[Midnightbsd-cvs] src: sys/kern: Sync with freebsd
laffer1 at midnightbsd.org
laffer1 at midnightbsd.org
Sat Sep 27 19:02:19 EDT 2008
Log Message:
-----------
Sync with freebsd
Modified Files:
--------------
src/sys/kern:
Makefile (r1.1.1.1 -> r1.2)
bus_if.m (r1.1.1.1 -> r1.2)
imgact_aout.c (r1.2 -> r1.3)
imgact_elf.c (r1.2 -> r1.3)
imgact_gzip.c (r1.2 -> r1.3)
inflate.c (r1.1.1.1 -> r1.2)
init_main.c (r1.3 -> r1.4)
init_sysent.c (r1.2 -> r1.3)
kern_acct.c (r1.1.1.2 -> r1.2)
kern_alq.c (r1.1.1.1 -> r1.2)
kern_clock.c (r1.2 -> r1.3)
kern_condvar.c (r1.2 -> r1.3)
kern_conf.c (r1.1.1.1 -> r1.2)
kern_context.c (r1.1.1.1 -> r1.2)
kern_cpu.c (r1.2 -> r1.3)
kern_descrip.c (r1.5 -> r1.6)
kern_environment.c (r1.1.1.1 -> r1.2)
kern_event.c (r1.3 -> r1.4)
kern_exec.c (r1.2 -> r1.3)
kern_exit.c (r1.2 -> r1.3)
kern_fork.c (r1.2 -> r1.3)
kern_idle.c (r1.1.1.1 -> r1.2)
kern_intr.c (r1.2 -> r1.3)
kern_jail.c (r1.1.1.1 -> r1.2)
kern_kse.c (r1.3 -> r1.4)
kern_kthread.c (r1.2 -> r1.3)
kern_ktr.c (r1.1.1.1 -> r1.2)
kern_ktrace.c (r1.2 -> r1.3)
kern_linker.c (r1.1.1.1 -> r1.2)
kern_lock.c (r1.2 -> r1.3)
kern_lockf.c (r1.1.1.1 -> r1.2)
kern_malloc.c (r1.2 -> r1.3)
kern_mbuf.c (r1.3 -> r1.4)
kern_mib.c (r1.1.1.1 -> r1.2)
kern_module.c (r1.1.1.1 -> r1.2)
kern_mtxpool.c (r1.1.1.1 -> r1.2)
kern_mutex.c (r1.1.1.2 -> r1.2)
kern_ntptime.c (r1.1.1.1 -> r1.2)
kern_pmc.c (r1.1.1.1 -> r1.2)
kern_poll.c (r1.2 -> r1.3)
kern_proc.c (r1.2 -> r1.3)
kern_prot.c (r1.1.1.1 -> r1.2)
kern_resource.c (r1.1.1.2 -> r1.2)
kern_shutdown.c (r1.2 -> r1.3)
kern_sig.c (r1.2 -> r1.3)
kern_subr.c (r1.1.1.1 -> r1.2)
kern_switch.c (r1.1.1.1 -> r1.2)
kern_sx.c (r1.1.1.2 -> r1.2)
kern_synch.c (r1.2 -> r1.3)
kern_syscalls.c (r1.1.1.1 -> r1.2)
kern_sysctl.c (r1.2 -> r1.3)
kern_tc.c (r1.1.1.1 -> r1.2)
kern_thr.c (r1.2 -> r1.3)
kern_thread.c (r1.4 -> r1.5)
kern_time.c (r1.1.1.2 -> r1.2)
kern_timeout.c (r1.1.1.1 -> r1.2)
kern_umtx.c (r1.2 -> r1.3)
kern_uuid.c (r1.1.1.1 -> r1.2)
kern_xxx.c (r1.1.1.1 -> r1.2)
link_elf.c (r1.1.1.2 -> r1.2)
link_elf_obj.c (r1.1.1.2 -> r1.2)
makesyscalls.sh (r1.1.1.1 -> r1.2)
md5c.c (r1.1.1.1 -> r1.2)
sched_4bsd.c (r1.2 -> r1.3)
sched_ule.c (r1.1.1.1 -> r1.2)
subr_autoconf.c (r1.1.1.1 -> r1.2)
subr_bus.c (r1.1.1.1 -> r1.2)
subr_clock.c (r1.1.1.1 -> r1.2)
subr_disk.c (r1.1.1.2 -> r1.2)
subr_firmware.c (r1.1.1.1 -> r1.2)
subr_hints.c (r1.1.1.1 -> r1.2)
subr_kdb.c (r1.1.1.1 -> r1.2)
subr_kobj.c (r1.1.1.1 -> r1.2)
subr_mbpool.c (r1.1.1.1 -> r1.2)
subr_mchain.c (r1.1.1.1 -> r1.2)
subr_param.c (r1.1.1.1 -> r1.2)
subr_pcpu.c (r1.1.1.1 -> r1.2)
subr_power.c (r1.1.1.1 -> r1.2)
subr_prf.c (r1.2 -> r1.3)
subr_prof.c (r1.1.1.1 -> r1.2)
subr_rman.c (r1.1.1.2 -> r1.2)
subr_sbuf.c (r1.1.1.1 -> r1.2)
subr_sleepqueue.c (r1.2 -> r1.3)
subr_smp.c (r1.2 -> r1.3)
subr_stack.c (r1.1 -> r1.2)
subr_taskqueue.c (r1.2 -> r1.3)
subr_trap.c (r1.1.1.1 -> r1.2)
subr_turnstile.c (r1.1.1.2 -> r1.2)
subr_unit.c (r1.1.1.1 -> r1.2)
subr_witness.c (r1.3 -> r1.4)
sys_generic.c (r1.2 -> r1.3)
sys_pipe.c (r1.2 -> r1.3)
sys_process.c (r1.2 -> r1.3)
sys_socket.c (r1.1.1.1 -> r1.2)
syscalls.c (r1.2 -> r1.3)
syscalls.master (r1.2 -> r1.3)
sysv_ipc.c (r1.1.1.1 -> r1.2)
sysv_msg.c (r1.1.1.1 -> r1.2)
sysv_sem.c (r1.1.1.1 -> r1.2)
sysv_shm.c (r1.1.1.1 -> r1.2)
tty.c (r1.3 -> r1.4)
tty_compat.c (r1.1.1.1 -> r1.2)
tty_cons.c (r1.1.1.1 -> r1.2)
tty_pty.c (r1.2 -> r1.3)
tty_tty.c (r1.1.1.1 -> r1.2)
uipc_cow.c (r1.1.1.1 -> r1.2)
uipc_domain.c (r1.2 -> r1.3)
uipc_mbuf.c (r1.5 -> r1.6)
uipc_mbuf2.c (r1.1.1.1 -> r1.2)
uipc_sem.c (r1.1.1.2 -> r1.2)
uipc_socket.c (r1.2 -> r1.3)
uipc_syscalls.c (r1.3 -> r1.4)
uipc_usrreq.c (r1.2 -> r1.3)
vfs_aio.c (r1.3 -> r1.4)
vfs_bio.c (r1.6 -> r1.7)
vfs_cache.c (r1.2 -> r1.3)
vfs_cluster.c (r1.2 -> r1.3)
vfs_default.c (r1.2 -> r1.3)
vfs_export.c (r1.1.1.1 -> r1.2)
vfs_hash.c (r1.1.1.1 -> r1.2)
vfs_init.c (r1.1.1.1 -> r1.2)
vfs_lookup.c (r1.2 -> r1.3)
vfs_mount.c (r1.4 -> r1.5)
vfs_subr.c (r1.2 -> r1.3)
vfs_syscalls.c (r1.2 -> r1.3)
vfs_vnops.c (r1.2 -> r1.3)
vnode_if.src (r1.1.1.1 -> r1.2)
Added Files:
-----------
src/sys/kern:
kern_priv.c (r1.1)
kern_rwlock.c (r1.1)
ksched.c (r1.1)
p1003_1b.c (r1.1)
posix4_mib.c (r1.1)
serdev_if.m (r1.1)
subr_acl_posix1e.c (r1.1)
subr_fattime.c (r1.1)
subr_lock.c (r1.1)
subr_rtc.c (r1.1)
systrace_args.c (r1.1)
tty_pts.c (r1.1)
uipc_debug.c (r1.1)
uipc_mqueue.c (r1.1)
uipc_sockbuf.c (r1.1)
vfs_acl.c (r1.1)
vfs_extattr.c (r1.1)
Removed Files:
-------------
src/sys/kern:
kern_acl.c
kern_mac.c
uipc_proto.c
uipc_socket2.c
-------------- next part --------------
--- /dev/null
+++ sys/kern/uipc_debug.c
@@ -0,0 +1,522 @@
+/*-
+ * Copyright (c) 2007 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Debugger routines relating to sockets, protocols, etc, for use in DDB.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/uipc_debug.c,v 1.2 2007/05/03 14:42:41 rwatson Exp $");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+
+static void
+db_print_sotype(short so_type)
+{
+
+ switch (so_type) {
+ case SOCK_STREAM:
+ db_printf("SOCK_STREAM");
+ break;
+
+ case SOCK_DGRAM:
+ db_printf("SOCK_DGRAM");
+ break;
+
+ case SOCK_RAW:
+ db_printf("SOCK_RAW");
+ break;
+
+ case SOCK_RDM:
+ db_printf("SOCK_RDM");
+ break;
+
+ case SOCK_SEQPACKET:
+ db_printf("SOCK_SEQPACKET");
+ break;
+
+ default:
+ db_printf("unknown");
+ break;
+ }
+}
+
+static void
+db_print_sooptions(short so_options)
+{
+ int comma;
+
+ comma = 0;
+ if (so_options & SO_DEBUG) {
+ db_printf("%sSO_DEBUG", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_ACCEPTCONN) {
+ db_printf("%sSO_ACCEPTCONN", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_REUSEADDR) {
+ db_printf("%sSO_REUSEADDR", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_KEEPALIVE) {
+ db_printf("%sSO_KEEPALIVE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_DONTROUTE) {
+ db_printf("%sSO_DONTROUTE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_BROADCAST) {
+ db_printf("%sSO_BROADCAST", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_USELOOPBACK) {
+ db_printf("%sSO_USELOOPBACK", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_LINGER) {
+ db_printf("%sSO_LINGER", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_OOBINLINE) {
+ db_printf("%sSO_OOBINLINE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_REUSEPORT) {
+ db_printf("%sSO_REUSEPORT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_TIMESTAMP) {
+ db_printf("%sSO_TIMESTAMP", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_NOSIGPIPE) {
+ db_printf("%sSO_NOSIGPIPE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_ACCEPTFILTER) {
+ db_printf("%sSO_ACCEPTFILTER", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_BINTIME) {
+ db_printf("%sSO_BINTIME", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_sostate(short so_state)
+{
+ int comma;
+
+ comma = 0;
+ if (so_state & SS_NOFDREF) {
+ db_printf("%sSS_FDREF", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_state & SS_ISCONNECTED) {
+ db_printf("%sSS_ISCONNECTED", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_state & SS_ISCONNECTING) {
+ db_printf("%sSS_ISCONNECTING", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_state & SS_ISDISCONNECTING) {
+ db_printf("%sSS_ISDISCONNECTING", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_state & SS_NBIO) {
+ db_printf("%sSS_NBIO", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_state & SS_ASYNC) {
+ db_printf("%sSS_ASYNC", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_state & SS_ISCONFIRMING) {
+ db_printf("%sSS_ISCONFIRMING", comma ? ", " : "");
+ comma = 1;
+ }
+ comma = 0;
+ if (so_state & SS_PROTOREF) {
+ db_printf("%sSS_PROTOREF", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_soqstate(int so_qstate)
+{
+ int comma;
+
+ comma = 0;
+ if (so_qstate & SQ_INCOMP) {
+ db_printf("%sSQ_INCOMP", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_qstate & SQ_COMP) {
+ db_printf("%sSQ_COMP", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_sbstate(short sb_state)
+{
+ int comma;
+
+ comma = 0;
+ if (sb_state & SBS_CANTSENDMORE) {
+ db_printf("%sSS_CANTSENDMORE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_state & SBS_CANTRCVMORE) {
+ db_printf("%sSS_CANTRCVMORE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_state & SBS_RCVATMARK) {
+ db_printf("%sSS_RCVATMARK", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_indent(int indent)
+{
+ int i;
+
+ for (i = 0; i < indent; i++)
+ db_printf(" ");
+}
+
+static void
+db_print_domain(struct domain *d, const char *domainname, int indent)
+{
+
+ db_print_indent(indent);
+ db_printf("%s at %p\n", domainname, d);
+
+ indent += 2;
+
+ db_print_indent(indent);
+ db_printf("dom_family: %d ", d->dom_family);
+ db_printf("dom_name: %s\n", d->dom_name);
+
+ db_print_indent(indent);
+ db_printf("dom_init: %p ", d->dom_init);
+ db_printf("dom_externalize: %p ", d->dom_externalize);
+ db_printf("dom_dispose: %p\n", d->dom_dispose);
+
+ db_print_indent(indent);
+ db_printf("dom_protosw: %p ", d->dom_protosw);
+ db_printf("dom_next: %p\n", d->dom_next);
+
+ db_print_indent(indent);
+ db_printf("dom_rtattach: %p ", d->dom_rtattach);
+ db_printf("dom_rtoffset: %d ", d->dom_rtoffset);
+ db_printf("dom_maxrtkey: %d\n", d->dom_maxrtkey);
+
+ db_print_indent(indent);
+ db_printf("dom_ifattach: %p ", d->dom_ifattach);
+ db_printf("dom_ifdetach: %p\n", d->dom_ifdetach);
+}
+
+static void
+db_print_prflags(short pr_flags)
+{
+ int comma;
+
+ comma = 0;
+ if (pr_flags & PR_ATOMIC) {
+ db_printf("%sPR_ATOMIC", comma ? ", " : "");
+ comma = 1;
+ }
+ if (pr_flags & PR_ADDR) {
+ db_printf("%sPR_ADDR", comma ? ", " : "");
+ comma = 1;
+ }
+ if (pr_flags & PR_CONNREQUIRED) {
+ db_printf("%sPR_CONNREQUIRED", comma ? ", " : "");
+ comma = 1;
+ }
+ if (pr_flags & PR_WANTRCVD) {
+ db_printf("%sPR_WANTRCVD", comma ? ", " : "");
+ comma = 1;
+ }
+ if (pr_flags & PR_RIGHTS) {
+ db_printf("%sPR_RIGHTS", comma ? ", " : "");
+ comma = 1;
+ }
+ if (pr_flags & PR_IMPLOPCL) {
+ db_printf("%sPR_IMPLOPCL", comma ? ", " : "");
+ comma = 1;
+ }
+ if (pr_flags & PR_LASTHDR) {
+ db_printf("%sPR_LASTHDR", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_protosw(struct protosw *pr, const char *prname, int indent)
+{
+
+ db_print_indent(indent);
+ db_printf("%s at %p\n", prname, pr);
+
+ indent += 2;
+
+ db_print_indent(indent);
+ db_printf("pr_type: %d ", pr->pr_type);
+ db_printf("pr_domain: %p\n", pr->pr_domain);
+ if (pr->pr_domain != NULL)
+ db_print_domain(pr->pr_domain, "pr_domain", indent);
+
+ db_print_indent(indent);
+ db_printf("pr_protocol: %d\n", pr->pr_protocol);
+
+ db_print_indent(indent);
+ db_printf("pr_flags: %d (", pr->pr_flags);
+ db_print_prflags(pr->pr_flags);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("pr_input: %p ", pr->pr_input);
+ db_printf("pr_output: %p ", pr->pr_output);
+ db_printf("pr_ctlinput: %p\n", pr->pr_ctlinput);
+
+ db_print_indent(indent);
+ db_printf("pr_ctloutput: %p ", pr->pr_ctloutput);
+ db_printf("pr_ousrreq: %p ", pr->pr_ousrreq);
+ db_printf("pr_init: %p\n", pr->pr_init);
+
+ db_print_indent(indent);
+ db_printf("pr_fasttimo: %p ", pr->pr_fasttimo);
+ db_printf("pr_slowtimo: %p ", pr->pr_slowtimo);
+ db_printf("pr_drain: %p\n", pr->pr_drain);
+
+ db_print_indent(indent);
+ db_printf("pr_ousrreq: %p\n", pr->pr_ousrreq);
+}
+
+static void
+db_print_sbflags(short sb_flags)
+{
+ int comma;
+
+ comma = 0;
+ if (sb_flags & SB_WAIT) {
+ db_printf("%sSB_WAIT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_flags & SB_SEL) {
+ db_printf("%sSB_SEL", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_flags & SB_ASYNC) {
+ db_printf("%sSB_ASYNC", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_flags & SB_UPCALL) {
+ db_printf("%sSB_UPCALL", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_flags & SB_NOINTR) {
+ db_printf("%sSB_NOINTR", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_flags & SB_AIO) {
+ db_printf("%sSB_AIO", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_flags & SB_KNOTE) {
+ db_printf("%sSB_KNOTE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_flags & SB_AUTOSIZE) {
+ db_printf("%sSB_AUTOSIZE", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_sockbuf(struct sockbuf *sb, const char *sockbufname, int indent)
+{
+
+ db_print_indent(indent);
+ db_printf("%s at %p\n", sockbufname, sb);
+
+ indent += 2;
+
+ db_print_indent(indent);
+ db_printf("sb_state: 0x%x (", sb->sb_state);
+ db_print_sbstate(sb->sb_state);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("sb_mb: %p ", sb->sb_mb);
+ db_printf("sb_mbtail: %p ", sb->sb_mbtail);
+ db_printf("sb_lastrecord: %p\n", sb->sb_lastrecord);
+
+ db_print_indent(indent);
+ db_printf("sb_cc: %d ", sb->sb_cc);
+ db_printf("sb_hiwat: %d ", sb->sb_hiwat);
+ db_printf("sb_mbcnt: %d ", sb->sb_mbcnt);
+ db_printf("sb_mbmax: %d\n", sb->sb_mbmax);
+
+ db_print_indent(indent);
+ db_printf("sb_ctl: %d ", sb->sb_ctl);
+ db_printf("sb_lowat: %d ", sb->sb_lowat);
+ db_printf("sb_timeo: %d\n", sb->sb_timeo);
+
+ db_print_indent(indent);
+ db_printf("sb_flags: 0x%x (", sb->sb_flags);
+ db_print_sbflags(sb->sb_flags);
+ db_printf(")\n");
+}
+
+static void
+db_print_socket(struct socket *so, const char *socketname, int indent)
+{
+
+ db_print_indent(indent);
+ db_printf("%s at %p\n", socketname, so);
+
+ indent += 2;
+
+ db_print_indent(indent);
+ db_printf("so_count: %d ", so->so_count);
+ db_printf("so_type: %d (", so->so_type);
+ db_print_sotype(so->so_type);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("so_options: 0x%x (", so->so_options);
+ db_print_sooptions(so->so_options);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("so_linger: %d ", so->so_linger);
+ db_printf("so_state: 0x%x (", so->so_state);
+ db_print_sostate(so->so_state);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("so_qstate: 0x%x (", so->so_qstate);
+ db_print_soqstate(so->so_qstate);
+ db_printf(") ");
+ db_printf("so_pcb: %p ", so->so_pcb);
+ db_printf("so_proto: %p\n", so->so_proto);
+
+ if (so->so_proto != NULL)
+ db_print_protosw(so->so_proto, "so_proto", indent);
+
+ db_print_indent(indent);
+ db_printf("so_head: %p ", so->so_head);
+ db_printf("so_incomp first: %p ", TAILQ_FIRST(&so->so_incomp));
+ db_printf("so_comp first: %p\n", TAILQ_FIRST(&so->so_comp));
+
+ db_print_indent(indent);
+ /* so_list skipped */
+ db_printf("so_qlen: %d ", so->so_qlen);
+ db_printf("so_incqlen: %d ", so->so_incqlen);
+ db_printf("so_qlimit: %d ", so->so_qlimit);
+ db_printf("so_timeo: %d ", so->so_timeo);
+ db_printf("so_error: %d\n", so->so_error);
+
+ db_print_indent(indent);
+ db_printf("so_sigio: %p ", so->so_sigio);
+ db_printf("so_oobmark: %lu ", so->so_oobmark);
+ db_printf("so_aiojobq first: %p\n", TAILQ_FIRST(&so->so_aiojobq));
+
+ db_print_sockbuf(&so->so_rcv, "so_rcv", indent);
+ db_print_sockbuf(&so->so_snd, "so_snd", indent);
+}
+
+DB_SHOW_COMMAND(socket, db_show_socket)
+{
+ struct socket *so;
+
+ if (!have_addr) {
+ db_printf("usage: show socket <addr>\n");
+ return;
+ }
+ so = (struct socket *)addr;
+
+ db_print_socket(so, "socket", 0);
+}
+
+DB_SHOW_COMMAND(sockbuf, db_show_sockbuf)
+{
+ struct sockbuf *sb;
+
+ if (!have_addr) {
+ db_printf("usage: show sockbuf <addr>\n");
+ return;
+ }
+ sb = (struct sockbuf *)addr;
+
+ db_print_sockbuf(sb, "sockbuf", 0);
+}
+
+DB_SHOW_COMMAND(protosw, db_show_protosw)
+{
+ struct protosw *pr;
+
+ if (!have_addr) {
+ db_printf("usage: show protosw <addr>\n");
+ return;
+ }
+ pr = (struct protosw *)addr;
+
+ db_print_protosw(pr, "protosw", 0);
+}
+
+DB_SHOW_COMMAND(domain, db_show_domain)
+{
+ struct domain *d;
+
+ if (!have_addr) {
+ db_printf("usage: show protosw <addr>\n");
+ return;
+ }
+ d = (struct domain *)addr;
+
+ db_print_domain(d, "domain", 0);
+}
+#endif
Index: subr_smp.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_smp.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/subr_smp.c -L sys/kern/subr_smp.c -u -r1.2 -r1.3
--- sys/kern/subr_smp.c
+++ sys/kern/subr_smp.c
@@ -33,9 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_smp.c,v 1.196 2005/06/30 03:38:10 peter Exp $");
-
-#include "opt_kdb.h"
+__FBSDID("$FreeBSD: src/sys/kern/subr_smp.c,v 1.201 2007/09/11 22:54:09 attilio Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -49,6 +47,7 @@
#include <sys/smp.h>
#include <sys/sysctl.h>
+#include <machine/cpu.h>
#include <machine/smp.h>
#include "opt_sched.h"
@@ -109,7 +108,7 @@
static void (*smp_rv_action_func)(void *arg);
static void (*smp_rv_teardown_func)(void *arg);
static void *smp_rv_func_arg;
-static volatile int smp_rv_waiters[2];
+static volatile int smp_rv_waiters[3];
/*
* Shared mutex to restrict busywaits between smp_rendezvous() and
@@ -145,11 +144,11 @@
mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
cpu_mp_start();
- printf("MidnightBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
+ printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
mp_ncpus);
cpu_mp_announce();
}
-SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_SECOND, mp_start, NULL)
+SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL)
void
forward_signal(struct thread *td)
@@ -161,7 +160,7 @@
* this thread, so all we need to do is poke it if it is currently
* executing so that it executes ast().
*/
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
KASSERT(TD_IS_RUNNING(td),
("forward_signal: thread is not TDS_RUNNING"));
@@ -189,8 +188,6 @@
struct thread *td;
cpumask_t id, map, me;
- mtx_assert(&sched_lock, MA_OWNED);
-
CTR0(KTR_SMP, "forward_roundrobin()");
if (!smp_started || cold || panicstr)
@@ -203,7 +200,7 @@
td = pc->pc_curthread;
id = pc->pc_cpumask;
if (id != me && (id & stopped_cpus) == 0 &&
- td != pc->pc_idlethread) {
+ !TD_IS_IDLETHREAD(td)) {
td->td_flags |= TDF_NEEDRESCHED;
map |= id;
}
@@ -242,37 +239,9 @@
ipi_selected(map, IPI_STOP);
i = 0;
- while ((atomic_load_acq_int(&stopped_cpus) & map) != map) {
- /* spin */
- i++;
-#ifdef DIAGNOSTIC
- if (i == 100000) {
- printf("timeout stopping cpus\n");
- break;
- }
-#endif
- }
-
- return 1;
-}
-
-#ifdef KDB_STOP_NMI
-int
-stop_cpus_nmi(cpumask_t map)
-{
- int i;
-
- if (!smp_started)
- return 0;
-
- CTR1(KTR_SMP, "stop_cpus(%x)", map);
-
- /* send the stop IPI to all CPUs in map */
- ipi_nmi_selected(map);
-
- i = 0;
- while ((atomic_load_acq_int(&stopped_cpus) & map) != map) {
+ while ((stopped_cpus & map) != map) {
/* spin */
+ cpu_spinwait();
i++;
#ifdef DIAGNOSTIC
if (i == 100000) {
@@ -284,7 +253,6 @@
return 1;
}
-#endif /* KDB_STOP_NMI */
/*
* Called by a CPU to restart stopped CPUs.
@@ -312,8 +280,8 @@
atomic_store_rel_int(&started_cpus, map);
/* wait for each to clear its bit */
- while ((atomic_load_acq_int(&stopped_cpus) & map) != 0)
- ; /* nothing */
+ while ((stopped_cpus & map) != 0)
+ cpu_spinwait();
return 1;
}
@@ -331,20 +299,29 @@
smp_rendezvous_action(void)
{
+ /* Ensure we have up-to-date values. */
+ atomic_add_acq_int(&smp_rv_waiters[0], 1);
+ while (smp_rv_waiters[0] < mp_ncpus)
+ cpu_spinwait();
+
/* setup function */
if (smp_rv_setup_func != NULL)
smp_rv_setup_func(smp_rv_func_arg);
+
/* spin on entry rendezvous */
- atomic_add_int(&smp_rv_waiters[0], 1);
- while (atomic_load_acq_int(&smp_rv_waiters[0]) < mp_ncpus)
- ; /* nothing */
+ atomic_add_int(&smp_rv_waiters[1], 1);
+ while (smp_rv_waiters[1] < mp_ncpus)
+ cpu_spinwait();
+
/* action function */
if (smp_rv_action_func != NULL)
smp_rv_action_func(smp_rv_func_arg);
+
/* spin on exit rendezvous */
- atomic_add_int(&smp_rv_waiters[1], 1);
- while (atomic_load_acq_int(&smp_rv_waiters[1]) < mp_ncpus)
- ; /* nothing */
+ atomic_add_int(&smp_rv_waiters[2], 1);
+ while (smp_rv_waiters[2] < mp_ncpus)
+ cpu_spinwait();
+
/* teardown function */
if (smp_rv_teardown_func != NULL)
smp_rv_teardown_func(smp_rv_func_arg);
@@ -375,8 +352,9 @@
smp_rv_action_func = action_func;
smp_rv_teardown_func = teardown_func;
smp_rv_func_arg = arg;
- smp_rv_waiters[0] = 0;
smp_rv_waiters[1] = 0;
+ smp_rv_waiters[2] = 0;
+ atomic_store_rel_int(&smp_rv_waiters[0], 0);
/* signal other processors, which will enter the IPI with interrupts off */
ipi_all_but_self(IPI_RENDEZVOUS);
Index: subr_mbpool.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_mbpool.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_mbpool.c -L sys/kern/subr_mbpool.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_mbpool.c
+++ sys/kern/subr_mbpool.c
@@ -28,7 +28,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_mbpool.c,v 1.3 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_mbpool.c,v 1.4 2007/05/27 17:38:36 rwatson Exp $");
#include <sys/param.h>
#include <sys/lock.h>
@@ -338,7 +338,7 @@
}
mtx_lock(&p->free_lock);
SLIST_FOREACH(cf, &p->free_list, link)
- *free++;
+ (*free)++;
mtx_unlock(&p->free_lock);
}
Index: kern_conf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_conf.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_conf.c -L sys/kern/kern_conf.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_conf.c
+++ sys/kern/kern_conf.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_conf.c,v 1.186.2.5 2005/11/06 15:58:06 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_conf.c,v 1.208.2.1 2007/12/07 03:45:16 thompsa Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
@@ -39,9 +39,11 @@
#include <sys/vnode.h>
#include <sys/queue.h>
#include <sys/poll.h>
+#include <sys/sx.h>
#include <sys/ctype.h>
#include <sys/tty.h>
#include <sys/ucred.h>
+#include <sys/taskqueue.h>
#include <machine/stdarg.h>
#include <fs/devfs/devfs_int.h>
@@ -50,9 +52,15 @@
struct mtx devmtx;
static void destroy_devl(struct cdev *dev);
-static struct cdev *make_dev_credv(struct cdevsw *devsw, int minornr,
- struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt,
- va_list ap);
+static int destroy_dev_sched_cbl(struct cdev *dev,
+ void (*cb)(void *), void *arg);
+static struct cdev *make_dev_credv(int flags,
+ struct cdevsw *devsw, int minornr,
+ struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt,
+ va_list ap);
+
+static struct cdev_priv_list cdevp_free_list =
+ TAILQ_HEAD_INITIALIZER(cdevp_free_list);
void
dev_lock(void)
@@ -61,6 +69,31 @@
mtx_lock(&devmtx);
}
+static void
+dev_unlock_and_free(void)
+{
+ struct cdev_priv *cdp;
+
+ mtx_assert(&devmtx, MA_OWNED);
+ while ((cdp = TAILQ_FIRST(&cdevp_free_list)) != NULL) {
+ TAILQ_REMOVE(&cdevp_free_list, cdp, cdp_list);
+ mtx_unlock(&devmtx);
+ devfs_free(&cdp->cdp_c);
+ mtx_lock(&devmtx);
+ }
+ mtx_unlock(&devmtx);
+}
+
+static void
+dev_free_devlocked(struct cdev *cdev)
+{
+ struct cdev_priv *cdp;
+
+ mtx_assert(&devmtx, MA_OWNED);
+ cdp = cdev->si_priv;
+ TAILQ_INSERT_HEAD(&cdevp_free_list, cdp, cdp_list);
+}
+
void
dev_unlock(void)
{
@@ -102,7 +135,7 @@
;
else
#endif
-if (dev->si_devsw == NULL && dev->si_refcount == 0) {
+ if (dev->si_devsw == NULL && dev->si_refcount == 0) {
LIST_REMOVE(dev, si_list);
flag = 1;
}
@@ -115,12 +148,40 @@
dev_refthread(struct cdev *dev)
{
struct cdevsw *csw;
+ struct cdev_priv *cdp;
mtx_assert(&devmtx, MA_NOTOWNED);
dev_lock();
csw = dev->si_devsw;
- if (csw != NULL)
- dev->si_threadcount++;
+ if (csw != NULL) {
+ cdp = dev->si_priv;
+ if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0)
+ dev->si_threadcount++;
+ else
+ csw = NULL;
+ }
+ dev_unlock();
+ return (csw);
+}
+
+struct cdevsw *
+devvn_refthread(struct vnode *vp, struct cdev **devp)
+{
+ struct cdevsw *csw;
+ struct cdev_priv *cdp;
+
+ mtx_assert(&devmtx, MA_NOTOWNED);
+ csw = NULL;
+ dev_lock();
+ *devp = vp->v_rdev;
+ if (*devp != NULL) {
+ cdp = (*devp)->si_priv;
+ if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0) {
+ csw = (*devp)->si_devsw;
+ if (csw != NULL)
+ (*devp)->si_threadcount++;
+ }
+ }
dev_unlock();
return (csw);
}
@@ -246,13 +307,13 @@
}
static int
-giant_fdopen(struct cdev *dev, int oflags, struct thread *td, int fdidx)
+giant_fdopen(struct cdev *dev, int oflags, struct thread *td, struct file *fp)
{
int retval;
mtx_lock(&Giant);
retval = dev->si_devsw->d_gianttrick->
- d_fdopen(dev, oflags, td, fdidx);
+ d_fdopen(dev, oflags, td, fp);
mtx_unlock(&Giant);
return (retval);
}
@@ -399,7 +460,7 @@
udev = y;
LIST_FOREACH(si2, &csw->d_devs, si_list) {
if (si2->si_drv0 == udev) {
- devfs_free(si);
+ dev_free_devlocked(si);
return (si2);
}
}
@@ -449,7 +510,8 @@
if (devsw->d_version != D_VERSION_01) {
printf(
"WARNING: Device driver \"%s\" has wrong version %s\n",
- devsw->d_name, "and is disabled. Recompile KLD module.");
+ devsw->d_name == NULL ? "???" : devsw->d_name,
+ "and is disabled. Recompile KLD module.");
devsw->d_open = dead_open;
devsw->d_close = dead_close;
devsw->d_read = dead_read;
@@ -507,8 +569,9 @@
dev_unlock();
}
-static struct cdev *
-make_dev_credv(struct cdevsw *devsw, int minornr, struct ucred *cr, uid_t uid,
+struct cdev *
+make_dev_credv(int flags, struct cdevsw *devsw, int minornr,
+ struct ucred *cr, uid_t uid,
gid_t gid, int mode, const char *fmt, va_list ap)
{
struct cdev *dev;
@@ -522,6 +585,8 @@
dev = devfs_alloc();
dev_lock();
dev = newdev(devsw, minornr, dev);
+ if (flags & MAKEDEV_REF)
+ dev_refl(dev);
if (dev->si_flags & SI_CHEAPCLONE &&
dev->si_flags & SI_NAMED) {
/*
@@ -529,7 +594,7 @@
* simplifies cloning devices.
* XXX: still ??
*/
- dev_unlock();
+ dev_unlock_and_free();
return (dev);
}
KASSERT(!(dev->si_flags & SI_NAMED),
@@ -543,15 +608,18 @@
}
dev->si_flags |= SI_NAMED;
+#ifdef MAC
if (cr != NULL)
dev->si_cred = crhold(cr);
else
+#endif
dev->si_cred = NULL;
dev->si_uid = uid;
dev->si_gid = gid;
dev->si_mode = mode;
devfs_create(dev);
+ clean_unrhdrl(devfs_inos);
dev_unlock();
return (dev);
}
@@ -564,7 +632,7 @@
va_list ap;
va_start(ap, fmt);
- dev = make_dev_credv(devsw, minornr, NULL, uid, gid, mode, fmt, ap);
+ dev = make_dev_credv(0, devsw, minornr, NULL, uid, gid, mode, fmt, ap);
va_end(ap);
return (dev);
}
@@ -577,7 +645,23 @@
va_list ap;
va_start(ap, fmt);
- dev = make_dev_credv(devsw, minornr, cr, uid, gid, mode, fmt, ap);
+ dev = make_dev_credv(0, devsw, minornr, cr, uid, gid, mode, fmt, ap);
+ va_end(ap);
+
+ return (dev);
+}
+
+struct cdev *
+make_dev_credf(int flags, struct cdevsw *devsw, int minornr,
+ struct ucred *cr, uid_t uid,
+ gid_t gid, int mode, const char *fmt, ...)
+{
+ struct cdev *dev;
+ va_list ap;
+
+ va_start(ap, fmt);
+ dev = make_dev_credv(flags, devsw, minornr, cr, uid, gid, mode,
+ fmt, ap);
va_end(ap);
return (dev);
@@ -622,6 +706,7 @@
va_end(ap);
devfs_create(dev);
+ clean_unrhdrl(devfs_inos);
dev_unlock();
dev_depends(pdev, dev);
return (dev);
@@ -635,7 +720,7 @@
mtx_assert(&devmtx, MA_OWNED);
KASSERT(dev->si_flags & SI_NAMED,
("WARNING: Driver mistake: destroy_dev on %d\n", minor(dev)));
-
+
devfs_destroy(dev);
/* Remove name marking */
@@ -657,16 +742,20 @@
dev->si_flags &= ~SI_CLONELIST;
}
+ dev->si_refcount++; /* Avoid race with dev_rel() */
csw = dev->si_devsw;
dev->si_devsw = NULL; /* already NULL for SI_ALIAS */
while (csw != NULL && csw->d_purge != NULL && dev->si_threadcount) {
- printf("Purging %lu threads from %s\n",
- dev->si_threadcount, devtoname(dev));
csw->d_purge(dev);
msleep(csw, &devmtx, PRIBIO, "devprg", hz/10);
+ if (dev->si_threadcount)
+ printf("Still %lu threads in %s\n",
+ dev->si_threadcount, devtoname(dev));
+ }
+ while (dev->si_threadcount != 0) {
+ /* Use unique dummy wait ident */
+ msleep(&csw, &devmtx, PRIBIO, "devdrn", hz / 10);
}
- if (csw != NULL && csw->d_purge != NULL)
- printf("All threads purged from %s\n", devtoname(dev));
dev->si_drv1 = 0;
dev->si_drv2 = 0;
@@ -677,15 +766,18 @@
LIST_REMOVE(dev, si_list);
/* If cdevsw has no more struct cdev *'s, clean it */
- if (LIST_EMPTY(&csw->d_devs))
+ if (LIST_EMPTY(&csw->d_devs)) {
fini_cdevsw(csw);
+ wakeup(&csw->d_devs);
+ }
}
dev->si_flags &= ~SI_ALIAS;
+ dev->si_refcount--; /* Avoid race with dev_rel() */
if (dev->si_refcount > 0) {
LIST_INSERT_HEAD(&dead_cdevsw.d_devs, dev, si_list);
} else {
- devfs_free(dev);
+ dev_free_devlocked(dev);
}
}
@@ -695,7 +787,7 @@
dev_lock();
destroy_devl(dev);
- dev_unlock();
+ dev_unlock_and_free();
}
const char *
@@ -779,7 +871,7 @@
}
int
-clone_create(struct clonedevs **cdp, struct cdevsw *csw, int *up, struct cdev **dp, u_int extra)
+clone_create(struct clonedevs **cdp, struct cdevsw *csw, int *up, struct cdev **dp, int extra)
{
struct clonedevs *cd;
struct cdev *dev, *ndev, *dl, *de;
@@ -815,8 +907,8 @@
u = dev2unit(dev);
if (u == (unit | extra)) {
*dp = dev;
- devfs_free(ndev);
dev_unlock();
+ devfs_free(ndev);
return (0);
}
if (unit == -1 && u == low) {
@@ -852,7 +944,7 @@
LIST_INSERT_HEAD(&cd->head, dev, si_clone);
dev->si_flags |= SI_CLONELIST;
*up = unit;
- dev_unlock();
+ dev_unlock_and_free();
return (1);
}
@@ -863,21 +955,126 @@
void
clone_cleanup(struct clonedevs **cdp)
{
- struct cdev *dev, *tdev;
+ struct cdev *dev;
+ struct cdev_priv *cp;
struct clonedevs *cd;
cd = *cdp;
if (cd == NULL)
return;
dev_lock();
- LIST_FOREACH_SAFE(dev, &cd->head, si_clone, tdev) {
+ while (!LIST_EMPTY(&cd->head)) {
+ dev = LIST_FIRST(&cd->head);
+ LIST_REMOVE(dev, si_clone);
KASSERT(dev->si_flags & SI_CLONELIST,
("Dev %p(%s) should be on clonelist", dev, dev->si_name));
- KASSERT(dev->si_flags & SI_NAMED,
- ("Driver has goofed in cloning underways udev %x", dev->si_drv0));
- destroy_devl(dev);
+ dev->si_flags &= ~SI_CLONELIST;
+ cp = dev->si_priv;
+ if (!(cp->cdp_flags & CDP_SCHED_DTR)) {
+ cp->cdp_flags |= CDP_SCHED_DTR;
+ KASSERT(dev->si_flags & SI_NAMED,
+ ("Driver has goofed in cloning underways udev %x", dev->si_drv0));
+ destroy_devl(dev);
+ }
}
dev_unlock();
free(cd, M_DEVBUF);
*cdp = NULL;
}
+
+static TAILQ_HEAD(, cdev_priv) dev_ddtr =
+ TAILQ_HEAD_INITIALIZER(dev_ddtr);
+static struct task dev_dtr_task;
+
+static void
+destroy_dev_tq(void *ctx, int pending)
+{
+ struct cdev_priv *cp;
+ struct cdev *dev;
+ void (*cb)(void *);
+ void *cb_arg;
+
+ dev_lock();
+ while (!TAILQ_EMPTY(&dev_ddtr)) {
+ cp = TAILQ_FIRST(&dev_ddtr);
+ dev = &cp->cdp_c;
+ KASSERT(cp->cdp_flags & CDP_SCHED_DTR,
+ ("cdev %p in dev_destroy_tq without CDP_SCHED_DTR", cp));
+ TAILQ_REMOVE(&dev_ddtr, cp, cdp_dtr_list);
+ cb = cp->cdp_dtr_cb;
+ cb_arg = cp->cdp_dtr_cb_arg;
+ destroy_devl(dev);
+ dev_unlock();
+ dev_rel(dev);
+ if (cb != NULL)
+ cb(cb_arg);
+ dev_lock();
+ }
+ dev_unlock();
+}
+
+/*
+ * devmtx shall be locked on entry. devmtx will be unlocked after
+ * function return.
+ */
+static int
+destroy_dev_sched_cbl(struct cdev *dev, void (*cb)(void *), void *arg)
+{
+ struct cdev_priv *cp;
+
+ mtx_assert(&devmtx, MA_OWNED);
+ cp = dev->si_priv;
+ if (cp->cdp_flags & CDP_SCHED_DTR) {
+ dev_unlock();
+ return (0);
+ }
+ dev_refl(dev);
+ cp->cdp_flags |= CDP_SCHED_DTR;
+ cp->cdp_dtr_cb = cb;
+ cp->cdp_dtr_cb_arg = arg;
+ TAILQ_INSERT_TAIL(&dev_ddtr, cp, cdp_dtr_list);
+ dev_unlock();
+ taskqueue_enqueue(taskqueue_swi_giant, &dev_dtr_task);
+ return (1);
+}
+
+int
+destroy_dev_sched_cb(struct cdev *dev, void (*cb)(void *), void *arg)
+{
+ dev_lock();
+ return (destroy_dev_sched_cbl(dev, cb, arg));
+}
+
+int
+destroy_dev_sched(struct cdev *dev)
+{
+ return (destroy_dev_sched_cb(dev, NULL, NULL));
+}
+
+void
+destroy_dev_drain(struct cdevsw *csw)
+{
+
+ dev_lock();
+ while (!LIST_EMPTY(&csw->d_devs)) {
+ msleep(&csw->d_devs, &devmtx, PRIBIO, "devscd", hz/10);
+ }
+ dev_unlock();
+}
+
+void
+drain_dev_clone_events(void)
+{
+
+ sx_xlock(&clone_drain_lock);
+ sx_xunlock(&clone_drain_lock);
+}
+
+static void
+devdtr_init(void *dummy __unused)
+{
+
+ TASK_INIT(&dev_dtr_task, 0, destroy_dev_tq, NULL);
+}
+
+SYSINIT(devdtr, SI_SUB_DEVFS, SI_ORDER_SECOND, devdtr_init, NULL);
Index: kern_mbuf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_mbuf.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/kern_mbuf.c -L sys/kern/kern_mbuf.c -u -r1.3 -r1.4
--- sys/kern/kern_mbuf.c
+++ sys/kern/kern_mbuf.c
@@ -26,13 +26,12 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_mbuf.c,v 1.9.2.8 2006/05/16 07:27:48 ps Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_mbuf.c,v 1.32.2.1 2007/12/15 23:16:04 rrs Exp $");
#include "opt_mac.h"
#include "opt_param.h"
#include <sys/param.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
@@ -43,6 +42,8 @@
#include <sys/smp.h>
#include <sys/sysctl.h>
+#include <security/mac/mac_framework.h>
+
#include <vm/vm.h>
#include <vm/vm_page.h>
#include <vm/uma.h>
@@ -106,19 +107,98 @@
/* This has to be done before VM init. */
nmbclusters = 1024 + maxusers * 64;
+ nmbjumbop = nmbclusters / 2;
+ nmbjumbo9 = nmbjumbop / 2;
+ nmbjumbo16 = nmbjumbo9 / 2;
TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
}
SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
-SYSCTL_DECL(_kern_ipc);
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RW, &nmbclusters, 0,
- "Maximum number of mbuf clusters allowed.");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbjumbop, CTLFLAG_RW, &nmbjumbop, 0,
- "Maximum number of mbuf page size jumbo clusters allowed");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbjumbo9, CTLFLAG_RW, &nmbjumbo9, 0,
- "Maximum number of mbuf 9k jumbo clusters allowed");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbjumbo16, CTLFLAG_RW, &nmbjumbo16, 0,
+/* XXX: These should be tuneables. Can't change UMA limits on the fly. */
+static int
+sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
+{
+ int error, newnmbclusters;
+
+ newnmbclusters = nmbclusters;
+ error = sysctl_handle_int(oidp, &newnmbclusters, 0, req);
+ if (error == 0 && req->newptr) {
+ if (newnmbclusters > nmbclusters) {
+ nmbclusters = newnmbclusters;
+ uma_zone_set_max(zone_clust, nmbclusters);
+ EVENTHANDLER_INVOKE(nmbclusters_change);
+ } else
+ error = EINVAL;
+ }
+ return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW,
+&nmbclusters, 0, sysctl_nmbclusters, "IU",
+ "Maximum number of mbuf clusters allowed");
+
+static int
+sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS)
+{
+ int error, newnmbjumbop;
+
+ newnmbjumbop = nmbjumbop;
+ error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req);
+ if (error == 0 && req->newptr) {
+ if (newnmbjumbop> nmbjumbop) {
+ nmbjumbop = newnmbjumbop;
+ uma_zone_set_max(zone_jumbop, nmbjumbop);
+ } else
+ error = EINVAL;
+ }
+ return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW,
+&nmbjumbop, 0, sysctl_nmbjumbop, "IU",
+ "Maximum number of mbuf page size jumbo clusters allowed");
+
+
+static int
+sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS)
+{
+ int error, newnmbjumbo9;
+
+ newnmbjumbo9 = nmbjumbo9;
+ error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req);
+ if (error == 0 && req->newptr) {
+ if (newnmbjumbo9> nmbjumbo9) {
+ nmbjumbo9 = newnmbjumbo9;
+ uma_zone_set_max(zone_jumbo9, nmbjumbo9);
+ } else
+ error = EINVAL;
+ }
+ return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW,
+&nmbjumbo9, 0, sysctl_nmbjumbo9, "IU",
+ "Maximum number of mbuf 9k jumbo clusters allowed");
+
+static int
+sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS)
+{
+ int error, newnmbjumbo16;
+
+ newnmbjumbo16 = nmbjumbo16;
+ error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req);
+ if (error == 0 && req->newptr) {
+ if (newnmbjumbo16> nmbjumbo16) {
+ nmbjumbo16 = newnmbjumbo16;
+ uma_zone_set_max(zone_jumbo16, nmbjumbo16);
+ } else
+ error = EINVAL;
+ }
+ return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW,
+&nmbjumbo16, 0, sysctl_nmbjumbo16, "IU",
"Maximum number of mbuf 16k jumbo clusters allowed");
+
+
+
SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
"Mbuf general information and statistics");
@@ -131,6 +211,7 @@
uma_zone_t zone_jumbop;
uma_zone_t zone_jumbo9;
uma_zone_t zone_jumbo16;
+uma_zone_t zone_ext_refcnt;
/*
* Local prototypes.
@@ -178,7 +259,6 @@
NULL, NULL,
#endif
UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
-
if (nmbclusters > 0)
uma_zone_set_max(zone_clust, nmbclusters);
@@ -219,6 +299,11 @@
if (nmbjumbo16 > 0)
uma_zone_set_max(zone_jumbo16, nmbjumbo16);
+ zone_ext_refcnt = uma_zcreate(MBUF_EXTREFCNT_MEM_NAME, sizeof(u_int),
+ NULL, NULL,
+ NULL, NULL,
+ UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+
/* uma_prealloc() goes here... */
/*
@@ -294,6 +379,8 @@
m->m_pkthdr.header = NULL;
m->m_pkthdr.csum_flags = 0;
m->m_pkthdr.csum_data = 0;
+ m->m_pkthdr.tso_segsz = 0;
+ m->m_pkthdr.ether_vtag = 0;
SLIST_INIT(&m->m_pkthdr.tags);
#ifdef MAC
/* If the label init fails, fail the alloc */
@@ -303,7 +390,6 @@
#endif
} else
m->m_data = m->m_dat;
- mbstat.m_mbufs += 1; /* XXX */
return (0);
}
@@ -314,14 +400,18 @@
mb_dtor_mbuf(void *mem, int size, void *arg)
{
struct mbuf *m;
+ unsigned long flags;
m = (struct mbuf *)mem;
- if ((m->m_flags & M_PKTHDR) != 0)
+ flags = (unsigned long)arg;
+
+ if ((flags & MB_NOTAGS) == 0 && (m->m_flags & M_PKTHDR) != 0)
m_tag_delete_chain(m, NULL);
+ KASSERT((m->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__));
+ KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
#ifdef INVARIANTS
trash_dtor(mem, size, arg);
#endif
- mbstat.m_mbufs -= 1; /* XXX */
}
/*
@@ -343,11 +433,18 @@
KASSERT(m->m_ext.ext_args == NULL, ("%s: ext_args != NULL", __func__));
KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__));
KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__));
+ KASSERT(*m->m_ext.ref_cnt == 1, ("%s: ref_cnt != 1", __func__));
#ifdef INVARIANTS
trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg);
#endif
- mbstat.m_mbufs -= 1; /* XXX */
- mbstat.m_mclusts -= 1; /* XXX */
+ /*
+ * If there are processes blocked on zone_clust, waiting for pages to be freed up,
+ * cause them to be woken up by draining the packet zone. We are exposed to a race here
+ * (in the check for the UMA_ZFLAG_FULL) where we might miss the flag set, but that is
+ * deliberate. We don't want to acquire the zone lock for every mbuf free.
+ */
+ if (uma_zone_exhausted_nolock(zone_clust))
+ zone_drain(zone_pack);
}
/*
@@ -362,32 +459,41 @@
mb_ctor_clust(void *mem, int size, void *arg, int how)
{
struct mbuf *m;
- int type = 0;
-
+ u_int *refcnt;
+ int type;
+ uma_zone_t zone;
+
#ifdef INVARIANTS
trash_ctor(mem, size, arg, how);
#endif
- m = (struct mbuf *)arg;
- if (m != NULL) {
- switch (size) {
- case MCLBYTES:
- type = EXT_CLUSTER;
- break;
+ switch (size) {
+ case MCLBYTES:
+ type = EXT_CLUSTER;
+ zone = zone_clust;
+ break;
#if MJUMPAGESIZE != MCLBYTES
- case MJUMPAGESIZE:
- type = EXT_JUMBOP;
- break;
-#endif
- case MJUM9BYTES:
- type = EXT_JUMBO9;
- break;
- case MJUM16BYTES:
- type = EXT_JUMBO16;
- break;
- default:
- panic("unknown cluster size");
- break;
- }
+ case MJUMPAGESIZE:
+ type = EXT_JUMBOP;
+ zone = zone_jumbop;
+ break;
+#endif
+ case MJUM9BYTES:
+ type = EXT_JUMBO9;
+ zone = zone_jumbo9;
+ break;
+ case MJUM16BYTES:
+ type = EXT_JUMBO16;
+ zone = zone_jumbo16;
+ break;
+ default:
+ panic("unknown cluster size");
+ break;
+ }
+
+ m = (struct mbuf *)arg;
+ refcnt = uma_find_refcnt(zone, mem);
+ *refcnt = 1;
+ if (m != NULL) {
m->m_ext.ext_buf = (caddr_t)mem;
m->m_data = m->m_ext.ext_buf;
m->m_flags |= M_EXT;
@@ -395,9 +501,9 @@
m->m_ext.ext_args = NULL;
m->m_ext.ext_size = size;
m->m_ext.ext_type = type;
- m->m_ext.ref_cnt = NULL; /* Lazy counter assign. */
+ m->m_ext.ref_cnt = refcnt;
}
- mbstat.m_mclusts += 1; /* XXX */
+
return (0);
}
@@ -408,9 +514,15 @@
mb_dtor_clust(void *mem, int size, void *arg)
{
#ifdef INVARIANTS
+ uma_zone_t zone;
+
+ zone = m_getzone(size);
+ KASSERT(*(uma_find_refcnt(zone, mem)) <= 1,
+ ("%s: refcnt incorrect %u", __func__,
+ *(uma_find_refcnt(zone, mem))) );
+
trash_dtor(mem, size, arg);
#endif
- mbstat.m_mclusts -= 1; /* XXX */
}
/*
@@ -422,7 +534,7 @@
{
struct mbuf *m;
- m = (struct mbuf *)mem;
+ m = (struct mbuf *)mem; /* m is virgin. */
if (uma_zalloc_arg(zone_clust, m, how) == NULL ||
m->m_ext.ext_buf == NULL)
return (ENOMEM);
@@ -430,7 +542,6 @@
#ifdef INVARIANTS
trash_init(m->m_ext.ext_buf, MCLBYTES, how);
#endif
- mbstat.m_mclusts -= 1; /* XXX */
return (0);
}
@@ -448,8 +559,6 @@
trash_fini(m->m_ext.ext_buf, MCLBYTES);
#endif
uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
- m->m_ext.ext_buf = NULL;
- mbstat.m_mclusts += 1; /* XXX */
#ifdef INVARIANTS
trash_dtor(mem, size, NULL);
#endif
@@ -483,14 +592,15 @@
m->m_len = 0;
m->m_flags = (flags | M_EXT);
m->m_type = type;
- m->m_ext.ref_cnt = NULL; /* Lazy counter assign. */
-
+
if (flags & M_PKTHDR) {
m->m_pkthdr.rcvif = NULL;
m->m_pkthdr.len = 0;
m->m_pkthdr.header = NULL;
m->m_pkthdr.csum_flags = 0;
m->m_pkthdr.csum_data = 0;
+ m->m_pkthdr.tso_segsz = 0;
+ m->m_pkthdr.ether_vtag = 0;
SLIST_INIT(&m->m_pkthdr.tags);
#ifdef MAC
/* If the label init fails, fail the alloc */
@@ -501,8 +611,6 @@
}
/* m_ext is already initialized. */
- mbstat.m_mbufs += 1; /* XXX */
- mbstat.m_mclusts += 1; /* XXX */
return (0);
}
@@ -522,7 +630,6 @@
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
"mb_reclaim()");
- mbstat.m_drain++;
for (dp = domains; dp != NULL; dp = dp->dom_next)
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
if (pr->pr_drain != NULL)
Index: vfs_syscalls.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_syscalls.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/vfs_syscalls.c -L sys/kern/vfs_syscalls.c -u -r1.2 -r1.3
--- sys/kern/vfs_syscalls.c
+++ sys/kern/vfs_syscalls.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.392.2.7 2006/03/13 03:06:39 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.443 2007/09/10 00:00:16 rwatson Exp $");
#include "opt_compat.h"
#include "opt_mac.h"
@@ -45,7 +45,6 @@
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/sysent.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
@@ -55,21 +54,25 @@
#include <sys/kernel.h>
#include <sys/fcntl.h>
#include <sys/file.h>
+#include <sys/filio.h>
#include <sys/limits.h>
#include <sys/linker.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/dirent.h>
-#include <sys/extattr.h>
#include <sys/jail.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <machine/stdarg.h>
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
@@ -85,11 +88,6 @@
static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
struct thread *td);
-static int extattr_list_vp(struct vnode *vp, int attrnamespace, void *data,
- size_t nbytes, struct thread *td);
-
-int (*union_dircheckp)(struct thread *td, struct vnode **, struct file *);
-
/*
* The module initialization routine for POSIX asynchronous I/O will
* set this to the version of AIO that it implements. (Zero means
@@ -98,6 +96,11 @@
*/
int async_io_version;
+#ifdef DEBUG
+static int syncprt = 0;
+SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
/*
* Sync each mounted filesystem.
*/
@@ -106,12 +109,6 @@
int dummy;
};
#endif
-
-#ifdef DEBUG
-static int syncprt = 0;
-SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
-#endif
-
/* ARGSUSED */
int
sync(td, uap)
@@ -119,40 +116,37 @@
struct sync_args *uap;
{
struct mount *mp, *nmp;
- int asyncflag;
+ int vfslocked;
- mtx_lock(&Giant);
mtx_lock(&mountlist_mtx);
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
+ vfslocked = VFS_LOCK_GIANT(mp);
if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
- asyncflag = mp->mnt_flag & MNT_ASYNC;
- mp->mnt_flag &= ~MNT_ASYNC;
+ MNT_ILOCK(mp);
+ mp->mnt_noasync++;
+ mp->mnt_kern_flag &= ~MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
vfs_msync(mp, MNT_NOWAIT);
VFS_SYNC(mp, MNT_NOWAIT, td);
- mp->mnt_flag |= asyncflag;
+ MNT_ILOCK(mp);
+ mp->mnt_noasync--;
+ if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
+ mp->mnt_noasync == 0)
+ mp->mnt_kern_flag |= MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
vn_finished_write(mp);
}
+ VFS_UNLOCK_GIANT(vfslocked);
mtx_lock(&mountlist_mtx);
nmp = TAILQ_NEXT(mp, mnt_list);
vfs_unbusy(mp, td);
}
mtx_unlock(&mountlist_mtx);
-#if 0
-/*
- * XXX don't call vfs_bufstats() yet because that routine
- * was not imported in the Lite2 merge.
- */
-#ifdef DIAGNOSTIC
- if (syncprt)
- vfs_bufstats();
-#endif /* DIAGNOSTIC */
-#endif
- mtx_unlock(&Giant);
return (0);
}
@@ -164,8 +158,6 @@
/*
* Change filesystem quotas.
- *
- * MP SAFE
*/
#ifndef _SYS_SYSPROTO_H_
struct quotactl_args {
@@ -185,33 +177,76 @@
caddr_t arg;
} */ *uap;
{
- struct mount *mp, *vmp;
+ struct mount *mp;
+ int vfslocked;
int error;
struct nameidata nd;
+ AUDIT_ARG(cmd, uap->cmd);
+ AUDIT_ARG(uid, uap->uid);
if (jailed(td->td_ucred) && !prison_quotas)
return (EPERM);
- mtx_lock(&Giant);
- NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
- if ((error = namei(&nd)) != 0) {
- mtx_unlock(&Giant);
+ NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1,
+ UIO_USERSPACE, uap->path, td);
+ if ((error = namei(&nd)) != 0)
return (error);
- }
+ vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
- error = vn_start_write(nd.ni_vp, &vmp, V_WAIT | PCATCH);
mp = nd.ni_vp->v_mount;
- vrele(nd.ni_vp);
- if (error) {
- mtx_unlock(&Giant);
+ if ((error = vfs_busy(mp, 0, NULL, td))) {
+ vrele(nd.ni_vp);
+ VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
+ vrele(nd.ni_vp);
error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, td);
- vn_finished_write(vmp);
- mtx_unlock(&Giant);
+ vfs_unbusy(mp, td);
+ VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
+ * Used by statfs conversion routines to scale the block size up if
+ * necessary so that all of the block counts are <= 'max_size'. Note
+ * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
+ * value of 'n'.
+ */
+void
+statfs_scale_blocks(struct statfs *sf, long max_size)
+{
+ uint64_t count;
+ int shift;
+
+ KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
+
+ /*
+ * Attempt to scale the block counts to give a more accurate
+ * overview to userland of the ratio of free space to used
+ * space. To do this, find the largest block count and compute
+ * a divisor that lets it fit into a signed integer <= max_size.
+ */
+ if (sf->f_bavail < 0)
+ count = -sf->f_bavail;
+ else
+ count = sf->f_bavail;
+ count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
+ if (count <= max_size)
+ return;
+
+ count >>= flsl(max_size);
+ shift = 0;
+ while (count > 0) {
+ shift++;
+ count >>=1;
+ }
+
+ sf->f_bsize <<= shift;
+ sf->f_blocks >>= shift;
+ sf->f_bfree >>= shift;
+ sf->f_bavail >>= shift;
+}
+
+/*
* Get filesystem statistics.
*/
#ifndef _SYS_SYSPROTO_H_
@@ -243,27 +278,24 @@
{
struct mount *mp;
struct statfs *sp, sb;
+ int vfslocked;
int error;
struct nameidata nd;
- mtx_lock(&Giant);
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, pathseg, path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+ pathseg, path, td);
error = namei(&nd);
- if (error) {
- mtx_unlock(&Giant);
+ if (error)
return (error);
- }
+ vfslocked = NDHASGIANT(&nd);
mp = nd.ni_vp->v_mount;
vfs_ref(mp);
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_vp);
#ifdef MAC
error = mac_check_mount_stat(td->td_ucred, mp);
- if (error) {
- vfs_rel(mp);
- mtx_unlock(&Giant);
- return (error);
- }
+ if (error)
+ goto out;
#endif
/*
* Set these in case the underlying filesystem fails to do so.
@@ -273,20 +305,21 @@
sp->f_namemax = NAME_MAX;
sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
error = VFS_STATFS(mp, sp, td);
- vfs_rel(mp);
- if (error) {
- mtx_unlock(&Giant);
- return (error);
- }
- if (suser(td)) {
+ if (error)
+ goto out;
+ if (priv_check(td, PRIV_VFS_GENERATION)) {
bcopy(sp, &sb, sizeof(sb));
sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
prison_enforce_statfs(td->td_ucred, mp, &sb);
sp = &sb;
}
- mtx_unlock(&Giant);
*buf = *sp;
- return (0);
+out:
+ vfs_rel(mp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ if (mtx_owned(&Giant))
+ printf("statfs(%d): %s: %d\n", vfslocked, path, error);
+ return (error);
}
/*
@@ -321,14 +354,16 @@
struct file *fp;
struct mount *mp;
struct statfs *sp, sb;
+ int vfslocked;
struct vnode *vp;
int error;
+ AUDIT_ARG(fd, fd);
error = getvnode(td->td_proc->p_fd, fd, &fp);
if (error)
return (error);
- mtx_lock(&Giant);
vp = fp->f_vnode;
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
#ifdef AUDIT
AUDIT_ARG(vnode, vp, ARG_VNODE1);
@@ -339,18 +374,13 @@
VOP_UNLOCK(vp, 0, td);
fdrop(fp, td);
if (vp->v_iflag & VI_DOOMED) {
- if (mp)
- vfs_rel(mp);
- mtx_unlock(&Giant);
- return (EBADF);
+ error = EBADF;
+ goto out;
}
#ifdef MAC
error = mac_check_mount_stat(td->td_ucred, mp);
- if (error) {
- vfs_rel(mp);
- mtx_unlock(&Giant);
- return (error);
- }
+ if (error)
+ goto out;
#endif
/*
* Set these in case the underlying filesystem fails to do so.
@@ -360,20 +390,20 @@
sp->f_namemax = NAME_MAX;
sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
error = VFS_STATFS(mp, sp, td);
- vfs_rel(mp);
- if (error) {
- mtx_unlock(&Giant);
- return (error);
- }
- if (suser(td)) {
+ if (error)
+ goto out;
+ if (priv_check(td, PRIV_VFS_GENERATION)) {
bcopy(sp, &sb, sizeof(sb));
sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
prison_enforce_statfs(td->td_ucred, mp, &sb);
sp = &sb;
}
- mtx_unlock(&Giant);
*buf = *sp;
- return (0);
+out:
+ if (mp)
+ vfs_rel(mp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
}
/*
@@ -412,6 +442,7 @@
struct mount *mp, *nmp;
struct statfs *sfsp, *sp, sb;
size_t count, maxcount;
+ int vfslocked;
int error;
maxcount = bufsize / sizeof(struct statfs);
@@ -432,7 +463,6 @@
M_WAITOK);
}
count = 0;
- mtx_lock(&Giant);
mtx_lock(&mountlist_mtx);
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
if (prison_canseemount(td->td_ucred, mp) != 0) {
@@ -449,6 +479,7 @@
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
+ vfslocked = VFS_LOCK_GIANT(mp);
if (sfsp && count < maxcount) {
sp = &mp->mnt_stat;
/*
@@ -466,12 +497,13 @@
if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
(flags & MNT_WAIT)) &&
(error = VFS_STATFS(mp, sp, td))) {
+ VFS_UNLOCK_GIANT(vfslocked);
mtx_lock(&mountlist_mtx);
nmp = TAILQ_NEXT(mp, mnt_list);
vfs_unbusy(mp, td);
continue;
}
- if (suser(td)) {
+ if (priv_check(td, PRIV_VFS_GENERATION)) {
bcopy(sp, &sb, sizeof(sb));
sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
prison_enforce_statfs(td->td_ucred, mp, &sb);
@@ -483,19 +515,19 @@
error = copyout(sp, sfsp, sizeof(*sp));
if (error) {
vfs_unbusy(mp, td);
- mtx_unlock(&Giant);
+ VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
}
sfsp++;
}
+ VFS_UNLOCK_GIANT(vfslocked);
count++;
mtx_lock(&mountlist_mtx);
nmp = TAILQ_NEXT(mp, mnt_list);
vfs_unbusy(mp, td);
}
mtx_unlock(&mountlist_mtx);
- mtx_unlock(&Giant);
if (sfsp && count > maxcount)
td->td_retval[0] = maxcount;
else
@@ -645,12 +677,13 @@
struct ostatfs *osp;
{
+ statfs_scale_blocks(nsp, LONG_MAX);
bzero(osp, sizeof(*osp));
- osp->f_bsize = MIN(nsp->f_bsize, LONG_MAX);
+ osp->f_bsize = nsp->f_bsize;
osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
- osp->f_blocks = MIN(nsp->f_blocks, LONG_MAX);
- osp->f_bfree = MIN(nsp->f_bfree, LONG_MAX);
- osp->f_bavail = MIN(nsp->f_bavail, LONG_MAX);
+ osp->f_blocks = nsp->f_blocks;
+ osp->f_bfree = nsp->f_bfree;
+ osp->f_bavail = nsp->f_bavail;
osp->f_files = MIN(nsp->f_files, LONG_MAX);
osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
osp->f_owner = nsp->f_owner;
@@ -692,21 +725,16 @@
int vfslocked;
int error;
+ AUDIT_ARG(fd, uap->fd);
if ((error = getvnode(fdp, uap->fd, &fp)) != 0)
return (error);
vp = fp->f_vnode;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
VREF(vp);
fdrop(fp, td);
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
- if (vp->v_type != VDIR)
- error = ENOTDIR;
-#ifdef MAC
- else if ((error = mac_check_vnode_chdir(td->td_ucred, vp)) != 0) {
- }
-#endif
- else
- error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
+ AUDIT_ARG(vnode, vp, ARG_VNODE1);
+ error = change_dir(vp, td);
while (!error && (mp = vp->v_mountedhere) != NULL) {
int tvfslocked;
if (vfs_busy(mp, 0, 0, td))
@@ -730,10 +758,10 @@
}
VOP_UNLOCK(vp, 0, td);
VFS_UNLOCK_GIANT(vfslocked);
- FILEDESC_LOCK_FAST(fdp);
+ FILEDESC_XLOCK(fdp);
vpold = fdp->fd_cdir;
fdp->fd_cdir = vp;
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_XUNLOCK(fdp);
vfslocked = VFS_LOCK_GIANT(vpold->v_mount);
vrele(vpold);
VFS_UNLOCK_GIANT(vfslocked);
@@ -768,7 +796,8 @@
struct vnode *vp;
int vfslocked;
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE, pathseg, path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1 | MPSAFE,
+ pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -781,10 +810,10 @@
VOP_UNLOCK(nd.ni_vp, 0, td);
VFS_UNLOCK_GIANT(vfslocked);
NDFREE(&nd, NDF_ONLY_PNBUF);
- FILEDESC_LOCK_FAST(fdp);
+ FILEDESC_XLOCK(fdp);
vp = fdp->fd_cdir;
fdp->fd_cdir = nd.ni_vp;
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_XUNLOCK(fdp);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
@@ -803,7 +832,8 @@
struct file *fp;
int fd;
- FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+ FILEDESC_LOCK_ASSERT(fdp);
+
for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
fp = fget_locked(fdp, fd);
if (fp == NULL)
@@ -849,10 +879,10 @@
struct nameidata nd;
int vfslocked;
- error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+ error = priv_check(td, PRIV_VFS_CHROOT);
if (error)
return (error);
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE,
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
UIO_USERSPACE, uap->path, td);
error = namei(&nd);
if (error)
@@ -903,8 +933,8 @@
/*
* Common routine for kern_chroot() and jail_attach(). The caller is
- * responsible for invoking suser() and mac_check_chroot() to authorize this
- * operation.
+ * responsible for invoking priv_check() and mac_check_chroot() to authorize
+ * this operation.
*/
int
change_root(vp, td)
@@ -918,12 +948,12 @@
VFS_ASSERT_GIANT(vp->v_mount);
fdp = td->td_proc->p_fd;
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
if (chroot_allow_open_directories == 0 ||
(chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
error = chroot_refuse_vdir_fds(fdp);
if (error) {
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
return (error);
}
}
@@ -934,7 +964,7 @@
fdp->fd_jdir = vp;
VREF(fdp->fd_jdir);
}
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
vfslocked = VFS_LOCK_GIANT(oldvp->v_mount);
vrele(oldvp);
VFS_UNLOCK_GIANT(vfslocked);
@@ -942,10 +972,8 @@
}
/*
- * Check permissions, allocate an open file structure,
- * and call the device open routine if any.
- *
- * MP SAFE
+ * Check permissions, allocate an open file structure, and call the device
+ * open routine if any.
*/
#ifndef _SYS_SYSPROTO_H_
struct open_args {
@@ -963,12 +991,8 @@
int mode;
} */ *uap;
{
- int error;
- error = kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode);
- if (mtx_owned(&Giant))
- printf("open: %s: %d\n", uap->path, error);
- return (error);
+ return kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode);
}
int
@@ -988,6 +1012,8 @@
struct nameidata nd;
int vfslocked;
+ AUDIT_ARG(fflags, flags);
+ AUDIT_ARG(mode, mode);
if ((flags & O_ACCMODE) == O_ACCMODE)
return (EINVAL);
flags = FFLAGS(flags);
@@ -997,9 +1023,9 @@
/* An extra reference on `nfp' has been held for us by falloc(). */
fp = nfp;
cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
- NDINIT(&nd, LOOKUP, FOLLOW, pathseg, path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg, path, td);
td->td_dupfd = -1; /* XXX check for fdopen */
- error = vn_open(&nd, &flags, cmode, indx);
+ error = vn_open(&nd, &flags, cmode, fp);
if (error) {
/*
* If the vn_open replaced the method vector, something
@@ -1013,11 +1039,6 @@
}
/*
- * release our own reference
- */
- fdrop(fp, td);
-
- /*
* handle special fdopen() case. bleh. dupfdopen() is
* responsible for dropping the old contents of ofiles[indx]
* if it succeeds.
@@ -1027,6 +1048,7 @@
(error =
dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) {
td->td_retval[0] = indx;
+ fdrop(fp, td);
return (0);
}
/*
@@ -1034,6 +1056,7 @@
* replaced or closed it.
*/
fdclose(fdp, fp, indx, td);
+ fdrop(fp, td);
if (error == ERESTART)
error = EINTR;
@@ -1044,41 +1067,16 @@
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
- /*
- * There should be 2 references on the file, one from the descriptor
- * table, and one for us.
- *
- * Handle the case where someone closed the file (via its file
- * descriptor) while we were blocked. The end result should look
- * like opening the file succeeded but it was immediately closed.
- * We call vn_close() manually because we haven't yet hooked up
- * the various 'struct file' fields.
- */
- FILEDESC_LOCK(fdp);
FILE_LOCK(fp);
- if (fp->f_count == 1) {
- mp = vp->v_mount;
- KASSERT(fdp->fd_ofiles[indx] != fp,
- ("Open file descriptor lost all refs"));
- FILE_UNLOCK(fp);
- FILEDESC_UNLOCK(fdp);
- VOP_UNLOCK(vp, 0, td);
- vn_close(vp, flags & FMASK, fp->f_cred, td);
- VFS_UNLOCK_GIANT(vfslocked);
- fdrop(fp, td);
- td->td_retval[0] = indx;
- return (0);
- }
fp->f_vnode = vp;
if (fp->f_data == NULL)
fp->f_data = vp;
fp->f_flag = flags & FMASK;
- if (fp->f_ops == &badfileops)
- fp->f_ops = &vnops;
fp->f_seqcount = 1;
fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
+ if (fp->f_ops == &badfileops)
+ fp->f_ops = &vnops;
FILE_UNLOCK(fp);
- FILEDESC_UNLOCK(fdp);
VOP_UNLOCK(vp, 0, td);
if (flags & (O_EXLOCK | O_SHLOCK)) {
@@ -1132,8 +1130,6 @@
#ifdef COMPAT_43
/*
* Create a file.
- *
- * MP SAFE
*/
#ifndef _SYS_SYSPROTO_H_
struct ocreat_args {
@@ -1190,20 +1186,29 @@
struct nameidata nd;
int vfslocked;
+ AUDIT_ARG(mode, mode);
+ AUDIT_ARG(dev, dev);
switch (mode & S_IFMT) {
case S_IFCHR:
case S_IFBLK:
- error = suser(td);
+ error = priv_check(td, PRIV_VFS_MKNOD_DEV);
+ break;
+ case S_IFMT:
+ error = priv_check(td, PRIV_VFS_MKNOD_BAD);
+ break;
+ case S_IFWHT:
+ error = priv_check(td, PRIV_VFS_MKNOD_WHT);
break;
default:
- error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+ error = EINVAL;
break;
}
if (error)
return (error);
restart:
bwillwrite();
- NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE, pathseg, path, td);
+ NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
+ pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -1219,10 +1224,10 @@
return (EEXIST);
} else {
VATTR_NULL(&vattr);
- FILEDESC_LOCK_FAST(td->td_proc->p_fd);
+ FILEDESC_SLOCK(td->td_proc->p_fd);
vattr.va_mode = (mode & ALLPERMS) &
~td->td_proc->p_fd->fd_cmask;
- FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
+ FILEDESC_SUNLOCK(td->td_proc->p_fd);
vattr.va_rdev = dev;
whiteout = 0;
@@ -1240,8 +1245,7 @@
whiteout = 1;
break;
default:
- error = EINVAL;
- break;
+ panic("kern_mknod: invalid mode");
}
}
if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
@@ -1305,9 +1309,11 @@
struct nameidata nd;
int vfslocked;
+ AUDIT_ARG(mode, mode);
restart:
bwillwrite();
- NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE, pathseg, path, td);
+ NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
+ pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -1331,9 +1337,9 @@
}
VATTR_NULL(&vattr);
vattr.va_type = VFIFO;
- FILEDESC_LOCK_FAST(td->td_proc->p_fd);
+ FILEDESC_SLOCK(td->td_proc->p_fd);
vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
- FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
+ FILEDESC_SUNLOCK(td->td_proc->p_fd);
#ifdef MAC
error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
&vattr);
@@ -1377,8 +1383,6 @@
return (error);
}
-SYSCTL_DECL(_security_bsd);
-
static int hardlink_check_uid = 0;
SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
&hardlink_check_uid, 0,
@@ -1396,9 +1400,6 @@
struct vattr va;
int error;
- if (suser_cred(cred, SUSER_ALLOWJAIL) == 0)
- return (0);
-
if (!hardlink_check_uid && !hardlink_check_gid)
return (0);
@@ -1406,14 +1407,16 @@
if (error != 0)
return (error);
- if (hardlink_check_uid) {
- if (cred->cr_uid != va.va_uid)
- return (EPERM);
+ if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
+ error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
+ if (error)
+ return (error);
}
- if (hardlink_check_gid) {
- if (!groupmember(va.va_gid, cred))
- return (EPERM);
+ if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
+ error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
+ if (error)
+ return (error);
}
return (0);
@@ -1430,7 +1433,7 @@
int error;
bwillwrite();
- NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, segflg, path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, segflg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -1446,7 +1449,8 @@
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
- NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE, segflg, link, td);
+ NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE2,
+ segflg, link, td);
if ((error = namei(&nd)) == 0) {
lvfslocked = NDHASGIANT(&nd);
if (nd.ni_vp != NULL) {
@@ -1518,9 +1522,10 @@
if ((error = copyinstr(path, syspath, MAXPATHLEN, NULL)) != 0)
goto out;
}
+ AUDIT_ARG(text, syspath);
restart:
bwillwrite();
- NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE,
+ NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
segflg, link, td);
if ((error = namei(&nd)) != 0)
goto out;
@@ -1545,9 +1550,9 @@
goto restart;
}
VATTR_NULL(&vattr);
- FILEDESC_LOCK_FAST(td->td_proc->p_fd);
+ FILEDESC_SLOCK(td->td_proc->p_fd);
vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
- FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
+ FILEDESC_SUNLOCK(td->td_proc->p_fd);
#ifdef MAC
vattr.va_type = VLNK;
error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
@@ -1589,8 +1594,8 @@
restart:
bwillwrite();
- NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE, UIO_USERSPACE,
- uap->path, td);
+ NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1,
+ UIO_USERSPACE, uap->path, td);
error = namei(&nd);
if (error)
return (error);
@@ -1656,7 +1661,8 @@
restart:
bwillwrite();
- NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE, pathseg, path, td);
+ NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
+ pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error == EINVAL ? EPERM : error);
vfslocked = NDHASGIANT(&nd);
@@ -1687,7 +1693,7 @@
goto restart;
}
#ifdef MAC
- error = mac_check_vnode_delete(td->td_ucred, nd.ni_dvp, vp,
+ error = mac_check_vnode_unlink(td->td_ucred, nd.ni_dvp, vp,
&nd.ni_cnd);
if (error)
goto out;
@@ -1774,6 +1780,12 @@
break;
case L_SET:
break;
+ case SEEK_DATA:
+ error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
+ break;
+ case SEEK_HOLE:
+ error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
+ break;
default:
error = EINVAL;
}
@@ -1815,16 +1827,28 @@
off_t offset;
int whence;
} */ nuap;
- int error;
nuap.fd = uap->fd;
nuap.offset = uap->offset;
nuap.whence = uap->whence;
- error = lseek(td, &nuap);
- return (error);
+ return (lseek(td, &nuap));
}
#endif /* COMPAT_43 */
+/* Version with the 'pad' argument */
+int
+freebsd6_lseek(td, uap)
+ struct thread *td;
+ register struct freebsd6_lseek_args *uap;
+{
+ struct lseek_args ouap;
+
+ ouap.fd = uap->fd;
+ ouap.offset = uap->offset;
+ ouap.whence = uap->whence;
+ return (lseek(td, &ouap));
+}
+
/*
* Check access permissions using passed credentials.
*/
@@ -1898,7 +1922,8 @@
tmpcred->cr_uid = cred->cr_ruid;
tmpcred->cr_groups[0] = cred->cr_rgid;
td->td_ucred = tmpcred;
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE, pathseg, path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+ pathseg, path, td);
if ((error = namei(&nd)) != 0)
goto out1;
vfslocked = NDHASGIANT(&nd);
@@ -1931,18 +1956,25 @@
int flags;
} */ *uap;
{
+
+ return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->flags));
+}
+
+int
+kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int flags)
+{
struct nameidata nd;
struct vnode *vp;
int vfslocked;
int error;
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE, UIO_USERSPACE,
- uap->path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+ pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vp = nd.ni_vp;
vfslocked = NDHASGIANT(&nd);
- error = vn_access(vp, uap->flags, td->td_ucred, td);
+ error = vn_access(vp, flags, td->td_ucred, td);
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
@@ -2071,7 +2103,8 @@
struct stat sb;
int error, vfslocked;
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE,
+ NDINIT(&nd, LOOKUP,
+ FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1,
pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
@@ -2080,6 +2113,8 @@
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
+ if (mtx_owned(&Giant))
+ printf("stat(%d): %s\n", vfslocked, path);
if (error)
return (error);
*sbp = sb;
@@ -2120,7 +2155,8 @@
struct nameidata nd;
int error, vfslocked;
- NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE,
+ NDINIT(&nd, LOOKUP,
+ NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE | AUDITVNODE1,
pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
@@ -2245,7 +2281,8 @@
struct nameidata nd;
int error, vfslocked;
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE, pathseg, path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+ pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -2296,7 +2333,8 @@
struct nameidata nd;
int vfslocked;
- NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE, pathseg, path, td);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+ pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -2350,7 +2388,7 @@
* chown can't fail when done as root.
*/
if (vp->v_type == VCHR || vp->v_type == VBLK) {
- error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+ error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
if (error)
return (error);
}
@@ -2392,7 +2430,9 @@
struct nameidata nd;
int vfslocked;
- NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_USERSPACE, uap->path, td);
+ AUDIT_ARG(fflags, uap->flags);
+ NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
if ((error = namei(&nd)) != 0)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -2418,7 +2458,9 @@
struct nameidata nd;
int vfslocked;
- NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE, uap->path, td);
+ AUDIT_ARG(fflags, uap->flags);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -2450,12 +2492,19 @@
int vfslocked;
int error;
+ AUDIT_ARG(fd, uap->fd);
+ AUDIT_ARG(fflags, uap->flags);
if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
return (error);
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+#ifdef AUDIT
+ vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
+ AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
+ VOP_UNLOCK(fp->f_vnode, 0, td);
+#endif
error = setfflags(td, fp->f_vnode, uap->flags);
- fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
+ fdrop(fp, td);
return (error);
}
@@ -2516,7 +2565,8 @@
struct nameidata nd;
int vfslocked;
- NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, pathseg, path, td);
+ AUDIT_ARG(mode, mode);
+ NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -2548,7 +2598,9 @@
struct nameidata nd;
int vfslocked;
- NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE, uap->path, td);
+ AUDIT_ARG(mode, (mode_t)uap->mode);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -2580,12 +2632,19 @@
int vfslocked;
int error;
+ AUDIT_ARG(fd, uap->fd);
+ AUDIT_ARG(mode, uap->mode);
if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
return (error);
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+#ifdef AUDIT
+ vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
+ AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
+ VOP_UNLOCK(fp->f_vnode, 0, td);
+#endif
error = setfmode(td, fp->f_vnode, uap->mode);
- fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
+ fdrop(fp, td);
return (error);
}
@@ -2652,7 +2711,8 @@
struct nameidata nd;
int vfslocked;
- NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, pathseg, path, td);
+ AUDIT_ARG(owner, uid, gid);
+ NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -2694,7 +2754,8 @@
struct nameidata nd;
int vfslocked;
- NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, pathseg, path, td);
+ AUDIT_ARG(owner, uid, gid);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -2728,12 +2789,19 @@
int vfslocked;
int error;
+ AUDIT_ARG(fd, uap->fd);
+ AUDIT_ARG(owner, uap->uid, uap->gid);
if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
return (error);
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+#ifdef AUDIT
+ vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
+ AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
+ VOP_UNLOCK(fp->f_vnode, 0, td);
+#endif
error = setfown(td, fp->f_vnode, uap->uid, uap->gid);
- fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
+ fdrop(fp, td);
return (error);
}
@@ -2848,7 +2916,7 @@
if ((error = getutimes(tptr, tptrseg, ts)) != 0)
return (error);
- NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, pathseg, path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -2892,7 +2960,7 @@
if ((error = getutimes(tptr, tptrseg, ts)) != 0)
return (error);
- NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, pathseg, path, td);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -2933,14 +3001,20 @@
int vfslocked;
int error;
+ AUDIT_ARG(fd, fd);
if ((error = getutimes(tptr, tptrseg, ts)) != 0)
return (error);
if ((error = getvnode(td->td_proc->p_fd, fd, &fp)) != 0)
return (error);
vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+#ifdef AUDIT
+ vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
+ AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
+ VOP_UNLOCK(fp->f_vnode, 0, td);
+#endif
error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
- fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
+ fdrop(fp, td);
return (error);
}
@@ -2979,7 +3053,7 @@
if (length < 0)
return(EINVAL);
- NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, pathseg, path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -3036,6 +3110,7 @@
int vfslocked;
int error;
+ AUDIT_ARG(fd, uap->fd);
if (uap->length < 0)
return(EINVAL);
if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
@@ -3050,6 +3125,7 @@
goto drop;
VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ AUDIT_ARG(vnode, vp, ARG_VNODE1);
if (vp->v_type == VDIR)
error = EISDIR;
#ifdef MAC
@@ -3128,6 +3204,27 @@
}
#endif /* COMPAT_43 */
+/* Versions with the pad argument */
+int
+freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
+{
+ struct truncate_args ouap;
+
+ ouap.path = uap->path;
+ ouap.length = uap->length;
+ return (truncate(td, &ouap));
+}
+
+int
+freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
+{
+ struct ftruncate_args ouap;
+
+ ouap.fd = uap->fd;
+ ouap.length = uap->length;
+ return (ftruncate(td, &ouap));
+}
+
/*
* Sync an open file.
*/
@@ -3149,6 +3246,7 @@
int vfslocked;
int error;
+ AUDIT_ARG(fd, uap->fd);
if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
return (error);
vp = fp->f_vnode;
@@ -3156,6 +3254,7 @@
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
goto drop;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ AUDIT_ARG(vnode, vp, ARG_VNODE1);
if (vp->v_object != NULL) {
VM_OBJECT_LOCK(vp->v_object);
vm_object_page_clean(vp->v_object, 0, 0, 0);
@@ -3172,8 +3271,8 @@
}
/*
- * Rename files. Source and destination must either both be directories,
- * or both not be directories. If target is a directory, it must be empty.
+ * Rename files. Source and destination must either both be directories, or
+ * both not be directories. If target is a directory, it must be empty.
*/
#ifndef _SYS_SYSPROTO_H_
struct rename_args {
@@ -3205,11 +3304,11 @@
bwillwrite();
#ifdef MAC
- NDINIT(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE,
- pathseg, from, td);
+ NDINIT(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE |
+ AUDITVNODE1, pathseg, from, td);
#else
- NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE,
- pathseg, from, td);
+ NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE |
+ AUDITVNODE1, pathseg, from, td);
#endif
if ((error = namei(&fromnd)) != 0)
return (error);
@@ -3219,7 +3318,8 @@
error = mac_check_vnode_rename_from(td->td_ucred, fromnd.ni_dvp,
fromnd.ni_vp, &fromnd.ni_cnd);
VOP_UNLOCK(fromnd.ni_dvp, 0, td);
- VOP_UNLOCK(fromnd.ni_vp, 0, td);
+ if (fromnd.ni_dvp != fromnd.ni_vp)
+ VOP_UNLOCK(fromnd.ni_vp, 0, td);
#endif
fvp = fromnd.ni_vp;
if (error == 0)
@@ -3231,7 +3331,7 @@
goto out1;
}
NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART |
- MPSAFE, pathseg, to, td);
+ MPSAFE | AUDITVNODE2, pathseg, to, td);
if (fromnd.ni_vp->v_type == VDIR)
tond.ni_cnd.cn_flags |= WILLBEDIR;
if ((error = namei(&tond)) != 0) {
@@ -3337,9 +3437,11 @@
struct nameidata nd;
int vfslocked;
+ AUDIT_ARG(mode, mode);
restart:
bwillwrite();
- NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE, segflg, path, td);
+ NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
+ segflg, path, td);
nd.ni_cnd.cn_flags |= WILLBEDIR;
if ((error = namei(&nd)) != 0)
return (error);
@@ -3370,9 +3472,9 @@
}
VATTR_NULL(&vattr);
vattr.va_type = VDIR;
- FILEDESC_LOCK_FAST(td->td_proc->p_fd);
+ FILEDESC_SLOCK(td->td_proc->p_fd);
vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
- FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
+ FILEDESC_SUNLOCK(td->td_proc->p_fd);
#ifdef MAC
error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
&vattr);
@@ -3423,7 +3525,8 @@
restart:
bwillwrite();
- NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE, pathseg, path, td);
+ NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
+ pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -3447,7 +3550,7 @@
goto out;
}
#ifdef MAC
- error = mac_check_vnode_delete(td->td_ucred, nd.ni_dvp, vp,
+ error = mac_check_vnode_unlink(td->td_ucred, nd.ni_dvp, vp,
&nd.ni_cnd);
if (error)
goto out;
@@ -3507,7 +3610,7 @@
struct iovec aiov, kiov;
struct dirent *dp, *edp;
caddr_t dirbuf;
- int error, eofflag, readcnt;
+ int error, eofflag, readcnt, vfslocked;
long loff;
/* XXX arbitrary sanity limit on `count'. */
@@ -3521,7 +3624,9 @@
}
vp = fp->f_vnode;
unionread:
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type != VDIR) {
+ VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (EINVAL);
}
@@ -3539,6 +3644,7 @@
error = mac_check_vnode_readdir(td->td_ucred, vp);
if (error) {
VOP_UNLOCK(vp, 0, td);
+ VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (error);
}
@@ -3594,39 +3700,27 @@
}
FREE(dirbuf, M_TEMP);
}
- VOP_UNLOCK(vp, 0, td);
if (error) {
+ VOP_UNLOCK(vp, 0, td);
+ VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (error);
}
- if (uap->count == auio.uio_resid) {
- if (union_dircheckp) {
- error = union_dircheckp(td, &vp, fp);
- if (error == -1)
- goto unionread;
- if (error) {
- fdrop(fp, td);
- return (error);
- }
- }
- /*
- * XXX We could delay dropping the lock above but
- * union_dircheckp complicates things.
- */
- vn_lock(vp, LK_EXCLUSIVE|LK_RETRY, td);
- if ((vp->v_vflag & VV_ROOT) &&
- (vp->v_mount->mnt_flag & MNT_UNION)) {
- struct vnode *tvp = vp;
- vp = vp->v_mount->mnt_vnodecovered;
- VREF(vp);
- fp->f_vnode = vp;
- fp->f_data = vp;
- fp->f_offset = 0;
- vput(tvp);
- goto unionread;
- }
- VOP_UNLOCK(vp, 0, td);
+ if (uap->count == auio.uio_resid &&
+ (vp->v_vflag & VV_ROOT) &&
+ (vp->v_mount->mnt_flag & MNT_UNION)) {
+ struct vnode *tvp = vp;
+ vp = vp->v_mount->mnt_vnodecovered;
+ VREF(vp);
+ fp->f_vnode = vp;
+ fp->f_data = vp;
+ fp->f_offset = 0;
+ vput(tvp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ goto unionread;
}
+ VOP_UNLOCK(vp, 0, td);
+ VFS_UNLOCK_GIANT(vfslocked);
error = copyout(&loff, uap->basep, sizeof(long));
fdrop(fp, td);
td->td_retval[0] = uap->count - auio.uio_resid;
@@ -3663,6 +3757,7 @@
long loff;
int error, eofflag;
+ AUDIT_ARG(fd, uap->fd);
if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
return (error);
if ((fp->f_flag & FREAD) == 0) {
@@ -3673,6 +3768,7 @@
unionread:
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type != VDIR) {
+ VFS_UNLOCK_GIANT(vfslocked);
error = EINVAL;
goto fail;
}
@@ -3686,6 +3782,7 @@
auio.uio_resid = uap->count;
/* vn_lock(vp, LK_SHARED | LK_RETRY, td); */
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ AUDIT_ARG(vnode, vp, ARG_VNODE1);
loff = auio.uio_offset = fp->f_offset;
#ifdef MAC
error = mac_check_vnode_readdir(td->td_ucred, vp);
@@ -3694,47 +3791,35 @@
error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
NULL);
fp->f_offset = auio.uio_offset;
- VOP_UNLOCK(vp, 0, td);
- if (error)
- goto fail;
- if (uap->count == auio.uio_resid) {
- if (union_dircheckp) {
- error = union_dircheckp(td, &vp, fp);
- if (error == -1) {
- VFS_UNLOCK_GIANT(vfslocked);
- goto unionread;
- }
- if (error)
- goto fail;
- }
- /*
- * XXX We could delay dropping the lock above but
- * union_dircheckp complicates things.
- */
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
- if ((vp->v_vflag & VV_ROOT) &&
- (vp->v_mount->mnt_flag & MNT_UNION)) {
- struct vnode *tvp = vp;
- vp = vp->v_mount->mnt_vnodecovered;
- VREF(vp);
- fp->f_vnode = vp;
- fp->f_data = vp;
- fp->f_offset = 0;
- vput(tvp);
- VFS_UNLOCK_GIANT(vfslocked);
- goto unionread;
- }
+ if (error) {
VOP_UNLOCK(vp, 0, td);
+ VFS_UNLOCK_GIANT(vfslocked);
+ goto fail;
}
+ if (uap->count == auio.uio_resid &&
+ (vp->v_vflag & VV_ROOT) &&
+ (vp->v_mount->mnt_flag & MNT_UNION)) {
+ struct vnode *tvp = vp;
+ vp = vp->v_mount->mnt_vnodecovered;
+ VREF(vp);
+ fp->f_vnode = vp;
+ fp->f_data = vp;
+ fp->f_offset = 0;
+ vput(tvp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ goto unionread;
+ }
+ VOP_UNLOCK(vp, 0, td);
+ VFS_UNLOCK_GIANT(vfslocked);
if (uap->basep != NULL) {
error = copyout(&loff, uap->basep, sizeof(long));
}
td->td_retval[0] = uap->count - auio.uio_resid;
fail:
- VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (error);
}
+
#ifndef _SYS_SYSPROTO_H_
struct getdents_args {
int fd;
@@ -3761,8 +3846,6 @@
/*
* Set the mode mask for creation of filesystem nodes.
- *
- * MP SAFE
*/
#ifndef _SYS_SYSPROTO_H_
struct umask_args {
@@ -3778,17 +3861,17 @@
{
register struct filedesc *fdp;
- FILEDESC_LOCK_FAST(td->td_proc->p_fd);
+ FILEDESC_XLOCK(td->td_proc->p_fd);
fdp = td->td_proc->p_fd;
td->td_retval[0] = fdp->fd_cmask;
fdp->fd_cmask = uap->newmask & ALLPERMS;
- FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
+ FILEDESC_XUNLOCK(td->td_proc->p_fd);
return (0);
}
/*
- * Void all references to file by ripping underlying filesystem
- * away from vnode.
+ * Void all references to file by ripping underlying filesystem away from
+ * vnode.
*/
#ifndef _SYS_SYSPROTO_H_
struct revoke_args {
@@ -3808,8 +3891,8 @@
struct nameidata nd;
int vfslocked;
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE, UIO_USERSPACE,
- uap->path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+ UIO_USERSPACE, uap->path, td);
if ((error = namei(&nd)) != 0)
return (error);
vfslocked = NDHASGIANT(&nd);
@@ -3828,7 +3911,7 @@
if (error)
goto out;
if (td->td_ucred->cr_uid != vattr.va_uid) {
- error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+ error = priv_check(td, PRIV_VFS_ADMIN);
if (error)
goto out;
}
@@ -3857,7 +3940,7 @@
if (fdp == NULL)
error = EBADF;
else {
- FILEDESC_LOCK(fdp);
+ FILEDESC_SLOCK(fdp);
if ((u_int)fd >= fdp->fd_nfiles ||
(fp = fdp->fd_ofiles[fd]) == NULL)
error = EBADF;
@@ -3868,14 +3951,14 @@
fhold(fp);
error = 0;
}
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
}
*fpp = fp;
return (error);
}
/*
- * Get (NFS) file handle
+ * Get an (NFS) file handle.
*/
#ifndef _SYS_SYSPROTO_H_
struct lgetfh_args {
@@ -3894,10 +3977,10 @@
int vfslocked;
int error;
- error = suser(td);
+ error = priv_check(td, PRIV_VFS_GETFH);
if (error)
return (error);
- NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE,
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
UIO_USERSPACE, uap->fname, td);
error = namei(&nd);
if (error)
@@ -3907,7 +3990,7 @@
vp = nd.ni_vp;
bzero(&fh, sizeof(fh));
fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
- error = VFS_VPTOFH(vp, &fh.fh_fid);
+ error = VOP_VPTOFH(vp, &fh.fh_fid);
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
if (error)
@@ -3933,10 +4016,10 @@
int vfslocked;
int error;
- error = suser(td);
+ error = priv_check(td, PRIV_VFS_GETFH);
if (error)
return (error);
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE,
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
UIO_USERSPACE, uap->fname, td);
error = namei(&nd);
if (error)
@@ -3946,7 +4029,7 @@
vp = nd.ni_vp;
bzero(&fh, sizeof(fh));
fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
- error = VFS_VPTOFH(vp, &fh.fh_fid);
+ error = VOP_VPTOFH(vp, &fh.fh_fid);
vput(vp);
VFS_UNLOCK_GIANT(vfslocked);
if (error)
@@ -3956,13 +4039,11 @@
}
/*
- * syscall for the rpc.lockd to use to translate a NFS file handle into
- * an open descriptor.
+ * syscall for the rpc.lockd to use to translate a NFS file handle into an
+ * open descriptor.
*
- * warning: do not remove the suser() call or this becomes one giant
+ * warning: do not remove the priv_check() call or this becomes one giant
* security hole.
- *
- * MP SAFE
*/
#ifndef _SYS_SYSPROTO_H_
struct fhopen_args {
@@ -3989,9 +4070,10 @@
register struct filedesc *fdp = p->p_fd;
int fmode, mode, error, type;
struct file *nfp;
+ int vfslocked;
int indx;
- error = suser(td);
+ error = priv_check(td, PRIV_VFS_FHOPEN);
if (error)
return (error);
fmode = FFLAGS(uap->flags);
@@ -4002,12 +4084,10 @@
if (error)
return(error);
/* find the mount point */
- mtx_lock(&Giant);
mp = vfs_getvfs(&fhp.fh_fsid);
- if (mp == NULL) {
- error = ESTALE;
- goto out;
- }
+ if (mp == NULL)
+ return (ESTALE);
+ vfslocked = VFS_LOCK_GIANT(mp);
/* now give me my vnode, it gets returned to me locked */
error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp);
if (error)
@@ -4081,7 +4161,7 @@
if (error)
goto bad;
}
- error = VOP_OPEN(vp, fmode, td->td_ucred, td, -1);
+ error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL);
if (error)
goto bad;
@@ -4100,11 +4180,13 @@
/* An extra reference on `nfp' has been held for us by falloc(). */
fp = nfp;
+ FILE_LOCK(nfp);
nfp->f_vnode = vp;
nfp->f_data = vp;
nfp->f_flag = fmode & FMASK;
- nfp->f_ops = &vnops;
nfp->f_type = DTYPE_VNODE;
+ nfp->f_ops = &vnops;
+ FILE_UNLOCK(nfp);
if (fmode & (O_EXLOCK | O_SHLOCK)) {
lf.l_whence = SEEK_SET;
lf.l_start = 0;
@@ -4138,21 +4220,21 @@
VOP_UNLOCK(vp, 0, td);
fdrop(fp, td);
- mtx_unlock(&Giant);
+ vfs_rel(mp);
+ VFS_UNLOCK_GIANT(vfslocked);
td->td_retval[0] = indx;
return (0);
bad:
vput(vp);
out:
- mtx_unlock(&Giant);
+ vfs_rel(mp);
+ VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
* Stat an (NFS) file handle.
- *
- * MP SAFE
*/
#ifndef _SYS_SYSPROTO_H_
struct fhstat_args {
@@ -4172,26 +4254,27 @@
fhandle_t fh;
struct mount *mp;
struct vnode *vp;
+ int vfslocked;
int error;
- error = suser(td);
+ error = priv_check(td, PRIV_VFS_FHSTAT);
if (error)
return (error);
error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
if (error)
return (error);
- mtx_lock(&Giant);
- if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
- mtx_unlock(&Giant);
+ if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
return (ESTALE);
- }
+ vfslocked = VFS_LOCK_GIANT(mp);
if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) {
- mtx_unlock(&Giant);
+ vfs_rel(mp);
+ VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
vput(vp);
- mtx_unlock(&Giant);
+ vfs_rel(mp);
+ VFS_UNLOCK_GIANT(vfslocked);
if (error)
return (error);
error = copyout(&sb, uap->sb, sizeof(sb));
@@ -4200,8 +4283,6 @@
/*
* Implement fstatfs() for (NFS) file handles.
- *
- * MP SAFE
*/
#ifndef _SYS_SYSPROTO_H_
struct fhstatfs_args {
@@ -4236,39 +4317,29 @@
struct statfs *sp;
struct mount *mp;
struct vnode *vp;
+ int vfslocked;
int error;
- error = suser(td);
+ error = priv_check(td, PRIV_VFS_FHSTATFS);
if (error)
return (error);
- mtx_lock(&Giant);
- if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
- mtx_unlock(&Giant);
+ if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
return (ESTALE);
- }
+ vfslocked = VFS_LOCK_GIANT(mp);
error = VFS_FHTOVP(mp, &fh.fh_fid, &vp);
if (error) {
- mtx_unlock(&Giant);
+ VFS_UNLOCK_GIANT(vfslocked);
+ vfs_rel(mp);
return (error);
}
- mp = vp->v_mount;
- if (mp)
- vfs_ref(mp);
vput(vp);
- if (mp == NULL)
- return (EBADF);
error = prison_canseemount(td->td_ucred, mp);
- if (error) {
- vfs_rel(mp);
- return (error);
- }
+ if (error)
+ goto out;
#ifdef MAC
error = mac_check_mount_stat(td->td_ucred, mp);
- if (error) {
- vfs_rel(mp);
- mtx_unlock(&Giant);
- return (error);
- }
+ if (error)
+ goto out;
#endif
/*
* Set these in case the underlying filesystem fails to do so.
@@ -4278,714 +4349,10 @@
sp->f_namemax = NAME_MAX;
sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
error = VFS_STATFS(mp, sp, td);
- vfs_rel(mp);
- mtx_unlock(&Giant);
- if (error)
- return (error);
- *buf = *sp;
- return (0);
-}
-
-/*
- * Syscall to push extended attribute configuration information into the
- * VFS. Accepts a path, which it converts to a mountpoint, as well as
- * a command (int cmd), and attribute name and misc data. For now, the
- * attribute name is left in userspace for consumption by the VFS_op.
- * It will probably be changed to be copied into sysspace by the
- * syscall in the future, once issues with various consumers of the
- * attribute code have raised their hands.
- *
- * Currently this is used only by UFS Extended Attributes.
- */
-int
-extattrctl(td, uap)
- struct thread *td;
- struct extattrctl_args /* {
- const char *path;
- int cmd;
- const char *filename;
- int attrnamespace;
- const char *attrname;
- } */ *uap;
-{
- struct vnode *filename_vp;
- struct nameidata nd;
- struct mount *mp, *mp_writable;
- char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, fnvfslocked, error;
-
- /*
- * uap->attrname is not always defined. We check again later when we
- * invoke the VFS call so as to pass in NULL there if needed.
- */
- if (uap->attrname != NULL) {
- error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
- NULL);
- if (error)
- return (error);
- }
-
- vfslocked = fnvfslocked = 0;
- /*
- * uap->filename is not always defined. If it is, grab a vnode lock,
- * which VFS_EXTATTRCTL() will later release.
- */
- filename_vp = NULL;
- if (uap->filename != NULL) {
- NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF,
- UIO_USERSPACE, uap->filename, td);
- error = namei(&nd);
- if (error)
- return (error);
- fnvfslocked = NDHASGIANT(&nd);
- filename_vp = nd.ni_vp;
- NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK);
- }
-
- /* uap->path is always defined. */
- NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- if (error) {
- if (filename_vp != NULL)
- vput(filename_vp);
- goto out;
- }
- vfslocked = NDHASGIANT(&nd);
- mp = nd.ni_vp->v_mount;
- error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
- NDFREE(&nd, 0);
- if (error) {
- if (filename_vp != NULL)
- vput(filename_vp);
- goto out;
- }
-
- error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace,
- uap->attrname != NULL ? attrname : NULL, td);
-
- vn_finished_write(mp_writable);
- /*
- * VFS_EXTATTRCTL will have unlocked, but not de-ref'd,
- * filename_vp, so vrele it if it is defined.
- */
- if (filename_vp != NULL)
- vrele(filename_vp);
+ if (error == 0)
+ *buf = *sp;
out:
- VFS_UNLOCK_GIANT(fnvfslocked);
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-/*-
- * Set a named extended attribute on a file or directory
- *
- * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
- * kernelspace string pointer "attrname", userspace buffer
- * pointer "data", buffer length "nbytes", thread "td".
- * Returns: 0 on success, an error number otherwise
- * Locks: none
- * References: vp must be a valid reference for the duration of the call
- */
-static int
-extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
- void *data, size_t nbytes, struct thread *td)
-{
- struct mount *mp;
- struct uio auio;
- struct iovec aiov;
- ssize_t cnt;
- int error;
-
- VFS_ASSERT_GIANT(vp->v_mount);
- error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
- if (error)
- return (error);
- VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-
- aiov.iov_base = data;
- aiov.iov_len = nbytes;
- auio.uio_iov = &aiov;
- auio.uio_iovcnt = 1;
- auio.uio_offset = 0;
- if (nbytes > INT_MAX) {
- error = EINVAL;
- goto done;
- }
- auio.uio_resid = nbytes;
- auio.uio_rw = UIO_WRITE;
- auio.uio_segflg = UIO_USERSPACE;
- auio.uio_td = td;
- cnt = nbytes;
-
-#ifdef MAC
- error = mac_check_vnode_setextattr(td->td_ucred, vp, attrnamespace,
- attrname, &auio);
- if (error)
- goto done;
-#endif
-
- error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
- td->td_ucred, td);
- cnt -= auio.uio_resid;
- td->td_retval[0] = cnt;
-
-done:
- VOP_UNLOCK(vp, 0, td);
- vn_finished_write(mp);
- return (error);
-}
-
-int
-extattr_set_fd(td, uap)
- struct thread *td;
- struct extattr_set_fd_args /* {
- int fd;
- int attrnamespace;
- const char *attrname;
- void *data;
- size_t nbytes;
- } */ *uap;
-{
- struct file *fp;
- char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
-
- error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
- if (error)
- return (error);
-
- error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
- if (error)
- return (error);
-
- vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
- error = extattr_set_vp(fp->f_vnode, uap->attrnamespace,
- attrname, uap->data, uap->nbytes, td);
- fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
-
- return (error);
-}
-
-int
-extattr_set_file(td, uap)
- struct thread *td;
- struct extattr_set_file_args /* {
- const char *path;
- int attrnamespace;
- const char *attrname;
- void *data;
- size_t nbytes;
- } */ *uap;
-{
- struct nameidata nd;
- char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
-
- error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
- if (error)
- return (error);
-
- NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- if (error)
- return (error);
- NDFREE(&nd, NDF_ONLY_PNBUF);
-
- vfslocked = NDHASGIANT(&nd);
- error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
- uap->data, uap->nbytes, td);
-
- vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-int
-extattr_set_link(td, uap)
- struct thread *td;
- struct extattr_set_link_args /* {
- const char *path;
- int attrnamespace;
- const char *attrname;
- void *data;
- size_t nbytes;
- } */ *uap;
-{
- struct nameidata nd;
- char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
-
- error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
- if (error)
- return (error);
-
- NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- if (error)
- return (error);
- NDFREE(&nd, NDF_ONLY_PNBUF);
-
- vfslocked = NDHASGIANT(&nd);
- error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
- uap->data, uap->nbytes, td);
-
- vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-/*-
- * Get a named extended attribute on a file or directory
- *
- * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
- * kernelspace string pointer "attrname", userspace buffer
- * pointer "data", buffer length "nbytes", thread "td".
- * Returns: 0 on success, an error number otherwise
- * Locks: none
- * References: vp must be a valid reference for the duration of the call
- */
-static int
-extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
- void *data, size_t nbytes, struct thread *td)
-{
- struct uio auio, *auiop;
- struct iovec aiov;
- ssize_t cnt;
- size_t size, *sizep;
- int error;
-
- VFS_ASSERT_GIANT(vp->v_mount);
- VOP_LEASE(vp, td, td->td_ucred, LEASE_READ);
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-
- /*
- * Slightly unusual semantics: if the user provides a NULL data
- * pointer, they don't want to receive the data, just the
- * maximum read length.
- */
- auiop = NULL;
- sizep = NULL;
- cnt = 0;
- if (data != NULL) {
- aiov.iov_base = data;
- aiov.iov_len = nbytes;
- auio.uio_iov = &aiov;
- auio.uio_iovcnt = 1;
- auio.uio_offset = 0;
- if (nbytes > INT_MAX) {
- error = EINVAL;
- goto done;
- }
- auio.uio_resid = nbytes;
- auio.uio_rw = UIO_READ;
- auio.uio_segflg = UIO_USERSPACE;
- auio.uio_td = td;
- auiop = &auio;
- cnt = nbytes;
- } else
- sizep = &size;
-
-#ifdef MAC
- error = mac_check_vnode_getextattr(td->td_ucred, vp, attrnamespace,
- attrname, &auio);
- if (error)
- goto done;
-#endif
-
- error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
- td->td_ucred, td);
-
- if (auiop != NULL) {
- cnt -= auio.uio_resid;
- td->td_retval[0] = cnt;
- } else
- td->td_retval[0] = size;
-
-done:
- VOP_UNLOCK(vp, 0, td);
- return (error);
-}
-
-int
-extattr_get_fd(td, uap)
- struct thread *td;
- struct extattr_get_fd_args /* {
- int fd;
- int attrnamespace;
- const char *attrname;
- void *data;
- size_t nbytes;
- } */ *uap;
-{
- struct file *fp;
- char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
-
- error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
- if (error)
- return (error);
-
- error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
- if (error)
- return (error);
-
- vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
- error = extattr_get_vp(fp->f_vnode, uap->attrnamespace,
- attrname, uap->data, uap->nbytes, td);
-
- fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-int
-extattr_get_file(td, uap)
- struct thread *td;
- struct extattr_get_file_args /* {
- const char *path;
- int attrnamespace;
- const char *attrname;
- void *data;
- size_t nbytes;
- } */ *uap;
-{
- struct nameidata nd;
- char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
-
- error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
- if (error)
- return (error);
-
- NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- if (error)
- return (error);
- NDFREE(&nd, NDF_ONLY_PNBUF);
-
- vfslocked = NDHASGIANT(&nd);
- error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
- uap->data, uap->nbytes, td);
-
- vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-int
-extattr_get_link(td, uap)
- struct thread *td;
- struct extattr_get_link_args /* {
- const char *path;
- int attrnamespace;
- const char *attrname;
- void *data;
- size_t nbytes;
- } */ *uap;
-{
- struct nameidata nd;
- char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
-
- error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
- if (error)
- return (error);
-
- NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- if (error)
- return (error);
- NDFREE(&nd, NDF_ONLY_PNBUF);
-
- vfslocked = NDHASGIANT(&nd);
- error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
- uap->data, uap->nbytes, td);
-
- vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-/*
- * extattr_delete_vp(): Delete a named extended attribute on a file or
- * directory
- *
- * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
- * kernelspace string pointer "attrname", proc "p"
- * Returns: 0 on success, an error number otherwise
- * Locks: none
- * References: vp must be a valid reference for the duration of the call
- */
-static int
-extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
- struct thread *td)
-{
- struct mount *mp;
- int error;
-
- VFS_ASSERT_GIANT(vp->v_mount);
- error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
- if (error)
- return (error);
- VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-
-#ifdef MAC
- error = mac_check_vnode_deleteextattr(td->td_ucred, vp, attrnamespace,
- attrname);
- if (error)
- goto done;
-#endif
-
- error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred,
- td);
- if (error == EOPNOTSUPP)
- error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
- td->td_ucred, td);
-#ifdef MAC
-done:
-#endif
- VOP_UNLOCK(vp, 0, td);
- vn_finished_write(mp);
- return (error);
-}
-
-int
-extattr_delete_fd(td, uap)
- struct thread *td;
- struct extattr_delete_fd_args /* {
- int fd;
- int attrnamespace;
- const char *attrname;
- } */ *uap;
-{
- struct file *fp;
- char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
-
- error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
- if (error)
- return (error);
-
- error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
- if (error)
- return (error);
-
- vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
- error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace,
- attrname, td);
- fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-int
-extattr_delete_file(td, uap)
- struct thread *td;
- struct extattr_delete_file_args /* {
- const char *path;
- int attrnamespace;
- const char *attrname;
- } */ *uap;
-{
- struct nameidata nd;
- char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
-
- error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
- if (error)
- return(error);
-
- NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- if (error)
- return(error);
- NDFREE(&nd, NDF_ONLY_PNBUF);
-
- vfslocked = NDHASGIANT(&nd);
- error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
- vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
- return(error);
-}
-
-int
-extattr_delete_link(td, uap)
- struct thread *td;
- struct extattr_delete_link_args /* {
- const char *path;
- int attrnamespace;
- const char *attrname;
- } */ *uap;
-{
- struct nameidata nd;
- char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
-
- error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
- if (error)
- return(error);
-
- NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- if (error)
- return(error);
- NDFREE(&nd, NDF_ONLY_PNBUF);
-
- vfslocked = NDHASGIANT(&nd);
- error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
- vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
- return(error);
-}
-
-/*-
- * Retrieve a list of extended attributes on a file or directory.
- *
- * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace",
- * userspace buffer pointer "data", buffer length "nbytes",
- * thread "td".
- * Returns: 0 on success, an error number otherwise
- * Locks: none
- * References: vp must be a valid reference for the duration of the call
- */
-static int
-extattr_list_vp(struct vnode *vp, int attrnamespace, void *data,
- size_t nbytes, struct thread *td)
-{
- struct uio auio, *auiop;
- size_t size, *sizep;
- struct iovec aiov;
- ssize_t cnt;
- int error;
-
- VFS_ASSERT_GIANT(vp->v_mount);
- VOP_LEASE(vp, td, td->td_ucred, LEASE_READ);
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-
- auiop = NULL;
- sizep = NULL;
- cnt = 0;
- if (data != NULL) {
- aiov.iov_base = data;
- aiov.iov_len = nbytes;
- auio.uio_iov = &aiov;
- auio.uio_iovcnt = 1;
- auio.uio_offset = 0;
- if (nbytes > INT_MAX) {
- error = EINVAL;
- goto done;
- }
- auio.uio_resid = nbytes;
- auio.uio_rw = UIO_READ;
- auio.uio_segflg = UIO_USERSPACE;
- auio.uio_td = td;
- auiop = &auio;
- cnt = nbytes;
- } else
- sizep = &size;
-
-#ifdef MAC
- error = mac_check_vnode_listextattr(td->td_ucred, vp, attrnamespace);
- if (error)
- goto done;
-#endif
-
- error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep,
- td->td_ucred, td);
-
- if (auiop != NULL) {
- cnt -= auio.uio_resid;
- td->td_retval[0] = cnt;
- } else
- td->td_retval[0] = size;
-
-done:
- VOP_UNLOCK(vp, 0, td);
- return (error);
-}
-
-
-int
-extattr_list_fd(td, uap)
- struct thread *td;
- struct extattr_list_fd_args /* {
- int fd;
- int attrnamespace;
- void *data;
- size_t nbytes;
- } */ *uap;
-{
- struct file *fp;
- int vfslocked, error;
-
- error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
- if (error)
- return (error);
-
- vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
- error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data,
- uap->nbytes, td);
-
- fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-int
-extattr_list_file(td, uap)
- struct thread*td;
- struct extattr_list_file_args /* {
- const char *path;
- int attrnamespace;
- void *data;
- size_t nbytes;
- } */ *uap;
-{
- struct nameidata nd;
- int vfslocked, error;
-
- NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- if (error)
- return (error);
- NDFREE(&nd, NDF_ONLY_PNBUF);
-
- vfslocked = NDHASGIANT(&nd);
- error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
- uap->nbytes, td);
-
- vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-int
-extattr_list_link(td, uap)
- struct thread*td;
- struct extattr_list_link_args /* {
- const char *path;
- int attrnamespace;
- void *data;
- size_t nbytes;
- } */ *uap;
-{
- struct nameidata nd;
- int vfslocked, error;
-
- NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- if (error)
- return (error);
- NDFREE(&nd, NDF_ONLY_PNBUF);
-
- vfslocked = NDHASGIANT(&nd);
- error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
- uap->nbytes, td);
-
- vrele(nd.ni_vp);
+ vfs_rel(mp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
Index: sys_pipe.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sys_pipe.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/sys_pipe.c -L sys/kern/sys_pipe.c -u -r1.2 -r1.3
--- sys/kern/sys_pipe.c
+++ sys/kern/sys_pipe.c
@@ -89,7 +89,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sys_pipe.c,v 1.184.2.2 2006/01/31 15:44:51 glebius Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sys_pipe.c,v 1.191.2.1 2007/11/25 11:11:28 dumbbell Exp $");
#include "opt_mac.h"
@@ -101,7 +101,6 @@
#include <sys/filio.h>
#include <sys/kernel.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/mutex.h>
#include <sys/ttycom.h>
#include <sys/stat.h>
@@ -117,6 +116,8 @@
#include <sys/uio.h>
#include <sys/event.h>
+#include <security/mac/mac_framework.h>
+
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_object.h>
@@ -174,19 +175,14 @@
#define MINPIPESIZE (PIPE_SIZE/3)
#define MAXPIPESIZE (2*PIPE_SIZE/3)
-static int amountpipes;
static int amountpipekva;
static int pipefragretry;
static int pipeallocfail;
static int piperesizefail;
static int piperesizeallowed = 1;
-SYSCTL_DECL(_kern_ipc);
-
SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
&maxpipekva, 0, "Pipe KVA limit");
-SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
- &amountpipes, 0, "Current # of pipes");
SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
&amountpipekva, 0, "Pipe KVA usage");
SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
@@ -215,7 +211,6 @@
static int pipespace_new(struct pipe *cpipe, int size);
static int pipe_zone_ctor(void *mem, int size, void *arg, int flags);
-static void pipe_zone_dtor(void *mem, int size, void *arg);
static int pipe_zone_init(void *mem, int size, int flags);
static void pipe_zone_fini(void *mem, int size);
@@ -227,8 +222,8 @@
pipeinit(void *dummy __unused)
{
- pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair),
- pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini,
+ pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
+ pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini,
UMA_ALIGN_PTR, 0);
KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
}
@@ -278,22 +273,9 @@
*/
pp->pp_label = NULL;
- atomic_add_int(&amountpipes, 2);
return (0);
}
-static void
-pipe_zone_dtor(void *mem, int size, void *arg)
-{
- struct pipepair *pp;
-
- KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size"));
-
- pp = (struct pipepair *)mem;
-
- atomic_subtract_int(&amountpipes, 2);
-}
-
static int
pipe_zone_init(void *mem, int size, int flags)
{
@@ -320,10 +302,9 @@
}
/*
- * The pipe system call for the DTYPE_PIPE type of pipes. If we fail,
- * let the zone pick up the pieces via pipeclose().
+ * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let
+ * the zone pick up the pieces via pipeclose().
*/
-
/* ARGSUSED */
int
pipe(td, uap)
@@ -897,9 +878,9 @@
while (wpipe->pipe_state & PIPE_DIRECTW) {
if (wpipe->pipe_state & PIPE_WANTR) {
wpipe->pipe_state &= ~PIPE_WANTR;
- pipeselwakeup(wpipe);
wakeup(wpipe);
}
+ pipeselwakeup(wpipe);
wpipe->pipe_state |= PIPE_WANTW;
pipeunlock(wpipe);
error = msleep(wpipe, PIPE_MTX(wpipe),
@@ -913,9 +894,9 @@
if (wpipe->pipe_buffer.cnt > 0) {
if (wpipe->pipe_state & PIPE_WANTR) {
wpipe->pipe_state &= ~PIPE_WANTR;
- pipeselwakeup(wpipe);
wakeup(wpipe);
}
+ pipeselwakeup(wpipe);
wpipe->pipe_state |= PIPE_WANTW;
pipeunlock(wpipe);
error = msleep(wpipe, PIPE_MTX(wpipe),
@@ -1077,8 +1058,9 @@
* The direct write mechanism will detect the reader going
* away on us.
*/
- if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
- (wpipe->pipe_buffer.size >= PIPE_MINDIRECT) &&
+ if (uio->uio_segflg == UIO_USERSPACE &&
+ uio->uio_iov->iov_len >= PIPE_MINDIRECT &&
+ wpipe->pipe_buffer.size >= PIPE_MINDIRECT &&
(fp->f_flag & FNONBLOCK) == 0) {
pipeunlock(wpipe);
error = pipe_direct_write(wpipe, uio);
@@ -1098,9 +1080,10 @@
if (wpipe->pipe_state & PIPE_DIRECTW) {
if (wpipe->pipe_state & PIPE_WANTR) {
wpipe->pipe_state &= ~PIPE_WANTR;
- pipeselwakeup(wpipe);
wakeup(wpipe);
}
+ pipeselwakeup(wpipe);
+ wpipe->pipe_state |= PIPE_WANTW;
pipeunlock(wpipe);
error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
"pipbww", 0);
Index: sysv_shm.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sysv_shm.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/sysv_shm.c -L sys/kern/sysv_shm.c -u -r1.1.1.1 -r1.2
--- sys/kern/sysv_shm.c
+++ sys/kern/sysv_shm.c
@@ -60,7 +60,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sysv_shm.c,v 1.102 2005/05/12 20:04:48 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sysv_shm.c,v 1.111 2007/03/05 13:10:57 rwatson Exp $");
#include "opt_compat.h"
#include "opt_sysvipc.h"
@@ -84,7 +84,8 @@
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/jail.h>
-#include <sys/mac.h>
+
+#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -94,28 +95,26 @@
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
-#ifdef MAC_DEBUG
-#define MPRINTF(a) printf a
-#else
-#define MPRINTF(a)
-#endif
-
static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
+#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
struct oshmctl_args;
static int oshmctl(struct thread *td, struct oshmctl_args *uap);
+#endif
static int shmget_allocate_segment(struct thread *td,
struct shmget_args *uap, int mode);
static int shmget_existing(struct thread *td, struct shmget_args *uap,
int mode, int segnum);
+#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
/* XXX casting to (sy_call_t *) is bogus, as usual. */
static sy_call_t *shmcalls[] = {
(sy_call_t *)shmat, (sy_call_t *)oshmctl,
(sy_call_t *)shmdt, (sy_call_t *)shmget,
(sy_call_t *)shmctl
};
+#endif
#define SHMSEG_FREE 0x0200
#define SHMSEG_REMOVED 0x0400
@@ -176,16 +175,15 @@
static int shm_use_phys;
static int shm_allow_removed;
-SYSCTL_DECL(_kern_ipc);
-SYSCTL_INT(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0,
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0,
"Maximum shared memory segment size");
-SYSCTL_INT(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0,
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0,
"Minimum shared memory segment size");
-SYSCTL_INT(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
"Number of shared memory identifiers");
-SYSCTL_INT(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
"Number of segments per process");
-SYSCTL_INT(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0,
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0,
"Maximum number of pages available for shared memory");
SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RW,
&shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core");
@@ -291,10 +289,6 @@
const void *shmaddr;
};
#endif
-
-/*
- * MPSAFE
- */
int
shmdt(td, uap)
struct thread *td;
@@ -329,10 +323,8 @@
#ifdef MAC
shmsegptr = &shmsegs[IPCID_TO_IX(shmmap_s->shmid)];
error = mac_check_sysv_shmdt(td->td_ucred, shmsegptr);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_shmdt returned %d\n", error));
+ if (error != 0)
goto done2;
- }
#endif
error = shm_delete_mapping(p->p_vmspace, shmmap_s);
done2:
@@ -347,10 +339,6 @@
int shmflg;
};
#endif
-
-/*
- * MPSAFE
- */
int
kern_shmat(td, shmid, shmaddr, shmflg)
struct thread *td;
@@ -390,10 +378,8 @@
goto done2;
#ifdef MAC
error = mac_check_sysv_shmat(td->td_ucred, shmseg, shmflg);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_shmat returned %d\n", error));
+ if (error != 0)
goto done2;
- }
#endif
for (i = 0; i < shminfo.shmseg; i++) {
if (shmmap_s->shmid == -1)
@@ -464,6 +450,7 @@
return kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg);
}
+#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
struct oshmid_ds {
struct ipc_perm shm_perm; /* operation perms */
int shm_segsz; /* size of segment (bytes) */
@@ -481,10 +468,6 @@
int cmd;
struct oshmid_ds *ubuf;
};
-
-/*
- * MPSAFE
- */
static int
oshmctl(td, uap)
struct thread *td;
@@ -510,11 +493,8 @@
goto done2;
#ifdef MAC
error = mac_check_sysv_shmctl(td->td_ucred, shmseg, uap->cmd);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_shmctl returned %d\n",
- error));
+ if (error != 0)
goto done2;
- }
#endif
outbuf.shm_perm = shmseg->u.shm_perm;
outbuf.shm_segsz = shmseg->u.shm_segsz;
@@ -540,6 +520,7 @@
return (EINVAL);
#endif
}
+#endif
#ifndef _SYS_SYSPROTO_H_
struct shmctl_args {
@@ -548,10 +529,6 @@
struct shmid_ds *buf;
};
#endif
-
-/*
- * MPSAFE
- */
int
kern_shmctl(td, shmid, cmd, buf, bufsz)
struct thread *td;
@@ -599,10 +576,8 @@
}
#ifdef MAC
error = mac_check_sysv_shmctl(td->td_ucred, shmseg, cmd);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_shmctl returned %d\n", error));
+ if (error != 0)
goto done2;
- }
#endif
switch (cmd) {
case SHM_STAT:
@@ -700,7 +675,6 @@
int shmflg;
};
#endif
-
static int
shmget_existing(td, uap, mode, segnum)
struct thread *td;
@@ -726,14 +700,11 @@
}
if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
return (EEXIST);
- error = ipcperm(td, &shmseg->u.shm_perm, mode);
#ifdef MAC
error = mac_check_sysv_shmget(td->td_ucred, shmseg, uap->shmflg);
if (error != 0)
- MPRINTF(("mac_check_sysv_shmget returned %d\n", error));
-#endif
- if (error)
return (error);
+#endif
if (uap->size && uap->size > shmseg->u.shm_segsz)
return (EINVAL);
td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
@@ -825,9 +796,6 @@
return (0);
}
-/*
- * MPSAFE
- */
int
shmget(td, uap)
struct thread *td;
@@ -860,9 +828,6 @@
return (error);
}
-/*
- * MPSAFE
- */
int
shmsys(td, uap)
struct thread *td;
@@ -874,6 +839,7 @@
int a4;
} */ *uap;
{
+#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
int error;
if (!jail_sysvipc_allowed && jailed(td->td_ucred))
@@ -885,6 +851,9 @@
error = (*shmcalls[uap->which])(td, &uap->a2);
mtx_unlock(&Giant);
return (error);
+#else
+ return (nosys(td, NULL));
+#endif
}
static void
@@ -955,15 +924,15 @@
{
int i;
- TUNABLE_INT_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall);
+ TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall);
for (i = PAGE_SIZE; i > 0; i--) {
shminfo.shmmax = shminfo.shmall * i;
if (shminfo.shmmax >= shminfo.shmall)
break;
}
- TUNABLE_INT_FETCH("kern.ipc.shmmin", &shminfo.shmmin);
- TUNABLE_INT_FETCH("kern.ipc.shmmni", &shminfo.shmmni);
- TUNABLE_INT_FETCH("kern.ipc.shmseg", &shminfo.shmseg);
+ TUNABLE_ULONG_FETCH("kern.ipc.shmmin", &shminfo.shmmin);
+ TUNABLE_ULONG_FETCH("kern.ipc.shmmni", &shminfo.shmmni);
+ TUNABLE_ULONG_FETCH("kern.ipc.shmseg", &shminfo.shmseg);
TUNABLE_INT_FETCH("kern.ipc.shm_use_phys", &shm_use_phys);
shmalloced = shminfo.shmmni;
Index: kern_lock.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_lock.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_lock.c -L sys/kern/kern_lock.c -u -r1.2 -r1.3
--- sys/kern/kern_lock.c
+++ sys/kern/kern_lock.c
@@ -41,7 +41,10 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_lock.c,v 1.89.2.3 2006/03/13 03:05:50 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_lock.c,v 1.110 2007/05/18 15:04:59 jhb Exp $");
+
+#include "opt_ddb.h"
+#include "opt_global.h"
#include <sys/param.h>
#include <sys/kdb.h>
@@ -52,20 +55,52 @@
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/systm.h>
+#include <sys/lock_profile.h>
#ifdef DEBUG_LOCKS
#include <sys/stack.h>
#endif
+#ifdef DDB
+#include <ddb/ddb.h>
+static void db_show_lockmgr(struct lock_object *lock);
+#endif
+static void lock_lockmgr(struct lock_object *lock, int how);
+static int unlock_lockmgr(struct lock_object *lock);
+
+struct lock_class lock_class_lockmgr = {
+ .lc_name = "lockmgr",
+ .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE,
+#ifdef DDB
+ .lc_ddb_show = db_show_lockmgr,
+#endif
+ .lc_lock = lock_lockmgr,
+ .lc_unlock = unlock_lockmgr,
+};
+
/*
* Locking primitives implementation.
* Locks provide shared/exclusive sychronization.
*/
+void
+lock_lockmgr(struct lock_object *lock, int how)
+{
+
+ panic("lockmgr locks do not support sleep interlocking");
+}
+
+int
+unlock_lockmgr(struct lock_object *lock)
+{
+
+ panic("lockmgr locks do not support sleep interlocking");
+}
+
#define COUNT(td, x) if ((td)) (td)->td_locks += (x)
#define LK_ALL (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | \
LK_SHARE_NONZERO | LK_WAIT_NONZERO)
-static int acquire(struct lock **lkpp, int extflags, int wanted);
+static int acquire(struct lock **lkpp, int extflags, int wanted, int *contested, uint64_t *waittime);
static int acquiredrain(struct lock *lkp, int extflags) ;
static __inline void
@@ -93,7 +128,7 @@
}
static int
-acquire(struct lock **lkpp, int extflags, int wanted)
+acquire(struct lock **lkpp, int extflags, int wanted, int *contested, uint64_t *waittime)
{
struct lock *lkp = *lkpp;
int error;
@@ -104,6 +139,9 @@
if ((extflags & LK_NOWAIT) && (lkp->lk_flags & wanted))
return EBUSY;
error = 0;
+ if ((lkp->lk_flags & wanted) != 0)
+ lock_profile_obtain_lock_failed(&lkp->lk_object, contested, waittime);
+
while ((lkp->lk_flags & wanted) != 0) {
CTR2(KTR_LOCK,
"acquire(): lkp == %p, lk_flags == 0x%x sleeping",
@@ -142,16 +180,16 @@
* accepted shared locks and shared-to-exclusive upgrades to go away.
*/
int
-lockmgr(lkp, flags, interlkp, td)
- struct lock *lkp;
- u_int flags;
- struct mtx *interlkp;
- struct thread *td;
+_lockmgr(struct lock *lkp, u_int flags, struct mtx *interlkp,
+ struct thread *td, char *file, int line)
+
{
int error;
struct thread *thr;
int extflags, lockflags;
-
+ int contested = 0;
+ uint64_t waitstart = 0;
+
error = 0;
if (td == NULL)
thr = LK_KERNPROC;
@@ -179,7 +217,7 @@
if ((flags & (LK_NOWAIT|LK_RELEASE)) == 0)
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
- &lkp->lk_interlock->mtx_object,
+ &lkp->lk_interlock->lock_object,
"Acquiring lockmgr lock \"%s\"", lkp->lk_wmesg);
if (panicstr != NULL) {
@@ -209,10 +247,13 @@
lockflags = LK_HAVE_EXCL;
if (td != NULL && !(td->td_pflags & TDP_DEADLKTREAT))
lockflags |= LK_WANT_EXCL | LK_WANT_UPGRADE;
- error = acquire(&lkp, extflags, lockflags);
+ error = acquire(&lkp, extflags, lockflags, &contested, &waitstart);
if (error)
break;
sharelock(td, lkp, 1);
+ if (lkp->lk_sharecount == 1)
+ lock_profile_obtain_lock_success(&lkp->lk_object, contested, waitstart, file, line);
+
#if defined(DEBUG_LOCKS)
stack_save(&lkp->lk_stack);
#endif
@@ -223,6 +264,8 @@
* An alternative would be to fail with EDEADLK.
*/
sharelock(td, lkp, 1);
+ if (lkp->lk_sharecount == 1)
+ lock_profile_obtain_lock_success(&lkp->lk_object, contested, waitstart, file, line);
/* FALLTHROUGH downgrade */
case LK_DOWNGRADE:
@@ -266,6 +309,8 @@
if (lkp->lk_sharecount <= 0)
panic("lockmgr: upgrade without shared");
shareunlock(td, lkp, 1);
+ if (lkp->lk_sharecount == 0)
+ lock_profile_release_lock(&lkp->lk_object);
/*
* If we are just polling, check to see if we will block.
*/
@@ -282,7 +327,7 @@
* drop to zero, then take exclusive lock.
*/
lkp->lk_flags |= LK_WANT_UPGRADE;
- error = acquire(&lkp, extflags, LK_SHARE_NONZERO);
+ error = acquire(&lkp, extflags, LK_SHARE_NONZERO, &contested, &waitstart);
lkp->lk_flags &= ~LK_WANT_UPGRADE;
if (error) {
@@ -296,6 +341,7 @@
lkp->lk_lockholder = thr;
lkp->lk_exclusivecount = 1;
COUNT(td, 1);
+ lock_profile_obtain_lock_success(&lkp->lk_object, contested, waitstart, file, line);
#if defined(DEBUG_LOCKS)
stack_save(&lkp->lk_stack);
#endif
@@ -335,14 +381,14 @@
/*
* Try to acquire the want_exclusive flag.
*/
- error = acquire(&lkp, extflags, (LK_HAVE_EXCL | LK_WANT_EXCL));
+ error = acquire(&lkp, extflags, (LK_HAVE_EXCL | LK_WANT_EXCL), &contested, &waitstart);
if (error)
break;
lkp->lk_flags |= LK_WANT_EXCL;
/*
* Wait for shared locks and upgrades to finish.
*/
- error = acquire(&lkp, extflags, LK_HAVE_EXCL | LK_WANT_UPGRADE | LK_SHARE_NONZERO);
+ error = acquire(&lkp, extflags, LK_HAVE_EXCL | LK_WANT_UPGRADE | LK_SHARE_NONZERO, &contested, &waitstart);
lkp->lk_flags &= ~LK_WANT_EXCL;
if (error) {
if (lkp->lk_flags & LK_WAIT_NONZERO)
@@ -355,6 +401,7 @@
panic("lockmgr: non-zero exclusive count");
lkp->lk_exclusivecount = 1;
COUNT(td, 1);
+ lock_profile_obtain_lock_success(&lkp->lk_object, contested, waitstart, file, line);
#if defined(DEBUG_LOCKS)
stack_save(&lkp->lk_stack);
#endif
@@ -374,11 +421,18 @@
lkp->lk_flags &= ~LK_HAVE_EXCL;
lkp->lk_lockholder = LK_NOPROC;
lkp->lk_exclusivecount = 0;
+ lock_profile_release_lock(&lkp->lk_object);
} else {
lkp->lk_exclusivecount--;
}
} else if (lkp->lk_flags & LK_SHARE_NONZERO)
shareunlock(td, lkp, 1);
+ else {
+ printf("lockmgr: thread %p unlocking unheld lock\n",
+ thr);
+ kdb_backtrace();
+ }
+
if (lkp->lk_flags & LK_WAIT_NONZERO)
wakeup((void *)lkp);
break;
@@ -490,13 +544,14 @@
lkp->lk_waitcount = 0;
lkp->lk_exclusivecount = 0;
lkp->lk_prio = prio;
- lkp->lk_wmesg = wmesg;
lkp->lk_timo = timo;
lkp->lk_lockholder = LK_NOPROC;
lkp->lk_newlock = NULL;
#ifdef DEBUG_LOCKS
stack_zero(&lkp->lk_stack);
#endif
+ lock_init(&lkp->lk_object, &lock_class_lockmgr, wmesg, NULL,
+ LO_RECURSABLE | LO_SLEEPABLE | LO_UPGRADABLE);
}
/*
@@ -506,8 +561,10 @@
lockdestroy(lkp)
struct lock *lkp;
{
+
CTR2(KTR_LOCK, "lockdestroy(): lkp == %p (lk_wmesg == \"%s\")",
lkp, lkp->lk_wmesg);
+ lock_destroy(&lkp->lk_object);
}
/*
@@ -554,6 +611,21 @@
}
/*
+ * Determine the number of waiters on a lock.
+ */
+int
+lockwaiters(lkp)
+ struct lock *lkp;
+{
+ int count;
+
+ mtx_lock(lkp->lk_interlock);
+ count = lkp->lk_waitcount;
+ mtx_unlock(lkp->lk_interlock);
+ return (count);
+}
+
+/*
* Print out information about state of a lock. Used by VOP_PRINT
* routines to display status about contained locks.
*/
@@ -575,3 +647,71 @@
stack_print(&lkp->lk_stack);
#endif
}
+
+#ifdef DDB
+/*
+ * Check to see if a thread that is blocked on a sleep queue is actually
+ * blocked on a 'struct lock'. If so, output some details and return true.
+ * If the lock has an exclusive owner, return that in *ownerp.
+ */
+int
+lockmgr_chain(struct thread *td, struct thread **ownerp)
+{
+ struct lock *lkp;
+
+ lkp = td->td_wchan;
+
+ /* Simple test to see if wchan points to a lockmgr lock. */
+ if (LOCK_CLASS(&lkp->lk_object) == &lock_class_lockmgr &&
+ lkp->lk_wmesg == td->td_wmesg)
+ goto ok;
+
+ /*
+ * If this thread is doing a DRAIN, then it would be asleep on
+ * &lkp->lk_flags rather than lkp.
+ */
+ lkp = (struct lock *)((char *)td->td_wchan -
+ offsetof(struct lock, lk_flags));
+ if (LOCK_CLASS(&lkp->lk_object) == &lock_class_lockmgr &&
+ lkp->lk_wmesg == td->td_wmesg && (lkp->lk_flags & LK_WAITDRAIN))
+ goto ok;
+
+ /* Doen't seem to be a lockmgr lock. */
+ return (0);
+
+ok:
+ /* Ok, we think we have a lockmgr lock, so output some details. */
+ db_printf("blocked on lk \"%s\" ", lkp->lk_wmesg);
+ if (lkp->lk_sharecount) {
+ db_printf("SHARED (count %d)\n", lkp->lk_sharecount);
+ *ownerp = NULL;
+ } else {
+ db_printf("EXCL (count %d)\n", lkp->lk_exclusivecount);
+ *ownerp = lkp->lk_lockholder;
+ }
+ return (1);
+}
+
+void
+db_show_lockmgr(struct lock_object *lock)
+{
+ struct thread *td;
+ struct lock *lkp;
+
+ lkp = (struct lock *)lock;
+
+ db_printf(" lock type: %s\n", lkp->lk_wmesg);
+ db_printf(" state: ");
+ if (lkp->lk_sharecount)
+ db_printf("SHARED (count %d)\n", lkp->lk_sharecount);
+ else if (lkp->lk_flags & LK_HAVE_EXCL) {
+ td = lkp->lk_lockholder;
+ db_printf("EXCL (count %d) %p ", lkp->lk_exclusivecount, td);
+ db_printf("(tid %d, pid %d, \"%s\")\n", td->td_tid,
+ td->td_proc->p_pid, td->td_proc->p_comm);
+ } else
+ db_printf("UNLOCKED\n");
+ if (lkp->lk_waitcount > 0)
+ db_printf(" waiters: %d\n", lkp->lk_waitcount);
+}
+#endif
Index: kern_shutdown.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_shutdown.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_shutdown.c -L sys/kern/kern_shutdown.c -u -r1.2 -r1.3
--- sys/kern/kern_shutdown.c
+++ sys/kern/kern_shutdown.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_shutdown.c,v 1.174.2.3 2006/03/13 03:05:54 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_shutdown.c,v 1.182.4.1 2008/01/30 21:21:50 ru Exp $");
#include "opt_kdb.h"
#include "opt_mac.h"
@@ -53,9 +53,9 @@
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
#include <sys/mount.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/reboot.h>
#include <sys/resourcevar.h>
@@ -68,6 +68,14 @@
#include <machine/pcb.h>
#include <machine/smp.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/swap_pager.h>
+
#include <sys/signalvar.h>
#ifndef PANIC_REBOOT_WAIT_TIME
@@ -142,9 +150,7 @@
SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL)
/*
- * The system call that results in a reboot
- *
- * MPSAFE
+ * The system call that results in a reboot.
*/
/* ARGSUSED */
int
@@ -157,7 +163,7 @@
error = mac_check_system_reboot(td->td_ucred, uap->opt);
#endif
if (error == 0)
- error = suser(td);
+ error = priv_check(td, PRIV_REBOOT);
if (error == 0) {
mtx_lock(&Giant);
boot(uap->opt);
@@ -261,9 +267,9 @@
* systems don't shutdown properly (i.e., ACPI power off) if we
* run on another processor.
*/
- mtx_lock_spin(&sched_lock);
+ thread_lock(curthread);
sched_bind(curthread, 0);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(curthread);
KASSERT(PCPU_GET(cpuid) == 0, ("boot: not running on cpu 0"));
#endif
/* We're in the process of rebooting. */
@@ -334,9 +340,9 @@
*/
DROP_GIANT();
for (subiter = 0; subiter < 50 * iter; subiter++) {
- mtx_lock_spin(&sched_lock);
+ thread_lock(curthread);
mi_switch(SW_VOL, NULL);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(curthread);
DELAY(1000);
}
PICKUP_GIANT();
@@ -384,6 +390,7 @@
if (panicstr == 0)
vfs_unmountall();
}
+ swapoff_all();
DELAY(100000); /* wait for console output to finish */
}
@@ -486,8 +493,6 @@
* Panic is called on unresolvable fatal errors. It prints "panic: mesg",
* and then reboots. If we are called twice, then we avoid trying to sync
* the disks as this often leads to recursive panics.
- *
- * MPSAFE
*/
void
panic(const char *fmt, ...)
@@ -550,9 +555,9 @@
}
#endif
#endif
- mtx_lock_spin(&sched_lock);
+ /*thread_lock(td); */
td->td_flags |= TDF_INPANIC;
- mtx_unlock_spin(&sched_lock);
+ /* thread_unlock(td); */
if (!sync_on_panic)
bootopt |= RB_NOSYNC;
boot(bootopt);
@@ -626,6 +631,20 @@
return (0);
}
+/* Call dumper with bounds checking. */
+int
+dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical,
+ off_t offset, size_t length)
+{
+
+ if (length != 0 && (offset < di->mediaoffset ||
+ offset - di->mediaoffset + length > di->mediasize)) {
+ printf("Attempt to write outside dump device boundaries.\n");
+ return (ENXIO);
+ }
+ return (di->dumper(di->priv, virtual, physical, offset, length));
+}
+
#if defined(__powerpc__)
void
dumpsys(struct dumperinfo *di __unused)
Index: subr_autoconf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_autoconf.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_autoconf.c -L sys/kern/subr_autoconf.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_autoconf.c
+++ sys/kern/subr_autoconf.c
@@ -35,10 +35,12 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_autoconf.c,v 1.22 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_autoconf.c,v 1.23 2006/07/19 18:53:56 jhb Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
#include <sys/systm.h>
/*
@@ -50,26 +52,32 @@
*/
static TAILQ_HEAD(, intr_config_hook) intr_config_hook_list =
TAILQ_HEAD_INITIALIZER(intr_config_hook_list);
-
+static struct mtx intr_config_hook_lock;
+MTX_SYSINIT(intr_config_hook, &intr_config_hook_lock, "intr config", MTX_DEF);
/* ARGSUSED */
static void run_interrupt_driven_config_hooks(void *dummy);
+
static void
run_interrupt_driven_config_hooks(dummy)
void *dummy;
{
struct intr_config_hook *hook_entry, *next_entry;
- for (hook_entry = TAILQ_FIRST(&intr_config_hook_list);
- hook_entry != NULL;
- hook_entry = next_entry) {
+ mtx_lock(&intr_config_hook_lock);
+ TAILQ_FOREACH_SAFE(hook_entry, &intr_config_hook_list, ich_links,
+ next_entry) {
next_entry = TAILQ_NEXT(hook_entry, ich_links);
+ mtx_unlock(&intr_config_hook_lock);
(*hook_entry->ich_func)(hook_entry->ich_arg);
+ mtx_lock(&intr_config_hook_lock);
}
while (!TAILQ_EMPTY(&intr_config_hook_list)) {
- tsleep(&intr_config_hook_list, PCONFIG, "conifhk", 0);
+ msleep(&intr_config_hook_list, &intr_config_hook_lock, PCONFIG,
+ "conifhk", 0);
}
+ mtx_unlock(&intr_config_hook_lock);
}
SYSINIT(intr_config_hooks, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_FIRST,
run_interrupt_driven_config_hooks, NULL)
@@ -85,17 +93,18 @@
{
struct intr_config_hook *hook_entry;
- for (hook_entry = TAILQ_FIRST(&intr_config_hook_list);
- hook_entry != NULL;
- hook_entry = TAILQ_NEXT(hook_entry, ich_links))
+ mtx_lock(&intr_config_hook_lock);
+ TAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links)
if (hook_entry == hook)
break;
if (hook_entry != NULL) {
+ mtx_unlock(&intr_config_hook_lock);
printf("config_intrhook_establish: establishing an "
"already established hook.\n");
return (1);
}
TAILQ_INSERT_TAIL(&intr_config_hook_list, hook, ich_links);
+ mtx_unlock(&intr_config_hook_lock);
if (cold == 0)
/* XXX Sufficient for modules loaded after initial config??? */
run_interrupt_driven_config_hooks(NULL);
@@ -108,9 +117,8 @@
{
struct intr_config_hook *hook_entry;
- for (hook_entry = TAILQ_FIRST(&intr_config_hook_list);
- hook_entry != NULL;
- hook_entry = TAILQ_NEXT(hook_entry, ich_links))
+ mtx_lock(&intr_config_hook_lock);
+ TAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links)
if (hook_entry == hook)
break;
if (hook_entry == NULL)
@@ -118,6 +126,8 @@
"unestablished hook");
TAILQ_REMOVE(&intr_config_hook_list, hook, ich_links);
+
/* Wakeup anyone watching the list */
wakeup(&intr_config_hook_list);
+ mtx_unlock(&intr_config_hook_lock);
}
Index: kern_umtx.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_umtx.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_umtx.c -L sys/kern/kern_umtx.c -u -r1.2 -r1.3
--- sys/kern/kern_umtx.c
+++ sys/kern/kern_umtx.c
@@ -26,20 +26,24 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_umtx.c,v 1.33.2.1 2006/01/16 05:48:40 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_umtx.c,v 1.61.2.1 2007/12/20 07:15:40 davidxu Exp $");
+#include "opt_compat.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/eventhandler.h>
-#include <sys/thr.h>
#include <sys/umtx.h>
#include <vm/vm.h>
@@ -48,81 +52,204 @@
#include <vm/vm_map.h>
#include <vm/vm_object.h>
-#define UMTX_PRIVATE 0
-#define UMTX_SHARED 1
+#include <machine/cpu.h>
-#define UMTX_STATIC_SHARED
+#ifdef COMPAT_IA32
+#include <compat/freebsd32/freebsd32_proto.h>
+#endif
+
+#define TYPE_SIMPLE_LOCK 0
+#define TYPE_SIMPLE_WAIT 1
+#define TYPE_NORMAL_UMUTEX 2
+#define TYPE_PI_UMUTEX 3
+#define TYPE_PP_UMUTEX 4
+#define TYPE_CV 5
+/* Key to represent a unique userland synchronous object */
struct umtx_key {
+ int hash;
int type;
+ int shared;
union {
struct {
vm_object_t object;
- long offset;
+ uintptr_t offset;
} shared;
struct {
- struct umtx *umtx;
- long pid;
+ struct vmspace *vs;
+ uintptr_t addr;
} private;
struct {
- void *ptr;
- long word;
+ void *a;
+ uintptr_t b;
} both;
} info;
};
+/* Priority inheritance mutex info. */
+struct umtx_pi {
+ /* Owner thread */
+ struct thread *pi_owner;
+
+ /* Reference count */
+ int pi_refcount;
+
+ /* List entry to link umtx holding by thread */
+ TAILQ_ENTRY(umtx_pi) pi_link;
+
+ /* List entry in hash */
+ TAILQ_ENTRY(umtx_pi) pi_hashlink;
+
+ /* List for waiters */
+ TAILQ_HEAD(,umtx_q) pi_blocked;
+
+ /* Identify a userland lock object */
+ struct umtx_key pi_key;
+};
+
+/* A userland synchronous object user. */
struct umtx_q {
- LIST_ENTRY(umtx_q) uq_next; /* Linked list for the hash. */
- struct umtx_key uq_key; /* Umtx key. */
- struct thread *uq_thread; /* The thread waits on. */
- LIST_ENTRY(umtx_q) uq_rqnext; /* Linked list for requeuing. */
- vm_offset_t uq_addr; /* Umtx's virtual address. */
+ /* Linked list for the hash. */
+ TAILQ_ENTRY(umtx_q) uq_link;
+
+ /* Umtx key. */
+ struct umtx_key uq_key;
+
+ /* Umtx flags. */
+ int uq_flags;
+#define UQF_UMTXQ 0x0001
+
+ /* The thread waits on. */
+ struct thread *uq_thread;
+
+ /*
+ * Blocked on PI mutex. read can use chain lock
+ * or umtx_lock, write must have both chain lock and
+ * umtx_lock being hold.
+ */
+ struct umtx_pi *uq_pi_blocked;
+
+ /* On blocked list */
+ TAILQ_ENTRY(umtx_q) uq_lockq;
+
+ /* Thread contending with us */
+ TAILQ_HEAD(,umtx_pi) uq_pi_contested;
+
+ /* Inherited priority from PP mutex */
+ u_char uq_inherited_pri;
};
-LIST_HEAD(umtx_head, umtx_q);
+TAILQ_HEAD(umtxq_head, umtx_q);
+
+/* Userland lock object's wait-queue chain */
struct umtxq_chain {
- struct mtx uc_lock; /* Lock for this chain. */
- struct umtx_head uc_queue; /* List of sleep queues. */
-#define UCF_BUSY 0x01
-#define UCF_WANT 0x02
- int uc_flags;
+ /* Lock for this chain. */
+ struct mtx uc_lock;
+
+ /* List of sleep queues. */
+ struct umtxq_head uc_queue;
+
+ /* Busy flag */
+ char uc_busy;
+
+ /* Chain lock waiters */
+ int uc_waiters;
+
+ /* All PI in the list */
+ TAILQ_HEAD(,umtx_pi) uc_pi_list;
};
+#define UMTXQ_LOCKED_ASSERT(uc) mtx_assert(&(uc)->uc_lock, MA_OWNED)
+
+/*
+ * Don't propagate time-sharing priority, there is a security reason,
+ * a user can simply introduce PI-mutex, let thread A lock the mutex,
+ * and let another thread B block on the mutex, because B is
+ * sleeping, its priority will be boosted, this causes A's priority to
+ * be boosted via priority propagating too and will never be lowered even
+ * if it is using 100%CPU, this is unfair to other processes.
+ */
+
+#define UPRI(td) (((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
+ (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
+ PRI_MAX_TIMESHARE : (td)->td_user_pri)
+
#define GOLDEN_RATIO_PRIME 2654404609U
#define UMTX_CHAINS 128
#define UMTX_SHIFTS (__WORD_BIT - 7)
-static struct umtxq_chain umtxq_chains[UMTX_CHAINS];
+#define THREAD_SHARE 0
+#define PROCESS_SHARE 1
+#define AUTO_SHARE 2
+
+#define GET_SHARE(flags) \
+ (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
+
+static uma_zone_t umtx_pi_zone;
+static struct umtxq_chain umtxq_chains[UMTX_CHAINS];
static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
+static int umtx_pi_allocated;
-static void umtxq_init_chains(void *);
-static int umtxq_hash(struct umtx_key *key);
-static struct mtx *umtxq_mtx(int chain);
+SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
+SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
+ &umtx_pi_allocated, 0, "Allocated umtx_pi");
+
+static void umtxq_sysinit(void *);
+static void umtxq_hash(struct umtx_key *key);
+static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
static void umtxq_lock(struct umtx_key *key);
static void umtxq_unlock(struct umtx_key *key);
static void umtxq_busy(struct umtx_key *key);
static void umtxq_unbusy(struct umtx_key *key);
static void umtxq_insert(struct umtx_q *uq);
static void umtxq_remove(struct umtx_q *uq);
-static int umtxq_sleep(struct thread *td, struct umtx_key *key,
- int prio, const char *wmesg, int timo);
+static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
static int umtxq_count(struct umtx_key *key);
static int umtxq_signal(struct umtx_key *key, int nr_wakeup);
-#ifdef UMTX_DYNAMIC_SHARED
-static void fork_handler(void *arg, struct proc *p1, struct proc *p2,
- int flags);
-#endif
static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
-static int umtx_key_get(struct thread *td, struct umtx *umtx,
+static int umtx_key_get(void *addr, int type, int share,
struct umtx_key *key);
static void umtx_key_release(struct umtx_key *key);
+static struct umtx_pi *umtx_pi_alloc(int);
+static void umtx_pi_free(struct umtx_pi *pi);
+static void umtx_pi_adjust_locked(struct thread *td, u_char oldpri);
+static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
+static void umtx_thread_cleanup(struct thread *td);
+static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
+ struct image_params *imgp __unused);
+SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
+
+static struct mtx umtx_lock;
-SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_init_chains, NULL);
+static void
+umtxq_sysinit(void *arg __unused)
+{
+ int i;
+
+ umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ for (i = 0; i < UMTX_CHAINS; ++i) {
+ mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
+ MTX_DEF | MTX_DUPOK);
+ TAILQ_INIT(&umtxq_chains[i].uc_queue);
+ TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
+ umtxq_chains[i].uc_busy = 0;
+ umtxq_chains[i].uc_waiters = 0;
+ }
+ mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
+ EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
+ EVENTHANDLER_PRI_ANY);
+}
struct umtx_q *
umtxq_alloc(void)
{
- return (malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK));
+ struct umtx_q *uq;
+
+ uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&uq->uq_pi_contested);
+ uq->uq_inherited_pri = PRI_MAX;
+ return (uq);
}
void
@@ -131,83 +258,84 @@
free(uq, M_UMTX);
}
-static void
-umtxq_init_chains(void *arg __unused)
-{
- int i;
-
- for (i = 0; i < UMTX_CHAINS; ++i) {
- mtx_init(&umtxq_chains[i].uc_lock, "umtxq_lock", NULL,
- MTX_DEF | MTX_DUPOK);
- LIST_INIT(&umtxq_chains[i].uc_queue);
- umtxq_chains[i].uc_flags = 0;
- }
-#ifdef UMTX_DYNAMIC_SHARED
- EVENTHANDLER_REGISTER(process_fork, fork_handler, 0, 10000);
-#endif
-}
-
-static inline int
+static inline void
umtxq_hash(struct umtx_key *key)
{
- unsigned n = (uintptr_t)key->info.both.ptr + key->info.both.word;
- return (((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS);
+ unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
+ key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
}
static inline int
umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
{
return (k1->type == k2->type &&
- k1->info.both.ptr == k2->info.both.ptr &&
- k1->info.both.word == k2->info.both.word);
+ k1->info.both.a == k2->info.both.a &&
+ k1->info.both.b == k2->info.both.b);
}
-static inline struct mtx *
-umtxq_mtx(int chain)
+static inline struct umtxq_chain *
+umtxq_getchain(struct umtx_key *key)
{
- return (&umtxq_chains[chain].uc_lock);
+ return (&umtxq_chains[key->hash]);
}
+/*
+ * Set chain to busy state when following operation
+ * may be blocked (kernel mutex can not be used).
+ */
static inline void
umtxq_busy(struct umtx_key *key)
{
- int chain = umtxq_hash(key);
+ struct umtxq_chain *uc;
- mtx_assert(umtxq_mtx(chain), MA_OWNED);
- while (umtxq_chains[chain].uc_flags & UCF_BUSY) {
- umtxq_chains[chain].uc_flags |= UCF_WANT;
- msleep(&umtxq_chains[chain], umtxq_mtx(chain),
- 0, "umtxq_busy", 0);
+ uc = umtxq_getchain(key);
+ mtx_assert(&uc->uc_lock, MA_OWNED);
+ while (uc->uc_busy != 0) {
+ uc->uc_waiters++;
+ msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
+ uc->uc_waiters--;
}
- umtxq_chains[chain].uc_flags |= UCF_BUSY;
+ uc->uc_busy = 1;
}
+/*
+ * Unbusy a chain.
+ */
static inline void
umtxq_unbusy(struct umtx_key *key)
{
- int chain = umtxq_hash(key);
+ struct umtxq_chain *uc;
- mtx_assert(umtxq_mtx(chain), MA_OWNED);
- KASSERT(umtxq_chains[chain].uc_flags & UCF_BUSY, ("not busy"));
- umtxq_chains[chain].uc_flags &= ~UCF_BUSY;
- if (umtxq_chains[chain].uc_flags & UCF_WANT) {
- umtxq_chains[chain].uc_flags &= ~UCF_WANT;
- wakeup(&umtxq_chains[chain]);
- }
+ uc = umtxq_getchain(key);
+ mtx_assert(&uc->uc_lock, MA_OWNED);
+ KASSERT(uc->uc_busy != 0, ("not busy"));
+ uc->uc_busy = 0;
+ if (uc->uc_waiters)
+ wakeup_one(uc);
}
+/*
+ * Lock a chain.
+ */
static inline void
umtxq_lock(struct umtx_key *key)
{
- int chain = umtxq_hash(key);
- mtx_lock(umtxq_mtx(chain));
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(key);
+ mtx_lock(&uc->uc_lock);
}
+/*
+ * Unlock a chain.
+ */
static inline void
umtxq_unlock(struct umtx_key *key)
{
- int chain = umtxq_hash(key);
- mtx_unlock(umtxq_mtx(chain));
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(key);
+ mtx_unlock(&uc->uc_lock);
}
/*
@@ -216,15 +344,12 @@
static inline void
umtxq_insert(struct umtx_q *uq)
{
- struct umtx_head *head;
- int chain = umtxq_hash(&uq->uq_key);
+ struct umtxq_chain *uc;
- mtx_assert(umtxq_mtx(chain), MA_OWNED);
- head = &umtxq_chains[chain].uc_queue;
- LIST_INSERT_HEAD(head, uq, uq_next);
- mtx_lock_spin(&sched_lock);
- uq->uq_thread->td_flags |= TDF_UMTXQ;
- mtx_unlock_spin(&sched_lock);
+ uc = umtxq_getchain(&uq->uq_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
+ uq->uq_flags |= UQF_UMTXQ;
}
/*
@@ -233,53 +358,78 @@
static inline void
umtxq_remove(struct umtx_q *uq)
{
- mtx_assert(umtxq_mtx(umtxq_hash(&uq->uq_key)), MA_OWNED);
- if (uq->uq_thread->td_flags & TDF_UMTXQ) {
- LIST_REMOVE(uq, uq_next);
- /* turning off TDF_UMTXQ should be the last thing. */
- mtx_lock_spin(&sched_lock);
- uq->uq_thread->td_flags &= ~TDF_UMTXQ;
- mtx_unlock_spin(&sched_lock);
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(&uq->uq_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ if (uq->uq_flags & UQF_UMTXQ) {
+ TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
+ uq->uq_flags &= ~UQF_UMTXQ;
}
}
+/*
+ * Check if there are multiple waiters
+ */
static int
umtxq_count(struct umtx_key *key)
{
+ struct umtxq_chain *uc;
+ struct umtx_q *uq;
+ int count = 0;
+
+ uc = umtxq_getchain(key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
+ if (umtx_key_match(&uq->uq_key, key)) {
+ if (++count > 1)
+ break;
+ }
+ }
+ return (count);
+}
+
+/*
+ * Check if there are multiple PI waiters and returns first
+ * waiter.
+ */
+static int
+umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
+{
+ struct umtxq_chain *uc;
struct umtx_q *uq;
- struct umtx_head *head;
- int chain, count = 0;
+ int count = 0;
- chain = umtxq_hash(key);
- mtx_assert(umtxq_mtx(chain), MA_OWNED);
- head = &umtxq_chains[chain].uc_queue;
- LIST_FOREACH(uq, head, uq_next) {
+ *first = NULL;
+ uc = umtxq_getchain(key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
if (umtx_key_match(&uq->uq_key, key)) {
if (++count > 1)
break;
+ *first = uq;
}
}
return (count);
}
+/*
+ * Wake up threads waiting on an userland object.
+ */
static int
umtxq_signal(struct umtx_key *key, int n_wake)
{
+ struct umtxq_chain *uc;
struct umtx_q *uq, *next;
- struct umtx_head *head;
- struct thread *blocked = NULL;
- int chain, ret;
+ int ret;
ret = 0;
- chain = umtxq_hash(key);
- mtx_assert(umtxq_mtx(chain), MA_OWNED);
- head = &umtxq_chains[chain].uc_queue;
- for (uq = LIST_FIRST(head); uq; uq = next) {
- next = LIST_NEXT(uq, uq_next);
+ uc = umtxq_getchain(key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
if (umtx_key_match(&uq->uq_key, key)) {
- blocked = uq->uq_thread;
umtxq_remove(uq);
- wakeup(blocked);
+ wakeup(uq);
if (++ret >= n_wake)
break;
}
@@ -287,180 +437,118 @@
return (ret);
}
+/*
+ * Wake up specified thread.
+ */
+static inline void
+umtxq_signal_thread(struct umtx_q *uq)
+{
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(&uq->uq_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ umtxq_remove(uq);
+ wakeup(uq);
+}
+
+/*
+ * Put thread into sleep state, before sleeping, check if
+ * thread was removed from umtx queue.
+ */
static inline int
-umtxq_sleep(struct thread *td, struct umtx_key *key, int priority,
- const char *wmesg, int timo)
+umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
{
- int chain = umtxq_hash(key);
- int error = msleep(td, umtxq_mtx(chain), priority, wmesg, timo);
+ struct umtxq_chain *uc;
+ int error;
+
+ uc = umtxq_getchain(&uq->uq_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ if (!(uq->uq_flags & UQF_UMTXQ))
+ return (0);
+ error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
if (error == EWOULDBLOCK)
error = ETIMEDOUT;
return (error);
}
+/*
+ * Convert userspace address into unique logical address.
+ */
static int
-umtx_key_get(struct thread *td, struct umtx *umtx, struct umtx_key *key)
+umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
{
-#if defined(UMTX_DYNAMIC_SHARED) || defined(UMTX_STATIC_SHARED)
+ struct thread *td = curthread;
vm_map_t map;
vm_map_entry_t entry;
vm_pindex_t pindex;
vm_prot_t prot;
boolean_t wired;
- map = &td->td_proc->p_vmspace->vm_map;
- if (vm_map_lookup(&map, (vm_offset_t)umtx, VM_PROT_WRITE,
- &entry, &key->info.shared.object, &pindex, &prot,
- &wired) != KERN_SUCCESS) {
- return EFAULT;
+ key->type = type;
+ if (share == THREAD_SHARE) {
+ key->shared = 0;
+ key->info.private.vs = td->td_proc->p_vmspace;
+ key->info.private.addr = (uintptr_t)addr;
+ } else {
+ MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
+ map = &td->td_proc->p_vmspace->vm_map;
+ if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
+ &entry, &key->info.shared.object, &pindex, &prot,
+ &wired) != KERN_SUCCESS) {
+ return EFAULT;
+ }
+
+ if ((share == PROCESS_SHARE) ||
+ (share == AUTO_SHARE &&
+ VM_INHERIT_SHARE == entry->inheritance)) {
+ key->shared = 1;
+ key->info.shared.offset = entry->offset + entry->start -
+ (vm_offset_t)addr;
+ vm_object_reference(key->info.shared.object);
+ } else {
+ key->shared = 0;
+ key->info.private.vs = td->td_proc->p_vmspace;
+ key->info.private.addr = (uintptr_t)addr;
+ }
+ vm_map_lookup_done(map, entry);
}
-#endif
-#if defined(UMTX_DYNAMIC_SHARED)
- key->type = UMTX_SHARED;
- key->info.shared.offset = entry->offset + entry->start -
- (vm_offset_t)umtx;
- /*
- * Add object reference, if we don't do this, a buggy application
- * deallocates the object, the object will be reused by other
- * applications, then unlock will wake wrong thread.
- */
- vm_object_reference(key->info.shared.object);
- vm_map_lookup_done(map, entry);
-#elif defined(UMTX_STATIC_SHARED)
- if (VM_INHERIT_SHARE == entry->inheritance) {
- key->type = UMTX_SHARED;
- key->info.shared.offset = entry->offset + entry->start -
- (vm_offset_t)umtx;
- vm_object_reference(key->info.shared.object);
- } else {
- key->type = UMTX_PRIVATE;
- key->info.private.umtx = umtx;
- key->info.private.pid = td->td_proc->p_pid;
- }
- vm_map_lookup_done(map, entry);
-#else
- key->type = UMTX_PRIVATE;
- key->info.private.umtx = umtx;
- key->info.private.pid = td->td_proc->p_pid;
-#endif
+ umtxq_hash(key);
return (0);
}
+/*
+ * Release key.
+ */
static inline void
umtx_key_release(struct umtx_key *key)
{
- if (key->type == UMTX_SHARED)
+ if (key->shared)
vm_object_deallocate(key->info.shared.object);
}
-static inline int
-umtxq_queue_me(struct thread *td, struct umtx *umtx, struct umtx_q *uq)
-{
- int error;
-
- if ((error = umtx_key_get(td, umtx, &uq->uq_key)) != 0)
- return (error);
-
- uq->uq_addr = (vm_offset_t)umtx;
- uq->uq_thread = td;
- umtxq_lock(&uq->uq_key);
- /* hmm, for condition variable, we don't need busy flag. */
- umtxq_busy(&uq->uq_key);
- umtxq_insert(uq);
- umtxq_unbusy(&uq->uq_key);
- umtxq_unlock(&uq->uq_key);
- return (0);
-}
-
-#if defined(UMTX_DYNAMIC_SHARED)
-static void
-fork_handler(void *arg, struct proc *p1, struct proc *p2, int flags)
-{
- vm_map_t map;
- vm_map_entry_t entry;
- vm_object_t object;
- vm_pindex_t pindex;
- vm_prot_t prot;
- boolean_t wired;
- struct umtx_key key;
- LIST_HEAD(, umtx_q) workq;
- struct umtx_q *uq;
- struct thread *td;
- int onq;
-
- LIST_INIT(&workq);
-
- /* Collect threads waiting on umtxq */
- PROC_LOCK(p1);
- FOREACH_THREAD_IN_PROC(p1, td) {
- if (td->td_flags & TDF_UMTXQ) {
- uq = td->td_umtxq;
- if (uq)
- LIST_INSERT_HEAD(&workq, uq, uq_rqnext);
- }
- }
- PROC_UNLOCK(p1);
-
- LIST_FOREACH(uq, &workq, uq_rqnext) {
- map = &p1->p_vmspace->vm_map;
- if (vm_map_lookup(&map, uq->uq_addr, VM_PROT_WRITE,
- &entry, &object, &pindex, &prot, &wired) != KERN_SUCCESS) {
- continue;
- }
- key.type = UMTX_SHARED;
- key.info.shared.object = object;
- key.info.shared.offset = entry->offset + entry->start -
- uq->uq_addr;
- if (umtx_key_match(&key, &uq->uq_key)) {
- vm_map_lookup_done(map, entry);
- continue;
- }
-
- umtxq_lock(&uq->uq_key);
- umtxq_busy(&uq->uq_key);
- if (uq->uq_thread->td_flags & TDF_UMTXQ) {
- umtxq_remove(uq);
- onq = 1;
- } else
- onq = 0;
- umtxq_unbusy(&uq->uq_key);
- umtxq_unlock(&uq->uq_key);
- if (onq) {
- vm_object_deallocate(uq->uq_key.info.shared.object);
- uq->uq_key = key;
- umtxq_lock(&uq->uq_key);
- umtxq_busy(&uq->uq_key);
- umtxq_insert(uq);
- umtxq_unbusy(&uq->uq_key);
- umtxq_unlock(&uq->uq_key);
- vm_object_reference(uq->uq_key.info.shared.object);
- }
- vm_map_lookup_done(map, entry);
- }
-}
-#endif
-
+/*
+ * Lock a umtx object.
+ */
static int
-_do_lock(struct thread *td, struct umtx *umtx, long id, int timo)
+_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
{
struct umtx_q *uq;
- intptr_t owner;
- intptr_t old;
+ u_long owner;
+ u_long old;
int error = 0;
uq = td->td_umtxq;
+
/*
- * Care must be exercised when dealing with umtx structure. It
+ * Care must be exercised when dealing with umtx structure. It
* can fault on any access.
*/
-
for (;;) {
/*
* Try the uncontested case. This should be done in userland.
*/
- owner = casuptr((intptr_t *)&umtx->u_owner,
- UMTX_UNOWNED, id);
+ owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
/* The acquire succeeded. */
if (owner == UMTX_UNOWNED)
@@ -472,7 +560,7 @@
/* If no one owns it but it is contested try to acquire it. */
if (owner == UMTX_CONTESTED) {
- owner = casuptr((intptr_t *)&umtx->u_owner,
+ owner = casuword(&umtx->u_owner,
UMTX_CONTESTED, id | UMTX_CONTESTED);
if (owner == UMTX_CONTESTED)
@@ -490,24 +578,31 @@
* If we caught a signal, we have retried and now
* exit immediately.
*/
- if (error || (error = umtxq_queue_me(td, umtx, uq)) != 0)
+ if (error != 0)
return (error);
+ if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
+ AUTO_SHARE, &uq->uq_key)) != 0)
+ return (error);
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+
/*
* Set the contested bit so that a release in user space
* knows to use the system call for unlock. If this fails
* either some one else has acquired the lock or it has been
* released.
*/
- old = casuptr((intptr_t *)&umtx->u_owner, owner,
- owner | UMTX_CONTESTED);
+ old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
/* The address was invalid. */
if (old == -1) {
umtxq_lock(&uq->uq_key);
- umtxq_busy(&uq->uq_key);
umtxq_remove(uq);
- umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
umtx_key_release(&uq->uq_key);
return (EFAULT);
@@ -519,14 +614,9 @@
* unlocking the umtx.
*/
umtxq_lock(&uq->uq_key);
- if (old == owner && (td->td_flags & TDF_UMTXQ)) {
- error = umtxq_sleep(td, &uq->uq_key,
- PCATCH,
- "umtx", timo);
- }
- umtxq_busy(&uq->uq_key);
+ if (old == owner)
+ error = umtxq_sleep(uq, "umtx", timo);
umtxq_remove(uq);
- umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
umtx_key_release(&uq->uq_key);
}
@@ -534,8 +624,11 @@
return (0);
}
+/*
+ * Lock a umtx object.
+ */
static int
-do_lock(struct thread *td, struct umtx *umtx, long id,
+do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
struct timespec *timeout)
{
struct timespec ts, ts2, ts3;
@@ -543,13 +636,16 @@
int error;
if (timeout == NULL) {
- error = _do_lock(td, umtx, id, 0);
+ error = _do_lock_umtx(td, umtx, id, 0);
+ /* Mutex locking is restarted if it is interrupted. */
+ if (error == EINTR)
+ error = ERESTART;
} else {
getnanouptime(&ts);
timespecadd(&ts, timeout);
TIMESPEC_TO_TIMEVAL(&tv, timeout);
for (;;) {
- error = _do_lock(td, umtx, id, tvtohz(&tv));
+ error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
if (error != ETIMEDOUT)
break;
getnanouptime(&ts2);
@@ -561,41 +657,48 @@
timespecsub(&ts3, &ts2);
TIMESPEC_TO_TIMEVAL(&tv, &ts3);
}
+ /* Timed-locking is not restarted. */
+ if (error == ERESTART)
+ error = EINTR;
}
- /*
- * This lets userland back off critical region if needed.
- */
- if (error == ERESTART)
- error = EINTR;
return (error);
}
+/*
+ * Unlock a umtx object.
+ */
static int
-do_unlock(struct thread *td, struct umtx *umtx, long id)
+do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
{
struct umtx_key key;
- intptr_t owner;
- intptr_t old;
+ u_long owner;
+ u_long old;
int error;
int count;
/*
* Make sure we own this mtx.
- *
- * XXX Need a {fu,su}ptr this is not correct on arch where
- * sizeof(intptr_t) != sizeof(long).
*/
- if ((owner = fuword(&umtx->u_owner)) == -1)
+ owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
+ if (owner == -1)
return (EFAULT);
if ((owner & ~UMTX_CONTESTED) != id)
return (EPERM);
- /* We should only ever be in here for contested locks */
- if ((owner & UMTX_CONTESTED) == 0)
- return (EINVAL);
+ /* This should be done in userland */
+ if ((owner & UMTX_CONTESTED) == 0) {
+ old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
+ if (old == -1)
+ return (EFAULT);
+ if (old == owner)
+ return (0);
+ owner = old;
+ }
- if ((error = umtx_key_get(td, umtx, &key)) != 0)
+ /* We should only ever be in here for contested locks */
+ if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
+ &key)) != 0)
return (error);
umtxq_lock(&key);
@@ -608,10 +711,10 @@
* there is zero or one thread only waiting for it.
* Otherwise, it must be marked as contested.
*/
- old = casuptr((intptr_t *)&umtx->u_owner, owner,
- count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
+ old = casuword(&umtx->u_owner, owner,
+ count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
umtxq_lock(&key);
- umtxq_signal(&key, 0);
+ umtxq_signal(&key,1);
umtxq_unbusy(&key);
umtxq_unlock(&key);
umtx_key_release(&key);
@@ -622,49 +725,126 @@
return (0);
}
+#ifdef COMPAT_IA32
+
+/*
+ * Lock a umtx object.
+ */
static int
-do_wait(struct thread *td, struct umtx *umtx, long id, struct timespec *timeout)
+_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
{
struct umtx_q *uq;
- struct timespec ts, ts2, ts3;
- struct timeval tv;
- long tmp;
+ uint32_t owner;
+ uint32_t old;
int error = 0;
uq = td->td_umtxq;
- if ((error = umtxq_queue_me(td, umtx, uq)) != 0)
- return (error);
- tmp = fuword(&umtx->u_owner);
- if (tmp != id) {
- umtxq_lock(&uq->uq_key);
- umtxq_remove(uq);
- umtxq_unlock(&uq->uq_key);
- } else if (timeout == NULL) {
+
+ /*
+ * Care must be exercised when dealing with umtx structure. It
+ * can fault on any access.
+ */
+ for (;;) {
+ /*
+ * Try the uncontested case. This should be done in userland.
+ */
+ owner = casuword32(m, UMUTEX_UNOWNED, id);
+
+ /* The acquire succeeded. */
+ if (owner == UMUTEX_UNOWNED)
+ return (0);
+
+ /* The address was invalid. */
+ if (owner == -1)
+ return (EFAULT);
+
+ /* If no one owns it but it is contested try to acquire it. */
+ if (owner == UMUTEX_CONTESTED) {
+ owner = casuword32(m,
+ UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+ if (owner == UMUTEX_CONTESTED)
+ return (0);
+
+ /* The address was invalid. */
+ if (owner == -1)
+ return (EFAULT);
+
+ /* If this failed the lock has changed, restart. */
+ continue;
+ }
+
+ /*
+ * If we caught a signal, we have retried and now
+ * exit immediately.
+ */
+ if (error != 0)
+ return (error);
+
+ if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
+ AUTO_SHARE, &uq->uq_key)) != 0)
+ return (error);
+
umtxq_lock(&uq->uq_key);
- if (td->td_flags & TDF_UMTXQ)
- error = umtxq_sleep(td, &uq->uq_key,
- PCATCH, "ucond", 0);
- if (!(td->td_flags & TDF_UMTXQ))
- error = 0;
- else
- umtxq_remove(uq);
+ umtxq_busy(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unbusy(&uq->uq_key);
umtxq_unlock(&uq->uq_key);
- } else {
- getnanouptime(&ts);
- timespecadd(&ts, timeout);
- TIMESPEC_TO_TIMEVAL(&tv, timeout);
- for (;;) {
+
+ /*
+ * Set the contested bit so that a release in user space
+ * knows to use the system call for unlock. If this fails
+ * either some one else has acquired the lock or it has been
+ * released.
+ */
+ old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
+
+ /* The address was invalid. */
+ if (old == -1) {
umtxq_lock(&uq->uq_key);
- if (td->td_flags & TDF_UMTXQ) {
- error = umtxq_sleep(td, &uq->uq_key,
- PCATCH,
- "ucond", tvtohz(&tv));
- }
- if (!(td->td_flags & TDF_UMTXQ)) {
- umtxq_unlock(&uq->uq_key);
- goto out;
- }
+ umtxq_remove(uq);
umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ return (EFAULT);
+ }
+
+ /*
+ * We set the contested bit, sleep. Otherwise the lock changed
+ * and we need to retry or we lost a race to the thread
+ * unlocking the umtx.
+ */
+ umtxq_lock(&uq->uq_key);
+ if (old == owner)
+ error = umtxq_sleep(uq, "umtx", timo);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ }
+
+ return (0);
+}
+
+/*
+ * Lock a umtx object.
+ */
+static int
+do_lock_umtx32(struct thread *td, void *m, uint32_t id,
+ struct timespec *timeout)
+{
+ struct timespec ts, ts2, ts3;
+ struct timeval tv;
+ int error;
+
+ if (timeout == NULL) {
+ error = _do_lock_umtx32(td, m, id, 0);
+ /* Mutex locking is restarted if it is interrupted. */
+ if (error == EINTR)
+ error = ERESTART;
+ } else {
+ getnanouptime(&ts);
+ timespecadd(&ts, timeout);
+ TIMESPEC_TO_TIMEVAL(&tv, timeout);
+ for (;;) {
+ error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
if (error != ETIMEDOUT)
break;
getnanouptime(&ts2);
@@ -676,24 +856,152 @@
timespecsub(&ts3, &ts2);
TIMESPEC_TO_TIMEVAL(&tv, &ts3);
}
+ /* Timed-locking is not restarted. */
+ if (error == ERESTART)
+ error = EINTR;
+ }
+ return (error);
+}
+
+/*
+ * Unlock a umtx object.
+ */
+static int
+do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
+{
+ struct umtx_key key;
+ uint32_t owner;
+ uint32_t old;
+ int error;
+ int count;
+
+ /*
+ * Make sure we own this mtx.
+ */
+ owner = fuword32(m);
+ if (owner == -1)
+ return (EFAULT);
+
+ if ((owner & ~UMUTEX_CONTESTED) != id)
+ return (EPERM);
+
+ /* This should be done in userland */
+ if ((owner & UMUTEX_CONTESTED) == 0) {
+ old = casuword32(m, owner, UMUTEX_UNOWNED);
+ if (old == -1)
+ return (EFAULT);
+ if (old == owner)
+ return (0);
+ owner = old;
+ }
+
+ /* We should only ever be in here for contested locks */
+ if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
+ &key)) != 0)
+ return (error);
+
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ count = umtxq_count(&key);
+ umtxq_unlock(&key);
+
+ /*
+ * When unlocking the umtx, it must be marked as unowned if
+ * there is zero or one thread only waiting for it.
+ * Otherwise, it must be marked as contested.
+ */
+ old = casuword32(m, owner,
+ count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+ umtxq_lock(&key);
+ umtxq_signal(&key,1);
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+ umtx_key_release(&key);
+ if (old == -1)
+ return (EFAULT);
+ if (old != owner)
+ return (EINVAL);
+ return (0);
+}
+#endif
+
+/*
+ * Fetch and compare value, sleep on the address if value is not changed.
+ */
+static int
+do_wait(struct thread *td, void *addr, u_long id,
+ struct timespec *timeout, int compat32)
+{
+ struct umtx_q *uq;
+ struct timespec ts, ts2, ts3;
+ struct timeval tv;
+ u_long tmp;
+ int error = 0;
+
+ uq = td->td_umtxq;
+ if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
+ &uq->uq_key)) != 0)
+ return (error);
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unlock(&uq->uq_key);
+ if (compat32 == 0)
+ tmp = fuword(addr);
+ else
+ tmp = fuword32(addr);
+ if (tmp != id) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ } else if (timeout == NULL) {
+ umtxq_lock(&uq->uq_key);
+ error = umtxq_sleep(uq, "uwait", 0);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ } else {
+ getnanouptime(&ts);
+ timespecadd(&ts, timeout);
+ TIMESPEC_TO_TIMEVAL(&tv, timeout);
umtxq_lock(&uq->uq_key);
+ for (;;) {
+ error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
+ if (!(uq->uq_flags & UQF_UMTXQ))
+ break;
+ if (error != ETIMEDOUT)
+ break;
+ umtxq_unlock(&uq->uq_key);
+ getnanouptime(&ts2);
+ if (timespeccmp(&ts2, &ts, >=)) {
+ error = ETIMEDOUT;
+ umtxq_lock(&uq->uq_key);
+ break;
+ }
+ ts3 = ts;
+ timespecsub(&ts3, &ts2);
+ TIMESPEC_TO_TIMEVAL(&tv, &ts3);
+ umtxq_lock(&uq->uq_key);
+ }
umtxq_remove(uq);
umtxq_unlock(&uq->uq_key);
}
-out:
umtx_key_release(&uq->uq_key);
if (error == ERESTART)
error = EINTR;
return (error);
}
+/*
+ * Wake up threads sleeping on the specified address.
+ */
int
kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
{
struct umtx_key key;
int ret;
- if ((ret = umtx_key_get(td, uaddr, &key)) != 0)
+ if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
+ &key)) != 0)
return (ret);
umtxq_lock(&key);
ret = umtxq_signal(&key, n_wake);
@@ -702,71 +1010,1726 @@
return (0);
}
-int
-_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
- /* struct umtx *umtx */
+/*
+ * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
+ */
+static int
+_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
+ int try)
{
- return _do_lock(td, uap->umtx, td->td_tid, 0);
-}
+ struct umtx_q *uq;
+ uint32_t owner, old, id;
+ int error = 0;
-int
-_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
- /* struct umtx *umtx */
-{
- return do_unlock(td, uap->umtx, td->td_tid);
+ id = td->td_tid;
+ uq = td->td_umtxq;
+
+ /*
+ * Care must be exercised when dealing with umtx structure. It
+ * can fault on any access.
+ */
+ for (;;) {
+ /*
+ * Try the uncontested case. This should be done in userland.
+ */
+ owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
+
+ /* The acquire succeeded. */
+ if (owner == UMUTEX_UNOWNED)
+ return (0);
+
+ /* The address was invalid. */
+ if (owner == -1)
+ return (EFAULT);
+
+ /* If no one owns it but it is contested try to acquire it. */
+ if (owner == UMUTEX_CONTESTED) {
+ owner = casuword32(&m->m_owner,
+ UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+ if (owner == UMUTEX_CONTESTED)
+ return (0);
+
+ /* The address was invalid. */
+ if (owner == -1)
+ return (EFAULT);
+
+ /* If this failed the lock has changed, restart. */
+ continue;
+ }
+
+ if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
+ (owner & ~UMUTEX_CONTESTED) == id)
+ return (EDEADLK);
+
+ if (try != 0)
+ return (EBUSY);
+
+ /*
+ * If we caught a signal, we have retried and now
+ * exit immediately.
+ */
+ if (error != 0)
+ return (error);
+
+ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
+ GET_SHARE(flags), &uq->uq_key)) != 0)
+ return (error);
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+
+ /*
+ * Set the contested bit so that a release in user space
+ * knows to use the system call for unlock. If this fails
+ * either some one else has acquired the lock or it has been
+ * released.
+ */
+ old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
+
+ /* The address was invalid. */
+ if (old == -1) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ return (EFAULT);
+ }
+
+ /*
+ * We set the contested bit, sleep. Otherwise the lock changed
+ * and we need to retry or we lost a race to the thread
+ * unlocking the umtx.
+ */
+ umtxq_lock(&uq->uq_key);
+ if (old == owner)
+ error = umtxq_sleep(uq, "umtxn", timo);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ }
+
+ return (0);
}
-int
-_umtx_op(struct thread *td, struct _umtx_op_args *uap)
+/*
+ * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
+ */
+/*
+ * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
+ */
+static int
+do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
{
- struct timespec timeout;
- struct timespec *ts;
+ struct umtx_key key;
+ uint32_t owner, old, id;
int error;
+ int count;
+
+ id = td->td_tid;
+ /*
+ * Make sure we own this mtx.
+ */
+ owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+ if (owner == -1)
+ return (EFAULT);
+
+ if ((owner & ~UMUTEX_CONTESTED) != id)
+ return (EPERM);
- switch(uap->op) {
- case UMTX_OP_LOCK:
- /* Allow a null timespec (wait forever). */
- if (uap->uaddr2 == NULL)
- ts = NULL;
- else {
- error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
- if (error != 0)
- break;
- if (timeout.tv_nsec >= 1000000000 ||
- timeout.tv_nsec < 0) {
- error = EINVAL;
- break;
- }
- ts = &timeout;
- }
- error = do_lock(td, uap->umtx, uap->id, ts);
- break;
- case UMTX_OP_UNLOCK:
- error = do_unlock(td, uap->umtx, uap->id);
- break;
- case UMTX_OP_WAIT:
- /* Allow a null timespec (wait forever). */
- if (uap->uaddr2 == NULL)
- ts = NULL;
- else {
- error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
- if (error != 0)
- break;
- if (timeout.tv_nsec >= 1000000000 ||
- timeout.tv_nsec < 0) {
- error = EINVAL;
- break;
- }
- ts = &timeout;
- }
- error = do_wait(td, uap->umtx, uap->id, ts);
- break;
- case UMTX_OP_WAKE:
- error = kern_umtx_wake(td, uap->umtx, uap->id);
- break;
- default:
- error = EINVAL;
- break;
+ /* This should be done in userland */
+ if ((owner & UMUTEX_CONTESTED) == 0) {
+ old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
+ if (old == -1)
+ return (EFAULT);
+ if (old == owner)
+ return (0);
+ owner = old;
}
- return (error);
+
+ /* We should only ever be in here for contested locks */
+ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
+ &key)) != 0)
+ return (error);
+
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ count = umtxq_count(&key);
+ umtxq_unlock(&key);
+
+ /*
+ * When unlocking the umtx, it must be marked as unowned if
+ * there is zero or one thread only waiting for it.
+ * Otherwise, it must be marked as contested.
+ */
+ old = casuword32(&m->m_owner, owner,
+ count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+ umtxq_lock(&key);
+ umtxq_signal(&key,1);
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+ umtx_key_release(&key);
+ if (old == -1)
+ return (EFAULT);
+ if (old != owner)
+ return (EINVAL);
+ return (0);
+}
+
+static inline struct umtx_pi *
+umtx_pi_alloc(int flags)
+{
+ struct umtx_pi *pi;
+
+ pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
+ TAILQ_INIT(&pi->pi_blocked);
+ atomic_add_int(&umtx_pi_allocated, 1);
+ return (pi);
+}
+
+static inline void
+umtx_pi_free(struct umtx_pi *pi)
+{
+ uma_zfree(umtx_pi_zone, pi);
+ atomic_add_int(&umtx_pi_allocated, -1);
+}
+
+/*
+ * Adjust the thread's position on a pi_state after its priority has been
+ * changed.
+ */
+static int
+umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
+{
+ struct umtx_q *uq, *uq1, *uq2;
+ struct thread *td1;
+
+ mtx_assert(&umtx_lock, MA_OWNED);
+ if (pi == NULL)
+ return (0);
+
+ uq = td->td_umtxq;
+
+ /*
+ * Check if the thread needs to be moved on the blocked chain.
+ * It needs to be moved if either its priority is lower than
+ * the previous thread or higher than the next thread.
+ */
+ uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
+ uq2 = TAILQ_NEXT(uq, uq_lockq);
+ if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
+ (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
+ /*
+ * Remove thread from blocked chain and determine where
+ * it should be moved to.
+ */
+ TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
+ TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
+ td1 = uq1->uq_thread;
+ MPASS(td1->td_proc->p_magic == P_MAGIC);
+ if (UPRI(td1) > UPRI(td))
+ break;
+ }
+
+ if (uq1 == NULL)
+ TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
+ else
+ TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
+ }
+ return (1);
+}
+
+/*
+ * Propagate priority when a thread is blocked on POSIX
+ * PI mutex.
+ */
+static void
+umtx_propagate_priority(struct thread *td)
+{
+ struct umtx_q *uq;
+ struct umtx_pi *pi;
+ int pri;
+
+ mtx_assert(&umtx_lock, MA_OWNED);
+ pri = UPRI(td);
+ uq = td->td_umtxq;
+ pi = uq->uq_pi_blocked;
+ if (pi == NULL)
+ return;
+
+ for (;;) {
+ td = pi->pi_owner;
+ if (td == NULL)
+ return;
+
+ MPASS(td->td_proc != NULL);
+ MPASS(td->td_proc->p_magic == P_MAGIC);
+
+ if (UPRI(td) <= pri)
+ return;
+
+ thread_lock(td);
+ sched_lend_user_prio(td, pri);
+ thread_unlock(td);
+
+ /*
+ * Pick up the lock that td is blocked on.
+ */
+ uq = td->td_umtxq;
+ pi = uq->uq_pi_blocked;
+ /* Resort td on the list if needed. */
+ if (!umtx_pi_adjust_thread(pi, td))
+ break;
+ }
+}
+
+/*
+ * Unpropagate priority for a PI mutex when a thread blocked on
+ * it is interrupted by signal or resumed by others.
+ */
+static void
+umtx_unpropagate_priority(struct umtx_pi *pi)
+{
+ struct umtx_q *uq, *uq_owner;
+ struct umtx_pi *pi2;
+ int pri, oldpri;
+
+ mtx_assert(&umtx_lock, MA_OWNED);
+
+ while (pi != NULL && pi->pi_owner != NULL) {
+ pri = PRI_MAX;
+ uq_owner = pi->pi_owner->td_umtxq;
+
+ TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
+ uq = TAILQ_FIRST(&pi2->pi_blocked);
+ if (uq != NULL) {
+ if (pri > UPRI(uq->uq_thread))
+ pri = UPRI(uq->uq_thread);
+ }
+ }
+
+ if (pri > uq_owner->uq_inherited_pri)
+ pri = uq_owner->uq_inherited_pri;
+ thread_lock(pi->pi_owner);
+ oldpri = pi->pi_owner->td_user_pri;
+ sched_unlend_user_prio(pi->pi_owner, pri);
+ thread_unlock(pi->pi_owner);
+ umtx_pi_adjust_locked(pi->pi_owner, oldpri);
+ pi = uq_owner->uq_pi_blocked;
+ }
+}
+
+/*
+ * Insert a PI mutex into owned list.
+ */
+static void
+umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
+{
+ struct umtx_q *uq_owner;
+
+ uq_owner = owner->td_umtxq;
+ mtx_assert(&umtx_lock, MA_OWNED);
+ if (pi->pi_owner != NULL)
+ panic("pi_ower != NULL");
+ pi->pi_owner = owner;
+ TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
+}
+
+/*
+ * Claim ownership of a PI mutex.
+ */
+static int
+umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
+{
+ struct umtx_q *uq, *uq_owner;
+
+ uq_owner = owner->td_umtxq;
+ mtx_lock_spin(&umtx_lock);
+ if (pi->pi_owner == owner) {
+ mtx_unlock_spin(&umtx_lock);
+ return (0);
+ }
+
+ if (pi->pi_owner != NULL) {
+ /*
+ * userland may have already messed the mutex, sigh.
+ */
+ mtx_unlock_spin(&umtx_lock);
+ return (EPERM);
+ }
+ umtx_pi_setowner(pi, owner);
+ uq = TAILQ_FIRST(&pi->pi_blocked);
+ if (uq != NULL) {
+ int pri;
+
+ pri = UPRI(uq->uq_thread);
+ thread_lock(owner);
+ if (pri < UPRI(owner))
+ sched_lend_user_prio(owner, pri);
+ thread_unlock(owner);
+ }
+ mtx_unlock_spin(&umtx_lock);
+ return (0);
+}
+
+static void
+umtx_pi_adjust_locked(struct thread *td, u_char oldpri)
+{
+ struct umtx_q *uq;
+ struct umtx_pi *pi;
+
+ uq = td->td_umtxq;
+
+ mtx_assert(&umtx_lock, MA_OWNED);
+ MPASS(TD_ON_UPILOCK(td));
+
+ /*
+ * Pick up the lock that td is blocked on.
+ */
+ pi = uq->uq_pi_blocked;
+ MPASS(pi != NULL);
+
+ /* Resort the turnstile on the list. */
+ if (!umtx_pi_adjust_thread(pi, td))
+ return;
+
+ /*
+ * If our priority was lowered and we are at the head of the
+ * turnstile, then propagate our new priority up the chain.
+ */
+ if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
+ umtx_propagate_priority(td);
+}
+
+/*
+ * Adjust a thread's order position in its blocked PI mutex,
+ * this may result new priority propagating process.
+ */
+void
+umtx_pi_adjust(struct thread *td, u_char oldpri)
+{
+ struct umtx_q *uq;
+ struct umtx_pi *pi;
+
+ uq = td->td_umtxq;
+ mtx_lock_spin(&umtx_lock);
+ /*
+ * Pick up the lock that td is blocked on.
+ */
+ pi = uq->uq_pi_blocked;
+ if (pi != NULL)
+ umtx_pi_adjust_locked(td, oldpri);
+ mtx_unlock_spin(&umtx_lock);
+}
+
+/*
+ * Sleep on a PI mutex.
+ */
+static int
+umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
+ uint32_t owner, const char *wmesg, int timo)
+{
+ struct umtxq_chain *uc;
+ struct thread *td, *td1;
+ struct umtx_q *uq1;
+ int pri;
+ int error = 0;
+
+ td = uq->uq_thread;
+ KASSERT(td == curthread, ("inconsistent uq_thread"));
+ uc = umtxq_getchain(&uq->uq_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ umtxq_insert(uq);
+ if (pi->pi_owner == NULL) {
+ /* XXX
+ * Current, We only support process private PI-mutex,
+ * non-contended PI-mutexes are locked in userland.
+ * Process shared PI-mutex should always be initialized
+ * by kernel and be registered in kernel, locking should
+ * always be done by kernel to avoid security problems.
+ * For process private PI-mutex, we can find owner
+ * thread and boost its priority safely.
+ */
+ PROC_LOCK(curproc);
+ td1 = thread_find(curproc, owner);
+ mtx_lock_spin(&umtx_lock);
+ if (td1 != NULL && pi->pi_owner == NULL) {
+ uq1 = td1->td_umtxq;
+ umtx_pi_setowner(pi, td1);
+ }
+ PROC_UNLOCK(curproc);
+ } else {
+ mtx_lock_spin(&umtx_lock);
+ }
+
+ TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
+ pri = UPRI(uq1->uq_thread);
+ if (pri > UPRI(td))
+ break;
+ }
+
+ if (uq1 != NULL)
+ TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
+ else
+ TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
+
+ uq->uq_pi_blocked = pi;
+ td->td_flags |= TDF_UPIBLOCKED;
+ mtx_unlock_spin(&umtx_lock);
+ umtxq_unlock(&uq->uq_key);
+
+ mtx_lock_spin(&umtx_lock);
+ umtx_propagate_priority(td);
+ mtx_unlock_spin(&umtx_lock);
+
+ umtxq_lock(&uq->uq_key);
+ if (uq->uq_flags & UQF_UMTXQ) {
+ error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
+ if (error == EWOULDBLOCK)
+ error = ETIMEDOUT;
+ if (uq->uq_flags & UQF_UMTXQ) {
+ umtxq_busy(&uq->uq_key);
+ umtxq_remove(uq);
+ umtxq_unbusy(&uq->uq_key);
+ }
+ }
+ umtxq_unlock(&uq->uq_key);
+
+ mtx_lock_spin(&umtx_lock);
+ uq->uq_pi_blocked = NULL;
+ thread_lock(td);
+ td->td_flags &= ~TDF_UPIBLOCKED;
+ thread_unlock(td);
+ TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
+ umtx_unpropagate_priority(pi);
+ mtx_unlock_spin(&umtx_lock);
+
+ umtxq_lock(&uq->uq_key);
+
+ return (error);
+}
+
+/*
+ * Add reference count for a PI mutex.
+ */
+static void
+umtx_pi_ref(struct umtx_pi *pi)
+{
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(&pi->pi_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ pi->pi_refcount++;
+}
+
+/*
+ * Decrease reference count for a PI mutex, if the counter
+ * is decreased to zero, its memory space is freed.
+ */
+static void
+umtx_pi_unref(struct umtx_pi *pi)
+{
+ struct umtxq_chain *uc;
+ int free = 0;
+
+ uc = umtxq_getchain(&pi->pi_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
+ if (--pi->pi_refcount == 0) {
+ mtx_lock_spin(&umtx_lock);
+ if (pi->pi_owner != NULL) {
+ TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
+ pi, pi_link);
+ pi->pi_owner = NULL;
+ }
+ KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
+ ("blocked queue not empty"));
+ mtx_unlock_spin(&umtx_lock);
+ TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
+ free = 1;
+ }
+ if (free)
+ umtx_pi_free(pi);
+}
+
+/*
+ * Find a PI mutex in hash table.
+ */
+static struct umtx_pi *
+umtx_pi_lookup(struct umtx_key *key)
+{
+ struct umtxq_chain *uc;
+ struct umtx_pi *pi;
+
+ uc = umtxq_getchain(key);
+ UMTXQ_LOCKED_ASSERT(uc);
+
+ TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
+ if (umtx_key_match(&pi->pi_key, key)) {
+ return (pi);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Insert a PI mutex into hash table.
+ */
+static inline void
+umtx_pi_insert(struct umtx_pi *pi)
+{
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(&pi->pi_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
+}
+
+/*
+ * Lock a PI mutex.
+ */
+static int
+_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
+ int try)
+{
+ struct umtx_q *uq;
+ struct umtx_pi *pi, *new_pi;
+ uint32_t id, owner, old;
+ int error;
+
+ id = td->td_tid;
+ uq = td->td_umtxq;
+
+ if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
+ &uq->uq_key)) != 0)
+ return (error);
+ umtxq_lock(&uq->uq_key);
+ pi = umtx_pi_lookup(&uq->uq_key);
+ if (pi == NULL) {
+ new_pi = umtx_pi_alloc(M_NOWAIT);
+ if (new_pi == NULL) {
+ umtxq_unlock(&uq->uq_key);
+ new_pi = umtx_pi_alloc(M_WAITOK);
+ new_pi->pi_key = uq->uq_key;
+ umtxq_lock(&uq->uq_key);
+ pi = umtx_pi_lookup(&uq->uq_key);
+ if (pi != NULL) {
+ umtx_pi_free(new_pi);
+ new_pi = NULL;
+ }
+ }
+ if (new_pi != NULL) {
+ new_pi->pi_key = uq->uq_key;
+ umtx_pi_insert(new_pi);
+ pi = new_pi;
+ }
+ }
+ umtx_pi_ref(pi);
+ umtxq_unlock(&uq->uq_key);
+
+ /*
+ * Care must be exercised when dealing with umtx structure. It
+ * can fault on any access.
+ */
+ for (;;) {
+ /*
+ * Try the uncontested case. This should be done in userland.
+ */
+ owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
+
+ /* The acquire succeeded. */
+ if (owner == UMUTEX_UNOWNED) {
+ error = 0;
+ break;
+ }
+
+ /* The address was invalid. */
+ if (owner == -1) {
+ error = EFAULT;
+ break;
+ }
+
+ /* If no one owns it but it is contested try to acquire it. */
+ if (owner == UMUTEX_CONTESTED) {
+ owner = casuword32(&m->m_owner,
+ UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+ if (owner == UMUTEX_CONTESTED) {
+ umtxq_lock(&uq->uq_key);
+ error = umtx_pi_claim(pi, td);
+ umtxq_unlock(&uq->uq_key);
+ break;
+ }
+
+ /* The address was invalid. */
+ if (owner == -1) {
+ error = EFAULT;
+ break;
+ }
+
+ /* If this failed the lock has changed, restart. */
+ continue;
+ }
+
+ if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
+ (owner & ~UMUTEX_CONTESTED) == id) {
+ error = EDEADLK;
+ break;
+ }
+
+ if (try != 0) {
+ error = EBUSY;
+ break;
+ }
+
+ /*
+ * If we caught a signal, we have retried and now
+ * exit immediately.
+ */
+ if (error != 0)
+ break;
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+
+ /*
+ * Set the contested bit so that a release in user space
+ * knows to use the system call for unlock. If this fails
+ * either some one else has acquired the lock or it has been
+ * released.
+ */
+ old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
+
+ /* The address was invalid. */
+ if (old == -1) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ error = EFAULT;
+ break;
+ }
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ /*
+ * We set the contested bit, sleep. Otherwise the lock changed
+ * and we need to retry or we lost a race to the thread
+ * unlocking the umtx.
+ */
+ if (old == owner)
+ error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
+ "umtxpi", timo);
+ umtxq_unlock(&uq->uq_key);
+ }
+
+ umtxq_lock(&uq->uq_key);
+ umtx_pi_unref(pi);
+ umtxq_unlock(&uq->uq_key);
+
+ umtx_key_release(&uq->uq_key);
+ return (error);
+}
+
+/*
+ * Unlock a PI mutex.
+ */
+static int
+do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
+{
+ struct umtx_key key;
+ struct umtx_q *uq_first, *uq_first2, *uq_me;
+ struct umtx_pi *pi, *pi2;
+ uint32_t owner, old, id;
+ int error;
+ int count;
+ int pri;
+
+ id = td->td_tid;
+ /*
+ * Make sure we own this mtx.
+ */
+ owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+ if (owner == -1)
+ return (EFAULT);
+
+ if ((owner & ~UMUTEX_CONTESTED) != id)
+ return (EPERM);
+
+ /* This should be done in userland */
+ if ((owner & UMUTEX_CONTESTED) == 0) {
+ old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
+ if (old == -1)
+ return (EFAULT);
+ if (old == owner)
+ return (0);
+ owner = old;
+ }
+
+ /* We should only ever be in here for contested locks */
+ if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
+ &key)) != 0)
+ return (error);
+
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ count = umtxq_count_pi(&key, &uq_first);
+ if (uq_first != NULL) {
+ pi = uq_first->uq_pi_blocked;
+ if (pi->pi_owner != curthread) {
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+ /* userland messed the mutex */
+ return (EPERM);
+ }
+ uq_me = curthread->td_umtxq;
+ mtx_lock_spin(&umtx_lock);
+ pi->pi_owner = NULL;
+ TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
+ uq_first = TAILQ_FIRST(&pi->pi_blocked);
+ pri = PRI_MAX;
+ TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
+ uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
+ if (uq_first2 != NULL) {
+ if (pri > UPRI(uq_first2->uq_thread))
+ pri = UPRI(uq_first2->uq_thread);
+ }
+ }
+ thread_lock(curthread);
+ sched_unlend_user_prio(curthread, pri);
+ thread_unlock(curthread);
+ mtx_unlock_spin(&umtx_lock);
+ }
+ umtxq_unlock(&key);
+
+ /*
+ * When unlocking the umtx, it must be marked as unowned if
+ * there is zero or one thread only waiting for it.
+ * Otherwise, it must be marked as contested.
+ */
+ old = casuword32(&m->m_owner, owner,
+ count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+
+ umtxq_lock(&key);
+ if (uq_first != NULL)
+ umtxq_signal_thread(uq_first);
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+ umtx_key_release(&key);
+ if (old == -1)
+ return (EFAULT);
+ if (old != owner)
+ return (EINVAL);
+ return (0);
+}
+
+/*
+ * Lock a PP mutex.
+ */
+static int
+_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
+ int try)
+{
+ struct umtx_q *uq, *uq2;
+ struct umtx_pi *pi;
+ uint32_t ceiling;
+ uint32_t owner, id;
+ int error, pri, old_inherited_pri, su;
+
+ id = td->td_tid;
+ uq = td->td_umtxq;
+ if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
+ &uq->uq_key)) != 0)
+ return (error);
+ su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
+ for (;;) {
+ old_inherited_pri = uq->uq_inherited_pri;
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+
+ ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
+ if (ceiling > RTP_PRIO_MAX) {
+ error = EINVAL;
+ goto out;
+ }
+
+ mtx_lock_spin(&umtx_lock);
+ if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
+ mtx_unlock_spin(&umtx_lock);
+ error = EINVAL;
+ goto out;
+ }
+ if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
+ uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
+ thread_lock(td);
+ if (uq->uq_inherited_pri < UPRI(td))
+ sched_lend_user_prio(td, uq->uq_inherited_pri);
+ thread_unlock(td);
+ }
+ mtx_unlock_spin(&umtx_lock);
+
+ owner = casuword32(&m->m_owner,
+ UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+ if (owner == UMUTEX_CONTESTED) {
+ error = 0;
+ break;
+ }
+
+ /* The address was invalid. */
+ if (owner == -1) {
+ error = EFAULT;
+ break;
+ }
+
+ if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
+ (owner & ~UMUTEX_CONTESTED) == id) {
+ error = EDEADLK;
+ break;
+ }
+
+ if (try != 0) {
+ error = EBUSY;
+ break;
+ }
+
+ /*
+ * If we caught a signal, we have retried and now
+ * exit immediately.
+ */
+ if (error != 0)
+ break;
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unbusy(&uq->uq_key);
+ error = umtxq_sleep(uq, "umtxpp", timo);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+
+ mtx_lock_spin(&umtx_lock);
+ uq->uq_inherited_pri = old_inherited_pri;
+ pri = PRI_MAX;
+ TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
+ uq2 = TAILQ_FIRST(&pi->pi_blocked);
+ if (uq2 != NULL) {
+ if (pri > UPRI(uq2->uq_thread))
+ pri = UPRI(uq2->uq_thread);
+ }
+ }
+ if (pri > uq->uq_inherited_pri)
+ pri = uq->uq_inherited_pri;
+ thread_lock(td);
+ sched_unlend_user_prio(td, pri);
+ thread_unlock(td);
+ mtx_unlock_spin(&umtx_lock);
+ }
+
+ if (error != 0) {
+ mtx_lock_spin(&umtx_lock);
+ uq->uq_inherited_pri = old_inherited_pri;
+ pri = PRI_MAX;
+ TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
+ uq2 = TAILQ_FIRST(&pi->pi_blocked);
+ if (uq2 != NULL) {
+ if (pri > UPRI(uq2->uq_thread))
+ pri = UPRI(uq2->uq_thread);
+ }
+ }
+ if (pri > uq->uq_inherited_pri)
+ pri = uq->uq_inherited_pri;
+ thread_lock(td);
+ sched_unlend_user_prio(td, pri);
+ thread_unlock(td);
+ mtx_unlock_spin(&umtx_lock);
+ }
+
+out:
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ return (error);
+}
+
+/*
+ * Unlock a PP mutex.
+ */
+static int
+do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
+{
+ struct umtx_key key;
+ struct umtx_q *uq, *uq2;
+ struct umtx_pi *pi;
+ uint32_t owner, id;
+ uint32_t rceiling;
+ int error, pri, new_inherited_pri, su;
+
+ id = td->td_tid;
+ uq = td->td_umtxq;
+ su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
+
+ /*
+ * Make sure we own this mtx.
+ */
+ owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+ if (owner == -1)
+ return (EFAULT);
+
+ if ((owner & ~UMUTEX_CONTESTED) != id)
+ return (EPERM);
+
+ error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
+ if (error != 0)
+ return (error);
+
+ if (rceiling == -1)
+ new_inherited_pri = PRI_MAX;
+ else {
+ rceiling = RTP_PRIO_MAX - rceiling;
+ if (rceiling > RTP_PRIO_MAX)
+ return (EINVAL);
+ new_inherited_pri = PRI_MIN_REALTIME + rceiling;
+ }
+
+ if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
+ &key)) != 0)
+ return (error);
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ umtxq_unlock(&key);
+ /*
+ * For priority protected mutex, always set unlocked state
+ * to UMUTEX_CONTESTED, so that userland always enters kernel
+ * to lock the mutex, it is necessary because thread priority
+ * has to be adjusted for such mutex.
+ */
+ error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
+ UMUTEX_CONTESTED);
+
+ umtxq_lock(&key);
+ if (error == 0)
+ umtxq_signal(&key, 1);
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+
+ if (error == -1)
+ error = EFAULT;
+ else {
+ mtx_lock_spin(&umtx_lock);
+ if (su != 0)
+ uq->uq_inherited_pri = new_inherited_pri;
+ pri = PRI_MAX;
+ TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
+ uq2 = TAILQ_FIRST(&pi->pi_blocked);
+ if (uq2 != NULL) {
+ if (pri > UPRI(uq2->uq_thread))
+ pri = UPRI(uq2->uq_thread);
+ }
+ }
+ if (pri > uq->uq_inherited_pri)
+ pri = uq->uq_inherited_pri;
+ thread_lock(td);
+ sched_unlend_user_prio(td, pri);
+ thread_unlock(td);
+ mtx_unlock_spin(&umtx_lock);
+ }
+ umtx_key_release(&key);
+ return (error);
+}
+
+static int
+do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
+ uint32_t *old_ceiling)
+{
+ struct umtx_q *uq;
+ uint32_t save_ceiling;
+ uint32_t owner, id;
+ uint32_t flags;
+ int error;
+
+ flags = fuword32(&m->m_flags);
+ if ((flags & UMUTEX_PRIO_PROTECT) == 0)
+ return (EINVAL);
+ if (ceiling > RTP_PRIO_MAX)
+ return (EINVAL);
+ id = td->td_tid;
+ uq = td->td_umtxq;
+ if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
+ &uq->uq_key)) != 0)
+ return (error);
+ for (;;) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+
+ save_ceiling = fuword32(&m->m_ceilings[0]);
+
+ owner = casuword32(&m->m_owner,
+ UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+ if (owner == UMUTEX_CONTESTED) {
+ suword32(&m->m_ceilings[0], ceiling);
+ suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
+ UMUTEX_CONTESTED);
+ error = 0;
+ break;
+ }
+
+ /* The address was invalid. */
+ if (owner == -1) {
+ error = EFAULT;
+ break;
+ }
+
+ if ((owner & ~UMUTEX_CONTESTED) == id) {
+ suword32(&m->m_ceilings[0], ceiling);
+ error = 0;
+ break;
+ }
+
+ /*
+ * If we caught a signal, we have retried and now
+ * exit immediately.
+ */
+ if (error != 0)
+ break;
+
+ /*
+ * We set the contested bit, sleep. Otherwise the lock changed
+ * and we need to retry or we lost a race to the thread
+ * unlocking the umtx.
+ */
+ umtxq_lock(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unbusy(&uq->uq_key);
+ error = umtxq_sleep(uq, "umtxpp", 0);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ }
+ umtxq_lock(&uq->uq_key);
+ if (error == 0)
+ umtxq_signal(&uq->uq_key, INT_MAX);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ if (error == 0 && old_ceiling != NULL)
+ suword32(old_ceiling, save_ceiling);
+ return (error);
+}
+
+static int
+_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
+ int try)
+{
+ switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
+ case 0:
+ return (_do_lock_normal(td, m, flags, timo, try));
+ case UMUTEX_PRIO_INHERIT:
+ return (_do_lock_pi(td, m, flags, timo, try));
+ case UMUTEX_PRIO_PROTECT:
+ return (_do_lock_pp(td, m, flags, timo, try));
+ }
+ return (EINVAL);
+}
+
+/*
+ * Lock a userland POSIX mutex.
+ */
+static int
+do_lock_umutex(struct thread *td, struct umutex *m,
+ struct timespec *timeout, int try)
+{
+ struct timespec ts, ts2, ts3;
+ struct timeval tv;
+ uint32_t flags;
+ int error;
+
+ flags = fuword32(&m->m_flags);
+ if (flags == -1)
+ return (EFAULT);
+
+ if (timeout == NULL) {
+ error = _do_lock_umutex(td, m, flags, 0, try);
+ /* Mutex locking is restarted if it is interrupted. */
+ if (error == EINTR)
+ error = ERESTART;
+ } else {
+ getnanouptime(&ts);
+ timespecadd(&ts, timeout);
+ TIMESPEC_TO_TIMEVAL(&tv, timeout);
+ for (;;) {
+ error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
+ if (error != ETIMEDOUT)
+ break;
+ getnanouptime(&ts2);
+ if (timespeccmp(&ts2, &ts, >=)) {
+ error = ETIMEDOUT;
+ break;
+ }
+ ts3 = ts;
+ timespecsub(&ts3, &ts2);
+ TIMESPEC_TO_TIMEVAL(&tv, &ts3);
+ }
+ /* Timed-locking is not restarted. */
+ if (error == ERESTART)
+ error = EINTR;
+ }
+ return (error);
+}
+
+/*
+ * Unlock a userland POSIX mutex.
+ */
+static int
+do_unlock_umutex(struct thread *td, struct umutex *m)
+{
+ uint32_t flags;
+
+ flags = fuword32(&m->m_flags);
+ if (flags == -1)
+ return (EFAULT);
+
+ switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
+ case 0:
+ return (do_unlock_normal(td, m, flags));
+ case UMUTEX_PRIO_INHERIT:
+ return (do_unlock_pi(td, m, flags));
+ case UMUTEX_PRIO_PROTECT:
+ return (do_unlock_pp(td, m, flags));
+ }
+
+ return (EINVAL);
+}
+
+static int
+do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
+ struct timespec *timeout, u_long wflags)
+{
+ struct umtx_q *uq;
+ struct timeval tv;
+ struct timespec cts, ets, tts;
+ uint32_t flags;
+ int error;
+
+ uq = td->td_umtxq;
+ flags = fuword32(&cv->c_flags);
+ error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
+ if (error != 0)
+ return (error);
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unlock(&uq->uq_key);
+
+ /*
+ * The magic thing is we should set c_has_waiters to 1 before
+ * releasing user mutex.
+ */
+ suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+
+ error = do_unlock_umutex(td, m);
+
+ umtxq_lock(&uq->uq_key);
+ if (error == 0) {
+ if ((wflags & UMTX_CHECK_UNPARKING) &&
+ (td->td_pflags & TDP_WAKEUP)) {
+ td->td_pflags &= ~TDP_WAKEUP;
+ error = EINTR;
+ } else if (timeout == NULL) {
+ error = umtxq_sleep(uq, "ucond", 0);
+ } else {
+ getnanouptime(&ets);
+ timespecadd(&ets, timeout);
+ TIMESPEC_TO_TIMEVAL(&tv, timeout);
+ for (;;) {
+ error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
+ if (error != ETIMEDOUT)
+ break;
+ getnanouptime(&cts);
+ if (timespeccmp(&cts, &ets, >=)) {
+ error = ETIMEDOUT;
+ break;
+ }
+ tts = ets;
+ timespecsub(&tts, &cts);
+ TIMESPEC_TO_TIMEVAL(&tv, &tts);
+ }
+ }
+ }
+
+ if (error != 0) {
+ if ((uq->uq_flags & UQF_UMTXQ) == 0) {
+ /*
+ * If we concurrently got do_cv_signal()d
+ * and we got an error or UNIX signals or a timeout,
+ * then, perform another umtxq_signal to avoid
+ * consuming the wakeup. This may cause supurious
+ * wakeup for another thread which was just queued,
+ * but SUSV3 explicitly allows supurious wakeup to
+ * occur, and indeed a kernel based implementation
+ * can not avoid it.
+ */
+ if (!umtxq_signal(&uq->uq_key, 1))
+ error = 0;
+ }
+ if (error == ERESTART)
+ error = EINTR;
+ }
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ return (error);
+}
+
+/*
+ * Signal a userland condition variable.
+ */
+static int
+do_cv_signal(struct thread *td, struct ucond *cv)
+{
+ struct umtx_key key;
+ int error, cnt, nwake;
+ uint32_t flags;
+
+ flags = fuword32(&cv->c_flags);
+ if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
+ return (error);
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ cnt = umtxq_count(&key);
+ nwake = umtxq_signal(&key, 1);
+ if (cnt <= nwake) {
+ umtxq_unlock(&key);
+ error = suword32(
+ __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
+ umtxq_lock(&key);
+ }
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+ umtx_key_release(&key);
+ return (error);
+}
+
+static int
+do_cv_broadcast(struct thread *td, struct ucond *cv)
+{
+ struct umtx_key key;
+ int error;
+ uint32_t flags;
+
+ flags = fuword32(&cv->c_flags);
+ if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
+ return (error);
+
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ umtxq_signal(&key, INT_MAX);
+ umtxq_unlock(&key);
+
+ error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
+
+ umtxq_lock(&key);
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+
+ umtx_key_release(&key);
+ return (error);
+}
+
+int
+_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
+ /* struct umtx *umtx */
+{
+ return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
+}
+
+int
+_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
+ /* struct umtx *umtx */
+{
+ return do_unlock_umtx(td, uap->umtx, td->td_tid);
+}
+
+static int
+__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct timespec *ts, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ ts = NULL;
+ else {
+ error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
+ if (error != 0)
+ return (error);
+ if (timeout.tv_nsec >= 1000000000 ||
+ timeout.tv_nsec < 0) {
+ return (EINVAL);
+ }
+ ts = &timeout;
+ }
+ return (do_lock_umtx(td, uap->obj, uap->val, ts));
+}
+
+static int
+__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
+{
+ return (do_unlock_umtx(td, uap->obj, uap->val));
+}
+
+static int
+__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct timespec *ts, timeout;
+ int error;
+
+ if (uap->uaddr2 == NULL)
+ ts = NULL;
+ else {
+ error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
+ if (error != 0)
+ return (error);
+ if (timeout.tv_nsec >= 1000000000 ||
+ timeout.tv_nsec < 0)
+ return (EINVAL);
+ ts = &timeout;
+ }
+ return do_wait(td, uap->obj, uap->val, ts, 0);
+}
+
+static int
+__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
+{
+ return (kern_umtx_wake(td, uap->obj, uap->val));
+}
+
+static int
+__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct timespec *ts, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ ts = NULL;
+ else {
+ error = copyin(uap->uaddr2, &timeout,
+ sizeof(timeout));
+ if (error != 0)
+ return (error);
+ if (timeout.tv_nsec >= 1000000000 ||
+ timeout.tv_nsec < 0) {
+ return (EINVAL);
+ }
+ ts = &timeout;
+ }
+ return do_lock_umutex(td, uap->obj, ts, 0);
+}
+
+static int
+__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+ return do_lock_umutex(td, uap->obj, NULL, 1);
+}
+
+static int
+__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+ return do_unlock_umutex(td, uap->obj);
+}
+
+static int
+__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
+{
+ return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
+}
+
+static int
+__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct timespec *ts, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ ts = NULL;
+ else {
+ error = copyin(uap->uaddr2, &timeout,
+ sizeof(timeout));
+ if (error != 0)
+ return (error);
+ if (timeout.tv_nsec >= 1000000000 ||
+ timeout.tv_nsec < 0) {
+ return (EINVAL);
+ }
+ ts = &timeout;
+ }
+ return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
+}
+
+static int
+__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
+{
+ return do_cv_signal(td, uap->obj);
+}
+
+static int
+__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
+{
+ return do_cv_broadcast(td, uap->obj);
+}
+
+typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
+
+static _umtx_op_func op_table[] = {
+ __umtx_op_lock_umtx, /* UMTX_OP_LOCK */
+ __umtx_op_unlock_umtx, /* UMTX_OP_UNLOCK */
+ __umtx_op_wait, /* UMTX_OP_WAIT */
+ __umtx_op_wake, /* UMTX_OP_WAKE */
+ __umtx_op_trylock_umutex, /* UMTX_OP_MUTEX_TRYLOCK */
+ __umtx_op_lock_umutex, /* UMTX_OP_MUTEX_LOCK */
+ __umtx_op_unlock_umutex, /* UMTX_OP_MUTEX_UNLOCK */
+ __umtx_op_set_ceiling, /* UMTX_OP_SET_CEILING */
+ __umtx_op_cv_wait, /* UMTX_OP_CV_WAIT*/
+ __umtx_op_cv_signal, /* UMTX_OP_CV_SIGNAL */
+ __umtx_op_cv_broadcast /* UMTX_OP_CV_BROADCAST */
+};
+
+int
+_umtx_op(struct thread *td, struct _umtx_op_args *uap)
+{
+ if ((unsigned)uap->op < UMTX_OP_MAX)
+ return (*op_table[uap->op])(td, uap);
+ return (EINVAL);
+}
+
+#ifdef COMPAT_IA32
+int
+freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
+ /* struct umtx *umtx */
+{
+ return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
+}
+
+int
+freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
+ /* struct umtx *umtx */
+{
+ return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
+}
+
+struct timespec32 {
+ u_int32_t tv_sec;
+ u_int32_t tv_nsec;
+};
+
+static inline int
+copyin_timeout32(void *addr, struct timespec *tsp)
+{
+ struct timespec32 ts32;
+ int error;
+
+ error = copyin(addr, &ts32, sizeof(struct timespec32));
+ if (error == 0) {
+ tsp->tv_sec = ts32.tv_sec;
+ tsp->tv_nsec = ts32.tv_nsec;
+ }
+ return (error);
+}
+
+static int
+__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct timespec *ts, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ ts = NULL;
+ else {
+ error = copyin_timeout32(uap->uaddr2, &timeout);
+ if (error != 0)
+ return (error);
+ if (timeout.tv_nsec >= 1000000000 ||
+ timeout.tv_nsec < 0) {
+ return (EINVAL);
+ }
+ ts = &timeout;
+ }
+ return (do_lock_umtx32(td, uap->obj, uap->val, ts));
+}
+
+static int
+__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
+}
+
+static int
+__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct timespec *ts, timeout;
+ int error;
+
+ if (uap->uaddr2 == NULL)
+ ts = NULL;
+ else {
+ error = copyin_timeout32(uap->uaddr2, &timeout);
+ if (error != 0)
+ return (error);
+ if (timeout.tv_nsec >= 1000000000 ||
+ timeout.tv_nsec < 0)
+ return (EINVAL);
+ ts = &timeout;
+ }
+ return do_wait(td, uap->obj, uap->val, ts, 1);
+}
+
+static int
+__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct timespec *ts, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ ts = NULL;
+ else {
+ error = copyin_timeout32(uap->uaddr2, &timeout);
+ if (error != 0)
+ return (error);
+ if (timeout.tv_nsec >= 1000000000 ||
+ timeout.tv_nsec < 0)
+ return (EINVAL);
+ ts = &timeout;
+ }
+ return do_lock_umutex(td, uap->obj, ts, 0);
+}
+
+static int
+__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct timespec *ts, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ ts = NULL;
+ else {
+ error = copyin_timeout32(uap->uaddr2, &timeout);
+ if (error != 0)
+ return (error);
+ if (timeout.tv_nsec >= 1000000000 ||
+ timeout.tv_nsec < 0)
+ return (EINVAL);
+ ts = &timeout;
+ }
+ return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
+}
+
+static _umtx_op_func op_table_compat32[] = {
+ __umtx_op_lock_umtx_compat32, /* UMTX_OP_LOCK */
+ __umtx_op_unlock_umtx_compat32, /* UMTX_OP_UNLOCK */
+ __umtx_op_wait_compat32, /* UMTX_OP_WAIT */
+ __umtx_op_wake, /* UMTX_OP_WAKE */
+ __umtx_op_trylock_umutex, /* UMTX_OP_MUTEX_LOCK */
+ __umtx_op_lock_umutex_compat32, /* UMTX_OP_MUTEX_TRYLOCK */
+ __umtx_op_unlock_umutex, /* UMTX_OP_MUTEX_UNLOCK */
+ __umtx_op_set_ceiling, /* UMTX_OP_SET_CEILING */
+ __umtx_op_cv_wait_compat32, /* UMTX_OP_CV_WAIT*/
+ __umtx_op_cv_signal, /* UMTX_OP_CV_SIGNAL */
+ __umtx_op_cv_broadcast /* UMTX_OP_CV_BROADCAST */
+};
+
+int
+freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
+{
+ if ((unsigned)uap->op < UMTX_OP_MAX)
+ return (*op_table_compat32[uap->op])(td,
+ (struct _umtx_op_args *)uap);
+ return (EINVAL);
+}
+#endif
+
+void
+umtx_thread_init(struct thread *td)
+{
+ td->td_umtxq = umtxq_alloc();
+ td->td_umtxq->uq_thread = td;
+}
+
+void
+umtx_thread_fini(struct thread *td)
+{
+ umtxq_free(td->td_umtxq);
+}
+
+/*
+ * It will be called when new thread is created, e.g fork().
+ */
+void
+umtx_thread_alloc(struct thread *td)
+{
+ struct umtx_q *uq;
+
+ uq = td->td_umtxq;
+ uq->uq_inherited_pri = PRI_MAX;
+
+ KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
+ KASSERT(uq->uq_thread == td, ("uq_thread != td"));
+ KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
+ KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
+}
+
+/*
+ * exec() hook.
+ */
+static void
+umtx_exec_hook(void *arg __unused, struct proc *p __unused,
+ struct image_params *imgp __unused)
+{
+ umtx_thread_cleanup(curthread);
+}
+
+/*
+ * thread_exit() hook.
+ */
+void
+umtx_thread_exit(struct thread *td)
+{
+ umtx_thread_cleanup(td);
+}
+
+/*
+ * clean up umtx data.
+ */
+static void
+umtx_thread_cleanup(struct thread *td)
+{
+ struct umtx_q *uq;
+ struct umtx_pi *pi;
+
+ if ((uq = td->td_umtxq) == NULL)
+ return;
+
+ mtx_lock_spin(&umtx_lock);
+ uq->uq_inherited_pri = PRI_MAX;
+ while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
+ pi->pi_owner = NULL;
+ TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
+ }
+ thread_lock(td);
+ td->td_flags &= ~TDF_UBORROWING;
+ thread_unlock(td);
+ mtx_unlock_spin(&umtx_lock);
}
Index: kern_sig.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_sig.c -L sys/kern/kern_sig.c -u -r1.2 -r1.3
--- sys/kern/kern_sig.c
+++ sys/kern/kern_sig.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/kern/kern_sig.c,v 1.306.2.2 2006/02/27 00:19:40 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_sig.c,v 1.349.2.1.2.1 2008/01/19 18:15:05 kib Exp $");
#include "opt_compat.h"
#include "opt_ktrace.h"
@@ -57,9 +57,9 @@
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/proc.h>
+#include <sys/posix4.h>
#include <sys/pioctl.h>
#include <sys/resourcevar.h>
-#include <sys/sched.h>
#include <sys/sleepqueue.h>
#include <sys/smp.h>
#include <sys/stat.h>
@@ -69,14 +69,16 @@
#include <sys/sysent.h>
#include <sys/syslog.h>
#include <sys/sysproto.h>
+#include <sys/timers.h>
#include <sys/unistd.h>
#include <sys/wait.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
#include <machine/cpu.h>
-#if defined (__alpha__) && !defined(COMPAT_43)
-#error "You *really* need COMPAT_43 on the alpha for longjmp(3)"
-#endif
+#include <security/audit/audit.h>
#define ONSIG 32 /* NSIG for osig* syscalls. XXX. */
@@ -91,18 +93,45 @@
static void filt_sigdetach(struct knote *kn);
static int filt_signal(struct knote *kn, long hint);
static struct thread *sigtd(struct proc *p, int sig, int prop);
-static int kern_sigtimedwait(struct thread *td, sigset_t set,
- siginfo_t *info, struct timespec *timeout);
-static void do_tdsignal(struct thread *td, int sig, sigtarget_t target);
+#ifdef KSE
+static int do_tdsignal(struct proc *, struct thread *, int, ksiginfo_t *);
+#endif
+static void sigqueue_start(void);
+static uma_zone_t ksiginfo_zone = NULL;
struct filterops sig_filtops =
{ 0, filt_sigattach, filt_sigdetach, filt_signal };
-static int kern_logsigexit = 1;
+int kern_logsigexit = 1;
SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
&kern_logsigexit, 0,
"Log processes quitting on abnormal signals to syslog(3)");
+static int kern_forcesigexit = 1;
+SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
+ &kern_forcesigexit, 0, "Force trap signal to be handled");
+
+SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal");
+
+static int max_pending_per_proc = 128;
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
+ &max_pending_per_proc, 0, "Max pending signals per proc");
+
+static int preallocate_siginfo = 1024;
+TUNABLE_INT("kern.sigqueue.preallocate", &preallocate_siginfo);
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RD,
+ &preallocate_siginfo, 0, "Preallocated signal memory size");
+
+static int signal_overflow = 0;
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
+ &signal_overflow, 0, "Number of signals overflew");
+
+static int signal_alloc_fail = 0;
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
+ &signal_alloc_fail, 0, "signals failed to be allocated");
+
+SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
+
/*
* Policy -- Can ucred cr1 send SIGIO to process cr2?
* Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
@@ -175,33 +204,372 @@
SA_KILL|SA_PROC, /* SIGUSR2 */
};
+static void
+sigqueue_start(void)
+{
+ ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ uma_prealloc(ksiginfo_zone, preallocate_siginfo);
+ p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
+ p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
+ p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
+}
+
+ksiginfo_t *
+ksiginfo_alloc(int wait)
+{
+ int flags;
+
+ flags = M_ZERO;
+ if (! wait)
+ flags |= M_NOWAIT;
+ if (ksiginfo_zone != NULL)
+ return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags));
+ return (NULL);
+}
+
+void
+ksiginfo_free(ksiginfo_t *ksi)
+{
+ uma_zfree(ksiginfo_zone, ksi);
+}
+
+static __inline int
+ksiginfo_tryfree(ksiginfo_t *ksi)
+{
+ if (!(ksi->ksi_flags & KSI_EXT)) {
+ uma_zfree(ksiginfo_zone, ksi);
+ return (1);
+ }
+ return (0);
+}
+
+void
+sigqueue_init(sigqueue_t *list, struct proc *p)
+{
+ SIGEMPTYSET(list->sq_signals);
+ SIGEMPTYSET(list->sq_kill);
+ TAILQ_INIT(&list->sq_list);
+ list->sq_proc = p;
+ list->sq_flags = SQ_INIT;
+}
+
+/*
+ * Get a signal's ksiginfo.
+ * Return:
+ * 0 - signal not found
+ * others - signal number
+ */
+int
+sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si)
+{
+ struct proc *p = sq->sq_proc;
+ struct ksiginfo *ksi, *next;
+ int count = 0;
+
+ KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
+
+ if (!SIGISMEMBER(sq->sq_signals, signo))
+ return (0);
+
+ if (SIGISMEMBER(sq->sq_kill, signo)) {
+ count++;
+ SIGDELSET(sq->sq_kill, signo);
+ }
+
+ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
+ if (ksi->ksi_signo == signo) {
+ if (count == 0) {
+ TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+ ksi->ksi_sigq = NULL;
+ ksiginfo_copy(ksi, si);
+ if (ksiginfo_tryfree(ksi) && p != NULL)
+ p->p_pendingcnt--;
+ }
+ if (++count > 1)
+ break;
+ }
+ }
+
+ if (count <= 1)
+ SIGDELSET(sq->sq_signals, signo);
+ si->ksi_signo = signo;
+ return (signo);
+}
+
+void
+sigqueue_take(ksiginfo_t *ksi)
+{
+ struct ksiginfo *kp;
+ struct proc *p;
+ sigqueue_t *sq;
+
+ if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL)
+ return;
+
+ p = sq->sq_proc;
+ TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+ ksi->ksi_sigq = NULL;
+ if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
+ p->p_pendingcnt--;
+
+ for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
+ kp = TAILQ_NEXT(kp, ksi_link)) {
+ if (kp->ksi_signo == ksi->ksi_signo)
+ break;
+ }
+ if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo))
+ SIGDELSET(sq->sq_signals, ksi->ksi_signo);
+}
+
+int
+sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si)
+{
+ struct proc *p = sq->sq_proc;
+ struct ksiginfo *ksi;
+ int ret = 0;
+
+ KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
+
+ if (signo == SIGKILL || signo == SIGSTOP || si == NULL) {
+ SIGADDSET(sq->sq_kill, signo);
+ goto out_set_bit;
+ }
+
+ /* directly insert the ksi, don't copy it */
+ if (si->ksi_flags & KSI_INS) {
+ TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
+ si->ksi_sigq = sq;
+ goto out_set_bit;
+ }
+
+ if (__predict_false(ksiginfo_zone == NULL)) {
+ SIGADDSET(sq->sq_kill, signo);
+ goto out_set_bit;
+ }
+
+ if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
+ signal_overflow++;
+ ret = EAGAIN;
+ } else if ((ksi = ksiginfo_alloc(0)) == NULL) {
+ signal_alloc_fail++;
+ ret = EAGAIN;
+ } else {
+ if (p != NULL)
+ p->p_pendingcnt++;
+ ksiginfo_copy(si, ksi);
+ ksi->ksi_signo = signo;
+ TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
+ ksi->ksi_sigq = sq;
+ }
+
+ if ((si->ksi_flags & KSI_TRAP) != 0) {
+ if (ret != 0)
+ SIGADDSET(sq->sq_kill, signo);
+ ret = 0;
+ goto out_set_bit;
+ }
+
+ if (ret != 0)
+ return (ret);
+
+out_set_bit:
+ SIGADDSET(sq->sq_signals, signo);
+ return (ret);
+}
+
+void
+sigqueue_flush(sigqueue_t *sq)
+{
+ struct proc *p = sq->sq_proc;
+ ksiginfo_t *ksi;
+
+ KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
+
+ if (p != NULL)
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
+ TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+ ksi->ksi_sigq = NULL;
+ if (ksiginfo_tryfree(ksi) && p != NULL)
+ p->p_pendingcnt--;
+ }
+
+ SIGEMPTYSET(sq->sq_signals);
+ SIGEMPTYSET(sq->sq_kill);
+}
+
+void
+sigqueue_collect_set(sigqueue_t *sq, sigset_t *set)
+{
+ ksiginfo_t *ksi;
+
+ KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
+
+ TAILQ_FOREACH(ksi, &sq->sq_list, ksi_link)
+ SIGADDSET(*set, ksi->ksi_signo);
+ SIGSETOR(*set, sq->sq_kill);
+}
+
+void
+sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, sigset_t *setp)
+{
+ sigset_t tmp, set;
+ struct proc *p1, *p2;
+ ksiginfo_t *ksi, *next;
+
+ KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
+ KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
+ /*
+ * make a copy, this allows setp to point to src or dst
+ * sq_signals without trouble.
+ */
+ set = *setp;
+ p1 = src->sq_proc;
+ p2 = dst->sq_proc;
+ /* Move siginfo to target list */
+ TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
+ if (SIGISMEMBER(set, ksi->ksi_signo)) {
+ TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
+ if (p1 != NULL)
+ p1->p_pendingcnt--;
+ TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
+ ksi->ksi_sigq = dst;
+ if (p2 != NULL)
+ p2->p_pendingcnt++;
+ }
+ }
+
+ /* Move pending bits to target list */
+ tmp = src->sq_kill;
+ SIGSETAND(tmp, set);
+ SIGSETOR(dst->sq_kill, tmp);
+ SIGSETNAND(src->sq_kill, tmp);
+
+ tmp = src->sq_signals;
+ SIGSETAND(tmp, set);
+ SIGSETOR(dst->sq_signals, tmp);
+ SIGSETNAND(src->sq_signals, tmp);
+
+ /* Finally, rescan src queue and set pending bits for it */
+ sigqueue_collect_set(src, &src->sq_signals);
+}
+
+void
+sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo)
+{
+ sigset_t set;
+
+ SIGEMPTYSET(set);
+ SIGADDSET(set, signo);
+ sigqueue_move_set(src, dst, &set);
+}
+
+void
+sigqueue_delete_set(sigqueue_t *sq, sigset_t *set)
+{
+ struct proc *p = sq->sq_proc;
+ ksiginfo_t *ksi, *next;
+
+ KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));
+
+ /* Remove siginfo queue */
+ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
+ if (SIGISMEMBER(*set, ksi->ksi_signo)) {
+ TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+ ksi->ksi_sigq = NULL;
+ if (ksiginfo_tryfree(ksi) && p != NULL)
+ p->p_pendingcnt--;
+ }
+ }
+ SIGSETNAND(sq->sq_kill, *set);
+ SIGSETNAND(sq->sq_signals, *set);
+ /* Finally, rescan queue and set pending bits for it */
+ sigqueue_collect_set(sq, &sq->sq_signals);
+}
+
+void
+sigqueue_delete(sigqueue_t *sq, int signo)
+{
+ sigset_t set;
+
+ SIGEMPTYSET(set);
+ SIGADDSET(set, signo);
+ sigqueue_delete_set(sq, &set);
+}
+
+/* Remove a set of signals for a process */
+void
+sigqueue_delete_set_proc(struct proc *p, sigset_t *set)
+{
+ sigqueue_t worklist;
+ struct thread *td0;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ sigqueue_init(&worklist, NULL);
+ sigqueue_move_set(&p->p_sigqueue, &worklist, set);
+
+ PROC_SLOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td0)
+ sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
+ PROC_SUNLOCK(p);
+
+ sigqueue_flush(&worklist);
+}
+
+void
+sigqueue_delete_proc(struct proc *p, int signo)
+{
+ sigset_t set;
+
+ SIGEMPTYSET(set);
+ SIGADDSET(set, signo);
+ sigqueue_delete_set_proc(p, &set);
+}
+
+void
+sigqueue_delete_stopmask_proc(struct proc *p)
+{
+ sigset_t set;
+
+ SIGEMPTYSET(set);
+ SIGADDSET(set, SIGSTOP);
+ SIGADDSET(set, SIGTSTP);
+ SIGADDSET(set, SIGTTIN);
+ SIGADDSET(set, SIGTTOU);
+ sigqueue_delete_set_proc(p, &set);
+}
+
/*
* Determine signal that should be delivered to process p, the current
* process, 0 if none. If there is a pending stop signal with default
* action, the process stops in issignal().
- * XXXKSE the check for a pending stop is not done under KSE
- *
- * MP SAFE.
*/
int
cursig(struct thread *td)
{
PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
- mtx_assert(&sched_lock, MA_NOTOWNED);
+ THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
return (SIGPENDING(td) ? issignal(td) : 0);
}
/*
* Arrange for ast() to handle unmasked pending signals on return to user
- * mode. This must be called whenever a signal is added to td_siglist or
+ * mode. This must be called whenever a signal is added to td_sigqueue or
* unmasked in td_sigmask.
*/
void
signotify(struct thread *td)
{
struct proc *p;
+#ifdef KSE
sigset_t set, saved;
+#else
+ sigset_t set;
+#endif
p = td->td_proc;
@@ -209,27 +577,30 @@
/*
* If our mask changed we may have to move signal that were
- * previously masked by all threads to our siglist.
+ * previously masked by all threads to our sigqueue.
*/
- set = p->p_siglist;
+ set = p->p_sigqueue.sq_signals;
+#ifdef KSE
if (p->p_flag & P_SA)
- saved = p->p_siglist;
+ saved = p->p_sigqueue.sq_signals;
+#endif
SIGSETNAND(set, td->td_sigmask);
- SIGSETNAND(p->p_siglist, set);
- SIGSETOR(td->td_siglist, set);
-
+ if (! SIGISEMPTY(set))
+ sigqueue_move_set(&p->p_sigqueue, &td->td_sigqueue, &set);
if (SIGPENDING(td)) {
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
+#ifdef KSE
if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) {
- if (!SIGSETEQ(saved, p->p_siglist)) {
+ if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) {
/* pending set changed */
p->p_flag |= P_SIGEVENT;
wakeup(&p->p_siglist);
}
}
+#endif
}
int
@@ -273,8 +644,6 @@
* sigaction
* freebsd4_sigaction
* osigaction
- *
- * MPSAFE
*/
int
kern_sigaction(td, sig, act, oact, flags)
@@ -284,7 +653,6 @@
int flags;
{
struct sigacts *ps;
- struct thread *td0;
struct proc *p = td->td_proc;
if (!_SIG_VALID(sig))
@@ -382,17 +750,17 @@
if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
(sigprop(sig) & SA_IGNORE &&
ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
+#ifdef KSE
if ((p->p_flag & P_SA) &&
- SIGISMEMBER(p->p_siglist, sig)) {
+ SIGISMEMBER(p->p_sigqueue.sq_signals, sig)) {
p->p_flag |= P_SIGEVENT;
wakeup(&p->p_siglist);
}
+#endif
/* never to be seen again */
- SIGDELSET(p->p_siglist, sig);
- mtx_lock_spin(&sched_lock);
- FOREACH_THREAD_IN_PROC(p, td0)
- SIGDELSET(td0->td_siglist, sig);
- mtx_unlock_spin(&sched_lock);
+ PROC_SLOCK(p);
+ sigqueue_delete_proc(p, sig);
+ PROC_SUNLOCK(p);
if (sig != SIGCONT)
/* easier in psignal */
SIGADDSET(ps->ps_sigignore, sig);
@@ -433,9 +801,6 @@
struct sigaction *oact;
};
#endif
-/*
- * MPSAFE
- */
int
sigaction(td, uap)
struct thread *td;
@@ -466,9 +831,6 @@
struct sigaction *oact;
};
#endif
-/*
- * MPSAFE
- */
int
freebsd4_sigaction(td, uap)
struct thread *td;
@@ -501,9 +863,6 @@
struct osigaction *osa;
};
#endif
-/*
- * MPSAFE
- */
int
osigaction(td, uap)
struct thread *td;
@@ -538,7 +897,7 @@
return (error);
}
-#if !defined(__i386__) && !defined(__alpha__)
+#if !defined(__i386__)
/* Avoid replicating the same stub everywhere */
int
osigreturn(td, uap)
@@ -597,11 +956,9 @@
if (sigprop(sig) & SA_IGNORE) {
if (sig != SIGCONT)
SIGADDSET(ps->ps_sigignore, sig);
- SIGDELSET(p->p_siglist, sig);
- /*
- * There is only one thread at this point.
- */
- SIGDELSET(td->td_siglist, sig);
+ PROC_SLOCK(p);
+ sigqueue_delete_proc(p, sig);
+ PROC_SUNLOCK(p);
}
ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
}
@@ -668,10 +1025,6 @@
return (error);
}
-/*
- * sigprocmask() - MP SAFE
- */
-
#ifndef _SYS_SYSPROTO_H_
struct sigprocmask_args {
int how;
@@ -703,9 +1056,6 @@
}
#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
-/*
- * osigprocmask() - MP SAFE
- */
#ifndef _SYS_SYSPROTO_H_
struct osigprocmask_args {
int how;
@@ -727,18 +1077,10 @@
}
#endif /* COMPAT_43 */
-#ifndef _SYS_SYSPROTO_H_
-struct sigpending_args {
- sigset_t *set;
-};
-#endif
-/*
- * MPSAFE
- */
int
sigwait(struct thread *td, struct sigwait_args *uap)
{
- siginfo_t info;
+ ksiginfo_t ksi;
sigset_t set;
int error;
@@ -748,7 +1090,7 @@
return (0);
}
- error = kern_sigtimedwait(td, set, &info, NULL);
+ error = kern_sigtimedwait(td, set, &ksi, NULL);
if (error) {
if (error == ERESTART)
return (error);
@@ -756,26 +1098,18 @@
return (0);
}
- error = copyout(&info.si_signo, uap->sig, sizeof(info.si_signo));
- /* Repost if we got an error. */
- if (error && info.si_signo) {
- PROC_LOCK(td->td_proc);
- tdsignal(td, info.si_signo, SIGTARGET_TD);
- PROC_UNLOCK(td->td_proc);
- }
+ error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
td->td_retval[0] = error;
return (0);
}
-/*
- * MPSAFE
- */
+
int
sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
{
struct timespec ts;
struct timespec *timeout;
sigset_t set;
- siginfo_t info;
+ ksiginfo_t ksi;
int error;
if (uap->timeout) {
@@ -791,30 +1125,22 @@
if (error)
return (error);
- error = kern_sigtimedwait(td, set, &info, timeout);
+ error = kern_sigtimedwait(td, set, &ksi, timeout);
if (error)
return (error);
if (uap->info)
- error = copyout(&info, uap->info, sizeof(info));
- /* Repost if we got an error. */
- if (error && info.si_signo) {
- PROC_LOCK(td->td_proc);
- tdsignal(td, info.si_signo, SIGTARGET_TD);
- PROC_UNLOCK(td->td_proc);
- } else {
- td->td_retval[0] = info.si_signo;
- }
+ error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
+
+ if (error == 0)
+ td->td_retval[0] = ksi.ksi_signo;
return (error);
}
-/*
- * MPSAFE
- */
int
sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
{
- siginfo_t info;
+ ksiginfo_t ksi;
sigset_t set;
int error;
@@ -822,26 +1148,21 @@
if (error)
return (error);
- error = kern_sigtimedwait(td, set, &info, NULL);
+ error = kern_sigtimedwait(td, set, &ksi, NULL);
if (error)
return (error);
if (uap->info)
- error = copyout(&info, uap->info, sizeof(info));
- /* Repost if we got an error. */
- if (error && info.si_signo) {
- PROC_LOCK(td->td_proc);
- tdsignal(td, info.si_signo, SIGTARGET_TD);
- PROC_UNLOCK(td->td_proc);
- } else {
- td->td_retval[0] = info.si_signo;
- }
+ error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
+
+ if (error == 0)
+ td->td_retval[0] = ksi.ksi_signo;
return (error);
}
-static int
-kern_sigtimedwait(struct thread *td, sigset_t waitset, siginfo_t *info,
- struct timespec *timeout)
+int
+kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
+ struct timespec *timeout)
{
struct sigacts *ps;
sigset_t savedmask;
@@ -853,6 +1174,8 @@
p = td->td_proc;
error = 0;
sig = 0;
+ ets.tv_sec = 0;
+ ets.tv_nsec = 0;
SIG_CANTMASK(waitset);
PROC_LOCK(p);
@@ -867,36 +1190,42 @@
}
}
-again:
+restart:
for (i = 1; i <= _SIG_MAXSIG; ++i) {
if (!SIGISMEMBER(waitset, i))
continue;
- if (SIGISMEMBER(td->td_siglist, i)) {
- SIGFILLSET(td->td_sigmask);
- SIG_CANTMASK(td->td_sigmask);
- SIGDELSET(td->td_sigmask, i);
- mtx_lock(&ps->ps_mtx);
- sig = cursig(td);
- i = 0;
- mtx_unlock(&ps->ps_mtx);
- } else if (SIGISMEMBER(p->p_siglist, i)) {
- if (p->p_flag & P_SA) {
- p->p_flag |= P_SIGEVENT;
- wakeup(&p->p_siglist);
- }
- SIGDELSET(p->p_siglist, i);
- SIGADDSET(td->td_siglist, i);
- SIGFILLSET(td->td_sigmask);
- SIG_CANTMASK(td->td_sigmask);
- SIGDELSET(td->td_sigmask, i);
- mtx_lock(&ps->ps_mtx);
- sig = cursig(td);
- i = 0;
- mtx_unlock(&ps->ps_mtx);
+ if (!SIGISMEMBER(td->td_sigqueue.sq_signals, i)) {
+ if (SIGISMEMBER(p->p_sigqueue.sq_signals, i)) {
+#ifdef KSE
+ if (p->p_flag & P_SA) {
+ p->p_flag |= P_SIGEVENT;
+ wakeup(&p->p_siglist);
+ }
+#endif
+ sigqueue_move(&p->p_sigqueue,
+ &td->td_sigqueue, i);
+ } else
+ continue;
}
+
+ SIGFILLSET(td->td_sigmask);
+ SIG_CANTMASK(td->td_sigmask);
+ SIGDELSET(td->td_sigmask, i);
+ mtx_lock(&ps->ps_mtx);
+ sig = cursig(td);
+ mtx_unlock(&ps->ps_mtx);
if (sig)
goto out;
+ else {
+ /*
+ * Because cursig() may have stopped current thread,
+ * after it is resumed, things may have already been
+ * changed, it should rescan any pending signals.
+ */
+ goto restart;
+ }
}
+
if (error)
goto out;
@@ -934,49 +1263,54 @@
error = 0;
}
}
- goto again;
+ goto restart;
out:
td->td_sigmask = savedmask;
signotify(td);
if (sig) {
- sig_t action;
-
+ ksiginfo_init(ksi);
+ sigqueue_get(&td->td_sigqueue, sig, ksi);
+ ksi->ksi_signo = sig;
+ if (ksi->ksi_code == SI_TIMER)
+ itimer_accept(p, ksi->ksi_timerid, ksi);
error = 0;
- mtx_lock(&ps->ps_mtx);
- action = ps->ps_sigact[_SIG_IDX(sig)];
- mtx_unlock(&ps->ps_mtx);
+
#ifdef KTRACE
- if (KTRPOINT(td, KTR_PSIG))
+ if (KTRPOINT(td, KTR_PSIG)) {
+ sig_t action;
+
+ mtx_lock(&ps->ps_mtx);
+ action = ps->ps_sigact[_SIG_IDX(sig)];
+ mtx_unlock(&ps->ps_mtx);
ktrpsig(sig, action, &td->td_sigmask, 0);
+ }
#endif
- _STOPEVENT(p, S_SIG, sig);
-
- SIGDELSET(td->td_siglist, sig);
- bzero(info, sizeof(*info));
- info->si_signo = sig;
- info->si_code = 0;
+ if (sig == SIGKILL)
+ sigexit(td, sig);
}
PROC_UNLOCK(p);
return (error);
}
-/*
- * MPSAFE
- */
+#ifndef _SYS_SYSPROTO_H_
+struct sigpending_args {
+ sigset_t *set;
+};
+#endif
int
sigpending(td, uap)
struct thread *td;
struct sigpending_args *uap;
{
struct proc *p = td->td_proc;
- sigset_t siglist;
+ sigset_t pending;
PROC_LOCK(p);
- siglist = p->p_siglist;
- SIGSETOR(siglist, td->td_siglist);
+ pending = p->p_sigqueue.sq_signals;
+ SIGSETOR(pending, td->td_sigqueue.sq_signals);
PROC_UNLOCK(p);
- return (copyout(&siglist, uap->set, sizeof(sigset_t)));
+ return (copyout(&pending, uap->set, sizeof(sigset_t)));
}
#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
@@ -985,22 +1319,19 @@
int dummy;
};
#endif
-/*
- * MPSAFE
- */
int
osigpending(td, uap)
struct thread *td;
struct osigpending_args *uap;
{
struct proc *p = td->td_proc;
- sigset_t siglist;
+ sigset_t pending;
PROC_LOCK(p);
- siglist = p->p_siglist;
- SIGSETOR(siglist, td->td_siglist);
+ pending = p->p_sigqueue.sq_signals;
+ SIGSETOR(pending, td->td_sigqueue.sq_signals);
PROC_UNLOCK(p);
- SIG2OSIG(siglist, td->td_retval[0]);
+ SIG2OSIG(pending, td->td_retval[0]);
return (0);
}
#endif /* COMPAT_43 */
@@ -1016,9 +1347,6 @@
struct sigvec *osv;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
osigvec(td, uap)
@@ -1060,9 +1388,6 @@
int mask;
};
#endif
-/*
- * MPSAFE
- */
int
osigblock(td, uap)
register struct thread *td;
@@ -1085,9 +1410,6 @@
int mask;
};
#endif
-/*
- * MPSAFE
- */
int
osigsetmask(td, uap)
struct thread *td;
@@ -1108,20 +1430,14 @@
#endif /* COMPAT_43 */
/*
- * Suspend process until signal, providing mask to be set
- * in the meantime.
- ***** XXXKSE this doesn't make sense under KSE.
- ***** Do we suspend the thread or all threads in the process?
- ***** How do we suspend threads running NOW on another processor?
+ * Suspend calling thread until signal, providing mask to be set in the
+ * meantime.
*/
#ifndef _SYS_SYSPROTO_H_
struct sigsuspend_args {
const sigset_t *sigmask;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
sigsuspend(td, uap)
@@ -1172,9 +1488,6 @@
osigset_t mask;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
osigsuspend(td, uap)
@@ -1206,9 +1519,6 @@
struct sigstack *oss;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
osigstack(td, uap)
@@ -1244,9 +1554,6 @@
stack_t *oss;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
sigaltstack(td, uap)
@@ -1290,9 +1597,9 @@
if ((ss->ss_flags & ~SS_DISABLE) != 0)
return (EINVAL);
if (!(ss->ss_flags & SS_DISABLE)) {
- if (ss->ss_size < p->p_sysent->sv_minsigstksz) {
+ if (ss->ss_size < p->p_sysent->sv_minsigstksz)
return (ENOMEM);
- }
+
td->td_sigstk = *ss;
td->td_pflags |= TDP_ALTSTACK;
} else {
@@ -1320,10 +1627,10 @@
* broadcast
*/
sx_slock(&allproc_lock);
- LIST_FOREACH(p, &allproc, p_list) {
+ FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
- p == td->td_proc) {
+ p == td->td_proc || p->p_state == PRS_NEW) {
PROC_UNLOCK(p);
continue;
}
@@ -1353,7 +1660,8 @@
sx_sunlock(&proctree_lock);
LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
PROC_LOCK(p);
- if (p->p_pid <= 1 || p->p_flag & P_SYSTEM) {
+ if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
+ p->p_state == PRS_NEW ) {
PROC_UNLOCK(p);
continue;
}
@@ -1375,9 +1683,6 @@
int signum;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
kill(td, uap)
@@ -1387,6 +1692,8 @@
register struct proc *p;
int error;
+ AUDIT_ARG(signum, uap->signum);
+ AUDIT_ARG(pid, uap->pid);
if ((u_int)uap->signum > _SIG_MAXSIG)
return (EINVAL);
@@ -1396,6 +1703,7 @@
if ((p = zpfind(uap->pid)) == NULL)
return (ESRCH);
}
+ AUDIT_ARG(process, p);
error = p_cansignal(td, p, uap->signum);
if (error == 0 && uap->signum)
psignal(p, uap->signum);
@@ -1420,9 +1728,6 @@
int signum;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
okillpg(td, uap)
@@ -1430,12 +1735,57 @@
register struct okillpg_args *uap;
{
+ AUDIT_ARG(signum, uap->signum);
+ AUDIT_ARG(pid, uap->pgid);
if ((u_int)uap->signum > _SIG_MAXSIG)
return (EINVAL);
+
return (killpg1(td, uap->signum, uap->pgid, 0));
}
#endif /* COMPAT_43 */
+#ifndef _SYS_SYSPROTO_H_
+struct sigqueue_args {
+ pid_t pid;
+ int signum;
+ /* union sigval */ void *value;
+};
+#endif
+int
+sigqueue(struct thread *td, struct sigqueue_args *uap)
+{
+ ksiginfo_t ksi;
+ struct proc *p;
+ int error;
+
+ if ((u_int)uap->signum > _SIG_MAXSIG)
+ return (EINVAL);
+
+ /*
+ * Specification says sigqueue can only send signal to
+ * single process.
+ */
+ if (uap->pid <= 0)
+ return (EINVAL);
+
+ if ((p = pfind(uap->pid)) == NULL) {
+ if ((p = zpfind(uap->pid)) == NULL)
+ return (ESRCH);
+ }
+ error = p_cansignal(td, p, uap->signum);
+ if (error == 0 && uap->signum != 0) {
+ ksiginfo_init(&ksi);
+ ksi.ksi_signo = uap->signum;
+ ksi.ksi_code = SI_QUEUE;
+ ksi.ksi_pid = td->td_proc->p_pid;
+ ksi.ksi_uid = td->td_ucred->cr_ruid;
+ ksi.ksi_value.sival_ptr = uap->value;
+ error = tdsignal(p, NULL, ksi.ksi_signo, &ksi);
+ }
+ PROC_UNLOCK(p);
+ return (error);
+}
+
/*
* Send a signal to a process group.
*/
@@ -1479,27 +1829,33 @@
}
/*
- * Send a signal caused by a trap to the current thread.
- * If it will be caught immediately, deliver it with correct code.
- * Otherwise, post it normally.
- *
- * MPSAFE
+ * Send a signal caused by a trap to the current thread. If it will be
+ * caught immediately, deliver it with correct code. Otherwise, post it
+ * normally.
*/
void
-trapsignal(struct thread *td, int sig, u_long code)
+trapsignal(struct thread *td, ksiginfo_t *ksi)
{
struct sigacts *ps;
struct proc *p;
- siginfo_t siginfo;
+#ifdef KSE
int error;
+#endif
+ int sig;
+ int code;
p = td->td_proc;
+ sig = ksi->ksi_signo;
+ code = ksi->ksi_code;
+ KASSERT(_SIG_VALID(sig), ("invalid signal"));
+
+#ifdef KSE
if (td->td_pflags & TDP_SA) {
if (td->td_mailbox == NULL)
thread_user_enter(td);
PROC_LOCK(p);
SIGDELSET(td->td_sigmask, sig);
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
/*
* Force scheduling an upcall, so UTS has chance to
* process the signal before thread runs again in
@@ -1507,24 +1863,32 @@
*/
if (td->td_upcall)
td->td_upcall->ku_flags |= KUF_DOUPCALL;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
} else {
PROC_LOCK(p);
}
+#else
+ PROC_LOCK(p);
+#endif
ps = p->p_sigacts;
mtx_lock(&ps->ps_mtx);
if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
!SIGISMEMBER(td->td_sigmask, sig)) {
- p->p_stats->p_ru.ru_nsignals++;
+ td->td_ru.ru_nsignals++;
#ifdef KTRACE
if (KTRPOINT(curthread, KTR_PSIG))
ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
&td->td_sigmask, code);
#endif
+#ifdef KSE
if (!(td->td_pflags & TDP_SA))
- (*p->p_sysent->sv_sendsig)(
- ps->ps_sigact[_SIG_IDX(sig)], sig,
- &td->td_sigmask, code);
+ (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)],
+ ksi, &td->td_sigmask);
+#else
+ (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)],
+ ksi, &td->td_sigmask);
+#endif
+#ifdef KSE
else if (td->td_mailbox == NULL) {
mtx_unlock(&ps->ps_mtx);
/* UTS caused a sync signal */
@@ -1532,18 +1896,18 @@
p->p_sig = sig; /* XXX to verify code */
sigexit(td, sig);
} else {
- cpu_thread_siginfo(sig, code, &siginfo);
mtx_unlock(&ps->ps_mtx);
SIGADDSET(td->td_sigmask, sig);
PROC_UNLOCK(p);
- error = copyout(&siginfo, &td->td_mailbox->tm_syncsig,
- sizeof(siginfo));
+ error = copyout(&ksi->ksi_info, &td->td_mailbox->tm_syncsig,
+ sizeof(siginfo_t));
PROC_LOCK(p);
/* UTS memory corrupted */
if (error)
sigexit(td, SIGSEGV);
mtx_lock(&ps->ps_mtx);
}
+#endif
SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
if (!SIGISMEMBER(ps->ps_signodefer, sig))
SIGADDSET(td->td_sigmask, sig);
@@ -1559,10 +1923,23 @@
}
mtx_unlock(&ps->ps_mtx);
} else {
+ /*
+ * Avoid a possible infinite loop if the thread
+ * masking the signal or process is ignoring the
+ * signal.
+ */
+ if (kern_forcesigexit &&
+ (SIGISMEMBER(td->td_sigmask, sig) ||
+ ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
+ SIGDELSET(td->td_sigmask, sig);
+ SIGDELSET(ps->ps_sigcatch, sig);
+ SIGDELSET(ps->ps_sigignore, sig);
+ ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+ }
mtx_unlock(&ps->ps_mtx);
p->p_code = code; /* XXX for core dump/debugger */
p->p_sig = sig; /* XXX to verify code */
- tdsignal(td, sig, SIGTARGET_TD);
+ tdsignal(p, td, sig, ksi);
}
PROC_UNLOCK(p);
}
@@ -1581,7 +1958,7 @@
if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig))
return (curthread);
signal_td = NULL;
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
FOREACH_THREAD_IN_PROC(p, td) {
if (!SIGISMEMBER(td->td_sigmask, sig)) {
signal_td = td;
@@ -1590,7 +1967,7 @@
}
if (signal_td == NULL)
signal_td = FIRST_THREAD_IN_PROC(p);
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
return (signal_td);
}
@@ -1606,75 +1983,97 @@
* regardless of the signal action (eg, blocked or ignored).
*
* Other ignored signals are discarded immediately.
- *
- * MPSAFE
+ *
+ * NB: This function may be entered from the debugger via the "kill" DDB
+ * command. There is little that can be done to mitigate the possibly messy
+ * side effects of this unwise possibility.
*/
void
psignal(struct proc *p, int sig)
{
- struct thread *td;
- int prop;
+ (void) tdsignal(p, NULL, sig, NULL);
+}
- if (!_SIG_VALID(sig))
- panic("psignal(): invalid signal");
+int
+psignal_event(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
+{
+ struct thread *td = NULL;
PROC_LOCK_ASSERT(p, MA_OWNED);
- /*
- * IEEE Std 1003.1-2001: return success when killing a zombie.
- */
- if (p->p_state == PRS_ZOMBIE)
- return;
- prop = sigprop(sig);
+
+ KASSERT(!KSI_ONQ(ksi), ("psignal_event: ksi on queue"));
/*
- * Find a thread to deliver the signal to.
+ * ksi_code and other fields should be set before
+ * calling this function.
*/
- td = sigtd(p, sig, prop);
-
- tdsignal(td, sig, SIGTARGET_P);
+ ksi->ksi_signo = sigev->sigev_signo;
+ ksi->ksi_value = sigev->sigev_value;
+ if (sigev->sigev_notify == SIGEV_THREAD_ID) {
+ td = thread_find(p, sigev->sigev_notify_thread_id);
+ if (td == NULL)
+ return (ESRCH);
+ }
+ return (tdsignal(p, td, ksi->ksi_signo, ksi));
}
-/*
- * MPSAFE
- */
-void
-tdsignal(struct thread *td, int sig, sigtarget_t target)
+int
+tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
{
+#ifdef KSE
sigset_t saved;
- struct proc *p = td->td_proc;
+ int ret;
if (p->p_flag & P_SA)
- saved = p->p_siglist;
- do_tdsignal(td, sig, target);
+ saved = p->p_sigqueue.sq_signals;
+ ret = do_tdsignal(p, td, sig, ksi);
if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) {
- if (!SIGSETEQ(saved, p->p_siglist)) {
+ if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) {
/* pending set changed */
p->p_flag |= P_SIGEVENT;
wakeup(&p->p_siglist);
}
}
+ return (ret);
}
-static void
-do_tdsignal(struct thread *td, int sig, sigtarget_t target)
+static int
+do_tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
{
- struct proc *p;
- register sig_t action;
- sigset_t *siglist;
- struct thread *td0;
- register int prop;
+#endif
+ sig_t action;
+ sigqueue_t *sigqueue;
+ int prop;
struct sigacts *ps;
int intrval;
+ int ret = 0;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
if (!_SIG_VALID(sig))
- panic("do_tdsignal(): invalid signal");
+#ifdef KSE
+ panic("do_tdsignal(): invalid signal %d", sig);
+#else
+ panic("tdsignal(): invalid signal %d", sig);
+#endif
- p = td->td_proc;
- ps = p->p_sigacts;
+#ifdef KSE
+ KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("do_tdsignal: ksi on queue"));
+#else
+ KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("tdsignal: ksi on queue"));
+#endif
- PROC_LOCK_ASSERT(p, MA_OWNED);
- KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
+ /*
+ * IEEE Std 1003.1-2001: return success when killing a zombie.
+ */
+ if (p->p_state == PRS_ZOMBIE) {
+ if (ksi && (ksi->ksi_flags & KSI_INS))
+ ksiginfo_tryfree(ksi);
+ return (ret);
+ }
+ ps = p->p_sigacts;
+ KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
prop = sigprop(sig);
/*
@@ -1682,13 +2081,15 @@
* assign it to the process so that we can find it later in the first
* thread that unblocks it. Otherwise, assign it to this thread now.
*/
- if (target == SIGTARGET_TD) {
- siglist = &td->td_siglist;
- } else {
- if (!SIGISMEMBER(td->td_sigmask, sig))
- siglist = &td->td_siglist;
+ if (td == NULL) {
+ td = sigtd(p, sig, prop);
+ if (SIGISMEMBER(td->td_sigmask, sig))
+ sigqueue = &p->p_sigqueue;
else
- siglist = &p->p_siglist;
+ sigqueue = &td->td_sigqueue;
+ } else {
+ KASSERT(td->td_proc == p, ("invalid thread"));
+ sigqueue = &td->td_sigqueue;
}
/*
@@ -1699,10 +2100,11 @@
* action will be SIG_DFL here.)
*/
mtx_lock(&ps->ps_mtx);
- if (SIGISMEMBER(ps->ps_sigignore, sig) ||
- (p->p_flag & P_WEXIT)) {
+ if (SIGISMEMBER(ps->ps_sigignore, sig)) {
mtx_unlock(&ps->ps_mtx);
- return;
+ if (ksi && (ksi->ksi_flags & KSI_INS))
+ ksiginfo_tryfree(ksi);
+ return (ret);
}
if (SIGISMEMBER(td->td_sigmask, sig))
action = SIG_HOLD;
@@ -1716,19 +2118,9 @@
intrval = ERESTART;
mtx_unlock(&ps->ps_mtx);
- if (prop & SA_CONT) {
- SIG_STOPSIGMASK(p->p_siglist);
- /*
- * XXX Should investigate leaving STOP and CONT sigs only in
- * the proc's siglist.
- */
- mtx_lock_spin(&sched_lock);
- FOREACH_THREAD_IN_PROC(p, td0)
- SIG_STOPSIGMASK(td0->td_siglist);
- mtx_unlock_spin(&sched_lock);
- }
-
- if (prop & SA_STOP) {
+ if (prop & SA_CONT)
+ sigqueue_delete_stopmask_proc(p);
+ else if (prop & SA_STOP) {
/*
* If sending a tty stop signal to a member of an orphaned
* process group, discard the signal here if the action
@@ -1737,25 +2129,33 @@
*/
if ((prop & SA_TTYSTOP) &&
(p->p_pgrp->pg_jobc == 0) &&
- (action == SIG_DFL))
- return;
- SIG_CONTSIGMASK(p->p_siglist);
- mtx_lock_spin(&sched_lock);
- FOREACH_THREAD_IN_PROC(p, td0)
- SIG_CONTSIGMASK(td0->td_siglist);
- mtx_unlock_spin(&sched_lock);
- p->p_flag &= ~P_CONTINUED;
+ (action == SIG_DFL)) {
+ if (ksi && (ksi->ksi_flags & KSI_INS))
+ ksiginfo_tryfree(ksi);
+ return (ret);
+ }
+ PROC_SLOCK(p);
+ sigqueue_delete_proc(p, SIGCONT);
+ PROC_SUNLOCK(p);
+ if (p->p_flag & P_CONTINUED) {
+ p->p_flag &= ~P_CONTINUED;
+ PROC_LOCK(p->p_pptr);
+ sigqueue_take(p->p_ksi);
+ PROC_UNLOCK(p->p_pptr);
+ }
}
- SIGADDSET(*siglist, sig);
- signotify(td); /* uses schedlock */
+ ret = sigqueue_add(sigqueue, sig, ksi);
+ if (ret != 0)
+ return (ret);
+ signotify(td);
/*
* Defer further processing for signals which are held,
* except that stopped processes must be continued by SIGCONT.
*/
if (action == SIG_HOLD &&
!((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG)))
- return;
+ return (ret);
/*
* SIGKILL: Remove procfs STOPEVENTs.
*/
@@ -1774,6 +2174,7 @@
* waking up threads so that they can cross the user boundary.
* We try do the per-process part here.
*/
+ PROC_SLOCK(p);
if (P_SHOULDSTOP(p)) {
/*
* The process is in stopped mode. All the threads should be
@@ -1785,6 +2186,7 @@
* so no further action is necessary.
* No signal can restart us.
*/
+ PROC_SUNLOCK(p);
goto out;
}
@@ -1801,19 +2203,32 @@
if (prop & SA_CONT) {
/*
* If SIGCONT is default (or ignored), we continue the
- * process but don't leave the signal in siglist as
+ * process but don't leave the signal in sigqueue as
* it has no further action. If SIGCONT is held, we
* continue the process and leave the signal in
- * siglist. If the process catches SIGCONT, let it
+ * sigqueue. If the process catches SIGCONT, let it
* handle the signal itself. If it isn't waiting on
* an event, it goes back to run state.
* Otherwise, process goes back to sleep state.
*/
p->p_flag &= ~P_STOPPED_SIG;
- p->p_flag |= P_CONTINUED;
+ if (p->p_numthreads == p->p_suspcount) {
+ PROC_SUNLOCK(p);
+ p->p_flag |= P_CONTINUED;
+ p->p_xstat = SIGCONT;
+ PROC_LOCK(p->p_pptr);
+ childproc_continued(p);
+ PROC_UNLOCK(p->p_pptr);
+ PROC_SLOCK(p);
+ }
if (action == SIG_DFL) {
- SIGDELSET(*siglist, sig);
- } else if (action == SIG_CATCH) {
+ thread_unsuspend(p);
+ PROC_SUNLOCK(p);
+ sigqueue_delete(sigqueue, sig);
+ goto out;
+ }
+ if (action == SIG_CATCH) {
+#ifdef KSE
/*
* The process wants to catch it so it needs
* to run at least one thread, but which one?
@@ -1824,14 +2239,18 @@
* single thread is runnable asap.
* XXXKSE for now however, make them all run.
*/
+#endif
+ /*
+ * The process wants to catch it so it needs
+ * to run at least one thread, but which one?
+ */
goto runfast;
}
/*
* The signal is not ignored or caught.
*/
- mtx_lock_spin(&sched_lock);
thread_unsuspend(p);
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
goto out;
}
@@ -1841,8 +2260,9 @@
* (If we did the shell could get confused).
* Just make sure the signal STOP bit set.
*/
+ PROC_SUNLOCK(p);
p->p_flag |= P_STOPPED_SIG;
- SIGDELSET(*siglist, sig);
+ sigqueue_delete(sigqueue, sig);
goto out;
}
@@ -1854,10 +2274,11 @@
* the PROCESS runnable, leave it stopped.
* It may run a bit until it hits a thread_suspend_check().
*/
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR))
sleepq_abort(td, intrval);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
+ PROC_SUNLOCK(p);
goto out;
/*
* Mutexes are short lived. Threads waiting on them will
@@ -1865,28 +2286,36 @@
*/
} else if (p->p_state == PRS_NORMAL) {
if (p->p_flag & P_TRACED || action == SIG_CATCH) {
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
tdsigwakeup(td, sig, action, intrval);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
+ PROC_SUNLOCK(p);
goto out;
}
MPASS(action == SIG_DFL);
if (prop & SA_STOP) {
- if (p->p_flag & P_PPWAIT)
+ if (p->p_flag & P_PPWAIT) {
+ PROC_SUNLOCK(p);
goto out;
+ }
p->p_flag |= P_STOPPED_SIG;
p->p_xstat = sig;
- mtx_lock_spin(&sched_lock);
sig_suspend_threads(td, p, 1);
- thread_stopped(p);
if (p->p_numthreads == p->p_suspcount) {
- SIGDELSET(p->p_siglist, p->p_xstat);
- FOREACH_THREAD_IN_PROC(p, td0)
- SIGDELSET(td0->td_siglist, p->p_xstat);
- }
- mtx_unlock_spin(&sched_lock);
+ /*
+ * only thread sending signal to another
+ * process can reach here, if thread is sending
+ * signal to its process, because thread does
+ * not suspend itself here, p_numthreads
+ * should never be equal to p_suspcount.
+ */
+ thread_stopped(p);
+ PROC_SUNLOCK(p);
+ sigqueue_delete_proc(p, p->p_xstat);
+ } else
+ PROC_SUNLOCK(p);
goto out;
}
else
@@ -1894,7 +2323,8 @@
/* NOTREACHED */
} else {
/* Not in "NORMAL" state. discard the signal. */
- SIGDELSET(*siglist, sig);
+ PROC_SUNLOCK(p);
+ sigqueue_delete(sigqueue, sig);
goto out;
}
@@ -1904,13 +2334,15 @@
*/
runfast:
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
tdsigwakeup(td, sig, action, intrval);
+ thread_unlock(td);
thread_unsuspend(p);
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
out:
- /* If we jump here, sched_lock should not be owned. */
- mtx_assert(&sched_lock, MA_NOTOWNED);
+ /* If we jump here, proc slock should not be owned. */
+ PROC_SLOCK_ASSERT(p, MA_NOTOWNED);
+ return (ret);
}
/*
@@ -1925,19 +2357,16 @@
register int prop;
PROC_LOCK_ASSERT(p, MA_OWNED);
- mtx_assert(&sched_lock, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
prop = sigprop(sig);
/*
* Bring the priority of a thread up if we want it to get
* killed in this lifetime.
*/
- if (action == SIG_DFL && (prop & SA_KILL)) {
- if (p->p_nice > 0)
- sched_nice(td->td_proc, 0);
- if (td->td_priority > PUSER)
- sched_prio(td, PUSER);
- }
+ if (action == SIG_DFL && (prop & SA_KILL) && td->td_priority > PUSER)
+ sched_prio(td, PUSER);
if (TD_ON_SLEEPQ(td)) {
/*
@@ -1954,12 +2383,16 @@
* be awakened.
*/
if ((prop & SA_CONT) && action == SIG_DFL) {
- SIGDELSET(p->p_siglist, sig);
+ thread_unlock(td);
+ PROC_SUNLOCK(p);
+ sigqueue_delete(&p->p_sigqueue, sig);
/*
* It may be on either list in this state.
* Remove from both for now.
*/
- SIGDELSET(td->td_siglist, sig);
+ sigqueue_delete(&td->td_sigqueue, sig);
+ PROC_SLOCK(p);
+ thread_lock(td);
return;
}
@@ -1989,9 +2422,10 @@
struct thread *td2;
PROC_LOCK_ASSERT(p, MA_OWNED);
- mtx_assert(&sched_lock, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
FOREACH_THREAD_IN_PROC(p, td2) {
+ thread_lock(td2);
if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
(td2->td_flags & TDF_SINTR) &&
!TD_IS_SUSPENDED(td2)) {
@@ -2004,6 +2438,7 @@
forward_signal(td2);
#endif
}
+ thread_unlock(td2);
}
}
@@ -2014,17 +2449,19 @@
PROC_LOCK_ASSERT(p, MA_OWNED);
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
- &p->p_mtx.mtx_object, "Stopping for traced signal");
+ &p->p_mtx.lock_object, "Stopping for traced signal");
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_flags |= TDF_XSIG;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
td->td_xsig = sig;
+ PROC_SLOCK(p);
while ((p->p_flag & P_TRACED) && (td->td_flags & TDF_XSIG)) {
if (p->p_flag & P_SINGLE_EXIT) {
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_flags &= ~TDF_XSIG;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
+ PROC_SUNLOCK(p);
return (sig);
}
/*
@@ -2034,26 +2471,19 @@
p->p_xstat = sig;
p->p_xthread = td;
p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE);
- mtx_lock_spin(&sched_lock);
sig_suspend_threads(td, p, 0);
stopme:
- thread_stopped(p);
- thread_suspend_one(td);
- PROC_UNLOCK(p);
- DROP_GIANT();
- mi_switch(SW_VOL, NULL);
- mtx_unlock_spin(&sched_lock);
- PICKUP_GIANT();
- PROC_LOCK(p);
- if (!(p->p_flag & P_TRACED))
+ thread_suspend_switch(td);
+ if (!(p->p_flag & P_TRACED)) {
break;
+ }
if (td->td_flags & TDF_DBSUSPEND) {
if (p->p_flag & P_SINGLE_EXIT)
break;
- mtx_lock_spin(&sched_lock);
goto stopme;
}
}
+ PROC_SUNLOCK(p);
return (td->td_xsig);
}
@@ -2085,7 +2515,7 @@
for (;;) {
int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
- sigpending = td->td_siglist;
+ sigpending = td->td_sigqueue.sq_signals;
SIGSETNAND(sigpending, td->td_sigmask);
if (p->p_flag & P_PPWAIT)
@@ -2105,9 +2535,11 @@
* only if P_TRACED was on when they were posted.
*/
if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
- SIGDELSET(td->td_siglist, sig);
+ sigqueue_delete(&td->td_sigqueue, sig);
+#ifdef KSE
if (td->td_pflags & TDP_SA)
SIGADDSET(td->td_sigmask, sig);
+#endif
continue;
}
if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
@@ -2118,17 +2550,43 @@
newsig = ptracestop(td, sig);
mtx_lock(&ps->ps_mtx);
- /*
- * If parent wants us to take the signal,
- * then it will leave it in p->p_xstat;
- * otherwise we just look for signals again.
- */
- SIGDELSET(td->td_siglist, sig); /* clear old signal */
+#ifdef KSE
if (td->td_pflags & TDP_SA)
SIGADDSET(td->td_sigmask, sig);
- if (newsig == 0)
- continue;
- sig = newsig;
+
+#endif
+ if (sig != newsig) {
+ ksiginfo_t ksi;
+ /*
+ * clear old signal.
+ * XXX shrug off debugger, it causes siginfo to
+ * be thrown away.
+ */
+ sigqueue_get(&td->td_sigqueue, sig, &ksi);
+
+ /*
+ * If parent wants us to take the signal,
+ * then it will leave it in p->p_xstat;
+ * otherwise we just look for signals again.
+ */
+ if (newsig == 0)
+ continue;
+ sig = newsig;
+
+ /*
+ * Put the new signal into td_sigqueue. If the
+ * signal is being masked, look for other signals.
+ */
+ SIGADDSET(td->td_sigqueue.sq_signals, sig);
+#ifdef KSE
+ if (td->td_pflags & TDP_SA)
+ SIGDELSET(td->td_sigmask, sig);
+#endif
+ if (SIGISMEMBER(td->td_sigmask, sig))
+ continue;
+ signotify(td);
+ }
+
/*
* If the traced bit got turned off, go back up
* to the top to rescan signals. This ensures
@@ -2136,17 +2594,6 @@
*/
if ((p->p_flag & P_TRACED) == 0)
continue;
-
- /*
- * Put the new signal into td_siglist. If the
- * signal is being masked, look for other signals.
- */
- SIGADDSET(td->td_siglist, sig);
- if (td->td_pflags & TDP_SA)
- SIGDELSET(td->td_sigmask, sig);
- if (SIGISMEMBER(td->td_sigmask, sig))
- continue;
- signotify(td);
}
prop = sigprop(sig);
@@ -2187,19 +2634,13 @@
break; /* == ignore */
mtx_unlock(&ps->ps_mtx);
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
- &p->p_mtx.mtx_object, "Catching SIGSTOP");
+ &p->p_mtx.lock_object, "Catching SIGSTOP");
p->p_flag |= P_STOPPED_SIG;
p->p_xstat = sig;
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
sig_suspend_threads(td, p, 0);
- thread_stopped(p);
- thread_suspend_one(td);
- PROC_UNLOCK(p);
- DROP_GIANT();
- mi_switch(SW_INVOL, NULL);
- mtx_unlock_spin(&sched_lock);
- PICKUP_GIANT();
- PROC_LOCK(p);
+ thread_suspend_switch(td);
+ PROC_SUNLOCK(p);
mtx_lock(&ps->ps_mtx);
break;
} else if (prop & SA_IGNORE) {
@@ -2230,47 +2671,29 @@
*/
return (sig);
}
- SIGDELSET(td->td_siglist, sig); /* take the signal! */
+ sigqueue_delete(&td->td_sigqueue, sig); /* take the signal! */
}
/* NOTREACHED */
}
-/*
- * MPSAFE
- */
void
thread_stopped(struct proc *p)
{
- struct proc *p1 = curthread->td_proc;
- struct sigacts *ps;
int n;
PROC_LOCK_ASSERT(p, MA_OWNED);
- mtx_assert(&sched_lock, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
n = p->p_suspcount;
- if (p == p1)
+ if (p == curproc)
n++;
if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
p->p_flag &= ~P_WAITED;
PROC_LOCK(p->p_pptr);
- /*
- * Wake up parent sleeping in kern_wait(), also send
- * SIGCHLD to parent, but SIGCHLD does not guarantee
- * that parent will awake, because parent may masked
- * the signal.
- */
- p->p_pptr->p_flag |= P_STATCHILD;
- wakeup(p->p_pptr);
- ps = p->p_pptr->p_sigacts;
- mtx_lock(&ps->ps_mtx);
- if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
- mtx_unlock(&ps->ps_mtx);
- psignal(p->p_pptr, SIGCHLD);
- } else
- mtx_unlock(&ps->ps_mtx);
+ childproc_stopped(p, (p->p_flag & P_TRACED) ?
+ CLD_TRAPPED : CLD_STOPPED);
PROC_UNLOCK(p->p_pptr);
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
}
}
@@ -2286,6 +2709,7 @@
register struct proc *p = td->td_proc;
struct sigacts *ps;
sig_t action;
+ ksiginfo_t ksi;
sigset_t returnmask;
int code;
@@ -2294,7 +2718,11 @@
PROC_LOCK_ASSERT(p, MA_OWNED);
ps = p->p_sigacts;
mtx_assert(&ps->ps_mtx, MA_OWNED);
- SIGDELSET(td->td_siglist, sig);
+ ksiginfo_init(&ksi);
+ sigqueue_get(&td->td_sigqueue, sig, &ksi);
+ ksi.ksi_signo = sig;
+ if (ksi.ksi_code == SI_TIMER)
+ itimer_accept(p, ksi.ksi_timerid, &ksi);
action = ps->ps_sigact[_SIG_IDX(sig)];
#ifdef KTRACE
if (KTRPOINT(td, KTR_PSIG))
@@ -2307,7 +2735,11 @@
mtx_lock(&ps->ps_mtx);
}
+#ifdef KSE
if (!(td->td_pflags & TDP_SA) && action == SIG_DFL) {
+#else
+ if (action == SIG_DFL) {
+#endif
/*
* Default action, where the default is to kill
* the process. (Other cases were ignored above.)
@@ -2316,6 +2748,7 @@
sigexit(td, sig);
/* NOTREACHED */
} else {
+#ifdef KSE
if (td->td_pflags & TDP_SA) {
if (sig == SIGKILL) {
mtx_unlock(&ps->ps_mtx);
@@ -2323,6 +2756,7 @@
}
}
+#endif
/*
* If we get here, the signal must be caught.
*/
@@ -2357,7 +2791,7 @@
SIGADDSET(ps->ps_sigignore, sig);
ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
}
- p->p_stats->p_ru.ru_nsignals++;
+ td->td_ru.ru_nsignals++;
if (p->p_sig != sig) {
code = 0;
} else {
@@ -2365,11 +2799,14 @@
p->p_code = 0;
p->p_sig = 0;
}
+#ifdef KSE
if (td->td_pflags & TDP_SA)
- thread_signal_add(curthread, sig);
+ thread_signal_add(curthread, &ksi);
else
- (*p->p_sysent->sv_sendsig)(action, sig,
- &returnmask, code);
+ (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
+#else
+ (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
+#endif
}
}
@@ -2397,8 +2834,6 @@
* signal state. Mark the accounting record with the signal termination.
* If dumping core, save the signal number for the debugger. Calls exit and
* does not return.
- *
- * MPSAFE
*/
void
sigexit(td, sig)
@@ -2442,6 +2877,84 @@
/* NOTREACHED */
}
+/*
+ * Send queued SIGCHLD to parent when child process's state
+ * is changed.
+ */
+static void
+sigparent(struct proc *p, int reason, int status)
+{
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
+
+ if (p->p_ksi != NULL) {
+ p->p_ksi->ksi_signo = SIGCHLD;
+ p->p_ksi->ksi_code = reason;
+ p->p_ksi->ksi_status = status;
+ p->p_ksi->ksi_pid = p->p_pid;
+ p->p_ksi->ksi_uid = p->p_ucred->cr_ruid;
+ if (KSI_ONQ(p->p_ksi))
+ return;
+ }
+ tdsignal(p->p_pptr, NULL, SIGCHLD, p->p_ksi);
+}
+
+static void
+childproc_jobstate(struct proc *p, int reason, int status)
+{
+ struct sigacts *ps;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
+
+ /*
+ * Wake up parent sleeping in kern_wait(), also send
+ * SIGCHLD to parent, but SIGCHLD does not guarantee
+ * that parent will awake, because parent may masked
+ * the signal.
+ */
+ p->p_pptr->p_flag |= P_STATCHILD;
+ wakeup(p->p_pptr);
+
+ ps = p->p_pptr->p_sigacts;
+ mtx_lock(&ps->ps_mtx);
+ if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
+ mtx_unlock(&ps->ps_mtx);
+ sigparent(p, reason, status);
+ } else
+ mtx_unlock(&ps->ps_mtx);
+}
+
+void
+childproc_stopped(struct proc *p, int reason)
+{
+ childproc_jobstate(p, reason, p->p_xstat);
+}
+
+void
+childproc_continued(struct proc *p)
+{
+ childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
+}
+
+void
+childproc_exited(struct proc *p)
+{
+ int reason;
+ int status = p->p_xstat; /* convert to int */
+
+ reason = CLD_EXITED;
+ if (WCOREDUMP(status))
+ reason = CLD_DUMPED;
+ else if (WIFSIGNALED(status))
+ reason = CLD_KILLED;
+ /*
+ * XXX avoid calling wakeup(p->p_pptr), the work is
+ * done in exit1().
+ */
+ sigparent(p, reason, status);
+}
+
static char corefilename[MAXPATHLEN] = {"%N.core"};
SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
sizeof(corefilename), "process corefile name format string");
@@ -2539,6 +3052,7 @@
struct mount *mp;
char *name; /* name of corefile */
off_t limit;
+ int vfslocked;
PROC_LOCK_ASSERT(p, MA_OWNED);
MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
@@ -2562,21 +3076,17 @@
if (limit == 0)
return (EFBIG);
- mtx_lock(&Giant);
restart:
name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid);
- if (name == NULL) {
- mtx_unlock(&Giant);
+ if (name == NULL)
return (EINVAL);
- }
- NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); /* XXXKSE */
+ NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, name, td);
flags = O_CREAT | FWRITE | O_NOFOLLOW;
- error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR, -1);
+ error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR, NULL);
free(name, M_TEMP);
- if (error) {
- mtx_unlock(&Giant);
+ if (error)
return (error);
- }
+ vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
@@ -2585,7 +3095,7 @@
VOP_GETATTR(vp, &vattr, cred, td) || vattr.va_nlink != 1) {
VOP_UNLOCK(vp, 0, td);
error = EFAULT;
- goto out;
+ goto close;
}
VOP_UNLOCK(vp, 0, td);
@@ -2600,9 +3110,10 @@
if (locked)
VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
if ((error = vn_close(vp, FWRITE, cred, td)) != 0)
- return (error);
+ goto out;
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
- return (error);
+ goto out;
+ VFS_UNLOCK_GIANT(vfslocked);
goto restart;
}
@@ -2614,6 +3125,7 @@
VOP_LEASE(vp, td, cred, LEASE_WRITE);
VOP_SETATTR(vp, &vattr, cred, td);
VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
PROC_LOCK(p);
p->p_acflag |= ACORE;
PROC_UNLOCK(p);
@@ -2626,27 +3138,24 @@
lf.l_type = F_UNLCK;
VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
}
- vn_finished_write(mp);
-out:
+close:
error1 = vn_close(vp, FWRITE, cred, td);
- mtx_unlock(&Giant);
if (error == 0)
error = error1;
+out:
+ VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
- * Nonexistent system call-- signal process (may want to handle it).
- * Flag error in case process won't see signal immediately (blocked or ignored).
+ * Nonexistent system call-- signal process (may want to handle it). Flag
+ * error in case process won't see signal immediately (blocked or ignored).
*/
#ifndef _SYS_SYSPROTO_H_
struct nosys_args {
int dummy;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
nosys(td, args)
@@ -2662,8 +3171,8 @@
}
/*
- * Send a SIGIO or SIGURG signal to a process or process group using
- * stored credentials rather than those of the current process.
+ * Send a SIGIO or SIGURG signal to a process or process group using stored
+ * credentials rather than those of the current process.
*/
void
pgsigio(sigiop, sig, checkctty)
Index: kern_environment.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_environment.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_environment.c -L sys/kern/kern_environment.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_environment.c
+++ sys/kern/kern_environment.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_environment.c,v 1.39.2.2 2005/10/09 03:29:03 delphij Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_environment.c,v 1.47 2007/03/05 13:10:57 rwatson Exp $");
#include "opt_mac.h"
@@ -44,17 +44,18 @@
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/priv.h>
#include <sys/kernel.h>
-#include <sys/sx.h>
#include <sys/systm.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/libkern.h>
#include <sys/kenv.h>
+#include <security/mac/mac_framework.h>
+
static MALLOC_DEFINE(M_KENV, "kenv", "kernel environment");
#define KENV_SIZE 512 /* Maximum number of environment strings */
@@ -65,11 +66,10 @@
/* dynamic environment variables */
char **kenvp;
-struct sx kenv_lock;
+struct mtx kenv_lock;
/*
- * No need to protect this with a mutex
- * since SYSINITS are single threaded.
+ * No need to protect this with a mutex since SYSINITS are single threaded.
*/
int dynamic_kenv = 0;
@@ -86,7 +86,7 @@
int len;
} */ *uap;
{
- char *name, *value;
+ char *name, *value, *buffer = NULL;
size_t len, done, needed;
int error, i;
@@ -100,7 +100,9 @@
return (error);
#endif
done = needed = 0;
- sx_slock(&kenv_lock);
+ if (uap->len > 0 && uap->value != NULL)
+ buffer = malloc(uap->len, M_TEMP, M_WAITOK|M_ZERO);
+ mtx_lock(&kenv_lock);
for (i = 0; kenvp[i] != NULL; i++) {
len = strlen(kenvp[i]) + 1;
needed += len;
@@ -109,24 +111,32 @@
* If called with a NULL or insufficiently large
* buffer, just keep computing the required size.
*/
- if (uap->value != NULL && len > 0) {
- error = copyout(kenvp[i], uap->value + done,
- len);
- if (error)
- break;
+ if (uap->value != NULL && buffer != NULL && len > 0) {
+ bcopy(kenvp[i], buffer + done, len);
done += len;
}
}
- sx_sunlock(&kenv_lock);
+ mtx_unlock(&kenv_lock);
+ if (buffer != NULL) {
+ error = copyout(buffer, uap->value, done);
+ free(buffer, M_TEMP);
+ }
td->td_retval[0] = ((done == needed) ? 0 : needed);
return (error);
}
- if ((uap->what == KENV_SET) ||
- (uap->what == KENV_UNSET)) {
- error = suser(td);
+ switch (uap->what) {
+ case KENV_SET:
+ error = priv_check(td, PRIV_KENV_SET);
+ if (error)
+ return (error);
+ break;
+
+ case KENV_UNSET:
+ error = priv_check(td, PRIV_KENV_UNSET);
if (error)
return (error);
+ break;
}
name = malloc(KENV_MNAMELEN, M_TEMP, M_WAITOK);
@@ -210,12 +220,17 @@
i = 0;
for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
len = strlen(cp) + 1;
- kenvp[i] = malloc(len, M_KENV, M_WAITOK);
- strcpy(kenvp[i++], cp);
+ if (i < KENV_SIZE) {
+ kenvp[i] = malloc(len, M_KENV, M_WAITOK);
+ strcpy(kenvp[i++], cp);
+ } else
+ printf(
+ "WARNING: too many kenv strings, ignoring %s\n",
+ cp);
}
kenvp[i] = NULL;
- sx_init(&kenv_lock, "kernel environment");
+ mtx_init(&kenv_lock, "kernel environment", NULL, MTX_DEF);
dynamic_kenv = 1;
}
SYSINIT(kenv, SI_SUB_KMEM, SI_ORDER_ANY, init_dynamic_kenv, NULL);
@@ -237,7 +252,7 @@
char *cp;
int len, i;
- sx_assert(&kenv_lock, SX_LOCKED);
+ mtx_assert(&kenv_lock, MA_OWNED);
len = strlen(name);
for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) {
if ((strncmp(cp, name, len) == 0) &&
@@ -283,16 +298,16 @@
int len;
if (dynamic_kenv) {
- sx_slock(&kenv_lock);
+ mtx_lock(&kenv_lock);
cp = _getenv_dynamic(name, NULL);
if (cp != NULL) {
strcpy(buf, cp);
- sx_sunlock(&kenv_lock);
+ mtx_unlock(&kenv_lock);
len = strlen(buf) + 1;
ret = malloc(len, M_KENV, M_WAITOK);
strcpy(ret, buf);
} else {
- sx_sunlock(&kenv_lock);
+ mtx_unlock(&kenv_lock);
ret = NULL;
}
} else
@@ -309,9 +324,9 @@
char *cp;
if (dynamic_kenv) {
- sx_slock(&kenv_lock);
+ mtx_lock(&kenv_lock);
cp = _getenv_dynamic(name, NULL);
- sx_sunlock(&kenv_lock);
+ mtx_unlock(&kenv_lock);
} else
cp = _getenv_static(name);
if (cp != NULL)
@@ -339,12 +354,12 @@
buf = malloc(namelen + vallen, M_KENV, M_WAITOK);
sprintf(buf, "%s=%s", name, value);
- sx_xlock(&kenv_lock);
+ mtx_lock(&kenv_lock);
cp = _getenv_dynamic(name, &i);
if (cp != NULL) {
oldenv = kenvp[i];
kenvp[i] = buf;
- sx_xunlock(&kenv_lock);
+ mtx_unlock(&kenv_lock);
free(oldenv, M_KENV);
} else {
/* We add the option if it wasn't found */
@@ -354,13 +369,13 @@
/* Bounds checking */
if (i < 0 || i >= KENV_SIZE) {
free(buf, M_KENV);
- sx_xunlock(&kenv_lock);
+ mtx_unlock(&kenv_lock);
return (-1);
}
kenvp[i] = buf;
kenvp[i + 1] = NULL;
- sx_xunlock(&kenv_lock);
+ mtx_unlock(&kenv_lock);
}
return (0);
}
@@ -376,18 +391,18 @@
KENV_CHECK;
- sx_xlock(&kenv_lock);
+ mtx_lock(&kenv_lock);
cp = _getenv_dynamic(name, &i);
if (cp != NULL) {
oldenv = kenvp[i];
for (j = i + 1; kenvp[j] != NULL; j++)
kenvp[i++] = kenvp[j];
kenvp[i] = NULL;
- sx_xunlock(&kenv_lock);
+ mtx_unlock(&kenv_lock);
free(oldenv, M_KENV);
return (0);
}
- sx_xunlock(&kenv_lock);
+ mtx_unlock(&kenv_lock);
return (-1);
}
Index: kern_descrip.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_descrip.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -L sys/kern/kern_descrip.c -L sys/kern/kern_descrip.c -u -r1.5 -r1.6
--- sys/kern/kern_descrip.c
+++ sys/kern/kern_descrip.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_descrip.c,v 1.279.2.5 2005/11/17 13:11:36 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_descrip.c,v 1.313.4.1 2008/02/14 11:45:41 simon Exp $");
#include "opt_compat.h"
#include "opt_ddb.h"
@@ -54,8 +54,10 @@
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
+#include <sys/mqueue.h>
#include <sys/mutex.h>
#include <sys/namei.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
@@ -68,12 +70,14 @@
#include <sys/unistd.h>
#include <sys/vnode.h>
+#include <security/audit/audit.h>
+
#include <vm/uma.h>
#include <ddb/ddb.h>
-static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
-static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "file desc to leader",
+static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
+static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
"file desc to leader structures");
static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
@@ -134,6 +138,7 @@
int openfiles; /* actual number of open files */
struct sx filelist_lock; /* sx to protect filelist */
struct mtx sigio_lock; /* mtx to protect pointers to sigio */
+void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
/* A mutex to protect the association between a proc and filedesc. */
static struct mtx fdesc_mtx;
@@ -206,9 +211,11 @@
static void
fdused(struct filedesc *fdp, int fd)
{
- FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
+ FILEDESC_XLOCK_ASSERT(fdp);
KASSERT(!fdisused(fdp, fd),
("fd already used"));
+
fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
if (fd > fdp->fd_lastfile)
fdp->fd_lastfile = fd;
@@ -222,11 +229,13 @@
static void
fdunused(struct filedesc *fdp, int fd)
{
- FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
+ FILEDESC_XLOCK_ASSERT(fdp);
KASSERT(fdisused(fdp, fd),
("fd is already unused"));
KASSERT(fdp->fd_ofiles[fd] == NULL,
("fd is still in use"));
+
fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
if (fd < fdp->fd_freefile)
fdp->fd_freefile = fd;
@@ -242,9 +251,6 @@
int dummy;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
getdtablesize(struct thread *td, struct getdtablesize_args *uap)
@@ -261,7 +267,7 @@
/*
* Duplicate a file descriptor to a particular value.
*
- * note: keep in mind that a potential race condition exists when closing
+ * Note: keep in mind that a potential race condition exists when closing
* descriptors from a shared descriptor table (via rfork).
*/
#ifndef _SYS_SYSPROTO_H_
@@ -270,9 +276,6 @@
u_int to;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
dup2(struct thread *td, struct dup2_args *uap)
@@ -290,9 +293,6 @@
u_int fd;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
dup(struct thread *td, struct dup_args *uap)
@@ -311,9 +311,6 @@
long arg;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
fcntl(struct thread *td, struct fcntl_args *uap)
@@ -344,6 +341,18 @@
return (error);
}
+static inline struct file *
+fdtofp(int fd, struct filedesc *fdp)
+{
+ struct file *fp;
+
+ FILEDESC_LOCK_ASSERT(fdp);
+ if ((unsigned)fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[fd]) == NULL)
+ return (NULL);
+ return (fp);
+}
+
int
kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
{
@@ -355,42 +364,23 @@
struct vnode *vp;
u_int newmin;
int error, flg, tmp;
- int giant_locked;
-
- /*
- * XXXRW: Some fcntl() calls require Giant -- others don't. Try to
- * avoid grabbing Giant for calls we know don't need it.
- */
- switch (cmd) {
- case F_DUPFD:
- case F_GETFD:
- case F_SETFD:
- case F_GETFL:
- giant_locked = 0;
- break;
-
- default:
- giant_locked = 1;
- mtx_lock(&Giant);
- }
+ int vfslocked;
+ vfslocked = 0;
error = 0;
flg = F_POSIX;
p = td->td_proc;
fdp = p->p_fd;
- FILEDESC_LOCK(fdp);
- if ((unsigned)fd >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[fd]) == NULL) {
- FILEDESC_UNLOCK(fdp);
- error = EBADF;
- goto done2;
- }
- pop = &fdp->fd_ofileflags[fd];
switch (cmd) {
case F_DUPFD:
- /* mtx_assert(&Giant, MA_NOTOWNED); */
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SLOCK(fdp);
+ if ((fp = fdtofp(fd, fdp)) == NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ error = EBADF;
+ break;
+ }
+ FILEDESC_SUNLOCK(fdp);
newmin = arg;
PROC_LOCK(p);
if (newmin >= lim_cur(p, RLIMIT_NOFILE) ||
@@ -404,34 +394,56 @@
break;
case F_GETFD:
- /* mtx_assert(&Giant, MA_NOTOWNED); */
+ FILEDESC_SLOCK(fdp);
+ if ((fp = fdtofp(fd, fdp)) == NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ error = EBADF;
+ break;
+ }
+ pop = &fdp->fd_ofileflags[fd];
td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
break;
case F_SETFD:
- /* mtx_assert(&Giant, MA_NOTOWNED); */
+ FILEDESC_XLOCK(fdp);
+ if ((fp = fdtofp(fd, fdp)) == NULL) {
+ FILEDESC_XUNLOCK(fdp);
+ error = EBADF;
+ break;
+ }
+ pop = &fdp->fd_ofileflags[fd];
*pop = (*pop &~ UF_EXCLOSE) |
(arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
break;
case F_GETFL:
- /* mtx_assert(&Giant, MA_NOTOWNED); */
+ FILEDESC_SLOCK(fdp);
+ if ((fp = fdtofp(fd, fdp)) == NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ error = EBADF;
+ break;
+ }
FILE_LOCK(fp);
td->td_retval[0] = OFLAGS(fp->f_flag);
FILE_UNLOCK(fp);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
break;
case F_SETFL:
- mtx_assert(&Giant, MA_OWNED);
+ FILEDESC_SLOCK(fdp);
+ if ((fp = fdtofp(fd, fdp)) == NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ error = EBADF;
+ break;
+ }
FILE_LOCK(fp);
fhold_locked(fp);
fp->f_flag &= ~FCNTLFLAGS;
fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
FILE_UNLOCK(fp);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
tmp = fp->f_flag & FNONBLOCK;
error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
if (error) {
@@ -453,9 +465,14 @@
break;
case F_GETOWN:
- mtx_assert(&Giant, MA_OWNED);
+ FILEDESC_SLOCK(fdp);
+ if ((fp = fdtofp(fd, fdp)) == NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ error = EBADF;
+ break;
+ }
fhold(fp);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
if (error == 0)
td->td_retval[0] = tmp;
@@ -463,33 +480,41 @@
break;
case F_SETOWN:
- mtx_assert(&Giant, MA_OWNED);
+ FILEDESC_SLOCK(fdp);
+ if ((fp = fdtofp(fd, fdp)) == NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ error = EBADF;
+ break;
+ }
fhold(fp);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
tmp = arg;
error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
fdrop(fp, td);
break;
case F_SETLKW:
- mtx_assert(&Giant, MA_OWNED);
flg |= F_WAIT;
/* FALLTHROUGH F_SETLK */
case F_SETLK:
- mtx_assert(&Giant, MA_OWNED);
+ FILEDESC_SLOCK(fdp);
+ if ((fp = fdtofp(fd, fdp)) == NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ error = EBADF;
+ break;
+ }
if (fp->f_type != DTYPE_VNODE) {
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
error = EBADF;
break;
}
-
flp = (struct flock *)arg;
if (flp->l_whence == SEEK_CUR) {
if (fp->f_offset < 0 ||
(flp->l_start > 0 &&
fp->f_offset > OFF_MAX - flp->l_start)) {
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
error = EOVERFLOW;
break;
}
@@ -500,9 +525,9 @@
* VOP_ADVLOCK() may block.
*/
fhold(fp);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
vp = fp->f_vnode;
-
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
switch (flp->l_type) {
case F_RDLCK:
if ((fp->f_flag & FREAD) == 0) {
@@ -534,33 +559,43 @@
error = EINVAL;
break;
}
+ VFS_UNLOCK_GIANT(vfslocked);
+ vfslocked = 0;
/* Check for race with close */
- FILEDESC_LOCK_FAST(fdp);
+ FILEDESC_SLOCK(fdp);
if ((unsigned) fd >= fdp->fd_nfiles ||
fp != fdp->fd_ofiles[fd]) {
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_SUNLOCK(fdp);
flp->l_whence = SEEK_SET;
flp->l_start = 0;
flp->l_len = 0;
flp->l_type = F_UNLCK;
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
F_UNLCK, flp, F_POSIX);
+ VFS_UNLOCK_GIANT(vfslocked);
+ vfslocked = 0;
} else
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_SUNLOCK(fdp);
fdrop(fp, td);
break;
case F_GETLK:
- mtx_assert(&Giant, MA_OWNED);
+ FILEDESC_SLOCK(fdp);
+ if ((fp = fdtofp(fd, fdp)) == NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ error = EBADF;
+ break;
+ }
if (fp->f_type != DTYPE_VNODE) {
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
error = EBADF;
break;
}
flp = (struct flock *)arg;
if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
flp->l_type != F_UNLCK) {
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
error = EINVAL;
break;
}
@@ -569,7 +604,7 @@
fp->f_offset > OFF_MAX - flp->l_start) ||
(flp->l_start < 0 &&
fp->f_offset < OFF_MIN - flp->l_start)) {
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
error = EOVERFLOW;
break;
}
@@ -579,20 +614,20 @@
* VOP_ADVLOCK() may block.
*/
fhold(fp);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
vp = fp->f_vnode;
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
F_POSIX);
+ VFS_UNLOCK_GIANT(vfslocked);
+ vfslocked = 0;
fdrop(fp, td);
break;
default:
- FILEDESC_UNLOCK(fdp);
error = EINVAL;
break;
}
-done2:
- if (giant_locked)
- mtx_unlock(&Giant);
+ VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -600,7 +635,8 @@
* Common code for dup, dup2, and fcntl(F_DUPFD).
*/
static int
-do_dup(struct thread *td, enum dup_type type, int old, int new, register_t *retval)
+do_dup(struct thread *td, enum dup_type type, int old, int new,
+ register_t *retval)
{
struct filedesc *fdp;
struct proc *p;
@@ -626,14 +662,14 @@
if (new >= maxfd)
return (EMFILE);
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
return (EBADF);
}
if (type == DUP_FIXED && old == new) {
*retval = new;
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
return (0);
}
fp = fdp->fd_ofiles[old];
@@ -653,7 +689,7 @@
fdused(fdp, new);
} else {
if ((error = fdalloc(td, new, &new)) != 0) {
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
fdrop(fp, td);
return (error);
}
@@ -668,7 +704,7 @@
/* we've allocated a descriptor which we won't use */
if (fdp->fd_ofiles[new] == NULL)
fdunused(fdp, new);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
fdrop(fp, td);
return (EBADF);
}
@@ -713,20 +749,22 @@
*/
if (delfp != NULL) {
knote_fdclose(td, new);
- FILEDESC_UNLOCK(fdp);
+ if (delfp->f_type == DTYPE_MQUEUE)
+ mq_fdclose(td, new, delfp);
+ FILEDESC_XUNLOCK(fdp);
(void) closef(delfp, td);
if (holdleaders) {
- FILEDESC_LOCK_FAST(fdp);
+ FILEDESC_XLOCK(fdp);
fdp->fd_holdleaderscount--;
if (fdp->fd_holdleaderscount == 0 &&
fdp->fd_holdleaderswakeup != 0) {
fdp->fd_holdleaderswakeup = 0;
wakeup(&fdp->fd_holdleaderscount);
}
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_XUNLOCK(fdp);
}
} else {
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
}
return (0);
}
@@ -958,28 +996,36 @@
int fd;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
close(td, uap)
struct thread *td;
struct close_args *uap;
{
+
+ return (kern_close(td, uap->fd));
+}
+
+int
+kern_close(td, fd)
+ struct thread *td;
+ int fd;
+{
struct filedesc *fdp;
struct file *fp;
- int fd, error;
+ int error;
int holdleaders;
- fd = uap->fd;
error = 0;
holdleaders = 0;
fdp = td->td_proc->p_fd;
- FILEDESC_LOCK(fdp);
+
+ AUDIT_SYSCLOSE(td, fd);
+
+ FILEDESC_XLOCK(fdp);
if ((unsigned)fd >= fdp->fd_nfiles ||
(fp = fdp->fd_ofiles[fd]) == NULL) {
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
return (EBADF);
}
fdp->fd_ofiles[fd] = NULL;
@@ -995,25 +1041,26 @@
}
/*
- * We now hold the fp reference that used to be owned by the descriptor
- * array.
- * We have to unlock the FILEDESC *AFTER* knote_fdclose to prevent a
- * race of the fd getting opened, a knote added, and deleteing a knote
- * for the new fd.
+ * We now hold the fp reference that used to be owned by the
+ * descriptor array. We have to unlock the FILEDESC *AFTER*
+ * knote_fdclose to prevent a race of the fd getting opened, a knote
+ * added, and deleteing a knote for the new fd.
*/
knote_fdclose(td, fd);
- FILEDESC_UNLOCK(fdp);
+ if (fp->f_type == DTYPE_MQUEUE)
+ mq_fdclose(td, fd, fp);
+ FILEDESC_XUNLOCK(fdp);
error = closef(fp, td);
if (holdleaders) {
- FILEDESC_LOCK_FAST(fdp);
+ FILEDESC_XLOCK(fdp);
fdp->fd_holdleaderscount--;
if (fdp->fd_holdleaderscount == 0 &&
fdp->fd_holdleaderswakeup != 0) {
fdp->fd_holdleaderswakeup = 0;
wakeup(&fdp->fd_holdleaderscount);
}
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_XUNLOCK(fdp);
}
return (error);
}
@@ -1028,9 +1075,6 @@
struct ostat *sb;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
ofstat(struct thread *td, struct ofstat_args *uap)
@@ -1057,9 +1101,6 @@
struct stat *sb;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
fstat(struct thread *td, struct fstat_args *uap)
@@ -1079,8 +1120,13 @@
struct file *fp;
int error;
+ AUDIT_ARG(fd, fd);
+
if ((error = fget(td, fd, &fp)) != 0)
return (error);
+
+ AUDIT_ARG(file, td->td_proc, fp);
+
error = fo_stat(fp, sbp, td->td_ucred, td);
fdrop(fp, td);
return (error);
@@ -1095,9 +1141,6 @@
struct nstat *sb;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
nfstat(struct thread *td, struct nfstat_args *uap)
@@ -1123,9 +1166,6 @@
int name;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
fpathconf(struct thread *td, struct fpathconf_args *uap)
@@ -1178,7 +1218,7 @@
int nnfiles, onfiles;
NDSLOTTYPE *nmap;
- FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+ FILEDESC_XLOCK_ASSERT(fdp);
KASSERT(fdp->fd_nfiles > 0,
("zero-length file table"));
@@ -1191,7 +1231,7 @@
return;
/* allocate a new table and (if required) new bitmaps */
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
MALLOC(ntable, struct file **, nnfiles * OFILESIZE,
M_FILEDESC, M_ZERO | M_WAITOK);
nfileflags = (char *)&ntable[nnfiles];
@@ -1200,7 +1240,7 @@
M_FILEDESC, M_ZERO | M_WAITOK);
else
nmap = NULL;
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
/*
* We now have new tables ready to go. Since we dropped the
@@ -1239,7 +1279,7 @@
struct filedesc *fdp = p->p_fd;
int fd = -1, maxfd;
- FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+ FILEDESC_XLOCK_ASSERT(fdp);
if (fdp->fd_freefile > minfd)
minfd = fdp->fd_freefile;
@@ -1278,8 +1318,8 @@
}
/*
- * Check to see whether n user file descriptors
- * are available to the process p.
+ * Check to see whether n user file descriptors are available to the process
+ * p.
*/
int
fdavail(struct thread *td, int n)
@@ -1289,7 +1329,7 @@
struct file **fpp;
int i, lim, last;
- FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+ FILEDESC_LOCK_ASSERT(fdp);
PROC_LOCK(p);
lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
@@ -1306,12 +1346,11 @@
}
/*
- * Create a new open file structure and allocate
- * a file decriptor for the process that refers to it.
- * We add one reference to the file for the descriptor table
- * and one reference for resultfp. This is to prevent us being
- * preempted and the entry in the descriptor table closed after
- * we release the FILEDESC lock.
+ * Create a new open file structure and allocate a file decriptor for the
+ * process that refers to it. We add one reference to the file for the
+ * descriptor table and one reference for resultfp. This is to prevent us
+ * being preempted and the entry in the descriptor table closed after we
+ * release the FILEDESC lock.
*/
int
falloc(struct thread *td, struct file **resultfp, int *resultfd)
@@ -1325,8 +1364,10 @@
fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
sx_xlock(&filelist_lock);
- if ((openfiles >= maxuserfiles && (td->td_ucred->cr_ruid != 0 ||
- jailed(td->td_ucred))) || openfiles >= maxfiles) {
+
+ if ((openfiles >= maxuserfiles &&
+ priv_check(td, PRIV_MAXFILES) != 0) ||
+ openfiles >= maxfiles) {
if (ppsratecheck(&lastfail, &curfail, 1)) {
printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n",
td->td_ucred->cr_ruid);
@@ -1350,7 +1391,7 @@
fp->f_ops = &badfileops;
fp->f_data = NULL;
fp->f_vnode = NULL;
- FILEDESC_LOCK(p->p_fd);
+ FILEDESC_XLOCK(p->p_fd);
if ((fq = p->p_fd->fd_ofiles[0])) {
LIST_INSERT_AFTER(fq, fp, f_list);
} else {
@@ -1358,14 +1399,14 @@
}
sx_xunlock(&filelist_lock);
if ((error = fdalloc(td, 0, &i))) {
- FILEDESC_UNLOCK(p->p_fd);
+ FILEDESC_XUNLOCK(p->p_fd);
fdrop(fp, td);
if (resultfp)
fdrop(fp, td);
return (error);
}
p->p_fd->fd_ofiles[i] = fp;
- FILEDESC_UNLOCK(p->p_fd);
+ FILEDESC_XUNLOCK(p->p_fd);
if (resultfp)
*resultfp = fp;
if (resultfd)
@@ -1383,9 +1424,9 @@
struct filedesc0 *newfdp;
newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
- mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
+ FILEDESC_LOCK_INIT(&newfdp->fd_fd);
if (fdp != NULL) {
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
if (newfdp->fd_fd.fd_cdir)
VREF(newfdp->fd_fd.fd_cdir);
@@ -1395,7 +1436,7 @@
newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
if (newfdp->fd_fd.fd_jdir)
VREF(newfdp->fd_fd.fd_jdir);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
}
/* Create the file descriptor table. */
@@ -1434,7 +1475,7 @@
if (i > 0)
return;
- mtx_destroy(&fdp->fd_mtx);
+ FILEDESC_LOCK_DESTROY(fdp);
FREE(fdp, M_FILEDESC);
}
@@ -1444,9 +1485,10 @@
struct filedesc *
fdshare(struct filedesc *fdp)
{
- FILEDESC_LOCK_FAST(fdp);
+
+ FILEDESC_XLOCK(fdp);
fdp->fd_refcnt++;
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_XUNLOCK(fdp);
return (fdp);
}
@@ -1457,22 +1499,21 @@
fdunshare(struct proc *p, struct thread *td)
{
- FILEDESC_LOCK_FAST(p->p_fd);
+ FILEDESC_XLOCK(p->p_fd);
if (p->p_fd->fd_refcnt > 1) {
struct filedesc *tmp;
- FILEDESC_UNLOCK_FAST(p->p_fd);
+ FILEDESC_XUNLOCK(p->p_fd);
tmp = fdcopy(p->p_fd);
fdfree(td);
p->p_fd = tmp;
} else
- FILEDESC_UNLOCK_FAST(p->p_fd);
+ FILEDESC_XUNLOCK(p->p_fd);
}
/*
- * Copy a filedesc structure.
- * A NULL pointer in returns a NULL reference, this is to ease callers,
- * not catch errors.
+ * Copy a filedesc structure. A NULL pointer in returns a NULL reference,
+ * this is to ease callers, not catch errors.
*/
struct filedesc *
fdcopy(struct filedesc *fdp)
@@ -1485,13 +1526,13 @@
return (NULL);
newfdp = fdinit(fdp);
- FILEDESC_LOCK_FAST(fdp);
+ FILEDESC_SLOCK(fdp);
while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
- FILEDESC_UNLOCK_FAST(fdp);
- FILEDESC_LOCK(newfdp);
+ FILEDESC_SUNLOCK(fdp);
+ FILEDESC_XLOCK(newfdp);
fdgrowtable(newfdp, fdp->fd_lastfile + 1);
- FILEDESC_UNLOCK(newfdp);
- FILEDESC_LOCK_FAST(fdp);
+ FILEDESC_XUNLOCK(newfdp);
+ FILEDESC_SLOCK(fdp);
}
/* copy everything except kqueue descriptors */
newfdp->fd_freefile = -1;
@@ -1507,17 +1548,17 @@
newfdp->fd_freefile = i;
}
}
- FILEDESC_UNLOCK_FAST(fdp);
- FILEDESC_LOCK(newfdp);
+ FILEDESC_SUNLOCK(fdp);
+ FILEDESC_XLOCK(newfdp);
for (i = 0; i <= newfdp->fd_lastfile; ++i)
if (newfdp->fd_ofiles[i] != NULL)
fdused(newfdp, i);
- FILEDESC_UNLOCK(newfdp);
- FILEDESC_LOCK_FAST(fdp);
+ FILEDESC_XUNLOCK(newfdp);
+ FILEDESC_SLOCK(fdp);
if (newfdp->fd_freefile == -1)
newfdp->fd_freefile = i;
newfdp->fd_cmask = fdp->fd_cmask;
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_SUNLOCK(fdp);
return (newfdp);
}
@@ -1543,7 +1584,7 @@
/* Check for special need to clear POSIX style locks */
fdtol = td->td_proc->p_fdtol;
if (fdtol != NULL) {
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
KASSERT(fdtol->fdl_refcount > 0,
("filedesc_to_refcount botch: fdl_refcount=%d",
fdtol->fdl_refcount));
@@ -1557,7 +1598,7 @@
continue;
fp = *fpp;
fhold(fp);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
@@ -1571,7 +1612,7 @@
&lf,
F_POSIX);
VFS_UNLOCK_GIANT(locked);
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
fdrop(fp, td);
fpp = fdp->fd_ofiles + i;
}
@@ -1585,18 +1626,18 @@
* in a shared file descriptor table.
*/
fdp->fd_holdleaderswakeup = 1;
- msleep(&fdp->fd_holdleaderscount, &fdp->fd_mtx,
- PLOCK, "fdlhold", 0);
+ sx_sleep(&fdp->fd_holdleaderscount,
+ FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
goto retry;
}
if (fdtol->fdl_holdcount > 0) {
/*
- * Ensure that fdtol->fdl_leader
- * remains valid in closef().
+ * Ensure that fdtol->fdl_leader remains
+ * valid in closef().
*/
fdtol->fdl_wakeup = 1;
- msleep(fdtol, &fdp->fd_mtx,
- PLOCK, "fdlhold", 0);
+ sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
+ "fdlhold", 0);
goto retry;
}
}
@@ -1608,13 +1649,13 @@
} else
fdtol = NULL;
td->td_proc->p_fdtol = NULL;
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
if (fdtol != NULL)
FREE(fdtol, M_FILEDESC_TO_LEADER);
}
- FILEDESC_LOCK_FAST(fdp);
+ FILEDESC_XLOCK(fdp);
i = --fdp->fd_refcnt;
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_XUNLOCK(fdp);
if (i > 0)
return;
/*
@@ -1626,7 +1667,7 @@
if (*fpp)
(void) closef(*fpp, td);
}
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
/* XXX This should happen earlier. */
mtx_lock(&fdesc_mtx);
@@ -1646,7 +1687,7 @@
fdp->fd_rdir = NULL;
jdir = fdp->fd_jdir;
fdp->fd_jdir = NULL;
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
if (cdir) {
locked = VFS_LOCK_GIANT(cdir->v_mount);
@@ -1706,7 +1747,7 @@
* Note: fdp->fd_ofiles may be reallocated out from under us while
* we are blocked in a close. Be careful!
*/
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
for (i = 0; i <= fdp->fd_lastfile; i++) {
if (i > 2)
break;
@@ -1722,27 +1763,33 @@
fdp->fd_ofiles[i] = NULL;
fdp->fd_ofileflags[i] = 0;
fdunused(fdp, i);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
(void) closef(fp, td);
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
}
}
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
}
+/*
+ * If a specific file object occupies a specific file descriptor, close the
+ * file descriptor entry and drop a reference on the file object. This is a
+ * convenience function to handle a subsequent error in a function that calls
+ * falloc() that handles the race that another thread might have closed the
+ * file descriptor out from under the thread creating the file object.
+ */
void
fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
{
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
if (fdp->fd_ofiles[idx] == fp) {
fdp->fd_ofiles[idx] = NULL;
fdunused(fdp, idx);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
fdrop(fp, td);
- } else {
- FILEDESC_UNLOCK(fdp);
- }
+ } else
+ FILEDESC_XUNLOCK(fdp);
}
/*
@@ -1759,7 +1806,7 @@
if (fdp == NULL)
return;
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
/*
* We cannot cache fd_ofiles or fd_ofileflags since operations
@@ -1767,7 +1814,8 @@
*/
for (i = 0; i <= fdp->fd_lastfile; i++) {
if (fdp->fd_ofiles[i] != NULL &&
- (fdp->fd_ofileflags[i] & UF_EXCLOSE)) {
+ (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE ||
+ (fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
struct file *fp;
knote_fdclose(td, i);
@@ -1779,12 +1827,14 @@
fdp->fd_ofiles[i] = NULL;
fdp->fd_ofileflags[i] = 0;
fdunused(fdp, i);
- FILEDESC_UNLOCK(fdp);
+ if (fp->f_type == DTYPE_MQUEUE)
+ mq_fdclose(td, i, fp);
+ FILEDESC_XUNLOCK(fdp);
(void) closef(fp, td);
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
}
}
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
}
/*
@@ -1797,11 +1847,9 @@
int
fdcheckstd(struct thread *td)
{
- struct nameidata nd;
struct filedesc *fdp;
- struct file *fp;
- register_t retval;
- int fd, i, error, flags, devnull;
+ register_t retval, save;
+ int i, error, devnull;
fdp = td->td_proc->p_fd;
if (fdp == NULL)
@@ -1813,45 +1861,14 @@
if (fdp->fd_ofiles[i] != NULL)
continue;
if (devnull < 0) {
- int vfslocked;
- error = falloc(td, &fp, &fd);
- if (error != 0)
- break;
- /* Note extra ref on `fp' held for us by falloc(). */
- KASSERT(fd == i, ("oof, we didn't get our fd"));
- NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE,
- "/dev/null", td);
- flags = FREAD | FWRITE;
- error = vn_open(&nd, &flags, 0, fd);
- if (error != 0) {
- /*
- * Someone may have closed the entry in the
- * file descriptor table, so check it hasn't
- * changed before dropping the reference count.
- */
- FILEDESC_LOCK(fdp);
- KASSERT(fdp->fd_ofiles[fd] == fp,
- ("table not shared, how did it change?"));
- fdp->fd_ofiles[fd] = NULL;
- fdunused(fdp, fd);
- FILEDESC_UNLOCK(fdp);
- fdrop(fp, td);
- fdrop(fp, td);
+ save = td->td_retval[0];
+ error = kern_open(td, "/dev/null", UIO_SYSSPACE,
+ O_RDWR, 0);
+ devnull = td->td_retval[0];
+ KASSERT(devnull == i, ("oof, we didn't get our fd"));
+ td->td_retval[0] = save;
+ if (error)
break;
- }
- vfslocked = NDHASGIANT(&nd);
- NDFREE(&nd, NDF_ONLY_PNBUF);
- fp->f_flag = flags;
- fp->f_vnode = nd.ni_vp;
- if (fp->f_data == NULL)
- fp->f_data = nd.ni_vp;
- if (fp->f_ops == &badfileops)
- fp->f_ops = &vnops;
- fp->f_type = DTYPE_VNODE;
- VOP_UNLOCK(nd.ni_vp, 0, td);
- VFS_UNLOCK_GIANT(vfslocked);
- devnull = fd;
- fdrop(fp, td);
} else {
error = do_dup(td, DUP_FIXED, devnull, i, &retval);
if (error != 0)
@@ -1862,8 +1879,7 @@
}
/*
- * Internal form of close.
- * Decrement reference count on file structure.
+ * Internal form of close. Decrement reference count on file structure.
* Note: td may be NULL when closing a file that was being passed in a
* message.
*
@@ -1906,11 +1922,11 @@
fdtol = td->td_proc->p_fdtol;
if (fdtol != NULL) {
/*
- * Handle special case where file descriptor table
- * is shared between multiple process leaders.
+ * Handle special case where file descriptor table is
+ * shared between multiple process leaders.
*/
fdp = td->td_proc->p_fd;
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
for (fdtol = fdtol->fdl_next;
fdtol != td->td_proc->p_fdtol;
fdtol = fdtol->fdl_next) {
@@ -1918,7 +1934,7 @@
P_ADVLOCK) == 0)
continue;
fdtol->fdl_holdcount++;
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
@@ -1927,7 +1943,7 @@
(void) VOP_ADVLOCK(vp,
(caddr_t)fdtol->fdl_leader,
F_UNLCK, &lf, F_POSIX);
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
fdtol->fdl_holdcount--;
if (fdtol->fdl_holdcount == 0 &&
fdtol->fdl_wakeup != 0) {
@@ -1935,7 +1951,7 @@
wakeup(fdtol);
}
}
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
}
VFS_UNLOCK_GIANT(vfslocked);
}
@@ -1943,21 +1959,21 @@
}
/*
- * Extract the file pointer associated with the specified descriptor for
- * the current user process.
+ * Extract the file pointer associated with the specified descriptor for the
+ * current user process.
*
* If the descriptor doesn't exist, EBADF is returned.
*
- * If the descriptor exists but doesn't match 'flags' then
- * return EBADF for read attempts and EINVAL for write attempts.
+ * If the descriptor exists but doesn't match 'flags' then return EBADF for
+ * read attempts and EINVAL for write attempts.
*
* If 'hold' is set (non-zero) the file's refcount will be bumped on return.
- * It should be dropped with fdrop().
- * If it is not set, then the refcount will not be bumped however the
- * thread's filedesc struct will be returned locked (for fgetsock).
+ * It should be dropped with fdrop(). If it is not set, then the refcount
+ * will not be bumped however the thread's filedesc struct will be returned
+ * locked (for fgetsock).
*
- * If an error occured the non-zero error is returned and *fpp is set to NULL.
- * Otherwise *fpp is set and zero is returned.
+ * If an error occured the non-zero error is returned and *fpp is set to
+ * NULL. Otherwise *fpp is set and zero is returned.
*/
static __inline int
_fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
@@ -1968,29 +1984,28 @@
*fpp = NULL;
if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
return (EBADF);
- FILEDESC_LOCK(fdp);
+ FILEDESC_SLOCK(fdp);
if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
return (EBADF);
}
/*
- * Note: FREAD failure returns EBADF to maintain backwards
- * compatibility with what routines returned before.
+ * FREAD and FWRITE failure return EBADF as per POSIX.
*
* Only one flag, or 0, may be specified.
*/
if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
return (EBADF);
}
if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
- FILEDESC_UNLOCK(fdp);
- return (EINVAL);
+ FILEDESC_SUNLOCK(fdp);
+ return (EBADF);
}
if (hold) {
fhold(fp);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
}
*fpp = fp;
return (0);
@@ -2018,9 +2033,9 @@
}
/*
- * Like fget() but loads the underlying vnode, or returns an error if
- * the descriptor does not represent a vnode. Note that pipes use vnodes
- * but never have VM objects. The returned vnode will be vref()d.
+ * Like fget() but loads the underlying vnode, or returns an error if the
+ * descriptor does not represent a vnode. Note that pipes use vnodes but
+ * never have VM objects. The returned vnode will be vref()'d.
*
* XXX: what about the unused flags ?
*/
@@ -2039,7 +2054,7 @@
*vpp = fp->f_vnode;
vref(*vpp);
}
- FILEDESC_UNLOCK(td->td_proc->p_fd);
+ FILEDESC_SUNLOCK(td->td_proc->p_fd);
return (error);
}
@@ -2067,11 +2082,15 @@
#endif
/*
- * Like fget() but loads the underlying socket, or returns an error if
- * the descriptor does not represent a socket.
+ * Like fget() but loads the underlying socket, or returns an error if the
+ * descriptor does not represent a socket.
*
- * We bump the ref count on the returned socket. XXX Also obtain the SX
- * lock in the future.
+ * We bump the ref count on the returned socket. XXX Also obtain the SX lock
+ * in the future.
+ *
+ * XXXRW: fgetsock() and fputsock() are deprecated, as consumers should rely
+ * on their file descriptor reference to prevent the socket from being free'd
+ * during use.
*/
int
fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
@@ -2079,8 +2098,6 @@
struct file *fp;
int error;
- NET_ASSERT_GIANT();
-
*spp = NULL;
if (fflagp != NULL)
*fflagp = 0;
@@ -2096,19 +2113,20 @@
soref(*spp);
SOCK_UNLOCK(*spp);
}
- FILEDESC_UNLOCK(td->td_proc->p_fd);
+ FILEDESC_SUNLOCK(td->td_proc->p_fd);
return (error);
}
/*
- * Drop the reference count on the socket and XXX release the SX lock in
- * the future. The last reference closes the socket.
+ * Drop the reference count on the socket and XXX release the SX lock in the
+ * future. The last reference closes the socket.
+ *
+ * XXXRW: fputsock() is deprecated, see comment for fgetsock().
*/
void
fputsock(struct socket *so)
{
- NET_ASSERT_GIANT();
ACCEPT_LOCK();
SOCK_LOCK(so);
sorele(so);
@@ -2138,6 +2156,17 @@
FILE_UNLOCK(fp);
return (0);
}
+
+ /*
+ * We might have just dropped the last reference to a file
+ * object that is for a UNIX domain socket whose message
+ * buffers are being examined in unp_gc(). If that is the
+ * case, FWAIT will be set in f_gcflag and we need to wait for
+ * unp_gc() to finish its scan.
+ */
+ while (fp->f_gcflag & FWAIT)
+ msleep(&fp->f_gcflag, fp->f_mtxp, 0, "fpdrop", 0);
+
/* We have the last ref so we can proceed without the file lock. */
FILE_UNLOCK(fp);
if (fp->f_count < 0)
@@ -2160,8 +2189,8 @@
/*
* Apply an advisory lock on a file descriptor.
*
- * Just attempt to get a record lock of the requested type on
- * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
+ * Just attempt to get a record lock of the requested type on the entire file
+ * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
*/
#ifndef _SYS_SYSPROTO_H_
struct flock_args {
@@ -2169,9 +2198,6 @@
int how;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
flock(struct thread *td, struct flock_args *uap)
@@ -2179,6 +2205,7 @@
struct file *fp;
struct vnode *vp;
struct flock lf;
+ int vfslocked;
int error;
if ((error = fget(td, uap->fd, &fp)) != 0)
@@ -2188,8 +2215,8 @@
return (EOPNOTSUPP);
}
- mtx_lock(&Giant);
vp = fp->f_vnode;
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
@@ -2216,7 +2243,7 @@
(uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
done2:
fdrop(fp, td);
- mtx_unlock(&Giant);
+ VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
@@ -2233,22 +2260,20 @@
* of file descriptors, or the fd to be dup'd has already been
* closed, then reject.
*/
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
if (dfd < 0 || dfd >= fdp->fd_nfiles ||
(wfp = fdp->fd_ofiles[dfd]) == NULL) {
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
return (EBADF);
}
/*
* There are two cases of interest here.
*
- * For ENODEV simply dup (dfd) to file descriptor
- * (indx) and return.
+ * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
*
- * For ENXIO steal away the file structure from (dfd) and
- * store it in (indx). (dfd) is effectively closed by
- * this operation.
+ * For ENXIO steal away the file structure from (dfd) and store it in
+ * (indx). (dfd) is effectively closed by this operation.
*
* Any other error code is just returned.
*/
@@ -2261,7 +2286,7 @@
FILE_LOCK(wfp);
if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
FILE_UNLOCK(wfp);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
return (EACCES);
}
fp = fdp->fd_ofiles[indx];
@@ -2271,15 +2296,13 @@
fdused(fdp, indx);
fhold_locked(wfp);
FILE_UNLOCK(wfp);
- FILEDESC_UNLOCK(fdp);
- if (fp != NULL) {
+ FILEDESC_XUNLOCK(fdp);
+ if (fp != NULL)
/*
* We now own the reference to fp that the ofiles[]
* array used to own. Release it.
*/
- FILE_LOCK(fp);
- fdrop_locked(fp, td);
- }
+ fdrop(fp, td);
return (0);
case ENXIO:
@@ -2294,31 +2317,26 @@
fdunused(fdp, dfd);
if (fp == NULL)
fdused(fdp, indx);
- if (fp != NULL)
- FILE_LOCK(fp);
+ FILEDESC_XUNLOCK(fdp);
/*
* We now own the reference to fp that the ofiles[] array
* used to own. Release it.
*/
if (fp != NULL)
- fdrop_locked(fp, td);
-
- FILEDESC_UNLOCK(fdp);
-
+ fdrop(fp, td);
return (0);
default:
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
return (error);
}
/* NOTREACHED */
}
/*
- * Scan all active processes to see if any of them have a current
- * or root directory of `olddp'. If so, replace them with the new
- * mount point.
+ * Scan all active processes to see if any of them have a current or root
+ * directory of `olddp'. If so, replace them with the new mount point.
*/
void
mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
@@ -2330,12 +2348,12 @@
if (vrefcnt(olddp) == 1)
return;
sx_slock(&allproc_lock);
- LIST_FOREACH(p, &allproc, p_list) {
+ FOREACH_PROC_IN_SYSTEM(p) {
fdp = fdhold(p);
if (fdp == NULL)
continue;
nrele = 0;
- FILEDESC_LOCK_FAST(fdp);
+ FILEDESC_XLOCK(fdp);
if (fdp->fd_cdir == olddp) {
vref(newdp);
fdp->fd_cdir = newdp;
@@ -2346,7 +2364,7 @@
fdp->fd_rdir = newdp;
nrele++;
}
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_XUNLOCK(fdp);
fddrop(fdp);
while (nrele--)
vrele(olddp);
@@ -2373,12 +2391,12 @@
fdtol->fdl_wakeup = 0;
fdtol->fdl_leader = leader;
if (old != NULL) {
- FILEDESC_LOCK(fdp);
+ FILEDESC_XLOCK(fdp);
fdtol->fdl_next = old->fdl_next;
fdtol->fdl_prev = old;
old->fdl_next = fdtol;
fdtol->fdl_next->fdl_prev = fdtol;
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_XUNLOCK(fdp);
} else {
fdtol->fdl_next = fdtol;
fdtol->fdl_prev = fdtol;
@@ -2427,7 +2445,7 @@
bzero(&xf, sizeof(xf));
xf.xf_size = sizeof(xf);
sx_slock(&allproc_lock);
- LIST_FOREACH(p, &allproc, p_list) {
+ FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_state == PRS_NEW)
continue;
PROC_LOCK(p);
@@ -2441,7 +2459,7 @@
fdp = fdhold(p);
if (fdp == NULL)
continue;
- FILEDESC_LOCK_FAST(fdp);
+ FILEDESC_SLOCK(fdp);
for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
if ((fp = fdp->fd_ofiles[n]) == NULL)
continue;
@@ -2458,7 +2476,7 @@
if (error)
break;
}
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_SUNLOCK(fdp);
fddrop(fdp);
if (error)
break;
@@ -2490,8 +2508,12 @@
return ("pipe");
case DTYPE_FIFO:
return ("fifo");
+ case DTYPE_KQUEUE:
+ return ("kque");
case DTYPE_CRYPTO:
return ("crpt");
+ case DTYPE_MQUEUE:
+ return ("mque");
default:
return ("unkn");
}
@@ -2509,7 +2531,7 @@
struct proc *p;
int n;
- LIST_FOREACH(p, &allproc, p_list) {
+ FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_state == PRS_NEW)
continue;
fdp = p->p_fd;
@@ -2523,20 +2545,43 @@
return (NULL);
}
+static void
+db_print_file(struct file *fp, int header)
+{
+ struct proc *p;
+
+ if (header)
+ db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
+ "File", "Type", "Data", "Flag", "GCFl", "Count",
+ "MCount", "Vnode", "FPID", "FCmd");
+ p = file_to_first_proc(fp);
+ db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
+ file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
+ fp->f_gcflag, fp->f_count, fp->f_msgcount, fp->f_vnode,
+ p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
+}
+
+DB_SHOW_COMMAND(file, db_show_file)
+{
+ struct file *fp;
+
+ if (!have_addr) {
+ db_printf("usage: show file <addr>\n");
+ return;
+ }
+ fp = (struct file *)addr;
+ db_print_file(fp, 1);
+}
+
DB_SHOW_COMMAND(files, db_show_files)
{
struct file *fp;
- struct proc *p;
+ int header;
- db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n", "File",
- "Type", "Data", "Flag", "GCFl", "Count", "MCount", "Vnode",
- "FPID", "FCmd");
+ header = 1;
LIST_FOREACH(fp, &filehead, f_list) {
- p = file_to_first_proc(fp);
- db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
- file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
- fp->f_gcflag, fp->f_count, fp->f_msgcount, fp->f_vnode,
- p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
+ db_print_file(fp, header);
+ header = 0;
}
}
#endif
Index: kern_resource.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_resource.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/kern_resource.c -L sys/kern/kern_resource.c -u -r1.1.1.2 -r1.2
--- sys/kern/kern_resource.c
+++ sys/kern/kern_resource.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_resource.c,v 1.148.2.1 2005/12/28 17:35:55 ps Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_resource.c,v 1.180.2.1 2007/12/20 07:15:40 davidxu Exp $");
#include "opt_compat.h"
@@ -43,18 +43,20 @@
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/file.h>
-#include <sys/imgact.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/refcount.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/sx.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/time.h>
+#include <sys/umtx.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -77,16 +79,12 @@
/*
* Resource controls and accounting.
*/
-
#ifndef _SYS_SYSPROTO_H_
struct getpriority_args {
int which;
int who;
};
#endif
-/*
- * MPSAFE
- */
int
getpriority(td, uap)
struct thread *td;
@@ -141,7 +139,10 @@
if (uap->who == 0)
uap->who = td->td_ucred->cr_uid;
sx_slock(&allproc_lock);
- LIST_FOREACH(p, &allproc, p_list) {
+ FOREACH_PROC_IN_SYSTEM(p) {
+ /* Do not bother to check PRS_NEW processes */
+ if (p->p_state == PRS_NEW)
+ continue;
PROC_LOCK(p);
if (!p_cansee(td, p) &&
p->p_ucred->cr_uid == uap->who) {
@@ -170,9 +171,6 @@
int prio;
};
#endif
-/*
- * MPSAFE
- */
int
setpriority(td, uap)
struct thread *td;
@@ -264,18 +262,106 @@
n = PRIO_MAX;
if (n < PRIO_MIN)
n = PRIO_MIN;
- if (n < p->p_nice && suser(td) != 0)
+ if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0)
return (EACCES);
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
sched_nice(p, n);
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
return (0);
}
/*
+ * Set realtime priority for LWP.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rtprio_thread_args {
+ int function;
+ lwpid_t lwpid;
+ struct rtprio *rtp;
+};
+#endif
+int
+rtprio_thread(struct thread *td, struct rtprio_thread_args *uap)
+{
+ struct proc *curp;
+ struct proc *p;
+ struct rtprio rtp;
+ struct thread *td1;
+ int cierror, error;
+
+ /* Perform copyin before acquiring locks if needed. */
+ if (uap->function == RTP_SET)
+ cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
+ else
+ cierror = 0;
+
+ curp = td->td_proc;
+ /*
+ * Though lwpid is unique, only current process is supported
+ * since there is no efficient way to look up a LWP yet.
+ */
+ p = curp;
+ PROC_LOCK(p);
+
+ switch (uap->function) {
+ case RTP_LOOKUP:
+ if ((error = p_cansee(td, p)))
+ break;
+ PROC_SLOCK(p);
+ if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
+ td1 = td;
+ else
+ td1 = thread_find(p, uap->lwpid);
+ if (td1 != NULL)
+ pri_to_rtp(td1, &rtp);
+ else
+ error = ESRCH;
+ PROC_SUNLOCK(p);
+ PROC_UNLOCK(p);
+ return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
+ case RTP_SET:
+ if ((error = p_cansched(td, p)) || (error = cierror))
+ break;
+
+ /* Disallow setting rtprio in most cases if not superuser. */
+/*
+ * Realtime priority has to be restricted for reasons which should be
+ * obvious. However, for idle priority, there is a potential for
+ * system deadlock if an idleprio process gains a lock on a resource
+ * that other processes need (and the idleprio process can't run
+ * due to a CPU-bound normal process). Fix me! XXX
+ */
+#if 0
+ if (RTP_PRIO_IS_REALTIME(rtp.type)) {
+#else
+ if (rtp.type != RTP_PRIO_NORMAL) {
+#endif
+ error = priv_check(td, PRIV_SCHED_RTPRIO);
+ if (error)
+ break;
+ }
+
+ PROC_SLOCK(p);
+ if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
+ td1 = td;
+ else
+ td1 = thread_find(p, uap->lwpid);
+ if (td1 != NULL)
+ error = rtp_to_pri(&rtp, td1);
+ else
+ error = ESRCH;
+ PROC_SUNLOCK(p);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+/*
* Set realtime priority.
- *
- * MPSAFE
*/
#ifndef _SYS_SYSPROTO_H_
struct rtprio_args {
@@ -284,7 +370,6 @@
struct rtprio *rtp;
};
#endif
-
int
rtprio(td, uap)
struct thread *td; /* curthread */
@@ -292,7 +377,7 @@
{
struct proc *curp;
struct proc *p;
- struct ksegrp *kg;
+ struct thread *tdp;
struct rtprio rtp;
int cierror, error;
@@ -316,7 +401,7 @@
case RTP_LOOKUP:
if ((error = p_cansee(td, p)))
break;
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
/*
* Return OUR priority if no pid specified,
* or if one is, report the highest priority
@@ -328,14 +413,14 @@
* as leaving it zero.
*/
if (uap->pid == 0) {
- pri_to_rtp(td->td_ksegrp, &rtp);
+ pri_to_rtp(td, &rtp);
} else {
struct rtprio rtp2;
rtp.type = RTP_PRIO_IDLE;
rtp.prio = RTP_PRIO_MAX;
- FOREACH_KSEGRP_IN_PROC(p, kg) {
- pri_to_rtp(kg, &rtp2);
+ FOREACH_THREAD_IN_PROC(p, tdp) {
+ pri_to_rtp(tdp, &rtp2);
if (rtp2.type < rtp.type ||
(rtp2.type == rtp.type &&
rtp2.prio < rtp.prio)) {
@@ -344,7 +429,7 @@
}
}
}
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
PROC_UNLOCK(p);
return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
case RTP_SET:
@@ -352,13 +437,6 @@
break;
/* Disallow setting rtprio in most cases if not superuser. */
- if (suser(td) != 0) {
- /* can't set someone else's */
- if (uap->pid) {
- error = EPERM;
- break;
- }
- /* can't set realtime priority */
/*
* Realtime priority has to be restricted for reasons which should be
* obvious. However, for idle priority, there is a potential for
@@ -367,32 +445,31 @@
* due to a CPU-bound normal process). Fix me! XXX
*/
#if 0
- if (RTP_PRIO_IS_REALTIME(rtp.type)) {
+ if (RTP_PRIO_IS_REALTIME(rtp.type)) {
#else
- if (rtp.type != RTP_PRIO_NORMAL) {
+ if (rtp.type != RTP_PRIO_NORMAL) {
#endif
- error = EPERM;
+ error = priv_check(td, PRIV_SCHED_RTPRIO);
+ if (error)
break;
- }
}
/*
* If we are setting our own priority, set just our
- * KSEGRP but if we are doing another process,
- * do all the groups on that process. If we
+ * thread but if we are doing another process,
+ * do all the threads on that process. If we
* specify our own pid we do the latter.
*/
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
if (uap->pid == 0) {
- error = rtp_to_pri(&rtp, td->td_ksegrp);
+ error = rtp_to_pri(&rtp, td);
} else {
- FOREACH_KSEGRP_IN_PROC(p, kg) {
- if ((error = rtp_to_pri(&rtp, kg)) != 0) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if ((error = rtp_to_pri(&rtp, td)) != 0)
break;
- }
}
}
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
break;
default:
error = EINVAL;
@@ -403,51 +480,61 @@
}
int
-rtp_to_pri(struct rtprio *rtp, struct ksegrp *kg)
+rtp_to_pri(struct rtprio *rtp, struct thread *td)
{
+ u_char newpri;
+ u_char oldpri;
- mtx_assert(&sched_lock, MA_OWNED);
if (rtp->prio > RTP_PRIO_MAX)
return (EINVAL);
+ thread_lock(td);
switch (RTP_PRIO_BASE(rtp->type)) {
case RTP_PRIO_REALTIME:
- kg->kg_user_pri = PRI_MIN_REALTIME + rtp->prio;
+ newpri = PRI_MIN_REALTIME + rtp->prio;
break;
case RTP_PRIO_NORMAL:
- kg->kg_user_pri = PRI_MIN_TIMESHARE + rtp->prio;
+ newpri = PRI_MIN_TIMESHARE + rtp->prio;
break;
case RTP_PRIO_IDLE:
- kg->kg_user_pri = PRI_MIN_IDLE + rtp->prio;
+ newpri = PRI_MIN_IDLE + rtp->prio;
break;
default:
+ thread_unlock(td);
return (EINVAL);
}
- sched_class(kg, rtp->type);
- if (curthread->td_ksegrp == kg) {
- sched_prio(curthread, kg->kg_user_pri); /* XXX dubious */
- }
+ sched_class(td, rtp->type); /* XXX fix */
+ oldpri = td->td_user_pri;
+ sched_user_prio(td, newpri);
+ if (curthread == td)
+ sched_prio(curthread, td->td_user_pri); /* XXX dubious */
+ if (TD_ON_UPILOCK(td) && oldpri != newpri) {
+ thread_unlock(td);
+ umtx_pi_adjust(td, oldpri);
+ } else
+ thread_unlock(td);
return (0);
}
void
-pri_to_rtp(struct ksegrp *kg, struct rtprio *rtp)
+pri_to_rtp(struct thread *td, struct rtprio *rtp)
{
- mtx_assert(&sched_lock, MA_OWNED);
- switch (PRI_BASE(kg->kg_pri_class)) {
+ thread_lock(td);
+ switch (PRI_BASE(td->td_pri_class)) {
case PRI_REALTIME:
- rtp->prio = kg->kg_user_pri - PRI_MIN_REALTIME;
+ rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME;
break;
case PRI_TIMESHARE:
- rtp->prio = kg->kg_user_pri - PRI_MIN_TIMESHARE;
+ rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE;
break;
case PRI_IDLE:
- rtp->prio = kg->kg_user_pri - PRI_MIN_IDLE;
+ rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE;
break;
default:
break;
}
- rtp->type = kg->kg_pri_class;
+ rtp->type = td->td_pri_class;
+ thread_unlock(td);
}
#if defined(COMPAT_43)
@@ -457,9 +544,6 @@
struct orlimit *rlp;
};
#endif
-/*
- * MPSAFE
- */
int
osetrlimit(td, uap)
struct thread *td;
@@ -483,9 +567,6 @@
struct orlimit *rlp;
};
#endif
-/*
- * MPSAFE
- */
int
ogetrlimit(td, uap)
struct thread *td;
@@ -525,9 +606,6 @@
struct rlimit *rlp;
};
#endif
-/*
- * MPSAFE
- */
int
setrlimit(td, uap)
struct thread *td;
@@ -542,6 +620,41 @@
return (error);
}
+static void
+lim_cb(void *arg)
+{
+ struct rlimit rlim;
+ struct thread *td;
+ struct proc *p;
+
+ p = arg;
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ /*
+ * Check if the process exceeds its cpu resource allocation. If
+ * it reaches the max, arrange to kill the process in ast().
+ */
+ if (p->p_cpulimit == RLIM_INFINITY)
+ return;
+ PROC_SLOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ ruxagg(&p->p_rux, td);
+ thread_unlock(td);
+ }
+ PROC_SUNLOCK(p);
+ if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) {
+ lim_rlimit(p, RLIMIT_CPU, &rlim);
+ if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) {
+ killproc(p, "exceeded maximum CPU limit");
+ } else {
+ if (p->p_cpulimit < rlim.rlim_max)
+ p->p_cpulimit += 5;
+ psignal(p, SIGXCPU);
+ }
+ }
+ callout_reset(&p->p_limco, hz, lim_cb, p);
+}
+
int
kern_setrlimit(td, which, limp)
struct thread *td;
@@ -551,7 +664,7 @@
struct plimit *newlim, *oldlim;
struct proc *p;
register struct rlimit *alimp;
- rlim_t oldssiz;
+ struct rlimit oldssiz;
int error;
if (which >= RLIM_NLIMITS)
@@ -565,7 +678,7 @@
if (limp->rlim_max < 0)
limp->rlim_max = RLIM_INFINITY;
- oldssiz = 0;
+ oldssiz.rlim_cur = 0;
p = td->td_proc;
newlim = lim_alloc();
PROC_LOCK(p);
@@ -573,7 +686,7 @@
alimp = &oldlim->pl_rlimit[which];
if (limp->rlim_cur > alimp->rlim_max ||
limp->rlim_max > alimp->rlim_max)
- if ((error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL))) {
+ if ((error = priv_check(td, PRIV_PROC_SETRLIMIT))) {
PROC_UNLOCK(p);
lim_free(newlim);
return (error);
@@ -586,9 +699,12 @@
switch (which) {
case RLIMIT_CPU:
- mtx_lock_spin(&sched_lock);
+ if (limp->rlim_cur != RLIM_INFINITY &&
+ p->p_cpulimit == RLIM_INFINITY)
+ callout_reset(&p->p_limco, hz, lim_cb, p);
+ PROC_SLOCK(p);
p->p_cpulimit = limp->rlim_cur;
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
break;
case RLIMIT_DATA:
if (limp->rlim_cur > maxdsiz)
@@ -602,7 +718,10 @@
limp->rlim_cur = maxssiz;
if (limp->rlim_max > maxssiz)
limp->rlim_max = maxssiz;
- oldssiz = alimp->rlim_cur;
+ oldssiz = *alimp;
+ if (td->td_proc->p_sysent->sv_fixlimit != NULL)
+ td->td_proc->p_sysent->sv_fixlimit(&oldssiz,
+ RLIMIT_STACK);
break;
case RLIMIT_NOFILE:
@@ -623,6 +742,8 @@
limp->rlim_max = 1;
break;
}
+ if (td->td_proc->p_sysent->sv_fixlimit != NULL)
+ td->td_proc->p_sysent->sv_fixlimit(limp, which);
*alimp = *limp;
p->p_limit = newlim;
PROC_UNLOCK(p);
@@ -634,20 +755,21 @@
* "rlim_cur" bytes accessible. If stack limit is going
* up make more accessible, if going down make inaccessible.
*/
- if (limp->rlim_cur != oldssiz) {
+ if (limp->rlim_cur != oldssiz.rlim_cur) {
vm_offset_t addr;
vm_size_t size;
vm_prot_t prot;
- if (limp->rlim_cur > oldssiz) {
+ if (limp->rlim_cur > oldssiz.rlim_cur) {
prot = p->p_sysent->sv_stackprot;
- size = limp->rlim_cur - oldssiz;
+ size = limp->rlim_cur - oldssiz.rlim_cur;
addr = p->p_sysent->sv_usrstack -
limp->rlim_cur;
} else {
prot = VM_PROT_NONE;
- size = oldssiz - limp->rlim_cur;
- addr = p->p_sysent->sv_usrstack - oldssiz;
+ size = oldssiz.rlim_cur - limp->rlim_cur;
+ addr = p->p_sysent->sv_usrstack -
+ oldssiz.rlim_cur;
}
addr = trunc_page(addr);
size = round_page(size);
@@ -656,12 +778,6 @@
}
}
- if (td->td_proc->p_sysent->sv_fixlimits != NULL) {
- struct image_params imgp;
-
- imgp.proc = td->td_proc;
- td->td_proc->p_sysent->sv_fixlimits(&imgp);
- }
return (0);
}
@@ -671,9 +787,6 @@
struct rlimit *rlp;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
getrlimit(td, uap)
@@ -695,125 +808,124 @@
}
/*
- * Transform the running time and tick information in proc p into user,
- * system, and interrupt time usage.
+ * Transform the running time and tick information for children of proc p
+ * into user and system time usage.
*/
void
-calcru(p, up, sp)
+calccru(p, up, sp)
struct proc *p;
struct timeval *up;
struct timeval *sp;
{
- struct bintime bt;
- struct rusage_ext rux;
- struct thread *td;
- int bt_valid;
PROC_LOCK_ASSERT(p, MA_OWNED);
- mtx_assert(&sched_lock, MA_NOTOWNED);
- bt_valid = 0;
- mtx_lock_spin(&sched_lock);
- rux = p->p_rux;
- FOREACH_THREAD_IN_PROC(p, td) {
- if (TD_IS_RUNNING(td)) {
- /*
- * Adjust for the current time slice. This is
- * actually fairly important since the error here is
- * on the order of a time quantum which is much
- * greater than the precision of binuptime().
- */
- KASSERT(td->td_oncpu != NOCPU,
- ("%s: running thread has no CPU", __func__));
- if (!bt_valid) {
- binuptime(&bt);
- bt_valid = 1;
- }
- bintime_add(&rux.rux_runtime, &bt);
- bintime_sub(&rux.rux_runtime,
- &pcpu_find(td->td_oncpu)->pc_switchtime);
- }
- }
- mtx_unlock_spin(&sched_lock);
- calcru1(p, &rux, up, sp);
- p->p_rux.rux_uu = rux.rux_uu;
- p->p_rux.rux_su = rux.rux_su;
- p->p_rux.rux_iu = rux.rux_iu;
+ calcru1(p, &p->p_crux, up, sp);
}
+/*
+ * Transform the running time and tick information in proc p into user
+ * and system time usage. If appropriate, include the current time slice
+ * on this CPU.
+ */
void
-calccru(p, up, sp)
- struct proc *p;
- struct timeval *up;
- struct timeval *sp;
+calcru(struct proc *p, struct timeval *up, struct timeval *sp)
{
+ struct thread *td;
+ uint64_t u;
PROC_LOCK_ASSERT(p, MA_OWNED);
- calcru1(p, &p->p_crux, up, sp);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ /*
+ * If we are getting stats for the current process, then add in the
+ * stats that this thread has accumulated in its current time slice.
+ * We reset the thread and CPU state as if we had performed a context
+ * switch right here.
+ */
+ td = curthread;
+ if (td->td_proc == p) {
+ u = cpu_ticks();
+ p->p_rux.rux_runtime += u - PCPU_GET(switchtime);
+ PCPU_SET(switchtime, u);
+ }
+ /* Make sure the per-thread stats are current. */
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (td->td_runtime == 0)
+ continue;
+ thread_lock(td);
+ ruxagg(&p->p_rux, td);
+ thread_unlock(td);
+ }
+ calcru1(p, &p->p_rux, up, sp);
}
static void
-calcru1(p, ruxp, up, sp)
- struct proc *p;
- struct rusage_ext *ruxp;
- struct timeval *up;
- struct timeval *sp;
+calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
+ struct timeval *sp)
{
- struct timeval tv;
- /* {user, system, interrupt, total} {ticks, usec}; previous tu: */
- u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
+ /* {user, system, interrupt, total} {ticks, usec}: */
+ u_int64_t ut, uu, st, su, it, tt, tu;
ut = ruxp->rux_uticks;
st = ruxp->rux_sticks;
it = ruxp->rux_iticks;
tt = ut + st + it;
if (tt == 0) {
+ /* Avoid divide by zero */
st = 1;
tt = 1;
}
- bintime2timeval(&ruxp->rux_runtime, &tv);
- tu = (u_int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
- ptu = ruxp->rux_uu + ruxp->rux_su + ruxp->rux_iu;
- if (tu < ptu) {
- printf(
-"calcru: runtime went backwards from %ju usec to %ju usec for pid %d (%s)\n",
- (uintmax_t)ptu, (uintmax_t)tu, p->p_pid, p->p_comm);
- tu = ptu;
- }
+ tu = cputick2usec(ruxp->rux_runtime);
if ((int64_t)tu < 0) {
+ /* XXX: this should be an assert /phk */
printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
(intmax_t)tu, p->p_pid, p->p_comm);
- tu = ptu;
+ tu = ruxp->rux_tu;
}
- /* Subdivide tu. */
- uu = (tu * ut) / tt;
- su = (tu * st) / tt;
- iu = tu - uu - su;
-
- /* Enforce monotonicity. */
- if (uu < ruxp->rux_uu || su < ruxp->rux_su || iu < ruxp->rux_iu) {
+ if (tu >= ruxp->rux_tu) {
+ /*
+ * The normal case, time increased.
+ * Enforce monotonicity of bucketed numbers.
+ */
+ uu = (tu * ut) / tt;
if (uu < ruxp->rux_uu)
uu = ruxp->rux_uu;
- else if (uu + ruxp->rux_su + ruxp->rux_iu > tu)
- uu = tu - ruxp->rux_su - ruxp->rux_iu;
- if (st == 0)
+ su = (tu * st) / tt;
+ if (su < ruxp->rux_su)
su = ruxp->rux_su;
- else {
- su = ((tu - uu) * st) / (st + it);
- if (su < ruxp->rux_su)
- su = ruxp->rux_su;
- else if (uu + su + ruxp->rux_iu > tu)
- su = tu - uu - ruxp->rux_iu;
- }
- KASSERT(uu + su + ruxp->rux_iu <= tu,
- ("calcru: monotonisation botch 1"));
- iu = tu - uu - su;
- KASSERT(iu >= ruxp->rux_iu,
- ("calcru: monotonisation botch 2"));
+ } else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
+ /*
+ * When we calibrate the cputicker, it is not uncommon to
+ * see the presumably fixed frequency increase slightly over
+ * time as a result of thermal stabilization and NTP
+ * discipline (of the reference clock). We therefore ignore
+ * a bit of backwards slop because we expect to catch up
+ * shortly. We use a 3 microsecond limit to catch low
+ * counts and a 1% limit for high counts.
+ */
+ uu = ruxp->rux_uu;
+ su = ruxp->rux_su;
+ tu = ruxp->rux_tu;
+ } else { /* tu < ruxp->rux_tu */
+ /*
+ * What happene here was likely that a laptop, which ran at
+ * a reduced clock frequency at boot, kicked into high gear.
+ * The wisdom of spamming this message in that case is
+ * dubious, but it might also be indicative of something
+ * serious, so lets keep it and hope laptops can be made
+ * more truthful about their CPU speed via ACPI.
+ */
+ printf("calcru: runtime went backwards from %ju usec "
+ "to %ju usec for pid %d (%s)\n",
+ (uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
+ p->p_pid, p->p_comm);
+ uu = (tu * ut) / tt;
+ su = (tu * st) / tt;
}
+
ruxp->rux_uu = uu;
ruxp->rux_su = su;
- ruxp->rux_iu = iu;
+ ruxp->rux_tu = tu;
up->tv_sec = uu / 1000000;
up->tv_usec = uu % 1000000;
@@ -827,9 +939,6 @@
struct rusage *rusage;
};
#endif
-/*
- * MPSAFE
- */
int
getrusage(td, uap)
register struct thread *td;
@@ -857,8 +966,8 @@
switch (who) {
case RUSAGE_SELF:
- *rup = p->p_stats->p_ru;
- calcru(p, &rup->ru_utime, &rup->ru_stime);
+ rufetchcalc(p, rup, &rup->ru_utime,
+ &rup->ru_stime);
break;
case RUSAGE_CHILDREN:
@@ -875,22 +984,11 @@
}
void
-ruadd(ru, rux, ru2, rux2)
- struct rusage *ru;
- struct rusage_ext *rux;
- struct rusage *ru2;
- struct rusage_ext *rux2;
+rucollect(struct rusage *ru, struct rusage *ru2)
{
- register long *ip, *ip2;
- register int i;
+ long *ip, *ip2;
+ int i;
- bintime_add(&rux->rux_runtime, &rux2->rux_runtime);
- rux->rux_uticks += rux2->rux_uticks;
- rux->rux_sticks += rux2->rux_sticks;
- rux->rux_iticks += rux2->rux_iticks;
- rux->rux_uu += rux2->rux_uu;
- rux->rux_su += rux2->rux_su;
- rux->rux_iu += rux2->rux_iu;
if (ru->ru_maxrss < ru2->ru_maxrss)
ru->ru_maxrss = ru2->ru_maxrss;
ip = &ru->ru_first;
@@ -899,6 +997,78 @@
*ip++ += *ip2++;
}
+void
+ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2,
+ struct rusage_ext *rux2)
+{
+
+ rux->rux_runtime += rux2->rux_runtime;
+ rux->rux_uticks += rux2->rux_uticks;
+ rux->rux_sticks += rux2->rux_sticks;
+ rux->rux_iticks += rux2->rux_iticks;
+ rux->rux_uu += rux2->rux_uu;
+ rux->rux_su += rux2->rux_su;
+ rux->rux_tu += rux2->rux_tu;
+ rucollect(ru, ru2);
+}
+
+/*
+ * Aggregate tick counts into the proc's rusage_ext.
+ */
+void
+ruxagg(struct rusage_ext *rux, struct thread *td)
+{
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED);
+ rux->rux_runtime += td->td_runtime;
+ rux->rux_uticks += td->td_uticks;
+ rux->rux_sticks += td->td_sticks;
+ rux->rux_iticks += td->td_iticks;
+ td->td_runtime = 0;
+ td->td_uticks = 0;
+ td->td_iticks = 0;
+ td->td_sticks = 0;
+}
+
+/*
+ * Update the rusage_ext structure and fetch a valid aggregate rusage
+ * for proc p if storage for one is supplied.
+ */
+void
+rufetch(struct proc *p, struct rusage *ru)
+{
+ struct thread *td;
+
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+
+ *ru = p->p_ru;
+ if (p->p_numthreads > 0) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ ruxagg(&p->p_rux, td);
+ thread_unlock(td);
+ rucollect(ru, &td->td_ru);
+ }
+ }
+}
+
+/*
+ * Atomically perform a rufetch and a calcru together.
+ * Consumers, can safely assume the calcru is executed only once
+ * rufetch is completed.
+ */
+void
+rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up,
+ struct timeval *sp)
+{
+
+ PROC_SLOCK(p);
+ rufetch(p, ru);
+ calcru(p, up, sp);
+ PROC_SUNLOCK(p);
+}
+
/*
* Allocate a new resource limits structure and initialize its
* reference count and mutex pointer.
@@ -909,8 +1079,7 @@
struct plimit *limp;
limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK);
- limp->pl_refcnt = 1;
- limp->pl_mtx = mtx_pool_alloc(mtxpool_sleep);
+ refcount_init(&limp->pl_refcnt, 1);
return (limp);
}
@@ -919,25 +1088,27 @@
struct plimit *limp;
{
- LIM_LOCK(limp);
- limp->pl_refcnt++;
- LIM_UNLOCK(limp);
+ refcount_acquire(&limp->pl_refcnt);
return (limp);
}
void
+lim_fork(struct proc *p1, struct proc *p2)
+{
+ p2->p_limit = lim_hold(p1->p_limit);
+ callout_init_mtx(&p2->p_limco, &p2->p_mtx, 0);
+ if (p1->p_cpulimit != RLIM_INFINITY)
+ callout_reset(&p2->p_limco, hz, lim_cb, p2);
+}
+
+void
lim_free(limp)
struct plimit *limp;
{
- LIM_LOCK(limp);
KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow"));
- if (--limp->pl_refcnt == 0) {
- LIM_UNLOCK(limp);
+ if (refcount_release(&limp->pl_refcnt))
free((void *)limp, M_PLIMIT);
- return;
- }
- LIM_UNLOCK(limp);
}
/*
@@ -991,6 +1162,8 @@
KASSERT(which >= 0 && which < RLIM_NLIMITS,
("request for invalid resource limit"));
*rlp = p->p_limit->pl_rlimit[which];
+ if (p->p_sysent->sv_fixlimit != NULL)
+ p->p_sysent->sv_fixlimit(rlp, which);
}
/*
@@ -1088,7 +1261,7 @@
* that we don't need to free, simply unlock and return.
* Suboptimal case:
* If refcount lowering results in need to free, bump the count
- * back up, loose the lock and aquire the locks in the proper
+ * back up, lose the lock and acquire the locks in the proper
* order to try again.
*/
void
--- /dev/null
+++ sys/kern/vfs_extattr.c
@@ -0,0 +1,785 @@
+/*-
+ * Copyright (c) 1999-2001 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/vfs_extattr.c,v 1.431 2006/12/23 00:30:03 rwatson Exp $");
+
+#include "opt_mac.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/limits.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/extattr.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+/*
+ * Syscall to push extended attribute configuration information into the VFS.
+ * Accepts a path, which it converts to a mountpoint, as well as a command
+ * (int cmd), and attribute name and misc data.
+ *
+ * Currently this is used only by UFS1 extended attributes.
+ */
+int
+extattrctl(td, uap)
+ struct thread *td;
+ struct extattrctl_args /* {
+ const char *path;
+ int cmd;
+ const char *filename;
+ int attrnamespace;
+ const char *attrname;
+ } */ *uap;
+{
+ struct vnode *filename_vp;
+ struct nameidata nd;
+ struct mount *mp, *mp_writable;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int vfslocked, fnvfslocked, error;
+
+ AUDIT_ARG(cmd, uap->cmd);
+ AUDIT_ARG(value, uap->attrnamespace);
+ /*
+ * uap->attrname is not always defined. We check again later when we
+ * invoke the VFS call so as to pass in NULL there if needed.
+ */
+ if (uap->attrname != NULL) {
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
+ NULL);
+ if (error)
+ return (error);
+ }
+ AUDIT_ARG(text, attrname);
+
+ vfslocked = fnvfslocked = 0;
+ /*
+ * uap->filename is not always defined. If it is, grab a vnode lock,
+ * which VFS_EXTATTRCTL() will later release.
+ */
+ filename_vp = NULL;
+ if (uap->filename != NULL) {
+ NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF |
+ AUDITVNODE2, UIO_USERSPACE, uap->filename, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ fnvfslocked = NDHASGIANT(&nd);
+ filename_vp = nd.ni_vp;
+ NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK);
+ }
+
+ /* uap->path is always defined. */
+ NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ error = namei(&nd);
+ if (error) {
+ if (filename_vp != NULL)
+ vput(filename_vp);
+ goto out;
+ }
+ vfslocked = NDHASGIANT(&nd);
+ mp = nd.ni_vp->v_mount;
+ error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
+ NDFREE(&nd, 0);
+ if (error) {
+ if (filename_vp != NULL)
+ vput(filename_vp);
+ goto out;
+ }
+
+ error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace,
+ uap->attrname != NULL ? attrname : NULL, td);
+
+ vn_finished_write(mp_writable);
+ /*
+ * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, filename_vp,
+ * so vrele it if it is defined.
+ */
+ if (filename_vp != NULL)
+ vrele(filename_vp);
+out:
+ VFS_UNLOCK_GIANT(fnvfslocked);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+/*-
+ * Set a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", userspace buffer
+ * pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ void *data, size_t nbytes, struct thread *td)
+{
+ struct mount *mp;
+ struct uio auio;
+ struct iovec aiov;
+ ssize_t cnt;
+ int error;
+
+ VFS_ASSERT_GIANT(vp->v_mount);
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+ aiov.iov_base = data;
+ aiov.iov_len = nbytes;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ if (nbytes > INT_MAX) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid = nbytes;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ cnt = nbytes;
+
+#ifdef MAC
+ error = mac_check_vnode_setextattr(td->td_ucred, vp, attrnamespace,
+ attrname, &auio);
+ if (error)
+ goto done;
+#endif
+
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
+ td->td_ucred, td);
+ cnt -= auio.uio_resid;
+ td->td_retval[0] = cnt;
+
+done:
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+int
+extattr_set_fd(td, uap)
+ struct thread *td;
+ struct extattr_set_fd_args /* {
+ int fd;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct file *fp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int vfslocked, error;
+
+ AUDIT_ARG(fd, uap->fd);
+ AUDIT_ARG(value, uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG(text, attrname);
+
+ error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
+ if (error)
+ return (error);
+
+ vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+ error = extattr_set_vp(fp->f_vnode, uap->attrnamespace,
+ attrname, uap->data, uap->nbytes, td);
+ fdrop(fp, td);
+ VFS_UNLOCK_GIANT(vfslocked);
+
+ return (error);
+}
+
+int
+extattr_set_file(td, uap)
+ struct thread *td;
+ struct extattr_set_file_args /* {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int vfslocked, error;
+
+ AUDIT_ARG(value, uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG(text, attrname);
+
+ NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ vfslocked = NDHASGIANT(&nd);
+ error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
+ uap->data, uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+int
+extattr_set_link(td, uap)
+ struct thread *td;
+ struct extattr_set_link_args /* {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int vfslocked, error;
+
+ AUDIT_ARG(value, uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG(text, attrname);
+
+ NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ vfslocked = NDHASGIANT(&nd);
+ error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
+ uap->data, uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+/*-
+ * Get a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", userspace buffer
+ * pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ void *data, size_t nbytes, struct thread *td)
+{
+ struct uio auio, *auiop;
+ struct iovec aiov;
+ ssize_t cnt;
+ size_t size, *sizep;
+ int error;
+
+ VFS_ASSERT_GIANT(vp->v_mount);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_READ);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+ /*
+ * Slightly unusual semantics: if the user provides a NULL data
+ * pointer, they don't want to receive the data, just the maximum
+ * read length.
+ */
+ auiop = NULL;
+ sizep = NULL;
+ cnt = 0;
+ if (data != NULL) {
+ aiov.iov_base = data;
+ aiov.iov_len = nbytes;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ if (nbytes > INT_MAX) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid = nbytes;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auiop = &auio;
+ cnt = nbytes;
+ } else
+ sizep = &size;
+
+#ifdef MAC
+ error = mac_check_vnode_getextattr(td->td_ucred, vp, attrnamespace,
+ attrname, &auio);
+ if (error)
+ goto done;
+#endif
+
+ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
+ td->td_ucred, td);
+
+ if (auiop != NULL) {
+ cnt -= auio.uio_resid;
+ td->td_retval[0] = cnt;
+ } else
+ td->td_retval[0] = size;
+
+done:
+ VOP_UNLOCK(vp, 0, td);
+ return (error);
+}
+
+int
+extattr_get_fd(td, uap)
+ struct thread *td;
+ struct extattr_get_fd_args /* {
+ int fd;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct file *fp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int vfslocked, error;
+
+ AUDIT_ARG(fd, uap->fd);
+ AUDIT_ARG(value, uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG(text, attrname);
+
+ error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
+ if (error)
+ return (error);
+
+ vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+ error = extattr_get_vp(fp->f_vnode, uap->attrnamespace,
+ attrname, uap->data, uap->nbytes, td);
+
+ fdrop(fp, td);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+int
+extattr_get_file(td, uap)
+ struct thread *td;
+ struct extattr_get_file_args /* {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int vfslocked, error;
+
+ AUDIT_ARG(value, uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG(text, attrname);
+
+ NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ vfslocked = NDHASGIANT(&nd);
+ error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
+ uap->data, uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+int
+extattr_get_link(td, uap)
+ struct thread *td;
+ struct extattr_get_link_args /* {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int vfslocked, error;
+
+ AUDIT_ARG(value, uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG(text, attrname);
+
+ NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ vfslocked = NDHASGIANT(&nd);
+ error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
+ uap->data, uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+/*
+ * extattr_delete_vp(): Delete a named extended attribute on a file or
+ * directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", proc "p"
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ struct thread *td)
+{
+ struct mount *mp;
+ int error;
+
+ VFS_ASSERT_GIANT(vp->v_mount);
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+#ifdef MAC
+ error = mac_check_vnode_deleteextattr(td->td_ucred, vp, attrnamespace,
+ attrname);
+ if (error)
+ goto done;
+#endif
+
+ error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred,
+ td);
+ if (error == EOPNOTSUPP)
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
+ td->td_ucred, td);
+#ifdef MAC
+done:
+#endif
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+int
+extattr_delete_fd(td, uap)
+ struct thread *td;
+ struct extattr_delete_fd_args /* {
+ int fd;
+ int attrnamespace;
+ const char *attrname;
+ } */ *uap;
+{
+ struct file *fp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int vfslocked, error;
+
+ AUDIT_ARG(fd, uap->fd);
+ AUDIT_ARG(value, uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG(text, attrname);
+
+ error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
+ if (error)
+ return (error);
+
+ vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+ error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace,
+ attrname, td);
+ fdrop(fp, td);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+int
+extattr_delete_file(td, uap)
+ struct thread *td;
+ struct extattr_delete_file_args /* {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int vfslocked, error;
+
+ AUDIT_ARG(value, uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return(error);
+ AUDIT_ARG(text, attrname);
+
+ NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ error = namei(&nd);
+ if (error)
+ return(error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ vfslocked = NDHASGIANT(&nd);
+ error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
+ vrele(nd.ni_vp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return(error);
+}
+
+int
+extattr_delete_link(td, uap)
+ struct thread *td;
+ struct extattr_delete_link_args /* {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int vfslocked, error;
+
+ AUDIT_ARG(value, uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return(error);
+ AUDIT_ARG(text, attrname);
+
+ NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ error = namei(&nd);
+ if (error)
+ return(error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ vfslocked = NDHASGIANT(&nd);
+ error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
+ vrele(nd.ni_vp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return(error);
+}
+
+/*-
+ * Retrieve a list of extended attributes on a file or directory.
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace",
+ * userspace buffer pointer "data", buffer length "nbytes",
+ * thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_list_vp(struct vnode *vp, int attrnamespace, void *data,
+ size_t nbytes, struct thread *td)
+{
+ struct uio auio, *auiop;
+ size_t size, *sizep;
+ struct iovec aiov;
+ ssize_t cnt;
+ int error;
+
+ VFS_ASSERT_GIANT(vp->v_mount);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_READ);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+ auiop = NULL;
+ sizep = NULL;
+ cnt = 0;
+ if (data != NULL) {
+ aiov.iov_base = data;
+ aiov.iov_len = nbytes;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ if (nbytes > INT_MAX) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid = nbytes;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auiop = &auio;
+ cnt = nbytes;
+ } else
+ sizep = &size;
+
+#ifdef MAC
+ error = mac_check_vnode_listextattr(td->td_ucred, vp, attrnamespace);
+ if (error)
+ goto done;
+#endif
+
+ error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep,
+ td->td_ucred, td);
+
+ if (auiop != NULL) {
+ cnt -= auio.uio_resid;
+ td->td_retval[0] = cnt;
+ } else
+ td->td_retval[0] = size;
+
+done:
+ VOP_UNLOCK(vp, 0, td);
+ return (error);
+}
+
+
+int
+extattr_list_fd(td, uap)
+ struct thread *td;
+ struct extattr_list_fd_args /* {
+ int fd;
+ int attrnamespace;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct file *fp;
+ int vfslocked, error;
+
+ AUDIT_ARG(fd, uap->fd);
+ AUDIT_ARG(value, uap->attrnamespace);
+ error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
+ if (error)
+ return (error);
+
+ vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+ error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data,
+ uap->nbytes, td);
+
+ fdrop(fp, td);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+int
+extattr_list_file(td, uap)
+ struct thread*td;
+ struct extattr_list_file_args /* {
+ const char *path;
+ int attrnamespace;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ int vfslocked, error;
+
+ AUDIT_ARG(value, uap->attrnamespace);
+ NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ vfslocked = NDHASGIANT(&nd);
+ error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
+ uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+int
+extattr_list_link(td, uap)
+ struct thread*td;
+ struct extattr_list_link_args /* {
+ const char *path;
+ int attrnamespace;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ int vfslocked, error;
+
+ AUDIT_ARG(value, uap->attrnamespace);
+ NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ vfslocked = NDHASGIANT(&nd);
+ error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
+ uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
--- /dev/null
+++ sys/kern/subr_fattime.c
@@ -0,0 +1,307 @@
+/*-
+ * Copyright (c) 2006 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/kern/subr_fattime.c,v 1.2 2006/10/24 10:27:23 phk Exp $
+ *
+ * Convert MS-DOS FAT format timestamps to and from unix timespecs
+ *
+ * FAT filestamps originally consisted of two 16 bit integers, encoded like
+ * this:
+ *
+ * yyyyyyymmmmddddd (year - 1980, month, day)
+ *
+ * hhhhhmmmmmmsssss (hour, minutes, seconds divided by two)
+ *
+ * Subsequently even Microsoft realized that files could be accessed in less
+ * than two seconds and a byte was added containing:
+ *
+ * sfffffff (second mod two, 100ths of second)
+ *
+ * FAT timestamps are in the local timezone, with no indication of which
+ * timezone much less if daylight savings time applies.
+ *
+ * Later on again, in Windows NT, timestamps were defined relative to GMT.
+ *
+ * Purists will point out that UTC replaced GMT for such uses around
+ * a century ago, already then. Ironically "NT" was an abbreviation of
+ * "New Technology". Anyway...
+ *
+ * The 'utc' argument determines if the resulting FATTIME timestamp
+ * should b on the UTC or local timezone calendar.
+ *
+ * The conversion functions below cut time into four-year leap-second
+ * cycles rather than single years and uses table lookups inside those
+ * cycles to get the months and years sorted out.
+ *
+ * Obviously we cannot calculate the correct table index going from
+ * a posix seconds count to Y/M/D, but we can get pretty close by
+ * dividing the daycount by 32 (giving a too low index), and then
+ * adjusting upwards a couple of steps if necessary.
+ *
+ * FAT timestamps have 7 bits for the year and starts at 1980, so
+ * they can represent up to 2107 which means that the non-leap-year
+ * 2100 must be handled.
+ *
+ * XXX: As long as time_t is 32 bits this is not relevant or easily
+ * XXX: testable. Revisit when time_t grows bigger.
+ * XXX: grepfodder: 64 bit time_t, y2100, y2.1k, 2100, leap year
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/clock.h>
+
+#define DAY (24 * 60 * 60) /* Length of day in seconds */
+#define YEAR 365 /* Length of normal year */
+#define LYC (4 * YEAR + 1) /* Length of 4 year leap-year cycle */
+#define T1980 (10 * 365 + 2) /* Days from 1970 to 1980 */
+
+/* End of month is N days from start of (normal) year */
+#define JAN 31
+#define FEB (JAN + 28)
+#define MAR (FEB + 31)
+#define APR (MAR + 30)
+#define MAY (APR + 31)
+#define JUN (MAY + 30)
+#define JUL (JUN + 31)
+#define AUG (JUL + 31)
+#define SEP (AUG + 30)
+#define OCT (SEP + 31)
+#define NOV (OCT + 30)
+#define DEC (NOV + 31)
+
+/* Table of months in a 4 year leap-year cycle */
+
+#define ENC(y,m) (((y) << 9) | ((m) << 5))
+
+static const struct {
+ uint16_t days; /* month start in days relative to cycle */
+ uint16_t coded; /* encoded year + month information */
+} mtab[48] = {
+ { 0 + 0 * YEAR, ENC(0, 1) },
+
+ { JAN + 0 * YEAR, ENC(0, 2) }, { FEB + 0 * YEAR + 1, ENC(0, 3) },
+ { MAR + 0 * YEAR + 1, ENC(0, 4) }, { APR + 0 * YEAR + 1, ENC(0, 5) },
+ { MAY + 0 * YEAR + 1, ENC(0, 6) }, { JUN + 0 * YEAR + 1, ENC(0, 7) },
+ { JUL + 0 * YEAR + 1, ENC(0, 8) }, { AUG + 0 * YEAR + 1, ENC(0, 9) },
+ { SEP + 0 * YEAR + 1, ENC(0, 10) }, { OCT + 0 * YEAR + 1, ENC(0, 11) },
+ { NOV + 0 * YEAR + 1, ENC(0, 12) }, { DEC + 0 * YEAR + 1, ENC(1, 1) },
+
+ { JAN + 1 * YEAR + 1, ENC(1, 2) }, { FEB + 1 * YEAR + 1, ENC(1, 3) },
+ { MAR + 1 * YEAR + 1, ENC(1, 4) }, { APR + 1 * YEAR + 1, ENC(1, 5) },
+ { MAY + 1 * YEAR + 1, ENC(1, 6) }, { JUN + 1 * YEAR + 1, ENC(1, 7) },
+ { JUL + 1 * YEAR + 1, ENC(1, 8) }, { AUG + 1 * YEAR + 1, ENC(1, 9) },
+ { SEP + 1 * YEAR + 1, ENC(1, 10) }, { OCT + 1 * YEAR + 1, ENC(1, 11) },
+ { NOV + 1 * YEAR + 1, ENC(1, 12) }, { DEC + 1 * YEAR + 1, ENC(2, 1) },
+
+ { JAN + 2 * YEAR + 1, ENC(2, 2) }, { FEB + 2 * YEAR + 1, ENC(2, 3) },
+ { MAR + 2 * YEAR + 1, ENC(2, 4) }, { APR + 2 * YEAR + 1, ENC(2, 5) },
+ { MAY + 2 * YEAR + 1, ENC(2, 6) }, { JUN + 2 * YEAR + 1, ENC(2, 7) },
+ { JUL + 2 * YEAR + 1, ENC(2, 8) }, { AUG + 2 * YEAR + 1, ENC(2, 9) },
+ { SEP + 2 * YEAR + 1, ENC(2, 10) }, { OCT + 2 * YEAR + 1, ENC(2, 11) },
+ { NOV + 2 * YEAR + 1, ENC(2, 12) }, { DEC + 2 * YEAR + 1, ENC(3, 1) },
+
+ { JAN + 3 * YEAR + 1, ENC(3, 2) }, { FEB + 3 * YEAR + 1, ENC(3, 3) },
+ { MAR + 3 * YEAR + 1, ENC(3, 4) }, { APR + 3 * YEAR + 1, ENC(3, 5) },
+ { MAY + 3 * YEAR + 1, ENC(3, 6) }, { JUN + 3 * YEAR + 1, ENC(3, 7) },
+ { JUL + 3 * YEAR + 1, ENC(3, 8) }, { AUG + 3 * YEAR + 1, ENC(3, 9) },
+ { SEP + 3 * YEAR + 1, ENC(3, 10) }, { OCT + 3 * YEAR + 1, ENC(3, 11) },
+ { NOV + 3 * YEAR + 1, ENC(3, 12) }
+};
+
+
+void
+timespec2fattime(struct timespec *tsp, int utc, u_int16_t *ddp, u_int16_t *dtp, u_int8_t *dhp)
+{
+ time_t t1;
+ unsigned t2, l, m;
+
+ t1 = tsp->tv_sec;
+ if (!utc)
+ t1 -= utc_offset();
+
+ if (dhp != NULL)
+ *dhp = (tsp->tv_sec & 1) * 100 + tsp->tv_nsec / 10000000;
+ if (dtp != NULL) {
+ *dtp = (t1 / 2) % 30;
+ *dtp |= ((t1 / 60) % 60) << 5;
+ *dtp |= ((t1 / 3600) % 24) << 11;
+ }
+ if (ddp != NULL) {
+ t2 = t1 / DAY;
+ if (t2 < T1980) {
+ /* Impossible date, truncate to 1980-01-01 */
+ *ddp = 0x0021;
+ } else {
+ t2 -= T1980;
+
+ /*
+ * 2100 is not a leap year.
+ * XXX: a 32 bit time_t can not get us here.
+ */
+ if (t2 >= ((2100 - 1980) / 4 * LYC + FEB))
+ t2++;
+
+ /* Account for full leapyear cycles */
+ l = t2 / LYC;
+ *ddp = (l * 4) << 9;
+ t2 -= l * LYC;
+
+ /* Find approximate table entry */
+ m = t2 / 32;
+
+ /* Find correct table entry */
+ while (m < 47 && mtab[m + 1].days <= t2)
+ m++;
+
+ /* Get year + month from the table */
+ *ddp += mtab[m].coded;
+
+ /* And apply the day in the month */
+ t2 -= mtab[m].days - 1;
+ *ddp |= t2;
+ }
+ }
+}
+
+/*
+ * Table indexed by the bottom two bits of year + four bits of the month
+ * from the FAT timestamp, returning number of days into 4 year long
+ * leap-year cycle
+ */
+
+#define DCOD(m, y, l) ((m) + YEAR * (y) + (l))
+static const uint16_t daytab[64] = {
+ 0, DCOD( 0, 0, 0), DCOD(JAN, 0, 0), DCOD(FEB, 0, 1),
+ DCOD(MAR, 0, 1), DCOD(APR, 0, 1), DCOD(MAY, 0, 1), DCOD(JUN, 0, 1),
+ DCOD(JUL, 0, 1), DCOD(AUG, 0, 1), DCOD(SEP, 0, 1), DCOD(OCT, 0, 1),
+ DCOD(NOV, 0, 1), DCOD(DEC, 0, 1), 0, 0,
+ 0, DCOD( 0, 1, 1), DCOD(JAN, 1, 1), DCOD(FEB, 1, 1),
+ DCOD(MAR, 1, 1), DCOD(APR, 1, 1), DCOD(MAY, 1, 1), DCOD(JUN, 1, 1),
+ DCOD(JUL, 1, 1), DCOD(AUG, 1, 1), DCOD(SEP, 1, 1), DCOD(OCT, 1, 1),
+ DCOD(NOV, 1, 1), DCOD(DEC, 1, 1), 0, 0,
+ 0, DCOD( 0, 2, 1), DCOD(JAN, 2, 1), DCOD(FEB, 2, 1),
+ DCOD(MAR, 2, 1), DCOD(APR, 2, 1), DCOD(MAY, 2, 1), DCOD(JUN, 2, 1),
+ DCOD(JUL, 2, 1), DCOD(AUG, 2, 1), DCOD(SEP, 2, 1), DCOD(OCT, 2, 1),
+ DCOD(NOV, 2, 1), DCOD(DEC, 2, 1), 0, 0,
+ 0, DCOD( 0, 3, 1), DCOD(JAN, 3, 1), DCOD(FEB, 3, 1),
+ DCOD(MAR, 3, 1), DCOD(APR, 3, 1), DCOD(MAY, 3, 1), DCOD(JUN, 3, 1),
+ DCOD(JUL, 3, 1), DCOD(AUG, 3, 1), DCOD(SEP, 3, 1), DCOD(OCT, 3, 1),
+ DCOD(NOV, 3, 1), DCOD(DEC, 3, 1), 0, 0
+};
+
+void
+fattime2timespec(unsigned dd, unsigned dt, unsigned dh, int utc, struct timespec *tsp)
+{
+ unsigned day;
+
+ /* Unpack time fields */
+ tsp->tv_sec = (dt & 0x1f) << 1;
+ tsp->tv_sec += ((dt & 0x7e0) >> 5) * 60;
+ tsp->tv_sec += ((dt & 0xf800) >> 11) * 3600;
+ tsp->tv_sec += dh / 100;
+ tsp->tv_nsec = (dh % 100) * 10000000;
+
+ /* Day of month */
+ day = (dd & 0x1f) - 1;
+
+ /* Full leap-year cycles */
+ day += LYC * ((dd >> 11) & 0x1f);
+
+ /* Month offset from leap-year cycle */
+ day += daytab[(dd >> 5) & 0x3f];
+
+ /*
+ * 2100 is not a leap year.
+ * XXX: a 32 bit time_t can not get us here.
+ */
+ if (day >= ((2100 - 1980) / 4 * LYC + FEB))
+ day--;
+
+ /* Align with time_t epoch */
+ day += T1980;
+
+ tsp->tv_sec += DAY * day;
+ if (!utc)
+ tsp->tv_sec += utc_offset();
+}
+
+#ifdef TEST_DRIVER
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+int
+main(int argc __unused, char **argv __unused)
+{
+ int i;
+ struct timespec ts;
+ struct tm tm;
+ double a;
+ u_int16_t d, t;
+ u_int8_t p;
+ char buf[100];
+
+ for (i = 0; i < 10000; i++) {
+ do {
+ ts.tv_sec = random();
+ } while (ts.tv_sec < T1980 * 86400);
+ ts.tv_nsec = random() % 1000000000;
+
+ printf("%10d.%03ld -- ", ts.tv_sec, ts.tv_nsec / 1000000);
+
+ gmtime_r(&ts.tv_sec, &tm);
+ strftime(buf, sizeof buf, "%Y %m %d %H %M %S", &tm);
+ printf("%s -- ", buf);
+
+ a = ts.tv_sec + ts.tv_nsec * 1e-9;
+ d = t = p = 0;
+ timet2fattime(&ts, &d, &t, &p);
+ printf("%04x %04x %02x -- ", d, t, p);
+ printf("%3d %02d %02d %02d %02d %02d -- ",
+ ((d >> 9) & 0x7f) + 1980,
+ (d >> 5) & 0x0f,
+ (d >> 0) & 0x1f,
+ (t >> 11) & 0x1f,
+ (t >> 5) & 0x3f,
+ ((t >> 0) & 0x1f) * 2);
+
+ ts.tv_sec = ts.tv_nsec = 0;
+ fattime2timet(d, t, p, &ts);
+ printf("%10d.%03ld == ", ts.tv_sec, ts.tv_nsec / 1000000);
+ gmtime_r(&ts.tv_sec, &tm);
+ strftime(buf, sizeof buf, "%Y %m %d %H %M %S", &tm);
+ printf("%s -- ", buf);
+ a -= ts.tv_sec + ts.tv_nsec * 1e-9;
+ printf("%.3f", a);
+ printf("\n");
+ }
+ return (0);
+}
+
+#endif /* TEST_DRIVER */
Index: subr_stack.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_stack.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -L sys/kern/subr_stack.c -L sys/kern/subr_stack.c -u -r1.1 -r1.2
--- sys/kern/subr_stack.c
+++ sys/kern/subr_stack.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_stack.c,v 1.2.2.1 2006/03/13 03:05:58 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_stack.c,v 1.3 2006/05/28 22:15:28 kris Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
@@ -90,7 +90,7 @@
long offset;
int i;
- KASSERT(st->depth <= STACK_MAX, ("bogous stack"));
+ KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
for (i = 0; i < st->depth; i++) {
stack_symbol(st->pcs[i], &name, &offset);
printf("#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
@@ -105,7 +105,7 @@
long offset;
int i;
- KASSERT(st->depth <= STACK_MAX, ("bogous stack"));
+ KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
for (i = 0; i < st->depth; i++) {
stack_symbol(st->pcs[i], &name, &offset);
sbuf_printf(sb, "#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
@@ -122,7 +122,7 @@
long offset;
int i;
- KASSERT(st->depth <= STACK_MAX, ("bogous stack"));
+ KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
if (cheap) {
ktr_tracepoint(mask, file, line, "#0 %p %p %p %p %p %p",
st->pcs[0], st->pcs[1], st->pcs[2], st->pcs[3],
Index: kern_lockf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_lockf.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_lockf.c -L sys/kern/kern_lockf.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_lockf.c
+++ sys/kern/kern_lockf.c
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_lockf.c,v 1.54 2005/03/29 08:13:01 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_lockf.c,v 1.57 2007/08/07 09:04:50 kib Exp $");
#include "opt_debug_lockf.h"
@@ -73,14 +73,14 @@
#define NOLOCKF (struct lockf *)0
#define SELF 0x1
#define OTHERS 0x2
-static int lf_clearlock(struct lockf *);
+static int lf_clearlock(struct lockf *, struct lockf **);
static int lf_findoverlap(struct lockf *,
struct lockf *, int, struct lockf ***, struct lockf **);
static struct lockf *
lf_getblock(struct lockf *);
static int lf_getlock(struct lockf *, struct flock *);
-static int lf_setlock(struct lockf *);
-static void lf_split(struct lockf *, struct lockf *);
+static int lf_setlock(struct lockf *, struct vnode *, struct lockf **);
+static void lf_split(struct lockf *, struct lockf *, struct lockf **);
static void lf_wakelock(struct lockf *);
#ifdef LOCKF_DEBUG
static void lf_print(char *, struct lockf *);
@@ -102,12 +102,13 @@
struct lockf **head;
u_quad_t size;
{
- register struct flock *fl = ap->a_fl;
- register struct lockf *lock;
+ struct flock *fl = ap->a_fl;
+ struct lockf *lock;
+ struct vnode *vp = ap->a_vp;
off_t start, end, oadd;
+ struct lockf *clean, *n;
int error;
- mtx_lock(&Giant);
/*
* Convert the flock structure into a start and end.
*/
@@ -124,40 +125,29 @@
case SEEK_END:
if (size > OFF_MAX ||
- (fl->l_start > 0 && size > OFF_MAX - fl->l_start)) {
- error = EOVERFLOW;
- goto out;
- }
+ (fl->l_start > 0 && size > OFF_MAX - fl->l_start))
+ return (EOVERFLOW);
start = size + fl->l_start;
break;
default:
- error = EINVAL;
- goto out;
- }
- if (start < 0) {
- error = EINVAL;
- goto out;
+ return (EINVAL);
}
+ if (start < 0)
+ return (EINVAL);
if (fl->l_len < 0) {
- if (start == 0) {
- error = EINVAL;
- goto out;
- }
+ if (start == 0)
+ return (EINVAL);
end = start - 1;
start += fl->l_len;
- if (start < 0) {
- error = EINVAL;
- goto out;
- }
+ if (start < 0)
+ return (EINVAL);
} else if (fl->l_len == 0)
end = -1;
else {
oadd = fl->l_len - 1;
- if (oadd > OFF_MAX - start) {
- error = EOVERFLOW;
- goto out;
- }
+ if (oadd > OFF_MAX - start)
+ return (EOVERFLOW);
end = start + oadd;
}
/*
@@ -166,11 +156,18 @@
if (*head == (struct lockf *)0) {
if (ap->a_op != F_SETLK) {
fl->l_type = F_UNLCK;
- error = 0;
- goto out;
+ return (0);
}
}
/*
+ * Allocate a spare structure in case we have to split.
+ */
+ clean = NULL;
+ if (ap->a_op == F_SETLK || ap->a_op == F_UNLCK) {
+ MALLOC(clean, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
+ clean->lf_next = NULL;
+ }
+ /*
* Create the lockf structure
*/
MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
@@ -192,29 +189,36 @@
/*
* Do the requested operation.
*/
+ VI_LOCK(vp);
switch(ap->a_op) {
case F_SETLK:
- error = lf_setlock(lock);
- goto out;
+ error = lf_setlock(lock, vp, &clean);
+ break;
case F_UNLCK:
- error = lf_clearlock(lock);
- FREE(lock, M_LOCKF);
- goto out;
+ error = lf_clearlock(lock, &clean);
+ lock->lf_next = clean;
+ clean = lock;
+ break;
case F_GETLK:
error = lf_getlock(lock, fl);
- FREE(lock, M_LOCKF);
- goto out;
+ lock->lf_next = clean;
+ clean = lock;
+ break;
default:
- free(lock, M_LOCKF);
+ lock->lf_next = clean;
+ clean = lock;
error = EINVAL;
- goto out;
+ break;
+ }
+ VI_UNLOCK(vp);
+ for (lock = clean; lock != NULL; ) {
+ n = lock->lf_next;
+ free(lock, M_LOCKF);
+ lock = n;
}
- /* NOTREACHED */
-out:
- mtx_unlock(&Giant);
return (error);
}
@@ -222,10 +226,12 @@
* Set a byte-range lock.
*/
static int
-lf_setlock(lock)
- register struct lockf *lock;
+lf_setlock(lock, vp, clean)
+ struct lockf *lock;
+ struct vnode *vp;
+ struct lockf **clean;
{
- register struct lockf *block;
+ struct lockf *block;
struct lockf **head = lock->lf_head;
struct lockf **prev, *overlap, *ltmp;
static char lockstr[] = "lockf";
@@ -251,7 +257,8 @@
* Free the structure and return if nonblocking.
*/
if ((lock->lf_flags & F_WAIT) == 0) {
- FREE(lock, M_LOCKF);
+ lock->lf_next = *clean;
+ *clean = lock;
return (EAGAIN);
}
/*
@@ -266,16 +273,19 @@
*/
if ((lock->lf_flags & F_POSIX) &&
(block->lf_flags & F_POSIX)) {
- register struct proc *wproc;
+ struct proc *wproc;
+ struct proc *nproc;
struct thread *td;
- register struct lockf *waitblock;
+ struct lockf *waitblock;
int i = 0;
/* The block is waiting on something */
- /* XXXKSE this is not complete under threads */
wproc = (struct proc *)block->lf_id;
- mtx_lock_spin(&sched_lock);
+restart:
+ nproc = NULL;
+ PROC_SLOCK(wproc);
FOREACH_THREAD_IN_PROC(wproc, td) {
+ thread_lock(td);
while (td->td_wchan &&
(td->td_wmesg == lockstr) &&
(i++ < maxlockdepth)) {
@@ -284,15 +294,21 @@
waitblock = waitblock->lf_next;
if ((waitblock->lf_flags & F_POSIX) == 0)
break;
- wproc = (struct proc *)waitblock->lf_id;
- if (wproc == (struct proc *)lock->lf_id) {
- mtx_unlock_spin(&sched_lock);
- free(lock, M_LOCKF);
+ nproc = (struct proc *)waitblock->lf_id;
+ if (nproc == (struct proc *)lock->lf_id) {
+ PROC_SUNLOCK(wproc);
+ thread_unlock(td);
+ lock->lf_next = *clean;
+ *clean = lock;
return (EDEADLK);
}
}
+ thread_unlock(td);
}
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(wproc);
+ wproc = nproc;
+ if (wproc)
+ goto restart;
}
/*
* For flock type locks, we must first remove
@@ -302,7 +318,7 @@
if ((lock->lf_flags & F_FLOCK) &&
lock->lf_type == F_WRLCK) {
lock->lf_type = F_UNLCK;
- (void) lf_clearlock(lock);
+ (void) lf_clearlock(lock, clean);
lock->lf_type = F_WRLCK;
}
/*
@@ -317,7 +333,7 @@
lf_printlist("lf_setlock", block);
}
#endif /* LOCKF_DEBUG */
- error = tsleep(lock, priority, lockstr, 0);
+ error = msleep(lock, VI_MTX(vp), priority, lockstr, 0);
/*
* We may have been awakened by a signal and/or by a
* debugger continuing us (in which cases we must remove
@@ -331,7 +347,8 @@
lock->lf_next = NOLOCKF;
}
if (error) {
- free(lock, M_LOCKF);
+ lock->lf_next = *clean;
+ *clean = lock;
return (error);
}
}
@@ -376,7 +393,8 @@
overlap->lf_type == F_WRLCK)
lf_wakelock(overlap);
overlap->lf_type = lock->lf_type;
- FREE(lock, M_LOCKF);
+ lock->lf_next = *clean;
+ *clean = lock;
lock = overlap; /* for debug output below */
break;
@@ -385,7 +403,8 @@
* Check for common starting point and different types.
*/
if (overlap->lf_type == lock->lf_type) {
- free(lock, M_LOCKF);
+ lock->lf_next = *clean;
+ *clean = lock;
lock = overlap; /* for debug output below */
break;
}
@@ -394,7 +413,7 @@
lock->lf_next = overlap;
overlap->lf_start = lock->lf_end + 1;
} else
- lf_split(overlap, lock);
+ lf_split(overlap, lock, clean);
lf_wakelock(overlap);
break;
@@ -426,7 +445,8 @@
needtolink = 0;
} else
*prev = overlap->lf_next;
- free(overlap, M_LOCKF);
+ overlap->lf_next = *clean;
+ *clean = overlap;
continue;
case 4: /* overlap starts before lock */
@@ -471,8 +491,9 @@
* and remove it (or shrink it), then wakeup anyone we can.
*/
static int
-lf_clearlock(unlock)
- register struct lockf *unlock;
+lf_clearlock(unlock, clean)
+ struct lockf *unlock;
+ struct lockf **clean;
{
struct lockf **head = unlock->lf_head;
register struct lockf *lf = *head;
@@ -498,7 +519,8 @@
case 1: /* overlap == lock */
*prev = overlap->lf_next;
- FREE(overlap, M_LOCKF);
+ overlap->lf_next = *clean;
+ *clean = overlap;
break;
case 2: /* overlap contains lock: split it */
@@ -506,14 +528,15 @@
overlap->lf_start = unlock->lf_end + 1;
break;
}
- lf_split(overlap, unlock);
+ lf_split(overlap, unlock, clean);
overlap->lf_next = unlock->lf_next;
break;
case 3: /* lock contains overlap */
*prev = overlap->lf_next;
lf = overlap->lf_next;
- free(overlap, M_LOCKF);
+ overlap->lf_next = *clean;
+ *clean = overlap;
continue;
case 4: /* overlap starts before lock */
@@ -714,11 +737,12 @@
* two or three locks as necessary.
*/
static void
-lf_split(lock1, lock2)
- register struct lockf *lock1;
- register struct lockf *lock2;
+lf_split(lock1, lock2, split)
+ struct lockf *lock1;
+ struct lockf *lock2;
+ struct lockf **split;
{
- register struct lockf *splitlock;
+ struct lockf *splitlock;
#ifdef LOCKF_DEBUG
if (lockf_debug & 2) {
@@ -742,9 +766,12 @@
}
/*
* Make a new lock consisting of the last part of
- * the encompassing lock
+ * the encompassing lock. We use the preallocated
+ * splitlock so we don't have to block.
*/
- MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
+ splitlock = *split;
+ KASSERT(splitlock != NULL, ("no split"));
+ *split = splitlock->lf_next;
bcopy(lock1, splitlock, sizeof *splitlock);
splitlock->lf_start = lock2->lf_end + 1;
TAILQ_INIT(&splitlock->lf_blkhd);
--- /dev/null
+++ sys/kern/vfs_acl.c
@@ -0,0 +1,431 @@
+/*-
+ * Copyright (c) 1999-2006 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ *
+ * ACL system calls and other functions common across different ACL types.
+ * Type-specific routines go into subr_acl_<type>.c.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/vfs_acl.c,v 1.53 2007/03/05 13:26:07 rwatson Exp $");
+
+#include "opt_mac.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/acl.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/uma.h>
+
+uma_zone_t acl_zone;
+static int vacl_set_acl(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+static int vacl_get_acl(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+static int vacl_aclcheck(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+
+/*
+ * These calls wrap the real vnode operations, and are called by the syscall
+ * code once the syscall has converted the path or file descriptor to a vnode
+ * (unlocked). The aclp pointer is assumed still to point to userland, so
+ * this should not be consumed within the kernel except by syscall code.
+ * Other code should directly invoke VOP_{SET,GET}ACL.
+ */
+
+/*
+ * Given a vnode, set its ACL.
+ */
+static int
+vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl inkernacl;
+ struct mount *mp;
+ int error;
+
+ error = copyin(aclp, &inkernacl, sizeof(struct acl));
+ if (error)
+ return(error);
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+#ifdef MAC
+ error = mac_check_vnode_setacl(td->td_ucred, vp, type, &inkernacl);
+ if (error != 0)
+ goto out;
+#endif
+ error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td);
+#ifdef MAC
+out:
+#endif
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return(error);
+}
+
+/*
+ * Given a vnode, get its ACL.
+ */
+static int
+vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl inkernelacl;
+ int error;
+
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+#ifdef MAC
+ error = mac_check_vnode_getacl(td->td_ucred, vp, type);
+ if (error != 0)
+ goto out;
+#endif
+ error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td);
+#ifdef MAC
+out:
+#endif
+ VOP_UNLOCK(vp, 0, td);
+ if (error == 0)
+ error = copyout(&inkernelacl, aclp, sizeof(struct acl));
+ return (error);
+}
+
+/*
+ * Given a vnode, delete its ACL.
+ */
+static int
+vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
+{
+ struct mount *mp;
+ int error;
+
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+#ifdef MAC
+ error = mac_check_vnode_deleteacl(td->td_ucred, vp, type);
+ if (error)
+ goto out;
+#endif
+ error = VOP_SETACL(vp, type, 0, td->td_ucred, td);
+#ifdef MAC
+out:
+#endif
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Given a vnode, check whether an ACL is appropriate for it
+ */
+static int
+vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl inkernelacl;
+ int error;
+
+ error = copyin(aclp, &inkernelacl, sizeof(struct acl));
+ if (error)
+ return(error);
+ error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td);
+ return (error);
+}
+
+/*
+ * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. Don't
+ * need to lock, as the vacl_ code will get/release any locks required.
+ */
+
+/*
+ * Given a file path, get an ACL for it
+ */
+int
+__acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
+{
+ struct nameidata nd;
+ int vfslocked, error;
+
+ NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ vfslocked = NDHASGIANT(&nd);
+ if (error == 0) {
+ error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
+ NDFREE(&nd, 0);
+ }
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+/*
+ * Given a file path, get an ACL for it; don't follow links.
+ */
+int
+__acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
+{
+ struct nameidata nd;
+ int vfslocked, error;
+
+ NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ vfslocked = NDHASGIANT(&nd);
+ if (error == 0) {
+ error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
+ NDFREE(&nd, 0);
+ }
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it.
+ */
+int
+__acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
+{
+ struct nameidata nd;
+ int vfslocked, error;
+
+ NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ vfslocked = NDHASGIANT(&nd);
+ if (error == 0) {
+ error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
+ NDFREE(&nd, 0);
+ }
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it; don't follow links.
+ */
+int
+__acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
+{
+ struct nameidata nd;
+ int vfslocked, error;
+
+ NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ vfslocked = NDHASGIANT(&nd);
+ if (error == 0) {
+ error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
+ NDFREE(&nd, 0);
+ }
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+/*
+ * Given a file descriptor, get an ACL for it.
+ */
+int
+__acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
+{
+ struct file *fp;
+ int vfslocked, error;
+
+ error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
+ if (error == 0) {
+ vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+ error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
+ fdrop(fp, td);
+ VFS_UNLOCK_GIANT(vfslocked);
+ }
+ return (error);
+}
+
+/*
+ * Given a file descriptor, set an ACL for it.
+ */
+int
+__acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
+{
+ struct file *fp;
+ int vfslocked, error;
+
+ error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
+ if (error == 0) {
+ vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+ error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
+ fdrop(fp, td);
+ VFS_UNLOCK_GIANT(vfslocked);
+ }
+ return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ */
+int
+__acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
+{
+ struct nameidata nd;
+ int vfslocked, error;
+
+ NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ vfslocked = NDHASGIANT(&nd);
+ if (error == 0) {
+ error = vacl_delete(td, nd.ni_vp, uap->type);
+ NDFREE(&nd, 0);
+ }
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it; don't follow links.
+ */
+int
+__acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
+{
+ struct nameidata nd;
+ int vfslocked, error;
+
+ NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ vfslocked = NDHASGIANT(&nd);
+ if (error == 0) {
+ error = vacl_delete(td, nd.ni_vp, uap->type);
+ NDFREE(&nd, 0);
+ }
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ */
+int
+__acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
+{
+ struct file *fp;
+ int vfslocked, error;
+
+ error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
+ if (error == 0) {
+ vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+ error = vacl_delete(td, fp->f_vnode, uap->type);
+ fdrop(fp, td);
+ VFS_UNLOCK_GIANT(vfslocked);
+ }
+ return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it.
+ */
+int
+__acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
+{
+ struct nameidata nd;
+ int vfslocked, error;
+
+ NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ vfslocked = NDHASGIANT(&nd);
+ if (error == 0) {
+ error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
+ NDFREE(&nd, 0);
+ }
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it; don't follow links.
+ */
+int
+__acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
+{
+ struct nameidata nd;
+ int vfslocked, error;
+
+ NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ vfslocked = NDHASGIANT(&nd);
+ if (error == 0) {
+ error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
+ NDFREE(&nd, 0);
+ }
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+}
+
+/*
+ * Given a file descriptor, check an ACL for it.
+ */
+int
+__acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
+{
+ struct file *fp;
+ int vfslocked, error;
+
+ error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
+ if (error == 0) {
+ vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+ error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
+ fdrop(fp, td);
+ VFS_UNLOCK_GIANT(vfslocked);
+ }
+ return (error);
+}
+
+/* ARGUSED */
+
+static void
+aclinit(void *dummy __unused)
+{
+
+ acl_zone = uma_zcreate("ACL UMA zone", sizeof(struct acl),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+SYSINIT(acls, SI_SUB_ACL, SI_ORDER_FIRST, aclinit, NULL)
Index: kern_intr.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_intr.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_intr.c -L sys/kern/kern_intr.c -u -r1.2 -r1.3
--- sys/kern/kern_intr.c
+++ sys/kern/kern_intr.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_intr.c,v 1.124.2.3.2.2 2006/04/15 20:08:33 scottl Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_intr.c,v 1.147 2007/06/05 00:00:54 jeff Exp $");
#include "opt_ddb.h"
@@ -83,7 +83,7 @@
static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads");
-static int intr_storm_threshold = 500;
+static int intr_storm_threshold = 1000;
TUNABLE_INT("hw.intr_storm_threshold", &intr_storm_threshold);
SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RW,
&intr_storm_threshold, 0,
@@ -92,13 +92,24 @@
TAILQ_HEAD_INITIALIZER(event_list);
static void intr_event_update(struct intr_event *ie);
+#ifdef INTR_FILTER
+static struct intr_thread *ithread_create(const char *name,
+ struct intr_handler *ih);
+#else
static struct intr_thread *ithread_create(const char *name);
-static void ithread_destroy2(struct intr_thread *ithread);
-static void ithread_execute_handlers(struct proc *p, struct intr_event *ie);
+#endif
+static void ithread_destroy(struct intr_thread *ithread);
+static void ithread_execute_handlers(struct proc *p,
+ struct intr_event *ie);
+#ifdef INTR_FILTER
+static void priv_ithread_execute_handler(struct proc *p,
+ struct intr_handler *ih);
+#endif
static void ithread_loop(void *);
static void ithread_update(struct intr_thread *ithd);
static void start_softintr(void *);
+/* Map an interrupt type to an ithread priority. */
u_char
intr_priority(enum intr_type flags)
{
@@ -162,9 +173,9 @@
/* Update name and priority. */
strlcpy(td->td_proc->p_comm, ie->ie_fullname,
sizeof(td->td_proc->p_comm));
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
sched_prio(td, pri);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
/*
@@ -226,6 +237,7 @@
CTR2(KTR_INTR, "%s: updated %s", __func__, ie->ie_fullname);
}
+#ifndef INTR_FILTER
int
intr_event_create(struct intr_event **event, void *source, int flags,
void (*enable)(void *), const char *fmt, ...)
@@ -255,6 +267,40 @@
CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name);
return (0);
}
+#else
+int
+intr_event_create(struct intr_event **event, void *source, int flags,
+ void (*enable)(void *), void (*eoi)(void *), void (*disab)(void *),
+ const char *fmt, ...)
+{
+ struct intr_event *ie;
+ va_list ap;
+
+ /* The only valid flag during creation is IE_SOFT. */
+ if ((flags & ~IE_SOFT) != 0)
+ return (EINVAL);
+ ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO);
+ ie->ie_source = source;
+ ie->ie_enable = enable;
+ ie->ie_eoi = eoi;
+ ie->ie_disab = disab;
+ ie->ie_flags = flags;
+ TAILQ_INIT(&ie->ie_handlers);
+ mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF);
+
+ va_start(ap, fmt);
+ vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap);
+ va_end(ap);
+ strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
+ mtx_pool_lock(mtxpool_sleep, &event_list);
+ TAILQ_INSERT_TAIL(&event_list, ie, ie_list);
+ mtx_pool_unlock(mtxpool_sleep, &event_list);
+ if (event != NULL)
+ *event = ie;
+ CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name);
+ return (0);
+}
+#endif
int
intr_event_destroy(struct intr_event *ie)
@@ -270,7 +316,7 @@
mtx_pool_unlock(mtxpool_sleep, &event_list);
#ifndef notyet
if (ie->ie_thread != NULL) {
- ithread_destroy2(ie->ie_thread);
+ ithread_destroy(ie->ie_thread);
ie->ie_thread = NULL;
}
#endif
@@ -280,6 +326,7 @@
return (0);
}
+#ifndef INTR_FILTER
static struct intr_thread *
ithread_create(const char *name)
{
@@ -295,53 +342,79 @@
if (error)
panic("kthread_create() failed with %d", error);
td = FIRST_THREAD_IN_PROC(p); /* XXXKSE */
- mtx_lock_spin(&sched_lock);
- td->td_ksegrp->kg_pri_class = PRI_ITHD;
+ thread_lock(td);
+ sched_class(td, PRI_ITHD);
+ TD_SET_IWAIT(td);
+ thread_unlock(td);
+ td->td_pflags |= TDP_ITHREAD;
+ ithd->it_thread = td;
+ CTR2(KTR_INTR, "%s: created %s", __func__, name);
+ return (ithd);
+}
+#else
+static struct intr_thread *
+ithread_create(const char *name, struct intr_handler *ih)
+{
+ struct intr_thread *ithd;
+ struct thread *td;
+ struct proc *p;
+ int error;
+
+ ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
+
+ error = kthread_create(ithread_loop, ih, &p, RFSTOPPED | RFHIGHPID,
+ 0, "%s", name);
+ if (error)
+ panic("kthread_create() failed with %d", error);
+ td = FIRST_THREAD_IN_PROC(p); /* XXXKSE */
+ thread_lock(td);
+ sched_class(td, PRI_ITHD);
TD_SET_IWAIT(td);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
td->td_pflags |= TDP_ITHREAD;
ithd->it_thread = td;
CTR2(KTR_INTR, "%s: created %s", __func__, name);
return (ithd);
}
+#endif
static void
-ithread_destroy2(struct intr_thread *ithread)
+ithread_destroy(struct intr_thread *ithread)
{
struct thread *td;
CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name);
td = ithread->it_thread;
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
ithread->it_flags |= IT_DEAD;
if (TD_AWAITING_INTR(td)) {
TD_CLR_IWAIT(td);
- setrunqueue(td, SRQ_INTR);
+ sched_add(td, SRQ_INTR);
}
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
+#ifndef INTR_FILTER
int
intr_event_add_handler(struct intr_event *ie, const char *name,
- driver_intr_t handler, void *arg, u_char pri, enum intr_type flags,
- void **cookiep)
+ driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
+ enum intr_type flags, void **cookiep)
{
struct intr_handler *ih, *temp_ih;
struct intr_thread *it;
- if (ie == NULL || name == NULL || handler == NULL)
+ if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
return (EINVAL);
/* Allocate and populate an interrupt handler structure. */
ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
+ ih->ih_filter = filter;
ih->ih_handler = handler;
ih->ih_argument = arg;
ih->ih_name = name;
ih->ih_event = ie;
ih->ih_pri = pri;
- if (flags & INTR_FAST)
- ih->ih_flags = IH_FAST;
- else if (flags & INTR_EXCL)
+ if (flags & INTR_EXCL)
ih->ih_flags = IH_EXCLUSIVE;
if (flags & INTR_MPSAFE)
ih->ih_flags |= IH_MPSAFE;
@@ -371,10 +444,9 @@
intr_event_update(ie);
/* Create a thread if we need one. */
- while (ie->ie_thread == NULL && !(flags & INTR_FAST)) {
+ while (ie->ie_thread == NULL && handler != NULL) {
if (ie->ie_flags & IE_ADDING_THREAD)
- msleep(ie, &ie->ie_lock, curthread->td_priority,
- "ithread", 0);
+ msleep(ie, &ie->ie_lock, 0, "ithread", 0);
else {
ie->ie_flags |= IE_ADDING_THREAD;
mtx_unlock(&ie->ie_lock);
@@ -395,7 +467,111 @@
*cookiep = ih;
return (0);
}
+#else
+int
+intr_event_add_handler(struct intr_event *ie, const char *name,
+ driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
+ enum intr_type flags, void **cookiep)
+{
+ struct intr_handler *ih, *temp_ih;
+ struct intr_thread *it;
+
+ if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
+ return (EINVAL);
+
+ /* Allocate and populate an interrupt handler structure. */
+ ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
+ ih->ih_filter = filter;
+ ih->ih_handler = handler;
+ ih->ih_argument = arg;
+ ih->ih_name = name;
+ ih->ih_event = ie;
+ ih->ih_pri = pri;
+ if (flags & INTR_EXCL)
+ ih->ih_flags = IH_EXCLUSIVE;
+ if (flags & INTR_MPSAFE)
+ ih->ih_flags |= IH_MPSAFE;
+ if (flags & INTR_ENTROPY)
+ ih->ih_flags |= IH_ENTROPY;
+
+ /* We can only have one exclusive handler in a event. */
+ mtx_lock(&ie->ie_lock);
+ if (!TAILQ_EMPTY(&ie->ie_handlers)) {
+ if ((flags & INTR_EXCL) ||
+ (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) {
+ mtx_unlock(&ie->ie_lock);
+ free(ih, M_ITHREAD);
+ return (EINVAL);
+ }
+ }
+
+ /* Add the new handler to the event in priority order. */
+ TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) {
+ if (temp_ih->ih_pri > ih->ih_pri)
+ break;
+ }
+ if (temp_ih == NULL)
+ TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next);
+ else
+ TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
+ intr_event_update(ie);
+ /* For filtered handlers, create a private ithread to run on. */
+ if (filter != NULL && handler != NULL) {
+ mtx_unlock(&ie->ie_lock);
+ it = ithread_create("intr: newborn", ih);
+ mtx_lock(&ie->ie_lock);
+ it->it_event = ie;
+ ih->ih_thread = it;
+ ithread_update(it); // XXX - do we really need this?!?!?
+ } else { /* Create the global per-event thread if we need one. */
+ while (ie->ie_thread == NULL && handler != NULL) {
+ if (ie->ie_flags & IE_ADDING_THREAD)
+ msleep(ie, &ie->ie_lock, 0, "ithread", 0);
+ else {
+ ie->ie_flags |= IE_ADDING_THREAD;
+ mtx_unlock(&ie->ie_lock);
+ it = ithread_create("intr: newborn", ih);
+ mtx_lock(&ie->ie_lock);
+ ie->ie_flags &= ~IE_ADDING_THREAD;
+ ie->ie_thread = it;
+ it->it_event = ie;
+ ithread_update(it);
+ wakeup(ie);
+ }
+ }
+ }
+ CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
+ ie->ie_name);
+ mtx_unlock(&ie->ie_lock);
+
+ if (cookiep != NULL)
+ *cookiep = ih;
+ return (0);
+}
+#endif
+
+/*
+ * Return the ie_source field from the intr_event an intr_handler is
+ * associated with.
+ */
+void *
+intr_handler_source(void *cookie)
+{
+ struct intr_handler *ih;
+ struct intr_event *ie;
+
+ ih = (struct intr_handler *)cookie;
+ if (ih == NULL)
+ return (NULL);
+ ie = ih->ih_event;
+ KASSERT(ie != NULL,
+ ("interrupt handler \"%s\" has a NULL interrupt event",
+ ih->ih_name));
+ return (ie->ie_source);
+}
+
+#ifndef INTR_FILTER
int
intr_event_remove_handler(void *cookie)
{
@@ -413,7 +589,7 @@
ie = handler->ih_event;
KASSERT(ie != NULL,
("interrupt handler \"%s\" has a NULL interrupt event",
- handler->ih_name));
+ handler->ih_name));
mtx_lock(&ie->ie_lock);
CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
ie->ie_name);
@@ -446,7 +622,7 @@
* so we have to remove the handler here rather than letting the
* thread do it.
*/
- mtx_lock_spin(&sched_lock);
+ thread_lock(ie->ie_thread->it_thread);
if (!TD_AWAITING_INTR(ie->ie_thread->it_thread) && !cold) {
handler->ih_flags |= IH_DEAD;
@@ -458,10 +634,9 @@
ie->ie_thread->it_need = 1;
} else
TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(ie->ie_thread->it_thread);
while (handler->ih_flags & IH_DEAD)
- msleep(handler, &ie->ie_lock, curthread->td_priority, "iev_rmh",
- 0);
+ msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
intr_event_update(ie);
#ifdef notyet
/*
@@ -477,7 +652,7 @@
}
}
if (dead) {
- ithread_destroy2(ie->ie_thread);
+ ithread_destroy(ie->ie_thread);
ie->ie_thread = NULL;
}
#endif
@@ -524,24 +699,179 @@
/*
* Set it_need to tell the thread to keep running if it is already
- * running. Then, grab sched_lock and see if we actually need to
- * put this thread on the runqueue.
+ * running. Then, lock the thread and see if we actually need to
+ * put it on the runqueue.
+ */
+ it->it_need = 1;
+ thread_lock(td);
+ if (TD_AWAITING_INTR(td)) {
+ CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
+ p->p_comm);
+ TD_CLR_IWAIT(td);
+ sched_add(td, SRQ_INTR);
+ } else {
+ CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
+ __func__, p->p_pid, p->p_comm, it->it_need, td->td_state);
+ }
+ thread_unlock(td);
+
+ return (0);
+}
+#else
+int
+intr_event_remove_handler(void *cookie)
+{
+ struct intr_handler *handler = (struct intr_handler *)cookie;
+ struct intr_event *ie;
+ struct intr_thread *it;
+#ifdef INVARIANTS
+ struct intr_handler *ih;
+#endif
+#ifdef notyet
+ int dead;
+#endif
+
+ if (handler == NULL)
+ return (EINVAL);
+ ie = handler->ih_event;
+ KASSERT(ie != NULL,
+ ("interrupt handler \"%s\" has a NULL interrupt event",
+ handler->ih_name));
+ mtx_lock(&ie->ie_lock);
+ CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
+ ie->ie_name);
+#ifdef INVARIANTS
+ TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
+ if (ih == handler)
+ goto ok;
+ mtx_unlock(&ie->ie_lock);
+ panic("interrupt handler \"%s\" not found in interrupt event \"%s\"",
+ ih->ih_name, ie->ie_name);
+ok:
+#endif
+ /*
+ * If there are no ithreads (per event and per handler), then
+ * just remove the handler and return.
+ * XXX: Note that an INTR_FAST handler might be running on another CPU!
+ */
+ if (ie->ie_thread == NULL && handler->ih_thread == NULL) {
+ TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
+ mtx_unlock(&ie->ie_lock);
+ free(handler, M_ITHREAD);
+ return (0);
+ }
+
+ /* Private or global ithread? */
+ it = (handler->ih_thread) ? handler->ih_thread : ie->ie_thread;
+ /*
+ * If the interrupt thread is already running, then just mark this
+ * handler as being dead and let the ithread do the actual removal.
+ *
+ * During a cold boot while cold is set, msleep() does not sleep,
+ * so we have to remove the handler here rather than letting the
+ * thread do it.
+ */
+ thread_lock(it->it_thread);
+ if (!TD_AWAITING_INTR(it->it_thread) && !cold) {
+ handler->ih_flags |= IH_DEAD;
+
+ /*
+ * Ensure that the thread will process the handler list
+ * again and remove this handler if it has already passed
+ * it on the list.
+ */
+ it->it_need = 1;
+ } else
+ TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
+ thread_unlock(it->it_thread);
+ while (handler->ih_flags & IH_DEAD)
+ msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
+ /*
+ * At this point, the handler has been disconnected from the event,
+ * so we can kill the private ithread if any.
+ */
+ if (handler->ih_thread) {
+ ithread_destroy(handler->ih_thread);
+ handler->ih_thread = NULL;
+ }
+ intr_event_update(ie);
+#ifdef notyet
+ /*
+ * XXX: This could be bad in the case of ppbus(8). Also, I think
+ * this could lead to races of stale data when servicing an
+ * interrupt.
+ */
+ dead = 1;
+ TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+ if (handler != NULL) {
+ dead = 0;
+ break;
+ }
+ }
+ if (dead) {
+ ithread_destroy(ie->ie_thread);
+ ie->ie_thread = NULL;
+ }
+#endif
+ mtx_unlock(&ie->ie_lock);
+ free(handler, M_ITHREAD);
+ return (0);
+}
+
+int
+intr_event_schedule_thread(struct intr_event *ie, struct intr_thread *it)
+{
+ struct intr_entropy entropy;
+ struct thread *td;
+ struct thread *ctd;
+ struct proc *p;
+
+ /*
+ * If no ithread or no handlers, then we have a stray interrupt.
+ */
+ if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) || it == NULL)
+ return (EINVAL);
+
+ ctd = curthread;
+ td = it->it_thread;
+ p = td->td_proc;
+
+ /*
+ * If any of the handlers for this ithread claim to be good
+ * sources of entropy, then gather some.
+ */
+ if (harvest.interrupt && ie->ie_flags & IE_ENTROPY) {
+ CTR3(KTR_INTR, "%s: pid %d (%s) gathering entropy", __func__,
+ p->p_pid, p->p_comm);
+ entropy.event = (uintptr_t)ie;
+ entropy.td = ctd;
+ random_harvest(&entropy, sizeof(entropy), 2, 0,
+ RANDOM_INTERRUPT);
+ }
+
+ KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name));
+
+ /*
+ * Set it_need to tell the thread to keep running if it is already
+ * running. Then, lock the thread and see if we actually need to
+ * put it on the runqueue.
*/
it->it_need = 1;
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
if (TD_AWAITING_INTR(td)) {
CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
p->p_comm);
TD_CLR_IWAIT(td);
- setrunqueue(td, SRQ_INTR);
+ sched_add(td, SRQ_INTR);
} else {
CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
__func__, p->p_pid, p->p_comm, it->it_need, td->td_state);
}
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
return (0);
}
+#endif
/*
* Add a software interrupt handler to a specified event. If a given event
@@ -554,7 +884,7 @@
struct intr_event *ie;
int error;
- if (flags & (INTR_FAST | INTR_ENTROPY))
+ if (flags & INTR_ENTROPY)
return (EINVAL);
ie = (eventp != NULL) ? *eventp : NULL;
@@ -563,14 +893,19 @@
if (!(ie->ie_flags & IE_SOFT))
return (EINVAL);
} else {
- error = intr_event_create(&ie, NULL, IE_SOFT, NULL,
- "swi%d:", pri);
+#ifdef INTR_FILTER
+ error = intr_event_create(&ie, NULL, IE_SOFT,
+ NULL, NULL, NULL, "swi%d:", pri);
+#else
+ error = intr_event_create(&ie, NULL, IE_SOFT,
+ NULL, "swi%d:", pri);
+#endif
if (error)
return (error);
if (eventp != NULL)
*eventp = ie;
}
- return (intr_event_add_handler(ie, name, handler, arg,
+ return (intr_event_add_handler(ie, name, NULL, handler, arg,
(pri * RQ_PPQ) + PI_SOFT, flags, cookiep));
/* XXKSE.. think of a better way to get separate queues */
}
@@ -585,8 +920,6 @@
struct intr_event *ie = ih->ih_event;
int error;
- PCPU_LAZY_INC(cnt.v_intr);
-
CTR3(KTR_INTR, "swi_sched: %s %s need=%d", ie->ie_name, ih->ih_name,
ih->ih_need);
@@ -596,8 +929,14 @@
* it will execute it the next time it runs.
*/
atomic_store_rel_int(&ih->ih_need, 1);
+
if (!(flags & SWI_DELAY)) {
+ PCPU_INC(cnt.v_soft);
+#ifdef INTR_FILTER
+ error = intr_event_schedule_thread(ie, ie->ie_thread);
+#else
error = intr_event_schedule_thread(ie);
+#endif
KASSERT(error == 0, ("stray software interrupt"));
}
}
@@ -615,25 +954,38 @@
return (intr_event_remove_handler(cookie));
}
-/* ABI compatibility shims. */
-#undef ithread_remove_handler
-#undef ithread_destroy
-int ithread_remove_handler(void *);
-int ithread_destroy(struct ithd *);
-
-int
-ithread_remove_handler(void *cookie)
-{
-
- return (intr_event_remove_handler(cookie));
-}
-
-int
-ithread_destroy(struct ithd *ithread)
+#ifdef INTR_FILTER
+static void
+priv_ithread_execute_handler(struct proc *p, struct intr_handler *ih)
{
+ struct intr_event *ie;
- return (intr_event_destroy(ithread));
+ ie = ih->ih_event;
+ /*
+ * If this handler is marked for death, remove it from
+ * the list of handlers and wake up the sleeper.
+ */
+ if (ih->ih_flags & IH_DEAD) {
+ mtx_lock(&ie->ie_lock);
+ TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next);
+ ih->ih_flags &= ~IH_DEAD;
+ wakeup(ih);
+ mtx_unlock(&ie->ie_lock);
+ return;
+ }
+
+ /* Execute this handler. */
+ CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
+ __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument,
+ ih->ih_name, ih->ih_flags);
+
+ if (!(ih->ih_flags & IH_MPSAFE))
+ mtx_lock(&Giant);
+ ih->ih_handler(ih->ih_argument);
+ if (!(ih->ih_flags & IH_MPSAFE))
+ mtx_unlock(&Giant);
}
+#endif
static void
ithread_execute_handlers(struct proc *p, struct intr_event *ie)
@@ -658,6 +1010,10 @@
continue;
}
+ /* Skip filter only handlers */
+ if (ih->ih_handler == NULL)
+ continue;
+
/*
* For software interrupt threads, we only execute
* handlers that have their need flag set. Hardware
@@ -670,14 +1026,10 @@
atomic_store_rel_int(&ih->ih_need, 0);
}
- /* Fast handlers are handled in primary interrupt context. */
- if (ih->ih_flags & IH_FAST)
- continue;
-
/* Execute this handler. */
CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
- __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument,
- ih->ih_name, ih->ih_flags);
+ __func__, p->p_pid, (void *)ih->ih_handler,
+ ih->ih_argument, ih->ih_name, ih->ih_flags);
if (!(ih->ih_flags & IH_MPSAFE))
mtx_lock(&Giant);
@@ -698,14 +1050,15 @@
* number of back to back interrupts exceeds the storm threshold,
* then enter storming mode.
*/
- if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold) {
- if (ie->ie_warned == 0) {
+ if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold &&
+ !(ie->ie_flags & IE_SOFT)) {
+ /* Report the message only once every second. */
+ if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) {
printf(
- "Interrupt storm detected on \"%s\"; throttling interrupt source\n",
+ "interrupt storm detected on \"%s\"; throttling interrupt source\n",
ie->ie_name);
- ie->ie_warned = 1;
}
- tsleep(&ie->ie_count, curthread->td_priority, "istorm", 1);
+ pause("istorm", 1);
} else
ie->ie_count++;
@@ -717,6 +1070,7 @@
ie->ie_enable(ie->ie_source);
}
+#ifndef INTR_FILTER
/*
* This is the main code for interrupt threads.
*/
@@ -774,15 +1128,221 @@
* lock. This may take a while and it_need may get
* set again, so we have to check it again.
*/
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) {
TD_SET_IWAIT(td);
ie->ie_count = 0;
mi_switch(SW_VOL, NULL);
}
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
}
+#else
+/*
+ * This is the main code for interrupt threads.
+ */
+static void
+ithread_loop(void *arg)
+{
+ struct intr_thread *ithd;
+ struct intr_handler *ih;
+ struct intr_event *ie;
+ struct thread *td;
+ struct proc *p;
+ int priv;
+
+ td = curthread;
+ p = td->td_proc;
+ ih = (struct intr_handler *)arg;
+ priv = (ih->ih_thread != NULL) ? 1 : 0;
+ ithd = (priv) ? ih->ih_thread : ih->ih_event->ie_thread;
+ KASSERT(ithd->it_thread == td,
+ ("%s: ithread and proc linkage out of sync", __func__));
+ ie = ithd->it_event;
+ ie->ie_count = 0;
+
+ /*
+ * As long as we have interrupts outstanding, go through the
+ * list of handlers, giving each one a go at it.
+ */
+ for (;;) {
+ /*
+ * If we are an orphaned thread, then just die.
+ */
+ if (ithd->it_flags & IT_DEAD) {
+ CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
+ p->p_pid, p->p_comm);
+ free(ithd, M_ITHREAD);
+ kthread_exit(0);
+ }
+
+ /*
+ * Service interrupts. If another interrupt arrives while
+ * we are running, it will set it_need to note that we
+ * should make another pass.
+ */
+ while (ithd->it_need) {
+ /*
+ * This might need a full read and write barrier
+ * to make sure that this write posts before any
+ * of the memory or device accesses in the
+ * handlers.
+ */
+ atomic_store_rel_int(&ithd->it_need, 0);
+ if (priv)
+ priv_ithread_execute_handler(p, ih);
+ else
+ ithread_execute_handlers(p, ie);
+ }
+ WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
+ mtx_assert(&Giant, MA_NOTOWNED);
+
+ /*
+ * Processed all our interrupts. Now get the sched
+ * lock. This may take a while and it_need may get
+ * set again, so we have to check it again.
+ */
+ thread_lock(td);
+ if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) {
+ TD_SET_IWAIT(td);
+ ie->ie_count = 0;
+ mi_switch(SW_VOL, NULL);
+ }
+ thread_unlock(td);
+ }
+}
+
+/*
+ * Main loop for interrupt filter.
+ *
+ * Some architectures (i386, amd64 and arm) require the optional frame
+ * parameter, and use it as the main argument for fast handler execution
+ * when ih_argument == NULL.
+ *
+ * Return value:
+ * o FILTER_STRAY: No filter recognized the event, and no
+ * filter-less handler is registered on this
+ * line.
+ * o FILTER_HANDLED: A filter claimed the event and served it.
+ * o FILTER_SCHEDULE_THREAD: No filter claimed the event, but there's at
+ * least one filter-less handler on this line.
+ * o FILTER_HANDLED |
+ * FILTER_SCHEDULE_THREAD: A filter claimed the event, and asked for
+ * scheduling the per-handler ithread.
+ *
+ * In case an ithread has to be scheduled, in *ithd there will be a
+ * pointer to a struct intr_thread containing the thread to be
+ * scheduled.
+ */
+
+int
+intr_filter_loop(struct intr_event *ie, struct trapframe *frame,
+ struct intr_thread **ithd)
+{
+ struct intr_handler *ih;
+ void *arg;
+ int ret, thread_only;
+
+ ret = 0;
+ thread_only = 0;
+ TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+ /*
+ * Execute fast interrupt handlers directly.
+ * To support clock handlers, if a handler registers
+ * with a NULL argument, then we pass it a pointer to
+ * a trapframe as its argument.
+ */
+ arg = ((ih->ih_argument == NULL) ? frame : ih->ih_argument);
+
+ CTR5(KTR_INTR, "%s: exec %p/%p(%p) for %s", __func__,
+ ih->ih_filter, ih->ih_handler, arg, ih->ih_name);
+
+ if (ih->ih_filter != NULL)
+ ret = ih->ih_filter(arg);
+ else {
+ thread_only = 1;
+ continue;
+ }
+
+ if (ret & FILTER_STRAY)
+ continue;
+ else {
+ *ithd = ih->ih_thread;
+ return (ret);
+ }
+ }
+
+ /*
+ * No filters handled the interrupt and we have at least
+ * one handler without a filter. In this case, we schedule
+ * all of the filter-less handlers to run in the ithread.
+ */
+ if (thread_only) {
+ *ithd = ie->ie_thread;
+ return (FILTER_SCHEDULE_THREAD);
+ }
+ return (FILTER_STRAY);
+}
+
+/*
+ * Main interrupt handling body.
+ *
+ * Input:
+ * o ie: the event connected to this interrupt.
+ * o frame: some archs (i.e. i386) pass a frame to some.
+ * handlers as their main argument.
+ * Return value:
+ * o 0: everything ok.
+ * o EINVAL: stray interrupt.
+ */
+int
+intr_event_handle(struct intr_event *ie, struct trapframe *frame)
+{
+ struct intr_thread *ithd;
+ struct thread *td;
+ int thread;
+
+ ithd = NULL;
+ td = curthread;
+
+ if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
+ return (EINVAL);
+
+ td->td_intr_nesting_level++;
+ thread = 0;
+ critical_enter();
+ thread = intr_filter_loop(ie, frame, &ithd);
+
+ /*
+ * If the interrupt was fully served, send it an EOI but leave
+ * it unmasked. Otherwise, mask the source as well as sending
+ * it an EOI.
+ */
+ if (thread & FILTER_HANDLED) {
+ if (ie->ie_eoi != NULL)
+ ie->ie_eoi(ie->ie_source);
+ } else {
+ if (ie->ie_disab != NULL)
+ ie->ie_disab(ie->ie_source);
+ }
+ critical_exit();
+
+ /* Interrupt storm logic */
+ if (thread & FILTER_STRAY) {
+ ie->ie_count++;
+ if (ie->ie_count < intr_storm_threshold)
+ printf("Interrupt stray detection not present\n");
+ }
+
+ /* Schedule an ithread if needed. */
+ if (thread & FILTER_SCHEDULE_THREAD) {
+ if (intr_event_schedule_thread(ie, ithd) != 0)
+ panic("%s: impossible stray interrupt", __func__);
+ }
+ td->td_intr_nesting_level--;
+ return (0);
+}
+#endif
#ifdef DDB
/*
@@ -829,14 +1389,10 @@
db_printsym((uintptr_t)ih->ih_handler, DB_STGY_PROC);
db_printf("(%p)", ih->ih_argument);
if (ih->ih_need ||
- (ih->ih_flags & (IH_FAST | IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD |
+ (ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD |
IH_MPSAFE)) != 0) {
db_printf(" {");
comma = 0;
- if (ih->ih_flags & IH_FAST) {
- db_printf("FAST");
- comma = 1;
- }
if (ih->ih_flags & IH_EXCLUSIVE) {
if (comma)
db_printf(", ");
@@ -927,16 +1483,16 @@
DB_SHOW_COMMAND(intr, db_show_intr)
{
struct intr_event *ie;
- int quit, all, verbose;
+ int all, verbose;
- quit = 0;
verbose = index(modif, 'v') != NULL;
all = index(modif, 'a') != NULL;
- db_setup_paging(db_simple_pager, &quit, db_lines_per_page);
TAILQ_FOREACH(ie, &event_list, ie_list) {
if (!all && TAILQ_EMPTY(&ie->ie_handlers))
continue;
db_dump_intr_event(ie, verbose);
+ if (db_pager_quit)
+ break;
}
}
#endif /* DDB */
@@ -998,11 +1554,9 @@
{
u_long *i;
char *cp;
- int quit;
cp = intrnames;
- db_setup_paging(db_simple_pager, &quit, db_lines_per_page);
- for (i = intrcnt, quit = 0; i != eintrcnt && !quit; i++) {
+ for (i = intrcnt; i != eintrcnt && !db_pager_quit; i++) {
if (*cp == '\0')
break;
if (*i != 0)
Index: vfs_bio.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -L sys/kern/vfs_bio.c -L sys/kern/vfs_bio.c -u -r1.6 -r1.7
--- sys/kern/vfs_bio.c
+++ sys/kern/vfs_bio.c
@@ -39,7 +39,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_bio.c,v 1.491.2.7 2006/03/13 03:06:09 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_bio.c,v 1.528 2007/09/26 11:22:23 ru Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -48,6 +48,7 @@
#include <sys/buf.h>
#include <sys/devicestat.h>
#include <sys/eventhandler.h>
+#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
@@ -71,7 +72,7 @@
#include "opt_directio.h"
#include "opt_swap.h"
-static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
+static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
struct bio_ops bioops; /* I/O operation notification */
@@ -80,6 +81,7 @@
.bop_write = bufwrite,
.bop_strategy = bufstrategy,
.bop_sync = bufsync,
+ .bop_bdflush = bufbdflush,
};
/*
@@ -99,10 +101,11 @@
int pageno, vm_page_t m);
static void vfs_clean_pages(struct buf *bp);
static void vfs_setdirty(struct buf *bp);
+static void vfs_setdirty_locked_object(struct buf *bp);
static void vfs_vmio_release(struct buf *bp);
static int vfs_bio_clcheck(struct vnode *vp, int size,
daddr_t lblkno, daddr_t blkno);
-static int flushbufqueues(int flushdeps);
+static int flushbufqueues(int, int);
static void buf_daemon(void);
static void bremfreel(struct buf *bp);
@@ -145,10 +148,13 @@
static int hirunningspace;
SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
"Maximum amount of space to use for in-progress I/O");
-static int dirtybufferflushes;
+int dirtybufferflushes;
SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
-static int altbufferflushes;
+int bdwriteskip;
+SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
+ 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
+int altbufferflushes;
SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
0, "Number of fsync flushes to limit dirty buffers");
static int recursiveflushes;
@@ -163,7 +169,7 @@
static int hidirtybuffers;
SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
"When the number of dirty buffers is considered severe");
-static int dirtybufthresh;
+int dirtybufthresh;
SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
static int numfreebuffers;
@@ -237,15 +243,21 @@
static struct mtx bdonelock;
/*
+ * Lock that protects against bwait()/bdone()/B_DONE races.
+ */
+static struct mtx bpinlock;
+
+/*
* Definitions for the buffer free lists.
*/
-#define BUFFER_QUEUES 5 /* number of free buffer queues */
+#define BUFFER_QUEUES 6 /* number of free buffer queues */
#define QUEUE_NONE 0 /* on no queue */
#define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */
#define QUEUE_DIRTY 2 /* B_DELWRI buffers */
-#define QUEUE_EMPTYKVA 3 /* empty buffer headers w/KVA assignment */
-#define QUEUE_EMPTY 4 /* empty buffer headers */
+#define QUEUE_DIRTY_GIANT 3 /* B_DELWRI buffers that need giant */
+#define QUEUE_EMPTYKVA 4 /* empty buffer headers w/KVA assignment */
+#define QUEUE_EMPTY 5 /* empty buffer headers */
/* Queues for free buffers with various properties */
static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
@@ -408,7 +420,7 @@
}
}
-/* Wake up the buffer deamon if necessary */
+/* Wake up the buffer daemon if necessary */
static __inline
void
bd_wakeup(int dirtybuflevel)
@@ -443,6 +455,7 @@
caddr_t
kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
{
+ int maxbuf;
/*
* physmem_est is in pages. Convert it to kilobytes (assumes
@@ -454,7 +467,7 @@
* The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
* For the first 64MB of ram nominally allocate sufficient buffers to
* cover 1/4 of our ram. Beyond the first 64MB allocate additional
- * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
+ * buffers to cover 1/10 of our ram over 64MB. When auto-sizing
* the buffer cache we limit the eventual kva reservation to
* maxbcache bytes.
*
@@ -472,6 +485,11 @@
if (maxbcache && nbuf > maxbcache / BKVASIZE)
nbuf = maxbcache / BKVASIZE;
+
+ /* XXX Avoid integer overflows later on with maxbufspace. */
+ maxbuf = (INT_MAX / 3) / BKVASIZE;
+ if (nbuf > maxbuf)
+ nbuf = maxbuf;
}
#if 0
@@ -523,6 +541,7 @@
mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF);
+ mtx_init(&bpinlock, "bpin lock", NULL, MTX_DEF);
/* next, make a null set of free lists */
for (i = 0; i < BUFFER_QUEUES; i++)
@@ -621,12 +640,8 @@
if (bp->b_kvasize) {
atomic_add_int(&buffreekvacnt, 1);
atomic_subtract_int(&bufspace, bp->b_kvasize);
- vm_map_lock(buffer_map);
- vm_map_delete(buffer_map,
- (vm_offset_t) bp->b_kvabase,
- (vm_offset_t) bp->b_kvabase + bp->b_kvasize
- );
- vm_map_unlock(buffer_map);
+ vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase,
+ (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
bp->b_kvasize = 0;
bufspacewakeup();
}
@@ -720,18 +735,51 @@
}
/*
+ * Attempt to initiate asynchronous I/O on read-ahead blocks. We must
+ * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
+ * the buffer is valid and we do not have to do anything.
+ */
+void
+breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
+ int cnt, struct ucred * cred)
+{
+ struct buf *rabp;
+ int i;
+
+ for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+ if (inmem(vp, *rablkno))
+ continue;
+ rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
+
+ if ((rabp->b_flags & B_CACHE) == 0) {
+ if (!TD_IS_IDLETHREAD(curthread))
+ curthread->td_ru.ru_inblock++;
+ rabp->b_flags |= B_ASYNC;
+ rabp->b_flags &= ~B_INVAL;
+ rabp->b_ioflags &= ~BIO_ERROR;
+ rabp->b_iocmd = BIO_READ;
+ if (rabp->b_rcred == NOCRED && cred != NOCRED)
+ rabp->b_rcred = crhold(cred);
+ vfs_busy_pages(rabp, 0);
+ BUF_KERNPROC(rabp);
+ rabp->b_iooffset = dbtob(rabp->b_blkno);
+ bstrategy(rabp);
+ } else {
+ brelse(rabp);
+ }
+ }
+}
+
+/*
* Operates like bread, but also starts asynchronous I/O on
- * read-ahead blocks. We must clear BIO_ERROR and B_INVAL prior
- * to initiating I/O . If B_CACHE is set, the buffer is valid
- * and we do not have to do anything.
+ * read-ahead blocks.
*/
int
breadn(struct vnode * vp, daddr_t blkno, int size,
daddr_t * rablkno, int *rabsize,
int cnt, struct ucred * cred, struct buf **bpp)
{
- struct buf *bp, *rabp;
- int i;
+ struct buf *bp;
int rv = 0, readwait = 0;
CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
@@ -739,8 +787,8 @@
/* if not found in cache, do some I/O */
if ((bp->b_flags & B_CACHE) == 0) {
- if (curthread != PCPU_GET(idlethread))
- curthread->td_proc->p_stats->p_ru.ru_inblock++;
+ if (!TD_IS_IDLETHREAD(curthread))
+ curthread->td_ru.ru_inblock++;
bp->b_iocmd = BIO_READ;
bp->b_flags &= ~B_INVAL;
bp->b_ioflags &= ~BIO_ERROR;
@@ -752,28 +800,7 @@
++readwait;
}
- for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
- if (inmem(vp, *rablkno))
- continue;
- rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
-
- if ((rabp->b_flags & B_CACHE) == 0) {
- if (curthread != PCPU_GET(idlethread))
- curthread->td_proc->p_stats->p_ru.ru_inblock++;
- rabp->b_flags |= B_ASYNC;
- rabp->b_flags &= ~B_INVAL;
- rabp->b_ioflags &= ~BIO_ERROR;
- rabp->b_iocmd = BIO_READ;
- if (rabp->b_rcred == NOCRED && cred != NOCRED)
- rabp->b_rcred = crhold(cred);
- vfs_busy_pages(rabp, 0);
- BUF_KERNPROC(rabp);
- rabp->b_iooffset = dbtob(rabp->b_blkno);
- bstrategy(rabp);
- } else {
- brelse(rabp);
- }
- }
+ breada(vp, rablkno, rabsize, cnt, cred);
if (readwait) {
rv = bufwait(bp);
@@ -796,6 +823,8 @@
bufwrite(struct buf *bp)
{
int oldflags;
+ struct vnode *vp;
+ int vp_md;
CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
if (bp->b_flags & B_INVAL) {
@@ -807,9 +836,19 @@
if (BUF_REFCNT(bp) == 0)
panic("bufwrite: buffer is not busy???");
+
+ if (bp->b_pin_count > 0)
+ bunpin_wait(bp);
+
KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
("FFS background buffer should not get here %p", bp));
+ vp = bp->b_vp;
+ if (vp)
+ vp_md = vp->v_vflag & VV_MD;
+ else
+ vp_md = 0;
+
/* Mark the buffer clean */
bundirty(bp);
@@ -827,8 +866,8 @@
bp->b_runningbufspace = bp->b_bufsize;
atomic_add_int(&runningbufspace, bp->b_runningbufspace);
- if (curthread != PCPU_GET(idlethread))
- curthread->td_proc->p_stats->p_ru.ru_oublock++;
+ if (!TD_IS_IDLETHREAD(curthread))
+ curthread->td_ru.ru_oublock++;
if (oldflags & B_ASYNC)
BUF_KERNPROC(bp);
bp->b_iooffset = dbtob(bp->b_blkno);
@@ -847,13 +886,54 @@
* or syncer daemon trying to clean up as that can lead
* to deadlock.
*/
- if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0)
+ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
waitrunningbufspace();
}
return (0);
}
+void
+bufbdflush(struct bufobj *bo, struct buf *bp)
+{
+ struct buf *nbp;
+
+ if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
+ (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
+ altbufferflushes++;
+ } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
+ BO_LOCK(bo);
+ /*
+ * Try to find a buffer to flush.
+ */
+ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
+ if ((nbp->b_vflags & BV_BKGRDINPROG) ||
+ BUF_LOCK(nbp,
+ LK_EXCLUSIVE | LK_NOWAIT, NULL))
+ continue;
+ if (bp == nbp)
+ panic("bdwrite: found ourselves");
+ BO_UNLOCK(bo);
+ /* Don't countdeps with the bo lock held. */
+ if (buf_countdeps(nbp, 0)) {
+ BO_LOCK(bo);
+ BUF_UNLOCK(nbp);
+ continue;
+ }
+ if (nbp->b_flags & B_CLUSTEROK) {
+ vfs_bio_awrite(nbp);
+ } else {
+ bremfree(nbp);
+ bawrite(nbp);
+ }
+ dirtybufferflushes++;
+ break;
+ }
+ if (nbp == NULL)
+ BO_UNLOCK(bo);
+ }
+}
+
/*
* Delayed write. (Buffer is marked dirty). Do not bother writing
* anything if the buffer is marked invalid.
@@ -868,7 +948,6 @@
{
struct thread *td = curthread;
struct vnode *vp;
- struct buf *nbp;
struct bufobj *bo;
CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
@@ -889,43 +968,10 @@
*/
vp = bp->b_vp;
bo = bp->b_bufobj;
- if ((td->td_pflags & TDP_COWINPROGRESS) == 0) {
- BO_LOCK(bo);
- if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
- BO_UNLOCK(bo);
- (void) VOP_FSYNC(vp, MNT_NOWAIT, td);
- altbufferflushes++;
- } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
- /*
- * Try to find a buffer to flush.
- */
- TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
- if ((nbp->b_vflags & BV_BKGRDINPROG) ||
- BUF_LOCK(nbp,
- LK_EXCLUSIVE | LK_NOWAIT, NULL))
- continue;
- if (bp == nbp)
- panic("bdwrite: found ourselves");
- BO_UNLOCK(bo);
- /* Don't countdeps with the bo lock held. */
- if (buf_countdeps(nbp, 0)) {
- BO_LOCK(bo);
- BUF_UNLOCK(nbp);
- continue;
- }
- if (nbp->b_flags & B_CLUSTEROK) {
- vfs_bio_awrite(nbp);
- } else {
- bremfree(nbp);
- bawrite(nbp);
- }
- dirtybufferflushes++;
- break;
- }
- if (nbp == NULL)
- BO_UNLOCK(bo);
- } else
- BO_UNLOCK(bo);
+ if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
+ td->td_pflags |= TDP_INBDFLUSH;
+ BO_BDFLUSH(bo, bp);
+ td->td_pflags &= ~TDP_INBDFLUSH;
} else
recursiveflushes++;
@@ -1117,6 +1163,11 @@
KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+ if (bp->b_flags & B_MANAGED) {
+ bqrelse(bp);
+ return;
+ }
+
if (bp->b_iocmd == BIO_WRITE &&
(bp->b_ioflags & BIO_ERROR) &&
!(bp->b_flags & B_INVAL)) {
@@ -1136,7 +1187,7 @@
* cache the buffer.
*/
bp->b_flags |= B_INVAL;
- if (LIST_FIRST(&bp->b_dep) != NULL)
+ if (!LIST_EMPTY(&bp->b_dep))
buf_deallocate(bp);
if (bp->b_flags & B_DELWRI) {
atomic_subtract_int(&numdirtybuffers, 1);
@@ -1329,6 +1380,9 @@
TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
/* remaining buffers */
} else {
+ if ((bp->b_flags & (B_DELWRI|B_NEEDSGIANT)) ==
+ (B_DELWRI|B_NEEDSGIANT))
+ bp->b_qindex = QUEUE_DIRTY_GIANT;
if (bp->b_flags & B_DELWRI)
bp->b_qindex = QUEUE_DIRTY;
else
@@ -1399,6 +1453,18 @@
BUF_UNLOCK(bp);
return;
}
+
+ if (bp->b_flags & B_MANAGED) {
+ if (bp->b_flags & B_REMFREE) {
+ mtx_lock(&bqlock);
+ bremfreel(bp);
+ mtx_unlock(&bqlock);
+ }
+ bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+ BUF_UNLOCK(bp);
+ return;
+ }
+
mtx_lock(&bqlock);
/* Handle delayed bremfree() processing. */
if (bp->b_flags & B_REMFREE)
@@ -1407,8 +1473,11 @@
panic("bqrelse: free buffer onto another queue???");
/* buffers with stale but valid contents */
if (bp->b_flags & B_DELWRI) {
- bp->b_qindex = QUEUE_DIRTY;
- TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
+ if (bp->b_flags & B_NEEDSGIANT)
+ bp->b_qindex = QUEUE_DIRTY_GIANT;
+ else
+ bp->b_qindex = QUEUE_DIRTY;
+ TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
} else {
/*
* XXX This lock may not be necessary since BKGRDINPROG
@@ -1473,7 +1542,7 @@
* the responsibility of the process that
* busied the pages to deal with them.
*/
- if ((m->flags & PG_BUSY) || (m->busy != 0))
+ if ((m->oflags & VPO_BUSY) || (m->busy != 0))
continue;
if (m->wire_count == 0) {
@@ -1484,7 +1553,6 @@
*/
if ((bp->b_flags & B_ASYNC) == 0 && !m->valid &&
m->hold_count == 0) {
- pmap_remove_all(m);
vm_page_free(m);
} else if (bp->b_flags & B_DIRECT) {
vm_page_try_to_free(m);
@@ -1798,7 +1866,7 @@
crfree(bp->b_wcred);
bp->b_wcred = NOCRED;
}
- if (LIST_FIRST(&bp->b_dep) != NULL)
+ if (!LIST_EMPTY(&bp->b_dep))
buf_deallocate(bp);
if (bp->b_vflags & BV_BKGRDINPROG)
panic("losing buffer 3");
@@ -1826,6 +1894,10 @@
bp->b_npages = 0;
bp->b_dirtyoff = bp->b_dirtyend = 0;
bp->b_bufobj = NULL;
+ bp->b_pin_count = 0;
+ bp->b_fsprivate1 = NULL;
+ bp->b_fsprivate2 = NULL;
+ bp->b_fsprivate3 = NULL;
LIST_INIT(&bp->b_dep);
@@ -1841,6 +1913,17 @@
}
/*
+ * Notify any waiters for the buffer lock about
+ * identity change by freeing the buffer.
+ */
+ if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp) > 0) {
+ bp->b_flags |= B_INVAL;
+ bfreekva(bp);
+ brelse(bp);
+ goto restart;
+ }
+
+ /*
* If we are overcomitted then recover the buffer and its
* KVM space. This occurs in rare situations when multiple
* processes are blocked in getnewbuf() or allocbuf().
@@ -1959,7 +2042,6 @@
static void
buf_daemon()
{
- mtx_lock(&Giant);
/*
* This process needs to be suspended prior to shutdown sync.
@@ -1985,13 +2067,28 @@
* normally would so they can run in parallel with our drain.
*/
while (numdirtybuffers > lodirtybuffers) {
- if (flushbufqueues(0) == 0) {
+ int flushed;
+
+ flushed = flushbufqueues(QUEUE_DIRTY, 0);
+ /* The list empty check here is slightly racy */
+ if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
+ mtx_lock(&Giant);
+ flushed += flushbufqueues(QUEUE_DIRTY_GIANT, 0);
+ mtx_unlock(&Giant);
+ }
+ if (flushed == 0) {
/*
* Could not find any buffers without rollback
* dependencies, so just write the first one
* in the hopes of eventually making progress.
*/
- flushbufqueues(1);
+ flushbufqueues(QUEUE_DIRTY, 1);
+ if (!TAILQ_EMPTY(
+ &bufqueues[QUEUE_DIRTY_GIANT])) {
+ mtx_lock(&Giant);
+ flushbufqueues(QUEUE_DIRTY_GIANT, 1);
+ mtx_unlock(&Giant);
+ }
break;
}
uio_yield();
@@ -2039,7 +2136,7 @@
0, "Number of buffers flushed with dependecies that require rollbacks");
static int
-flushbufqueues(int flushdeps)
+flushbufqueues(int queue, int flushdeps)
{
struct thread *td = curthread;
struct buf sentinel;
@@ -2056,16 +2153,20 @@
flushed = 0;
bp = NULL;
mtx_lock(&bqlock);
- TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], &sentinel, b_freelist);
+ TAILQ_INSERT_TAIL(&bufqueues[queue], &sentinel, b_freelist);
while (flushed != target) {
- bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
+ bp = TAILQ_FIRST(&bufqueues[queue]);
if (bp == &sentinel)
break;
- TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
- TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
+ TAILQ_REMOVE(&bufqueues[queue], bp, b_freelist);
+ TAILQ_INSERT_TAIL(&bufqueues[queue], bp, b_freelist);
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
continue;
+ if (bp->b_pin_count > 0) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
BO_LOCK(bp->b_bufobj);
if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
(bp->b_flags & B_DELWRI) == 0) {
@@ -2084,7 +2185,7 @@
continue;
}
- if (LIST_FIRST(&bp->b_dep) != NULL && buf_countdeps(bp, 0)) {
+ if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
if (flushdeps == 0) {
BUF_UNLOCK(bp);
continue;
@@ -2124,7 +2225,7 @@
vn_finished_write(mp);
BUF_UNLOCK(bp);
}
- TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY], &sentinel, b_freelist);
+ TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist);
mtx_unlock(&bqlock);
return (flushed);
}
@@ -2206,8 +2307,6 @@
static void
vfs_setdirty(struct buf *bp)
{
- int i;
- vm_object_t object;
/*
* Degenerate case - empty buffer
@@ -2218,20 +2317,25 @@
/*
* We qualify the scan for modified pages on whether the
- * object has been flushed yet. The OBJ_WRITEABLE flag
- * is not cleared simply by protecting pages off.
+ * object has been flushed yet.
*/
if ((bp->b_flags & B_VMIO) == 0)
return;
- object = bp->b_pages[0]->object;
- VM_OBJECT_LOCK(object);
- if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
- printf("Warning: object %p writeable but not mightbedirty\n", object);
- if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
- printf("Warning: object %p mightbedirty but not writeable\n", object);
+ VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
+ vfs_setdirty_locked_object(bp);
+ VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
+}
+static void
+vfs_setdirty_locked_object(struct buf *bp)
+{
+ vm_object_t object;
+ int i;
+
+ object = bp->b_bufobj->bo_object;
+ VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
vm_offset_t boffset;
vm_offset_t eoffset;
@@ -2282,7 +2386,6 @@
bp->b_dirtyend = eoffset;
}
}
- VM_OBJECT_UNLOCK(object);
}
/*
@@ -2347,14 +2450,14 @@
* XXX remove if 0 sections (clean this up after its proven)
*/
if (numfreebuffers == 0) {
- if (curthread == PCPU_GET(idlethread))
+ if (TD_IS_IDLETHREAD(curthread))
return NULL;
mtx_lock(&nblock);
needsbuffer |= VFS_BIO_NEED_ANY;
mtx_unlock(&nblock);
}
- VI_LOCK(vp);
+ BO_LOCK(bo);
bp = gbincore(bo, blkno);
if (bp != NULL) {
int lockflags;
@@ -2400,10 +2503,23 @@
if ((bp->b_flags & B_VMIO) == 0 ||
(size > bp->b_kvasize)) {
if (bp->b_flags & B_DELWRI) {
+ /*
+ * If buffer is pinned and caller does
+ * not want sleep waiting for it to be
+ * unpinned, bail out
+ * */
+ if (bp->b_pin_count > 0) {
+ if (flags & GB_LOCK_NOWAIT) {
+ bqrelse(bp);
+ return (NULL);
+ } else {
+ bunpin_wait(bp);
+ }
+ }
bp->b_flags |= B_NOCACHE;
bwrite(bp);
} else {
- if (LIST_FIRST(&bp->b_dep) == NULL) {
+ if (LIST_EMPTY(&bp->b_dep)) {
bp->b_flags |= B_RELBUF;
brelse(bp);
} else {
@@ -2470,7 +2586,7 @@
* returned by getnewbuf() is locked. Note that the returned
* buffer is also considered valid (not marked B_INVAL).
*/
- VI_UNLOCK(vp);
+ BO_UNLOCK(bo);
/*
* If the user does not want us to create the buffer, bail out
* here.
@@ -2516,7 +2632,6 @@
*/
bp->b_blkno = bp->b_lblkno = blkno;
bp->b_offset = offset;
-
bgetvp(vp, bp);
BO_UNLOCK(bo);
@@ -2783,7 +2898,8 @@
VM_WAIT;
VM_OBJECT_LOCK(obj);
} else {
- bp->b_flags &= ~B_CACHE;
+ if (m->valid == 0)
+ bp->b_flags &= ~B_CACHE;
bp->b_pages[bp->b_npages] = m;
++bp->b_npages;
}
@@ -2795,26 +2911,19 @@
* retry because it might have gotten freed out
* from under us.
*
- * We can only test PG_BUSY here. Blocking on
+ * We can only test VPO_BUSY here. Blocking on
* m->busy might lead to a deadlock:
*
* vm_fault->getpages->cluster_read->allocbuf
*
*/
- vm_page_lock_queues();
if (vm_page_sleep_if_busy(m, FALSE, "pgtblk"))
continue;
/*
- * We have a good page. Should we wakeup the
- * page daemon?
+ * We have a good page.
*/
- if ((curproc != pageproc) &&
- ((m->queue - m->pc) == PQ_CACHE) &&
- ((cnt.v_free_count + cnt.v_cache_count) <
- (cnt.v_free_min + cnt.v_cache_min))) {
- pagedaemon_wakeup();
- }
+ vm_page_lock_queues();
vm_page_wire(m);
vm_page_unlock_queues();
bp->b_pages[bp->b_npages] = m;
@@ -3041,11 +3150,11 @@
struct bufobj *dropobj;
void (*biodone)(struct buf *);
-
CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
dropobj = NULL;
- KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
+ KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
+ BUF_REFCNT(bp)));
KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
runningbufwakeup(bp);
@@ -3060,7 +3169,20 @@
bufobj_wdrop(dropobj);
return;
}
- if (LIST_FIRST(&bp->b_dep) != NULL)
+
+ bufdone_finish(bp);
+
+ if (dropobj)
+ bufobj_wdrop(dropobj);
+}
+
+void
+bufdone_finish(struct buf *bp)
+{
+ KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
+ BUF_REFCNT(bp)));
+
+ if (!LIST_EMPTY(&bp->b_dep))
buf_complete(bp);
if (bp->b_flags & B_VMIO) {
@@ -3070,6 +3192,7 @@
vm_object_t obj;
int iosize;
struct vnode *vp = bp->b_vp;
+ boolean_t are_queues_locked;
obj = bp->b_bufobj->bo_object;
@@ -3106,7 +3229,11 @@
!(bp->b_ioflags & BIO_ERROR)) {
bp->b_flags |= B_CACHE;
}
- vm_page_lock_queues();
+ if (bp->b_iocmd == BIO_READ) {
+ vm_page_lock_queues();
+ are_queues_locked = TRUE;
+ } else
+ are_queues_locked = FALSE;
for (i = 0; i < bp->b_npages; i++) {
int bogusflag = 0;
int resid;
@@ -3125,7 +3252,8 @@
if (m == NULL)
panic("biodone: page disappeared!");
bp->b_pages[i] = m;
- pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+ bp->b_pages, bp->b_npages);
}
#if defined(VFS_BIO_DEBUG)
if (OFF_TO_IDX(foff) != m->pindex) {
@@ -3174,7 +3302,8 @@
foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
iosize -= resid;
}
- vm_page_unlock_queues();
+ if (are_queues_locked)
+ vm_page_unlock_queues();
vm_object_pip_wakeupn(obj, 0);
VM_OBJECT_UNLOCK(obj);
}
@@ -3192,8 +3321,6 @@
bqrelse(bp);
} else
bdone(bp);
- if (dropobj)
- bufobj_wdrop(dropobj);
}
/*
@@ -3214,7 +3341,6 @@
obj = bp->b_bufobj->bo_object;
VM_OBJECT_LOCK(obj);
- vm_page_lock_queues();
for (i = 0; i < bp->b_npages; i++) {
m = bp->b_pages[i];
if (m == bogus_page) {
@@ -3228,7 +3354,6 @@
vm_object_pip_subtract(obj, 1);
vm_page_io_finish(m);
}
- vm_page_unlock_queues();
vm_object_pip_wakeupn(obj, 0);
VM_OBJECT_UNLOCK(obj);
}
@@ -3275,7 +3400,7 @@
* This routine is called before a device strategy routine.
* It is used to tell the VM system that paging I/O is in
* progress, and treat the pages associated with the buffer
- * almost as being PG_BUSY. Also the object paging_in_progress
+ * almost as being VPO_BUSY. Also the object paging_in_progress
* flag is handled to make sure that the object doesn't become
* inconsistant.
*
@@ -3298,10 +3423,10 @@
foff = bp->b_offset;
KASSERT(bp->b_offset != NOOFFSET,
("vfs_busy_pages: no buffer offset"));
- vfs_setdirty(bp);
VM_OBJECT_LOCK(obj);
+ if (bp->b_bufsize != 0)
+ vfs_setdirty_locked_object(bp);
retry:
- vm_page_lock_queues();
for (i = 0; i < bp->b_npages; i++) {
m = bp->b_pages[i];
@@ -3309,6 +3434,7 @@
goto retry;
}
bogus = 0;
+ vm_page_lock_queues();
for (i = 0; i < bp->b_npages; i++) {
m = bp->b_pages[i];
@@ -3749,6 +3875,32 @@
return (error);
}
+void
+bpin(struct buf *bp)
+{
+ mtx_lock(&bpinlock);
+ bp->b_pin_count++;
+ mtx_unlock(&bpinlock);
+}
+
+void
+bunpin(struct buf *bp)
+{
+ mtx_lock(&bpinlock);
+ if (--bp->b_pin_count == 0)
+ wakeup(bp);
+ mtx_unlock(&bpinlock);
+}
+
+void
+bunpin_wait(struct buf *bp)
+{
+ mtx_lock(&bpinlock);
+ while (bp->b_pin_count > 0)
+ msleep(bp, &bpinlock, PRIBIO, "bwunpin", 0);
+ mtx_unlock(&bpinlock);
+}
+
#include "opt_ddb.h"
#ifdef DDB
#include <ddb/ddb.h>
Index: subr_sbuf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_sbuf.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_sbuf.c -L sys/kern/subr_sbuf.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_sbuf.c
+++ sys/kern/subr_sbuf.c
@@ -27,7 +27,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_sbuf.c,v 1.29 2005/02/10 12:02:37 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_sbuf.c,v 1.30 2005/12/23 11:49:53 phk Exp $");
#include <sys/param.h>
@@ -379,7 +379,7 @@
return (-1); /* XXX */
}
- return (0);
+ return (done);
}
#endif
Index: uipc_socket.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/uipc_socket.c -L sys/kern/uipc_socket.c -u -r1.2 -r1.3
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -1,8 +1,9 @@
/*-
- * Copyright (c) 2004 The FreeBSD Foundation
- * Copyright (c) 2004-2005 Robert N. M. Watson
* Copyright (c) 1982, 1986, 1988, 1990, 1993
- * The Regents of the University of California. All rights reserved.
+ * The Regents of the University of California.
+ * Copyright (c) 2004 The FreeBSD Foundation
+ * Copyright (c) 2004-2007 Robert N. M. Watson
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -31,8 +32,70 @@
* @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
*/
+/*
+ * Comments on the socket life cycle:
+ *
+ * soalloc() sets of socket layer state for a socket, called only by
+ * socreate() and sonewconn(). Socket layer private.
+ *
+ * sodealloc() tears down socket layer state for a socket, called only by
+ * sofree() and sonewconn(). Socket layer private.
+ *
+ * pru_attach() associates protocol layer state with an allocated socket;
+ * called only once, may fail, aborting socket allocation. This is called
+ * from socreate() and sonewconn(). Socket layer private.
+ *
+ * pru_detach() disassociates protocol layer state from an attached socket,
+ * and will be called exactly once for sockets in which pru_attach() has
+ * been successfully called. If pru_attach() returned an error,
+ * pru_detach() will not be called. Socket layer private.
+ *
+ * pru_abort() and pru_close() notify the protocol layer that the last
+ * consumer of a socket is starting to tear down the socket, and that the
+ * protocol should terminate the connection. Historically, pru_abort() also
+ * detached protocol state from the socket state, but this is no longer the
+ * case.
+ *
+ * socreate() creates a socket and attaches protocol state. This is a public
+ * interface that may be used by socket layer consumers to create new
+ * sockets.
+ *
+ * sonewconn() creates a socket and attaches protocol state. This is a
+ * public interface that may be used by protocols to create new sockets when
+ * a new connection is received and will be available for accept() on a
+ * listen socket.
+ *
+ * soclose() destroys a socket after possibly waiting for it to disconnect.
+ * This is a public interface that socket consumers should use to close and
+ * release a socket when done with it.
+ *
+ * soabort() destroys a socket without waiting for it to disconnect (used
+ * only for incoming connections that are already partially or fully
+ * connected). This is used internally by the socket layer when clearing
+ * listen socket queues (due to overflow or close on the listen socket), but
+ * is also a public interface protocols may use to abort connections in
+ * their incomplete listen queues should they no longer be required. Sockets
+ * placed in completed connection listen queues should not be aborted for
+ * reasons described in the comment above the soclose() implementation. This
+ * is not a general purpose close routine, and except in the specific
+ * circumstances described here, should not be used.
+ *
+ * sofree() will free a socket and its protocol state if all references on
+ * the socket have been released, and is the public interface to attempt to
+ * free a socket when a reference is removed. This is a socket layer private
+ * interface.
+ *
+ * NOTE: In addition to socreate() and soclose(), which provide a single
+ * socket reference to the consumer to be managed as required, there are two
+ * calls to explicitly manage socket references, soref(), and sorele().
+ * Currently, these are generally required only when transitioning a socket
+ * from a listen queue to a file descriptor, in order to prevent garbage
+ * collection of the socket at an untimely moment. For a number of reasons,
+ * these interfaces are not preferred, and should be avoided.
+ */
+
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_socket.c,v 1.242.2.4 2005/12/28 18:05:13 ps Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_socket.c,v 1.302.4.1 2008/02/02 12:44:13 rwatson Exp $");
#include "opt_inet.h"
#include "opt_mac.h"
@@ -52,6 +115,7 @@
#include <sys/file.h> /* for struct knote */
#include <sys/kernel.h>
#include <sys/event.h>
+#include <sys/eventhandler.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/protosw.h>
@@ -59,10 +123,14 @@
#include <sys/socketvar.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/uio.h>
#include <sys/jail.h>
+#include <security/mac/mac_framework.h>
+
#include <vm/uma.h>
#ifdef COMPAT_IA32
@@ -91,16 +159,16 @@
uma_zone_t socket_zone;
so_gen_t so_gencnt; /* generation count for sockets */
+int maxsockets;
+
MALLOC_DEFINE(M_SONAME, "soname", "socket name");
MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
-SYSCTL_DECL(_kern_ipc);
-
static int somaxconn = SOMAXCONN;
-static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS);
+static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
/* XXX: we dont have SYSCTL_USHORT */
SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
- 0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection "
+ 0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
"queue size");
static int numopensockets;
SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
@@ -132,57 +200,135 @@
MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
/*
- * Socket operation routines.
- * These routines are called by the routines in
- * sys_socket.c or from a system process, and
- * implement the semantics of socket operations by
- * switching out to the protocol specific routines.
+ * General IPC sysctl name space, used by sockets and a variety of other IPC
+ * types.
*/
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
/*
- * Get a socket structure from our zone, and initialize it.
- * Note that it would probably be better to allocate socket
- * and PCB at the same time, but I'm not convinced that all
- * the protocols can be easily modified to do this.
+ * Sysctl to get and set the maximum global sockets limit. Notify protocols
+ * of the change so that they can update their dependent limits as required.
+ */
+static int
+sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
+{
+ int error, newmaxsockets;
+
+ newmaxsockets = maxsockets;
+ error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
+ if (error == 0 && req->newptr) {
+ if (newmaxsockets > maxsockets) {
+ maxsockets = newmaxsockets;
+ if (maxsockets > ((maxfiles / 4) * 3)) {
+ maxfiles = (maxsockets * 5) / 4;
+ maxfilesperproc = (maxfiles * 9) / 10;
+ }
+ EVENTHANDLER_INVOKE(maxsockets_change);
+ } else
+ error = EINVAL;
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
+ &maxsockets, 0, sysctl_maxsockets, "IU",
+ "Maximum number of sockets avaliable");
+
+/*
+ * Initialise maxsockets.
+ */
+static void init_maxsockets(void *ignored)
+{
+ TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
+ maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
+}
+SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
+
+/*
+ * Socket operation routines. These routines are called by the routines in
+ * sys_socket.c or from a system process, and implement the semantics of
+ * socket operations by switching out to the protocol specific routines.
+ */
+
+/*
+ * Get a socket structure from our zone, and initialize it. Note that it
+ * would probably be better to allocate socket and PCB at the same time, but
+ * I'm not convinced that all the protocols can be easily modified to do
+ * this.
*
* soalloc() returns a socket with a ref count of 0.
*/
-struct socket *
-soalloc(int mflags)
+static struct socket *
+soalloc(void)
{
struct socket *so;
- so = uma_zalloc(socket_zone, mflags | M_ZERO);
- if (so != NULL) {
+ so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
+ if (so == NULL)
+ return (NULL);
#ifdef MAC
- if (mac_init_socket(so, mflags) != 0) {
- uma_zfree(socket_zone, so);
- return (NULL);
- }
-#endif
- SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
- SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
- TAILQ_INIT(&so->so_aiojobq);
- mtx_lock(&so_global_mtx);
- so->so_gencnt = ++so_gencnt;
- ++numopensockets;
- mtx_unlock(&so_global_mtx);
+ if (mac_init_socket(so, M_NOWAIT) != 0) {
+ uma_zfree(socket_zone, so);
+ return (NULL);
}
+#endif
+ SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
+ SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
+ sx_init(&so->so_snd.sb_sx, "so_snd_sx");
+ sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
+ TAILQ_INIT(&so->so_aiojobq);
+ mtx_lock(&so_global_mtx);
+ so->so_gencnt = ++so_gencnt;
+ ++numopensockets;
+ mtx_unlock(&so_global_mtx);
return (so);
}
/*
+ * Free the storage associated with a socket at the socket layer, tear down
+ * locks, labels, etc. All protocol state is assumed already to have been
+ * torn down (and possibly never set up) by the caller.
+ */
+static void
+sodealloc(struct socket *so)
+{
+
+ KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
+ KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
+
+ mtx_lock(&so_global_mtx);
+ so->so_gencnt = ++so_gencnt;
+ --numopensockets; /* Could be below, but faster here. */
+ mtx_unlock(&so_global_mtx);
+ if (so->so_rcv.sb_hiwat)
+ (void)chgsbsize(so->so_cred->cr_uidinfo,
+ &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
+ if (so->so_snd.sb_hiwat)
+ (void)chgsbsize(so->so_cred->cr_uidinfo,
+ &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
+#ifdef INET
+ /* remove acccept filter if one is present. */
+ if (so->so_accf != NULL)
+ do_setopt_accept_filter(so, NULL);
+#endif
+#ifdef MAC
+ mac_destroy_socket(so);
+#endif
+ crfree(so->so_cred);
+ sx_destroy(&so->so_snd.sb_sx);
+ sx_destroy(&so->so_rcv.sb_sx);
+ SOCKBUF_LOCK_DESTROY(&so->so_snd);
+ SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+ uma_zfree(socket_zone, so);
+}
+
+/*
* socreate returns a socket with a ref count of 1. The socket should be
* closed with soclose().
*/
int
-socreate(dom, aso, type, proto, cred, td)
- int dom;
- struct socket **aso;
- int type;
- int proto;
- struct ucred *cred;
- struct thread *td;
+socreate(int dom, struct socket **aso, int type, int proto,
+ struct ucred *cred, struct thread *td)
{
struct protosw *prp;
struct socket *so;
@@ -206,7 +352,7 @@
if (prp->pr_type != type)
return (EPROTOTYPE);
- so = soalloc(M_WAITOK);
+ so = soalloc();
if (so == NULL)
return (ENOBUFS);
@@ -229,55 +375,120 @@
*/
error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
if (error) {
- ACCEPT_LOCK();
- SOCK_LOCK(so);
- so->so_state |= SS_NOFDREF;
- sorele(so);
+ KASSERT(so->so_count == 1, ("socreate: so_count %d",
+ so->so_count));
+ so->so_count = 0;
+ sodealloc(so);
return (error);
}
*aso = so;
return (0);
}
-int
-sobind(so, nam, td)
- struct socket *so;
- struct sockaddr *nam;
- struct thread *td;
-{
-
- return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
-}
+#ifdef REGRESSION
+static int regression_sonewconn_earlytest = 1;
+SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
+ ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
+#endif
-void
-sodealloc(struct socket *so)
+/*
+ * When an attempt at a new connection is noted on a socket which accepts
+ * connections, sonewconn is called. If the connection is possible (subject
+ * to space constraints, etc.) then we allocate a new structure, propoerly
+ * linked into the data structure of the original socket, and return this.
+ * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
+ *
+ * Note: the ref count on the socket is 0 on return.
+ */
+struct socket *
+sonewconn(struct socket *head, int connstatus)
{
+ struct socket *so;
+ int over;
- KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
- mtx_lock(&so_global_mtx);
- so->so_gencnt = ++so_gencnt;
- mtx_unlock(&so_global_mtx);
- if (so->so_rcv.sb_hiwat)
- (void)chgsbsize(so->so_cred->cr_uidinfo,
- &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
- if (so->so_snd.sb_hiwat)
- (void)chgsbsize(so->so_cred->cr_uidinfo,
- &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
-#ifdef INET
- /* remove acccept filter if one is present. */
- if (so->so_accf != NULL)
- do_setopt_accept_filter(so, NULL);
+ ACCEPT_LOCK();
+ over = (head->so_qlen > 3 * head->so_qlimit / 2);
+ ACCEPT_UNLOCK();
+#ifdef REGRESSION
+ if (regression_sonewconn_earlytest && over)
+#else
+ if (over)
#endif
+ return (NULL);
+ so = soalloc();
+ if (so == NULL)
+ return (NULL);
+ if ((head->so_options & SO_ACCEPTFILTER) != 0)
+ connstatus = 0;
+ so->so_head = head;
+ so->so_type = head->so_type;
+ so->so_options = head->so_options &~ SO_ACCEPTCONN;
+ so->so_linger = head->so_linger;
+ so->so_state = head->so_state | SS_NOFDREF;
+ so->so_proto = head->so_proto;
+ so->so_cred = crhold(head->so_cred);
#ifdef MAC
- mac_destroy_socket(so);
+ SOCK_LOCK(head);
+ mac_create_socket_from_socket(head, so);
+ SOCK_UNLOCK(head);
#endif
- crfree(so->so_cred);
- SOCKBUF_LOCK_DESTROY(&so->so_snd);
- SOCKBUF_LOCK_DESTROY(&so->so_rcv);
- uma_zfree(socket_zone, so);
- mtx_lock(&so_global_mtx);
- --numopensockets;
- mtx_unlock(&so_global_mtx);
+ knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
+ NULL, NULL, NULL);
+ knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
+ NULL, NULL, NULL);
+ if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
+ (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+ sodealloc(so);
+ return (NULL);
+ }
+ so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
+ so->so_snd.sb_lowat = head->so_snd.sb_lowat;
+ so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
+ so->so_snd.sb_timeo = head->so_snd.sb_timeo;
+ so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
+ so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
+ so->so_state |= connstatus;
+ ACCEPT_LOCK();
+ if (connstatus) {
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ so->so_qstate |= SQ_COMP;
+ head->so_qlen++;
+ } else {
+ /*
+ * Keep removing sockets from the head until there's room for
+ * us to insert on the tail. In pre-locking revisions, this
+ * was a simple if(), but as we could be racing with other
+ * threads and soabort() requires dropping locks, we must
+ * loop waiting for the condition to be true.
+ */
+ while (head->so_incqlen > head->so_qlimit) {
+ struct socket *sp;
+ sp = TAILQ_FIRST(&head->so_incomp);
+ TAILQ_REMOVE(&head->so_incomp, sp, so_list);
+ head->so_incqlen--;
+ sp->so_qstate &= ~SQ_INCOMP;
+ sp->so_head = NULL;
+ ACCEPT_UNLOCK();
+ soabort(sp);
+ ACCEPT_LOCK();
+ }
+ TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+ so->so_qstate |= SQ_INCOMP;
+ head->so_incqlen++;
+ }
+ ACCEPT_UNLOCK();
+ if (connstatus) {
+ sorwakeup(head);
+ wakeup_one(&head->so_timeo);
+ }
+ return (so);
+}
+
+int
+sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+ return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
}
/*
@@ -293,31 +504,14 @@
* socket-layer test and set to avoid races at the socket layer.
*/
int
-solisten(so, backlog, td)
- struct socket *so;
- int backlog;
- struct thread *td;
+solisten(struct socket *so, int backlog, struct thread *td)
{
- int error;
-
- error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
- if (error)
- return (error);
- /*
- * XXXRW: The following state adjustment should occur in
- * solisten_proto(), but we don't currently pass the backlog request
- * to the protocol via pru_listen().
- */
- if (backlog < 0 || backlog > somaxconn)
- backlog = somaxconn;
- so->so_qlimit = backlog;
- return (0);
+ return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
}
int
-solisten_proto_check(so)
- struct socket *so;
+solisten_proto_check(struct socket *so)
{
SOCK_LOCK_ASSERT(so);
@@ -329,37 +523,48 @@
}
void
-solisten_proto(so)
- struct socket *so;
+solisten_proto(struct socket *so, int backlog)
{
SOCK_LOCK_ASSERT(so);
+ if (backlog < 0 || backlog > somaxconn)
+ backlog = somaxconn;
+ so->so_qlimit = backlog;
so->so_options |= SO_ACCEPTCONN;
}
/*
* Attempt to free a socket. This should really be sotryfree().
*
- * We free the socket if the protocol is no longer interested in the socket,
- * there's no file descriptor reference, and the refcount is 0. While the
- * calling macro sotryfree() tests the refcount, sofree() has to test it
- * again as it's possible to race with an accept()ing thread if the socket is
- * in an listen queue of a listen socket, as being in the listen queue
- * doesn't elevate the reference count. sofree() acquires the accept mutex
- * early for this test in order to avoid that race.
+ * sofree() will succeed if:
+ *
+ * - There are no outstanding file descriptor references or related consumers
+ * (so_count == 0).
+ *
+ * - The socket has been closed by user space, if ever open (SS_NOFDREF).
+ *
+ * - The protocol does not have an outstanding strong reference on the socket
+ * (SS_PROTOREF).
+ *
+ * - The socket is not in a completed connection queue, so a process has been
+ * notified that it is present. If it is removed, the user process may
+ * block in accept() despite select() saying the socket was ready.
+ *
+ * Otherwise, it will quietly abort so that a future call to sofree(), when
+ * conditions are right, can succeed.
*/
void
-sofree(so)
- struct socket *so;
+sofree(struct socket *so)
{
+ struct protosw *pr = so->so_proto;
struct socket *head;
ACCEPT_LOCK_ASSERT();
SOCK_LOCK_ASSERT(so);
- if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0 ||
- so->so_count != 0) {
+ if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
+ (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
SOCK_UNLOCK(so);
ACCEPT_UNLOCK();
return;
@@ -374,22 +579,6 @@
KASSERT((so->so_qstate & SQ_COMP) == 0 ||
(so->so_qstate & SQ_INCOMP) == 0,
("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
- /*
- * accept(2) is responsible draining the completed
- * connection queue and freeing those sockets, so
- * we just return here if this socket is currently
- * on the completed connection queue. Otherwise,
- * accept(2) may hang after select(2) has indicating
- * that a listening socket was ready. If it's an
- * incomplete connection, we remove it from the queue
- * and free it; otherwise, it won't be released until
- * the listening socket is closed.
- */
- if ((so->so_qstate & SQ_COMP) != 0) {
- SOCK_UNLOCK(so);
- ACCEPT_UNLOCK();
- return;
- }
TAILQ_REMOVE(&head->so_incomp, so, so_list);
head->so_incqlen--;
so->so_qstate &= ~SQ_INCOMP;
@@ -399,45 +588,77 @@
(so->so_qstate & SQ_INCOMP) == 0,
("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
+ if (so->so_options & SO_ACCEPTCONN) {
+ KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
+ KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
+ }
SOCK_UNLOCK(so);
ACCEPT_UNLOCK();
- SOCKBUF_LOCK(&so->so_snd);
- so->so_snd.sb_flags |= SB_NOINTR;
- (void)sblock(&so->so_snd, M_WAITOK);
+
+ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
+ (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
+ if (pr->pr_usrreqs->pru_detach != NULL)
+ (*pr->pr_usrreqs->pru_detach)(so);
+
/*
- * socantsendmore_locked() drops the socket buffer mutex so that it
- * can safely perform wakeups. Re-acquire the mutex before
- * continuing.
+ * From this point on, we assume that no other references to this
+ * socket exist anywhere else in the stack. Therefore, no locks need
+ * to be acquired or held.
+ *
+ * We used to do a lot of socket buffer and socket locking here, as
+ * well as invoke sorflush() and perform wakeups. The direct call to
+ * dom_dispose() and sbrelease_internal() are an inlining of what was
+ * necessary from sorflush().
+ *
+ * Notice that the socket buffer and kqueue state are torn down
+ * before calling pru_detach. This means that protocols shold not
+ * assume they can perform socket wakeups, etc, in their detach code.
*/
- socantsendmore_locked(so);
- SOCKBUF_LOCK(&so->so_snd);
- sbunlock(&so->so_snd);
- sbrelease_locked(&so->so_snd, so);
- SOCKBUF_UNLOCK(&so->so_snd);
- sorflush(so);
+ sbdestroy(&so->so_snd, so);
+ sbdestroy(&so->so_rcv, so);
knlist_destroy(&so->so_rcv.sb_sel.si_note);
knlist_destroy(&so->so_snd.sb_sel.si_note);
sodealloc(so);
}
/*
- * Close a socket on last file table reference removal.
- * Initiate disconnect if connected.
- * Free socket when disconnect complete.
+ * Close a socket on last file table reference removal. Initiate disconnect
+ * if connected. Free socket when disconnect complete.
*
- * This function will sorele() the socket. Note that soclose() may be
- * called prior to the ref count reaching zero. The actual socket
- * structure will not be freed until the ref count reaches zero.
+ * This function will sorele() the socket. Note that soclose() may be called
+ * prior to the ref count reaching zero. The actual socket structure will
+ * not be freed until the ref count reaches zero.
*/
int
-soclose(so)
- struct socket *so;
+soclose(struct socket *so)
{
int error = 0;
KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
funsetown(&so->so_sigio);
+ if (so->so_state & SS_ISCONNECTED) {
+ if ((so->so_state & SS_ISDISCONNECTING) == 0) {
+ error = sodisconnect(so);
+ if (error)
+ goto drop;
+ }
+ if (so->so_options & SO_LINGER) {
+ if ((so->so_state & SS_ISDISCONNECTING) &&
+ (so->so_state & SS_NBIO))
+ goto drop;
+ while (so->so_state & SS_ISCONNECTED) {
+ error = tsleep(&so->so_timeo,
+ PSOCK | PCATCH, "soclos", so->so_linger * hz);
+ if (error)
+ break;
+ }
+ }
+ }
+
+drop:
+ if (so->so_proto->pr_usrreqs->pru_close != NULL)
+ (*so->so_proto->pr_usrreqs->pru_close)(so);
if (so->so_options & SO_ACCEPTCONN) {
struct socket *sp;
ACCEPT_LOCK();
@@ -447,7 +668,7 @@
sp->so_qstate &= ~SQ_INCOMP;
sp->so_head = NULL;
ACCEPT_UNLOCK();
- (void) soabort(sp);
+ soabort(sp);
ACCEPT_LOCK();
}
while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
@@ -456,38 +677,11 @@
sp->so_qstate &= ~SQ_COMP;
sp->so_head = NULL;
ACCEPT_UNLOCK();
- (void) soabort(sp);
+ soabort(sp);
ACCEPT_LOCK();
}
ACCEPT_UNLOCK();
}
- if (so->so_pcb == NULL)
- goto discard;
- if (so->so_state & SS_ISCONNECTED) {
- if ((so->so_state & SS_ISDISCONNECTING) == 0) {
- error = sodisconnect(so);
- if (error)
- goto drop;
- }
- if (so->so_options & SO_LINGER) {
- if ((so->so_state & SS_ISDISCONNECTING) &&
- (so->so_state & SS_NBIO))
- goto drop;
- while (so->so_state & SS_ISCONNECTED) {
- error = tsleep(&so->so_timeo,
- PSOCK | PCATCH, "soclos", so->so_linger * hz);
- if (error)
- break;
- }
- }
- }
-drop:
- if (so->so_pcb != NULL) {
- int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
- if (error == 0)
- error = error2;
- }
-discard:
ACCEPT_LOCK();
SOCK_LOCK(so);
KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
@@ -497,31 +691,44 @@
}
/*
- * soabort() must not be called with any socket locks held, as it calls
- * into the protocol, which will call back into the socket code causing
- * it to acquire additional socket locks that may cause recursion or lock
- * order reversals.
+ * soabort() is used to abruptly tear down a connection, such as when a
+ * resource limit is reached (listen queue depth exceeded), or if a listen
+ * socket is closed while there are sockets waiting to be accepted.
+ *
+ * This interface is tricky, because it is called on an unreferenced socket,
+ * and must be called only by a thread that has actually removed the socket
+ * from the listen queue it was on, or races with other threads are risked.
+ *
+ * This interface will call into the protocol code, so must not be called
+ * with any socket locks held. Protocols do call it while holding their own
+ * recursible protocol mutexes, but this is something that should be subject
+ * to review in the future.
*/
-int
-soabort(so)
- struct socket *so;
+void
+soabort(struct socket *so)
{
- int error;
- error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
- if (error) {
- ACCEPT_LOCK();
- SOCK_LOCK(so);
- sotryfree(so); /* note: does not decrement the ref count */
- return error;
- }
- return (0);
+ /*
+ * In as much as is possible, assert that no references to this
+ * socket are held. This is not quite the same as asserting that the
+ * current thread is responsible for arranging for no references, but
+ * is as close as we can get for now.
+ */
+ KASSERT(so->so_count == 0, ("soabort: so_count"));
+ KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
+ KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
+ KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
+ KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
+
+ if (so->so_proto->pr_usrreqs->pru_abort != NULL)
+ (*so->so_proto->pr_usrreqs->pru_abort)(so);
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ sofree(so);
}
int
-soaccept(so, nam)
- struct socket *so;
- struct sockaddr **nam;
+soaccept(struct socket *so, struct sockaddr **nam)
{
int error;
@@ -534,10 +741,7 @@
}
int
-soconnect(so, nam, td)
- struct socket *so;
- struct sockaddr *nam;
- struct thread *td;
+soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
{
int error;
@@ -545,9 +749,8 @@
return (EOPNOTSUPP);
/*
* If protocol is connection-based, can only connect once.
- * Otherwise, if connected, try to disconnect first.
- * This allows user to disconnect by connecting to, e.g.,
- * a null address.
+ * Otherwise, if connected, try to disconnect first. This allows
+ * user to disconnect by connecting to, e.g., a null address.
*/
if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
@@ -555,8 +758,8 @@
error = EISCONN;
} else {
/*
- * Prevent accumulated error from previous connection
- * from biting us.
+ * Prevent accumulated error from previous connection from
+ * biting us.
*/
so->so_error = 0;
error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
@@ -566,17 +769,14 @@
}
int
-soconnect2(so1, so2)
- struct socket *so1;
- struct socket *so2;
+soconnect2(struct socket *so1, struct socket *so2)
{
return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
}
int
-sodisconnect(so)
- struct socket *so;
+sodisconnect(struct socket *so)
{
int error;
@@ -588,25 +788,6 @@
return (error);
}
-#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
-/*
- * Send on a socket.
- * If send must go all at once and message is larger than
- * send buffering, then hard error.
- * Lock against other senders.
- * If must go all at once and not enough room now, then
- * inform user that this would block and do nothing.
- * Otherwise, if nonblocking, send as much as possible.
- * The data to be sent is described by "uio" if nonzero,
- * otherwise by the mbuf chain "top" (which must be null
- * if uio is not). Data provided in mbuf chain must be small
- * enough to send all at once.
- *
- * Returns nonzero on error, timeout or signal; callers
- * must check for short counts if EINTR/ERESTART are returned.
- * Data and control buffers are freed on return.
- */
-
#ifdef ZERO_COPY_SOCKETS
struct so_zerocopy_stats{
int size_ok;
@@ -620,37 +801,315 @@
#include <vm/vm.h>
#include <vm/vm_page.h>
#include <vm/vm_object.h>
+
+/*
+ * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise
+ * sosend_dgram() and sosend_generic() use m_uiotombuf().
+ *
+ * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
+ * all of the data referenced by the uio. If desired, it uses zero-copy.
+ * *space will be updated to reflect data copied in.
+ *
+ * NB: If atomic I/O is requested, the caller must already have checked that
+ * space can hold resid bytes.
+ *
+ * NB: In the event of an error, the caller may need to free the partial
+ * chain pointed to by *mpp. The contents of both *uio and *space may be
+ * modified even in the case of an error.
+ */
+static int
+sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
+ int flags)
+{
+ struct mbuf *m, **mp, *top;
+ long len, resid;
+ int error;
+#ifdef ZERO_COPY_SOCKETS
+ int cow_send;
+#endif
+
+ *retmp = top = NULL;
+ mp = ⊤
+ len = 0;
+ resid = uio->uio_resid;
+ error = 0;
+ do {
+#ifdef ZERO_COPY_SOCKETS
+ cow_send = 0;
+#endif /* ZERO_COPY_SOCKETS */
+ if (resid >= MINCLSIZE) {
+#ifdef ZERO_COPY_SOCKETS
+ if (top == NULL) {
+ m = m_gethdr(M_WAITOK, MT_DATA);
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = NULL;
+ } else
+ m = m_get(M_WAITOK, MT_DATA);
+ if (so_zero_copy_send &&
+ resid>=PAGE_SIZE &&
+ *space>=PAGE_SIZE &&
+ uio->uio_iov->iov_len>=PAGE_SIZE) {
+ so_zerocp_stats.size_ok++;
+ so_zerocp_stats.align_ok++;
+ cow_send = socow_setup(m, uio);
+ len = cow_send;
+ }
+ if (!cow_send) {
+ m_clget(m, M_WAITOK);
+ len = min(min(MCLBYTES, resid), *space);
+ }
+#else /* ZERO_COPY_SOCKETS */
+ if (top == NULL) {
+ m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = NULL;
+ } else
+ m = m_getcl(M_TRYWAIT, MT_DATA, 0);
+ len = min(min(MCLBYTES, resid), *space);
+#endif /* ZERO_COPY_SOCKETS */
+ } else {
+ if (top == NULL) {
+ m = m_gethdr(M_TRYWAIT, MT_DATA);
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = NULL;
+
+ len = min(min(MHLEN, resid), *space);
+ /*
+ * For datagram protocols, leave room
+ * for protocol headers in first mbuf.
+ */
+ if (atomic && m && len < MHLEN)
+ MH_ALIGN(m, len);
+ } else {
+ m = m_get(M_TRYWAIT, MT_DATA);
+ len = min(min(MLEN, resid), *space);
+ }
+ }
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+
+ *space -= len;
+#ifdef ZERO_COPY_SOCKETS
+ if (cow_send)
+ error = 0;
+ else
+#endif /* ZERO_COPY_SOCKETS */
+ error = uiomove(mtod(m, void *), (int)len, uio);
+ resid = uio->uio_resid;
+ m->m_len = len;
+ *mp = m;
+ top->m_pkthdr.len += len;
+ if (error)
+ goto out;
+ mp = &m->m_next;
+ if (resid <= 0) {
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
+ break;
+ }
+ } while (*space > 0 && atomic);
+out:
+ *retmp = top;
+ return (error);
+}
#endif /*ZERO_COPY_SOCKETS*/
+#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
+
int
-sosend(so, addr, uio, top, control, flags, td)
- struct socket *so;
- struct sockaddr *addr;
- struct uio *uio;
- struct mbuf *top;
- struct mbuf *control;
- int flags;
- struct thread *td;
+sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
+ struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
{
- struct mbuf **mp;
- struct mbuf *m;
- long space, len = 0, resid;
+ long space, resid;
int clen = 0, error, dontroute;
+#ifdef ZERO_COPY_SOCKETS
int atomic = sosendallatonce(so) || top;
+#endif
+
+ KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
+ KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
+ ("sodgram_send: !PR_ATOMIC"));
+
+ if (uio != NULL)
+ resid = uio->uio_resid;
+ else
+ resid = top->m_pkthdr.len;
+ /*
+ * In theory resid should be unsigned. However, space must be
+ * signed, as it might be less than 0 if we over-committed, and we
+ * must use a signed comparison of space and resid. On the other
+ * hand, a negative resid causes us to loop sending 0-length
+ * segments to the protocol.
+ *
+ * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
+ * type sockets since that's an error.
+ */
+ if (resid < 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ dontroute =
+ (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
+ if (td != NULL)
+ td->td_ru.ru_msgsnd++;
+ if (control != NULL)
+ clen = control->m_len;
+
+ SOCKBUF_LOCK(&so->so_snd);
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EPIPE;
+ goto out;
+ }
+ if (so->so_error) {
+ error = so->so_error;
+ so->so_error = 0;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto out;
+ }
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ /*
+ * `sendto' and `sendmsg' is allowed on a connection-based
+ * socket if it supports implied connect. Return ENOTCONN if
+ * not connected and no address is supplied.
+ */
+ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
+ if ((so->so_state & SS_ISCONFIRMING) == 0 &&
+ !(resid == 0 && clen != 0)) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = ENOTCONN;
+ goto out;
+ }
+ } else if (addr == NULL) {
+ if (so->so_proto->pr_flags & PR_CONNREQUIRED)
+ error = ENOTCONN;
+ else
+ error = EDESTADDRREQ;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto out;
+ }
+ }
+
+ /*
+ * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
+ * problem and need fixing.
+ */
+ space = sbspace(&so->so_snd);
+ if (flags & MSG_OOB)
+ space += 1024;
+ space -= clen;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (resid > space) {
+ error = EMSGSIZE;
+ goto out;
+ }
+ if (uio == NULL) {
+ resid = 0;
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
+ } else {
#ifdef ZERO_COPY_SOCKETS
- int cow_send;
-#endif /* ZERO_COPY_SOCKETS */
+ error = sosend_copyin(uio, &top, atomic, &space, flags);
+ if (error)
+ goto out;
+#else
+ /*
+ * Copy the data from userland into a mbuf chain.
+ * If no data is to be copied in, a single empty mbuf
+ * is returned.
+ */
+ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
+ (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
+ if (top == NULL) {
+ error = EFAULT; /* only possible error */
+ goto out;
+ }
+ space -= resid - uio->uio_resid;
+#endif
+ resid = uio->uio_resid;
+ }
+ KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
+ /*
+ * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
+ * than with.
+ */
+ if (dontroute) {
+ SOCK_LOCK(so);
+ so->so_options |= SO_DONTROUTE;
+ SOCK_UNLOCK(so);
+ }
+ /*
+ * XXX all the SBS_CANTSENDMORE checks previously done could be out
+ * of date. We could have recieved a reset packet in an interrupt or
+ * maybe we slept while doing page faults in uiomove() etc. We could
+ * probably recheck again inside the locking protection here, but
+ * there are probably other places that this also happens. We must
+ * rethink this.
+ */
+ error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+ (flags & MSG_OOB) ? PRUS_OOB :
+ /*
+ * If the user set MSG_EOF, the protocol understands this flag and
+ * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
+ */
+ ((flags & MSG_EOF) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+ (resid <= 0)) ?
+ PRUS_EOF :
+ /* If there is more to send set PRUS_MORETOCOME */
+ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+ top, addr, control, td);
+ if (dontroute) {
+ SOCK_LOCK(so);
+ so->so_options &= ~SO_DONTROUTE;
+ SOCK_UNLOCK(so);
+ }
+ clen = 0;
+ control = NULL;
+ top = NULL;
+out:
+ if (top != NULL)
+ m_freem(top);
+ if (control != NULL)
+ m_freem(control);
+ return (error);
+}
+
+/*
+ * Send on a socket. If send must go all at once and message is larger than
+ * send buffering, then hard error. Lock against other senders. If must go
+ * all at once and not enough room now, then inform user that this would
+ * block and do nothing. Otherwise, if nonblocking, send as much as
+ * possible. The data to be sent is described by "uio" if nonzero, otherwise
+ * by the mbuf chain "top" (which must be null if uio is not). Data provided
+ * in mbuf chain must be small enough to send all at once.
+ *
+ * Returns nonzero on error, timeout or signal; callers must check for short
+ * counts if EINTR/ERESTART are returned. Data and control buffers are freed
+ * on return.
+ */
+int
+sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
+ struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+ long space, resid;
+ int clen = 0, error, dontroute;
+ int atomic = sosendallatonce(so) || top;
if (uio != NULL)
resid = uio->uio_resid;
else
resid = top->m_pkthdr.len;
/*
- * In theory resid should be unsigned.
- * However, space must be signed, as it might be less than 0
- * if we over-committed, and we must use a signed comparison
- * of space and resid. On the other hand, a negative resid
- * causes us to loop sending 0-length segments to the protocol.
+ * In theory resid should be unsigned. However, space must be
+ * signed, as it might be less than 0 if we over-committed, and we
+ * must use a signed comparison of space and resid. On the other
+ * hand, a negative resid causes us to loop sending 0-length
+ * segments to the protocol.
*
* Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
* type sockets since that's an error.
@@ -664,24 +1123,26 @@
(flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
(so->so_proto->pr_flags & PR_ATOMIC);
if (td != NULL)
- td->td_proc->p_stats->p_ru.ru_msgsnd++;
+ td->td_ru.ru_msgsnd++;
if (control != NULL)
clen = control->m_len;
-#define snderr(errno) { error = (errno); goto release; }
- SOCKBUF_LOCK(&so->so_snd);
-restart:
- SOCKBUF_LOCK_ASSERT(&so->so_snd);
error = sblock(&so->so_snd, SBLOCKWAIT(flags));
if (error)
- goto out_locked;
+ goto out;
+
+restart:
do {
- SOCKBUF_LOCK_ASSERT(&so->so_snd);
- if (so->so_snd.sb_state & SBS_CANTSENDMORE)
- snderr(EPIPE);
+ SOCKBUF_LOCK(&so->so_snd);
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EPIPE;
+ goto release;
+ }
if (so->so_error) {
error = so->so_error;
so->so_error = 0;
+ SOCKBUF_UNLOCK(&so->so_snd);
goto release;
}
if ((so->so_state & SS_ISCONNECTED) == 0) {
@@ -694,186 +1155,117 @@
if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
(so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
if ((so->so_state & SS_ISCONFIRMING) == 0 &&
- !(resid == 0 && clen != 0))
- snderr(ENOTCONN);
- } else if (addr == NULL)
- snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
- ENOTCONN : EDESTADDRREQ);
+ !(resid == 0 && clen != 0)) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = ENOTCONN;
+ goto release;
+ }
+ } else if (addr == NULL) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (so->so_proto->pr_flags & PR_CONNREQUIRED)
+ error = ENOTCONN;
+ else
+ error = EDESTADDRREQ;
+ goto release;
+ }
}
space = sbspace(&so->so_snd);
if (flags & MSG_OOB)
space += 1024;
if ((atomic && resid > so->so_snd.sb_hiwat) ||
- clen > so->so_snd.sb_hiwat)
- snderr(EMSGSIZE);
+ clen > so->so_snd.sb_hiwat) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EMSGSIZE;
+ goto release;
+ }
if (space < resid + clen &&
(atomic || space < so->so_snd.sb_lowat || space < clen)) {
- if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
- snderr(EWOULDBLOCK);
- sbunlock(&so->so_snd);
+ if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EWOULDBLOCK;
+ goto release;
+ }
error = sbwait(&so->so_snd);
+ SOCKBUF_UNLOCK(&so->so_snd);
if (error)
- goto out_locked;
+ goto release;
goto restart;
}
SOCKBUF_UNLOCK(&so->so_snd);
- mp = ⊤
space -= clen;
do {
- if (uio == NULL) {
- /*
- * Data is prepackaged in "top".
- */
- resid = 0;
- if (flags & MSG_EOR)
- top->m_flags |= M_EOR;
- } else do {
-#ifdef ZERO_COPY_SOCKETS
- cow_send = 0;
-#endif /* ZERO_COPY_SOCKETS */
- if (resid >= MINCLSIZE) {
-#ifdef ZERO_COPY_SOCKETS
- if (top == NULL) {
- MGETHDR(m, M_TRYWAIT, MT_DATA);
- if (m == NULL) {
- error = ENOBUFS;
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
- }
- m->m_pkthdr.len = 0;
- m->m_pkthdr.rcvif = NULL;
- } else {
- MGET(m, M_TRYWAIT, MT_DATA);
- if (m == NULL) {
- error = ENOBUFS;
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
- }
- }
- if (so_zero_copy_send &&
- resid>=PAGE_SIZE &&
- space>=PAGE_SIZE &&
- uio->uio_iov->iov_len>=PAGE_SIZE) {
- so_zerocp_stats.size_ok++;
- so_zerocp_stats.align_ok++;
- cow_send = socow_setup(m, uio);
- len = cow_send;
- }
- if (!cow_send) {
- MCLGET(m, M_TRYWAIT);
- if ((m->m_flags & M_EXT) == 0) {
- m_free(m);
- m = NULL;
- } else {
- len = min(min(MCLBYTES, resid), space);
- }
- }
-#else /* ZERO_COPY_SOCKETS */
- if (top == NULL) {
- m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
- m->m_pkthdr.len = 0;
- m->m_pkthdr.rcvif = NULL;
- } else
- m = m_getcl(M_TRYWAIT, MT_DATA, 0);
- len = min(min(MCLBYTES, resid), space);
-#endif /* ZERO_COPY_SOCKETS */
+ if (uio == NULL) {
+ resid = 0;
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
} else {
+#ifdef ZERO_COPY_SOCKETS
+ error = sosend_copyin(uio, &top, atomic,
+ &space, flags);
+ if (error != 0)
+ goto release;
+#else
+ /*
+ * Copy the data from userland into a mbuf
+ * chain. If no data is to be copied in,
+ * a single empty mbuf is returned.
+ */
+ top = m_uiotombuf(uio, M_WAITOK, space,
+ (atomic ? max_hdr : 0),
+ (atomic ? M_PKTHDR : 0) |
+ ((flags & MSG_EOR) ? M_EOR : 0));
if (top == NULL) {
- m = m_gethdr(M_TRYWAIT, MT_DATA);
- m->m_pkthdr.len = 0;
- m->m_pkthdr.rcvif = NULL;
-
- len = min(min(MHLEN, resid), space);
- /*
- * For datagram protocols, leave room
- * for protocol headers in first mbuf.
- */
- if (atomic && m && len < MHLEN)
- MH_ALIGN(m, len);
- } else {
- m = m_get(M_TRYWAIT, MT_DATA);
- len = min(min(MLEN, resid), space);
+ error = EFAULT; /* only possible error */
+ goto release;
}
+ space -= resid - uio->uio_resid;
+#endif
+ resid = uio->uio_resid;
}
- if (m == NULL) {
- error = ENOBUFS;
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
- }
-
- space -= len;
-#ifdef ZERO_COPY_SOCKETS
- if (cow_send)
- error = 0;
- else
-#endif /* ZERO_COPY_SOCKETS */
- error = uiomove(mtod(m, void *), (int)len, uio);
- resid = uio->uio_resid;
- m->m_len = len;
- *mp = m;
- top->m_pkthdr.len += len;
- if (error) {
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
- }
- mp = &m->m_next;
- if (resid <= 0) {
- if (flags & MSG_EOR)
- top->m_flags |= M_EOR;
- break;
+ if (dontroute) {
+ SOCK_LOCK(so);
+ so->so_options |= SO_DONTROUTE;
+ SOCK_UNLOCK(so);
}
- } while (space > 0 && atomic);
- if (dontroute) {
- SOCK_LOCK(so);
- so->so_options |= SO_DONTROUTE;
- SOCK_UNLOCK(so);
- }
- /*
- * XXX all the SBS_CANTSENDMORE checks previously
- * done could be out of date. We could have recieved
- * a reset packet in an interrupt or maybe we slept
- * while doing page faults in uiomove() etc. We could
- * probably recheck again inside the locking protection
- * here, but there are probably other places that this
- * also happens. We must rethink this.
- */
- error = (*so->so_proto->pr_usrreqs->pru_send)(so,
- (flags & MSG_OOB) ? PRUS_OOB :
/*
- * If the user set MSG_EOF, the protocol
- * understands this flag and nothing left to
- * send then use PRU_SEND_EOF instead of PRU_SEND.
+ * XXX all the SBS_CANTSENDMORE checks previously
+ * done could be out of date. We could have recieved
+ * a reset packet in an interrupt or maybe we slept
+ * while doing page faults in uiomove() etc. We
+ * could probably recheck again inside the locking
+ * protection here, but there are probably other
+ * places that this also happens. We must rethink
+ * this.
+ */
+ error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+ (flags & MSG_OOB) ? PRUS_OOB :
+ /*
+ * If the user set MSG_EOF, the protocol understands
+ * this flag and nothing left to send then use
+ * PRU_SEND_EOF instead of PRU_SEND.
*/
- ((flags & MSG_EOF) &&
- (so->so_proto->pr_flags & PR_IMPLOPCL) &&
- (resid <= 0)) ?
+ ((flags & MSG_EOF) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+ (resid <= 0)) ?
PRUS_EOF :
- /* If there is more to send set PRUS_MORETOCOME */
- (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
- top, addr, control, td);
- if (dontroute) {
- SOCK_LOCK(so);
- so->so_options &= ~SO_DONTROUTE;
- SOCK_UNLOCK(so);
- }
- clen = 0;
- control = NULL;
- top = NULL;
- mp = ⊤
- if (error) {
- SOCKBUF_LOCK(&so->so_snd);
- goto release;
- }
+ /* If there is more to send set PRUS_MORETOCOME. */
+ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+ top, addr, control, td);
+ if (dontroute) {
+ SOCK_LOCK(so);
+ so->so_options &= ~SO_DONTROUTE;
+ SOCK_UNLOCK(so);
+ }
+ clen = 0;
+ control = NULL;
+ top = NULL;
+ if (error)
+ goto release;
} while (resid && space > 0);
- SOCKBUF_LOCK(&so->so_snd);
} while (resid);
release:
- SOCKBUF_LOCK_ASSERT(&so->so_snd);
sbunlock(&so->so_snd);
-out_locked:
- SOCKBUF_LOCK_ASSERT(&so->so_snd);
- SOCKBUF_UNLOCK(&so->so_snd);
out:
if (top != NULL)
m_freem(top);
@@ -882,6 +1274,19 @@
return (error);
}
+int
+sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
+ struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+
+ /* XXXRW: Temporary debugging. */
+ KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend,
+ ("sosend: protocol calls sosend"));
+
+ return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
+ control, flags, td));
+}
+
/*
* The part of soreceive() that implements reading non-inline out-of-band
* data from a socket. For more complete comments, see soreceive(), from
@@ -891,10 +1296,7 @@
* unable to return an mbuf chain to the caller.
*/
static int
-soreceive_rcvoob(so, uio, flags)
- struct socket *so;
- struct uio *uio;
- int flags;
+soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
{
struct protosw *pr = so->so_proto;
struct mbuf *m;
@@ -971,29 +1373,24 @@
/*
- * Implement receive operations on a socket.
- * We depend on the way that records are added to the sockbuf
- * by sbappend*. In particular, each record (mbufs linked through m_next)
- * must begin with an address if the protocol so specifies,
- * followed by an optional mbuf or mbufs containing ancillary data,
- * and then zero or more mbufs of data.
- * In order to avoid blocking network interrupts for the entire time here,
- * we splx() while doing the actual copy to user space.
- * Although the sockbuf is locked, new data may still be appended,
- * and thus we must maintain consistency of the sockbuf during that time.
- *
- * The caller may receive the data as a single mbuf chain by supplying
- * an mbuf **mp0 for use in returning the chain. The uio is then used
- * only for the count in uio_resid.
+ * Implement receive operations on a socket. We depend on the way that
+ * records are added to the sockbuf by sbappend. In particular, each record
+ * (mbufs linked through m_next) must begin with an address if the protocol
+ * so specifies, followed by an optional mbuf or mbufs containing ancillary
+ * data, and then zero or more mbufs of data. In order to allow parallelism
+ * between network receive and copying to user space, as well as avoid
+ * sleeping with a mutex held, we release the socket buffer mutex during the
+ * user space copy. Although the sockbuf is locked, new data may still be
+ * appended, and thus we must maintain consistency of the sockbuf during that
+ * time.
+ *
+ * The caller may receive the data as a single mbuf chain by supplying an
+ * mbuf **mp0 for use in returning the chain. The uio is then used only for
+ * the count in uio_resid.
*/
int
-soreceive(so, psa, uio, mp0, controlp, flagsp)
- struct socket *so;
- struct sockaddr **psa;
- struct uio *uio;
- struct mbuf **mp0;
- struct mbuf **controlp;
- int *flagsp;
+soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
+ struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
{
struct mbuf *m, **mp;
int flags, len, error, offset;
@@ -1019,24 +1416,23 @@
&& uio->uio_resid)
(*pr->pr_usrreqs->pru_rcvd)(so, 0);
- SOCKBUF_LOCK(&so->so_rcv);
-restart:
- SOCKBUF_LOCK_ASSERT(&so->so_rcv);
error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
if (error)
- goto out;
+ return (error);
+restart:
+ SOCKBUF_LOCK(&so->so_rcv);
m = so->so_rcv.sb_mb;
/*
- * If we have less data than requested, block awaiting more
- * (subject to any timeout) if:
+ * If we have less data than requested, block awaiting more (subject
+ * to any timeout) if:
* 1. the current count is less than the low water mark, or
* 2. MSG_WAITALL is set, and it is possible to do the entire
* receive operation at once if we block (resid <= hiwat).
* 3. MSG_DONTWAIT is not set
* If MSG_WAITALL is set but resid is larger than the receive buffer,
- * we have to do the receive in sections, and thus risk returning
- * a short count if a timeout or signal occurs after we start.
+ * we have to do the receive in sections, and thus risk returning a
+ * short count if a timeout or signal occurs after we start.
*/
if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
so->so_rcv.sb_cc < uio->uio_resid) &&
@@ -1052,14 +1448,16 @@
error = so->so_error;
if ((flags & MSG_PEEK) == 0)
so->so_error = 0;
+ SOCKBUF_UNLOCK(&so->so_rcv);
goto release;
}
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
- if (m)
- goto dontblock;
- else
+ if (m == NULL) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
goto release;
+ } else
+ goto dontblock;
}
for (; m != NULL; m = m->m_next)
if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
@@ -1068,22 +1466,26 @@
}
if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
(so->so_proto->pr_flags & PR_CONNREQUIRED)) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
error = ENOTCONN;
goto release;
}
- if (uio->uio_resid == 0)
+ if (uio->uio_resid == 0) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
goto release;
+ }
if ((so->so_state & SS_NBIO) ||
(flags & (MSG_DONTWAIT|MSG_NBIO))) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
error = EWOULDBLOCK;
goto release;
}
SBLASTRECORDCHK(&so->so_rcv);
SBLASTMBUFCHK(&so->so_rcv);
- sbunlock(&so->so_rcv);
error = sbwait(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_rcv);
if (error)
- goto out;
+ goto release;
goto restart;
}
dontblock:
@@ -1104,7 +1506,7 @@
*/
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
if (uio->uio_td)
- uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
+ uio->uio_td->td_ru.ru_msgrcv++;
KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
SBLASTRECORDCHK(&so->so_rcv);
SBLASTMBUFCHK(&so->so_rcv);
@@ -1173,7 +1575,10 @@
}
cm = cmn;
}
- nextrecord = so->so_rcv.sb_mb->m_nextpkt;
+ if (m != NULL)
+ nextrecord = so->so_rcv.sb_mb->m_nextpkt;
+ else
+ nextrecord = so->so_rcv.sb_mb;
orig_resid = 0;
}
if (m != NULL) {
@@ -1226,7 +1631,7 @@
} else if (type == MT_OOBDATA)
break;
else
- KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
+ KASSERT(m->m_type == MT_DATA,
("m->m_type == %d", m->m_type));
so->so_rcv.sb_state &= ~SBS_RCVATMARK;
len = uio->uio_resid;
@@ -1235,12 +1640,11 @@
if (len > m->m_len - moff)
len = m->m_len - moff;
/*
- * If mp is set, just pass back the mbufs.
- * Otherwise copy them out via the uio, then free.
- * Sockbuf must be consistent here (points to current mbuf,
- * it points to next record) when we drop priority;
- * we must note any additions to the sockbuf when we
- * block interrupts again.
+ * If mp is set, just pass back the mbufs. Otherwise copy
+ * them out via the uio, then free. Sockbuf must be
+ * consistent here (points to current mbuf, it points to next
+ * record) when we drop priority; we must note any additions
+ * to the sockbuf when we block interrupts again.
*/
if (mp == NULL) {
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
@@ -1264,8 +1668,21 @@
#endif /* ZERO_COPY_SOCKETS */
error = uiomove(mtod(m, char *) + moff, (int)len, uio);
SOCKBUF_LOCK(&so->so_rcv);
- if (error)
+ if (error) {
+ /*
+ * The MT_SONAME mbuf has already been removed
+ * from the record, so it is necessary to
+ * remove the data mbufs, if any, to preserve
+ * the invariant in the case of PR_ADDR that
+ * requires MT_SONAME mbufs at the head of
+ * each record.
+ */
+ if (m && pr->pr_flags & PR_ATOMIC &&
+ ((flags & MSG_PEEK) == 0))
+ (void)sbdroprecord_locked(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_rcv);
goto release;
+ }
} else
uio->uio_resid -= len;
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
@@ -1287,14 +1704,7 @@
so->so_rcv.sb_mb = m_free(m);
m = so->so_rcv.sb_mb;
}
- if (m != NULL) {
- m->m_nextpkt = nextrecord;
- if (nextrecord == NULL)
- so->so_rcv.sb_lastrecord = m;
- } else {
- so->so_rcv.sb_mb = nextrecord;
- SB_EMPTY_FIXUP(&so->so_rcv);
- }
+ sockbuf_pushsync(&so->so_rcv, nextrecord);
SBLASTRECORDCHK(&so->so_rcv);
SBLASTMBUFCHK(&so->so_rcv);
}
@@ -1316,9 +1726,11 @@
SOCKBUF_LOCK(&so->so_rcv);
if (*mp == NULL) {
/*
- * m_copym() couldn't allocate an mbuf.
- * Adjust uio_resid back (it was adjusted
- * down by len bytes, which we didn't end
+ * m_copym() couldn't
+ * allocate an mbuf. Adjust
+ * uio_resid back (it was
+ * adjusted down by len
+ * bytes, which we didn't end
* up "copying" over).
*/
uio->uio_resid += len;
@@ -1347,11 +1759,11 @@
if (flags & MSG_EOR)
break;
/*
- * If the MSG_WAITALL flag is set (for non-atomic socket),
- * we must not quit until "uio->uio_resid == 0" or an error
- * termination. If a signal/timeout occurs, return
- * with a short count but without error.
- * Keep sockbuf locked against other readers.
+ * If the MSG_WAITALL flag is set (for non-atomic socket), we
+ * must not quit until "uio->uio_resid == 0" or an error
+ * termination. If a signal/timeout occurs, return with a
+ * short count but without error. Keep sockbuf locked
+ * against other readers.
*/
while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
!sosendallatonce(so) && nextrecord == NULL) {
@@ -1362,7 +1774,7 @@
* Notify the protocol that some data has been
* drained before blocking.
*/
- if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
+ if (pr->pr_flags & PR_WANTRCVD) {
SOCKBUF_UNLOCK(&so->so_rcv);
(*pr->pr_usrreqs->pru_rcvd)(so, flags);
SOCKBUF_LOCK(&so->so_rcv);
@@ -1370,8 +1782,10 @@
SBLASTRECORDCHK(&so->so_rcv);
SBLASTMBUFCHK(&so->so_rcv);
error = sbwait(&so->so_rcv);
- if (error)
+ if (error) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
goto release;
+ }
m = so->so_rcv.sb_mb;
if (m != NULL)
nextrecord = m->m_nextpkt;
@@ -1401,12 +1815,12 @@
SBLASTRECORDCHK(&so->so_rcv);
SBLASTMBUFCHK(&so->so_rcv);
/*
- * If soreceive() is being done from the socket callback, then
- * don't need to generate ACK to peer to update window, since
- * ACK will be generated on return to TCP.
+ * If soreceive() is being done from the socket callback,
+ * then don't need to generate ACK to peer to update window,
+ * since ACK will be generated on return to TCP.
*/
- if (!(flags & MSG_SOCALLBCK) &&
- (pr->pr_flags & PR_WANTRCVD) && so->so_pcb) {
+ if (!(flags & MSG_SOCALLBCK) &&
+ (pr->pr_flags & PR_WANTRCVD)) {
SOCKBUF_UNLOCK(&so->so_rcv);
(*pr->pr_usrreqs->pru_rcvd)(so, flags);
SOCKBUF_LOCK(&so->so_rcv);
@@ -1415,25 +1829,33 @@
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
if (orig_resid == uio->uio_resid && orig_resid &&
(flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
- sbunlock(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_rcv);
goto restart;
}
+ SOCKBUF_UNLOCK(&so->so_rcv);
if (flagsp != NULL)
*flagsp |= flags;
release:
- SOCKBUF_LOCK_ASSERT(&so->so_rcv);
sbunlock(&so->so_rcv);
-out:
- SOCKBUF_LOCK_ASSERT(&so->so_rcv);
- SOCKBUF_UNLOCK(&so->so_rcv);
return (error);
}
int
-soshutdown(so, how)
- struct socket *so;
- int how;
+soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
+ struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+
+ /* XXXRW: Temporary debugging. */
+ KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive,
+ ("soreceive: protocol calls soreceive"));
+
+ return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
+ controlp, flagsp));
+}
+
+int
+soshutdown(struct socket *so, int how)
{
struct protosw *pr = so->so_proto;
@@ -1448,8 +1870,7 @@
}
void
-sorflush(so)
- struct socket *so;
+sorflush(struct socket *so)
{
struct sockbuf *sb = &so->so_rcv;
struct protosw *pr = so->so_proto;
@@ -1463,27 +1884,28 @@
* however, we have to initialize and destroy the mutex in the copy
* so that dom_dispose() and sbrelease() can lock t as needed.
*/
- SOCKBUF_LOCK(sb);
- sb->sb_flags |= SB_NOINTR;
- (void) sblock(sb, M_WAITOK);
+
/*
- * socantrcvmore_locked() drops the socket buffer mutex so that it
- * can safely perform wakeups. Re-acquire the mutex before
- * continuing.
+ * Dislodge threads currently blocked in receive and wait to acquire
+ * a lock against other simultaneous readers before clearing the
+ * socket buffer. Don't let our acquire be interrupted by a signal
+ * despite any existing socket disposition on interruptable waiting.
*/
- socantrcvmore_locked(so);
- SOCKBUF_LOCK(sb);
- sbunlock(sb);
+ socantrcvmore(so);
+ (void) sblock(sb, SBL_WAIT | SBL_NOINTR);
+
/*
- * Invalidate/clear most of the sockbuf structure, but leave
- * selinfo and mutex data unchanged.
+ * Invalidate/clear most of the sockbuf structure, but leave selinfo
+ * and mutex data unchanged.
*/
+ SOCKBUF_LOCK(sb);
bzero(&asb, offsetof(struct sockbuf, sb_startzero));
bcopy(&sb->sb_startzero, &asb.sb_startzero,
sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
bzero(&sb->sb_startzero,
sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
SOCKBUF_UNLOCK(sb);
+ sbunlock(sb);
SOCKBUF_LOCK_INIT(&asb, "so_rcv");
if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
@@ -1493,26 +1915,22 @@
}
/*
- * Perhaps this routine, and sooptcopyout(), below, ought to come in
- * an additional variant to handle the case where the option value needs
- * to be some kind of integer, but not a specific size.
- * In addition to their use here, these functions are also called by the
- * protocol-level pr_ctloutput() routines.
+ * Perhaps this routine, and sooptcopyout(), below, ought to come in an
+ * additional variant to handle the case where the option value needs to be
+ * some kind of integer, but not a specific size. In addition to their use
+ * here, these functions are also called by the protocol-level pr_ctloutput()
+ * routines.
*/
int
-sooptcopyin(sopt, buf, len, minlen)
- struct sockopt *sopt;
- void *buf;
- size_t len;
- size_t minlen;
+sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
{
size_t valsize;
/*
- * If the user gives us more than we wanted, we ignore it,
- * but if we don't get the minimum length the caller
- * wants, we return EINVAL. On success, sopt->sopt_valsize
- * is set to however much we actually retrieved.
+ * If the user gives us more than we wanted, we ignore it, but if we
+ * don't get the minimum length the caller wants, we return EINVAL.
+ * On success, sopt->sopt_valsize is set to however much we actually
+ * retrieved.
*/
if ((valsize = sopt->sopt_valsize) < minlen)
return EINVAL;
@@ -1523,11 +1941,12 @@
return (copyin(sopt->sopt_val, buf, valsize));
bcopy(sopt->sopt_val, buf, valsize);
- return 0;
+ return (0);
}
/*
- * Kernel version of setsockopt(2)/
+ * Kernel version of setsockopt(2).
+ *
* XXX: optlen is size_t, not socklen_t
*/
int
@@ -1546,9 +1965,7 @@
}
int
-sosetopt(so, sopt)
- struct socket *so;
- struct sockopt *sopt;
+sosetopt(struct socket *so, struct sockopt *sopt)
{
int error, optval;
struct linger l;
@@ -1620,8 +2037,8 @@
goto bad;
/*
- * Values < 1 make no sense for any of these
- * options, so disallow them.
+ * Values < 1 make no sense for any of these options,
+ * so disallow them.
*/
if (optval < 1) {
error = EINVAL;
@@ -1642,8 +2059,8 @@
break;
/*
- * Make sure the low-water is never greater than
- * the high-water.
+ * Make sure the low-water is never greater than the
+ * high-water.
*/
case SO_SNDLOWAT:
SOCKBUF_LOCK(&so->so_snd);
@@ -1732,7 +2149,9 @@
return (error);
}
-/* Helper routine for getsockopt */
+/*
+ * Helper routine for getsockopt.
+ */
int
sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
{
@@ -1742,13 +2161,12 @@
error = 0;
/*
- * Documented get behavior is that we always return a value,
- * possibly truncated to fit in the user's buffer.
- * Traditional behavior is that we always tell the user
- * precisely how much we copied, rather than something useful
- * like the total amount we had available for her.
- * Note that this interface is not idempotent; the entire answer must
- * generated ahead of time.
+ * Documented get behavior is that we always return a value, possibly
+ * truncated to fit in the user's buffer. Traditional behavior is
+ * that we always tell the user precisely how much we copied, rather
+ * than something useful like the total amount we had available for
+ * her. Note that this interface is not idempotent; the entire
+ * answer must generated ahead of time.
*/
valsize = min(len, sopt->sopt_valsize);
sopt->sopt_valsize = valsize;
@@ -1758,13 +2176,11 @@
else
bcopy(buf, sopt->sopt_val, valsize);
}
- return error;
+ return (error);
}
int
-sogetopt(so, sopt)
- struct socket *so;
- struct sockopt *sopt;
+sogetopt(struct socket *so, struct sockopt *sopt)
{
int error, optval;
struct linger l;
@@ -1817,8 +2233,10 @@
goto integer;
case SO_ERROR:
+ SOCK_LOCK(so);
optval = so->so_error;
so->so_error = 0;
+ SOCK_UNLOCK(so);
goto integer;
case SO_SNDBUF:
@@ -1954,7 +2372,7 @@
m_prev->m_next = m;
m_prev = m;
}
- return 0;
+ return (0);
}
/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
@@ -1964,7 +2382,7 @@
struct mbuf *m0 = m;
if (sopt->sopt_val == NULL)
- return 0;
+ return (0);
while (m != NULL && sopt->sopt_valsize >= m->m_len) {
if (sopt->sopt_td != NULL) {
int error;
@@ -1983,7 +2401,7 @@
}
if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
panic("ip6_sooptmcopyin");
- return 0;
+ return (0);
}
/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
@@ -1994,7 +2412,7 @@
size_t valsize = 0;
if (sopt->sopt_val == NULL)
- return 0;
+ return (0);
while (m != NULL && sopt->sopt_valsize >= m->m_len) {
if (sopt->sopt_td != NULL) {
int error;
@@ -2018,13 +2436,17 @@
return(EINVAL);
}
sopt->sopt_valsize = valsize;
- return 0;
+ return (0);
}
+/*
+ * sohasoutofband(): protocol notifies socket layer of the arrival of new
+ * out-of-band data, which will then notify socket consumers.
+ */
void
-sohasoutofband(so)
- struct socket *so;
+sohasoutofband(struct socket *so)
{
+
if (so->so_sigio != NULL)
pgsigio(&so->so_sigio, SIGURG, 0);
selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
@@ -2034,6 +2456,19 @@
sopoll(struct socket *so, int events, struct ucred *active_cred,
struct thread *td)
{
+
+ /* XXXRW: Temporary debugging. */
+ KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll,
+ ("sopoll: protocol calls sopoll"));
+
+ return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
+ td));
+}
+
+int
+sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
+ struct thread *td)
+{
int revents = 0;
SOCKBUF_LOCK(&so->so_snd);
@@ -2103,6 +2538,146 @@
return (0);
}
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol. Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
+ struct ifnet *ifp, struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_disconnect_notsupp(struct socket *so)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
+ struct sockaddr *addr, struct mbuf *control, struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one and
+ * doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+
+ sb->st_blksize = so->so_snd.sb_hiwat;
+ return 0;
+}
+
+int
+pru_shutdown_notsupp(struct socket *so)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
+ struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
+ struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
+ struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
static void
filt_sordetach(struct knote *kn)
{
@@ -2195,13 +2770,13 @@
}
static int
-somaxconn_sysctl(SYSCTL_HANDLER_ARGS)
+sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
{
int error;
int val;
val = somaxconn;
- error = sysctl_handle_int(oidp, &val, sizeof(int), req);
+ error = sysctl_handle_int(oidp, &val, 0, req);
if (error || !req->newptr )
return (error);
@@ -2211,3 +2786,172 @@
somaxconn = val;
return (0);
}
+
+/*
+ * These functions are used by protocols to notify the socket layer (and its
+ * consumers) of state changes in the sockets driven by protocol-side events.
+ */
+
+/*
+ * Procedures to manipulate state flags of socket and do appropriate wakeups.
+ *
+ * Normal sequence from the active (originating) side is that
+ * soisconnecting() is called during processing of connect() call, resulting
+ * in an eventual call to soisconnected() if/when the connection is
+ * established. When the connection is torn down soisdisconnecting() is
+ * called during processing of disconnect() call, and soisdisconnected() is
+ * called when the connection to the peer is totally severed. The semantics
+ * of these routines are such that connectionless protocols can call
+ * soisconnected() and soisdisconnected() only, bypassing the in-progress
+ * calls when setting up a ``connection'' takes no time.
+ *
+ * From the passive side, a socket is created with two queues of sockets:
+ * so_incomp for connections in progress and so_comp for connections already
+ * made and awaiting user acceptance. As a protocol is preparing incoming
+ * connections, it creates a socket structure queued on so_incomp by calling
+ * sonewconn(). When the connection is established, soisconnected() is
+ * called, and transfers the socket structure to so_comp, making it available
+ * to accept().
+ *
+ * If a socket is closed with sockets on either so_incomp or so_comp, these
+ * sockets are dropped.
+ *
+ * If higher-level protocols are implemented in the kernel, the wakeups done
+ * here will sometimes cause software-interrupt process scheduling.
+ */
+void
+soisconnecting(struct socket *so)
+{
+
+ SOCK_LOCK(so);
+ so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
+ so->so_state |= SS_ISCONNECTING;
+ SOCK_UNLOCK(so);
+}
+
+void
+soisconnected(struct socket *so)
+{
+ struct socket *head;
+
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
+ so->so_state |= SS_ISCONNECTED;
+ head = so->so_head;
+ if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
+ if ((so->so_options & SO_ACCEPTFILTER) == 0) {
+ SOCK_UNLOCK(so);
+ TAILQ_REMOVE(&head->so_incomp, so, so_list);
+ head->so_incqlen--;
+ so->so_qstate &= ~SQ_INCOMP;
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ head->so_qlen++;
+ so->so_qstate |= SQ_COMP;
+ ACCEPT_UNLOCK();
+ sorwakeup(head);
+ wakeup_one(&head->so_timeo);
+ } else {
+ ACCEPT_UNLOCK();
+ so->so_upcall =
+ head->so_accf->so_accept_filter->accf_callback;
+ so->so_upcallarg = head->so_accf->so_accept_filter_arg;
+ so->so_rcv.sb_flags |= SB_UPCALL;
+ so->so_options &= ~SO_ACCEPTFILTER;
+ SOCK_UNLOCK(so);
+ so->so_upcall(so, so->so_upcallarg, M_DONTWAIT);
+ }
+ return;
+ }
+ SOCK_UNLOCK(so);
+ ACCEPT_UNLOCK();
+ wakeup(&so->so_timeo);
+ sorwakeup(so);
+ sowwakeup(so);
+}
+
+void
+soisdisconnecting(struct socket *so)
+{
+
+ /*
+ * Note: This code assumes that SOCK_LOCK(so) and
+ * SOCKBUF_LOCK(&so->so_rcv) are the same.
+ */
+ SOCKBUF_LOCK(&so->so_rcv);
+ so->so_state &= ~SS_ISCONNECTING;
+ so->so_state |= SS_ISDISCONNECTING;
+ so->so_rcv.sb_state |= SBS_CANTRCVMORE;
+ sorwakeup_locked(so);
+ SOCKBUF_LOCK(&so->so_snd);
+ so->so_snd.sb_state |= SBS_CANTSENDMORE;
+ sowwakeup_locked(so);
+ wakeup(&so->so_timeo);
+}
+
+void
+soisdisconnected(struct socket *so)
+{
+
+ /*
+ * Note: This code assumes that SOCK_LOCK(so) and
+ * SOCKBUF_LOCK(&so->so_rcv) are the same.
+ */
+ SOCKBUF_LOCK(&so->so_rcv);
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
+ so->so_state |= SS_ISDISCONNECTED;
+ so->so_rcv.sb_state |= SBS_CANTRCVMORE;
+ sorwakeup_locked(so);
+ SOCKBUF_LOCK(&so->so_snd);
+ so->so_snd.sb_state |= SBS_CANTSENDMORE;
+ sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
+ sowwakeup_locked(so);
+ wakeup(&so->so_timeo);
+}
+
+/*
+ * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
+ */
+struct sockaddr *
+sodupsockaddr(const struct sockaddr *sa, int mflags)
+{
+ struct sockaddr *sa2;
+
+ sa2 = malloc(sa->sa_len, M_SONAME, mflags);
+ if (sa2)
+ bcopy(sa, sa2, sa->sa_len);
+ return sa2;
+}
+
+/*
+ * Create an external-format (``xsocket'') structure using the information in
+ * the kernel-format socket structure pointed to by so. This is done to
+ * reduce the spew of irrelevant information over this interface, to isolate
+ * user code from changes in the kernel structure, and potentially to provide
+ * information-hiding if we decide that some of this information should be
+ * hidden from users.
+ */
+void
+sotoxsocket(struct socket *so, struct xsocket *xso)
+{
+
+ xso->xso_len = sizeof *xso;
+ xso->xso_so = so;
+ xso->so_type = so->so_type;
+ xso->so_options = so->so_options;
+ xso->so_linger = so->so_linger;
+ xso->so_state = so->so_state;
+ xso->so_pcb = so->so_pcb;
+ xso->xso_protocol = so->so_proto->pr_protocol;
+ xso->xso_family = so->so_proto->pr_domain->dom_family;
+ xso->so_qlen = so->so_qlen;
+ xso->so_incqlen = so->so_incqlen;
+ xso->so_qlimit = so->so_qlimit;
+ xso->so_timeo = so->so_timeo;
+ xso->so_error = so->so_error;
+ xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+ xso->so_oobmark = so->so_oobmark;
+ sbtoxsockbuf(&so->so_snd, &xso->so_snd);
+ sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+ xso->so_uid = so->so_cred->cr_uid;
+}
Index: kern_condvar.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_condvar.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_condvar.c -L sys/kern/kern_condvar.c -u -r1.2 -r1.3
--- sys/kern/kern_condvar.c
+++ sys/kern/kern_condvar.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/kern/kern_condvar.c,v 1.52.2.1 2006/02/27 00:19:40 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_condvar.c,v 1.62 2007/06/04 23:50:56 jeff Exp $");
#include "opt_ktrace.h"
@@ -49,12 +49,11 @@
/*
* Common sanity checks for cv_wait* functions.
*/
-#define CV_ASSERT(cvp, mp, td) do { \
+#define CV_ASSERT(cvp, lock, td) do { \
KASSERT((td) != NULL, ("%s: curthread NULL", __func__)); \
KASSERT(TD_IS_RUNNING(td), ("%s: not TDS_RUNNING", __func__)); \
KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__)); \
- KASSERT((mp) != NULL, ("%s: mp NULL", __func__)); \
- mtx_assert((mp), MA_OWNED | MA_NOTRECURSED); \
+ KASSERT((lock) != NULL, ("%s: lock NULL", __func__)); \
} while (0)
/*
@@ -93,20 +92,23 @@
* held when cv_signal or cv_broadcast are called.
*/
void
-cv_wait(struct cv *cvp, struct mtx *mp)
+_cv_wait(struct cv *cvp, struct lock_object *lock)
{
+ WITNESS_SAVE_DECL(lock_witness);
+ struct lock_class *class;
struct thread *td;
- WITNESS_SAVE_DECL(mp);
+ int lock_state;
td = curthread;
#ifdef KTRACE
if (KTRPOINT(td, KTR_CSW))
ktrcsw(1, 0);
#endif
- CV_ASSERT(cvp, mp, td);
- WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &mp->mtx_object,
+ CV_ASSERT(cvp, lock, td);
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
"Waiting on \"%s\"", cvp->cv_description);
- WITNESS_SAVE(&mp->mtx_object, mp);
+ WITNESS_SAVE(lock, lock_witness);
+ class = LOCK_CLASS(lock);
if (cold || panicstr) {
/*
@@ -122,9 +124,66 @@
cvp->cv_waiters++;
DROP_GIANT();
- mtx_unlock(mp);
- sleepq_add(cvp, mp, cvp->cv_description, SLEEPQ_CONDVAR);
+ sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_release(cvp);
+ lock_state = class->lc_unlock(lock);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_lock(cvp);
+ sleepq_wait(cvp);
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(0, 0);
+#endif
+ PICKUP_GIANT();
+ class->lc_lock(lock, lock_state);
+ WITNESS_RESTORE(lock, lock_witness);
+}
+
+/*
+ * Wait on a condition variable. This function differs from cv_wait by
+ * not aquiring the mutex after condition variable was signaled.
+ */
+void
+_cv_wait_unlock(struct cv *cvp, struct lock_object *lock)
+{
+ struct lock_class *class;
+ struct thread *td;
+
+ td = curthread;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(1, 0);
+#endif
+ CV_ASSERT(cvp, lock, td);
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+ "Waiting on \"%s\"", cvp->cv_description);
+ class = LOCK_CLASS(lock);
+
+ if (cold || panicstr) {
+ /*
+ * During autoconfiguration, just give interrupts
+ * a chance, then just return. Don't run any other
+ * thread or panic below, in case this is the idle
+ * process and already asleep.
+ */
+ class->lc_unlock(lock);
+ return;
+ }
+
+ sleepq_lock(cvp);
+
+ cvp->cv_waiters++;
+ DROP_GIANT();
+
+ sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_release(cvp);
+ class->lc_unlock(lock);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_lock(cvp);
sleepq_wait(cvp);
#ifdef KTRACE
@@ -132,8 +191,6 @@
ktrcsw(0, 0);
#endif
PICKUP_GIANT();
- mtx_lock(mp);
- WITNESS_RESTORE(&mp->mtx_object, mp);
}
/*
@@ -143,12 +200,13 @@
* restarted if possible.
*/
int
-cv_wait_sig(struct cv *cvp, struct mtx *mp)
+_cv_wait_sig(struct cv *cvp, struct lock_object *lock)
{
+ WITNESS_SAVE_DECL(lock_witness);
+ struct lock_class *class;
struct thread *td;
struct proc *p;
- int rval;
- WITNESS_SAVE_DECL(mp);
+ int lock_state, rval;
td = curthread;
p = td->td_proc;
@@ -156,10 +214,11 @@
if (KTRPOINT(td, KTR_CSW))
ktrcsw(1, 0);
#endif
- CV_ASSERT(cvp, mp, td);
- WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &mp->mtx_object,
+ CV_ASSERT(cvp, lock, td);
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
"Waiting on \"%s\"", cvp->cv_description);
- WITNESS_SAVE(&mp->mtx_object, mp);
+ WITNESS_SAVE(lock, lock_witness);
+ class = LOCK_CLASS(lock);
if (cold || panicstr) {
/*
@@ -175,10 +234,14 @@
cvp->cv_waiters++;
DROP_GIANT();
- mtx_unlock(mp);
- sleepq_add(cvp, mp, cvp->cv_description, SLEEPQ_CONDVAR |
- SLEEPQ_INTERRUPTIBLE);
+ sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR |
+ SLEEPQ_INTERRUPTIBLE, 0);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_release(cvp);
+ lock_state = class->lc_unlock(lock);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_lock(cvp);
rval = sleepq_wait_sig(cvp);
#ifdef KTRACE
@@ -186,8 +249,8 @@
ktrcsw(0, 0);
#endif
PICKUP_GIANT();
- mtx_lock(mp);
- WITNESS_RESTORE(&mp->mtx_object, mp);
+ class->lc_lock(lock, lock_state);
+ WITNESS_RESTORE(lock, lock_witness);
return (rval);
}
@@ -198,11 +261,12 @@
* expires.
*/
int
-cv_timedwait(struct cv *cvp, struct mtx *mp, int timo)
+_cv_timedwait(struct cv *cvp, struct lock_object *lock, int timo)
{
+ WITNESS_SAVE_DECL(lock_witness);
+ struct lock_class *class;
struct thread *td;
- int rval;
- WITNESS_SAVE_DECL(mp);
+ int lock_state, rval;
td = curthread;
rval = 0;
@@ -210,10 +274,11 @@
if (KTRPOINT(td, KTR_CSW))
ktrcsw(1, 0);
#endif
- CV_ASSERT(cvp, mp, td);
- WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &mp->mtx_object,
+ CV_ASSERT(cvp, lock, td);
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
"Waiting on \"%s\"", cvp->cv_description);
- WITNESS_SAVE(&mp->mtx_object, mp);
+ WITNESS_SAVE(lock, lock_witness);
+ class = LOCK_CLASS(lock);
if (cold || panicstr) {
/*
@@ -229,10 +294,14 @@
cvp->cv_waiters++;
DROP_GIANT();
- mtx_unlock(mp);
- sleepq_add(cvp, mp, cvp->cv_description, SLEEPQ_CONDVAR);
+ sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
sleepq_set_timeout(cvp, timo);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_release(cvp);
+ lock_state = class->lc_unlock(lock);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_lock(cvp);
rval = sleepq_timedwait(cvp);
#ifdef KTRACE
@@ -240,8 +309,8 @@
ktrcsw(0, 0);
#endif
PICKUP_GIANT();
- mtx_lock(mp);
- WITNESS_RESTORE(&mp->mtx_object, mp);
+ class->lc_lock(lock, lock_state);
+ WITNESS_RESTORE(lock, lock_witness);
return (rval);
}
@@ -253,12 +322,13 @@
* a signal was caught.
*/
int
-cv_timedwait_sig(struct cv *cvp, struct mtx *mp, int timo)
+_cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, int timo)
{
+ WITNESS_SAVE_DECL(lock_witness);
+ struct lock_class *class;
struct thread *td;
struct proc *p;
- int rval;
- WITNESS_SAVE_DECL(mp);
+ int lock_state, rval;
td = curthread;
p = td->td_proc;
@@ -267,10 +337,11 @@
if (KTRPOINT(td, KTR_CSW))
ktrcsw(1, 0);
#endif
- CV_ASSERT(cvp, mp, td);
- WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &mp->mtx_object,
+ CV_ASSERT(cvp, lock, td);
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
"Waiting on \"%s\"", cvp->cv_description);
- WITNESS_SAVE(&mp->mtx_object, mp);
+ WITNESS_SAVE(lock, lock_witness);
+ class = LOCK_CLASS(lock);
if (cold || panicstr) {
/*
@@ -286,11 +357,15 @@
cvp->cv_waiters++;
DROP_GIANT();
- mtx_unlock(mp);
- sleepq_add(cvp, mp, cvp->cv_description, SLEEPQ_CONDVAR |
- SLEEPQ_INTERRUPTIBLE);
+ sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR |
+ SLEEPQ_INTERRUPTIBLE, 0);
sleepq_set_timeout(cvp, timo);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_release(cvp);
+ lock_state = class->lc_unlock(lock);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_lock(cvp);
rval = sleepq_timedwait_sig(cvp);
#ifdef KTRACE
@@ -298,8 +373,8 @@
ktrcsw(0, 0);
#endif
PICKUP_GIANT();
- mtx_lock(mp);
- WITNESS_RESTORE(&mp->mtx_object, mp);
+ class->lc_lock(lock, lock_state);
+ WITNESS_RESTORE(lock, lock_witness);
return (rval);
}
@@ -318,9 +393,9 @@
sleepq_lock(cvp);
if (cvp->cv_waiters > 0) {
cvp->cv_waiters--;
- sleepq_signal(cvp, SLEEPQ_CONDVAR, -1);
- } else
- sleepq_release(cvp);
+ sleepq_signal(cvp, SLEEPQ_CONDVAR, -1, 0);
+ }
+ sleepq_release(cvp);
}
/*
@@ -334,7 +409,7 @@
sleepq_lock(cvp);
if (cvp->cv_waiters > 0) {
cvp->cv_waiters = 0;
- sleepq_broadcast(cvp, SLEEPQ_CONDVAR, pri);
+ sleepq_broadcast(cvp, SLEEPQ_CONDVAR, pri, 0);
} else
sleepq_release(cvp);
}
Index: subr_mchain.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_mchain.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_mchain.c -L sys/kern/subr_mchain.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_mchain.c
+++ sys/kern/subr_mchain.c
@@ -28,7 +28,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_mchain.c,v 1.17 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_mchain.c,v 1.18 2005/07/29 13:22:36 imura Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -183,6 +183,7 @@
caddr_t dst;
c_caddr_t src;
int cplen, error, mleft, count;
+ size_t srclen, dstlen;
m = mbp->mb_cur;
mleft = mbp->mb_mleft;
@@ -199,10 +200,13 @@
continue;
}
cplen = mleft > size ? size : mleft;
+ srclen = dstlen = cplen;
dst = mtod(m, caddr_t) + m->m_len;
switch (type) {
case MB_MCUSTOM:
- error = mbp->mb_copy(mbp, source, dst, cplen);
+ srclen = size;
+ dstlen = mleft;
+ error = mbp->mb_copy(mbp, source, dst, &srclen, &dstlen);
if (error)
return error;
break;
@@ -222,11 +226,11 @@
bzero(dst, cplen);
break;
}
- size -= cplen;
- source += cplen;
- m->m_len += cplen;
- mleft -= cplen;
- mbp->mb_count += cplen;
+ size -= srclen;
+ source += srclen;
+ m->m_len += dstlen;
+ mleft -= dstlen;
+ mbp->mb_count += dstlen;
}
mbp->mb_cur = m;
mbp->mb_mleft = mleft;
Index: uipc_syscalls.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_syscalls.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/uipc_syscalls.c -L sys/kern/uipc_syscalls.c -u -r1.3 -r1.4
--- sys/kern/uipc_syscalls.c
+++ sys/kern/uipc_syscalls.c
@@ -33,8 +33,9 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_syscalls.c,v 1.221.2.1 2005/12/28 19:30:41 ps Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_syscalls.c,v 1.259.4.2 2008/02/14 11:45:41 simon Exp $");
+#include "opt_sctp.h"
#include "opt_compat.h"
#include "opt_ktrace.h"
#include "opt_mac.h"
@@ -43,7 +44,6 @@
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <sys/malloc.h>
@@ -68,6 +68,8 @@
#include <sys/ktrace.h>
#endif
+#include <security/mac/mac_framework.h>
+
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
@@ -75,6 +77,11 @@
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
+#ifdef SCTP
+#include <netinet/sctp.h>
+#include <netinet/sctp_peeloff.h>
+#endif /* SCTP */
+
static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
@@ -92,7 +99,6 @@
int nsfbufspeak;
int nsfbufsused;
-SYSCTL_DECL(_kern_ipc);
SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
"Maximum number of sendfile(2) sf_bufs available");
SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
@@ -105,10 +111,11 @@
* file entry is held upon returning. This is lighter weight than
* fgetsock(), which bumps the socket reference drops the file reference
* count instead, as this approach avoids several additional mutex operations
- * associated with the additional reference count.
+ * associated with the additional reference count. If requested, return the
+ * open file flags.
*/
static int
-getsock(struct filedesc *fdp, int fd, struct file **fpp)
+getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp)
{
struct file *fp;
int error;
@@ -117,7 +124,7 @@
if (fdp == NULL)
error = EBADF;
else {
- FILEDESC_LOCK_FAST(fdp);
+ FILEDESC_SLOCK(fdp);
fp = fget_locked(fdp, fd);
if (fp == NULL)
error = EBADF;
@@ -126,9 +133,11 @@
error = ENOTSOCK;
} else {
fhold(fp);
+ if (fflagp != NULL)
+ *fflagp = fp->f_flag;
error = 0;
}
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_SUNLOCK(fdp);
}
*fpp = fp;
return (error);
@@ -141,13 +150,10 @@
#define COMPAT_OLDSOCK
#endif
-/*
- * MPSAFE
- */
int
socket(td, uap)
struct thread *td;
- register struct socket_args /* {
+ struct socket_args /* {
int domain;
int type;
int protocol;
@@ -169,33 +175,28 @@
if (error)
return (error);
/* An extra reference on `fp' has been held for us by falloc(). */
- NET_LOCK_GIANT();
error = socreate(uap->domain, &so, uap->type, uap->protocol,
td->td_ucred, td);
- NET_UNLOCK_GIANT();
if (error) {
fdclose(fdp, fp, fd, td);
} else {
- FILEDESC_LOCK_FAST(fdp);
+ FILE_LOCK(fp);
fp->f_data = so; /* already has ref count */
fp->f_flag = FREAD|FWRITE;
- fp->f_ops = &socketops;
fp->f_type = DTYPE_SOCKET;
- FILEDESC_UNLOCK_FAST(fdp);
+ fp->f_ops = &socketops;
+ FILE_UNLOCK(fp);
td->td_retval[0] = fd;
}
fdrop(fp, td);
return (error);
}
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
bind(td, uap)
struct thread *td;
- register struct bind_args /* {
+ struct bind_args /* {
int s;
caddr_t name;
int namelen;
@@ -207,7 +208,9 @@
if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
return (error);
- return (kern_bind(td, uap->s, sa));
+ error = kern_bind(td, uap->s, sa);
+ free(sa, M_SONAME);
+ return (error);
}
int
@@ -220,37 +223,30 @@
struct file *fp;
int error;
- NET_LOCK_GIANT();
- error = getsock(td->td_proc->p_fd, fd, &fp);
+ error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
if (error)
- goto done2;
+ return (error);
so = fp->f_data;
#ifdef MAC
SOCK_LOCK(so);
error = mac_check_socket_bind(td->td_ucred, so, sa);
SOCK_UNLOCK(so);
if (error)
- goto done1;
+ goto done;
#endif
error = sobind(so, sa, td);
#ifdef MAC
-done1:
+done:
#endif
fdrop(fp, td);
-done2:
- NET_UNLOCK_GIANT();
- FREE(sa, M_SONAME);
return (error);
}
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
listen(td, uap)
struct thread *td;
- register struct listen_args /* {
+ struct listen_args /* {
int s;
int backlog;
} */ *uap;
@@ -259,8 +255,7 @@
struct file *fp;
int error;
- NET_LOCK_GIANT();
- error = getsock(td->td_proc->p_fd, uap->s, &fp);
+ error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
if (error == 0) {
so = fp->f_data;
#ifdef MAC
@@ -276,28 +271,71 @@
#endif
fdrop(fp, td);
}
- NET_UNLOCK_GIANT();
return(error);
}
/*
* accept1()
- * MPSAFE
*/
static int
accept1(td, uap, compat)
struct thread *td;
- register struct accept_args /* {
+ struct accept_args /* {
int s;
struct sockaddr * __restrict name;
socklen_t * __restrict anamelen;
} */ *uap;
int compat;
{
+ struct sockaddr *name;
+ socklen_t namelen;
+ struct file *fp;
+ int error;
+
+ if (uap->name == NULL)
+ return (kern_accept(td, uap->s, NULL, NULL, NULL));
+
+ error = copyin(uap->anamelen, &namelen, sizeof (namelen));
+ if (error)
+ return (error);
+
+ error = kern_accept(td, uap->s, &name, &namelen, &fp);
+
+ /*
+ * return a namelen of zero for older code which might
+ * ignore the return value from accept.
+ */
+ if (error) {
+ (void) copyout(&namelen,
+ uap->anamelen, sizeof(*uap->anamelen));
+ return (error);
+ }
+
+ if (error == 0 && name != NULL) {
+#ifdef COMPAT_OLDSOCK
+ if (compat)
+ ((struct osockaddr *)name)->sa_family =
+ name->sa_family;
+#endif
+ error = copyout(name, uap->name, namelen);
+ }
+ if (error == 0)
+ error = copyout(&namelen, uap->anamelen,
+ sizeof(namelen));
+ if (error)
+ fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
+ fdrop(fp, td);
+ free(name, M_SONAME);
+ return (error);
+}
+
+int
+kern_accept(struct thread *td, int s, struct sockaddr **name,
+ socklen_t *namelen, struct file **fp)
+{
struct filedesc *fdp;
- struct file *nfp = NULL;
+ struct file *headfp, *nfp = NULL;
struct sockaddr *sa = NULL;
- socklen_t namelen;
int error;
struct socket *head, *so;
int fd;
@@ -305,18 +343,17 @@
pid_t pgid;
int tmp;
- fdp = td->td_proc->p_fd;
- if (uap->name) {
- error = copyin(uap->anamelen, &namelen, sizeof (namelen));
- if(error)
- return (error);
- if (namelen < 0)
+ if (name) {
+ *name = NULL;
+ if (*namelen < 0)
return (EINVAL);
}
- NET_LOCK_GIANT();
- error = fgetsock(td, uap->s, &head, &fflag);
+
+ fdp = td->td_proc->p_fd;
+ error = getsock(fdp, s, &headfp, &fflag);
if (error)
- goto done2;
+ return (error);
+ head = headfp->f_data;
if ((head->so_options & SO_ACCEPTCONN) == 0) {
error = EINVAL;
goto done;
@@ -389,8 +426,8 @@
FILE_LOCK(nfp);
nfp->f_data = so; /* nfp has ref count from falloc */
nfp->f_flag = fflag;
- nfp->f_ops = &socketops;
nfp->f_type = DTYPE_SOCKET;
+ nfp->f_ops = &socketops;
FILE_UNLOCK(nfp);
/* Sync socket nonblocking/async state with file flags */
tmp = fflag & FNONBLOCK;
@@ -404,34 +441,21 @@
* return a namelen of zero for older code which might
* ignore the return value from accept.
*/
- if (uap->name != NULL) {
- namelen = 0;
- (void) copyout(&namelen,
- uap->anamelen, sizeof(*uap->anamelen));
- }
+ if (name)
+ *namelen = 0;
goto noconnection;
}
if (sa == NULL) {
- namelen = 0;
- if (uap->name)
- goto gotnoname;
- error = 0;
+ if (name)
+ *namelen = 0;
goto done;
}
- if (uap->name) {
+ if (name) {
/* check sa_len before it is destroyed */
- if (namelen > sa->sa_len)
- namelen = sa->sa_len;
-#ifdef COMPAT_OLDSOCK
- if (compat)
- ((struct osockaddr *)sa)->sa_family =
- sa->sa_family;
-#endif
- error = copyout(sa, uap->name, (u_int)namelen);
- if (!error)
-gotnoname:
- error = copyout(&namelen,
- uap->anamelen, sizeof (*uap->anamelen));
+ if (*namelen > sa->sa_len)
+ *namelen = sa->sa_len;
+ *name = sa;
+ sa = NULL;
}
noconnection:
if (sa)
@@ -445,20 +469,23 @@
fdclose(fdp, nfp, fd, td);
/*
- * Release explicitly held references before returning.
+ * Release explicitly held references before returning. We return
+ * a reference on nfp to the caller on success if they request it.
*/
done:
+ if (fp != NULL) {
+ if (error == 0) {
+ *fp = nfp;
+ nfp = NULL;
+ } else
+ *fp = NULL;
+ }
if (nfp != NULL)
fdrop(nfp, td);
- fputsock(head);
-done2:
- NET_UNLOCK_GIANT();
+ fdrop(headfp, td);
return (error);
}
-/*
- * MPSAFE (accept1() is MPSAFE)
- */
int
accept(td, uap)
struct thread *td;
@@ -469,9 +496,6 @@
}
#ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE (accept1() is MPSAFE)
- */
int
oaccept(td, uap)
struct thread *td;
@@ -482,14 +506,11 @@
}
#endif /* COMPAT_OLDSOCK */
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
connect(td, uap)
struct thread *td;
- register struct connect_args /* {
+ struct connect_args /* {
int s;
caddr_t name;
int namelen;
@@ -502,7 +523,9 @@
if (error)
return (error);
- return (kern_connect(td, uap->s, sa));
+ error = kern_connect(td, uap->s, sa);
+ free(sa, M_SONAME);
+ return (error);
}
@@ -517,10 +540,9 @@
int error;
int interrupted = 0;
- NET_LOCK_GIANT();
- error = getsock(td->td_proc->p_fd, fd, &fp);
+ error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
if (error)
- goto done2;
+ return (error);
so = fp->f_data;
if (so->so_state & SS_ISCONNECTING) {
error = EALREADY;
@@ -562,26 +584,20 @@
error = EINTR;
done1:
fdrop(fp, td);
-done2:
- NET_UNLOCK_GIANT();
- FREE(sa, M_SONAME);
return (error);
}
-/*
- * MPSAFE
- */
int
socketpair(td, uap)
struct thread *td;
- register struct socketpair_args /* {
+ struct socketpair_args /* {
int domain;
int type;
int protocol;
int *rsv;
} */ *uap;
{
- register struct filedesc *fdp = td->td_proc->p_fd;
+ struct filedesc *fdp = td->td_proc->p_fd;
struct file *fp1, *fp2;
struct socket *so1, *so2;
int fd, error, sv[2];
@@ -594,11 +610,10 @@
return (error);
#endif
- NET_LOCK_GIANT();
error = socreate(uap->domain, &so1, uap->type, uap->protocol,
td->td_ucred, td);
if (error)
- goto done2;
+ return (error);
error = socreate(uap->domain, &so2, uap->type, uap->protocol,
td->td_ucred, td);
if (error)
@@ -627,18 +642,21 @@
}
FILE_LOCK(fp1);
fp1->f_flag = FREAD|FWRITE;
- fp1->f_ops = &socketops;
fp1->f_type = DTYPE_SOCKET;
+ fp1->f_ops = &socketops;
FILE_UNLOCK(fp1);
FILE_LOCK(fp2);
fp2->f_flag = FREAD|FWRITE;
- fp2->f_ops = &socketops;
fp2->f_type = DTYPE_SOCKET;
+ fp2->f_ops = &socketops;
FILE_UNLOCK(fp2);
+ so1 = so2 = NULL;
error = copyout(sv, uap->rsv, 2 * sizeof (int));
+ if (error)
+ goto free4;
fdrop(fp1, td);
fdrop(fp2, td);
- goto done2;
+ return (0);
free4:
fdclose(fdp, fp2, sv[1], td);
fdrop(fp2, td);
@@ -646,19 +664,19 @@
fdclose(fdp, fp1, sv[0], td);
fdrop(fp1, td);
free2:
- (void)soclose(so2);
+ if (so2 != NULL)
+ (void)soclose(so2);
free1:
- (void)soclose(so1);
-done2:
- NET_UNLOCK_GIANT();
+ if (so1 != NULL)
+ (void)soclose(so1);
return (error);
}
static int
sendit(td, s, mp, flags)
- register struct thread *td;
+ struct thread *td;
int s;
- register struct msghdr *mp;
+ struct msghdr *mp;
int flags;
{
struct mbuf *control;
@@ -691,7 +709,7 @@
goto bad;
#ifdef COMPAT_OLDSOCK
if (mp->msg_flags == MSG_COMPAT) {
- register struct cmsghdr *cm;
+ struct cmsghdr *cm;
M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
if (control == 0) {
@@ -736,10 +754,9 @@
struct uio *ktruio = NULL;
#endif
- NET_LOCK_GIANT();
- error = getsock(td->td_proc->p_fd, s, &fp);
+ error = getsock(td->td_proc->p_fd, s, &fp, NULL);
if (error)
- goto bad2;
+ return (error);
so = (struct socket *)fp->f_data;
#ifdef MAC
@@ -769,8 +786,7 @@
ktruio = cloneuio(&auio);
#endif
len = auio.uio_resid;
- error = so->so_proto->pr_usrreqs->pru_sosend(so, mp->msg_name, &auio,
- 0, control, flags, td);
+ error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
if (error) {
if (auio.uio_resid != len && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
@@ -793,18 +809,13 @@
#endif
bad:
fdrop(fp, td);
-bad2:
- NET_UNLOCK_GIANT();
return (error);
}
-/*
- * MPSAFE
- */
int
sendto(td, uap)
struct thread *td;
- register struct sendto_args /* {
+ struct sendto_args /* {
int s;
caddr_t buf;
size_t len;
@@ -832,13 +843,10 @@
}
#ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
int
osend(td, uap)
struct thread *td;
- register struct osend_args /* {
+ struct osend_args /* {
int s;
caddr_t buf;
int len;
@@ -861,9 +869,6 @@
return (error);
}
-/*
- * MPSAFE
- */
int
osendmsg(td, uap)
struct thread *td;
@@ -891,9 +896,6 @@
}
#endif
-/*
- * MPSAFE
- */
int
sendmsg(td, uap)
struct thread *td;
@@ -923,12 +925,11 @@
}
int
-kern_recvit(td, s, mp, namelenp, segflg, controlp)
+kern_recvit(td, s, mp, fromseg, controlp)
struct thread *td;
int s;
struct msghdr *mp;
- void *namelenp;
- enum uio_seg segflg;
+ enum uio_seg fromseg;
struct mbuf **controlp;
{
struct uio auio;
@@ -948,12 +949,9 @@
if(controlp != NULL)
*controlp = 0;
- NET_LOCK_GIANT();
- error = getsock(td->td_proc->p_fd, s, &fp);
- if (error) {
- NET_UNLOCK_GIANT();
+ error = getsock(td->td_proc->p_fd, s, &fp, NULL);
+ if (error)
return (error);
- }
so = fp->f_data;
#ifdef MAC
@@ -962,14 +960,13 @@
SOCK_UNLOCK(so);
if (error) {
fdrop(fp, td);
- NET_UNLOCK_GIANT();
return (error);
}
#endif
auio.uio_iov = mp->msg_iov;
auio.uio_iovcnt = mp->msg_iovlen;
- auio.uio_segflg = segflg;
+ auio.uio_segflg = UIO_USERSPACE;
auio.uio_rw = UIO_READ;
auio.uio_td = td;
auio.uio_offset = 0; /* XXX */
@@ -978,7 +975,6 @@
for (i = 0; i < mp->msg_iovlen; i++, iov++) {
if ((auio.uio_resid += iov->iov_len) < 0) {
fdrop(fp, td);
- NET_UNLOCK_GIANT();
return (EINVAL);
}
}
@@ -987,8 +983,8 @@
ktruio = cloneuio(&auio);
#endif
len = auio.uio_resid;
- error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
- (struct mbuf **)0, (mp->msg_control || controlp) ? &control : (struct mbuf **)0,
+ error = soreceive(so, &fromsa, &auio, (struct mbuf **)0,
+ (mp->msg_control || controlp) ? &control : (struct mbuf **)0,
&mp->msg_flags);
if (error) {
if (auio.uio_resid != (int)len && (error == ERESTART ||
@@ -1016,20 +1012,15 @@
((struct osockaddr *)fromsa)->sa_family =
fromsa->sa_family;
#endif
- error = copyout(fromsa, mp->msg_name, (unsigned)len);
- if (error)
- goto out;
+ if (fromseg == UIO_USERSPACE) {
+ error = copyout(fromsa, mp->msg_name,
+ (unsigned)len);
+ if (error)
+ goto out;
+ } else
+ bcopy(fromsa, mp->msg_name, len);
}
mp->msg_namelen = len;
- if (namelenp &&
- (error = copyout(&len, namelenp, sizeof (socklen_t)))) {
-#ifdef COMPAT_OLDSOCK
- if (mp->msg_flags & MSG_COMPAT)
- error = 0; /* old recvfrom didn't check */
- else
-#endif
- goto out;
- }
}
if (mp->msg_control && controlp == NULL) {
#ifdef COMPAT_OLDSOCK
@@ -1079,7 +1070,6 @@
}
out:
fdrop(fp, td);
- NET_UNLOCK_GIANT();
if (fromsa)
FREE(fromsa, M_SONAME);
@@ -1098,17 +1088,25 @@
struct msghdr *mp;
void *namelenp;
{
+ int error;
- return (kern_recvit(td, s, mp, namelenp, UIO_USERSPACE, NULL));
+ error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
+ if (error)
+ return (error);
+ if (namelenp) {
+ error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
+#ifdef COMPAT_OLDSOCK
+ if (mp->msg_flags & MSG_COMPAT)
+ error = 0; /* old recvfrom didn't check */
+#endif
+ }
+ return (error);
}
-/*
- * MPSAFE
- */
int
recvfrom(td, uap)
struct thread *td;
- register struct recvfrom_args /* {
+ struct recvfrom_args /* {
int s;
caddr_t buf;
size_t len;
@@ -1142,9 +1140,6 @@
}
#ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
int
orecvfrom(td, uap)
struct thread *td;
@@ -1156,15 +1151,11 @@
}
#endif
-
#ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
int
orecv(td, uap)
struct thread *td;
- register struct orecv_args /* {
+ struct orecv_args /* {
int s;
caddr_t buf;
int len;
@@ -1191,8 +1182,6 @@
* Old recvmsg. This code takes advantage of the fact that the old msghdr
* overlays the new one, missing only the flags, and with the (old) access
* rights where the control fields are now.
- *
- * MPSAFE
*/
int
orecvmsg(td, uap)
@@ -1224,9 +1213,6 @@
}
#endif
-/*
- * MPSAFE
- */
int
recvmsg(td, uap)
struct thread *td;
@@ -1261,14 +1247,11 @@
return (error);
}
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
shutdown(td, uap)
struct thread *td;
- register struct shutdown_args /* {
+ struct shutdown_args /* {
int s;
int how;
} */ *uap;
@@ -1277,25 +1260,20 @@
struct file *fp;
int error;
- NET_LOCK_GIANT();
- error = getsock(td->td_proc->p_fd, uap->s, &fp);
+ error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
if (error == 0) {
so = fp->f_data;
error = soshutdown(so, uap->how);
fdrop(fp, td);
}
- NET_UNLOCK_GIANT();
return (error);
}
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
setsockopt(td, uap)
struct thread *td;
- register struct setsockopt_args /* {
+ struct setsockopt_args /* {
int s;
int level;
int name;
@@ -1344,25 +1322,20 @@
panic("kern_setsockopt called with bad valseg");
}
- NET_LOCK_GIANT();
- error = getsock(td->td_proc->p_fd, s, &fp);
+ error = getsock(td->td_proc->p_fd, s, &fp, NULL);
if (error == 0) {
so = fp->f_data;
error = sosetopt(so, &sopt);
fdrop(fp, td);
}
- NET_UNLOCK_GIANT();
return(error);
}
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
getsockopt(td, uap)
struct thread *td;
- register struct getsockopt_args /* {
+ struct getsockopt_args /* {
int s;
int level;
int name;
@@ -1427,83 +1400,89 @@
panic("kern_getsockopt called with bad valseg");
}
- NET_LOCK_GIANT();
- error = getsock(td->td_proc->p_fd, s, &fp);
+ error = getsock(td->td_proc->p_fd, s, &fp, NULL);
if (error == 0) {
so = fp->f_data;
error = sogetopt(so, &sopt);
*valsize = sopt.sopt_valsize;
fdrop(fp, td);
}
- NET_UNLOCK_GIANT();
return (error);
}
/*
* getsockname1() - Get socket name.
- *
- * MPSAFE
*/
/* ARGSUSED */
static int
getsockname1(td, uap, compat)
struct thread *td;
- register struct getsockname_args /* {
+ struct getsockname_args /* {
int fdes;
struct sockaddr * __restrict asa;
socklen_t * __restrict alen;
} */ *uap;
int compat;
{
- struct socket *so;
struct sockaddr *sa;
- struct file *fp;
socklen_t len;
int error;
- NET_LOCK_GIANT();
- error = getsock(td->td_proc->p_fd, uap->fdes, &fp);
+ error = copyin(uap->alen, &len, sizeof(len));
if (error)
- goto done2;
- so = fp->f_data;
- error = copyin(uap->alen, &len, sizeof (len));
- if (error)
- goto done1;
- if (len < 0) {
- error = EINVAL;
- goto done1;
- }
- sa = 0;
- error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
+ return (error);
+
+ error = kern_getsockname(td, uap->fdes, &sa, &len);
if (error)
- goto bad;
- if (sa == 0) {
- len = 0;
- goto gotnothing;
- }
+ return (error);
- len = MIN(len, sa->sa_len);
+ if (len != 0) {
#ifdef COMPAT_OLDSOCK
- if (compat)
- ((struct osockaddr *)sa)->sa_family = sa->sa_family;
+ if (compat)
+ ((struct osockaddr *)sa)->sa_family = sa->sa_family;
#endif
- error = copyout(sa, uap->asa, (u_int)len);
+ error = copyout(sa, uap->asa, (u_int)len);
+ }
+ free(sa, M_SONAME);
if (error == 0)
-gotnothing:
- error = copyout(&len, uap->alen, sizeof (len));
+ error = copyout(&len, uap->alen, sizeof(len));
+ return (error);
+}
+
+int
+kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
+ socklen_t *alen)
+{
+ struct socket *so;
+ struct file *fp;
+ socklen_t len;
+ int error;
+
+ if (*alen < 0)
+ return (EINVAL);
+
+ error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
+ if (error)
+ return (error);
+ so = fp->f_data;
+ *sa = NULL;
+ error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
+ if (error)
+ goto bad;
+ if (*sa == NULL)
+ len = 0;
+ else
+ len = MIN(*alen, (*sa)->sa_len);
+ *alen = len;
bad:
- if (sa)
- FREE(sa, M_SONAME);
-done1:
fdrop(fp, td);
-done2:
- NET_UNLOCK_GIANT();
+ if (error && *sa) {
+ free(*sa, M_SONAME);
+ *sa = NULL;
+ }
return (error);
}
-/*
- * MPSAFE
- */
int
getsockname(td, uap)
struct thread *td;
@@ -1514,9 +1493,6 @@
}
#ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
int
ogetsockname(td, uap)
struct thread *td;
@@ -1529,74 +1505,82 @@
/*
* getpeername1() - Get name of peer for connected socket.
- *
- * MPSAFE
*/
/* ARGSUSED */
static int
getpeername1(td, uap, compat)
struct thread *td;
- register struct getpeername_args /* {
+ struct getpeername_args /* {
int fdes;
struct sockaddr * __restrict asa;
socklen_t * __restrict alen;
} */ *uap;
int compat;
{
- struct socket *so;
struct sockaddr *sa;
+ socklen_t len;
+ int error;
+
+ error = copyin(uap->alen, &len, sizeof (len));
+ if (error)
+ return (error);
+
+ error = kern_getpeername(td, uap->fdes, &sa, &len);
+ if (error)
+ return (error);
+
+ if (len != 0) {
+#ifdef COMPAT_OLDSOCK
+ if (compat)
+ ((struct osockaddr *)sa)->sa_family = sa->sa_family;
+#endif
+ error = copyout(sa, uap->asa, (u_int)len);
+ }
+ free(sa, M_SONAME);
+ if (error == 0)
+ error = copyout(&len, uap->alen, sizeof(len));
+ return (error);
+}
+
+int
+kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
+ socklen_t *alen)
+{
+ struct socket *so;
struct file *fp;
socklen_t len;
int error;
- NET_LOCK_GIANT();
- error = getsock(td->td_proc->p_fd, uap->fdes, &fp);
+ if (*alen < 0)
+ return (EINVAL);
+
+ error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
if (error)
- goto done2;
+ return (error);
so = fp->f_data;
if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
error = ENOTCONN;
- goto done1;
- }
- error = copyin(uap->alen, &len, sizeof (len));
- if (error)
- goto done1;
- if (len < 0) {
- error = EINVAL;
- goto done1;
+ goto done;
}
- sa = 0;
- error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
+ *sa = NULL;
+ error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
if (error)
goto bad;
- if (sa == 0) {
+ if (*sa == NULL)
len = 0;
- goto gotnothing;
- }
- len = MIN(len, sa->sa_len);
-#ifdef COMPAT_OLDSOCK
- if (compat)
- ((struct osockaddr *)sa)->sa_family =
- sa->sa_family;
-#endif
- error = copyout(sa, uap->asa, (u_int)len);
- if (error)
- goto bad;
-gotnothing:
- error = copyout(&len, uap->alen, sizeof (len));
+ else
+ len = MIN(*alen, (*sa)->sa_len);
+ *alen = len;
bad:
- if (sa)
- FREE(sa, M_SONAME);
-done1:
+ if (error && *sa) {
+ free(*sa, M_SONAME);
+ *sa = NULL;
+ }
+done:
fdrop(fp, td);
-done2:
- NET_UNLOCK_GIANT();
return (error);
}
-/*
- * MPSAFE
- */
int
getpeername(td, uap)
struct thread *td;
@@ -1607,9 +1591,6 @@
}
#ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
int
ogetpeername(td, uap)
struct thread *td;
@@ -1627,8 +1608,8 @@
caddr_t buf;
int buflen, type;
{
- register struct sockaddr *sa;
- register struct mbuf *m;
+ struct sockaddr *sa;
+ struct mbuf *m;
int error;
if ((u_int)buflen > MLEN) {
@@ -1722,16 +1703,13 @@
/*
* sendfile(2)
*
- * MPSAFE
- *
* int sendfile(int fd, int s, off_t offset, size_t nbytes,
* struct sf_hdtr *hdtr, off_t *sbytes, int flags)
*
* Send a file specified by 'fd' and starting at 'offset' to a socket
- * specified by 's'. Send only 'nbytes' of the file or until EOF if
- * nbytes == 0. Optionally add a header and/or trailer to the socket
- * output. If specified, write the total number of bytes sent into *sbytes.
- *
+ * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
+ * 0. Optionally add a header and/or trailer to the socket output. If
+ * specified, write the total number of bytes sent into *sbytes.
*/
int
sendfile(struct thread *td, struct sendfile_args *uap)
@@ -1740,399 +1718,477 @@
return (do_sendfile(td, uap, 0));
}
-#ifdef COMPAT_FREEBSD4
-int
-freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
-{
- struct sendfile_args args;
-
- args.fd = uap->fd;
- args.s = uap->s;
- args.offset = uap->offset;
- args.nbytes = uap->nbytes;
- args.hdtr = uap->hdtr;
- args.sbytes = uap->sbytes;
- args.flags = uap->flags;
-
- return (do_sendfile(td, &args, 1));
-}
-#endif /* COMPAT_FREEBSD4 */
-
static int
do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
{
- struct vnode *vp;
- struct vm_object *obj = NULL;
- struct socket *so = NULL;
- struct mbuf *m, *m_header = NULL;
- struct sf_buf *sf;
- struct vm_page *pg;
- struct writev_args nuap;
struct sf_hdtr hdtr;
- struct uio *hdr_uio = NULL;
- off_t off, xfsize, hdtr_size, sbytes = 0;
- int error, headersize = 0, headersent = 0;
+ struct uio *hdr_uio, *trl_uio;
+ int error;
+
+ hdr_uio = trl_uio = NULL;
+
+ if (uap->hdtr != NULL) {
+ error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
+ if (error)
+ goto out;
+ if (hdtr.headers != NULL) {
+ error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
+ if (error)
+ goto out;
+ }
+ if (hdtr.trailers != NULL) {
+ error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
+ if (error)
+ goto out;
+
+ }
+ }
- mtx_lock(&Giant);
+ error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
+out:
+ if (hdr_uio)
+ free(hdr_uio, M_IOV);
+ if (trl_uio)
+ free(trl_uio, M_IOV);
+ return (error);
+}
- hdtr_size = 0;
+#ifdef COMPAT_FREEBSD4
+int
+freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
+{
+ struct sendfile_args args;
+
+ args.fd = uap->fd;
+ args.s = uap->s;
+ args.offset = uap->offset;
+ args.nbytes = uap->nbytes;
+ args.hdtr = uap->hdtr;
+ args.sbytes = uap->sbytes;
+ args.flags = uap->flags;
+
+ return (do_sendfile(td, &args, 1));
+}
+#endif /* COMPAT_FREEBSD4 */
+
+int
+kern_sendfile(struct thread *td, struct sendfile_args *uap,
+ struct uio *hdr_uio, struct uio *trl_uio, int compat)
+{
+ struct file *sock_fp;
+ struct vnode *vp;
+ struct vm_object *obj = NULL;
+ struct socket *so = NULL;
+ struct mbuf *m = NULL;
+ struct sf_buf *sf;
+ struct vm_page *pg;
+ off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
+ int error, hdrlen = 0, mnw = 0;
+ int vfslocked;
/*
- * The descriptor must be a regular file and have a backing VM object.
+ * The file descriptor must be a regular file and have a
+ * backing VM object.
+ * File offset must be positive. If it goes beyond EOF
+ * we send only the header/trailer and no payload data.
*/
if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
- goto done;
+ goto out;
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
- if (vp->v_type == VREG)
+ if (vp->v_type == VREG) {
obj = vp->v_object;
+ if (obj != NULL) {
+ /*
+ * Temporarily increase the backing VM
+ * object's reference count so that a forced
+ * reclamation of its vnode does not
+ * immediately destroy it.
+ */
+ VM_OBJECT_LOCK(obj);
+ if ((obj->flags & OBJ_DEAD) == 0) {
+ vm_object_reference_locked(obj);
+ VM_OBJECT_UNLOCK(obj);
+ } else {
+ VM_OBJECT_UNLOCK(obj);
+ obj = NULL;
+ }
+ }
+ }
VOP_UNLOCK(vp, 0, td);
+ VFS_UNLOCK_GIANT(vfslocked);
if (obj == NULL) {
error = EINVAL;
- goto done;
+ goto out;
}
- if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
- goto done;
+ if (uap->offset < 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * The socket must be a stream socket and connected.
+ * Remember if it a blocking or non-blocking socket.
+ */
+ if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp,
+ NULL)) != 0)
+ goto out;
+ so = sock_fp->f_data;
if (so->so_type != SOCK_STREAM) {
error = EINVAL;
- goto done;
+ goto out;
}
if ((so->so_state & SS_ISCONNECTED) == 0) {
error = ENOTCONN;
- goto done;
- }
- if (uap->offset < 0) {
- error = EINVAL;
- goto done;
+ goto out;
}
+ /*
+ * Do not wait on memory allocations but return ENOMEM for
+ * caller to retry later.
+ * XXX: Experimental.
+ */
+ if (uap->flags & SF_MNOWAIT)
+ mnw = 1;
#ifdef MAC
SOCK_LOCK(so);
error = mac_check_socket_send(td->td_ucred, so);
SOCK_UNLOCK(so);
if (error)
- goto done;
+ goto out;
#endif
- /*
- * If specified, get the pointer to the sf_hdtr struct for
- * any headers/trailers.
- */
- if (uap->hdtr != NULL) {
- error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
- if (error)
- goto done;
- /*
- * Send any headers.
- */
- if (hdtr.headers != NULL) {
- error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
- if (error)
- goto done;
- hdr_uio->uio_td = td;
- hdr_uio->uio_rw = UIO_WRITE;
- if (hdr_uio->uio_resid > 0) {
- m_header = m_uiotombuf(hdr_uio, M_DONTWAIT, 0, 0);
- if (m_header == NULL)
- goto done;
- headersize = m_header->m_pkthdr.len;
- if (compat)
- sbytes += headersize;
+ /* If headers are specified copy them into mbufs. */
+ if (hdr_uio != NULL) {
+ hdr_uio->uio_td = td;
+ hdr_uio->uio_rw = UIO_WRITE;
+ if (hdr_uio->uio_resid > 0) {
+ /*
+ * In FBSD < 5.0 the nbytes to send also included
+ * the header. If compat is specified subtract the
+ * header size from nbytes.
+ */
+ if (compat) {
+ if (uap->nbytes > hdr_uio->uio_resid)
+ uap->nbytes -= hdr_uio->uio_resid;
+ else
+ uap->nbytes = 0;
}
+ m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
+ 0, 0, 0);
+ if (m == NULL) {
+ error = mnw ? EAGAIN : ENOBUFS;
+ goto out;
+ }
+ hdrlen = m_length(m, NULL);
}
}
/*
* Protect against multiple writers to the socket.
+ *
+ * XXXRW: Historically this has assumed non-interruptibility, so now
+ * we implement that, but possibly shouldn't.
*/
- SOCKBUF_LOCK(&so->so_snd);
- (void) sblock(&so->so_snd, M_WAITOK);
- SOCKBUF_UNLOCK(&so->so_snd);
+ (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
/*
- * Loop through the pages in the file, starting with the requested
+ * Loop through the pages of the file, starting with the requested
* offset. Get a file page (do I/O if necessary), map the file page
* into an sf_buf, attach an mbuf header to the sf_buf, and queue
* it on the socket.
+ * This is done in two loops. The inner loop turns as many pages
+ * as it can, up to available socket buffer space, without blocking
+ * into mbufs to have it bulk delivered into the socket send buffer.
+ * The outer loop checks the state and available space of the socket
+ * and takes care of the overall progress.
*/
- for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
- vm_pindex_t pindex;
- vm_offset_t pgoff;
-
- pindex = OFF_TO_IDX(off);
- VM_OBJECT_LOCK(obj);
-retry_lookup:
- /*
- * Calculate the amount to transfer. Not to exceed a page,
- * the EOF, or the passed in nbytes.
- */
- xfsize = obj->un_pager.vnp.vnp_size - off;
- VM_OBJECT_UNLOCK(obj);
- if (xfsize > PAGE_SIZE)
- xfsize = PAGE_SIZE;
- pgoff = (vm_offset_t)(off & PAGE_MASK);
- if (PAGE_SIZE - pgoff < xfsize)
- xfsize = PAGE_SIZE - pgoff;
- if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
- xfsize = uap->nbytes - sbytes;
- if (xfsize <= 0) {
- if (m_header != NULL) {
- m = m_header;
- m_header = NULL;
- SOCKBUF_LOCK(&so->so_snd);
- goto retry_space;
- } else
- break;
- }
+ for (off = uap->offset, rem = uap->nbytes; ; ) {
+ int loopbytes = 0;
+ int space = 0;
+ int done = 0;
+
/*
- * Optimize the non-blocking case by looking at the socket space
- * before going to the extra work of constituting the sf_buf.
+ * Check the socket state for ongoing connection,
+ * no errors and space in socket buffer.
+ * If space is low allow for the remainder of the
+ * file to be processed if it fits the socket buffer.
+ * Otherwise block in waiting for sufficient space
+ * to proceed, or if the socket is nonblocking, return
+ * to userland with EAGAIN while reporting how far
+ * we've come.
+ * We wait until the socket buffer has significant free
+ * space to do bulk sends. This makes good use of file
+ * system read ahead and allows packet segmentation
+ * offloading hardware to take over lots of work. If
+ * we were not careful here we would send off only one
+ * sfbuf at a time.
*/
SOCKBUF_LOCK(&so->so_snd);
- if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
- if (so->so_snd.sb_state & SBS_CANTSENDMORE)
- error = EPIPE;
- else
- error = EAGAIN;
- sbunlock(&so->so_snd);
+ if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
+ so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
+retry_space:
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+ error = EPIPE;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto done;
+ } else if (so->so_error) {
+ error = so->so_error;
+ so->so_error = 0;
SOCKBUF_UNLOCK(&so->so_snd);
goto done;
}
- SOCKBUF_UNLOCK(&so->so_snd);
- VM_OBJECT_LOCK(obj);
- /*
- * Attempt to look up the page.
- *
- * Allocate if not found
- *
- * Wait and loop if busy.
- */
- pg = vm_page_lookup(obj, pindex);
-
- if (pg == NULL) {
- pg = vm_page_alloc(obj, pindex, VM_ALLOC_NOBUSY |
- VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
- if (pg == NULL) {
- VM_OBJECT_UNLOCK(obj);
- VM_WAIT;
- VM_OBJECT_LOCK(obj);
- goto retry_lookup;
+ space = sbspace(&so->so_snd);
+ if (space < rem &&
+ (space <= 0 ||
+ space < so->so_snd.sb_lowat)) {
+ if (so->so_state & SS_NBIO) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EAGAIN;
+ goto done;
}
- vm_page_lock_queues();
- } else {
- vm_page_lock_queues();
- if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy"))
- goto retry_lookup;
/*
- * Wire the page so it does not get ripped out from
- * under us.
+ * sbwait drops the lock while sleeping.
+ * When we loop back to retry_space the
+ * state may have changed and we retest
+ * for it.
+ */
+ error = sbwait(&so->so_snd);
+ /*
+ * An error from sbwait usually indicates that we've
+ * been interrupted by a signal. If we've sent anything
+ * then return bytes sent, otherwise return the error.
*/
- vm_page_wire(pg);
+ if (error) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto done;
+ }
+ goto retry_space;
}
+ SOCKBUF_UNLOCK(&so->so_snd);
/*
- * If page is not valid for what we need, initiate I/O
+ * Reduce space in the socket buffer by the size of
+ * the header mbuf chain.
+ * hdrlen is set to 0 after the first loop.
*/
+ space -= hdrlen;
- if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize)) {
- VM_OBJECT_UNLOCK(obj);
- } else if (uap->flags & SF_NODISKIO) {
- error = EBUSY;
- } else {
- int bsize, resid;
+ /*
+ * Loop and construct maximum sized mbuf chain to be bulk
+ * dumped into socket buffer.
+ */
+ while(space > loopbytes) {
+ vm_pindex_t pindex;
+ vm_offset_t pgoff;
+ struct mbuf *m0;
+ VM_OBJECT_LOCK(obj);
+ /*
+ * Calculate the amount to transfer.
+ * Not to exceed a page, the EOF,
+ * or the passed in nbytes.
+ */
+ pgoff = (vm_offset_t)(off & PAGE_MASK);
+ xfsize = omin(PAGE_SIZE - pgoff,
+ obj->un_pager.vnp.vnp_size - uap->offset -
+ fsbytes - loopbytes);
+ if (uap->nbytes)
+ rem = (uap->nbytes - fsbytes - loopbytes);
+ else
+ rem = obj->un_pager.vnp.vnp_size -
+ uap->offset - fsbytes - loopbytes;
+ xfsize = omin(rem, xfsize);
+ if (xfsize <= 0) {
+ VM_OBJECT_UNLOCK(obj);
+ done = 1; /* all data sent */
+ break;
+ }
/*
- * Ensure that our page is still around when the I/O
- * completes.
+ * Don't overflow the send buffer.
+ * Stop here and send out what we've
+ * already got.
*/
- vm_page_io_start(pg);
- vm_page_unlock_queues();
- VM_OBJECT_UNLOCK(obj);
+ if (space < loopbytes + xfsize) {
+ VM_OBJECT_UNLOCK(obj);
+ break;
+ }
/*
- * Get the page from backing store.
+ * Attempt to look up the page. Allocate
+ * if not found or wait and loop if busy.
*/
- bsize = vp->v_mount->mnt_stat.f_iosize;
- vn_lock(vp, LK_SHARED | LK_RETRY, td);
+ pindex = OFF_TO_IDX(off);
+ pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
+ VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
+
/*
- * XXXMAC: Because we don't have fp->f_cred here,
- * we pass in NOCRED. This is probably wrong, but
- * is consistent with our original implementation.
+ * Check if page is valid for what we need,
+ * otherwise initiate I/O.
+ * If we already turned some pages into mbufs,
+ * send them off before we come here again and
+ * block.
*/
- error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
- trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
- IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
- td->td_ucred, NOCRED, &resid, td);
- VOP_UNLOCK(vp, 0, td);
- VM_OBJECT_LOCK(obj);
- vm_page_lock_queues();
- vm_page_io_finish(pg);
- if (!error)
+ if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
VM_OBJECT_UNLOCK(obj);
- mbstat.sf_iocnt++;
- }
-
- if (error) {
- vm_page_unwire(pg, 0);
+ else if (m != NULL)
+ error = EAGAIN; /* send what we already got */
+ else if (uap->flags & SF_NODISKIO)
+ error = EBUSY;
+ else {
+ int bsize, resid;
+
+ /*
+ * Ensure that our page is still around
+ * when the I/O completes.
+ */
+ vm_page_io_start(pg);
+ VM_OBJECT_UNLOCK(obj);
+
+ /*
+ * Get the page from backing store.
+ */
+ bsize = vp->v_mount->mnt_stat.f_iosize;
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ vn_lock(vp, LK_SHARED | LK_RETRY, td);
+
+ /*
+ * XXXMAC: Because we don't have fp->f_cred
+ * here, we pass in NOCRED. This is probably
+ * wrong, but is consistent with our original
+ * implementation.
+ */
+ error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
+ trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
+ IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
+ td->td_ucred, NOCRED, &resid, td);
+ VOP_UNLOCK(vp, 0, td);
+ VFS_UNLOCK_GIANT(vfslocked);
+ VM_OBJECT_LOCK(obj);
+ vm_page_io_finish(pg);
+ if (!error)
+ VM_OBJECT_UNLOCK(obj);
+ mbstat.sf_iocnt++;
+ }
+ if (error) {
+ vm_page_lock_queues();
+ vm_page_unwire(pg, 0);
+ /*
+ * See if anyone else might know about
+ * this page. If not and it is not valid,
+ * then free it.
+ */
+ if (pg->wire_count == 0 && pg->valid == 0 &&
+ pg->busy == 0 && !(pg->oflags & VPO_BUSY) &&
+ pg->hold_count == 0) {
+ vm_page_free(pg);
+ }
+ vm_page_unlock_queues();
+ VM_OBJECT_UNLOCK(obj);
+ if (error == EAGAIN)
+ error = 0; /* not a real error */
+ break;
+ }
+
/*
- * See if anyone else might know about this page.
- * If not and it is not valid, then free it.
+ * Get a sendfile buf. We usually wait as long
+ * as necessary, but this wait can be interrupted.
*/
- if (pg->wire_count == 0 && pg->valid == 0 &&
- pg->busy == 0 && !(pg->flags & PG_BUSY) &&
- pg->hold_count == 0) {
- vm_page_free(pg);
+ if ((sf = sf_buf_alloc(pg,
+ (mnw ? SFB_NOWAIT : SFB_CATCH))) == NULL) {
+ mbstat.sf_allocfail++;
+ vm_page_lock_queues();
+ vm_page_unwire(pg, 0);
+ /*
+ * XXX: Not same check as above!?
+ */
+ if (pg->wire_count == 0 && pg->object == NULL)
+ vm_page_free(pg);
+ vm_page_unlock_queues();
+ error = (mnw ? EAGAIN : EINTR);
+ break;
}
- vm_page_unlock_queues();
- VM_OBJECT_UNLOCK(obj);
- SOCKBUF_LOCK(&so->so_snd);
- sbunlock(&so->so_snd);
- SOCKBUF_UNLOCK(&so->so_snd);
- goto done;
- }
- vm_page_unlock_queues();
- /*
- * Get a sendfile buf. We usually wait as long as necessary,
- * but this wait can be interrupted.
- */
- if ((sf = sf_buf_alloc(pg, SFB_CATCH)) == NULL) {
- mbstat.sf_allocfail++;
- vm_page_lock_queues();
- vm_page_unwire(pg, 0);
- if (pg->wire_count == 0 && pg->object == NULL)
- vm_page_free(pg);
- vm_page_unlock_queues();
- SOCKBUF_LOCK(&so->so_snd);
- sbunlock(&so->so_snd);
- SOCKBUF_UNLOCK(&so->so_snd);
- error = EINTR;
- goto done;
- }
+ /*
+ * Get an mbuf and set it up as having
+ * external storage.
+ */
+ m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
+ if (m0 == NULL) {
+ error = (mnw ? EAGAIN : ENOBUFS);
+ sf_buf_mext((void *)sf_buf_kva(sf), sf);
+ break;
+ }
+ MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext,
+ sf, M_RDONLY, EXT_SFBUF);
+ m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
+ m0->m_len = xfsize;
+
+ /* Append to mbuf chain. */
+ if (m != NULL)
+ m_cat(m, m0);
+ else
+ m = m0;
- /*
- * Get an mbuf header and set it up as having external storage.
- */
- if (m_header)
- MGET(m, M_TRYWAIT, MT_DATA);
- else
- MGETHDR(m, M_TRYWAIT, MT_DATA);
- if (m == NULL) {
- error = ENOBUFS;
- sf_buf_mext((void *)sf_buf_kva(sf), sf);
- SOCKBUF_LOCK(&so->so_snd);
- sbunlock(&so->so_snd);
- SOCKBUF_UNLOCK(&so->so_snd);
- goto done;
- }
- /*
- * Setup external storage for mbuf.
- */
- MEXTADD(m, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext, sf, M_RDONLY,
- EXT_SFBUF);
- m->m_data = (char *)sf_buf_kva(sf) + pgoff;
- m->m_pkthdr.len = m->m_len = xfsize;
-
- if (m_header) {
- m_cat(m_header, m);
- m = m_header;
- m_header = NULL;
- m_fixhdr(m);
+ /* Keep track of bits processed. */
+ loopbytes += xfsize;
+ off += xfsize;
}
- /*
- * Add the buffer to the socket buffer chain.
- */
- SOCKBUF_LOCK(&so->so_snd);
-retry_space:
- /*
- * Make sure that the socket is still able to take more data.
- * CANTSENDMORE being true usually means that the connection
- * was closed. so_error is true when an error was sensed after
- * a previous send.
- * The state is checked after the page mapping and buffer
- * allocation above since those operations may block and make
- * any socket checks stale. From this point forward, nothing
- * blocks before the pru_send (or more accurately, any blocking
- * results in a loop back to here to re-check).
- */
- SOCKBUF_LOCK_ASSERT(&so->so_snd);
- if ((so->so_snd.sb_state & SBS_CANTSENDMORE) || so->so_error) {
+ /* Add the buffer chain to the socket buffer. */
+ if (m != NULL) {
+ int mlen, err;
+
+ mlen = m_length(m, NULL);
+ SOCKBUF_LOCK(&so->so_snd);
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
error = EPIPE;
- } else {
- error = so->so_error;
- so->so_error = 0;
- }
- m_freem(m);
- sbunlock(&so->so_snd);
- SOCKBUF_UNLOCK(&so->so_snd);
- goto done;
- }
- /*
- * Wait for socket space to become available. We do this just
- * after checking the connection state above in order to avoid
- * a race condition with sbwait().
- */
- if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
- if (so->so_state & SS_NBIO) {
- m_freem(m);
- sbunlock(&so->so_snd);
SOCKBUF_UNLOCK(&so->so_snd);
- error = EAGAIN;
goto done;
}
- error = sbwait(&so->so_snd);
- /*
- * An error from sbwait usually indicates that we've
- * been interrupted by a signal. If we've sent anything
- * then return bytes sent, otherwise return the error.
- */
- if (error) {
- m_freem(m);
- sbunlock(&so->so_snd);
- SOCKBUF_UNLOCK(&so->so_snd);
- goto done;
- }
- goto retry_space;
- }
- SOCKBUF_UNLOCK(&so->so_snd);
- error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td);
- if (error) {
- SOCKBUF_LOCK(&so->so_snd);
- sbunlock(&so->so_snd);
SOCKBUF_UNLOCK(&so->so_snd);
- goto done;
+ /* Avoid error aliasing. */
+ err = (*so->so_proto->pr_usrreqs->pru_send)
+ (so, 0, m, NULL, NULL, td);
+ if (err == 0) {
+ /*
+ * We need two counters to get the
+ * file offset and nbytes to send
+ * right:
+ * - sbytes contains the total amount
+ * of bytes sent, including headers.
+ * - fsbytes contains the total amount
+ * of bytes sent from the file.
+ */
+ sbytes += mlen;
+ fsbytes += mlen;
+ if (hdrlen) {
+ fsbytes -= hdrlen;
+ hdrlen = 0;
+ }
+ } else if (error == 0)
+ error = err;
+ m = NULL; /* pru_send always consumes */
}
- headersent = 1;
+
+ /* Quit outer loop on error or when we're done. */
+ if (error || done)
+ goto done;
}
- SOCKBUF_LOCK(&so->so_snd);
- sbunlock(&so->so_snd);
- SOCKBUF_UNLOCK(&so->so_snd);
/*
* Send trailers. Wimp out and use writev(2).
*/
- if (uap->hdtr != NULL && hdtr.trailers != NULL) {
- nuap.fd = uap->s;
- nuap.iovp = hdtr.trailers;
- nuap.iovcnt = hdtr.trl_cnt;
- error = writev(td, &nuap);
- if (error)
- goto done;
- if (compat)
- sbytes += td->td_retval[0];
- else
- hdtr_size += td->td_retval[0];
+ if (trl_uio != NULL) {
+ error = kern_writev(td, uap->s, trl_uio);
+ if (error)
+ goto done;
+ sbytes += td->td_retval[0];
}
done:
- if (headersent) {
- if (!compat)
- hdtr_size += headersize;
- } else {
- if (compat)
- sbytes -= headersize;
- }
+ sbunlock(&so->so_snd);
+out:
/*
* If there was no error we have to clear td->td_retval[0]
* because it may have been set by writev.
@@ -2141,23 +2197,464 @@
td->td_retval[0] = 0;
}
if (uap->sbytes != NULL) {
- if (!compat)
- sbytes += hdtr_size;
copyout(&sbytes, uap->sbytes, sizeof(off_t));
}
- if (vp)
+ if (obj != NULL)
+ vm_object_deallocate(obj);
+ if (vp != NULL) {
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ }
if (so)
- fputsock(so);
- if (hdr_uio != NULL)
- free(hdr_uio, M_IOV);
- if (m_header)
- m_freem(m_header);
-
- mtx_unlock(&Giant);
+ fdrop(sock_fp, td);
+ if (m)
+ m_freem(m);
if (error == ERESTART)
error = EINTR;
return (error);
}
+
+/*
+ * SCTP syscalls.
+ * Functionality only compiled in if SCTP is defined in the kernel Makefile,
+ * otherwise all return EOPNOTSUPP.
+ * XXX: We should make this loadable one day.
+ */
+int
+sctp_peeloff(td, uap)
+ struct thread *td;
+ struct sctp_peeloff_args /* {
+ int sd;
+ caddr_t name;
+ } */ *uap;
+{
+#ifdef SCTP
+ struct filedesc *fdp;
+ struct file *nfp = NULL;
+ int error;
+ struct socket *head, *so;
+ int fd;
+ u_int fflag;
+
+ fdp = td->td_proc->p_fd;
+ error = fgetsock(td, uap->sd, &head, &fflag);
+ if (error)
+ goto done2;
+ error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
+ if (error)
+ goto done2;
+ /*
+ * At this point we know we do have a assoc to pull
+ * we proceed to get the fd setup. This may block
+ * but that is ok.
+ */
+
+ error = falloc(td, &nfp, &fd);
+ if (error)
+ goto done;
+ td->td_retval[0] = fd;
+
+ so = sonewconn(head, SS_ISCONNECTED);
+ if (so == NULL)
+ goto noconnection;
+ /*
+ * Before changing the flags on the socket, we have to bump the
+ * reference count. Otherwise, if the protocol calls sofree(),
+ * the socket will be released due to a zero refcount.
+ */
+ SOCK_LOCK(so);
+ soref(so); /* file descriptor reference */
+ SOCK_UNLOCK(so);
+
+ ACCEPT_LOCK();
+
+ TAILQ_REMOVE(&head->so_comp, so, so_list);
+ head->so_qlen--;
+ so->so_state |= (head->so_state & SS_NBIO);
+ so->so_state &= ~SS_NOFDREF;
+ so->so_qstate &= ~SQ_COMP;
+ so->so_head = NULL;
+ ACCEPT_UNLOCK();
+ FILE_LOCK(nfp);
+ nfp->f_data = so;
+ nfp->f_flag = fflag;
+ nfp->f_type = DTYPE_SOCKET;
+ nfp->f_ops = &socketops;
+ FILE_UNLOCK(nfp);
+ error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
+ if (error)
+ goto noconnection;
+ if (head->so_sigio != NULL)
+ fsetown(fgetown(&head->so_sigio), &so->so_sigio);
+
+noconnection:
+ /*
+ * close the new descriptor, assuming someone hasn't ripped it
+ * out from under us.
+ */
+ if (error)
+ fdclose(fdp, nfp, fd, td);
+
+ /*
+ * Release explicitly held references before returning.
+ */
+done:
+ if (nfp != NULL)
+ fdrop(nfp, td);
+ fputsock(head);
+done2:
+ return (error);
+#else /* SCTP */
+ return (EOPNOTSUPP);
+#endif /* SCTP */
+}
+
+int
+sctp_generic_sendmsg (td, uap)
+ struct thread *td;
+ struct sctp_generic_sendmsg_args /* {
+ int sd,
+ caddr_t msg,
+ int mlen,
+ caddr_t to,
+ __socklen_t tolen,
+ struct sctp_sndrcvinfo *sinfo,
+ int flags
+ } */ *uap;
+{
+#ifdef SCTP
+ struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
+ struct socket *so;
+ struct file *fp = NULL;
+ int use_rcvinfo = 1;
+ int error = 0, len;
+ struct sockaddr *to = NULL;
+#ifdef KTRACE
+ struct uio *ktruio = NULL;
+#endif
+ struct uio auio;
+ struct iovec iov[1];
+
+ if (uap->sinfo) {
+ error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
+ if (error)
+ return (error);
+ u_sinfo = &sinfo;
+ }
+ if (uap->tolen) {
+ error = getsockaddr(&to, uap->to, uap->tolen);
+ if (error) {
+ to = NULL;
+ goto sctp_bad2;
+ }
+ }
+
+ error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
+ if (error)
+ goto sctp_bad;
+
+ iov[0].iov_base = uap->msg;
+ iov[0].iov_len = uap->mlen;
+
+ so = (struct socket *)fp->f_data;
+#ifdef MAC
+ SOCK_LOCK(so);
+ error = mac_check_socket_send(td->td_ucred, so);
+ SOCK_UNLOCK(so);
+ if (error)
+ goto sctp_bad;
+#endif /* MAC */
+
+ auio.uio_iov = iov;
+ auio.uio_iovcnt = 1;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_td = td;
+ auio.uio_offset = 0; /* XXX */
+ auio.uio_resid = 0;
+ len = auio.uio_resid = uap->mlen;
+ error = sctp_lower_sosend(so, to, &auio,
+ (struct mbuf *)NULL, (struct mbuf *)NULL,
+ uap->flags, use_rcvinfo, u_sinfo, td);
+ if (error) {
+ if (auio.uio_resid != len && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ /* Generation of SIGPIPE can be controlled per socket. */
+ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
+ !(uap->flags & MSG_NOSIGNAL)) {
+ PROC_LOCK(td->td_proc);
+ psignal(td->td_proc, SIGPIPE);
+ PROC_UNLOCK(td->td_proc);
+ }
+ }
+ if (error == 0)
+ td->td_retval[0] = len - auio.uio_resid;
+#ifdef KTRACE
+ if (ktruio != NULL) {
+ ktruio->uio_resid = td->td_retval[0];
+ ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
+ }
+#endif /* KTRACE */
+sctp_bad:
+ if (fp)
+ fdrop(fp, td);
+sctp_bad2:
+ if (to)
+ free(to, M_SONAME);
+ return (error);
+#else /* SCTP */
+ return (EOPNOTSUPP);
+#endif /* SCTP */
+}
+
+int
+sctp_generic_sendmsg_iov(td, uap)
+ struct thread *td;
+ struct sctp_generic_sendmsg_iov_args /* {
+ int sd,
+ struct iovec *iov,
+ int iovlen,
+ caddr_t to,
+ __socklen_t tolen,
+ struct sctp_sndrcvinfo *sinfo,
+ int flags
+ } */ *uap;
+{
+#ifdef SCTP
+ struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
+ struct socket *so;
+ struct file *fp = NULL;
+ int use_rcvinfo = 1;
+ int error=0, len, i;
+ struct sockaddr *to = NULL;
+#ifdef KTRACE
+ struct uio *ktruio = NULL;
+#endif
+ struct uio auio;
+ struct iovec *iov, *tiov;
+
+ if (uap->sinfo) {
+ error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
+ if (error)
+ return (error);
+ u_sinfo = &sinfo;
+ }
+ if (uap->tolen) {
+ error = getsockaddr(&to, uap->to, uap->tolen);
+ if (error) {
+ to = NULL;
+ goto sctp_bad2;
+ }
+ }
+
+ error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
+ if (error)
+ goto sctp_bad1;
+
+ error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
+ if (error)
+ goto sctp_bad1;
+
+ so = (struct socket *)fp->f_data;
+#ifdef MAC
+ SOCK_LOCK(so);
+ error = mac_check_socket_send(td->td_ucred, so);
+ SOCK_UNLOCK(so);
+ if (error)
+ goto sctp_bad;
+#endif /* MAC */
+
+ auio.uio_iov = iov;
+ auio.uio_iovcnt = uap->iovlen;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_td = td;
+ auio.uio_offset = 0; /* XXX */
+ auio.uio_resid = 0;
+ tiov = iov;
+ for (i = 0; i <uap->iovlen; i++, tiov++) {
+ if ((auio.uio_resid += tiov->iov_len) < 0) {
+ error = EINVAL;
+ goto sctp_bad;
+ }
+ }
+ len = auio.uio_resid;
+ error = sctp_lower_sosend(so, to, &auio,
+ (struct mbuf *)NULL, (struct mbuf *)NULL,
+ uap->flags, use_rcvinfo, u_sinfo, td);
+ if (error) {
+ if (auio.uio_resid != len && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ /* Generation of SIGPIPE can be controlled per socket */
+ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
+ !(uap->flags & MSG_NOSIGNAL)) {
+ PROC_LOCK(td->td_proc);
+ psignal(td->td_proc, SIGPIPE);
+ PROC_UNLOCK(td->td_proc);
+ }
+ }
+ if (error == 0)
+ td->td_retval[0] = len - auio.uio_resid;
+#ifdef KTRACE
+ if (ktruio != NULL) {
+ ktruio->uio_resid = td->td_retval[0];
+ ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
+ }
+#endif /* KTRACE */
+sctp_bad:
+ free(iov, M_IOV);
+sctp_bad1:
+ if (fp)
+ fdrop(fp, td);
+sctp_bad2:
+ if (to)
+ free(to, M_SONAME);
+ return (error);
+#else /* SCTP */
+ return (EOPNOTSUPP);
+#endif /* SCTP */
+}
+
+int
+sctp_generic_recvmsg(td, uap)
+ struct thread *td;
+ struct sctp_generic_recvmsg_args /* {
+ int sd,
+ struct iovec *iov,
+ int iovlen,
+ struct sockaddr *from,
+ __socklen_t *fromlenaddr,
+ struct sctp_sndrcvinfo *sinfo,
+ int *msg_flags
+ } */ *uap;
+{
+#ifdef SCTP
+ u_int8_t sockbufstore[256];
+ struct uio auio;
+ struct iovec *iov, *tiov;
+ struct sctp_sndrcvinfo sinfo;
+ struct socket *so;
+ struct file *fp = NULL;
+ struct sockaddr *fromsa;
+ int fromlen;
+ int len, i, msg_flags;
+ int error = 0;
+#ifdef KTRACE
+ struct uio *ktruio = NULL;
+#endif
+ error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
+ if (error) {
+ return (error);
+ }
+ error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
+ if (error) {
+ goto out1;
+ }
+
+ so = fp->f_data;
+#ifdef MAC
+ SOCK_LOCK(so);
+ error = mac_check_socket_receive(td->td_ucred, so);
+ SOCK_UNLOCK(so);
+ if (error) {
+ goto out;
+ return (error);
+ }
+#endif /* MAC */
+
+ if (uap->fromlenaddr) {
+ error = copyin(uap->fromlenaddr,
+ &fromlen, sizeof (fromlen));
+ if (error) {
+ goto out;
+ }
+ } else {
+ fromlen = 0;
+ }
+ if(uap->msg_flags) {
+ error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
+ if (error) {
+ goto out;
+ }
+ } else {
+ msg_flags = 0;
+ }
+ auio.uio_iov = iov;
+ auio.uio_iovcnt = uap->iovlen;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_td = td;
+ auio.uio_offset = 0; /* XXX */
+ auio.uio_resid = 0;
+ tiov = iov;
+ for (i = 0; i <uap->iovlen; i++, tiov++) {
+ if ((auio.uio_resid += tiov->iov_len) < 0) {
+ error = EINVAL;
+ goto out;
+ }
+ }
+ len = auio.uio_resid;
+ fromsa = (struct sockaddr *)sockbufstore;
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_GENIO))
+ ktruio = cloneuio(&auio);
+#endif /* KTRACE */
+ error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
+ fromsa, fromlen, &msg_flags,
+ (struct sctp_sndrcvinfo *)&sinfo, 1);
+ if (error) {
+ if (auio.uio_resid != (int)len && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ } else {
+ if (uap->sinfo)
+ error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
+ }
+#ifdef KTRACE
+ if (ktruio != NULL) {
+ ktruio->uio_resid = (int)len - auio.uio_resid;
+ ktrgenio(uap->sd, UIO_READ, ktruio, error);
+ }
+#endif /* KTRACE */
+ if (error)
+ goto out;
+ td->td_retval[0] = (int)len - auio.uio_resid;
+
+ if (fromlen && uap->from) {
+ len = fromlen;
+ if (len <= 0 || fromsa == 0)
+ len = 0;
+ else {
+ len = MIN(len, fromsa->sa_len);
+ error = copyout(fromsa, uap->from, (unsigned)len);
+ if (error)
+ goto out;
+ }
+ error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
+ if (error) {
+ goto out;
+ }
+ }
+ if (uap->msg_flags) {
+ error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
+ if (error) {
+ goto out;
+ }
+ }
+out:
+ free(iov, M_IOV);
+out1:
+ if (fp)
+ fdrop(fp, td);
+
+ return (error);
+#else /* SCTP */
+ return (EOPNOTSUPP);
+#endif /* SCTP */
+}
Index: subr_turnstile.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_turnstile.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/subr_turnstile.c -L sys/kern/subr_turnstile.c -u -r1.1.1.2 -r1.2
--- sys/kern/subr_turnstile.c
+++ sys/kern/subr_turnstile.c
@@ -46,27 +46,28 @@
* chain. Each chain contains a spin mutex that protects all of the
* turnstiles in the chain.
*
- * Each time a thread is created, a turnstile is malloc'd and attached to
- * that thread. When a thread blocks on a lock, if it is the first thread
- * to block, it lends its turnstile to the lock. If the lock already has
- * a turnstile, then it gives its turnstile to the lock's turnstile's free
- * list. When a thread is woken up, it takes a turnstile from the free list
- * if there are any other waiters. If it is the only thread blocked on the
- * lock, then it reclaims the turnstile associated with the lock and removes
- * it from the hash table.
+ * Each time a thread is created, a turnstile is allocated from a UMA zone
+ * and attached to that thread. When a thread blocks on a lock, if it is the
+ * first thread to block, it lends its turnstile to the lock. If the lock
+ * already has a turnstile, then it gives its turnstile to the lock's
+ * turnstile's free list. When a thread is woken up, it takes a turnstile from
+ * the free list if there are any other waiters. If it is the only thread
+ * blocked on the lock, then it reclaims the turnstile associated with the lock
+ * and removes it from the hash table.
*/
-#include "opt_turnstile_profiling.h"
-
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_turnstile.c,v 1.152.2.1 2005/10/09 03:25:37 delphij Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_turnstile.c,v 1.169 2007/06/12 23:27:31 jeff Exp $");
+
+#include "opt_ddb.h"
+#include "opt_turnstile_profiling.h"
+#include "opt_sched.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/lock.h>
-#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/queue.h>
@@ -74,6 +75,15 @@
#include <sys/sysctl.h>
#include <sys/turnstile.h>
+#include <vm/uma.h>
+
+#ifdef DDB
+#include <sys/kdb.h>
+#include <ddb/ddb.h>
+#include <sys/lockmgr.h>
+#include <sys/sx.h>
+#endif
+
/*
* Constants for the hash table of turnstile chains. TC_SHIFT is a magic
* number chosen because the sleep queue's use the same value for the
@@ -95,8 +105,9 @@
* when it is attached to a lock. The second list to use ts_hash is the
* free list hung off of a turnstile that is attached to a lock.
*
- * Each turnstile contains two lists of threads. The ts_blocked list is
- * a linked list of threads blocked on the turnstile's lock. The
+ * Each turnstile contains three lists of threads. The two ts_blocked lists
+ * are linked list of threads blocked on the turnstile's lock. One list is
+ * for exclusive waiters, and the other is for shared waiters. The
* ts_pending list is a linked list of threads previously awakened by
* turnstile_signal() or turnstile_wait() that are waiting to be put on
* the run queue.
@@ -106,8 +117,9 @@
* q - td_contested lock
*/
struct turnstile {
- TAILQ_HEAD(, thread) ts_blocked; /* (c + q) Blocked threads. */
- TAILQ_HEAD(, thread) ts_pending; /* (c) Pending threads. */
+ struct mtx ts_lock; /* Spin lock for self. */
+ struct threadqueue ts_blocked[2]; /* (c + q) Blocked threads. */
+ struct threadqueue ts_pending; /* (c) Pending threads. */
LIST_ENTRY(turnstile) ts_hash; /* (c) Chain and free list. */
LIST_ENTRY(turnstile) ts_link; /* (q) Contested locks. */
LIST_HEAD(, turnstile) ts_free; /* (c) Free turnstiles. */
@@ -134,8 +146,7 @@
#endif
static struct mtx td_contested_lock;
static struct turnstile_chain turnstile_chains[TC_TABLESIZE];
-
-static MALLOC_DEFINE(M_TURNSTILE, "turnstiles", "turnstiles");
+static uma_zone_t turnstile_zone;
/*
* Prototypes for non-exported routines.
@@ -147,7 +158,13 @@
static void propagate_priority(struct thread *td);
static int turnstile_adjust_thread(struct turnstile *ts,
struct thread *td);
+static struct thread *turnstile_first_waiter(struct turnstile *ts);
static void turnstile_setowner(struct turnstile *ts, struct thread *owner);
+#ifdef INVARIANTS
+static void turnstile_dtor(void *mem, int size, void *arg);
+#endif
+static int turnstile_init(void *mem, int size, int flags);
+static void turnstile_fini(void *mem, int size);
/*
* Walks the chain of turnstiles and their owners to propagate the priority
@@ -157,45 +174,62 @@
static void
propagate_priority(struct thread *td)
{
- struct turnstile_chain *tc;
struct turnstile *ts;
int pri;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
pri = td->td_priority;
ts = td->td_blocked;
+ MPASS(td->td_lock == &ts->ts_lock);
+ /*
+ * Grab a recursive lock on this turnstile chain so it stays locked
+ * for the whole operation. The caller expects us to return with
+ * the original lock held. We only ever lock down the chain so
+ * the lock order is constant.
+ */
+ mtx_lock_spin(&ts->ts_lock);
for (;;) {
td = ts->ts_owner;
if (td == NULL) {
/*
- * This really isn't quite right. Really
- * ought to bump priority of thread that
- * next acquires the lock.
+ * This might be a read lock with no owner. There's
+ * not much we can do, so just bail.
*/
+ mtx_unlock_spin(&ts->ts_lock);
return;
}
+ thread_lock_flags(td, MTX_DUPOK);
+ mtx_unlock_spin(&ts->ts_lock);
MPASS(td->td_proc != NULL);
MPASS(td->td_proc->p_magic == P_MAGIC);
/*
- * XXX: The owner of a turnstile can be stale if it is the
- * first thread to grab a slock of a sx lock. In that case
- * it is possible for us to be at SSLEEP or some other
- * weird state. We should probably just return if the state
- * isn't SRUN or SLOCK.
+ * If the thread is asleep, then we are probably about
+ * to deadlock. To make debugging this easier, just
+ * panic and tell the user which thread misbehaved so
+ * they can hopefully get a stack trace from the truly
+ * misbehaving thread.
*/
- KASSERT(!TD_IS_SLEEPING(td),
- ("sleeping thread (tid %d) owns a non-sleepable lock",
- td->td_tid));
+ if (TD_IS_SLEEPING(td)) {
+ printf(
+ "Sleeping thread (tid %d, pid %d) owns a non-sleepable lock\n",
+ td->td_tid, td->td_proc->p_pid);
+#ifdef DDB
+ db_trace_thread(td, -1);
+#endif
+ panic("sleeping thread");
+ }
/*
* If this thread already has higher priority than the
* thread that is being blocked, we are finished.
*/
- if (td->td_priority <= pri)
+ if (td->td_priority <= pri) {
+ thread_unlock(td);
return;
+ }
/*
* Bump this thread's priority.
@@ -208,6 +242,7 @@
*/
if (TD_IS_RUNNING(td) || TD_ON_RUNQ(td)) {
MPASS(td->td_blocked == NULL);
+ thread_unlock(td);
return;
}
@@ -232,15 +267,13 @@
*/
ts = td->td_blocked;
MPASS(ts != NULL);
- tc = TC_LOOKUP(ts->ts_lockobj);
- mtx_lock_spin(&tc->tc_lock);
-
+ MPASS(td->td_lock == &ts->ts_lock);
/* Resort td on the list if needed. */
if (!turnstile_adjust_thread(ts, td)) {
- mtx_unlock_spin(&tc->tc_lock);
+ mtx_unlock_spin(&ts->ts_lock);
return;
}
- mtx_unlock_spin(&tc->tc_lock);
+ /* The thread lock is released as ts lock above. */
}
}
@@ -251,16 +284,16 @@
static int
turnstile_adjust_thread(struct turnstile *ts, struct thread *td)
{
- struct turnstile_chain *tc;
struct thread *td1, *td2;
+ int queue;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
MPASS(TD_ON_LOCK(td));
/*
* This thread may not be blocked on this turnstile anymore
* but instead might already be woken up on another CPU
- * that is waiting on sched_lock in turnstile_unpend() to
+ * that is waiting on the thread lock in turnstile_unpend() to
* finish waking this thread up. We can detect this case
* by checking to see if this thread has been given a
* turnstile by either turnstile_signal() or
@@ -275,8 +308,7 @@
* It needs to be moved if either its priority is lower than
* the previous thread or higher than the next thread.
*/
- tc = TC_LOOKUP(ts->ts_lockobj);
- mtx_assert(&tc->tc_lock, MA_OWNED);
+ MPASS(td->td_lock == &ts->ts_lock);
td1 = TAILQ_PREV(td, threadqueue, td_lockq);
td2 = TAILQ_NEXT(td, td_lockq);
if ((td1 != NULL && td->td_priority < td1->td_priority) ||
@@ -286,16 +318,18 @@
* Remove thread from blocked chain and determine where
* it should be moved to.
*/
+ queue = td->td_tsqueue;
+ MPASS(queue == TS_EXCLUSIVE_QUEUE || queue == TS_SHARED_QUEUE);
mtx_lock_spin(&td_contested_lock);
- TAILQ_REMOVE(&ts->ts_blocked, td, td_lockq);
- TAILQ_FOREACH(td1, &ts->ts_blocked, td_lockq) {
+ TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq);
+ TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq) {
MPASS(td1->td_proc->p_magic == P_MAGIC);
if (td1->td_priority > td->td_priority)
break;
}
if (td1 == NULL)
- TAILQ_INSERT_TAIL(&ts->ts_blocked, td, td_lockq);
+ TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
else
TAILQ_INSERT_BEFORE(td1, td, td_lockq);
mtx_unlock_spin(&td_contested_lock);
@@ -328,6 +362,7 @@
NULL, MTX_SPIN);
}
mtx_init(&td_contested_lock, "td_contested", NULL, MTX_SPIN);
+ LIST_INIT(&thread0.td_contested);
thread0.td_turnstile = NULL;
}
@@ -360,6 +395,13 @@
init_turnstile0(void *dummy)
{
+ turnstile_zone = uma_zcreate("TURNSTILE", sizeof(struct turnstile),
+#ifdef INVARIANTS
+ NULL, turnstile_dtor, turnstile_init, turnstile_fini,
+ UMA_ALIGN_CACHE, 0);
+#else
+ NULL, NULL, turnstile_init, turnstile_fini, UMA_ALIGN_CACHE, 0);
+#endif
thread0.td_turnstile = turnstile_alloc();
}
SYSINIT(turnstile0, SI_SUB_LOCK, SI_ORDER_ANY, init_turnstile0, NULL);
@@ -371,10 +413,8 @@
void
turnstile_adjust(struct thread *td, u_char oldpri)
{
- struct turnstile_chain *tc;
struct turnstile *ts;
- mtx_assert(&sched_lock, MA_OWNED);
MPASS(TD_ON_LOCK(td));
/*
@@ -382,26 +422,24 @@
*/
ts = td->td_blocked;
MPASS(ts != NULL);
- tc = TC_LOOKUP(ts->ts_lockobj);
- mtx_lock_spin(&tc->tc_lock);
+ MPASS(td->td_lock == &ts->ts_lock);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
/* Resort the turnstile on the list. */
- if (!turnstile_adjust_thread(ts, td)) {
- mtx_unlock_spin(&tc->tc_lock);
+ if (!turnstile_adjust_thread(ts, td))
return;
- }
-
/*
* If our priority was lowered and we are at the head of the
* turnstile, then propagate our new priority up the chain.
* Note that we currently don't try to revoke lent priorities
* when our priority goes up.
*/
- if (td == TAILQ_FIRST(&ts->ts_blocked) && td->td_priority < oldpri) {
- mtx_unlock_spin(&tc->tc_lock);
+ MPASS(td->td_tsqueue == TS_EXCLUSIVE_QUEUE ||
+ td->td_tsqueue == TS_SHARED_QUEUE);
+ if (td == TAILQ_FIRST(&ts->ts_blocked[td->td_tsqueue]) &&
+ td->td_priority < oldpri) {
propagate_priority(td);
- } else
- mtx_unlock_spin(&tc->tc_lock);
+ }
}
/*
@@ -412,25 +450,68 @@
{
mtx_assert(&td_contested_lock, MA_OWNED);
- MPASS(owner->td_proc->p_magic == P_MAGIC);
MPASS(ts->ts_owner == NULL);
+
+ /* A shared lock might not have an owner. */
+ if (owner == NULL)
+ return;
+
+ MPASS(owner->td_proc->p_magic == P_MAGIC);
ts->ts_owner = owner;
LIST_INSERT_HEAD(&owner->td_contested, ts, ts_link);
}
+#ifdef INVARIANTS
/*
- * Malloc a turnstile for a new thread, initialize it and return it.
+ * UMA zone item deallocator.
*/
-struct turnstile *
-turnstile_alloc(void)
+static void
+turnstile_dtor(void *mem, int size, void *arg)
{
struct turnstile *ts;
- ts = malloc(sizeof(struct turnstile), M_TURNSTILE, M_WAITOK | M_ZERO);
- TAILQ_INIT(&ts->ts_blocked);
+ ts = mem;
+ MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]));
+ MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
+ MPASS(TAILQ_EMPTY(&ts->ts_pending));
+}
+#endif
+
+/*
+ * UMA zone item initializer.
+ */
+static int
+turnstile_init(void *mem, int size, int flags)
+{
+ struct turnstile *ts;
+
+ bzero(mem, size);
+ ts = mem;
+ TAILQ_INIT(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]);
+ TAILQ_INIT(&ts->ts_blocked[TS_SHARED_QUEUE]);
TAILQ_INIT(&ts->ts_pending);
LIST_INIT(&ts->ts_free);
- return (ts);
+ mtx_init(&ts->ts_lock, "turnstile lock", NULL, MTX_SPIN | MTX_RECURSE);
+ return (0);
+}
+
+static void
+turnstile_fini(void *mem, int size)
+{
+ struct turnstile *ts;
+
+ ts = mem;
+ mtx_destroy(&ts->ts_lock);
+}
+
+/*
+ * Get a turnstile for a new thread.
+ */
+struct turnstile *
+turnstile_alloc(void)
+{
+
+ return (uma_zalloc(turnstile_zone, M_WAITOK));
}
/*
@@ -440,22 +521,58 @@
turnstile_free(struct turnstile *ts)
{
- MPASS(ts != NULL);
- MPASS(TAILQ_EMPTY(&ts->ts_blocked));
- MPASS(TAILQ_EMPTY(&ts->ts_pending));
- free(ts, M_TURNSTILE);
+ uma_zfree(turnstile_zone, ts);
}
/*
* Lock the turnstile chain associated with the specified lock.
*/
void
-turnstile_lock(struct lock_object *lock)
+turnstile_chain_lock(struct lock_object *lock)
+{
+ struct turnstile_chain *tc;
+
+ tc = TC_LOOKUP(lock);
+ mtx_lock_spin(&tc->tc_lock);
+}
+
+struct turnstile *
+turnstile_trywait(struct lock_object *lock)
{
struct turnstile_chain *tc;
+ struct turnstile *ts;
tc = TC_LOOKUP(lock);
mtx_lock_spin(&tc->tc_lock);
+ LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+ if (ts->ts_lockobj == lock) {
+ mtx_lock_spin(&ts->ts_lock);
+ return (ts);
+ }
+
+ ts = curthread->td_turnstile;
+ MPASS(ts != NULL);
+ mtx_lock_spin(&ts->ts_lock);
+ KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer"));
+ ts->ts_lockobj = lock;
+
+ return (ts);
+}
+
+void
+turnstile_cancel(struct turnstile *ts)
+{
+ struct turnstile_chain *tc;
+ struct lock_object *lock;
+
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+
+ mtx_unlock_spin(&ts->ts_lock);
+ lock = ts->ts_lockobj;
+ if (ts == curthread->td_turnstile)
+ ts->ts_lockobj = NULL;
+ tc = TC_LOOKUP(lock);
+ mtx_unlock_spin(&tc->tc_lock);
}
/*
@@ -472,8 +589,10 @@
tc = TC_LOOKUP(lock);
mtx_assert(&tc->tc_lock, MA_OWNED);
LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
- if (ts->ts_lockobj == lock)
+ if (ts->ts_lockobj == lock) {
+ mtx_lock_spin(&ts->ts_lock);
return (ts);
+ }
return (NULL);
}
@@ -481,7 +600,7 @@
* Unlock the turnstile chain associated with a given lock.
*/
void
-turnstile_release(struct lock_object *lock)
+turnstile_chain_unlock(struct lock_object *lock)
{
struct turnstile_chain *tc;
@@ -490,38 +609,54 @@
}
/*
+ * Return a pointer to the thread waiting on this turnstile with the
+ * most important priority or NULL if the turnstile has no waiters.
+ */
+static struct thread *
+turnstile_first_waiter(struct turnstile *ts)
+{
+ struct thread *std, *xtd;
+
+ std = TAILQ_FIRST(&ts->ts_blocked[TS_SHARED_QUEUE]);
+ xtd = TAILQ_FIRST(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]);
+ if (xtd == NULL || (std != NULL && std->td_priority < xtd->td_priority))
+ return (std);
+ return (xtd);
+}
+
+/*
* Take ownership of a turnstile and adjust the priority of the new
* owner appropriately.
*/
void
-turnstile_claim(struct lock_object *lock)
+turnstile_claim(struct turnstile *ts)
{
- struct turnstile_chain *tc;
- struct turnstile *ts;
struct thread *td, *owner;
+ struct turnstile_chain *tc;
- tc = TC_LOOKUP(lock);
- mtx_assert(&tc->tc_lock, MA_OWNED);
- ts = turnstile_lookup(lock);
- MPASS(ts != NULL);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+ MPASS(ts != curthread->td_turnstile);
owner = curthread;
mtx_lock_spin(&td_contested_lock);
turnstile_setowner(ts, owner);
mtx_unlock_spin(&td_contested_lock);
- td = TAILQ_FIRST(&ts->ts_blocked);
+ td = turnstile_first_waiter(ts);
MPASS(td != NULL);
MPASS(td->td_proc->p_magic == P_MAGIC);
- mtx_unlock_spin(&tc->tc_lock);
+ MPASS(td->td_lock == &ts->ts_lock);
/*
* Update the priority of the new owner if needed.
*/
- mtx_lock_spin(&sched_lock);
+ thread_lock(owner);
if (td->td_priority < owner->td_priority)
sched_lend_prio(owner, td->td_priority);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(owner);
+ tc = TC_LOOKUP(ts->ts_lockobj);
+ mtx_unlock_spin(&ts->ts_lock);
+ mtx_unlock_spin(&tc->tc_lock);
}
/*
@@ -531,28 +666,28 @@
* turnstile chain locked and will return with it unlocked.
*/
void
-turnstile_wait(struct lock_object *lock, struct thread *owner)
+turnstile_wait(struct turnstile *ts, struct thread *owner, int queue)
{
struct turnstile_chain *tc;
- struct turnstile *ts;
struct thread *td, *td1;
+ struct lock_object *lock;
td = curthread;
- tc = TC_LOOKUP(lock);
- mtx_assert(&tc->tc_lock, MA_OWNED);
- MPASS(td->td_turnstile != NULL);
- MPASS(owner != NULL);
- MPASS(owner->td_proc->p_magic == P_MAGIC);
-
- /* Look up the turnstile associated with the lock 'lock'. */
- ts = turnstile_lookup(lock);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+ if (queue == TS_SHARED_QUEUE)
+ MPASS(owner != NULL);
+ if (owner)
+ MPASS(owner->td_proc->p_magic == P_MAGIC);
+ MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
/*
* If the lock does not already have a turnstile, use this thread's
* turnstile. Otherwise insert the current thread into the
* turnstile already in use by this lock.
*/
- if (ts == NULL) {
+ tc = TC_LOOKUP(ts->ts_lockobj);
+ if (ts == td->td_turnstile) {
+ mtx_assert(&tc->tc_lock, MA_OWNED);
#ifdef TURNSTILE_PROFILING
tc->tc_depth++;
if (tc->tc_depth > tc->tc_max_depth) {
@@ -561,83 +696,60 @@
turnstile_max_depth = tc->tc_max_depth;
}
#endif
- ts = td->td_turnstile;
+ tc = TC_LOOKUP(ts->ts_lockobj);
LIST_INSERT_HEAD(&tc->tc_turnstiles, ts, ts_hash);
KASSERT(TAILQ_EMPTY(&ts->ts_pending),
("thread's turnstile has pending threads"));
- KASSERT(TAILQ_EMPTY(&ts->ts_blocked),
- ("thread's turnstile has a non-empty queue"));
+ KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]),
+ ("thread's turnstile has exclusive waiters"));
+ KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]),
+ ("thread's turnstile has shared waiters"));
KASSERT(LIST_EMPTY(&ts->ts_free),
("thread's turnstile has a non-empty free list"));
- KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer"));
- ts->ts_lockobj = lock;
+ MPASS(ts->ts_lockobj != NULL);
mtx_lock_spin(&td_contested_lock);
- TAILQ_INSERT_TAIL(&ts->ts_blocked, td, td_lockq);
+ TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
turnstile_setowner(ts, owner);
mtx_unlock_spin(&td_contested_lock);
} else {
- TAILQ_FOREACH(td1, &ts->ts_blocked, td_lockq)
+ TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq)
if (td1->td_priority > td->td_priority)
break;
mtx_lock_spin(&td_contested_lock);
if (td1 != NULL)
TAILQ_INSERT_BEFORE(td1, td, td_lockq);
else
- TAILQ_INSERT_TAIL(&ts->ts_blocked, td, td_lockq);
+ TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
+ MPASS(owner == ts->ts_owner);
mtx_unlock_spin(&td_contested_lock);
MPASS(td->td_turnstile != NULL);
LIST_INSERT_HEAD(&ts->ts_free, td->td_turnstile, ts_hash);
- MPASS(owner == ts->ts_owner);
}
+ thread_lock(td);
+ thread_lock_set(td, &ts->ts_lock);
td->td_turnstile = NULL;
- mtx_unlock_spin(&tc->tc_lock);
-
- mtx_lock_spin(&sched_lock);
- /*
- * Handle race condition where a thread on another CPU that owns
- * lock 'lock' could have woken us in between us dropping the
- * turnstile chain lock and acquiring the sched_lock.
- */
- if (td->td_flags & TDF_TSNOBLOCK) {
- td->td_flags &= ~TDF_TSNOBLOCK;
- mtx_unlock_spin(&sched_lock);
- return;
- }
-
-#ifdef notyet
- /*
- * If we're borrowing an interrupted thread's VM context, we
- * must clean up before going to sleep.
- */
- if (td->td_ithd != NULL) {
- struct ithd *it = td->td_ithd;
-
- if (it->it_interrupted) {
- if (LOCK_LOG_TEST(lock, 0))
- CTR3(KTR_LOCK, "%s: %p interrupted %p",
- __func__, it, it->it_interrupted);
- intr_thd_fixup(it);
- }
- }
-#endif
/* Save who we are blocked on and switch. */
+ lock = ts->ts_lockobj;
+ td->td_tsqueue = queue;
td->td_blocked = ts;
td->td_lockname = lock->lo_name;
TD_SET_LOCK(td);
+ mtx_unlock_spin(&tc->tc_lock);
propagate_priority(td);
if (LOCK_LOG_TEST(lock, 0))
CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__,
td->td_tid, lock, lock->lo_name);
+ MPASS(td->td_lock == &ts->ts_lock);
+ SCHED_STAT_INC(switch_turnstile);
mi_switch(SW_VOL, NULL);
if (LOCK_LOG_TEST(lock, 0))
CTR4(KTR_LOCK, "%s: td %d free from blocked on [%p] %s",
__func__, td->td_tid, lock, lock->lo_name);
-
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
/*
@@ -645,26 +757,27 @@
* pending list. This must be called with the turnstile chain locked.
*/
int
-turnstile_signal(struct turnstile *ts)
+turnstile_signal(struct turnstile *ts, int queue)
{
struct turnstile_chain *tc;
struct thread *td;
int empty;
MPASS(ts != NULL);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
MPASS(curthread->td_proc->p_magic == P_MAGIC);
- MPASS(ts->ts_owner == curthread);
- tc = TC_LOOKUP(ts->ts_lockobj);
- mtx_assert(&tc->tc_lock, MA_OWNED);
+ MPASS(ts->ts_owner == curthread ||
+ (queue == TS_EXCLUSIVE_QUEUE && ts->ts_owner == NULL));
+ MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
/*
* Pick the highest priority thread blocked on this lock and
* move it to the pending list.
*/
- td = TAILQ_FIRST(&ts->ts_blocked);
+ td = TAILQ_FIRST(&ts->ts_blocked[queue]);
MPASS(td->td_proc->p_magic == P_MAGIC);
mtx_lock_spin(&td_contested_lock);
- TAILQ_REMOVE(&ts->ts_blocked, td, td_lockq);
+ TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq);
mtx_unlock_spin(&td_contested_lock);
TAILQ_INSERT_TAIL(&ts->ts_pending, td, td_lockq);
@@ -673,8 +786,11 @@
* give it to the about-to-be-woken thread. Otherwise take a
* turnstile from the free list and give it to the thread.
*/
- empty = TAILQ_EMPTY(&ts->ts_blocked);
+ empty = TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
+ TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]);
if (empty) {
+ tc = TC_LOOKUP(ts->ts_lockobj);
+ mtx_assert(&tc->tc_lock, MA_OWNED);
MPASS(LIST_EMPTY(&ts->ts_free));
#ifdef TURNSTILE_PROFILING
tc->tc_depth--;
@@ -693,28 +809,35 @@
* the turnstile chain locked.
*/
void
-turnstile_broadcast(struct turnstile *ts)
+turnstile_broadcast(struct turnstile *ts, int queue)
{
struct turnstile_chain *tc;
struct turnstile *ts1;
struct thread *td;
MPASS(ts != NULL);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
MPASS(curthread->td_proc->p_magic == P_MAGIC);
- MPASS(ts->ts_owner == curthread);
+ MPASS(ts->ts_owner == curthread ||
+ (queue == TS_EXCLUSIVE_QUEUE && ts->ts_owner == NULL));
+ /*
+ * We must have the chain locked so that we can remove the empty
+ * turnstile from the hash queue.
+ */
tc = TC_LOOKUP(ts->ts_lockobj);
mtx_assert(&tc->tc_lock, MA_OWNED);
+ MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
/*
* Transfer the blocked list to the pending list.
*/
mtx_lock_spin(&td_contested_lock);
- TAILQ_CONCAT(&ts->ts_pending, &ts->ts_blocked, td_lockq);
+ TAILQ_CONCAT(&ts->ts_pending, &ts->ts_blocked[queue], td_lockq);
mtx_unlock_spin(&td_contested_lock);
/*
* Give a turnstile to each thread. The last thread gets
- * this turnstile.
+ * this turnstile if the turnstile is empty.
*/
TAILQ_FOREACH(td, &ts->ts_pending, td_lockq) {
if (LIST_EMPTY(&ts->ts_free)) {
@@ -737,17 +860,17 @@
* chain locked.
*/
void
-turnstile_unpend(struct turnstile *ts)
+turnstile_unpend(struct turnstile *ts, int owner_type)
{
TAILQ_HEAD( ,thread) pending_threads;
- struct turnstile_chain *tc;
+ struct turnstile *nts;
struct thread *td;
u_char cp, pri;
MPASS(ts != NULL);
- MPASS(ts->ts_owner == curthread);
- tc = TC_LOOKUP(ts->ts_lockobj);
- mtx_assert(&tc->tc_lock, MA_OWNED);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+ MPASS(ts->ts_owner == curthread ||
+ (owner_type == TS_SHARED_LOCK && ts->ts_owner == NULL));
MPASS(!TAILQ_EMPTY(&ts->ts_pending));
/*
@@ -757,9 +880,81 @@
TAILQ_INIT(&pending_threads);
TAILQ_CONCAT(&pending_threads, &ts->ts_pending, td_lockq);
#ifdef INVARIANTS
- if (TAILQ_EMPTY(&ts->ts_blocked))
+ if (TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
+ TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]))
ts->ts_lockobj = NULL;
#endif
+ /*
+ * Adjust the priority of curthread based on other contested
+ * locks it owns. Don't lower the priority below the base
+ * priority however.
+ */
+ td = curthread;
+ pri = PRI_MAX;
+ thread_lock(td);
+ mtx_lock_spin(&td_contested_lock);
+ /*
+ * Remove the turnstile from this thread's list of contested locks
+ * since this thread doesn't own it anymore. New threads will
+ * not be blocking on the turnstile until it is claimed by a new
+ * owner. There might not be a current owner if this is a shared
+ * lock.
+ */
+ if (ts->ts_owner != NULL) {
+ ts->ts_owner = NULL;
+ LIST_REMOVE(ts, ts_link);
+ }
+ LIST_FOREACH(nts, &td->td_contested, ts_link) {
+ cp = turnstile_first_waiter(nts)->td_priority;
+ if (cp < pri)
+ pri = cp;
+ }
+ mtx_unlock_spin(&td_contested_lock);
+ sched_unlend_prio(td, pri);
+ thread_unlock(td);
+ /*
+ * Wake up all the pending threads. If a thread is not blocked
+ * on a lock, then it is currently executing on another CPU in
+ * turnstile_wait() or sitting on a run queue waiting to resume
+ * in turnstile_wait(). Set a flag to force it to try to acquire
+ * the lock again instead of blocking.
+ */
+ while (!TAILQ_EMPTY(&pending_threads)) {
+ td = TAILQ_FIRST(&pending_threads);
+ TAILQ_REMOVE(&pending_threads, td, td_lockq);
+ thread_lock(td);
+ MPASS(td->td_lock == &ts->ts_lock);
+ MPASS(td->td_proc->p_magic == P_MAGIC);
+ MPASS(TD_ON_LOCK(td));
+ TD_CLR_LOCK(td);
+ MPASS(TD_CAN_RUN(td));
+ td->td_blocked = NULL;
+ td->td_lockname = NULL;
+#ifdef INVARIANTS
+ td->td_tsqueue = 0xff;
+#endif
+ sched_add(td, SRQ_BORING);
+ thread_unlock(td);
+ }
+ mtx_unlock_spin(&ts->ts_lock);
+}
+
+/*
+ * Give up ownership of a turnstile. This must be called with the
+ * turnstile chain locked.
+ */
+void
+turnstile_disown(struct turnstile *ts)
+{
+ struct thread *td;
+ u_char cp, pri;
+
+ MPASS(ts != NULL);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+ MPASS(ts->ts_owner == curthread);
+ MPASS(TAILQ_EMPTY(&ts->ts_pending));
+ MPASS(!TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) ||
+ !TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
/*
* Remove the turnstile from this thread's list of contested locks
@@ -771,8 +966,6 @@
ts->ts_owner = NULL;
LIST_REMOVE(ts, ts_link);
mtx_unlock_spin(&td_contested_lock);
- critical_enter();
- mtx_unlock_spin(&tc->tc_lock);
/*
* Adjust the priority of curthread based on other contested
@@ -781,70 +974,330 @@
*/
td = curthread;
pri = PRI_MAX;
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
+ mtx_unlock_spin(&ts->ts_lock);
mtx_lock_spin(&td_contested_lock);
LIST_FOREACH(ts, &td->td_contested, ts_link) {
- cp = TAILQ_FIRST(&ts->ts_blocked)->td_priority;
+ cp = turnstile_first_waiter(ts)->td_priority;
if (cp < pri)
pri = cp;
}
mtx_unlock_spin(&td_contested_lock);
sched_unlend_prio(td, pri);
-
- /*
- * Wake up all the pending threads. If a thread is not blocked
- * on a lock, then it is currently executing on another CPU in
- * turnstile_wait() or sitting on a run queue waiting to resume
- * in turnstile_wait(). Set a flag to force it to try to acquire
- * the lock again instead of blocking.
- */
- while (!TAILQ_EMPTY(&pending_threads)) {
- td = TAILQ_FIRST(&pending_threads);
- TAILQ_REMOVE(&pending_threads, td, td_lockq);
- MPASS(td->td_proc->p_magic == P_MAGIC);
- if (TD_ON_LOCK(td)) {
- td->td_blocked = NULL;
- td->td_lockname = NULL;
- TD_CLR_LOCK(td);
- MPASS(TD_CAN_RUN(td));
- setrunqueue(td, SRQ_BORING);
- } else {
- td->td_flags |= TDF_TSNOBLOCK;
- MPASS(TD_IS_RUNNING(td) || TD_ON_RUNQ(td));
- }
- }
- critical_exit();
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
/*
* Return the first thread in a turnstile.
*/
struct thread *
-turnstile_head(struct turnstile *ts)
+turnstile_head(struct turnstile *ts, int queue)
{
#ifdef INVARIANTS
- struct turnstile_chain *tc;
MPASS(ts != NULL);
- tc = TC_LOOKUP(ts->ts_lockobj);
- mtx_assert(&tc->tc_lock, MA_OWNED);
+ MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
#endif
- return (TAILQ_FIRST(&ts->ts_blocked));
+ return (TAILQ_FIRST(&ts->ts_blocked[queue]));
}
/*
- * Returns true if a turnstile is empty.
+ * Returns true if a sub-queue of a turnstile is empty.
*/
int
-turnstile_empty(struct turnstile *ts)
+turnstile_empty(struct turnstile *ts, int queue)
{
#ifdef INVARIANTS
- struct turnstile_chain *tc;
MPASS(ts != NULL);
- tc = TC_LOOKUP(ts->ts_lockobj);
- mtx_assert(&tc->tc_lock, MA_OWNED);
+ MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
#endif
- return (TAILQ_EMPTY(&ts->ts_blocked));
+ return (TAILQ_EMPTY(&ts->ts_blocked[queue]));
}
+
+#ifdef DDB
+static void
+print_thread(struct thread *td, const char *prefix)
+{
+
+ db_printf("%s%p (tid %d, pid %d, \"%s\")\n", prefix, td, td->td_tid,
+ td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
+ td->td_proc->p_comm);
+}
+
+static void
+print_queue(struct threadqueue *queue, const char *header, const char *prefix)
+{
+ struct thread *td;
+
+ db_printf("%s:\n", header);
+ if (TAILQ_EMPTY(queue)) {
+ db_printf("%sempty\n", prefix);
+ return;
+ }
+ TAILQ_FOREACH(td, queue, td_lockq) {
+ print_thread(td, prefix);
+ }
+}
+
+DB_SHOW_COMMAND(turnstile, db_show_turnstile)
+{
+ struct turnstile_chain *tc;
+ struct turnstile *ts;
+ struct lock_object *lock;
+ int i;
+
+ if (!have_addr)
+ return;
+
+ /*
+ * First, see if there is an active turnstile for the lock indicated
+ * by the address.
+ */
+ lock = (struct lock_object *)addr;
+ tc = TC_LOOKUP(lock);
+ LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+ if (ts->ts_lockobj == lock)
+ goto found;
+
+ /*
+ * Second, see if there is an active turnstile at the address
+ * indicated.
+ */
+ for (i = 0; i < TC_TABLESIZE; i++)
+ LIST_FOREACH(ts, &turnstile_chains[i].tc_turnstiles, ts_hash) {
+ if (ts == (struct turnstile *)addr)
+ goto found;
+ }
+
+ db_printf("Unable to locate a turnstile via %p\n", (void *)addr);
+ return;
+found:
+ lock = ts->ts_lockobj;
+ db_printf("Lock: %p - (%s) %s\n", lock, LOCK_CLASS(lock)->lc_name,
+ lock->lo_name);
+ if (ts->ts_owner)
+ print_thread(ts->ts_owner, "Lock Owner: ");
+ else
+ db_printf("Lock Owner: none\n");
+ print_queue(&ts->ts_blocked[TS_SHARED_QUEUE], "Shared Waiters", "\t");
+ print_queue(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE], "Exclusive Waiters",
+ "\t");
+ print_queue(&ts->ts_pending, "Pending Threads", "\t");
+
+}
+
+/*
+ * Show all the threads a particular thread is waiting on based on
+ * non-sleepable and non-spin locks.
+ */
+static void
+print_lockchain(struct thread *td, const char *prefix)
+{
+ struct lock_object *lock;
+ struct lock_class *class;
+ struct turnstile *ts;
+
+ /*
+ * Follow the chain. We keep walking as long as the thread is
+ * blocked on a turnstile that has an owner.
+ */
+ while (!db_pager_quit) {
+ db_printf("%sthread %d (pid %d, %s) ", prefix, td->td_tid,
+ td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
+ td->td_proc->p_comm);
+ switch (td->td_state) {
+ case TDS_INACTIVE:
+ db_printf("is inactive\n");
+ return;
+ case TDS_CAN_RUN:
+ db_printf("can run\n");
+ return;
+ case TDS_RUNQ:
+ db_printf("is on a run queue\n");
+ return;
+ case TDS_RUNNING:
+ db_printf("running on CPU %d\n", td->td_oncpu);
+ return;
+ case TDS_INHIBITED:
+ if (TD_ON_LOCK(td)) {
+ ts = td->td_blocked;
+ lock = ts->ts_lockobj;
+ class = LOCK_CLASS(lock);
+ db_printf("blocked on lock %p (%s) \"%s\"\n",
+ lock, class->lc_name, lock->lo_name);
+ if (ts->ts_owner == NULL)
+ return;
+ td = ts->ts_owner;
+ break;
+ }
+ db_printf("inhibited\n");
+ return;
+ default:
+ db_printf("??? (%#x)\n", td->td_state);
+ return;
+ }
+ }
+}
+
+DB_SHOW_COMMAND(lockchain, db_show_lockchain)
+{
+ struct thread *td;
+
+ /* Figure out which thread to start with. */
+ if (have_addr)
+ td = db_lookup_thread(addr, TRUE);
+ else
+ td = kdb_thread;
+
+ print_lockchain(td, "");
+}
+
+DB_SHOW_COMMAND(allchains, db_show_allchains)
+{
+ struct thread *td;
+ struct proc *p;
+ int i;
+
+ i = 1;
+ FOREACH_PROC_IN_SYSTEM(p) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (TD_ON_LOCK(td) && LIST_EMPTY(&td->td_contested)) {
+ db_printf("chain %d:\n", i++);
+ print_lockchain(td, " ");
+ }
+ if (db_pager_quit)
+ return;
+ }
+ }
+}
+
+/*
+ * Show all the threads a particular thread is waiting on based on
+ * sleepable locks.
+ */
+static void
+print_sleepchain(struct thread *td, const char *prefix)
+{
+ struct thread *owner;
+
+ /*
+ * Follow the chain. We keep walking as long as the thread is
+ * blocked on a sleep lock that has an owner.
+ */
+ while (!db_pager_quit) {
+ db_printf("%sthread %d (pid %d, %s) ", prefix, td->td_tid,
+ td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
+ td->td_proc->p_comm);
+ switch (td->td_state) {
+ case TDS_INACTIVE:
+ db_printf("is inactive\n");
+ return;
+ case TDS_CAN_RUN:
+ db_printf("can run\n");
+ return;
+ case TDS_RUNQ:
+ db_printf("is on a run queue\n");
+ return;
+ case TDS_RUNNING:
+ db_printf("running on CPU %d\n", td->td_oncpu);
+ return;
+ case TDS_INHIBITED:
+ if (TD_ON_SLEEPQ(td)) {
+ if (lockmgr_chain(td, &owner) ||
+ sx_chain(td, &owner)) {
+ if (owner == NULL)
+ return;
+ td = owner;
+ break;
+ }
+ db_printf("sleeping on %p \"%s\"\n",
+ td->td_wchan, td->td_wmesg);
+ return;
+ }
+ db_printf("inhibited\n");
+ return;
+ default:
+ db_printf("??? (%#x)\n", td->td_state);
+ return;
+ }
+ }
+}
+
+DB_SHOW_COMMAND(sleepchain, db_show_sleepchain)
+{
+ struct thread *td;
+
+ /* Figure out which thread to start with. */
+ if (have_addr)
+ td = db_lookup_thread(addr, TRUE);
+ else
+ td = kdb_thread;
+
+ print_sleepchain(td, "");
+}
+
+static void print_waiters(struct turnstile *ts, int indent);
+
+static void
+print_waiter(struct thread *td, int indent)
+{
+ struct turnstile *ts;
+ int i;
+
+ if (db_pager_quit)
+ return;
+ for (i = 0; i < indent; i++)
+ db_printf(" ");
+ print_thread(td, "thread ");
+ LIST_FOREACH(ts, &td->td_contested, ts_link)
+ print_waiters(ts, indent + 1);
+}
+
+static void
+print_waiters(struct turnstile *ts, int indent)
+{
+ struct lock_object *lock;
+ struct lock_class *class;
+ struct thread *td;
+ int i;
+
+ if (db_pager_quit)
+ return;
+ lock = ts->ts_lockobj;
+ class = LOCK_CLASS(lock);
+ for (i = 0; i < indent; i++)
+ db_printf(" ");
+ db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name);
+ TAILQ_FOREACH(td, &ts->ts_blocked[TS_EXCLUSIVE_QUEUE], td_lockq)
+ print_waiter(td, indent + 1);
+ TAILQ_FOREACH(td, &ts->ts_blocked[TS_SHARED_QUEUE], td_lockq)
+ print_waiter(td, indent + 1);
+ TAILQ_FOREACH(td, &ts->ts_pending, td_lockq)
+ print_waiter(td, indent + 1);
+}
+
+DB_SHOW_COMMAND(locktree, db_show_locktree)
+{
+ struct lock_object *lock;
+ struct lock_class *class;
+ struct turnstile_chain *tc;
+ struct turnstile *ts;
+
+ if (!have_addr)
+ return;
+ lock = (struct lock_object *)addr;
+ tc = TC_LOOKUP(lock);
+ LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+ if (ts->ts_lockobj == lock)
+ break;
+ if (ts == NULL) {
+ class = LOCK_CLASS(lock);
+ db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name,
+ lock->lo_name);
+ } else
+ print_waiters(ts, 0);
+}
+#endif
Index: tty_pty.c
===================================================================
RCS file: /home/cvs/src/sys/kern/tty_pty.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/tty_pty.c -L sys/kern/tty_pty.c -u -r1.2 -r1.3
--- sys/kern/tty_pty.c
+++ sys/kern/tty_pty.c
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/tty_pty.c,v 1.137.2.2 2006/03/30 16:46:56 csjp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/tty_pty.c,v 1.152.2.2.2.2 2008/01/28 12:47:56 kib Exp $");
/*
* Pseudo-teletype Driver
@@ -40,14 +40,14 @@
#include "opt_tty.h"
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/libkern.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sx.h>
-#ifndef BURN_BRIDGES
-#if defined(COMPAT_43)
+#if defined(COMPAT_43TTY)
#include <sys/ioctl_compat.h>
#endif
-#endif
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/tty.h>
#include <sys/conf.h>
@@ -109,6 +109,7 @@
u_char pt_ucntl;
struct tty *pt_tty;
struct cdev *devs, *devc;
+ int pt_devs_open, pt_devc_open;
struct prison *pt_prison;
};
@@ -121,43 +122,80 @@
#define TSA_PTC_WRITE(tp) ((void *)&(tp)->t_rawq.c_cl)
#define TSA_PTS_READ(tp) ((void *)&(tp)->t_canq)
-static char *names = "pqrsPQRS";
+static const char names[] = "pqrsPQRSlmnoLMNO";
/*
* This function creates and initializes a pts/ptc pair
*
- * pts == /dev/tty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
- * ptc == /dev/pty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
- *
- * XXX: define and add mapping of upper minor bits to allow more
- * than 256 ptys.
+ * pts == /dev/tty[pqrsPQRSlmnoLMNO][0123456789abcdefghijklmnopqrstuv]
+ * ptc == /dev/pty[pqrsPQRSlmnoLMNO][0123456789abcdefghijklmnopqrstuv]
*/
static struct cdev *
ptyinit(struct cdev *devc, struct thread *td)
{
- struct cdev *devs;
struct ptsc *pt;
int n;
- n = minor(devc);
- /* For now we only map the lower 8 bits of the minor */
- if (n & ~0xff)
+ n = minor2unit(minor(devc));
+
+ /* We only allow for up to 32 ptys per char in "names". */
+ if (n >= 32 * (sizeof(names) - 1))
return (NULL);
devc->si_flags &= ~SI_CHEAPCLONE;
+ /*
+ * Initially do not create a slave endpoint.
+ */
pt = malloc(sizeof(*pt), M_PTY, M_WAITOK | M_ZERO);
- pt->devs = devs = make_dev_cred(&pts_cdevsw, n, td->td_ucred,
- UID_ROOT, GID_WHEEL, 0666, "tty%c%r", names[n / 32], n % 32);
pt->devc = devc;
- pt->pt_tty = ttymalloc(pt->pt_tty);
+ pt->pt_tty = ttyalloc();
pt->pt_tty->t_sc = pt;
- devs->si_drv1 = devc->si_drv1 = pt;
- devs->si_tty = devc->si_tty = pt->pt_tty;
- pt->pt_tty->t_dev = devs;
+ devc->si_drv1 = pt;
+ devc->si_tty = pt->pt_tty;
return (devc);
}
+static void
+pty_create_slave(struct ucred *cred, struct ptsc *pt, int m)
+{
+ int n;
+
+ n = minor2unit(m);
+ KASSERT(n >= 0 && n / 32 < sizeof(names),
+ ("pty_create_slave: n %d ptsc %p", n, pt));
+ pt->devs = make_dev_cred(&pts_cdevsw, m, cred, UID_ROOT, GID_WHEEL,
+ 0666, "tty%c%r", names[n / 32], n % 32);
+ pt->devs->si_drv1 = pt;
+ pt->devs->si_tty = pt->pt_tty;
+ pt->pt_tty->t_dev = pt->devs;
+}
+
+static void
+pty_destroy_slave(struct ptsc *pt)
+{
+
+ if (pt->pt_tty->t_refcnt > 1)
+ return;
+ pt->pt_tty->t_dev = NULL;
+ ttyrel(pt->pt_tty);
+ pt->pt_tty = NULL;
+ destroy_dev(pt->devs);
+ pt->devs = NULL;
+}
+
+static void
+pty_maybe_destroy_slave(struct ptsc *pt)
+{
+
+ /*
+ * vfs bugs and complications near revoke() make
+ * it currently impossible to destroy struct cdev
+ */
+ if (0 && pt->pt_devc_open == 0 && pt->pt_devs_open == 0)
+ pty_destroy_slave(pt);
+}
+
/*ARGSUSED*/
static int
ptsopen(struct cdev *dev, int flag, int devtype, struct thread *td)
@@ -170,11 +208,14 @@
return(ENXIO);
pt = dev->si_drv1;
tp = dev->si_tty;
+
if ((tp->t_state & TS_ISOPEN) == 0) {
ttyinitmode(tp, 1, 0);
- } else if (tp->t_state & TS_XCLUDE && suser(td))
+ } else if (tp->t_state & TS_XCLUDE && priv_check(td,
+ PRIV_TTY_EXCLUSIVE))
return (EBUSY);
- else if (pt->pt_prison != td->td_ucred->cr_prison && suser(td))
+ else if (pt->pt_prison != td->td_ucred->cr_prison &&
+ priv_check(td, PRIV_TTY_PRISON))
return (EBUSY);
if (tp->t_oproc) /* Ctrlr still around. */
(void)ttyld_modem(tp, 1);
@@ -187,20 +228,32 @@
return (error);
}
error = ttyld_open(tp, dev);
- if (error == 0)
+ if (error == 0) {
ptcwakeup(tp, FREAD|FWRITE);
+ pt->pt_devs_open = 1;
+ } else
+ pty_maybe_destroy_slave(pt);
return (error);
}
static int
ptsclose(struct cdev *dev, int flag, int mode, struct thread *td)
{
+ struct ptsc *pti;
struct tty *tp;
int err;
tp = dev->si_tty;
+ pti = dev->si_drv1;
+
+ KASSERT(dev == pti->devs, ("ptsclose: dev != pti->devs"));
+
err = ttyld_close(tp, flag);
(void) tty_close(tp);
+
+ pti->pt_devs_open = 0;
+ pty_maybe_destroy_slave(pti);
+
return (err);
}
@@ -275,7 +328,19 @@
ptyinit(dev, td);
if (!dev->si_drv1)
return(ENXIO);
+
+ pt = dev->si_drv1;
+ /*
+ * In case we have destroyed the struct tty at the last connect time,
+ * we need to recreate it.
+ */
+ if (pt->pt_tty == NULL) {
+ pt->pt_tty = ttyalloc();
+ pt->pt_tty->t_sc = pt;
+ dev->si_tty = pt->pt_tty;
+ }
tp = dev->si_tty;
+
if (tp->t_oproc)
return (EIO);
tp->t_timeout = -1;
@@ -283,17 +348,22 @@
tp->t_stop = ptsstop;
(void)ttyld_modem(tp, 1);
tp->t_lflag &= ~EXTPROC;
- pt = dev->si_drv1;
pt->pt_prison = td->td_ucred->cr_prison;
pt->pt_flags = 0;
pt->pt_send = 0;
pt->pt_ucntl = 0;
+
+ if (!pt->devs)
+ pty_create_slave(td->td_ucred, pt, minor(dev));
+ pt->pt_devc_open = 1;
+
return (0);
}
static int
ptcclose(struct cdev *dev, int flags, int fmt, struct thread *td)
{
+ struct ptsc *pti = dev->si_drv1;
struct tty *tp;
tp = dev->si_tty;
@@ -314,6 +384,8 @@
}
tp->t_oproc = 0; /* mark closed */
+ pti->pt_devc_open = 0;
+ pty_maybe_destroy_slave(pti);
return (0);
}
@@ -515,6 +587,10 @@
{
struct tty *tp = dev->si_tty;
struct ptsc *pt = dev->si_drv1;
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+ int ival;
+#endif
switch (cmd) {
@@ -553,12 +629,10 @@
return (EAGAIN);
switch (cmd) {
-#ifndef BURN_BRIDGES
-#ifdef COMPAT_43
+#ifdef COMPAT_43TTY
case TIOCSETP:
case TIOCSETN:
#endif
-#endif
case TIOCSETD:
case TIOCSETA:
case TIOCSETAW:
@@ -571,6 +645,13 @@
ndflush(&tp->t_outq, tp->t_outq.c_cc);
break;
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+ case _IO('t', 95):
+ ival = IOCPARM_IVAL(data);
+ data = (caddr_t)&ival;
+ /* FALLTHROUGH */
+#endif
case TIOCSIG:
if (*(unsigned int *)data >= NSIG ||
*(unsigned int *)data == 0)
@@ -642,8 +723,7 @@
case TIOCSETA:
case TIOCSETAW:
case TIOCSETAF:
-#ifndef BURN_BRIDGES
-#ifdef COMPAT_43
+#ifdef COMPAT_43TTY
case TIOCSETP:
case TIOCSETN:
case TIOCSETC:
@@ -652,7 +732,6 @@
case TIOCLBIC:
case TIOCLSET:
#endif
-#endif
pt->pt_send |= TIOCPKT_IOCTL;
ptcwakeup(tp, FREAD);
break;
@@ -684,34 +763,27 @@
pty_clone(void *arg, struct ucred *cr, char *name, int namelen,
struct cdev **dev)
{
+ char *cp;
int u;
if (*dev != NULL)
return;
if (bcmp(name, "pty", 3) != 0)
return;
- if (name[5] != '\0')
+ if (name[5] != '\0' || name[3] == '\0')
return;
- switch (name[3]) {
- case 'p': u = 0; break;
- case 'q': u = 32; break;
- case 'r': u = 64; break;
- case 's': u = 96; break;
- case 'P': u = 128; break;
- case 'Q': u = 160; break;
- case 'R': u = 192; break;
- case 'S': u = 224; break;
- default: return;
- }
+ cp = index(names, name[3]);
+ if (cp == NULL)
+ return;
+ u = (cp - names) * 32;
if (name[4] >= '0' && name[4] <= '9')
u += name[4] - '0';
else if (name[4] >= 'a' && name[4] <= 'v')
u += name[4] - 'a' + 10;
else
return;
- *dev = make_dev_cred(&ptc_cdevsw, u, cr,
+ *dev = make_dev_credf(MAKEDEV_REF, &ptc_cdevsw, unit2minor(u), cr,
UID_ROOT, GID_WHEEL, 0666, "pty%c%r", names[u / 32], u % 32);
- dev_ref(*dev);
(*dev)->si_flags |= SI_CHEAPCLONE;
return;
}
--- /dev/null
+++ sys/kern/systrace_args.c
@@ -0,0 +1,2878 @@
+/*
+ * System call argument to DTrace register array converstion.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD: src/sys/kern/systrace_args.c,v 1.14 2007/08/16 05:32:26 davidxu Exp $
+ * This file is part of the DTrace syscall provider.
+ */
+
+static void
+systrace_args(int sysnum, void *params, u_int64_t *uarg, int *n_args)
+{
+ int64_t *iarg = (int64_t *) uarg;
+ switch (sysnum) {
+ /* nosys */
+ case 0: {
+ *n_args = 0;
+ break;
+ }
+ /* sys_exit */
+ case 1: {
+ struct sys_exit_args *p = params;
+ iarg[0] = p->rval; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* fork */
+ case 2: {
+ *n_args = 0;
+ break;
+ }
+ /* read */
+ case 3: {
+ struct read_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* void * */
+ uarg[2] = p->nbyte; /* size_t */
+ *n_args = 3;
+ break;
+ }
+ /* write */
+ case 4: {
+ struct write_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* const void * */
+ uarg[2] = p->nbyte; /* size_t */
+ *n_args = 3;
+ break;
+ }
+ /* open */
+ case 5: {
+ struct open_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->flags; /* int */
+ iarg[2] = p->mode; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* close */
+ case 6: {
+ struct close_args *p = params;
+ iarg[0] = p->fd; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* wait4 */
+ case 7: {
+ struct wait_args *p = params;
+ iarg[0] = p->pid; /* int */
+ uarg[1] = (intptr_t) p->status; /* int * */
+ iarg[2] = p->options; /* int */
+ uarg[3] = (intptr_t) p->rusage; /* struct rusage * */
+ *n_args = 4;
+ break;
+ }
+ /* link */
+ case 9: {
+ struct link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->link; /* char * */
+ *n_args = 2;
+ break;
+ }
+ /* unlink */
+ case 10: {
+ struct unlink_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* chdir */
+ case 12: {
+ struct chdir_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* fchdir */
+ case 13: {
+ struct fchdir_args *p = params;
+ iarg[0] = p->fd; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* mknod */
+ case 14: {
+ struct mknod_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->mode; /* int */
+ iarg[2] = p->dev; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* chmod */
+ case 15: {
+ struct chmod_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->mode; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* chown */
+ case 16: {
+ struct chown_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->uid; /* int */
+ iarg[2] = p->gid; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* obreak */
+ case 17: {
+ struct obreak_args *p = params;
+ uarg[0] = (intptr_t) p->nsize; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* getpid */
+ case 20: {
+ *n_args = 0;
+ break;
+ }
+ /* mount */
+ case 21: {
+ struct mount_args *p = params;
+ uarg[0] = (intptr_t) p->type; /* char * */
+ uarg[1] = (intptr_t) p->path; /* char * */
+ iarg[2] = p->flags; /* int */
+ uarg[3] = (intptr_t) p->data; /* caddr_t */
+ *n_args = 4;
+ break;
+ }
+ /* unmount */
+ case 22: {
+ struct unmount_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->flags; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* setuid */
+ case 23: {
+ struct setuid_args *p = params;
+ uarg[0] = p->uid; /* uid_t */
+ *n_args = 1;
+ break;
+ }
+ /* getuid */
+ case 24: {
+ *n_args = 0;
+ break;
+ }
+ /* geteuid */
+ case 25: {
+ *n_args = 0;
+ break;
+ }
+ /* ptrace */
+ case 26: {
+ struct ptrace_args *p = params;
+ iarg[0] = p->req; /* int */
+ iarg[1] = p->pid; /* pid_t */
+ uarg[2] = (intptr_t) p->addr; /* caddr_t */
+ iarg[3] = p->data; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* recvmsg */
+ case 27: {
+ struct recvmsg_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->msg; /* struct msghdr * */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* sendmsg */
+ case 28: {
+ struct sendmsg_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->msg; /* struct msghdr * */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* recvfrom */
+ case 29: {
+ struct recvfrom_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->buf; /* caddr_t */
+ uarg[2] = p->len; /* size_t */
+ iarg[3] = p->flags; /* int */
+ uarg[4] = (intptr_t) p->from; /* struct sockaddr *__restrict */
+ uarg[5] = (intptr_t) p->fromlenaddr; /* __socklen_t *__restrict */
+ *n_args = 6;
+ break;
+ }
+ /* accept */
+ case 30: {
+ struct accept_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->name; /* struct sockaddr *__restrict */
+ uarg[2] = (intptr_t) p->anamelen; /* __socklen_t *__restrict */
+ *n_args = 3;
+ break;
+ }
+ /* getpeername */
+ case 31: {
+ struct getpeername_args *p = params;
+ iarg[0] = p->fdes; /* int */
+ uarg[1] = (intptr_t) p->asa; /* struct sockaddr *__restrict */
+ uarg[2] = (intptr_t) p->alen; /* __socklen_t *__restrict */
+ *n_args = 3;
+ break;
+ }
+ /* getsockname */
+ case 32: {
+ struct getsockname_args *p = params;
+ iarg[0] = p->fdes; /* int */
+ uarg[1] = (intptr_t) p->asa; /* struct sockaddr *__restrict */
+ uarg[2] = (intptr_t) p->alen; /* __socklen_t *__restrict */
+ *n_args = 3;
+ break;
+ }
+ /* access */
+ case 33: {
+ struct access_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->flags; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* chflags */
+ case 34: {
+ struct chflags_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->flags; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* fchflags */
+ case 35: {
+ struct fchflags_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->flags; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* sync */
+ case 36: {
+ *n_args = 0;
+ break;
+ }
+ /* kill */
+ case 37: {
+ struct kill_args *p = params;
+ iarg[0] = p->pid; /* int */
+ iarg[1] = p->signum; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* getppid */
+ case 39: {
+ *n_args = 0;
+ break;
+ }
+ /* dup */
+ case 41: {
+ struct dup_args *p = params;
+ uarg[0] = p->fd; /* u_int */
+ *n_args = 1;
+ break;
+ }
+ /* pipe */
+ case 42: {
+ *n_args = 0;
+ break;
+ }
+ /* getegid */
+ case 43: {
+ *n_args = 0;
+ break;
+ }
+ /* profil */
+ case 44: {
+ struct profil_args *p = params;
+ uarg[0] = (intptr_t) p->samples; /* caddr_t */
+ uarg[1] = p->size; /* size_t */
+ uarg[2] = p->offset; /* size_t */
+ uarg[3] = p->scale; /* u_int */
+ *n_args = 4;
+ break;
+ }
+ /* ktrace */
+ case 45: {
+ struct ktrace_args *p = params;
+ uarg[0] = (intptr_t) p->fname; /* const char * */
+ iarg[1] = p->ops; /* int */
+ iarg[2] = p->facs; /* int */
+ iarg[3] = p->pid; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* getgid */
+ case 47: {
+ *n_args = 0;
+ break;
+ }
+ /* getlogin */
+ case 49: {
+ struct getlogin_args *p = params;
+ uarg[0] = (intptr_t) p->namebuf; /* char * */
+ uarg[1] = p->namelen; /* u_int */
+ *n_args = 2;
+ break;
+ }
+ /* setlogin */
+ case 50: {
+ struct setlogin_args *p = params;
+ uarg[0] = (intptr_t) p->namebuf; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* acct */
+ case 51: {
+ struct acct_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* sigaltstack */
+ case 53: {
+ struct sigaltstack_args *p = params;
+ uarg[0] = (intptr_t) p->ss; /* stack_t * */
+ uarg[1] = (intptr_t) p->oss; /* stack_t * */
+ *n_args = 2;
+ break;
+ }
+ /* ioctl */
+ case 54: {
+ struct ioctl_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = p->com; /* u_long */
+ uarg[2] = (intptr_t) p->data; /* caddr_t */
+ *n_args = 3;
+ break;
+ }
+ /* reboot */
+ case 55: {
+ struct reboot_args *p = params;
+ iarg[0] = p->opt; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* revoke */
+ case 56: {
+ struct revoke_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* symlink */
+ case 57: {
+ struct symlink_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->link; /* char * */
+ *n_args = 2;
+ break;
+ }
+ /* readlink */
+ case 58: {
+ struct readlink_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->buf; /* char * */
+ iarg[2] = p->count; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* execve */
+ case 59: {
+ struct execve_args *p = params;
+ uarg[0] = (intptr_t) p->fname; /* char * */
+ uarg[1] = (intptr_t) p->argv; /* char ** */
+ uarg[2] = (intptr_t) p->envv; /* char ** */
+ *n_args = 3;
+ break;
+ }
+ /* umask */
+ case 60: {
+ struct umask_args *p = params;
+ iarg[0] = p->newmask; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* chroot */
+ case 61: {
+ struct chroot_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* msync */
+ case 65: {
+ struct msync_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* void * */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* vfork */
+ case 66: {
+ *n_args = 0;
+ break;
+ }
+ /* sbrk */
+ case 69: {
+ struct sbrk_args *p = params;
+ iarg[0] = p->incr; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* sstk */
+ case 70: {
+ struct sstk_args *p = params;
+ iarg[0] = p->incr; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* ovadvise */
+ case 72: {
+ struct ovadvise_args *p = params;
+ iarg[0] = p->anom; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* munmap */
+ case 73: {
+ struct munmap_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* void * */
+ uarg[1] = p->len; /* size_t */
+ *n_args = 2;
+ break;
+ }
+ /* mprotect */
+ case 74: {
+ struct mprotect_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* const void * */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->prot; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* madvise */
+ case 75: {
+ struct madvise_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* void * */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->behav; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* mincore */
+ case 78: {
+ struct mincore_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* const void * */
+ uarg[1] = p->len; /* size_t */
+ uarg[2] = (intptr_t) p->vec; /* char * */
+ *n_args = 3;
+ break;
+ }
+ /* getgroups */
+ case 79: {
+ struct getgroups_args *p = params;
+ uarg[0] = p->gidsetsize; /* u_int */
+ uarg[1] = (intptr_t) p->gidset; /* gid_t * */
+ *n_args = 2;
+ break;
+ }
+ /* setgroups */
+ case 80: {
+ struct setgroups_args *p = params;
+ uarg[0] = p->gidsetsize; /* u_int */
+ uarg[1] = (intptr_t) p->gidset; /* gid_t * */
+ *n_args = 2;
+ break;
+ }
+ /* getpgrp */
+ case 81: {
+ *n_args = 0;
+ break;
+ }
+ /* setpgid */
+ case 82: {
+ struct setpgid_args *p = params;
+ iarg[0] = p->pid; /* int */
+ iarg[1] = p->pgid; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* setitimer */
+ case 83: {
+ struct setitimer_args *p = params;
+ uarg[0] = p->which; /* u_int */
+ uarg[1] = (intptr_t) p->itv; /* struct itimerval * */
+ uarg[2] = (intptr_t) p->oitv; /* struct itimerval * */
+ *n_args = 3;
+ break;
+ }
+ /* swapon */
+ case 85: {
+ struct swapon_args *p = params;
+ uarg[0] = (intptr_t) p->name; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* getitimer */
+ case 86: {
+ struct getitimer_args *p = params;
+ uarg[0] = p->which; /* u_int */
+ uarg[1] = (intptr_t) p->itv; /* struct itimerval * */
+ *n_args = 2;
+ break;
+ }
+ /* getdtablesize */
+ case 89: {
+ *n_args = 0;
+ break;
+ }
+ /* dup2 */
+ case 90: {
+ struct dup2_args *p = params;
+ uarg[0] = p->from; /* u_int */
+ uarg[1] = p->to; /* u_int */
+ *n_args = 2;
+ break;
+ }
+ /* fcntl */
+ case 92: {
+ struct fcntl_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->cmd; /* int */
+ iarg[2] = p->arg; /* long */
+ *n_args = 3;
+ break;
+ }
+ /* select */
+ case 93: {
+ struct select_args *p = params;
+ iarg[0] = p->nd; /* int */
+ uarg[1] = (intptr_t) p->in; /* fd_set * */
+ uarg[2] = (intptr_t) p->ou; /* fd_set * */
+ uarg[3] = (intptr_t) p->ex; /* fd_set * */
+ uarg[4] = (intptr_t) p->tv; /* struct timeval * */
+ *n_args = 5;
+ break;
+ }
+ /* fsync */
+ case 95: {
+ struct fsync_args *p = params;
+ iarg[0] = p->fd; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* setpriority */
+ case 96: {
+ struct setpriority_args *p = params;
+ iarg[0] = p->which; /* int */
+ iarg[1] = p->who; /* int */
+ iarg[2] = p->prio; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* socket */
+ case 97: {
+ struct socket_args *p = params;
+ iarg[0] = p->domain; /* int */
+ iarg[1] = p->type; /* int */
+ iarg[2] = p->protocol; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* connect */
+ case 98: {
+ struct connect_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->name; /* caddr_t */
+ iarg[2] = p->namelen; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* getpriority */
+ case 100: {
+ struct getpriority_args *p = params;
+ iarg[0] = p->which; /* int */
+ iarg[1] = p->who; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* bind */
+ case 104: {
+ struct bind_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->name; /* caddr_t */
+ iarg[2] = p->namelen; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* setsockopt */
+ case 105: {
+ struct setsockopt_args *p = params;
+ iarg[0] = p->s; /* int */
+ iarg[1] = p->level; /* int */
+ iarg[2] = p->name; /* int */
+ uarg[3] = (intptr_t) p->val; /* caddr_t */
+ iarg[4] = p->valsize; /* int */
+ *n_args = 5;
+ break;
+ }
+ /* listen */
+ case 106: {
+ struct listen_args *p = params;
+ iarg[0] = p->s; /* int */
+ iarg[1] = p->backlog; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* gettimeofday */
+ case 116: {
+ struct gettimeofday_args *p = params;
+ uarg[0] = (intptr_t) p->tp; /* struct timeval * */
+ uarg[1] = (intptr_t) p->tzp; /* struct timezone * */
+ *n_args = 2;
+ break;
+ }
+ /* getrusage */
+ case 117: {
+ struct getrusage_args *p = params;
+ iarg[0] = p->who; /* int */
+ uarg[1] = (intptr_t) p->rusage; /* struct rusage * */
+ *n_args = 2;
+ break;
+ }
+ /* getsockopt */
+ case 118: {
+ struct getsockopt_args *p = params;
+ iarg[0] = p->s; /* int */
+ iarg[1] = p->level; /* int */
+ iarg[2] = p->name; /* int */
+ uarg[3] = (intptr_t) p->val; /* caddr_t */
+ uarg[4] = (intptr_t) p->avalsize; /* int * */
+ *n_args = 5;
+ break;
+ }
+ /* readv */
+ case 120: {
+ struct readv_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+ uarg[2] = p->iovcnt; /* u_int */
+ *n_args = 3;
+ break;
+ }
+ /* writev */
+ case 121: {
+ struct writev_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+ uarg[2] = p->iovcnt; /* u_int */
+ *n_args = 3;
+ break;
+ }
+ /* settimeofday */
+ case 122: {
+ struct settimeofday_args *p = params;
+ uarg[0] = (intptr_t) p->tv; /* struct timeval * */
+ uarg[1] = (intptr_t) p->tzp; /* struct timezone * */
+ *n_args = 2;
+ break;
+ }
+ /* fchown */
+ case 123: {
+ struct fchown_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->uid; /* int */
+ iarg[2] = p->gid; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* fchmod */
+ case 124: {
+ struct fchmod_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->mode; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* setreuid */
+ case 126: {
+ struct setreuid_args *p = params;
+ iarg[0] = p->ruid; /* int */
+ iarg[1] = p->euid; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* setregid */
+ case 127: {
+ struct setregid_args *p = params;
+ iarg[0] = p->rgid; /* int */
+ iarg[1] = p->egid; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* rename */
+ case 128: {
+ struct rename_args *p = params;
+ uarg[0] = (intptr_t) p->from; /* char * */
+ uarg[1] = (intptr_t) p->to; /* char * */
+ *n_args = 2;
+ break;
+ }
+ /* flock */
+ case 131: {
+ struct flock_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->how; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* mkfifo */
+ case 132: {
+ struct mkfifo_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->mode; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* sendto */
+ case 133: {
+ struct sendto_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->buf; /* caddr_t */
+ uarg[2] = p->len; /* size_t */
+ iarg[3] = p->flags; /* int */
+ uarg[4] = (intptr_t) p->to; /* caddr_t */
+ iarg[5] = p->tolen; /* int */
+ *n_args = 6;
+ break;
+ }
+ /* shutdown */
+ case 134: {
+ struct shutdown_args *p = params;
+ iarg[0] = p->s; /* int */
+ iarg[1] = p->how; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* socketpair */
+ case 135: {
+ struct socketpair_args *p = params;
+ iarg[0] = p->domain; /* int */
+ iarg[1] = p->type; /* int */
+ iarg[2] = p->protocol; /* int */
+ uarg[3] = (intptr_t) p->rsv; /* int * */
+ *n_args = 4;
+ break;
+ }
+ /* mkdir */
+ case 136: {
+ struct mkdir_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->mode; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* rmdir */
+ case 137: {
+ struct rmdir_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* utimes */
+ case 138: {
+ struct utimes_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->tptr; /* struct timeval * */
+ *n_args = 2;
+ break;
+ }
+ /* adjtime */
+ case 140: {
+ struct adjtime_args *p = params;
+ uarg[0] = (intptr_t) p->delta; /* struct timeval * */
+ uarg[1] = (intptr_t) p->olddelta; /* struct timeval * */
+ *n_args = 2;
+ break;
+ }
+ /* setsid */
+ case 147: {
+ *n_args = 0;
+ break;
+ }
+ /* quotactl */
+ case 148: {
+ struct quotactl_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->cmd; /* int */
+ iarg[2] = p->uid; /* int */
+ uarg[3] = (intptr_t) p->arg; /* caddr_t */
+ *n_args = 4;
+ break;
+ }
+ /* nfssvc */
+ case 155: {
+ struct nfssvc_args *p = params;
+ iarg[0] = p->flag; /* int */
+ uarg[1] = (intptr_t) p->argp; /* caddr_t */
+ *n_args = 2;
+ break;
+ }
+ /* lgetfh */
+ case 160: {
+ struct lgetfh_args *p = params;
+ uarg[0] = (intptr_t) p->fname; /* char * */
+ uarg[1] = (intptr_t) p->fhp; /* struct fhandle * */
+ *n_args = 2;
+ break;
+ }
+ /* getfh */
+ case 161: {
+ struct getfh_args *p = params;
+ uarg[0] = (intptr_t) p->fname; /* char * */
+ uarg[1] = (intptr_t) p->fhp; /* struct fhandle * */
+ *n_args = 2;
+ break;
+ }
+ /* getdomainname */
+ case 162: {
+ struct getdomainname_args *p = params;
+ uarg[0] = (intptr_t) p->domainname; /* char * */
+ iarg[1] = p->len; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* setdomainname */
+ case 163: {
+ struct setdomainname_args *p = params;
+ uarg[0] = (intptr_t) p->domainname; /* char * */
+ iarg[1] = p->len; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* uname */
+ case 164: {
+ struct uname_args *p = params;
+ uarg[0] = (intptr_t) p->name; /* struct utsname * */
+ *n_args = 1;
+ break;
+ }
+ /* sysarch */
+ case 165: {
+ struct sysarch_args *p = params;
+ iarg[0] = p->op; /* int */
+ uarg[1] = (intptr_t) p->parms; /* char * */
+ *n_args = 2;
+ break;
+ }
+ /* rtprio */
+ case 166: {
+ struct rtprio_args *p = params;
+ iarg[0] = p->function; /* int */
+ iarg[1] = p->pid; /* pid_t */
+ uarg[2] = (intptr_t) p->rtp; /* struct rtprio * */
+ *n_args = 3;
+ break;
+ }
+ /* semsys */
+ case 169: {
+ struct semsys_args *p = params;
+ iarg[0] = p->which; /* int */
+ iarg[1] = p->a2; /* int */
+ iarg[2] = p->a3; /* int */
+ iarg[3] = p->a4; /* int */
+ iarg[4] = p->a5; /* int */
+ *n_args = 5;
+ break;
+ }
+ /* msgsys */
+ case 170: {
+ struct msgsys_args *p = params;
+ iarg[0] = p->which; /* int */
+ iarg[1] = p->a2; /* int */
+ iarg[2] = p->a3; /* int */
+ iarg[3] = p->a4; /* int */
+ iarg[4] = p->a5; /* int */
+ iarg[5] = p->a6; /* int */
+ *n_args = 6;
+ break;
+ }
+ /* shmsys */
+ case 171: {
+ struct shmsys_args *p = params;
+ iarg[0] = p->which; /* int */
+ iarg[1] = p->a2; /* int */
+ iarg[2] = p->a3; /* int */
+ iarg[3] = p->a4; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* freebsd6_pread */
+ case 173: {
+ struct freebsd6_pread_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* void * */
+ uarg[2] = p->nbyte; /* size_t */
+ iarg[3] = p->pad; /* int */
+ iarg[4] = p->offset; /* off_t */
+ *n_args = 5;
+ break;
+ }
+ /* freebsd6_pwrite */
+ case 174: {
+ struct freebsd6_pwrite_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* const void * */
+ uarg[2] = p->nbyte; /* size_t */
+ iarg[3] = p->pad; /* int */
+ iarg[4] = p->offset; /* off_t */
+ *n_args = 5;
+ break;
+ }
+ /* ntp_adjtime */
+ case 176: {
+ struct ntp_adjtime_args *p = params;
+ uarg[0] = (intptr_t) p->tp; /* struct timex * */
+ *n_args = 1;
+ break;
+ }
+ /* setgid */
+ case 181: {
+ struct setgid_args *p = params;
+ iarg[0] = p->gid; /* gid_t */
+ *n_args = 1;
+ break;
+ }
+ /* setegid */
+ case 182: {
+ struct setegid_args *p = params;
+ iarg[0] = p->egid; /* gid_t */
+ *n_args = 1;
+ break;
+ }
+ /* seteuid */
+ case 183: {
+ struct seteuid_args *p = params;
+ uarg[0] = p->euid; /* uid_t */
+ *n_args = 1;
+ break;
+ }
+ /* stat */
+ case 188: {
+ struct stat_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->ub; /* struct stat * */
+ *n_args = 2;
+ break;
+ }
+ /* fstat */
+ case 189: {
+ struct fstat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->sb; /* struct stat * */
+ *n_args = 2;
+ break;
+ }
+ /* lstat */
+ case 190: {
+ struct lstat_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->ub; /* struct stat * */
+ *n_args = 2;
+ break;
+ }
+ /* pathconf */
+ case 191: {
+ struct pathconf_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->name; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* fpathconf */
+ case 192: {
+ struct fpathconf_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->name; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* getrlimit */
+ case 194: {
+ struct __getrlimit_args *p = params;
+ uarg[0] = p->which; /* u_int */
+ uarg[1] = (intptr_t) p->rlp; /* struct rlimit * */
+ *n_args = 2;
+ break;
+ }
+ /* setrlimit */
+ case 195: {
+ struct __setrlimit_args *p = params;
+ uarg[0] = p->which; /* u_int */
+ uarg[1] = (intptr_t) p->rlp; /* struct rlimit * */
+ *n_args = 2;
+ break;
+ }
+ /* getdirentries */
+ case 196: {
+ struct getdirentries_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* char * */
+ uarg[2] = p->count; /* u_int */
+ uarg[3] = (intptr_t) p->basep; /* long * */
+ *n_args = 4;
+ break;
+ }
+ /* freebsd6_mmap */
+ case 197: {
+ struct freebsd6_mmap_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* caddr_t */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->prot; /* int */
+ iarg[3] = p->flags; /* int */
+ iarg[4] = p->fd; /* int */
+ iarg[5] = p->pad; /* int */
+ iarg[6] = p->pos; /* off_t */
+ *n_args = 7;
+ break;
+ }
+ /* nosys */
+ case 198: {
+ *n_args = 0;
+ break;
+ }
+ /* freebsd6_lseek */
+ case 199: {
+ struct freebsd6_lseek_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->pad; /* int */
+ iarg[2] = p->offset; /* off_t */
+ iarg[3] = p->whence; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* freebsd6_truncate */
+ case 200: {
+ struct freebsd6_truncate_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->pad; /* int */
+ iarg[2] = p->length; /* off_t */
+ *n_args = 3;
+ break;
+ }
+ /* freebsd6_ftruncate */
+ case 201: {
+ struct freebsd6_ftruncate_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->pad; /* int */
+ iarg[2] = p->length; /* off_t */
+ *n_args = 3;
+ break;
+ }
+ /* __sysctl */
+ case 202: {
+ struct sysctl_args *p = params;
+ uarg[0] = (intptr_t) p->name; /* int * */
+ uarg[1] = p->namelen; /* u_int */
+ uarg[2] = (intptr_t) p->old; /* void * */
+ uarg[3] = (intptr_t) p->oldlenp; /* size_t * */
+ uarg[4] = (intptr_t) p->new; /* void * */
+ uarg[5] = p->newlen; /* size_t */
+ *n_args = 6;
+ break;
+ }
+ /* mlock */
+ case 203: {
+ struct mlock_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* const void * */
+ uarg[1] = p->len; /* size_t */
+ *n_args = 2;
+ break;
+ }
+ /* munlock */
+ case 204: {
+ struct munlock_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* const void * */
+ uarg[1] = p->len; /* size_t */
+ *n_args = 2;
+ break;
+ }
+ /* undelete */
+ case 205: {
+ struct undelete_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* futimes */
+ case 206: {
+ struct futimes_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->tptr; /* struct timeval * */
+ *n_args = 2;
+ break;
+ }
+ /* getpgid */
+ case 207: {
+ struct getpgid_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ *n_args = 1;
+ break;
+ }
+ /* poll */
+ case 209: {
+ struct poll_args *p = params;
+ uarg[0] = (intptr_t) p->fds; /* struct pollfd * */
+ uarg[1] = p->nfds; /* u_int */
+ iarg[2] = p->timeout; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* lkmnosys */
+ case 210: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 211: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 212: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 213: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 214: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 215: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 216: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 217: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 218: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 219: {
+ *n_args = 0;
+ break;
+ }
+ /* __semctl */
+ case 220: {
+ struct __semctl_args *p = params;
+ iarg[0] = p->semid; /* int */
+ iarg[1] = p->semnum; /* int */
+ iarg[2] = p->cmd; /* int */
+ uarg[3] = (intptr_t) p->arg; /* union semun * */
+ *n_args = 4;
+ break;
+ }
+ /* semget */
+ case 221: {
+ struct semget_args *p = params;
+ iarg[0] = p->key; /* key_t */
+ iarg[1] = p->nsems; /* int */
+ iarg[2] = p->semflg; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* semop */
+ case 222: {
+ struct semop_args *p = params;
+ iarg[0] = p->semid; /* int */
+ uarg[1] = (intptr_t) p->sops; /* struct sembuf * */
+ uarg[2] = p->nsops; /* size_t */
+ *n_args = 3;
+ break;
+ }
+ /* msgctl */
+ case 224: {
+ struct msgctl_args *p = params;
+ iarg[0] = p->msqid; /* int */
+ iarg[1] = p->cmd; /* int */
+ uarg[2] = (intptr_t) p->buf; /* struct msqid_ds * */
+ *n_args = 3;
+ break;
+ }
+ /* msgget */
+ case 225: {
+ struct msgget_args *p = params;
+ iarg[0] = p->key; /* key_t */
+ iarg[1] = p->msgflg; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* msgsnd */
+ case 226: {
+ struct msgsnd_args *p = params;
+ iarg[0] = p->msqid; /* int */
+ uarg[1] = (intptr_t) p->msgp; /* const void * */
+ uarg[2] = p->msgsz; /* size_t */
+ iarg[3] = p->msgflg; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* msgrcv */
+ case 227: {
+ struct msgrcv_args *p = params;
+ iarg[0] = p->msqid; /* int */
+ uarg[1] = (intptr_t) p->msgp; /* void * */
+ uarg[2] = p->msgsz; /* size_t */
+ iarg[3] = p->msgtyp; /* long */
+ iarg[4] = p->msgflg; /* int */
+ *n_args = 5;
+ break;
+ }
+ /* shmat */
+ case 228: {
+ struct shmat_args *p = params;
+ iarg[0] = p->shmid; /* int */
+ uarg[1] = (intptr_t) p->shmaddr; /* const void * */
+ iarg[2] = p->shmflg; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* shmctl */
+ case 229: {
+ struct shmctl_args *p = params;
+ iarg[0] = p->shmid; /* int */
+ iarg[1] = p->cmd; /* int */
+ uarg[2] = (intptr_t) p->buf; /* struct shmid_ds * */
+ *n_args = 3;
+ break;
+ }
+ /* shmdt */
+ case 230: {
+ struct shmdt_args *p = params;
+ uarg[0] = (intptr_t) p->shmaddr; /* const void * */
+ *n_args = 1;
+ break;
+ }
+ /* shmget */
+ case 231: {
+ struct shmget_args *p = params;
+ iarg[0] = p->key; /* key_t */
+ uarg[1] = p->size; /* size_t */
+ iarg[2] = p->shmflg; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* clock_gettime */
+ case 232: {
+ struct clock_gettime_args *p = params;
+ iarg[0] = p->clock_id; /* clockid_t */
+ uarg[1] = (intptr_t) p->tp; /* struct timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* clock_settime */
+ case 233: {
+ struct clock_settime_args *p = params;
+ iarg[0] = p->clock_id; /* clockid_t */
+ uarg[1] = (intptr_t) p->tp; /* const struct timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* clock_getres */
+ case 234: {
+ struct clock_getres_args *p = params;
+ iarg[0] = p->clock_id; /* clockid_t */
+ uarg[1] = (intptr_t) p->tp; /* struct timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* ktimer_create */
+ case 235: {
+ struct ktimer_create_args *p = params;
+ iarg[0] = p->clock_id; /* clockid_t */
+ uarg[1] = (intptr_t) p->evp; /* struct sigevent * */
+ uarg[2] = (intptr_t) p->timerid; /* int * */
+ *n_args = 3;
+ break;
+ }
+ /* ktimer_delete */
+ case 236: {
+ struct ktimer_delete_args *p = params;
+ iarg[0] = p->timerid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* ktimer_settime */
+ case 237: {
+ struct ktimer_settime_args *p = params;
+ iarg[0] = p->timerid; /* int */
+ iarg[1] = p->flags; /* int */
+ uarg[2] = (intptr_t) p->value; /* const struct itimerspec * */
+ uarg[3] = (intptr_t) p->ovalue; /* struct itimerspec * */
+ *n_args = 4;
+ break;
+ }
+ /* ktimer_gettime */
+ case 238: {
+ struct ktimer_gettime_args *p = params;
+ iarg[0] = p->timerid; /* int */
+ uarg[1] = (intptr_t) p->value; /* struct itimerspec * */
+ *n_args = 2;
+ break;
+ }
+ /* ktimer_getoverrun */
+ case 239: {
+ struct ktimer_getoverrun_args *p = params;
+ iarg[0] = p->timerid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* nanosleep */
+ case 240: {
+ struct nanosleep_args *p = params;
+ uarg[0] = (intptr_t) p->rqtp; /* const struct timespec * */
+ uarg[1] = (intptr_t) p->rmtp; /* struct timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* ntp_gettime */
+ case 248: {
+ struct ntp_gettime_args *p = params;
+ uarg[0] = (intptr_t) p->ntvp; /* struct ntptimeval * */
+ *n_args = 1;
+ break;
+ }
+ /* minherit */
+ case 250: {
+ struct minherit_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* void * */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->inherit; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* rfork */
+ case 251: {
+ struct rfork_args *p = params;
+ iarg[0] = p->flags; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* openbsd_poll */
+ case 252: {
+ struct openbsd_poll_args *p = params;
+ uarg[0] = (intptr_t) p->fds; /* struct pollfd * */
+ uarg[1] = p->nfds; /* u_int */
+ iarg[2] = p->timeout; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* issetugid */
+ case 253: {
+ *n_args = 0;
+ break;
+ }
+ /* lchown */
+ case 254: {
+ struct lchown_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->uid; /* int */
+ iarg[2] = p->gid; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* aio_read */
+ case 255: {
+ struct aio_read_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+ *n_args = 1;
+ break;
+ }
+ /* aio_write */
+ case 256: {
+ struct aio_write_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+ *n_args = 1;
+ break;
+ }
+ /* lio_listio */
+ case 257: {
+ struct lio_listio_args *p = params;
+ iarg[0] = p->mode; /* int */
+ uarg[1] = (intptr_t) p->acb_list; /* struct aiocb *const * */
+ iarg[2] = p->nent; /* int */
+ uarg[3] = (intptr_t) p->sig; /* struct sigevent * */
+ *n_args = 4;
+ break;
+ }
+ /* getdents */
+ case 272: {
+ struct getdents_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* char * */
+ uarg[2] = p->count; /* size_t */
+ *n_args = 3;
+ break;
+ }
+ /* lchmod */
+ case 274: {
+ struct lchmod_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->mode; /* mode_t */
+ *n_args = 2;
+ break;
+ }
+ /* lchown */
+ case 275: {
+ struct lchown_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = p->uid; /* uid_t */
+ iarg[2] = p->gid; /* gid_t */
+ *n_args = 3;
+ break;
+ }
+ /* lutimes */
+ case 276: {
+ struct lutimes_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->tptr; /* struct timeval * */
+ *n_args = 2;
+ break;
+ }
+ /* msync */
+ case 277: {
+ struct msync_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* void * */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* nstat */
+ case 278: {
+ struct nstat_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->ub; /* struct nstat * */
+ *n_args = 2;
+ break;
+ }
+ /* nfstat */
+ case 279: {
+ struct nfstat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->sb; /* struct nstat * */
+ *n_args = 2;
+ break;
+ }
+ /* nlstat */
+ case 280: {
+ struct nlstat_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->ub; /* struct nstat * */
+ *n_args = 2;
+ break;
+ }
+ /* preadv */
+ case 289: {
+ struct preadv_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+ uarg[2] = p->iovcnt; /* u_int */
+ iarg[3] = p->offset; /* off_t */
+ *n_args = 4;
+ break;
+ }
+ /* pwritev */
+ case 290: {
+ struct pwritev_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+ uarg[2] = p->iovcnt; /* u_int */
+ iarg[3] = p->offset; /* off_t */
+ *n_args = 4;
+ break;
+ }
+ /* fhopen */
+ case 298: {
+ struct fhopen_args *p = params;
+ uarg[0] = (intptr_t) p->u_fhp; /* const struct fhandle * */
+ iarg[1] = p->flags; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* fhstat */
+ case 299: {
+ struct fhstat_args *p = params;
+ uarg[0] = (intptr_t) p->u_fhp; /* const struct fhandle * */
+ uarg[1] = (intptr_t) p->sb; /* struct stat * */
+ *n_args = 2;
+ break;
+ }
+ /* modnext */
+ case 300: {
+ struct modnext_args *p = params;
+ iarg[0] = p->modid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* modstat */
+ case 301: {
+ struct modstat_args *p = params;
+ iarg[0] = p->modid; /* int */
+ uarg[1] = (intptr_t) p->stat; /* struct module_stat * */
+ *n_args = 2;
+ break;
+ }
+ /* modfnext */
+ case 302: {
+ struct modfnext_args *p = params;
+ iarg[0] = p->modid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* modfind */
+ case 303: {
+ struct modfind_args *p = params;
+ uarg[0] = (intptr_t) p->name; /* const char * */
+ *n_args = 1;
+ break;
+ }
+ /* kldload */
+ case 304: {
+ struct kldload_args *p = params;
+ uarg[0] = (intptr_t) p->file; /* const char * */
+ *n_args = 1;
+ break;
+ }
+ /* kldunload */
+ case 305: {
+ struct kldunload_args *p = params;
+ iarg[0] = p->fileid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* kldfind */
+ case 306: {
+ struct kldfind_args *p = params;
+ uarg[0] = (intptr_t) p->file; /* const char * */
+ *n_args = 1;
+ break;
+ }
+ /* kldnext */
+ case 307: {
+ struct kldnext_args *p = params;
+ iarg[0] = p->fileid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* kldstat */
+ case 308: {
+ struct kldstat_args *p = params;
+ iarg[0] = p->fileid; /* int */
+ uarg[1] = (intptr_t) p->stat; /* struct kld_file_stat * */
+ *n_args = 2;
+ break;
+ }
+ /* kldfirstmod */
+ case 309: {
+ struct kldfirstmod_args *p = params;
+ iarg[0] = p->fileid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* getsid */
+ case 310: {
+ struct getsid_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ *n_args = 1;
+ break;
+ }
+ /* setresuid */
+ case 311: {
+ struct setresuid_args *p = params;
+ uarg[0] = p->ruid; /* uid_t */
+ uarg[1] = p->euid; /* uid_t */
+ uarg[2] = p->suid; /* uid_t */
+ *n_args = 3;
+ break;
+ }
+ /* setresgid */
+ case 312: {
+ struct setresgid_args *p = params;
+ iarg[0] = p->rgid; /* gid_t */
+ iarg[1] = p->egid; /* gid_t */
+ iarg[2] = p->sgid; /* gid_t */
+ *n_args = 3;
+ break;
+ }
+ /* aio_return */
+ case 314: {
+ struct aio_return_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+ *n_args = 1;
+ break;
+ }
+ /* aio_suspend */
+ case 315: {
+ struct aio_suspend_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb *const * */
+ iarg[1] = p->nent; /* int */
+ uarg[2] = (intptr_t) p->timeout; /* const struct timespec * */
+ *n_args = 3;
+ break;
+ }
+ /* aio_cancel */
+ case 316: {
+ struct aio_cancel_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->aiocbp; /* struct aiocb * */
+ *n_args = 2;
+ break;
+ }
+ /* aio_error */
+ case 317: {
+ struct aio_error_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+ *n_args = 1;
+ break;
+ }
+ /* oaio_read */
+ case 318: {
+ struct oaio_read_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct oaiocb * */
+ *n_args = 1;
+ break;
+ }
+ /* oaio_write */
+ case 319: {
+ struct oaio_write_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct oaiocb * */
+ *n_args = 1;
+ break;
+ }
+ /* olio_listio */
+ case 320: {
+ struct olio_listio_args *p = params;
+ iarg[0] = p->mode; /* int */
+ uarg[1] = (intptr_t) p->acb_list; /* struct oaiocb *const * */
+ iarg[2] = p->nent; /* int */
+ uarg[3] = (intptr_t) p->sig; /* struct osigevent * */
+ *n_args = 4;
+ break;
+ }
+ /* yield */
+ case 321: {
+ *n_args = 0;
+ break;
+ }
+ /* mlockall */
+ case 324: {
+ struct mlockall_args *p = params;
+ iarg[0] = p->how; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* munlockall */
+ case 325: {
+ *n_args = 0;
+ break;
+ }
+ /* __getcwd */
+ case 326: {
+ struct __getcwd_args *p = params;
+ uarg[0] = (intptr_t) p->buf; /* u_char * */
+ uarg[1] = p->buflen; /* u_int */
+ *n_args = 2;
+ break;
+ }
+ /* sched_setparam */
+ case 327: {
+ struct sched_setparam_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ uarg[1] = (intptr_t) p->param; /* const struct sched_param * */
+ *n_args = 2;
+ break;
+ }
+ /* sched_getparam */
+ case 328: {
+ struct sched_getparam_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ uarg[1] = (intptr_t) p->param; /* struct sched_param * */
+ *n_args = 2;
+ break;
+ }
+ /* sched_setscheduler */
+ case 329: {
+ struct sched_setscheduler_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ iarg[1] = p->policy; /* int */
+ uarg[2] = (intptr_t) p->param; /* const struct sched_param * */
+ *n_args = 3;
+ break;
+ }
+ /* sched_getscheduler */
+ case 330: {
+ struct sched_getscheduler_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ *n_args = 1;
+ break;
+ }
+ /* sched_yield */
+ case 331: {
+ *n_args = 0;
+ break;
+ }
+ /* sched_get_priority_max */
+ case 332: {
+ struct sched_get_priority_max_args *p = params;
+ iarg[0] = p->policy; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* sched_get_priority_min */
+ case 333: {
+ struct sched_get_priority_min_args *p = params;
+ iarg[0] = p->policy; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* sched_rr_get_interval */
+ case 334: {
+ struct sched_rr_get_interval_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ uarg[1] = (intptr_t) p->interval; /* struct timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* utrace */
+ case 335: {
+ struct utrace_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* const void * */
+ uarg[1] = p->len; /* size_t */
+ *n_args = 2;
+ break;
+ }
+ /* kldsym */
+ case 337: {
+ struct kldsym_args *p = params;
+ iarg[0] = p->fileid; /* int */
+ iarg[1] = p->cmd; /* int */
+ uarg[2] = (intptr_t) p->data; /* void * */
+ *n_args = 3;
+ break;
+ }
+ /* jail */
+ case 338: {
+ struct jail_args *p = params;
+ uarg[0] = (intptr_t) p->jail; /* struct jail * */
+ *n_args = 1;
+ break;
+ }
+ /* sigprocmask */
+ case 340: {
+ struct sigprocmask_args *p = params;
+ iarg[0] = p->how; /* int */
+ uarg[1] = (intptr_t) p->set; /* const sigset_t * */
+ uarg[2] = (intptr_t) p->oset; /* sigset_t * */
+ *n_args = 3;
+ break;
+ }
+ /* sigsuspend */
+ case 341: {
+ struct sigsuspend_args *p = params;
+ uarg[0] = (intptr_t) p->sigmask; /* const sigset_t * */
+ *n_args = 1;
+ break;
+ }
+ /* sigpending */
+ case 343: {
+ struct sigpending_args *p = params;
+ uarg[0] = (intptr_t) p->set; /* sigset_t * */
+ *n_args = 1;
+ break;
+ }
+ /* sigtimedwait */
+ case 345: {
+ struct sigtimedwait_args *p = params;
+ uarg[0] = (intptr_t) p->set; /* const sigset_t * */
+ uarg[1] = (intptr_t) p->info; /* siginfo_t * */
+ uarg[2] = (intptr_t) p->timeout; /* const struct timespec * */
+ *n_args = 3;
+ break;
+ }
+ /* sigwaitinfo */
+ case 346: {
+ struct sigwaitinfo_args *p = params;
+ uarg[0] = (intptr_t) p->set; /* const sigset_t * */
+ uarg[1] = (intptr_t) p->info; /* siginfo_t * */
+ *n_args = 2;
+ break;
+ }
+ /* __acl_get_file */
+ case 347: {
+ struct __acl_get_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* __acl_set_file */
+ case 348: {
+ struct __acl_set_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* __acl_get_fd */
+ case 349: {
+ struct __acl_get_fd_args *p = params;
+ iarg[0] = p->filedes; /* int */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* __acl_set_fd */
+ case 350: {
+ struct __acl_set_fd_args *p = params;
+ iarg[0] = p->filedes; /* int */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* __acl_delete_file */
+ case 351: {
+ struct __acl_delete_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ *n_args = 2;
+ break;
+ }
+ /* __acl_delete_fd */
+ case 352: {
+ struct __acl_delete_fd_args *p = params;
+ iarg[0] = p->filedes; /* int */
+ iarg[1] = p->type; /* acl_type_t */
+ *n_args = 2;
+ break;
+ }
+ /* __acl_aclcheck_file */
+ case 353: {
+ struct __acl_aclcheck_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* __acl_aclcheck_fd */
+ case 354: {
+ struct __acl_aclcheck_fd_args *p = params;
+ iarg[0] = p->filedes; /* int */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* extattrctl */
+ case 355: {
+ struct extattrctl_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->cmd; /* int */
+ uarg[2] = (intptr_t) p->filename; /* const char * */
+ iarg[3] = p->attrnamespace; /* int */
+ uarg[4] = (intptr_t) p->attrname; /* const char * */
+ *n_args = 5;
+ break;
+ }
+ /* extattr_set_file */
+ case 356: {
+ struct extattr_set_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ uarg[3] = (intptr_t) p->data; /* void * */
+ uarg[4] = p->nbytes; /* size_t */
+ *n_args = 5;
+ break;
+ }
+ /* extattr_get_file */
+ case 357: {
+ struct extattr_get_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ uarg[3] = (intptr_t) p->data; /* void * */
+ uarg[4] = p->nbytes; /* size_t */
+ *n_args = 5;
+ break;
+ }
+ /* extattr_delete_file */
+ case 358: {
+ struct extattr_delete_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ *n_args = 3;
+ break;
+ }
+ /* aio_waitcomplete */
+ case 359: {
+ struct aio_waitcomplete_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb ** */
+ uarg[1] = (intptr_t) p->timeout; /* struct timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* getresuid */
+ case 360: {
+ struct getresuid_args *p = params;
+ uarg[0] = (intptr_t) p->ruid; /* uid_t * */
+ uarg[1] = (intptr_t) p->euid; /* uid_t * */
+ uarg[2] = (intptr_t) p->suid; /* uid_t * */
+ *n_args = 3;
+ break;
+ }
+ /* getresgid */
+ case 361: {
+ struct getresgid_args *p = params;
+ uarg[0] = (intptr_t) p->rgid; /* gid_t * */
+ uarg[1] = (intptr_t) p->egid; /* gid_t * */
+ uarg[2] = (intptr_t) p->sgid; /* gid_t * */
+ *n_args = 3;
+ break;
+ }
+ /* kqueue */
+ case 362: {
+ *n_args = 0;
+ break;
+ }
+ /* kevent */
+ case 363: {
+ struct kevent_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->changelist; /* struct kevent * */
+ iarg[2] = p->nchanges; /* int */
+ uarg[3] = (intptr_t) p->eventlist; /* struct kevent * */
+ iarg[4] = p->nevents; /* int */
+ uarg[5] = (intptr_t) p->timeout; /* const struct timespec * */
+ *n_args = 6;
+ break;
+ }
+ /* lkmressys */
+ case 370: {
+ *n_args = 0;
+ break;
+ }
+ /* extattr_set_fd */
+ case 371: {
+ struct extattr_set_fd_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ uarg[3] = (intptr_t) p->data; /* void * */
+ uarg[4] = p->nbytes; /* size_t */
+ *n_args = 5;
+ break;
+ }
+ /* extattr_get_fd */
+ case 372: {
+ struct extattr_get_fd_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ uarg[3] = (intptr_t) p->data; /* void * */
+ uarg[4] = p->nbytes; /* size_t */
+ *n_args = 5;
+ break;
+ }
+ /* extattr_delete_fd */
+ case 373: {
+ struct extattr_delete_fd_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ *n_args = 3;
+ break;
+ }
+ /* __setugid */
+ case 374: {
+ struct __setugid_args *p = params;
+ iarg[0] = p->flag; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* nfsclnt */
+ case 375: {
+ struct nfsclnt_args *p = params;
+ iarg[0] = p->flag; /* int */
+ uarg[1] = (intptr_t) p->argp; /* caddr_t */
+ *n_args = 2;
+ break;
+ }
+ /* eaccess */
+ case 376: {
+ struct eaccess_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->flags; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* nmount */
+ case 378: {
+ struct nmount_args *p = params;
+ uarg[0] = (intptr_t) p->iovp; /* struct iovec * */
+ uarg[1] = p->iovcnt; /* unsigned int */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* kse_exit */
+ case 379: {
+ *n_args = 0;
+ break;
+ }
+ /* kse_wakeup */
+ case 380: {
+ struct kse_wakeup_args *p = params;
+ uarg[0] = (intptr_t) p->mbx; /* struct kse_mailbox * */
+ *n_args = 1;
+ break;
+ }
+ /* kse_create */
+ case 381: {
+ struct kse_create_args *p = params;
+ uarg[0] = (intptr_t) p->mbx; /* struct kse_mailbox * */
+ iarg[1] = p->newgroup; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* kse_thr_interrupt */
+ case 382: {
+ struct kse_thr_interrupt_args *p = params;
+ uarg[0] = (intptr_t) p->tmbx; /* struct kse_thr_mailbox * */
+ iarg[1] = p->cmd; /* int */
+ iarg[2] = p->data; /* long */
+ *n_args = 3;
+ break;
+ }
+ /* kse_release */
+ case 383: {
+ struct kse_release_args *p = params;
+ uarg[0] = (intptr_t) p->timeout; /* struct timespec * */
+ *n_args = 1;
+ break;
+ }
+ /* __mac_get_proc */
+ case 384: {
+ struct __mac_get_proc_args *p = params;
+ uarg[0] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 1;
+ break;
+ }
+ /* __mac_set_proc */
+ case 385: {
+ struct __mac_set_proc_args *p = params;
+ uarg[0] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 1;
+ break;
+ }
+ /* __mac_get_fd */
+ case 386: {
+ struct __mac_get_fd_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 2;
+ break;
+ }
+ /* __mac_get_file */
+ case 387: {
+ struct __mac_get_file_args *p = params;
+ uarg[0] = (intptr_t) p->path_p; /* const char * */
+ uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 2;
+ break;
+ }
+ /* __mac_set_fd */
+ case 388: {
+ struct __mac_set_fd_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 2;
+ break;
+ }
+ /* __mac_set_file */
+ case 389: {
+ struct __mac_set_file_args *p = params;
+ uarg[0] = (intptr_t) p->path_p; /* const char * */
+ uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 2;
+ break;
+ }
+ /* kenv */
+ case 390: {
+ struct kenv_args *p = params;
+ iarg[0] = p->what; /* int */
+ uarg[1] = (intptr_t) p->name; /* const char * */
+ uarg[2] = (intptr_t) p->value; /* char * */
+ iarg[3] = p->len; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* lchflags */
+ case 391: {
+ struct lchflags_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->flags; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* uuidgen */
+ case 392: {
+ struct uuidgen_args *p = params;
+ uarg[0] = (intptr_t) p->store; /* struct uuid * */
+ iarg[1] = p->count; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* sendfile */
+ case 393: {
+ struct sendfile_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->s; /* int */
+ iarg[2] = p->offset; /* off_t */
+ uarg[3] = p->nbytes; /* size_t */
+ uarg[4] = (intptr_t) p->hdtr; /* struct sf_hdtr * */
+ uarg[5] = (intptr_t) p->sbytes; /* off_t * */
+ iarg[6] = p->flags; /* int */
+ *n_args = 7;
+ break;
+ }
+ /* mac_syscall */
+ case 394: {
+ struct mac_syscall_args *p = params;
+ uarg[0] = (intptr_t) p->policy; /* const char * */
+ iarg[1] = p->call; /* int */
+ uarg[2] = (intptr_t) p->arg; /* void * */
+ *n_args = 3;
+ break;
+ }
+ /* getfsstat */
+ case 395: {
+ struct getfsstat_args *p = params;
+ uarg[0] = (intptr_t) p->buf; /* struct statfs * */
+ iarg[1] = p->bufsize; /* long */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* statfs */
+ case 396: {
+ struct statfs_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->buf; /* struct statfs * */
+ *n_args = 2;
+ break;
+ }
+ /* fstatfs */
+ case 397: {
+ struct fstatfs_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* struct statfs * */
+ *n_args = 2;
+ break;
+ }
+ /* fhstatfs */
+ case 398: {
+ struct fhstatfs_args *p = params;
+ uarg[0] = (intptr_t) p->u_fhp; /* const struct fhandle * */
+ uarg[1] = (intptr_t) p->buf; /* struct statfs * */
+ *n_args = 2;
+ break;
+ }
+ /* ksem_close */
+ case 400: {
+ struct ksem_close_args *p = params;
+ iarg[0] = p->id; /* semid_t */
+ *n_args = 1;
+ break;
+ }
+ /* ksem_post */
+ case 401: {
+ struct ksem_post_args *p = params;
+ iarg[0] = p->id; /* semid_t */
+ *n_args = 1;
+ break;
+ }
+ /* ksem_wait */
+ case 402: {
+ struct ksem_wait_args *p = params;
+ iarg[0] = p->id; /* semid_t */
+ *n_args = 1;
+ break;
+ }
+ /* ksem_trywait */
+ case 403: {
+ struct ksem_trywait_args *p = params;
+ iarg[0] = p->id; /* semid_t */
+ *n_args = 1;
+ break;
+ }
+ /* ksem_init */
+ case 404: {
+ struct ksem_init_args *p = params;
+ uarg[0] = (intptr_t) p->idp; /* semid_t * */
+ uarg[1] = p->value; /* unsigned int */
+ *n_args = 2;
+ break;
+ }
+ /* ksem_open */
+ case 405: {
+ struct ksem_open_args *p = params;
+ uarg[0] = (intptr_t) p->idp; /* semid_t * */
+ uarg[1] = (intptr_t) p->name; /* const char * */
+ iarg[2] = p->oflag; /* int */
+ iarg[3] = p->mode; /* mode_t */
+ uarg[4] = p->value; /* unsigned int */
+ *n_args = 5;
+ break;
+ }
+ /* ksem_unlink */
+ case 406: {
+ struct ksem_unlink_args *p = params;
+ uarg[0] = (intptr_t) p->name; /* const char * */
+ *n_args = 1;
+ break;
+ }
+ /* ksem_getvalue */
+ case 407: {
+ struct ksem_getvalue_args *p = params;
+ iarg[0] = p->id; /* semid_t */
+ uarg[1] = (intptr_t) p->val; /* int * */
+ *n_args = 2;
+ break;
+ }
+ /* ksem_destroy */
+ case 408: {
+ struct ksem_destroy_args *p = params;
+ iarg[0] = p->id; /* semid_t */
+ *n_args = 1;
+ break;
+ }
+ /* __mac_get_pid */
+ case 409: {
+ struct __mac_get_pid_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 2;
+ break;
+ }
+ /* __mac_get_link */
+ case 410: {
+ struct __mac_get_link_args *p = params;
+ uarg[0] = (intptr_t) p->path_p; /* const char * */
+ uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 2;
+ break;
+ }
+ /* __mac_set_link */
+ case 411: {
+ struct __mac_set_link_args *p = params;
+ uarg[0] = (intptr_t) p->path_p; /* const char * */
+ uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 2;
+ break;
+ }
+ /* extattr_set_link */
+ case 412: {
+ struct extattr_set_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ uarg[3] = (intptr_t) p->data; /* void * */
+ uarg[4] = p->nbytes; /* size_t */
+ *n_args = 5;
+ break;
+ }
+ /* extattr_get_link */
+ case 413: {
+ struct extattr_get_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ uarg[3] = (intptr_t) p->data; /* void * */
+ uarg[4] = p->nbytes; /* size_t */
+ *n_args = 5;
+ break;
+ }
+ /* extattr_delete_link */
+ case 414: {
+ struct extattr_delete_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ *n_args = 3;
+ break;
+ }
+ /* __mac_execve */
+ case 415: {
+ struct __mac_execve_args *p = params;
+ uarg[0] = (intptr_t) p->fname; /* char * */
+ uarg[1] = (intptr_t) p->argv; /* char ** */
+ uarg[2] = (intptr_t) p->envv; /* char ** */
+ uarg[3] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 4;
+ break;
+ }
+ /* sigaction */
+ case 416: {
+ struct sigaction_args *p = params;
+ iarg[0] = p->sig; /* int */
+ uarg[1] = (intptr_t) p->act; /* const struct sigaction * */
+ uarg[2] = (intptr_t) p->oact; /* struct sigaction * */
+ *n_args = 3;
+ break;
+ }
+ /* sigreturn */
+ case 417: {
+ struct sigreturn_args *p = params;
+ uarg[0] = (intptr_t) p->sigcntxp; /* const struct __ucontext * */
+ *n_args = 1;
+ break;
+ }
+ /* getcontext */
+ case 421: {
+ struct getcontext_args *p = params;
+ uarg[0] = (intptr_t) p->ucp; /* struct __ucontext * */
+ *n_args = 1;
+ break;
+ }
+ /* setcontext */
+ case 422: {
+ struct setcontext_args *p = params;
+ uarg[0] = (intptr_t) p->ucp; /* const struct __ucontext * */
+ *n_args = 1;
+ break;
+ }
+ /* swapcontext */
+ case 423: {
+ struct swapcontext_args *p = params;
+ uarg[0] = (intptr_t) p->oucp; /* struct __ucontext * */
+ uarg[1] = (intptr_t) p->ucp; /* const struct __ucontext * */
+ *n_args = 2;
+ break;
+ }
+ /* swapoff */
+ case 424: {
+ struct swapoff_args *p = params;
+ uarg[0] = (intptr_t) p->name; /* const char * */
+ *n_args = 1;
+ break;
+ }
+ /* __acl_get_link */
+ case 425: {
+ struct __acl_get_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* __acl_set_link */
+ case 426: {
+ struct __acl_set_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* __acl_delete_link */
+ case 427: {
+ struct __acl_delete_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ *n_args = 2;
+ break;
+ }
+ /* __acl_aclcheck_link */
+ case 428: {
+ struct __acl_aclcheck_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* sigwait */
+ case 429: {
+ struct sigwait_args *p = params;
+ uarg[0] = (intptr_t) p->set; /* const sigset_t * */
+ uarg[1] = (intptr_t) p->sig; /* int * */
+ *n_args = 2;
+ break;
+ }
+ /* thr_create */
+ case 430: {
+ struct thr_create_args *p = params;
+ uarg[0] = (intptr_t) p->ctx; /* ucontext_t * */
+ uarg[1] = (intptr_t) p->id; /* long * */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* thr_exit */
+ case 431: {
+ struct thr_exit_args *p = params;
+ uarg[0] = (intptr_t) p->state; /* long * */
+ *n_args = 1;
+ break;
+ }
+ /* thr_self */
+ case 432: {
+ struct thr_self_args *p = params;
+ uarg[0] = (intptr_t) p->id; /* long * */
+ *n_args = 1;
+ break;
+ }
+ /* thr_kill */
+ case 433: {
+ struct thr_kill_args *p = params;
+ iarg[0] = p->id; /* long */
+ iarg[1] = p->sig; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* _umtx_lock */
+ case 434: {
+ struct _umtx_lock_args *p = params;
+ uarg[0] = (intptr_t) p->umtx; /* struct umtx * */
+ *n_args = 1;
+ break;
+ }
+ /* _umtx_unlock */
+ case 435: {
+ struct _umtx_unlock_args *p = params;
+ uarg[0] = (intptr_t) p->umtx; /* struct umtx * */
+ *n_args = 1;
+ break;
+ }
+ /* jail_attach */
+ case 436: {
+ struct jail_attach_args *p = params;
+ iarg[0] = p->jid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* extattr_list_fd */
+ case 437: {
+ struct extattr_list_fd_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->data; /* void * */
+ uarg[3] = p->nbytes; /* size_t */
+ *n_args = 4;
+ break;
+ }
+ /* extattr_list_file */
+ case 438: {
+ struct extattr_list_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->data; /* void * */
+ uarg[3] = p->nbytes; /* size_t */
+ *n_args = 4;
+ break;
+ }
+ /* extattr_list_link */
+ case 439: {
+ struct extattr_list_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->data; /* void * */
+ uarg[3] = p->nbytes; /* size_t */
+ *n_args = 4;
+ break;
+ }
+ /* kse_switchin */
+ case 440: {
+ struct kse_switchin_args *p = params;
+ uarg[0] = (intptr_t) p->tmbx; /* struct kse_thr_mailbox * */
+ iarg[1] = p->flags; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* ksem_timedwait */
+ case 441: {
+ struct ksem_timedwait_args *p = params;
+ iarg[0] = p->id; /* semid_t */
+ uarg[1] = (intptr_t) p->abstime; /* const struct timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* thr_suspend */
+ case 442: {
+ struct thr_suspend_args *p = params;
+ uarg[0] = (intptr_t) p->timeout; /* const struct timespec * */
+ *n_args = 1;
+ break;
+ }
+ /* thr_wake */
+ case 443: {
+ struct thr_wake_args *p = params;
+ iarg[0] = p->id; /* long */
+ *n_args = 1;
+ break;
+ }
+ /* kldunloadf */
+ case 444: {
+ struct kldunloadf_args *p = params;
+ iarg[0] = p->fileid; /* int */
+ iarg[1] = p->flags; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* audit */
+ case 445: {
+ struct audit_args *p = params;
+ uarg[0] = (intptr_t) p->record; /* const void * */
+ uarg[1] = p->length; /* u_int */
+ *n_args = 2;
+ break;
+ }
+ /* auditon */
+ case 446: {
+ struct auditon_args *p = params;
+ iarg[0] = p->cmd; /* int */
+ uarg[1] = (intptr_t) p->data; /* void * */
+ uarg[2] = p->length; /* u_int */
+ *n_args = 3;
+ break;
+ }
+ /* getauid */
+ case 447: {
+ struct getauid_args *p = params;
+ uarg[0] = (intptr_t) p->auid; /* uid_t * */
+ *n_args = 1;
+ break;
+ }
+ /* setauid */
+ case 448: {
+ struct setauid_args *p = params;
+ uarg[0] = (intptr_t) p->auid; /* uid_t * */
+ *n_args = 1;
+ break;
+ }
+ /* getaudit */
+ case 449: {
+ struct getaudit_args *p = params;
+ uarg[0] = (intptr_t) p->auditinfo; /* struct auditinfo * */
+ *n_args = 1;
+ break;
+ }
+ /* setaudit */
+ case 450: {
+ struct setaudit_args *p = params;
+ uarg[0] = (intptr_t) p->auditinfo; /* struct auditinfo * */
+ *n_args = 1;
+ break;
+ }
+ /* getaudit_addr */
+ case 451: {
+ struct getaudit_addr_args *p = params;
+ uarg[0] = (intptr_t) p->auditinfo_addr; /* struct auditinfo_addr * */
+ uarg[1] = p->length; /* u_int */
+ *n_args = 2;
+ break;
+ }
+ /* setaudit_addr */
+ case 452: {
+ struct setaudit_addr_args *p = params;
+ uarg[0] = (intptr_t) p->auditinfo_addr; /* struct auditinfo_addr * */
+ uarg[1] = p->length; /* u_int */
+ *n_args = 2;
+ break;
+ }
+ /* auditctl */
+ case 453: {
+ struct auditctl_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* _umtx_op */
+ case 454: {
+ struct _umtx_op_args *p = params;
+ uarg[0] = (intptr_t) p->obj; /* void * */
+ iarg[1] = p->op; /* int */
+ uarg[2] = p->val; /* u_long */
+ uarg[3] = (intptr_t) p->uaddr1; /* void * */
+ uarg[4] = (intptr_t) p->uaddr2; /* void * */
+ *n_args = 5;
+ break;
+ }
+ /* thr_new */
+ case 455: {
+ struct thr_new_args *p = params;
+ uarg[0] = (intptr_t) p->param; /* struct thr_param * */
+ iarg[1] = p->param_size; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* sigqueue */
+ case 456: {
+ struct sigqueue_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ iarg[1] = p->signum; /* int */
+ uarg[2] = (intptr_t) p->value; /* void * */
+ *n_args = 3;
+ break;
+ }
+ /* kmq_open */
+ case 457: {
+ struct kmq_open_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->flags; /* int */
+ iarg[2] = p->mode; /* mode_t */
+ uarg[3] = (intptr_t) p->attr; /* const struct mq_attr * */
+ *n_args = 4;
+ break;
+ }
+ /* kmq_setattr */
+ case 458: {
+ struct kmq_setattr_args *p = params;
+ iarg[0] = p->mqd; /* int */
+ uarg[1] = (intptr_t) p->attr; /* const struct mq_attr * */
+ uarg[2] = (intptr_t) p->oattr; /* struct mq_attr * */
+ *n_args = 3;
+ break;
+ }
+ /* kmq_timedreceive */
+ case 459: {
+ struct kmq_timedreceive_args *p = params;
+ iarg[0] = p->mqd; /* int */
+ uarg[1] = (intptr_t) p->msg_ptr; /* char * */
+ uarg[2] = p->msg_len; /* size_t */
+ uarg[3] = (intptr_t) p->msg_prio; /* unsigned * */
+ uarg[4] = (intptr_t) p->abs_timeout; /* const struct timespec * */
+ *n_args = 5;
+ break;
+ }
+ /* kmq_timedsend */
+ case 460: {
+ struct kmq_timedsend_args *p = params;
+ iarg[0] = p->mqd; /* int */
+ uarg[1] = (intptr_t) p->msg_ptr; /* const char * */
+ uarg[2] = p->msg_len; /* size_t */
+ uarg[3] = p->msg_prio; /* unsigned */
+ uarg[4] = (intptr_t) p->abs_timeout; /* const struct timespec * */
+ *n_args = 5;
+ break;
+ }
+ /* kmq_notify */
+ case 461: {
+ struct kmq_notify_args *p = params;
+ iarg[0] = p->mqd; /* int */
+ uarg[1] = (intptr_t) p->sigev; /* const struct sigevent * */
+ *n_args = 2;
+ break;
+ }
+ /* kmq_unlink */
+ case 462: {
+ struct kmq_unlink_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ *n_args = 1;
+ break;
+ }
+ /* abort2 */
+ case 463: {
+ struct abort2_args *p = params;
+ uarg[0] = (intptr_t) p->why; /* const char * */
+ iarg[1] = p->nargs; /* int */
+ uarg[2] = (intptr_t) p->args; /* void ** */
+ *n_args = 3;
+ break;
+ }
+ /* thr_set_name */
+ case 464: {
+ struct thr_set_name_args *p = params;
+ iarg[0] = p->id; /* long */
+ uarg[1] = (intptr_t) p->name; /* const char * */
+ *n_args = 2;
+ break;
+ }
+ /* aio_fsync */
+ case 465: {
+ struct aio_fsync_args *p = params;
+ iarg[0] = p->op; /* int */
+ uarg[1] = (intptr_t) p->aiocbp; /* struct aiocb * */
+ *n_args = 2;
+ break;
+ }
+ /* rtprio_thread */
+ case 466: {
+ struct rtprio_thread_args *p = params;
+ iarg[0] = p->function; /* int */
+ iarg[1] = p->lwpid; /* lwpid_t */
+ uarg[2] = (intptr_t) p->rtp; /* struct rtprio * */
+ *n_args = 3;
+ break;
+ }
+ /* sctp_peeloff */
+ case 471: {
+ struct sctp_peeloff_args *p = params;
+ iarg[0] = p->sd; /* int */
+ uarg[1] = p->name; /* uint32_t */
+ *n_args = 2;
+ break;
+ }
+ /* sctp_generic_sendmsg */
+ case 472: {
+ struct sctp_generic_sendmsg_args *p = params;
+ iarg[0] = p->sd; /* int */
+ uarg[1] = (intptr_t) p->msg; /* caddr_t */
+ iarg[2] = p->mlen; /* int */
+ uarg[3] = (intptr_t) p->to; /* caddr_t */
+ iarg[4] = p->tolen; /* __socklen_t */
+ uarg[5] = (intptr_t) p->sinfo; /* struct sctp_sndrcvinfo * */
+ iarg[6] = p->flags; /* int */
+ *n_args = 7;
+ break;
+ }
+ /* sctp_generic_sendmsg_iov */
+ case 473: {
+ struct sctp_generic_sendmsg_iov_args *p = params;
+ iarg[0] = p->sd; /* int */
+ uarg[1] = (intptr_t) p->iov; /* struct iovec * */
+ iarg[2] = p->iovlen; /* int */
+ uarg[3] = (intptr_t) p->to; /* caddr_t */
+ iarg[4] = p->tolen; /* __socklen_t */
+ uarg[5] = (intptr_t) p->sinfo; /* struct sctp_sndrcvinfo * */
+ iarg[6] = p->flags; /* int */
+ *n_args = 7;
+ break;
+ }
+ /* sctp_generic_recvmsg */
+ case 474: {
+ struct sctp_generic_recvmsg_args *p = params;
+ iarg[0] = p->sd; /* int */
+ uarg[1] = (intptr_t) p->iov; /* struct iovec * */
+ iarg[2] = p->iovlen; /* int */
+ uarg[3] = (intptr_t) p->from; /* struct sockaddr * */
+ uarg[4] = (intptr_t) p->fromlenaddr; /* __socklen_t * */
+ uarg[5] = (intptr_t) p->sinfo; /* struct sctp_sndrcvinfo * */
+ uarg[6] = (intptr_t) p->msg_flags; /* int * */
+ *n_args = 7;
+ break;
+ }
+ /* pread */
+ case 475: {
+ struct pread_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* void * */
+ uarg[2] = p->nbyte; /* size_t */
+ iarg[3] = p->offset; /* off_t */
+ *n_args = 4;
+ break;
+ }
+ /* pwrite */
+ case 476: {
+ struct pwrite_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* const void * */
+ uarg[2] = p->nbyte; /* size_t */
+ iarg[3] = p->offset; /* off_t */
+ *n_args = 4;
+ break;
+ }
+ /* mmap */
+ case 477: {
+ struct mmap_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* caddr_t */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->prot; /* int */
+ iarg[3] = p->flags; /* int */
+ iarg[4] = p->fd; /* int */
+ iarg[5] = p->pos; /* off_t */
+ *n_args = 6;
+ break;
+ }
+ /* lseek */
+ case 478: {
+ struct lseek_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->offset; /* off_t */
+ iarg[2] = p->whence; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* truncate */
+ case 479: {
+ struct truncate_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->length; /* off_t */
+ *n_args = 2;
+ break;
+ }
+ /* ftruncate */
+ case 480: {
+ struct ftruncate_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->length; /* off_t */
+ *n_args = 2;
+ break;
+ }
+ /* thr_kill2 */
+ case 481: {
+ struct thr_kill2_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ iarg[1] = p->id; /* long */
+ iarg[2] = p->sig; /* int */
+ *n_args = 3;
+ break;
+ }
+ default:
+ *n_args = 0;
+ break;
+ };
+}
--- /dev/null
+++ sys/kern/subr_acl_posix1e.c
@@ -0,0 +1,637 @@
+/*-
+ * Copyright (c) 1999-2006 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ *
+ * ACL support routines specific to POSIX.1e access control lists. These are
+ * utility routines for code common across file systems implementing POSIX.1e
+ * ACLs.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/subr_acl_posix1e.c,v 1.52 2007/06/12 00:11:59 rwatson Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+
+/*
+ * Implement a version of vaccess() that understands POSIX.1e ACL semantics;
+ * the access ACL has already been prepared for evaluation by the file system
+ * and is passed via 'uid', 'gid', and 'acl'. Return 0 on success, else an
+ * errno value.
+ */
+int
+vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
+ struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused)
+{
+ struct acl_entry *acl_other, *acl_mask;
+ mode_t dac_granted;
+ mode_t priv_granted;
+ mode_t acl_mask_granted;
+ int group_matched, i;
+
+ /*
+ * Look for a normal, non-privileged way to access the file/directory
+ * as requested. If it exists, go with that. Otherwise, attempt to
+ * use privileges granted via priv_granted. In some cases, which
+ * privileges to use may be ambiguous due to "best match", in which
+ * case fall back on first match for the time being.
+ */
+ if (privused != NULL)
+ *privused = 0;
+
+ /*
+ * Determine privileges now, but don't apply until we've found a DAC
+ * entry that matches but has failed to allow access.
+ *
+ * XXXRW: Ideally, we'd determine the privileges required before
+ * asking for them.
+ */
+ priv_granted = 0;
+
+ if (type == VDIR) {
+ if ((acc_mode & VEXEC) && !priv_check_cred(cred,
+ PRIV_VFS_LOOKUP, 0))
+ priv_granted |= VEXEC;
+ } else {
+ if ((acc_mode & VEXEC) && !priv_check_cred(cred,
+ PRIV_VFS_EXEC, 0))
+ priv_granted |= VEXEC;
+ }
+
+ if ((acc_mode & VREAD) && !priv_check_cred(cred, PRIV_VFS_READ, 0))
+ priv_granted |= VREAD;
+
+ if (((acc_mode & VWRITE) || (acc_mode & VAPPEND)) &&
+ !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
+ priv_granted |= (VWRITE | VAPPEND);
+
+ if ((acc_mode & VADMIN) && !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
+ priv_granted |= VADMIN;
+
+ /*
+ * The owner matches if the effective uid associated with the
+ * credential matches that of the ACL_USER_OBJ entry. While we're
+ * doing the first scan, also cache the location of the ACL_MASK and
+ * ACL_OTHER entries, preventing some future iterations.
+ */
+ acl_mask = acl_other = NULL;
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_USER_OBJ:
+ if (file_uid != cred->cr_uid)
+ break;
+ dac_granted = 0;
+ dac_granted |= VADMIN;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= (VWRITE | VAPPEND);
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+
+ /*
+ * XXXRW: Do privilege lookup here.
+ */
+ if ((acc_mode & (dac_granted | priv_granted)) ==
+ acc_mode) {
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+ goto error;
+
+ case ACL_MASK:
+ acl_mask = &acl->acl_entry[i];
+ break;
+
+ case ACL_OTHER:
+ acl_other = &acl->acl_entry[i];
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ /*
+ * An ACL_OTHER entry should always exist in a valid access ACL. If
+ * it doesn't, then generate a serious failure. For now, this means
+ * a debugging message and EPERM, but in the future should probably
+ * be a panic.
+ */
+ if (acl_other == NULL) {
+ /*
+ * XXX This should never happen
+ */
+ printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
+ return (EPERM);
+ }
+
+ /*
+ * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields are
+ * masked by an ACL_MASK entry, if any. As such, first identify the
+ * ACL_MASK field, then iterate through identifying potential user
+ * matches, then group matches. If there is no ACL_MASK, assume that
+ * the mask allows all requests to succeed.
+ */
+ if (acl_mask != NULL) {
+ acl_mask_granted = 0;
+ if (acl_mask->ae_perm & ACL_EXECUTE)
+ acl_mask_granted |= VEXEC;
+ if (acl_mask->ae_perm & ACL_READ)
+ acl_mask_granted |= VREAD;
+ if (acl_mask->ae_perm & ACL_WRITE)
+ acl_mask_granted |= (VWRITE | VAPPEND);
+ } else
+ acl_mask_granted = VEXEC | VREAD | VWRITE | VAPPEND;
+
+ /*
+ * Check ACL_USER ACL entries. There will either be one or no
+ * matches; if there is one, we accept or rejected based on the
+ * match; otherwise, we continue on to groups.
+ */
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_USER:
+ if (acl->acl_entry[i].ae_id != cred->cr_uid)
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= (VWRITE | VAPPEND);
+ dac_granted &= acl_mask_granted;
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+ /*
+ * XXXRW: Do privilege lookup here.
+ */
+ if ((acc_mode & (dac_granted | priv_granted)) !=
+ acc_mode)
+ goto error;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+ }
+
+ /*
+ * Group match is best-match, not first-match, so find a "best"
+ * match. Iterate across, testing each potential group match. Make
+ * sure we keep track of whether we found a match or not, so that we
+ * know if we should try again with any available privilege, or if we
+ * should move on to ACL_OTHER.
+ */
+ group_matched = 0;
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_GROUP_OBJ:
+ if (!groupmember(file_gid, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= (VWRITE | VAPPEND);
+ dac_granted &= acl_mask_granted;
+
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+
+ group_matched = 1;
+ break;
+
+ case ACL_GROUP:
+ if (!groupmember(acl->acl_entry[i].ae_id, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= (VWRITE | VAPPEND);
+ dac_granted &= acl_mask_granted;
+
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+
+ group_matched = 1;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ if (group_matched == 1) {
+ /*
+ * There was a match, but it did not grant rights via pure
+ * DAC. Try again, this time with privilege.
+ */
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_GROUP_OBJ:
+ if (!groupmember(file_gid, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= (VWRITE | VAPPEND);
+ dac_granted &= acl_mask_granted;
+
+ /*
+ * XXXRW: Do privilege lookup here.
+ */
+ if ((acc_mode & (dac_granted | priv_granted))
+ != acc_mode)
+ break;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+
+ case ACL_GROUP:
+ if (!groupmember(acl->acl_entry[i].ae_id,
+ cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= (VWRITE | VAPPEND);
+ dac_granted &= acl_mask_granted;
+
+ /*
+ * XXXRW: Do privilege lookup here.
+ */
+ if ((acc_mode & (dac_granted | priv_granted))
+ != acc_mode)
+ break;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+
+ default:
+ break;
+ }
+ }
+ /*
+ * Even with privilege, group membership was not sufficient.
+ * Return failure.
+ */
+ goto error;
+ }
+
+ /*
+ * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER.
+ */
+ dac_granted = 0;
+ if (acl_other->ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl_other->ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl_other->ae_perm & ACL_WRITE)
+ dac_granted |= (VWRITE | VAPPEND);
+
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+ /*
+ * XXXRW: Do privilege lookup here.
+ */
+ if ((acc_mode & (dac_granted | priv_granted)) == acc_mode) {
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+
+error:
+ return ((acc_mode & VADMIN) ? EPERM : EACCES);
+}
+
+/*
+ * For the purposes of filesystems maintaining the _OBJ entries in an inode
+ * with a mode_t field, this routine converts a mode_t entry to an
+ * acl_perm_t.
+ */
+acl_perm_t
+acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
+{
+ acl_perm_t perm = 0;
+
+ switch(tag) {
+ case ACL_USER_OBJ:
+ if (mode & S_IXUSR)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IRUSR)
+ perm |= ACL_READ;
+ if (mode & S_IWUSR)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ case ACL_GROUP_OBJ:
+ if (mode & S_IXGRP)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IRGRP)
+ perm |= ACL_READ;
+ if (mode & S_IWGRP)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ case ACL_OTHER:
+ if (mode & S_IXOTH)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IROTH)
+ perm |= ACL_READ;
+ if (mode & S_IWOTH)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ default:
+ printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
+ return (0);
+ }
+}
+
+/*
+ * Given inode information (uid, gid, mode), return an acl entry of the
+ * appropriate type.
+ */
+struct acl_entry
+acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
+{
+ struct acl_entry acl_entry;
+
+ acl_entry.ae_tag = tag;
+ acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
+ switch(tag) {
+ case ACL_USER_OBJ:
+ acl_entry.ae_id = uid;
+ break;
+
+ case ACL_GROUP_OBJ:
+ acl_entry.ae_id = gid;
+ break;
+
+ case ACL_OTHER:
+ acl_entry.ae_id = ACL_UNDEFINED_ID;
+ break;
+
+ default:
+ acl_entry.ae_id = ACL_UNDEFINED_ID;
+ printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
+ }
+
+ return (acl_entry);
+}
+
+/*
+ * Utility function to generate a file mode given appropriate ACL entries.
+ */
+mode_t
+acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
+ struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
+{
+ mode_t mode;
+
+ mode = 0;
+ if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXUSR;
+ if (acl_user_obj_entry->ae_perm & ACL_READ)
+ mode |= S_IRUSR;
+ if (acl_user_obj_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWUSR;
+ if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXGRP;
+ if (acl_group_obj_entry->ae_perm & ACL_READ)
+ mode |= S_IRGRP;
+ if (acl_group_obj_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWGRP;
+ if (acl_other_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXOTH;
+ if (acl_other_entry->ae_perm & ACL_READ)
+ mode |= S_IROTH;
+ if (acl_other_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWOTH;
+
+ return (mode);
+}
+
+/*
+ * Utility function to generate a file mode given a complete POSIX.1e access
+ * ACL. Note that if the ACL is improperly formed, this may result in a
+ * panic.
+ */
+mode_t
+acl_posix1e_acl_to_mode(struct acl *acl)
+{
+ struct acl_entry *acl_mask, *acl_user_obj, *acl_group_obj, *acl_other;
+ int i;
+
+ /*
+ * Find the ACL entries relevant to a POSIX permission mode.
+ */
+ acl_user_obj = acl_group_obj = acl_other = acl_mask = NULL;
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_USER_OBJ:
+ acl_user_obj = &acl->acl_entry[i];
+ break;
+
+ case ACL_GROUP_OBJ:
+ acl_group_obj = &acl->acl_entry[i];
+ break;
+
+ case ACL_OTHER:
+ acl_other = &acl->acl_entry[i];
+ break;
+
+ case ACL_MASK:
+ acl_mask = &acl->acl_entry[i];
+ break;
+
+ case ACL_USER:
+ case ACL_GROUP:
+ break;
+
+ default:
+ panic("acl_posix1e_acl_to_mode: bad ae_tag");
+ }
+ }
+
+ if (acl_user_obj == NULL || acl_group_obj == NULL || acl_other == NULL)
+ panic("acl_posix1e_acl_to_mode: missing base ae_tags");
+
+ /*
+ * POSIX.1e specifies that if there is an ACL_MASK entry, we replace
+ * the mode "group" bits with its permissions. If there isn't, we
+ * use the ACL_GROUP_OBJ permissions.
+ */
+ if (acl_mask != NULL)
+ return (acl_posix1e_perms_to_mode(acl_user_obj, acl_mask,
+ acl_other));
+ else
+ return (acl_posix1e_perms_to_mode(acl_user_obj, acl_group_obj,
+ acl_other));
+}
+
+/*
+ * Perform a syntactic check of the ACL, sufficient to allow an implementing
+ * filesystem to determine if it should accept this and rely on the POSIX.1e
+ * ACL properties.
+ */
+int
+acl_posix1e_check(struct acl *acl)
+{
+ int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
+ int num_acl_mask, num_acl_other, i;
+
+ /*
+ * Verify that the number of entries does not exceed the maximum
+ * defined for acl_t.
+ *
+ * Verify that the correct number of various sorts of ae_tags are
+ * present:
+ * Exactly one ACL_USER_OBJ
+ * Exactly one ACL_GROUP_OBJ
+ * Exactly one ACL_OTHER
+ * If any ACL_USER or ACL_GROUP entries appear, then exactly one
+ * ACL_MASK entry must also appear.
+ *
+ * Verify that all ae_perm entries are in ACL_PERM_BITS.
+ *
+ * Verify all ae_tag entries are understood by this implementation.
+ *
+ * Note: Does not check for uniqueness of qualifier (ae_id) field.
+ */
+ num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
+ num_acl_mask = num_acl_other = 0;
+ if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0)
+ return (EINVAL);
+ for (i = 0; i < acl->acl_cnt; i++) {
+ /*
+ * Check for a valid tag.
+ */
+ switch(acl->acl_entry[i].ae_tag) {
+ case ACL_USER_OBJ:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_user_obj++;
+ break;
+ case ACL_GROUP_OBJ:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_group_obj++;
+ break;
+ case ACL_USER:
+ if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_user++;
+ break;
+ case ACL_GROUP:
+ if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_group++;
+ break;
+ case ACL_OTHER:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_other++;
+ break;
+ case ACL_MASK:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_mask++;
+ break;
+ default:
+ return (EINVAL);
+ }
+ /*
+ * Check for valid perm entries.
+ */
+ if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
+ ACL_PERM_BITS)
+ return (EINVAL);
+ }
+ if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
+ (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
+ return (EINVAL);
+ if (((num_acl_group != 0) || (num_acl_user != 0)) &&
+ (num_acl_mask != 1))
+ return (EINVAL);
+ return (0);
+}
+
+/*
+ * Given a requested mode for a new object, and a default ACL, combine the
+ * two to produce a new mode. Be careful not to clear any bits that aren't
+ * intended to be affected by the POSIX.1e ACL. Eventually, this might also
+ * take the cmask as an argument, if we push that down into
+ * per-filesystem-code.
+ */
+mode_t
+acl_posix1e_newfilemode(mode_t cmode, struct acl *dacl)
+{
+ mode_t mode;
+
+ mode = cmode;
+ /*
+ * The current composition policy is that a permission bit must be
+ * set in *both* the ACL and the requested creation mode for it to
+ * appear in the resulting mode/ACL. First clear any possibly
+ * effected bits, then reconstruct.
+ */
+ mode &= ACL_PRESERVE_MASK;
+ mode |= (ACL_OVERRIDE_MASK & cmode & acl_posix1e_acl_to_mode(dacl));
+
+ return (mode);
+}
Index: kern_sx.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_sx.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/kern_sx.c -L sys/kern/kern_sx.c -u -r1.1.1.2 -r1.2
--- sys/kern/kern_sx.c
+++ sys/kern/kern_sx.c
@@ -1,12 +1,14 @@
/*-
- * Copyright (C) 2001 Jason Evans <jasone at freebsd.org>. All rights reserved.
+ * Copyright (c) 2007 Attilio Rao <attilio at freebsd.org>
+ * Copyright (c) 2001 Jason Evans <jasone at freebsd.org>
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice(s), this list of conditions and the following disclaimer as
- * the first lines of this file unmodified other than the possible
+ * the first lines of this file unmodified other than the possible
* addition of one or more copyright notices.
* 2. Redistributions in binary form must reproduce the above copyright
* notice(s), this list of conditions and the following disclaimer in the
@@ -26,40 +28,95 @@
*/
/*
- * Shared/exclusive locks. This implementation assures deterministic lock
- * granting behavior, so that slocks and xlocks are interleaved.
+ * Shared/exclusive locks. This implementation attempts to ensure
+ * deterministic lock granting behavior, so that slocks and xlocks are
+ * interleaved.
*
* Priority propagation will not generally raise the priority of lock holders,
* so should not be relied upon in combination with sx locks.
*/
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_sx.c,v 1.25.2.1 2005/12/20 19:28:23 jhb Exp $");
-
+#include "opt_adaptive_sx.h"
#include "opt_ddb.h"
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/kern_sx.c,v 1.55 2007/10/02 14:48:48 pjd Exp $");
+
#include <sys/param.h>
-#include <sys/systm.h>
#include <sys/ktr.h>
-#include <sys/linker_set.h>
-#include <sys/condvar.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/sleepqueue.h>
#include <sys/sx.h>
+#include <sys/systm.h>
+
+#ifdef ADAPTIVE_SX
+#include <machine/cpu.h>
+#endif
+#ifdef DDB
#include <ddb/ddb.h>
+#endif
+
+#if !defined(SMP) && defined(ADAPTIVE_SX)
+#error "You must have SMP to enable the ADAPTIVE_SX option"
+#endif
+
+CTASSERT(((SX_ADAPTIVESPIN | SX_RECURSE) & LO_CLASSFLAGS) ==
+ (SX_ADAPTIVESPIN | SX_RECURSE));
+
+/* Handy macros for sleep queues. */
+#define SQ_EXCLUSIVE_QUEUE 0
+#define SQ_SHARED_QUEUE 1
+
+/*
+ * Variations on DROP_GIANT()/PICKUP_GIANT() for use in this file. We
+ * drop Giant anytime we have to sleep or if we adaptively spin.
+ */
+#define GIANT_DECLARE \
+ int _giantcnt = 0; \
+ WITNESS_SAVE_DECL(Giant) \
+
+#define GIANT_SAVE() do { \
+ if (mtx_owned(&Giant)) { \
+ WITNESS_SAVE(&Giant.lock_object, Giant); \
+ while (mtx_owned(&Giant)) { \
+ _giantcnt++; \
+ mtx_unlock(&Giant); \
+ } \
+ } \
+} while (0)
+
+#define GIANT_RESTORE() do { \
+ if (_giantcnt > 0) { \
+ mtx_assert(&Giant, MA_NOTOWNED); \
+ while (_giantcnt--) \
+ mtx_lock(&Giant); \
+ WITNESS_RESTORE(&Giant.lock_object, Giant); \
+ } \
+} while (0)
+
+/*
+ * Returns true if an exclusive lock is recursed. It assumes
+ * curthread currently has an exclusive lock.
+ */
+#define sx_recursed(sx) ((sx)->sx_recurse != 0)
#ifdef DDB
static void db_show_sx(struct lock_object *lock);
#endif
+static void lock_sx(struct lock_object *lock, int how);
+static int unlock_sx(struct lock_object *lock);
struct lock_class lock_class_sx = {
- "sx",
- LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE,
+ .lc_name = "sx",
+ .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE,
#ifdef DDB
- db_show_sx
+ .lc_ddb_show = db_show_sx,
#endif
+ .lc_lock = lock_sx,
+ .lc_unlock = unlock_sx,
};
#ifndef INVARIANTS
@@ -67,6 +124,34 @@
#endif
void
+lock_sx(struct lock_object *lock, int how)
+{
+ struct sx *sx;
+
+ sx = (struct sx *)lock;
+ if (how)
+ sx_xlock(sx);
+ else
+ sx_slock(sx);
+}
+
+int
+unlock_sx(struct lock_object *lock)
+{
+ struct sx *sx;
+
+ sx = (struct sx *)lock;
+ sx_assert(sx, SA_LOCKED | SA_NOTRECURSED);
+ if (sx_xlocked(sx)) {
+ sx_xunlock(sx);
+ return (1);
+ } else {
+ sx_sunlock(sx);
+ return (0);
+ }
+}
+
+void
sx_sysinit(void *arg)
{
struct sx_args *sargs = arg;
@@ -75,250 +160,718 @@
}
void
-sx_init(struct sx *sx, const char *description)
+sx_init_flags(struct sx *sx, const char *description, int opts)
{
- struct lock_object *lock;
+ int flags;
- lock = &sx->sx_object;
- KASSERT((lock->lo_flags & LO_INITIALIZED) == 0,
- ("sx lock %s %p already initialized", description, sx));
- bzero(sx, sizeof(*sx));
- lock->lo_class = &lock_class_sx;
- lock->lo_type = lock->lo_name = description;
- lock->lo_flags = LO_WITNESS | LO_RECURSABLE | LO_SLEEPABLE |
- LO_UPGRADABLE;
- sx->sx_lock = mtx_pool_find(mtxpool_lockbuilder, sx);
- sx->sx_cnt = 0;
- cv_init(&sx->sx_shrd_cv, description);
- sx->sx_shrd_wcnt = 0;
- cv_init(&sx->sx_excl_cv, description);
- sx->sx_excl_wcnt = 0;
- sx->sx_xholder = NULL;
+ MPASS((opts & ~(SX_QUIET | SX_RECURSE | SX_NOWITNESS | SX_DUPOK |
+ SX_NOPROFILE | SX_ADAPTIVESPIN)) == 0);
- LOCK_LOG_INIT(lock, 0);
-
- WITNESS_INIT(lock);
+ flags = LO_RECURSABLE | LO_SLEEPABLE | LO_UPGRADABLE;
+ if (opts & SX_DUPOK)
+ flags |= LO_DUPOK;
+ if (opts & SX_NOPROFILE)
+ flags |= LO_NOPROFILE;
+ if (!(opts & SX_NOWITNESS))
+ flags |= LO_WITNESS;
+ if (opts & SX_QUIET)
+ flags |= LO_QUIET;
+
+ flags |= opts & (SX_ADAPTIVESPIN | SX_RECURSE);
+ sx->sx_lock = SX_LOCK_UNLOCKED;
+ sx->sx_recurse = 0;
+ lock_init(&sx->lock_object, &lock_class_sx, description, NULL, flags);
}
void
sx_destroy(struct sx *sx)
{
- LOCK_LOG_DESTROY(&sx->sx_object, 0);
+ KASSERT(sx->sx_lock == SX_LOCK_UNLOCKED, ("sx lock still held"));
+ KASSERT(sx->sx_recurse == 0, ("sx lock still recursed"));
+ sx->sx_lock = SX_LOCK_DESTROYED;
+ lock_destroy(&sx->lock_object);
+}
- KASSERT((sx->sx_cnt == 0 && sx->sx_shrd_wcnt == 0 && sx->sx_excl_wcnt ==
- 0), ("%s (%s): holders or waiters\n", __func__,
- sx->sx_object.lo_name));
+int
+_sx_slock(struct sx *sx, int opts, const char *file, int line)
+{
+ int error = 0;
- sx->sx_lock = NULL;
- cv_destroy(&sx->sx_shrd_cv);
- cv_destroy(&sx->sx_excl_cv);
+ MPASS(curthread != NULL);
+ KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+ ("sx_slock() of destroyed sx @ %s:%d", file, line));
+ WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER, file, line);
+ error = __sx_slock(sx, opts, file, line);
+ if (!error) {
+ LOCK_LOG_LOCK("SLOCK", &sx->lock_object, 0, 0, file, line);
+ WITNESS_LOCK(&sx->lock_object, 0, file, line);
+ curthread->td_locks++;
+ }
- WITNESS_DESTROY(&sx->sx_object);
+ return (error);
}
-void
-_sx_slock(struct sx *sx, const char *file, int line)
+int
+_sx_try_slock(struct sx *sx, const char *file, int line)
{
+ uintptr_t x;
- mtx_lock(sx->sx_lock);
- KASSERT(sx->sx_xholder != curthread,
- ("%s (%s): slock while xlock is held @ %s:%d\n", __func__,
- sx->sx_object.lo_name, file, line));
- WITNESS_CHECKORDER(&sx->sx_object, LOP_NEWORDER, file, line);
-
- /*
- * Loop in case we lose the race for lock acquisition.
- */
- while (sx->sx_cnt < 0) {
- sx->sx_shrd_wcnt++;
- cv_wait(&sx->sx_shrd_cv, sx->sx_lock);
- sx->sx_shrd_wcnt--;
+ for (;;) {
+ x = sx->sx_lock;
+ KASSERT(x != SX_LOCK_DESTROYED,
+ ("sx_try_slock() of destroyed sx @ %s:%d", file, line));
+ if (!(x & SX_LOCK_SHARED))
+ break;
+ if (atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER)) {
+ LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 1, file, line);
+ WITNESS_LOCK(&sx->lock_object, LOP_TRYLOCK, file, line);
+ curthread->td_locks++;
+ return (1);
+ }
}
- /* Acquire a shared lock. */
- sx->sx_cnt++;
+ LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 0, file, line);
+ return (0);
+}
- LOCK_LOG_LOCK("SLOCK", &sx->sx_object, 0, 0, file, line);
- WITNESS_LOCK(&sx->sx_object, 0, file, line);
+int
+_sx_xlock(struct sx *sx, int opts, const char *file, int line)
+{
+ int error = 0;
+
+ MPASS(curthread != NULL);
+ KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+ ("sx_xlock() of destroyed sx @ %s:%d", file, line));
+ WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
+ line);
+ error = __sx_xlock(sx, curthread, opts, file, line);
+ if (!error) {
+ LOCK_LOG_LOCK("XLOCK", &sx->lock_object, 0, sx->sx_recurse,
+ file, line);
+ WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
+ curthread->td_locks++;
+ }
- mtx_unlock(sx->sx_lock);
+ return (error);
}
int
-_sx_try_slock(struct sx *sx, const char *file, int line)
+_sx_try_xlock(struct sx *sx, const char *file, int line)
{
+ int rval;
- mtx_lock(sx->sx_lock);
- if (sx->sx_cnt >= 0) {
- sx->sx_cnt++;
- LOCK_LOG_TRY("SLOCK", &sx->sx_object, 0, 1, file, line);
- WITNESS_LOCK(&sx->sx_object, LOP_TRYLOCK, file, line);
- mtx_unlock(sx->sx_lock);
- return (1);
- } else {
- LOCK_LOG_TRY("SLOCK", &sx->sx_object, 0, 0, file, line);
- mtx_unlock(sx->sx_lock);
- return (0);
+ MPASS(curthread != NULL);
+ KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+ ("sx_try_xlock() of destroyed sx @ %s:%d", file, line));
+
+ if (sx_xlocked(sx) && (sx->lock_object.lo_flags & SX_RECURSE) != 0) {
+ sx->sx_recurse++;
+ atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
+ rval = 1;
+ } else
+ rval = atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED,
+ (uintptr_t)curthread);
+ LOCK_LOG_TRY("XLOCK", &sx->lock_object, 0, rval, file, line);
+ if (rval) {
+ WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+ file, line);
+ curthread->td_locks++;
}
+
+ return (rval);
+}
+
+void
+_sx_sunlock(struct sx *sx, const char *file, int line)
+{
+
+ MPASS(curthread != NULL);
+ KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+ ("sx_sunlock() of destroyed sx @ %s:%d", file, line));
+ _sx_assert(sx, SA_SLOCKED, file, line);
+ curthread->td_locks--;
+ WITNESS_UNLOCK(&sx->lock_object, 0, file, line);
+ LOCK_LOG_LOCK("SUNLOCK", &sx->lock_object, 0, 0, file, line);
+#ifdef LOCK_PROFILING_SHARED
+ if (SX_SHARERS(sx->sx_lock) == 1)
+ lock_profile_release_lock(&sx->lock_object);
+#endif
+ __sx_sunlock(sx, file, line);
}
void
-_sx_xlock(struct sx *sx, const char *file, int line)
+_sx_xunlock(struct sx *sx, const char *file, int line)
+{
+
+ MPASS(curthread != NULL);
+ KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+ ("sx_xunlock() of destroyed sx @ %s:%d", file, line));
+ _sx_assert(sx, SA_XLOCKED, file, line);
+ curthread->td_locks--;
+ WITNESS_UNLOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
+ LOCK_LOG_LOCK("XUNLOCK", &sx->lock_object, 0, sx->sx_recurse, file,
+ line);
+ if (!sx_recursed(sx))
+ lock_profile_release_lock(&sx->lock_object);
+ __sx_xunlock(sx, curthread, file, line);
+}
+
+/*
+ * Try to do a non-blocking upgrade from a shared lock to an exclusive lock.
+ * This will only succeed if this thread holds a single shared lock.
+ * Return 1 if if the upgrade succeed, 0 otherwise.
+ */
+int
+_sx_try_upgrade(struct sx *sx, const char *file, int line)
{
+ uintptr_t x;
+ int success;
- mtx_lock(sx->sx_lock);
+ KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+ ("sx_try_upgrade() of destroyed sx @ %s:%d", file, line));
+ _sx_assert(sx, SA_SLOCKED, file, line);
/*
- * With sx locks, we're absolutely not permitted to recurse on
- * xlocks, as it is fatal (deadlock). Normally, recursion is handled
- * by WITNESS, but as it is not semantically correct to hold the
- * xlock while in here, we consider it API abuse and put it under
- * INVARIANTS.
+ * Try to switch from one shared lock to an exclusive lock. We need
+ * to maintain the SX_LOCK_EXCLUSIVE_WAITERS flag if set so that
+ * we will wake up the exclusive waiters when we drop the lock.
*/
- KASSERT(sx->sx_xholder != curthread,
- ("%s (%s): xlock already held @ %s:%d", __func__,
- sx->sx_object.lo_name, file, line));
- WITNESS_CHECKORDER(&sx->sx_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
- line);
+ x = sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS;
+ success = atomic_cmpset_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | x,
+ (uintptr_t)curthread | x);
+ LOCK_LOG_TRY("XUPGRADE", &sx->lock_object, 0, success, file, line);
+ if (success)
+ WITNESS_UPGRADE(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+ file, line);
+ return (success);
+}
- /* Loop in case we lose the race for lock acquisition. */
- while (sx->sx_cnt != 0) {
- sx->sx_excl_wcnt++;
- cv_wait(&sx->sx_excl_cv, sx->sx_lock);
- sx->sx_excl_wcnt--;
- }
+/*
+ * Downgrade an unrecursed exclusive lock into a single shared lock.
+ */
+void
+_sx_downgrade(struct sx *sx, const char *file, int line)
+{
+ uintptr_t x;
+
+ KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+ ("sx_downgrade() of destroyed sx @ %s:%d", file, line));
+ _sx_assert(sx, SA_XLOCKED | SA_NOTRECURSED, file, line);
+#ifndef INVARIANTS
+ if (sx_recursed(sx))
+ panic("downgrade of a recursed lock");
+#endif
- MPASS(sx->sx_cnt == 0);
+ WITNESS_DOWNGRADE(&sx->lock_object, 0, file, line);
+
+ /*
+ * Try to switch from an exclusive lock with no shared waiters
+ * to one sharer with no shared waiters. If there are
+ * exclusive waiters, we don't need to lock the sleep queue so
+ * long as we preserve the flag. We do one quick try and if
+ * that fails we grab the sleepq lock to keep the flags from
+ * changing and do it the slow way.
+ *
+ * We have to lock the sleep queue if there are shared waiters
+ * so we can wake them up.
+ */
+ x = sx->sx_lock;
+ if (!(x & SX_LOCK_SHARED_WAITERS) &&
+ atomic_cmpset_rel_ptr(&sx->sx_lock, x, SX_SHARERS_LOCK(1) |
+ (x & SX_LOCK_EXCLUSIVE_WAITERS))) {
+ LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line);
+ return;
+ }
- /* Acquire an exclusive lock. */
- sx->sx_cnt--;
- sx->sx_xholder = curthread;
+ /*
+ * Lock the sleep queue so we can read the waiters bits
+ * without any races and wakeup any shared waiters.
+ */
+ sleepq_lock(&sx->lock_object);
- LOCK_LOG_LOCK("XLOCK", &sx->sx_object, 0, 0, file, line);
- WITNESS_LOCK(&sx->sx_object, LOP_EXCLUSIVE, file, line);
+ /*
+ * Preserve SX_LOCK_EXCLUSIVE_WAITERS while downgraded to a single
+ * shared lock. If there are any shared waiters, wake them up.
+ */
+ x = sx->sx_lock;
+ atomic_store_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) |
+ (x & SX_LOCK_EXCLUSIVE_WAITERS));
+ if (x & SX_LOCK_SHARED_WAITERS)
+ sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, -1,
+ SQ_SHARED_QUEUE);
+ else
+ sleepq_release(&sx->lock_object);
- mtx_unlock(sx->sx_lock);
+ LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line);
}
+/*
+ * This function represents the so-called 'hard case' for sx_xlock
+ * operation. All 'easy case' failures are redirected to this. Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
int
-_sx_try_xlock(struct sx *sx, const char *file, int line)
+_sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file,
+ int line)
{
-
- mtx_lock(sx->sx_lock);
- if (sx->sx_cnt == 0) {
- sx->sx_cnt--;
- sx->sx_xholder = curthread;
- LOCK_LOG_TRY("XLOCK", &sx->sx_object, 0, 1, file, line);
- WITNESS_LOCK(&sx->sx_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file,
- line);
- mtx_unlock(sx->sx_lock);
- return (1);
- } else {
- LOCK_LOG_TRY("XLOCK", &sx->sx_object, 0, 0, file, line);
- mtx_unlock(sx->sx_lock);
+ GIANT_DECLARE;
+#ifdef ADAPTIVE_SX
+ volatile struct thread *owner;
+#endif
+ uint64_t waittime = 0;
+ uintptr_t x;
+ int contested = 0, error = 0;
+
+ /* If we already hold an exclusive lock, then recurse. */
+ if (sx_xlocked(sx)) {
+ KASSERT((sx->lock_object.lo_flags & SX_RECURSE) != 0,
+ ("_sx_xlock_hard: recursed on non-recursive sx %s @ %s:%d\n",
+ sx->lock_object.lo_name, file, line));
+ sx->sx_recurse++;
+ atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p recursing", __func__, sx);
return (0);
}
-}
-void
-_sx_sunlock(struct sx *sx, const char *file, int line)
-{
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
+ sx->lock_object.lo_name, (void *)sx->sx_lock, file, line);
+
+ while (!atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, tid)) {
+#ifdef ADAPTIVE_SX
+ /*
+ * If the lock is write locked and the owner is
+ * running on another CPU, spin until the owner stops
+ * running or the state of the lock changes.
+ */
+ x = sx->sx_lock;
+ if (!(x & SX_LOCK_SHARED) &&
+ (sx->lock_object.lo_flags & SX_ADAPTIVESPIN)) {
+ x = SX_OWNER(x);
+ owner = (struct thread *)x;
+ if (TD_IS_RUNNING(owner)) {
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR3(KTR_LOCK,
+ "%s: spinning on %p held by %p",
+ __func__, sx, owner);
+ GIANT_SAVE();
+ lock_profile_obtain_lock_failed(
+ &sx->lock_object, &contested, &waittime);
+ while (SX_OWNER(sx->sx_lock) == x &&
+ TD_IS_RUNNING(owner))
+ cpu_spinwait();
+ continue;
+ }
+ }
+#endif
- _sx_assert(sx, SX_SLOCKED, file, line);
- mtx_lock(sx->sx_lock);
+ sleepq_lock(&sx->lock_object);
+ x = sx->sx_lock;
- WITNESS_UNLOCK(&sx->sx_object, 0, file, line);
+ /*
+ * If the lock was released while spinning on the
+ * sleep queue chain lock, try again.
+ */
+ if (x == SX_LOCK_UNLOCKED) {
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
- /* Release. */
- sx->sx_cnt--;
+#ifdef ADAPTIVE_SX
+ /*
+ * The current lock owner might have started executing
+ * on another CPU (or the lock could have changed
+ * owners) while we were waiting on the sleep queue
+ * chain lock. If so, drop the sleep queue lock and try
+ * again.
+ */
+ if (!(x & SX_LOCK_SHARED) &&
+ (sx->lock_object.lo_flags & SX_ADAPTIVESPIN)) {
+ owner = (struct thread *)SX_OWNER(x);
+ if (TD_IS_RUNNING(owner)) {
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
+ }
+#endif
- /*
- * If we just released the last shared lock, wake any waiters up, giving
- * exclusive lockers precedence. In order to make sure that exclusive
- * lockers won't be blocked forever, don't wake shared lock waiters if
- * there are exclusive lock waiters.
- */
- if (sx->sx_excl_wcnt > 0) {
- if (sx->sx_cnt == 0)
- cv_signal(&sx->sx_excl_cv);
- } else if (sx->sx_shrd_wcnt > 0)
- cv_broadcast(&sx->sx_shrd_cv);
+ /*
+ * If an exclusive lock was released with both shared
+ * and exclusive waiters and a shared waiter hasn't
+ * woken up and acquired the lock yet, sx_lock will be
+ * set to SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS.
+ * If we see that value, try to acquire it once. Note
+ * that we have to preserve SX_LOCK_EXCLUSIVE_WAITERS
+ * as there are other exclusive waiters still. If we
+ * fail, restart the loop.
+ */
+ if (x == (SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS)) {
+ if (atomic_cmpset_acq_ptr(&sx->sx_lock,
+ SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS,
+ tid | SX_LOCK_EXCLUSIVE_WAITERS)) {
+ sleepq_release(&sx->lock_object);
+ CTR2(KTR_LOCK, "%s: %p claimed by new writer",
+ __func__, sx);
+ break;
+ }
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
- LOCK_LOG_LOCK("SUNLOCK", &sx->sx_object, 0, 0, file, line);
+ /*
+ * Try to set the SX_LOCK_EXCLUSIVE_WAITERS. If we fail,
+ * than loop back and retry.
+ */
+ if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) {
+ if (!atomic_cmpset_ptr(&sx->sx_lock, x,
+ x | SX_LOCK_EXCLUSIVE_WAITERS)) {
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p set excl waiters flag",
+ __func__, sx);
+ }
+
+ /*
+ * Since we have been unable to acquire the exclusive
+ * lock and the exclusive waiters flag is set, we have
+ * to sleep.
+ */
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
+ __func__, sx);
+
+ GIANT_SAVE();
+ lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
+ &waittime);
+ sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
+ SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
+ SLEEPQ_INTERRUPTIBLE : 0), SQ_EXCLUSIVE_QUEUE);
+ if (!(opts & SX_INTERRUPTIBLE))
+ sleepq_wait(&sx->lock_object);
+ else
+ error = sleepq_wait_sig(&sx->lock_object);
+
+ if (error) {
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK,
+ "%s: interruptible sleep by %p suspended by signal",
+ __func__, sx);
+ break;
+ }
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
+ __func__, sx);
+ }
- mtx_unlock(sx->sx_lock);
+ GIANT_RESTORE();
+ if (!error)
+ lock_profile_obtain_lock_success(&sx->lock_object, contested,
+ waittime, file, line);
+ return (error);
}
+/*
+ * This function represents the so-called 'hard case' for sx_xunlock
+ * operation. All 'easy case' failures are redirected to this. Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
void
-_sx_xunlock(struct sx *sx, const char *file, int line)
+_sx_xunlock_hard(struct sx *sx, uintptr_t tid, const char *file, int line)
{
+ uintptr_t x;
+ int queue;
- _sx_assert(sx, SX_XLOCKED, file, line);
- mtx_lock(sx->sx_lock);
- MPASS(sx->sx_cnt == -1);
-
- WITNESS_UNLOCK(&sx->sx_object, LOP_EXCLUSIVE, file, line);
-
- /* Release. */
- sx->sx_cnt++;
- sx->sx_xholder = NULL;
+ MPASS(!(sx->sx_lock & SX_LOCK_SHARED));
- /*
- * Wake up waiters if there are any. Give precedence to slock waiters.
- */
- if (sx->sx_shrd_wcnt > 0)
- cv_broadcast(&sx->sx_shrd_cv);
- else if (sx->sx_excl_wcnt > 0)
- cv_signal(&sx->sx_excl_cv);
+ /* If the lock is recursed, then unrecurse one level. */
+ if (sx_xlocked(sx) && sx_recursed(sx)) {
+ if ((--sx->sx_recurse) == 0)
+ atomic_clear_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, sx);
+ return;
+ }
+ MPASS(sx->sx_lock & (SX_LOCK_SHARED_WAITERS |
+ SX_LOCK_EXCLUSIVE_WAITERS));
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p contested", __func__, sx);
- LOCK_LOG_LOCK("XUNLOCK", &sx->sx_object, 0, 0, file, line);
+ sleepq_lock(&sx->lock_object);
+ x = SX_LOCK_UNLOCKED;
- mtx_unlock(sx->sx_lock);
+ /*
+ * The wake up algorithm here is quite simple and probably not
+ * ideal. It gives precedence to shared waiters if they are
+ * present. For this condition, we have to preserve the
+ * state of the exclusive waiters flag.
+ */
+ if (sx->sx_lock & SX_LOCK_SHARED_WAITERS) {
+ queue = SQ_SHARED_QUEUE;
+ x |= (sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS);
+ } else
+ queue = SQ_EXCLUSIVE_QUEUE;
+
+ /* Wake up all the waiters for the specific queue. */
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR3(KTR_LOCK, "%s: %p waking up all threads on %s queue",
+ __func__, sx, queue == SQ_SHARED_QUEUE ? "shared" :
+ "exclusive");
+ atomic_store_rel_ptr(&sx->sx_lock, x);
+ sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, -1, queue);
}
+/*
+ * This function represents the so-called 'hard case' for sx_slock
+ * operation. All 'easy case' failures are redirected to this. Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
int
-_sx_try_upgrade(struct sx *sx, const char *file, int line)
+_sx_slock_hard(struct sx *sx, int opts, const char *file, int line)
{
+ GIANT_DECLARE;
+#ifdef ADAPTIVE_SX
+ volatile struct thread *owner;
+#endif
+#ifdef LOCK_PROFILING_SHARED
+ uint64_t waittime = 0;
+ int contested = 0;
+#endif
+ uintptr_t x;
+ int error = 0;
- _sx_assert(sx, SX_SLOCKED, file, line);
- mtx_lock(sx->sx_lock);
+ /*
+ * As with rwlocks, we don't make any attempt to try to block
+ * shared locks once there is an exclusive waiter.
+ */
+ for (;;) {
+ x = sx->sx_lock;
- if (sx->sx_cnt == 1) {
- sx->sx_cnt = -1;
- sx->sx_xholder = curthread;
+ /*
+ * If no other thread has an exclusive lock then try to bump up
+ * the count of sharers. Since we have to preserve the state
+ * of SX_LOCK_EXCLUSIVE_WAITERS, if we fail to acquire the
+ * shared lock loop back and retry.
+ */
+ if (x & SX_LOCK_SHARED) {
+ MPASS(!(x & SX_LOCK_SHARED_WAITERS));
+ if (atomic_cmpset_acq_ptr(&sx->sx_lock, x,
+ x + SX_ONE_SHARER)) {
+#ifdef LOCK_PROFILING_SHARED
+ if (SX_SHARERS(x) == 0)
+ lock_profile_obtain_lock_success(
+ &sx->lock_object, contested,
+ waittime, file, line);
+#endif
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR4(KTR_LOCK,
+ "%s: %p succeed %p -> %p", __func__,
+ sx, (void *)x,
+ (void *)(x + SX_ONE_SHARER));
+ break;
+ }
+ continue;
+ }
- LOCK_LOG_TRY("XUPGRADE", &sx->sx_object, 0, 1, file, line);
- WITNESS_UPGRADE(&sx->sx_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
- file, line);
+#ifdef ADAPTIVE_SX
+ /*
+ * If the owner is running on another CPU, spin until
+ * the owner stops running or the state of the lock
+ * changes.
+ */
+ else if (sx->lock_object.lo_flags & SX_ADAPTIVESPIN) {
+ x = SX_OWNER(x);
+ owner = (struct thread *)x;
+ if (TD_IS_RUNNING(owner)) {
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR3(KTR_LOCK,
+ "%s: spinning on %p held by %p",
+ __func__, sx, owner);
+ GIANT_SAVE();
+#ifdef LOCK_PROFILING_SHARED
+ lock_profile_obtain_lock_failed(
+ &sx->lock_object, &contested, &waittime);
+#endif
+ while (SX_OWNER(sx->sx_lock) == x &&
+ TD_IS_RUNNING(owner))
+ cpu_spinwait();
+ continue;
+ }
+ }
+#endif
- mtx_unlock(sx->sx_lock);
- return (1);
- } else {
- LOCK_LOG_TRY("XUPGRADE", &sx->sx_object, 0, 0, file, line);
- mtx_unlock(sx->sx_lock);
- return (0);
+ /*
+ * Some other thread already has an exclusive lock, so
+ * start the process of blocking.
+ */
+ sleepq_lock(&sx->lock_object);
+ x = sx->sx_lock;
+
+ /*
+ * The lock could have been released while we spun.
+ * In this case loop back and retry.
+ */
+ if (x & SX_LOCK_SHARED) {
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
+
+#ifdef ADAPTIVE_SX
+ /*
+ * If the owner is running on another CPU, spin until
+ * the owner stops running or the state of the lock
+ * changes.
+ */
+ if (!(x & SX_LOCK_SHARED) &&
+ (sx->lock_object.lo_flags & SX_ADAPTIVESPIN)) {
+ owner = (struct thread *)SX_OWNER(x);
+ if (TD_IS_RUNNING(owner)) {
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
+ }
+#endif
+
+ /*
+ * Try to set the SX_LOCK_SHARED_WAITERS flag. If we
+ * fail to set it drop the sleep queue lock and loop
+ * back.
+ */
+ if (!(x & SX_LOCK_SHARED_WAITERS)) {
+ if (!atomic_cmpset_ptr(&sx->sx_lock, x,
+ x | SX_LOCK_SHARED_WAITERS)) {
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p set shared waiters flag",
+ __func__, sx);
+ }
+
+ /*
+ * Since we have been unable to acquire the shared lock,
+ * we have to sleep.
+ */
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
+ __func__, sx);
+
+ GIANT_SAVE();
+#ifdef LOCK_PROFILING_SHARED
+ lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
+ &waittime);
+#endif
+ sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
+ SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
+ SLEEPQ_INTERRUPTIBLE : 0), SQ_SHARED_QUEUE);
+ if (!(opts & SX_INTERRUPTIBLE))
+ sleepq_wait(&sx->lock_object);
+ else
+ error = sleepq_wait_sig(&sx->lock_object);
+
+ if (error) {
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK,
+ "%s: interruptible sleep by %p suspended by signal",
+ __func__, sx);
+ break;
+ }
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
+ __func__, sx);
}
+
+ GIANT_RESTORE();
+ return (error);
}
+/*
+ * This function represents the so-called 'hard case' for sx_sunlock
+ * operation. All 'easy case' failures are redirected to this. Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
void
-_sx_downgrade(struct sx *sx, const char *file, int line)
+_sx_sunlock_hard(struct sx *sx, const char *file, int line)
{
+ uintptr_t x;
+
+ for (;;) {
+ x = sx->sx_lock;
- _sx_assert(sx, SX_XLOCKED, file, line);
- mtx_lock(sx->sx_lock);
- MPASS(sx->sx_cnt == -1);
+ /*
+ * We should never have sharers while at least one thread
+ * holds a shared lock.
+ */
+ KASSERT(!(x & SX_LOCK_SHARED_WAITERS),
+ ("%s: waiting sharers", __func__));
- WITNESS_DOWNGRADE(&sx->sx_object, 0, file, line);
+ /*
+ * See if there is more than one shared lock held. If
+ * so, just drop one and return.
+ */
+ if (SX_SHARERS(x) > 1) {
+ if (atomic_cmpset_ptr(&sx->sx_lock, x,
+ x - SX_ONE_SHARER)) {
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR4(KTR_LOCK,
+ "%s: %p succeeded %p -> %p",
+ __func__, sx, (void *)x,
+ (void *)(x - SX_ONE_SHARER));
+ break;
+ }
+ continue;
+ }
+
+ /*
+ * If there aren't any waiters for an exclusive lock,
+ * then try to drop it quickly.
+ */
+ if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) {
+ MPASS(x == SX_SHARERS_LOCK(1));
+ if (atomic_cmpset_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1),
+ SX_LOCK_UNLOCKED)) {
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p last succeeded",
+ __func__, sx);
+ break;
+ }
+ continue;
+ }
- sx->sx_cnt = 1;
- sx->sx_xholder = NULL;
- if (sx->sx_shrd_wcnt > 0)
- cv_broadcast(&sx->sx_shrd_cv);
+ /*
+ * At this point, there should just be one sharer with
+ * exclusive waiters.
+ */
+ MPASS(x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS));
- LOCK_LOG_LOCK("XDOWNGRADE", &sx->sx_object, 0, 0, file, line);
+ sleepq_lock(&sx->lock_object);
- mtx_unlock(sx->sx_lock);
+ /*
+ * Wake up semantic here is quite simple:
+ * Just wake up all the exclusive waiters.
+ * Note that the state of the lock could have changed,
+ * so if it fails loop back and retry.
+ */
+ if (!atomic_cmpset_ptr(&sx->sx_lock,
+ SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS,
+ SX_LOCK_UNLOCKED)) {
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p waking up all thread on"
+ "exclusive queue", __func__, sx);
+ sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, -1,
+ SQ_EXCLUSIVE_QUEUE);
+ break;
+ }
}
#ifdef INVARIANT_SUPPORT
@@ -334,44 +887,76 @@
void
_sx_assert(struct sx *sx, int what, const char *file, int line)
{
+#ifndef WITNESS
+ int slocked = 0;
+#endif
if (panicstr != NULL)
return;
switch (what) {
- case SX_LOCKED:
- case SX_SLOCKED:
+ case SA_SLOCKED:
+ case SA_SLOCKED | SA_NOTRECURSED:
+ case SA_SLOCKED | SA_RECURSED:
+#ifndef WITNESS
+ slocked = 1;
+ /* FALLTHROUGH */
+#endif
+ case SA_LOCKED:
+ case SA_LOCKED | SA_NOTRECURSED:
+ case SA_LOCKED | SA_RECURSED:
#ifdef WITNESS
- witness_assert(&sx->sx_object, what, file, line);
+ witness_assert(&sx->lock_object, what, file, line);
#else
- mtx_lock(sx->sx_lock);
- if (sx->sx_cnt <= 0 &&
- (what == SX_SLOCKED || sx->sx_xholder != curthread))
+ /*
+ * If some other thread has an exclusive lock or we
+ * have one and are asserting a shared lock, fail.
+ * Also, if no one has a lock at all, fail.
+ */
+ if (sx->sx_lock == SX_LOCK_UNLOCKED ||
+ (!(sx->sx_lock & SX_LOCK_SHARED) && (slocked ||
+ sx_xholder(sx) != curthread)))
panic("Lock %s not %slocked @ %s:%d\n",
- sx->sx_object.lo_name, (what == SX_SLOCKED) ?
- "share " : "", file, line);
- mtx_unlock(sx->sx_lock);
+ sx->lock_object.lo_name, slocked ? "share " : "",
+ file, line);
+
+ if (!(sx->sx_lock & SX_LOCK_SHARED)) {
+ if (sx_recursed(sx)) {
+ if (what & SA_NOTRECURSED)
+ panic("Lock %s recursed @ %s:%d\n",
+ sx->lock_object.lo_name, file,
+ line);
+ } else if (what & SA_RECURSED)
+ panic("Lock %s not recursed @ %s:%d\n",
+ sx->lock_object.lo_name, file, line);
+ }
#endif
break;
- case SX_XLOCKED:
- mtx_lock(sx->sx_lock);
- if (sx->sx_xholder != curthread)
+ case SA_XLOCKED:
+ case SA_XLOCKED | SA_NOTRECURSED:
+ case SA_XLOCKED | SA_RECURSED:
+ if (sx_xholder(sx) != curthread)
panic("Lock %s not exclusively locked @ %s:%d\n",
- sx->sx_object.lo_name, file, line);
- mtx_unlock(sx->sx_lock);
+ sx->lock_object.lo_name, file, line);
+ if (sx_recursed(sx)) {
+ if (what & SA_NOTRECURSED)
+ panic("Lock %s recursed @ %s:%d\n",
+ sx->lock_object.lo_name, file, line);
+ } else if (what & SA_RECURSED)
+ panic("Lock %s not recursed @ %s:%d\n",
+ sx->lock_object.lo_name, file, line);
break;
- case SX_UNLOCKED:
+ case SA_UNLOCKED:
#ifdef WITNESS
- witness_assert(&sx->sx_object, what, file, line);
+ witness_assert(&sx->lock_object, what, file, line);
#else
/*
- * We are able to check only exclusive lock here,
- * we cannot assert that *this* thread owns slock.
+ * If we hold an exclusve lock fail. We can't
+ * reliably check to see if we hold a shared lock or
+ * not.
*/
- mtx_lock(sx->sx_lock);
- if (sx->sx_xholder == curthread)
+ if (sx_xholder(sx) == curthread)
panic("Lock %s exclusively locked @ %s:%d\n",
- sx->sx_object.lo_name, file, line);
- mtx_unlock(sx->sx_lock);
+ sx->lock_object.lo_name, file, line);
#endif
break;
default:
@@ -382,7 +967,7 @@
#endif /* INVARIANT_SUPPORT */
#ifdef DDB
-void
+static void
db_show_sx(struct lock_object *lock)
{
struct thread *td;
@@ -391,15 +976,66 @@
sx = (struct sx *)lock;
db_printf(" state: ");
- if (sx->sx_cnt < 0) {
- td = sx->sx_xholder;
+ if (sx->sx_lock == SX_LOCK_UNLOCKED)
+ db_printf("UNLOCKED\n");
+ else if (sx->sx_lock == SX_LOCK_DESTROYED) {
+ db_printf("DESTROYED\n");
+ return;
+ } else if (sx->sx_lock & SX_LOCK_SHARED)
+ db_printf("SLOCK: %ju\n", (uintmax_t)SX_SHARERS(sx->sx_lock));
+ else {
+ td = sx_xholder(sx);
db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
td->td_tid, td->td_proc->p_pid, td->td_proc->p_comm);
- } else if (sx->sx_cnt > 0)
- db_printf("SLOCK: %d locks\n", sx->sx_cnt);
+ if (sx_recursed(sx))
+ db_printf(" recursed: %d\n", sx->sx_recurse);
+ }
+
+ db_printf(" waiters: ");
+ switch(sx->sx_lock &
+ (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)) {
+ case SX_LOCK_SHARED_WAITERS:
+ db_printf("shared\n");
+ break;
+ case SX_LOCK_EXCLUSIVE_WAITERS:
+ db_printf("exclusive\n");
+ break;
+ case SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS:
+ db_printf("exclusive and shared\n");
+ break;
+ default:
+ db_printf("none\n");
+ }
+}
+
+/*
+ * Check to see if a thread that is blocked on a sleep queue is actually
+ * blocked on an sx lock. If so, output some details and return true.
+ * If the lock has an exclusive owner, return that in *ownerp.
+ */
+int
+sx_chain(struct thread *td, struct thread **ownerp)
+{
+ struct sx *sx;
+
+ /*
+ * Check to see if this thread is blocked on an sx lock.
+ * First, we check the lock class. If that is ok, then we
+ * compare the lock name against the wait message.
+ */
+ sx = td->td_wchan;
+ if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx ||
+ sx->lock_object.lo_name != td->td_wmesg)
+ return (0);
+
+ /* We think we have an sx lock, so output some details. */
+ db_printf("blocked on sx \"%s\" ", td->td_wmesg);
+ *ownerp = sx_xholder(sx);
+ if (sx->sx_lock & SX_LOCK_SHARED)
+ db_printf("SLOCK (count %ju)\n",
+ (uintmax_t)SX_SHARERS(sx->sx_lock));
else
- db_printf("UNLOCKED\n");
- db_printf(" waiters: %d shared, %d exclusive\n", sx->sx_shrd_wcnt,
- sx->sx_excl_wcnt);
+ db_printf("XLOCK\n");
+ return (1);
}
#endif
Index: vfs_lookup.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_lookup.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/vfs_lookup.c -L sys/kern/vfs_lookup.c -u -r1.2 -r1.3
--- sys/kern/vfs_lookup.c
+++ sys/kern/vfs_lookup.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_lookup.c,v 1.80.2.6.2.1 2006/04/30 03:58:12 kris Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_lookup.c,v 1.102 2007/09/21 10:16:56 pjd Exp $");
#include "opt_ktrace.h"
#include "opt_mac.h"
@@ -45,7 +45,6 @@
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/vnode.h>
@@ -58,6 +57,9 @@
#include <sys/ktrace.h>
#endif
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
#include <vm/uma.h>
#define NAMEI_DIAGNOSTIC 1
@@ -67,13 +69,22 @@
* Allocation zone for namei
*/
uma_zone_t namei_zone;
+/*
+ * Placeholder vnode for mp traversal
+ */
+static struct vnode *vp_crossmp;
static void
nameiinit(void *dummy __unused)
{
+ int error;
+
namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
UMA_ALIGN_PTR, 0);
-
+ error = getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp);
+ if (error != 0)
+ panic("nameiinit: getnewvnode");
+ vp_crossmp->v_vnlock->lk_flags &= ~LK_NOSHARE;
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL)
@@ -86,7 +97,7 @@
"Enables/Disables shared locks for path name translation");
/*
- * Convert a pathname into a pointer to a locked inode.
+ * Convert a pathname into a pointer to a locked vnode.
*
* The FOLLOW flag is set when symbolic links are to be followed
* when they occur at the end of the name translation process.
@@ -106,12 +117,11 @@
* }
*/
int
-namei(ndp)
- register struct nameidata *ndp;
+namei(struct nameidata *ndp)
{
- register struct filedesc *fdp; /* pointer to file descriptor state */
- register char *cp; /* pointer into pathname argument */
- register struct vnode *dp; /* the directory we are searching */
+ struct filedesc *fdp; /* pointer to file descriptor state */
+ char *cp; /* pointer into pathname argument */
+ struct vnode *dp; /* the directory we are searching */
struct iovec aiov; /* uio for reading symbolic links */
struct uio auio;
int error, linklen;
@@ -145,6 +155,12 @@
error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+ /* If we are auditing the kernel pathname, save the user pathname. */
+ if (cnp->cn_flags & AUDITVNODE1)
+ AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH1);
+ if (cnp->cn_flags & AUDITVNODE2)
+ AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH2);
+
/*
* Don't allow empty pathnames.
*/
@@ -172,14 +188,14 @@
/*
* Get starting point for the translation.
*/
- FILEDESC_LOCK(fdp);
+ FILEDESC_SLOCK(fdp);
ndp->ni_rootdir = fdp->fd_rdir;
ndp->ni_topdir = fdp->fd_jdir;
dp = fdp->fd_cdir;
vfslocked = VFS_LOCK_GIANT(dp->v_mount);
VREF(dp);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
for (;;) {
/*
* Check if root directory should replace current directory.
@@ -296,6 +312,17 @@
return (error);
}
+static int
+compute_cn_lkflags(struct mount *mp, int lkflags)
+{
+ if (mp == NULL ||
+ ((lkflags & LK_SHARED) && !(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED))) {
+ lkflags &= ~LK_SHARED;
+ lkflags |= LK_EXCLUSIVE;
+ }
+ return lkflags;
+}
+
/*
* Search a pathname.
* This is a very central and rather complicated routine.
@@ -335,11 +362,10 @@
* if WANTPARENT set, return unlocked parent in ni_dvp
*/
int
-lookup(ndp)
- register struct nameidata *ndp;
+lookup(struct nameidata *ndp)
{
- register char *cp; /* pointer into pathname argument */
- register struct vnode *dp = 0; /* the directory we are searching */
+ char *cp; /* pointer into pathname argument */
+ struct vnode *dp = 0; /* the directory we are searching */
struct vnode *tdp; /* saved dp */
struct mount *mp; /* mount table entry */
int docache; /* == 0 do not cache last component */
@@ -353,7 +379,8 @@
int vfslocked; /* VFS Giant state for child */
int dvfslocked; /* VFS Giant state for parent */
int tvfslocked;
-
+ int lkflags_save;
+
/*
* Setup: break out flag bits into variables.
*/
@@ -381,7 +408,7 @@
cnp->cn_lkflags = LK_EXCLUSIVE;
dp = ndp->ni_startdir;
ndp->ni_startdir = NULLVP;
- vn_lock(dp, cnp->cn_lkflags | LK_RETRY, td);
+ vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
dirloop:
/*
@@ -460,6 +487,12 @@
VREF(dp);
}
ndp->ni_vp = dp;
+
+ if (cnp->cn_flags & AUDITVNODE1)
+ AUDIT_ARG(vnode, dp, ARG_VNODE1);
+ else if (cnp->cn_flags & AUDITVNODE2)
+ AUDIT_ARG(vnode, dp, ARG_VNODE2);
+
if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
VOP_UNLOCK(dp, 0, td);
/* XXX This should probably move to the top of function. */
@@ -491,15 +524,16 @@
for (;;) {
if (dp == ndp->ni_rootdir ||
dp == ndp->ni_topdir ||
- dp == rootvnode) {
+ dp == rootvnode ||
+ ((dp->v_vflag & VV_ROOT) != 0 &&
+ (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
ndp->ni_dvp = dp;
ndp->ni_vp = dp;
vfslocked = VFS_LOCK_GIANT(dp->v_mount);
VREF(dp);
goto nextname;
}
- if ((dp->v_vflag & VV_ROOT) == 0 ||
- (cnp->cn_flags & NOCROSSMOUNT))
+ if ((dp->v_vflag & VV_ROOT) == 0)
break;
if (dp->v_iflag & VI_DOOMED) { /* forced unmount */
error = EBADF;
@@ -512,7 +546,7 @@
VREF(dp);
vput(tdp);
VFS_UNLOCK_GIANT(tvfslocked);
- vn_lock(dp, cnp->cn_lkflags | LK_RETRY, td);
+ vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
}
}
@@ -535,7 +569,8 @@
* If we have a shared lock we may need to upgrade the lock for the
* last operation.
*/
- if (VOP_ISLOCKED(dp, td) == LK_SHARED &&
+ if (dp != vp_crossmp &&
+ VOP_ISLOCKED(dp, td) == LK_SHARED &&
(cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT))
vn_lock(dp, LK_UPGRADE|LK_RETRY, td);
/*
@@ -548,7 +583,10 @@
#ifdef NAMEI_DIAGNOSTIC
vprint("lookup in", dp);
#endif
+ lkflags_save = cnp->cn_lkflags;
+ cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags);
if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
+ cnp->cn_lkflags = lkflags_save;
KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
#ifdef NAMEI_DIAGNOSTIC
printf("not found\n");
@@ -563,7 +601,7 @@
VREF(dp);
vput(tdp);
VFS_UNLOCK_GIANT(tvfslocked);
- vn_lock(dp, cnp->cn_lkflags | LK_RETRY, td);
+ vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
goto unionlookup;
}
@@ -593,14 +631,15 @@
/*
* We return with ni_vp NULL to indicate that the entry
* doesn't currently exist, leaving a pointer to the
- * (possibly locked) directory inode in ndp->ni_dvp.
+ * (possibly locked) directory vnode in ndp->ni_dvp.
*/
if (cnp->cn_flags & SAVESTART) {
ndp->ni_startdir = ndp->ni_dvp;
VREF(ndp->ni_startdir);
}
goto success;
- }
+ } else
+ cnp->cn_lkflags = lkflags_save;
#ifdef NAMEI_DIAGNOSTIC
printf("found\n");
#endif
@@ -630,10 +669,17 @@
VFS_UNLOCK_GIANT(vfslocked);
vfslocked = VFS_LOCK_GIANT(mp);
if (dp != ndp->ni_dvp)
- VOP_UNLOCK(ndp->ni_dvp, 0, td);
- error = VFS_ROOT(mp, cnp->cn_lkflags, &tdp, td);
+ vput(ndp->ni_dvp);
+ else
+ vrele(ndp->ni_dvp);
+ VFS_UNLOCK_GIANT(dvfslocked);
+ dvfslocked = 0;
+ vref(vp_crossmp);
+ ndp->ni_dvp = vp_crossmp;
+ error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags), &tdp, td);
vfs_unbusy(mp, td);
- vn_lock(ndp->ni_dvp, cnp->cn_lkflags | LK_RETRY, td);
+ if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT, td))
+ panic("vp_crossmp exclusively locked or reclaimed");
if (error) {
dpunlocked = 1;
goto bad2;
@@ -718,9 +764,22 @@
} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp)
VOP_UNLOCK(ndp->ni_dvp, 0, td);
+ if (cnp->cn_flags & AUDITVNODE1)
+ AUDIT_ARG(vnode, dp, ARG_VNODE1);
+ else if (cnp->cn_flags & AUDITVNODE2)
+ AUDIT_ARG(vnode, dp, ARG_VNODE2);
+
if ((cnp->cn_flags & LOCKLEAF) == 0)
VOP_UNLOCK(dp, 0, td);
success:
+ /*
+ * Because of lookup_shared we may have the vnode shared locked, but
+ * the caller may want it to be exclusively locked.
+ */
+ if ((cnp->cn_flags & (ISLASTCN | LOCKSHARED | LOCKLEAF)) ==
+ (ISLASTCN | LOCKLEAF) && VOP_ISLOCKED(dp, td) != LK_EXCLUSIVE) {
+ vn_lock(dp, LK_UPGRADE | LK_RETRY, td);
+ }
if (vfslocked && dvfslocked)
VFS_UNLOCK_GIANT(dvfslocked); /* Only need one */
if (vfslocked || dvfslocked)
@@ -744,12 +803,10 @@
/*
* relookup - lookup a path name component
- * Used by lookup to re-aquire things.
+ * Used by lookup to re-acquire things.
*/
int
-relookup(dvp, vpp, cnp)
- struct vnode *dvp, **vpp;
- struct componentname *cnp;
+relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
{
struct thread *td = cnp->cn_thread;
struct vnode *dp = 0; /* the directory we are searching */
@@ -840,10 +897,11 @@
/*
* We return with ni_vp NULL to indicate that the entry
* doesn't currently exist, leaving a pointer to the
- * (possibly locked) directory inode in ndp->ni_dvp.
+ * (possibly locked) directory vnode in ndp->ni_dvp.
*/
return (0);
}
+
dp = *vpp;
/*
@@ -891,9 +949,7 @@
* Free data allocated by namei(); see namei(9) for details.
*/
void
-NDFREE(ndp, flags)
- struct nameidata *ndp;
- const u_int flags;
+NDFREE(struct nameidata *ndp, const u_int flags)
{
int unlock_dvp;
int unlock_vp;
--- /dev/null
+++ sys/kern/tty_pts.c
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2003 Networks Associates Technology, Inc.
+ * Copyright (c) 2006 Robert N. M. Watson
+ * Copyright (c) 2006 Olivier Houchard
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project in part by Network
+ * Associates Laboratories, the Security Research Division of Network
+ * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
+ * as part of the DARPA CHATS research program.
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tty_pty.c 8.4 (Berkeley) 2/20/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/tty_pts.c,v 1.16 2007/07/05 05:54:47 peter Exp $");
+
+/*
+ * Pseudo-teletype Driver
+ * (Actually two drivers, requiring two entries in 'cdevsw')
+ */
+#include "opt_compat.h"
+#include "opt_tty.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#if defined(COMPAT_43TTY)
+#include <sys/ioctl_compat.h>
+#endif
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/tty.h>
+#include <sys/fcntl.h>
+#include <sys/poll.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/sysctl.h>
+#include <sys/filio.h>
+
+static MALLOC_DEFINE(M_PTY, "ptys", "pty data structures");
+
+static void ptsstart(struct tty *tp);
+static void ptsstop(struct tty *tp, int rw);
+static void ptcwakeup(struct tty *tp, int flag);
+
+static d_open_t ptsopen;
+static d_close_t ptsclose;
+static d_read_t ptsread;
+static d_write_t ptswrite;
+static d_ioctl_t ptsioctl;
+static d_ioctl_t ptcioctl;
+static d_open_t ptcopen;
+static d_close_t ptcclose;
+static d_read_t ptcread;
+static d_write_t ptcwrite;
+static d_poll_t ptcpoll;
+
+static struct cdevsw pts_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = ptsopen,
+ .d_close = ptsclose,
+ .d_read = ptsread,
+ .d_write = ptswrite,
+ .d_ioctl = ptsioctl,
+ .d_poll = ttypoll,
+ .d_name = "pts",
+ .d_flags = D_TTY | D_NEEDGIANT,
+ .d_kqfilter = ttykqfilter,
+};
+
+static struct cdevsw ptc_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = ptcopen,
+ .d_close = ptcclose,
+ .d_read = ptcread,
+ .d_write = ptcwrite,
+ .d_ioctl = ptcioctl,
+ .d_poll = ptcpoll,
+ .d_name = "ptc",
+ .d_flags = D_TTY | D_NEEDGIANT,
+ .d_kqfilter = ttykqfilter,
+};
+
+#define BUFSIZ 100 /* Chunk size iomoved to/from user */
+
+#define TSA_PTC_READ(tp) ((void *)&(tp)->t_outq.c_cf)
+#define TSA_PTC_WRITE(tp) ((void *)&(tp)->t_rawq.c_cl)
+#define TSA_PTS_READ(tp) ((void *)&(tp)->t_canq)
+
+#define NUM_TO_MINOR(c) ((c & 0xff) | ((c & ~0xff) << 16))
+/*-
+ * Once a tty is allocated, it cannot (currently) be freed. As such,
+ * we keep a global list of ptys that have been used so we can recycle
+ * them. Another list is provided for released pts, which are
+ * not currently allocated, permitting reuse. pt_flags holds state
+ * associated with a particular session, so isn't overloaded for this.
+ * When a pty descriptor is unused, its number is set to -1 giving
+ * more consistent and traditional allocation orders to pty numbers.
+ *
+ * Locking: (p) indicates that the field is locked by the global pt_mtx.
+ * (c) indicates the value is constant after allocation. Other fields
+ * await tty locking generally, and are protected by Giant.
+ */
+struct pt_desc {
+ int pt_num; /* (c) pty number */
+ LIST_ENTRY(pt_desc) pt_list; /* (p) global pty list */
+
+ int pt_flags;
+ struct selinfo pt_selr, pt_selw;
+ u_char pt_send;
+ u_char pt_ucntl;
+ struct tty *pt_tty;
+ struct cdev *pt_devs, *pt_devc;
+ int pt_pts_open, pt_ptc_open;
+ struct prison *pt_prison;
+};
+
+static struct mtx pt_mtx;
+static LIST_HEAD(,pt_desc) pt_list;
+static LIST_HEAD(,pt_desc) pt_free_list;
+
+#define PF_PKT 0x008 /* packet mode */
+#define PF_STOPPED 0x010 /* user told stopped */
+#define PF_NOSTOP 0x040
+#define PF_UCNTL 0x080 /* user control mode */
+
+static unsigned int next_avail_nb;
+
+static int use_pts = 0;
+
+static unsigned int max_pts = 1000;
+
+static unsigned int nb_allocated;
+
+TUNABLE_INT("kern.pts.enable", &use_pts);
+
+SYSCTL_NODE(_kern, OID_AUTO, pts, CTLFLAG_RD, 0, "pts");
+
+SYSCTL_INT(_kern_pts, OID_AUTO, enable, CTLFLAG_RW, &use_pts, 0,
+ "enable pts");
+
+SYSCTL_INT(_kern_pts, OID_AUTO, max, CTLFLAG_RW, &max_pts, 0, "max pts");
+
+/*
+ * If there's a free pty descriptor in the pty descriptor list, retrieve it.
+ * Otherwise, allocate a new one, initialize it, and hook it up. If there's
+ * not a tty number, reject.
+ */
+static struct pt_desc *
+pty_new(void)
+{
+ struct pt_desc *pt;
+ int nb;
+
+ mtx_lock(&pt_mtx);
+ if (nb_allocated >= max_pts || nb_allocated == 0xffffff) {
+ mtx_unlock(&pt_mtx);
+ return (NULL);
+ }
+ nb_allocated++;
+ pt = LIST_FIRST(&pt_free_list);
+ if (pt) {
+ LIST_REMOVE(pt, pt_list);
+ LIST_INSERT_HEAD(&pt_list, pt, pt_list);
+ mtx_unlock(&pt_mtx);
+ } else {
+ nb = next_avail_nb++;
+ mtx_unlock(&pt_mtx);
+ pt = malloc(sizeof(*pt), M_PTY, M_WAITOK | M_ZERO);
+ mtx_lock(&pt_mtx);
+ pt->pt_num = nb;
+ LIST_INSERT_HEAD(&pt_list, pt, pt_list);
+ mtx_unlock(&pt_mtx);
+ pt->pt_tty = ttyalloc();
+ }
+ return (pt);
+}
+
+/*
+ * Release a pty descriptor back to the pool for reuse. The pty number
+ * remains allocated.
+ */
+static void
+pty_release(void *v)
+{
+ struct pt_desc *pt = (struct pt_desc *)v;
+
+ mtx_lock(&pt_mtx);
+ KASSERT(pt->pt_ptc_open == 0 && pt->pt_pts_open == 0,
+ ("pty_release: pts/%d freed while open\n", pt->pt_num));
+ KASSERT(pt->pt_devs == NULL && pt->pt_devc == NULL,
+ ("pty_release: pts/%d freed whith non-null struct cdev\n", pt->pt_num));
+ nb_allocated--;
+ LIST_REMOVE(pt, pt_list);
+ LIST_INSERT_HEAD(&pt_free_list, pt, pt_list);
+ mtx_unlock(&pt_mtx);
+}
+
+/*
+ * Given a pty descriptor, if both endpoints are closed, release all
+ * resources and destroy the device nodes to flush file system level
+ * state for the tty (owner, avoid races, etc).
+ */
+static void
+pty_maybecleanup(struct pt_desc *pt)
+{
+ struct cdev *pt_devs, *pt_devc;
+
+ if (pt->pt_ptc_open || pt->pt_pts_open)
+ return;
+
+ if (pt->pt_tty->t_refcnt > 1)
+ return;
+
+ if (bootverbose)
+ printf("destroying pty %d\n", pt->pt_num);
+
+ pt_devs = pt->pt_devs;
+ pt_devc = pt->pt_devc;
+ pt->pt_devs = pt->pt_devc = NULL;
+ pt->pt_tty->t_dev = NULL;
+ pt_devc->si_drv1 = NULL;
+ ttyrel(pt->pt_tty);
+ pt->pt_tty = NULL;
+ destroy_dev_sched(pt_devs);
+ destroy_dev_sched_cb(pt_devc, pty_release, pt);
+}
+
+/*ARGSUSED*/
+static int
+ptsopen(struct cdev *dev, int flag, int devtype, struct thread *td)
+{
+ struct tty *tp;
+ int error;
+ struct pt_desc *pt;
+
+ pt = dev->si_drv1;
+ tp = dev->si_tty;
+ if ((tp->t_state & TS_ISOPEN) == 0)
+ ttyinitmode(tp, 1, 0);
+ else if (tp->t_state & TS_XCLUDE && priv_check(td,
+ PRIV_TTY_EXCLUSIVE)) {
+ return (EBUSY);
+ } else if (pt->pt_prison != td->td_ucred->cr_prison &&
+ priv_check(td, PRIV_TTY_PRISON)) {
+ return (EBUSY);
+ }
+ if (tp->t_oproc) /* Ctrlr still around. */
+ ttyld_modem(tp, 1);
+ while ((tp->t_state & TS_CARR_ON) == 0) {
+ if (flag & FNONBLOCK)
+ break;
+ error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
+ "ptsopn", 0);
+ if (error)
+ return (error);
+ }
+ error = ttyld_open(tp, dev);
+ if (error == 0) {
+ ptcwakeup(tp, FREAD|FWRITE);
+ pt->pt_pts_open = 1;
+ }
+ return (error);
+}
+
+static int
+ptsclose(struct cdev *dev, int flag, int mode, struct thread *td)
+{
+ struct pt_desc *pt = dev->si_drv1;
+ struct tty *tp;
+ int err;
+
+ tp = dev->si_tty;
+ err = ttyld_close(tp, flag);
+ ptsstop(tp, FREAD|FWRITE);
+ (void) tty_close(tp);
+ pt->pt_pts_open = 0;
+ pty_maybecleanup(pt);
+ return (err);
+}
+
+static int
+ptsread(struct cdev *dev, struct uio *uio, int flag)
+{
+ struct tty *tp = dev->si_tty;
+ int error = 0;
+
+ if (tp->t_oproc)
+ error = ttyld_read(tp, uio, flag);
+ ptcwakeup(tp, FWRITE);
+ return (error);
+}
+
+/*
+ * Write to pseudo-tty.
+ * Wakeups of controlling tty will happen
+ * indirectly, when tty driver calls ptsstart.
+ */
+static int
+ptswrite(struct cdev *dev, struct uio *uio, int flag)
+{
+ struct tty *tp;
+
+ tp = dev->si_tty;
+ if (tp->t_oproc == 0)
+ return (EIO);
+ return (ttyld_write(tp, uio, flag));
+}
+
+/*
+ * Start output on pseudo-tty.
+ * Wake up process selecting or sleeping for input from controlling tty.
+ */
+static void
+ptsstart(struct tty *tp)
+{
+ struct pt_desc *pt = tp->t_dev->si_drv1;
+
+ if (tp->t_state & TS_TTSTOP)
+ return;
+ if (pt->pt_flags & PF_STOPPED) {
+ pt->pt_flags &= ~PF_STOPPED;
+ pt->pt_send = TIOCPKT_START;
+ }
+ ptcwakeup(tp, FREAD);
+}
+
+static void
+ptcwakeup(struct tty *tp, int flag)
+{
+ struct pt_desc *pt = tp->t_dev->si_drv1;
+
+ if (flag & FREAD) {
+ selwakeup(&pt->pt_selr);
+ wakeup(TSA_PTC_READ(tp));
+ }
+ if (flag & FWRITE) {
+ selwakeup(&pt->pt_selw);
+ wakeup(TSA_PTC_WRITE(tp));
+ }
+}
+
+/*
+ * ptcopen implementes exclusive access to the master/control device
+ * as well as creating the slave device based on the credential of the
+ * process opening the master. By creating the slave here, we avoid
+ * a race to access the master in terms of having a process with access
+ * to an incorrectly owned slave, but it does create the possibility
+ * that a racing process can cause a ptmx user to get EIO if it gets
+ * there first. Consumers of ptmx must look for EIO and retry if it
+ * happens. VFS locking may actually prevent this from occurring due
+ * to the lookup into devfs holding the vnode lock through open, but
+ * it's better to be careful.
+ */
+static int
+ptcopen(struct cdev *dev, int flag, int devtype, struct thread *td)
+{
+ struct pt_desc *pt;
+ struct tty *tp;
+ struct cdev *devs;
+
+ pt = dev->si_drv1;
+ if (pt == NULL)
+ return (EIO);
+ /*
+ * In case we have destroyed the struct tty at the last connect time,
+ * we need to recreate it.
+ */
+ if (pt->pt_tty == NULL) {
+ pt->pt_tty = ttyalloc();
+ dev->si_tty = pt->pt_tty;
+ }
+ tp = dev->si_tty;
+ if (tp->t_oproc)
+ return (EIO);
+
+ /*
+ * XXX: Might want to make the ownership/permissions here more
+ * configurable.
+ */
+ if (pt->pt_devs)
+ devs = pt->pt_devs;
+ else
+ pt->pt_devs = devs = make_dev_cred(&pts_cdevsw,
+ NUM_TO_MINOR(pt->pt_num),
+ td->td_ucred, UID_ROOT, GID_WHEEL, 0666, "pts/%d",
+ pt->pt_num);
+ devs->si_drv1 = pt;
+ devs->si_tty = pt->pt_tty;
+ pt->pt_tty->t_dev = devs;
+
+ tp->t_timeout = -1;
+ tp->t_oproc = ptsstart;
+ tp->t_stop = ptsstop;
+ ttyld_modem(tp, 1);
+ tp->t_lflag &= ~EXTPROC;
+ pt = dev->si_drv1;
+ pt->pt_prison = td->td_ucred->cr_prison;
+ pt->pt_flags = 0;
+ pt->pt_send = 0;
+ pt->pt_ucntl = 0;
+ pt->pt_ptc_open = 1;
+ return (0);
+}
+
+static int
+ptcclose(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+ struct pt_desc *pt = dev->si_drv1;
+ struct tty *tp;
+
+ tp = dev->si_tty;
+ ttyld_modem(tp, 0);
+
+ /*
+ * XXX MDMBUF makes no sense for ptys but would inhibit the above
+ * l_modem(). CLOCAL makes sense but isn't supported. Special
+ * l_modem()s that ignore carrier drop make no sense for ptys but
+ * may be in use because other parts of the line discipline make
+ * sense for ptys. Recover by doing everything that a normal
+ * ttymodem() would have done except for sending a SIGHUP.
+ */
+ if (tp->t_state & TS_ISOPEN) {
+ tp->t_state &= ~(TS_CARR_ON | TS_CONNECTED);
+ tp->t_state |= TS_ZOMBIE;
+ ttyflush(tp, FREAD | FWRITE);
+ }
+
+ tp->t_oproc = 0; /* mark closed */
+ pt->pt_ptc_open = 0;
+ pty_maybecleanup(pt);
+ return (0);
+}
+
+static int
+ptcread(struct cdev *dev, struct uio *uio, int flag)
+{
+ struct tty *tp = dev->si_tty;
+ struct pt_desc *pt = dev->si_drv1;
+ char buf[BUFSIZ];
+ int error = 0, cc;
+
+ /*
+ * We want to block until the slave
+ * is open, and there's something to read;
+ * but if we lost the slave or we're NBIO,
+ * then return the appropriate error instead.
+ */
+ for (;;) {
+ if (tp->t_state&TS_ISOPEN) {
+ if (pt->pt_flags&PF_PKT && pt->pt_send) {
+ error = ureadc((int)pt->pt_send, uio);
+ if (error)
+ return (error);
+ if (pt->pt_send & TIOCPKT_IOCTL) {
+ cc = min(uio->uio_resid,
+ sizeof(tp->t_termios));
+ uiomove(&tp->t_termios, cc, uio);
+ }
+ pt->pt_send = 0;
+ return (0);
+ }
+ if (pt->pt_flags&PF_UCNTL && pt->pt_ucntl) {
+ error = ureadc((int)pt->pt_ucntl, uio);
+ if (error)
+ return (error);
+ pt->pt_ucntl = 0;
+ return (0);
+ }
+ if (tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0)
+ break;
+ }
+ if ((tp->t_state & TS_CONNECTED) == 0)
+ return (0); /* EOF */
+ if (flag & O_NONBLOCK)
+ return (EWOULDBLOCK);
+ error = tsleep(TSA_PTC_READ(tp), TTIPRI | PCATCH, "ptcin", 0);
+ if (error)
+ return (error);
+ }
+ if (pt->pt_flags & (PF_PKT|PF_UCNTL))
+ error = ureadc(0, uio);
+ while (uio->uio_resid > 0 && error == 0) {
+ cc = q_to_b(&tp->t_outq, buf, min(uio->uio_resid, BUFSIZ));
+ if (cc <= 0)
+ break;
+ error = uiomove(buf, cc, uio);
+ }
+ ttwwakeup(tp);
+ return (error);
+}
+
+static void
+ptsstop(struct tty *tp, int flush)
+{
+ struct pt_desc *pt = tp->t_dev->si_drv1;
+ int flag;
+
+ /* note: FLUSHREAD and FLUSHWRITE already ok */
+ if (flush == 0) {
+ flush = TIOCPKT_STOP;
+ pt->pt_flags |= PF_STOPPED;
+ } else
+ pt->pt_flags &= ~PF_STOPPED;
+ pt->pt_send |= flush;
+ /* change of perspective */
+ flag = 0;
+ if (flush & FREAD)
+ flag |= FWRITE;
+ if (flush & FWRITE)
+ flag |= FREAD;
+ ptcwakeup(tp, flag);
+}
+
+static int
+ptcpoll(struct cdev *dev, int events, struct thread *td)
+{
+ struct tty *tp = dev->si_tty;
+ struct pt_desc *pt = dev->si_drv1;
+ int revents = 0;
+ int s;
+
+ if ((tp->t_state & TS_CONNECTED) == 0)
+ return (events &
+ (POLLHUP | POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM));
+
+ /*
+ * Need to block timeouts (ttrstart).
+ */
+ s = spltty();
+
+ if (events & (POLLIN | POLLRDNORM))
+ if ((tp->t_state & TS_ISOPEN) &&
+ ((tp->t_outq.c_cc && (tp->t_state & TS_TTSTOP) == 0) ||
+ ((pt->pt_flags & PF_PKT) && pt->pt_send) ||
+ ((pt->pt_flags & PF_UCNTL) && pt->pt_ucntl)))
+ revents |= events & (POLLIN | POLLRDNORM);
+
+ if (events & (POLLOUT | POLLWRNORM))
+ if (tp->t_state & TS_ISOPEN &&
+ (((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG - 2) ||
+ (tp->t_canq.c_cc == 0 && (tp->t_lflag & ICANON)))))
+ revents |= events & (POLLOUT | POLLWRNORM);
+
+ if (events & POLLHUP)
+ if ((tp->t_state & TS_CARR_ON) == 0)
+ revents |= POLLHUP;
+
+ if (revents == 0) {
+ if (events & (POLLIN | POLLRDNORM))
+ selrecord(td, &pt->pt_selr);
+
+ if (events & (POLLOUT | POLLWRNORM))
+ selrecord(td, &pt->pt_selw);
+ }
+ splx(s);
+
+ return (revents);
+}
+
+static int
+ptcwrite(struct cdev *dev, struct uio *uio, int flag)
+{
+ struct tty *tp = dev->si_tty;
+ u_char *cp = 0;
+ int cc = 0;
+ u_char locbuf[BUFSIZ];
+ int cnt = 0;
+ int error = 0;
+
+again:
+ if ((tp->t_state&TS_ISOPEN) == 0)
+ goto block;
+ while (uio->uio_resid > 0 || cc > 0) {
+ if (cc == 0) {
+ cc = min(uio->uio_resid, BUFSIZ);
+ cp = locbuf;
+ error = uiomove(cp, cc, uio);
+ if (error)
+ return (error);
+ /* check again for safety */
+ if ((tp->t_state & TS_ISOPEN) == 0) {
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
+ return (EIO);
+ }
+ }
+ while (cc > 0) {
+ if ((tp->t_rawq.c_cc + tp->t_canq.c_cc) >= TTYHOG - 2 &&
+ (tp->t_canq.c_cc > 0 || !(tp->t_lflag&ICANON))) {
+ wakeup(TSA_HUP_OR_INPUT(tp));
+ goto block;
+ }
+ ttyld_rint(tp, *cp++);
+ cnt++;
+ cc--;
+ }
+ cc = 0;
+ }
+ return (0);
+block:
+ /*
+ * Come here to wait for slave to open, for space
+ * in outq, or space in rawq, or an empty canq.
+ */
+ if ((tp->t_state & TS_CONNECTED) == 0) {
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
+ return (EIO);
+ }
+ if (flag & IO_NDELAY) {
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
+ if (cnt == 0)
+ return (EWOULDBLOCK);
+ return (0);
+ }
+ error = tsleep(TSA_PTC_WRITE(tp), TTOPRI | PCATCH, "ptcout", 0);
+ if (error) {
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
+ return (error);
+ }
+ goto again;
+}
+
+static int
+ptcioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td)
+{
+ struct tty *tp = dev->si_tty;
+ struct pt_desc *pt = dev->si_drv1;
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+ int ival;
+#endif
+
+ switch (cmd) {
+
+ case TIOCGPGRP:
+ /*
+ * We avoid calling ttioctl on the controller since,
+ * in that case, tp must be the controlling terminal.
+ */
+ *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : 0;
+ return (0);
+
+ case TIOCPKT:
+ if (*(int *)data) {
+ if (pt->pt_flags & PF_UCNTL)
+ return (EINVAL);
+ pt->pt_flags |= PF_PKT;
+ } else
+ pt->pt_flags &= ~PF_PKT;
+ return (0);
+
+ case TIOCUCNTL:
+ if (*(int *)data) {
+ if (pt->pt_flags & PF_PKT)
+ return (EINVAL);
+ pt->pt_flags |= PF_UCNTL;
+ } else
+ pt->pt_flags &= ~PF_UCNTL;
+ return (0);
+ case TIOCGPTN:
+ *(unsigned int *)data = pt->pt_num;
+ return (0);
+ }
+
+ /*
+ * The rest of the ioctls shouldn't be called until
+ * the slave is open.
+ */
+ if ((tp->t_state & TS_ISOPEN) == 0) {
+ if (cmd == TIOCGETA) {
+ /*
+ * TIOCGETA is used by isatty() to make sure it's
+ * a tty. Linux openpty() calls isatty() very early,
+ * before the slave is opened, so don't actually
+ * fill the struct termios, but just let isatty()
+ * know it's a tty.
+ */
+ return (0);
+ }
+ if (cmd != FIONBIO && cmd != FIOASYNC)
+ return (EAGAIN);
+ }
+
+ switch (cmd) {
+#ifdef COMPAT_43TTY
+ case TIOCSETP:
+ case TIOCSETN:
+#endif
+ case TIOCSETD:
+ case TIOCSETA:
+ case TIOCSETAW:
+ case TIOCSETAF:
+ /*
+ * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG.
+ * ttywflush(tp) will hang if there are characters in
+ * the outq.
+ */
+ ndflush(&tp->t_outq, tp->t_outq.c_cc);
+ break;
+
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+ case _IO('t', 95):
+ ival = IOCPARM_IVAL(data);
+ data = (caddr_t)&ival;
+ /* FALLTHROUGH */
+#endif
+ case TIOCSIG:
+ if (*(unsigned int *)data >= NSIG ||
+ *(unsigned int *)data == 0)
+ return(EINVAL);
+ if ((tp->t_lflag&NOFLSH) == 0)
+ ttyflush(tp, FREAD|FWRITE);
+ if (tp->t_pgrp != NULL) {
+ PGRP_LOCK(tp->t_pgrp);
+ pgsignal(tp->t_pgrp, *(unsigned int *)data, 1);
+ PGRP_UNLOCK(tp->t_pgrp);
+ }
+ if ((*(unsigned int *)data == SIGINFO) &&
+ ((tp->t_lflag&NOKERNINFO) == 0))
+ ttyinfo(tp);
+ return(0);
+ }
+ return (ptsioctl(dev, cmd, data, flag, td));
+}
+/*ARGSUSED*/
+static int
+ptsioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td)
+{
+ struct tty *tp = dev->si_tty;
+ struct pt_desc *pt = dev->si_drv1;
+ u_char *cc = tp->t_cc;
+ int stop, error;
+
+ if (cmd == TIOCEXT) {
+ /*
+ * When the EXTPROC bit is being toggled, we need
+ * to send an TIOCPKT_IOCTL if the packet driver
+ * is turned on.
+ */
+ if (*(int *)data) {
+ if (pt->pt_flags & PF_PKT) {
+ pt->pt_send |= TIOCPKT_IOCTL;
+ ptcwakeup(tp, FREAD);
+ }
+ tp->t_lflag |= EXTPROC;
+ } else {
+ if ((tp->t_lflag & EXTPROC) &&
+ (pt->pt_flags & PF_PKT)) {
+ pt->pt_send |= TIOCPKT_IOCTL;
+ ptcwakeup(tp, FREAD);
+ }
+ tp->t_lflag &= ~EXTPROC;
+ }
+ return(0);
+ }
+ error = ttioctl(tp, cmd, data, flag);
+ if (error == ENOTTY) {
+ if (pt->pt_flags & PF_UCNTL &&
+ (cmd & ~0xff) == UIOCCMD(0)) {
+ if (cmd & 0xff) {
+ pt->pt_ucntl = (u_char)cmd;
+ ptcwakeup(tp, FREAD);
+ }
+ return (0);
+ }
+ error = ENOTTY;
+ }
+ /*
+ * If external processing and packet mode send ioctl packet.
+ */
+ if ((tp->t_lflag&EXTPROC) && (pt->pt_flags & PF_PKT)) {
+ switch(cmd) {
+ case TIOCSETA:
+ case TIOCSETAW:
+ case TIOCSETAF:
+#ifdef COMPAT_43TTY
+ case TIOCSETP:
+ case TIOCSETN:
+ case TIOCSETC:
+ case TIOCSLTC:
+ case TIOCLBIS:
+ case TIOCLBIC:
+ case TIOCLSET:
+#endif
+ pt->pt_send |= TIOCPKT_IOCTL;
+ ptcwakeup(tp, FREAD);
+ break;
+ default:
+ break;
+ }
+ }
+ stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s'))
+ && CCEQ(cc[VSTART], CTRL('q'));
+ if (pt->pt_flags & PF_NOSTOP) {
+ if (stop) {
+ pt->pt_send &= ~TIOCPKT_NOSTOP;
+ pt->pt_send |= TIOCPKT_DOSTOP;
+ pt->pt_flags &= ~PF_NOSTOP;
+ ptcwakeup(tp, FREAD);
+ }
+ } else {
+ if (!stop) {
+ pt->pt_send &= ~TIOCPKT_DOSTOP;
+ pt->pt_send |= TIOCPKT_NOSTOP;
+ pt->pt_flags |= PF_NOSTOP;
+ ptcwakeup(tp, FREAD);
+ }
+ }
+ return (error);
+}
+
+/*
+ * Match lookups on /dev/ptmx, find the next free pty (if any), set up
+ * the pty descriptor, register it, and return a reference to the master.
+ *
+ * pts == /dev/pts/xxx (oldstyle: ttyp...)
+ * ptc == /dev/pty/xxx (oldstyle: ptyp...)
+ */
+static void
+pty_clone(void *arg, struct ucred *cred, char *name, int namelen,
+ struct cdev **dev)
+{
+ struct pt_desc *pt;
+ struct cdev *devc;
+
+ if (!use_pts)
+ return;
+
+ if (*dev != NULL)
+ return;
+
+ if (strcmp(name, "ptmx") != 0)
+ return;
+
+ mtx_lock(&Giant);
+ pt = pty_new();
+ if (pt == NULL) {
+ mtx_unlock(&Giant);
+ return;
+ }
+
+ /*
+ * XXX: Lack of locking here considered worrying. We expose the
+ * pts/pty device nodes before they are fully initialized, although
+ * Giant likely protects us (unless make_dev blocks...?).
+ *
+ * XXX: If a process performs a lookup on /dev/ptmx but never an
+ * open, we won't GC the device node. We should have a callout
+ * sometime later that GC's device instances that were never
+ * opened, or some way to tell devfs that "this had better be for
+ * an open() or we won't create a device".
+ */
+ pt->pt_devc = devc = make_dev_credf(MAKEDEV_REF, &ptc_cdevsw,
+ NUM_TO_MINOR(pt->pt_num), cred, UID_ROOT, GID_WHEEL, 0666,
+ "pty/%d", pt->pt_num);
+
+ devc->si_drv1 = pt;
+ devc->si_tty = pt->pt_tty;
+ *dev = devc;
+ mtx_unlock(&Giant);
+
+ if (bootverbose)
+ printf("pty_clone: allocated pty %d to uid %d\n", pt->pt_num,
+ cred->cr_ruid);
+
+ return;
+}
+
+static void
+pty_drvinit(void *unused)
+{
+
+ mtx_init(&pt_mtx, "pt_mtx", NULL, MTX_DEF);
+ LIST_INIT(&pt_list);
+ LIST_INIT(&pt_free_list);
+ EVENTHANDLER_REGISTER(dev_clone, pty_clone, 0, 1000);
+}
+
+SYSINIT(ptydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,pty_drvinit,NULL)
Index: kern_exec.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_exec.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_exec.c -L sys/kern/kern_exec.c -u -r1.2 -r1.3
--- sys/kern/kern_exec.c
+++ sys/kern/kern_exec.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_exec.c,v 1.275.2.4 2006/03/13 03:05:42 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_exec.c,v 1.308.2.1.2.1 2008/01/19 18:15:05 kib Exp $");
#include "opt_hwpmc_hooks.h"
#include "opt_ktrace.h"
@@ -39,7 +39,6 @@
#include <sys/sysproto.h>
#include <sys/signalvar.h>
#include <sys/kernel.h>
-#include <sys/mac.h>
#include <sys/mount.h>
#include <sys/filedesc.h>
#include <sys/fcntl.h>
@@ -49,6 +48,7 @@
#include <sys/imgact_elf.h>
#include <sys/wait.h>
#include <sys/malloc.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/pioctl.h>
#include <sys/namei.h>
@@ -79,6 +79,9 @@
#include <machine/reg.h>
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
@@ -86,6 +89,7 @@
static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
static int do_execve(struct thread *td, struct image_args *args,
struct mac *mac_p);
+static void exec_free_args(struct image_args *);
/* XXX This should be vm_size_t. */
SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
@@ -164,9 +168,6 @@
};
#endif
-/*
- * MPSAFE
- */
int
execve(td, uap)
struct thread *td;
@@ -181,12 +182,8 @@
error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
uap->argv, uap->envv);
-
if (error == 0)
error = kern_execve(td, &args, NULL);
-
- exec_free_args(&args);
-
return (error);
}
@@ -199,9 +196,6 @@
};
#endif
-/*
- * MPSAFE
- */
int
__mac_execve(td, uap)
struct thread *td;
@@ -218,12 +212,8 @@
error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
uap->argv, uap->envv);
-
if (error == 0)
error = kern_execve(td, &args, uap->mac_p);
-
- exec_free_args(&args);
-
return (error);
#else
return (ENOSYS);
@@ -231,11 +221,11 @@
}
/*
- * XXX: kern_execve has the astonishing property of not always
- * returning to the caller. If sufficiently bad things happen during
- * the call to do_execve(), it can end up calling exit1(); as a result,
- * callers must avoid doing anything which they might need to undo
- * (e.g., allocating memory).
+ * XXX: kern_execve has the astonishing property of not always returning to
+ * the caller. If sufficiently bad things happen during the call to
+ * do_execve(), it can end up calling exit1(); as a result, callers must
+ * avoid doing anything which they might need to undo (e.g., allocating
+ * memory).
*/
int
kern_execve(td, args, mac_p)
@@ -246,10 +236,15 @@
struct proc *p = td->td_proc;
int error;
+ AUDIT_ARG(argv, args->begin_argv, args->argc,
+ args->begin_envv - args->begin_argv);
+ AUDIT_ARG(envv, args->begin_envv, args->envc,
+ args->endp - args->begin_envv);
if (p->p_flag & P_HADTHREADS) {
PROC_LOCK(p);
if (thread_single(SINGLE_BOUNDARY)) {
PROC_UNLOCK(p);
+ exec_free_args(args);
return (ERESTART); /* Try again later. */
}
PROC_UNLOCK(p);
@@ -276,8 +271,6 @@
/*
* In-kernel implementation of execve(). All arguments are assumed to be
* userspace pointers from the passed thread.
- *
- * MPSAFE
*/
static int
do_execve(td, args, mac_p)
@@ -357,10 +350,13 @@
/*
* Translate the file name. namei() returns a vnode pointer
* in ni_vp amoung other things.
+ *
+ * XXXAUDIT: It would be desirable to also audit the name of the
+ * interpreter if this is an interpreted binary.
*/
ndp = &nd;
- NDINIT(ndp, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | MPSAFE,
- UIO_SYSSPACE, args->fname, td);
+ NDINIT(ndp, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | MPSAFE |
+ AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
interpret:
error = namei(ndp);
@@ -395,6 +391,7 @@
if (error)
goto exec_fail_dealloc;
+ imgp->proc->p_osrel = 0;
/*
* If the current process has a special image activator it
* wants to try first, call it. For example, emulating shell
@@ -493,7 +490,9 @@
}
/* close files on exec */
+ VOP_UNLOCK(imgp->vp, 0, td);
fdcloseexec(td);
+ vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
/* Get a reference to the vnode prior to locking the proc */
VREF(ndp->ni_vp);
@@ -566,8 +565,10 @@
* we do not regain any tracing during a possible block.
*/
setsugid(p);
+
#ifdef KTRACE
- if (p->p_tracevp != NULL && suser_cred(oldcred, SUSER_ALLOWJAIL)) {
+ if (p->p_tracevp != NULL &&
+ priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0)) {
mtx_lock(&ktrace_mtx);
p->p_traceflag = 0;
tracevp = p->p_tracevp;
@@ -588,7 +589,9 @@
*/
PROC_UNLOCK(p);
setugidsafety(td);
+ VOP_UNLOCK(imgp->vp, 0, td);
error = fdcheckstd(td);
+ vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
if (error != 0)
goto done1;
PROC_LOCK(p);
@@ -666,7 +669,7 @@
* single thread mode.
*/
if (p->p_flag & P_TRACED)
- tdsignal(td, SIGTRAP, SIGTARGET_TD);
+ tdsignal(p, td, SIGTRAP, NULL);
/* clear "fork but no exec" flag, as we _are_ execing */
p->p_acflag &= ~AFORK;
@@ -720,6 +723,7 @@
crfree(oldcred);
else
crfree(newcred);
+ VOP_UNLOCK(imgp->vp, 0, td);
/*
* Handle deferred decrement of ref counts.
*/
@@ -733,11 +737,17 @@
if (ndp->ni_vp && error != 0)
vrele(ndp->ni_vp);
#ifdef KTRACE
- if (tracevp != NULL)
+ if (tracevp != NULL) {
+ int tvfslocked;
+
+ tvfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
vrele(tracevp);
+ VFS_UNLOCK_GIANT(tvfslocked);
+ }
if (tracecred != NULL)
crfree(tracecred);
#endif
+ vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
if (oldargs != NULL)
pargs_drop(oldargs);
if (newargs != NULL)
@@ -776,19 +786,6 @@
p->p_flag &= ~P_INEXEC;
PROC_UNLOCK(p);
- if (imgp->vmspace_destroyed) {
- /* sorry, no more process anymore. exit gracefully */
-#ifdef MAC
- mac_execve_exit(imgp);
- if (interplabel != NULL)
- mac_vnode_label_free(interplabel);
-#endif
- VFS_UNLOCK_GIANT(vfslocked);
- exec_free_args(args);
- exit1(td, W_EXITCODE(0, SIGABRT));
- /* NOT REACHED */
- error = 0;
- }
done2:
#ifdef MAC
mac_execve_exit(imgp);
@@ -796,6 +793,13 @@
mac_vnode_label_free(interplabel);
#endif
VFS_UNLOCK_GIANT(vfslocked);
+ exec_free_args(args);
+
+ if (error && imgp->vmspace_destroyed) {
+ /* sorry, no more process anymore. exit gracefully */
+ exit1(td, W_EXITCODE(0, SIGABRT));
+ /* NOT REACHED */
+ }
return (error);
}
@@ -824,16 +828,12 @@
if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
if (ma[i]->valid)
break;
- vm_page_lock_queues();
- if ((ma[i]->flags & PG_BUSY) || ma[i]->busy) {
- vm_page_unlock_queues();
+ if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy)
break;
- }
vm_page_busy(ma[i]);
- vm_page_unlock_queues();
} else {
ma[i] = vm_page_alloc(object, i,
- VM_ALLOC_NORMAL);
+ VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED);
if (ma[i] == NULL)
break;
}
@@ -845,7 +845,6 @@
(ma[0]->valid == 0)) {
if (ma[0]) {
vm_page_lock_queues();
- pmap_remove_all(ma[0]);
vm_page_free(ma[0]);
vm_page_unlock_queues();
}
@@ -855,8 +854,8 @@
}
vm_page_lock_queues();
vm_page_hold(ma[0]);
- vm_page_wakeup(ma[0]);
vm_page_unlock_queues();
+ vm_page_wakeup(ma[0]);
VM_OBJECT_UNLOCK(object);
imgp->firstpage = sf_buf_alloc(ma[0], 0);
@@ -896,20 +895,13 @@
struct vmspace *vmspace = p->p_vmspace;
vm_offset_t stack_addr;
vm_map_t map;
+ u_long ssiz;
imgp->vmspace_destroyed = 1;
+ imgp->sysent = sv;
- /* Called with Giant held, do not depend on it! */
- EVENTHANDLER_INVOKE(process_exec, p);
-
- /*
- * Here is as good a place as any to do any resource limit cleanups.
- * This is needed if a 64 bit binary exec's a 32 bit binary - the
- * data size limit may need to be changed to a value that makes
- * sense for the 32 bit binary.
- */
- if (sv->sv_fixlimits != NULL)
- sv->sv_fixlimits(imgp);
+ /* May be called with Giant held */
+ EVENTHANDLER_INVOKE(process_exec, p, imgp);
/*
* Blow away entire process VM, if address space not shared,
@@ -920,18 +912,23 @@
if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser &&
vm_map_max(map) == sv->sv_maxuser) {
shmexit(vmspace);
- pmap_remove_pages(vmspace_pmap(vmspace), vm_map_min(map),
- vm_map_max(map));
+ pmap_remove_pages(vmspace_pmap(vmspace));
vm_map_remove(map, vm_map_min(map), vm_map_max(map));
} else {
- vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
+ error = vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
+ if (error)
+ return (error);
vmspace = p->p_vmspace;
map = &vmspace->vm_map;
}
/* Allocate a new stack */
- stack_addr = sv->sv_usrstack - maxssiz;
- error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
+ if (sv->sv_maxssiz != NULL)
+ ssiz = *sv->sv_maxssiz;
+ else
+ ssiz = maxssiz;
+ stack_addr = sv->sv_usrstack - ssiz;
+ error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
if (error)
return (error);
@@ -939,7 +936,7 @@
#ifdef __ia64__
/* Allocate a new register stack */
stack_addr = IA64_BACKINGSTORE;
- error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
+ error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
if (error)
return (error);
@@ -950,14 +947,14 @@
* process stack so we can check the stack rlimit.
*/
vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
- vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz;
+ vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;
return (0);
}
/*
- * Copy out argument and environment strings from the old process
- * address space into the temporary string buffer.
+ * Copy out argument and environment strings from the old process address
+ * space into the temporary string buffer.
*/
int
exec_copyin_args(struct image_args *args, char *fname,
@@ -996,19 +993,21 @@
copystr(fname, args->fname, PATH_MAX, &length) :
copyinstr(fname, args->fname, PATH_MAX, &length);
if (error != 0)
- return (error);
+ goto err_exit;
/*
* extract arguments first
*/
while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
- if (argp == (caddr_t) -1)
- return (EFAULT);
+ if (argp == (caddr_t) -1) {
+ error = EFAULT;
+ goto err_exit;
+ }
if ((error = copyinstr(argp, args->endp,
args->stringspace, &length))) {
- if (error == ENAMETOOLONG)
- return (E2BIG);
- return (error);
+ if (error == ENAMETOOLONG)
+ error = E2BIG;
+ goto err_exit;
}
args->stringspace -= length;
args->endp += length;
@@ -1022,13 +1021,15 @@
*/
if (envv) {
while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
- if (envp == (caddr_t)-1)
- return (EFAULT);
+ if (envp == (caddr_t)-1) {
+ error = EFAULT;
+ goto err_exit;
+ }
if ((error = copyinstr(envp, args->endp,
args->stringspace, &length))) {
if (error == ENAMETOOLONG)
- return (E2BIG);
- return (error);
+ error = E2BIG;
+ goto err_exit;
}
args->stringspace -= length;
args->endp += length;
@@ -1037,9 +1038,13 @@
}
return (0);
+
+err_exit:
+ exec_free_args(args);
+ return (error);
}
-void
+static void
exec_free_args(struct image_args *args)
{
@@ -1051,9 +1056,9 @@
}
/*
- * Copy strings out to the new process address space, constructing
- * new arg and env vector tables. Return a pointer to the base
- * so that it can be used as the initial stack pointer.
+ * Copy strings out to the new process address space, constructing new arg
+ * and env vector tables. Return a pointer to the base so that it can be used
+ * as the initial stack pointer.
*/
register_t *
exec_copyout_strings(imgp)
@@ -1231,7 +1236,7 @@
* Call filesystem specific open routine (which does nothing in the
* general case).
*/
- error = VOP_OPEN(vp, FREAD, td->td_ucred, td, -1);
+ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
return (error);
}
Index: kern_jail.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_jail.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_jail.c -L sys/kern/kern_jail.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_jail.c
+++ sys/kern/kern_jail.c
@@ -8,7 +8,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_jail.c,v 1.50.2.1 2005/11/13 03:12:32 csjp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_jail.c,v 1.70 2007/04/13 23:54:22 pjd Exp $");
#include "opt_mac.h"
@@ -18,13 +18,14 @@
#include <sys/systm.h>
#include <sys/errno.h>
#include <sys/sysproto.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/taskqueue.h>
#include <sys/jail.h>
#include <sys/lock.h>
#include <sys/mutex.h>
+#include <sys/sx.h>
#include <sys/namei.h>
#include <sys/mount.h>
#include <sys/queue.h>
@@ -35,9 +36,10 @@
#include <net/if.h>
#include <netinet/in.h>
+#include <security/mac/mac_framework.h>
+
MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
-SYSCTL_DECL(_security);
SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
"Jail rules");
@@ -71,30 +73,48 @@
&jail_chflags_allowed, 0,
"Processes in jail can alter system file flags");
-/* allprison, lastprid, and prisoncount are protected by allprison_mtx. */
+int jail_mount_allowed = 0;
+SYSCTL_INT(_security_jail, OID_AUTO, mount_allowed, CTLFLAG_RW,
+ &jail_mount_allowed, 0,
+ "Processes in jail can mount/unmount jail-friendly file systems");
+
+/* allprison, lastprid, and prisoncount are protected by allprison_lock. */
struct prisonlist allprison;
-struct mtx allprison_mtx;
+struct sx allprison_lock;
int lastprid = 0;
int prisoncount = 0;
+/*
+ * List of jail services. Protected by allprison_lock.
+ */
+TAILQ_HEAD(prison_services_head, prison_service);
+static struct prison_services_head prison_services =
+ TAILQ_HEAD_INITIALIZER(prison_services);
+static int prison_service_slots = 0;
+
+struct prison_service {
+ prison_create_t ps_create;
+ prison_destroy_t ps_destroy;
+ int ps_slotno;
+ TAILQ_ENTRY(prison_service) ps_next;
+ char ps_name[0];
+};
+
static void init_prison(void *);
static void prison_complete(void *context, int pending);
-static struct prison *prison_find(int);
static int sysctl_jail_list(SYSCTL_HANDLER_ARGS);
static void
init_prison(void *data __unused)
{
- mtx_init(&allprison_mtx, "allprison", NULL, MTX_DEF);
+ sx_init(&allprison_lock, "allprison");
LIST_INIT(&allprison);
}
SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
/*
- * MPSAFE
- *
* struct jail_args {
* struct jail *jail;
* };
@@ -104,6 +124,7 @@
{
struct nameidata nd;
struct prison *pr, *tpr;
+ struct prison_service *psrv;
struct jail j;
struct jail_attach_args jaa;
int vfslocked, error, tryprid;
@@ -136,9 +157,15 @@
pr->pr_ip = j.ip_number;
pr->pr_linux = NULL;
pr->pr_securelevel = securelevel;
+ if (prison_service_slots == 0)
+ pr->pr_slots = NULL;
+ else {
+ pr->pr_slots = malloc(sizeof(*pr->pr_slots) * prison_service_slots,
+ M_PRISON, M_ZERO | M_WAITOK);
+ }
/* Determine next pr_id and add prison to allprison list. */
- mtx_lock(&allprison_mtx);
+ sx_xlock(&allprison_lock);
tryprid = lastprid + 1;
if (tryprid == JAIL_MAX)
tryprid = 1;
@@ -147,7 +174,7 @@
if (tpr->pr_id == tryprid) {
tryprid++;
if (tryprid == JAIL_MAX) {
- mtx_unlock(&allprison_mtx);
+ sx_xunlock(&allprison_lock);
error = EAGAIN;
goto e_dropvnref;
}
@@ -157,7 +184,11 @@
pr->pr_id = jaa.jid = lastprid = tryprid;
LIST_INSERT_HEAD(&allprison, pr, pr_list);
prisoncount++;
- mtx_unlock(&allprison_mtx);
+ sx_downgrade(&allprison_lock);
+ TAILQ_FOREACH(psrv, &prison_services, ps_next) {
+ psrv->ps_create(psrv, pr);
+ }
+ sx_sunlock(&allprison_lock);
error = jail_attach(td, &jaa);
if (error)
@@ -168,10 +199,14 @@
td->td_retval[0] = jaa.jid;
return (0);
e_dropprref:
- mtx_lock(&allprison_mtx);
+ sx_xlock(&allprison_lock);
LIST_REMOVE(pr, pr_list);
prisoncount--;
- mtx_unlock(&allprison_mtx);
+ sx_downgrade(&allprison_lock);
+ TAILQ_FOREACH(psrv, &prison_services, ps_next) {
+ psrv->ps_destroy(psrv, pr);
+ }
+ sx_sunlock(&allprison_lock);
e_dropvnref:
vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
vrele(pr->pr_root);
@@ -183,8 +218,6 @@
}
/*
- * MPSAFE
- *
* struct jail_attach_args {
* int jid;
* };
@@ -196,7 +229,7 @@
struct ucred *newcred, *oldcred;
struct prison *pr;
int vfslocked, error;
-
+
/*
* XXX: Note that there is a slight race here if two threads
* in the same privileged process attempt to attach to two
@@ -205,20 +238,20 @@
* a process root from one prison, but attached to the jail
* of another.
*/
- error = suser(td);
+ error = priv_check(td, PRIV_JAIL_ATTACH);
if (error)
return (error);
p = td->td_proc;
- mtx_lock(&allprison_mtx);
+ sx_slock(&allprison_lock);
pr = prison_find(uap->jid);
if (pr == NULL) {
- mtx_unlock(&allprison_mtx);
+ sx_sunlock(&allprison_lock);
return (EINVAL);
}
pr->pr_ref++;
mtx_unlock(&pr->pr_mtx);
- mtx_unlock(&allprison_mtx);
+ sx_sunlock(&allprison_lock);
vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY, td);
@@ -254,15 +287,19 @@
/*
* Returns a locked prison instance, or NULL on failure.
*/
-static struct prison *
+struct prison *
prison_find(int prid)
{
struct prison *pr;
- mtx_assert(&allprison_mtx, MA_OWNED);
+ sx_assert(&allprison_lock, SX_LOCKED);
LIST_FOREACH(pr, &allprison, pr_list) {
if (pr->pr_id == prid) {
mtx_lock(&pr->pr_mtx);
+ if (pr->pr_ref == 0) {
+ mtx_unlock(&pr->pr_mtx);
+ break;
+ }
return (pr);
}
}
@@ -273,31 +310,35 @@
prison_free(struct prison *pr)
{
- mtx_lock(&allprison_mtx);
mtx_lock(&pr->pr_mtx);
pr->pr_ref--;
if (pr->pr_ref == 0) {
- LIST_REMOVE(pr, pr_list);
mtx_unlock(&pr->pr_mtx);
- prisoncount--;
- mtx_unlock(&allprison_mtx);
-
TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
return;
}
mtx_unlock(&pr->pr_mtx);
- mtx_unlock(&allprison_mtx);
}
static void
prison_complete(void *context, int pending)
{
+ struct prison_service *psrv;
struct prison *pr;
int vfslocked;
pr = (struct prison *)context;
+ sx_xlock(&allprison_lock);
+ LIST_REMOVE(pr, pr_list);
+ prisoncount--;
+ sx_downgrade(&allprison_lock);
+ TAILQ_FOREACH(psrv, &prison_services, ps_next) {
+ psrv->ps_destroy(psrv, pr);
+ }
+ sx_sunlock(&allprison_lock);
+
vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
vrele(pr->pr_root);
VFS_UNLOCK_GIANT(vfslocked);
@@ -313,6 +354,8 @@
{
mtx_lock(&pr->pr_mtx);
+ KASSERT(pr->pr_ref > 0,
+ ("Trying to hold dead prison (id=%d).", pr->pr_id));
pr->pr_ref++;
mtx_unlock(&pr->pr_mtx);
}
@@ -331,12 +374,12 @@
if (!jailed(cred))
return (0);
- if (flag)
+ if (flag)
tmp = *ip;
else
tmp = ntohl(*ip);
if (tmp == INADDR_ANY) {
- if (flag)
+ if (flag)
*ip = cred->cr_prison->pr_ip;
else
*ip = htonl(cred->cr_prison->pr_ip);
@@ -523,6 +566,372 @@
}
}
+/*
+ * Check with permission for a specific privilege is granted within jail. We
+ * have a specific list of accepted privileges; the rest are denied.
+ */
+int
+prison_priv_check(struct ucred *cred, int priv)
+{
+
+ if (!jailed(cred))
+ return (0);
+
+ switch (priv) {
+
+ /*
+ * Allow ktrace privileges for root in jail.
+ */
+ case PRIV_KTRACE:
+
+#if 0
+ /*
+ * Allow jailed processes to configure audit identity and
+ * submit audit records (login, etc). In the future we may
+ * want to further refine the relationship between audit and
+ * jail.
+ */
+ case PRIV_AUDIT_GETAUDIT:
+ case PRIV_AUDIT_SETAUDIT:
+ case PRIV_AUDIT_SUBMIT:
+#endif
+
+ /*
+ * Allow jailed processes to manipulate process UNIX
+ * credentials in any way they see fit.
+ */
+ case PRIV_CRED_SETUID:
+ case PRIV_CRED_SETEUID:
+ case PRIV_CRED_SETGID:
+ case PRIV_CRED_SETEGID:
+ case PRIV_CRED_SETGROUPS:
+ case PRIV_CRED_SETREUID:
+ case PRIV_CRED_SETREGID:
+ case PRIV_CRED_SETRESUID:
+ case PRIV_CRED_SETRESGID:
+
+ /*
+ * Jail implements visibility constraints already, so allow
+ * jailed root to override uid/gid-based constraints.
+ */
+ case PRIV_SEEOTHERGIDS:
+ case PRIV_SEEOTHERUIDS:
+
+ /*
+ * Jail implements inter-process debugging limits already, so
+ * allow jailed root various debugging privileges.
+ */
+ case PRIV_DEBUG_DIFFCRED:
+ case PRIV_DEBUG_SUGID:
+ case PRIV_DEBUG_UNPRIV:
+
+ /*
+ * Allow jail to set various resource limits and login
+ * properties, and for now, exceed process resource limits.
+ */
+ case PRIV_PROC_LIMIT:
+ case PRIV_PROC_SETLOGIN:
+ case PRIV_PROC_SETRLIMIT:
+
+ /*
+ * System V and POSIX IPC privileges are granted in jail.
+ */
+ case PRIV_IPC_READ:
+ case PRIV_IPC_WRITE:
+ case PRIV_IPC_ADMIN:
+ case PRIV_IPC_MSGSIZE:
+ case PRIV_MQ_ADMIN:
+
+ /*
+ * Jail implements its own inter-process limits, so allow
+ * root processes in jail to change scheduling on other
+ * processes in the same jail. Likewise for signalling.
+ */
+ case PRIV_SCHED_DIFFCRED:
+ case PRIV_SIGNAL_DIFFCRED:
+ case PRIV_SIGNAL_SUGID:
+
+ /*
+ * Allow jailed processes to write to sysctls marked as jail
+ * writable.
+ */
+ case PRIV_SYSCTL_WRITEJAIL:
+
+ /*
+ * Allow root in jail to manage a variety of quota
+ * properties. These should likely be conditional on a
+ * configuration option.
+ */
+ case PRIV_VFS_GETQUOTA:
+ case PRIV_VFS_SETQUOTA:
+
+ /*
+ * Since Jail relies on chroot() to implement file system
+ * protections, grant many VFS privileges to root in jail.
+ * Be careful to exclude mount-related and NFS-related
+ * privileges.
+ */
+ case PRIV_VFS_READ:
+ case PRIV_VFS_WRITE:
+ case PRIV_VFS_ADMIN:
+ case PRIV_VFS_EXEC:
+ case PRIV_VFS_LOOKUP:
+ case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */
+ case PRIV_VFS_CHFLAGS_DEV:
+ case PRIV_VFS_CHOWN:
+ case PRIV_VFS_CHROOT:
+ case PRIV_VFS_RETAINSUGID:
+ case PRIV_VFS_FCHROOT:
+ case PRIV_VFS_LINK:
+ case PRIV_VFS_SETGID:
+ case PRIV_VFS_STICKYFILE:
+ return (0);
+
+ /*
+ * Depending on the global setting, allow privilege of
+ * setting system flags.
+ */
+ case PRIV_VFS_SYSFLAGS:
+ if (jail_chflags_allowed)
+ return (0);
+ else
+ return (EPERM);
+
+ /*
+ * Depending on the global setting, allow privilege of
+ * mounting/unmounting file systems.
+ */
+ case PRIV_VFS_MOUNT:
+ case PRIV_VFS_UNMOUNT:
+ case PRIV_VFS_MOUNT_NONUSER:
+ case PRIV_VFS_MOUNT_OWNER:
+ if (jail_mount_allowed)
+ return (0);
+ else
+ return (EPERM);
+
+ /*
+ * Allow jailed root to bind reserved ports and reuse in-use
+ * ports.
+ */
+ case PRIV_NETINET_RESERVEDPORT:
+ case PRIV_NETINET_REUSEPORT:
+ return (0);
+
+ /*
+ * Conditionally allow creating raw sockets in jail.
+ */
+ case PRIV_NETINET_RAW:
+ if (jail_allow_raw_sockets)
+ return (0);
+ else
+ return (EPERM);
+
+ /*
+ * Since jail implements its own visibility limits on netstat
+ * sysctls, allow getcred. This allows identd to work in
+ * jail.
+ */
+ case PRIV_NETINET_GETCRED:
+ return (0);
+
+ default:
+ /*
+ * In all remaining cases, deny the privilege request. This
+ * includes almost all network privileges, many system
+ * configuration privileges.
+ */
+ return (EPERM);
+ }
+}
+
+/*
+ * Register jail service. Provides 'create' and 'destroy' methods.
+ * 'create' method will be called for every existing jail and all
+ * jails in the future as they beeing created.
+ * 'destroy' method will be called for every jail going away and
+ * for all existing jails at the time of service deregistration.
+ */
+struct prison_service *
+prison_service_register(const char *name, prison_create_t create,
+ prison_destroy_t destroy)
+{
+ struct prison_service *psrv, *psrv2;
+ struct prison *pr;
+ int reallocate = 1, slotno = 0;
+ void **slots, **oldslots;
+
+ psrv = malloc(sizeof(*psrv) + strlen(name) + 1, M_PRISON,
+ M_WAITOK | M_ZERO);
+ psrv->ps_create = create;
+ psrv->ps_destroy = destroy;
+ strcpy(psrv->ps_name, name);
+ /*
+ * Grab the allprison_lock here, so we won't miss any jail
+ * creation/destruction.
+ */
+ sx_xlock(&allprison_lock);
+#ifdef INVARIANTS
+ /*
+ * Verify if service is not already registered.
+ */
+ TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
+ KASSERT(strcmp(psrv2->ps_name, name) != 0,
+ ("jail service %s already registered", name));
+ }
+#endif
+ /*
+ * Find free slot. When there is no existing free slot available,
+ * allocate one at the end.
+ */
+ TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
+ if (psrv2->ps_slotno != slotno) {
+ KASSERT(slotno < psrv2->ps_slotno,
+ ("Invalid slotno (slotno=%d >= ps_slotno=%d",
+ slotno, psrv2->ps_slotno));
+ /* We found free slot. */
+ reallocate = 0;
+ break;
+ }
+ slotno++;
+ }
+ psrv->ps_slotno = slotno;
+ /*
+ * Keep the list sorted by slot number.
+ */
+ if (psrv2 != NULL) {
+ KASSERT(reallocate == 0, ("psrv2 != NULL && reallocate != 0"));
+ TAILQ_INSERT_BEFORE(psrv2, psrv, ps_next);
+ } else {
+ KASSERT(reallocate == 1, ("psrv2 == NULL && reallocate == 0"));
+ TAILQ_INSERT_TAIL(&prison_services, psrv, ps_next);
+ }
+ prison_service_slots++;
+ sx_downgrade(&allprison_lock);
+ /*
+ * Allocate memory for new slot if we didn't found empty one.
+ * Do not use realloc(9), because pr_slots is protected with a mutex,
+ * so we can't sleep.
+ */
+ LIST_FOREACH(pr, &allprison, pr_list) {
+ if (reallocate) {
+ /* First allocate memory with M_WAITOK. */
+ slots = malloc(sizeof(*slots) * prison_service_slots,
+ M_PRISON, M_WAITOK);
+ /* Now grab the mutex and replace pr_slots. */
+ mtx_lock(&pr->pr_mtx);
+ oldslots = pr->pr_slots;
+ if (psrv->ps_slotno > 0) {
+ bcopy(oldslots, slots,
+ sizeof(*slots) * (prison_service_slots - 1));
+ }
+ slots[psrv->ps_slotno] = NULL;
+ pr->pr_slots = slots;
+ mtx_unlock(&pr->pr_mtx);
+ if (oldslots != NULL)
+ free(oldslots, M_PRISON);
+ }
+ /*
+ * Call 'create' method for each existing jail.
+ */
+ psrv->ps_create(psrv, pr);
+ }
+ sx_sunlock(&allprison_lock);
+
+ return (psrv);
+}
+
+void
+prison_service_deregister(struct prison_service *psrv)
+{
+ struct prison *pr;
+ void **slots, **oldslots;
+ int last = 0;
+
+ sx_xlock(&allprison_lock);
+ if (TAILQ_LAST(&prison_services, prison_services_head) == psrv)
+ last = 1;
+ TAILQ_REMOVE(&prison_services, psrv, ps_next);
+ prison_service_slots--;
+ sx_downgrade(&allprison_lock);
+ LIST_FOREACH(pr, &allprison, pr_list) {
+ /*
+ * Call 'destroy' method for every currently existing jail.
+ */
+ psrv->ps_destroy(psrv, pr);
+ /*
+ * If this is the last slot, free the memory allocated for it.
+ */
+ if (last) {
+ if (prison_service_slots == 0)
+ slots = NULL;
+ else {
+ slots = malloc(sizeof(*slots) * prison_service_slots,
+ M_PRISON, M_WAITOK);
+ }
+ mtx_lock(&pr->pr_mtx);
+ oldslots = pr->pr_slots;
+ /*
+ * We require setting slot to NULL after freeing it,
+ * this way we can check for memory leaks here.
+ */
+ KASSERT(oldslots[psrv->ps_slotno] == NULL,
+ ("Slot %d (service %s, jailid=%d) still contains data?",
+ psrv->ps_slotno, psrv->ps_name, pr->pr_id));
+ if (psrv->ps_slotno > 0) {
+ bcopy(oldslots, slots,
+ sizeof(*slots) * prison_service_slots);
+ }
+ pr->pr_slots = slots;
+ mtx_unlock(&pr->pr_mtx);
+ KASSERT(oldslots != NULL, ("oldslots == NULL"));
+ free(oldslots, M_PRISON);
+ }
+ }
+ sx_sunlock(&allprison_lock);
+ free(psrv, M_PRISON);
+}
+
+/*
+ * Function sets data for the given jail in slot assigned for the given
+ * jail service.
+ */
+void
+prison_service_data_set(struct prison_service *psrv, struct prison *pr,
+ void *data)
+{
+
+ mtx_assert(&pr->pr_mtx, MA_OWNED);
+ pr->pr_slots[psrv->ps_slotno] = data;
+}
+
+/*
+ * Function clears slots assigned for the given jail service in the given
+ * prison structure and returns current slot data.
+ */
+void *
+prison_service_data_del(struct prison_service *psrv, struct prison *pr)
+{
+ void *data;
+
+ mtx_assert(&pr->pr_mtx, MA_OWNED);
+ data = pr->pr_slots[psrv->ps_slotno];
+ pr->pr_slots[psrv->ps_slotno] = NULL;
+ return (data);
+}
+
+/*
+ * Function returns current data from the slot assigned to the given jail
+ * service for the given jail.
+ */
+void *
+prison_service_data_get(struct prison_service *psrv, struct prison *pr)
+{
+
+ mtx_assert(&pr->pr_mtx, MA_OWNED);
+ return (pr->pr_slots[psrv->ps_slotno]);
+}
+
static int
sysctl_jail_list(SYSCTL_HANDLER_ARGS)
{
@@ -532,39 +941,30 @@
if (jailed(req->td->td_ucred))
return (0);
-retry:
- mtx_lock(&allprison_mtx);
- count = prisoncount;
- mtx_unlock(&allprison_mtx);
- if (count == 0)
+ sx_slock(&allprison_lock);
+ if ((count = prisoncount) == 0) {
+ sx_sunlock(&allprison_lock);
return (0);
+ }
sxp = xp = malloc(sizeof(*xp) * count, M_TEMP, M_WAITOK | M_ZERO);
- mtx_lock(&allprison_mtx);
- if (count != prisoncount) {
- mtx_unlock(&allprison_mtx);
- free(sxp, M_TEMP);
- goto retry;
- }
-
+
LIST_FOREACH(pr, &allprison, pr_list) {
- mtx_lock(&pr->pr_mtx);
xp->pr_version = XPRISON_VERSION;
xp->pr_id = pr->pr_id;
+ xp->pr_ip = pr->pr_ip;
strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path));
+ mtx_lock(&pr->pr_mtx);
strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host));
- xp->pr_ip = pr->pr_ip;
mtx_unlock(&pr->pr_mtx);
xp++;
}
- mtx_unlock(&allprison_mtx);
+ sx_sunlock(&allprison_lock);
error = SYSCTL_OUT(req, sxp, sizeof(*sxp) * count);
free(sxp, M_TEMP);
- if (error)
- return (error);
- return (0);
+ return (error);
}
SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD,
Index: tty_compat.c
===================================================================
RCS file: /home/cvs/src/sys/kern/tty_compat.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/tty_compat.c -L sys/kern/tty_compat.c -u -r1.1.1.1 -r1.2
--- sys/kern/tty_compat.c
+++ sys/kern/tty_compat.c
@@ -30,15 +30,13 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/tty_compat.c,v 1.37 2004/06/21 22:57:15 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/tty_compat.c,v 1.39 2006/01/10 09:19:09 phk Exp $");
#include "opt_compat.h"
-#ifndef BURN_BRIDGES
/*
* mapping routines for old line discipline (yuck)
*/
-#if defined(COMPAT_43)
#include <sys/param.h>
#include <sys/systm.h>
@@ -93,7 +91,7 @@
return (1); /* 50, min and not hangup */
}
-int
+static int
ttsetcompat(struct tty *tp, u_long *com, caddr_t data, struct termios *term)
{
switch (*com) {
@@ -471,6 +469,3 @@
t->c_lflag = lflag;
t->c_cflag = cflag;
}
-#endif /* COMPAT_43 */
-
-#endif /* BURN_BRIDGES */
--- /dev/null
+++ sys/kern/posix4_mib.c
@@ -0,0 +1,151 @@
+/*-
+ * Copyright (c) 1998
+ * HD Associates, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/posix4_mib.c,v 1.12 2006/11/12 03:34:03 trhodes Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+
+static int facility[CTL_P1003_1B_MAXID - 1];
+static int facility_initialized[CTL_P1003_1B_MAXID - 1];
+
+/* OID_AUTO isn't working with sysconf(3). I guess I'd have to
+ * modify it to do a lookup by name from the index.
+ * For now I've left it a top-level sysctl.
+ */
+
+#if 1
+
+SYSCTL_DECL(_p1003_1b);
+
+#define P1B_SYSCTL(num, name) \
+SYSCTL_INT(_p1003_1b, num, \
+ name, CTLFLAG_RD, facility + num - 1, 0, "");
+
+#else
+
+SYSCTL_DECL(_kern_p1003_1b);
+
+#define P1B_SYSCTL(num, name) \
+SYSCTL_INT(_kern_p1003_1b, OID_AUTO, \
+ name, CTLFLAG_RD, facility + num - 1, 0, "");
+SYSCTL_NODE(_kern, OID_AUTO, p1003_1b, CTLFLAG_RW, 0, "P1003.1B");
+
+#endif
+
+SYSCTL_INT(_p1003_1b, CTL_P1003_1B_ASYNCHRONOUS_IO, \
+ asynchronous_io, CTLFLAG_RD, &async_io_version, 0, "");
+P1B_SYSCTL(CTL_P1003_1B_MAPPED_FILES, mapped_files);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK, memlock);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK_RANGE, memlock_range);
+P1B_SYSCTL(CTL_P1003_1B_MEMORY_PROTECTION, memory_protection);
+P1B_SYSCTL(CTL_P1003_1B_MESSAGE_PASSING, message_passing);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITIZED_IO, prioritized_io);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITY_SCHEDULING, priority_scheduling);
+P1B_SYSCTL(CTL_P1003_1B_REALTIME_SIGNALS, realtime_signals);
+P1B_SYSCTL(CTL_P1003_1B_SEMAPHORES, semaphores);
+P1B_SYSCTL(CTL_P1003_1B_FSYNC, fsync);
+P1B_SYSCTL(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, shared_memory_objects);
+P1B_SYSCTL(CTL_P1003_1B_SYNCHRONIZED_IO, synchronized_io);
+P1B_SYSCTL(CTL_P1003_1B_TIMERS, timers);
+P1B_SYSCTL(CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_MAX, aio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, aio_prio_delta_max);
+P1B_SYSCTL(CTL_P1003_1B_DELAYTIMER_MAX, delaytimer_max);
+P1B_SYSCTL(CTL_P1003_1B_MQ_OPEN_MAX, mq_open_max);
+P1B_SYSCTL(CTL_P1003_1B_PAGESIZE, pagesize);
+P1B_SYSCTL(CTL_P1003_1B_RTSIG_MAX, rtsig_max);
+P1B_SYSCTL(CTL_P1003_1B_SEM_NSEMS_MAX, sem_nsems_max);
+P1B_SYSCTL(CTL_P1003_1B_SEM_VALUE_MAX, sem_value_max);
+P1B_SYSCTL(CTL_P1003_1B_SIGQUEUE_MAX, sigqueue_max);
+P1B_SYSCTL(CTL_P1003_1B_TIMER_MAX, timer_max);
+
+#define P31B_VALID(num) ((num) >= 1 && (num) < CTL_P1003_1B_MAXID)
+
+/* p31b_setcfg: Set the configuration
+ */
+void
+p31b_setcfg(int num, int value)
+{
+
+ if (P31B_VALID(num)) {
+ facility[num - 1] = value;
+ facility_initialized[num - 1] = 1;
+ }
+}
+
+int
+p31b_getcfg(int num)
+{
+
+ if (P31B_VALID(num))
+ return (facility[num - 1]);
+ return (0);
+}
+
+int
+p31b_iscfg(int num)
+{
+
+ if (P31B_VALID(num))
+ return (facility_initialized[num - 1]);
+ return (0);
+}
+
+/*
+ * Turn on indications for standard (non-configurable) kernel features.
+ */
+static void
+p31b_set_standard(void *dummy)
+{
+ /* ??? p31b_setcfg(CTL_P1003_1B_FSYNC, 1); */
+ p31b_setcfg(CTL_P1003_1B_MAPPED_FILES, 1);
+ p31b_setcfg(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, 1);
+ p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
+ if (!p31b_iscfg(CTL_P1003_1B_AIO_LISTIO_MAX))
+ p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
+ if (!p31b_iscfg(CTL_P1003_1B_AIO_MAX))
+ p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
+ if (!p31b_iscfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX))
+ p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
+}
+
+SYSINIT(p31b_set_standard, SI_SUB_P1003_1B, SI_ORDER_ANY, p31b_set_standard,
+ 0);
+
--- /dev/null
+++ sys/kern/ksched.c
@@ -0,0 +1,277 @@
+/*-
+ * Copyright (c) 1996, 1997
+ * HD Associates, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* ksched: Soft real time scheduling based on "rtprio".
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/ksched.c,v 1.36 2007/06/05 00:00:54 jeff Exp $");
+
+#include "opt_posix.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+#include <sys/resource.h>
+#include <sys/sched.h>
+
+/* ksched: Real-time extension to support POSIX priority scheduling.
+ */
+
+struct ksched {
+ struct timespec rr_interval;
+};
+
+int
+ksched_attach(struct ksched **p)
+{
+ struct ksched *ksched= p31b_malloc(sizeof(*ksched));
+
+ ksched->rr_interval.tv_sec = 0;
+ ksched->rr_interval.tv_nsec = 1000000000L / sched_rr_interval();
+
+ *p = ksched;
+ return 0;
+}
+
+int
+ksched_detach(struct ksched *ks)
+{
+ p31b_free(ks);
+
+ return 0;
+}
+
+/*
+ * XXX About priorities
+ *
+ * POSIX 1003.1b requires that numerically higher priorities be of
+ * higher priority. It also permits sched_setparam to be
+ * implementation defined for SCHED_OTHER. I don't like
+ * the notion of inverted priorites for normal processes when
+ * you can use "setpriority" for that.
+ *
+ * I'm rejecting sched_setparam for SCHED_OTHER with EINVAL.
+ */
+
+/* Macros to convert between the unix (lower numerically is higher priority)
+ * and POSIX 1003.1b (higher numerically is higher priority)
+ */
+
+#define p4prio_to_rtpprio(P) (RTP_PRIO_MAX - (P))
+#define rtpprio_to_p4prio(P) (RTP_PRIO_MAX - (P))
+
+/* These improve readability a bit for me:
+ */
+#define P1B_PRIO_MIN rtpprio_to_p4prio(RTP_PRIO_MAX)
+#define P1B_PRIO_MAX rtpprio_to_p4prio(RTP_PRIO_MIN)
+
+static __inline int
+getscheduler(struct ksched *ksched, struct thread *td, int *policy)
+{
+ struct rtprio rtp;
+ int e = 0;
+
+ pri_to_rtp(td, &rtp);
+ switch (rtp.type)
+ {
+ case RTP_PRIO_FIFO:
+ *policy = SCHED_FIFO;
+ break;
+
+ case RTP_PRIO_REALTIME:
+ *policy = SCHED_RR;
+ break;
+
+ default:
+ *policy = SCHED_OTHER;
+ break;
+ }
+
+ return e;
+}
+
+int
+ksched_setparam(struct ksched *ksched,
+ struct thread *td, const struct sched_param *param)
+{
+ int policy;
+ int e;
+
+ e = getscheduler(ksched, td, &policy);
+
+ if (e == 0)
+ {
+ if (policy == SCHED_OTHER)
+ e = EINVAL;
+ else
+ e = ksched_setscheduler(ksched, td, policy, param);
+ }
+
+ return e;
+}
+
+int
+ksched_getparam(struct ksched *ksched,
+ struct thread *td, struct sched_param *param)
+{
+ struct rtprio rtp;
+
+ pri_to_rtp(td, &rtp);
+ if (RTP_PRIO_IS_REALTIME(rtp.type))
+ param->sched_priority = rtpprio_to_p4prio(rtp.prio);
+
+ return 0;
+}
+
+/*
+ * XXX The priority and scheduler modifications should
+ * be moved into published interfaces in kern/kern_sync.
+ *
+ * The permissions to modify process p were checked in "p31b_proc()".
+ *
+ */
+int
+ksched_setscheduler(struct ksched *ksched,
+ struct thread *td, int policy, const struct sched_param *param)
+{
+ int e = 0;
+ struct rtprio rtp;
+
+ switch(policy)
+ {
+ case SCHED_RR:
+ case SCHED_FIFO:
+
+ if (param->sched_priority >= P1B_PRIO_MIN &&
+ param->sched_priority <= P1B_PRIO_MAX)
+ {
+ rtp.prio = p4prio_to_rtpprio(param->sched_priority);
+ rtp.type = (policy == SCHED_FIFO)
+ ? RTP_PRIO_FIFO : RTP_PRIO_REALTIME;
+
+ rtp_to_pri(&rtp, td);
+ }
+ else
+ e = EPERM;
+
+
+ break;
+
+ case SCHED_OTHER:
+ {
+ rtp.type = RTP_PRIO_NORMAL;
+ rtp.prio = p4prio_to_rtpprio(param->sched_priority);
+ rtp_to_pri(&rtp, td);
+ }
+ break;
+
+ default:
+ e = EINVAL;
+ break;
+ }
+
+ return e;
+}
+
+int
+ksched_getscheduler(struct ksched *ksched, struct thread *td, int *policy)
+{
+ return getscheduler(ksched, td, policy);
+}
+
+/* ksched_yield: Yield the CPU.
+ */
+int
+ksched_yield(struct ksched *ksched)
+{
+ sched_relinquish(curthread);
+ return 0;
+}
+
+int
+ksched_get_priority_max(struct ksched *ksched, int policy, int *prio)
+{
+ int e = 0;
+
+ switch (policy)
+ {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ *prio = RTP_PRIO_MAX;
+ break;
+
+ case SCHED_OTHER:
+ *prio = PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
+ break;
+
+ default:
+ e = EINVAL;
+ }
+
+ return e;
+}
+
+int
+ksched_get_priority_min(struct ksched *ksched, int policy, int *prio)
+{
+ int e = 0;
+
+ switch (policy)
+ {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ *prio = P1B_PRIO_MIN;
+ break;
+
+ case SCHED_OTHER:
+ *prio = 0;
+ break;
+
+ default:
+ e = EINVAL;
+ }
+
+ return e;
+}
+
+int
+ksched_rr_get_interval(struct ksched *ksched,
+ struct thread *td, struct timespec *timespec)
+{
+ *timespec = ksched->rr_interval;
+
+ return 0;
+}
Index: kern_prot.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_prot.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_prot.c -L sys/kern/kern_prot.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_prot.c
+++ sys/kern/kern_prot.c
@@ -1,12 +1,14 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993
- * The Regents of the University of California. All rights reserved.
+ * The Regents of the University of California.
* (c) UNIX System Laboratories, Inc.
+ * Copyright (c) 2000-2001 Robert N. M. Watson.
+ * All rights reserved.
+ *
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
- * Copyright (c) 2000-2001 Robert N. M. Watson. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -40,7 +42,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_prot.c,v 1.200 2005/04/18 13:36:56 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_prot.c,v 1.211 2007/06/12 00:11:59 rwatson Exp $");
#include "opt_compat.h"
#include "opt_mac.h"
@@ -51,10 +53,11 @@
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/refcount.h>
#include <sys/sx.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/sysproto.h>
#include <sys/jail.h>
@@ -62,22 +65,21 @@
#include <sys/resourcevar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
+#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
static MALLOC_DEFINE(M_CRED, "cred", "credentials");
-SYSCTL_DECL(_security);
-SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0,
- "BSD security policy");
+SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0, "BSD security policy");
#ifndef _SYS_SYSPROTO_H_
struct getpid_args {
int dummy;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
getpid(struct thread *td, struct getpid_args *uap)
@@ -98,9 +100,6 @@
int dummy;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
getppid(struct thread *td, struct getppid_args *uap)
@@ -121,9 +120,6 @@
int dummy;
};
#endif
-/*
- * MPSAFE
- */
int
getpgrp(struct thread *td, struct getpgrp_args *uap)
{
@@ -141,9 +137,6 @@
pid_t pid;
};
#endif
-/*
- * MPSAFE
- */
int
getpgid(struct thread *td, struct getpgid_args *uap)
{
@@ -176,9 +169,6 @@
pid_t pid;
};
#endif
-/*
- * MPSAFE
- */
int
getsid(struct thread *td, struct getsid_args *uap)
{
@@ -208,9 +198,6 @@
int dummy;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
getuid(struct thread *td, struct getuid_args *uap)
@@ -228,9 +215,6 @@
int dummy;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
geteuid(struct thread *td, struct geteuid_args *uap)
@@ -245,9 +229,6 @@
int dummy;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
getgid(struct thread *td, struct getgid_args *uap)
@@ -270,9 +251,6 @@
int dummy;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
getegid(struct thread *td, struct getegid_args *uap)
@@ -288,28 +266,39 @@
gid_t *gidset;
};
#endif
-/*
- * MPSAFE
- */
int
getgroups(struct thread *td, register struct getgroups_args *uap)
{
- struct ucred *cred;
+ gid_t groups[NGROUPS];
u_int ngrp;
int error;
+ ngrp = MIN(uap->gidsetsize, NGROUPS);
+ error = kern_getgroups(td, &ngrp, groups);
+ if (error)
+ return (error);
+ if (uap->gidsetsize > 0)
+ error = copyout(groups, uap->gidset, ngrp * sizeof(gid_t));
+ if (error == 0)
+ td->td_retval[0] = ngrp;
+ return (error);
+}
+
+int
+kern_getgroups(struct thread *td, u_int *ngrp, gid_t *groups)
+{
+ struct ucred *cred;
+
cred = td->td_ucred;
- if ((ngrp = uap->gidsetsize) == 0) {
- td->td_retval[0] = cred->cr_ngroups;
+ if (*ngrp == 0) {
+ *ngrp = cred->cr_ngroups;
return (0);
}
- if (ngrp < cred->cr_ngroups)
+ if (*ngrp < cred->cr_ngroups)
return (EINVAL);
- ngrp = cred->cr_ngroups;
- error = copyout(cred->cr_groups, uap->gidset, ngrp * sizeof(gid_t));
- if (error == 0)
- td->td_retval[0] = ngrp;
- return (error);
+ *ngrp = cred->cr_ngroups;
+ bcopy(cred->cr_groups, groups, *ngrp * sizeof(gid_t));
+ return (0);
}
#ifndef _SYS_SYSPROTO_H_
@@ -317,9 +306,6 @@
int dummy;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
setsid(register struct thread *td, struct setsid_args *uap)
@@ -378,9 +364,6 @@
int pgid; /* target pgrp id */
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
setpgid(struct thread *td, register struct setpgid_args *uap)
@@ -481,9 +464,6 @@
uid_t uid;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
setuid(struct thread *td, struct setuid_args *uap)
@@ -495,6 +475,7 @@
int error;
uid = uap->uid;
+ AUDIT_ARG(uid, uid);
newcred = crget();
uip = uifind(uid);
PROC_LOCK(p);
@@ -530,7 +511,7 @@
#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */
uid != oldcred->cr_uid && /* allow setuid(geteuid()) */
#endif
- (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETUID, 0)) != 0)
goto fail;
/*
@@ -546,7 +527,8 @@
#ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */
uid == oldcred->cr_uid ||
#endif
- suser_cred(oldcred, SUSER_ALLOWJAIL) == 0) /* we are using privs */
+ /* We are using privs. */
+ priv_check_cred(oldcred, PRIV_CRED_SETUID, 0) == 0)
#endif
{
/*
@@ -594,9 +576,6 @@
uid_t euid;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
seteuid(struct thread *td, struct seteuid_args *uap)
@@ -608,6 +587,7 @@
int error;
euid = uap->euid;
+ AUDIT_ARG(euid, euid);
newcred = crget();
euip = uifind(euid);
PROC_LOCK(p);
@@ -621,7 +601,7 @@
if (euid != oldcred->cr_ruid && /* allow seteuid(getuid()) */
euid != oldcred->cr_svuid && /* allow seteuid(saved uid) */
- (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETEUID, 0)) != 0)
goto fail;
/*
@@ -651,9 +631,6 @@
gid_t gid;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
setgid(struct thread *td, struct setgid_args *uap)
@@ -664,6 +641,7 @@
int error;
gid = uap->gid;
+ AUDIT_ARG(gid, gid);
newcred = crget();
PROC_LOCK(p);
oldcred = p->p_ucred;
@@ -692,7 +670,7 @@
#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */
gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */
#endif
- (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETGID, 0)) != 0)
goto fail;
crcopy(newcred, oldcred);
@@ -705,7 +683,8 @@
#ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */
gid == oldcred->cr_groups[0] ||
#endif
- suser_cred(oldcred, SUSER_ALLOWJAIL) == 0) /* we are using privs */
+ /* We are using privs. */
+ priv_check_cred(oldcred, PRIV_CRED_SETGID, 0) == 0)
#endif
{
/*
@@ -751,9 +730,6 @@
gid_t egid;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
setegid(struct thread *td, struct setegid_args *uap)
@@ -764,6 +740,7 @@
int error;
egid = uap->egid;
+ AUDIT_ARG(egid, egid);
newcred = crget();
PROC_LOCK(p);
oldcred = p->p_ucred;
@@ -776,7 +753,7 @@
if (egid != oldcred->cr_rgid && /* allow setegid(getgid()) */
egid != oldcred->cr_svgid && /* allow setegid(saved gid) */
- (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID, 0)) != 0)
goto fail;
crcopy(newcred, oldcred);
@@ -801,39 +778,42 @@
gid_t *gidset;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
setgroups(struct thread *td, struct setgroups_args *uap)
{
+ gid_t groups[NGROUPS];
+ int error;
+
+ if (uap->gidsetsize > NGROUPS)
+ return (EINVAL);
+ error = copyin(uap->gidset, groups, uap->gidsetsize * sizeof(gid_t));
+ if (error)
+ return (error);
+ return (kern_setgroups(td, uap->gidsetsize, groups));
+}
+
+int
+kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups)
+{
struct proc *p = td->td_proc;
- struct ucred *newcred, *tempcred, *oldcred;
- u_int ngrp;
+ struct ucred *newcred, *oldcred;
int error;
- ngrp = uap->gidsetsize;
if (ngrp > NGROUPS)
return (EINVAL);
- tempcred = crget();
- error = copyin(uap->gidset, tempcred->cr_groups, ngrp * sizeof(gid_t));
- if (error != 0) {
- crfree(tempcred);
- return (error);
- }
+ AUDIT_ARG(groupset, groups, ngrp);
newcred = crget();
PROC_LOCK(p);
oldcred = p->p_ucred;
#ifdef MAC
- error = mac_check_proc_setgroups(p, oldcred, ngrp,
- tempcred->cr_groups);
+ error = mac_check_proc_setgroups(p, oldcred, ngrp, groups);
if (error)
goto fail;
#endif
- error = suser_cred(oldcred, SUSER_ALLOWJAIL);
+ error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0);
if (error)
goto fail;
@@ -851,21 +831,18 @@
*/
newcred->cr_ngroups = 1;
} else {
- bcopy(tempcred->cr_groups, newcred->cr_groups,
- ngrp * sizeof(gid_t));
+ bcopy(groups, newcred->cr_groups, ngrp * sizeof(gid_t));
newcred->cr_ngroups = ngrp;
}
setsugid(p);
p->p_ucred = newcred;
PROC_UNLOCK(p);
- crfree(tempcred);
crfree(oldcred);
return (0);
fail:
PROC_UNLOCK(p);
crfree(newcred);
- crfree(tempcred);
return (error);
}
@@ -875,9 +852,6 @@
uid_t euid;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
setreuid(register struct thread *td, struct setreuid_args *uap)
@@ -890,6 +864,8 @@
euid = uap->euid;
ruid = uap->ruid;
+ AUDIT_ARG(euid, euid);
+ AUDIT_ARG(ruid, ruid);
newcred = crget();
euip = uifind(euid);
ruip = uifind(ruid);
@@ -906,7 +882,7 @@
ruid != oldcred->cr_svuid) ||
(euid != (uid_t)-1 && euid != oldcred->cr_uid &&
euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) &&
- (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETREUID, 0)) != 0)
goto fail;
crcopy(newcred, oldcred);
@@ -944,9 +920,6 @@
gid_t egid;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
setregid(register struct thread *td, struct setregid_args *uap)
@@ -958,6 +931,8 @@
egid = uap->egid;
rgid = uap->rgid;
+ AUDIT_ARG(egid, egid);
+ AUDIT_ARG(rgid, rgid);
newcred = crget();
PROC_LOCK(p);
oldcred = p->p_ucred;
@@ -972,7 +947,7 @@
rgid != oldcred->cr_svgid) ||
(egid != (gid_t)-1 && egid != oldcred->cr_groups[0] &&
egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) &&
- (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID, 0)) != 0)
goto fail;
crcopy(newcred, oldcred);
@@ -1001,10 +976,9 @@
}
/*
- * setresuid(ruid, euid, suid) is like setreuid except control over the
- * saved uid is explicit.
+ * setresuid(ruid, euid, suid) is like setreuid except control over the saved
+ * uid is explicit.
*/
-
#ifndef _SYS_SYSPROTO_H_
struct setresuid_args {
uid_t ruid;
@@ -1012,9 +986,6 @@
uid_t suid;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
setresuid(register struct thread *td, struct setresuid_args *uap)
@@ -1028,6 +999,9 @@
euid = uap->euid;
ruid = uap->ruid;
suid = uap->suid;
+ AUDIT_ARG(euid, euid);
+ AUDIT_ARG(ruid, ruid);
+ AUDIT_ARG(suid, suid);
newcred = crget();
euip = uifind(euid);
ruip = uifind(ruid);
@@ -1049,7 +1023,7 @@
(suid != (uid_t)-1 && suid != oldcred->cr_ruid &&
suid != oldcred->cr_svuid &&
suid != oldcred->cr_uid)) &&
- (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID, 0)) != 0)
goto fail;
crcopy(newcred, oldcred);
@@ -1082,10 +1056,9 @@
}
/*
- * setresgid(rgid, egid, sgid) is like setregid except control over the
- * saved gid is explicit.
+ * setresgid(rgid, egid, sgid) is like setregid except control over the saved
+ * gid is explicit.
*/
-
#ifndef _SYS_SYSPROTO_H_
struct setresgid_args {
gid_t rgid;
@@ -1093,9 +1066,6 @@
gid_t sgid;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
setresgid(register struct thread *td, struct setresgid_args *uap)
@@ -1108,6 +1078,9 @@
egid = uap->egid;
rgid = uap->rgid;
sgid = uap->sgid;
+ AUDIT_ARG(egid, egid);
+ AUDIT_ARG(rgid, rgid);
+ AUDIT_ARG(sgid, sgid);
newcred = crget();
PROC_LOCK(p);
oldcred = p->p_ucred;
@@ -1127,7 +1100,7 @@
(sgid != (gid_t)-1 && sgid != oldcred->cr_rgid &&
sgid != oldcred->cr_svgid &&
sgid != oldcred->cr_groups[0])) &&
- (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID, 0)) != 0)
goto fail;
crcopy(newcred, oldcred);
@@ -1161,9 +1134,6 @@
uid_t *suid;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
getresuid(register struct thread *td, struct getresuid_args *uap)
@@ -1191,9 +1161,6 @@
gid_t *sgid;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
getresgid(register struct thread *td, struct getresgid_args *uap)
@@ -1219,9 +1186,6 @@
int dummy;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
issetugid(register struct thread *td, struct issetugid_args *uap)
@@ -1242,9 +1206,6 @@
return (0);
}
-/*
- * MPSAFE
- */
int
__setugid(struct thread *td, struct __setugid_args *uap)
{
@@ -1274,8 +1235,6 @@
/*
* Check if gid is a member of the group set.
- *
- * MPSAFE (cred must be held)
*/
int
groupmember(gid_t gid, struct ucred *cred)
@@ -1291,66 +1250,13 @@
}
/*
- * `suser_enabled' (which can be set by the security.suser_enabled
- * sysctl) determines whether the system 'super-user' policy is in effect.
- * If it is nonzero, an effective uid of 0 connotes special privilege,
- * overriding many mandatory and discretionary protections. If it is zero,
- * uid 0 is offered no special privilege in the kernel security policy.
- * Setting it to zero may seriously impact the functionality of many
- * existing userland programs, and should not be done without careful
- * consideration of the consequences.
- */
-int suser_enabled = 1;
-SYSCTL_INT(_security_bsd, OID_AUTO, suser_enabled, CTLFLAG_RW,
- &suser_enabled, 0, "processes with uid 0 have privilege");
-TUNABLE_INT("security.bsd.suser_enabled", &suser_enabled);
-
-/*
- * Test whether the specified credentials imply "super-user" privilege.
- * Return 0 or EPERM.
- */
-int
-suser_cred(struct ucred *cred, int flag)
-{
-
- if (!suser_enabled)
- return (EPERM);
- if (((flag & SUSER_RUID) ? cred->cr_ruid : cred->cr_uid) != 0)
- return (EPERM);
- if (jailed(cred) && !(flag & SUSER_ALLOWJAIL))
- return (EPERM);
- return (0);
-}
-
-/*
- * Shortcut to hide contents of struct td and struct proc from the
- * caller, promoting binary compatibility.
- */
-int
-suser(struct thread *td)
-{
-
-#ifdef INVARIANTS
- if (td != curthread) {
- printf("suser: thread %p (%d %s) != curthread %p (%d %s)\n",
- td, td->td_proc->p_pid, td->td_proc->p_comm,
- curthread, curthread->td_proc->p_pid,
- curthread->td_proc->p_comm);
-#ifdef KDB
- kdb_backtrace();
-#endif
- }
-#endif
- return (suser_cred(td->td_ucred, 0));
-}
-
-/*
* Test the active securelevel against a given level. securelevel_gt()
* implements (securelevel > level). securelevel_ge() implements
* (securelevel >= level). Note that the logic is inverted -- these
* functions return EPERM on "success" and 0 on "failure".
*
- * MPSAFE
+ * XXXRW: Possibly since this has to do with privilege, it should move to
+ * kern_priv.c.
*/
int
securelevel_gt(struct ucred *cr, int level)
@@ -1402,7 +1308,7 @@
{
if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) {
- if (suser_cred(u1, SUSER_ALLOWJAIL) != 0)
+ if (priv_check_cred(u1, PRIV_SEEOTHERUIDS, 0) != 0)
return (ESRCH);
}
return (0);
@@ -1441,7 +1347,7 @@
break;
}
if (!match) {
- if (suser_cred(u1, SUSER_ALLOWJAIL) != 0)
+ if (priv_check_cred(u1, PRIV_SEEOTHERGIDS, 0) != 0)
return (ESRCH);
}
}
@@ -1558,7 +1464,7 @@
break;
default:
/* Not permitted without privilege. */
- error = suser_cred(cred, SUSER_ALLOWJAIL);
+ error = priv_check_cred(cred, PRIV_SIGNAL_SUGID, 0);
if (error)
return (error);
}
@@ -1572,8 +1478,7 @@
cred->cr_ruid != proc->p_ucred->cr_svuid &&
cred->cr_uid != proc->p_ucred->cr_ruid &&
cred->cr_uid != proc->p_ucred->cr_svuid) {
- /* Not permitted without privilege. */
- error = suser_cred(cred, SUSER_ALLOWJAIL);
+ error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED, 0);
if (error)
return (error);
}
@@ -1581,7 +1486,6 @@
return (0);
}
-
/*-
* Determine whether td may deliver the specified signal to p.
* Returns: 0 for permitted, an errno value otherwise
@@ -1650,19 +1554,13 @@
return (error);
if ((error = cr_seeothergids(td->td_ucred, p->p_ucred)))
return (error);
- if (td->td_ucred->cr_ruid == p->p_ucred->cr_ruid)
- return (0);
- if (td->td_ucred->cr_uid == p->p_ucred->cr_ruid)
- return (0);
- if (suser_cred(td->td_ucred, SUSER_ALLOWJAIL) == 0)
- return (0);
-
-#ifdef CAPABILITIES
- if (!cap_check(NULL, td, CAP_SYS_NICE, SUSER_ALLOWJAIL))
- return (0);
-#endif
-
- return (EPERM);
+ if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid &&
+ td->td_ucred->cr_uid != p->p_ucred->cr_ruid) {
+ error = priv_check(td, PRIV_SCHED_DIFFCRED);
+ if (error)
+ return (error);
+ }
+ return (0);
}
/*
@@ -1697,7 +1595,7 @@
KASSERT(td == curthread, ("%s: td not curthread", __func__));
PROC_LOCK_ASSERT(p, MA_OWNED);
if (!unprivileged_proc_debug) {
- error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+ error = priv_check(td, PRIV_DEBUG_UNPRIV);
if (error)
return (error);
}
@@ -1745,11 +1643,16 @@
/*
* If p's gids aren't a subset, or the uids aren't a subset,
* or the credential has changed, require appropriate privilege
- * for td to debug p. For POSIX.1e capabilities, this will
- * require CAP_SYS_PTRACE.
+ * for td to debug p.
*/
- if (!grpsubset || !uidsubset || credentialchanged) {
- error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+ if (!grpsubset || !uidsubset) {
+ error = priv_check(td, PRIV_DEBUG_DIFFCRED);
+ if (error)
+ return (error);
+ }
+
+ if (credentialchanged) {
+ error = priv_check(td, PRIV_DEBUG_SUGID);
if (error)
return (error);
}
@@ -1763,6 +1666,7 @@
/*
* Can't trace a process that's currently exec'ing.
+ *
* XXX: Note, this is not a security policy decision, it's a
* basic correctness/functionality decision. Therefore, this check
* should be moved to the caller's of p_candebug().
@@ -1833,7 +1737,6 @@
/*
* Allocate a zeroed cred structure.
- * MPSAFE
*/
struct ucred *
crget(void)
@@ -1841,8 +1744,10 @@
register struct ucred *cr;
MALLOC(cr, struct ucred *, sizeof(*cr), M_CRED, M_WAITOK | M_ZERO);
- cr->cr_ref = 1;
- cr->cr_mtxp = mtx_pool_find(mtxpool_sleep, cr);
+ refcount_init(&cr->cr_ref, 1);
+#ifdef AUDIT
+ audit_cred_init(cr);
+#endif
#ifdef MAC
mac_init_cred(cr);
#endif
@@ -1851,32 +1756,25 @@
/*
* Claim another reference to a ucred structure.
- * MPSAFE
*/
struct ucred *
crhold(struct ucred *cr)
{
- mtx_lock(cr->cr_mtxp);
- cr->cr_ref++;
- mtx_unlock(cr->cr_mtxp);
+ refcount_acquire(&cr->cr_ref);
return (cr);
}
/*
- * Free a cred structure.
- * Throws away space when ref count gets to 0.
- * MPSAFE
+ * Free a cred structure. Throws away space when ref count gets to 0.
*/
void
crfree(struct ucred *cr)
{
- struct mtx *mtxp = cr->cr_mtxp;
- mtx_lock(mtxp);
KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref));
- if (--cr->cr_ref == 0) {
- mtx_unlock(mtxp);
+ KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred"));
+ if (refcount_release(&cr->cr_ref)) {
/*
* Some callers of crget(), such as nfs_statfs(),
* allocate a temporary credential, but don't
@@ -1891,33 +1789,28 @@
*/
if (jailed(cr))
prison_free(cr->cr_prison);
+#ifdef AUDIT
+ audit_cred_destroy(cr);
+#endif
#ifdef MAC
mac_destroy_cred(cr);
#endif
FREE(cr, M_CRED);
- } else {
- mtx_unlock(mtxp);
}
}
/*
* Check to see if this ucred is shared.
- * MPSAFE
*/
int
crshared(struct ucred *cr)
{
- int shared;
- mtx_lock(cr->cr_mtxp);
- shared = (cr->cr_ref > 1);
- mtx_unlock(cr->cr_mtxp);
- return (shared);
+ return (cr->cr_ref > 1);
}
/*
* Copy a ucred's contents from a template. Does not block.
- * MPSAFE
*/
void
crcopy(struct ucred *dest, struct ucred *src)
@@ -1931,6 +1824,9 @@
uihold(dest->cr_ruidinfo);
if (jailed(dest))
prison_hold(dest->cr_prison);
+#ifdef AUDIT
+ audit_cred_copy(src, dest);
+#endif
#ifdef MAC
mac_copy_cred(src, dest);
#endif
@@ -1938,7 +1834,6 @@
/*
* Dup cred struct to a new held one.
- * MPSAFE
*/
struct ucred *
crdup(struct ucred *cr)
@@ -1952,7 +1847,6 @@
/*
* Fill in a struct xucred based on a struct ucred.
- * MPSAFE
*/
void
cru2x(struct ucred *cr, struct xucred *xcr)
@@ -1966,9 +1860,8 @@
}
/*
- * small routine to swap a thread's current ucred for the correct one
- * taken from the process.
- * MPSAFE
+ * small routine to swap a thread's current ucred for the correct one taken
+ * from the process.
*/
void
cred_update_thread(struct thread *td)
@@ -1994,9 +1887,6 @@
u_int namelen;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
getlogin(struct thread *td, struct getlogin_args *uap)
@@ -2024,9 +1914,6 @@
char *namebuf;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
setlogin(struct thread *td, struct setlogin_args *uap)
@@ -2035,7 +1922,7 @@
int error;
char logintmp[MAXLOGNAME];
- error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+ error = priv_check(td, PRIV_PROC_SETLOGIN);
if (error)
return (error);
error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL);
Index: kern_context.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_context.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_context.c -L sys/kern/kern_context.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_context.c
+++ sys/kern/kern_context.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_context.c,v 1.7 2003/11/09 20:31:03 marcel Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_context.c,v 1.9 2007/03/05 13:10:57 rwatson Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
@@ -39,9 +39,9 @@
#include <sys/ucontext.h>
/*
- * The first two fields of a ucontext_t are the signal mask and
- * the machine context. The next field is uc_link; we want to
- * avoid destroying the link when copying out contexts.
+ * The first two fields of a ucontext_t are the signal mask and the machine
+ * context. The next field is uc_link; we want to avoid destroying the link
+ * when copying out contexts.
*/
#define UC_COPY_SIZE offsetof(ucontext_t, uc_link)
@@ -58,9 +58,6 @@
}
#endif
-/*
- * MPSAFE
- */
int
getcontext(struct thread *td, struct getcontext_args *uap)
{
@@ -79,9 +76,6 @@
return (ret);
}
-/*
- * MPSAFE
- */
int
setcontext(struct thread *td, struct setcontext_args *uap)
{
--- sys/kern/kern_mac.c
+++ /dev/null
@@ -1,1250 +0,0 @@
-/*-
- * Copyright (c) 1999-2002 Robert N. M. Watson
- * Copyright (c) 2001 Ilmar S. Habibulin
- * Copyright (c) 2001-2005 Networks Associates Technology, Inc.
- * All rights reserved.
- *
- * This software was developed by Robert Watson and Ilmar Habibulin for the
- * TrustedBSD Project.
- *
- * This software was developed for the FreeBSD Project in part by Network
- * Associates Laboratories, the Security Research Division of Network
- * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
- * as part of the DARPA CHATS research program.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*-
- * Framework for extensible kernel access control. This file contains
- * Kernel and userland interface to the framework, policy registration
- * and composition. Per-object interfaces, controls, and labeling may be
- * found in src/sys/security/mac/. Sample policies may be found in
- * src/sys/security/mac_*.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_mac.c,v 1.117.2.2 2006/03/22 17:34:39 tegge Exp $");
-
-#include "opt_mac.h"
-#include "opt_devfs.h"
-
-#include <sys/param.h>
-#include <sys/condvar.h>
-#include <sys/extattr.h>
-#include <sys/imgact.h>
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
-#include <sys/mutex.h>
-#include <sys/mac.h>
-#include <sys/module.h>
-#include <sys/proc.h>
-#include <sys/sbuf.h>
-#include <sys/systm.h>
-#include <sys/sysproto.h>
-#include <sys/sysent.h>
-#include <sys/vnode.h>
-#include <sys/mount.h>
-#include <sys/file.h>
-#include <sys/namei.h>
-#include <sys/socket.h>
-#include <sys/pipe.h>
-#include <sys/socketvar.h>
-#include <sys/sysctl.h>
-
-#include <vm/vm.h>
-#include <vm/pmap.h>
-#include <vm/vm_map.h>
-#include <vm/vm_object.h>
-
-#include <sys/mac_policy.h>
-
-#include <fs/devfs/devfs.h>
-
-#include <net/bpfdesc.h>
-#include <net/if.h>
-#include <net/if_var.h>
-
-#include <netinet/in.h>
-#include <netinet/ip_var.h>
-
-#include <security/mac/mac_internal.h>
-
-#ifdef MAC
-
-/*
- * Declare that the kernel provides MAC support, version 1. This permits
- * modules to refuse to be loaded if the necessary support isn't present,
- * even if it's pre-boot.
- */
-MODULE_VERSION(kernel_mac_support, 2);
-
-SYSCTL_NODE(_security, OID_AUTO, mac, CTLFLAG_RW, 0,
- "TrustedBSD MAC policy controls");
-
-#if MAC_MAX_SLOTS > 32
-#error "MAC_MAX_SLOTS too large"
-#endif
-
-static unsigned int mac_max_slots = MAC_MAX_SLOTS;
-static unsigned int mac_slot_offsets_free = (1 << MAC_MAX_SLOTS) - 1;
-SYSCTL_UINT(_security_mac, OID_AUTO, max_slots, CTLFLAG_RD,
- &mac_max_slots, 0, "");
-
-/*
- * Has the kernel started generating labeled objects yet? All read/write
- * access to this variable is serialized during the boot process. Following
- * the end of serialization, we don't update this flag; no locking.
- */
-int mac_late = 0;
-
-/*
- * Flag to indicate whether or not we should allocate label storage for
- * new mbufs. Since most dynamic policies we currently work with don't
- * rely on mbuf labeling, try to avoid paying the cost of mtag allocation
- * unless specifically notified of interest. One result of this is
- * that if a dynamically loaded policy requests mbuf labels, it must
- * be able to deal with a NULL label being returned on any mbufs that
- * were already in flight when the policy was loaded. Since the policy
- * already has to deal with uninitialized labels, this probably won't
- * be a problem. Note: currently no locking. Will this be a problem?
- */
-#ifndef MAC_ALWAYS_LABEL_MBUF
-int mac_labelmbufs = 0;
-#endif
-
-#ifdef MAC_DEBUG
-SYSCTL_NODE(_security_mac, OID_AUTO, debug, CTLFLAG_RW, 0,
- "TrustedBSD MAC debug info");
-SYSCTL_NODE(_security_mac_debug, OID_AUTO, counters, CTLFLAG_RW, 0,
- "TrustedBSD MAC object counters");
-
-static unsigned int nmactemp;
-SYSCTL_UINT(_security_mac_debug_counters, OID_AUTO, temp, CTLFLAG_RD,
- &nmactemp, 0, "number of temporary labels in use");
-#endif
-
-static int mac_policy_register(struct mac_policy_conf *mpc);
-static int mac_policy_unregister(struct mac_policy_conf *mpc);
-
-MALLOC_DEFINE(M_MACTEMP, "mactemp", "MAC temporary label storage");
-
-/*
- * mac_static_policy_list holds a list of policy modules that are not
- * loaded while the system is "live", and cannot be unloaded. These
- * policies can be invoked without holding the busy count.
- *
- * mac_policy_list stores the list of dynamic policies. A busy count is
- * maintained for the list, stored in mac_policy_busy. The busy count
- * is protected by mac_policy_mtx; the list may be modified only
- * while the busy count is 0, requiring that the lock be held to
- * prevent new references to the list from being acquired. For almost
- * all operations, incrementing the busy count is sufficient to
- * guarantee consistency, as the list cannot be modified while the
- * busy count is elevated. For a few special operations involving a
- * change to the list of active policies, the mtx itself must be held.
- * A condition variable, mac_policy_cv, is used to signal potential
- * exclusive consumers that they should try to acquire the lock if a
- * first attempt at exclusive access fails.
- */
-#ifndef MAC_STATIC
-static struct mtx mac_policy_mtx;
-static struct cv mac_policy_cv;
-static int mac_policy_count;
-#endif
-struct mac_policy_list_head mac_policy_list;
-struct mac_policy_list_head mac_static_policy_list;
-
-/*
- * We manually invoke WITNESS_WARN() to allow Witness to generate
- * warnings even if we don't end up ever triggering the wait at
- * run-time. The consumer of the exclusive interface must not hold
- * any locks (other than potentially Giant) since we may sleep for
- * long (potentially indefinite) periods of time waiting for the
- * framework to become quiescent so that a policy list change may
- * be made.
- */
-void
-mac_policy_grab_exclusive(void)
-{
-
-#ifndef MAC_STATIC
- if (!mac_late)
- return;
-
- WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
- "mac_policy_grab_exclusive() at %s:%d", __FILE__, __LINE__);
- mtx_lock(&mac_policy_mtx);
- while (mac_policy_count != 0)
- cv_wait(&mac_policy_cv, &mac_policy_mtx);
-#endif
-}
-
-void
-mac_policy_assert_exclusive(void)
-{
-
-#ifndef MAC_STATIC
- if (!mac_late)
- return;
-
- mtx_assert(&mac_policy_mtx, MA_OWNED);
- KASSERT(mac_policy_count == 0,
- ("mac_policy_assert_exclusive(): not exclusive"));
-#endif
-}
-
-void
-mac_policy_release_exclusive(void)
-{
-
-#ifndef MAC_STATIC
- if (!mac_late)
- return;
-
- KASSERT(mac_policy_count == 0,
- ("mac_policy_release_exclusive(): not exclusive"));
- mtx_unlock(&mac_policy_mtx);
- cv_signal(&mac_policy_cv);
-#endif
-}
-
-void
-mac_policy_list_busy(void)
-{
-
-#ifndef MAC_STATIC
- if (!mac_late)
- return;
-
- mtx_lock(&mac_policy_mtx);
- mac_policy_count++;
- mtx_unlock(&mac_policy_mtx);
-#endif
-}
-
-int
-mac_policy_list_conditional_busy(void)
-{
-#ifndef MAC_STATIC
- int ret;
-
- if (!mac_late)
- return (1);
-
- mtx_lock(&mac_policy_mtx);
- if (!LIST_EMPTY(&mac_policy_list)) {
- mac_policy_count++;
- ret = 1;
- } else
- ret = 0;
- mtx_unlock(&mac_policy_mtx);
- return (ret);
-#else
- if (!mac_late)
- return (1);
-
- return (1);
-#endif
-}
-
-void
-mac_policy_list_unbusy(void)
-{
-
-#ifndef MAC_STATIC
- if (!mac_late)
- return;
-
- mtx_lock(&mac_policy_mtx);
- mac_policy_count--;
- KASSERT(mac_policy_count >= 0, ("MAC_POLICY_LIST_LOCK"));
- if (mac_policy_count == 0)
- cv_signal(&mac_policy_cv);
- mtx_unlock(&mac_policy_mtx);
-#endif
-}
-
-/*
- * Initialize the MAC subsystem, including appropriate SMP locks.
- */
-static void
-mac_init(void)
-{
-
- LIST_INIT(&mac_static_policy_list);
- LIST_INIT(&mac_policy_list);
- mac_labelzone_init();
-
-#ifndef MAC_STATIC
- mtx_init(&mac_policy_mtx, "mac_policy_mtx", NULL, MTX_DEF);
- cv_init(&mac_policy_cv, "mac_policy_cv");
-#endif
-}
-
-/*
- * For the purposes of modules that want to know if they were loaded
- * "early", set the mac_late flag once we've processed modules either
- * linked into the kernel, or loaded before the kernel startup.
- */
-static void
-mac_late_init(void)
-{
-
- mac_late = 1;
-}
-
-/*
- * After the policy list has changed, walk the list to update any global
- * flags. Currently, we support only one flag, and it's conditionally
- * defined; as a result, the entire function is conditional. Eventually,
- * the #else case might also iterate across the policies.
- */
-static void
-mac_policy_updateflags(void)
-{
-#ifndef MAC_ALWAYS_LABEL_MBUF
- struct mac_policy_conf *tmpc;
- int labelmbufs;
-
- mac_policy_assert_exclusive();
-
- labelmbufs = 0;
- LIST_FOREACH(tmpc, &mac_static_policy_list, mpc_list) {
- if (tmpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_LABELMBUFS)
- labelmbufs++;
- }
- LIST_FOREACH(tmpc, &mac_policy_list, mpc_list) {
- if (tmpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_LABELMBUFS)
- labelmbufs++;
- }
- mac_labelmbufs = (labelmbufs != 0);
-#endif
-}
-
-/*
- * Allow MAC policy modules to register during boot, etc.
- */
-int
-mac_policy_modevent(module_t mod, int type, void *data)
-{
- struct mac_policy_conf *mpc;
- int error;
-
- error = 0;
- mpc = (struct mac_policy_conf *) data;
-
-#ifdef MAC_STATIC
- if (mac_late) {
- printf("mac_policy_modevent: MAC_STATIC and late\n");
- return (EBUSY);
- }
-#endif
-
- switch (type) {
- case MOD_LOAD:
- if (mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_NOTLATE &&
- mac_late) {
- printf("mac_policy_modevent: can't load %s policy "
- "after booting\n", mpc->mpc_name);
- error = EBUSY;
- break;
- }
- error = mac_policy_register(mpc);
- break;
- case MOD_UNLOAD:
- /* Don't unregister the module if it was never registered. */
- if ((mpc->mpc_runtime_flags & MPC_RUNTIME_FLAG_REGISTERED)
- != 0)
- error = mac_policy_unregister(mpc);
- else
- error = 0;
- break;
- default:
- error = EOPNOTSUPP;
- break;
- }
-
- return (error);
-}
-
-static int
-mac_policy_register(struct mac_policy_conf *mpc)
-{
- struct mac_policy_conf *tmpc;
- int error, slot, static_entry;
-
- error = 0;
-
- /*
- * We don't technically need exclusive access while !mac_late,
- * but hold it for assertion consistency.
- */
- mac_policy_grab_exclusive();
-
- /*
- * If the module can potentially be unloaded, or we're loading
- * late, we have to stick it in the non-static list and pay
- * an extra performance overhead. Otherwise, we can pay a
- * light locking cost and stick it in the static list.
- */
- static_entry = (!mac_late &&
- !(mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_UNLOADOK));
-
- if (static_entry) {
- LIST_FOREACH(tmpc, &mac_static_policy_list, mpc_list) {
- if (strcmp(tmpc->mpc_name, mpc->mpc_name) == 0) {
- error = EEXIST;
- goto out;
- }
- }
- } else {
- LIST_FOREACH(tmpc, &mac_policy_list, mpc_list) {
- if (strcmp(tmpc->mpc_name, mpc->mpc_name) == 0) {
- error = EEXIST;
- goto out;
- }
- }
- }
- if (mpc->mpc_field_off != NULL) {
- slot = ffs(mac_slot_offsets_free);
- if (slot == 0) {
- error = ENOMEM;
- goto out;
- }
- slot--;
- mac_slot_offsets_free &= ~(1 << slot);
- *mpc->mpc_field_off = slot;
- }
- mpc->mpc_runtime_flags |= MPC_RUNTIME_FLAG_REGISTERED;
-
- /*
- * If we're loading a MAC module after the framework has
- * initialized, it has to go into the dynamic list. If
- * we're loading it before we've finished initializing,
- * it can go into the static list with weaker locker
- * requirements.
- */
- if (static_entry)
- LIST_INSERT_HEAD(&mac_static_policy_list, mpc, mpc_list);
- else
- LIST_INSERT_HEAD(&mac_policy_list, mpc, mpc_list);
-
- /* Per-policy initialization. */
- if (mpc->mpc_ops->mpo_init != NULL)
- (*(mpc->mpc_ops->mpo_init))(mpc);
- mac_policy_updateflags();
-
- printf("Security policy loaded: %s (%s)\n", mpc->mpc_fullname,
- mpc->mpc_name);
-
-out:
- mac_policy_release_exclusive();
- return (error);
-}
-
-static int
-mac_policy_unregister(struct mac_policy_conf *mpc)
-{
-
- /*
- * If we fail the load, we may get a request to unload. Check
- * to see if we did the run-time registration, and if not,
- * silently succeed.
- */
- mac_policy_grab_exclusive();
- if ((mpc->mpc_runtime_flags & MPC_RUNTIME_FLAG_REGISTERED) == 0) {
- mac_policy_release_exclusive();
- return (0);
- }
-#if 0
- /*
- * Don't allow unloading modules with private data.
- */
- if (mpc->mpc_field_off != NULL) {
- MAC_POLICY_LIST_UNLOCK();
- return (EBUSY);
- }
-#endif
- /*
- * Only allow the unload to proceed if the module is unloadable
- * by its own definition.
- */
- if ((mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_UNLOADOK) == 0) {
- mac_policy_release_exclusive();
- return (EBUSY);
- }
- if (mpc->mpc_ops->mpo_destroy != NULL)
- (*(mpc->mpc_ops->mpo_destroy))(mpc);
-
- LIST_REMOVE(mpc, mpc_list);
- mpc->mpc_runtime_flags &= ~MPC_RUNTIME_FLAG_REGISTERED;
- mac_policy_updateflags();
-
- mac_policy_release_exclusive();
-
- printf("Security policy unload: %s (%s)\n", mpc->mpc_fullname,
- mpc->mpc_name);
-
- return (0);
-}
-
-/*
- * Define an error value precedence, and given two arguments, selects the
- * value with the higher precedence.
- */
-int
-mac_error_select(int error1, int error2)
-{
-
- /* Certain decision-making errors take top priority. */
- if (error1 == EDEADLK || error2 == EDEADLK)
- return (EDEADLK);
-
- /* Invalid arguments should be reported where possible. */
- if (error1 == EINVAL || error2 == EINVAL)
- return (EINVAL);
-
- /* Precedence goes to "visibility", with both process and file. */
- if (error1 == ESRCH || error2 == ESRCH)
- return (ESRCH);
-
- if (error1 == ENOENT || error2 == ENOENT)
- return (ENOENT);
-
- /* Precedence goes to DAC/MAC protections. */
- if (error1 == EACCES || error2 == EACCES)
- return (EACCES);
-
- /* Precedence goes to privilege. */
- if (error1 == EPERM || error2 == EPERM)
- return (EPERM);
-
- /* Precedence goes to error over success; otherwise, arbitrary. */
- if (error1 != 0)
- return (error1);
- return (error2);
-}
-
-void
-mac_init_label(struct label *label)
-{
-
- bzero(label, sizeof(*label));
- label->l_flags = MAC_FLAG_INITIALIZED;
-}
-
-void
-mac_destroy_label(struct label *label)
-{
-
- KASSERT(label->l_flags & MAC_FLAG_INITIALIZED,
- ("destroying uninitialized label"));
-
- bzero(label, sizeof(*label));
- /* implicit: label->l_flags &= ~MAC_FLAG_INITIALIZED; */
-}
-
-int
-mac_check_structmac_consistent(struct mac *mac)
-{
-
- if (mac->m_buflen < 0 ||
- mac->m_buflen > MAC_MAX_LABEL_BUF_LEN)
- return (EINVAL);
-
- return (0);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
-{
- char *elements, *buffer;
- struct mac mac;
- struct proc *tproc;
- struct ucred *tcred;
- int error;
-
- error = copyin(uap->mac_p, &mac, sizeof(mac));
- if (error)
- return (error);
-
- error = mac_check_structmac_consistent(&mac);
- if (error)
- return (error);
-
- tproc = pfind(uap->pid);
- if (tproc == NULL)
- return (ESRCH);
-
- tcred = NULL; /* Satisfy gcc. */
- error = p_cansee(td, tproc);
- if (error == 0)
- tcred = crhold(tproc->p_ucred);
- PROC_UNLOCK(tproc);
- if (error)
- return (error);
-
- elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
- error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
- if (error) {
- free(elements, M_MACTEMP);
- crfree(tcred);
- return (error);
- }
-
- buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
- error = mac_externalize_cred_label(tcred->cr_label, elements,
- buffer, mac.m_buflen);
- if (error == 0)
- error = copyout(buffer, mac.m_string, strlen(buffer)+1);
-
- free(buffer, M_MACTEMP);
- free(elements, M_MACTEMP);
- crfree(tcred);
- return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
-{
- char *elements, *buffer;
- struct mac mac;
- int error;
-
- error = copyin(uap->mac_p, &mac, sizeof(mac));
- if (error)
- return (error);
-
- error = mac_check_structmac_consistent(&mac);
- if (error)
- return (error);
-
- elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
- error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
- if (error) {
- free(elements, M_MACTEMP);
- return (error);
- }
-
- buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
- error = mac_externalize_cred_label(td->td_ucred->cr_label,
- elements, buffer, mac.m_buflen);
- if (error == 0)
- error = copyout(buffer, mac.m_string, strlen(buffer)+1);
-
- free(buffer, M_MACTEMP);
- free(elements, M_MACTEMP);
- return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
-{
- struct ucred *newcred, *oldcred;
- struct label *intlabel;
- struct proc *p;
- struct mac mac;
- char *buffer;
- int error;
-
- error = copyin(uap->mac_p, &mac, sizeof(mac));
- if (error)
- return (error);
-
- error = mac_check_structmac_consistent(&mac);
- if (error)
- return (error);
-
- buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
- error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
- if (error) {
- free(buffer, M_MACTEMP);
- return (error);
- }
-
- intlabel = mac_cred_label_alloc();
- error = mac_internalize_cred_label(intlabel, buffer);
- free(buffer, M_MACTEMP);
- if (error)
- goto out;
-
- newcred = crget();
-
- p = td->td_proc;
- PROC_LOCK(p);
- oldcred = p->p_ucred;
-
- error = mac_check_cred_relabel(oldcred, intlabel);
- if (error) {
- PROC_UNLOCK(p);
- crfree(newcred);
- goto out;
- }
-
- setsugid(p);
- crcopy(newcred, oldcred);
- mac_relabel_cred(newcred, intlabel);
- p->p_ucred = newcred;
-
- /*
- * Grab additional reference for use while revoking mmaps, prior
- * to releasing the proc lock and sharing the cred.
- */
- crhold(newcred);
- PROC_UNLOCK(p);
-
- if (mac_enforce_vm) {
- mac_cred_mmapped_drop_perms(td, newcred);
- }
-
- crfree(newcred); /* Free revocation reference. */
- crfree(oldcred);
-
-out:
- mac_cred_label_free(intlabel);
- return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
-{
- char *elements, *buffer;
- struct label *intlabel;
- struct file *fp;
- struct mac mac;
- struct vnode *vp;
- struct pipe *pipe;
- struct socket *so;
- short label_type;
- int vfslocked, error;
-
- error = copyin(uap->mac_p, &mac, sizeof(mac));
- if (error)
- return (error);
-
- error = mac_check_structmac_consistent(&mac);
- if (error)
- return (error);
-
- elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
- error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
- if (error) {
- free(elements, M_MACTEMP);
- return (error);
- }
-
- buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
- error = fget(td, uap->fd, &fp);
- if (error)
- goto out;
-
- label_type = fp->f_type;
- switch (fp->f_type) {
- case DTYPE_FIFO:
- case DTYPE_VNODE:
- vp = fp->f_vnode;
- intlabel = mac_vnode_label_alloc();
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
- mac_copy_vnode_label(vp->v_label, intlabel);
- VOP_UNLOCK(vp, 0, td);
- VFS_UNLOCK_GIANT(vfslocked);
- error = mac_externalize_vnode_label(intlabel, elements,
- buffer, mac.m_buflen);
- mac_vnode_label_free(intlabel);
- break;
-
- case DTYPE_PIPE:
- pipe = fp->f_data;
- intlabel = mac_pipe_label_alloc();
- PIPE_LOCK(pipe);
- mac_copy_pipe_label(pipe->pipe_pair->pp_label, intlabel);
- PIPE_UNLOCK(pipe);
- error = mac_externalize_pipe_label(intlabel, elements,
- buffer, mac.m_buflen);
- mac_pipe_label_free(intlabel);
- break;
-
- case DTYPE_SOCKET:
- so = fp->f_data;
- intlabel = mac_socket_label_alloc(M_WAITOK);
- NET_LOCK_GIANT();
- SOCK_LOCK(so);
- mac_copy_socket_label(so->so_label, intlabel);
- SOCK_UNLOCK(so);
- NET_UNLOCK_GIANT();
- error = mac_externalize_socket_label(intlabel, elements,
- buffer, mac.m_buflen);
- mac_socket_label_free(intlabel);
- break;
-
- default:
- error = EINVAL;
- }
- fdrop(fp, td);
- if (error == 0)
- error = copyout(buffer, mac.m_string, strlen(buffer)+1);
-
-out:
- free(buffer, M_MACTEMP);
- free(elements, M_MACTEMP);
- return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
-{
- char *elements, *buffer;
- struct nameidata nd;
- struct label *intlabel;
- struct mac mac;
- int vfslocked, error;
-
- error = copyin(uap->mac_p, &mac, sizeof(mac));
- if (error)
- return (error);
-
- error = mac_check_structmac_consistent(&mac);
- if (error)
- return (error);
-
- elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
- error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
- if (error) {
- free(elements, M_MACTEMP);
- return (error);
- }
-
- buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
- NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | FOLLOW, UIO_USERSPACE,
- uap->path_p, td);
- error = namei(&nd);
- if (error)
- goto out;
-
- intlabel = mac_vnode_label_alloc();
- vfslocked = NDHASGIANT(&nd);
- mac_copy_vnode_label(nd.ni_vp->v_label, intlabel);
- error = mac_externalize_vnode_label(intlabel, elements, buffer,
- mac.m_buflen);
-
- NDFREE(&nd, 0);
- VFS_UNLOCK_GIANT(vfslocked);
- mac_vnode_label_free(intlabel);
- if (error == 0)
- error = copyout(buffer, mac.m_string, strlen(buffer)+1);
-
-out:
- free(buffer, M_MACTEMP);
- free(elements, M_MACTEMP);
-
- return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
-{
- char *elements, *buffer;
- struct nameidata nd;
- struct label *intlabel;
- struct mac mac;
- int vfslocked, error;
-
- error = copyin(uap->mac_p, &mac, sizeof(mac));
- if (error)
- return (error);
-
- error = mac_check_structmac_consistent(&mac);
- if (error)
- return (error);
-
- elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
- error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
- if (error) {
- free(elements, M_MACTEMP);
- return (error);
- }
-
- buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
- NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | NOFOLLOW, UIO_USERSPACE,
- uap->path_p, td);
- error = namei(&nd);
- if (error)
- goto out;
-
- intlabel = mac_vnode_label_alloc();
- vfslocked = NDHASGIANT(&nd);
- mac_copy_vnode_label(nd.ni_vp->v_label, intlabel);
- error = mac_externalize_vnode_label(intlabel, elements, buffer,
- mac.m_buflen);
- NDFREE(&nd, 0);
- VFS_UNLOCK_GIANT(vfslocked);
- mac_vnode_label_free(intlabel);
-
- if (error == 0)
- error = copyout(buffer, mac.m_string, strlen(buffer)+1);
-
-out:
- free(buffer, M_MACTEMP);
- free(elements, M_MACTEMP);
-
- return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
-{
- struct label *intlabel;
- struct pipe *pipe;
- struct socket *so;
- struct file *fp;
- struct mount *mp;
- struct vnode *vp;
- struct mac mac;
- char *buffer;
- int error, vfslocked;
-
- error = copyin(uap->mac_p, &mac, sizeof(mac));
- if (error)
- return (error);
-
- error = mac_check_structmac_consistent(&mac);
- if (error)
- return (error);
-
- buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
- error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
- if (error) {
- free(buffer, M_MACTEMP);
- return (error);
- }
-
- error = fget(td, uap->fd, &fp);
- if (error)
- goto out;
-
- switch (fp->f_type) {
- case DTYPE_FIFO:
- case DTYPE_VNODE:
- intlabel = mac_vnode_label_alloc();
- error = mac_internalize_vnode_label(intlabel, buffer);
- if (error) {
- mac_vnode_label_free(intlabel);
- break;
- }
- vp = fp->f_vnode;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
- error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
- if (error != 0) {
- VFS_UNLOCK_GIANT(vfslocked);
- mac_vnode_label_free(intlabel);
- break;
- }
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
- error = vn_setlabel(vp, intlabel, td->td_ucred);
- VOP_UNLOCK(vp, 0, td);
- vn_finished_write(mp);
- VFS_UNLOCK_GIANT(vfslocked);
- mac_vnode_label_free(intlabel);
- break;
-
- case DTYPE_PIPE:
- intlabel = mac_pipe_label_alloc();
- error = mac_internalize_pipe_label(intlabel, buffer);
- if (error == 0) {
- pipe = fp->f_data;
- PIPE_LOCK(pipe);
- error = mac_pipe_label_set(td->td_ucred,
- pipe->pipe_pair, intlabel);
- PIPE_UNLOCK(pipe);
- }
- mac_pipe_label_free(intlabel);
- break;
-
- case DTYPE_SOCKET:
- intlabel = mac_socket_label_alloc(M_WAITOK);
- error = mac_internalize_socket_label(intlabel, buffer);
- if (error == 0) {
- so = fp->f_data;
- NET_LOCK_GIANT();
- error = mac_socket_label_set(td->td_ucred, so,
- intlabel);
- NET_UNLOCK_GIANT();
- }
- mac_socket_label_free(intlabel);
- break;
-
- default:
- error = EINVAL;
- }
- fdrop(fp, td);
-out:
- free(buffer, M_MACTEMP);
- return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
-{
- struct label *intlabel;
- struct nameidata nd;
- struct mount *mp;
- struct mac mac;
- char *buffer;
- int vfslocked, error;
-
- error = copyin(uap->mac_p, &mac, sizeof(mac));
- if (error)
- return (error);
-
- error = mac_check_structmac_consistent(&mac);
- if (error)
- return (error);
-
- buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
- error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
- if (error) {
- free(buffer, M_MACTEMP);
- return (error);
- }
-
- intlabel = mac_vnode_label_alloc();
- error = mac_internalize_vnode_label(intlabel, buffer);
- free(buffer, M_MACTEMP);
- if (error)
- goto out;
-
- NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | FOLLOW, UIO_USERSPACE,
- uap->path_p, td);
- error = namei(&nd);
- vfslocked = NDHASGIANT(&nd);
- if (error == 0) {
- error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
- if (error == 0) {
- error = vn_setlabel(nd.ni_vp, intlabel,
- td->td_ucred);
- vn_finished_write(mp);
- }
- }
-
- NDFREE(&nd, 0);
- VFS_UNLOCK_GIANT(vfslocked);
-out:
- mac_vnode_label_free(intlabel);
- return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
-{
- struct label *intlabel;
- struct nameidata nd;
- struct mount *mp;
- struct mac mac;
- char *buffer;
- int vfslocked, error;
-
- error = copyin(uap->mac_p, &mac, sizeof(mac));
- if (error)
- return (error);
-
- error = mac_check_structmac_consistent(&mac);
- if (error)
- return (error);
-
- buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
- error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
- if (error) {
- free(buffer, M_MACTEMP);
- return (error);
- }
-
- intlabel = mac_vnode_label_alloc();
- error = mac_internalize_vnode_label(intlabel, buffer);
- free(buffer, M_MACTEMP);
- if (error)
- goto out;
-
- NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | NOFOLLOW, UIO_USERSPACE,
- uap->path_p, td);
- error = namei(&nd);
- vfslocked = NDHASGIANT(&nd);
- if (error == 0) {
- error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
- if (error == 0) {
- error = vn_setlabel(nd.ni_vp, intlabel,
- td->td_ucred);
- vn_finished_write(mp);
- }
- }
-
- NDFREE(&nd, 0);
- VFS_UNLOCK_GIANT(vfslocked);
-out:
- mac_vnode_label_free(intlabel);
- return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-mac_syscall(struct thread *td, struct mac_syscall_args *uap)
-{
- struct mac_policy_conf *mpc;
- char target[MAC_MAX_POLICY_NAME];
- int entrycount, error;
-
- error = copyinstr(uap->policy, target, sizeof(target), NULL);
- if (error)
- return (error);
-
- error = ENOSYS;
- LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) {
- if (strcmp(mpc->mpc_name, target) == 0 &&
- mpc->mpc_ops->mpo_syscall != NULL) {
- error = mpc->mpc_ops->mpo_syscall(td,
- uap->call, uap->arg);
- goto out;
- }
- }
-
- if ((entrycount = mac_policy_list_conditional_busy()) != 0) {
- LIST_FOREACH(mpc, &mac_policy_list, mpc_list) {
- if (strcmp(mpc->mpc_name, target) == 0 &&
- mpc->mpc_ops->mpo_syscall != NULL) {
- error = mpc->mpc_ops->mpo_syscall(td,
- uap->call, uap->arg);
- break;
- }
- }
- mac_policy_list_unbusy();
- }
-out:
- return (error);
-}
-
-SYSINIT(mac, SI_SUB_MAC, SI_ORDER_FIRST, mac_init, NULL);
-SYSINIT(mac_late, SI_SUB_MAC_LATE, SI_ORDER_FIRST, mac_late_init, NULL);
-
-#else /* !MAC */
-
-int
-__mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
-{
-
- return (ENOSYS);
-}
-
-int
-__mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
-{
-
- return (ENOSYS);
-}
-
-int
-__mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
-{
-
- return (ENOSYS);
-}
-
-int
-__mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
-{
-
- return (ENOSYS);
-}
-
-int
-__mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
-{
-
- return (ENOSYS);
-}
-
-int
-__mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
-{
-
- return (ENOSYS);
-}
-
-int
-__mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
-{
-
- return (ENOSYS);
-}
-
-int
-__mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
-{
-
- return (ENOSYS);
-}
-
-int
-__mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
-{
-
- return (ENOSYS);
-}
-
-int
-mac_syscall(struct thread *td, struct mac_syscall_args *uap)
-{
-
- return (ENOSYS);
-}
-
-#endif /* !MAC */
--- /dev/null
+++ sys/kern/uipc_sockbuf.c
@@ -0,0 +1,1041 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/uipc_sockbuf.c,v 1.171.2.1.2.1 2008/02/02 12:44:13 rwatson Exp $");
+
+#include "opt_param.h"
+
+#include <sys/param.h>
+#include <sys/aio.h> /* for aio_swake proto */
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+
+/*
+ * Function pointer set by the AIO routines so that the socket buffer code
+ * can call back into the AIO module if it is loaded.
+ */
+void (*aio_swake)(struct socket *, struct sockbuf *);
+
+/*
+ * Primitive routines for operating on socket buffers
+ */
+
+u_long sb_max = SB_MAX;
+u_long sb_max_adj =
+ SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */
+
+static u_long sb_efficiency = 8; /* parameter for sbreserve() */
+
+static void sbdrop_internal(struct sockbuf *sb, int len);
+static void sbflush_internal(struct sockbuf *sb);
+static void sbrelease_internal(struct sockbuf *sb, struct socket *so);
+
+/*
+ * Socantsendmore indicates that no more data will be sent on the socket; it
+ * would normally be applied to a socket when the user informs the system
+ * that no more data is to be sent, by the protocol code (in case
+ * PRU_SHUTDOWN). Socantrcvmore indicates that no more data will be
+ * received, and will normally be applied to the socket by a protocol when it
+ * detects that the peer will send no more data. Data queued for reading in
+ * the socket may yet be read.
+ */
+void
+socantsendmore_locked(struct socket *so)
+{
+
+ SOCKBUF_LOCK_ASSERT(&so->so_snd);
+
+ so->so_snd.sb_state |= SBS_CANTSENDMORE;
+ sowwakeup_locked(so);
+ mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
+}
+
+void
+socantsendmore(struct socket *so)
+{
+
+ SOCKBUF_LOCK(&so->so_snd);
+ socantsendmore_locked(so);
+ mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
+}
+
+void
+socantrcvmore_locked(struct socket *so)
+{
+
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+ so->so_rcv.sb_state |= SBS_CANTRCVMORE;
+ sorwakeup_locked(so);
+ mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
+}
+
+void
+socantrcvmore(struct socket *so)
+{
+
+ SOCKBUF_LOCK(&so->so_rcv);
+ socantrcvmore_locked(so);
+ mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
+}
+
+/*
+ * Wait for data to arrive at/drain from a socket buffer.
+ */
+int
+sbwait(struct sockbuf *sb)
+{
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ sb->sb_flags |= SB_WAIT;
+ return (msleep(&sb->sb_cc, &sb->sb_mtx,
+ (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
+ sb->sb_timeo));
+}
+
+int
+sblock(struct sockbuf *sb, int flags)
+{
+
+ KASSERT((flags & SBL_VALID) == flags,
+ ("sblock: flags invalid (0x%x)", flags));
+
+ if (flags & SBL_WAIT) {
+ if ((sb->sb_flags & SB_NOINTR) ||
+ (flags & SBL_NOINTR)) {
+ sx_xlock(&sb->sb_sx);
+ return (0);
+ }
+ return (sx_xlock_sig(&sb->sb_sx));
+ } else {
+ if (sx_try_xlock(&sb->sb_sx) == 0)
+ return (EWOULDBLOCK);
+ return (0);
+ }
+}
+
+void
+sbunlock(struct sockbuf *sb)
+{
+
+ sx_xunlock(&sb->sb_sx);
+}
+
+/*
+ * Wakeup processes waiting on a socket buffer. Do asynchronous notification
+ * via SIGIO if the socket has the SS_ASYNC flag set.
+ *
+ * Called with the socket buffer lock held; will release the lock by the end
+ * of the function. This allows the caller to acquire the socket buffer lock
+ * while testing for the need for various sorts of wakeup and hold it through
+ * to the point where it's no longer required. We currently hold the lock
+ * through calls out to other subsystems (with the exception of kqueue), and
+ * then release it to avoid lock order issues. It's not clear that's
+ * correct.
+ */
+void
+sowakeup(struct socket *so, struct sockbuf *sb)
+{
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ selwakeuppri(&sb->sb_sel, PSOCK);
+ sb->sb_flags &= ~SB_SEL;
+ if (sb->sb_flags & SB_WAIT) {
+ sb->sb_flags &= ~SB_WAIT;
+ wakeup(&sb->sb_cc);
+ }
+ KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
+ SOCKBUF_UNLOCK(sb);
+ if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
+ pgsigio(&so->so_sigio, SIGIO, 0);
+ if (sb->sb_flags & SB_UPCALL)
+ (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
+ if (sb->sb_flags & SB_AIO)
+ aio_swake(so, sb);
+ mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED);
+}
+
+/*
+ * Socket buffer (struct sockbuf) utility routines.
+ *
+ * Each socket contains two socket buffers: one for sending data and one for
+ * receiving data. Each buffer contains a queue of mbufs, information about
+ * the number of mbufs and amount of data in the queue, and other fields
+ * allowing select() statements and notification on data availability to be
+ * implemented.
+ *
+ * Data stored in a socket buffer is maintained as a list of records. Each
+ * record is a list of mbufs chained together with the m_next field. Records
+ * are chained together with the m_nextpkt field. The upper level routine
+ * soreceive() expects the following conventions to be observed when placing
+ * information in the receive buffer:
+ *
+ * 1. If the protocol requires each message be preceded by the sender's name,
+ * then a record containing that name must be present before any
+ * associated data (mbuf's must be of type MT_SONAME).
+ * 2. If the protocol supports the exchange of ``access rights'' (really just
+ * additional data associated with the message), and there are ``rights''
+ * to be received, then a record containing this data should be present
+ * (mbuf's must be of type MT_RIGHTS).
+ * 3. If a name or rights record exists, then it must be followed by a data
+ * record, perhaps of zero length.
+ *
+ * Before using a new socket structure it is first necessary to reserve
+ * buffer space to the socket, by calling sbreserve(). This should commit
+ * some of the available buffer space in the system buffer pool for the
+ * socket (currently, it does nothing but enforce limits). The space should
+ * be released by calling sbrelease() when the socket is destroyed.
+ */
+int
+soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
+{
+ struct thread *td = curthread;
+
+ SOCKBUF_LOCK(&so->so_snd);
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0)
+ goto bad;
+ if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0)
+ goto bad2;
+ if (so->so_rcv.sb_lowat == 0)
+ so->so_rcv.sb_lowat = 1;
+ if (so->so_snd.sb_lowat == 0)
+ so->so_snd.sb_lowat = MCLBYTES;
+ if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
+ so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ return (0);
+bad2:
+ sbrelease_locked(&so->so_snd, so);
+bad:
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ return (ENOBUFS);
+}
+
+static int
+sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS)
+{
+ int error = 0;
+ u_long tmp_sb_max = sb_max;
+
+ error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req);
+ if (error || !req->newptr)
+ return (error);
+ if (tmp_sb_max < MSIZE + MCLBYTES)
+ return (EINVAL);
+ sb_max = tmp_sb_max;
+ sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
+ return (0);
+}
+
+/*
+ * Allot mbufs to a sockbuf. Attempt to scale mbmax so that mbcnt doesn't
+ * become limiting if buffering efficiency is near the normal case.
+ */
+int
+sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
+ struct thread *td)
+{
+ rlim_t sbsize_limit;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ /*
+ * td will only be NULL when we're in an interrupt (e.g. in
+ * tcp_input()).
+ *
+ * XXXRW: This comment needs updating, as might the code.
+ */
+ if (cc > sb_max_adj)
+ return (0);
+ if (td != NULL) {
+ PROC_LOCK(td->td_proc);
+ sbsize_limit = lim_cur(td->td_proc, RLIMIT_SBSIZE);
+ PROC_UNLOCK(td->td_proc);
+ } else
+ sbsize_limit = RLIM_INFINITY;
+ if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
+ sbsize_limit))
+ return (0);
+ sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
+ if (sb->sb_lowat > sb->sb_hiwat)
+ sb->sb_lowat = sb->sb_hiwat;
+ return (1);
+}
+
+int
+sbreserve(struct sockbuf *sb, u_long cc, struct socket *so,
+ struct thread *td)
+{
+ int error;
+
+ SOCKBUF_LOCK(sb);
+ error = sbreserve_locked(sb, cc, so, td);
+ SOCKBUF_UNLOCK(sb);
+ return (error);
+}
+
+/*
+ * Free mbufs held by a socket, and reserved mbuf space.
+ */
+static void
+sbrelease_internal(struct sockbuf *sb, struct socket *so)
+{
+
+ sbflush_internal(sb);
+ (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
+ RLIM_INFINITY);
+ sb->sb_mbmax = 0;
+}
+
+void
+sbrelease_locked(struct sockbuf *sb, struct socket *so)
+{
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ sbrelease_internal(sb, so);
+}
+
+void
+sbrelease(struct sockbuf *sb, struct socket *so)
+{
+
+ SOCKBUF_LOCK(sb);
+ sbrelease_locked(sb, so);
+ SOCKBUF_UNLOCK(sb);
+}
+
+void
+sbdestroy(struct sockbuf *sb, struct socket *so)
+{
+
+ sbrelease_internal(sb, so);
+}
+
+/*
+ * Routines to add and remove data from an mbuf queue.
+ *
+ * The routines sbappend() or sbappendrecord() are normally called to append
+ * new mbufs to a socket buffer, after checking that adequate space is
+ * available, comparing the function sbspace() with the amount of data to be
+ * added. sbappendrecord() differs from sbappend() in that data supplied is
+ * treated as the beginning of a new record. To place a sender's address,
+ * optional access rights, and data in a socket receive buffer,
+ * sbappendaddr() should be used. To place access rights and data in a
+ * socket receive buffer, sbappendrights() should be used. In either case,
+ * the new data begins a new record. Note that unlike sbappend() and
+ * sbappendrecord(), these routines check for the caller that there will be
+ * enough space to store the data. Each fails if there is not enough space,
+ * or if it cannot find mbufs to store additional information in.
+ *
+ * Reliable protocols may use the socket send buffer to hold data awaiting
+ * acknowledgement. Data is normally copied from a socket send buffer in a
+ * protocol with m_copy for output to a peer, and then removing the data from
+ * the socket buffer with sbdrop() or sbdroprecord() when the data is
+ * acknowledged by the peer.
+ */
+#ifdef SOCKBUF_DEBUG
+void
+sblastrecordchk(struct sockbuf *sb, const char *file, int line)
+{
+ struct mbuf *m = sb->sb_mb;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ while (m && m->m_nextpkt)
+ m = m->m_nextpkt;
+
+ if (m != sb->sb_lastrecord) {
+ printf("%s: sb_mb %p sb_lastrecord %p last %p\n",
+ __func__, sb->sb_mb, sb->sb_lastrecord, m);
+ printf("packet chain:\n");
+ for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
+ printf("\t%p\n", m);
+ panic("%s from %s:%u", __func__, file, line);
+ }
+}
+
+void
+sblastmbufchk(struct sockbuf *sb, const char *file, int line)
+{
+ struct mbuf *m = sb->sb_mb;
+ struct mbuf *n;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ while (m && m->m_nextpkt)
+ m = m->m_nextpkt;
+
+ while (m && m->m_next)
+ m = m->m_next;
+
+ if (m != sb->sb_mbtail) {
+ printf("%s: sb_mb %p sb_mbtail %p last %p\n",
+ __func__, sb->sb_mb, sb->sb_mbtail, m);
+ printf("packet tree:\n");
+ for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
+ printf("\t");
+ for (n = m; n != NULL; n = n->m_next)
+ printf("%p ", n);
+ printf("\n");
+ }
+ panic("%s from %s:%u", __func__, file, line);
+ }
+}
+#endif /* SOCKBUF_DEBUG */
+
+#define SBLINKRECORD(sb, m0) do { \
+ SOCKBUF_LOCK_ASSERT(sb); \
+ if ((sb)->sb_lastrecord != NULL) \
+ (sb)->sb_lastrecord->m_nextpkt = (m0); \
+ else \
+ (sb)->sb_mb = (m0); \
+ (sb)->sb_lastrecord = (m0); \
+} while (/*CONSTCOND*/0)
+
+/*
+ * Append mbuf chain m to the last record in the socket buffer sb. The
+ * additional space associated the mbuf chain is recorded in sb. Empty mbufs
+ * are discarded and mbufs are compacted where possible.
+ */
+void
+sbappend_locked(struct sockbuf *sb, struct mbuf *m)
+{
+ struct mbuf *n;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ if (m == 0)
+ return;
+
+ SBLASTRECORDCHK(sb);
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ do {
+ if (n->m_flags & M_EOR) {
+ sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
+ return;
+ }
+ } while (n->m_next && (n = n->m_next));
+ } else {
+ /*
+ * XXX Would like to simply use sb_mbtail here, but
+ * XXX I need to verify that I won't miss an EOR that
+ * XXX way.
+ */
+ if ((n = sb->sb_lastrecord) != NULL) {
+ do {
+ if (n->m_flags & M_EOR) {
+ sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
+ return;
+ }
+ } while (n->m_next && (n = n->m_next));
+ } else {
+ /*
+ * If this is the first record in the socket buffer,
+ * it's also the last record.
+ */
+ sb->sb_lastrecord = m;
+ }
+ }
+ sbcompress(sb, m, n);
+ SBLASTRECORDCHK(sb);
+}
+
+/*
+ * Append mbuf chain m to the last record in the socket buffer sb. The
+ * additional space associated the mbuf chain is recorded in sb. Empty mbufs
+ * are discarded and mbufs are compacted where possible.
+ */
+void
+sbappend(struct sockbuf *sb, struct mbuf *m)
+{
+
+ SOCKBUF_LOCK(sb);
+ sbappend_locked(sb, m);
+ SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * This version of sbappend() should only be used when the caller absolutely
+ * knows that there will never be more than one record in the socket buffer,
+ * that is, a stream protocol (such as TCP).
+ */
+void
+sbappendstream_locked(struct sockbuf *sb, struct mbuf *m)
+{
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ KASSERT(m->m_nextpkt == NULL,("sbappendstream 0"));
+ KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
+
+ SBLASTMBUFCHK(sb);
+
+ sbcompress(sb, m, sb->sb_mbtail);
+
+ sb->sb_lastrecord = sb->sb_mb;
+ SBLASTRECORDCHK(sb);
+}
+
+/*
+ * This version of sbappend() should only be used when the caller absolutely
+ * knows that there will never be more than one record in the socket buffer,
+ * that is, a stream protocol (such as TCP).
+ */
+void
+sbappendstream(struct sockbuf *sb, struct mbuf *m)
+{
+
+ SOCKBUF_LOCK(sb);
+ sbappendstream_locked(sb, m);
+ SOCKBUF_UNLOCK(sb);
+}
+
+#ifdef SOCKBUF_DEBUG
+void
+sbcheck(struct sockbuf *sb)
+{
+ struct mbuf *m;
+ struct mbuf *n = 0;
+ u_long len = 0, mbcnt = 0;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ for (m = sb->sb_mb; m; m = n) {
+ n = m->m_nextpkt;
+ for (; m; m = m->m_next) {
+ len += m->m_len;
+ mbcnt += MSIZE;
+ if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+ mbcnt += m->m_ext.ext_size;
+ }
+ }
+ if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
+ printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc,
+ mbcnt, sb->sb_mbcnt);
+ panic("sbcheck");
+ }
+}
+#endif
+
+/*
+ * As above, except the mbuf chain begins a new record.
+ */
+void
+sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0)
+{
+ struct mbuf *m;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ if (m0 == 0)
+ return;
+ m = sb->sb_mb;
+ if (m)
+ while (m->m_nextpkt)
+ m = m->m_nextpkt;
+ /*
+ * Put the first mbuf on the queue. Note this permits zero length
+ * records.
+ */
+ sballoc(sb, m0);
+ SBLASTRECORDCHK(sb);
+ SBLINKRECORD(sb, m0);
+ if (m)
+ m->m_nextpkt = m0;
+ else
+ sb->sb_mb = m0;
+ m = m0->m_next;
+ m0->m_next = 0;
+ if (m && (m0->m_flags & M_EOR)) {
+ m0->m_flags &= ~M_EOR;
+ m->m_flags |= M_EOR;
+ }
+ sbcompress(sb, m, m0);
+}
+
+/*
+ * As above, except the mbuf chain begins a new record.
+ */
+void
+sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
+{
+
+ SOCKBUF_LOCK(sb);
+ sbappendrecord_locked(sb, m0);
+ SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data to the
+ * receive queue of a socket. If present, m0 must include a packet header
+ * with total length. Returns 0 if no space in sockbuf or insufficient
+ * mbufs.
+ */
+int
+sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
+ struct mbuf *m0, struct mbuf *control)
+{
+ struct mbuf *m, *n, *nlast;
+ int space = asa->sa_len;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+ panic("sbappendaddr_locked");
+ if (m0)
+ space += m0->m_pkthdr.len;
+ space += m_length(control, &n);
+
+ if (space > sbspace(sb))
+ return (0);
+#if MSIZE <= 256
+ if (asa->sa_len > MLEN)
+ return (0);
+#endif
+ MGET(m, M_DONTWAIT, MT_SONAME);
+ if (m == 0)
+ return (0);
+ m->m_len = asa->sa_len;
+ bcopy(asa, mtod(m, caddr_t), asa->sa_len);
+ if (n)
+ n->m_next = m0; /* concatenate data to control */
+ else
+ control = m0;
+ m->m_next = control;
+ for (n = m; n->m_next != NULL; n = n->m_next)
+ sballoc(sb, n);
+ sballoc(sb, n);
+ nlast = n;
+ SBLINKRECORD(sb, m);
+
+ sb->sb_mbtail = nlast;
+ SBLASTMBUFCHK(sb);
+
+ SBLASTRECORDCHK(sb);
+ return (1);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data to the
+ * receive queue of a socket. If present, m0 must include a packet header
+ * with total length. Returns 0 if no space in sockbuf or insufficient
+ * mbufs.
+ */
+int
+sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa,
+ struct mbuf *m0, struct mbuf *control)
+{
+ int retval;
+
+ SOCKBUF_LOCK(sb);
+ retval = sbappendaddr_locked(sb, asa, m0, control);
+ SOCKBUF_UNLOCK(sb);
+ return (retval);
+}
+
+int
+sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0,
+ struct mbuf *control)
+{
+ struct mbuf *m, *n, *mlast;
+ int space;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ if (control == 0)
+ panic("sbappendcontrol_locked");
+ space = m_length(control, &n) + m_length(m0, NULL);
+
+ if (space > sbspace(sb))
+ return (0);
+ n->m_next = m0; /* concatenate data to control */
+
+ SBLASTRECORDCHK(sb);
+
+ for (m = control; m->m_next; m = m->m_next)
+ sballoc(sb, m);
+ sballoc(sb, m);
+ mlast = m;
+ SBLINKRECORD(sb, control);
+
+ sb->sb_mbtail = mlast;
+ SBLASTMBUFCHK(sb);
+
+ SBLASTRECORDCHK(sb);
+ return (1);
+}
+
+int
+sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
+{
+ int retval;
+
+ SOCKBUF_LOCK(sb);
+ retval = sbappendcontrol_locked(sb, m0, control);
+ SOCKBUF_UNLOCK(sb);
+ return (retval);
+}
+
+/*
+ * Append the data in mbuf chain (m) into the socket buffer sb following mbuf
+ * (n). If (n) is NULL, the buffer is presumed empty.
+ *
+ * When the data is compressed, mbufs in the chain may be handled in one of
+ * three ways:
+ *
+ * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no
+ * record boundary, and no change in data type).
+ *
+ * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into
+ * an mbuf already in the socket buffer. This can occur if an
+ * appropriate mbuf exists, there is room, and no merging of data types
+ * will occur.
+ *
+ * (3) The mbuf may be appended to the end of the existing mbuf chain.
+ *
+ * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as
+ * end-of-record.
+ */
+void
+sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
+{
+ int eor = 0;
+ struct mbuf *o;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ while (m) {
+ eor |= m->m_flags & M_EOR;
+ if (m->m_len == 0 &&
+ (eor == 0 ||
+ (((o = m->m_next) || (o = n)) &&
+ o->m_type == m->m_type))) {
+ if (sb->sb_lastrecord == m)
+ sb->sb_lastrecord = m->m_next;
+ m = m_free(m);
+ continue;
+ }
+ if (n && (n->m_flags & M_EOR) == 0 &&
+ M_WRITABLE(n) &&
+ m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
+ m->m_len <= M_TRAILINGSPACE(n) &&
+ n->m_type == m->m_type) {
+ bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
+ (unsigned)m->m_len);
+ n->m_len += m->m_len;
+ sb->sb_cc += m->m_len;
+ if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
+ /* XXX: Probably don't need.*/
+ sb->sb_ctl += m->m_len;
+ m = m_free(m);
+ continue;
+ }
+ if (n)
+ n->m_next = m;
+ else
+ sb->sb_mb = m;
+ sb->sb_mbtail = m;
+ sballoc(sb, m);
+ n = m;
+ m->m_flags &= ~M_EOR;
+ m = m->m_next;
+ n->m_next = 0;
+ }
+ if (eor) {
+ KASSERT(n != NULL, ("sbcompress: eor && n == NULL"));
+ n->m_flags |= eor;
+ }
+ SBLASTMBUFCHK(sb);
+}
+
+/*
+ * Free all mbufs in a sockbuf. Check that all resources are reclaimed.
+ */
+static void
+sbflush_internal(struct sockbuf *sb)
+{
+
+ while (sb->sb_mbcnt) {
+ /*
+ * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
+ * we would loop forever. Panic instead.
+ */
+ if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
+ break;
+ sbdrop_internal(sb, (int)sb->sb_cc);
+ }
+ if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
+ panic("sbflush_internal: cc %u || mb %p || mbcnt %u",
+ sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
+}
+
+void
+sbflush_locked(struct sockbuf *sb)
+{
+
+ SOCKBUF_LOCK_ASSERT(sb);
+ sbflush_internal(sb);
+}
+
+void
+sbflush(struct sockbuf *sb)
+{
+
+ SOCKBUF_LOCK(sb);
+ sbflush_locked(sb);
+ SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+static void
+sbdrop_internal(struct sockbuf *sb, int len)
+{
+ struct mbuf *m;
+ struct mbuf *next;
+
+ next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+ while (len > 0) {
+ if (m == 0) {
+ if (next == 0)
+ panic("sbdrop");
+ m = next;
+ next = m->m_nextpkt;
+ continue;
+ }
+ if (m->m_len > len) {
+ m->m_len -= len;
+ m->m_data += len;
+ sb->sb_cc -= len;
+ if (sb->sb_sndptroff != 0)
+ sb->sb_sndptroff -= len;
+ if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
+ sb->sb_ctl -= len;
+ break;
+ }
+ len -= m->m_len;
+ sbfree(sb, m);
+ m = m_free(m);
+ }
+ while (m && m->m_len == 0) {
+ sbfree(sb, m);
+ m = m_free(m);
+ }
+ if (m) {
+ sb->sb_mb = m;
+ m->m_nextpkt = next;
+ } else
+ sb->sb_mb = next;
+ /*
+ * First part is an inline SB_EMPTY_FIXUP(). Second part makes sure
+ * sb_lastrecord is up-to-date if we dropped part of the last record.
+ */
+ m = sb->sb_mb;
+ if (m == NULL) {
+ sb->sb_mbtail = NULL;
+ sb->sb_lastrecord = NULL;
+ } else if (m->m_nextpkt == NULL) {
+ sb->sb_lastrecord = m;
+ }
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+void
+sbdrop_locked(struct sockbuf *sb, int len)
+{
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ sbdrop_internal(sb, len);
+}
+
+void
+sbdrop(struct sockbuf *sb, int len)
+{
+
+ SOCKBUF_LOCK(sb);
+ sbdrop_locked(sb, len);
+ SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Maintain a pointer and offset pair into the socket buffer mbuf chain to
+ * avoid traversal of the entire socket buffer for larger offsets.
+ */
+struct mbuf *
+sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff)
+{
+ struct mbuf *m, *ret;
+
+ KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
+ KASSERT(off + len <= sb->sb_cc, ("%s: beyond sb", __func__));
+ KASSERT(sb->sb_sndptroff <= sb->sb_cc, ("%s: sndptroff broken", __func__));
+
+ /*
+ * Is off below stored offset? Happens on retransmits.
+ * Just return, we can't help here.
+ */
+ if (sb->sb_sndptroff > off) {
+ *moff = off;
+ return (sb->sb_mb);
+ }
+
+ /* Return closest mbuf in chain for current offset. */
+ *moff = off - sb->sb_sndptroff;
+ m = ret = sb->sb_sndptr ? sb->sb_sndptr : sb->sb_mb;
+
+ /* Advance by len to be as close as possible for the next transmit. */
+ for (off = off - sb->sb_sndptroff + len - 1;
+ off > 0 && off >= m->m_len;
+ m = m->m_next) {
+ sb->sb_sndptroff += m->m_len;
+ off -= m->m_len;
+ }
+ sb->sb_sndptr = m;
+
+ return (ret);
+}
+
+/*
+ * Drop a record off the front of a sockbuf and move the next record to the
+ * front.
+ */
+void
+sbdroprecord_locked(struct sockbuf *sb)
+{
+ struct mbuf *m;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ m = sb->sb_mb;
+ if (m) {
+ sb->sb_mb = m->m_nextpkt;
+ do {
+ sbfree(sb, m);
+ m = m_free(m);
+ } while (m);
+ }
+ SB_EMPTY_FIXUP(sb);
+}
+
+/*
+ * Drop a record off the front of a sockbuf and move the next record to the
+ * front.
+ */
+void
+sbdroprecord(struct sockbuf *sb)
+{
+
+ SOCKBUF_LOCK(sb);
+ sbdroprecord_locked(sb);
+ SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Create a "control" mbuf containing the specified data with the specified
+ * type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(caddr_t p, int size, int type, int level)
+{
+ struct cmsghdr *cp;
+ struct mbuf *m;
+
+ if (CMSG_SPACE((u_int)size) > MCLBYTES)
+ return ((struct mbuf *) NULL);
+ if (CMSG_SPACE((u_int)size) > MLEN)
+ m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
+ else
+ m = m_get(M_DONTWAIT, MT_CONTROL);
+ if (m == NULL)
+ return ((struct mbuf *) NULL);
+ cp = mtod(m, struct cmsghdr *);
+ m->m_len = 0;
+ KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
+ ("sbcreatecontrol: short mbuf"));
+ if (p != NULL)
+ (void)memcpy(CMSG_DATA(cp), p, size);
+ m->m_len = CMSG_SPACE(size);
+ cp->cmsg_len = CMSG_LEN(size);
+ cp->cmsg_level = level;
+ cp->cmsg_type = type;
+ return (m);
+}
+
+/*
+ * This does the same for socket buffers that sotoxsocket does for sockets:
+ * generate an user-format data structure describing the socket buffer. Note
+ * that the xsockbuf structure, since it is always embedded in a socket, does
+ * not include a self pointer nor a length. We make this entry point public
+ * in case some other mechanism needs it.
+ */
+void
+sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
+{
+
+ xsb->sb_cc = sb->sb_cc;
+ xsb->sb_hiwat = sb->sb_hiwat;
+ xsb->sb_mbcnt = sb->sb_mbcnt;
+ xsb->sb_mbmax = sb->sb_mbmax;
+ xsb->sb_lowat = sb->sb_lowat;
+ xsb->sb_flags = sb->sb_flags;
+ xsb->sb_timeo = sb->sb_timeo;
+}
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW,
+ &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size");
+SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+ &sb_efficiency, 0, "");
Index: kern_tc.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_tc.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_tc.c -L sys/kern/kern_tc.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_tc.c
+++ sys/kern/kern_tc.c
@@ -8,7 +8,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.164 2005/03/26 20:04:28 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.178 2007/06/04 18:25:07 dwmalone Exp $");
#include "opt_ntp.h"
@@ -61,7 +61,7 @@
struct timehands *th_next;
};
-extern struct timehands th0;
+static struct timehands th0;
static struct timehands th9 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th0};
static struct timehands th8 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th9};
static struct timehands th7 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th8};
@@ -88,7 +88,7 @@
static struct timecounter *timecounters = &dummy_timecounter;
time_t time_second = 1;
-time_t time_uptime = 0;
+time_t time_uptime = 1;
static struct bintime boottimebin;
struct timeval boottime;
@@ -97,6 +97,7 @@
NULL, 0, sysctl_kern_boottime, "S,timeval", "System boottime");
SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
+SYSCTL_NODE(_kern_timecounter, OID_AUTO, tc, CTLFLAG_RW, 0, "");
static int timestepwarnings;
SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW,
@@ -116,6 +117,7 @@
#undef TC_STATS
static void tc_windup(void);
+static void cpu_tick_calibrate(int);
static int
sysctl_kern_boottime(SYSCTL_HANDLER_ARGS)
@@ -131,6 +133,27 @@
#endif
return SYSCTL_OUT(req, &boottime, sizeof(boottime));
}
+
+static int
+sysctl_kern_timecounter_get(SYSCTL_HANDLER_ARGS)
+{
+ u_int ncount;
+ struct timecounter *tc = arg1;
+
+ ncount = tc->tc_get_timecount(tc);
+ return sysctl_handle_int(oidp, &ncount, 0, req);
+}
+
+static int
+sysctl_kern_timecounter_freq(SYSCTL_HANDLER_ARGS)
+{
+ u_int64_t freq;
+ struct timecounter *tc = arg1;
+
+ freq = tc->tc_frequency;
+ return sysctl_handle_quad(oidp, &freq, 0, req);
+}
+
/*
* Return the difference between the timehands' counter value now and what
* was when we copied it to the timehands' offset_count.
@@ -307,6 +330,7 @@
tc_init(struct timecounter *tc)
{
u_int u;
+ struct sysctl_oid *tc_root;
u = tc->tc_frequency / tc->tc_counter_mask;
/* XXX: We need some margin here, 10% is a guess */
@@ -328,6 +352,24 @@
tc->tc_next = timecounters;
timecounters = tc;
/*
+ * Set up sysctl tree for this counter.
+ */
+ tc_root = SYSCTL_ADD_NODE(NULL,
+ SYSCTL_STATIC_CHILDREN(_kern_timecounter_tc), OID_AUTO, tc->tc_name,
+ CTLFLAG_RW, 0, "timecounter description");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+ "mask", CTLFLAG_RD, &(tc->tc_counter_mask), 0,
+ "mask for implemented bits");
+ SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+ "counter", CTLTYPE_UINT | CTLFLAG_RD, tc, sizeof(*tc),
+ sysctl_kern_timecounter_get, "IU", "current timecounter value");
+ SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+ "frequency", CTLTYPE_QUAD | CTLFLAG_RD, tc, sizeof(*tc),
+ sysctl_kern_timecounter_freq, "QU", "timecounter frequency");
+ SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+ "quality", CTLFLAG_RD, &(tc->tc_quality), 0,
+ "goodness of time counter");
+ /*
* Never automatically use a timecounter with negative quality.
* Even though we run on the dummy counter, switching here may be
* worse since this timecounter may not be monotonous.
@@ -360,12 +402,14 @@
void
tc_setclock(struct timespec *ts)
{
- struct timespec ts2;
+ struct timespec tbef, taft;
struct bintime bt, bt2;
+ cpu_tick_calibrate(1);
nsetclock++;
- binuptime(&bt2);
+ nanotime(&tbef);
timespec2bintime(ts, &bt);
+ binuptime(&bt2);
bintime_sub(&bt, &bt2);
bintime_add(&bt2, &boottimebin);
boottimebin = bt;
@@ -373,12 +417,15 @@
/* XXX fiddle all the little crinkly bits around the fiords... */
tc_windup();
+ nanotime(&taft);
if (timestepwarnings) {
- bintime2timespec(&bt2, &ts2);
- log(LOG_INFO, "Time stepped from %jd.%09ld to %jd.%09ld\n",
- (intmax_t)ts2.tv_sec, ts2.tv_nsec,
+ log(LOG_INFO,
+ "Time stepped from %jd.%09ld to %jd.%09ld (%jd.%09ld)\n",
+ (intmax_t)tbef.tv_sec, tbef.tv_nsec,
+ (intmax_t)taft.tv_sec, taft.tv_nsec,
(intmax_t)ts->tv_sec, ts->tv_nsec);
}
+ cpu_tick_calibrate(1);
}
/*
@@ -475,8 +522,8 @@
* x = a * 2^32 / 10^9 = a * 4.294967296
*
* The range of th_adjustment is +/- 5000PPM so inside a 64bit int
- * we can only multiply by about 850 without overflowing, but that
- * leaves suitably precise fractions for multiply before divide.
+ * we can only multiply by about 850 without overflowing, that
+ * leaves no suitably precise fractions for multiply before divide.
*
* Divide before multiply with a fraction of 2199/512 results in a
* systematic undercompensation of 10PPM of th_adjustment. On a
@@ -749,11 +796,16 @@
tc_ticktock(void)
{
static int count;
+ static time_t last_calib;
if (++count < tc_tick)
return;
count = 0;
tc_windup();
+ if (time_uptime != last_calib && !(time_uptime & 0xf)) {
+ cpu_tick_calibrate(0);
+ last_calib = time_uptime;
+ }
}
static void
@@ -782,3 +834,147 @@
}
SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_SECOND, inittimecounter, NULL)
+
+/* Cpu tick handling -------------------------------------------------*/
+
+static int cpu_tick_variable;
+static uint64_t cpu_tick_frequency;
+
+static uint64_t
+tc_cpu_ticks(void)
+{
+ static uint64_t base;
+ static unsigned last;
+ unsigned u;
+ struct timecounter *tc;
+
+ tc = timehands->th_counter;
+ u = tc->tc_get_timecount(tc) & tc->tc_counter_mask;
+ if (u < last)
+ base += (uint64_t)tc->tc_counter_mask + 1;
+ last = u;
+ return (u + base);
+}
+
+/*
+ * This function gets called ever 16 seconds on only one designated
+ * CPU in the system from hardclock() via tc_ticktock().
+ *
+ * Whenever the real time clock is stepped we get called with reset=1
+ * to make sure we handle suspend/resume and similar events correctly.
+ */
+
+static void
+cpu_tick_calibrate(int reset)
+{
+ static uint64_t c_last;
+ uint64_t c_this, c_delta;
+ static struct bintime t_last;
+ struct bintime t_this, t_delta;
+ uint32_t divi;
+
+ if (reset) {
+ /* The clock was stepped, abort & reset */
+ t_last.sec = 0;
+ return;
+ }
+
+ /* we don't calibrate fixed rate cputicks */
+ if (!cpu_tick_variable)
+ return;
+
+ getbinuptime(&t_this);
+ c_this = cpu_ticks();
+ if (t_last.sec != 0) {
+ c_delta = c_this - c_last;
+ t_delta = t_this;
+ bintime_sub(&t_delta, &t_last);
+ /*
+ * Validate that 16 +/- 1/256 seconds passed.
+ * After division by 16 this gives us a precision of
+ * roughly 250PPM which is sufficient
+ */
+ if (t_delta.sec > 16 || (
+ t_delta.sec == 16 && t_delta.frac >= (0x01LL << 56))) {
+ /* too long */
+ if (bootverbose)
+ printf("%ju.%016jx too long\n",
+ (uintmax_t)t_delta.sec,
+ (uintmax_t)t_delta.frac);
+ } else if (t_delta.sec < 15 ||
+ (t_delta.sec == 15 && t_delta.frac <= (0xffLL << 56))) {
+ /* too short */
+ if (bootverbose)
+ printf("%ju.%016jx too short\n",
+ (uintmax_t)t_delta.sec,
+ (uintmax_t)t_delta.frac);
+ } else {
+ /* just right */
+ /*
+ * Headroom:
+ * 2^(64-20) / 16[s] =
+ * 2^(44) / 16[s] =
+ * 17.592.186.044.416 / 16 =
+ * 1.099.511.627.776 [Hz]
+ */
+ divi = t_delta.sec << 20;
+ divi |= t_delta.frac >> (64 - 20);
+ c_delta <<= 20;
+ c_delta /= divi;
+ if (c_delta > cpu_tick_frequency) {
+ if (0 && bootverbose)
+ printf("cpu_tick increased to %ju Hz\n",
+ c_delta);
+ cpu_tick_frequency = c_delta;
+ }
+ }
+ }
+ c_last = c_this;
+ t_last = t_this;
+}
+
+void
+set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var)
+{
+
+ if (func == NULL) {
+ cpu_ticks = tc_cpu_ticks;
+ } else {
+ cpu_tick_frequency = freq;
+ cpu_tick_variable = var;
+ cpu_ticks = func;
+ }
+}
+
+uint64_t
+cpu_tickrate(void)
+{
+
+ if (cpu_ticks == tc_cpu_ticks)
+ return (tc_getfrequency());
+ return (cpu_tick_frequency);
+}
+
+/*
+ * We need to be slightly careful converting cputicks to microseconds.
+ * There is plenty of margin in 64 bits of microseconds (half a million
+ * years) and in 64 bits at 4 GHz (146 years), but if we do a multiply
+ * before divide conversion (to retain precision) we find that the
+ * margin shrinks to 1.5 hours (one millionth of 146y).
+ * With a three prong approach we never lose significant bits, no
+ * matter what the cputick rate and length of timeinterval is.
+ */
+
+uint64_t
+cputick2usec(uint64_t tick)
+{
+
+ if (tick > 18446744073709551LL) /* floor(2^64 / 1000) */
+ return (tick / (cpu_tickrate() / 1000000LL));
+ else if (tick > 18446744073709LL) /* floor(2^64 / 1000000) */
+ return ((tick * 1000LL) / (cpu_tickrate() / 1000LL));
+ else
+ return ((tick * 1000000LL) / cpu_tickrate());
+}
+
+cpu_tick_f *cpu_ticks = tc_cpu_ticks;
Index: subr_clock.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_clock.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_clock.c -L sys/kern/subr_clock.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_clock.c
+++ sys/kern/subr_clock.c
@@ -38,23 +38,8 @@
* from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
*/
-/*
- * Helpers for time-of-day clocks. This is useful for architectures that need
- * support multiple models of such clocks, and generally serves to make the
- * code more machine-independent.
- * If the clock in question can also be used as a time counter, the driver
- * needs to initiate this.
- * This code is not yet used by all architectures.
- */
-
-/*
- * Generic routines to convert between a POSIX date
- * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec
- * Derived from NetBSD arch/hp300/hp300/clock.c
- */
-
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_clock.c,v 1.6 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_clock.c,v 1.12 2007/07/23 09:42:31 dwmalone Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -64,43 +49,21 @@
#include <sys/sysctl.h>
#include <sys/timetc.h>
-/* XXX: for the CPU_* sysctl OID constants. */
-#include <machine/cpu.h>
-
-#include "clock_if.h"
-
-static __inline int leapyear(int year);
-static int sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS);
-
-#define FEBRUARY 2
-#define days_in_year(y) (leapyear(y) ? 366 : 365)
-#define days_in_month(y, m) \
- (month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0))
-/* Day of week. Days are counted from 1/1/1970, which was a Thursday */
-#define day_of_week(days) (((days) + 4) % 7)
-
-static const int month_days[12] = {
- 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
-};
-
-static device_t clock_dev = NULL;
-static long clock_res;
+static int adjkerntz; /* local offset from GMT in seconds */
+static int wall_cmos_clock; /* wall CMOS clock assumed if != 0 */
+int disable_rtc_set; /* disable resettodr() if != 0 */
-int adjkerntz; /* local offset from GMT in seconds */
-int disable_rtc_set; /* disable resettodr() if != 0 */
-int wall_cmos_clock; /* wall CMOS clock assumed if != 0 */
+int tz_minuteswest;
+int tz_dsttime;
/*
* These have traditionally been in machdep, but should probably be moved to
* kern.
*/
-SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
- &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
-
-SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
+SYSCTL_INT(_machdep, OID_AUTO, disable_rtc_set,
CTLFLAG_RW, &disable_rtc_set, 0, "");
-SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
+SYSCTL_INT(_machdep, OID_AUTO, wall_cmos_clock,
CTLFLAG_RW, &wall_cmos_clock, 0, "");
static int
@@ -114,6 +77,28 @@
return (error);
}
+SYSCTL_PROC(_machdep, OID_AUTO, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
+ &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
+
+/*--------------------------------------------------------------------*
+ * Generic routines to convert between a POSIX date
+ * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec
+ * Derived from NetBSD arch/hp300/hp300/clock.c
+ */
+
+
+#define FEBRUARY 2
+#define days_in_year(y) (leapyear(y) ? 366 : 365)
+#define days_in_month(y, m) \
+ (month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0))
+/* Day of week. Days are counted from 1/1/1970, which was a Thursday */
+#define day_of_week(days) (((days) + 4) % 7)
+
+static const int month_days[12] = {
+ 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+
/*
* This inline avoids some unnecessary modulo operations
* as compared with the usual macro:
@@ -166,7 +151,7 @@
days += days_in_month(year, i);
days += (ct->day - 1);
- /* Another sanity check. */
+ /* XXX Dow sanity check. Dow is not used, so should we check it? */
if (ct->dow != -1 && ct->dow != day_of_week(days))
return (EINVAL);
@@ -213,105 +198,9 @@
ct->nsec = ts->tv_nsec;
}
-void
-clock_register(device_t dev, long res)
-{
-
- if (clock_dev != NULL) {
- if (clock_res > res) {
- if (bootverbose) {
- device_printf(dev, "not installed as "
- "time-of-day clock: clock %s has higher "
- "resolution\n", device_get_name(clock_dev));
- }
- return;
- } else {
- if (bootverbose) {
- device_printf(clock_dev, "removed as "
- "time-of-day clock: clock %s has higher "
- "resolution\n", device_get_name(dev));
- }
- }
- }
- clock_dev = dev;
- clock_res = res;
- if (bootverbose) {
- device_printf(dev, "registered as a time-of-day clock "
- "(resolution %ldus)\n", res);
- }
-}
-
-/*
- * inittodr and settodr derived from the i386 versions written
- * by Christoph Robitschko <chmr at edvz.tu-graz.ac.at>, reintroduced and
- * updated by Chris Stenton <chris at gnome.co.uk> 8/10/94
- */
-
-/*
- * Initialize the time of day register, based on the time base which is, e.g.
- * from a filesystem.
- */
-void
-inittodr(time_t base)
-{
- struct timespec diff, ref, ts;
- int error;
-
- if (base) {
- ref.tv_sec = base;
- ref.tv_nsec = 0;
- tc_setclock(&ref);
- }
-
- if (clock_dev == NULL) {
- printf("warning: no time-of-day clock registered, system time "
- "will not be set accurately\n");
- return;
- }
- error = CLOCK_GETTIME(clock_dev, &ts);
- if (error != 0 && error != EINVAL) {
- printf("warning: clock_gettime failed (%d), the system time "
- "will not be set accurately\n", error);
- return;
- }
- if (error == EINVAL || ts.tv_sec < 0) {
- printf("Invalid time in real time clock.\n");
- printf("Check and reset the date immediately!\n");
- }
-
- ts.tv_sec += tz_minuteswest * 60 +
- (wall_cmos_clock ? adjkerntz : 0);
-
- if (timespeccmp(&ref, &ts, >)) {
- diff = ref;
- timespecsub(&ref, &ts);
- } else {
- diff = ts;
- timespecsub(&diff, &ref);
- }
- if (ts.tv_sec >= 2) {
- /* badly off, adjust it */
- tc_setclock(&ts);
- }
-}
-
-/*
- * Write system time back to RTC
- */
-void
-resettodr()
+int
+utc_offset(void)
{
- struct timespec ts;
- int error;
- if (disable_rtc_set || clock_dev == NULL)
- return;
-
- getnanotime(&ts);
- ts.tv_sec -= tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0);
- if ((error = CLOCK_SETTIME(clock_dev, &ts)) != 0) {
- printf("warning: clock_settime failed (%d), time-of-day clock "
- "not adjusted to system time\n", error);
- return;
- }
+ return (tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0));
}
Index: vfs_default.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_default.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/vfs_default.c -L sys/kern/vfs_default.c -u -r1.2 -r1.3
--- sys/kern/vfs_default.c
+++ sys/kern/vfs_default.c
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_default.c,v 1.127.2.2 2006/03/13 03:06:17 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_default.c,v 1.138 2007/05/18 13:02:13 kib Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -86,7 +86,7 @@
.vop_kqfilter = vop_stdkqfilter,
.vop_islocked = vop_stdislocked,
.vop_lease = VOP_NULL,
- .vop_lock = vop_stdlock,
+ .vop_lock1 = vop_stdlock,
.vop_lookup = vop_nolookup,
.vop_open = VOP_NULL,
.vop_pathconf = VOP_EINVAL,
@@ -96,6 +96,7 @@
.vop_revoke = VOP_PANIC,
.vop_strategy = vop_nostrategy,
.vop_unlock = vop_stdunlock,
+ .vop_vptofh = vop_stdvptofh,
};
/*
@@ -217,6 +218,12 @@
{
switch (ap->a_name) {
+ case _PC_NAME_MAX:
+ *ap->a_retval = NAME_MAX;
+ return (0);
+ case _PC_PATH_MAX:
+ *ap->a_retval = PATH_MAX;
+ return (0);
case _PC_LINK_MAX:
*ap->a_retval = LINK_MAX;
return (0);
@@ -246,15 +253,17 @@
*/
int
vop_stdlock(ap)
- struct vop_lock_args /* {
+ struct vop_lock1_args /* {
struct vnode *a_vp;
int a_flags;
struct thread *a_td;
+ char *file;
+ int line;
} */ *ap;
{
struct vnode *vp = ap->a_vp;
- return (lockmgr(vp->v_vnlock, ap->a_flags, VI_MTX(vp), ap->a_td));
+ return (_lockmgr(vp->v_vnlock, ap->a_flags, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line));
}
/* See above. */
@@ -337,8 +346,24 @@
struct mount **a_mpp;
} */ *ap;
{
+ struct mount *mp;
- *(ap->a_mpp) = ap->a_vp->v_mount;
+ /*
+ * XXX Since this is called unlocked we may be recycled while
+ * attempting to ref the mount. If this is the case or mountpoint
+ * will be set to NULL. We only have to prevent this call from
+ * returning with a ref to an incorrect mountpoint. It is not
+ * harmful to return with a ref to our previous mountpoint.
+ */
+ mp = ap->a_vp->v_mount;
+ if (mp != NULL) {
+ vfs_ref(mp);
+ if (mp != ap->a_vp->v_mount) {
+ vfs_rel(mp);
+ mp = NULL;
+ }
+ }
+ *(ap->a_mpp) = mp;
return (0);
}
@@ -487,6 +512,12 @@
ap->a_sync, ap->a_rtvals);
}
+int
+vop_stdvptofh(struct vop_vptofh_args *ap)
+{
+ return (EOPNOTSUPP);
+}
+
/*
* vfs default ops
* used to fill the vfs function table to get reasonable default return values.
@@ -513,20 +544,11 @@
}
int
-vfs_stdvptofh (vp, fhp)
- struct vnode *vp;
- struct fid *fhp;
-{
-
- return (EOPNOTSUPP);
-}
-
-int
vfs_stdquotactl (mp, cmds, uid, arg, td)
struct mount *mp;
int cmds;
uid_t uid;
- caddr_t arg;
+ void *arg;
struct thread *td;
{
@@ -571,6 +593,7 @@
if (error)
allerror = error;
+ /* Do not turn this into vput. td is not always curthread. */
VOP_UNLOCK(vp, 0, td);
vrele(vp);
MNT_ILOCK(mp);
Index: tty_cons.c
===================================================================
RCS file: /home/cvs/src/sys/kern/tty_cons.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/tty_cons.c -L sys/kern/tty_cons.c -u -r1.1.1.1 -r1.2
--- sys/kern/tty_cons.c
+++ sys/kern/tty_cons.c
@@ -35,12 +35,14 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/tty_cons.c,v 1.131 2005/02/27 21:52:41 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/tty_cons.c,v 1.139 2007/05/31 11:51:51 kib Exp $");
#include "opt_ddb.h"
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
#include <sys/conf.h>
#include <sys/cons.h>
#include <sys/fcntl.h>
@@ -49,6 +51,7 @@
#include <sys/malloc.h>
#include <sys/msgbuf.h>
#include <sys/namei.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/reboot.h>
@@ -99,7 +102,7 @@
(cnd->cnd_vp->v_type == VBAD && !cn_devopen(cnd, td, 1)))
static dev_t cn_udev_t;
-SYSCTL_OPAQUE(_machdep, CPU_CONSDEV, consdev, CTLFLAG_RD,
+SYSCTL_OPAQUE(_machdep, OID_AUTO, consdev, CTLFLAG_RD,
&cn_udev_t, sizeof cn_udev_t, "T,struct cdev *", "");
int cons_avail_mask = 0; /* Bit mask. Each registered low level console
@@ -117,10 +120,13 @@
static char *console_pausestr=
"<pause; press any key to proceed to next line or '.' to end pause mode>";
struct tty *constty; /* pointer to console "window" tty */
+static struct mtx cnputs_mtx; /* Mutex for cnputs(). */
+static int use_cnputs_mtx = 0; /* != 0 if cnputs_mtx locking reqd. */
static void constty_timeout(void *arg);
-CONS_DRIVER(cons, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+static struct consdev cons_consdev;
+DATA_SET(cons_set, cons_consdev);
SET_DECLARE(cons_set, struct consdev);
void
@@ -157,15 +163,15 @@
/*
* Initialize console, and attach to it.
*/
- cnadd(cn);
cn->cn_init(cn);
+ cnadd(cn);
}
}
if (best_cn == NULL)
return;
if ((boothowto & RB_MULTIPLE) == 0) {
- cnadd(best_cn);
best_cn->cn_init(best_cn);
+ cnadd(best_cn);
}
if (boothowto & RB_PAUSE)
console_pausing = 1;
@@ -401,7 +407,7 @@
}
snprintf(path, sizeof(path), "/dev/%s", cnd->cnd_cn->cn_name);
NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, td);
- error = vn_open(&nd, &openflag, 0, -1);
+ error = vn_open(&nd, &openflag, 0, NULL);
if (error == 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
VOP_UNLOCK(nd.ni_vp, 0, td);
@@ -505,7 +511,7 @@
* output from the "virtual" console.
*/
if (cmd == TIOCCONS && constty) {
- error = suser(td);
+ error = priv_check(td, PRIV_TTY_CONSOLE);
if (error)
return (error);
constty = NULL;
@@ -597,7 +603,10 @@
STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
cn = cnd->cnd_cn;
if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG)) {
- c = cn->cn_checkc(cn);
+ if (cn->cn_checkc != NULL)
+ c = cn->cn_checkc(cn);
+ else
+ c = cn->cn_getc(cn);
if (c != -1) {
return (c);
}
@@ -636,22 +645,21 @@
}
void
-cndbctl(int on)
+cnputs(char *p)
{
- struct cn_device *cnd;
- struct consdev *cn;
- static int refcount;
+ int c;
+ int unlock_reqd = 0;
- if (!on)
- refcount--;
- if (refcount == 0)
- STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
- cn = cnd->cnd_cn;
- if (cn->cn_dbctl != NULL)
- cn->cn_dbctl(cn, on);
- }
- if (on)
- refcount++;
+ if (use_cnputs_mtx) {
+ mtx_lock_spin(&cnputs_mtx);
+ unlock_reqd = 1;
+ }
+
+ while ((c = *p++) != '\0')
+ cnputc(c);
+
+ if (unlock_reqd)
+ mtx_unlock_spin(&cnputs_mtx);
}
static int consmsgbuf_size = 8192;
@@ -723,6 +731,9 @@
{
make_dev(&cn_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "console");
+
+ mtx_init(&cnputs_mtx, "cnputs_mtx", NULL, MTX_SPIN | MTX_NOWITNESS);
+ use_cnputs_mtx = 1;
}
SYSINIT(cndev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, cn_drvinit, NULL)
Index: sysv_ipc.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sysv_ipc.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/sysv_ipc.c -L sys/kern/sysv_ipc.c -u -r1.1.1.1 -r1.2
--- sys/kern/sysv_ipc.c
+++ sys/kern/sysv_ipc.c
@@ -1,8 +1,12 @@
/* $NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $ */
/*-
* Copyright (c) 1994 Herb Peyerl <hpeyerl at novatel.ca>
+ * Copyright (c) 2006 nCircle Network Security, Inc.
* All rights reserved.
*
+ * This software was developed by Robert N. M. Watson for the TrustedBSD
+ * Project under contract to nCircle Network Security, Inc.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -30,7 +34,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sysv_ipc.c,v 1.29 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sysv_ipc.c,v 1.34 2007/06/12 00:11:59 rwatson Exp $");
#include "opt_sysvipc.h"
@@ -39,6 +43,7 @@
#include <sys/sem.h>
#include <sys/shm.h>
#include <sys/ipc.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/ucred.h>
@@ -72,50 +77,73 @@
* Note: The MAC Framework does not require any modifications to the
* ipcperm() function, as access control checks are performed throughout the
* implementation of each primitive. Those entry point calls complement the
- * ipcperm() discertionary checks.
+ * ipcperm() discertionary checks. Unlike file system discretionary access
+ * control, the original create of an object is given the same rights as the
+ * current owner.
*/
int
-ipcperm(td, perm, mode)
- struct thread *td;
- struct ipc_perm *perm;
- int mode;
+ipcperm(struct thread *td, struct ipc_perm *perm, int acc_mode)
{
struct ucred *cred = td->td_ucred;
- int error;
+ int error, obj_mode, dac_granted, priv_granted;
- if (cred->cr_uid != perm->cuid && cred->cr_uid != perm->uid) {
- /*
- * For a non-create/owner, we require privilege to
- * modify the object protections. Note: some other
- * implementations permit IPC_M to be delegated to
- * unprivileged non-creator/owner uids/gids.
- */
- if (mode & IPC_M) {
- error = suser(td);
- if (error)
- return (error);
- }
- /*
- * Try to match against creator/owner group; if not, fall
- * back on other.
- */
- mode >>= 3;
- if (!groupmember(perm->gid, cred) &&
- !groupmember(perm->cgid, cred))
- mode >>= 3;
+ dac_granted = 0;
+ if (cred->cr_uid == perm->cuid || cred->cr_uid == perm->uid) {
+ obj_mode = perm->mode;
+ dac_granted |= IPC_M;
+ } else if (groupmember(perm->gid, cred) ||
+ groupmember(perm->cgid, cred)) {
+ obj_mode = perm->mode;
+ obj_mode <<= 3;
} else {
- /*
- * Always permit the creator/owner to update the object
- * protections regardless of whether the object mode
- * permits it.
- */
- if (mode & IPC_M)
- return (0);
+ obj_mode = perm->mode;
+ obj_mode <<= 6;
+ }
+
+ /*
+ * While the System V IPC permission model allows IPC_M to be
+ * granted, as part of the mode, our implementation requires
+ * privilege to adminster the object if not the owner or creator.
+ */
+#if 0
+ if (obj_mode & IPC_M)
+ dac_granted |= IPC_M;
+#endif
+ if (obj_mode & IPC_R)
+ dac_granted |= IPC_R;
+ if (obj_mode & IPC_W)
+ dac_granted |= IPC_W;
+
+ /*
+ * Simple case: all required rights are granted by DAC.
+ */
+ if ((dac_granted & acc_mode) == acc_mode)
+ return (0);
+
+ /*
+ * Privilege is required to satisfy the request.
+ */
+ priv_granted = 0;
+ if ((acc_mode & IPC_M) && !(dac_granted & IPC_M)) {
+ error = priv_check(td, PRIV_IPC_ADMIN);
+ if (error == 0)
+ priv_granted |= IPC_M;
}
- if ((mode & perm->mode) != mode) {
- if (suser(td) != 0)
- return (EACCES);
+ if ((acc_mode & IPC_R) && !(dac_granted & IPC_R)) {
+ error = priv_check(td, PRIV_IPC_READ);
+ if (error == 0)
+ priv_granted |= IPC_R;
}
- return (0);
+
+ if ((acc_mode & IPC_W) && !(dac_granted & IPC_W)) {
+ error = priv_check(td, PRIV_IPC_WRITE);
+ if (error == 0)
+ priv_granted |= IPC_W;
+ }
+
+ if (((dac_granted | priv_granted) & acc_mode) == acc_mode)
+ return (0);
+ else
+ return (EACCES);
}
Index: Makefile
===================================================================
RCS file: /home/cvs/src/sys/kern/Makefile,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/Makefile -L sys/kern/Makefile -u -r1.1.1.1 -r1.2
--- sys/kern/Makefile
+++ sys/kern/Makefile
@@ -1,5 +1,5 @@
# @(#)Makefile 8.2 (Berkeley) 3/21/94
-# $FreeBSD: src/sys/kern/Makefile,v 1.11.12.1 2005/07/18 19:54:49 jhb Exp $
+# $FreeBSD: src/sys/kern/Makefile,v 1.14 2007/06/25 05:06:56 rafan Exp $
# Makefile for kernel tags files, init_sysent, etc.
@@ -11,10 +11,11 @@
sysent: init_sysent.c syscalls.c ../sys/syscall.h ../sys/syscall.mk \
../sys/sysproto.h
-init_sysent.c syscalls.c ../sys/syscall.h \
+init_sysent.c syscalls.c systrace_args.c ../sys/syscall.h \
../sys/syscall.mk ../sys/sysproto.h: makesyscalls.sh syscalls.master
-mv -f init_sysent.c init_sysent.c.bak
-mv -f syscalls.c syscalls.c.bak
+ -mv -f systrace_args.c systrace_args.c.bak
-mv -f ../sys/syscall.h ../sys/syscall.h.bak
-mv -f ../sys/syscall.mk ../sys/syscall.mk.bak
-mv -f ../sys/sysproto.h ../sys/sysproto.h.bak
@@ -37,7 +38,7 @@
dev dev/scsi \
fs fs/deadfs fs/fdescfs fs/fifofs \
fs/lofs fs/nullfs fs/portalfs fs/procfs \
- fs/specfs fs/umapfs fs/unionfs \
+ fs/specfs fs/unionfs \
hp hp/dev hp/hpux \
kern libkern \
net netinet nfs scripts sys \
--- /dev/null
+++ sys/kern/uipc_mqueue.c
@@ -0,0 +1,2481 @@
+/*-
+ * Copyright (c) 2005 David Xu <davidxu at freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * POSIX message queue implementation.
+ *
+ * 1) A mqueue filesystem can be mounted, each message queue appears
+ * in mounted directory, user can change queue's permission and
+ * ownership, or remove a queue. Manually creating a file in the
+ * directory causes a message queue to be created in the kernel with
+ * default message queue attributes applied and same name used, this
+ * method is not advocated since mq_open syscall allows user to specify
+ * different attributes. Also the file system can be mounted multiple
+ * times at different mount points but shows same contents.
+ *
+ * 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
+ * but directly operate on internal data structure, this allows user to
+ * use the IPC facility without having to mount mqueue file system.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/uipc_mqueue.c,v 1.25 2007/06/12 00:11:59 rwatson Exp $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/buf.h>
+#include <sys/dirent.h>
+#include <sys/event.h>
+#include <sys/eventhandler.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mount.h>
+#include <sys/mqueue.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/posix4.h>
+#include <sys/poll.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sysproto.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <machine/atomic.h>
+
+/*
+ * Limits and constants
+ */
+#define MQFS_NAMELEN NAME_MAX
+#define MQFS_DELEN (8 + MQFS_NAMELEN)
+
+/* node types */
+typedef enum {
+ mqfstype_none = 0,
+ mqfstype_root,
+ mqfstype_dir,
+ mqfstype_this,
+ mqfstype_parent,
+ mqfstype_file,
+ mqfstype_symlink,
+} mqfs_type_t;
+
+struct mqfs_node;
+
+/*
+ * mqfs_info: describes a mqfs instance
+ */
+struct mqfs_info {
+ struct sx mi_lock;
+ struct mqfs_node *mi_root;
+ struct unrhdr *mi_unrhdr;
+};
+
+struct mqfs_vdata {
+ LIST_ENTRY(mqfs_vdata) mv_link;
+ struct mqfs_node *mv_node;
+ struct vnode *mv_vnode;
+ struct task mv_task;
+};
+
+/*
+ * mqfs_node: describes a node (file or directory) within a mqfs
+ */
+struct mqfs_node {
+ char mn_name[MQFS_NAMELEN+1];
+ struct mqfs_info *mn_info;
+ struct mqfs_node *mn_parent;
+ LIST_HEAD(,mqfs_node) mn_children;
+ LIST_ENTRY(mqfs_node) mn_sibling;
+ LIST_HEAD(,mqfs_vdata) mn_vnodes;
+ int mn_refcount;
+ mqfs_type_t mn_type;
+ int mn_deleted;
+ u_int32_t mn_fileno;
+ void *mn_data;
+ struct timespec mn_birth;
+ struct timespec mn_ctime;
+ struct timespec mn_atime;
+ struct timespec mn_mtime;
+ uid_t mn_uid;
+ gid_t mn_gid;
+ int mn_mode;
+};
+
+#define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node)
+#define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data))
+#define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data))
+#define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \
+ (fp)->f_data)->mn_data))
+
+TAILQ_HEAD(msgq, mqueue_msg);
+
+struct mqueue;
+
+struct mqueue_notifier {
+ LIST_ENTRY(mqueue_notifier) nt_link;
+ struct sigevent nt_sigev;
+ ksiginfo_t nt_ksi;
+ struct proc *nt_proc;
+};
+
+struct mqueue {
+ struct mtx mq_mutex;
+ int mq_flags;
+ long mq_maxmsg;
+ long mq_msgsize;
+ long mq_curmsgs;
+ long mq_totalbytes;
+ struct msgq mq_msgq;
+ int mq_receivers;
+ int mq_senders;
+ struct selinfo mq_rsel;
+ struct selinfo mq_wsel;
+ struct mqueue_notifier *mq_notifier;
+};
+
+#define MQ_RSEL 0x01
+#define MQ_WSEL 0x02
+
+struct mqueue_msg {
+ TAILQ_ENTRY(mqueue_msg) msg_link;
+ unsigned int msg_prio;
+ unsigned int msg_size;
+ /* following real data... */
+};
+
+SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0,
+ "POSIX real time message queue");
+
+static int default_maxmsg = 10;
+static int default_msgsize = 1024;
+
+static int maxmsg = 100;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
+ &maxmsg, 0, "Default maximum messages in queue");
+static int maxmsgsize = 16384;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
+ &maxmsgsize, 0, "Default maximum message size");
+static int maxmq = 100;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
+ &maxmq, 0, "maximum message queues");
+static int curmq = 0;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
+ &curmq, 0, "current message queue number");
+static int unloadable = 0;
+static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
+
+static eventhandler_tag exit_tag;
+
+/* Only one instance per-system */
+static struct mqfs_info mqfs_data;
+static uma_zone_t mqnode_zone;
+static uma_zone_t mqueue_zone;
+static uma_zone_t mvdata_zone;
+static uma_zone_t mqnoti_zone;
+static struct vop_vector mqfs_vnodeops;
+static struct fileops mqueueops;
+
+/*
+ * Directory structure construction and manipulation
+ */
+#ifdef notyet
+static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent,
+ const char *name, int namelen, struct ucred *cred, int mode);
+static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent,
+ const char *name, int namelen, struct ucred *cred, int mode);
+#endif
+
+static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent,
+ const char *name, int namelen, struct ucred *cred, int mode);
+static int mqfs_destroy(struct mqfs_node *mn);
+static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
+static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
+static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
+
+/*
+ * Message queue construction and maniplation
+ */
+static struct mqueue *mqueue_alloc(const struct mq_attr *attr);
+static void mqueue_free(struct mqueue *mq);
+static int mqueue_send(struct mqueue *mq, const char *msg_ptr,
+ size_t msg_len, unsigned msg_prio, int waitok,
+ const struct timespec *abs_timeout);
+static int mqueue_receive(struct mqueue *mq, char *msg_ptr,
+ size_t msg_len, unsigned *msg_prio, int waitok,
+ const struct timespec *abs_timeout);
+static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
+ int timo);
+static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
+ int timo);
+static void mqueue_send_notification(struct mqueue *mq);
+static void mqueue_fdclose(struct thread *td, int fd, struct file *fp);
+static void mq_proc_exit(void *arg, struct proc *p);
+
+/*
+ * kqueue filters
+ */
+static void filt_mqdetach(struct knote *kn);
+static int filt_mqread(struct knote *kn, long hint);
+static int filt_mqwrite(struct knote *kn, long hint);
+
+struct filterops mq_rfiltops =
+ { 1, NULL, filt_mqdetach, filt_mqread };
+struct filterops mq_wfiltops =
+ { 1, NULL, filt_mqdetach, filt_mqwrite };
+
+/*
+ * Initialize fileno bitmap
+ */
+static void
+mqfs_fileno_init(struct mqfs_info *mi)
+{
+ struct unrhdr *up;
+
+ up = new_unrhdr(1, INT_MAX, NULL);
+ mi->mi_unrhdr = up;
+}
+
+/*
+ * Tear down fileno bitmap
+ */
+static void
+mqfs_fileno_uninit(struct mqfs_info *mi)
+{
+ struct unrhdr *up;
+
+ up = mi->mi_unrhdr;
+ mi->mi_unrhdr = NULL;
+ delete_unrhdr(up);
+}
+
+/*
+ * Allocate a file number
+ */
+static void
+mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
+{
+ /* make sure our parent has a file number */
+ if (mn->mn_parent && !mn->mn_parent->mn_fileno)
+ mqfs_fileno_alloc(mi, mn->mn_parent);
+
+ switch (mn->mn_type) {
+ case mqfstype_root:
+ case mqfstype_dir:
+ case mqfstype_file:
+ case mqfstype_symlink:
+ mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
+ break;
+ case mqfstype_this:
+ KASSERT(mn->mn_parent != NULL,
+ ("mqfstype_this node has no parent"));
+ mn->mn_fileno = mn->mn_parent->mn_fileno;
+ break;
+ case mqfstype_parent:
+ KASSERT(mn->mn_parent != NULL,
+ ("mqfstype_parent node has no parent"));
+ if (mn->mn_parent == mi->mi_root) {
+ mn->mn_fileno = mn->mn_parent->mn_fileno;
+ break;
+ }
+ KASSERT(mn->mn_parent->mn_parent != NULL,
+ ("mqfstype_parent node has no grandparent"));
+ mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
+ break;
+ default:
+ KASSERT(0,
+ ("mqfs_fileno_alloc() called for unknown type node: %d",
+ mn->mn_type));
+ break;
+ }
+}
+
+/*
+ * Release a file number
+ */
+static void
+mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
+{
+ switch (mn->mn_type) {
+ case mqfstype_root:
+ case mqfstype_dir:
+ case mqfstype_file:
+ case mqfstype_symlink:
+ free_unr(mi->mi_unrhdr, mn->mn_fileno);
+ break;
+ case mqfstype_this:
+ case mqfstype_parent:
+ /* ignore these, as they don't "own" their file number */
+ break;
+ default:
+ KASSERT(0,
+ ("mqfs_fileno_free() called for unknown type node: %d",
+ mn->mn_type));
+ break;
+ }
+}
+
+static __inline struct mqfs_node *
+mqnode_alloc(void)
+{
+ return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO);
+}
+
+static __inline void
+mqnode_free(struct mqfs_node *node)
+{
+ uma_zfree(mqnode_zone, node);
+}
+
+static __inline void
+mqnode_addref(struct mqfs_node *node)
+{
+ atomic_fetchadd_int(&node->mn_refcount, 1);
+}
+
+static __inline void
+mqnode_release(struct mqfs_node *node)
+{
+ int old, exp;
+
+ old = atomic_fetchadd_int(&node->mn_refcount, -1);
+ if (node->mn_type == mqfstype_dir ||
+ node->mn_type == mqfstype_root)
+ exp = 3; /* include . and .. */
+ else
+ exp = 1;
+ if (old == exp)
+ mqfs_destroy(node);
+}
+
+/*
+ * Add a node to a directory
+ */
+static int
+mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
+{
+ KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
+ KASSERT(parent->mn_info != NULL,
+ ("%s(): parent has no mn_info", __func__));
+ KASSERT(parent->mn_type == mqfstype_dir ||
+ parent->mn_type == mqfstype_root,
+ ("%s(): parent is not a directory", __func__));
+
+ node->mn_info = parent->mn_info;
+ node->mn_parent = parent;
+ LIST_INIT(&node->mn_children);
+ LIST_INIT(&node->mn_vnodes);
+ LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
+ mqnode_addref(parent);
+ return (0);
+}
+
+static struct mqfs_node *
+mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
+ int nodetype)
+{
+ struct mqfs_node *node;
+
+ node = mqnode_alloc();
+ strncpy(node->mn_name, name, namelen);
+ node->mn_type = nodetype;
+ node->mn_refcount = 1;
+ getnanotime(&node->mn_birth);
+ node->mn_ctime = node->mn_atime = node->mn_mtime
+ = node->mn_birth;
+ node->mn_uid = cred->cr_uid;
+ node->mn_gid = cred->cr_gid;
+ node->mn_mode = mode;
+ return (node);
+}
+
+/*
+ * Create a file
+ */
+static struct mqfs_node *
+mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
+ struct ucred *cred, int mode)
+{
+ struct mqfs_node *node;
+
+ node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
+ if (mqfs_add_node(parent, node) != 0) {
+ mqnode_free(node);
+ return (NULL);
+ }
+ return (node);
+}
+
+/*
+ * Add . and .. to a directory
+ */
+static int
+mqfs_fixup_dir(struct mqfs_node *parent)
+{
+ struct mqfs_node *dir;
+
+ dir = mqnode_alloc();
+ dir->mn_name[0] = '.';
+ dir->mn_type = mqfstype_this;
+ dir->mn_refcount = 1;
+ if (mqfs_add_node(parent, dir) != 0) {
+ mqnode_free(dir);
+ return (-1);
+ }
+
+ dir = mqnode_alloc();
+ dir->mn_name[0] = dir->mn_name[1] = '.';
+ dir->mn_type = mqfstype_parent;
+ dir->mn_refcount = 1;
+
+ if (mqfs_add_node(parent, dir) != 0) {
+ mqnode_free(dir);
+ return (-1);
+ }
+
+ return (0);
+}
+
+#ifdef notyet
+
+/*
+ * Create a directory
+ */
+static struct mqfs_node *
+mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
+ struct ucred *cred, int mode)
+{
+ struct mqfs_node *node;
+
+ node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
+ if (mqfs_add_node(parent, node) != 0) {
+ mqnode_free(node);
+ return (NULL);
+ }
+
+ if (mqfs_fixup_dir(node) != 0) {
+ mqfs_destroy(node);
+ return (NULL);
+ }
+ return (node);
+}
+
+/*
+ * Create a symlink
+ */
+static struct mqfs_node *
+mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
+ struct ucred *cred, int mode)
+{
+ struct mqfs_node *node;
+
+ node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
+ if (mqfs_add_node(parent, node) != 0) {
+ mqnode_free(node);
+ return (NULL);
+ }
+ return (node);
+}
+
+#endif
+
+/*
+ * Destroy a node or a tree of nodes
+ */
+static int
+mqfs_destroy(struct mqfs_node *node)
+{
+ struct mqfs_node *parent;
+
+ KASSERT(node != NULL,
+ ("%s(): node is NULL", __func__));
+ KASSERT(node->mn_info != NULL,
+ ("%s(): node has no mn_info", __func__));
+
+ /* destroy children */
+ if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
+ while (! LIST_EMPTY(&node->mn_children))
+ mqfs_destroy(LIST_FIRST(&node->mn_children));
+
+ /* unlink from parent */
+ if ((parent = node->mn_parent) != NULL) {
+ KASSERT(parent->mn_info == node->mn_info,
+ ("%s(): parent has different mn_info", __func__));
+ LIST_REMOVE(node, mn_sibling);
+ }
+
+ if (node->mn_fileno != 0)
+ mqfs_fileno_free(node->mn_info, node);
+ if (node->mn_data != NULL)
+ mqueue_free(node->mn_data);
+ mqnode_free(node);
+ return (0);
+}
+
+/*
+ * Mount a mqfs instance
+ */
+static int
+mqfs_mount(struct mount *mp, struct thread *td)
+{
+ struct statfs *sbp;
+
+ if (mp->mnt_flag & MNT_UPDATE)
+ return (EOPNOTSUPP);
+
+ mp->mnt_data = &mqfs_data;
+ MNT_ILOCK(mp);
+ mp->mnt_flag |= MNT_LOCAL;
+ mp->mnt_kern_flag |= MNTK_MPSAFE;
+ MNT_IUNLOCK(mp);
+ vfs_getnewfsid(mp);
+
+ sbp = &mp->mnt_stat;
+ vfs_mountedfrom(mp, "mqueue");
+ sbp->f_bsize = PAGE_SIZE;
+ sbp->f_iosize = PAGE_SIZE;
+ sbp->f_blocks = 1;
+ sbp->f_bfree = 0;
+ sbp->f_bavail = 0;
+ sbp->f_files = 1;
+ sbp->f_ffree = 0;
+ return (0);
+}
+
+/*
+ * Unmount a mqfs instance
+ */
+static int
+mqfs_unmount(struct mount *mp, int mntflags, struct thread *td)
+{
+ int error;
+
+ error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0, td);
+ return (error);
+}
+
+/*
+ * Return a root vnode
+ */
+static int
+mqfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td)
+{
+ struct mqfs_info *mqfs;
+ int ret;
+
+ mqfs = VFSTOMQFS(mp);
+ sx_xlock(&mqfs->mi_lock);
+ ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
+ sx_xunlock(&mqfs->mi_lock);
+ return (ret);
+}
+
+/*
+ * Return filesystem stats
+ */
+static int
+mqfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
+{
+ /* XXX update statistics */
+ return (0);
+}
+
+/*
+ * Initialize a mqfs instance
+ */
+static int
+mqfs_init(struct vfsconf *vfc)
+{
+ struct mqfs_node *root;
+ struct mqfs_info *mi;
+
+ mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ mvdata_zone = uma_zcreate("mvdata",
+ sizeof(struct mqfs_vdata), NULL, NULL, NULL,
+ NULL, UMA_ALIGN_PTR, 0);
+ mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ mi = &mqfs_data;
+ sx_init(&mi->mi_lock, "mqfs lock");
+ /* set up the root diretory */
+ root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
+ mqfstype_root);
+ root->mn_info = mi;
+ LIST_INIT(&root->mn_children);
+ LIST_INIT(&root->mn_vnodes);
+ mi->mi_root = root;
+ mqfs_fileno_init(mi);
+ mqfs_fileno_alloc(mi, root);
+ mqfs_fixup_dir(root);
+ exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
+ EVENTHANDLER_PRI_ANY);
+ mq_fdclose = mqueue_fdclose;
+ p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
+ return (0);
+}
+
+/*
+ * Destroy a mqfs instance
+ */
+static int
+mqfs_uninit(struct vfsconf *vfc)
+{
+ struct mqfs_info *mi;
+
+ if (!unloadable)
+ return (EOPNOTSUPP);
+ EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
+ mi = &mqfs_data;
+ mqfs_destroy(mi->mi_root);
+ mi->mi_root = NULL;
+ mqfs_fileno_uninit(mi);
+ sx_destroy(&mi->mi_lock);
+ uma_zdestroy(mqnode_zone);
+ uma_zdestroy(mqueue_zone);
+ uma_zdestroy(mvdata_zone);
+ uma_zdestroy(mqnoti_zone);
+ return (0);
+}
+
+/*
+ * task routine
+ */
+static void
+do_recycle(void *context, int pending __unused)
+{
+ struct vnode *vp = (struct vnode *)context;
+
+ vrecycle(vp, curthread);
+ vdrop(vp);
+}
+
+/*
+ * Allocate a vnode
+ */
+static int
+mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
+{
+ struct mqfs_vdata *vd;
+ int error;
+
+ LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
+ if (vd->mv_vnode->v_mount == mp)
+ break;
+ }
+
+ if (vd != NULL) {
+ if (vget(vd->mv_vnode, 0, curthread) == 0) {
+ *vpp = vd->mv_vnode;
+ vn_lock(*vpp, LK_RETRY | LK_EXCLUSIVE,
+ curthread);
+ return (0);
+ }
+ /* XXX if this can happen, we're in trouble */
+ }
+
+ error = getnewvnode("mqueue", mp, &mqfs_vnodeops, vpp);
+ if (error)
+ return (error);
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+ error = insmntque(*vpp, mp);
+ if (error != 0) {
+ *vpp = NULLVP;
+ return (error);
+ }
+ vd = uma_zalloc(mvdata_zone, M_WAITOK);
+ (*vpp)->v_data = vd;
+ vd->mv_vnode = *vpp;
+ vd->mv_node = pn;
+ TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
+ LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
+ mqnode_addref(pn);
+ switch (pn->mn_type) {
+ case mqfstype_root:
+ (*vpp)->v_vflag = VV_ROOT;
+ /* fall through */
+ case mqfstype_dir:
+ case mqfstype_this:
+ case mqfstype_parent:
+ (*vpp)->v_type = VDIR;
+ break;
+ case mqfstype_file:
+ (*vpp)->v_type = VREG;
+ break;
+ case mqfstype_symlink:
+ (*vpp)->v_type = VLNK;
+ break;
+ case mqfstype_none:
+ KASSERT(0, ("mqfs_allocf called for null node\n"));
+ default:
+ panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
+ }
+ return (0);
+}
+
+/*
+ * Search a directory entry
+ */
+static struct mqfs_node *
+mqfs_search(struct mqfs_node *pd, const char *name, int len)
+{
+ struct mqfs_node *pn;
+
+ LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
+ if (strncmp(pn->mn_name, name, len) == 0)
+ return (pn);
+ }
+ return (NULL);
+}
+
+/*
+ * Look up a file or directory.
+ */
+static int
+mqfs_lookupx(struct vop_cachedlookup_args *ap)
+{
+ struct componentname *cnp;
+ struct vnode *dvp, **vpp;
+ struct mqfs_node *pd;
+ struct mqfs_node *pn;
+ int nameiop, flags, error, namelen;
+ char *pname;
+ struct thread *td;
+
+ cnp = ap->a_cnp;
+ vpp = ap->a_vpp;
+ dvp = ap->a_dvp;
+ pname = cnp->cn_nameptr;
+ namelen = cnp->cn_namelen;
+ td = cnp->cn_thread;
+ flags = cnp->cn_flags;
+ nameiop = cnp->cn_nameiop;
+ pd = VTON(dvp);
+ pn = NULL;
+ *vpp = NULLVP;
+
+ if (dvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
+ if (error)
+ return (error);
+
+ /* shortcut: check if the name is too long */
+ if (cnp->cn_namelen >= MQFS_NAMELEN)
+ return (ENOENT);
+
+ /* self */
+ if (namelen == 1 && pname[0] == '.') {
+ if ((flags & ISLASTCN) && nameiop != LOOKUP)
+ return (EINVAL);
+ pn = pd;
+ *vpp = dvp;
+ VREF(dvp);
+ return (0);
+ }
+
+ /* parent */
+ if (cnp->cn_flags & ISDOTDOT) {
+ if (dvp->v_vflag & VV_ROOT)
+ return (EIO);
+ if ((flags & ISLASTCN) && nameiop != LOOKUP)
+ return (EINVAL);
+ VOP_UNLOCK(dvp, 0, cnp->cn_thread);
+ KASSERT(pd->mn_parent, ("non-root directory has no parent"));
+ pn = pd->mn_parent;
+ error = mqfs_allocv(dvp->v_mount, vpp, pn);
+ vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+ return (error);
+ }
+
+ /* named node */
+ pn = mqfs_search(pd, pname, namelen);
+
+ /* found */
+ if (pn != NULL) {
+ /* DELETE */
+ if (nameiop == DELETE && (flags & ISLASTCN)) {
+ error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
+ if (error)
+ return (error);
+ if (*vpp == dvp) {
+ VREF(dvp);
+ *vpp = dvp;
+ return (0);
+ }
+ }
+
+ /* allocate vnode */
+ error = mqfs_allocv(dvp->v_mount, vpp, pn);
+ if (error == 0 && cnp->cn_flags & MAKEENTRY)
+ cache_enter(dvp, *vpp, cnp);
+ return (error);
+ }
+
+ /* not found */
+
+ /* will create a new entry in the directory ? */
+ if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
+ && (flags & ISLASTCN)) {
+ error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
+ if (error)
+ return (error);
+ cnp->cn_flags |= SAVENAME;
+ return (EJUSTRETURN);
+ }
+ return (ENOENT);
+}
+
+#if 0
+struct vop_lookup_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+};
+#endif
+
+/*
+ * vnode lookup operation
+ */
+static int
+mqfs_lookup(struct vop_cachedlookup_args *ap)
+{
+ struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+ int rc;
+
+ sx_xlock(&mqfs->mi_lock);
+ rc = mqfs_lookupx(ap);
+ sx_xunlock(&mqfs->mi_lock);
+ return (rc);
+}
+
+#if 0
+struct vop_create_args {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+};
+#endif
+
+/*
+ * vnode creation operation
+ */
+static int
+mqfs_create(struct vop_create_args *ap)
+{
+ struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+ struct componentname *cnp = ap->a_cnp;
+ struct mqfs_node *pd;
+ struct mqfs_node *pn;
+ struct mqueue *mq;
+ int error;
+
+ pd = VTON(ap->a_dvp);
+ if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
+ return (ENOTDIR);
+ mq = mqueue_alloc(NULL);
+ if (mq == NULL)
+ return (EAGAIN);
+ sx_xlock(&mqfs->mi_lock);
+#if 0
+ /* named node */
+ pn = mqfs_search(pd, cnp->cn_nameptr, cnp->cn_namelen);
+ if (pn != NULL) {
+ mqueue_free(mq);
+ sx_xunlock(&mqfs->mi_lock);
+ return (EEXIST);
+ }
+#else
+ if ((cnp->cn_flags & HASBUF) == 0)
+ panic("%s: no name", __func__);
+#endif
+ pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
+ cnp->cn_cred, ap->a_vap->va_mode);
+ if (pn == NULL)
+ error = ENOSPC;
+ else {
+ error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
+ if (error)
+ mqfs_destroy(pn);
+ else
+ pn->mn_data = mq;
+ }
+ sx_xunlock(&mqfs->mi_lock);
+ if (error)
+ mqueue_free(mq);
+ return (error);
+}
+
+/*
+ * Remove an entry
+ */
+static
+int do_unlink(struct mqfs_node *pn, struct ucred *ucred)
+{
+ struct mqfs_node *parent;
+ struct mqfs_vdata *vd;
+ int error = 0;
+
+ sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
+
+ if (ucred->cr_uid != pn->mn_uid &&
+ (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0)
+ error = EACCES;
+ else if (!pn->mn_deleted) {
+ parent = pn->mn_parent;
+ pn->mn_parent = NULL;
+ pn->mn_deleted = 1;
+ LIST_REMOVE(pn, mn_sibling);
+ LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
+ cache_purge(vd->mv_vnode);
+ vhold(vd->mv_vnode);
+ taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
+ }
+ mqnode_release(pn);
+ mqnode_release(parent);
+ } else
+ error = ENOENT;
+ return (error);
+}
+
+#if 0
+struct vop_remove_args {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+};
+#endif
+
+/*
+ * vnode removal operation
+ */
+static int
+mqfs_remove(struct vop_remove_args *ap)
+{
+ struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+ struct mqfs_node *pn;
+ int error;
+
+ if (ap->a_vp->v_type == VDIR)
+ return (EPERM);
+ pn = VTON(ap->a_vp);
+ sx_xlock(&mqfs->mi_lock);
+ error = do_unlink(pn, ap->a_cnp->cn_cred);
+ sx_xunlock(&mqfs->mi_lock);
+ return (error);
+}
+
+#if 0
+struct vop_inactive_args {
+ struct vnode *a_vp;
+ struct thread *a_td;
+};
+#endif
+
+static int
+mqfs_inactive(struct vop_inactive_args *ap)
+{
+ struct mqfs_node *pn = VTON(ap->a_vp);
+
+ if (pn->mn_deleted)
+ vrecycle(ap->a_vp, ap->a_td);
+ return (0);
+}
+
+#if 0
+struct vop_reclaim_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ struct thread *a_td;
+};
+#endif
+
+static int
+mqfs_reclaim(struct vop_reclaim_args *ap)
+{
+ struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
+ struct vnode *vp = ap->a_vp;
+ struct mqfs_node *pn;
+ struct mqfs_vdata *vd;
+
+ vd = vp->v_data;
+ pn = vd->mv_node;
+ sx_xlock(&mqfs->mi_lock);
+ vp->v_data = NULL;
+ LIST_REMOVE(vd, mv_link);
+ uma_zfree(mvdata_zone, vd);
+ mqnode_release(pn);
+ sx_xunlock(&mqfs->mi_lock);
+ return (0);
+}
+
+#if 0
+struct vop_open_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ int a_mode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ int a_fdidx;
+};
+#endif
+
+static int
+mqfs_open(struct vop_open_args *ap)
+{
+ return (0);
+}
+
+#if 0
+struct vop_close_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ int a_fflag;
+ struct ucred *a_cred;
+ struct thread *a_td;
+};
+#endif
+
+static int
+mqfs_close(struct vop_close_args *ap)
+{
+ return (0);
+}
+
+#if 0
+struct vop_access_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ int a_mode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+};
+#endif
+
+/*
+ * Verify permissions
+ */
+static int
+mqfs_access(struct vop_access_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct vattr vattr;
+ int error;
+
+ error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
+ if (error)
+ return (error);
+ error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid,
+ vattr.va_gid, ap->a_mode, ap->a_cred, NULL);
+ return (error);
+}
+
+#if 0
+struct vop_getattr_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ struct thread *a_td;
+};
+#endif
+
+/*
+ * Get file attributes
+ */
+static int
+mqfs_getattr(struct vop_getattr_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct mqfs_node *pn = VTON(vp);
+ struct vattr *vap = ap->a_vap;
+ int error = 0;
+
+ VATTR_NULL(vap);
+ vap->va_type = vp->v_type;
+ vap->va_mode = pn->mn_mode;
+ vap->va_nlink = 1;
+ vap->va_uid = pn->mn_uid;
+ vap->va_gid = pn->mn_gid;
+ vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+ vap->va_fileid = pn->mn_fileno;
+ vap->va_size = 0;
+ vap->va_blocksize = PAGE_SIZE;
+ vap->va_bytes = vap->va_size = 0;
+ vap->va_atime = pn->mn_atime;
+ vap->va_mtime = pn->mn_mtime;
+ vap->va_ctime = pn->mn_ctime;
+ vap->va_birthtime = pn->mn_birth;
+ vap->va_gen = 0;
+ vap->va_flags = 0;
+ vap->va_rdev = 0;
+ vap->va_bytes = 0;
+ vap->va_filerev = 0;
+ vap->va_vaflags = 0;
+ return (error);
+}
+
+#if 0
+struct vop_setattr_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ struct thread *a_td;
+};
+#endif
+/*
+ * Set attributes
+ */
+static int
+mqfs_setattr(struct vop_setattr_args *ap)
+{
+ struct mqfs_node *pn;
+ struct vattr *vap;
+ struct vnode *vp;
+ int c, error;
+ uid_t uid;
+ gid_t gid;
+
+ vap = ap->a_vap;
+ vp = ap->a_vp;
+ if ((vap->va_type != VNON) ||
+ (vap->va_nlink != VNOVAL) ||
+ (vap->va_fsid != VNOVAL) ||
+ (vap->va_fileid != VNOVAL) ||
+ (vap->va_blocksize != VNOVAL) ||
+ (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
+ (vap->va_rdev != VNOVAL) ||
+ ((int)vap->va_bytes != VNOVAL) ||
+ (vap->va_gen != VNOVAL)) {
+ return (EINVAL);
+ }
+
+ pn = VTON(vp);
+
+ error = c = 0;
+ if (vap->va_uid == (uid_t)VNOVAL)
+ uid = pn->mn_uid;
+ else
+ uid = vap->va_uid;
+ if (vap->va_gid == (gid_t)VNOVAL)
+ gid = pn->mn_gid;
+ else
+ gid = vap->va_gid;
+
+ if (uid != pn->mn_uid || gid != pn->mn_gid) {
+ /*
+ * To modify the ownership of a file, must possess VADMIN
+ * for that file.
+ */
+ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)))
+ return (error);
+
+ /*
+ * XXXRW: Why is there a privilege check here: shouldn't the
+ * check in VOP_ACCESS() be enough? Also, are the group bits
+ * below definitely right?
+ */
+ if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid ||
+ (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
+ (error = priv_check(ap->a_td, PRIV_MQ_ADMIN)) != 0)
+ return (error);
+ pn->mn_uid = uid;
+ pn->mn_gid = gid;
+ c = 1;
+ }
+
+ if (vap->va_mode != (mode_t)VNOVAL) {
+ if ((ap->a_cred->cr_uid != pn->mn_uid) &&
+ (error = priv_check(ap->a_td, PRIV_MQ_ADMIN)))
+ return (error);
+ pn->mn_mode = vap->va_mode;
+ c = 1;
+ }
+
+ if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
+ /* See the comment in ufs_vnops::ufs_setattr(). */
+ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)) &&
+ ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
+ (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, ap->a_td))))
+ return (error);
+ if (vap->va_atime.tv_sec != VNOVAL) {
+ pn->mn_atime = vap->va_atime;
+ }
+ if (vap->va_mtime.tv_sec != VNOVAL) {
+ pn->mn_mtime = vap->va_mtime;
+ }
+ c = 1;
+ }
+ if (c) {
+ vfs_timestamp(&pn->mn_ctime);
+ }
+ return (0);
+}
+
+#if 0
+struct vop_read_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ struct ucred *a_cred;
+};
+#endif
+
+/*
+ * Read from a file
+ */
+static int
+mqfs_read(struct vop_read_args *ap)
+{
+ char buf[80];
+ struct vnode *vp = ap->a_vp;
+ struct uio *uio = ap->a_uio;
+ struct mqfs_node *pn;
+ struct mqueue *mq;
+ int len, error;
+
+ if (vp->v_type != VREG)
+ return (EINVAL);
+
+ pn = VTON(vp);
+ mq = VTOMQ(vp);
+ snprintf(buf, sizeof(buf),
+ "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
+ mq->mq_totalbytes,
+ mq->mq_maxmsg,
+ mq->mq_curmsgs,
+ mq->mq_msgsize);
+ buf[sizeof(buf)-1] = '\0';
+ len = strlen(buf);
+ error = uiomove_frombuf(buf, len, uio);
+ return (error);
+}
+
+#if 0
+struct vop_readdir_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+ int *a_eofflag;
+ int *a_ncookies;
+ u_long **a_cookies;
+};
+#endif
+
+/*
+ * Return directory entries.
+ */
+static int
+mqfs_readdir(struct vop_readdir_args *ap)
+{
+ struct vnode *vp;
+ struct mqfs_info *mi;
+ struct mqfs_node *pd;
+ struct mqfs_node *pn;
+ struct dirent entry;
+ struct uio *uio;
+ int *tmp_ncookies = NULL;
+ off_t offset;
+ int error, i;
+
+ vp = ap->a_vp;
+ mi = VFSTOMQFS(vp->v_mount);
+ pd = VTON(vp);
+ uio = ap->a_uio;
+
+ if (vp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if (uio->uio_offset < 0)
+ return (EINVAL);
+
+ if (ap->a_ncookies != NULL) {
+ tmp_ncookies = ap->a_ncookies;
+ *ap->a_ncookies = 0;
+ ap->a_ncookies = NULL;
+ }
+
+ error = 0;
+ offset = 0;
+
+ sx_xlock(&mi->mi_lock);
+
+ LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
+ entry.d_reclen = sizeof(entry);
+ if (!pn->mn_fileno)
+ mqfs_fileno_alloc(mi, pn);
+ entry.d_fileno = pn->mn_fileno;
+ for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
+ entry.d_name[i] = pn->mn_name[i];
+ entry.d_name[i] = 0;
+ entry.d_namlen = i;
+ switch (pn->mn_type) {
+ case mqfstype_root:
+ case mqfstype_dir:
+ case mqfstype_this:
+ case mqfstype_parent:
+ entry.d_type = DT_DIR;
+ break;
+ case mqfstype_file:
+ entry.d_type = DT_REG;
+ break;
+ case mqfstype_symlink:
+ entry.d_type = DT_LNK;
+ break;
+ default:
+ panic("%s has unexpected node type: %d", pn->mn_name,
+ pn->mn_type);
+ }
+ if (entry.d_reclen > uio->uio_resid)
+ break;
+ if (offset >= uio->uio_offset) {
+ error = vfs_read_dirent(ap, &entry, offset);
+ if (error)
+ break;
+ }
+ offset += entry.d_reclen;
+ }
+ sx_xunlock(&mi->mi_lock);
+
+ uio->uio_offset = offset;
+
+ if (tmp_ncookies != NULL)
+ ap->a_ncookies = tmp_ncookies;
+
+ return (error);
+}
+
+#ifdef notyet
+
+#if 0
+struct vop_mkdir_args {
+ struct vnode *a_dvp;
+ struvt vnode **a_vpp;
+ struvt componentname *a_cnp;
+ struct vattr *a_vap;
+};
+#endif
+
+/*
+ * Create a directory.
+ */
+static int
+mqfs_mkdir(struct vop_mkdir_args *ap)
+{
+ struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+ struct componentname *cnp = ap->a_cnp;
+ struct mqfs_node *pd = VTON(ap->a_dvp);
+ struct mqfs_node *pn;
+ int error;
+
+ if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
+ return (ENOTDIR);
+ sx_xlock(&mqfs->mi_lock);
+#if 0
+ /* named node */
+ pn = mqfs_search(pd, cnp->cn_nameptr, cnp->cn_namelen);
+ if (pn != NULL) {
+ sx_xunlock(&mqfs->mi_lock);
+ return (EEXIST);
+ }
+#else
+ if ((cnp->cn_flags & HASBUF) == 0)
+ panic("%s: no name", __func__);
+#endif
+ pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
+ ap->a_vap->cn_cred, ap->a_vap->va_mode);
+ if (pn == NULL)
+ error = ENOSPC;
+ else
+ error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
+ sx_xunlock(&mqfs->mi_lock);
+ return (error);
+}
+
+#if 0
+struct vop_rmdir_args {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+};
+#endif
+
+/*
+ * Remove a directory.
+ */
+static int
+mqfs_rmdir(struct vop_rmdir_args *ap)
+{
+ struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+ struct mqfs_node *pn = VTON(ap->a_vp);
+ struct mqfs_node *pt;
+
+ if (pn->mn_type != mqfstype_dir)
+ return (ENOTDIR);
+
+ sx_xlock(&mqfs->mi_lock);
+ if (pn->mn_deleted) {
+ sx_xunlock(&mqfs->mi_lock);
+ return (ENOENT);
+ }
+
+ pt = LIST_FIRST(&pn->mn_children);
+ pt = LIST_NEXT(pt, mn_sibling);
+ pt = LIST_NEXT(pt, mn_sibling);
+ if (pt != NULL) {
+ sx_xunlock(&mqfs->mi_lock);
+ return (ENOTEMPTY);
+ }
+ pt = pn->mn_parent;
+ pn->mn_parent = NULL;
+ pn->mn_deleted = 1;
+ LIST_REMOVE(pn, mn_sibling);
+ mqnode_release(pn);
+ mqnode_release(pt);
+ sx_xunlock(&mqfs->mi_lock);
+ cache_purge(ap->a_vp);
+ return (0);
+}
+
+#endif /* notyet */
+
+/*
+ * Allocate a message queue
+ */
+static struct mqueue *
+mqueue_alloc(const struct mq_attr *attr)
+{
+ struct mqueue *mq;
+
+ if (curmq >= maxmq)
+ return (NULL);
+ mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&mq->mq_msgq);
+ if (attr != NULL) {
+ mq->mq_maxmsg = attr->mq_maxmsg;
+ mq->mq_msgsize = attr->mq_msgsize;
+ } else {
+ mq->mq_maxmsg = default_maxmsg;
+ mq->mq_msgsize = default_msgsize;
+ }
+ mtx_init(&mq->mq_mutex, "mqueue", NULL, MTX_DEF);
+ knlist_init(&mq->mq_rsel.si_note, &mq->mq_mutex, NULL, NULL, NULL);
+ knlist_init(&mq->mq_wsel.si_note, &mq->mq_mutex, NULL, NULL, NULL);
+ atomic_add_int(&curmq, 1);
+ return (mq);
+}
+
+/*
+ * Destroy a message queue
+ */
+static void
+mqueue_free(struct mqueue *mq)
+{
+ struct mqueue_msg *msg;
+
+ while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
+ TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
+ FREE(msg, M_MQUEUEDATA);
+ }
+
+ mtx_destroy(&mq->mq_mutex);
+ knlist_destroy(&mq->mq_rsel.si_note);
+ knlist_destroy(&mq->mq_wsel.si_note);
+ uma_zfree(mqueue_zone, mq);
+ atomic_add_int(&curmq, -1);
+}
+
+/*
+ * Load a message from user space
+ */
+static struct mqueue_msg *
+mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
+{
+ struct mqueue_msg *msg;
+ size_t len;
+ int error;
+
+ len = sizeof(struct mqueue_msg) + msg_size;
+ MALLOC(msg, struct mqueue_msg *, len, M_MQUEUEDATA, M_WAITOK);
+ error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
+ msg_size);
+ if (error) {
+ FREE(msg, M_MQUEUEDATA);
+ msg = NULL;
+ } else {
+ msg->msg_size = msg_size;
+ msg->msg_prio = msg_prio;
+ }
+ return (msg);
+}
+
+/*
+ * Save a message to user space
+ */
+static int
+mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
+{
+ int error;
+
+ error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
+ msg->msg_size);
+ if (error == 0 && msg_prio != NULL)
+ error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
+ return (error);
+}
+
+/*
+ * Free a message's memory
+ */
+static __inline void
+mqueue_freemsg(struct mqueue_msg *msg)
+{
+ FREE(msg, M_MQUEUEDATA);
+}
+
+/*
+ * Send a message. if waitok is false, thread will not be
+ * blocked if there is no data in queue, otherwise, absolute
+ * time will be checked.
+ */
+int
+mqueue_send(struct mqueue *mq, const char *msg_ptr,
+ size_t msg_len, unsigned msg_prio, int waitok,
+ const struct timespec *abs_timeout)
+{
+ struct mqueue_msg *msg;
+ struct timespec ets, ts, ts2;
+ struct timeval tv;
+ int error;
+
+ if (msg_prio >= MQ_PRIO_MAX)
+ return (EINVAL);
+ if (msg_len > mq->mq_msgsize)
+ return (EMSGSIZE);
+ msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
+ if (msg == NULL)
+ return (EFAULT);
+
+ /* O_NONBLOCK case */
+ if (!waitok) {
+ error = _mqueue_send(mq, msg, -1);
+ if (error)
+ goto bad;
+ return (0);
+ }
+
+ /* we allow a null timeout (wait forever) */
+ if (abs_timeout == NULL) {
+ error = _mqueue_send(mq, msg, 0);
+ if (error)
+ goto bad;
+ return (0);
+ }
+
+ /* send it before checking time */
+ error = _mqueue_send(mq, msg, -1);
+ if (error == 0)
+ return (0);
+
+ if (error != EAGAIN)
+ goto bad;
+
+ error = copyin(abs_timeout, &ets, sizeof(ets));
+ if (error != 0)
+ goto bad;
+ if (ets.tv_nsec >= 1000000000 || ets.tv_nsec < 0) {
+ error = EINVAL;
+ goto bad;
+ }
+ for (;;) {
+ ts2 = ets;
+ getnanotime(&ts);
+ timespecsub(&ts2, &ts);
+ if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
+ error = ETIMEDOUT;
+ break;
+ }
+ TIMESPEC_TO_TIMEVAL(&tv, &ts2);
+ error = _mqueue_send(mq, msg, tvtohz(&tv));
+ if (error != ETIMEDOUT)
+ break;
+ }
+ if (error == 0)
+ return (0);
+bad:
+ mqueue_freemsg(msg);
+ return (error);
+}
+
+/*
+ * Common routine to send a message
+ */
+static int
+_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
+{
+ struct mqueue_msg *msg2;
+ int error = 0;
+
+ mtx_lock(&mq->mq_mutex);
+ while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
+ if (timo < 0) {
+ mtx_unlock(&mq->mq_mutex);
+ return (EAGAIN);
+ }
+ mq->mq_senders++;
+ error = msleep(&mq->mq_senders, &mq->mq_mutex,
+ PCATCH, "mqsend", timo);
+ mq->mq_senders--;
+ if (error == EAGAIN)
+ error = ETIMEDOUT;
+ }
+ if (mq->mq_curmsgs >= mq->mq_maxmsg) {
+ mtx_unlock(&mq->mq_mutex);
+ return (error);
+ }
+ error = 0;
+ if (TAILQ_EMPTY(&mq->mq_msgq)) {
+ TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
+ } else {
+ if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
+ TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
+ } else {
+ TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
+ if (msg2->msg_prio < msg->msg_prio)
+ break;
+ }
+ TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
+ }
+ }
+ mq->mq_curmsgs++;
+ mq->mq_totalbytes += msg->msg_size;
+ if (mq->mq_receivers)
+ wakeup_one(&mq->mq_receivers);
+ else if (mq->mq_notifier != NULL)
+ mqueue_send_notification(mq);
+ if (mq->mq_flags & MQ_RSEL) {
+ mq->mq_flags &= ~MQ_RSEL;
+ selwakeup(&mq->mq_rsel);
+ }
+ KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
+ mtx_unlock(&mq->mq_mutex);
+ return (0);
+}
+
+/*
+ * Send realtime a signal to process which registered itself
+ * successfully by mq_notify.
+ */
+static void
+mqueue_send_notification(struct mqueue *mq)
+{
+ struct mqueue_notifier *nt;
+ struct proc *p;
+
+ mtx_assert(&mq->mq_mutex, MA_OWNED);
+ nt = mq->mq_notifier;
+ if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
+ p = nt->nt_proc;
+ PROC_LOCK(p);
+ if (!KSI_ONQ(&nt->nt_ksi))
+ psignal_event(p, &nt->nt_sigev, &nt->nt_ksi);
+ PROC_UNLOCK(p);
+ }
+ mq->mq_notifier = NULL;
+}
+
+/*
+ * Get a message. if waitok is false, thread will not be
+ * blocked if there is no data in queue, otherwise, absolute
+ * time will be checked.
+ */
+int
+mqueue_receive(struct mqueue *mq, char *msg_ptr,
+ size_t msg_len, unsigned *msg_prio, int waitok,
+ const struct timespec *abs_timeout)
+{
+ struct mqueue_msg *msg;
+ struct timespec ets, ts, ts2;
+ struct timeval tv;
+ int error;
+
+ if (msg_len < mq->mq_msgsize)
+ return (EMSGSIZE);
+
+ /* O_NONBLOCK case */
+ if (!waitok) {
+ error = _mqueue_recv(mq, &msg, -1);
+ if (error)
+ return (error);
+ goto received;
+ }
+
+ /* we allow a null timeout (wait forever). */
+ if (abs_timeout == NULL) {
+ error = _mqueue_recv(mq, &msg, 0);
+ if (error)
+ return (error);
+ goto received;
+ }
+
+ /* try to get a message before checking time */
+ error = _mqueue_recv(mq, &msg, -1);
+ if (error == 0)
+ goto received;
+
+ if (error != EAGAIN)
+ return (error);
+
+ error = copyin(abs_timeout, &ets, sizeof(ets));
+ if (error != 0)
+ return (error);
+ if (ets.tv_nsec >= 1000000000 || ets.tv_nsec < 0) {
+ error = EINVAL;
+ return (error);
+ }
+
+ for (;;) {
+ ts2 = ets;
+ getnanotime(&ts);
+ timespecsub(&ts2, &ts);
+ if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
+ error = ETIMEDOUT;
+ return (error);
+ }
+ TIMESPEC_TO_TIMEVAL(&tv, &ts2);
+ error = _mqueue_recv(mq, &msg, tvtohz(&tv));
+ if (error == 0)
+ break;
+ if (error != ETIMEDOUT)
+ return (error);
+ }
+
+received:
+ error = mqueue_savemsg(msg, msg_ptr, msg_prio);
+ if (error == 0) {
+ curthread->td_retval[0] = msg->msg_size;
+ curthread->td_retval[1] = 0;
+ }
+ mqueue_freemsg(msg);
+ return (error);
+}
+
+/*
+ * Common routine to receive a message
+ */
+static int
+_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
+{
+ int error = 0;
+
+ mtx_lock(&mq->mq_mutex);
+ while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
+ if (timo < 0) {
+ mtx_unlock(&mq->mq_mutex);
+ return (EAGAIN);
+ }
+ mq->mq_receivers++;
+ error = msleep(&mq->mq_receivers, &mq->mq_mutex,
+ PCATCH, "mqrecv", timo);
+ mq->mq_receivers--;
+ if (error == EAGAIN)
+ error = ETIMEDOUT;
+ }
+ if (*msg != NULL) {
+ error = 0;
+ TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
+ mq->mq_curmsgs--;
+ mq->mq_totalbytes -= (*msg)->msg_size;
+ if (mq->mq_senders)
+ wakeup_one(&mq->mq_senders);
+ if (mq->mq_flags & MQ_WSEL) {
+ mq->mq_flags &= ~MQ_WSEL;
+ selwakeup(&mq->mq_wsel);
+ }
+ KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
+ }
+ if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
+ !TAILQ_EMPTY(&mq->mq_msgq)) {
+ mqueue_send_notification(mq);
+ }
+ mtx_unlock(&mq->mq_mutex);
+ return (error);
+}
+
+static __inline struct mqueue_notifier *
+notifier_alloc(void)
+{
+ return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
+}
+
+static __inline void
+notifier_free(struct mqueue_notifier *p)
+{
+ uma_zfree(mqnoti_zone, p);
+}
+
+static struct mqueue_notifier *
+notifier_search(struct proc *p, int fd)
+{
+ struct mqueue_notifier *nt;
+
+ LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
+ if (nt->nt_ksi.ksi_mqd == fd)
+ break;
+ }
+ return (nt);
+}
+
+static __inline void
+notifier_insert(struct proc *p, struct mqueue_notifier *nt)
+{
+ LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
+}
+
+static __inline void
+notifier_delete(struct proc *p, struct mqueue_notifier *nt)
+{
+ LIST_REMOVE(nt, nt_link);
+ notifier_free(nt);
+}
+
+static void
+notifier_remove(struct proc *p, struct mqueue *mq, int fd)
+{
+ struct mqueue_notifier *nt;
+
+ mtx_assert(&mq->mq_mutex, MA_OWNED);
+ PROC_LOCK(p);
+ nt = notifier_search(p, fd);
+ if (nt != NULL) {
+ if (mq->mq_notifier == nt)
+ mq->mq_notifier = NULL;
+ sigqueue_take(&nt->nt_ksi);
+ notifier_delete(p, nt);
+ }
+ PROC_UNLOCK(p);
+}
+
+/*
+ * Syscall to open a message queue.
+ */
+int
+kmq_open(struct thread *td, struct kmq_open_args *uap)
+{
+ char path[MQFS_NAMELEN + 1];
+ struct mq_attr attr, *pattr;
+ struct mqfs_node *pn;
+ struct filedesc *fdp;
+ struct file *fp;
+ struct mqueue *mq;
+ int fd, error, len, flags, cmode;
+
+ if ((uap->flags & O_ACCMODE) == O_ACCMODE)
+ return (EINVAL);
+
+ fdp = td->td_proc->p_fd;
+ flags = FFLAGS(uap->flags);
+ cmode = (((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT);
+ mq = NULL;
+ if ((flags & O_CREAT) && (uap->attr != NULL)) {
+ error = copyin(uap->attr, &attr, sizeof(attr));
+ if (error)
+ return (error);
+ if (attr.mq_maxmsg <= 0 || attr.mq_maxmsg > maxmsg)
+ return (EINVAL);
+ if (attr.mq_msgsize <= 0 || attr.mq_msgsize > maxmsgsize)
+ return (EINVAL);
+ pattr = &attr;
+ } else
+ pattr = NULL;
+
+ error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
+ if (error)
+ return (error);
+
+ /*
+ * The first character of name must be a slash (/) character
+ * and the remaining characters of name cannot include any slash
+ * characters.
+ */
+ len = strlen(path);
+ if (len < 2 || path[0] != '/' || index(path + 1, '/') != NULL)
+ return (EINVAL);
+
+ error = falloc(td, &fp, &fd);
+ if (error)
+ return (error);
+
+ sx_xlock(&mqfs_data.mi_lock);
+ pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
+ if (pn == NULL) {
+ if (!(flags & O_CREAT)) {
+ error = ENOENT;
+ } else {
+ mq = mqueue_alloc(pattr);
+ if (mq == NULL) {
+ error = ENFILE;
+ } else {
+ pn = mqfs_create_file(mqfs_data.mi_root,
+ path + 1, len - 1, td->td_ucred,
+ cmode);
+ if (pn == NULL) {
+ error = ENOSPC;
+ mqueue_free(mq);
+ }
+ }
+ }
+
+ if (error == 0) {
+ pn->mn_data = mq;
+ }
+ } else {
+ if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
+ error = EEXIST;
+ } else {
+ int acc_mode = 0;
+
+ if (flags & FREAD)
+ acc_mode |= VREAD;
+ if (flags & FWRITE)
+ acc_mode |= VWRITE;
+ error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
+ pn->mn_gid, acc_mode, td->td_ucred, NULL);
+ }
+ }
+
+ if (error) {
+ sx_xunlock(&mqfs_data.mi_lock);
+ fdclose(fdp, fp, fd, td);
+ fdrop(fp, td);
+ return (error);
+ }
+
+ mqnode_addref(pn);
+ sx_xunlock(&mqfs_data.mi_lock);
+
+ FILE_LOCK(fp);
+ fp->f_flag = (flags & (FREAD | FWRITE | O_NONBLOCK));
+ fp->f_type = DTYPE_MQUEUE;
+ fp->f_data = pn;
+ fp->f_ops = &mqueueops;
+ FILE_UNLOCK(fp);
+
+ FILEDESC_XLOCK(fdp);
+ if (fdp->fd_ofiles[fd] == fp)
+ fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
+ FILEDESC_XUNLOCK(fdp);
+ td->td_retval[0] = fd;
+ fdrop(fp, td);
+ return (0);
+}
+
+/*
+ * Syscall to unlink a message queue.
+ */
+int
+kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
+{
+ char path[MQFS_NAMELEN+1];
+ struct mqfs_node *pn;
+ int error, len;
+
+ error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
+ if (error)
+ return (error);
+
+ len = strlen(path);
+ if (len < 2 || path[0] != '/' || index(path + 1, '/') != NULL)
+ return (EINVAL);
+
+ sx_xlock(&mqfs_data.mi_lock);
+ pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
+ if (pn != NULL)
+ error = do_unlink(pn, td->td_ucred);
+ else
+ error = ENOENT;
+ sx_xunlock(&mqfs_data.mi_lock);
+ return (error);
+}
+
+typedef int (*_fgetf)(struct thread *, int, struct file **);
+
+/*
+ * Get message queue by giving file slot
+ */
+static int
+_getmq(struct thread *td, int fd, _fgetf func,
+ struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
+{
+ struct mqfs_node *pn;
+ int error;
+
+ error = func(td, fd, fpp);
+ if (error)
+ return (error);
+ if (&mqueueops != (*fpp)->f_ops) {
+ fdrop(*fpp, td);
+ return (EBADF);
+ }
+ pn = (*fpp)->f_data;
+ if (ppn)
+ *ppn = pn;
+ if (pmq)
+ *pmq = pn->mn_data;
+ return (0);
+}
+
+static __inline int
+getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
+ struct mqueue **pmq)
+{
+ return _getmq(td, fd, fget, fpp, ppn, pmq);
+}
+
+static __inline int
+getmq_read(struct thread *td, int fd, struct file **fpp,
+ struct mqfs_node **ppn, struct mqueue **pmq)
+{
+ return _getmq(td, fd, fget_read, fpp, ppn, pmq);
+}
+
+static __inline int
+getmq_write(struct thread *td, int fd, struct file **fpp,
+ struct mqfs_node **ppn, struct mqueue **pmq)
+{
+ return _getmq(td, fd, fget_write, fpp, ppn, pmq);
+}
+
+int
+kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
+{
+ struct mqueue *mq;
+ struct file *fp;
+ struct mq_attr attr, oattr;
+ int error;
+
+ if (uap->attr) {
+ error = copyin(uap->attr, &attr, sizeof(attr));
+ if (error)
+ return (error);
+ if (attr.mq_flags & ~O_NONBLOCK)
+ return (EINVAL);
+ }
+ error = getmq(td, uap->mqd, &fp, NULL, &mq);
+ if (error)
+ return (error);
+ oattr.mq_maxmsg = mq->mq_maxmsg;
+ oattr.mq_msgsize = mq->mq_msgsize;
+ oattr.mq_curmsgs = mq->mq_curmsgs;
+ FILE_LOCK(fp);
+ oattr.mq_flags = (O_NONBLOCK & fp->f_flag);
+ if (uap->attr) {
+ fp->f_flag &= ~O_NONBLOCK;
+ fp->f_flag |= (attr.mq_flags & O_NONBLOCK);
+ }
+ FILE_UNLOCK(fp);
+ fdrop(fp, td);
+ if (uap->oattr)
+ error = copyout(&oattr, uap->oattr, sizeof(oattr));
+ return (error);
+}
+
+int
+kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
+{
+ struct mqueue *mq;
+ struct file *fp;
+ int error;
+ int waitok;
+
+ error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
+ if (error)
+ return (error);
+ waitok = !(fp->f_flag & O_NONBLOCK);
+ error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
+ uap->msg_prio, waitok, uap->abs_timeout);
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
+{
+ struct mqueue *mq;
+ struct file *fp;
+ int error, waitok;
+
+ error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
+ if (error)
+ return (error);
+ waitok = !(fp->f_flag & O_NONBLOCK);
+ error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
+ uap->msg_prio, waitok, uap->abs_timeout);
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+kmq_notify(struct thread *td, struct kmq_notify_args *uap)
+{
+ struct sigevent ev;
+ struct filedesc *fdp;
+ struct proc *p;
+ struct mqueue *mq;
+ struct file *fp;
+ struct mqueue_notifier *nt, *newnt = NULL;
+ int error;
+
+ p = td->td_proc;
+ fdp = td->td_proc->p_fd;
+ if (uap->sigev) {
+ error = copyin(uap->sigev, &ev, sizeof(ev));
+ if (error)
+ return (error);
+ if (ev.sigev_notify != SIGEV_SIGNAL &&
+ ev.sigev_notify != SIGEV_THREAD_ID &&
+ ev.sigev_notify != SIGEV_NONE)
+ return (EINVAL);
+ if ((ev.sigev_notify == SIGEV_SIGNAL ||
+ ev.sigev_notify == SIGEV_THREAD_ID) &&
+ !_SIG_VALID(ev.sigev_signo))
+ return (EINVAL);
+ }
+ error = getmq(td, uap->mqd, &fp, NULL, &mq);
+ if (error)
+ return (error);
+again:
+ FILEDESC_SLOCK(fdp);
+ if (fget_locked(fdp, uap->mqd) != fp) {
+ FILEDESC_SUNLOCK(fdp);
+ error = EBADF;
+ goto out;
+ }
+ mtx_lock(&mq->mq_mutex);
+ FILEDESC_SUNLOCK(fdp);
+ if (uap->sigev != NULL) {
+ if (mq->mq_notifier != NULL) {
+ error = EBUSY;
+ } else {
+ PROC_LOCK(p);
+ nt = notifier_search(p, uap->mqd);
+ if (nt == NULL) {
+ if (newnt == NULL) {
+ PROC_UNLOCK(p);
+ mtx_unlock(&mq->mq_mutex);
+ newnt = notifier_alloc();
+ goto again;
+ }
+ }
+
+ if (nt != NULL) {
+ sigqueue_take(&nt->nt_ksi);
+ if (newnt != NULL) {
+ notifier_free(newnt);
+ newnt = NULL;
+ }
+ } else {
+ nt = newnt;
+ newnt = NULL;
+ ksiginfo_init(&nt->nt_ksi);
+ nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
+ nt->nt_ksi.ksi_code = SI_MESGQ;
+ nt->nt_proc = p;
+ nt->nt_ksi.ksi_mqd = uap->mqd;
+ notifier_insert(p, nt);
+ }
+ nt->nt_sigev = ev;
+ mq->mq_notifier = nt;
+ PROC_UNLOCK(p);
+ /*
+ * if there is no receivers and message queue
+ * is not empty, we should send notification
+ * as soon as possible.
+ */
+ if (mq->mq_receivers == 0 &&
+ !TAILQ_EMPTY(&mq->mq_msgq))
+ mqueue_send_notification(mq);
+ }
+ } else {
+ notifier_remove(p, mq, uap->mqd);
+ }
+ mtx_unlock(&mq->mq_mutex);
+
+out:
+ fdrop(fp, td);
+ if (newnt != NULL)
+ notifier_free(newnt);
+ return (error);
+}
+
+static void
+mqueue_fdclose(struct thread *td, int fd, struct file *fp)
+{
+ struct filedesc *fdp;
+ struct mqueue *mq;
+
+ fdp = td->td_proc->p_fd;
+ FILEDESC_LOCK_ASSERT(fdp);
+
+ if (fp->f_ops == &mqueueops) {
+ mq = FPTOMQ(fp);
+ mtx_lock(&mq->mq_mutex);
+ notifier_remove(td->td_proc, mq, fd);
+
+ /* have to wakeup thread in same process */
+ if (mq->mq_flags & MQ_RSEL) {
+ mq->mq_flags &= ~MQ_RSEL;
+ selwakeup(&mq->mq_rsel);
+ }
+ if (mq->mq_flags & MQ_WSEL) {
+ mq->mq_flags &= ~MQ_WSEL;
+ selwakeup(&mq->mq_wsel);
+ }
+ mtx_unlock(&mq->mq_mutex);
+ }
+}
+
+static void
+mq_proc_exit(void *arg __unused, struct proc *p)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+ struct mqueue *mq;
+ int i;
+
+ fdp = p->p_fd;
+ FILEDESC_SLOCK(fdp);
+ for (i = 0; i < fdp->fd_nfiles; ++i) {
+ fp = fget_locked(fdp, i);
+ if (fp != NULL && fp->f_ops == &mqueueops) {
+ mq = FPTOMQ(fp);
+ mtx_lock(&mq->mq_mutex);
+ notifier_remove(p, FPTOMQ(fp), i);
+ mtx_unlock(&mq->mq_mutex);
+ }
+ }
+ FILEDESC_SUNLOCK(fdp);
+ KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
+}
+
+static int
+mqf_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ return (EOPNOTSUPP);
+}
+
+static int
+mqf_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ return (EOPNOTSUPP);
+}
+
+static int
+mqf_ioctl(struct file *fp, u_long cmd, void *data,
+ struct ucred *active_cred, struct thread *td)
+{
+ return (ENOTTY);
+}
+
+static int
+mqf_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct mqueue *mq = FPTOMQ(fp);
+ int revents = 0;
+
+ mtx_lock(&mq->mq_mutex);
+ if (events & (POLLIN | POLLRDNORM)) {
+ if (mq->mq_curmsgs) {
+ revents |= events & (POLLIN | POLLRDNORM);
+ } else {
+ mq->mq_flags |= MQ_RSEL;
+ selrecord(td, &mq->mq_rsel);
+ }
+ }
+ if (events & POLLOUT) {
+ if (mq->mq_curmsgs < mq->mq_maxmsg)
+ revents |= POLLOUT;
+ else {
+ mq->mq_flags |= MQ_WSEL;
+ selrecord(td, &mq->mq_wsel);
+ }
+ }
+ mtx_unlock(&mq->mq_mutex);
+ return (revents);
+}
+
+static int
+mqf_close(struct file *fp, struct thread *td)
+{
+ struct mqfs_node *pn;
+
+ fp->f_ops = &badfileops;
+ pn = fp->f_data;
+ fp->f_data = NULL;
+ sx_xlock(&mqfs_data.mi_lock);
+ mqnode_release(pn);
+ sx_xunlock(&mqfs_data.mi_lock);
+ return (0);
+}
+
+static int
+mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct mqfs_node *pn = fp->f_data;
+
+ bzero(st, sizeof *st);
+ st->st_atimespec = pn->mn_atime;
+ st->st_mtimespec = pn->mn_mtime;
+ st->st_ctimespec = pn->mn_ctime;
+ st->st_birthtimespec = pn->mn_birth;
+ st->st_uid = pn->mn_uid;
+ st->st_gid = pn->mn_gid;
+ st->st_mode = S_IFIFO | pn->mn_mode;
+ return (0);
+}
+
+static int
+mqf_kqfilter(struct file *fp, struct knote *kn)
+{
+ struct mqueue *mq = FPTOMQ(fp);
+ int error = 0;
+
+ if (kn->kn_filter == EVFILT_READ) {
+ kn->kn_fop = &mq_rfiltops;
+ knlist_add(&mq->mq_rsel.si_note, kn, 0);
+ } else if (kn->kn_filter == EVFILT_WRITE) {
+ kn->kn_fop = &mq_wfiltops;
+ knlist_add(&mq->mq_wsel.si_note, kn, 0);
+ } else
+ error = EINVAL;
+ return (error);
+}
+
+static void
+filt_mqdetach(struct knote *kn)
+{
+ struct mqueue *mq = FPTOMQ(kn->kn_fp);
+
+ if (kn->kn_filter == EVFILT_READ)
+ knlist_remove(&mq->mq_rsel.si_note, kn, 0);
+ else if (kn->kn_filter == EVFILT_WRITE)
+ knlist_remove(&mq->mq_wsel.si_note, kn, 0);
+ else
+ panic("filt_mqdetach");
+}
+
+static int
+filt_mqread(struct knote *kn, long hint)
+{
+ struct mqueue *mq = FPTOMQ(kn->kn_fp);
+
+ mtx_assert(&mq->mq_mutex, MA_OWNED);
+ return (mq->mq_curmsgs != 0);
+}
+
+static int
+filt_mqwrite(struct knote *kn, long hint)
+{
+ struct mqueue *mq = FPTOMQ(kn->kn_fp);
+
+ mtx_assert(&mq->mq_mutex, MA_OWNED);
+ return (mq->mq_curmsgs < mq->mq_maxmsg);
+}
+
+static struct fileops mqueueops = {
+ .fo_read = mqf_read,
+ .fo_write = mqf_write,
+ .fo_ioctl = mqf_ioctl,
+ .fo_poll = mqf_poll,
+ .fo_kqfilter = mqf_kqfilter,
+ .fo_stat = mqf_stat,
+ .fo_close = mqf_close
+};
+
+static struct vop_vector mqfs_vnodeops = {
+ .vop_default = &default_vnodeops,
+ .vop_access = mqfs_access,
+ .vop_cachedlookup = mqfs_lookup,
+ .vop_lookup = vfs_cache_lookup,
+ .vop_reclaim = mqfs_reclaim,
+ .vop_create = mqfs_create,
+ .vop_remove = mqfs_remove,
+ .vop_inactive = mqfs_inactive,
+ .vop_open = mqfs_open,
+ .vop_close = mqfs_close,
+ .vop_getattr = mqfs_getattr,
+ .vop_setattr = mqfs_setattr,
+ .vop_read = mqfs_read,
+ .vop_write = VOP_EOPNOTSUPP,
+ .vop_readdir = mqfs_readdir,
+ .vop_mkdir = VOP_EOPNOTSUPP,
+ .vop_rmdir = VOP_EOPNOTSUPP
+};
+
+static struct vfsops mqfs_vfsops = {
+ .vfs_init = mqfs_init,
+ .vfs_uninit = mqfs_uninit,
+ .vfs_mount = mqfs_mount,
+ .vfs_unmount = mqfs_unmount,
+ .vfs_root = mqfs_root,
+ .vfs_statfs = mqfs_statfs,
+};
+
+SYSCALL_MODULE_HELPER(kmq_open);
+SYSCALL_MODULE_HELPER(kmq_setattr);
+SYSCALL_MODULE_HELPER(kmq_timedsend);
+SYSCALL_MODULE_HELPER(kmq_timedreceive);
+SYSCALL_MODULE_HELPER(kmq_notify);
+SYSCALL_MODULE_HELPER(kmq_unlink);
+
+VFS_SET(mqfs_vfsops, mqueuefs, VFCF_SYNTHETIC);
+MODULE_VERSION(mqueuefs, 1);
--- /dev/null
+++ sys/kern/subr_lock.c
@@ -0,0 +1,356 @@
+/*-
+ * Copyright (c) 2006 John Baldwin <jhb at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This module holds the global variables and functions used to maintain
+ * lock_object structures.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/subr_lock.c,v 1.17 2007/09/14 01:12:39 attilio Exp $");
+
+#include "opt_ddb.h"
+#include "opt_mprof.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ktr.h>
+#include <sys/linker_set.h>
+#include <sys/lock.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/lock_profile.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+CTASSERT(LOCK_CLASS_MAX == 15);
+
+struct lock_class *lock_classes[LOCK_CLASS_MAX + 1] = {
+ &lock_class_mtx_spin,
+ &lock_class_mtx_sleep,
+ &lock_class_sx,
+ &lock_class_rw,
+ &lock_class_lockmgr,
+};
+
+#ifdef LOCK_PROFILING
+#include <machine/cpufunc.h>
+
+SYSCTL_NODE(_debug, OID_AUTO, lock, CTLFLAG_RD, NULL, "lock debugging");
+SYSCTL_NODE(_debug_lock, OID_AUTO, prof, CTLFLAG_RD, NULL, "lock profiling");
+int lock_prof_enable = 0;
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, enable, CTLFLAG_RW,
+ &lock_prof_enable, 0, "Enable lock profiling");
+
+/*
+ * lprof_buf is a static pool of profiling records to avoid possible
+ * reentrance of the memory allocation functions.
+ *
+ * Note: NUM_LPROF_BUFFERS must be smaller than LPROF_HASH_SIZE.
+ */
+struct lock_prof lprof_buf[LPROF_HASH_SIZE];
+static int allocated_lprof_buf;
+struct mtx lprof_locks[LPROF_LOCK_SIZE];
+
+
+/* SWAG: sbuf size = avg stat. line size * number of locks */
+#define LPROF_SBUF_SIZE 256 * 400
+
+static int lock_prof_acquisitions;
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, acquisitions, CTLFLAG_RD,
+ &lock_prof_acquisitions, 0, "Number of lock acquistions recorded");
+static int lock_prof_records;
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, records, CTLFLAG_RD,
+ &lock_prof_records, 0, "Number of profiling records");
+static int lock_prof_maxrecords = LPROF_HASH_SIZE;
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, maxrecords, CTLFLAG_RD,
+ &lock_prof_maxrecords, 0, "Maximum number of profiling records");
+static int lock_prof_rejected;
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, rejected, CTLFLAG_RD,
+ &lock_prof_rejected, 0, "Number of rejected profiling records");
+static int lock_prof_hashsize = LPROF_HASH_SIZE;
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, hashsize, CTLFLAG_RD,
+ &lock_prof_hashsize, 0, "Hash size");
+static int lock_prof_collisions = 0;
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, collisions, CTLFLAG_RD,
+ &lock_prof_collisions, 0, "Number of hash collisions");
+
+#ifndef USE_CPU_NANOSECONDS
+u_int64_t
+nanoseconds(void)
+{
+ struct timespec tv;
+
+ nanotime(&tv);
+ return (tv.tv_sec * (u_int64_t)1000000000 + tv.tv_nsec);
+}
+#endif
+
+static int
+dump_lock_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf *sb;
+ int error, i;
+ static int multiplier = 1;
+ const char *p;
+
+ if (allocated_lprof_buf == 0)
+ return (SYSCTL_OUT(req, "No locking recorded",
+ sizeof("No locking recorded")));
+
+retry_sbufops:
+ sb = sbuf_new(NULL, NULL, LPROF_SBUF_SIZE * multiplier, SBUF_FIXEDLEN);
+ sbuf_printf(sb, "\n%6s %12s %12s %11s %5s %5s %12s %12s %s\n",
+ "max", "total", "wait_total", "count", "avg", "wait_avg", "cnt_hold", "cnt_lock", "name");
+ for (i = 0; i < LPROF_HASH_SIZE; ++i) {
+ if (lprof_buf[i].name == NULL)
+ continue;
+ for (p = lprof_buf[i].file;
+ p != NULL && strncmp(p, "../", 3) == 0; p += 3)
+ /* nothing */ ;
+ sbuf_printf(sb, "%6ju %12ju %12ju %11ju %5ju %5ju %12ju %12ju %s:%d (%s:%s)\n",
+ lprof_buf[i].cnt_max / 1000,
+ lprof_buf[i].cnt_tot / 1000,
+ lprof_buf[i].cnt_wait / 1000,
+ lprof_buf[i].cnt_cur,
+ lprof_buf[i].cnt_cur == 0 ? (uintmax_t)0 :
+ lprof_buf[i].cnt_tot / (lprof_buf[i].cnt_cur * 1000),
+ lprof_buf[i].cnt_cur == 0 ? (uintmax_t)0 :
+ lprof_buf[i].cnt_wait / (lprof_buf[i].cnt_cur * 1000),
+ lprof_buf[i].cnt_contest_holding,
+ lprof_buf[i].cnt_contest_locking,
+ p, lprof_buf[i].line,
+ lprof_buf[i].type,
+ lprof_buf[i].name);
+ if (sbuf_overflowed(sb)) {
+ sbuf_delete(sb);
+ multiplier++;
+ goto retry_sbufops;
+ }
+ }
+
+ sbuf_finish(sb);
+ error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+ sbuf_delete(sb);
+ return (error);
+}
+static int
+reset_lock_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+ int error, v;
+
+ if (allocated_lprof_buf == 0)
+ return (0);
+
+ v = 0;
+ error = sysctl_handle_int(oidp, &v, 0, req);
+ if (error)
+ return (error);
+ if (req->newptr == NULL)
+ return (error);
+ if (v == 0)
+ return (0);
+
+ bzero(lprof_buf, LPROF_HASH_SIZE*sizeof(*lprof_buf));
+ allocated_lprof_buf = 0;
+ return (0);
+}
+
+SYSCTL_PROC(_debug_lock_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
+ NULL, 0, dump_lock_prof_stats, "A", "Lock profiling statistics");
+
+SYSCTL_PROC(_debug_lock_prof, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
+ NULL, 0, reset_lock_prof_stats, "I", "Reset lock profiling statistics");
+#endif
+
+void
+lock_init(struct lock_object *lock, struct lock_class *class, const char *name,
+ const char *type, int flags)
+{
+ int i;
+
+ /* Check for double-init and zero object. */
+ KASSERT(!lock_initalized(lock), ("lock \"%s\" %p already initialized",
+ name, lock));
+
+ /* Look up lock class to find its index. */
+ for (i = 0; i < LOCK_CLASS_MAX; i++)
+ if (lock_classes[i] == class) {
+ lock->lo_flags = i << LO_CLASSSHIFT;
+ break;
+ }
+ KASSERT(i < LOCK_CLASS_MAX, ("unknown lock class %p", class));
+
+ /* Initialize the lock object. */
+ lock->lo_name = name;
+ lock->lo_type = type != NULL ? type : name;
+ lock->lo_flags |= flags | LO_INITIALIZED;
+ LOCK_LOG_INIT(lock, 0);
+ WITNESS_INIT(lock);
+ lock_profile_object_init(lock, class, name);
+}
+
+void
+lock_destroy(struct lock_object *lock)
+{
+
+ KASSERT(lock_initalized(lock), ("lock %p is not initialized", lock));
+ lock_profile_object_destroy(lock);
+ WITNESS_DESTROY(lock);
+ LOCK_LOG_DESTROY(lock, 0);
+ lock->lo_flags &= ~LO_INITIALIZED;
+}
+
+#ifdef DDB
+DB_SHOW_COMMAND(lock, db_show_lock)
+{
+ struct lock_object *lock;
+ struct lock_class *class;
+
+ if (!have_addr)
+ return;
+ lock = (struct lock_object *)addr;
+ if (LO_CLASSINDEX(lock) > LOCK_CLASS_MAX) {
+ db_printf("Unknown lock class: %d\n", LO_CLASSINDEX(lock));
+ return;
+ }
+ class = LOCK_CLASS(lock);
+ db_printf(" class: %s\n", class->lc_name);
+ db_printf(" name: %s\n", lock->lo_name);
+ if (lock->lo_type && lock->lo_type != lock->lo_name)
+ db_printf(" type: %s\n", lock->lo_type);
+ class->lc_ddb_show(lock);
+}
+#endif
+
+#ifdef LOCK_PROFILING
+void _lock_profile_obtain_lock_success(struct lock_object *lo, int contested, uint64_t waittime, const char *file, int line)
+{
+ struct lock_profile_object *l = &lo->lo_profile_obj;
+
+ lo->lo_profile_obj.lpo_contest_holding = 0;
+
+ if (contested)
+ lo->lo_profile_obj.lpo_contest_locking++;
+
+ l->lpo_filename = file;
+ l->lpo_lineno = line;
+ l->lpo_acqtime = nanoseconds();
+ if (waittime && (l->lpo_acqtime > waittime))
+ l->lpo_waittime = l->lpo_acqtime - waittime;
+ else
+ l->lpo_waittime = 0;
+}
+
+void _lock_profile_release_lock(struct lock_object *lo)
+{
+ struct lock_profile_object *l = &lo->lo_profile_obj;
+
+ if (l->lpo_acqtime) {
+ const char *unknown = "(unknown)";
+ u_int64_t acqtime, now, waittime;
+ struct lock_prof *mpp;
+ u_int hash;
+ const char *p = l->lpo_filename;
+ int collision = 0;
+
+ now = nanoseconds();
+ acqtime = l->lpo_acqtime;
+ waittime = l->lpo_waittime;
+ if (now <= acqtime)
+ return;
+ if (p == NULL || *p == '\0')
+ p = unknown;
+ hash = (l->lpo_namehash * 31 * 31 + (uintptr_t)p * 31 + l->lpo_lineno) & LPROF_HASH_MASK;
+ mpp = &lprof_buf[hash];
+ while (mpp->name != NULL) {
+ if (mpp->line == l->lpo_lineno &&
+ mpp->file == p &&
+ mpp->namehash == l->lpo_namehash)
+ break;
+ /* If the lprof_hash entry is allocated to someone
+ * else, try the next one
+ */
+ collision = 1;
+ hash = (hash + 1) & LPROF_HASH_MASK;
+ mpp = &lprof_buf[hash];
+ }
+ if (mpp->name == NULL) {
+ int buf;
+
+ buf = atomic_fetchadd_int(&allocated_lprof_buf, 1);
+ /* Just exit if we cannot get a trace buffer */
+ if (buf >= LPROF_HASH_SIZE) {
+ ++lock_prof_rejected;
+ return;
+ }
+ mpp->file = p;
+ mpp->line = l->lpo_lineno;
+ mpp->namehash = l->lpo_namehash;
+ mpp->type = l->lpo_type;
+ mpp->name = lo->lo_name;
+
+ if (collision)
+ ++lock_prof_collisions;
+
+ /*
+ * We might have raced someone else but who cares,
+ * they'll try again next time
+ */
+ ++lock_prof_records;
+ }
+ LPROF_LOCK(hash);
+ /*
+ * Record if the lock has been held longer now than ever
+ * before.
+ */
+ if (now - acqtime > mpp->cnt_max)
+ mpp->cnt_max = now - acqtime;
+ mpp->cnt_tot += now - acqtime;
+ mpp->cnt_wait += waittime;
+ mpp->cnt_cur++;
+ /*
+ * There's a small race, really we should cmpxchg
+ * 0 with the current value, but that would bill
+ * the contention to the wrong lock instance if
+ * it followed this also.
+ */
+ mpp->cnt_contest_holding += l->lpo_contest_holding;
+ mpp->cnt_contest_locking += l->lpo_contest_locking;
+ LPROF_UNLOCK(hash);
+
+ }
+ l->lpo_acqtime = 0;
+ l->lpo_waittime = 0;
+ l->lpo_contest_locking = 0;
+ l->lpo_contest_holding = 0;
+}
+#endif
Index: kern_fork.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_fork.c -L sys/kern/kern_fork.c -u -r1.2 -r1.3
--- sys/kern/kern_fork.c
+++ sys/kern/kern_fork.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_fork.c,v 1.252 2005/07/01 16:28:30 ssouhlal Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_fork.c,v 1.282.2.1.2.1 2008/01/19 18:15:05 kib Exp $");
#include "opt_ktrace.h"
#include "opt_mac.h"
@@ -51,6 +51,7 @@
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/pioctl.h>
#include <sys/resourcevar.h>
@@ -59,13 +60,15 @@
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <sys/acct.h>
-#include <sys/mac.h>
#include <sys/ktr.h>
#include <sys/ktrace.h>
#include <sys/unistd.h>
#include <sys/sx.h>
#include <sys/signalvar.h>
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
@@ -79,11 +82,6 @@
};
#endif
-static int forksleep; /* Place for fork1() to sleep on. */
-
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
fork(td, uap)
@@ -101,9 +99,6 @@
return (error);
}
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
vfork(td, uap)
@@ -121,9 +116,6 @@
return (error);
}
-/*
- * MPSAFE
- */
int
rfork(td, uap)
struct thread *td;
@@ -136,6 +128,7 @@
if ((uap->flags & RFKERNELONLY) != 0)
return (EINVAL);
+ AUDIT_ARG(fflags, uap->flags);
error = fork1(td, uap->flags, 0, &p2);
if (error == 0) {
td->td_retval[0] = p2 ? p2->p_pid : 0;
@@ -201,8 +194,8 @@
struct filedesc *fd;
struct filedesc_to_leader *fdtol;
struct thread *td2;
- struct ksegrp *kg2;
struct sigacts *newsigacts;
+ struct vmspace *vm2;
int error;
/* Can't copy and clear. */
@@ -217,8 +210,8 @@
*/
if ((flags & RFPROC) == 0) {
if ((p1->p_flag & P_HADTHREADS) &&
- (flags & (RFCFDG | RFFDG))) {
- PROC_LOCK(p1);
+ (flags & (RFCFDG | RFFDG))) {
+ PROC_LOCK(p1);
if (thread_single(SINGLE_BOUNDARY)) {
PROC_UNLOCK(p1);
return (ERESTART);
@@ -226,7 +219,10 @@
PROC_UNLOCK(p1);
}
- vm_forkproc(td, NULL, NULL, flags);
+ error = vm_forkproc(td, NULL, NULL, NULL, flags);
+ if (error)
+ goto norfproc_fail;
+
/*
* Close all file descriptors.
*/
@@ -243,50 +239,50 @@
if (flags & RFFDG)
fdunshare(p1, td);
- if((p1->p_flag & P_HADTHREADS) &&
- (flags & (RFCFDG|RFFDG))) {
+norfproc_fail:
+ if ((p1->p_flag & P_HADTHREADS) &&
+ (flags & (RFCFDG | RFFDG))) {
PROC_LOCK(p1);
thread_single_end();
PROC_UNLOCK(p1);
}
*procp = NULL;
- return (0);
- }
-
- /*
- * Note 1:1 allows for forking with one thread coming out on the
- * other side with the expectation that the process is about to
- * exec.
- */
- if (p1->p_flag & P_HADTHREADS) {
- /*
- * Idle the other threads for a second.
- * Since the user space is copied, it must remain stable.
- * In addition, all threads (from the user perspective)
- * need to either be suspended or in the kernel,
- * where they will try restart in the parent and will
- * be aborted in the child.
- */
- PROC_LOCK(p1);
- if (thread_single(SINGLE_NO_EXIT)) {
- /* Abort. Someone else is single threading before us. */
- PROC_UNLOCK(p1);
- return (ERESTART);
- }
- PROC_UNLOCK(p1);
- /*
- * All other activity in this process
- * is now suspended at the user boundary,
- * (or other safe places if we think of any).
- */
+ return (error);
}
/* Allocate new proc. */
newproc = uma_zalloc(proc_zone, M_WAITOK);
+ if (TAILQ_EMPTY(&newproc->p_threads)) {
+ td2 = thread_alloc();
+ if (td2 == NULL) {
+ error = ENOMEM;
+ goto fail1;
+ }
+ proc_linkup(newproc, td2);
+ sched_newproc(newproc, td2);
+ } else
+ td2 = FIRST_THREAD_IN_PROC(newproc);
+
+ /* Allocate and switch to an alternate kstack if specified. */
+ if (pages != 0) {
+ if (!vm_thread_new_altkstack(td2, pages)) {
+ error = ENOMEM;
+ goto fail1;
+ }
+ }
+ if ((flags & RFMEM) == 0) {
+ vm2 = vmspace_fork(p1->p_vmspace);
+ if (vm2 == NULL) {
+ error = ENOMEM;
+ goto fail1;
+ }
+ } else
+ vm2 = NULL;
#ifdef MAC
mac_init_proc(newproc);
#endif
knlist_init(&newproc->p_klist, &newproc->p_mtx, NULL, NULL, NULL);
+ STAILQ_INIT(&newproc->p_ktr);
/* We have to lock the process tree while we look for a pid. */
sx_slock(&proctree_lock);
@@ -299,9 +295,8 @@
* processes, maxproc is the limit.
*/
sx_xlock(&allproc_lock);
- if ((nprocs >= maxproc - 10 &&
- suser_cred(td->td_ucred, SUSER_RUID) != 0) ||
- nprocs >= maxproc) {
+ if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred,
+ PRIV_MAXPROC, 0) != 0) || nprocs >= maxproc) {
error = EAGAIN;
goto fail;
}
@@ -309,14 +304,16 @@
/*
* Increment the count of procs running with this uid. Don't allow
* a nonprivileged user to exceed their current limit.
+ *
+ * XXXRW: Can we avoid privilege here if it's not needed?
*/
- error = suser_cred(td->td_ucred, SUSER_RUID|SUSER_ALLOWJAIL);
- if (error==0)
+ error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
+ if (error == 0)
ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
else {
PROC_LOCK(p1);
ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
- lim_cur(p1, RLIMIT_NPROC));
+ lim_cur(p1, RLIMIT_NPROC));
PROC_UNLOCK(p1);
}
if (!ok) {
@@ -369,16 +366,14 @@
p2 = LIST_FIRST(&allproc);
again:
for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
-
while (p2->p_pid == trypid ||
(p2->p_pgrp != NULL &&
(p2->p_pgrp->pg_id == trypid ||
(p2->p_session != NULL &&
p2->p_session->s_sid == trypid)))) {
trypid++;
- if (trypid >= pidchecked)
+ if (trypid >= pidchecked)
goto retry;
-
}
if (p2->p_pid > trypid && pidchecked > p2->p_pid)
pidchecked = p2->p_pid;
@@ -411,10 +406,31 @@
p2 = newproc;
p2->p_state = PRS_NEW; /* protect against others */
p2->p_pid = trypid;
+ /*
+ * Allow the scheduler to initialize the child.
+ */
+ thread_lock(td);
+ sched_fork(td, td2);
+ thread_unlock(td);
+ AUDIT_ARG(pid, p2->p_pid);
LIST_INSERT_HEAD(&allproc, p2, p_list);
LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
+
+ PROC_LOCK(p2);
+ PROC_LOCK(p1);
+
sx_xunlock(&allproc_lock);
+ bcopy(&p1->p_startcopy, &p2->p_startcopy,
+ __rangeof(struct proc, p_startcopy, p_endcopy));
+ PROC_UNLOCK(p1);
+
+ bzero(&p2->p_startzero,
+ __rangeof(struct proc, p_startzero, p_endzero));
+
+ p2->p_ucred = crhold(td->td_ucred);
+ PROC_UNLOCK(p2);
+
/*
* Malloc things while we don't hold any locks.
*/
@@ -445,9 +461,9 @@
* shared process leaders.
*/
fdtol = p1->p_fdtol;
- FILEDESC_LOCK_FAST(p1->p_fd);
+ FILEDESC_XLOCK(p1->p_fd);
fdtol->fdl_refcount++;
- FILEDESC_UNLOCK_FAST(p1->p_fd);
+ FILEDESC_XUNLOCK(p1->p_fd);
} else {
/*
* Shared file descriptor table, and
@@ -463,52 +479,29 @@
* Start by zeroing the section of proc that is zero-initialized,
* then copy the section that is copied directly from the parent.
*/
- td2 = FIRST_THREAD_IN_PROC(p2);
- kg2 = FIRST_KSEGRP_IN_PROC(p2);
-
- /* Allocate and switch to an alternate kstack if specified. */
- if (pages != 0)
- vm_thread_new_altkstack(td2, pages);
PROC_LOCK(p2);
PROC_LOCK(p1);
- bzero(&p2->p_startzero,
- __rangeof(struct proc, p_startzero, p_endzero));
bzero(&td2->td_startzero,
__rangeof(struct thread, td_startzero, td_endzero));
- bzero(&kg2->kg_startzero,
- __rangeof(struct ksegrp, kg_startzero, kg_endzero));
- bcopy(&p1->p_startcopy, &p2->p_startcopy,
- __rangeof(struct proc, p_startcopy, p_endcopy));
bcopy(&td->td_startcopy, &td2->td_startcopy,
__rangeof(struct thread, td_startcopy, td_endcopy));
- bcopy(&td->td_ksegrp->kg_startcopy, &kg2->kg_startcopy,
- __rangeof(struct ksegrp, kg_startcopy, kg_endcopy));
td2->td_sigstk = td->td_sigstk;
td2->td_sigmask = td->td_sigmask;
+ td2->td_flags = TDF_INMEM;
/*
* Duplicate sub-structures as needed.
* Increase reference counts on shared objects.
*/
- p2->p_flag = 0;
+ p2->p_flag = P_INMEM;
+ p2->p_swtick = ticks;
if (p1->p_flag & P_PROFIL)
startprofclock(p2);
- mtx_lock_spin(&sched_lock);
- p2->p_sflag = PS_INMEM;
- /*
- * Allow the scheduler to adjust the priority of the child and
- * parent while we hold the sched_lock.
- */
- sched_fork(td, td2);
-
- mtx_unlock_spin(&sched_lock);
- p2->p_ucred = crhold(td->td_ucred);
- td2->td_ucred = crhold(p2->p_ucred); /* XXXKSE */
-
+ td2->td_ucred = crhold(p2->p_ucred);
pargs_hold(p2->p_args);
if (flags & RFSIGSHARE) {
@@ -529,7 +522,7 @@
/*
* p_limit is copy-on-write. Bump its refcount.
*/
- p2->p_limit = lim_hold(p1->p_limit);
+ lim_fork(p1, p2);
pstats_fork(p1->p_stats, p2->p_stats);
@@ -657,23 +650,23 @@
* Finish creating the child process. It will return via a different
* execution path later. (ie: directly into user mode)
*/
- vm_forkproc(td, p2, td2, flags);
+ vm_forkproc(td, p2, td2, vm2, flags);
if (flags == (RFFDG | RFPROC)) {
- atomic_add_int(&cnt.v_forks, 1);
- atomic_add_int(&cnt.v_forkpages, p2->p_vmspace->vm_dsize +
+ PCPU_INC(cnt.v_forks);
+ PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
- atomic_add_int(&cnt.v_vforks, 1);
- atomic_add_int(&cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
+ PCPU_INC(cnt.v_vforks);
+ PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
} else if (p1 == &proc0) {
- atomic_add_int(&cnt.v_kthreads, 1);
- atomic_add_int(&cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
+ PCPU_INC(cnt.v_kthreads);
+ PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
} else {
- atomic_add_int(&cnt.v_rforks, 1);
- atomic_add_int(&cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
+ PCPU_INC(cnt.v_rforks);
+ PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
}
@@ -688,18 +681,20 @@
* Set the child start time and mark the process as being complete.
*/
microuptime(&p2->p_stats->p_start);
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p2);
p2->p_state = PRS_NORMAL;
+ PROC_SUNLOCK(p2);
/*
* If RFSTOPPED not requested, make child runnable and add to
* run queue.
*/
if ((flags & RFSTOPPED) == 0) {
+ thread_lock(td2);
TD_SET_CAN_RUN(td2);
- setrunqueue(td2, SRQ_BORING);
+ sched_add(td2, SRQ_BORING);
+ thread_unlock(td2);
}
- mtx_unlock_spin(&sched_lock);
/*
* Now can be swapped.
@@ -725,15 +720,6 @@
PROC_UNLOCK(p2);
/*
- * If other threads are waiting, let them continue now.
- */
- if (p1->p_flag & P_HADTHREADS) {
- PROC_LOCK(p1);
- thread_single_end();
- PROC_UNLOCK(p1);
- }
-
- /*
* Return child proc pointer to parent.
*/
*procp = p2;
@@ -742,18 +728,14 @@
sx_sunlock(&proctree_lock);
if (ppsratecheck(&lastfail, &curfail, 1))
printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n",
- td->td_ucred->cr_ruid);
+ td->td_ucred->cr_ruid);
sx_xunlock(&allproc_lock);
#ifdef MAC
mac_destroy_proc(newproc);
#endif
+fail1:
uma_zfree(proc_zone, newproc);
- if (p1->p_flag & P_HADTHREADS) {
- PROC_LOCK(p1);
- thread_single_end();
- PROC_UNLOCK(p1);
- }
- tsleep(&forksleep, PUSER, "fork", hz / 2);
+ pause("fork", hz / 2);
return (error);
}
@@ -769,33 +751,26 @@
{
struct proc *p;
struct thread *td;
+ struct thread *dtd;
- /*
- * Finish setting up thread glue so that it begins execution in a
- * non-nested critical section with sched_lock held but not recursed.
- */
td = curthread;
p = td->td_proc;
- td->td_oncpu = PCPU_GET(cpuid);
KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
- sched_lock.mtx_lock = (uintptr_t)td;
- mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
CTR4(KTR_PROC, "fork_exit: new thread %p (kse %p, pid %d, %s)",
td, td->td_sched, p->p_pid, p->p_comm);
+ sched_fork_exit(td);
/*
- * Processes normally resume in mi_switch() after being
- * cpu_switch()'ed to, but when children start up they arrive here
- * instead, so we must do much the same things as mi_switch() would.
- */
-
- if ((td = PCPU_GET(deadthread))) {
+ * Processes normally resume in mi_switch() after being
+ * cpu_switch()'ed to, but when children start up they arrive here
+ * instead, so we must do much the same things as mi_switch() would.
+ */
+ if ((dtd = PCPU_GET(deadthread))) {
PCPU_SET(deadthread, NULL);
- thread_stash(td);
+ thread_stash(dtd);
}
- td = curthread;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
/*
* cpu_set_fork_handler intercepts this function call to
@@ -809,15 +784,14 @@
* Check if a kernel thread misbehaved and returned from its main
* function.
*/
- PROC_LOCK(p);
if (p->p_flag & P_KTHREAD) {
- PROC_UNLOCK(p);
printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
p->p_comm, p->p_pid);
kthread_exit(0);
}
- PROC_UNLOCK(p);
mtx_assert(&Giant, MA_NOTOWNED);
+
+ EVENTHANDLER_INVOKE(schedtail, p);
}
/*
@@ -832,7 +806,7 @@
struct trapframe *frame;
{
- userret(td, frame, 0);
+ userret(td, frame);
#ifdef KTRACE
if (KTRPOINT(td, KTR_SYSRET))
ktrsysret(SYS_fork, 0, 0);
--- /dev/null
+++ sys/kern/kern_rwlock.c
@@ -0,0 +1,948 @@
+/*-
+ * Copyright (c) 2006 John Baldwin <jhb at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Machine independent bits of reader/writer lock implementation.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/kern_rwlock.c,v 1.28.4.2 2007/12/01 11:28:37 attilio Exp $");
+
+#include "opt_ddb.h"
+#include "opt_no_adaptive_rwlocks.h"
+
+#include <sys/param.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/systm.h>
+#include <sys/turnstile.h>
+
+#include <machine/cpu.h>
+
+CTASSERT((RW_RECURSE & LO_CLASSFLAGS) == RW_RECURSE);
+
+#if defined(SMP) && !defined(NO_ADAPTIVE_RWLOCKS)
+#define ADAPTIVE_RWLOCKS
+#endif
+
+#ifdef DDB
+#include <ddb/ddb.h>
+
+static void db_show_rwlock(struct lock_object *lock);
+#endif
+static void lock_rw(struct lock_object *lock, int how);
+static int unlock_rw(struct lock_object *lock);
+
+struct lock_class lock_class_rw = {
+ .lc_name = "rw",
+ .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE,
+#ifdef DDB
+ .lc_ddb_show = db_show_rwlock,
+#endif
+ .lc_lock = lock_rw,
+ .lc_unlock = unlock_rw,
+};
+
+/*
+ * Return a pointer to the owning thread if the lock is write-locked or
+ * NULL if the lock is unlocked or read-locked.
+ */
+#define rw_wowner(rw) \
+ ((rw)->rw_lock & RW_LOCK_READ ? NULL : \
+ (struct thread *)RW_OWNER((rw)->rw_lock))
+
+/*
+ * Returns if a write owner is recursed. Write ownership is not assured
+ * here and should be previously checked.
+ */
+#define rw_recursed(rw) ((rw)->rw_recurse != 0)
+
+/*
+ * Return true if curthread helds the lock.
+ */
+#define rw_wlocked(rw) (rw_wowner((rw)) == curthread)
+
+/*
+ * Return a pointer to the owning thread for this lock who should receive
+ * any priority lent by threads that block on this lock. Currently this
+ * is identical to rw_wowner().
+ */
+#define rw_owner(rw) rw_wowner(rw)
+
+#ifndef INVARIANTS
+#define _rw_assert(rw, what, file, line)
+#endif
+
+void
+lock_rw(struct lock_object *lock, int how)
+{
+ struct rwlock *rw;
+
+ rw = (struct rwlock *)lock;
+ if (how)
+ rw_wlock(rw);
+ else
+ rw_rlock(rw);
+}
+
+int
+unlock_rw(struct lock_object *lock)
+{
+ struct rwlock *rw;
+
+ rw = (struct rwlock *)lock;
+ rw_assert(rw, RA_LOCKED | LA_NOTRECURSED);
+ if (rw->rw_lock & RW_LOCK_READ) {
+ rw_runlock(rw);
+ return (0);
+ } else {
+ rw_wunlock(rw);
+ return (1);
+ }
+}
+
+void
+rw_init_flags(struct rwlock *rw, const char *name, int opts)
+{
+ int flags;
+
+ MPASS((opts & ~(RW_DUPOK | RW_NOPROFILE | RW_NOWITNESS | RW_QUIET |
+ RW_RECURSE)) == 0);
+
+ flags = LO_UPGRADABLE | LO_RECURSABLE;
+ if (opts & RW_DUPOK)
+ flags |= LO_DUPOK;
+ if (opts & RW_NOPROFILE)
+ flags |= LO_NOPROFILE;
+ if (!(opts & RW_NOWITNESS))
+ flags |= LO_WITNESS;
+ if (opts & RW_QUIET)
+ flags |= LO_QUIET;
+ flags |= opts & RW_RECURSE;
+
+ rw->rw_lock = RW_UNLOCKED;
+ rw->rw_recurse = 0;
+ lock_init(&rw->lock_object, &lock_class_rw, name, NULL, flags);
+}
+
+void
+rw_destroy(struct rwlock *rw)
+{
+
+ KASSERT(rw->rw_lock == RW_UNLOCKED, ("rw lock not unlocked"));
+ KASSERT(rw->rw_recurse == 0, ("rw lock still recursed"));
+ rw->rw_lock = RW_DESTROYED;
+ lock_destroy(&rw->lock_object);
+}
+
+void
+rw_sysinit(void *arg)
+{
+ struct rw_args *args = arg;
+
+ rw_init(args->ra_rw, args->ra_desc);
+}
+
+int
+rw_wowned(struct rwlock *rw)
+{
+
+ return (rw_wowner(rw) == curthread);
+}
+
+void
+_rw_wlock(struct rwlock *rw, const char *file, int line)
+{
+
+ MPASS(curthread != NULL);
+ KASSERT(rw->rw_lock != RW_DESTROYED,
+ ("rw_wlock() of destroyed rwlock @ %s:%d", file, line));
+ WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
+ line);
+ __rw_wlock(rw, curthread, file, line);
+ LOCK_LOG_LOCK("WLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line);
+ WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
+ curthread->td_locks++;
+}
+
+void
+_rw_wunlock(struct rwlock *rw, const char *file, int line)
+{
+
+ MPASS(curthread != NULL);
+ KASSERT(rw->rw_lock != RW_DESTROYED,
+ ("rw_wunlock() of destroyed rwlock @ %s:%d", file, line));
+ _rw_assert(rw, RA_WLOCKED, file, line);
+ curthread->td_locks--;
+ WITNESS_UNLOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
+ LOCK_LOG_LOCK("WUNLOCK", &rw->lock_object, 0, rw->rw_recurse, file,
+ line);
+ if (!rw_recursed(rw))
+ lock_profile_release_lock(&rw->lock_object);
+ __rw_wunlock(rw, curthread, file, line);
+}
+
+void
+_rw_rlock(struct rwlock *rw, const char *file, int line)
+{
+ struct turnstile *ts;
+#ifdef ADAPTIVE_RWLOCKS
+ volatile struct thread *owner;
+#endif
+#ifdef LOCK_PROFILING_SHARED
+ uint64_t waittime = 0;
+ int contested = 0;
+#endif
+ uintptr_t x;
+
+ KASSERT(rw->rw_lock != RW_DESTROYED,
+ ("rw_rlock() of destroyed rwlock @ %s:%d", file, line));
+ KASSERT(rw_wowner(rw) != curthread,
+ ("%s (%s): wlock already held @ %s:%d", __func__,
+ rw->lock_object.lo_name, file, line));
+ WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER, file, line);
+
+ /*
+ * Note that we don't make any attempt to try to block read
+ * locks once a writer has blocked on the lock. The reason is
+ * that we currently allow for read locks to recurse and we
+ * don't keep track of all the holders of read locks. Thus, if
+ * we were to block readers once a writer blocked and a reader
+ * tried to recurse on their reader lock after a writer had
+ * blocked we would end up in a deadlock since the reader would
+ * be blocked on the writer, and the writer would be blocked
+ * waiting for the reader to release its original read lock.
+ */
+ for (;;) {
+ /*
+ * Handle the easy case. If no other thread has a write
+ * lock, then try to bump up the count of read locks. Note
+ * that we have to preserve the current state of the
+ * RW_LOCK_WRITE_WAITERS flag. If we fail to acquire a
+ * read lock, then rw_lock must have changed, so restart
+ * the loop. Note that this handles the case of a
+ * completely unlocked rwlock since such a lock is encoded
+ * as a read lock with no waiters.
+ */
+ x = rw->rw_lock;
+ if (x & RW_LOCK_READ) {
+
+ /*
+ * The RW_LOCK_READ_WAITERS flag should only be set
+ * if another thread currently holds a write lock,
+ * and in that case RW_LOCK_READ should be clear.
+ */
+ MPASS((x & RW_LOCK_READ_WAITERS) == 0);
+ if (atomic_cmpset_acq_ptr(&rw->rw_lock, x,
+ x + RW_ONE_READER)) {
+#ifdef LOCK_PROFILING_SHARED
+ if (RW_READERS(x) == 0)
+ lock_profile_obtain_lock_success(
+ &rw->lock_object, contested,
+ waittime, file, line);
+#endif
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR4(KTR_LOCK,
+ "%s: %p succeed %p -> %p", __func__,
+ rw, (void *)x,
+ (void *)(x + RW_ONE_READER));
+ break;
+ }
+ cpu_spinwait();
+ continue;
+ }
+
+#ifdef ADAPTIVE_RWLOCKS
+ /*
+ * If the owner is running on another CPU, spin until
+ * the owner stops running or the state of the lock
+ * changes.
+ */
+ owner = (struct thread *)RW_OWNER(x);
+ if (TD_IS_RUNNING(owner)) {
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
+ __func__, rw, owner);
+#ifdef LOCK_PROFILING_SHARED
+ lock_profile_obtain_lock_failed(&rw->lock_object,
+ &contested, &waittime);
+#endif
+ while ((struct thread*)RW_OWNER(rw->rw_lock) == owner &&
+ TD_IS_RUNNING(owner))
+ cpu_spinwait();
+ continue;
+ }
+#endif
+
+ /*
+ * Okay, now it's the hard case. Some other thread already
+ * has a write lock, so acquire the turnstile lock so we can
+ * begin the process of blocking.
+ */
+ ts = turnstile_trywait(&rw->lock_object);
+
+ /*
+ * The lock might have been released while we spun, so
+ * recheck its state and restart the loop if there is no
+ * longer a write lock.
+ */
+ x = rw->rw_lock;
+ if (x & RW_LOCK_READ) {
+ turnstile_cancel(ts);
+ cpu_spinwait();
+ continue;
+ }
+
+#ifdef ADAPTIVE_RWLOCKS
+ /*
+ * If the current owner of the lock is executing on another
+ * CPU quit the hard path and try to spin.
+ */
+ owner = (struct thread *)RW_OWNER(x);
+ if (TD_IS_RUNNING(owner)) {
+ turnstile_cancel(ts);
+ cpu_spinwait();
+ continue;
+ }
+#endif
+
+ /*
+ * Ok, it's still a write lock. If the RW_LOCK_READ_WAITERS
+ * flag is already set, then we can go ahead and block. If
+ * it is not set then try to set it. If we fail to set it
+ * drop the turnstile lock and restart the loop.
+ */
+ if (!(x & RW_LOCK_READ_WAITERS)) {
+ if (!atomic_cmpset_ptr(&rw->rw_lock, x,
+ x | RW_LOCK_READ_WAITERS)) {
+ turnstile_cancel(ts);
+ cpu_spinwait();
+ continue;
+ }
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p set read waiters flag",
+ __func__, rw);
+ }
+
+ /*
+ * We were unable to acquire the lock and the read waiters
+ * flag is set, so we must block on the turnstile.
+ */
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
+ rw);
+#ifdef LOCK_PROFILING_SHARED
+ lock_profile_obtain_lock_failed(&rw->lock_object, &contested,
+ &waittime);
+#endif
+ turnstile_wait(ts, rw_owner(rw), TS_SHARED_QUEUE);
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
+ __func__, rw);
+ }
+
+ /*
+ * TODO: acquire "owner of record" here. Here be turnstile dragons
+ * however. turnstiles don't like owners changing between calls to
+ * turnstile_wait() currently.
+ */
+
+ LOCK_LOG_LOCK("RLOCK", &rw->lock_object, 0, 0, file, line);
+ WITNESS_LOCK(&rw->lock_object, 0, file, line);
+ curthread->td_locks++;
+}
+
+void
+_rw_runlock(struct rwlock *rw, const char *file, int line)
+{
+ struct turnstile *ts;
+ uintptr_t x;
+
+ KASSERT(rw->rw_lock != RW_DESTROYED,
+ ("rw_runlock() of destroyed rwlock @ %s:%d", file, line));
+ _rw_assert(rw, RA_RLOCKED, file, line);
+ curthread->td_locks--;
+ WITNESS_UNLOCK(&rw->lock_object, 0, file, line);
+ LOCK_LOG_LOCK("RUNLOCK", &rw->lock_object, 0, 0, file, line);
+
+ /* TODO: drop "owner of record" here. */
+
+ for (;;) {
+ /*
+ * See if there is more than one read lock held. If so,
+ * just drop one and return.
+ */
+ x = rw->rw_lock;
+ if (RW_READERS(x) > 1) {
+ if (atomic_cmpset_ptr(&rw->rw_lock, x,
+ x - RW_ONE_READER)) {
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR4(KTR_LOCK,
+ "%s: %p succeeded %p -> %p",
+ __func__, rw, (void *)x,
+ (void *)(x - RW_ONE_READER));
+ break;
+ }
+ continue;
+ }
+
+
+ /*
+ * We should never have read waiters while at least one
+ * thread holds a read lock. (See note above)
+ */
+ KASSERT(!(x & RW_LOCK_READ_WAITERS),
+ ("%s: waiting readers", __func__));
+#ifdef LOCK_PROFILING_SHARED
+ lock_profile_release_lock(&rw->lock_object);
+#endif
+
+ /*
+ * If there aren't any waiters for a write lock, then try
+ * to drop it quickly.
+ */
+ if (!(x & RW_LOCK_WRITE_WAITERS)) {
+
+ /*
+ * There shouldn't be any flags set and we should
+ * be the only read lock. If we fail to release
+ * the single read lock, then another thread might
+ * have just acquired a read lock, so go back up
+ * to the multiple read locks case.
+ */
+ MPASS(x == RW_READERS_LOCK(1));
+ if (atomic_cmpset_ptr(&rw->rw_lock, RW_READERS_LOCK(1),
+ RW_UNLOCKED)) {
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p last succeeded",
+ __func__, rw);
+ break;
+ }
+ continue;
+ }
+
+ /*
+ * There should just be one reader with one or more
+ * writers waiting.
+ */
+ MPASS(x == (RW_READERS_LOCK(1) | RW_LOCK_WRITE_WAITERS));
+
+ /*
+ * Ok, we know we have a waiting writer and we think we
+ * are the last reader, so grab the turnstile lock.
+ */
+ turnstile_chain_lock(&rw->lock_object);
+
+ /*
+ * Try to drop our lock leaving the lock in a unlocked
+ * state.
+ *
+ * If you wanted to do explicit lock handoff you'd have to
+ * do it here. You'd also want to use turnstile_signal()
+ * and you'd have to handle the race where a higher
+ * priority thread blocks on the write lock before the
+ * thread you wakeup actually runs and have the new thread
+ * "steal" the lock. For now it's a lot simpler to just
+ * wakeup all of the waiters.
+ *
+ * As above, if we fail, then another thread might have
+ * acquired a read lock, so drop the turnstile lock and
+ * restart.
+ */
+ if (!atomic_cmpset_ptr(&rw->rw_lock,
+ RW_READERS_LOCK(1) | RW_LOCK_WRITE_WAITERS, RW_UNLOCKED)) {
+ turnstile_chain_unlock(&rw->lock_object);
+ continue;
+ }
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p last succeeded with waiters",
+ __func__, rw);
+
+ /*
+ * Ok. The lock is released and all that's left is to
+ * wake up the waiters. Note that the lock might not be
+ * free anymore, but in that case the writers will just
+ * block again if they run before the new lock holder(s)
+ * release the lock.
+ */
+ ts = turnstile_lookup(&rw->lock_object);
+ MPASS(ts != NULL);
+ turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE);
+ turnstile_unpend(ts, TS_SHARED_LOCK);
+ turnstile_chain_unlock(&rw->lock_object);
+ break;
+ }
+}
+
+/*
+ * This function is called when we are unable to obtain a write lock on the
+ * first try. This means that at least one other thread holds either a
+ * read or write lock.
+ */
+void
+_rw_wlock_hard(struct rwlock *rw, uintptr_t tid, const char *file, int line)
+{
+ struct turnstile *ts;
+#ifdef ADAPTIVE_RWLOCKS
+ volatile struct thread *owner;
+#endif
+ uint64_t waittime = 0;
+ uintptr_t v;
+ int contested = 0;
+
+ if (rw_wlocked(rw)) {
+ KASSERT(rw->lock_object.lo_flags & RW_RECURSE,
+ ("%s: recursing but non-recursive rw %s @ %s:%d\n",
+ __func__, rw->lock_object.lo_name, file, line));
+ rw->rw_recurse++;
+ atomic_set_ptr(&rw->rw_lock, RW_LOCK_RECURSED);
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p recursing", __func__, rw);
+ return;
+ }
+
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
+ rw->lock_object.lo_name, (void *)rw->rw_lock, file, line);
+
+ while (!_rw_write_lock(rw, tid)) {
+#ifdef ADAPTIVE_RWLOCKS
+ /*
+ * If the lock is write locked and the owner is
+ * running on another CPU, spin until the owner stops
+ * running or the state of the lock changes.
+ */
+ v = rw->rw_lock;
+ owner = (struct thread *)RW_OWNER(v);
+ if (!(v & RW_LOCK_READ) && TD_IS_RUNNING(owner)) {
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
+ __func__, rw, owner);
+ lock_profile_obtain_lock_failed(&rw->lock_object,
+ &contested, &waittime);
+ while ((struct thread*)RW_OWNER(rw->rw_lock) == owner &&
+ TD_IS_RUNNING(owner))
+ cpu_spinwait();
+ continue;
+ }
+#endif
+
+ ts = turnstile_trywait(&rw->lock_object);
+ v = rw->rw_lock;
+
+ /*
+ * If the lock was released while spinning on the
+ * turnstile chain lock, try again.
+ */
+ if (v == RW_UNLOCKED) {
+ turnstile_cancel(ts);
+ cpu_spinwait();
+ continue;
+ }
+
+#ifdef ADAPTIVE_RWLOCKS
+ /*
+ * If the current owner of the lock is executing on another
+ * CPU quit the hard path and try to spin.
+ */
+ if (!(v & RW_LOCK_READ)) {
+ owner = (struct thread *)RW_OWNER(v);
+ if (TD_IS_RUNNING(owner)) {
+ turnstile_cancel(ts);
+ cpu_spinwait();
+ continue;
+ }
+ }
+#endif
+
+ /*
+ * If the lock was released by a writer with both readers
+ * and writers waiting and a reader hasn't woken up and
+ * acquired the lock yet, rw_lock will be set to the
+ * value RW_UNLOCKED | RW_LOCK_WRITE_WAITERS. If we see
+ * that value, try to acquire it once. Note that we have
+ * to preserve the RW_LOCK_WRITE_WAITERS flag as there are
+ * other writers waiting still. If we fail, restart the
+ * loop.
+ */
+ if (v == (RW_UNLOCKED | RW_LOCK_WRITE_WAITERS)) {
+ if (atomic_cmpset_acq_ptr(&rw->rw_lock,
+ RW_UNLOCKED | RW_LOCK_WRITE_WAITERS,
+ tid | RW_LOCK_WRITE_WAITERS)) {
+ turnstile_claim(ts);
+ CTR2(KTR_LOCK, "%s: %p claimed by new writer",
+ __func__, rw);
+ break;
+ }
+ turnstile_cancel(ts);
+ cpu_spinwait();
+ continue;
+ }
+
+ /*
+ * If the RW_LOCK_WRITE_WAITERS flag isn't set, then try to
+ * set it. If we fail to set it, then loop back and try
+ * again.
+ */
+ if (!(v & RW_LOCK_WRITE_WAITERS)) {
+ if (!atomic_cmpset_ptr(&rw->rw_lock, v,
+ v | RW_LOCK_WRITE_WAITERS)) {
+ turnstile_cancel(ts);
+ cpu_spinwait();
+ continue;
+ }
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p set write waiters flag",
+ __func__, rw);
+ }
+
+ /*
+ * We were unable to acquire the lock and the write waiters
+ * flag is set, so we must block on the turnstile.
+ */
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
+ rw);
+ lock_profile_obtain_lock_failed(&rw->lock_object, &contested,
+ &waittime);
+ turnstile_wait(ts, rw_owner(rw), TS_EXCLUSIVE_QUEUE);
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
+ __func__, rw);
+ }
+ lock_profile_obtain_lock_success(&rw->lock_object, contested, waittime,
+ file, line);
+}
+
+/*
+ * This function is called if the first try at releasing a write lock failed.
+ * This means that one of the 2 waiter bits must be set indicating that at
+ * least one thread is waiting on this lock.
+ */
+void
+_rw_wunlock_hard(struct rwlock *rw, uintptr_t tid, const char *file, int line)
+{
+ struct turnstile *ts;
+ uintptr_t v;
+ int queue;
+
+ if (rw_wlocked(rw) && rw_recursed(rw)) {
+ if ((--rw->rw_recurse) == 0)
+ atomic_clear_ptr(&rw->rw_lock, RW_LOCK_RECURSED);
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, rw);
+ return;
+ }
+
+ KASSERT(rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS),
+ ("%s: neither of the waiter flags are set", __func__));
+
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p contested", __func__, rw);
+
+ turnstile_chain_lock(&rw->lock_object);
+ ts = turnstile_lookup(&rw->lock_object);
+
+ MPASS(ts != NULL);
+
+ /*
+ * Use the same algo as sx locks for now. Prefer waking up shared
+ * waiters if we have any over writers. This is probably not ideal.
+ *
+ * 'v' is the value we are going to write back to rw_lock. If we
+ * have waiters on both queues, we need to preserve the state of
+ * the waiter flag for the queue we don't wake up. For now this is
+ * hardcoded for the algorithm mentioned above.
+ *
+ * In the case of both readers and writers waiting we wakeup the
+ * readers but leave the RW_LOCK_WRITE_WAITERS flag set. If a
+ * new writer comes in before a reader it will claim the lock up
+ * above. There is probably a potential priority inversion in
+ * there that could be worked around either by waking both queues
+ * of waiters or doing some complicated lock handoff gymnastics.
+ */
+ v = RW_UNLOCKED;
+ if (rw->rw_lock & RW_LOCK_READ_WAITERS) {
+ queue = TS_SHARED_QUEUE;
+ v |= (rw->rw_lock & RW_LOCK_WRITE_WAITERS);
+ } else
+ queue = TS_EXCLUSIVE_QUEUE;
+
+ /* Wake up all waiters for the specific queue. */
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR3(KTR_LOCK, "%s: %p waking up %s waiters", __func__, rw,
+ queue == TS_SHARED_QUEUE ? "read" : "write");
+ turnstile_broadcast(ts, queue);
+ atomic_store_rel_ptr(&rw->rw_lock, v);
+ turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+ turnstile_chain_unlock(&rw->lock_object);
+}
+
+/*
+ * Attempt to do a non-blocking upgrade from a read lock to a write
+ * lock. This will only succeed if this thread holds a single read
+ * lock. Returns true if the upgrade succeeded and false otherwise.
+ */
+int
+_rw_try_upgrade(struct rwlock *rw, const char *file, int line)
+{
+ uintptr_t v, tid;
+ struct turnstile *ts;
+ int success;
+
+ KASSERT(rw->rw_lock != RW_DESTROYED,
+ ("rw_try_upgrade() of destroyed rwlock @ %s:%d", file, line));
+ _rw_assert(rw, RA_RLOCKED, file, line);
+
+ /*
+ * Attempt to switch from one reader to a writer. If there
+ * are any write waiters, then we will have to lock the
+ * turnstile first to prevent races with another writer
+ * calling turnstile_wait() before we have claimed this
+ * turnstile. So, do the simple case of no waiters first.
+ */
+ tid = (uintptr_t)curthread;
+ if (!(rw->rw_lock & RW_LOCK_WRITE_WAITERS)) {
+ success = atomic_cmpset_ptr(&rw->rw_lock, RW_READERS_LOCK(1),
+ tid);
+ goto out;
+ }
+
+ /*
+ * Ok, we think we have write waiters, so lock the
+ * turnstile.
+ */
+ ts = turnstile_trywait(&rw->lock_object);
+
+ /*
+ * Try to switch from one reader to a writer again. This time
+ * we honor the current state of the RW_LOCK_WRITE_WAITERS
+ * flag. If we obtain the lock with the flag set, then claim
+ * ownership of the turnstile.
+ */
+ v = rw->rw_lock & RW_LOCK_WRITE_WAITERS;
+ success = atomic_cmpset_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v,
+ tid | v);
+ if (success && v)
+ turnstile_claim(ts);
+ else
+ turnstile_cancel(ts);
+out:
+ LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line);
+ if (success)
+ WITNESS_UPGRADE(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+ file, line);
+ return (success);
+}
+
+/*
+ * Downgrade a write lock into a single read lock.
+ */
+void
+_rw_downgrade(struct rwlock *rw, const char *file, int line)
+{
+ struct turnstile *ts;
+ uintptr_t tid, v;
+
+ KASSERT(rw->rw_lock != RW_DESTROYED,
+ ("rw_downgrade() of destroyed rwlock @ %s:%d", file, line));
+ _rw_assert(rw, RA_WLOCKED | RA_NOTRECURSED, file, line);
+#ifndef INVARIANTS
+ if (rw_recursed(rw))
+ panic("downgrade of a recursed lock");
+#endif
+
+ WITNESS_DOWNGRADE(&rw->lock_object, 0, file, line);
+
+ /*
+ * Convert from a writer to a single reader. First we handle
+ * the easy case with no waiters. If there are any waiters, we
+ * lock the turnstile, "disown" the lock, and awaken any read
+ * waiters.
+ */
+ tid = (uintptr_t)curthread;
+ if (atomic_cmpset_rel_ptr(&rw->rw_lock, tid, RW_READERS_LOCK(1)))
+ goto out;
+
+ /*
+ * Ok, we think we have waiters, so lock the turnstile so we can
+ * read the waiter flags without any races.
+ */
+ turnstile_chain_lock(&rw->lock_object);
+ v = rw->rw_lock;
+ MPASS(v & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS));
+
+ /*
+ * Downgrade from a write lock while preserving
+ * RW_LOCK_WRITE_WAITERS and give up ownership of the
+ * turnstile. If there are any read waiters, wake them up.
+ */
+ ts = turnstile_lookup(&rw->lock_object);
+ MPASS(ts != NULL);
+ if (v & RW_LOCK_READ_WAITERS)
+ turnstile_broadcast(ts, TS_SHARED_QUEUE);
+ atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) |
+ (v & RW_LOCK_WRITE_WAITERS));
+ if (v & RW_LOCK_READ_WAITERS)
+ turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+ else if (ts)
+ turnstile_disown(ts);
+ turnstile_chain_unlock(&rw->lock_object);
+out:
+ LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line);
+}
+
+#ifdef INVARIANT_SUPPORT
+#ifndef INVARIANTS
+#undef _rw_assert
+#endif
+
+/*
+ * In the non-WITNESS case, rw_assert() can only detect that at least
+ * *some* thread owns an rlock, but it cannot guarantee that *this*
+ * thread owns an rlock.
+ */
+void
+_rw_assert(struct rwlock *rw, int what, const char *file, int line)
+{
+
+ if (panicstr != NULL)
+ return;
+ switch (what) {
+ case RA_LOCKED:
+ case RA_LOCKED | RA_RECURSED:
+ case RA_LOCKED | RA_NOTRECURSED:
+ case RA_RLOCKED:
+#ifdef WITNESS
+ witness_assert(&rw->lock_object, what, file, line);
+#else
+ /*
+ * If some other thread has a write lock or we have one
+ * and are asserting a read lock, fail. Also, if no one
+ * has a lock at all, fail.
+ */
+ if (rw->rw_lock == RW_UNLOCKED ||
+ (!(rw->rw_lock & RW_LOCK_READ) && (what == RA_RLOCKED ||
+ rw_wowner(rw) != curthread)))
+ panic("Lock %s not %slocked @ %s:%d\n",
+ rw->lock_object.lo_name, (what == RA_RLOCKED) ?
+ "read " : "", file, line);
+
+ if (!(rw->rw_lock & RW_LOCK_READ)) {
+ if (rw_recursed(rw)) {
+ if (what & RA_NOTRECURSED)
+ panic("Lock %s recursed @ %s:%d\n",
+ rw->lock_object.lo_name, file,
+ line);
+ } else if (what & RA_RECURSED)
+ panic("Lock %s not recursed @ %s:%d\n",
+ rw->lock_object.lo_name, file, line);
+ }
+#endif
+ break;
+ case RA_WLOCKED:
+ case RA_WLOCKED | RA_RECURSED:
+ case RA_WLOCKED | RA_NOTRECURSED:
+ if (rw_wowner(rw) != curthread)
+ panic("Lock %s not exclusively locked @ %s:%d\n",
+ rw->lock_object.lo_name, file, line);
+ if (rw_recursed(rw)) {
+ if (what & RA_NOTRECURSED)
+ panic("Lock %s recursed @ %s:%d\n",
+ rw->lock_object.lo_name, file, line);
+ } else if (what & RA_RECURSED)
+ panic("Lock %s not recursed @ %s:%d\n",
+ rw->lock_object.lo_name, file, line);
+ break;
+ case RA_UNLOCKED:
+#ifdef WITNESS
+ witness_assert(&rw->lock_object, what, file, line);
+#else
+ /*
+ * If we hold a write lock fail. We can't reliably check
+ * to see if we hold a read lock or not.
+ */
+ if (rw_wowner(rw) == curthread)
+ panic("Lock %s exclusively locked @ %s:%d\n",
+ rw->lock_object.lo_name, file, line);
+#endif
+ break;
+ default:
+ panic("Unknown rw lock assertion: %d @ %s:%d", what, file,
+ line);
+ }
+}
+#endif /* INVARIANT_SUPPORT */
+
+#ifdef DDB
+void
+db_show_rwlock(struct lock_object *lock)
+{
+ struct rwlock *rw;
+ struct thread *td;
+
+ rw = (struct rwlock *)lock;
+
+ db_printf(" state: ");
+ if (rw->rw_lock == RW_UNLOCKED)
+ db_printf("UNLOCKED\n");
+ else if (rw->rw_lock == RW_DESTROYED) {
+ db_printf("DESTROYED\n");
+ return;
+ } else if (rw->rw_lock & RW_LOCK_READ)
+ db_printf("RLOCK: %ju locks\n",
+ (uintmax_t)(RW_READERS(rw->rw_lock)));
+ else {
+ td = rw_wowner(rw);
+ db_printf("WLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
+ td->td_tid, td->td_proc->p_pid, td->td_proc->p_comm);
+ if (rw_recursed(rw))
+ db_printf(" recursed: %u\n", rw->rw_recurse);
+ }
+ db_printf(" waiters: ");
+ switch (rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)) {
+ case RW_LOCK_READ_WAITERS:
+ db_printf("readers\n");
+ break;
+ case RW_LOCK_WRITE_WAITERS:
+ db_printf("writers\n");
+ break;
+ case RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS:
+ db_printf("readers and writers\n");
+ break;
+ default:
+ db_printf("none\n");
+ break;
+ }
+}
+
+#endif
Index: sys_socket.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sys_socket.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/sys_socket.c -L sys/kern/sys_socket.c -u -r1.1.1.1 -r1.2
--- sys/kern/sys_socket.c
+++ sys/kern/sys_socket.c
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sys_socket.c,v 1.69 2005/04/16 18:46:28 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sys_socket.c,v 1.73 2007/08/06 14:26:00 rwatson Exp $");
#include "opt_mac.h"
@@ -38,7 +38,6 @@
#include <sys/systm.h>
#include <sys/file.h>
#include <sys/filedesc.h>
-#include <sys/mac.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/sigio.h>
@@ -55,7 +54,9 @@
#include <net/if.h>
#include <net/route.h>
-struct fileops socketops = {
+#include <security/mac/mac_framework.h>
+
+struct fileops socketops = {
.fo_read = soo_read,
.fo_write = soo_write,
.fo_ioctl = soo_ioctl,
@@ -68,78 +69,54 @@
/* ARGSUSED */
int
-soo_read(fp, uio, active_cred, flags, td)
- struct file *fp;
- struct uio *uio;
- struct ucred *active_cred;
- struct thread *td;
- int flags;
+soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
{
struct socket *so = fp->f_data;
+#ifdef MAC
int error;
- NET_LOCK_GIANT();
-#ifdef MAC
SOCK_LOCK(so);
error = mac_check_socket_receive(active_cred, so);
SOCK_UNLOCK(so);
- if (error) {
- NET_UNLOCK_GIANT();
+ if (error)
return (error);
- }
#endif
- error = so->so_proto->pr_usrreqs->pru_soreceive(so, 0, uio, 0, 0, 0);
- NET_UNLOCK_GIANT();
- return (error);
+ return (soreceive(so, 0, uio, 0, 0, 0));
}
/* ARGSUSED */
int
-soo_write(fp, uio, active_cred, flags, td)
- struct file *fp;
- struct uio *uio;
- struct ucred *active_cred;
- struct thread *td;
- int flags;
+soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
{
struct socket *so = fp->f_data;
int error;
- NET_LOCK_GIANT();
#ifdef MAC
SOCK_LOCK(so);
error = mac_check_socket_send(active_cred, so);
SOCK_UNLOCK(so);
- if (error) {
- NET_UNLOCK_GIANT();
+ if (error)
return (error);
- }
#endif
- error = so->so_proto->pr_usrreqs->pru_sosend(so, 0, uio, 0, 0, 0,
- uio->uio_td);
+ error = sosend(so, 0, uio, 0, 0, 0, uio->uio_td);
if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) {
PROC_LOCK(uio->uio_td->td_proc);
psignal(uio->uio_td->td_proc, SIGPIPE);
PROC_UNLOCK(uio->uio_td->td_proc);
}
- NET_UNLOCK_GIANT();
return (error);
}
int
-soo_ioctl(fp, cmd, data, active_cred, td)
- struct file *fp;
- u_long cmd;
- void *data;
- struct ucred *active_cred;
- struct thread *td;
+soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
+ struct thread *td)
{
struct socket *so = fp->f_data;
int error = 0;
- NET_LOCK_GIANT();
switch (cmd) {
-
case FIONBIO:
SOCK_LOCK(so);
if (*(int *)data)
@@ -151,10 +128,10 @@
case FIOASYNC:
/*
- * XXXRW: This code separately acquires SOCK_LOCK(so)
- * and SOCKBUF_LOCK(&so->so_rcv) even though they are
- * the same mutex to avoid introducing the assumption
- * that they are the same.
+ * XXXRW: This code separately acquires SOCK_LOCK(so) and
+ * SOCKBUF_LOCK(&so->so_rcv) even though they are the same
+ * mutex to avoid introducing the assumption that they are
+ * the same.
*/
if (*(int *)data) {
SOCK_LOCK(so);
@@ -206,9 +183,9 @@
break;
default:
/*
- * Interface/routing/protocol specific ioctls:
- * interface and routing ioctls should have a
- * different entry since a socket's unnecessary
+ * Interface/routing/protocol specific ioctls: interface and
+ * routing ioctls should have a different entry since a
+ * socket is unnecessary.
*/
if (IOCGROUP(cmd) == 'i')
error = ifioctl(so, cmd, data, td);
@@ -219,65 +196,50 @@
(so, cmd, data, 0, td));
break;
}
- NET_UNLOCK_GIANT();
- return(error);
+ return (error);
}
int
-soo_poll(fp, events, active_cred, td)
- struct file *fp;
- int events;
- struct ucred *active_cred;
- struct thread *td;
+soo_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
{
struct socket *so = fp->f_data;
+#ifdef MAC
int error;
- NET_LOCK_GIANT();
-#ifdef MAC
SOCK_LOCK(so);
error = mac_check_socket_poll(active_cred, so);
SOCK_UNLOCK(so);
- if (error) {
- NET_UNLOCK_GIANT();
+ if (error)
return (error);
- }
#endif
- error = (so->so_proto->pr_usrreqs->pru_sopoll)
- (so, events, fp->f_cred, td);
- NET_UNLOCK_GIANT();
-
- return (error);
+ return (sopoll(so, events, fp->f_cred, td));
}
int
-soo_stat(fp, ub, active_cred, td)
- struct file *fp;
- struct stat *ub;
- struct ucred *active_cred;
- struct thread *td;
+soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred,
+ struct thread *td)
{
struct socket *so = fp->f_data;
+#ifdef MAC
int error;
+#endif
bzero((caddr_t)ub, sizeof (*ub));
ub->st_mode = S_IFSOCK;
- NET_LOCK_GIANT();
#ifdef MAC
SOCK_LOCK(so);
error = mac_check_socket_stat(active_cred, so);
SOCK_UNLOCK(so);
- if (error) {
- NET_UNLOCK_GIANT();
+ if (error)
return (error);
- }
#endif
/*
* If SBS_CANTRCVMORE is set, but there's still data left in the
* receive buffer, the socket is still readable.
*
- * XXXRW: perhaps should lock socket buffer so st_size result
- * is consistent.
+ * XXXRW: perhaps should lock socket buffer so st_size result is
+ * consistent.
*/
/* Unlocked read. */
if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 ||
@@ -288,33 +250,27 @@
ub->st_size = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
ub->st_uid = so->so_cred->cr_uid;
ub->st_gid = so->so_cred->cr_gid;
- error = (*so->so_proto->pr_usrreqs->pru_sense)(so, ub);
- NET_UNLOCK_GIANT();
- return (error);
+ return (*so->so_proto->pr_usrreqs->pru_sense)(so, ub);
}
/*
- * API socket close on file pointer. We call soclose() to close the
- * socket (including initiating closing protocols). soclose() will
- * sorele() the file reference but the actual socket will not go away
- * until the socket's ref count hits 0.
+ * API socket close on file pointer. We call soclose() to close the socket
+ * (including initiating closing protocols). soclose() will sorele() the
+ * file reference but the actual socket will not go away until the socket's
+ * ref count hits 0.
*/
/* ARGSUSED */
int
-soo_close(fp, td)
- struct file *fp;
- struct thread *td;
+soo_close(struct file *fp, struct thread *td)
{
int error = 0;
struct socket *so;
- NET_LOCK_GIANT();
so = fp->f_data;
fp->f_ops = &badfileops;
fp->f_data = NULL;
if (so)
error = soclose(so);
- NET_UNLOCK_GIANT();
return (error);
}
Index: uipc_cow.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_cow.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/uipc_cow.c -L sys/kern/uipc_cow.c -u -r1.1.1.1 -r1.2
--- sys/kern/uipc_cow.c
+++ sys/kern/uipc_cow.c
@@ -36,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_cow.c,v 1.23.2.1 2005/10/26 20:21:23 delphij Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_cow.c,v 1.26 2005/10/23 07:41:56 alc Exp $");
#include <sys/param.h>
#include <sys/systm.h>
Index: vfs_cluster.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_cluster.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/vfs_cluster.c -L sys/kern/vfs_cluster.c -u -r1.2 -r1.3
--- sys/kern/vfs_cluster.c
+++ sys/kern/vfs_cluster.c
@@ -32,7 +32,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_cluster.c,v 1.166.2.3 2006/03/22 17:54:50 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_cluster.c,v 1.176 2007/06/01 01:12:44 jeff Exp $");
#include "opt_debug_cluster.h"
@@ -58,7 +58,7 @@
"Debug VFS clustering code");
#endif
-static MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer");
+static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
static struct cluster_save *
cluster_collectbufs(struct vnode *vp, struct buf *last_bp);
@@ -228,7 +228,7 @@
BUF_KERNPROC(bp);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
- curproc->p_stats->p_ru.ru_inblock++;
+ curthread->td_ru.ru_inblock++;
}
/*
@@ -281,7 +281,7 @@
BUF_KERNPROC(rbp);
rbp->b_iooffset = dbtob(rbp->b_blkno);
bstrategy(rbp);
- curproc->p_stats->p_ru.ru_inblock++;
+ curthread->td_ru.ru_inblock++;
}
if (reqbp)
@@ -595,7 +595,7 @@
int async;
if (vp->v_type == VREG) {
- async = vp->v_mount->mnt_flag & MNT_ASYNC;
+ async = vp->v_mount->mnt_kern_flag & MNTK_ASYNC;
lblocksize = vp->v_mount->mnt_stat.f_iosize;
} else {
async = 0;
@@ -770,6 +770,12 @@
--len;
continue;
}
+ if (tbp->b_pin_count > 0) {
+ BUF_UNLOCK(tbp);
+ ++start_lbn;
+ --len;
+ continue;
+ }
bremfree(tbp);
tbp->b_flags &= ~B_DONE;
@@ -873,6 +879,15 @@
BUF_UNLOCK(tbp);
break;
}
+
+ /*
+ * Do not pull in pinned buffers.
+ */
+ if (tbp->b_pin_count > 0) {
+ BUF_UNLOCK(tbp);
+ break;
+ }
+
/*
* Ok, it's passed all the tests,
* so remove it from the free list
@@ -896,7 +911,7 @@
if (i != 0) { /* if not first buffer */
for (j = 0; j < tbp->b_npages; j += 1) {
m = tbp->b_pages[j];
- if (m->flags & PG_BUSY) {
+ if (m->oflags & VPO_BUSY) {
VM_OBJECT_UNLOCK(
tbp->b_object);
bqrelse(tbp);
Index: kern_proc.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_proc.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_proc.c -L sys/kern/kern_proc.c -u -r1.2 -r1.3
--- sys/kern/kern_proc.c
+++ sys/kern/kern_proc.c
@@ -27,11 +27,10 @@
* SUCH DAMAGE.
*
* @(#)kern_proc.c 8.7 (Berkeley) 2/14/95
- * $FreeBSD: src/sys/kern/kern_proc.c,v 1.230.2.3 2006/01/05 20:23:10 truckman Exp $
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_proc.c,v 1.230.2.3 2006/01/05 20:23:10 truckman Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_proc.c,v 1.252.2.2.2.1 2008/01/19 18:15:05 kib Exp $");
#include "opt_ktrace.h"
#include "opt_kstack_pages.h"
@@ -43,6 +42,7 @@
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/refcount.h>
#include <sys/sysent.h>
#include <sys/sched.h>
#include <sys/smp.h>
@@ -54,6 +54,7 @@
#include <sys/user.h>
#include <sys/jail.h>
#include <sys/vnode.h>
+#include <sys/eventhandler.h>
#ifdef KTRACE
#include <sys/uio.h>
#include <sys/ktrace.h>
@@ -92,7 +93,6 @@
struct proclist zombproc;
struct sx allproc_lock;
struct sx proctree_lock;
-struct mtx pargs_ref_lock;
struct mtx ppeers_lock;
uma_zone_t proc_zone;
uma_zone_t ithread_zone;
@@ -111,7 +111,6 @@
sx_init(&allproc_lock, "allproc");
sx_init(&proctree_lock, "proctree");
- mtx_init(&pargs_ref_lock, "struct pargs.ref", NULL, MTX_DEF);
mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF);
LIST_INIT(&allproc);
LIST_INIT(&zombproc);
@@ -132,6 +131,7 @@
struct proc *p;
p = (struct proc *)mem;
+ EVENTHANDLER_INVOKE(process_ctor, p);
return (0);
}
@@ -143,29 +143,28 @@
{
struct proc *p;
struct thread *td;
-#ifdef INVARIANTS
- struct ksegrp *kg;
-#endif
/* INVARIANTS checks go here */
p = (struct proc *)mem;
td = FIRST_THREAD_IN_PROC(p);
+ if (td != NULL) {
#ifdef INVARIANTS
- KASSERT((p->p_numthreads == 1),
- ("bad number of threads in exiting process"));
- KASSERT((p->p_numksegrps == 1), ("free proc with > 1 ksegrp"));
- KASSERT((td != NULL), ("proc_dtor: bad thread pointer"));
- kg = FIRST_KSEGRP_IN_PROC(p);
- KASSERT((kg != NULL), ("proc_dtor: bad kg pointer"));
+ KASSERT((p->p_numthreads == 1),
+ ("bad number of threads in exiting process"));
+ KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr"));
#endif
- /* Dispose of an alternate kstack, if it exists.
- * XXX What if there are more than one thread in the proc?
- * The first thread in the proc is special and not
- * freed, so you gotta do this here.
- */
- if (((p->p_flag & P_KTHREAD) != 0) && (td->td_altkstack != 0))
- vm_thread_dispose_altkstack(td);
+ /* Dispose of an alternate kstack, if it exists.
+ * XXX What if there are more than one thread in the proc?
+ * The first thread in the proc is special and not
+ * freed, so you gotta do this here.
+ */
+ if (((p->p_flag & P_KTHREAD) != 0) && (td->td_altkstack != 0))
+ vm_thread_dispose_altkstack(td);
+ }
+ EVENTHANDLER_INVOKE(process_dtor, p);
+ if (p->p_ksi != NULL)
+ KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue"));
}
/*
@@ -175,18 +174,15 @@
proc_init(void *mem, int size, int flags)
{
struct proc *p;
- struct thread *td;
- struct ksegrp *kg;
p = (struct proc *)mem;
p->p_sched = (struct p_sched *)&p[1];
- td = thread_alloc();
- kg = ksegrp_alloc();
bzero(&p->p_mtx, sizeof(struct mtx));
mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+ mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
+ TAILQ_INIT(&p->p_threads); /* all threads in proc */
+ EVENTHANDLER_INVOKE(process_init, p);
p->p_stats = pstats_alloc();
- proc_linkup(p, kg, td);
- sched_newproc(p, kg, td);
return (0);
}
@@ -197,8 +193,19 @@
static void
proc_fini(void *mem, int size)
{
+#ifdef notnow
+ struct proc *p;
+ p = (struct proc *)mem;
+ EVENTHANDLER_INVOKE(process_fini, p);
+ pstats_free(p->p_stats);
+ thread_free(FIRST_THREAD_IN_PROC(p));
+ mtx_destroy(&p->p_mtx);
+ if (p->p_ksi != NULL)
+ ksiginfo_free(p->p_ksi);
+#else
panic("proc reclaimed");
+#endif
}
/*
@@ -297,7 +304,7 @@
* new session
*/
mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF);
- mtx_lock(&Giant); /* XXX TTY */
+ mtx_lock(&Giant); /* XXX TTY */
PROC_LOCK(p);
p->p_flag &= ~P_CONTROLT;
PROC_UNLOCK(p);
@@ -313,7 +320,7 @@
KASSERT(p == curproc,
("enterpgrp: mksession and p != curproc"));
} else {
- mtx_lock(&Giant); /* XXX TTY */
+ mtx_lock(&Giant); /* XXX TTY */
pgrp->pg_session = p->p_session;
SESS_LOCK(pgrp->pg_session);
pgrp->pg_session->s_count++;
@@ -331,7 +338,7 @@
pgrp->pg_jobc = 0;
SLIST_INIT(&pgrp->pg_sigiolst);
PGRP_UNLOCK(pgrp);
- mtx_unlock(&Giant); /* XXX TTY */
+ mtx_unlock(&Giant); /* XXX TTY */
doenterpgrp(p, pgrp);
@@ -391,7 +398,7 @@
fixjobc(p, pgrp, 1);
fixjobc(p, p->p_pgrp, 0);
- mtx_lock(&Giant); /* XXX TTY */
+ mtx_lock(&Giant); /* XXX TTY */
PGRP_LOCK(pgrp);
PGRP_LOCK(savepgrp);
PROC_LOCK(p);
@@ -401,7 +408,7 @@
LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
PGRP_UNLOCK(savepgrp);
PGRP_UNLOCK(pgrp);
- mtx_unlock(&Giant); /* XXX TTY */
+ mtx_unlock(&Giant); /* XXX TTY */
if (LIST_EMPTY(&savepgrp->pg_members))
pgdelete(savepgrp);
}
@@ -449,7 +456,7 @@
*/
funsetownlst(&pgrp->pg_sigiolst);
- mtx_lock(&Giant); /* XXX TTY */
+ mtx_lock(&Giant); /* XXX TTY */
PGRP_LOCK(pgrp);
if (pgrp->pg_session->s_ttyp != NULL &&
pgrp->pg_session->s_ttyp->t_pgrp == pgrp)
@@ -460,7 +467,7 @@
PGRP_UNLOCK(pgrp);
mtx_destroy(&pgrp->pg_mtx);
FREE(pgrp, M_PGRP);
- mtx_unlock(&Giant); /* XXX TTY */
+ mtx_unlock(&Giant); /* XXX TTY */
}
static void
@@ -620,7 +627,6 @@
struct thread *td0;
struct tty *tp;
struct session *sp;
- struct timeval tv;
struct ucred *cred;
struct sigacts *ps;
@@ -667,7 +673,7 @@
kp->ki_sigcatch = ps->ps_sigcatch;
mtx_unlock(&ps->ps_mtx);
}
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
if (p->p_state != PRS_NEW &&
p->p_state != PRS_ZOMBIE &&
p->p_vmspace != NULL) {
@@ -687,18 +693,23 @@
kp->ki_ssize = vm->vm_ssize;
} else if (p->p_state == PRS_ZOMBIE)
kp->ki_stat = SZOMB;
- kp->ki_sflag = p->p_sflag;
- kp->ki_swtime = p->p_swtime;
+ if (kp->ki_flag & P_INMEM)
+ kp->ki_sflag = PS_INMEM;
+ else
+ kp->ki_sflag = 0;
+ /* Calculate legacy swtime as seconds since 'swtick'. */
+ kp->ki_swtime = (ticks - p->p_swtick) / hz;
kp->ki_pid = p->p_pid;
kp->ki_nice = p->p_nice;
- bintime2timeval(&p->p_rux.rux_runtime, &tv);
- kp->ki_runtime = tv.tv_sec * (u_int64_t)1000000 + tv.tv_usec;
- mtx_unlock_spin(&sched_lock);
- if ((p->p_sflag & PS_INMEM) && p->p_stats != NULL) {
+ rufetch(p, &kp->ki_rusage);
+ kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime);
+ PROC_SUNLOCK(p);
+ if ((p->p_flag & P_INMEM) && p->p_stats != NULL) {
kp->ki_start = p->p_stats->p_start;
timevaladd(&kp->ki_start, &boottime);
- kp->ki_rusage = p->p_stats->p_ru;
+ PROC_SLOCK(p);
calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime);
+ PROC_SUNLOCK(p);
calccru(p, &kp->ki_childutime, &kp->ki_childstime);
/* Some callers want child-times in a single value */
@@ -731,10 +742,8 @@
kp->ki_tsid = tp->t_session->s_sid;
} else
kp->ki_tdev = NODEV;
- if (p->p_comm[0] != '\0') {
+ if (p->p_comm[0] != '\0')
strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm));
- strlcpy(kp->ki_ocomm, p->p_comm, sizeof(kp->ki_ocomm));
- }
if (p->p_sysent && p->p_sysent->sv_name != NULL &&
p->p_sysent->sv_name[0] != '\0')
strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul));
@@ -748,20 +757,23 @@
/*
* Fill in information that is thread specific.
- * Must be called with sched_lock locked.
+ * Must be called with p_slock locked.
*/
static void
fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp)
{
- struct ksegrp *kg;
struct proc *p;
p = td->td_proc;
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ thread_lock(td);
if (td->td_wmesg != NULL)
strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg));
else
bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg));
+ if (td->td_name[0] != '\0')
+ strlcpy(kp->ki_ocomm, td->td_name, sizeof(kp->ki_ocomm));
if (TD_ON_LOCK(td)) {
kp->ki_kiflag |= KI_LOCKBLOCK;
strlcpy(kp->ki_lockname, td->td_lockname,
@@ -791,14 +803,6 @@
kp->ki_stat = SIDL;
}
- kg = td->td_ksegrp;
-
- /* things in the KSE GROUP */
- kp->ki_estcpu = kg->kg_estcpu;
- kp->ki_slptime = kg->kg_slptime;
- kp->ki_pri.pri_user = kg->kg_user_pri;
- kp->ki_pri.pri_class = kg->kg_pri_class;
-
/* Things in the thread */
kp->ki_wchan = td->td_wchan;
kp->ki_pri.pri_level = td->td_priority;
@@ -811,12 +815,17 @@
kp->ki_pcb = td->td_pcb;
kp->ki_kstack = (void *)td->td_kstack;
kp->ki_pctcpu = sched_pctcpu(td);
+ kp->ki_estcpu = td->td_estcpu;
+ kp->ki_slptime = (ticks - td->td_slptick) / hz;
+ kp->ki_pri.pri_class = td->td_pri_class;
+ kp->ki_pri.pri_user = td->td_user_pri;
/* We can't get this anymore but ps etc never used it anyway. */
kp->ki_rqindex = 0;
SIGSETOR(kp->ki_siglist, td->td_siglist);
kp->ki_sigmask = td->td_sigmask;
+ thread_unlock(td);
}
/*
@@ -828,10 +837,10 @@
{
fill_kinfo_proc_only(p, kp);
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
if (FIRST_THREAD_IN_PROC(p) != NULL)
fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp);
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
}
struct pstats *
@@ -898,26 +907,26 @@
fill_kinfo_proc_only(p, &kinfo_proc);
if (flags & KERN_PROC_NOTHREADS) {
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
if (FIRST_THREAD_IN_PROC(p) != NULL)
fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), &kinfo_proc);
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
sizeof(kinfo_proc));
} else {
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
if (FIRST_THREAD_IN_PROC(p) != NULL)
FOREACH_THREAD_IN_PROC(p, td) {
fill_kinfo_thread(td, &kinfo_proc);
error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
- sizeof(kinfo_proc));
+ sizeof(kinfo_proc));
if (error)
break;
}
else
error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
- sizeof(kinfo_proc));
- mtx_unlock_spin(&sched_lock);
+ sizeof(kinfo_proc));
+ PROC_SUNLOCK(p);
}
PROC_UNLOCK(p);
if (error)
@@ -1007,13 +1016,15 @@
/*
* Skip embryonic processes.
*/
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
if (p->p_state == PRS_NEW) {
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
continue;
}
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
PROC_LOCK(p);
+ KASSERT(p->p_ucred != NULL,
+ ("process credential is NULL for non-NEW proc"));
/*
* Show a user only appropriate processes.
*/
@@ -1028,8 +1039,7 @@
switch (oid_number) {
case KERN_PROC_GID:
- if (p->p_ucred == NULL ||
- p->p_ucred->cr_gid != (gid_t)name[0]) {
+ if (p->p_ucred->cr_gid != (gid_t)name[0]) {
PROC_UNLOCK(p);
continue;
}
@@ -1037,7 +1047,7 @@
case KERN_PROC_PGRP:
/* could do this by traversing pgrp */
- if (p->p_pgrp == NULL ||
+ if (p->p_pgrp == NULL ||
p->p_pgrp->pg_id != (pid_t)name[0]) {
PROC_UNLOCK(p);
continue;
@@ -1045,8 +1055,7 @@
break;
case KERN_PROC_RGID:
- if (p->p_ucred == NULL ||
- p->p_ucred->cr_rgid != (gid_t)name[0]) {
+ if (p->p_ucred->cr_rgid != (gid_t)name[0]) {
PROC_UNLOCK(p);
continue;
}
@@ -1078,16 +1087,14 @@
break;
case KERN_PROC_UID:
- if (p->p_ucred == NULL ||
- p->p_ucred->cr_uid != (uid_t)name[0]) {
+ if (p->p_ucred->cr_uid != (uid_t)name[0]) {
PROC_UNLOCK(p);
continue;
}
break;
case KERN_PROC_RUID:
- if (p->p_ucred == NULL ||
- p->p_ucred->cr_ruid != (uid_t)name[0]) {
+ if (p->p_ucred->cr_ruid != (uid_t)name[0]) {
PROC_UNLOCK(p);
continue;
}
@@ -1119,7 +1126,7 @@
MALLOC(pa, struct pargs *, sizeof(struct pargs) + len, M_PARGS,
M_WAITOK);
- pa->ar_ref = 1;
+ refcount_init(&pa->ar_ref, 1);
pa->ar_length = len;
return (pa);
}
@@ -1137,9 +1144,7 @@
if (pa == NULL)
return;
- PARGS_LOCK(pa);
- pa->ar_ref++;
- PARGS_UNLOCK(pa);
+ refcount_acquire(&pa->ar_ref);
}
void
@@ -1148,12 +1153,8 @@
if (pa == NULL)
return;
- PARGS_LOCK(pa);
- if (--pa->ar_ref == 0) {
- PARGS_UNLOCK(pa);
+ if (refcount_release(&pa->ar_ref))
pargs_free(pa);
- } else
- PARGS_UNLOCK(pa);
}
/*
@@ -1242,6 +1243,11 @@
}
vp = p->p_textvp;
+ if (vp == NULL) {
+ if (*pidp != -1)
+ PROC_UNLOCK(p);
+ return (0);
+ }
vref(vp);
if (*pidp != -1)
PROC_UNLOCK(p);
Index: link_elf_obj.c
===================================================================
RCS file: /home/cvs/src/sys/kern/link_elf_obj.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/link_elf_obj.c -L sys/kern/link_elf_obj.c -u -r1.1.1.2 -r1.2
--- sys/kern/link_elf_obj.c
+++ sys/kern/link_elf_obj.c
@@ -26,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/link_elf_obj.c,v 1.87.2.3 2005/12/30 22:13:58 marcel Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/link_elf_obj.c,v 1.95 2007/05/31 11:51:51 kib Exp $");
#include "opt_ddb.h"
#include "opt_mac.h"
@@ -35,9 +35,9 @@
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/fcntl.h>
@@ -46,6 +46,8 @@
#include <machine/elf.h>
+#include <security/mac/mac_framework.h>
+
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_object.h>
@@ -393,19 +395,19 @@
int nsym;
int pb, rl, ra;
int alignmask;
-
- GIANT_REQUIRED;
+ int vfslocked;
shdr = NULL;
lf = NULL;
mapsize = 0;
hdr = NULL;
- NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, filename, td);
flags = FREAD;
- error = vn_open(&nd, &flags, 0, -1);
+ error = vn_open(&nd, &flags, 0, NULL);
if (error)
return error;
+ vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
#ifdef MAC
error = mac_check_kld_load(td->td_ucred, nd.ni_vp);
@@ -788,6 +790,7 @@
free(hdr, M_LINKER);
VOP_UNLOCK(nd.ni_vp, 0, td);
vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+ VFS_UNLOCK_GIANT(vfslocked);
return error;
}
@@ -1112,6 +1115,51 @@
}
static void
+link_elf_fix_link_set(elf_file_t ef)
+{
+ static const char startn[] = "__start_";
+ static const char stopn[] = "__stop_";
+ Elf_Sym *sym;
+ const char *sym_name, *linkset_name;
+ Elf_Addr startp, stopp;
+ Elf_Size symidx;
+ int start, i;
+
+ startp = stopp = 0;
+ for (symidx = 1 /* zero entry is special */;
+ symidx < ef->ddbsymcnt; symidx++) {
+ sym = ef->ddbsymtab + symidx;
+ if (sym->st_shndx != SHN_UNDEF)
+ continue;
+
+ sym_name = ef->ddbstrtab + sym->st_name;
+ if (strncmp(sym_name, startn, sizeof(startn) - 1) == 0) {
+ start = 1;
+ linkset_name = sym_name + sizeof(startn) - 1;
+ }
+ else if (strncmp(sym_name, stopn, sizeof(stopn) - 1) == 0) {
+ start = 0;
+ linkset_name = sym_name + sizeof(stopn) - 1;
+ }
+ else
+ continue;
+
+ for (i = 0; i < ef->nprogtab; i++) {
+ if (strcmp(ef->progtab[i].name, linkset_name) == 0) {
+ startp = (Elf_Addr)ef->progtab[i].addr;
+ stopp = (Elf_Addr)(startp + ef->progtab[i].size);
+ break;
+ }
+ }
+ if (i == ef->nprogtab)
+ continue;
+
+ sym->st_value = start ? startp : stopp;
+ sym->st_shndx = i;
+ }
+}
+
+static void
link_elf_reloc_local(linker_file_t lf)
{
elf_file_t ef = (elf_file_t)lf;
@@ -1124,6 +1172,8 @@
int i;
Elf_Size symidx;
+ link_elf_fix_link_set(ef);
+
/* Perform relocations without addend if there are any: */
for (i = 0; i < ef->nrel; i++) {
rel = ef->reltab[i].rel;
Index: subr_power.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_power.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_power.c -L sys/kern/subr_power.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_power.c
+++ sys/kern/subr_power.c
@@ -25,17 +25,27 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_power.c,v 1.5 2004/01/02 18:24:13 njl Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_power.c,v 1.8 2005/11/09 16:22:56 imp Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/power.h>
+#include <sys/taskqueue.h>
static u_int power_pm_type = POWER_PM_TYPE_NONE;
static power_pm_fn_t power_pm_fn = NULL;
static void *power_pm_arg = NULL;
+static struct task power_pm_task;
+
+static void
+power_pm_deferred_fn(void *arg, int pending)
+{
+ int state = (intptr_t)arg;
+
+ power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, state);
+}
int
power_pm_register(u_int pm_type, power_pm_fn_t pm_fn, void *pm_arg)
@@ -48,6 +58,7 @@
power_pm_fn = pm_fn;
power_pm_arg = pm_arg;
error = 0;
+ TASK_INIT(&power_pm_task, 0, power_pm_deferred_fn, NULL);
} else {
error = ENXIO;
}
@@ -72,8 +83,8 @@
state != POWER_SLEEP_STATE_SUSPEND &&
state != POWER_SLEEP_STATE_HIBERNATE)
return;
-
- power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, state);
+ power_pm_task.ta_context = (void *)(intptr_t)state;
+ taskqueue_enqueue(taskqueue_thread, &power_pm_task);
}
/*
Index: vfs_subr.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_subr.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/vfs_subr.c -L sys/kern/vfs_subr.c -u -r1.2 -r1.3
--- sys/kern/vfs_subr.c
+++ sys/kern/vfs_subr.c
@@ -39,7 +39,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_subr.c,v 1.635.2.16.2.1 2006/05/04 07:42:10 scottl Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_subr.c,v 1.707.2.1 2007/12/13 11:58:00 kib Exp $");
#include "opt_ddb.h"
#include "opt_mac.h"
@@ -55,13 +55,14 @@
#include <sys/extattr.h>
#include <sys/file.h>
#include <sys/fcntl.h>
+#include <sys/jail.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/namei.h>
+#include <sys/priv.h>
#include <sys/reboot.h>
#include <sys/sleepqueue.h>
#include <sys/stat.h>
@@ -72,6 +73,8 @@
#include <machine/stdarg.h>
+#include <security/mac/mac_framework.h>
+
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_extern.h>
@@ -81,16 +84,18 @@
#include <vm/vm_kern.h>
#include <vm/uma.h>
-static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure");
static void delmntque(struct vnode *vp);
-static void insmntque(struct vnode *vp, struct mount *mp);
static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
int slpflag, int slptimeo);
static void syncer_shutdown(void *arg, int howto);
static int vtryrecycle(struct vnode *vp);
static void vbusy(struct vnode *vp);
-static void vdropl(struct vnode *vp);
static void vinactive(struct vnode *, struct thread *);
static void v_incr_usecount(struct vnode *);
static void v_decr_usecount(struct vnode *);
@@ -109,19 +114,15 @@
* Enable Giant pushdown based on whether or not the vm is mpsafe in this
* build. Without mpsafevm the buffer cache can not run Giant free.
*/
-#if defined(__alpha__) || defined(__amd64__) || defined(__i386__) || \
- defined(__sparc64__)
int mpsafe_vfs = 1;
-#else
-int mpsafe_vfs;
-#endif
TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs);
SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0,
"MPSAFE VFS");
/*
* Number of vnodes in existence. Increased whenever getnewvnode()
- * allocates a new vnode, never decreased.
+ * allocates a new vnode, decreased on vdestroy() called on VI_DOOMed
+ * vnode.
*/
static unsigned long numvnodes;
@@ -304,14 +305,14 @@
desiredvnodes, MAXVNODES_MAX);
desiredvnodes = MAXVNODES_MAX;
}
- wantfreevnodes = desiredvnodes / 4;
+ wantfreevnodes = desiredvnodes / 4;
mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
TAILQ_INIT(&vnode_free_list);
mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
/*
* Initialize the filesystem syncer.
*/
@@ -328,11 +329,8 @@
* unmounting. Interlock is not released on failure.
*/
int
-vfs_busy(mp, flags, interlkp, td)
- struct mount *mp;
- int flags;
- struct mtx *interlkp;
- struct thread *td;
+vfs_busy(struct mount *mp, int flags, struct mtx *interlkp,
+ struct thread *td)
{
int lkflags;
@@ -365,7 +363,6 @@
lkflags = LK_SHARED | LK_INTERLOCK;
if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td))
panic("vfs_busy: unexpected lock failure");
- vfs_rel(mp);
return (0);
}
@@ -373,20 +370,18 @@
* Free a busy filesystem.
*/
void
-vfs_unbusy(mp, td)
- struct mount *mp;
- struct thread *td;
+vfs_unbusy(struct mount *mp, struct thread *td)
{
lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
+ vfs_rel(mp);
}
/*
* Lookup a mount point by filesystem identifier.
*/
struct mount *
-vfs_getvfs(fsid)
- fsid_t *fsid;
+vfs_getvfs(fsid_t *fsid)
{
struct mount *mp;
@@ -394,6 +389,7 @@
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+ vfs_ref(mp);
mtx_unlock(&mountlist_mtx);
return (mp);
}
@@ -403,16 +399,39 @@
}
/*
- * Check if a user can access priveledged mount options.
+ * Check if a user can access privileged mount options.
*/
int
vfs_suser(struct mount *mp, struct thread *td)
{
int error;
+ /*
+ * If the thread is jailed, but this is not a jail-friendly file
+ * system, deny immediately.
+ */
+ if (jailed(td->td_ucred) && !(mp->mnt_vfc->vfc_flags & VFCF_JAIL))
+ return (EPERM);
+
+ /*
+ * If the file system was mounted outside a jail and a jailed thread
+ * tries to access it, deny immediately.
+ */
+ if (!jailed(mp->mnt_cred) && jailed(td->td_ucred))
+ return (EPERM);
+
+ /*
+ * If the file system was mounted inside different jail that the jail of
+ * the calling thread, deny immediately.
+ */
+ if (jailed(mp->mnt_cred) && jailed(td->td_ucred) &&
+ mp->mnt_cred->cr_prison != td->td_ucred->cr_prison) {
+ return (EPERM);
+ }
+
if ((mp->mnt_flag & MNT_USER) == 0 ||
mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
- if ((error = suser(td)) != 0)
+ if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
return (error);
}
return (0);
@@ -431,10 +450,10 @@
* different mounts.
*/
void
-vfs_getnewfsid(mp)
- struct mount *mp;
+vfs_getnewfsid(struct mount *mp)
{
static u_int16_t mntid_base;
+ struct mount *nmp;
fsid_t tfsid;
int mtype;
@@ -446,8 +465,9 @@
tfsid.val[0] = makedev(255,
mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
mntid_base++;
- if (vfs_getvfs(&tfsid) == NULL)
+ if ((nmp = vfs_getvfs(&tfsid)) == NULL)
break;
+ vfs_rel(nmp);
}
mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
@@ -472,8 +492,7 @@
* Get a current timestamp.
*/
void
-vfs_timestamp(tsp)
- struct timespec *tsp;
+vfs_timestamp(struct timespec *tsp)
{
struct timeval tv;
@@ -500,8 +519,7 @@
* Set vnode attributes to VNOVAL
*/
void
-vattr_null(vap)
- struct vattr *vap;
+vattr_null(struct vattr *vap)
{
vap->va_type = VNON;
@@ -610,7 +628,7 @@
* vnode lock before our VOP_LOCK() call fails.
*/
if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
- (vp->v_object != NULL &&
+ (vp->v_object != NULL &&
vp->v_object->resident_page_count > trigger)) {
VOP_UNLOCK(vp, LK_INTERLOCK, td);
goto next_iter_mntunlocked;
@@ -700,13 +718,13 @@
struct mount *mp, *nmp;
int done;
struct proc *p = vnlruproc;
- struct thread *td = FIRST_THREAD_IN_PROC(p);
-
- mtx_lock(&Giant);
+ struct thread *td = curthread;
EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
SHUTDOWN_PRI_FIRST);
+ mtx_lock(&Giant);
+
for (;;) {
kthread_suspend_check(p);
mtx_lock(&vnode_free_list_mtx);
@@ -742,6 +760,7 @@
}
mtx_unlock(&mountlist_mtx);
if (done == 0) {
+ EVENTHANDLER_INVOKE(vfs_lowvnodes, desiredvnodes / 10);
#if 0
/* These messages are temporary debugging aids */
if (vnlru_nowhere < 5)
@@ -751,7 +770,7 @@
#endif
vnlru_nowhere++;
tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
- } else
+ } else
uio_yield();
}
}
@@ -790,6 +809,7 @@
VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
+ VI_UNLOCK(vp);
#ifdef MAC
mac_destroy_vnode(vp);
#endif
@@ -859,11 +879,8 @@
* Return the next vnode from the free list.
*/
int
-getnewvnode(tag, mp, vops, vpp)
- const char *tag;
- struct mount *mp;
- struct vop_vector *vops;
- struct vnode **vpp;
+getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
+ struct vnode **vpp)
{
struct vnode *vp = NULL;
struct bufobj *bo;
@@ -878,8 +895,17 @@
* Wait for available vnodes.
*/
if (numvnodes > desiredvnodes) {
+ if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) {
+ /*
+ * File system is beeing suspended, we cannot risk a
+ * deadlock here, so allocate new vnode anyway.
+ */
+ if (freevnodes > wantfreevnodes)
+ vnlru_free(freevnodes - wantfreevnodes);
+ goto alloc;
+ }
if (vnlruproc_sig == 0) {
- vnlruproc_sig = 1; /* avoid unnecessary wakeups */
+ vnlruproc_sig = 1; /* avoid unnecessary wakeups */
wakeup(vnlruproc);
}
msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
@@ -891,6 +917,7 @@
}
#endif
}
+alloc:
numvnodes++;
mtx_unlock(&vnode_free_list_mtx);
vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
@@ -935,7 +962,6 @@
printf("NULL mp in getnewvnode()\n");
#endif
if (mp != NULL) {
- insmntque(vp, mp);
bo->bo_bsize = mp->mnt_stat.f_iosize;
if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
vp->v_vflag |= VV_NOKNOTE;
@@ -967,22 +993,56 @@
MNT_IUNLOCK(mp);
}
+static void
+insmntque_stddtr(struct vnode *vp, void *dtr_arg)
+{
+ struct thread *td;
+
+ td = curthread; /* XXX ? */
+ vp->v_data = NULL;
+ vp->v_op = &dead_vnodeops;
+ /* XXX non mp-safe fs may still call insmntque with vnode
+ unlocked */
+ if (!VOP_ISLOCKED(vp, td))
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ vgone(vp);
+ vput(vp);
+}
+
/*
* Insert into list of vnodes for the new mount point, if available.
*/
-static void
-insmntque(struct vnode *vp, struct mount *mp)
+int
+insmntque1(struct vnode *vp, struct mount *mp,
+ void (*dtr)(struct vnode *, void *), void *dtr_arg)
{
- vp->v_mount = mp;
+ KASSERT(vp->v_mount == NULL,
+ ("insmntque: vnode already on per mount vnode list"));
VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
MNT_ILOCK(mp);
+ if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
+ mp->mnt_nvnodelistsize == 0) {
+ MNT_IUNLOCK(mp);
+ if (dtr != NULL)
+ dtr(vp, dtr_arg);
+ return (EBUSY);
+ }
+ vp->v_mount = mp;
MNT_REF(mp);
TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
("neg mount point vnode list size"));
mp->mnt_nvnodelistsize++;
MNT_IUNLOCK(mp);
+ return (0);
+}
+
+int
+insmntque(struct vnode *vp, struct mount *mp)
+{
+
+ return (insmntque1(vp, mp, insmntque_stddtr, NULL));
}
/*
@@ -990,7 +1050,8 @@
* Called with the underlying object locked.
*/
int
-bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo)
+bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag,
+ int slptimeo)
{
int error;
@@ -1073,7 +1134,8 @@
* Called with the underlying object locked.
*/
int
-vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag, int slptimeo)
+vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag,
+ int slptimeo)
{
CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags);
@@ -1086,11 +1148,8 @@
*
*/
static int
-flushbuflist(bufv, flags, bo, slpflag, slptimeo)
- struct bufv *bufv;
- int flags;
- struct bufobj *bo;
- int slpflag, slptimeo;
+flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
+ int slptimeo)
{
struct buf *bp, *nbp;
int retval, error;
@@ -1121,7 +1180,7 @@
return (error != ENOLCK ? error : EAGAIN);
}
KASSERT(bp->b_bufobj == bo,
- ("bp %p wrong b_bufobj %p should be %p",
+ ("bp %p wrong b_bufobj %p should be %p",
bp, bp->b_bufobj, bo));
if (bp->b_bufobj != bo) { /* XXX: necessary ? */
BUF_UNLOCK(bp);
@@ -1143,12 +1202,12 @@
return (EAGAIN); /* XXX: why not loop ? */
}
bremfree(bp);
- bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
+ bp->b_flags |= (B_INVAL | B_RELBUF);
bp->b_flags &= ~B_ASYNC;
brelse(bp);
BO_LOCK(bo);
if (nbp != NULL &&
- (nbp->b_bufobj != bo ||
+ (nbp->b_bufobj != bo ||
nbp->b_lblkno != lblkno ||
(nbp->b_xflags &
(BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
@@ -1163,7 +1222,8 @@
* sync activity.
*/
int
-vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize)
+vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
+ off_t length, int blksize)
{
struct buf *bp, *nbp;
int anyfreed;
@@ -1339,7 +1399,7 @@
KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
(BX_VNDIRTY|BX_VNCLEAN),
("buf_vlist_remove: Buf %p is on two lists", bp));
- if (bp->b_xflags & BX_VNDIRTY)
+ if (bp->b_xflags & BX_VNDIRTY)
bv = &bp->b_bufobj->bo_dirty;
else
bv = &bp->b_bufobj->bo_clean;
@@ -1454,6 +1514,9 @@
ASSERT_VI_LOCKED(vp, "bgetvp");
vholdl(vp);
+ if (VFS_NEEDSGIANT(vp->v_mount) ||
+ vp->v_bufobj.bo_flag & BO_NEEDSGIANT)
+ bp->b_flags |= B_NEEDSGIANT;
bp->b_vp = vp;
bp->b_bufobj = &vp->v_bufobj;
/*
@@ -1488,9 +1551,10 @@
bo->bo_flag &= ~BO_ONWORKLST;
mtx_lock(&sync_mtx);
LIST_REMOVE(bo, bo_synclist);
- syncer_worklist_len--;
+ syncer_worklist_len--;
mtx_unlock(&sync_mtx);
}
+ bp->b_flags &= ~B_NEEDSGIANT;
bp->b_vp = NULL;
bp->b_bufobj = NULL;
vdropl(vp);
@@ -1511,7 +1575,7 @@
LIST_REMOVE(bo, bo_synclist);
else {
bo->bo_flag |= BO_ONWORKLST;
- syncer_worklist_len++;
+ syncer_worklist_len++;
}
if (delay > syncer_maxdelay - 2)
@@ -1547,16 +1611,42 @@
SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
static int
-sync_vnode(struct bufobj *bo, struct thread *td)
+sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
{
struct vnode *vp;
struct mount *mp;
+ int vfslocked;
- vp = bo->__bo_vnode; /* XXX */
- if (VOP_ISLOCKED(vp, NULL) != 0)
+ vfslocked = 0;
+restart:
+ *bo = LIST_FIRST(slp);
+ if (*bo == NULL) {
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (0);
+ }
+ vp = (*bo)->__bo_vnode; /* XXX */
+ if (VFS_NEEDSGIANT(vp->v_mount)) {
+ if (!vfslocked) {
+ vfslocked = 1;
+ if (mtx_trylock(&Giant) == 0) {
+ mtx_unlock(&sync_mtx);
+ mtx_lock(&Giant);
+ mtx_lock(&sync_mtx);
+ goto restart;
+ }
+ }
+ } else {
+ VFS_UNLOCK_GIANT(vfslocked);
+ vfslocked = 0;
+ }
+ if (VOP_ISLOCKED(vp, NULL) != 0) {
+ VFS_UNLOCK_GIANT(vfslocked);
return (1);
- if (VI_TRYLOCK(vp) == 0)
+ }
+ if (VI_TRYLOCK(vp) == 0) {
+ VFS_UNLOCK_GIANT(vfslocked);
return (1);
+ }
/*
* We use vhold in case the vnode does not
* successfully sync. vhold prevents the vnode from
@@ -1568,6 +1658,7 @@
VI_UNLOCK(vp);
if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
vdrop(vp);
+ VFS_UNLOCK_GIANT(vfslocked);
mtx_lock(&sync_mtx);
return (1);
}
@@ -1576,16 +1667,17 @@
VOP_UNLOCK(vp, 0, td);
vn_finished_write(mp);
VI_LOCK(vp);
- if ((bo->bo_flag & BO_ONWORKLST) != 0) {
+ if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
/*
* Put us back on the worklist. The worklist
* routine will remove us from our current
* position and then add us back in at a later
* position.
*/
- vn_syncer_add_to_worklist(bo, syncdelay);
+ vn_syncer_add_to_worklist(*bo, syncdelay);
}
vdropl(vp);
+ VFS_UNLOCK_GIANT(vfslocked);
mtx_lock(&sync_mtx);
return (0);
}
@@ -1600,7 +1692,7 @@
struct synclist *slp;
struct bufobj *bo;
long starttime;
- struct thread *td = FIRST_THREAD_IN_PROC(updateproc);
+ struct thread *td = curthread;
static int dummychan;
int last_work_seen;
int net_worklist_len;
@@ -1608,19 +1700,18 @@
int first_printf;
int error;
- mtx_lock(&Giant);
last_work_seen = 0;
syncer_final_iter = 0;
first_printf = 1;
syncer_state = SYNCER_RUNNING;
- starttime = time_second;
+ starttime = time_uptime;
td->td_pflags |= TDP_NORUNNINGBUF;
EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
SHUTDOWN_PRI_LAST);
+ mtx_lock(&sync_mtx);
for (;;) {
- mtx_lock(&sync_mtx);
if (syncer_state == SYNCER_FINAL_DELAY &&
syncer_final_iter == 0) {
mtx_unlock(&sync_mtx);
@@ -1629,14 +1720,14 @@
}
net_worklist_len = syncer_worklist_len - sync_vnode_count;
if (syncer_state != SYNCER_RUNNING &&
- starttime != time_second) {
+ starttime != time_uptime) {
if (first_printf) {
printf("\nSyncing disks, vnodes remaining...");
first_printf = 0;
}
printf("%d ", net_worklist_len);
}
- starttime = time_second;
+ starttime = time_uptime;
/*
* Push files whose dirty time has expired. Be careful
@@ -1652,7 +1743,7 @@
next = &syncer_workitem_pending[syncer_delayno];
/*
* If the worklist has wrapped since the
- * it was emptied of all but syncer vnodes,
+ * it was emptied of all but syncer vnodes,
* switch to the FINAL_DELAY state and run
* for one more second.
*/
@@ -1675,8 +1766,8 @@
last_work_seen = syncer_delayno;
if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
syncer_state = SYNCER_SHUTTING_DOWN;
- while ((bo = LIST_FIRST(slp)) != NULL) {
- error = sync_vnode(bo, td);
+ while (!LIST_EMPTY(slp)) {
+ error = sync_vnode(slp, &bo, td);
if (error == 1) {
LIST_REMOVE(bo, bo_synclist);
LIST_INSERT_HEAD(next, bo, bo_synclist);
@@ -1685,7 +1776,6 @@
}
if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
syncer_final_iter--;
- mtx_unlock(&sync_mtx);
/*
* The variable rushjob allows the kernel to speed up the
* processing of the filesystem syncer process. A rushjob
@@ -1696,15 +1786,12 @@
* ahead of the disk that the kernel memory pool is being
* threatened with exhaustion.
*/
- mtx_lock(&sync_mtx);
if (rushjob > 0) {
rushjob -= 1;
- mtx_unlock(&sync_mtx);
continue;
}
- mtx_unlock(&sync_mtx);
/*
- * Just sleep for a short period if time between
+ * Just sleep for a short period of time between
* iterations when shutting down to allow some I/O
* to happen.
*
@@ -1716,10 +1803,10 @@
* filesystem activity.
*/
if (syncer_state != SYNCER_RUNNING)
- tsleep(&dummychan, PPAUSE, "syncfnl",
+ msleep(&dummychan, &sync_mtx, PPAUSE, "syncfnl",
hz / SYNCER_SHUTDOWN_SPEEDUP);
- else if (time_second == starttime)
- tsleep(&lbolt, PPAUSE, "syncer", 0);
+ else if (time_uptime == starttime)
+ msleep(&lbolt, &sync_mtx, PPAUSE, "syncer", 0);
}
}
@@ -1729,13 +1816,12 @@
* normal turn time, otherwise it could take over the cpu.
*/
int
-speedup_syncer()
+speedup_syncer(void)
{
struct thread *td;
int ret = 0;
td = FIRST_THREAD_IN_PROC(updateproc);
- sleepq_remove(td, &lbolt);
mtx_lock(&sync_mtx);
if (rushjob < syncdelay / 2) {
rushjob += 1;
@@ -1743,6 +1829,7 @@
ret = 1;
}
mtx_unlock(&sync_mtx);
+ sleepq_remove(td, &lbolt);
return (ret);
}
@@ -1758,11 +1845,11 @@
if (howto & RB_NOSYNC)
return;
td = FIRST_THREAD_IN_PROC(updateproc);
- sleepq_remove(td, &lbolt);
mtx_lock(&sync_mtx);
syncer_state = SYNCER_SHUTTING_DOWN;
rushjob = 0;
mtx_unlock(&sync_mtx);
+ sleepq_remove(td, &lbolt);
kproc_shutdown(arg, howto);
}
@@ -1827,7 +1914,7 @@
if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
mtx_lock(&sync_mtx);
LIST_REMOVE(bo, bo_synclist);
- syncer_worklist_len--;
+ syncer_worklist_len--;
mtx_unlock(&sync_mtx);
bo->bo_flag &= ~BO_ONWORKLST;
}
@@ -2031,8 +2118,7 @@
* If count drops to zero, call inactive routine and return to freelist.
*/
void
-vrele(vp)
- struct vnode *vp;
+vrele(struct vnode *vp)
{
struct thread *td = curthread; /* XXX */
@@ -2086,11 +2172,10 @@
/*
* Release an already locked vnode. This give the same effects as
* unlock+vrele(), but takes less time and avoids releasing and
- * re-aquiring the lock (as vrele() aquires the lock internally.)
+ * re-aquiring the lock (as vrele() acquires the lock internally.)
*/
void
-vput(vp)
- struct vnode *vp;
+vput(struct vnode *vp)
{
struct thread *td = curthread; /* XXX */
int error;
@@ -2180,10 +2265,11 @@
* the vnode we will free it if it has been vgone'd otherwise it is
* placed on the free list.
*/
-static void
+void
vdropl(struct vnode *vp)
{
+ ASSERT_VI_LOCKED(vp, "vdropl");
if (vp->v_holdcnt <= 0)
panic("vdrop: holdcnt %d", vp->v_holdcnt);
vp->v_holdcnt--;
@@ -2247,11 +2333,7 @@
#endif
int
-vflush(mp, rootrefs, flags, td)
- struct mount *mp;
- int rootrefs;
- int flags;
- struct thread *td;
+vflush( struct mount *mp, int rootrefs, int flags, struct thread *td)
{
struct vnode *vp, *mvp, *rootvp = NULL;
struct vattr vattr;
@@ -2405,11 +2487,8 @@
CTR1(KTR_VFS, "vgonel: vp %p", vp);
ASSERT_VOP_LOCKED(vp, "vgonel");
ASSERT_VI_LOCKED(vp, "vgonel");
-#if 0
- /* XXX Need to fix ttyvp before I enable this. */
VNASSERT(vp->v_holdcnt, vp,
("vgonel: vp %p has no reference.", vp));
-#endif
td = curthread;
/*
@@ -2476,8 +2555,7 @@
* Calculate the total number of references to a special device.
*/
int
-vcount(vp)
- struct vnode *vp;
+vcount(struct vnode *vp)
{
int count;
@@ -2491,8 +2569,7 @@
* Same as above, but using the struct cdev *as argument
*/
int
-count_dev(dev)
- struct cdev *dev;
+count_dev(struct cdev *dev)
{
int count;
@@ -2513,7 +2590,8 @@
vn_printf(struct vnode *vp, const char *fmt, ...)
{
va_list ap;
- char buf[96];
+ char buf[256], buf2[16];
+ u_long flags;
va_start(ap, fmt);
vprintf(fmt, ap);
@@ -2525,15 +2603,54 @@
buf[0] = '\0';
buf[1] = '\0';
if (vp->v_vflag & VV_ROOT)
- strcat(buf, "|VV_ROOT");
+ strlcat(buf, "|VV_ROOT", sizeof(buf));
+ if (vp->v_vflag & VV_ISTTY)
+ strlcat(buf, "|VV_ISTTY", sizeof(buf));
+ if (vp->v_vflag & VV_NOSYNC)
+ strlcat(buf, "|VV_NOSYNC", sizeof(buf));
+ if (vp->v_vflag & VV_CACHEDLABEL)
+ strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
if (vp->v_vflag & VV_TEXT)
- strcat(buf, "|VV_TEXT");
+ strlcat(buf, "|VV_TEXT", sizeof(buf));
+ if (vp->v_vflag & VV_COPYONWRITE)
+ strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
if (vp->v_vflag & VV_SYSTEM)
- strcat(buf, "|VV_SYSTEM");
+ strlcat(buf, "|VV_SYSTEM", sizeof(buf));
+ if (vp->v_vflag & VV_PROCDEP)
+ strlcat(buf, "|VV_PROCDEP", sizeof(buf));
+ if (vp->v_vflag & VV_NOKNOTE)
+ strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
+ if (vp->v_vflag & VV_DELETED)
+ strlcat(buf, "|VV_DELETED", sizeof(buf));
+ if (vp->v_vflag & VV_MD)
+ strlcat(buf, "|VV_MD", sizeof(buf));
+ flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC |
+ VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
+ VV_NOKNOTE | VV_DELETED | VV_MD);
+ if (flags != 0) {
+ snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
+ strlcat(buf, buf2, sizeof(buf));
+ }
+ if (vp->v_iflag & VI_MOUNT)
+ strlcat(buf, "|VI_MOUNT", sizeof(buf));
+ if (vp->v_iflag & VI_AGE)
+ strlcat(buf, "|VI_AGE", sizeof(buf));
if (vp->v_iflag & VI_DOOMED)
- strcat(buf, "|VI_DOOMED");
+ strlcat(buf, "|VI_DOOMED", sizeof(buf));
if (vp->v_iflag & VI_FREE)
- strcat(buf, "|VI_FREE");
+ strlcat(buf, "|VI_FREE", sizeof(buf));
+ if (vp->v_iflag & VI_OBJDIRTY)
+ strlcat(buf, "|VI_OBJDIRTY", sizeof(buf));
+ if (vp->v_iflag & VI_DOINGINACT)
+ strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
+ if (vp->v_iflag & VI_OWEINACT)
+ strlcat(buf, "|VI_OWEINACT", sizeof(buf));
+ flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
+ VI_OBJDIRTY | VI_DOINGINACT | VI_OWEINACT);
+ if (flags != 0) {
+ snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
+ strlcat(buf, buf2, sizeof(buf));
+ }
printf(" flags (%s)\n", buf + 1);
if (mtx_owned(VI_MTX(vp)))
printf(" VI_LOCKed");
@@ -2549,7 +2666,6 @@
}
#ifdef DDB
-#include <ddb/ddb.h>
/*
* List all of the locked vnodes in the system.
* Called when debugging the kernel.
@@ -2575,7 +2691,20 @@
nmp = TAILQ_NEXT(mp, mnt_list);
}
}
-#endif
+
+/*
+ * Show details about the given vnode.
+ */
+DB_SHOW_COMMAND(vnode, db_show_vnode)
+{
+ struct vnode *vp;
+
+ if (!have_addr)
+ return;
+ vp = (struct vnode *)addr;
+ vn_printf(vp, "vnode ");
+}
+#endif /* DDB */
/*
* Fill in a struct xvfsconf based on a struct vfsconf.
@@ -2791,7 +2920,7 @@
* of mounting to avoid dependencies.
*/
void
-vfs_unmountall()
+vfs_unmountall(void)
{
struct mount *mp;
struct thread *td;
@@ -2839,7 +2968,6 @@
struct vnode *vp, *mvp;
struct vm_object *obj;
- (void) vn_start_write(NULL, &mp, V_WAIT);
MNT_ILOCK(mp);
MNT_VNODE_FOREACH(vp, mp, mvp) {
VI_LOCK(vp);
@@ -2870,7 +2998,6 @@
VI_UNLOCK(vp);
}
MNT_IUNLOCK(mp);
- vn_finished_write(mp);
}
/*
@@ -2945,10 +3072,7 @@
* to avoid race conditions.)
*/
int
-vn_pollrecord(vp, td, events)
- struct vnode *vp;
- struct thread *td;
- short events;
+vn_pollrecord(struct vnode *vp, struct thread *td, int events)
{
if (vp->v_pollinfo == NULL)
@@ -2988,7 +3112,7 @@
.vop_fsync = sync_fsync, /* fsync */
.vop_inactive = sync_inactive, /* inactive */
.vop_reclaim = sync_reclaim, /* reclaim */
- .vop_lock = vop_stdlock, /* lock */
+ .vop_lock1 = vop_stdlock, /* lock */
.vop_unlock = vop_stdunlock, /* unlock */
.vop_islocked = vop_stdislocked, /* islocked */
};
@@ -2997,8 +3121,7 @@
* Create a new filesystem syncer vnode for the specified mount point.
*/
int
-vfs_allocate_syncvnode(mp)
- struct mount *mp;
+vfs_allocate_syncvnode(struct mount *mp)
{
struct vnode *vp;
static long start, incr, next;
@@ -3010,6 +3133,9 @@
return (error);
}
vp->v_type = VNON;
+ error = insmntque(vp, mp);
+ if (error != 0)
+ panic("vfs_allocate_syncvnode: insmntque failed");
/*
* Place the vnode onto the syncer worklist. We attempt to
* scatter them about on the list so that they will go off
@@ -3042,18 +3168,12 @@
* Do a lazy sync of the filesystem.
*/
static int
-sync_fsync(ap)
- struct vop_fsync_args /* {
- struct vnode *a_vp;
- struct ucred *a_cred;
- int a_waitfor;
- struct thread *a_td;
- } */ *ap;
+sync_fsync(struct vop_fsync_args *ap)
{
struct vnode *syncvp = ap->a_vp;
struct mount *mp = syncvp->v_mount;
struct thread *td = ap->a_td;
- int error, asyncflag;
+ int error;
struct bufobj *bo;
/*
@@ -3083,12 +3203,17 @@
vfs_unbusy(mp, td);
return (0);
}
- asyncflag = mp->mnt_flag & MNT_ASYNC;
- mp->mnt_flag &= ~MNT_ASYNC;
+ MNT_ILOCK(mp);
+ mp->mnt_noasync++;
+ mp->mnt_kern_flag &= ~MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
vfs_msync(mp, MNT_NOWAIT);
error = VFS_SYNC(mp, MNT_LAZY, td);
- if (asyncflag)
- mp->mnt_flag |= MNT_ASYNC;
+ MNT_ILOCK(mp);
+ mp->mnt_noasync--;
+ if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
+ mp->mnt_kern_flag |= MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
vn_finished_write(mp);
vfs_unbusy(mp, td);
return (error);
@@ -3098,11 +3223,7 @@
* The syncer vnode is no referenced.
*/
static int
-sync_inactive(ap)
- struct vop_inactive_args /* {
- struct vnode *a_vp;
- struct thread *a_td;
- } */ *ap;
+sync_inactive(struct vop_inactive_args *ap)
{
vgone(ap->a_vp);
@@ -3115,10 +3236,7 @@
* Modifications to the worklist must be protected by sync_mtx.
*/
static int
-sync_reclaim(ap)
- struct vop_reclaim_args /* {
- struct vnode *a_vp;
- } */ *ap;
+sync_reclaim(struct vop_reclaim_args *ap)
{
struct vnode *vp = ap->a_vp;
struct bufobj *bo;
@@ -3129,7 +3247,7 @@
if (bo->bo_flag & BO_ONWORKLST) {
mtx_lock(&sync_mtx);
LIST_REMOVE(bo, bo_synclist);
- syncer_worklist_len--;
+ syncer_worklist_len--;
sync_vnode_count--;
mtx_unlock(&sync_mtx);
bo->bo_flag &= ~BO_ONWORKLST;
@@ -3143,9 +3261,7 @@
* Check if vnode represents a disk device
*/
int
-vn_isdisk(vp, errp)
- struct vnode *vp;
- int *errp;
+vn_isdisk(struct vnode *vp, int *errp)
{
int error;
@@ -3171,21 +3287,16 @@
* and optional call-by-reference privused argument allowing vaccess()
* to indicate to the caller whether privilege was used to satisfy the
* request (obsoleted). Returns 0 on success, or an errno on failure.
+ *
+ * The ifdef'd CAPABILITIES version is here for reference, but is not
+ * actually used.
*/
int
-vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
- enum vtype type;
- mode_t file_mode;
- uid_t file_uid;
- gid_t file_gid;
- mode_t acc_mode;
- struct ucred *cred;
- int *privused;
+vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
+ mode_t acc_mode, struct ucred *cred, int *privused)
{
mode_t dac_granted;
-#ifdef CAPABILITIES
- mode_t cap_granted;
-#endif
+ mode_t priv_granted;
/*
* Look for a normal, non-privileged way to access the file/directory
@@ -3239,56 +3350,46 @@
return (0);
privcheck:
- if (!suser_cred(cred, SUSER_ALLOWJAIL)) {
- /* XXX audit: privilege used */
- if (privused != NULL)
- *privused = 1;
- return (0);
- }
-
-#ifdef CAPABILITIES
/*
- * Build a capability mask to determine if the set of capabilities
+ * Build a privilege mask to determine if the set of privileges
* satisfies the requirements when combined with the granted mask
- * from above.
- * For each capability, if the capability is required, bitwise
- * or the request type onto the cap_granted mask.
+ * from above. For each privilege, if the privilege is required,
+ * bitwise or the request type onto the priv_granted mask.
*/
- cap_granted = 0;
+ priv_granted = 0;
if (type == VDIR) {
/*
- * For directories, use CAP_DAC_READ_SEARCH to satisfy
- * VEXEC requests, instead of CAP_DAC_EXECUTE.
+ * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
+ * requests, instead of PRIV_VFS_EXEC.
*/
if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
- !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
- cap_granted |= VEXEC;
+ !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
+ priv_granted |= VEXEC;
} else {
if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
- !cap_check(cred, NULL, CAP_DAC_EXECUTE, SUSER_ALLOWJAIL))
- cap_granted |= VEXEC;
+ !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
+ priv_granted |= VEXEC;
}
if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
- !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
- cap_granted |= VREAD;
+ !priv_check_cred(cred, PRIV_VFS_READ, 0))
+ priv_granted |= VREAD;
if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
- !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL))
- cap_granted |= (VWRITE | VAPPEND);
+ !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
+ priv_granted |= (VWRITE | VAPPEND);
if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
- !cap_check(cred, NULL, CAP_FOWNER, SUSER_ALLOWJAIL))
- cap_granted |= VADMIN;
+ !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
+ priv_granted |= VADMIN;
- if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
+ if ((acc_mode & (priv_granted | dac_granted)) == acc_mode) {
/* XXX audit: privilege used */
if (privused != NULL)
*privused = 1;
return (0);
}
-#endif
return ((acc_mode & VADMIN) ? EPERM : EACCES);
}
@@ -3298,8 +3399,8 @@
* permissions.
*/
int
-extattr_check_cred(struct vnode *vp, int attrnamespace,
- struct ucred *cred, struct thread *td, int access)
+extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
+ struct thread *td, int access)
{
/*
@@ -3309,16 +3410,13 @@
return (0);
/*
- * Do not allow privileged processes in jail to directly
- * manipulate system attributes.
- *
- * XXX What capability should apply here?
- * Probably CAP_SYS_SETFFLAG.
+ * Do not allow privileged processes in jail to directly manipulate
+ * system attributes.
*/
switch (attrnamespace) {
case EXTATTR_NAMESPACE_SYSTEM:
/* Potentially should be: return (EPERM); */
- return (suser_cred(cred, 0));
+ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
case EXTATTR_NAMESPACE_USER:
return (VOP_ACCESS(vp, access, cred, td));
default:
@@ -3438,10 +3536,10 @@
ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
/* Check the source (from). */
- if (a->a_tdvp != a->a_fdvp)
+ if (a->a_tdvp != a->a_fdvp && a->a_tvp != a->a_fdvp)
ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
if (a->a_tvp != a->a_fvp)
- ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked");
+ ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
/* Check the target. */
if (a->a_tvp)
@@ -3521,7 +3619,7 @@
vop_lock_pre(void *ap)
{
#ifdef DEBUG_VFS_LOCKS
- struct vop_lock_args *a = ap;
+ struct vop_lock1_args *a = ap;
if ((a->a_flags & LK_INTERLOCK) == 0)
ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
@@ -3534,7 +3632,7 @@
vop_lock_post(void *ap, int rc)
{
#ifdef DEBUG_VFS_LOCKS
- struct vop_lock_args *a = ap;
+ struct vop_lock1_args *a = ap;
ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
if (rc == 0)
@@ -3571,16 +3669,16 @@
struct vop_create_args *a = ap;
if (!rc)
- VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+ VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
}
void
vop_link_post(void *ap, int rc)
{
struct vop_link_args *a = ap;
-
+
if (!rc) {
- VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
}
}
@@ -3659,7 +3757,7 @@
vop_symlink_post(void *ap, int rc)
{
struct vop_symlink_args *a = ap;
-
+
if (!rc)
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
}
@@ -3730,14 +3828,17 @@
/* ensure that a specific sysctl goes to the right filesystem. */
if (strcmp(vc.vc_fstypename, "*") != 0 &&
strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
+ vfs_rel(mp);
return (EINVAL);
}
VCTLTOREQ(&vc, req);
- return (VFS_SYSCTL(mp, vc.vc_op, req));
+ error = VFS_SYSCTL(mp, vc.vc_op, req);
+ vfs_rel(mp);
+ return (error);
}
-SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR,
- NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid");
+SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "",
+ "Sysctl by fsid");
/*
* Function to initialize a va_filerev field sensibly.
@@ -3792,7 +3893,7 @@
{
struct vnode *vp = ap->a_vp;
struct knote *kn = ap->a_kn;
- struct knlist *knl;
+ struct knlist *knl;
switch (kn->kn_filter) {
case EVFILT_READ:
@@ -3848,7 +3949,7 @@
return (1);
}
- if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread))
+ if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread))
return (0);
kn->kn_data = va.va_size - kn->kn_fp->f_offset;
--- sys/kern/kern_acl.c
+++ /dev/null
@@ -1,1036 +0,0 @@
-/*-
- * Copyright (c) 1999-2003 Robert N. M. Watson
- * All rights reserved.
- *
- * This software was developed by Robert Watson for the TrustedBSD Project.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-/*
- * Developed by the TrustedBSD Project.
- * Support for POSIX.1e access control lists.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_acl.c,v 1.45.8.2 2005/11/13 03:14:00 csjp Exp $");
-
-#include "opt_mac.h"
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sysproto.h>
-#include <sys/kernel.h>
-#include <sys/mac.h>
-#include <sys/malloc.h>
-#include <sys/mount.h>
-#include <sys/vnode.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/namei.h>
-#include <sys/file.h>
-#include <sys/filedesc.h>
-#include <sys/proc.h>
-#include <sys/sysent.h>
-#include <sys/errno.h>
-#include <sys/stat.h>
-#include <sys/acl.h>
-
-#include <vm/uma.h>
-
-uma_zone_t acl_zone;
-static int vacl_set_acl(struct thread *td, struct vnode *vp,
- acl_type_t type, struct acl *aclp);
-static int vacl_get_acl(struct thread *td, struct vnode *vp,
- acl_type_t type, struct acl *aclp);
-static int vacl_aclcheck(struct thread *td, struct vnode *vp,
- acl_type_t type, struct acl *aclp);
-
-/*
- * Implement a version of vaccess() that understands POSIX.1e ACL semantics.
- * Return 0 on success, else an errno value. Should be merged into
- * vaccess() eventually.
- */
-int
-vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
- struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused)
-{
- struct acl_entry *acl_other, *acl_mask;
- mode_t dac_granted;
- mode_t cap_granted;
- mode_t acl_mask_granted;
- int group_matched, i;
-
- /*
- * Look for a normal, non-privileged way to access the file/directory
- * as requested. If it exists, go with that. Otherwise, attempt
- * to use privileges granted via cap_granted. In some cases,
- * which privileges to use may be ambiguous due to "best match",
- * in which case fall back on first match for the time being.
- */
- if (privused != NULL)
- *privused = 0;
-
- /*
- * Determine privileges now, but don't apply until we've found
- * a DAC entry that matches but has failed to allow access.
- */
-#ifndef CAPABILITIES
- if (suser_cred(cred, SUSER_ALLOWJAIL) == 0)
- cap_granted = VALLPERM;
- else
- cap_granted = 0;
-#else
- cap_granted = 0;
-
- if (type == VDIR) {
- if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
- CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
- cap_granted |= VEXEC;
- } else {
- if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
- CAP_DAC_EXECUTE, SUSER_ALLOWJAIL))
- cap_granted |= VEXEC;
- }
-
- if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH,
- SUSER_ALLOWJAIL))
- cap_granted |= VREAD;
-
- if (((acc_mode & VWRITE) || (acc_mode & VAPPEND)) &&
- !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL))
- cap_granted |= (VWRITE | VAPPEND);
-
- if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER,
- SUSER_ALLOWJAIL))
- cap_granted |= VADMIN;
-#endif /* CAPABILITIES */
-
- /*
- * The owner matches if the effective uid associated with the
- * credential matches that of the ACL_USER_OBJ entry. While we're
- * doing the first scan, also cache the location of the ACL_MASK
- * and ACL_OTHER entries, preventing some future iterations.
- */
- acl_mask = acl_other = NULL;
- for (i = 0; i < acl->acl_cnt; i++) {
- switch (acl->acl_entry[i].ae_tag) {
- case ACL_USER_OBJ:
- if (file_uid != cred->cr_uid)
- break;
- dac_granted = 0;
- dac_granted |= VADMIN;
- if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
- dac_granted |= VEXEC;
- if (acl->acl_entry[i].ae_perm & ACL_READ)
- dac_granted |= VREAD;
- if (acl->acl_entry[i].ae_perm & ACL_WRITE)
- dac_granted |= (VWRITE | VAPPEND);
- if ((acc_mode & dac_granted) == acc_mode)
- return (0);
- if ((acc_mode & (dac_granted | cap_granted)) ==
- acc_mode) {
- if (privused != NULL)
- *privused = 1;
- return (0);
- }
- goto error;
-
- case ACL_MASK:
- acl_mask = &acl->acl_entry[i];
- break;
-
- case ACL_OTHER:
- acl_other = &acl->acl_entry[i];
- break;
-
- default:
- break;
- }
- }
-
- /*
- * An ACL_OTHER entry should always exist in a valid access
- * ACL. If it doesn't, then generate a serious failure. For now,
- * this means a debugging message and EPERM, but in the future
- * should probably be a panic.
- */
- if (acl_other == NULL) {
- /*
- * XXX This should never happen
- */
- printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
- return (EPERM);
- }
-
- /*
- * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields
- * are masked by an ACL_MASK entry, if any. As such, first identify
- * the ACL_MASK field, then iterate through identifying potential
- * user matches, then group matches. If there is no ACL_MASK,
- * assume that the mask allows all requests to succeed.
- */
- if (acl_mask != NULL) {
- acl_mask_granted = 0;
- if (acl_mask->ae_perm & ACL_EXECUTE)
- acl_mask_granted |= VEXEC;
- if (acl_mask->ae_perm & ACL_READ)
- acl_mask_granted |= VREAD;
- if (acl_mask->ae_perm & ACL_WRITE)
- acl_mask_granted |= (VWRITE | VAPPEND);
- } else
- acl_mask_granted = VEXEC | VREAD | VWRITE | VAPPEND;
-
- /*
- * Iterate through user ACL entries. Do checks twice, first
- * without privilege, and then if a match is found but failed,
- * a second time with privilege.
- */
-
- /*
- * Check ACL_USER ACL entries.
- */
- for (i = 0; i < acl->acl_cnt; i++) {
- switch (acl->acl_entry[i].ae_tag) {
- case ACL_USER:
- if (acl->acl_entry[i].ae_id != cred->cr_uid)
- break;
- dac_granted = 0;
- if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
- dac_granted |= VEXEC;
- if (acl->acl_entry[i].ae_perm & ACL_READ)
- dac_granted |= VREAD;
- if (acl->acl_entry[i].ae_perm & ACL_WRITE)
- dac_granted |= (VWRITE | VAPPEND);
- dac_granted &= acl_mask_granted;
- if ((acc_mode & dac_granted) == acc_mode)
- return (0);
- if ((acc_mode & (dac_granted | cap_granted)) !=
- acc_mode)
- goto error;
-
- if (privused != NULL)
- *privused = 1;
- return (0);
- }
- }
-
- /*
- * Group match is best-match, not first-match, so find a
- * "best" match. Iterate across, testing each potential group
- * match. Make sure we keep track of whether we found a match
- * or not, so that we know if we should try again with any
- * available privilege, or if we should move on to ACL_OTHER.
- */
- group_matched = 0;
- for (i = 0; i < acl->acl_cnt; i++) {
- switch (acl->acl_entry[i].ae_tag) {
- case ACL_GROUP_OBJ:
- if (!groupmember(file_gid, cred))
- break;
- dac_granted = 0;
- if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
- dac_granted |= VEXEC;
- if (acl->acl_entry[i].ae_perm & ACL_READ)
- dac_granted |= VREAD;
- if (acl->acl_entry[i].ae_perm & ACL_WRITE)
- dac_granted |= (VWRITE | VAPPEND);
- dac_granted &= acl_mask_granted;
-
- if ((acc_mode & dac_granted) == acc_mode)
- return (0);
-
- group_matched = 1;
- break;
-
- case ACL_GROUP:
- if (!groupmember(acl->acl_entry[i].ae_id, cred))
- break;
- dac_granted = 0;
- if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
- dac_granted |= VEXEC;
- if (acl->acl_entry[i].ae_perm & ACL_READ)
- dac_granted |= VREAD;
- if (acl->acl_entry[i].ae_perm & ACL_WRITE)
- dac_granted |= (VWRITE | VAPPEND);
- dac_granted &= acl_mask_granted;
-
- if ((acc_mode & dac_granted) == acc_mode)
- return (0);
-
- group_matched = 1;
- break;
-
- default:
- break;
- }
- }
-
- if (group_matched == 1) {
- /*
- * There was a match, but it did not grant rights via
- * pure DAC. Try again, this time with privilege.
- */
- for (i = 0; i < acl->acl_cnt; i++) {
- switch (acl->acl_entry[i].ae_tag) {
- case ACL_GROUP_OBJ:
- if (!groupmember(file_gid, cred))
- break;
- dac_granted = 0;
- if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
- dac_granted |= VEXEC;
- if (acl->acl_entry[i].ae_perm & ACL_READ)
- dac_granted |= VREAD;
- if (acl->acl_entry[i].ae_perm & ACL_WRITE)
- dac_granted |= (VWRITE | VAPPEND);
- dac_granted &= acl_mask_granted;
-
- if ((acc_mode & (dac_granted | cap_granted)) !=
- acc_mode)
- break;
-
- if (privused != NULL)
- *privused = 1;
- return (0);
-
- case ACL_GROUP:
- if (!groupmember(acl->acl_entry[i].ae_id,
- cred))
- break;
- dac_granted = 0;
- if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
- dac_granted |= VEXEC;
- if (acl->acl_entry[i].ae_perm & ACL_READ)
- dac_granted |= VREAD;
- if (acl->acl_entry[i].ae_perm & ACL_WRITE)
- dac_granted |= (VWRITE | VAPPEND);
- dac_granted &= acl_mask_granted;
-
- if ((acc_mode & (dac_granted | cap_granted)) !=
- acc_mode)
- break;
-
- if (privused != NULL)
- *privused = 1;
- return (0);
-
- default:
- break;
- }
- }
- /*
- * Even with privilege, group membership was not sufficient.
- * Return failure.
- */
- goto error;
- }
-
- /*
- * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER.
- */
- dac_granted = 0;
- if (acl_other->ae_perm & ACL_EXECUTE)
- dac_granted |= VEXEC;
- if (acl_other->ae_perm & ACL_READ)
- dac_granted |= VREAD;
- if (acl_other->ae_perm & ACL_WRITE)
- dac_granted |= (VWRITE | VAPPEND);
-
- if ((acc_mode & dac_granted) == acc_mode)
- return (0);
- if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) {
- if (privused != NULL)
- *privused = 1;
- return (0);
- }
-
-error:
- return ((acc_mode & VADMIN) ? EPERM : EACCES);
-}
-
-/*
- * For the purposes of filesystems maintaining the _OBJ entries in an
- * inode with a mode_t field, this routine converts a mode_t entry
- * to an acl_perm_t.
- */
-acl_perm_t
-acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
-{
- acl_perm_t perm = 0;
-
- switch(tag) {
- case ACL_USER_OBJ:
- if (mode & S_IXUSR)
- perm |= ACL_EXECUTE;
- if (mode & S_IRUSR)
- perm |= ACL_READ;
- if (mode & S_IWUSR)
- perm |= ACL_WRITE;
- return (perm);
-
- case ACL_GROUP_OBJ:
- if (mode & S_IXGRP)
- perm |= ACL_EXECUTE;
- if (mode & S_IRGRP)
- perm |= ACL_READ;
- if (mode & S_IWGRP)
- perm |= ACL_WRITE;
- return (perm);
-
- case ACL_OTHER:
- if (mode & S_IXOTH)
- perm |= ACL_EXECUTE;
- if (mode & S_IROTH)
- perm |= ACL_READ;
- if (mode & S_IWOTH)
- perm |= ACL_WRITE;
- return (perm);
-
- default:
- printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
- return (0);
- }
-}
-
-/*
- * Given inode information (uid, gid, mode), return an acl entry of the
- * appropriate type.
- */
-struct acl_entry
-acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
-{
- struct acl_entry acl_entry;
-
- acl_entry.ae_tag = tag;
- acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
- switch(tag) {
- case ACL_USER_OBJ:
- acl_entry.ae_id = uid;
- break;
-
- case ACL_GROUP_OBJ:
- acl_entry.ae_id = gid;
- break;
-
- case ACL_OTHER:
- acl_entry.ae_id = ACL_UNDEFINED_ID;
- break;
-
- default:
- acl_entry.ae_id = ACL_UNDEFINED_ID;
- printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
- }
-
- return (acl_entry);
-}
-
-/*
- * Utility function to generate a file mode given appropriate ACL entries.
- */
-mode_t
-acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
- struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
-{
- mode_t mode;
-
- mode = 0;
- if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
- mode |= S_IXUSR;
- if (acl_user_obj_entry->ae_perm & ACL_READ)
- mode |= S_IRUSR;
- if (acl_user_obj_entry->ae_perm & ACL_WRITE)
- mode |= S_IWUSR;
- if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
- mode |= S_IXGRP;
- if (acl_group_obj_entry->ae_perm & ACL_READ)
- mode |= S_IRGRP;
- if (acl_group_obj_entry->ae_perm & ACL_WRITE)
- mode |= S_IWGRP;
- if (acl_other_entry->ae_perm & ACL_EXECUTE)
- mode |= S_IXOTH;
- if (acl_other_entry->ae_perm & ACL_READ)
- mode |= S_IROTH;
- if (acl_other_entry->ae_perm & ACL_WRITE)
- mode |= S_IWOTH;
-
- return (mode);
-}
-
-/*
- * Utility function to generate a file mode given a complete POSIX.1e
- * access ACL. Note that if the ACL is improperly formed, this may
- * result in a panic.
- */
-mode_t
-acl_posix1e_acl_to_mode(struct acl *acl)
-{
- struct acl_entry *acl_mask, *acl_user_obj, *acl_group_obj, *acl_other;
- int i;
-
- /*
- * Find the ACL entries relevant to a POSIX permission mode.
- */
- acl_user_obj = acl_group_obj = acl_other = acl_mask = NULL;
- for (i = 0; i < acl->acl_cnt; i++) {
- switch (acl->acl_entry[i].ae_tag) {
- case ACL_USER_OBJ:
- acl_user_obj = &acl->acl_entry[i];
- break;
-
- case ACL_GROUP_OBJ:
- acl_group_obj = &acl->acl_entry[i];
- break;
-
- case ACL_OTHER:
- acl_other = &acl->acl_entry[i];
- break;
-
- case ACL_MASK:
- acl_mask = &acl->acl_entry[i];
- break;
-
- case ACL_USER:
- case ACL_GROUP:
- break;
-
- default:
- panic("acl_posix1e_acl_to_mode: bad ae_tag");
- }
- }
-
- if (acl_user_obj == NULL || acl_group_obj == NULL || acl_other == NULL)
- panic("acl_posix1e_acl_to_mode: missing base ae_tags");
-
- /*
- * POSIX.1e specifies that if there is an ACL_MASK entry, we replace
- * the mode "group" bits with its permissions. If there isn't, we
- * use the ACL_GROUP_OBJ permissions.
- */
- if (acl_mask != NULL)
- return (acl_posix1e_perms_to_mode(acl_user_obj, acl_mask,
- acl_other));
- else
- return (acl_posix1e_perms_to_mode(acl_user_obj, acl_group_obj,
- acl_other));
-}
-
-/*
- * Perform a syntactic check of the ACL, sufficient to allow an
- * implementing filesystem to determine if it should accept this and
- * rely on the POSIX.1e ACL properties.
- */
-int
-acl_posix1e_check(struct acl *acl)
-{
- int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
- int num_acl_mask, num_acl_other, i;
-
- /*
- * Verify that the number of entries does not exceed the maximum
- * defined for acl_t.
- * Verify that the correct number of various sorts of ae_tags are
- * present:
- * Exactly one ACL_USER_OBJ
- * Exactly one ACL_GROUP_OBJ
- * Exactly one ACL_OTHER
- * If any ACL_USER or ACL_GROUP entries appear, then exactly one
- * ACL_MASK entry must also appear.
- * Verify that all ae_perm entries are in ACL_PERM_BITS.
- * Verify all ae_tag entries are understood by this implementation.
- * Note: Does not check for uniqueness of qualifier (ae_id) field.
- */
- num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
- num_acl_mask = num_acl_other = 0;
- if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0)
- return (EINVAL);
- for (i = 0; i < acl->acl_cnt; i++) {
- /*
- * Check for a valid tag.
- */
- switch(acl->acl_entry[i].ae_tag) {
- case ACL_USER_OBJ:
- acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
- if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
- return (EINVAL);
- num_acl_user_obj++;
- break;
- case ACL_GROUP_OBJ:
- acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
- if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
- return (EINVAL);
- num_acl_group_obj++;
- break;
- case ACL_USER:
- if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
- return (EINVAL);
- num_acl_user++;
- break;
- case ACL_GROUP:
- if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
- return (EINVAL);
- num_acl_group++;
- break;
- case ACL_OTHER:
- acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
- if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
- return (EINVAL);
- num_acl_other++;
- break;
- case ACL_MASK:
- acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
- if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
- return (EINVAL);
- num_acl_mask++;
- break;
- default:
- return (EINVAL);
- }
- /*
- * Check for valid perm entries.
- */
- if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
- ACL_PERM_BITS)
- return (EINVAL);
- }
- if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
- (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
- return (EINVAL);
- if (((num_acl_group != 0) || (num_acl_user != 0)) &&
- (num_acl_mask != 1))
- return (EINVAL);
- return (0);
-}
-
-/*
- * Given a requested mode for a new object, and a default ACL, combine
- * the two to produce a new mode. Be careful not to clear any bits that
- * aren't intended to be affected by the POSIX.1e ACL. Eventually,
- * this might also take the cmask as an argument, if we push that down
- * into per-filesystem-code.
- */
-mode_t
-acl_posix1e_newfilemode(mode_t cmode, struct acl *dacl)
-{
- mode_t mode;
-
- mode = cmode;
- /*
- * The current composition policy is that a permission bit must
- * be set in *both* the ACL and the requested creation mode for
- * it to appear in the resulting mode/ACL. First clear any
- * possibly effected bits, then reconstruct.
- */
- mode &= ACL_PRESERVE_MASK;
- mode |= (ACL_OVERRIDE_MASK & cmode & acl_posix1e_acl_to_mode(dacl));
-
- return (mode);
-}
-
-/*
- * These calls wrap the real vnode operations, and are called by the
- * syscall code once the syscall has converted the path or file
- * descriptor to a vnode (unlocked). The aclp pointer is assumed
- * still to point to userland, so this should not be consumed within
- * the kernel except by syscall code. Other code should directly
- * invoke VOP_{SET,GET}ACL.
- */
-
-/*
- * Given a vnode, set its ACL.
- */
-static int
-vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
- struct acl *aclp)
-{
- struct acl inkernacl;
- struct mount *mp;
- int error;
-
- error = copyin(aclp, &inkernacl, sizeof(struct acl));
- if (error)
- return(error);
- error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
- if (error != 0)
- return (error);
- VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-#ifdef MAC
- error = mac_check_vnode_setacl(td->td_ucred, vp, type, &inkernacl);
- if (error != 0)
- goto out;
-#endif
- error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td);
-#ifdef MAC
-out:
-#endif
- VOP_UNLOCK(vp, 0, td);
- vn_finished_write(mp);
- return(error);
-}
-
-/*
- * Given a vnode, get its ACL.
- */
-static int
-vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
- struct acl *aclp)
-{
- struct acl inkernelacl;
- int error;
-
- VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-#ifdef MAC
- error = mac_check_vnode_getacl(td->td_ucred, vp, type);
- if (error != 0)
- goto out;
-#endif
- error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td);
-#ifdef MAC
-out:
-#endif
- VOP_UNLOCK(vp, 0, td);
- if (error == 0)
- error = copyout(&inkernelacl, aclp, sizeof(struct acl));
- return (error);
-}
-
-/*
- * Given a vnode, delete its ACL.
- */
-static int
-vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
-{
- struct mount *mp;
- int error;
-
- error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
- if (error)
- return (error);
- VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-#ifdef MAC
- error = mac_check_vnode_deleteacl(td->td_ucred, vp, type);
- if (error)
- goto out;
-#endif
- error = VOP_SETACL(vp, type, 0, td->td_ucred, td);
-#ifdef MAC
-out:
-#endif
- VOP_UNLOCK(vp, 0, td);
- vn_finished_write(mp);
- return (error);
-}
-
-/*
- * Given a vnode, check whether an ACL is appropriate for it
- */
-static int
-vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
- struct acl *aclp)
-{
- struct acl inkernelacl;
- int error;
-
- error = copyin(aclp, &inkernelacl, sizeof(struct acl));
- if (error)
- return(error);
- error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td);
- return (error);
-}
-
-/*
- * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.
- * Don't need to lock, as the vacl_ code will get/release any locks
- * required.
- */
-
-/*
- * Given a file path, get an ACL for it
- *
- * MPSAFE
- */
-int
-__acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
-{
- struct nameidata nd;
- int vfslocked, error;
-
- NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- vfslocked = NDHASGIANT(&nd);
- if (error == 0) {
- error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
- NDFREE(&nd, 0);
- }
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-/*
- * Given a file path, get an ACL for it; don't follow links.
- *
- * MPSAFE
- */
-int
-__acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
-{
- struct nameidata nd;
- int vfslocked, error;
-
- NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- vfslocked = NDHASGIANT(&nd);
- if (error == 0) {
- error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
- NDFREE(&nd, 0);
- }
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-/*
- * Given a file path, set an ACL for it
- *
- * MPSAFE
- */
-int
-__acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
-{
- struct nameidata nd;
- int vfslocked, error;
-
- NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- vfslocked = NDHASGIANT(&nd);
- if (error == 0) {
- error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
- NDFREE(&nd, 0);
- }
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-/*
- * Given a file path, set an ACL for it; don't follow links.
- *
- * MPSAFE
- */
-int
-__acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
-{
- struct nameidata nd;
- int vfslocked, error;
-
- NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- vfslocked = NDHASGIANT(&nd);
- if (error == 0) {
- error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
- NDFREE(&nd, 0);
- }
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-/*
- * Given a file descriptor, get an ACL for it
- *
- * MPSAFE
- */
-int
-__acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
-{
- struct file *fp;
- int vfslocked, error;
-
- error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
- if (error == 0) {
- vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
- error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
- fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
- }
- return (error);
-}
-
-/*
- * Given a file descriptor, set an ACL for it
- *
- * MPSAFE
- */
-int
-__acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
-{
- struct file *fp;
- int vfslocked, error;
-
- error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
- if (error == 0) {
- vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
- error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
- fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
- }
- return (error);
-}
-
-/*
- * Given a file path, delete an ACL from it.
- *
- * MPSAFE
- */
-int
-__acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
-{
- struct nameidata nd;
- int vfslocked, error;
-
- NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- vfslocked = NDHASGIANT(&nd);
- if (error == 0) {
- error = vacl_delete(td, nd.ni_vp, uap->type);
- NDFREE(&nd, 0);
- }
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-/*
- * Given a file path, delete an ACL from it; don't follow links.
- *
- * MPSAFE
- */
-int
-__acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
-{
- struct nameidata nd;
- int vfslocked, error;
-
- NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- vfslocked = NDHASGIANT(&nd);
- if (error == 0) {
- error = vacl_delete(td, nd.ni_vp, uap->type);
- NDFREE(&nd, 0);
- }
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-/*
- * Given a file path, delete an ACL from it.
- *
- * MPSAFE
- */
-int
-__acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
-{
- struct file *fp;
- int vfslocked, error;
-
- error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
- if (error == 0) {
- vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
- error = vacl_delete(td, fp->f_vnode, uap->type);
- fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
- }
- return (error);
-}
-
-/*
- * Given a file path, check an ACL for it
- *
- * MPSAFE
- */
-int
-__acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
-{
- struct nameidata nd;
- int vfslocked, error;
-
- NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- vfslocked = NDHASGIANT(&nd);
- if (error == 0) {
- error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
- NDFREE(&nd, 0);
- }
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-/*
- * Given a file path, check an ACL for it; don't follow links.
- *
- * MPSAFE
- */
-int
-__acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
-{
- struct nameidata nd;
- int vfslocked, error;
-
- NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
- error = namei(&nd);
- vfslocked = NDHASGIANT(&nd);
- if (error == 0) {
- error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
- NDFREE(&nd, 0);
- }
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
-}
-
-/*
- * Given a file descriptor, check an ACL for it
- *
- * MPSAFE
- */
-int
-__acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
-{
- struct file *fp;
- int vfslocked, error;
-
- error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
- if (error == 0) {
- vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
- error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
- fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
- }
- return (error);
-}
-
-/* ARGUSED */
-
-static void
-aclinit(void *dummy __unused)
-{
-
- acl_zone = uma_zcreate("ACL UMA zone", sizeof(struct acl),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
-}
-SYSINIT(acls, SI_SUB_ACL, SI_ORDER_FIRST, aclinit, NULL)
Index: syscalls.master
===================================================================
RCS file: /home/cvs/src/sys/kern/syscalls.master,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/syscalls.master -L sys/kern/syscalls.master -u -r1.2 -r1.3
--- sys/kern/syscalls.master
+++ sys/kern/syscalls.master
@@ -1,19 +1,18 @@
- $FreeBSD: src/sys/kern/syscalls.master,v 1.198.2.2 2006/03/17 01:47:06 rwatson Exp $
+ $FreeBSD: src/sys/kern/syscalls.master,v 1.233 2007/08/16 05:26:41 davidxu Exp $
; from: @(#)syscalls.master 8.2 (Berkeley) 1/13/94
;
; System call name/number master file.
; Processed to created init_sysent.c, syscalls.c and syscall.h.
-; Columns: number [M]type nargs name alt{name,tag,rtyp}/comments
+; Columns: number audit type name alt{name,tag,rtyp}/comments
; number system call number, must be in order
; audit the audit event associated with the system call
; A value of AUE_NULL means no auditing, but it also means that
; there is no audit event for the call at this time. For the
; case where the event exists, but we don't want auditing, the
; event should be #defined to AUE_NULL in audit_kevents.h.
-; type one of [M]STD, [M]OBSOL, [M]UNIMPL, [M]COMPAT, [M]CPT_NOA,
-; [M]LIBCOMPAT, [M]NODEF, [M]NOARGS, [M]NOPROTO, [M]NOIMPL,
-; [M]NOSTD, [M]COMPAT4
+; type one of STD, OBSOL, UNIMPL, COMPAT, CPT_NOA, LIBCOMPAT,
+; NODEF, NOARGS, NOPROTO, NOIMPL, NOSTD, COMPAT4
; name psuedo-prototype of syscall routine
; If one of the following alts is different, then all appear:
; altname name of system call if different
@@ -22,9 +21,6 @@
; for UNIMPL/OBSOL, name continues with comments
; types:
-; [M] e.g. like MSTD -- means the system call is MP-safe. If no
-; M prefix is used, the syscall wrapper will obtain the Giant
-; lock for the syscall.
; STD always included
; COMPAT included on COMPAT #ifdef
; COMPAT4 included on COMPAT4 #ifdef (FreeBSD 4 compat)
@@ -32,8 +28,8 @@
; OBSOL obsolete, not included in system, only specifies name
; UNIMPL not implemented, placeholder only
; NOSTD implemented but as a lkm that can be statically
-; compiled in sysent entry will be filled with lkmsys
-; so the SYSCALL_MODULE macro works
+; compiled in; sysent entry will be filled with lkmsys
+; so the SYSCALL_MODULE macro works
;
; Please copy any additions and changes to the following compatability tables:
; sys/compat/freebsd32/syscalls.master
@@ -50,247 +46,247 @@
; redistributions should be placed in the reserved range at the end
; of the current calls.
-0 AUE_NULL MSTD { int nosys(void); } syscall nosys_args int
-1 AUE_NULL MSTD { void sys_exit(int rval); } exit \
+0 AUE_NULL STD { int nosys(void); } syscall nosys_args int
+1 AUE_EXIT STD { void sys_exit(int rval); } exit \
sys_exit_args void
-2 AUE_NULL MSTD { int fork(void); }
-3 AUE_NULL MSTD { ssize_t read(int fd, void *buf, \
+2 AUE_FORK STD { int fork(void); }
+3 AUE_NULL STD { ssize_t read(int fd, void *buf, \
size_t nbyte); }
-4 AUE_NULL MSTD { ssize_t write(int fd, const void *buf, \
+4 AUE_NULL STD { ssize_t write(int fd, const void *buf, \
size_t nbyte); }
-5 AUE_NULL MSTD { int open(char *path, int flags, int mode); }
+5 AUE_OPEN_RWTC STD { int open(char *path, int flags, int mode); }
; XXX should be { int open(const char *path, int flags, ...); }
; but we're not ready for `const' or varargs.
; XXX man page says `mode_t mode'.
-6 AUE_NULL MSTD { int close(int fd); }
-7 AUE_NULL MSTD { int wait4(int pid, int *status, \
+6 AUE_CLOSE STD { int close(int fd); }
+7 AUE_WAIT4 STD { int wait4(int pid, int *status, \
int options, struct rusage *rusage); } \
wait4 wait_args int
-8 AUE_NULL MCOMPAT { int creat(char *path, int mode); }
-9 AUE_NULL MSTD { int link(char *path, char *link); }
-10 AUE_NULL MSTD { int unlink(char *path); }
+8 AUE_CREAT COMPAT { int creat(char *path, int mode); }
+9 AUE_LINK STD { int link(char *path, char *link); }
+10 AUE_UNLINK STD { int unlink(char *path); }
11 AUE_NULL OBSOL execv
-12 AUE_NULL MSTD { int chdir(char *path); }
-13 AUE_NULL MSTD { int fchdir(int fd); }
-14 AUE_NULL MSTD { int mknod(char *path, int mode, int dev); }
-15 AUE_NULL MSTD { int chmod(char *path, int mode); }
-16 AUE_NULL MSTD { int chown(char *path, int uid, int gid); }
-17 AUE_NULL MSTD { int obreak(char *nsize); } break \
+12 AUE_CHDIR STD { int chdir(char *path); }
+13 AUE_FCHDIR STD { int fchdir(int fd); }
+14 AUE_MKNOD STD { int mknod(char *path, int mode, int dev); }
+15 AUE_CHMOD STD { int chmod(char *path, int mode); }
+16 AUE_CHOWN STD { int chown(char *path, int uid, int gid); }
+17 AUE_NULL STD { int obreak(char *nsize); } break \
obreak_args int
-18 AUE_NULL MCOMPAT4 { int getfsstat(struct ostatfs *buf, \
+18 AUE_GETFSSTAT COMPAT4 { int getfsstat(struct ostatfs *buf, \
long bufsize, int flags); }
-19 AUE_NULL MCOMPAT { long lseek(int fd, long offset, \
+19 AUE_LSEEK COMPAT { long lseek(int fd, long offset, \
int whence); }
-20 AUE_NULL MSTD { pid_t getpid(void); }
-21 AUE_NULL STD { int mount(char *type, char *path, \
+20 AUE_GETPID STD { pid_t getpid(void); }
+21 AUE_MOUNT STD { int mount(char *type, char *path, \
int flags, caddr_t data); }
; XXX `path' should have type `const char *' but we're not ready for that.
-22 AUE_NULL STD { int unmount(char *path, int flags); }
-23 AUE_NULL MSTD { int setuid(uid_t uid); }
-24 AUE_NULL MSTD { uid_t getuid(void); }
-25 AUE_NULL MSTD { uid_t geteuid(void); }
-26 AUE_NULL MSTD { int ptrace(int req, pid_t pid, \
+22 AUE_UMOUNT STD { int unmount(char *path, int flags); }
+23 AUE_SETUID STD { int setuid(uid_t uid); }
+24 AUE_GETUID STD { uid_t getuid(void); }
+25 AUE_GETEUID STD { uid_t geteuid(void); }
+26 AUE_PTRACE STD { int ptrace(int req, pid_t pid, \
caddr_t addr, int data); }
-27 AUE_NULL MSTD { int recvmsg(int s, struct msghdr *msg, \
+27 AUE_RECVMSG STD { int recvmsg(int s, struct msghdr *msg, \
int flags); }
-28 AUE_NULL MSTD { int sendmsg(int s, struct msghdr *msg, \
+28 AUE_SENDMSG STD { int sendmsg(int s, struct msghdr *msg, \
int flags); }
-29 AUE_NULL MSTD { int recvfrom(int s, caddr_t buf, \
+29 AUE_RECVFROM STD { int recvfrom(int s, caddr_t buf, \
size_t len, int flags, \
struct sockaddr * __restrict from, \
__socklen_t * __restrict fromlenaddr); }
-30 AUE_NULL MSTD { int accept(int s, \
+30 AUE_ACCEPT STD { int accept(int s, \
struct sockaddr * __restrict name, \
__socklen_t * __restrict anamelen); }
-31 AUE_NULL MSTD { int getpeername(int fdes, \
+31 AUE_GETPEERNAME STD { int getpeername(int fdes, \
struct sockaddr * __restrict asa, \
__socklen_t * __restrict alen); }
-32 AUE_NULL MSTD { int getsockname(int fdes, \
+32 AUE_GETSOCKNAME STD { int getsockname(int fdes, \
struct sockaddr * __restrict asa, \
__socklen_t * __restrict alen); }
-33 AUE_NULL MSTD { int access(char *path, int flags); }
-34 AUE_NULL MSTD { int chflags(char *path, int flags); }
-35 AUE_NULL MSTD { int fchflags(int fd, int flags); }
-36 AUE_NULL MSTD { int sync(void); }
-37 AUE_NULL MSTD { int kill(int pid, int signum); }
-38 AUE_NULL MCOMPAT { int stat(char *path, struct ostat *ub); }
-39 AUE_NULL MSTD { pid_t getppid(void); }
-40 AUE_NULL MCOMPAT { int lstat(char *path, struct ostat *ub); }
-41 AUE_NULL MSTD { int dup(u_int fd); }
-42 AUE_NULL MSTD { int pipe(void); }
-43 AUE_NULL MSTD { gid_t getegid(void); }
-44 AUE_NULL MSTD { int profil(caddr_t samples, size_t size, \
+33 AUE_ACCESS STD { int access(char *path, int flags); }
+34 AUE_CHFLAGS STD { int chflags(char *path, int flags); }
+35 AUE_FCHFLAGS STD { int fchflags(int fd, int flags); }
+36 AUE_SYNC STD { int sync(void); }
+37 AUE_KILL STD { int kill(int pid, int signum); }
+38 AUE_STAT COMPAT { int stat(char *path, struct ostat *ub); }
+39 AUE_GETPPID STD { pid_t getppid(void); }
+40 AUE_LSTAT COMPAT { int lstat(char *path, struct ostat *ub); }
+41 AUE_DUP STD { int dup(u_int fd); }
+42 AUE_PIPE STD { int pipe(void); }
+43 AUE_GETEGID STD { gid_t getegid(void); }
+44 AUE_PROFILE STD { int profil(caddr_t samples, size_t size, \
size_t offset, u_int scale); }
-45 AUE_NULL MSTD { int ktrace(const char *fname, int ops, \
+45 AUE_KTRACE STD { int ktrace(const char *fname, int ops, \
int facs, int pid); }
-46 AUE_NULL MCOMPAT { int sigaction(int signum, \
+46 AUE_SIGACTION COMPAT { int sigaction(int signum, \
struct osigaction *nsa, \
struct osigaction *osa); }
-47 AUE_NULL MSTD { gid_t getgid(void); }
-48 AUE_NULL MCOMPAT { int sigprocmask(int how, osigset_t mask); }
+47 AUE_GETGID STD { gid_t getgid(void); }
+48 AUE_SIGPROCMASK COMPAT { int sigprocmask(int how, osigset_t mask); }
; XXX note nonstandard (bogus) calling convention - the libc stub passes
; us the mask, not a pointer to it, and we return the old mask as the
; (int) return value.
-49 AUE_NULL MSTD { int getlogin(char *namebuf, u_int \
+49 AUE_GETLOGIN STD { int getlogin(char *namebuf, u_int \
namelen); }
-50 AUE_NULL MSTD { int setlogin(char *namebuf); }
-51 AUE_NULL MSTD { int acct(char *path); }
-52 AUE_NULL MCOMPAT { int sigpending(void); }
-53 AUE_NULL MSTD { int sigaltstack(stack_t *ss, \
+50 AUE_SETLOGIN STD { int setlogin(char *namebuf); }
+51 AUE_ACCT STD { int acct(char *path); }
+52 AUE_SIGPENDING COMPAT { int sigpending(void); }
+53 AUE_SIGALTSTACK STD { int sigaltstack(stack_t *ss, \
stack_t *oss); }
-54 AUE_NULL MSTD { int ioctl(int fd, u_long com, \
+54 AUE_IOCTL STD { int ioctl(int fd, u_long com, \
caddr_t data); }
-55 AUE_NULL MSTD { int reboot(int opt); }
-56 AUE_NULL MSTD { int revoke(char *path); }
-57 AUE_NULL MSTD { int symlink(char *path, char *link); }
-58 AUE_NULL MSTD { int readlink(char *path, char *buf, \
+55 AUE_REBOOT STD { int reboot(int opt); }
+56 AUE_REVOKE STD { int revoke(char *path); }
+57 AUE_SYMLINK STD { int symlink(char *path, char *link); }
+58 AUE_READLINK STD { int readlink(char *path, char *buf, \
int count); }
-59 AUE_NULL MSTD { int execve(char *fname, char **argv, \
+59 AUE_EXECVE STD { int execve(char *fname, char **argv, \
char **envv); }
-60 AUE_NULL MSTD { int umask(int newmask); } umask umask_args \
+60 AUE_UMASK STD { int umask(int newmask); } umask umask_args \
int
-61 AUE_NULL MSTD { int chroot(char *path); }
-62 AUE_NULL MCOMPAT { int fstat(int fd, struct ostat *sb); }
-63 AUE_NULL MCOMPAT { int getkerninfo(int op, char *where, \
+61 AUE_CHROOT STD { int chroot(char *path); }
+62 AUE_FSTAT COMPAT { int fstat(int fd, struct ostat *sb); }
+63 AUE_NULL COMPAT { int getkerninfo(int op, char *where, \
size_t *size, int arg); } getkerninfo \
getkerninfo_args int
-64 AUE_NULL MCOMPAT { int getpagesize(void); } getpagesize \
+64 AUE_NULL COMPAT { int getpagesize(void); } getpagesize \
getpagesize_args int
-65 AUE_NULL MSTD { int msync(void *addr, size_t len, \
+65 AUE_MSYNC STD { int msync(void *addr, size_t len, \
int flags); }
-66 AUE_NULL MSTD { int vfork(void); }
+66 AUE_VFORK STD { int vfork(void); }
67 AUE_NULL OBSOL vread
68 AUE_NULL OBSOL vwrite
-69 AUE_NULL MSTD { int sbrk(int incr); }
-70 AUE_NULL MSTD { int sstk(int incr); }
-71 AUE_NULL MCOMPAT { int mmap(void *addr, int len, int prot, \
+69 AUE_SBRK STD { int sbrk(int incr); }
+70 AUE_SSTK STD { int sstk(int incr); }
+71 AUE_MMAP COMPAT { int mmap(void *addr, int len, int prot, \
int flags, int fd, long pos); }
-72 AUE_NULL MSTD { int ovadvise(int anom); } vadvise \
+72 AUE_O_VADVISE STD { int ovadvise(int anom); } vadvise \
ovadvise_args int
-73 AUE_NULL MSTD { int munmap(void *addr, size_t len); }
-74 AUE_NULL MSTD { int mprotect(const void *addr, size_t len, \
+73 AUE_MUNMAP STD { int munmap(void *addr, size_t len); }
+74 AUE_MPROTECT STD { int mprotect(const void *addr, size_t len, \
int prot); }
-75 AUE_NULL MSTD { int madvise(void *addr, size_t len, \
+75 AUE_MADVISE STD { int madvise(void *addr, size_t len, \
int behav); }
76 AUE_NULL OBSOL vhangup
77 AUE_NULL OBSOL vlimit
-78 AUE_NULL MSTD { int mincore(const void *addr, size_t len, \
+78 AUE_MINCORE STD { int mincore(const void *addr, size_t len, \
char *vec); }
-79 AUE_NULL MSTD { int getgroups(u_int gidsetsize, \
+79 AUE_GETGROUPS STD { int getgroups(u_int gidsetsize, \
gid_t *gidset); }
-80 AUE_NULL MSTD { int setgroups(u_int gidsetsize, \
+80 AUE_SETGROUPS STD { int setgroups(u_int gidsetsize, \
gid_t *gidset); }
-81 AUE_NULL MSTD { int getpgrp(void); }
-82 AUE_NULL MSTD { int setpgid(int pid, int pgid); }
-83 AUE_NULL MSTD { int setitimer(u_int which, struct \
+81 AUE_GETPGRP STD { int getpgrp(void); }
+82 AUE_SETPGRP STD { int setpgid(int pid, int pgid); }
+83 AUE_SETITIMER STD { int setitimer(u_int which, struct \
itimerval *itv, struct itimerval *oitv); }
-84 AUE_NULL MCOMPAT { int wait(void); }
-85 AUE_NULL MSTD { int swapon(char *name); }
-86 AUE_NULL MSTD { int getitimer(u_int which, \
+84 AUE_WAIT4 COMPAT { int wait(void); }
+85 AUE_SWAPON STD { int swapon(char *name); }
+86 AUE_GETITIMER STD { int getitimer(u_int which, \
struct itimerval *itv); }
-87 AUE_NULL MCOMPAT { int gethostname(char *hostname, \
+87 AUE_SYSCTL COMPAT { int gethostname(char *hostname, \
u_int len); } gethostname \
gethostname_args int
-88 AUE_NULL MCOMPAT { int sethostname(char *hostname, \
+88 AUE_SYSCTL COMPAT { int sethostname(char *hostname, \
u_int len); } sethostname \
sethostname_args int
-89 AUE_NULL MSTD { int getdtablesize(void); }
-90 AUE_NULL MSTD { int dup2(u_int from, u_int to); }
+89 AUE_GETDTABLESIZE STD { int getdtablesize(void); }
+90 AUE_DUP2 STD { int dup2(u_int from, u_int to); }
91 AUE_NULL UNIMPL getdopt
-92 AUE_NULL MSTD { int fcntl(int fd, int cmd, long arg); }
+92 AUE_FCNTL STD { int fcntl(int fd, int cmd, long arg); }
; XXX should be { int fcntl(int fd, int cmd, ...); }
; but we're not ready for varargs.
-93 AUE_NULL MSTD { int select(int nd, fd_set *in, fd_set *ou, \
+93 AUE_SELECT STD { int select(int nd, fd_set *in, fd_set *ou, \
fd_set *ex, struct timeval *tv); }
94 AUE_NULL UNIMPL setdopt
-95 AUE_NULL MSTD { int fsync(int fd); }
-96 AUE_NULL MSTD { int setpriority(int which, int who, \
+95 AUE_FSYNC STD { int fsync(int fd); }
+96 AUE_SETPRIORITY STD { int setpriority(int which, int who, \
int prio); }
-97 AUE_NULL MSTD { int socket(int domain, int type, \
+97 AUE_SOCKET STD { int socket(int domain, int type, \
int protocol); }
-98 AUE_NULL MSTD { int connect(int s, caddr_t name, \
+98 AUE_CONNECT STD { int connect(int s, caddr_t name, \
int namelen); }
-99 AUE_NULL MCPT_NOA { int accept(int s, caddr_t name, \
+99 AUE_ACCEPT CPT_NOA { int accept(int s, caddr_t name, \
int *anamelen); } accept accept_args int
-100 AUE_NULL MSTD { int getpriority(int which, int who); }
-101 AUE_NULL MCOMPAT { int send(int s, caddr_t buf, int len, \
+100 AUE_GETPRIORITY STD { int getpriority(int which, int who); }
+101 AUE_SEND COMPAT { int send(int s, caddr_t buf, int len, \
int flags); }
-102 AUE_NULL MCOMPAT { int recv(int s, caddr_t buf, int len, \
+102 AUE_RECV COMPAT { int recv(int s, caddr_t buf, int len, \
int flags); }
-103 AUE_NULL MCOMPAT { int sigreturn( \
+103 AUE_SIGRETURN COMPAT { int sigreturn( \
struct osigcontext *sigcntxp); }
-104 AUE_NULL MSTD { int bind(int s, caddr_t name, \
+104 AUE_BIND STD { int bind(int s, caddr_t name, \
int namelen); }
-105 AUE_NULL MSTD { int setsockopt(int s, int level, int name, \
+105 AUE_SETSOCKOPT STD { int setsockopt(int s, int level, int name, \
caddr_t val, int valsize); }
-106 AUE_NULL MSTD { int listen(int s, int backlog); }
+106 AUE_LISTEN STD { int listen(int s, int backlog); }
107 AUE_NULL OBSOL vtimes
-108 AUE_NULL MCOMPAT { int sigvec(int signum, struct sigvec *nsv, \
+108 AUE_NULL COMPAT { int sigvec(int signum, struct sigvec *nsv, \
struct sigvec *osv); }
-109 AUE_NULL MCOMPAT { int sigblock(int mask); }
-110 AUE_NULL MCOMPAT { int sigsetmask(int mask); }
-111 AUE_NULL MCOMPAT { int sigsuspend(osigset_t mask); }
+109 AUE_NULL COMPAT { int sigblock(int mask); }
+110 AUE_NULL COMPAT { int sigsetmask(int mask); }
+111 AUE_NULL COMPAT { int sigsuspend(osigset_t mask); }
; XXX note nonstandard (bogus) calling convention - the libc stub passes
; us the mask, not a pointer to it.
-112 AUE_NULL MCOMPAT { int sigstack(struct sigstack *nss, \
+112 AUE_NULL COMPAT { int sigstack(struct sigstack *nss, \
struct sigstack *oss); }
-113 AUE_NULL MCOMPAT { int recvmsg(int s, struct omsghdr *msg, \
+113 AUE_RECVMSG COMPAT { int recvmsg(int s, struct omsghdr *msg, \
int flags); }
-114 AUE_NULL MCOMPAT { int sendmsg(int s, caddr_t msg, \
+114 AUE_SENDMSG COMPAT { int sendmsg(int s, caddr_t msg, \
int flags); }
115 AUE_NULL OBSOL vtrace
-116 AUE_NULL MSTD { int gettimeofday(struct timeval *tp, \
+116 AUE_GETTIMEOFDAY STD { int gettimeofday(struct timeval *tp, \
struct timezone *tzp); }
-117 AUE_NULL MSTD { int getrusage(int who, \
+117 AUE_GETRUSAGE STD { int getrusage(int who, \
struct rusage *rusage); }
-118 AUE_NULL MSTD { int getsockopt(int s, int level, int name, \
+118 AUE_GETSOCKOPT STD { int getsockopt(int s, int level, int name, \
caddr_t val, int *avalsize); }
119 AUE_NULL UNIMPL resuba (BSD/OS 2.x)
-120 AUE_NULL MSTD { int readv(int fd, struct iovec *iovp, \
+120 AUE_READV STD { int readv(int fd, struct iovec *iovp, \
u_int iovcnt); }
-121 AUE_NULL MSTD { int writev(int fd, struct iovec *iovp, \
+121 AUE_WRITEV STD { int writev(int fd, struct iovec *iovp, \
u_int iovcnt); }
-122 AUE_NULL MSTD { int settimeofday(struct timeval *tv, \
+122 AUE_SETTIMEOFDAY STD { int settimeofday(struct timeval *tv, \
struct timezone *tzp); }
-123 AUE_NULL MSTD { int fchown(int fd, int uid, int gid); }
-124 AUE_NULL MSTD { int fchmod(int fd, int mode); }
-125 AUE_NULL MCPT_NOA { int recvfrom(int s, caddr_t buf, \
+123 AUE_FCHOWN STD { int fchown(int fd, int uid, int gid); }
+124 AUE_FCHMOD STD { int fchmod(int fd, int mode); }
+125 AUE_RECVFROM CPT_NOA { int recvfrom(int s, caddr_t buf, \
size_t len, int flags, caddr_t from, int \
*fromlenaddr); } recvfrom recvfrom_args \
int
-126 AUE_NULL MSTD { int setreuid(int ruid, int euid); }
-127 AUE_NULL MSTD { int setregid(int rgid, int egid); }
-128 AUE_NULL MSTD { int rename(char *from, char *to); }
-129 AUE_NULL MCOMPAT { int truncate(char *path, long length); }
-130 AUE_NULL MCOMPAT { int ftruncate(int fd, long length); }
-131 AUE_NULL MSTD { int flock(int fd, int how); }
-132 AUE_NULL MSTD { int mkfifo(char *path, int mode); }
-133 AUE_NULL MSTD { int sendto(int s, caddr_t buf, size_t len, \
+126 AUE_SETREUID STD { int setreuid(int ruid, int euid); }
+127 AUE_SETREGID STD { int setregid(int rgid, int egid); }
+128 AUE_RENAME STD { int rename(char *from, char *to); }
+129 AUE_TRUNCATE COMPAT { int truncate(char *path, long length); }
+130 AUE_FTRUNCATE COMPAT { int ftruncate(int fd, long length); }
+131 AUE_FLOCK STD { int flock(int fd, int how); }
+132 AUE_MKFIFO STD { int mkfifo(char *path, int mode); }
+133 AUE_SENDTO STD { int sendto(int s, caddr_t buf, size_t len, \
int flags, caddr_t to, int tolen); }
-134 AUE_NULL MSTD { int shutdown(int s, int how); }
-135 AUE_NULL MSTD { int socketpair(int domain, int type, \
+134 AUE_SHUTDOWN STD { int shutdown(int s, int how); }
+135 AUE_SOCKETPAIR STD { int socketpair(int domain, int type, \
int protocol, int *rsv); }
-136 AUE_NULL MSTD { int mkdir(char *path, int mode); }
-137 AUE_NULL MSTD { int rmdir(char *path); }
-138 AUE_NULL MSTD { int utimes(char *path, \
+136 AUE_MKDIR STD { int mkdir(char *path, int mode); }
+137 AUE_RMDIR STD { int rmdir(char *path); }
+138 AUE_UTIMES STD { int utimes(char *path, \
struct timeval *tptr); }
139 AUE_NULL OBSOL 4.2 sigreturn
-140 AUE_NULL MSTD { int adjtime(struct timeval *delta, \
+140 AUE_ADJTIME STD { int adjtime(struct timeval *delta, \
struct timeval *olddelta); }
-141 AUE_NULL MCOMPAT { int getpeername(int fdes, caddr_t asa, \
+141 AUE_GETPEERNAME COMPAT { int getpeername(int fdes, caddr_t asa, \
int *alen); }
-142 AUE_NULL MCOMPAT { long gethostid(void); }
-143 AUE_NULL MCOMPAT { int sethostid(long hostid); }
-144 AUE_NULL MCOMPAT { int getrlimit(u_int which, struct \
+142 AUE_SYSCTL COMPAT { long gethostid(void); }
+143 AUE_SYSCTL COMPAT { int sethostid(long hostid); }
+144 AUE_GETRLIMIT COMPAT { int getrlimit(u_int which, struct \
orlimit *rlp); }
-145 AUE_NULL MCOMPAT { int setrlimit(u_int which, \
+145 AUE_SETRLIMIT COMPAT { int setrlimit(u_int which, \
struct orlimit *rlp); }
-146 AUE_NULL MCOMPAT { int killpg(int pgid, int signum); }
-147 AUE_NULL MSTD { int setsid(void); }
-148 AUE_NULL MSTD { int quotactl(char *path, int cmd, int uid, \
+146 AUE_KILLPG COMPAT { int killpg(int pgid, int signum); }
+147 AUE_SETSID STD { int setsid(void); }
+148 AUE_QUOTACTL STD { int quotactl(char *path, int cmd, int uid, \
caddr_t arg); }
-149 AUE_NULL MCOMPAT { int quota(void); }
-150 AUE_NULL MCPT_NOA { int getsockname(int fdec, \
+149 AUE_O_QUOTA COMPAT { int quota(void); }
+150 AUE_GETSOCKNAME CPT_NOA { int getsockname(int fdec, \
caddr_t asa, int *alen); } getsockname \
getsockname_args int
@@ -303,95 +299,96 @@
153 AUE_NULL UNIMPL asyncdaemon (BSD/OS 2.x)
154 AUE_NULL UNIMPL nosys
; 155 is initialized by the NFS code, if present.
-155 AUE_NULL MNOIMPL { int nfssvc(int flag, caddr_t argp); }
-156 AUE_NULL COMPAT { int getdirentries(int fd, char *buf, \
+155 AUE_NFS_SVC NOSTD { int nfssvc(int flag, caddr_t argp); }
+156 AUE_GETDIRENTRIES COMPAT { int getdirentries(int fd, char *buf, \
u_int count, long *basep); }
-157 AUE_NULL MCOMPAT4 { int statfs(char *path, \
+157 AUE_STATFS COMPAT4 { int statfs(char *path, \
struct ostatfs *buf); }
-158 AUE_NULL MCOMPAT4 { int fstatfs(int fd, \
+158 AUE_FSTATFS COMPAT4 { int fstatfs(int fd, \
struct ostatfs *buf); }
159 AUE_NULL UNIMPL nosys
-160 AUE_NULL MSTD { int lgetfh(char *fname, \
+160 AUE_LGETFH STD { int lgetfh(char *fname, \
struct fhandle *fhp); }
-161 AUE_NULL MSTD { int getfh(char *fname, \
+161 AUE_NFS_GETFH STD { int getfh(char *fname, \
struct fhandle *fhp); }
-162 AUE_NULL MSTD { int getdomainname(char *domainname, \
+162 AUE_SYSCTL STD { int getdomainname(char *domainname, \
int len); }
-163 AUE_NULL MSTD { int setdomainname(char *domainname, \
+163 AUE_SYSCTL STD { int setdomainname(char *domainname, \
int len); }
-164 AUE_NULL MSTD { int uname(struct utsname *name); }
-165 AUE_NULL MSTD { int sysarch(int op, char *parms); }
-166 AUE_NULL MSTD { int rtprio(int function, pid_t pid, \
+164 AUE_NULL STD { int uname(struct utsname *name); }
+165 AUE_SYSARCH STD { int sysarch(int op, char *parms); }
+166 AUE_RTPRIO STD { int rtprio(int function, pid_t pid, \
struct rtprio *rtp); }
167 AUE_NULL UNIMPL nosys
168 AUE_NULL UNIMPL nosys
; 169 is initialized by the SYSVSEM code if present or loaded
-169 AUE_NULL MNOSTD { int semsys(int which, int a2, int a3, \
+169 AUE_SEMSYS NOSTD { int semsys(int which, int a2, int a3, \
int a4, int a5); }
; 169 is initialized by the SYSVMSG code if present or loaded
; XXX should be { int semsys(int which, ...); }
-170 AUE_NULL MNOSTD { int msgsys(int which, int a2, int a3, \
+170 AUE_MSGSYS NOSTD { int msgsys(int which, int a2, int a3, \
int a4, int a5, int a6); }
; 169 is initialized by the SYSVSHM code if present or loaded
; XXX should be { int msgsys(int which, ...); }
-171 AUE_NULL MNOSTD { int shmsys(int which, int a2, int a3, \
+171 AUE_SHMSYS NOSTD { int shmsys(int which, int a2, int a3, \
int a4); }
; XXX should be { int shmsys(int which, ...); }
172 AUE_NULL UNIMPL nosys
-173 AUE_NULL MSTD { ssize_t pread(int fd, void *buf, \
+173 AUE_PREAD STD { ssize_t freebsd6_pread(int fd, void *buf, \
size_t nbyte, int pad, off_t offset); }
-174 AUE_NULL MSTD { ssize_t pwrite(int fd, const void *buf, \
+174 AUE_PWRITE STD { ssize_t freebsd6_pwrite(int fd, \
+ const void *buf, \
size_t nbyte, int pad, off_t offset); }
175 AUE_NULL UNIMPL nosys
-176 AUE_NULL MSTD { int ntp_adjtime(struct timex *tp); }
+176 AUE_NTP_ADJTIME STD { int ntp_adjtime(struct timex *tp); }
177 AUE_NULL UNIMPL sfork (BSD/OS 2.x)
178 AUE_NULL UNIMPL getdescriptor (BSD/OS 2.x)
179 AUE_NULL UNIMPL setdescriptor (BSD/OS 2.x)
180 AUE_NULL UNIMPL nosys
; Syscalls 181-199 are used by/reserved for BSD
-181 AUE_NULL MSTD { int setgid(gid_t gid); }
-182 AUE_NULL MSTD { int setegid(gid_t egid); }
-183 AUE_NULL MSTD { int seteuid(uid_t euid); }
+181 AUE_SETGID STD { int setgid(gid_t gid); }
+182 AUE_SETEGID STD { int setegid(gid_t egid); }
+183 AUE_SETEUID STD { int seteuid(uid_t euid); }
184 AUE_NULL UNIMPL lfs_bmapv
185 AUE_NULL UNIMPL lfs_markv
186 AUE_NULL UNIMPL lfs_segclean
187 AUE_NULL UNIMPL lfs_segwait
-188 AUE_NULL MSTD { int stat(char *path, struct stat *ub); }
-189 AUE_NULL MSTD { int fstat(int fd, struct stat *sb); }
-190 AUE_NULL MSTD { int lstat(char *path, struct stat *ub); }
-191 AUE_NULL MSTD { int pathconf(char *path, int name); }
-192 AUE_NULL MSTD { int fpathconf(int fd, int name); }
+188 AUE_STAT STD { int stat(char *path, struct stat *ub); }
+189 AUE_FSTAT STD { int fstat(int fd, struct stat *sb); }
+190 AUE_LSTAT STD { int lstat(char *path, struct stat *ub); }
+191 AUE_PATHCONF STD { int pathconf(char *path, int name); }
+192 AUE_FPATHCONF STD { int fpathconf(int fd, int name); }
193 AUE_NULL UNIMPL nosys
-194 AUE_NULL MSTD { int getrlimit(u_int which, \
+194 AUE_GETRLIMIT STD { int getrlimit(u_int which, \
struct rlimit *rlp); } getrlimit \
__getrlimit_args int
-195 AUE_NULL MSTD { int setrlimit(u_int which, \
+195 AUE_SETRLIMIT STD { int setrlimit(u_int which, \
struct rlimit *rlp); } setrlimit \
__setrlimit_args int
-196 AUE_NULL MSTD { int getdirentries(int fd, char *buf, \
+196 AUE_GETDIRENTRIES STD { int getdirentries(int fd, char *buf, \
u_int count, long *basep); }
-197 AUE_NULL MSTD { caddr_t mmap(caddr_t addr, size_t len, \
- int prot, int flags, int fd, int pad, \
- off_t pos); }
-198 AUE_NULL MSTD { int nosys(void); } __syscall \
+197 AUE_MMAP STD { caddr_t freebsd6_mmap(caddr_t addr, \
+ size_t len, int prot, int flags, int fd, \
+ int pad, off_t pos); }
+198 AUE_NULL STD { int nosys(void); } __syscall \
__syscall_args int
-199 AUE_NULL MSTD { off_t lseek(int fd, int pad, off_t offset, \
- int whence); }
-200 AUE_NULL MSTD { int truncate(char *path, int pad, \
+199 AUE_LSEEK STD { off_t freebsd6_lseek(int fd, int pad, \
+ off_t offset, int whence); }
+200 AUE_TRUNCATE STD { int freebsd6_truncate(char *path, int pad, \
off_t length); }
-201 AUE_NULL MSTD { int ftruncate(int fd, int pad, \
+201 AUE_FTRUNCATE STD { int freebsd6_ftruncate(int fd, int pad, \
off_t length); }
-202 AUE_NULL MSTD { int __sysctl(int *name, u_int namelen, \
+202 AUE_SYSCTL STD { int __sysctl(int *name, u_int namelen, \
void *old, size_t *oldlenp, void *new, \
size_t newlen); } __sysctl sysctl_args int
-203 AUE_NULL MSTD { int mlock(const void *addr, size_t len); }
-204 AUE_NULL MSTD { int munlock(const void *addr, size_t len); }
-205 AUE_NULL MSTD { int undelete(char *path); }
-206 AUE_NULL MSTD { int futimes(int fd, struct timeval *tptr); }
-207 AUE_NULL MSTD { int getpgid(pid_t pid); }
+203 AUE_MLOCK STD { int mlock(const void *addr, size_t len); }
+204 AUE_MUNLOCK STD { int munlock(const void *addr, size_t len); }
+205 AUE_UNDELETE STD { int undelete(char *path); }
+206 AUE_FUTIMES STD { int futimes(int fd, struct timeval *tptr); }
+207 AUE_GETPGID STD { int getpgid(pid_t pid); }
208 AUE_NULL UNIMPL newreboot (NetBSD)
-209 AUE_NULL MSTD { int poll(struct pollfd *fds, u_int nfds, \
+209 AUE_POLL STD { int poll(struct pollfd *fds, u_int nfds, \
int timeout); }
;
@@ -410,41 +407,45 @@
;
; The following were introduced with NetBSD/4.4Lite-2
-; They are initialized by their respective modules/sysinits
-220 AUE_NULL MNOSTD { int __semctl(int semid, int semnum, \
+220 AUE_SEMCTL NOSTD { int __semctl(int semid, int semnum, \
int cmd, union semun *arg); }
-221 AUE_NULL MNOSTD { int semget(key_t key, int nsems, \
+221 AUE_SEMGET NOSTD { int semget(key_t key, int nsems, \
int semflg); }
-222 AUE_NULL MNOSTD { int semop(int semid, struct sembuf *sops, \
+222 AUE_SEMOP NOSTD { int semop(int semid, struct sembuf *sops, \
size_t nsops); }
223 AUE_NULL UNIMPL semconfig
-224 AUE_NULL MNOSTD { int msgctl(int msqid, int cmd, \
+224 AUE_MSGCTL NOSTD { int msgctl(int msqid, int cmd, \
struct msqid_ds *buf); }
-225 AUE_NULL MNOSTD { int msgget(key_t key, int msgflg); }
-226 AUE_NULL MNOSTD { int msgsnd(int msqid, const void *msgp, \
+225 AUE_MSGGET NOSTD { int msgget(key_t key, int msgflg); }
+226 AUE_MSGSND NOSTD { int msgsnd(int msqid, const void *msgp, \
size_t msgsz, int msgflg); }
-227 AUE_NULL MNOSTD { int msgrcv(int msqid, void *msgp, \
+227 AUE_MSGRCV NOSTD { int msgrcv(int msqid, void *msgp, \
size_t msgsz, long msgtyp, int msgflg); }
-228 AUE_NULL MNOSTD { int shmat(int shmid, const void *shmaddr, \
+228 AUE_SHMAT NOSTD { int shmat(int shmid, const void *shmaddr, \
int shmflg); }
-229 AUE_NULL MNOSTD { int shmctl(int shmid, int cmd, \
+229 AUE_SHMCTL NOSTD { int shmctl(int shmid, int cmd, \
struct shmid_ds *buf); }
-230 AUE_NULL MNOSTD { int shmdt(const void *shmaddr); }
-231 AUE_NULL MNOSTD { int shmget(key_t key, size_t size, \
+230 AUE_SHMDT NOSTD { int shmdt(const void *shmaddr); }
+231 AUE_SHMGET NOSTD { int shmget(key_t key, size_t size, \
int shmflg); }
;
-232 AUE_NULL MSTD { int clock_gettime(clockid_t clock_id, \
+232 AUE_NULL STD { int clock_gettime(clockid_t clock_id, \
struct timespec *tp); }
-233 AUE_NULL MSTD { int clock_settime(clockid_t clock_id, \
+233 AUE_CLOCK_SETTIME STD { int clock_settime( \
+ clockid_t clock_id, \
const struct timespec *tp); }
-234 AUE_NULL MSTD { int clock_getres(clockid_t clock_id, \
+234 AUE_NULL STD { int clock_getres(clockid_t clock_id, \
struct timespec *tp); }
-235 AUE_NULL UNIMPL timer_create
-236 AUE_NULL UNIMPL timer_delete
-237 AUE_NULL UNIMPL timer_settime
-238 AUE_NULL UNIMPL timer_gettime
-239 AUE_NULL UNIMPL timer_getoverrun
-240 AUE_NULL MSTD { int nanosleep(const struct timespec *rqtp, \
+235 AUE_NULL STD { int ktimer_create(clockid_t clock_id, \
+ struct sigevent *evp, int *timerid); }
+236 AUE_NULL STD { int ktimer_delete(int timerid); }
+237 AUE_NULL STD { int ktimer_settime(int timerid, int flags, \
+ const struct itimerspec *value, \
+ struct itimerspec *ovalue); }
+238 AUE_NULL STD { int ktimer_gettime(int timerid, struct \
+ itimerspec *value); }
+239 AUE_NULL STD { int ktimer_getoverrun(int timerid); }
+240 AUE_NULL STD { int nanosleep(const struct timespec *rqtp, \
struct timespec *rmtp); }
241 AUE_NULL UNIMPL nosys
242 AUE_NULL UNIMPL nosys
@@ -453,19 +454,21 @@
245 AUE_NULL UNIMPL nosys
246 AUE_NULL UNIMPL nosys
247 AUE_NULL UNIMPL nosys
-248 AUE_NULL MSTD { int ntp_gettime(struct ntptimeval *ntvp); }
+248 AUE_NULL STD { int ntp_gettime(struct ntptimeval *ntvp); }
249 AUE_NULL UNIMPL nosys
; syscall numbers initially used in OpenBSD
-250 AUE_NULL MSTD { int minherit(void *addr, size_t len, \
+250 AUE_MINHERIT STD { int minherit(void *addr, size_t len, \
int inherit); }
-251 AUE_NULL MSTD { int rfork(int flags); }
-252 AUE_NULL MSTD { int openbsd_poll(struct pollfd *fds, \
+251 AUE_RFORK STD { int rfork(int flags); }
+252 AUE_POLL STD { int openbsd_poll(struct pollfd *fds, \
u_int nfds, int timeout); }
-253 AUE_NULL MSTD { int issetugid(void); }
-254 AUE_NULL MSTD { int lchown(char *path, int uid, int gid); }
-255 AUE_NULL UNIMPL nosys
-256 AUE_NULL UNIMPL nosys
-257 AUE_NULL UNIMPL nosys
+253 AUE_ISSETUGID STD { int issetugid(void); }
+254 AUE_LCHOWN STD { int lchown(char *path, int uid, int gid); }
+255 AUE_NULL NOSTD { int aio_read(struct aiocb *aiocbp); }
+256 AUE_NULL NOSTD { int aio_write(struct aiocb *aiocbp); }
+257 AUE_NULL NOSTD { int lio_listio(int mode, \
+ struct aiocb * const *acb_list, \
+ int nent, struct sigevent *sig); }
258 AUE_NULL UNIMPL nosys
259 AUE_NULL UNIMPL nosys
260 AUE_NULL UNIMPL nosys
@@ -480,20 +483,20 @@
269 AUE_NULL UNIMPL nosys
270 AUE_NULL UNIMPL nosys
271 AUE_NULL UNIMPL nosys
-272 AUE_NULL MSTD { int getdents(int fd, char *buf, \
+272 AUE_O_GETDENTS STD { int getdents(int fd, char *buf, \
size_t count); }
273 AUE_NULL UNIMPL nosys
-274 AUE_NULL MSTD { int lchmod(char *path, mode_t mode); }
-275 AUE_NULL MNOPROTO { int lchown(char *path, uid_t uid, \
+274 AUE_LCHMOD STD { int lchmod(char *path, mode_t mode); }
+275 AUE_LCHOWN NOPROTO { int lchown(char *path, uid_t uid, \
gid_t gid); } netbsd_lchown lchown_args \
int
-276 AUE_NULL MSTD { int lutimes(char *path, \
+276 AUE_LUTIMES STD { int lutimes(char *path, \
struct timeval *tptr); }
-277 AUE_NULL MNOPROTO { int msync(void *addr, size_t len, \
+277 AUE_MSYNC NOPROTO { int msync(void *addr, size_t len, \
int flags); } netbsd_msync msync_args int
-278 AUE_NULL MSTD { int nstat(char *path, struct nstat *ub); }
-279 AUE_NULL MSTD { int nfstat(int fd, struct nstat *sb); }
-280 AUE_NULL MSTD { int nlstat(char *path, struct nstat *ub); }
+278 AUE_STAT STD { int nstat(char *path, struct nstat *ub); }
+279 AUE_FSTAT STD { int nfstat(int fd, struct nstat *sb); }
+280 AUE_LSTAT STD { int nlstat(char *path, struct nstat *ub); }
281 AUE_NULL UNIMPL nosys
282 AUE_NULL UNIMPL nosys
283 AUE_NULL UNIMPL nosys
@@ -503,9 +506,9 @@
287 AUE_NULL UNIMPL nosys
288 AUE_NULL UNIMPL nosys
; 289 and 290 from NetBSD (OpenBSD: 267 and 268)
-289 AUE_NULL MSTD { ssize_t preadv(int fd, struct iovec *iovp, \
+289 AUE_PREADV STD { ssize_t preadv(int fd, struct iovec *iovp, \
u_int iovcnt, off_t offset); }
-290 AUE_NULL MSTD { ssize_t pwritev(int fd, struct iovec *iovp, \
+290 AUE_PWRITEV STD { ssize_t pwritev(int fd, struct iovec *iovp, \
u_int iovcnt, off_t offset); }
291 AUE_NULL UNIMPL nosys
292 AUE_NULL UNIMPL nosys
@@ -514,30 +517,30 @@
295 AUE_NULL UNIMPL nosys
296 AUE_NULL UNIMPL nosys
; XXX 297 is 300 in NetBSD
-297 AUE_NULL MCOMPAT4 { int fhstatfs( \
+297 AUE_FHSTATFS COMPAT4 { int fhstatfs( \
const struct fhandle *u_fhp, \
struct ostatfs *buf); }
-298 AUE_NULL MSTD { int fhopen(const struct fhandle *u_fhp, \
+298 AUE_FHOPEN STD { int fhopen(const struct fhandle *u_fhp, \
int flags); }
-299 AUE_NULL MSTD { int fhstat(const struct fhandle *u_fhp, \
+299 AUE_FHSTAT STD { int fhstat(const struct fhandle *u_fhp, \
struct stat *sb); }
; syscall numbers for FreeBSD
-300 AUE_NULL MSTD { int modnext(int modid); }
-301 AUE_NULL MSTD { int modstat(int modid, \
+300 AUE_NULL STD { int modnext(int modid); }
+301 AUE_NULL STD { int modstat(int modid, \
struct module_stat *stat); }
-302 AUE_NULL MSTD { int modfnext(int modid); }
-303 AUE_NULL MSTD { int modfind(const char *name); }
-304 AUE_NULL MSTD { int kldload(const char *file); }
-305 AUE_NULL MSTD { int kldunload(int fileid); }
-306 AUE_NULL MSTD { int kldfind(const char *file); }
-307 AUE_NULL MSTD { int kldnext(int fileid); }
-308 AUE_NULL MSTD { int kldstat(int fileid, struct \
+302 AUE_NULL STD { int modfnext(int modid); }
+303 AUE_NULL STD { int modfind(const char *name); }
+304 AUE_MODLOAD STD { int kldload(const char *file); }
+305 AUE_MODUNLOAD STD { int kldunload(int fileid); }
+306 AUE_NULL STD { int kldfind(const char *file); }
+307 AUE_NULL STD { int kldnext(int fileid); }
+308 AUE_NULL STD { int kldstat(int fileid, struct \
kld_file_stat* stat); }
-309 AUE_NULL MSTD { int kldfirstmod(int fileid); }
-310 AUE_NULL MSTD { int getsid(pid_t pid); }
-311 AUE_NULL MSTD { int setresuid(uid_t ruid, uid_t euid, \
+309 AUE_NULL STD { int kldfirstmod(int fileid); }
+310 AUE_GETSID STD { int getsid(pid_t pid); }
+311 AUE_SETRESUID STD { int setresuid(uid_t ruid, uid_t euid, \
uid_t suid); }
-312 AUE_NULL MSTD { int setresgid(gid_t rgid, gid_t egid, \
+312 AUE_SETRESGID STD { int setresgid(gid_t rgid, gid_t egid, \
gid_t sgid); }
313 AUE_NULL OBSOL signanosleep
314 AUE_NULL NOSTD { int aio_return(struct aiocb *aiocbp); }
@@ -547,93 +550,95 @@
316 AUE_NULL NOSTD { int aio_cancel(int fd, \
struct aiocb *aiocbp); }
317 AUE_NULL NOSTD { int aio_error(struct aiocb *aiocbp); }
-318 AUE_NULL NOSTD { int aio_read(struct aiocb *aiocbp); }
-319 AUE_NULL NOSTD { int aio_write(struct aiocb *aiocbp); }
-320 AUE_NULL NOSTD { int lio_listio(int mode, \
- struct aiocb * const *acb_list, \
- int nent, struct sigevent *sig); }
-321 AUE_NULL MSTD { int yield(void); }
+318 AUE_NULL NOSTD { int oaio_read(struct oaiocb *aiocbp); }
+319 AUE_NULL NOSTD { int oaio_write(struct oaiocb *aiocbp); }
+320 AUE_NULL NOSTD { int olio_listio(int mode, \
+ struct oaiocb * const *acb_list, \
+ int nent, struct osigevent *sig); }
+321 AUE_NULL STD { int yield(void); }
322 AUE_NULL OBSOL thr_sleep
323 AUE_NULL OBSOL thr_wakeup
-324 AUE_NULL MSTD { int mlockall(int how); }
-325 AUE_NULL MSTD { int munlockall(void); }
-326 AUE_NULL MSTD { int __getcwd(u_char *buf, u_int buflen); }
+324 AUE_MLOCKALL STD { int mlockall(int how); }
+325 AUE_MUNLOCKALL STD { int munlockall(void); }
+326 AUE_GETCWD STD { int __getcwd(u_char *buf, u_int buflen); }
-327 AUE_NULL MSTD { int sched_setparam (pid_t pid, \
+327 AUE_NULL STD { int sched_setparam (pid_t pid, \
const struct sched_param *param); }
-328 AUE_NULL MSTD { int sched_getparam (pid_t pid, struct \
+328 AUE_NULL STD { int sched_getparam (pid_t pid, struct \
sched_param *param); }
-329 AUE_NULL MSTD { int sched_setscheduler (pid_t pid, int \
+329 AUE_NULL STD { int sched_setscheduler (pid_t pid, int \
policy, const struct sched_param \
*param); }
-330 AUE_NULL MSTD { int sched_getscheduler (pid_t pid); }
+330 AUE_NULL STD { int sched_getscheduler (pid_t pid); }
-331 AUE_NULL MSTD { int sched_yield (void); }
-332 AUE_NULL MSTD { int sched_get_priority_max (int policy); }
-333 AUE_NULL MSTD { int sched_get_priority_min (int policy); }
-334 AUE_NULL MSTD { int sched_rr_get_interval (pid_t pid, \
+331 AUE_NULL STD { int sched_yield (void); }
+332 AUE_NULL STD { int sched_get_priority_max (int policy); }
+333 AUE_NULL STD { int sched_get_priority_min (int policy); }
+334 AUE_NULL STD { int sched_rr_get_interval (pid_t pid, \
struct timespec *interval); }
-335 AUE_NULL MSTD { int utrace(const void *addr, size_t len); }
-336 AUE_NULL MCOMPAT4 { int sendfile(int fd, int s, \
+335 AUE_NULL STD { int utrace(const void *addr, size_t len); }
+336 AUE_SENDFILE COMPAT4 { int sendfile(int fd, int s, \
off_t offset, size_t nbytes, \
struct sf_hdtr *hdtr, off_t *sbytes, \
int flags); }
-337 AUE_NULL MSTD { int kldsym(int fileid, int cmd, \
+337 AUE_NULL STD { int kldsym(int fileid, int cmd, \
void *data); }
-338 AUE_NULL MSTD { int jail(struct jail *jail); }
+338 AUE_JAIL STD { int jail(struct jail *jail); }
339 AUE_NULL UNIMPL pioctl
-340 AUE_NULL MSTD { int sigprocmask(int how, \
+340 AUE_SIGPROCMASK STD { int sigprocmask(int how, \
const sigset_t *set, sigset_t *oset); }
-341 AUE_NULL MSTD { int sigsuspend(const sigset_t *sigmask); }
-342 AUE_NULL MCOMPAT4 { int sigaction(int sig, const \
+341 AUE_SIGSUSPEND STD { int sigsuspend(const sigset_t *sigmask); }
+342 AUE_SIGACTION COMPAT4 { int sigaction(int sig, const \
struct sigaction *act, \
struct sigaction *oact); }
-343 AUE_NULL MSTD { int sigpending(sigset_t *set); }
-344 AUE_NULL MCOMPAT4 { int sigreturn( \
+343 AUE_SIGPENDING STD { int sigpending(sigset_t *set); }
+344 AUE_SIGRETURN COMPAT4 { int sigreturn( \
const struct ucontext4 *sigcntxp); }
-345 AUE_NULL MSTD { int sigtimedwait(const sigset_t *set, \
+345 AUE_SIGWAIT STD { int sigtimedwait(const sigset_t *set, \
siginfo_t *info, \
const struct timespec *timeout); }
-346 AUE_NULL MSTD { int sigwaitinfo(const sigset_t *set, \
+346 AUE_NULL STD { int sigwaitinfo(const sigset_t *set, \
siginfo_t *info); }
-347 AUE_NULL MSTD { int __acl_get_file(const char *path, \
+347 AUE_NULL STD { int __acl_get_file(const char *path, \
acl_type_t type, struct acl *aclp); }
-348 AUE_NULL MSTD { int __acl_set_file(const char *path, \
+348 AUE_NULL STD { int __acl_set_file(const char *path, \
acl_type_t type, struct acl *aclp); }
-349 AUE_NULL MSTD { int __acl_get_fd(int filedes, \
+349 AUE_NULL STD { int __acl_get_fd(int filedes, \
acl_type_t type, struct acl *aclp); }
-350 AUE_NULL MSTD { int __acl_set_fd(int filedes, \
+350 AUE_NULL STD { int __acl_set_fd(int filedes, \
acl_type_t type, struct acl *aclp); }
-351 AUE_NULL MSTD { int __acl_delete_file(const char *path, \
+351 AUE_NULL STD { int __acl_delete_file(const char *path, \
acl_type_t type); }
-352 AUE_NULL MSTD { int __acl_delete_fd(int filedes, \
+352 AUE_NULL STD { int __acl_delete_fd(int filedes, \
acl_type_t type); }
-353 AUE_NULL MSTD { int __acl_aclcheck_file(const char *path, \
+353 AUE_NULL STD { int __acl_aclcheck_file(const char *path, \
acl_type_t type, struct acl *aclp); }
-354 AUE_NULL MSTD { int __acl_aclcheck_fd(int filedes, \
+354 AUE_NULL STD { int __acl_aclcheck_fd(int filedes, \
acl_type_t type, struct acl *aclp); }
-355 AUE_NULL MSTD { int extattrctl(const char *path, int cmd, \
+355 AUE_EXTATTRCTL STD { int extattrctl(const char *path, int cmd, \
const char *filename, int attrnamespace, \
const char *attrname); }
-356 AUE_NULL MSTD { int extattr_set_file(const char *path, \
- int attrnamespace, const char *attrname, \
- void *data, size_t nbytes); }
-357 AUE_NULL MSTD { ssize_t extattr_get_file(const char *path, \
- int attrnamespace, const char *attrname, \
- void *data, size_t nbytes); }
-358 AUE_NULL MSTD { int extattr_delete_file(const char *path, \
+356 AUE_EXTATTR_SET_FILE STD { int extattr_set_file( \
+ const char *path, int attrnamespace, \
+ const char *attrname, void *data, \
+ size_t nbytes); }
+357 AUE_EXTATTR_GET_FILE STD { ssize_t extattr_get_file( \
+ const char *path, int attrnamespace, \
+ const char *attrname, void *data, \
+ size_t nbytes); }
+358 AUE_EXTATTR_DELETE_FILE STD { int extattr_delete_file(const char *path, \
int attrnamespace, \
const char *attrname); }
359 AUE_NULL NOSTD { int aio_waitcomplete( \
struct aiocb **aiocbp, \
struct timespec *timeout); }
-360 AUE_NULL MSTD { int getresuid(uid_t *ruid, uid_t *euid, \
+360 AUE_GETRESUID STD { int getresuid(uid_t *ruid, uid_t *euid, \
uid_t *suid); }
-361 AUE_NULL MSTD { int getresgid(gid_t *rgid, gid_t *egid, \
+361 AUE_GETRESGID STD { int getresgid(gid_t *rgid, gid_t *egid, \
gid_t *sgid); }
-362 AUE_NULL MSTD { int kqueue(void); }
-363 AUE_NULL MSTD { int kevent(int fd, \
+362 AUE_KQUEUE STD { int kqueue(void); }
+363 AUE_NULL STD { int kevent(int fd, \
struct kevent *changelist, int nchanges, \
struct kevent *eventlist, int nevents, \
const struct timespec *timeout); }
@@ -644,155 +649,203 @@
368 AUE_NULL UNIMPL __cap_set_fd
369 AUE_NULL UNIMPL __cap_set_file
370 AUE_NULL NODEF lkmressys lkmressys nosys_args int
-371 AUE_NULL MSTD { int extattr_set_fd(int fd, \
+371 AUE_EXTATTR_SET_FD STD { int extattr_set_fd(int fd, \
int attrnamespace, const char *attrname, \
void *data, size_t nbytes); }
-372 AUE_NULL MSTD { ssize_t extattr_get_fd(int fd, \
+372 AUE_EXTATTR_GET_FD STD { ssize_t extattr_get_fd(int fd, \
int attrnamespace, const char *attrname, \
void *data, size_t nbytes); }
-373 AUE_NULL MSTD { int extattr_delete_fd(int fd, \
+373 AUE_EXTATTR_DELETE_FD STD { int extattr_delete_fd(int fd, \
int attrnamespace, \
const char *attrname); }
-374 AUE_NULL MSTD { int __setugid(int flag); }
+374 AUE_NULL STD { int __setugid(int flag); }
375 AUE_NULL NOIMPL { int nfsclnt(int flag, caddr_t argp); }
-376 AUE_NULL MSTD { int eaccess(char *path, int flags); }
+376 AUE_EACCESS STD { int eaccess(char *path, int flags); }
377 AUE_NULL UNIMPL afs_syscall
-378 AUE_NULL STD { int nmount(struct iovec *iovp, \
+378 AUE_NMOUNT STD { int nmount(struct iovec *iovp, \
unsigned int iovcnt, int flags); }
-379 AUE_NULL MSTD { int kse_exit(void); }
-380 AUE_NULL MSTD { int kse_wakeup(struct kse_mailbox *mbx); }
-381 AUE_NULL MSTD { int kse_create(struct kse_mailbox *mbx, \
+379 AUE_NULL STD { int kse_exit(void); }
+380 AUE_NULL STD { int kse_wakeup(struct kse_mailbox *mbx); }
+381 AUE_NULL STD { int kse_create(struct kse_mailbox *mbx, \
int newgroup); }
-382 AUE_NULL MSTD { int kse_thr_interrupt( \
+382 AUE_NULL STD { int kse_thr_interrupt( \
struct kse_thr_mailbox *tmbx, int cmd, \
long data); }
-383 AUE_NULL MSTD { int kse_release(struct timespec *timeout); }
-384 AUE_NULL MSTD { int __mac_get_proc(struct mac *mac_p); }
-385 AUE_NULL MSTD { int __mac_set_proc(struct mac *mac_p); }
-386 AUE_NULL MSTD { int __mac_get_fd(int fd, \
+383 AUE_NULL STD { int kse_release(struct timespec *timeout); }
+384 AUE_NULL STD { int __mac_get_proc(struct mac *mac_p); }
+385 AUE_NULL STD { int __mac_set_proc(struct mac *mac_p); }
+386 AUE_NULL STD { int __mac_get_fd(int fd, \
struct mac *mac_p); }
-387 AUE_NULL MSTD { int __mac_get_file(const char *path_p, \
+387 AUE_NULL STD { int __mac_get_file(const char *path_p, \
struct mac *mac_p); }
-388 AUE_NULL MSTD { int __mac_set_fd(int fd, \
+388 AUE_NULL STD { int __mac_set_fd(int fd, \
struct mac *mac_p); }
-389 AUE_NULL MSTD { int __mac_set_file(const char *path_p, \
+389 AUE_NULL STD { int __mac_set_file(const char *path_p, \
struct mac *mac_p); }
-390 AUE_NULL MSTD { int kenv(int what, const char *name, \
+390 AUE_NULL STD { int kenv(int what, const char *name, \
char *value, int len); }
-391 AUE_NULL MSTD { int lchflags(const char *path, int flags); }
-392 AUE_NULL MSTD { int uuidgen(struct uuid *store, \
+391 AUE_LCHFLAGS STD { int lchflags(const char *path, int flags); }
+392 AUE_NULL STD { int uuidgen(struct uuid *store, \
int count); }
-393 AUE_NULL MSTD { int sendfile(int fd, int s, off_t offset, \
+393 AUE_SENDFILE STD { int sendfile(int fd, int s, off_t offset, \
size_t nbytes, struct sf_hdtr *hdtr, \
off_t *sbytes, int flags); }
-394 AUE_NULL MSTD { int mac_syscall(const char *policy, \
+394 AUE_NULL STD { int mac_syscall(const char *policy, \
int call, void *arg); }
-395 AUE_NULL MSTD { int getfsstat(struct statfs *buf, \
+395 AUE_GETFSSTAT STD { int getfsstat(struct statfs *buf, \
long bufsize, int flags); }
-396 AUE_NULL MSTD { int statfs(char *path, \
+396 AUE_STATFS STD { int statfs(char *path, \
struct statfs *buf); }
-397 AUE_NULL MSTD { int fstatfs(int fd, struct statfs *buf); }
-398 AUE_NULL MSTD { int fhstatfs(const struct fhandle *u_fhp, \
+397 AUE_FSTATFS STD { int fstatfs(int fd, struct statfs *buf); }
+398 AUE_FHSTATFS STD { int fhstatfs(const struct fhandle *u_fhp, \
struct statfs *buf); }
399 AUE_NULL UNIMPL nosys
-400 AUE_NULL MNOSTD { int ksem_close(semid_t id); }
-401 AUE_NULL MNOSTD { int ksem_post(semid_t id); }
-402 AUE_NULL MNOSTD { int ksem_wait(semid_t id); }
-403 AUE_NULL MNOSTD { int ksem_trywait(semid_t id); }
-404 AUE_NULL MNOSTD { int ksem_init(semid_t *idp, \
+400 AUE_NULL NOSTD { int ksem_close(semid_t id); }
+401 AUE_NULL NOSTD { int ksem_post(semid_t id); }
+402 AUE_NULL NOSTD { int ksem_wait(semid_t id); }
+403 AUE_NULL NOSTD { int ksem_trywait(semid_t id); }
+404 AUE_NULL NOSTD { int ksem_init(semid_t *idp, \
unsigned int value); }
-405 AUE_NULL MNOSTD { int ksem_open(semid_t *idp, \
+405 AUE_NULL NOSTD { int ksem_open(semid_t *idp, \
const char *name, int oflag, \
mode_t mode, unsigned int value); }
-406 AUE_NULL MNOSTD { int ksem_unlink(const char *name); }
-407 AUE_NULL MNOSTD { int ksem_getvalue(semid_t id, int *val); }
-408 AUE_NULL MNOSTD { int ksem_destroy(semid_t id); }
-409 AUE_NULL MSTD { int __mac_get_pid(pid_t pid, \
+406 AUE_NULL NOSTD { int ksem_unlink(const char *name); }
+407 AUE_NULL NOSTD { int ksem_getvalue(semid_t id, int *val); }
+408 AUE_NULL NOSTD { int ksem_destroy(semid_t id); }
+409 AUE_NULL STD { int __mac_get_pid(pid_t pid, \
struct mac *mac_p); }
-410 AUE_NULL MSTD { int __mac_get_link(const char *path_p, \
+410 AUE_NULL STD { int __mac_get_link(const char *path_p, \
struct mac *mac_p); }
-411 AUE_NULL MSTD { int __mac_set_link(const char *path_p, \
+411 AUE_NULL STD { int __mac_set_link(const char *path_p, \
struct mac *mac_p); }
-412 AUE_NULL MSTD { int extattr_set_link(const char *path, \
- int attrnamespace, const char *attrname, \
- void *data, size_t nbytes); }
-413 AUE_NULL MSTD { ssize_t extattr_get_link(const char *path, \
- int attrnamespace, const char *attrname, \
- void *data, size_t nbytes); }
-414 AUE_NULL MSTD { int extattr_delete_link(const char *path, \
- int attrnamespace, \
+412 AUE_EXTATTR_SET_LINK STD { int extattr_set_link( \
+ const char *path, int attrnamespace, \
+ const char *attrname, void *data, \
+ size_t nbytes); }
+413 AUE_EXTATTR_GET_LINK STD { ssize_t extattr_get_link( \
+ const char *path, int attrnamespace, \
+ const char *attrname, void *data, \
+ size_t nbytes); }
+414 AUE_EXTATTR_DELETE_LINK STD { int extattr_delete_link( \
+ const char *path, int attrnamespace, \
const char *attrname); }
-415 AUE_NULL MSTD { int __mac_execve(char *fname, char **argv, \
+415 AUE_NULL STD { int __mac_execve(char *fname, char **argv, \
char **envv, struct mac *mac_p); }
-416 AUE_NULL MSTD { int sigaction(int sig, \
+416 AUE_SIGACTION STD { int sigaction(int sig, \
const struct sigaction *act, \
struct sigaction *oact); }
-417 AUE_NULL MSTD { int sigreturn( \
+417 AUE_SIGRETURN STD { int sigreturn( \
const struct __ucontext *sigcntxp); }
418 AUE_NULL UNIMPL __xstat
419 AUE_NULL UNIMPL __xfstat
420 AUE_NULL UNIMPL __xlstat
-421 AUE_NULL MSTD { int getcontext(struct __ucontext *ucp); }
-422 AUE_NULL MSTD { int setcontext( \
+421 AUE_NULL STD { int getcontext(struct __ucontext *ucp); }
+422 AUE_NULL STD { int setcontext( \
const struct __ucontext *ucp); }
-423 AUE_NULL MSTD { int swapcontext(struct __ucontext *oucp, \
+423 AUE_NULL STD { int swapcontext(struct __ucontext *oucp, \
const struct __ucontext *ucp); }
-424 AUE_NULL MSTD { int swapoff(const char *name); }
-425 AUE_NULL MSTD { int __acl_get_link(const char *path, \
+424 AUE_SWAPOFF STD { int swapoff(const char *name); }
+425 AUE_NULL STD { int __acl_get_link(const char *path, \
acl_type_t type, struct acl *aclp); }
-426 AUE_NULL MSTD { int __acl_set_link(const char *path, \
+426 AUE_NULL STD { int __acl_set_link(const char *path, \
acl_type_t type, struct acl *aclp); }
-427 AUE_NULL MSTD { int __acl_delete_link(const char *path, \
+427 AUE_NULL STD { int __acl_delete_link(const char *path, \
acl_type_t type); }
-428 AUE_NULL MSTD { int __acl_aclcheck_link(const char *path, \
+428 AUE_NULL STD { int __acl_aclcheck_link(const char *path, \
acl_type_t type, struct acl *aclp); }
-429 AUE_NULL MSTD { int sigwait(const sigset_t *set, \
+429 AUE_SIGWAIT STD { int sigwait(const sigset_t *set, \
int *sig); }
-430 AUE_NULL MSTD { int thr_create(ucontext_t *ctx, long *id, \
+430 AUE_NULL STD { int thr_create(ucontext_t *ctx, long *id, \
int flags); }
-431 AUE_NULL MSTD { void thr_exit(long *state); }
-432 AUE_NULL MSTD { int thr_self(long *id); }
-433 AUE_NULL MSTD { int thr_kill(long id, int sig); }
-434 AUE_NULL MSTD { int _umtx_lock(struct umtx *umtx); }
-435 AUE_NULL MSTD { int _umtx_unlock(struct umtx *umtx); }
-436 AUE_NULL MSTD { int jail_attach(int jid); }
-437 AUE_NULL MSTD { ssize_t extattr_list_fd(int fd, \
+431 AUE_NULL STD { void thr_exit(long *state); }
+432 AUE_NULL STD { int thr_self(long *id); }
+433 AUE_NULL STD { int thr_kill(long id, int sig); }
+434 AUE_NULL STD { int _umtx_lock(struct umtx *umtx); }
+435 AUE_NULL STD { int _umtx_unlock(struct umtx *umtx); }
+436 AUE_NULL STD { int jail_attach(int jid); }
+437 AUE_EXTATTR_LIST_FD STD { ssize_t extattr_list_fd(int fd, \
int attrnamespace, void *data, \
size_t nbytes); }
-438 AUE_NULL MSTD { ssize_t extattr_list_file( \
+438 AUE_EXTATTR_LIST_FILE STD { ssize_t extattr_list_file( \
const char *path, int attrnamespace, \
void *data, size_t nbytes); }
-439 AUE_NULL MSTD { ssize_t extattr_list_link( \
+439 AUE_EXTATTR_LIST_LINK STD { ssize_t extattr_list_link( \
const char *path, int attrnamespace, \
void *data, size_t nbytes); }
-440 AUE_NULL MSTD { int kse_switchin( \
+440 AUE_NULL STD { int kse_switchin( \
struct kse_thr_mailbox *tmbx, \
int flags); }
-441 AUE_NULL MNOSTD { int ksem_timedwait(semid_t id, \
- struct timespec *abstime); }
-442 AUE_NULL MSTD { int thr_suspend( \
+441 AUE_NULL NOSTD { int ksem_timedwait(semid_t id, \
+ const struct timespec *abstime); }
+442 AUE_NULL STD { int thr_suspend( \
const struct timespec *timeout); }
-443 AUE_NULL MSTD { int thr_wake(long id); }
-444 AUE_NULL MSTD { int kldunloadf(int fileid, int flags); }
-445 AUE_NULL MSTD { int audit(const void *record, \
+443 AUE_NULL STD { int thr_wake(long id); }
+444 AUE_MODUNLOAD STD { int kldunloadf(int fileid, int flags); }
+445 AUE_AUDIT STD { int audit(const void *record, \
u_int length); }
-446 AUE_NULL MSTD { int auditon(int cmd, void *data, \
+446 AUE_AUDITON STD { int auditon(int cmd, void *data, \
u_int length); }
-447 AUE_NULL MSTD { int getauid(uid_t *auid); }
-448 AUE_NULL MSTD { int setauid(uid_t *auid); }
-449 AUE_NULL MSTD { int getaudit(struct auditinfo *auditinfo); }
-450 AUE_NULL MSTD { int setaudit(struct auditinfo *auditinfo); }
-451 AUE_NULL MSTD { int getaudit_addr( \
+447 AUE_GETAUID STD { int getauid(uid_t *auid); }
+448 AUE_SETAUID STD { int setauid(uid_t *auid); }
+449 AUE_GETAUDIT STD { int getaudit(struct auditinfo *auditinfo); }
+450 AUE_SETAUDIT STD { int setaudit(struct auditinfo *auditinfo); }
+451 AUE_GETAUDIT_ADDR STD { int getaudit_addr( \
struct auditinfo_addr *auditinfo_addr, \
u_int length); }
-452 AUE_NULL MSTD { int setaudit_addr( \
+452 AUE_SETAUDIT_ADDR STD { int setaudit_addr( \
struct auditinfo_addr *auditinfo_addr, \
u_int length); }
-453 AUE_NULL MSTD { int auditctl(int cmd, char *path); }
-454 AUE_NULL MSTD { int _umtx_op(struct umtx *umtx, int op, \
- long id, void *uaddr, void *uaddr2); }
-455 AUE_NULL MSTD { int thr_new(struct thr_param *param, \
+453 AUE_AUDITCTL STD { int auditctl(char *path); }
+454 AUE_NULL STD { int _umtx_op(void *obj, int op, \
+ u_long val, void *uaddr1, void *uaddr2); }
+455 AUE_NULL STD { int thr_new(struct thr_param *param, \
int param_size); }
-
+456 AUE_NULL STD { int sigqueue(pid_t pid, int signum, void *value); }
+457 AUE_NULL NOSTD { int kmq_open(const char *path, int flags, \
+ mode_t mode, const struct mq_attr *attr); }
+458 AUE_NULL NOSTD { int kmq_setattr(int mqd, \
+ const struct mq_attr *attr, \
+ struct mq_attr *oattr); }
+459 AUE_NULL NOSTD { int kmq_timedreceive(int mqd, \
+ char *msg_ptr, size_t msg_len, \
+ unsigned *msg_prio, \
+ const struct timespec *abs_timeout); }
+460 AUE_NULL NOSTD { int kmq_timedsend(int mqd, \
+ const char *msg_ptr, size_t msg_len,\
+ unsigned msg_prio, \
+ const struct timespec *abs_timeout);}
+461 AUE_NULL NOSTD { int kmq_notify(int mqd, \
+ const struct sigevent *sigev); }
+462 AUE_NULL NOSTD { int kmq_unlink(const char *path); }
+463 AUE_NULL STD { int abort2(const char *why, int nargs, void **args); }
+464 AUE_NULL STD { int thr_set_name(long id, const char *name); }
+465 AUE_NULL NOSTD { int aio_fsync(int op, struct aiocb *aiocbp); }
+466 AUE_RTPRIO STD { int rtprio_thread(int function, \
+ lwpid_t lwpid, struct rtprio *rtp); }
+467 AUE_NULL UNIMPL nosys
+468 AUE_NULL UNIMPL nosys
+469 AUE_NULL UNIMPL __getpath_fromfd
+470 AUE_NULL UNIMPL __getpath_fromaddr
+471 AUE_NULL STD { int sctp_peeloff(int sd, uint32_t name); }
+472 AUE_NULL STD { int sctp_generic_sendmsg(int sd, caddr_t msg, int mlen, \
+ caddr_t to, __socklen_t tolen, \
+ struct sctp_sndrcvinfo *sinfo, int flags); }
+473 AUE_NULL STD { int sctp_generic_sendmsg_iov(int sd, struct iovec *iov, int iovlen, \
+ caddr_t to, __socklen_t tolen, \
+ struct sctp_sndrcvinfo *sinfo, int flags); }
+474 AUE_NULL STD { int sctp_generic_recvmsg(int sd, struct iovec *iov, int iovlen, \
+ struct sockaddr * from, __socklen_t *fromlenaddr, \
+ struct sctp_sndrcvinfo *sinfo, int *msg_flags); }
+475 AUE_PREAD STD { ssize_t pread(int fd, void *buf, \
+ size_t nbyte, off_t offset); }
+476 AUE_PWRITE STD { ssize_t pwrite(int fd, const void *buf, \
+ size_t nbyte, off_t offset); }
+477 AUE_MMAP STD { caddr_t mmap(caddr_t addr, size_t len, \
+ int prot, int flags, int fd, off_t pos); }
+478 AUE_LSEEK STD { off_t lseek(int fd, off_t offset, \
+ int whence); }
+479 AUE_TRUNCATE STD { int truncate(char *path, off_t length); }
+480 AUE_FTRUNCATE STD { int ftruncate(int fd, off_t length); }
+481 AUE_KILL STD { int thr_kill2(pid_t pid, long id, int sig); }
; Please copy any additions and changes to the following compatability tables:
; sys/compat/freebsd32/syscalls.master
Index: subr_sleepqueue.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_sleepqueue.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/subr_sleepqueue.c -L sys/kern/subr_sleepqueue.c -u -r1.2 -r1.3
--- sys/kern/subr_sleepqueue.c
+++ sys/kern/subr_sleepqueue.c
@@ -59,17 +59,18 @@
* variables.
*/
-#include "opt_sleepqueue_profiling.h"
-
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/kern/subr_sleepqueue.c,v 1.18.2.2 2006/02/27 00:19:40 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_sleepqueue.c,v 1.39.4.1 2008/01/29 16:37:04 jhb Exp $");
+
+#include "opt_sleepqueue_profiling.h"
+#include "opt_ddb.h"
+#include "opt_sched.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
-#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/sched.h>
@@ -77,6 +78,12 @@
#include <sys/sleepqueue.h>
#include <sys/sysctl.h>
+#include <vm/uma.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
/*
* Constants for the hash table of sleep queue chains. These constants are
* the same ones that 4BSD (and possibly earlier versions of BSD) used.
@@ -89,7 +96,7 @@
#define SC_SHIFT 8
#define SC_HASH(wc) (((uintptr_t)(wc) >> SC_SHIFT) & SC_MASK)
#define SC_LOOKUP(wc) &sleepq_chains[SC_HASH(wc)]
-
+#define NR_SLEEPQS 2
/*
* There two different lists of sleep queues. Both lists are connected
* via the sq_hash entries. The first list is the sleep queue chain list
@@ -109,13 +116,13 @@
* c - sleep queue chain lock
*/
struct sleepqueue {
- TAILQ_HEAD(, thread) sq_blocked; /* (c) Blocked threads. */
+ TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */
LIST_ENTRY(sleepqueue) sq_hash; /* (c) Chain and free list. */
LIST_HEAD(, sleepqueue) sq_free; /* (c) Free queues. */
void *sq_wchan; /* (c) Wait channel. */
#ifdef INVARIANTS
int sq_type; /* (c) Queue type. */
- struct mtx *sq_lock; /* (c) Associated lock. */
+ struct lock_object *sq_lock; /* (c) Associated lock. */
#endif
};
@@ -137,16 +144,22 @@
0, "maxmimum depth achieved of a single chain");
#endif
static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE];
-
-static MALLOC_DEFINE(M_SLEEPQUEUE, "sleep queues", "sleep queues");
+static uma_zone_t sleepq_zone;
/*
* Prototypes for non-exported routines.
*/
+static int sleepq_catch_signals(void *wchan);
+static int sleepq_check_signals(void);
static int sleepq_check_timeout(void);
+#ifdef INVARIANTS
+static void sleepq_dtor(void *mem, int size, void *arg);
+#endif
+static int sleepq_init(void *mem, int size, int flags);
+static void sleepq_resume_thread(struct sleepqueue *sq, struct thread *td,
+ int pri);
static void sleepq_switch(void *wchan);
static void sleepq_timeout(void *arg);
-static void sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri);
/*
* Early initialization of sleep queues that is called from the sleepinit()
@@ -177,21 +190,24 @@
NULL);
#endif
}
+ sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue),
+#ifdef INVARIANTS
+ NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
+#else
+ NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
+#endif
+
thread0.td_sleepqueue = sleepq_alloc();
}
/*
- * Malloc and initialize a new sleep queue for a new thread.
+ * Get a sleep queue for a new thread.
*/
struct sleepqueue *
sleepq_alloc(void)
{
- struct sleepqueue *sq;
- sq = malloc(sizeof(struct sleepqueue), M_SLEEPQUEUE, M_WAITOK | M_ZERO);
- TAILQ_INIT(&sq->sq_blocked);
- LIST_INIT(&sq->sq_free);
- return (sq);
+ return (uma_zalloc(sleepq_zone, M_WAITOK));
}
/*
@@ -201,9 +217,7 @@
sleepq_free(struct sleepqueue *sq)
{
- MPASS(sq != NULL);
- MPASS(TAILQ_EMPTY(&sq->sq_blocked));
- free(sq, M_SLEEPQUEUE);
+ uma_zfree(sleepq_zone, sq);
}
/*
@@ -257,7 +271,8 @@
* woken up.
*/
void
-sleepq_add(void *wchan, struct mtx *lock, const char *wmesg, int flags)
+sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags,
+ int queue)
{
struct sleepqueue_chain *sc;
struct sleepqueue *sq;
@@ -268,10 +283,11 @@
mtx_assert(&sc->sc_lock, MA_OWNED);
MPASS(td->td_sleepqueue != NULL);
MPASS(wchan != NULL);
+ MPASS((queue >= 0) && (queue < NR_SLEEPQS));
/* If this thread is not allowed to sleep, die a horrible death. */
KASSERT(!(td->td_pflags & TDP_NOSLEEPING),
- ("trying to sleep while sleeping is prohibited"));
+ ("Trying sleep, but thread marked as sleeping prohibited"));
/* Look up the sleep queue associated with the wait channel 'wchan'. */
sq = sleepq_lookup(wchan);
@@ -282,6 +298,19 @@
* into the sleep queue already in use by this wait channel.
*/
if (sq == NULL) {
+#ifdef INVARIANTS
+ int i;
+
+ sq = td->td_sleepqueue;
+ for (i = 0; i < NR_SLEEPQS; i++)
+ KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]),
+ ("thread's sleep queue %d is not empty", i));
+ KASSERT(LIST_EMPTY(&sq->sq_free),
+ ("thread's sleep queue has a non-empty free list"));
+ KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
+ sq->sq_lock = lock;
+ sq->sq_type = flags & SLEEPQ_TYPE;
+#endif
#ifdef SLEEPQUEUE_PROFILING
sc->sc_depth++;
if (sc->sc_depth > sc->sc_max_depth) {
@@ -292,32 +321,24 @@
#endif
sq = td->td_sleepqueue;
LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
- KASSERT(TAILQ_EMPTY(&sq->sq_blocked),
- ("thread's sleep queue has a non-empty queue"));
- KASSERT(LIST_EMPTY(&sq->sq_free),
- ("thread's sleep queue has a non-empty free list"));
- KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
sq->sq_wchan = wchan;
-#ifdef INVARIANTS
- sq->sq_lock = lock;
- sq->sq_type = flags & SLEEPQ_TYPE;
-#endif
} else {
MPASS(wchan == sq->sq_wchan);
MPASS(lock == sq->sq_lock);
MPASS((flags & SLEEPQ_TYPE) == sq->sq_type);
LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash);
}
- TAILQ_INSERT_TAIL(&sq->sq_blocked, td, td_slpq);
+ thread_lock(td);
+ TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq);
td->td_sleepqueue = NULL;
- mtx_lock_spin(&sched_lock);
+ td->td_sqqueue = queue;
td->td_wchan = wchan;
td->td_wmesg = wmesg;
if (flags & SLEEPQ_INTERRUPTIBLE) {
td->td_flags |= TDF_SINTR;
td->td_flags &= ~TDF_SLEEPABORT;
}
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
/*
@@ -342,7 +363,8 @@
/*
* Marks the pending sleep of the current thread as interruptible and
* makes an initial check for pending signals before putting a thread
- * to sleep. Return with sleep queue and scheduler lock held.
+ * to sleep. Enters and exits with the thread lock held. Thread lock
+ * may have transitioned from the sleepq lock to a run lock.
*/
static int
sleepq_catch_signals(void *wchan)
@@ -362,7 +384,6 @@
CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)",
(void *)td, (long)p->p_pid, p->p_comm);
- MPASS(td->td_flags & TDF_SINTR);
mtx_unlock_spin(&sc->sc_lock);
/* See if there are any pending signals for this thread. */
@@ -381,68 +402,79 @@
ret = ERESTART;
mtx_unlock(&ps->ps_mtx);
}
-
+ /*
+ * Lock sleepq chain before unlocking proc
+ * without this, we could lose a race.
+ */
+ mtx_lock_spin(&sc->sc_lock);
+ PROC_UNLOCK(p);
+ thread_lock(td);
if (ret == 0) {
- mtx_lock_spin(&sc->sc_lock);
- /*
- * Lock sched_lock before unlocking proc lock,
- * without this, we could lose a race.
- */
- mtx_lock_spin(&sched_lock);
- PROC_UNLOCK(p);
- if (!(td->td_flags & TDF_INTERRUPT))
+ if (!(td->td_flags & TDF_INTERRUPT)) {
+ sleepq_switch(wchan);
return (0);
+ }
/* KSE threads tried unblocking us. */
ret = td->td_intrval;
- mtx_unlock_spin(&sched_lock);
- MPASS(ret == EINTR || ret == ERESTART);
- } else {
- PROC_UNLOCK(p);
- mtx_lock_spin(&sc->sc_lock);
+ MPASS(ret == EINTR || ret == ERESTART || ret == EWOULDBLOCK);
}
/*
* There were pending signals and this thread is still
* on the sleep queue, remove it from the sleep queue.
*/
- sq = sleepq_lookup(wchan);
- mtx_lock_spin(&sched_lock);
- if (TD_ON_SLEEPQ(td))
+ if (TD_ON_SLEEPQ(td)) {
+ sq = sleepq_lookup(wchan);
sleepq_resume_thread(sq, td, -1);
- td->td_flags &= ~TDF_SINTR;
+ }
+ mtx_unlock_spin(&sc->sc_lock);
+ MPASS(td->td_lock != &sc->sc_lock);
return (ret);
}
/*
- * Switches to another thread if we are still asleep on a sleep queue and
- * drop the lock on the sleep queue chain. Returns with sched_lock held.
+ * Switches to another thread if we are still asleep on a sleep queue.
+ * Returns with thread lock.
*/
static void
sleepq_switch(void *wchan)
{
struct sleepqueue_chain *sc;
+ struct sleepqueue *sq;
struct thread *td;
td = curthread;
sc = SC_LOOKUP(wchan);
mtx_assert(&sc->sc_lock, MA_OWNED);
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
/*
* If we have a sleep queue, then we've already been woken up, so
* just return.
*/
if (td->td_sleepqueue != NULL) {
- MPASS(!TD_ON_SLEEPQ(td));
mtx_unlock_spin(&sc->sc_lock);
return;
}
/*
- * Otherwise, actually go to sleep.
+ * If TDF_TIMEOUT is set, then our sleep has been timed out
+ * already but we are still on the sleep queue, so dequeue the
+ * thread and return.
*/
- mtx_unlock_spin(&sc->sc_lock);
+ if (td->td_flags & TDF_TIMEOUT) {
+ MPASS(TD_ON_SLEEPQ(td));
+ sq = sleepq_lookup(wchan);
+ sleepq_resume_thread(sq, td, -1);
+ mtx_unlock_spin(&sc->sc_lock);
+ return;
+ }
+
+ thread_lock_set(td, &sc->sc_lock);
+
+ MPASS(td->td_sleepqueue == NULL);
sched_sleep(td);
TD_SET_SLEEPING(td);
+ SCHED_STAT_INC(switch_sleepq);
mi_switch(SW_VOL, NULL);
KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)",
@@ -457,8 +489,8 @@
{
struct thread *td;
- mtx_assert(&sched_lock, MA_OWNED);
td = curthread;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
/*
* If TDF_TIMEOUT is set, we timed out.
@@ -483,6 +515,7 @@
else if (callout_stop(&td->td_slpcallout) == 0) {
td->td_flags |= TDF_TIMEOUT;
TD_SET_SLEEPING(td);
+ SCHED_STAT_INC(switch_sleepqtimo);
mi_switch(SW_INVOL, NULL);
}
return (0);
@@ -496,8 +529,8 @@
{
struct thread *td;
- mtx_assert(&sched_lock, MA_OWNED);
td = curthread;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
/* We are no longer in an interruptible sleep. */
if (td->td_flags & TDF_SINTR)
@@ -520,11 +553,13 @@
void
sleepq_wait(void *wchan)
{
+ struct thread *td;
- MPASS(!(curthread->td_flags & TDF_SINTR));
- mtx_lock_spin(&sched_lock);
+ td = curthread;
+ MPASS(!(td->td_flags & TDF_SINTR));
+ thread_lock(td);
sleepq_switch(wchan);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
/*
@@ -538,12 +573,8 @@
int rval;
rcatch = sleepq_catch_signals(wchan);
- if (rcatch == 0)
- sleepq_switch(wchan);
- else
- sleepq_release(wchan);
rval = sleepq_check_signals();
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(curthread);
if (rcatch)
return (rcatch);
return (rval);
@@ -556,13 +587,16 @@
int
sleepq_timedwait(void *wchan)
{
+ struct thread *td;
int rval;
- MPASS(!(curthread->td_flags & TDF_SINTR));
- mtx_lock_spin(&sched_lock);
+ td = curthread;
+ MPASS(!(td->td_flags & TDF_SINTR));
+ thread_lock(td);
sleepq_switch(wchan);
rval = sleepq_check_timeout();
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
+
return (rval);
}
@@ -576,13 +610,9 @@
int rcatch, rvalt, rvals;
rcatch = sleepq_catch_signals(wchan);
- if (rcatch == 0)
- sleepq_switch(wchan);
- else
- sleepq_release(wchan);
rvalt = sleepq_check_timeout();
rvals = sleepq_check_signals();
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(curthread);
if (rcatch)
return (rcatch);
if (rvals)
@@ -602,12 +632,13 @@
MPASS(td != NULL);
MPASS(sq->sq_wchan != NULL);
MPASS(td->td_wchan == sq->sq_wchan);
+ MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
sc = SC_LOOKUP(sq->sq_wchan);
mtx_assert(&sc->sc_lock, MA_OWNED);
- mtx_assert(&sched_lock, MA_OWNED);
/* Remove the thread from the queue. */
- TAILQ_REMOVE(&sq->sq_blocked, td, td_slpq);
+ TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);
/*
* Get a sleep queue for this thread. If this is the last waiter,
@@ -628,6 +659,7 @@
td->td_wmesg = NULL;
td->td_wchan = NULL;
+ td->td_flags &= ~TDF_SINTR;
/*
* Note that thread td might not be sleeping if it is running
@@ -647,22 +679,54 @@
setrunnable(td);
}
+#ifdef INVARIANTS
+/*
+ * UMA zone item deallocator.
+ */
+static void
+sleepq_dtor(void *mem, int size, void *arg)
+{
+ struct sleepqueue *sq;
+ int i;
+
+ sq = mem;
+ for (i = 0; i < NR_SLEEPQS; i++)
+ MPASS(TAILQ_EMPTY(&sq->sq_blocked[i]));
+}
+#endif
+
+/*
+ * UMA zone item initializer.
+ */
+static int
+sleepq_init(void *mem, int size, int flags)
+{
+ struct sleepqueue *sq;
+ int i;
+
+ bzero(mem, size);
+ sq = mem;
+ for (i = 0; i < NR_SLEEPQS; i++)
+ TAILQ_INIT(&sq->sq_blocked[i]);
+ LIST_INIT(&sq->sq_free);
+ return (0);
+}
+
/*
* Find the highest priority thread sleeping on a wait channel and resume it.
*/
void
-sleepq_signal(void *wchan, int flags, int pri)
+sleepq_signal(void *wchan, int flags, int pri, int queue)
{
struct sleepqueue *sq;
struct thread *td, *besttd;
CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags);
KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+ MPASS((queue >= 0) && (queue < NR_SLEEPQS));
sq = sleepq_lookup(wchan);
- if (sq == NULL) {
- sleepq_release(wchan);
+ if (sq == NULL)
return;
- }
KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
("%s: mismatch between sleep/wakeup and cv_*", __func__));
@@ -673,27 +737,28 @@
* the tail of sleep queues.
*/
besttd = NULL;
- TAILQ_FOREACH(td, &sq->sq_blocked, td_slpq) {
+ TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) {
if (besttd == NULL || td->td_priority < besttd->td_priority)
besttd = td;
}
MPASS(besttd != NULL);
- mtx_lock_spin(&sched_lock);
+ thread_lock(besttd);
sleepq_resume_thread(sq, besttd, pri);
- mtx_unlock_spin(&sched_lock);
- sleepq_release(wchan);
+ thread_unlock(besttd);
}
/*
* Resume all threads sleeping on a specified wait channel.
*/
void
-sleepq_broadcast(void *wchan, int flags, int pri)
+sleepq_broadcast(void *wchan, int flags, int pri, int queue)
{
struct sleepqueue *sq;
+ struct thread *td;
CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags);
KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+ MPASS((queue >= 0) && (queue < NR_SLEEPQS));
sq = sleepq_lookup(wchan);
if (sq == NULL) {
sleepq_release(wchan);
@@ -703,10 +768,12 @@
("%s: mismatch between sleep/wakeup and cv_*", __func__));
/* Resume all blocked threads on the sleep queue. */
- mtx_lock_spin(&sched_lock);
- while (!TAILQ_EMPTY(&sq->sq_blocked))
- sleepq_resume_thread(sq, TAILQ_FIRST(&sq->sq_blocked), pri);
- mtx_unlock_spin(&sched_lock);
+ while (!TAILQ_EMPTY(&sq->sq_blocked[queue])) {
+ td = TAILQ_FIRST(&sq->sq_blocked[queue]);
+ thread_lock(td);
+ sleepq_resume_thread(sq, td, pri);
+ thread_unlock(td);
+ }
sleepq_release(wchan);
}
@@ -717,6 +784,7 @@
static void
sleepq_timeout(void *arg)
{
+ struct sleepqueue_chain *sc;
struct sleepqueue *sq;
struct thread *td;
void *wchan;
@@ -729,38 +797,30 @@
* First, see if the thread is asleep and get the wait channel if
* it is.
*/
- mtx_lock_spin(&sched_lock);
- if (TD_ON_SLEEPQ(td)) {
+ thread_lock(td);
+ if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
wchan = td->td_wchan;
- mtx_unlock_spin(&sched_lock);
- sleepq_lock(wchan);
+ sc = SC_LOOKUP(wchan);
+ MPASS(td->td_lock == &sc->sc_lock);
sq = sleepq_lookup(wchan);
- mtx_lock_spin(&sched_lock);
- } else {
- wchan = NULL;
- sq = NULL;
+ MPASS(sq != NULL);
+ td->td_flags |= TDF_TIMEOUT;
+ sleepq_resume_thread(sq, td, -1);
+ thread_unlock(td);
+ return;
}
/*
- * At this point, if the thread is still on the sleep queue,
- * we have that sleep queue locked as it cannot migrate sleep
- * queues while we dropped sched_lock. If it had resumed and
- * was on another CPU while the lock was dropped, it would have
- * seen that TDF_TIMEOUT and TDF_TIMOFAIL are clear and the
- * call to callout_stop() to stop this routine would have failed
- * meaning that it would have already set TDF_TIMEOUT to
- * synchronize with this function.
+ * If the thread is on the SLEEPQ but isn't sleeping yet, it
+ * can either be on another CPU in between sleepq_add() and
+ * one of the sleepq_*wait*() routines or it can be in
+ * sleepq_catch_signals().
*/
if (TD_ON_SLEEPQ(td)) {
- MPASS(td->td_wchan == wchan);
- MPASS(sq != NULL);
td->td_flags |= TDF_TIMEOUT;
- sleepq_resume_thread(sq, td, -1);
- mtx_unlock_spin(&sched_lock);
- sleepq_release(wchan);
+ thread_unlock(td);
return;
- } else if (wchan != NULL)
- sleepq_release(wchan);
+ }
/*
* Now check for the edge cases. First, if TDF_TIMEOUT is set,
@@ -778,7 +838,7 @@
setrunnable(td);
} else
td->td_flags |= TDF_TIMOFAIL;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
/*
@@ -798,33 +858,36 @@
MPASS(wchan != NULL);
sleepq_lock(wchan);
sq = sleepq_lookup(wchan);
- mtx_lock_spin(&sched_lock);
+ /*
+ * We can not lock the thread here as it may be sleeping on a
+ * different sleepq. However, holding the sleepq lock for this
+ * wchan can guarantee that we do not miss a wakeup for this
+ * channel. The asserts below will catch any false positives.
+ */
if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) {
- mtx_unlock_spin(&sched_lock);
sleepq_release(wchan);
return;
}
- MPASS(sq != NULL);
-
/* Thread is asleep on sleep queue sq, so wake it up. */
+ thread_lock(td);
+ MPASS(sq != NULL);
+ MPASS(td->td_wchan == wchan);
sleepq_resume_thread(sq, td, -1);
+ thread_unlock(td);
sleepq_release(wchan);
- mtx_unlock_spin(&sched_lock);
}
/*
* Abort a thread as if an interrupt had occurred. Only abort
* interruptible waits (unfortunately it isn't safe to abort others).
- *
- * XXX: What in the world does the comment below mean?
- * Also, whatever the signal code does...
*/
void
sleepq_abort(struct thread *td, int intrval)
{
+ struct sleepqueue *sq;
void *wchan;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
MPASS(TD_ON_SLEEPQ(td));
MPASS(td->td_flags & TDF_SINTR);
MPASS(intrval == EINTR || intrval == ERESTART);
@@ -838,12 +901,87 @@
CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)",
(void *)td, (long)td->td_proc->p_pid, (void *)td->td_proc->p_comm);
+ td->td_intrval = intrval;
+ td->td_flags |= TDF_SLEEPABORT;
+ /*
+ * If the thread has not slept yet it will find the signal in
+ * sleepq_catch_signals() and call sleepq_resume_thread. Otherwise
+ * we have to do it here.
+ */
+ if (!TD_IS_SLEEPING(td))
+ return;
wchan = td->td_wchan;
- if (wchan != NULL) {
- td->td_intrval = intrval;
- td->td_flags |= TDF_SLEEPABORT;
- }
- mtx_unlock_spin(&sched_lock);
- sleepq_remove(td, wchan);
- mtx_lock_spin(&sched_lock);
+ MPASS(wchan != NULL);
+ sq = sleepq_lookup(wchan);
+ MPASS(sq != NULL);
+
+ /* Thread is asleep on sleep queue sq, so wake it up. */
+ sleepq_resume_thread(sq, td, -1);
}
+
+#ifdef DDB
+DB_SHOW_COMMAND(sleepq, db_show_sleepqueue)
+{
+ struct sleepqueue_chain *sc;
+ struct sleepqueue *sq;
+#ifdef INVARIANTS
+ struct lock_object *lock;
+#endif
+ struct thread *td;
+ void *wchan;
+ int i;
+
+ if (!have_addr)
+ return;
+
+ /*
+ * First, see if there is an active sleep queue for the wait channel
+ * indicated by the address.
+ */
+ wchan = (void *)addr;
+ sc = SC_LOOKUP(wchan);
+ LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
+ if (sq->sq_wchan == wchan)
+ goto found;
+
+ /*
+ * Second, see if there is an active sleep queue at the address
+ * indicated.
+ */
+ for (i = 0; i < SC_TABLESIZE; i++)
+ LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) {
+ if (sq == (struct sleepqueue *)addr)
+ goto found;
+ }
+
+ db_printf("Unable to locate a sleep queue via %p\n", (void *)addr);
+ return;
+found:
+ db_printf("Wait channel: %p\n", sq->sq_wchan);
+#ifdef INVARIANTS
+ db_printf("Queue type: %d\n", sq->sq_type);
+ if (sq->sq_lock) {
+ lock = sq->sq_lock;
+ db_printf("Associated Interlock: %p - (%s) %s\n", lock,
+ LOCK_CLASS(lock)->lc_name, lock->lo_name);
+ }
+#endif
+ db_printf("Blocked threads:\n");
+ for (i = 0; i < NR_SLEEPQS; i++) {
+ db_printf("\nQueue[%d]:\n", i);
+ if (TAILQ_EMPTY(&sq->sq_blocked[i]))
+ db_printf("\tempty\n");
+ else
+ TAILQ_FOREACH(td, &sq->sq_blocked[0],
+ td_slpq) {
+ db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td,
+ td->td_tid, td->td_proc->p_pid,
+ td->td_name[i] != '\0' ? td->td_name :
+ td->td_proc->p_comm);
+ }
+ }
+}
+
+/* Alias 'show sleepqueue' to 'show sleepq'. */
+DB_SET(sleepqueue, db_show_sleepqueue, db_show_cmd_set, 0, NULL);
+#endif
Index: kern_sysctl.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_sysctl.c -L sys/kern/kern_sysctl.c -u -r1.2 -r1.3
--- sys/kern/kern_sysctl.c
+++ sys/kern/kern_sysctl.c
@@ -36,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_sysctl.c,v 1.165.2.3 2006/03/01 21:08:53 andre Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_sysctl.c,v 1.177 2007/09/02 09:59:33 rwatson Exp $");
#include "opt_compat.h"
#include "opt_mac.h"
@@ -45,13 +45,16 @@
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sx.h>
#include <sys/sysproto.h>
+
+#include <security/mac/mac_framework.h>
+
#include <vm/vm.h>
#include <vm/vm_extern.h>
@@ -510,7 +513,7 @@
{
int error;
- error = suser(req->td);
+ error = priv_check(req->td, PRIV_SYSCTL_DEBUG);
if (error)
return (error);
sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
@@ -889,6 +892,31 @@
}
/*
+ * Handle a 64 bit int, signed or unsigned. arg1 points to it.
+ */
+
+int
+sysctl_handle_quad(SYSCTL_HANDLER_ARGS)
+{
+ int error = 0;
+ uint64_t tmpout;
+
+ /*
+ * Attempt to get a coherent snapshot by making a copy of the data.
+ */
+ if (!arg1)
+ return (EINVAL);
+ tmpout = *(uint64_t *)arg1;
+ error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t));
+
+ if (error || !req->newptr)
+ return (error);
+
+ error = SYSCTL_IN(req, arg1, sizeof(uint64_t));
+ return (error);
+}
+
+/*
* Handle our generic '\0' terminated 'C' string.
* Two cases:
* a variable string: point arg1 at it, arg2 is max length.
@@ -1135,10 +1163,6 @@
/*
* Wire the user space destination buffer. If set to a value greater than
* zero, the len parameter limits the maximum amount of wired memory.
- *
- * XXX - The len parameter is currently ignored due to the lack of
- * a place to save it in the sysctl_req structure so that the matching
- * amount of memory can be unwired in the sysctl exit code.
*/
int
sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
@@ -1255,13 +1279,10 @@
/* Is this sysctl writable by only privileged users? */
if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
- int flags;
-
if (oid->oid_kind & CTLFLAG_PRISON)
- flags = SUSER_ALLOWJAIL;
+ error = priv_check(req->td, PRIV_SYSCTL_WRITEJAIL);
else
- flags = 0;
- error = suser_cred(req->td->td_ucred, flags);
+ error = priv_check(req->td, PRIV_SYSCTL_WRITE);
if (error)
return (error);
}
@@ -1297,10 +1318,6 @@
size_t newlen;
};
#endif
-
-/*
- * MPSAFE
- */
int
__sysctl(struct thread *td, struct sysctl_args *uap)
{
@@ -1366,7 +1383,7 @@
}
if (new != NULL) {
- if (!useracc(new, req.newlen, VM_PROT_READ))
+ if (!useracc(new, newlen, VM_PROT_READ))
return (EFAULT);
req.newlen = newlen;
req.newptr = new;
@@ -1452,6 +1469,7 @@
/* the actual string data is appended here */
} bsdi_si;
+
/*
* this data is appended to the end of the bsdi_si structure during copyout.
* The "char *" offsets are relative to the base of the bsdi_si struct.
@@ -1468,10 +1486,6 @@
int arg;
};
#endif
-
-/*
- * MPSAFE
- */
int
ogetkerninfo(struct thread *td, struct getkerninfo_args *uap)
{
Index: sys_generic.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/sys_generic.c -L sys/kern/sys_generic.c -u -r1.2 -r1.3
--- sys/kern/sys_generic.c
+++ sys/kern/sys_generic.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sys_generic.c,v 1.146 2005/07/07 18:17:55 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sys_generic.c,v 1.158 2007/07/04 22:57:21 peter Exp $");
#include "opt_compat.h"
#include "opt_ktrace.h"
@@ -68,8 +68,6 @@
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
-#include <vm/vm.h>
-#include <vm/vm_page.h>
static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
@@ -83,9 +81,6 @@
off_t, int);
static void doselwakeup(struct selinfo *, int);
-/*
- * Read system call.
- */
#ifndef _SYS_SYSPROTO_H_
struct read_args {
int fd;
@@ -93,9 +88,6 @@
size_t nbyte;
};
#endif
-/*
- * MPSAFE
- */
int
read(td, uap)
struct thread *td;
@@ -129,9 +121,6 @@
off_t offset;
};
#endif
-/*
- * MPSAFE
- */
int
pread(td, uap)
struct thread *td;
@@ -153,6 +142,20 @@
return(error);
}
+int
+freebsd6_pread(td, uap)
+ struct thread *td;
+ struct freebsd6_pread_args *uap;
+{
+ struct pread_args oargs;
+
+ oargs.fd = uap->fd;
+ oargs.buf = uap->buf;
+ oargs.nbyte = uap->nbyte;
+ oargs.offset = uap->offset;
+ return (pread(td, &oargs));
+}
+
/*
* Scatter read system call.
*/
@@ -163,9 +166,6 @@
u_int iovcnt;
};
#endif
-/*
- * MPSAFE
- */
int
readv(struct thread *td, struct readv_args *uap)
{
@@ -205,9 +205,6 @@
off_t offset;
};
#endif
-/*
- * MPSAFE
- */
int
preadv(struct thread *td, struct preadv_args *uap)
{
@@ -293,9 +290,6 @@
return (error);
}
-/*
- * Write system call
- */
#ifndef _SYS_SYSPROTO_H_
struct write_args {
int fd;
@@ -303,9 +297,6 @@
size_t nbyte;
};
#endif
-/*
- * MPSAFE
- */
int
write(td, uap)
struct thread *td;
@@ -328,7 +319,7 @@
}
/*
- * Positioned write system call
+ * Positioned write system call.
*/
#ifndef _SYS_SYSPROTO_H_
struct pwrite_args {
@@ -339,9 +330,6 @@
off_t offset;
};
#endif
-/*
- * MPSAFE
- */
int
pwrite(td, uap)
struct thread *td;
@@ -363,8 +351,22 @@
return(error);
}
+int
+freebsd6_pwrite(td, uap)
+ struct thread *td;
+ struct freebsd6_pwrite_args *uap;
+{
+ struct pwrite_args oargs;
+
+ oargs.fd = uap->fd;
+ oargs.buf = uap->buf;
+ oargs.nbyte = uap->nbyte;
+ oargs.offset = uap->offset;
+ return (pwrite(td, &oargs));
+}
+
/*
- * Gather write system call
+ * Gather write system call.
*/
#ifndef _SYS_SYSPROTO_H_
struct writev_args {
@@ -373,9 +375,6 @@
u_int iovcnt;
};
#endif
-/*
- * MPSAFE
- */
int
writev(struct thread *td, struct writev_args *uap)
{
@@ -398,14 +397,14 @@
error = fget_write(td, fd, &fp);
if (error)
- return (EBADF); /* XXX this can't be right */
+ return (error);
error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
fdrop(fp, td);
return (error);
}
/*
- * Gather positioned write system call
+ * Gather positioned write system call.
*/
#ifndef _SYS_SYSPROTO_H_
struct pwritev_args {
@@ -415,9 +414,6 @@
off_t offset;
};
#endif
-/*
- * MPSAFE
- */
int
pwritev(struct thread *td, struct pwritev_args *uap)
{
@@ -444,7 +440,7 @@
error = fget_write(td, fd, &fp);
if (error)
- return (EBADF); /* XXX this can't be right */
+ return (error);
if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
error = ESPIPE;
else if (offset < 0 && fp->f_vnode->v_type != VCHR)
@@ -506,9 +502,6 @@
return (error);
}
-/*
- * Ioctl system call
- */
#ifndef _SYS_SYSPROTO_H_
struct ioctl_args {
int fd;
@@ -516,20 +509,14 @@
caddr_t data;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
ioctl(struct thread *td, struct ioctl_args *uap)
{
- struct file *fp;
- struct filedesc *fdp;
u_long com;
- int error = 0;
+ int arg, error;
u_int size;
- caddr_t data, memp;
- int tmp;
+ caddr_t data;
if (uap->com > 0xffffffff) {
printf(
@@ -537,27 +524,7 @@
td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
uap->com &= 0xffffffff;
}
- if ((error = fget(td, uap->fd, &fp)) != 0)
- return (error);
- if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
- fdrop(fp, td);
- return (EBADF);
- }
- fdp = td->td_proc->p_fd;
- switch (com = uap->com) {
- case FIONCLEX:
- FILEDESC_LOCK_FAST(fdp);
- fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
- FILEDESC_UNLOCK_FAST(fdp);
- fdrop(fp, td);
- return (0);
- case FIOCLEX:
- FILEDESC_LOCK_FAST(fdp);
- fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
- FILEDESC_UNLOCK_FAST(fdp);
- fdrop(fp, td);
- return (0);
- }
+ com = uap->com;
/*
* Interpret high order word to find amount of data to be
@@ -571,23 +538,25 @@
#else
((com & (IOC_IN | IOC_OUT)) && size == 0) ||
#endif
- ((com & IOC_VOID) && size > 0)) {
- fdrop(fp, td);
+ ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
return (ENOTTY);
- }
if (size > 0) {
- memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
- data = memp;
- } else {
- memp = NULL;
+ if (!(com & IOC_VOID))
+ data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
+ else {
+ /* Integer argument. */
+ arg = (intptr_t)uap->data;
+ data = (void *)&arg;
+ size = 0;
+ }
+ } else
data = (void *)&uap->data;
- }
if (com & IOC_IN) {
error = copyin(uap->data, data, (u_int)size);
if (error) {
- free(memp, M_IOCTLOPS);
- fdrop(fp, td);
+ if (size > 0)
+ free(data, M_IOCTLOPS);
return (error);
}
} else if (com & IOC_OUT) {
@@ -598,7 +567,43 @@
bzero(data, size);
}
- if (com == FIONBIO) {
+ error = kern_ioctl(td, uap->fd, com, data);
+
+ if (error == 0 && (com & IOC_OUT))
+ error = copyout(data, uap->data, (u_int)size);
+
+ if (size > 0)
+ free(data, M_IOCTLOPS);
+ return (error);
+}
+
+int
+kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
+{
+ struct file *fp;
+ struct filedesc *fdp;
+ int error;
+ int tmp;
+
+ if ((error = fget(td, fd, &fp)) != 0)
+ return (error);
+ if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
+ fdrop(fp, td);
+ return (EBADF);
+ }
+ fdp = td->td_proc->p_fd;
+ switch (com) {
+ case FIONCLEX:
+ FILEDESC_XLOCK(fdp);
+ fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
+ FILEDESC_XUNLOCK(fdp);
+ goto out;
+ case FIOCLEX:
+ FILEDESC_XLOCK(fdp);
+ fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
+ FILEDESC_XUNLOCK(fdp);
+ goto out;
+ case FIONBIO:
FILE_LOCK(fp);
if ((tmp = *(int *)data))
fp->f_flag |= FNONBLOCK;
@@ -606,7 +611,8 @@
fp->f_flag &= ~FNONBLOCK;
FILE_UNLOCK(fp);
data = (void *)&tmp;
- } else if (com == FIOASYNC) {
+ break;
+ case FIOASYNC:
FILE_LOCK(fp);
if ((tmp = *(int *)data))
fp->f_flag |= FASYNC;
@@ -614,15 +620,11 @@
fp->f_flag &= ~FASYNC;
FILE_UNLOCK(fp);
data = (void *)&tmp;
+ break;
}
error = fo_ioctl(fp, com, data, td->td_ucred, td);
-
- if (error == 0 && (com & IOC_OUT))
- error = copyout(data, uap->data, (u_int)size);
-
- if (memp != NULL)
- free(memp, M_IOCTLOPS);
+out:
fdrop(fp, td);
return (error);
}
@@ -635,9 +637,6 @@
u_int nselcoll; /* Select collisions since boot */
SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
-/*
- * Select system call.
- */
#ifndef _SYS_SYSPROTO_H_
struct select_args {
int nd;
@@ -645,9 +644,6 @@
struct timeval *tv;
};
#endif
-/*
- * MPSAFE
- */
int
select(td, uap)
register struct thread *td;
@@ -688,11 +684,10 @@
return (EINVAL);
fdp = td->td_proc->p_fd;
- FILEDESC_LOCK_FAST(fdp);
-
+ FILEDESC_SLOCK(fdp);
if (nd > td->td_proc->p_fd->fd_nfiles)
nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_SUNLOCK(fdp);
/*
* Allocate just enough bits for the non-null fd_sets. Use the
@@ -755,9 +750,9 @@
mtx_lock(&sellock);
retry:
ncoll = nselcoll;
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_flags |= TDF_SELECT;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
mtx_unlock(&sellock);
error = selscan(td, ibits, obits, nd);
@@ -780,12 +775,12 @@
* collisions and rescan the file descriptors if
* necessary.
*/
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
goto retry;
}
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
if (timo > 0)
error = cv_timedwait_sig(&selwait, &sellock, timo);
@@ -797,9 +792,9 @@
done:
clear_selinfo_list(td);
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_flags &= ~TDF_SELECT;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
mtx_unlock(&sellock);
done_nosellock:
@@ -839,7 +834,7 @@
static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
struct filedesc *fdp = td->td_proc->p_fd;
- FILEDESC_LOCK(fdp);
+ FILEDESC_SLOCK(fdp);
for (msk = 0; msk < 3; msk++) {
if (ibits[msk] == NULL)
continue;
@@ -850,7 +845,7 @@
if (!(bits & 1))
continue;
if ((fp = fget_locked(fdp, fd)) == NULL) {
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
return (EBADF);
}
if (fo_poll(fp, flag[msk], td->td_ucred,
@@ -862,14 +857,11 @@
}
}
}
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
td->td_retval[0] = n;
return (0);
}
-/*
- * Poll system call.
- */
#ifndef _SYS_SYSPROTO_H_
struct poll_args {
struct pollfd *fds;
@@ -877,9 +869,6 @@
int timeout;
};
#endif
-/*
- * MPSAFE
- */
int
poll(td, uap)
struct thread *td;
@@ -935,9 +924,9 @@
mtx_lock(&sellock);
retry:
ncoll = nselcoll;
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_flags |= TDF_SELECT;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
mtx_unlock(&sellock);
error = pollscan(td, bits, nfds);
@@ -958,12 +947,12 @@
* sellock, so check TDF_SELECT and the number of collisions
* and rescan the file descriptors if necessary.
*/
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
goto retry;
}
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
if (timo > 0)
error = cv_timedwait_sig(&selwait, &sellock, timo);
@@ -975,9 +964,9 @@
done:
clear_selinfo_list(td);
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_flags &= ~TDF_SELECT;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
mtx_unlock(&sellock);
done_nosellock:
@@ -1009,7 +998,7 @@
struct file *fp;
int n = 0;
- FILEDESC_LOCK(fdp);
+ FILEDESC_SLOCK(fdp);
for (i = 0; i < nfd; i++, fds++) {
if (fds->fd >= fdp->fd_nfiles) {
fds->revents = POLLNVAL;
@@ -1033,13 +1022,14 @@
}
}
}
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
td->td_retval[0] = n;
return (0);
}
/*
* OpenBSD poll system call.
+ *
* XXX this isn't quite a true representation.. OpenBSD uses select ops.
*/
#ifndef _SYS_SYSPROTO_H_
@@ -1049,9 +1039,6 @@
int timeout;
};
#endif
-/*
- * MPSAFE
- */
int
openbsd_poll(td, uap)
register struct thread *td;
@@ -1061,12 +1048,12 @@
}
/*
- * Remove the references to the thread from all of the objects
- * we were polling.
+ * Remove the references to the thread from all of the objects we were
+ * polling.
*
- * This code assumes that the underlying owner of the selinfo
- * structure will hold sellock before it changes it, and that
- * it will unlink itself from our list if it goes away.
+ * This code assumes that the underlying owner of the selinfo structure will
+ * hold sellock before it changes it, and that it will unlink itself from our
+ * list if it goes away.
*/
void
clear_selinfo_list(td)
@@ -1150,9 +1137,9 @@
}
TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
sip->si_thread = NULL;
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_flags &= ~TDF_SELECT;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
sleepq_remove(td, &selwait);
mtx_unlock(&sellock);
}
--- /dev/null
+++ sys/kern/p1003_1b.c
@@ -0,0 +1,321 @@
+/*-
+ * Copyright (c) 1996, 1997, 1998
+ * HD Associates, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* p1003_1b: Real Time common code.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/p1003_1b.c,v 1.36 2007/10/08 23:45:23 jeff Exp $");
+
+#include "opt_posix.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+
+MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B");
+
+/* The system calls return ENOSYS if an entry is called that is not run-time
+ * supported. I am also logging since some programs start to use this when
+ * they shouldn't. That will be removed if annoying.
+ */
+int
+syscall_not_present(struct thread *td, const char *s, struct nosys_args *uap)
+{
+ log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
+ td->td_proc->p_comm, td->td_proc->p_pid, s);
+
+ /* a " return nosys(p, uap); " here causes a core dump.
+ */
+
+ return ENOSYS;
+}
+
+#if !defined(_KPOSIX_PRIORITY_SCHEDULING)
+
+/* Not configured but loadable via a module:
+ */
+
+static int
+sched_attach(void)
+{
+ return 0;
+}
+
+SYSCALL_NOT_PRESENT_GEN(sched_setparam)
+SYSCALL_NOT_PRESENT_GEN(sched_getparam)
+SYSCALL_NOT_PRESENT_GEN(sched_setscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_getscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_yield)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min)
+SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval)
+#else
+
+/* Configured in kernel version:
+ */
+static struct ksched *ksched;
+
+static int
+sched_attach(void)
+{
+ int ret = ksched_attach(&ksched);
+
+ if (ret == 0)
+ p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 1);
+
+ return ret;
+}
+
+int
+sched_setparam(struct thread *td, struct sched_setparam_args *uap)
+{
+ struct thread *targettd;
+ struct proc *targetp;
+ int e;
+ struct sched_param sched_param;
+
+ e = copyin(uap->param, &sched_param, sizeof(sched_param));
+ if (e)
+ return (e);
+
+ if (uap->pid == 0) {
+ targetp = td->td_proc;
+ targettd = td;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = pfind(uap->pid);
+ if (targetp == NULL)
+ return (ESRCH);
+ targettd = FIRST_THREAD_IN_PROC(targetp);
+ }
+
+ e = p_cansched(td, targetp);
+ if (e == 0) {
+ e = ksched_setparam(ksched, targettd,
+ (const struct sched_param *)&sched_param);
+ }
+ PROC_UNLOCK(targetp);
+ return (e);
+}
+
+int
+sched_getparam(struct thread *td, struct sched_getparam_args *uap)
+{
+ int e;
+ struct sched_param sched_param;
+ struct thread *targettd;
+ struct proc *targetp;
+
+ if (uap->pid == 0) {
+ targetp = td->td_proc;
+ targettd = td;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = pfind(uap->pid);
+ if (targetp == NULL) {
+ return (ESRCH);
+ }
+ targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
+ }
+
+ e = p_cansee(td, targetp);
+ if (e == 0) {
+ e = ksched_getparam(ksched, targettd, &sched_param);
+ }
+ PROC_UNLOCK(targetp);
+ if (e == 0)
+ e = copyout(&sched_param, uap->param, sizeof(sched_param));
+ return (e);
+}
+
+int
+sched_setscheduler(struct thread *td, struct sched_setscheduler_args *uap)
+{
+ int e;
+ struct sched_param sched_param;
+ struct thread *targettd;
+ struct proc *targetp;
+
+ /* Don't allow non root user to set a scheduler policy. */
+ e = priv_check(td, PRIV_SCHED_SET);
+ if (e)
+ return (e);
+
+ e = copyin(uap->param, &sched_param, sizeof(sched_param));
+ if (e)
+ return (e);
+
+ if (uap->pid == 0) {
+ targetp = td->td_proc;
+ targettd = td;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = pfind(uap->pid);
+ if (targetp == NULL)
+ return (ESRCH);
+ targettd = FIRST_THREAD_IN_PROC(targetp);
+ }
+
+ e = p_cansched(td, targetp);
+ if (e == 0) {
+ e = ksched_setscheduler(ksched, targettd,
+ uap->policy, (const struct sched_param *)&sched_param);
+ }
+ PROC_UNLOCK(targetp);
+ return (e);
+}
+
+int
+sched_getscheduler(struct thread *td, struct sched_getscheduler_args *uap)
+{
+ int e, policy;
+ struct thread *targettd;
+ struct proc *targetp;
+
+ if (uap->pid == 0) {
+ targetp = td->td_proc;
+ targettd = td;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = pfind(uap->pid);
+ if (targetp == NULL) {
+ e = ESRCH;
+ goto done2;
+ }
+ targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
+ }
+
+ e = p_cansee(td, targetp);
+ if (e == 0) {
+ e = ksched_getscheduler(ksched, targettd, &policy);
+ td->td_retval[0] = policy;
+ }
+ PROC_UNLOCK(targetp);
+
+done2:
+ return (e);
+}
+
+int
+sched_yield(struct thread *td, struct sched_yield_args *uap)
+{
+
+ sched_relinquish(curthread);
+ return 0;
+}
+
+int
+sched_get_priority_max(struct thread *td,
+ struct sched_get_priority_max_args *uap)
+{
+ int error, prio;
+
+ error = ksched_get_priority_max(ksched, uap->policy, &prio);
+ td->td_retval[0] = prio;
+ return (error);
+}
+
+int
+sched_get_priority_min(struct thread *td,
+ struct sched_get_priority_min_args *uap)
+{
+ int error, prio;
+
+ error = ksched_get_priority_min(ksched, uap->policy, &prio);
+ td->td_retval[0] = prio;
+ return (error);
+}
+
+int
+sched_rr_get_interval(struct thread *td,
+ struct sched_rr_get_interval_args *uap)
+{
+ struct timespec timespec;
+ int error;
+
+ error = kern_sched_rr_get_interval(td, uap->pid, ×pec);
+ if (error == 0)
+ error = copyout(×pec, uap->interval, sizeof(timespec));
+ return (error);
+}
+
+int
+kern_sched_rr_get_interval(struct thread *td, pid_t pid,
+ struct timespec *ts)
+{
+ int e;
+ struct thread *targettd;
+ struct proc *targetp;
+
+ if (pid == 0) {
+ targettd = td;
+ targetp = td->td_proc;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = td->td_proc;
+ PROC_LOCK(targetp);
+ targettd = thread_find(targetp, pid);
+ if (targettd == NULL) {
+ PROC_UNLOCK(targetp);
+ return (ESRCH);
+ }
+ }
+
+ e = p_cansee(td, targetp);
+ if (e == 0)
+ e = ksched_rr_get_interval(ksched, targettd, ts);
+ PROC_UNLOCK(targetp);
+ return (e);
+}
+
+#endif
+
+static void
+p31binit(void *notused)
+{
+ (void) sched_attach();
+ p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
+}
+
+SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL);
Index: tty_tty.c
===================================================================
RCS file: /home/cvs/src/sys/kern/tty_tty.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/tty_tty.c -L sys/kern/tty_tty.c -u -r1.1.1.1 -r1.2
--- sys/kern/tty_tty.c
+++ sys/kern/tty_tty.c
@@ -24,15 +24,19 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/tty_tty.c,v 1.56.2.1 2005/08/13 21:24:16 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/tty_tty.c,v 1.60 2007/07/03 17:46:37 kib Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/proc.h>
+#include <sys/sx.h>
#include <sys/vnode.h>
+#include <fs/devfs/devfs.h>
+#include <fs/devfs/devfs_int.h>
+
static d_open_t cttyopen;
static struct cdevsw ctty_cdevsw = {
@@ -60,13 +64,25 @@
return;
if (strcmp(name, "tty"))
return;
+ sx_sunlock(&clone_drain_lock);
+ mtx_lock(&Giant);
+ sx_slock(&proctree_lock);
+ sx_slock(&clone_drain_lock);
+ dev_lock();
if (!(curthread->td_proc->p_flag & P_CONTROLT))
*dev = ctty;
else if (curthread->td_proc->p_session->s_ttyvp == NULL)
*dev = ctty;
- else
+ else if (curthread->td_proc->p_session->s_ttyvp->v_type == VBAD ||
+ curthread->td_proc->p_session->s_ttyvp->v_rdev == NULL) {
+ /* e.g. s_ttyvp was revoked */
+ *dev = ctty;
+ } else
*dev = curthread->td_proc->p_session->s_ttyvp->v_rdev;
- dev_ref(*dev);
+ dev_refl(*dev);
+ dev_unlock();
+ sx_sunlock(&proctree_lock);
+ mtx_unlock(&Giant);
}
static void
Index: inflate.c
===================================================================
RCS file: /home/cvs/src/sys/kern/inflate.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/inflate.c -L sys/kern/inflate.c -u -r1.1.1.1 -r1.2
--- sys/kern/inflate.c
+++ sys/kern/inflate.c
@@ -9,7 +9,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/inflate.c,v 1.19 2003/06/11 00:56:54 obrien Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/inflate.c,v 1.20 2005/10/31 15:41:25 rwatson Exp $");
#include <sys/param.h>
#include <sys/inflate.h>
@@ -20,7 +20,7 @@
#include <sys/malloc.h>
#ifdef _KERNEL
-static MALLOC_DEFINE(M_GZIP, "Gzip trees", "Gzip trees");
+static MALLOC_DEFINE(M_GZIP, "gzip_trees", "Gzip trees");
#endif
/* needed to make inflate() work */
Index: vfs_vnops.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_vnops.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/vfs_vnops.c -L sys/kern/vfs_vnops.c -u -r1.2 -r1.3
--- sys/kern/vfs_vnops.c
+++ sys/kern/vfs_vnops.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_vnops.c,v 1.233.2.1 2006/03/13 03:06:44 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_vnops.c,v 1.252 2007/07/26 16:58:09 pjd Exp $");
#include "opt_mac.h"
@@ -45,10 +45,10 @@
#include <sys/file.h>
#include <sys/kdb.h>
#include <sys/stat.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/limits.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
@@ -62,6 +62,8 @@
#include <sys/syslog.h>
#include <sys/unistd.h>
+#include <security/mac/mac_framework.h>
+
static fo_rdwr_t vn_read;
static fo_rdwr_t vn_write;
static fo_ioctl_t vn_ioctl;
@@ -82,13 +84,14 @@
};
int
-vn_open(ndp, flagp, cmode, fdidx)
+vn_open(ndp, flagp, cmode, fp)
struct nameidata *ndp;
- int *flagp, cmode, fdidx;
+ int *flagp, cmode;
+ struct file *fp;
{
struct thread *td = ndp->ni_cnd.cn_thread;
- return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fdidx));
+ return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fp));
}
/*
@@ -99,11 +102,11 @@
* due to the NDINIT being done elsewhere.
*/
int
-vn_open_cred(ndp, flagp, cmode, cred, fdidx)
+vn_open_cred(ndp, flagp, cmode, cred, fp)
struct nameidata *ndp;
int *flagp, cmode;
struct ucred *cred;
- int fdidx;
+ struct file *fp;
{
struct vnode *vp;
struct mount *mp;
@@ -111,21 +114,24 @@
struct vattr vat;
struct vattr *vap = &vat;
int mode, fmode, error;
- int vfslocked;
+ int vfslocked, mpsafe;
+ mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
restart:
vfslocked = 0;
fmode = *flagp;
if (fmode & O_CREAT) {
ndp->ni_cnd.cn_nameiop = CREATE;
- ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | MPSAFE;
+ ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
+ MPSAFE | AUDITVNODE1;
if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
ndp->ni_cnd.cn_flags |= FOLLOW;
bwillwrite();
if ((error = namei(ndp)) != 0)
return (error);
- vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
- ndp->ni_cnd.cn_flags &= ~MPSAFE;
+ vfslocked = NDHASGIANT(ndp);
+ if (!mpsafe)
+ ndp->ni_cnd.cn_flags &= ~MPSAFE;
if (ndp->ni_vp == NULL) {
VATTR_NULL(vap);
vap->va_type = VREG;
@@ -178,11 +184,12 @@
ndp->ni_cnd.cn_nameiop = LOOKUP;
ndp->ni_cnd.cn_flags = ISOPEN |
((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
- LOCKSHARED | LOCKLEAF | MPSAFE;
+ LOCKLEAF | MPSAFE | AUDITVNODE1;
if ((error = namei(ndp)) != 0)
return (error);
- ndp->ni_cnd.cn_flags &= ~MPSAFE;
- vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
+ if (!mpsafe)
+ ndp->ni_cnd.cn_flags &= ~MPSAFE;
+ vfslocked = NDHASGIANT(ndp);
vp = ndp->ni_vp;
}
if (vp->v_type == VLNK) {
@@ -222,14 +229,14 @@
goto bad;
}
}
- if ((error = VOP_OPEN(vp, fmode, cred, td, fdidx)) != 0)
+ if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
goto bad;
if (fmode & FWRITE)
vp->v_writecount++;
*flagp = fmode;
- ASSERT_VOP_LOCKED(vp, "vn_open_cred");
- if (fdidx == -1)
+ ASSERT_VOP_ELOCKED(vp, "vn_open_cred");
+ if (!mpsafe)
VFS_UNLOCK_GIANT(vfslocked);
return (0);
bad:
@@ -279,8 +286,11 @@
vn_start_write(vp, &mp, V_WAIT);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
- if (flags & FWRITE)
+ if (flags & FWRITE) {
+ VNASSERT(vp->v_writecount > 0, vp,
+ ("vn_close: negative writecount"));
vp->v_writecount--;
+ }
error = VOP_CLOSE(vp, flags, file_cred, td);
vput(vp);
vn_finished_write(mp);
@@ -327,7 +337,7 @@
aresid, td)
enum uio_rw rw;
struct vnode *vp;
- caddr_t base;
+ void *base;
int len;
off_t offset;
enum uio_seg segflg;
@@ -400,7 +410,7 @@
if (auio.uio_resid && error == 0)
error = EIO;
if ((ioflg & IO_NODELOCKED) == 0) {
- if (rw == UIO_WRITE)
+ if (rw == UIO_WRITE && vp->v_type != VCHR)
vn_finished_write(mp);
VOP_UNLOCK(vp, 0, td);
}
@@ -420,7 +430,7 @@
file_cred, aresid, td)
enum uio_rw rw;
struct vnode *vp;
- caddr_t base;
+ void *base;
size_t len;
off_t offset;
enum uio_seg segflg;
@@ -457,7 +467,7 @@
if (error)
break;
offset += chunk;
- base += chunk;
+ base = (char *)base + chunk;
uio_yield();
} while (len);
if (aresid)
@@ -491,11 +501,18 @@
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
VOP_LEASE(vp, td, fp->f_cred, LEASE_READ);
/*
- * According to McKusick the vn lock is protecting f_offset here.
- * Once this field has it's own lock we can acquire this shared.
+ * According to McKusick the vn lock was protecting f_offset here.
+ * It is now protected by the FOFFSET_LOCKED flag.
*/
if ((flags & FOF_OFFSET) == 0) {
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ FILE_LOCK(fp);
+ while(fp->f_vnread_flags & FOFFSET_LOCKED) {
+ fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+ msleep(&fp->f_vnread_flags,fp->f_mtxp,PUSER -1,"vnread offlock",0);
+ }
+ fp->f_vnread_flags |= FOFFSET_LOCKED;
+ FILE_UNLOCK(fp);
+ vn_lock(vp, LK_SHARED | LK_RETRY, td);
uio->uio_offset = fp->f_offset;
} else
vn_lock(vp, LK_SHARED | LK_RETRY, td);
@@ -507,8 +524,14 @@
if (error == 0)
#endif
error = VOP_READ(vp, uio, ioflag, fp->f_cred);
- if ((flags & FOF_OFFSET) == 0)
+ if ((flags & FOF_OFFSET) == 0) {
fp->f_offset = uio->uio_offset;
+ FILE_LOCK(fp);
+ if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+ wakeup(&fp->f_vnread_flags);
+ fp->f_vnread_flags = 0;
+ FILE_UNLOCK(fp);
+ }
fp->f_nextoff = uio->uio_offset;
VOP_UNLOCK(vp, 0, td);
VFS_UNLOCK_GIANT(vfslocked);
@@ -565,7 +588,8 @@
fp->f_offset = uio->uio_offset;
fp->f_nextoff = uio->uio_offset;
VOP_UNLOCK(vp, 0, td);
- vn_finished_write(mp);
+ if (vp->v_type != VCHR)
+ vn_finished_write(mp);
unlock:
VFS_UNLOCK_GIANT(vfslocked);
return (error);
@@ -690,17 +714,12 @@
sb->st_blksize = PAGE_SIZE;
sb->st_flags = vap->va_flags;
- if (suser(td))
+ if (priv_check(td, PRIV_VFS_GENERATION))
sb->st_gen = 0;
else
sb->st_gen = vap->va_gen;
-#if (S_BLKSIZE == 512)
- /* Optimize this case */
- sb->st_blocks = vap->va_bytes >> 9;
-#else
sb->st_blocks = vap->va_bytes / S_BLKSIZE;
-#endif
return (0);
}
@@ -757,11 +776,11 @@
struct thread *td;
{
struct vnode *vp;
+ int vfslocked;
int error;
- mtx_lock(&Giant);
-
vp = fp->f_vnode;
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
#ifdef MAC
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
error = mac_check_vnode_poll(active_cred, fp->f_cred, vp);
@@ -770,7 +789,7 @@
#endif
error = VOP_POLL(vp, events, fp->f_cred, td);
- mtx_unlock(&Giant);
+ VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -779,10 +798,7 @@
* acquire requested lock.
*/
int
-vn_lock(vp, flags, td)
- struct vnode *vp;
- int flags;
- struct thread *td;
+_vn_lock(struct vnode *vp, int flags, struct thread *td, char *file, int line)
{
int error;
@@ -805,7 +821,7 @@
* lockmgr drops interlock before it will return for
* any reason. So force the code above to relock it.
*/
- error = VOP_LOCK(vp, flags | LK_INTERLOCK, td);
+ error = VOP_LOCK1(vp, flags | LK_INTERLOCK, td, file, line);
flags &= ~LK_INTERLOCK;
KASSERT((flags & LK_RETRY) == 0 || error == 0,
("LK_RETRY set with incompatible flags %d\n", flags));
@@ -844,7 +860,7 @@
lf.l_start = 0;
lf.l_len = 0;
lf.l_type = F_UNLCK;
- (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
+ (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
}
fp->f_ops = &badfileops;
@@ -885,6 +901,8 @@
if ((mp = *mpp) == NULL)
return (0);
MNT_ILOCK(mp);
+ if (vp == NULL)
+ MNT_REF(mp);
/*
* Check on status of suspension.
*/
@@ -902,6 +920,7 @@
goto unlock;
mp->mnt_writeopcount++;
unlock:
+ MNT_REL(mp);
MNT_IUNLOCK(mp);
return (error);
}
@@ -935,19 +954,25 @@
if (mp == NULL)
return (0);
MNT_ILOCK(mp);
+ if (vp == NULL)
+ MNT_REF(mp);
if ((mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) {
+ MNT_REL(mp);
MNT_IUNLOCK(mp);
return (0);
}
if (flags & V_NOWAIT) {
+ MNT_REL(mp);
MNT_IUNLOCK(mp);
return (EWOULDBLOCK);
}
/*
* Wait for the suspension to finish.
*/
- return (msleep(&mp->mnt_flag, MNT_MTX(mp),
- (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0));
+ error = msleep(&mp->mnt_flag, MNT_MTX(mp),
+ (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
+ vfs_rel(mp);
+ return (error);
}
/*
@@ -982,13 +1007,17 @@
if ((mp = *mpp) == NULL)
return (0);
MNT_ILOCK(mp);
+ if (vp == NULL)
+ MNT_REF(mp);
if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
mp->mnt_secondary_writes++;
mp->mnt_secondary_accwrites++;
+ MNT_REL(mp);
MNT_IUNLOCK(mp);
return (0);
}
if (flags & V_NOWAIT) {
+ MNT_REL(mp);
MNT_IUNLOCK(mp);
return (EWOULDBLOCK);
}
@@ -997,6 +1026,7 @@
*/
error = msleep(&mp->mnt_flag, MNT_MTX(mp),
(PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
+ vfs_rel(mp);
if (error == 0)
goto retry;
return (error);
@@ -1057,23 +1087,19 @@
struct thread *td = curthread;
int error;
- error = 0;
MNT_ILOCK(mp);
- if (mp->mnt_kern_flag & MNTK_SUSPEND)
- goto unlock;
+ if (mp->mnt_kern_flag & MNTK_SUSPEND) {
+ MNT_IUNLOCK(mp);
+ return (0);
+ }
mp->mnt_kern_flag |= MNTK_SUSPEND;
if (mp->mnt_writeopcount > 0)
(void) msleep(&mp->mnt_writeopcount,
MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
else
MNT_IUNLOCK(mp);
- if ((error = VFS_SYNC(mp, MNT_SUSPEND, td)) != 0) {
+ if ((error = VFS_SYNC(mp, MNT_SUSPEND, td)) != 0)
vfs_write_resume(mp);
- return (error);
- }
- MNT_ILOCK(mp);
-unlock:
- MNT_IUNLOCK(mp);
return (error);
}
@@ -1101,11 +1127,12 @@
static int
vn_kqfilter(struct file *fp, struct knote *kn)
{
+ int vfslocked;
int error;
- mtx_lock(&Giant);
+ vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
error = VOP_KQFILTER(fp->f_vnode, kn);
- mtx_unlock(&Giant);
+ VFS_UNLOCK_GIANT(vfslocked);
return error;
}
--- sys/kern/uipc_socket2.c
+++ /dev/null
@@ -1,1458 +0,0 @@
-/*-
- * Copyright (c) 1982, 1986, 1988, 1990, 1993
- * The Regents of the University of California. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_socket2.c,v 1.147.2.2 2005/11/26 19:30:40 jdp Exp $");
-
-#include "opt_mac.h"
-#include "opt_param.h"
-
-#include <sys/param.h>
-#include <sys/aio.h> /* for aio_swake proto */
-#include <sys/domain.h>
-#include <sys/event.h>
-#include <sys/file.h> /* for maxfiles */
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/mac.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#include <sys/mutex.h>
-#include <sys/proc.h>
-#include <sys/protosw.h>
-#include <sys/resourcevar.h>
-#include <sys/signalvar.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/stat.h>
-#include <sys/sysctl.h>
-#include <sys/systm.h>
-
-int maxsockets;
-
-void (*aio_swake)(struct socket *, struct sockbuf *);
-
-/*
- * Primitive routines for operating on sockets and socket buffers
- */
-
-u_long sb_max = SB_MAX;
-static u_long sb_max_adj =
- SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */
-
-static u_long sb_efficiency = 8; /* parameter for sbreserve() */
-
-/*
- * Procedures to manipulate state flags of socket
- * and do appropriate wakeups. Normal sequence from the
- * active (originating) side is that soisconnecting() is
- * called during processing of connect() call,
- * resulting in an eventual call to soisconnected() if/when the
- * connection is established. When the connection is torn down
- * soisdisconnecting() is called during processing of disconnect() call,
- * and soisdisconnected() is called when the connection to the peer
- * is totally severed. The semantics of these routines are such that
- * connectionless protocols can call soisconnected() and soisdisconnected()
- * only, bypassing the in-progress calls when setting up a ``connection''
- * takes no time.
- *
- * From the passive side, a socket is created with
- * two queues of sockets: so_incomp for connections in progress
- * and so_comp for connections already made and awaiting user acceptance.
- * As a protocol is preparing incoming connections, it creates a socket
- * structure queued on so_incomp by calling sonewconn(). When the connection
- * is established, soisconnected() is called, and transfers the
- * socket structure to so_comp, making it available to accept().
- *
- * If a socket is closed with sockets on either
- * so_incomp or so_comp, these sockets are dropped.
- *
- * If higher level protocols are implemented in
- * the kernel, the wakeups done here will sometimes
- * cause software-interrupt process scheduling.
- */
-
-void
-soisconnecting(so)
- register struct socket *so;
-{
-
- SOCK_LOCK(so);
- so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
- so->so_state |= SS_ISCONNECTING;
- SOCK_UNLOCK(so);
-}
-
-void
-soisconnected(so)
- struct socket *so;
-{
- struct socket *head;
-
- ACCEPT_LOCK();
- SOCK_LOCK(so);
- so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
- so->so_state |= SS_ISCONNECTED;
- head = so->so_head;
- if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
- if ((so->so_options & SO_ACCEPTFILTER) == 0) {
- SOCK_UNLOCK(so);
- TAILQ_REMOVE(&head->so_incomp, so, so_list);
- head->so_incqlen--;
- so->so_qstate &= ~SQ_INCOMP;
- TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
- head->so_qlen++;
- so->so_qstate |= SQ_COMP;
- ACCEPT_UNLOCK();
- sorwakeup(head);
- wakeup_one(&head->so_timeo);
- } else {
- ACCEPT_UNLOCK();
- so->so_upcall =
- head->so_accf->so_accept_filter->accf_callback;
- so->so_upcallarg = head->so_accf->so_accept_filter_arg;
- so->so_rcv.sb_flags |= SB_UPCALL;
- so->so_options &= ~SO_ACCEPTFILTER;
- SOCK_UNLOCK(so);
- so->so_upcall(so, so->so_upcallarg, M_DONTWAIT);
- }
- return;
- }
- SOCK_UNLOCK(so);
- ACCEPT_UNLOCK();
- wakeup(&so->so_timeo);
- sorwakeup(so);
- sowwakeup(so);
-}
-
-void
-soisdisconnecting(so)
- register struct socket *so;
-{
-
- /*
- * XXXRW: This code assumes that SOCK_LOCK(so) and
- * SOCKBUF_LOCK(&so->so_rcv) are the same.
- */
- SOCKBUF_LOCK(&so->so_rcv);
- so->so_state &= ~SS_ISCONNECTING;
- so->so_state |= SS_ISDISCONNECTING;
- so->so_rcv.sb_state |= SBS_CANTRCVMORE;
- sorwakeup_locked(so);
- SOCKBUF_LOCK(&so->so_snd);
- so->so_snd.sb_state |= SBS_CANTSENDMORE;
- sowwakeup_locked(so);
- wakeup(&so->so_timeo);
-}
-
-void
-soisdisconnected(so)
- register struct socket *so;
-{
-
- /*
- * XXXRW: This code assumes that SOCK_LOCK(so) and
- * SOCKBUF_LOCK(&so->so_rcv) are the same.
- */
- SOCKBUF_LOCK(&so->so_rcv);
- so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
- so->so_state |= SS_ISDISCONNECTED;
- so->so_rcv.sb_state |= SBS_CANTRCVMORE;
- sorwakeup_locked(so);
- SOCKBUF_LOCK(&so->so_snd);
- so->so_snd.sb_state |= SBS_CANTSENDMORE;
- sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
- sowwakeup_locked(so);
- wakeup(&so->so_timeo);
-}
-
-/*
- * When an attempt at a new connection is noted on a socket
- * which accepts connections, sonewconn is called. If the
- * connection is possible (subject to space constraints, etc.)
- * then we allocate a new structure, propoerly linked into the
- * data structure of the original socket, and return this.
- * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
- *
- * note: the ref count on the socket is 0 on return
- */
-struct socket *
-sonewconn(head, connstatus)
- register struct socket *head;
- int connstatus;
-{
- register struct socket *so;
- int over;
-
- ACCEPT_LOCK();
- over = (head->so_qlen > 3 * head->so_qlimit / 2);
- ACCEPT_UNLOCK();
- if (over)
- return (NULL);
- so = soalloc(M_NOWAIT);
- if (so == NULL)
- return (NULL);
- if ((head->so_options & SO_ACCEPTFILTER) != 0)
- connstatus = 0;
- so->so_head = head;
- so->so_type = head->so_type;
- so->so_options = head->so_options &~ SO_ACCEPTCONN;
- so->so_linger = head->so_linger;
- so->so_state = head->so_state | SS_NOFDREF;
- so->so_proto = head->so_proto;
- so->so_cred = crhold(head->so_cred);
-
-#ifdef MAC
- SOCK_LOCK(head);
- mac_create_socket_from_socket(head, so);
- SOCK_UNLOCK(head);
-#endif
- knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
- NULL, NULL, NULL);
- knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
- NULL, NULL, NULL);
- if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
- (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
- sodealloc(so);
- return (NULL);
- }
- so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
- so->so_snd.sb_lowat = head->so_snd.sb_lowat;
- so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
- so->so_snd.sb_timeo = head->so_snd.sb_timeo;
- so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
- so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
- so->so_state |= connstatus;
- ACCEPT_LOCK();
- if (connstatus) {
- TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
- so->so_qstate |= SQ_COMP;
- head->so_qlen++;
- } else {
- /*
- * Keep removing sockets from the head until there's room for
- * us to insert on the tail. In pre-locking revisions, this
- * was a simple if(), but as we could be racing with other
- * threads and soabort() requires dropping locks, we must
- * loop waiting for the condition to be true.
- */
- while (head->so_incqlen > head->so_qlimit) {
- struct socket *sp;
- sp = TAILQ_FIRST(&head->so_incomp);
- TAILQ_REMOVE(&head->so_incomp, sp, so_list);
- head->so_incqlen--;
- sp->so_qstate &= ~SQ_INCOMP;
- sp->so_head = NULL;
- ACCEPT_UNLOCK();
- (void) soabort(sp);
- ACCEPT_LOCK();
- }
- TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
- so->so_qstate |= SQ_INCOMP;
- head->so_incqlen++;
- }
- ACCEPT_UNLOCK();
- if (connstatus) {
- sorwakeup(head);
- wakeup_one(&head->so_timeo);
- }
- return (so);
-}
-
-/*
- * Socantsendmore indicates that no more data will be sent on the
- * socket; it would normally be applied to a socket when the user
- * informs the system that no more data is to be sent, by the protocol
- * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
- * will be received, and will normally be applied to the socket by a
- * protocol when it detects that the peer will send no more data.
- * Data queued for reading in the socket may yet be read.
- */
-void
-socantsendmore_locked(so)
- struct socket *so;
-{
-
- SOCKBUF_LOCK_ASSERT(&so->so_snd);
-
- so->so_snd.sb_state |= SBS_CANTSENDMORE;
- sowwakeup_locked(so);
- mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
-}
-
-void
-socantsendmore(so)
- struct socket *so;
-{
-
- SOCKBUF_LOCK(&so->so_snd);
- socantsendmore_locked(so);
- mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
-}
-
-void
-socantrcvmore_locked(so)
- struct socket *so;
-{
-
- SOCKBUF_LOCK_ASSERT(&so->so_rcv);
-
- so->so_rcv.sb_state |= SBS_CANTRCVMORE;
- sorwakeup_locked(so);
- mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
-}
-
-void
-socantrcvmore(so)
- struct socket *so;
-{
-
- SOCKBUF_LOCK(&so->so_rcv);
- socantrcvmore_locked(so);
- mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
-}
-
-/*
- * Wait for data to arrive at/drain from a socket buffer.
- */
-int
-sbwait(sb)
- struct sockbuf *sb;
-{
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- sb->sb_flags |= SB_WAIT;
- return (msleep(&sb->sb_cc, &sb->sb_mtx,
- (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
- sb->sb_timeo));
-}
-
-/*
- * Lock a sockbuf already known to be locked;
- * return any error returned from sleep (EINTR).
- */
-int
-sb_lock(sb)
- register struct sockbuf *sb;
-{
- int error;
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- while (sb->sb_flags & SB_LOCK) {
- sb->sb_flags |= SB_WANT;
- error = msleep(&sb->sb_flags, &sb->sb_mtx,
- (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
- "sblock", 0);
- if (error)
- return (error);
- }
- sb->sb_flags |= SB_LOCK;
- return (0);
-}
-
-/*
- * Wakeup processes waiting on a socket buffer. Do asynchronous
- * notification via SIGIO if the socket has the SS_ASYNC flag set.
- *
- * Called with the socket buffer lock held; will release the lock by the end
- * of the function. This allows the caller to acquire the socket buffer lock
- * while testing for the need for various sorts of wakeup and hold it through
- * to the point where it's no longer required. We currently hold the lock
- * through calls out to other subsystems (with the exception of kqueue), and
- * then release it to avoid lock order issues. It's not clear that's
- * correct.
- */
-void
-sowakeup(so, sb)
- register struct socket *so;
- register struct sockbuf *sb;
-{
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- selwakeuppri(&sb->sb_sel, PSOCK);
- sb->sb_flags &= ~SB_SEL;
- if (sb->sb_flags & SB_WAIT) {
- sb->sb_flags &= ~SB_WAIT;
- wakeup(&sb->sb_cc);
- }
- KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
- SOCKBUF_UNLOCK(sb);
- if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
- pgsigio(&so->so_sigio, SIGIO, 0);
- if (sb->sb_flags & SB_UPCALL)
- (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
- if (sb->sb_flags & SB_AIO)
- aio_swake(so, sb);
- mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED);
-}
-
-/*
- * Socket buffer (struct sockbuf) utility routines.
- *
- * Each socket contains two socket buffers: one for sending data and
- * one for receiving data. Each buffer contains a queue of mbufs,
- * information about the number of mbufs and amount of data in the
- * queue, and other fields allowing select() statements and notification
- * on data availability to be implemented.
- *
- * Data stored in a socket buffer is maintained as a list of records.
- * Each record is a list of mbufs chained together with the m_next
- * field. Records are chained together with the m_nextpkt field. The upper
- * level routine soreceive() expects the following conventions to be
- * observed when placing information in the receive buffer:
- *
- * 1. If the protocol requires each message be preceded by the sender's
- * name, then a record containing that name must be present before
- * any associated data (mbuf's must be of type MT_SONAME).
- * 2. If the protocol supports the exchange of ``access rights'' (really
- * just additional data associated with the message), and there are
- * ``rights'' to be received, then a record containing this data
- * should be present (mbuf's must be of type MT_RIGHTS).
- * 3. If a name or rights record exists, then it must be followed by
- * a data record, perhaps of zero length.
- *
- * Before using a new socket structure it is first necessary to reserve
- * buffer space to the socket, by calling sbreserve(). This should commit
- * some of the available buffer space in the system buffer pool for the
- * socket (currently, it does nothing but enforce limits). The space
- * should be released by calling sbrelease() when the socket is destroyed.
- */
-
-int
-soreserve(so, sndcc, rcvcc)
- register struct socket *so;
- u_long sndcc, rcvcc;
-{
- struct thread *td = curthread;
-
- SOCKBUF_LOCK(&so->so_snd);
- SOCKBUF_LOCK(&so->so_rcv);
- if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0)
- goto bad;
- if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0)
- goto bad2;
- if (so->so_rcv.sb_lowat == 0)
- so->so_rcv.sb_lowat = 1;
- if (so->so_snd.sb_lowat == 0)
- so->so_snd.sb_lowat = MCLBYTES;
- if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
- so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
- SOCKBUF_UNLOCK(&so->so_rcv);
- SOCKBUF_UNLOCK(&so->so_snd);
- return (0);
-bad2:
- sbrelease_locked(&so->so_snd, so);
-bad:
- SOCKBUF_UNLOCK(&so->so_rcv);
- SOCKBUF_UNLOCK(&so->so_snd);
- return (ENOBUFS);
-}
-
-static int
-sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS)
-{
- int error = 0;
- u_long old_sb_max = sb_max;
-
- error = SYSCTL_OUT(req, arg1, sizeof(u_long));
- if (error || !req->newptr)
- return (error);
- error = SYSCTL_IN(req, arg1, sizeof(u_long));
- if (error)
- return (error);
- if (sb_max < MSIZE + MCLBYTES) {
- sb_max = old_sb_max;
- return (EINVAL);
- }
- sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
- return (0);
-}
-
-/*
- * Allot mbufs to a sockbuf.
- * Attempt to scale mbmax so that mbcnt doesn't become limiting
- * if buffering efficiency is near the normal case.
- */
-int
-sbreserve_locked(sb, cc, so, td)
- struct sockbuf *sb;
- u_long cc;
- struct socket *so;
- struct thread *td;
-{
- rlim_t sbsize_limit;
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- /*
- * td will only be NULL when we're in an interrupt
- * (e.g. in tcp_input())
- */
- if (cc > sb_max_adj)
- return (0);
- if (td != NULL) {
- PROC_LOCK(td->td_proc);
- sbsize_limit = lim_cur(td->td_proc, RLIMIT_SBSIZE);
- PROC_UNLOCK(td->td_proc);
- } else
- sbsize_limit = RLIM_INFINITY;
- if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
- sbsize_limit))
- return (0);
- sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
- if (sb->sb_lowat > sb->sb_hiwat)
- sb->sb_lowat = sb->sb_hiwat;
- return (1);
-}
-
-int
-sbreserve(sb, cc, so, td)
- struct sockbuf *sb;
- u_long cc;
- struct socket *so;
- struct thread *td;
-{
- int error;
-
- SOCKBUF_LOCK(sb);
- error = sbreserve_locked(sb, cc, so, td);
- SOCKBUF_UNLOCK(sb);
- return (error);
-}
-
-/*
- * Free mbufs held by a socket, and reserved mbuf space.
- */
-void
-sbrelease_locked(sb, so)
- struct sockbuf *sb;
- struct socket *so;
-{
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- sbflush_locked(sb);
- (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
- RLIM_INFINITY);
- sb->sb_mbmax = 0;
-}
-
-void
-sbrelease(sb, so)
- struct sockbuf *sb;
- struct socket *so;
-{
-
- SOCKBUF_LOCK(sb);
- sbrelease_locked(sb, so);
- SOCKBUF_UNLOCK(sb);
-}
-/*
- * Routines to add and remove
- * data from an mbuf queue.
- *
- * The routines sbappend() or sbappendrecord() are normally called to
- * append new mbufs to a socket buffer, after checking that adequate
- * space is available, comparing the function sbspace() with the amount
- * of data to be added. sbappendrecord() differs from sbappend() in
- * that data supplied is treated as the beginning of a new record.
- * To place a sender's address, optional access rights, and data in a
- * socket receive buffer, sbappendaddr() should be used. To place
- * access rights and data in a socket receive buffer, sbappendrights()
- * should be used. In either case, the new data begins a new record.
- * Note that unlike sbappend() and sbappendrecord(), these routines check
- * for the caller that there will be enough space to store the data.
- * Each fails if there is not enough space, or if it cannot find mbufs
- * to store additional information in.
- *
- * Reliable protocols may use the socket send buffer to hold data
- * awaiting acknowledgement. Data is normally copied from a socket
- * send buffer in a protocol with m_copy for output to a peer,
- * and then removing the data from the socket buffer with sbdrop()
- * or sbdroprecord() when the data is acknowledged by the peer.
- */
-
-#ifdef SOCKBUF_DEBUG
-void
-sblastrecordchk(struct sockbuf *sb, const char *file, int line)
-{
- struct mbuf *m = sb->sb_mb;
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- while (m && m->m_nextpkt)
- m = m->m_nextpkt;
-
- if (m != sb->sb_lastrecord) {
- printf("%s: sb_mb %p sb_lastrecord %p last %p\n",
- __func__, sb->sb_mb, sb->sb_lastrecord, m);
- printf("packet chain:\n");
- for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
- printf("\t%p\n", m);
- panic("%s from %s:%u", __func__, file, line);
- }
-}
-
-void
-sblastmbufchk(struct sockbuf *sb, const char *file, int line)
-{
- struct mbuf *m = sb->sb_mb;
- struct mbuf *n;
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- while (m && m->m_nextpkt)
- m = m->m_nextpkt;
-
- while (m && m->m_next)
- m = m->m_next;
-
- if (m != sb->sb_mbtail) {
- printf("%s: sb_mb %p sb_mbtail %p last %p\n",
- __func__, sb->sb_mb, sb->sb_mbtail, m);
- printf("packet tree:\n");
- for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
- printf("\t");
- for (n = m; n != NULL; n = n->m_next)
- printf("%p ", n);
- printf("\n");
- }
- panic("%s from %s:%u", __func__, file, line);
- }
-}
-#endif /* SOCKBUF_DEBUG */
-
-#define SBLINKRECORD(sb, m0) do { \
- SOCKBUF_LOCK_ASSERT(sb); \
- if ((sb)->sb_lastrecord != NULL) \
- (sb)->sb_lastrecord->m_nextpkt = (m0); \
- else \
- (sb)->sb_mb = (m0); \
- (sb)->sb_lastrecord = (m0); \
-} while (/*CONSTCOND*/0)
-
-/*
- * Append mbuf chain m to the last record in the
- * socket buffer sb. The additional space associated
- * the mbuf chain is recorded in sb. Empty mbufs are
- * discarded and mbufs are compacted where possible.
- */
-void
-sbappend_locked(sb, m)
- struct sockbuf *sb;
- struct mbuf *m;
-{
- register struct mbuf *n;
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- if (m == 0)
- return;
-
- SBLASTRECORDCHK(sb);
- n = sb->sb_mb;
- if (n) {
- while (n->m_nextpkt)
- n = n->m_nextpkt;
- do {
- if (n->m_flags & M_EOR) {
- sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
- return;
- }
- } while (n->m_next && (n = n->m_next));
- } else {
- /*
- * XXX Would like to simply use sb_mbtail here, but
- * XXX I need to verify that I won't miss an EOR that
- * XXX way.
- */
- if ((n = sb->sb_lastrecord) != NULL) {
- do {
- if (n->m_flags & M_EOR) {
- sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
- return;
- }
- } while (n->m_next && (n = n->m_next));
- } else {
- /*
- * If this is the first record in the socket buffer,
- * it's also the last record.
- */
- sb->sb_lastrecord = m;
- }
- }
- sbcompress(sb, m, n);
- SBLASTRECORDCHK(sb);
-}
-
-/*
- * Append mbuf chain m to the last record in the
- * socket buffer sb. The additional space associated
- * the mbuf chain is recorded in sb. Empty mbufs are
- * discarded and mbufs are compacted where possible.
- */
-void
-sbappend(sb, m)
- struct sockbuf *sb;
- struct mbuf *m;
-{
-
- SOCKBUF_LOCK(sb);
- sbappend_locked(sb, m);
- SOCKBUF_UNLOCK(sb);
-}
-
-/*
- * This version of sbappend() should only be used when the caller
- * absolutely knows that there will never be more than one record
- * in the socket buffer, that is, a stream protocol (such as TCP).
- */
-void
-sbappendstream_locked(struct sockbuf *sb, struct mbuf *m)
-{
- SOCKBUF_LOCK_ASSERT(sb);
-
- KASSERT(m->m_nextpkt == NULL,("sbappendstream 0"));
- KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
-
- SBLASTMBUFCHK(sb);
-
- sbcompress(sb, m, sb->sb_mbtail);
-
- sb->sb_lastrecord = sb->sb_mb;
- SBLASTRECORDCHK(sb);
-}
-
-/*
- * This version of sbappend() should only be used when the caller
- * absolutely knows that there will never be more than one record
- * in the socket buffer, that is, a stream protocol (such as TCP).
- */
-void
-sbappendstream(struct sockbuf *sb, struct mbuf *m)
-{
-
- SOCKBUF_LOCK(sb);
- sbappendstream_locked(sb, m);
- SOCKBUF_UNLOCK(sb);
-}
-
-#ifdef SOCKBUF_DEBUG
-void
-sbcheck(sb)
- struct sockbuf *sb;
-{
- struct mbuf *m;
- struct mbuf *n = 0;
- u_long len = 0, mbcnt = 0;
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- for (m = sb->sb_mb; m; m = n) {
- n = m->m_nextpkt;
- for (; m; m = m->m_next) {
- len += m->m_len;
- mbcnt += MSIZE;
- if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
- mbcnt += m->m_ext.ext_size;
- }
- }
- if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
- printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc,
- mbcnt, sb->sb_mbcnt);
- panic("sbcheck");
- }
-}
-#endif
-
-/*
- * As above, except the mbuf chain
- * begins a new record.
- */
-void
-sbappendrecord_locked(sb, m0)
- register struct sockbuf *sb;
- register struct mbuf *m0;
-{
- register struct mbuf *m;
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- if (m0 == 0)
- return;
- m = sb->sb_mb;
- if (m)
- while (m->m_nextpkt)
- m = m->m_nextpkt;
- /*
- * Put the first mbuf on the queue.
- * Note this permits zero length records.
- */
- sballoc(sb, m0);
- SBLASTRECORDCHK(sb);
- SBLINKRECORD(sb, m0);
- if (m)
- m->m_nextpkt = m0;
- else
- sb->sb_mb = m0;
- m = m0->m_next;
- m0->m_next = 0;
- if (m && (m0->m_flags & M_EOR)) {
- m0->m_flags &= ~M_EOR;
- m->m_flags |= M_EOR;
- }
- sbcompress(sb, m, m0);
-}
-
-/*
- * As above, except the mbuf chain
- * begins a new record.
- */
-void
-sbappendrecord(sb, m0)
- register struct sockbuf *sb;
- register struct mbuf *m0;
-{
-
- SOCKBUF_LOCK(sb);
- sbappendrecord_locked(sb, m0);
- SOCKBUF_UNLOCK(sb);
-}
-
-/*
- * Append address and data, and optionally, control (ancillary) data
- * to the receive queue of a socket. If present,
- * m0 must include a packet header with total length.
- * Returns 0 if no space in sockbuf or insufficient mbufs.
- */
-int
-sbappendaddr_locked(sb, asa, m0, control)
- struct sockbuf *sb;
- const struct sockaddr *asa;
- struct mbuf *m0, *control;
-{
- struct mbuf *m, *n, *nlast;
- int space = asa->sa_len;
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- if (m0 && (m0->m_flags & M_PKTHDR) == 0)
- panic("sbappendaddr_locked");
- if (m0)
- space += m0->m_pkthdr.len;
- space += m_length(control, &n);
-
- if (space > sbspace(sb))
- return (0);
-#if MSIZE <= 256
- if (asa->sa_len > MLEN)
- return (0);
-#endif
- MGET(m, M_DONTWAIT, MT_SONAME);
- if (m == 0)
- return (0);
- m->m_len = asa->sa_len;
- bcopy(asa, mtod(m, caddr_t), asa->sa_len);
- if (n)
- n->m_next = m0; /* concatenate data to control */
- else
- control = m0;
- m->m_next = control;
- for (n = m; n->m_next != NULL; n = n->m_next)
- sballoc(sb, n);
- sballoc(sb, n);
- nlast = n;
- SBLINKRECORD(sb, m);
-
- sb->sb_mbtail = nlast;
- SBLASTMBUFCHK(sb);
-
- SBLASTRECORDCHK(sb);
- return (1);
-}
-
-/*
- * Append address and data, and optionally, control (ancillary) data
- * to the receive queue of a socket. If present,
- * m0 must include a packet header with total length.
- * Returns 0 if no space in sockbuf or insufficient mbufs.
- */
-int
-sbappendaddr(sb, asa, m0, control)
- struct sockbuf *sb;
- const struct sockaddr *asa;
- struct mbuf *m0, *control;
-{
- int retval;
-
- SOCKBUF_LOCK(sb);
- retval = sbappendaddr_locked(sb, asa, m0, control);
- SOCKBUF_UNLOCK(sb);
- return (retval);
-}
-
-int
-sbappendcontrol_locked(sb, m0, control)
- struct sockbuf *sb;
- struct mbuf *control, *m0;
-{
- struct mbuf *m, *n, *mlast;
- int space;
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- if (control == 0)
- panic("sbappendcontrol_locked");
- space = m_length(control, &n) + m_length(m0, NULL);
-
- if (space > sbspace(sb))
- return (0);
- n->m_next = m0; /* concatenate data to control */
-
- SBLASTRECORDCHK(sb);
-
- for (m = control; m->m_next; m = m->m_next)
- sballoc(sb, m);
- sballoc(sb, m);
- mlast = m;
- SBLINKRECORD(sb, control);
-
- sb->sb_mbtail = mlast;
- SBLASTMBUFCHK(sb);
-
- SBLASTRECORDCHK(sb);
- return (1);
-}
-
-int
-sbappendcontrol(sb, m0, control)
- struct sockbuf *sb;
- struct mbuf *control, *m0;
-{
- int retval;
-
- SOCKBUF_LOCK(sb);
- retval = sbappendcontrol_locked(sb, m0, control);
- SOCKBUF_UNLOCK(sb);
- return (retval);
-}
-
-/*
- * Append the data in mbuf chain (m) into the socket buffer sb following mbuf
- * (n). If (n) is NULL, the buffer is presumed empty.
- *
- * When the data is compressed, mbufs in the chain may be handled in one of
- * three ways:
- *
- * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no
- * record boundary, and no change in data type).
- *
- * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into
- * an mbuf already in the socket buffer. This can occur if an
- * appropriate mbuf exists, there is room, and no merging of data types
- * will occur.
- *
- * (3) The mbuf may be appended to the end of the existing mbuf chain.
- *
- * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as
- * end-of-record.
- */
-void
-sbcompress(sb, m, n)
- register struct sockbuf *sb;
- register struct mbuf *m, *n;
-{
- register int eor = 0;
- register struct mbuf *o;
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- while (m) {
- eor |= m->m_flags & M_EOR;
- if (m->m_len == 0 &&
- (eor == 0 ||
- (((o = m->m_next) || (o = n)) &&
- o->m_type == m->m_type))) {
- if (sb->sb_lastrecord == m)
- sb->sb_lastrecord = m->m_next;
- m = m_free(m);
- continue;
- }
- if (n && (n->m_flags & M_EOR) == 0 &&
- M_WRITABLE(n) &&
- m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
- m->m_len <= M_TRAILINGSPACE(n) &&
- n->m_type == m->m_type) {
- bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
- (unsigned)m->m_len);
- n->m_len += m->m_len;
- sb->sb_cc += m->m_len;
- if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
- m->m_type != MT_OOBDATA)
- /* XXX: Probably don't need.*/
- sb->sb_ctl += m->m_len;
- m = m_free(m);
- continue;
- }
- if (n)
- n->m_next = m;
- else
- sb->sb_mb = m;
- sb->sb_mbtail = m;
- sballoc(sb, m);
- n = m;
- m->m_flags &= ~M_EOR;
- m = m->m_next;
- n->m_next = 0;
- }
- if (eor) {
- KASSERT(n != NULL, ("sbcompress: eor && n == NULL"));
- n->m_flags |= eor;
- }
- SBLASTMBUFCHK(sb);
-}
-
-/*
- * Free all mbufs in a sockbuf.
- * Check that all resources are reclaimed.
- */
-void
-sbflush_locked(sb)
- register struct sockbuf *sb;
-{
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- if (sb->sb_flags & SB_LOCK)
- panic("sbflush_locked: locked");
- while (sb->sb_mbcnt) {
- /*
- * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
- * we would loop forever. Panic instead.
- */
- if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
- break;
- sbdrop_locked(sb, (int)sb->sb_cc);
- }
- if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
- panic("sbflush_locked: cc %u || mb %p || mbcnt %u", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
-}
-
-void
-sbflush(sb)
- register struct sockbuf *sb;
-{
-
- SOCKBUF_LOCK(sb);
- sbflush_locked(sb);
- SOCKBUF_UNLOCK(sb);
-}
-
-/*
- * Drop data from (the front of) a sockbuf.
- */
-void
-sbdrop_locked(sb, len)
- register struct sockbuf *sb;
- register int len;
-{
- register struct mbuf *m;
- struct mbuf *next;
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
- while (len > 0) {
- if (m == 0) {
- if (next == 0)
- panic("sbdrop");
- m = next;
- next = m->m_nextpkt;
- continue;
- }
- if (m->m_len > len) {
- m->m_len -= len;
- m->m_data += len;
- sb->sb_cc -= len;
- if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
- m->m_type != MT_OOBDATA)
- sb->sb_ctl -= len;
- break;
- }
- len -= m->m_len;
- sbfree(sb, m);
- m = m_free(m);
- }
- while (m && m->m_len == 0) {
- sbfree(sb, m);
- m = m_free(m);
- }
- if (m) {
- sb->sb_mb = m;
- m->m_nextpkt = next;
- } else
- sb->sb_mb = next;
- /*
- * First part is an inline SB_EMPTY_FIXUP(). Second part
- * makes sure sb_lastrecord is up-to-date if we dropped
- * part of the last record.
- */
- m = sb->sb_mb;
- if (m == NULL) {
- sb->sb_mbtail = NULL;
- sb->sb_lastrecord = NULL;
- } else if (m->m_nextpkt == NULL) {
- sb->sb_lastrecord = m;
- }
-}
-
-/*
- * Drop data from (the front of) a sockbuf.
- */
-void
-sbdrop(sb, len)
- register struct sockbuf *sb;
- register int len;
-{
-
- SOCKBUF_LOCK(sb);
- sbdrop_locked(sb, len);
- SOCKBUF_UNLOCK(sb);
-}
-
-/*
- * Drop a record off the front of a sockbuf
- * and move the next record to the front.
- */
-void
-sbdroprecord_locked(sb)
- register struct sockbuf *sb;
-{
- register struct mbuf *m;
-
- SOCKBUF_LOCK_ASSERT(sb);
-
- m = sb->sb_mb;
- if (m) {
- sb->sb_mb = m->m_nextpkt;
- do {
- sbfree(sb, m);
- m = m_free(m);
- } while (m);
- }
- SB_EMPTY_FIXUP(sb);
-}
-
-/*
- * Drop a record off the front of a sockbuf
- * and move the next record to the front.
- */
-void
-sbdroprecord(sb)
- register struct sockbuf *sb;
-{
-
- SOCKBUF_LOCK(sb);
- sbdroprecord_locked(sb);
- SOCKBUF_UNLOCK(sb);
-}
-
-/*
- * Create a "control" mbuf containing the specified data
- * with the specified type for presentation on a socket buffer.
- */
-struct mbuf *
-sbcreatecontrol(p, size, type, level)
- caddr_t p;
- register int size;
- int type, level;
-{
- register struct cmsghdr *cp;
- struct mbuf *m;
-
- if (CMSG_SPACE((u_int)size) > MCLBYTES)
- return ((struct mbuf *) NULL);
- if (CMSG_SPACE((u_int)size) > MLEN)
- m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
- else
- m = m_get(M_DONTWAIT, MT_CONTROL);
- if (m == NULL)
- return ((struct mbuf *) NULL);
- cp = mtod(m, struct cmsghdr *);
- m->m_len = 0;
- KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
- ("sbcreatecontrol: short mbuf"));
- if (p != NULL)
- (void)memcpy(CMSG_DATA(cp), p, size);
- m->m_len = CMSG_SPACE(size);
- cp->cmsg_len = CMSG_LEN(size);
- cp->cmsg_level = level;
- cp->cmsg_type = type;
- return (m);
-}
-
-/*
- * Some routines that return EOPNOTSUPP for entry points that are not
- * supported by a protocol. Fill in as needed.
- */
-int
-pru_abort_notsupp(struct socket *so)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_connect2_notsupp(struct socket *so1, struct socket *so2)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
- struct ifnet *ifp, struct thread *td)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_detach_notsupp(struct socket *so)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_disconnect_notsupp(struct socket *so)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_listen_notsupp(struct socket *so, struct thread *td)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_rcvd_notsupp(struct socket *so, int flags)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
- struct sockaddr *addr, struct mbuf *control, struct thread *td)
-{
- return EOPNOTSUPP;
-}
-
-/*
- * This isn't really a ``null'' operation, but it's the default one
- * and doesn't do anything destructive.
- */
-int
-pru_sense_null(struct socket *so, struct stat *sb)
-{
- sb->st_blksize = so->so_snd.sb_hiwat;
- return 0;
-}
-
-int
-pru_shutdown_notsupp(struct socket *so)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
- struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
- struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
- int *flagsp)
-{
- return EOPNOTSUPP;
-}
-
-int
-pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
- struct thread *td)
-{
- return EOPNOTSUPP;
-}
-
-/*
- * For protocol types that don't keep cached copies of labels in their
- * pcbs, provide a null sosetlabel that does a NOOP.
- */
-void
-pru_sosetlabel_null(struct socket *so)
-{
-
-}
-
-/*
- * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
- */
-struct sockaddr *
-sodupsockaddr(const struct sockaddr *sa, int mflags)
-{
- struct sockaddr *sa2;
-
- sa2 = malloc(sa->sa_len, M_SONAME, mflags);
- if (sa2)
- bcopy(sa, sa2, sa->sa_len);
- return sa2;
-}
-
-/*
- * Create an external-format (``xsocket'') structure using the information
- * in the kernel-format socket structure pointed to by so. This is done
- * to reduce the spew of irrelevant information over this interface,
- * to isolate user code from changes in the kernel structure, and
- * potentially to provide information-hiding if we decide that
- * some of this information should be hidden from users.
- */
-void
-sotoxsocket(struct socket *so, struct xsocket *xso)
-{
- xso->xso_len = sizeof *xso;
- xso->xso_so = so;
- xso->so_type = so->so_type;
- xso->so_options = so->so_options;
- xso->so_linger = so->so_linger;
- xso->so_state = so->so_state;
- xso->so_pcb = so->so_pcb;
- xso->xso_protocol = so->so_proto->pr_protocol;
- xso->xso_family = so->so_proto->pr_domain->dom_family;
- xso->so_qlen = so->so_qlen;
- xso->so_incqlen = so->so_incqlen;
- xso->so_qlimit = so->so_qlimit;
- xso->so_timeo = so->so_timeo;
- xso->so_error = so->so_error;
- xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
- xso->so_oobmark = so->so_oobmark;
- sbtoxsockbuf(&so->so_snd, &xso->so_snd);
- sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
- xso->so_uid = so->so_cred->cr_uid;
-}
-
-/*
- * This does the same for sockbufs. Note that the xsockbuf structure,
- * since it is always embedded in a socket, does not include a self
- * pointer nor a length. We make this entry point public in case
- * some other mechanism needs it.
- */
-void
-sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
-{
- xsb->sb_cc = sb->sb_cc;
- xsb->sb_hiwat = sb->sb_hiwat;
- xsb->sb_mbcnt = sb->sb_mbcnt;
- xsb->sb_mbmax = sb->sb_mbmax;
- xsb->sb_lowat = sb->sb_lowat;
- xsb->sb_flags = sb->sb_flags;
- xsb->sb_timeo = sb->sb_timeo;
-}
-
-/*
- * Here is the definition of some of the basic objects in the kern.ipc
- * branch of the MIB.
- */
-SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
-
-/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
-static int dummy;
-SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
-SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW,
- &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size");
-SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RDTUN,
- &maxsockets, 0, "Maximum number of sockets avaliable");
-SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
- &sb_efficiency, 0, "");
-
-/*
- * Initialise maxsockets
- */
-static void init_maxsockets(void *ignored)
-{
- TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
- maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
-}
-SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
Index: kern_mtxpool.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_mtxpool.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_mtxpool.c -L sys/kern/kern_mtxpool.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_mtxpool.c
+++ sys/kern/kern_mtxpool.c
@@ -24,7 +24,7 @@
*/
/* Mutex pool routines. These routines are designed to be used as short
- * term leaf mutexes (e.g. the last mutex you might aquire other then
+ * term leaf mutexes (e.g. the last mutex you might acquire other then
* calling msleep()). They operate using a shared pool. A mutex is chosen
* from the pool based on the supplied pointer (which may or may not be
* valid).
@@ -44,7 +44,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_mtxpool.c,v 1.11 2005/02/10 12:02:37 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_mtxpool.c,v 1.12 2007/05/27 20:50:23 rwatson Exp $");
#include <sys/param.h>
#include <sys/proc.h>
Index: kern_synch.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_synch.c -L sys/kern/kern_synch.c -u -r1.2 -r1.3
--- sys/kern/kern_synch.c
+++ sys/kern/kern_synch.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/kern/kern_synch.c,v 1.270.2.6 2006/07/06 08:32:50 glebius Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_synch.c,v 1.302 2007/10/08 23:40:40 jeff Exp $");
#include "opt_ktrace.h"
@@ -69,6 +69,7 @@
int hogticks;
int lbolt;
+static int pause_wchan;
static struct callout loadav_callout;
static struct callout lbolt_callout;
@@ -101,8 +102,8 @@
}
/*
- * General sleep call. Suspends the current process until a wakeup is
- * performed on the specified identifier. The process will then be made
+ * General sleep call. Suspends the current thread until a wakeup is
+ * performed on the specified identifier. The thread will then be made
* runnable with the specified priority. Sleeps at most timo/hz seconds
* (0 means no timeout). If pri includes PCATCH flag, signals are checked
* before and after sleeping, else signals are not checked. Returns 0 if
@@ -111,21 +112,22 @@
* call should be restarted if possible, and EINTR is returned if the system
* call should be interrupted by the signal (return EINTR).
*
- * The mutex argument is exited before the caller is suspended, and
- * entered before msleep returns. If priority includes the PDROP
- * flag the mutex is not entered before returning.
+ * The lock argument is unlocked before the caller is suspended, and
+ * re-locked before _sleep() returns. If priority includes the PDROP
+ * flag the lock is not re-locked before returning.
*/
int
-msleep(ident, mtx, priority, wmesg, timo)
+_sleep(ident, lock, priority, wmesg, timo)
void *ident;
- struct mtx *mtx;
+ struct lock_object *lock;
int priority, timo;
const char *wmesg;
{
struct thread *td;
struct proc *p;
- int catch, rval, flags;
- WITNESS_SAVE_DECL(mtx);
+ struct lock_class *class;
+ int catch, flags, lock_state, pri, rval;
+ WITNESS_SAVE_DECL(lock_witness);
td = curthread;
p = td->td_proc;
@@ -133,12 +135,16 @@
if (KTRPOINT(td, KTR_CSW))
ktrcsw(1, 0);
#endif
- WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, mtx == NULL ? NULL :
- &mtx->mtx_object, "Sleeping on \"%s\"", wmesg);
- KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL,
- ("sleeping without a mutex"));
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+ "Sleeping on \"%s\"", wmesg);
+ KASSERT(timo != 0 || mtx_owned(&Giant) || lock != NULL ||
+ ident == &lbolt, ("sleeping without a lock"));
KASSERT(p != NULL, ("msleep1"));
KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
+ if (lock != NULL)
+ class = LOCK_CLASS(lock);
+ else
+ class = NULL;
if (cold) {
/*
@@ -149,8 +155,8 @@
* splx(s);" to give interrupts a chance, but there is
* no way to give interrupts a chance now.
*/
- if (mtx != NULL && priority & PDROP)
- mtx_unlock(mtx);
+ if (lock != NULL && priority & PDROP)
+ class->lc_unlock(lock);
return (0);
}
catch = priority & PCATCH;
@@ -164,20 +170,24 @@
if (TD_ON_SLEEPQ(td))
sleepq_remove(td, td->td_wchan);
- flags = SLEEPQ_MSLEEP;
+ if (ident == &pause_wchan)
+ flags = SLEEPQ_PAUSE;
+ else
+ flags = SLEEPQ_SLEEP;
if (catch)
flags |= SLEEPQ_INTERRUPTIBLE;
sleepq_lock(ident);
- CTR5(KTR_PROC, "msleep: thread %p (pid %ld, %s) on %s (%p)",
- (void *)td, (long)p->p_pid, p->p_comm, wmesg, ident);
+ CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)",
+ td->td_tid, p->p_pid, p->p_comm, wmesg, ident);
DROP_GIANT();
- if (mtx != NULL) {
- mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
- WITNESS_SAVE(&mtx->mtx_object, mtx);
- mtx_unlock(mtx);
- }
+ if (lock != NULL && !(class->lc_flags & LC_SLEEPABLE)) {
+ WITNESS_SAVE(lock, lock_witness);
+ lock_state = class->lc_unlock(lock);
+ } else
+ /* GCC needs to follow the Yellow Brick Road */
+ lock_state = -1;
/*
* We put ourselves on the sleep queue and start our timeout
@@ -188,17 +198,24 @@
* stopped, then td will no longer be on a sleep queue upon
* return from cursig().
*/
- sleepq_add(ident, mtx, wmesg, flags);
+ sleepq_add(ident, ident == &lbolt ? NULL : lock, wmesg, flags, 0);
if (timo)
sleepq_set_timeout(ident, timo);
+ if (lock != NULL && class->lc_flags & LC_SLEEPABLE) {
+ sleepq_release(ident);
+ WITNESS_SAVE(lock, lock_witness);
+ lock_state = class->lc_unlock(lock);
+ sleepq_lock(ident);
+ }
/*
- * Adjust this thread's priority.
+ * Adjust this thread's priority, if necessary.
*/
- if ((priority & PRIMASK) != 0) {
- mtx_lock_spin(&sched_lock);
- sched_prio(td, priority & PRIMASK);
- mtx_unlock_spin(&sched_lock);
+ pri = priority & PRIMASK;
+ if (pri != 0 && pri != td->td_priority) {
+ thread_lock(td);
+ sched_prio(td, pri);
+ thread_unlock(td);
}
if (timo && catch)
@@ -216,9 +233,9 @@
ktrcsw(0, 0);
#endif
PICKUP_GIANT();
- if (mtx != NULL && !(priority & PDROP)) {
- mtx_lock(mtx);
- WITNESS_RESTORE(&mtx->mtx_object, mtx);
+ if (lock != NULL && !(priority & PDROP)) {
+ class->lc_lock(lock, lock_state);
+ WITNESS_RESTORE(lock, lock_witness);
}
return (rval);
}
@@ -254,18 +271,18 @@
}
sleepq_lock(ident);
- CTR5(KTR_PROC, "msleep_spin: thread %p (pid %ld, %s) on %s (%p)",
- (void *)td, (long)p->p_pid, p->p_comm, wmesg, ident);
+ CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)",
+ td->td_tid, p->p_pid, p->p_comm, wmesg, ident);
DROP_GIANT();
mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
- WITNESS_SAVE(&mtx->mtx_object, mtx);
+ WITNESS_SAVE(&mtx->lock_object, mtx);
mtx_unlock_spin(mtx);
/*
* We put ourselves on the sleep queue and start our timeout.
*/
- sleepq_add(ident, mtx, wmesg, SLEEPQ_MSLEEP);
+ sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0);
if (timo)
sleepq_set_timeout(ident, timo);
@@ -301,11 +318,27 @@
#endif
PICKUP_GIANT();
mtx_lock_spin(mtx);
- WITNESS_RESTORE(&mtx->mtx_object, mtx);
+ WITNESS_RESTORE(&mtx->lock_object, mtx);
return (rval);
}
/*
+ * pause() is like tsleep() except that the intention is to not be
+ * explicitly woken up by another thread. Instead, the current thread
+ * simply wishes to sleep until the timeout expires. It is
+ * implemented using a dummy wait channel.
+ */
+int
+pause(wmesg, timo)
+ const char *wmesg;
+ int timo;
+{
+
+ KASSERT(timo != 0, ("pause: timeout required"));
+ return (tsleep(&pause_wchan, 0, wmesg, timo));
+}
+
+/*
* Make all threads sleeping on the specified identifier runnable.
*/
void
@@ -314,7 +347,7 @@
{
sleepq_lock(ident);
- sleepq_broadcast(ident, SLEEPQ_MSLEEP, -1);
+ sleepq_broadcast(ident, SLEEPQ_SLEEP, -1, 0);
}
/*
@@ -328,7 +361,8 @@
{
sleepq_lock(ident);
- sleepq_signal(ident, SLEEPQ_MSLEEP, -1);
+ sleepq_signal(ident, SLEEPQ_SLEEP, -1, 0);
+ sleepq_release(ident);
}
/*
@@ -337,12 +371,12 @@
void
mi_switch(int flags, struct thread *newtd)
{
- struct bintime new_switchtime;
+ uint64_t new_switchtime;
struct thread *td;
struct proc *p;
- mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
td = curthread; /* XXX */
+ THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
p = td->td_proc; /* XXX */
KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
#ifdef INVARIANTS
@@ -357,53 +391,33 @@
("mi_switch: switch must be voluntary or involuntary"));
KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself"));
- if (flags & SW_VOL)
- p->p_stats->p_ru.ru_nvcsw++;
- else
- p->p_stats->p_ru.ru_nivcsw++;
-
- /*
- * Compute the amount of time during which the current
- * process was running, and add that to its total so far.
- */
- binuptime(&new_switchtime);
- bintime_add(&p->p_rux.rux_runtime, &new_switchtime);
- bintime_sub(&p->p_rux.rux_runtime, PCPU_PTR(switchtime));
-
- td->td_generation++; /* bump preempt-detect counter */
-
/*
* Don't perform context switches from the debugger.
*/
if (kdb_active) {
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
kdb_backtrace();
kdb_reenter();
panic("%s: did not reenter debugger", __func__);
}
-
- /*
- * Check if the process exceeds its cpu resource allocation. If
- * it reaches the max, arrange to kill the process in ast().
- */
- if (p->p_cpulimit != RLIM_INFINITY &&
- p->p_rux.rux_runtime.sec >= p->p_cpulimit) {
- p->p_sflag |= PS_XCPU;
- td->td_flags |= TDF_ASTPENDING;
- }
-
+ if (flags & SW_VOL)
+ td->td_ru.ru_nvcsw++;
+ else
+ td->td_ru.ru_nivcsw++;
/*
- * Finish up stats for outgoing thread.
+ * Compute the amount of time during which the current
+ * thread was running, and add that to its total so far.
*/
- cnt.v_swtch++;
+ new_switchtime = cpu_ticks();
+ td->td_runtime += new_switchtime - PCPU_GET(switchtime);
PCPU_SET(switchtime, new_switchtime);
+ td->td_generation++; /* bump preempt-detect counter */
+ PCPU_INC(cnt.v_swtch);
PCPU_SET(switchticks, ticks);
- CTR4(KTR_PROC, "mi_switch: old thread %p (kse %p, pid %ld, %s)",
- (void *)td, td->td_sched, (long)p->p_pid, p->p_comm);
- if ((flags & SW_VOL) && (td->td_proc->p_flag & P_SA))
- newtd = thread_switchout(td, flags, newtd);
+ CTR4(KTR_PROC, "mi_switch: old thread %ld (kse %p, pid %ld, %s)",
+ td->td_tid, td->td_sched, p->p_pid, p->p_comm);
#if (KTR_COMPILE & KTR_SCHED) != 0
- if (td == PCPU_GET(idlethread))
+ if (TD_IS_IDLETHREAD(td))
CTR3(KTR_SCHED, "mi_switch: %p(%s) prio %d idle",
td, td->td_proc->p_comm, td->td_priority);
else if (newtd != NULL)
@@ -417,12 +431,20 @@
td, td->td_proc->p_comm, td->td_priority,
td->td_inhibitors, td->td_wmesg, td->td_lockname);
#endif
+ /*
+ * We call thread_switchout after the KTR_SCHED prints above so kse
+ * selecting a new thread to run does not show up as a preemption.
+ */
+#ifdef KSE
+ if ((flags & SW_VOL) && (td->td_proc->p_flag & P_SA))
+ newtd = thread_switchout(td, flags, newtd);
+#endif
sched_switch(td, newtd, flags);
CTR3(KTR_SCHED, "mi_switch: running %p(%s) prio %d",
td, td->td_proc->p_comm, td->td_priority);
- CTR4(KTR_PROC, "mi_switch: new thread %p (kse %p, pid %ld, %s)",
- (void *)td, td->td_sched, (long)p->p_pid, p->p_comm);
+ CTR4(KTR_PROC, "mi_switch: new thread %ld (kse %p, pid %ld, %s)",
+ td->td_tid, td->td_sched, p->p_pid, p->p_comm);
/*
* If the last thread was exiting, finish cleaning it up.
@@ -441,16 +463,10 @@
void
setrunnable(struct thread *td)
{
- struct proc *p;
- p = td->td_proc;
- mtx_assert(&sched_lock, MA_OWNED);
- switch (p->p_state) {
- case PRS_ZOMBIE:
- panic("setrunnable(1)");
- default:
- break;
- }
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT(td->td_proc->p_state != PRS_ZOMBIE,
+ ("setrunnable: pid %d is a zombie", td->td_proc->p_pid));
switch (td->td_state) {
case TDS_RUNNING:
case TDS_RUNQ:
@@ -469,11 +485,11 @@
printf("state is 0x%x", td->td_state);
panic("setrunnable(2)");
}
- if ((p->p_sflag & PS_INMEM) == 0) {
- if ((p->p_sflag & PS_SWAPPINGIN) == 0) {
- p->p_sflag |= PS_SWAPINREQ;
+ if ((td->td_flags & TDF_INMEM) == 0) {
+ if ((td->td_flags & TDF_SWAPINREQ) == 0) {
+ td->td_flags |= TDF_SWAPINREQ;
/*
- * due to a LOR between sched_lock and
+ * due to a LOR between the thread lock and
* the sleepqueue chain locks, use
* lower level scheduling functions.
*/
@@ -532,19 +548,16 @@
}
/*
- * General purpose yield system call
+ * General purpose yield system call.
*/
int
yield(struct thread *td, struct yield_args *uap)
{
- struct ksegrp *kg;
- kg = td->td_ksegrp;
- mtx_assert(&Giant, MA_NOTOWNED);
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
sched_prio(td, PRI_MAX_TIMESHARE);
mi_switch(SW_VOL, NULL);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
td->td_retval[0] = 0;
return (0);
}
Index: sysv_msg.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sysv_msg.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/sysv_msg.c -L sys/kern/sysv_msg.c -u -r1.1.1.1 -r1.2
--- sys/kern/sysv_msg.c
+++ sys/kern/sysv_msg.c
@@ -48,7 +48,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sysv_msg.c,v 1.60 2005/02/12 01:22:39 csjp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sysv_msg.c,v 1.70 2007/06/12 00:11:59 rwatson Exp $");
#include "opt_sysvipc.h"
#include "opt_mac.h"
@@ -57,9 +57,9 @@
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/kernel.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/mutex.h>
#include <sys/module.h>
#include <sys/msg.h>
@@ -70,6 +70,8 @@
#include <sys/malloc.h>
#include <sys/jail.h>
+#include <security/mac/mac_framework.h>
+
static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues");
static void msginit(void);
@@ -81,11 +83,6 @@
#else
#define DPRINTF(a)
#endif
-#ifdef MAC_DEBUG
-#define MPRINTF(a) printf a
-#else
-#define MPRINTF(a)
-#endif
static void msg_freehdr(struct msg *msghdr);
@@ -323,9 +320,7 @@
MODULE_VERSION(sysvmsg, 1);
/*
- * Entry point for all MSG calls
- *
- * MPSAFE
+ * Entry point for all MSG calls.
*/
int
msgsys(td, uap)
@@ -385,10 +380,6 @@
struct msqid_ds *buf;
};
#endif
-
-/*
- * MPSAFE
- */
int
msgctl(td, uap)
struct thread *td;
@@ -399,7 +390,7 @@
struct msqid_ds msqbuf;
int error;
- DPRINTF(("call to msgctl(%d, %d, 0x%x)\n", msqid, cmd, uap->buf));
+ DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf));
if (cmd == IPC_SET &&
(error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0)
return (error);
@@ -445,10 +436,8 @@
}
#ifdef MAC
error = mac_check_sysv_msqctl(td->td_ucred, msqkptr, cmd);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_msqctl returned %d\n", error));
+ if (error != 0)
goto done2;
- }
#endif
error = 0;
@@ -475,11 +464,8 @@
for (msghdr = msqkptr->u.msg_first; msghdr != NULL;
msghdr = msghdr->msg_next) {
error = mac_check_sysv_msgrmid(td->td_ucred, msghdr);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_msgrmid returned %d\n",
- error));
+ if (error != 0)
goto done2;
- }
}
#endif
@@ -516,7 +502,7 @@
if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
goto done2;
if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) {
- error = suser(td);
+ error = priv_check(td, PRIV_IPC_MSGSIZE);
if (error)
goto done2;
}
@@ -565,10 +551,6 @@
int msgflg;
};
#endif
-
-/*
- * MPSAFE
- */
int
msgget(td, uap)
struct thread *td;
@@ -608,11 +590,8 @@
}
#ifdef MAC
error = mac_check_sysv_msqget(cred, msqkptr);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_msqget returned %d\n",
- error));
+ if (error != 0)
goto done2;
- }
#endif
goto found;
}
@@ -681,46 +660,40 @@
int msgflg;
};
#endif
-
-/*
- * MPSAFE
- */
int
-msgsnd(td, uap)
+kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
struct thread *td;
- register struct msgsnd_args *uap;
+ int msqid;
+ const void *msgp; /* XXX msgp is actually mtext. */
+ size_t msgsz;
+ int msgflg;
+ long mtype;
{
- int msqid = uap->msqid;
- const void *user_msgp = uap->msgp;
- size_t msgsz = uap->msgsz;
- int msgflg = uap->msgflg;
- int segs_needed, error = 0;
+ int msqix, segs_needed, error = 0;
register struct msqid_kernel *msqkptr;
register struct msg *msghdr;
short next;
- DPRINTF(("call to msgsnd(%d, 0x%x, %d, %d)\n", msqid, user_msgp, msgsz,
- msgflg));
if (!jail_sysvipc_allowed && jailed(td->td_ucred))
return (ENOSYS);
mtx_lock(&msq_mtx);
- msqid = IPCID_TO_IX(msqid);
+ msqix = IPCID_TO_IX(msqid);
- if (msqid < 0 || msqid >= msginfo.msgmni) {
- DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+ if (msqix < 0 || msqix >= msginfo.msgmni) {
+ DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
msginfo.msgmni));
error = EINVAL;
goto done2;
}
- msqkptr = &msqids[msqid];
+ msqkptr = &msqids[msqix];
if (msqkptr->u.msg_qbytes == 0) {
DPRINTF(("no such message queue id\n"));
error = EINVAL;
goto done2;
}
- if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+ if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
DPRINTF(("wrong sequence number\n"));
error = EINVAL;
goto done2;
@@ -733,15 +706,13 @@
#ifdef MAC
error = mac_check_sysv_msqsnd(td->td_ucred, msqkptr);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_msqsnd returned %d\n", error));
+ if (error != 0)
goto done2;
- }
#endif
segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
- DPRINTF(("msgsz=%d, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz,
- segs_needed));
+ DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz,
+ msginfo.msgssz, segs_needed));
for (;;) {
int need_more_resources = 0;
@@ -793,12 +764,16 @@
msqkptr->u.msg_perm.mode |= MSG_LOCKED;
we_own_it = 1;
}
- DPRINTF(("goodnight\n"));
+ DPRINTF(("msgsnd: goodnight\n"));
error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
- "msgwait", 0);
- DPRINTF(("good morning, error=%d\n", error));
+ "msgsnd", hz);
+ DPRINTF(("msgsnd: good morning, error=%d\n", error));
if (we_own_it)
msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
+ if (error == EWOULDBLOCK) {
+ DPRINTF(("msgsnd: timed out\n"));
+ continue;
+ }
if (error != 0) {
DPRINTF(("msgsnd: interrupted system call\n"));
error = EINTR;
@@ -852,6 +827,7 @@
free_msghdrs = msghdr->msg_next;
msghdr->msg_spot = -1;
msghdr->msg_ts = msgsz;
+ msghdr->msg_type = mtype;
#ifdef MAC
/*
* XXXMAC: Should the mac_check_sysv_msgmsq check follow here
@@ -884,23 +860,6 @@
}
/*
- * Copy in the message type
- */
-
- mtx_unlock(&msq_mtx);
- if ((error = copyin(user_msgp, &msghdr->msg_type,
- sizeof(msghdr->msg_type))) != 0) {
- mtx_lock(&msq_mtx);
- DPRINTF(("error %d copying the message type\n", error));
- msg_freehdr(msghdr);
- msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
- wakeup(msqkptr);
- goto done2;
- }
- mtx_lock(&msq_mtx);
- user_msgp = (const char *)user_msgp + sizeof(msghdr->msg_type);
-
- /*
* Validate the message type
*/
@@ -908,7 +867,7 @@
msg_freehdr(msghdr);
msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
wakeup(msqkptr);
- DPRINTF(("mtype (%d) < 1\n", msghdr->msg_type));
+ DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type));
error = EINVAL;
goto done2;
}
@@ -929,7 +888,7 @@
if (next >= msginfo.msgseg)
panic("next out of range #2");
mtx_unlock(&msq_mtx);
- if ((error = copyin(user_msgp, &msgpool[next * msginfo.msgssz],
+ if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz],
tlen)) != 0) {
mtx_lock(&msq_mtx);
DPRINTF(("error %d copying in message segment\n",
@@ -941,7 +900,7 @@
}
mtx_lock(&msq_mtx);
msgsz -= tlen;
- user_msgp = (const char *)user_msgp + tlen;
+ msgp = (const char *)msgp + tlen;
next = msgmaps[next].next;
}
if (next != -1)
@@ -978,7 +937,6 @@
*/
error = mac_check_sysv_msgmsq(td->td_ucred, msghdr, msqkptr);
if (error != 0) {
- MPRINTF(("mac_check_sysv_msqmsq returned %d\n", error));
msg_freehdr(msghdr);
wakeup(msqkptr);
goto done2;
@@ -1009,6 +967,26 @@
return (error);
}
+int
+msgsnd(td, uap)
+ struct thread *td;
+ register struct msgsnd_args *uap;
+{
+ int error;
+ long mtype;
+
+ DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp,
+ uap->msgsz, uap->msgflg));
+
+ if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) {
+ DPRINTF(("error %d copying the message type\n", error));
+ return (error);
+ }
+ return (kern_msgsnd(td, uap->msqid,
+ (const char *)uap->msgp + sizeof(mtype),
+ uap->msgsz, uap->msgflg, mtype));
+}
+
#ifndef _SYS_SYSPROTO_H_
struct msgrcv_args {
int msqid;
@@ -1018,48 +996,41 @@
int msgflg;
};
#endif
-
-/*
- * MPSAFE
- */
int
-msgrcv(td, uap)
+kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype)
struct thread *td;
- register struct msgrcv_args *uap;
+ int msqid;
+ void *msgp; /* XXX msgp is actually mtext. */
+ size_t msgsz;
+ long msgtyp;
+ int msgflg;
+ long *mtype;
{
- int msqid = uap->msqid;
- void *user_msgp = uap->msgp;
- size_t msgsz = uap->msgsz;
- long msgtyp = uap->msgtyp;
- int msgflg = uap->msgflg;
size_t len;
register struct msqid_kernel *msqkptr;
register struct msg *msghdr;
- int error = 0;
+ int msqix, error = 0;
short next;
- DPRINTF(("call to msgrcv(%d, 0x%x, %d, %ld, %d)\n", msqid, user_msgp,
- msgsz, msgtyp, msgflg));
-
if (!jail_sysvipc_allowed && jailed(td->td_ucred))
return (ENOSYS);
- msqid = IPCID_TO_IX(msqid);
+ msqix = IPCID_TO_IX(msqid);
- if (msqid < 0 || msqid >= msginfo.msgmni) {
- DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+ if (msqix < 0 || msqix >= msginfo.msgmni) {
+ DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
msginfo.msgmni));
return (EINVAL);
}
- msqkptr = &msqids[msqid];
+ msqkptr = &msqids[msqix];
mtx_lock(&msq_mtx);
if (msqkptr->u.msg_qbytes == 0) {
DPRINTF(("no such message queue id\n"));
error = EINVAL;
goto done2;
}
- if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+ if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
DPRINTF(("wrong sequence number\n"));
error = EINVAL;
goto done2;
@@ -1072,10 +1043,8 @@
#ifdef MAC
error = mac_check_sysv_msqrcv(td->td_ucred, msqkptr);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_msqrcv returned %d\n", error));
+ if (error != 0)
goto done2;
- }
#endif
msghdr = NULL;
@@ -1086,7 +1055,7 @@
if (msgsz < msghdr->msg_ts &&
(msgflg & MSG_NOERROR) == 0) {
DPRINTF(("first message on the queue "
- "is too big (want %d, got %d)\n",
+ "is too big (want %zu, got %d)\n",
msgsz, msghdr->msg_ts));
error = E2BIG;
goto done2;
@@ -1094,11 +1063,8 @@
#ifdef MAC
error = mac_check_sysv_msgrcv(td->td_ucred,
msghdr);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_msgrcv "
- "returned %d\n", error));
+ if (error != 0)
goto done2;
- }
#endif
if (msqkptr->u.msg_first == msqkptr->u.msg_last) {
msqkptr->u.msg_first = NULL;
@@ -1127,14 +1093,14 @@
if (msgtyp == msghdr->msg_type ||
msghdr->msg_type <= -msgtyp) {
- DPRINTF(("found message type %d, "
- "requested %d\n",
+ DPRINTF(("found message type %ld, "
+ "requested %ld\n",
msghdr->msg_type, msgtyp));
if (msgsz < msghdr->msg_ts &&
(msgflg & MSG_NOERROR) == 0) {
DPRINTF(("requested message "
"on the queue is too big "
- "(want %d, got %d)\n",
+ "(want %zu, got %hu)\n",
msgsz, msghdr->msg_ts));
error = E2BIG;
goto done2;
@@ -1142,12 +1108,8 @@
#ifdef MAC
error = mac_check_sysv_msgrcv(
td->td_ucred, msghdr);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_"
- "msgrcv returned %d\n",
- error));
+ if (error != 0)
goto done2;
- }
#endif
*prev = msghdr->msg_next;
if (msghdr == msqkptr->u.msg_last) {
@@ -1188,7 +1150,7 @@
*/
if ((msgflg & IPC_NOWAIT) != 0) {
- DPRINTF(("no appropriate message found (msgtyp=%d)\n",
+ DPRINTF(("no appropriate message found (msgtyp=%ld)\n",
msgtyp));
/* The SVID says to return ENOMSG. */
error = ENOMSG;
@@ -1201,11 +1163,11 @@
DPRINTF(("msgrcv: goodnight\n"));
error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
- "msgwait", 0);
+ "msgrcv", 0);
DPRINTF(("msgrcv: good morning (error=%d)\n", error));
if (error != 0) {
- DPRINTF(("msgsnd: interrupted system call\n"));
+ DPRINTF(("msgrcv: interrupted system call\n"));
error = EINTR;
goto done2;
}
@@ -1215,7 +1177,7 @@
*/
if (msqkptr->u.msg_qbytes == 0 ||
- msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+ msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
DPRINTF(("msqid deleted\n"));
error = EIDRM;
goto done2;
@@ -1239,26 +1201,11 @@
* (since msgsz is never increased).
*/
- DPRINTF(("found a message, msgsz=%d, msg_ts=%d\n", msgsz,
+ DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz,
msghdr->msg_ts));
if (msgsz > msghdr->msg_ts)
msgsz = msghdr->msg_ts;
-
- /*
- * Return the type to the user.
- */
-
- mtx_unlock(&msq_mtx);
- error = copyout(&(msghdr->msg_type), user_msgp,
- sizeof(msghdr->msg_type));
- mtx_lock(&msq_mtx);
- if (error != 0) {
- DPRINTF(("error (%d) copying out message type\n", error));
- msg_freehdr(msghdr);
- wakeup(msqkptr);
- goto done2;
- }
- user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type);
+ *mtype = msghdr->msg_type;
/*
* Return the segments to the user
@@ -1277,8 +1224,7 @@
if (next >= msginfo.msgseg)
panic("next out of range #3");
mtx_unlock(&msq_mtx);
- error = copyout(&msgpool[next * msginfo.msgssz],
- user_msgp, tlen);
+ error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen);
mtx_lock(&msq_mtx);
if (error != 0) {
DPRINTF(("error (%d) copying out message segment\n",
@@ -1287,7 +1233,7 @@
wakeup(msqkptr);
goto done2;
}
- user_msgp = (char *)user_msgp + tlen;
+ msgp = (char *)msgp + tlen;
next = msgmaps[next].next;
}
@@ -1303,6 +1249,26 @@
return (error);
}
+int
+msgrcv(td, uap)
+ struct thread *td;
+ register struct msgrcv_args *uap;
+{
+ int error;
+ long mtype;
+
+ DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid,
+ uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg));
+
+ if ((error = kern_msgrcv(td, uap->msqid,
+ (char *)uap->msgp + sizeof(mtype), uap->msgsz,
+ uap->msgtyp, uap->msgflg, &mtype)) != 0)
+ return (error);
+ if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0)
+ DPRINTF(("error %d copying the message type\n", error));
+ return (error);
+}
+
static int
sysctl_msqids(SYSCTL_HANDLER_ARGS)
{
@@ -1311,7 +1277,6 @@
sizeof(struct msqid_kernel) * msginfo.msgmni));
}
-SYSCTL_DECL(_kern_ipc);
SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0,
"Maximum message size");
SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0,
Index: subr_kobj.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_kobj.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_kobj.c -L sys/kern/subr_kobj.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_kobj.c
+++ sys/kern/subr_kobj.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_kobj.c,v 1.8 2003/10/16 09:16:28 dfr Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_kobj.c,v 1.10 2005/12/29 18:00:42 jhb Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
@@ -57,6 +57,7 @@
#endif
static struct mtx kobj_mtx;
+static int kobj_mutex_inited;
static int kobj_next_id = 1;
SYSCTL_UINT(_kern, OID_AUTO, kobj_methodcount, CTLFLAG_RD,
@@ -65,12 +66,20 @@
static void
kobj_init_mutex(void *arg)
{
-
- mtx_init(&kobj_mtx, "kobj", NULL, MTX_DEF);
+ if (!kobj_mutex_inited) {
+ mtx_init(&kobj_mtx, "kobj", NULL, MTX_DEF);
+ kobj_mutex_inited = 1;
+ }
}
SYSINIT(kobj, SI_SUB_LOCK, SI_ORDER_ANY, kobj_init_mutex, NULL);
+void
+kobj_machdep_init(void)
+{
+ kobj_init_mutex(NULL);
+}
+
/*
* This method structure is used to initialise new caches. Since the
* desc pointer is NULL, it is guaranteed never to match any read
@@ -228,7 +237,7 @@
* a 'miss'.
*/
kobj_lookup_hits--;
- kobj_lookup_misses--;
+ kobj_lookup_misses++;
#endif
ce = kobj_lookup_method_mi(cls, desc);
Index: kern_alq.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_alq.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_alq.c -L sys/kern/kern_alq.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_alq.c
+++ sys/kern/kern_alq.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_alq.c,v 1.12 2005/04/16 12:12:27 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_alq.c,v 1.19 2007/06/01 14:33:11 kib Exp $");
#include "opt_mac.h"
@@ -34,7 +34,7 @@
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/lock.h>
-#include <sys/mac.h>
+#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/proc.h>
@@ -45,6 +45,8 @@
#include <sys/fcntl.h>
#include <sys/eventhandler.h>
+#include <security/mac/mac_framework.h>
+
/* Async. Logging Queue */
struct alq {
int aq_entmax; /* Max entries */
@@ -172,8 +174,6 @@
int needwakeup;
struct alq *alq;
- mtx_lock(&Giant);
-
ald_thread = FIRST_THREAD_IN_PROC(ald_proc);
EVENTHANDLER_REGISTER(shutdown_pre_sync, ald_shutdown, NULL,
@@ -250,6 +250,7 @@
struct ale *alstart;
int totlen;
int iov;
+ int vfslocked;
vp = alq->aq_vp;
td = curthread;
@@ -291,6 +292,7 @@
/*
* Do all of the junk required to write now.
*/
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_start_write(vp, &mp, V_WAIT);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
VOP_LEASE(vp, td, alq->aq_cred, LEASE_WRITE);
@@ -303,6 +305,7 @@
VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, alq->aq_cred);
VOP_UNLOCK(vp, 0, td);
vn_finished_write(mp);
+ VFS_UNLOCK_GIANT(vfslocked);
ALQ_LOCK(alq);
alq->aq_flags &= ~AQ_FLUSHING;
@@ -345,21 +348,23 @@
char *bufp;
int flags;
int error;
- int i;
+ int i, vfslocked;
*alqp = NULL;
td = curthread;
- NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, file, td);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, file, td);
flags = FWRITE | O_NOFOLLOW | O_CREAT;
- error = vn_open_cred(&nd, &flags, cmode, cred, -1);
+ error = vn_open_cred(&nd, &flags, cmode, cred, NULL);
if (error)
return (error);
-
+
+ vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
/* We just unlock so we hold a reference */
VOP_UNLOCK(nd.ni_vp, 0, td);
+ VFS_UNLOCK_GIANT(vfslocked);
alq = malloc(sizeof(*alq), M_ALD, M_WAITOK|M_ZERO);
alq->aq_entbuf = malloc(count * size, M_ALD, M_WAITOK|M_ZERO);
Index: kern_syscalls.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_syscalls.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_syscalls.c -L sys/kern/kern_syscalls.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_syscalls.c
+++ sys/kern/kern_syscalls.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_syscalls.c,v 1.11 2004/07/15 08:26:05 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_syscalls.c,v 1.12 2006/08/01 16:32:20 jhb Exp $");
#include <sys/param.h>
#include <sys/sysproto.h>
@@ -97,8 +97,11 @@
case MOD_LOAD :
error = syscall_register(data->offset, data->new_sysent,
&data->old_sysent);
- if (error)
+ if (error) {
+ /* Leave a mark so we know to safely unload below. */
+ data->offset = NULL;
return error;
+ }
ms.intval = *data->offset;
MOD_XLOCK;
module_setspecific(mod, &ms);
@@ -108,6 +111,13 @@
return error;
case MOD_UNLOAD :
+ /*
+ * MOD_LOAD failed, so just return without calling the
+ * chained handler since we didn't pass along the MOD_LOAD
+ * event.
+ */
+ if (data->offset == NULL)
+ return (0);
if (data->chainevh) {
error = data->chainevh(mod, what, data->chainarg);
if (error)
Index: subr_unit.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_unit.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_unit.c -L sys/kern/subr_unit.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_unit.c
+++ sys/kern/subr_unit.c
@@ -23,7 +23,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $FreeBSD: src/sys/kern/subr_unit.c,v 1.7 2005/03/14 06:51:29 phk Exp $
+ * $FreeBSD: src/sys/kern/subr_unit.c,v 1.9 2007/07/04 06:56:57 kib Exp $
*
*
* Unit number allocation functions.
@@ -197,6 +197,8 @@
u_int first; /* items in allocated from start */
u_int last; /* items free at end */
struct mtx *mtx;
+ TAILQ_HEAD(unrfr,unr) ppfree; /* Items to be freed after mtx
+ lock dropped */
};
@@ -281,9 +283,35 @@
static __inline void
delete_unr(struct unrhdr *uh, void *ptr)
{
+ struct unr *up;
uh->alloc--;
- Free(ptr);
+ up = ptr;
+ TAILQ_INSERT_TAIL(&uh->ppfree, up, list);
+}
+
+void
+clean_unrhdrl(struct unrhdr *uh)
+{
+ struct unr *up;
+
+ mtx_assert(uh->mtx, MA_OWNED);
+ while ((up = TAILQ_FIRST(&uh->ppfree)) != NULL) {
+ TAILQ_REMOVE(&uh->ppfree, up, list);
+ mtx_unlock(uh->mtx);
+ Free(up);
+ mtx_lock(uh->mtx);
+ }
+
+}
+
+void
+clean_unrhdr(struct unrhdr *uh)
+{
+
+ mtx_lock(uh->mtx);
+ clean_unrhdrl(uh);
+ mtx_unlock(uh->mtx);
}
/*
@@ -305,6 +333,7 @@
else
uh->mtx = &unitmtx;
TAILQ_INIT(&uh->head);
+ TAILQ_INIT(&uh->ppfree);
uh->low = low;
uh->high = high;
uh->first = 0;
@@ -320,6 +349,8 @@
check_unrhdr(uh, __LINE__);
KASSERT(uh->busy == 0, ("unrhdr has %u allocations", uh->busy));
KASSERT(uh->alloc == 0, ("UNR memory leak in delete_unrhdr"));
+ KASSERT(TAILQ_FIRST(&uh->ppfree) == NULL,
+ ("unrhdr has postponed item for free"));
Free(uh);
}
@@ -591,6 +622,7 @@
mtx_lock(uh->mtx);
i = alloc_unrl(uh);
+ clean_unrhdrl(uh);
mtx_unlock(uh->mtx);
return (i);
}
@@ -714,10 +746,12 @@
{
void *p1, *p2;
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "free_unr");
p1 = Malloc(sizeof(struct unr));
p2 = Malloc(sizeof(struct unr));
mtx_lock(uh->mtx);
free_unrl(uh, item, &p1, &p2);
+ clean_unrhdrl(uh);
mtx_unlock(uh->mtx);
if (p1 != NULL)
Free(p1);
Index: subr_hints.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_hints.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_hints.c -L sys/kern/subr_hints.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_hints.c
+++ sys/kern/subr_hints.c
@@ -25,11 +25,11 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_hints.c,v 1.11.2.1 2005/10/06 18:29:30 delphij Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_hints.c,v 1.13 2006/07/09 21:42:58 scottl Exp $");
#include <sys/param.h>
#include <sys/lock.h>
-#include <sys/sx.h>
+#include <sys/mutex.h>
#include <sys/systm.h>
#include <sys/bus.h>
@@ -72,7 +72,7 @@
break;
case 2: /* fallback mode */
if (dynamic_kenv) {
- sx_slock(&kenv_lock);
+ mtx_lock(&kenv_lock);
cp = kenvp[0];
for (i = 0; cp != NULL; cp = kenvp[++i]) {
if (!strncmp(cp, "hint.", 5)) {
@@ -81,7 +81,7 @@
break;
}
}
- sx_sunlock(&kenv_lock);
+ mtx_unlock(&kenv_lock);
} else {
cp = kern_envp;
while (cp) {
@@ -114,11 +114,11 @@
}
if (use_kenv) {
- sx_slock(&kenv_lock);
+ mtx_lock(&kenv_lock);
i = 0;
cp = kenvp[0];
if (cp == NULL) {
- sx_sunlock(&kenv_lock);
+ mtx_unlock(&kenv_lock);
return (ENOENT);
}
} else
@@ -165,7 +165,7 @@
}
}
if (use_kenv)
- sx_sunlock(&kenv_lock);
+ mtx_unlock(&kenv_lock);
if (cp == NULL)
return ENOENT;
Index: link_elf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/link_elf.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/link_elf.c -L sys/kern/link_elf.c -u -r1.1.1.2 -r1.2
--- sys/kern/link_elf.c
+++ sys/kern/link_elf.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/link_elf.c,v 1.81.8.5 2005/12/30 22:13:58 marcel Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/link_elf.c,v 1.93 2007/05/31 11:51:51 kib Exp $");
#include "opt_gdb.h"
#include "opt_mac.h"
@@ -37,9 +37,9 @@
#endif
#include <sys/kernel.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/fcntl.h>
@@ -48,6 +48,8 @@
#include <machine/elf.h>
+#include <security/mac/mac_framework.h>
+
#include <vm/vm.h>
#include <vm/vm_param.h>
#ifdef SPARSE_MAPPING
@@ -62,6 +64,8 @@
#include "linker_if.h"
+#define MAXSEGS 4
+
typedef struct elf_file {
struct linker_file lf; /* Common fields */
int preloaded; /* Was file pre-loaded */
@@ -302,9 +306,10 @@
#endif
(void)link_elf_link_common_finish(linker_kernel_file);
+ linker_kernel_file->flags |= LINKER_FILE_LINKED;
}
-SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0);
+SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_THIRD, link_elf_init, 0);
static int
link_elf_preload_parse_symbols(elf_file_t ef)
@@ -536,7 +541,7 @@
int nbytes, i;
Elf_Phdr *phdr;
Elf_Phdr *phlimit;
- Elf_Phdr *segs[2];
+ Elf_Phdr *segs[MAXSEGS];
int nsegs;
Elf_Phdr *phdyn;
Elf_Phdr *phphdr;
@@ -554,17 +559,17 @@
int symstrindex;
int symcnt;
int strcnt;
-
- GIANT_REQUIRED;
+ int vfslocked;
shdr = NULL;
lf = NULL;
- NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, filename, td);
flags = FREAD;
- error = vn_open(&nd, &flags, 0, -1);
+ error = vn_open(&nd, &flags, 0, NULL);
if (error)
return error;
+ vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
#ifdef MAC
error = mac_check_kld_load(curthread->td_ucred, nd.ni_vp);
@@ -643,7 +648,7 @@
switch (phdr->p_type) {
case PT_LOAD:
- if (nsegs == 2) {
+ if (nsegs == MAXSEGS) {
link_elf_error("Too many sections");
error = ENOEXEC;
goto out;
@@ -676,8 +681,8 @@
error = ENOEXEC;
goto out;
}
- if (nsegs != 2) {
- link_elf_error("Too few sections");
+ if (nsegs == 0) {
+ link_elf_error("No sections");
error = ENOEXEC;
goto out;
}
@@ -688,7 +693,8 @@
*/
base_offset = trunc_page(segs[0]->p_offset);
base_vaddr = trunc_page(segs[0]->p_vaddr);
- base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz);
+ base_vlimit = round_page(segs[nsegs - 1]->p_vaddr +
+ segs[nsegs - 1]->p_memsz);
mapsize = base_vlimit - base_vaddr;
lf = linker_make_file(filename, &link_elf_class);
@@ -726,7 +732,7 @@
/*
* Read the text and data sections and zero the bss.
*/
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < nsegs; i++) {
caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
error = vn_rdwr(UIO_READ, nd.ni_vp,
segbase, segs[i]->p_filesz, segs[i]->p_offset,
@@ -755,8 +761,10 @@
#ifdef GPROF
/* Update profiling information with the new text segment. */
+ mtx_lock(&Giant);
kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr +
segs[0]->p_memsz));
+ mtx_unlock(&Giant);
#endif
ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);
@@ -856,6 +864,7 @@
free(firstpage, M_LINKER);
VOP_UNLOCK(nd.ni_vp, 0, td);
vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+ VFS_UNLOCK_GIANT(vfslocked);
return error;
}
--- sys/kern/uipc_proto.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*-
- * Copyright (c) 1982, 1986, 1993
- * The Regents of the University of California. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)uipc_proto.c 8.1 (Berkeley) 6/10/93
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_proto.c,v 1.24.8.1 2005/11/16 10:31:21 ru Exp $");
-
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/domain.h>
-#include <sys/protosw.h>
-#include <sys/socket.h>
-#include <sys/queue.h>
-#include <sys/sysctl.h>
-#include <sys/un.h>
-
-#include <net/raw_cb.h>
-
-/*
- * Definitions of protocols supported in the LOCAL domain.
- */
-
-static struct protosw localsw[] = {
-{
- .pr_type = SOCK_STREAM,
- .pr_domain = &localdomain,
- .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
- .pr_ctloutput = &uipc_ctloutput,
- .pr_usrreqs = &uipc_usrreqs
-},
-{
- .pr_type = SOCK_DGRAM,
- .pr_domain = &localdomain,
- .pr_flags = PR_ATOMIC|PR_ADDR|PR_RIGHTS,
- .pr_usrreqs = &uipc_usrreqs
-},
-{
- .pr_ctlinput = raw_ctlinput,
- .pr_init = raw_init,
- .pr_usrreqs = &raw_usrreqs
-}
-};
-
-struct domain localdomain = {
- .dom_family = AF_LOCAL,
- .dom_name = "local",
- .dom_init = unp_init,
- .dom_externalize = unp_externalize,
- .dom_dispose = unp_dispose,
- .dom_protosw = localsw,
- .dom_protoswNPROTOSW = &localsw[sizeof(localsw)/sizeof(localsw[0])]
-};
-DOMAIN_SET(local);
-
-SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
-SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM");
-SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
Index: md5c.c
===================================================================
RCS file: /home/cvs/src/sys/kern/md5c.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/md5c.c -L sys/kern/md5c.c -u -r1.1.1.1 -r1.2
--- sys/kern/md5c.c
+++ sys/kern/md5c.c
@@ -30,7 +30,7 @@
* This file should be kept in sync with src/lib/libmd/md5c.c
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/md5c.c,v 1.25 2005/02/10 12:20:42 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/md5c.c,v 1.27 2006/03/30 18:45:50 pjd Exp $");
#include <sys/types.h>
@@ -60,10 +60,15 @@
Encode (unsigned char *output, u_int32_t *input, unsigned int len)
{
unsigned int i;
- u_int32_t *op = (u_int32_t *)output;
+ uint32_t ip;
- for (i = 0; i < len / 4; i++)
- op[i] = htole32(input[i]);
+ for (i = 0; i < len / 4; i++) {
+ ip = input[i];
+ *output++ = ip;
+ *output++ = ip >> 8;
+ *output++ = ip >> 16;
+ *output++ = ip >> 24;
+ }
}
/*
@@ -75,10 +80,11 @@
Decode (u_int32_t *output, const unsigned char *input, unsigned int len)
{
unsigned int i;
- const u_int32_t *ip = (const u_int32_t *)input;
- for (i = 0; i < len / 4; i++)
- output[i] = le32dec(&ip[i]);
+ for (i = 0; i < len; i += 4) {
+ *output++ = input[i] | (input[i+1] << 8) | (input[i+2] << 16) |
+ (input[i+3] << 24);
+ }
}
#endif
@@ -145,12 +151,13 @@
*/
void
-MD5Update (context, input, inputLen)
+MD5Update (context, in, inputLen)
MD5_CTX *context;
- const unsigned char *input;
+ const void *in;
unsigned int inputLen;
{
unsigned int i, index, partLen;
+ const unsigned char *input = in;
/* Compute number of bytes mod 64 */
index = (unsigned int)((context->count[0] >> 3) & 0x3F);
Index: kern_ntptime.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_ntptime.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_ntptime.c -L sys/kern/kern_ntptime.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_ntptime.c
+++ sys/kern/kern_ntptime.c
@@ -31,7 +31,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_ntptime.c,v 1.59 2005/05/28 14:34:41 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_ntptime.c,v 1.64 2007/06/14 18:37:58 rwatson Exp $");
#include "opt_ntp.h"
@@ -39,6 +39,7 @@
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/kernel.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
@@ -248,9 +249,8 @@
/*
* ntp_gettime() - NTP user application interface
*
- * See the timex.h header file for synopsis and API description. Note
- * that the TAI offset is returned in the ntvtimeval.tai structure
- * member.
+ * See the timex.h header file for synopsis and API description. Note that
+ * the TAI offset is returned in the ntvtimeval.tai structure member.
*/
#ifndef _SYS_SYSPROTO_H_
struct ntp_gettime_args {
@@ -267,6 +267,7 @@
ntp_gettime1(&ntv);
mtx_unlock(&Giant);
+ td->td_retval[0] = ntv.time_state;
return (copyout(&ntv, uap->ntvp, sizeof(ntv)));
}
@@ -292,12 +293,13 @@
SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, pps_freq, CTLFLAG_RD, &pps_freq, sizeof(pps_freq), "I", "");
SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, time_freq, CTLFLAG_RD, &time_freq, sizeof(time_freq), "I", "");
#endif
+
/*
* ntp_adjtime() - NTP daemon application interface
*
- * See the timex.h header file for synopsis and API description. Note
- * that the timex.constant structure member has a dual purpose to set
- * the time constant and to set the TAI offset.
+ * See the timex.h header file for synopsis and API description. Note that
+ * the timex.constant structure member has a dual purpose to set the time
+ * constant and to set the TAI offset.
*/
#ifndef _SYS_SYSPROTO_H_
struct ntp_adjtime_args {
@@ -305,9 +307,6 @@
};
#endif
-/*
- * MPSAFE
- */
int
ntp_adjtime(struct thread *td, struct ntp_adjtime_args *uap)
{
@@ -333,7 +332,7 @@
mtx_lock(&Giant);
modes = ntv.modes;
if (modes)
- error = suser(td);
+ error = priv_check(td, PRIV_NTP_ADJTIME);
if (error)
goto done2;
s = splclock();
@@ -925,9 +924,6 @@
struct timeval *olddelta;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
adjtime(struct thread *td, struct adjtime_args *uap)
@@ -954,9 +950,6 @@
struct timeval atv;
int error;
- if ((error = suser(td)))
- return (error);
-
mtx_lock(&Giant);
if (olddelta) {
atv.tv_sec = time_adjtime / 1000000;
@@ -967,10 +960,15 @@
}
*olddelta = atv;
}
- if (delta)
+ if (delta) {
+ if ((error = priv_check(td, PRIV_ADJTIME))) {
+ mtx_unlock(&Giant);
+ return (error);
+ }
time_adjtime = (int64_t)delta->tv_sec * 1000000 +
delta->tv_usec;
+ }
mtx_unlock(&Giant);
- return (error);
+ return (0);
}
Index: imgact_aout.c
===================================================================
RCS file: /home/cvs/src/sys/kern/imgact_aout.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/imgact_aout.c -L sys/kern/imgact_aout.c -u -r1.2 -r1.3
--- sys/kern/imgact_aout.c
+++ sys/kern/imgact_aout.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/imgact_aout.c,v 1.99.2.1 2006/03/16 00:25:31 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/imgact_aout.c,v 1.101.4.1 2008/01/19 18:15:05 kib Exp $");
#include <sys/param.h>
#include <sys/exec.h>
@@ -198,9 +198,11 @@
/*
* Destroy old process VM and create a new one (with a new stack)
*/
- exec_new_vmspace(imgp, &aout_sysvec);
+ error = exec_new_vmspace(imgp, &aout_sysvec);
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (error)
+ return (error);
/*
* The vm space can be changed by exec_new_vmspace
@@ -220,6 +222,7 @@
MAP_COPY_ON_WRITE | MAP_PREFAULT);
if (error) {
vm_map_unlock(map);
+ vm_object_deallocate(object);
return (error);
}
data_end = text_end + a_out->a_data;
@@ -232,6 +235,7 @@
MAP_COPY_ON_WRITE | MAP_PREFAULT);
if (error) {
vm_map_unlock(map);
+ vm_object_deallocate(object);
return (error);
}
}
Index: subr_prof.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_prof.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_prof.c -L sys/kern/subr_prof.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_prof.c
+++ sys/kern/subr_prof.c
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_prof.c,v 1.75 2005/03/02 21:33:27 joerg Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_prof.c,v 1.79 2007/06/05 00:00:54 jeff Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -402,9 +402,6 @@
u_int scale;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
profil(td, uap)
@@ -426,12 +423,12 @@
}
PROC_LOCK(p);
upp = &td->td_proc->p_stats->p_prof;
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
upp->pr_off = uap->offset;
upp->pr_scale = uap->scale;
upp->pr_base = uap->samples;
upp->pr_size = uap->size;
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
startprofclock(p);
PROC_UNLOCK(p);
@@ -461,7 +458,7 @@
* inaccurate.
*/
void
-addupc_intr(struct thread *td, uintptr_t pc, u_int ticks)
+addupc_intr(struct thread *td, uintfptr_t pc, u_int ticks)
{
struct uprof *prof;
caddr_t addr;
@@ -471,22 +468,22 @@
if (ticks == 0)
return;
prof = &td->td_proc->p_stats->p_prof;
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(td->td_proc);
if (pc < prof->pr_off ||
(i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(td->td_proc);
return; /* out of range; ignore */
}
addr = prof->pr_base + i;
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(td->td_proc);
if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) {
td->td_profil_addr = pc;
td->td_profil_ticks = ticks;
td->td_pflags |= TDP_OWEUPC;
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_flags |= TDF_ASTPENDING;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
}
@@ -495,7 +492,7 @@
* update fails, we simply turn off profiling.
*/
void
-addupc_task(struct thread *td, uintptr_t pc, u_int ticks)
+addupc_task(struct thread *td, uintfptr_t pc, u_int ticks)
{
struct proc *p = td->td_proc;
struct uprof *prof;
@@ -514,12 +511,15 @@
}
p->p_profthreads++;
prof = &p->p_stats->p_prof;
+ PROC_SLOCK(p);
if (pc < prof->pr_off ||
(i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
+ PROC_SUNLOCK(p);
goto out;
}
addr = prof->pr_base + i;
+ PROC_SUNLOCK(p);
PROC_UNLOCK(p);
if (copyin(addr, &v, sizeof(v)) == 0) {
v += ticks;
Index: kern_thread.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_thread.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -L sys/kern/kern_thread.c -L sys/kern/kern_thread.c -u -r1.4 -r1.5
--- sys/kern/kern_thread.c
+++ sys/kern/kern_thread.c
@@ -27,7 +27,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_thread.c,v 1.216.2.2 2006/02/27 00:19:40 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_thread.c,v 1.255.2.1.2.1 2008/01/19 18:15:05 kib Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -44,41 +44,39 @@
#include <sys/ktr.h>
#include <sys/umtx.h>
+#include <security/audit/audit.h>
+
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
+#include <sys/eventhandler.h>
/*
- * KSEGRP related storage.
+ * thread related storage.
*/
-static uma_zone_t ksegrp_zone;
static uma_zone_t thread_zone;
-/* DEBUG ONLY */
SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
-static int thread_debug = 0;
-SYSCTL_INT(_kern_threads, OID_AUTO, debug, CTLFLAG_RW,
- &thread_debug, 0, "thread debug");
int max_threads_per_proc = 1500;
SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
&max_threads_per_proc, 0, "Limit on threads per proc");
-int max_groups_per_proc = 1500;
-SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
- &max_groups_per_proc, 0, "Limit on thread groups per proc");
-
int max_threads_hits;
SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
&max_threads_hits, 0, "");
+#ifdef KSE
int virtual_cpu;
+#endif
TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
-TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
-struct mtx kse_zombie_lock;
-MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN);
+static struct mtx zombie_lock;
+MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN);
+
+static void thread_zombie(struct thread *);
+#ifdef KSE
static int
sysctl_kse_virtual_cpu(SYSCTL_HANDLER_ARGS)
{
@@ -103,6 +101,7 @@
SYSCTL_PROC(_kern_threads, OID_AUTO, virtual_cpu, CTLTYPE_INT|CTLFLAG_RW,
0, sizeof(virtual_cpu), sysctl_kse_virtual_cpu, "I",
"debug virtual cpus");
+#endif
struct mtx tid_lock;
static struct unrhdr *tid_unrhdr;
@@ -120,20 +119,19 @@
td->td_oncpu = NOCPU;
td->td_tid = alloc_unr(tid_unrhdr);
+ td->td_syscalls = 0;
/*
* Note that td_critnest begins life as 1 because the thread is not
* running and is thereby implicitly waiting to be on the receiving
- * end of a context switch. A context switch must occur inside a
- * critical section, and in fact, includes hand-off of the sched_lock.
- * After a context switch to a newly created thread, it will release
- * sched_lock for the first time, and its td_critnest will hit 0 for
- * the first time. This happens on the far end of a context switch,
- * and when it context switches away from itself, it will in fact go
- * back into a critical section, and hand off the sched lock to the
- * next thread.
+ * end of a context switch.
*/
td->td_critnest = 1;
+ EVENTHANDLER_INVOKE(thread_ctor, td);
+#ifdef AUDIT
+ audit_thread_alloc(td);
+#endif
+ umtx_thread_alloc(td);
return (0);
}
@@ -167,7 +165,10 @@
/* NOTREACHED */
}
#endif
-
+#ifdef AUDIT
+ audit_thread_free(td);
+#endif
+ EVENTHANDLER_INVOKE(thread_dtor, td);
free_unr(tid_unrhdr, td->td_tid);
sched_newthread(td);
}
@@ -182,13 +183,13 @@
td = (struct thread *)mem;
- vm_thread_new(td, 0);
- cpu_thread_setup(td);
td->td_sleepqueue = sleepq_alloc();
td->td_turnstile = turnstile_alloc();
- td->td_umtxq = umtxq_alloc();
+ EVENTHANDLER_INVOKE(thread_init, td);
td->td_sched = (struct td_sched *)&td[1];
sched_newthread(td);
+ umtx_thread_init(td);
+ td->td_kstack = 0;
return (0);
}
@@ -201,66 +202,10 @@
struct thread *td;
td = (struct thread *)mem;
+ EVENTHANDLER_INVOKE(thread_fini, td);
turnstile_free(td->td_turnstile);
sleepq_free(td->td_sleepqueue);
- umtxq_free(td->td_umtxq);
- vm_thread_dispose(td);
-}
-
-/*
- * Initialize type-stable parts of a ksegrp (when newly created).
- */
-static int
-ksegrp_ctor(void *mem, int size, void *arg, int flags)
-{
- struct ksegrp *kg;
-
- kg = (struct ksegrp *)mem;
- bzero(mem, size);
- kg->kg_sched = (struct kg_sched *)&kg[1];
- return (0);
-}
-
-void
-ksegrp_link(struct ksegrp *kg, struct proc *p)
-{
-
- TAILQ_INIT(&kg->kg_threads);
- TAILQ_INIT(&kg->kg_runq); /* links with td_runq */
- TAILQ_INIT(&kg->kg_upcalls); /* all upcall structure in ksegrp */
- kg->kg_proc = p;
- /*
- * the following counters are in the -zero- section
- * and may not need clearing
- */
- kg->kg_numthreads = 0;
- kg->kg_numupcalls = 0;
- /* link it in now that it's consistent */
- p->p_numksegrps++;
- TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
-}
-
-/*
- * Called from:
- * thread-exit()
- */
-void
-ksegrp_unlink(struct ksegrp *kg)
-{
- struct proc *p;
-
- mtx_assert(&sched_lock, MA_OWNED);
- KASSERT((kg->kg_numthreads == 0), ("ksegrp_unlink: residual threads"));
- KASSERT((kg->kg_numupcalls == 0), ("ksegrp_unlink: residual upcalls"));
-
- p = kg->kg_proc;
- TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
- p->p_numksegrps--;
- /*
- * Aggregate stats from the KSE
- */
- if (p->p_procscopegrp == kg)
- p->p_procscopegrp = NULL;
+ umtx_thread_fini(td);
}
/*
@@ -272,17 +217,28 @@
* proc_init()
*/
void
-proc_linkup(struct proc *p, struct ksegrp *kg, struct thread *td)
+proc_linkup0(struct proc *p, struct thread *td)
{
-
- TAILQ_INIT(&p->p_ksegrps); /* all ksegrps in proc */
TAILQ_INIT(&p->p_threads); /* all threads in proc */
- TAILQ_INIT(&p->p_suspended); /* Threads suspended */
- p->p_numksegrps = 0;
- p->p_numthreads = 0;
+ proc_linkup(p, td);
+}
- ksegrp_link(kg, p);
- thread_link(td, kg);
+void
+proc_linkup(struct proc *p, struct thread *td)
+{
+
+#ifdef KSE
+ TAILQ_INIT(&p->p_upcalls); /* upcall list */
+#endif
+ sigqueue_init(&p->p_sigqueue, p);
+ p->p_ksi = ksiginfo_alloc(1);
+ if (p->p_ksi != NULL) {
+ /* XXX p_ksi may be null if ksiginfo zone is not ready */
+ p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
+ }
+ LIST_INIT(&p->p_mqnotifier);
+ p->p_numthreads = 0;
+ thread_link(td, p);
}
/*
@@ -297,33 +253,32 @@
thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
thread_ctor, thread_dtor, thread_init, thread_fini,
- UMA_ALIGN_CACHE, 0);
- ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(),
- ksegrp_ctor, NULL, NULL, NULL,
- UMA_ALIGN_CACHE, 0);
+ 16 - 1, 0);
+#ifdef KSE
kseinit(); /* set up kse specific stuff e.g. upcall zone*/
+#endif
}
/*
- * Stash an embarasingly extra thread into the zombie thread queue.
+ * Place an unused thread on the zombie list.
+ * Use the slpq as that must be unused by now.
*/
void
-thread_stash(struct thread *td)
+thread_zombie(struct thread *td)
{
- mtx_lock_spin(&kse_zombie_lock);
- TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
- mtx_unlock_spin(&kse_zombie_lock);
+ mtx_lock_spin(&zombie_lock);
+ TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq);
+ mtx_unlock_spin(&zombie_lock);
}
/*
- * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
+ * Release a thread that has exited after cpu_throw().
*/
void
-ksegrp_stash(struct ksegrp *kg)
+thread_stash(struct thread *td)
{
- mtx_lock_spin(&kse_zombie_lock);
- TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
- mtx_unlock_spin(&kse_zombie_lock);
+ atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1);
+ thread_zombie(td);
}
/*
@@ -333,49 +288,28 @@
thread_reap(void)
{
struct thread *td_first, *td_next;
- struct ksegrp *kg_first, * kg_next;
/*
* Don't even bother to lock if none at this instant,
* we really don't care about the next instant..
*/
- if ((!TAILQ_EMPTY(&zombie_threads))
- || (!TAILQ_EMPTY(&zombie_ksegrps))) {
- mtx_lock_spin(&kse_zombie_lock);
+ if (!TAILQ_EMPTY(&zombie_threads)) {
+ mtx_lock_spin(&zombie_lock);
td_first = TAILQ_FIRST(&zombie_threads);
- kg_first = TAILQ_FIRST(&zombie_ksegrps);
if (td_first)
TAILQ_INIT(&zombie_threads);
- if (kg_first)
- TAILQ_INIT(&zombie_ksegrps);
- mtx_unlock_spin(&kse_zombie_lock);
+ mtx_unlock_spin(&zombie_lock);
while (td_first) {
- td_next = TAILQ_NEXT(td_first, td_runq);
+ td_next = TAILQ_NEXT(td_first, td_slpq);
if (td_first->td_ucred)
crfree(td_first->td_ucred);
thread_free(td_first);
td_first = td_next;
}
- while (kg_first) {
- kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
- ksegrp_free(kg_first);
- kg_first = kg_next;
- }
- /*
- * there will always be a thread on the list if one of these
- * is there.
- */
- kse_GC();
}
-}
-
-/*
- * Allocate a ksegrp.
- */
-struct ksegrp *
-ksegrp_alloc(void)
-{
- return (uma_zalloc(ksegrp_zone, M_WAITOK));
+#ifdef KSE
+ upcall_reap();
+#endif
}
/*
@@ -384,19 +318,21 @@
struct thread *
thread_alloc(void)
{
+ struct thread *td;
+
thread_reap(); /* check if any zombies to get */
- return (uma_zalloc(thread_zone, M_WAITOK));
-}
-/*
- * Deallocate a ksegrp.
- */
-void
-ksegrp_free(struct ksegrp *td)
-{
- uma_zfree(ksegrp_zone, td);
+ td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK);
+ KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack"));
+ if (!vm_thread_new(td, 0)) {
+ uma_zfree(thread_zone, td);
+ return (NULL);
+ }
+ cpu_thread_setup(td);
+ return (td);
}
+
/*
* Deallocate a thread.
*/
@@ -405,6 +341,10 @@
{
cpu_thread_clean(td);
+ if (td->td_altkstack != 0)
+ vm_thread_dispose_altkstack(td);
+ if (td->td_kstack != 0)
+ vm_thread_dispose(td);
uma_zfree(thread_zone, td);
}
@@ -433,38 +373,48 @@
* exit1()
* kse_exit()
* thr_exit()
+ * ifdef KSE
* thread_user_enter()
* thread_userret()
+ * endif
* thread_suspend_check()
*/
void
thread_exit(void)
{
- struct bintime new_switchtime;
+ uint64_t new_switchtime;
struct thread *td;
+ struct thread *td2;
struct proc *p;
- struct ksegrp *kg;
td = curthread;
- kg = td->td_ksegrp;
p = td->td_proc;
- mtx_assert(&sched_lock, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
mtx_assert(&Giant, MA_NOTOWNED);
+
PROC_LOCK_ASSERT(p, MA_OWNED);
KASSERT(p != NULL, ("thread exiting without a process"));
- KASSERT(kg != NULL, ("thread exiting without a kse group"));
CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
(long)p->p_pid, p->p_comm);
+ KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
+#ifdef AUDIT
+ AUDIT_SYSCALL_EXIT(0, td);
+#endif
+
+#ifdef KSE
if (td->td_standin != NULL) {
/*
* Note that we don't need to free the cred here as it
* is done in thread_reap().
*/
- thread_stash(td->td_standin);
+ thread_zombie(td->td_standin);
td->td_standin = NULL;
}
+#endif
+
+ umtx_thread_exit(td);
/*
* drop FPU & debug register state storage, or any other
@@ -473,24 +423,15 @@
*/
cpu_thread_exit(td); /* XXXSMP */
- /*
- * The thread is exiting. scheduler can release its stuff
- * and collect stats etc.
- */
- sched_thread_exit(td);
-
/* Do the same timestamp bookkeeping that mi_switch() would do. */
- binuptime(&new_switchtime);
- bintime_add(&p->p_rux.rux_runtime, &new_switchtime);
- bintime_sub(&p->p_rux.rux_runtime, PCPU_PTR(switchtime));
+ new_switchtime = cpu_ticks();
+ p->p_rux.rux_runtime += (new_switchtime - PCPU_GET(switchtime));
PCPU_SET(switchtime, new_switchtime);
PCPU_SET(switchticks, ticks);
- cnt.v_swtch++;
-
- /* Add our usage into the usage of all our children. */
- if (p->p_numthreads == 1)
- ruadd(p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
-
+ PCPU_INC(cnt.v_swtch);
+ /* Save our resource usage in our process. */
+ td->td_ru.ru_nvcsw++;
+ rucollect(&p->p_ru, &td->td_ru);
/*
* The last thread is left attached to the process
* So that the whole bundle gets recycled. Skip
@@ -501,10 +442,15 @@
*/
if (p->p_flag & P_HADTHREADS) {
if (p->p_numthreads > 1) {
+ thread_lock(td);
+#ifdef KSE
+ kse_unlink(td);
+#else
thread_unlink(td);
-
- /* XXX first arg not used in 4BSD or ULE */
- sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
+#endif
+ thread_unlock(td);
+ td2 = FIRST_THREAD_IN_PROC(p);
+ sched_exit_thread(td2, td);
/*
* The test below is NOT true if we are the
@@ -513,50 +459,13 @@
*/
if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
if (p->p_numthreads == p->p_suspcount) {
+ thread_lock(p->p_singlethread);
thread_unsuspend_one(p->p_singlethread);
+ thread_unlock(p->p_singlethread);
}
}
- /*
- * Because each upcall structure has an owner thread,
- * owner thread exits only when process is in exiting
- * state, so upcall to userland is no longer needed,
- * deleting upcall structure is safe here.
- * So when all threads in a group is exited, all upcalls
- * in the group should be automatically freed.
- * XXXKSE This is a KSE thing and should be exported
- * there somehow.
- */
- upcall_remove(td);
-
- /*
- * If the thread we unlinked above was the last one,
- * then this ksegrp should go away too.
- */
- if (kg->kg_numthreads == 0) {
- /*
- * let the scheduler know about this in case
- * it needs to recover stats or resources.
- * Theoretically we could let
- * sched_exit_ksegrp() do the equivalent of
- * setting the concurrency to 0
- * but don't do it yet to avoid changing
- * the existing scheduler code until we
- * are ready.
- * We supply a random other ksegrp
- * as the recipient of any built up
- * cpu usage etc. (If the scheduler wants it).
- * XXXKSE
- * This is probably not fair so think of
- * a better answer.
- */
- sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td);
- sched_set_concurrency(kg, 0); /* XXX TEMP */
- ksegrp_unlink(kg);
- ksegrp_stash(kg);
- }
- PROC_UNLOCK(p);
- td->td_ksegrp = NULL;
+ atomic_add_int(&td->td_proc->p_exitthreads, 1);
PCPU_SET(deadthread, td);
} else {
/*
@@ -566,23 +475,23 @@
* exit1() - clears threading flags before coming here
* kse_exit() - treats last thread specially
* thr_exit() - treats last thread specially
+ * ifdef KSE
* thread_user_enter() - only if more exist
* thread_userret() - only if more exist
+ * endif
* thread_suspend_check() - only if more exist
*/
panic ("thread_exit: Last thread exiting on its own");
}
- } else {
- /*
- * non threaded process comes here.
- * This includes an EX threaded process that is coming
- * here via exit1(). (exit1 dethreads the proc first).
- */
- PROC_UNLOCK(p);
- }
+ }
+ PROC_UNLOCK(p);
+ thread_lock(td);
+ /* Save our tick information with both the thread and proc locked */
+ ruxagg(&p->p_rux, td);
+ PROC_SUNLOCK(p);
td->td_state = TDS_INACTIVE;
CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
- cpu_throw(td, choosethread());
+ sched_throw(td);
panic("I'm a teapot!");
/* NOTREACHED */
}
@@ -598,19 +507,25 @@
mtx_assert(&Giant, MA_NOTOWNED);
KASSERT((p->p_numthreads == 1), ("Multiple threads in wait1()"));
- KASSERT((p->p_numksegrps == 1), ("Multiple ksegrps in wait1()"));
- FOREACH_THREAD_IN_PROC(p, td) {
- if (td->td_standin != NULL) {
- if (td->td_standin->td_ucred != NULL) {
- crfree(td->td_standin->td_ucred);
- td->td_standin->td_ucred = NULL;
- }
- thread_free(td->td_standin);
- td->td_standin = NULL;
+ td = FIRST_THREAD_IN_PROC(p);
+#ifdef KSE
+ if (td->td_standin != NULL) {
+ if (td->td_standin->td_ucred != NULL) {
+ crfree(td->td_standin->td_ucred);
+ td->td_standin->td_ucred = NULL;
}
- cpu_thread_clean(td);
- crfree(td->td_ucred);
+ thread_free(td->td_standin);
+ td->td_standin = NULL;
}
+#endif
+ /* Lock the last thread so we spin until it exits cpu_throw(). */
+ thread_lock(td);
+ thread_unlock(td);
+ /* Wait for any remaining threads to exit cpu_throw(). */
+ while (p->p_exitthreads)
+ sched_relinquish(curthread);
+ cpu_thread_clean(td);
+ crfree(td->td_ucred);
thread_reap(); /* check for zombie threads etc. */
}
@@ -627,23 +542,23 @@
* thr_create()
*/
void
-thread_link(struct thread *td, struct ksegrp *kg)
+thread_link(struct thread *td, struct proc *p)
{
- struct proc *p;
- p = kg->kg_proc;
+ /*
+ * XXX This can't be enabled because it's called for proc0 before
+ * it's spinlock has been created.
+ * PROC_SLOCK_ASSERT(p, MA_OWNED);
+ */
td->td_state = TDS_INACTIVE;
td->td_proc = p;
- td->td_ksegrp = kg;
- td->td_flags = 0;
- td->td_kflags = 0;
+ td->td_flags = TDF_INMEM;
LIST_INIT(&td->td_contested);
+ sigqueue_init(&td->td_sigqueue, p);
callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
- TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
p->p_numthreads++;
- kg->kg_numthreads++;
}
/*
@@ -658,15 +573,20 @@
struct proc *p = td->td_proc;
KASSERT((p->p_numthreads == 1), ("Unthreading with >1 threads"));
+#ifdef KSE
+ thread_lock(td);
upcall_remove(td);
+ thread_unlock(td);
p->p_flag &= ~(P_SA|P_HADTHREADS);
td->td_mailbox = NULL;
td->td_pflags &= ~(TDP_SA | TDP_CAN_UNBIND);
if (td->td_standin != NULL) {
- thread_stash(td->td_standin);
+ thread_zombie(td->td_standin);
td->td_standin = NULL;
}
- sched_set_concurrency(td->td_ksegrp, 1);
+#else
+ p->p_flag &= ~P_HADTHREADS;
+#endif
}
/*
@@ -677,15 +597,12 @@
thread_unlink(struct thread *td)
{
struct proc *p = td->td_proc;
- struct ksegrp *kg = td->td_ksegrp;
- mtx_assert(&sched_lock, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
TAILQ_REMOVE(&p->p_threads, td, td_plist);
p->p_numthreads--;
- TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
- kg->kg_numthreads--;
/* could clear a few other things here */
- /* Must NOT clear links to proc and ksegrp! */
+ /* Must NOT clear links to proc! */
}
/*
@@ -698,7 +615,7 @@
* There are no threads in user mode. Threads in the kernel must be
* allowed to continue until they get to the user boundary. They may even
* copy out their return values and data before suspending. They may however be
- * accellerated in reaching the user boundary as we will wake up
+ * accelerated in reaching the user boundary as we will wake up
* any sleeping threads that are interruptable. (PCATCH).
*/
int
@@ -733,7 +650,7 @@
p->p_flag &= ~P_SINGLE_BOUNDARY;
}
p->p_flag |= P_STOPPED_SINGLE;
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
p->p_singlethread = td;
if (mode == SINGLE_EXIT)
remaining = p->p_numthreads;
@@ -747,6 +664,7 @@
FOREACH_THREAD_IN_PROC(p, td2) {
if (td2 == td)
continue;
+ thread_lock(td2);
td2->td_flags |= TDF_ASTPENDING;
if (TD_IS_INHIBITED(td2)) {
switch (mode) {
@@ -768,10 +686,12 @@
sleepq_abort(td2, ERESTART);
break;
default:
- if (TD_IS_SUSPENDED(td2))
+ if (TD_IS_SUSPENDED(td2)) {
+ thread_unlock(td2);
continue;
+ }
/*
- * maybe other inhibitted states too?
+ * maybe other inhibited states too?
*/
if ((td2->td_flags & TDF_SINTR) &&
(td2->td_inhibitors &
@@ -785,6 +705,7 @@
forward_signal(td2);
}
#endif
+ thread_unlock(td2);
}
if (mode == SINGLE_EXIT)
remaining = p->p_numthreads;
@@ -804,13 +725,7 @@
* Wake us up when everyone else has suspended.
* In the mean time we suspend as well.
*/
- thread_stopped(p);
- thread_suspend_one(td);
- PROC_UNLOCK(p);
- mi_switch(SW_VOL, NULL);
- mtx_unlock_spin(&sched_lock);
- PROC_LOCK(p);
- mtx_lock_spin(&sched_lock);
+ thread_suspend_switch(td);
if (mode == SINGLE_EXIT)
remaining = p->p_numthreads;
else if (mode == SINGLE_BOUNDARY)
@@ -829,7 +744,7 @@
p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT);
thread_unthread(td);
}
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
return (0);
}
@@ -898,7 +813,11 @@
(p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
return (ERESTART);
- mtx_lock_spin(&sched_lock);
+ /* If thread will exit, flush its pending signals */
+ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td))
+ sigqueue_flush(&td->td_sigqueue);
+
+ PROC_SLOCK(p);
thread_stopped(p);
/*
* If the process is waiting for us to exit,
@@ -907,44 +826,75 @@
*/
if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td))
thread_exit();
-
+ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
+ if (p->p_numthreads == p->p_suspcount + 1) {
+ thread_lock(p->p_singlethread);
+ thread_unsuspend_one(p->p_singlethread);
+ thread_unlock(p->p_singlethread);
+ }
+ }
+ PROC_UNLOCK(p);
+ thread_lock(td);
/*
* When a thread suspends, it just
- * moves to the processes's suspend queue
- * and stays there.
+ * gets taken off all queues.
*/
thread_suspend_one(td);
if (return_instead == 0) {
p->p_boundary_count++;
td->td_flags |= TDF_BOUNDARY;
}
- if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
- if (p->p_numthreads == p->p_suspcount)
- thread_unsuspend_one(p->p_singlethread);
- }
- PROC_UNLOCK(p);
+ PROC_SUNLOCK(p);
mi_switch(SW_INVOL, NULL);
- if (return_instead == 0) {
- p->p_boundary_count--;
+ if (return_instead == 0)
td->td_flags &= ~TDF_BOUNDARY;
- }
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
PROC_LOCK(p);
+ if (return_instead == 0)
+ p->p_boundary_count--;
}
return (0);
}
void
+thread_suspend_switch(struct thread *td)
+{
+ struct proc *p;
+
+ p = td->td_proc;
+ KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ /*
+ * We implement thread_suspend_one in stages here to avoid
+ * dropping the proc lock while the thread lock is owned.
+ */
+ thread_stopped(p);
+ p->p_suspcount++;
+ PROC_UNLOCK(p);
+ thread_lock(td);
+ sched_sleep(td);
+ TD_SET_SUSPENDED(td);
+ PROC_SUNLOCK(p);
+ DROP_GIANT();
+ mi_switch(SW_VOL, NULL);
+ thread_unlock(td);
+ PICKUP_GIANT();
+ PROC_LOCK(p);
+ PROC_SLOCK(p);
+}
+
+void
thread_suspend_one(struct thread *td)
{
struct proc *p = td->td_proc;
- mtx_assert(&sched_lock, MA_OWNED);
- PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
p->p_suspcount++;
+ sched_sleep(td);
TD_SET_SUSPENDED(td);
- TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
}
void
@@ -952,9 +902,9 @@
{
struct proc *p = td->td_proc;
- mtx_assert(&sched_lock, MA_OWNED);
- PROC_LOCK_ASSERT(p, MA_OWNED);
- TAILQ_REMOVE(&p->p_suspended, td, td_runq);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
TD_CLR_SUSPENDED(td);
p->p_suspcount--;
setrunnable(td);
@@ -968,11 +918,15 @@
{
struct thread *td;
- mtx_assert(&sched_lock, MA_OWNED);
PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
if (!P_SHOULDSTOP(p)) {
- while ((td = TAILQ_FIRST(&p->p_suspended))) {
- thread_unsuspend_one(td);
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (TD_IS_SUSPENDED(td)) {
+ thread_unsuspend_one(td);
+ }
+ thread_unlock(td);
}
} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
(p->p_numthreads == p->p_suspcount)) {
@@ -981,7 +935,9 @@
* threading request. Now we've downgraded to single-threaded,
* let it continue.
*/
+ thread_lock(p->p_singlethread);
thread_unsuspend_one(p->p_singlethread);
+ thread_unlock(p->p_singlethread);
}
}
@@ -998,9 +954,8 @@
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY);
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
p->p_singlethread = NULL;
- p->p_procscopegrp = NULL;
/*
* If there are other threads they mey now run,
* unless of course there is a blanket 'stop order'
@@ -1008,33 +963,28 @@
* to continue however as this is a bad place to stop.
*/
if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
- while ((td = TAILQ_FIRST(&p->p_suspended))) {
- thread_unsuspend_one(td);
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (TD_IS_SUSPENDED(td)) {
+ thread_unsuspend_one(td);
+ }
+ thread_unlock(td);
}
}
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
}
-/*
- * Called before going into an interruptible sleep to see if we have been
- * interrupted or requested to exit.
- */
-int
-thread_sleep_check(struct thread *td)
+struct thread *
+thread_find(struct proc *p, lwpid_t tid)
{
- struct proc *p;
+ struct thread *td;
- p = td->td_proc;
- mtx_assert(&sched_lock, MA_OWNED);
- if (p->p_flag & P_HADTHREADS) {
- if (p->p_singlethread != td) {
- if (p->p_flag & P_SINGLE_EXIT)
- return (EINTR);
- if (p->p_flag & P_SINGLE_BOUNDARY)
- return (ERESTART);
- }
- if (td->td_flags & TDF_INTERRUPT)
- return (td->td_intrval);
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_SLOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (td->td_tid == tid)
+ break;
}
- return (0);
+ PROC_SUNLOCK(p);
+ return (td);
}
Index: kern_module.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_module.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_module.c -L sys/kern/kern_module.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_module.c
+++ sys/kern/kern_module.c
@@ -27,7 +27,7 @@
#include "opt_compat.h"
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_module.c,v 1.48 2005/02/18 22:14:40 ps Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_module.c,v 1.52.2.1 2007/12/19 20:37:53 jhb Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
@@ -99,10 +99,12 @@
if (arg2 & RB_NOSYNC)
return;
+ mtx_lock(&Giant);
MOD_SLOCK;
TAILQ_FOREACH(mod, &modules, link)
MOD_EVENT(mod, MOD_SHUTDOWN);
MOD_SUNLOCK;
+ mtx_unlock(&Giant);
}
void
@@ -112,6 +114,7 @@
int error;
module_t mod;
+ mtx_lock(&Giant);
MOD_SLOCK;
mod = module_lookupbyname(data->name);
if (mod == NULL)
@@ -128,6 +131,7 @@
" %d\n", data->name, (void *)data->evhand, data->priv,
error);
}
+ mtx_unlock(&Giant);
}
int
@@ -136,20 +140,20 @@
size_t namelen;
module_t newmod;
- MOD_SLOCK;
+ MOD_XLOCK;
newmod = module_lookupbyname(data->name);
if (newmod != NULL) {
- MOD_SUNLOCK;
+ MOD_XUNLOCK;
printf("module_register: module %s already exists!\n",
data->name);
return (EEXIST);
}
- MOD_SUNLOCK;
namelen = strlen(data->name) + 1;
newmod = malloc(sizeof(struct module) + namelen, M_MODULE, M_WAITOK);
- if (newmod == NULL)
+ if (newmod == NULL) {
+ MOD_XUNLOCK;
return (ENOMEM);
- MOD_XLOCK;
+ }
newmod->refs = 1;
newmod->id = nextid++;
newmod->name = (char *)(newmod + 1);
@@ -232,12 +236,14 @@
{
int error;
+ mtx_lock(&Giant);
error = MOD_EVENT(mod, MOD_QUIESCE);
if (error == EOPNOTSUPP || error == EINVAL)
error = 0;
- if (flags == LINKER_UNLOAD_NORMAL && error != 0)
- return (error);
- return (MOD_EVENT(mod, MOD_UNLOAD));
+ if (error == 0 || flags == LINKER_UNLOAD_FORCE)
+ error = MOD_EVENT(mod, MOD_UNLOAD);
+ mtx_unlock(&Giant);
+ return (error);
}
int
@@ -264,12 +270,16 @@
mod->data = *datap;
}
+linker_file_t
+module_file(module_t mod)
+{
+
+ return (mod->file);
+}
+
/*
* Syscalls.
*/
-/*
- * MPSAFE
- */
int
modnext(struct thread *td, struct modnext_args *uap)
{
@@ -301,9 +311,6 @@
return (error);
}
-/*
- * MPSAFE
- */
int
modfnext(struct thread *td, struct modfnext_args *uap)
{
@@ -334,9 +341,6 @@
int id;
};
-/*
- * MPSAFE
- */
int
modstat(struct thread *td, struct modstat_args *uap)
{
@@ -390,9 +394,6 @@
return (error);
}
-/*
- * MPSAFE
- */
int
modfind(struct thread *td, struct modfind_args *uap)
{
@@ -415,6 +416,7 @@
#ifdef COMPAT_IA32
#include <sys/mount.h>
+#include <sys/socket.h>
#include <compat/freebsd32/freebsd32_util.h>
#include <compat/freebsd32/freebsd32.h>
#include <compat/freebsd32/freebsd32_proto.h>
@@ -434,9 +436,6 @@
modspecific32_t data;
};
-/*
- * MPSAFE
- */
int
freebsd32_modstat(struct thread *td, struct freebsd32_modstat_args *uap)
{
Index: subr_kdb.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_kdb.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_kdb.c -L sys/kern/subr_kdb.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_kdb.c
+++ sys/kern/subr_kdb.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_kdb.c,v 1.12.2.1 2005/10/02 10:06:15 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_kdb.c,v 1.24 2007/09/17 05:27:20 jeff Exp $");
#include "opt_kdb.h"
@@ -42,18 +42,10 @@
#include <machine/kdb.h>
#include <machine/pcb.h>
-#ifdef KDB_STOP_NMI
+#ifdef SMP
#include <machine/smp.h>
#endif
-/*
- * KDB_STOP_NMI requires SMP to pick up the right dependencies
- * (And isn't useful on UP anyway)
- */
-#if defined(KDB_STOP_NMI) && !defined(SMP)
-#error "options KDB_STOP_NMI" requires "options SMP"
-#endif
-
int kdb_active = 0;
void *kdb_jmpbufp = NULL;
struct kdb_dbbe *kdb_dbbe = NULL;
@@ -68,6 +60,9 @@
static int kdb_sysctl_available(SYSCTL_HANDLER_ARGS);
static int kdb_sysctl_current(SYSCTL_HANDLER_ARGS);
static int kdb_sysctl_enter(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_panic(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_trap(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS);
SYSCTL_NODE(_debug, OID_AUTO, kdb, CTLFLAG_RW, NULL, "KDB nodes");
@@ -80,6 +75,15 @@
SYSCTL_PROC(_debug_kdb, OID_AUTO, enter, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
kdb_sysctl_enter, "I", "set to enter the debugger");
+SYSCTL_PROC(_debug_kdb, OID_AUTO, panic, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
+ kdb_sysctl_panic, "I", "set to panic the kernel");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, trap, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
+ kdb_sysctl_trap, "I", "set to cause a page fault via data access");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, trap_code, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
+ kdb_sysctl_trap_code, "I", "set to cause a page fault via code access");
+
/*
* Flag indicating whether or not to IPI the other CPUs to stop them on
* entering the debugger. Sometimes, this will result in a deadlock as
@@ -89,21 +93,8 @@
#ifdef SMP
static int kdb_stop_cpus = 1;
SYSCTL_INT(_debug_kdb, OID_AUTO, stop_cpus, CTLTYPE_INT | CTLFLAG_RW,
- &kdb_stop_cpus, 0, "");
+ &kdb_stop_cpus, 0, "stop other CPUs when entering the debugger");
TUNABLE_INT("debug.kdb.stop_cpus", &kdb_stop_cpus);
-
-#ifdef KDB_STOP_NMI
-/*
- * Provide an alternate method of stopping other CPUs. If another CPU has
- * disabled interrupts the conventional STOP IPI will be blocked. This
- * NMI-based stop should get through in that case.
- */
-static int kdb_stop_cpus_with_nmi = 1;
-SYSCTL_INT(_debug_kdb, OID_AUTO, stop_cpus_with_nmi, CTLTYPE_INT | CTLFLAG_RW,
- &kdb_stop_cpus_with_nmi, 0, "");
-TUNABLE_INT("debug.kdb.stop_cpus_with_nmi", &kdb_stop_cpus_with_nmi);
-#endif /* KDB_STOP_NMI */
-
#endif
static int
@@ -176,6 +167,55 @@
return (0);
}
+static int
+kdb_sysctl_panic(SYSCTL_HANDLER_ARGS)
+{
+ int error, i;
+
+ error = sysctl_wire_old_buffer(req, sizeof(int));
+ if (error == 0) {
+ i = 0;
+ error = sysctl_handle_int(oidp, &i, 0, req);
+ }
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ panic("kdb_sysctl_panic");
+ return (0);
+}
+
+static int
+kdb_sysctl_trap(SYSCTL_HANDLER_ARGS)
+{
+ int error, i;
+ int *addr = (int *)0x10;
+
+ error = sysctl_wire_old_buffer(req, sizeof(int));
+ if (error == 0) {
+ i = 0;
+ error = sysctl_handle_int(oidp, &i, 0, req);
+ }
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ return (*addr);
+}
+
+static int
+kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS)
+{
+ int error, i;
+ void (*fp)(u_int, u_int, u_int) = (void *)0xdeadc0de;
+
+ error = sysctl_wire_old_buffer(req, sizeof(int));
+ if (error == 0) {
+ i = 0;
+ error = sysctl_handle_int(oidp, &i, 0, req);
+ }
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ (*fp)(0x11111111, 0x22222222, 0x33333333);
+ return (0);
+}
+
/*
* Solaris implements a new BREAK which is initiated by a character sequence
* CR ~ ^b which is similar to a familiar pattern used on Sun servers by the
@@ -335,27 +375,22 @@
struct pcb *
kdb_thr_ctx(struct thread *thr)
-#ifdef KDB_STOP_NMI
{
- u_int cpuid;
- struct pcpu *pc;
-
- if (thr == curthread)
- return &kdb_pcb;
-
- SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
- cpuid = pc->pc_cpuid;
- if (pc->pc_curthread == thr && (atomic_load_acq_int(&stopped_cpus) & (1 << cpuid)))
- return &stoppcbs[cpuid];
- }
-
- return thr->td_pcb;
-}
-#else
-{
- return ((thr == curthread) ? &kdb_pcb : thr->td_pcb);
+#if defined(SMP) && defined(KDB_STOPPEDPCB)
+ struct pcpu *pc;
+#endif
+
+ if (thr == curthread)
+ return (&kdb_pcb);
+
+#if defined(SMP) && defined(KDB_STOPPEDPCB)
+ SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
+ if (pc->pc_curthread == thr && (stopped_cpus & pc->pc_cpumask))
+ return (KDB_STOPPEDPCB(pc));
+ }
+#endif
+ return (thr->td_pcb);
}
-#endif /* KDB_STOP_NMI */
struct thread *
kdb_thr_first(void)
@@ -365,7 +400,7 @@
p = LIST_FIRST(&allproc);
while (p != NULL) {
- if (p->p_sflag & PS_INMEM) {
+ if (p->p_flag & P_INMEM) {
thr = FIRST_THREAD_IN_PROC(p);
if (thr != NULL)
return (thr);
@@ -382,7 +417,7 @@
p = LIST_FIRST(&allproc);
while (p != NULL) {
- if (p->p_sflag & PS_INMEM && p->p_pid == pid)
+ if (p->p_flag & P_INMEM && p->p_pid == pid)
return (FIRST_THREAD_IN_PROC(p));
p = LIST_NEXT(p, p_list);
}
@@ -411,7 +446,7 @@
if (thr != NULL)
return (thr);
p = LIST_NEXT(p, p_list);
- if (p != NULL && (p->p_sflag & PS_INMEM))
+ if (p != NULL && (p->p_flag & P_INMEM))
thr = FIRST_THREAD_IN_PROC(p);
} while (p != NULL);
return (NULL);
@@ -434,6 +469,7 @@
int
kdb_trap(int type, int code, struct trapframe *tf)
{
+ register_t intr;
#ifdef SMP
int did_stop_cpus;
#endif
@@ -446,22 +482,15 @@
if (kdb_active)
return (0);
- critical_enter();
-
- kdb_active++;
+ intr = intr_disable();
#ifdef SMP
if ((did_stop_cpus = kdb_stop_cpus) != 0)
- {
-#ifdef KDB_STOP_NMI
- if(kdb_stop_cpus_with_nmi)
- stop_cpus_nmi(PCPU_GET(other_cpus));
- else
-#endif /* KDB_STOP_NMI */
stop_cpus(PCPU_GET(other_cpus));
- }
#endif
+ kdb_active++;
+
kdb_frame = tf;
/* Let MD code do its thing first... */
@@ -472,14 +501,14 @@
handled = kdb_dbbe->dbbe_trap(type, code);
+ kdb_active--;
+
#ifdef SMP
if (did_stop_cpus)
restart_cpus(stopped_cpus);
#endif
- kdb_active--;
-
- critical_exit();
+ intr_restore(intr);
return (handled);
}
Index: kern_mib.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_mib.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_mib.c -L sys/kern/kern_mib.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_mib.c
+++ sys/kern/kern_mib.c
@@ -36,12 +36,14 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_mib.c,v 1.74.2.2 2005/10/08 07:06:49 pjd Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_mib.c,v 1.84.2.1 2007/12/06 14:19:42 kib Exp $");
#include "opt_posix.h"
+#include "opt_config.h"
#include <sys/param.h>
#include <sys/kernel.h>
+#include <sys/sbuf.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/proc.h>
@@ -102,7 +104,6 @@
* NOTICE: The *userland* release date is available in
* /usr/include/osreldate.h
*/
-extern int osreldate;
SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD,
&osreldate, 0, "Kernel release date");
@@ -150,6 +151,18 @@
0, PAGE_SIZE, "System memory page size");
static int
+sysctl_kern_arnd(SYSCTL_HANDLER_ARGS)
+{
+ u_long val;
+
+ arc4rand(&val, sizeof(val), 0);
+ return (sysctl_handle_long(oidp, &val, 0, req));
+}
+
+SYSCTL_PROC(_kern, KERN_ARND, arandom, CTLFLAG_RD,
+ 0, 0, sysctl_kern_arnd, "L", "arc4rand");
+
+static int
sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
{
u_long val;
@@ -295,12 +308,30 @@
CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, sysctl_kern_securelvl,
"I", "Current secure level");
+#ifdef INCLUDE_CONFIG_FILE
+/* Actual kernel configuration options. */
+extern char kernconfstring[];
+
+static int
+sysctl_kern_config(SYSCTL_HANDLER_ARGS)
+{
+ return (sysctl_handle_string(oidp, kernconfstring,
+ strlen(kernconfstring), req));
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, conftxt, CTLTYPE_STRING|CTLFLAG_RW,
+ 0, 0, sysctl_kern_config, "", "Kernel configuration file");
+#endif
+
char domainname[MAXHOSTNAMELEN];
SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW,
&domainname, sizeof(domainname), "Name of the current YP/NIS domain");
u_long hostid;
SYSCTL_ULONG(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "Host ID");
+char hostuuid[64] = "00000000-0000-0000-0000-000000000000";
+SYSCTL_STRING(_kern, KERN_HOSTUUID, hostuuid, CTLFLAG_RW, hostuuid,
+ sizeof(hostuuid), "Host UUID");
/*
* This is really cheating. These actually live in the libc, something
Index: subr_prf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_prf.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/subr_prf.c -L sys/kern/subr_prf.c -u -r1.2 -r1.3
--- sys/kern/subr_prf.c
+++ sys/kern/subr_prf.c
@@ -35,9 +35,10 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_prf.c,v 1.116.2.3 2005/10/07 12:40:51 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_prf.c,v 1.130 2007/03/08 06:44:34 julian Exp $");
#include "opt_ddb.h"
+#include "opt_printf.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -48,6 +49,7 @@
#include <sys/kernel.h>
#include <sys/msgbuf.h>
#include <sys/malloc.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/stddef.h>
#include <sys/sysctl.h>
@@ -55,6 +57,7 @@
#include <sys/syslog.h>
#include <sys/cons.h>
#include <sys/uio.h>
+#include <sys/ctype.h>
#ifdef DDB
#include <ddb/ddb.h>
@@ -77,6 +80,10 @@
int flags;
int pri;
struct tty *tty;
+ char *p_bufr;
+ size_t n_bufr;
+ char *p_next;
+ size_t remain;
};
struct snprintf_arg {
@@ -88,10 +95,9 @@
static void msglogchar(int c, int pri);
static void putchar(int ch, void *arg);
-static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len);
+static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len, int upper);
static void snprintf_func(int ch, void *arg);
-static int consintr = 1; /* Ok to handle console interrupts? */
static int msgbufmapped; /* Set when safe to use msgbuf */
int msgbuftrigger;
@@ -127,7 +133,7 @@
struct putchar_arg pca;
int retval;
- if (td == NULL || td == PCPU_GET(idlethread))
+ if (td == NULL || TD_IS_IDLETHREAD(td))
return (0);
mtx_lock(&Giant);
@@ -233,6 +239,7 @@
pca.tty = NULL;
pca.pri = level;
pca.flags = log_open ? TOLOG : TOCONS;
+ pca.p_bufr = NULL;
va_start(ap, fmt);
kvprintf(fmt, putchar, &pca, 10, ap);
@@ -283,43 +290,108 @@
printf(const char *fmt, ...)
{
va_list ap;
- int savintr;
struct putchar_arg pca;
int retval;
+#ifdef PRINTF_BUFR_SIZE
+ char bufr[PRINTF_BUFR_SIZE];
+#endif
- savintr = consintr; /* disable interrupts */
- consintr = 0;
va_start(ap, fmt);
pca.tty = NULL;
pca.flags = TOCONS | TOLOG;
pca.pri = -1;
+#ifdef PRINTF_BUFR_SIZE
+ pca.p_bufr = bufr;
+ pca.p_next = pca.p_bufr;
+ pca.n_bufr = sizeof(bufr);
+ pca.remain = sizeof(bufr);
+ *pca.p_next = '\0';
+#else
+ /* Don't buffer console output. */
+ pca.p_bufr = NULL;
+#endif
+
retval = kvprintf(fmt, putchar, &pca, 10, ap);
va_end(ap);
+
+#ifdef PRINTF_BUFR_SIZE
+ /* Write any buffered console output: */
+ if (*pca.p_bufr != '\0')
+ cnputs(pca.p_bufr);
+#endif
+
if (!panicstr)
msgbuftrigger = 1;
- consintr = savintr; /* reenable interrupts */
+
return (retval);
}
int
vprintf(const char *fmt, va_list ap)
{
- int savintr;
struct putchar_arg pca;
int retval;
+#ifdef PRINTF_BUFR_SIZE
+ char bufr[PRINTF_BUFR_SIZE];
+#endif
- savintr = consintr; /* disable interrupts */
- consintr = 0;
pca.tty = NULL;
pca.flags = TOCONS | TOLOG;
pca.pri = -1;
+#ifdef PRINTF_BUFR_SIZE
+ pca.p_bufr = bufr;
+ pca.p_next = pca.p_bufr;
+ pca.n_bufr = sizeof(bufr);
+ pca.remain = sizeof(bufr);
+ *pca.p_next = '\0';
+#else
+ /* Don't buffer console output. */
+ pca.p_bufr = NULL;
+#endif
+
retval = kvprintf(fmt, putchar, &pca, 10, ap);
+
+#ifdef PRINTF_BUFR_SIZE
+ /* Write any buffered console output: */
+ if (*pca.p_bufr != '\0')
+ cnputs(pca.p_bufr);
+#endif
+
if (!panicstr)
msgbuftrigger = 1;
- consintr = savintr; /* reenable interrupts */
+
return (retval);
}
+static void
+putcons(int c, struct putchar_arg *ap)
+{
+ /* Check if no console output buffer was provided. */
+ if (ap->p_bufr == NULL)
+ /* Output direct to the console. */
+ cnputc(c);
+ else {
+ /* Buffer the character: */
+ if (c == '\n') {
+ *ap->p_next++ = '\r';
+ ap->remain--;
+ }
+ *ap->p_next++ = c;
+ ap->remain--;
+
+ /* Always leave the buffer zero terminated. */
+ *ap->p_next = '\0';
+
+ /* Check if the buffer needs to be flushed. */
+ if (ap->remain < 3 || c == '\n') {
+ cnputs(ap->p_bufr);
+ ap->p_next = ap->p_bufr;
+ ap->remain = ap->n_bufr;
+ *ap->p_next = '\0';
+ }
+ }
+}
+
/*
* Print a character on console or users terminal. If destination is
* the console then the last bunch of characters are saved in msgbuf for
@@ -330,17 +402,15 @@
{
struct putchar_arg *ap = (struct putchar_arg*) arg;
struct tty *tp = ap->tty;
- int consdirect, flags = ap->flags;
+ int flags = ap->flags;
- consdirect = ((flags & TOCONS) && constty == NULL);
/* Don't use the tty code after a panic or while in ddb. */
- if (panicstr)
- consdirect = 1;
- if (kdb_active)
- consdirect = 1;
- if (consdirect) {
+ if (kdb_active) {
if (c != '\0')
cnputc(c);
+ } else if (panicstr || ((flags & TOCONS) && constty == NULL)) {
+ if (c != '\0')
+ putcons(c, ap);
} else {
if ((flags & TOTTY) && tp != NULL)
tputchar(c, tp);
@@ -348,7 +418,7 @@
if (constty != NULL)
msgbuf_addchar(&consmsgbuf, c);
if (always_console_output && c != '\0')
- cnputc(c);
+ putcons(c, ap);
}
}
if ((flags & TOLOG))
@@ -451,14 +521,15 @@
* The buffer pointed to by `nbuf' must have length >= MAXNBUF.
*/
static char *
-ksprintn(char *nbuf, uintmax_t num, int base, int *lenp)
+ksprintn(char *nbuf, uintmax_t num, int base, int *lenp, int upper)
{
- char *p;
+ char *p, c;
p = nbuf;
*p = '\0';
do {
- *++p = hex2ascii(num % base);
+ c = hex2ascii(num % base);
+ *++p = upper ? toupper(c) : c;
} while (num /= base);
if (lenp)
*lenp = p - nbuf;
@@ -503,7 +574,7 @@
uintmax_t num;
int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
int cflag, hflag, jflag, tflag, zflag;
- int dwidth;
+ int dwidth, upper;
char padc;
int stop = 0, retval = 0;
@@ -529,7 +600,7 @@
}
percent = fmt - 1;
qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
- sign = 0; dot = 0; dwidth = 0;
+ sign = 0; dot = 0; dwidth = 0; upper = 0;
cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0;
reswitch: switch (ch = (u_char)*fmt++) {
case '.':
@@ -579,7 +650,7 @@
case 'b':
num = (u_int)va_arg(ap, int);
p = va_arg(ap, char *);
- for (q = ksprintn(nbuf, num, *p++, NULL); *q;)
+ for (q = ksprintn(nbuf, num, *p++, NULL, 0); *q;)
PCHAR(*q--);
if (num == 0)
@@ -698,8 +769,9 @@
case 'u':
base = 10;
goto handle_nosign;
- case 'x':
case 'X':
+ upper = 1;
+ case 'x':
base = 16;
goto handle_nosign;
case 'y':
@@ -750,7 +822,7 @@
neg = 1;
num = -(intmax_t)num;
}
- p = ksprintn(nbuf, num, base, &tmp);
+ p = ksprintn(nbuf, num, base, &tmp, upper);
if (sharpflag && num != 0) {
if (base == 8)
tmp++;
@@ -823,7 +895,7 @@
dangling = 0;
}
msgbuf_addchar(msgbufp, '<');
- for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL); *p;)
+ for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL, 0); *p;)
msgbuf_addchar(msgbufp, *p--);
msgbuf_addchar(msgbufp, '>');
lastpri = pri;
@@ -853,8 +925,6 @@
oldp = msgbufp;
}
-SYSCTL_DECL(_security_bsd);
-
static int unprivileged_read_msgbuf = 1;
SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_read_msgbuf,
CTLFLAG_RW, &unprivileged_read_msgbuf, 0,
@@ -869,7 +939,7 @@
int error, len;
if (!unprivileged_read_msgbuf) {
- error = suser(req->td);
+ error = priv_check(req->td, PRIV_MSGBUF);
if (error)
return (error);
}
@@ -909,20 +979,17 @@
DB_SHOW_COMMAND(msgbuf, db_show_msgbuf)
{
- int i, j, quit;
-
- quit = 0;
+ int i, j;
if (!msgbufmapped) {
db_printf("msgbuf not mapped yet\n");
return;
}
- db_setup_paging(db_simple_pager, &quit, db_lines_per_page);
db_printf("msgbufp = %p\n", msgbufp);
db_printf("magic = %x, size = %d, r= %u, w = %u, ptr = %p, cksum= %u\n",
msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_rseq,
msgbufp->msg_wseq, msgbufp->msg_ptr, msgbufp->msg_cksum);
- for (i = 0; i < msgbufp->msg_size && !quit; i++) {
+ for (i = 0; i < msgbufp->msg_size && !db_pager_quit; i++) {
j = MSGBUF_SEQ_TO_POS(msgbufp, i + msgbufp->msg_rseq);
db_printf("%c", msgbufp->msg_ptr[j]);
}
Index: kern_time.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/kern_time.c -L sys/kern/kern_time.c -u -r1.1.1.2 -r1.2
--- sys/kern/kern_time.c
+++ sys/kern/kern_time.c
@@ -30,31 +30,37 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_time.c,v 1.116.2.1 2005/12/28 19:30:41 ps Exp $");
-
-#include "opt_mac.h"
+__FBSDID("$FreeBSD: src/sys/kern/kern_time.c,v 1.142 2007/06/09 21:48:44 attilio Exp $");
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/clock.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
+#include <sys/eventhandler.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/kernel.h>
-#include <sys/mac.h>
#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
#include <sys/sysent.h>
+#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/posix4.h>
#include <sys/time.h>
+#include <sys/timers.h>
#include <sys/timetc.h>
#include <sys/vnode.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
-int tz_minuteswest;
-int tz_dsttime;
+#define MAX_CLOCKS (CLOCK_MONOTONIC+1)
+
+static struct kclock posix_clocks[MAX_CLOCKS];
+static uma_zone_t itimer_zone = NULL;
/*
* Time of day and interval timer support.
@@ -70,6 +76,36 @@
static void timevalfix(struct timeval *);
static void no_lease_updatetime(int);
+static void itimer_start(void);
+static int itimer_init(void *, int, int);
+static void itimer_fini(void *, int);
+static void itimer_enter(struct itimer *);
+static void itimer_leave(struct itimer *);
+static struct itimer *itimer_find(struct proc *, int);
+static void itimers_alloc(struct proc *);
+static void itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp);
+static void itimers_event_hook_exit(void *arg, struct proc *p);
+static int realtimer_create(struct itimer *);
+static int realtimer_gettime(struct itimer *, struct itimerspec *);
+static int realtimer_settime(struct itimer *, int,
+ struct itimerspec *, struct itimerspec *);
+static int realtimer_delete(struct itimer *);
+static void realtimer_clocktime(clockid_t, struct timespec *);
+static void realtimer_expire(void *);
+static int kern_timer_create(struct thread *, clockid_t,
+ struct sigevent *, int *, int);
+static int kern_timer_delete(struct thread *, int);
+
+int register_posix_clock(int, struct kclock *);
+void itimer_fire(struct itimer *it);
+int itimespecfix(struct timespec *ts);
+
+#define CLOCK_CALL(clock, call, arglist) \
+ ((*posix_clocks[clock].call) arglist)
+
+SYSINIT(posix_timer, SI_SUB_P1003_1B, SI_ORDER_FIRST+4, itimer_start, NULL);
+
+
static void
no_lease_updatetime(deltat)
int deltat;
@@ -146,10 +182,6 @@
struct timespec *tp;
};
#endif
-
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
clock_gettime(struct thread *td, struct clock_gettime_args *uap)
@@ -172,25 +204,44 @@
p = td->td_proc;
switch (clock_id) {
- case CLOCK_REALTIME:
+ case CLOCK_REALTIME: /* Default to precise. */
+ case CLOCK_REALTIME_PRECISE:
nanotime(ats);
break;
+ case CLOCK_REALTIME_FAST:
+ getnanotime(ats);
+ break;
case CLOCK_VIRTUAL:
PROC_LOCK(p);
+ PROC_SLOCK(p);
calcru(p, &user, &sys);
+ PROC_SUNLOCK(p);
PROC_UNLOCK(p);
TIMEVAL_TO_TIMESPEC(&user, ats);
break;
case CLOCK_PROF:
PROC_LOCK(p);
+ PROC_SLOCK(p);
calcru(p, &user, &sys);
+ PROC_SUNLOCK(p);
PROC_UNLOCK(p);
timevaladd(&user, &sys);
TIMEVAL_TO_TIMESPEC(&user, ats);
break;
- case CLOCK_MONOTONIC:
+ case CLOCK_MONOTONIC: /* Default to precise. */
+ case CLOCK_MONOTONIC_PRECISE:
+ case CLOCK_UPTIME:
+ case CLOCK_UPTIME_PRECISE:
nanouptime(ats);
break;
+ case CLOCK_UPTIME_FAST:
+ case CLOCK_MONOTONIC_FAST:
+ getnanouptime(ats);
+ break;
+ case CLOCK_SECOND:
+ ats->tv_sec = time_second;
+ ats->tv_nsec = 0;
+ break;
default:
return (EINVAL);
}
@@ -203,10 +254,6 @@
const struct timespec *tp;
};
#endif
-
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
clock_settime(struct thread *td, struct clock_settime_args *uap)
@@ -225,12 +272,7 @@
struct timeval atv;
int error;
-#ifdef MAC
- error = mac_check_system_settime(td->td_ucred);
- if (error)
- return (error);
-#endif
- if ((error = suser(td)) != 0)
+ if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
return (error);
if (clock_id != CLOCK_REALTIME)
return (EINVAL);
@@ -248,7 +290,6 @@
struct timespec *tp;
};
#endif
-
int
clock_getres(struct thread *td, struct clock_getres_args *uap)
{
@@ -271,7 +312,14 @@
ts->tv_sec = 0;
switch (clock_id) {
case CLOCK_REALTIME:
+ case CLOCK_REALTIME_FAST:
+ case CLOCK_REALTIME_PRECISE:
case CLOCK_MONOTONIC:
+ case CLOCK_MONOTONIC_FAST:
+ case CLOCK_MONOTONIC_PRECISE:
+ case CLOCK_UPTIME:
+ case CLOCK_UPTIME_FAST:
+ case CLOCK_UPTIME_PRECISE:
/*
* Round up the result of the division cheaply by adding 1.
* Rounding up is especially important if rounding down
@@ -284,6 +332,10 @@
/* Accurately round up here because we can do so cheaply. */
ts->tv_nsec = (1000000000 + hz - 1) / hz;
break;
+ case CLOCK_SECOND:
+ ts->tv_sec = 1;
+ ts->tv_nsec = 0;
+ break;
default:
return (EINVAL);
}
@@ -335,10 +387,6 @@
struct timespec *rmtp;
};
#endif
-
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
nanosleep(struct thread *td, struct nanosleep_args *uap)
@@ -370,9 +418,6 @@
struct timezone *tzp;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
gettimeofday(struct thread *td, struct gettimeofday_args *uap)
@@ -399,9 +444,6 @@
struct timezone *tzp;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
settimeofday(struct thread *td, struct settimeofday_args *uap)
@@ -432,12 +474,7 @@
{
int error;
-#ifdef MAC
- error = mac_check_system_settime(td->td_ucred);
- if (error)
- return (error);
-#endif
- error = suser(td);
+ error = priv_check(td, PRIV_SETTIMEOFDAY);
if (error)
return (error);
/* Verify all parameters before changing time. */
@@ -454,25 +491,25 @@
}
/*
- * Get value of an interval timer. The process virtual and
- * profiling virtual time timers are kept in the p_stats area, since
- * they can be swapped out. These are kept internally in the
- * way they are specified externally: in time until they expire.
+ * Get value of an interval timer. The process virtual and profiling virtual
+ * time timers are kept in the p_stats area, since they can be swapped out.
+ * These are kept internally in the way they are specified externally: in
+ * time until they expire.
*
- * The real time interval timer is kept in the process table slot
- * for the process, and its value (it_value) is kept as an
- * absolute time rather than as a delta, so that it is easy to keep
- * periodic real-time signals from drifting.
+ * The real time interval timer is kept in the process table slot for the
+ * process, and its value (it_value) is kept as an absolute time rather than
+ * as a delta, so that it is easy to keep periodic real-time signals from
+ * drifting.
*
* Virtual time timers are processed in the hardclock() routine of
- * kern_clock.c. The real time timer is processed by a timeout
- * routine, called from the softclock() routine. Since a callout
- * may be delayed in real time due to interrupt processing in the system,
- * it is possible for the real time timeout routine (realitexpire, given below),
- * to be delayed in real time past when it is supposed to occur. It
- * does not suffice, therefore, to reload the real timer .it_value from the
- * real time timers .it_interval. Rather, we compute the next time in
- * absolute time the timer should go off.
+ * kern_clock.c. The real time timer is processed by a timeout routine,
+ * called from the softclock() routine. Since a callout may be delayed in
+ * real time due to interrupt processing in the system, it is possible for
+ * the real time timeout routine (realitexpire, given below), to be delayed
+ * in real time past when it is supposed to occur. It does not suffice,
+ * therefore, to reload the real timer .it_value from the real time timers
+ * .it_interval. Rather, we compute the next time in absolute time the timer
+ * should go off.
*/
#ifndef _SYS_SYSPROTO_H_
struct getitimer_args {
@@ -480,9 +517,6 @@
struct itimerval *itv;
};
#endif
-/*
- * MPSAFE
- */
int
getitimer(struct thread *td, struct getitimer_args *uap)
{
@@ -522,9 +556,9 @@
timevalsub(&aitv->it_value, &ctv);
}
} else {
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
*aitv = p->p_stats->p_timer[which];
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
}
return (0);
}
@@ -535,10 +569,6 @@
struct itimerval *itv, *oitv;
};
#endif
-
-/*
- * MPSAFE
- */
int
setitimer(struct thread *td, struct setitimer_args *uap)
{
@@ -597,10 +627,10 @@
timevalsub(&oitv->it_value, &ctv);
}
} else {
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
*oitv = p->p_stats->p_timer[which];
p->p_stats->p_timer[which] = *aitv;
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
}
return (0);
}
@@ -659,8 +689,7 @@
itimerfix(struct timeval *tv)
{
- if (tv->tv_sec < 0 || tv->tv_sec > 100000000 ||
- tv->tv_usec < 0 || tv->tv_usec >= 1000000)
+ if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000)
return (EINVAL);
if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick)
tv->tv_usec = tick;
@@ -807,3 +836,655 @@
return (maxpps < 0 || *curpps < maxpps);
}
}
+
+static void
+itimer_start(void)
+{
+ struct kclock rt_clock = {
+ .timer_create = realtimer_create,
+ .timer_delete = realtimer_delete,
+ .timer_settime = realtimer_settime,
+ .timer_gettime = realtimer_gettime,
+ .event_hook = NULL
+ };
+
+ itimer_zone = uma_zcreate("itimer", sizeof(struct itimer),
+ NULL, NULL, itimer_init, itimer_fini, UMA_ALIGN_PTR, 0);
+ register_posix_clock(CLOCK_REALTIME, &rt_clock);
+ register_posix_clock(CLOCK_MONOTONIC, &rt_clock);
+ p31b_setcfg(CTL_P1003_1B_TIMERS, 200112L);
+ p31b_setcfg(CTL_P1003_1B_DELAYTIMER_MAX, INT_MAX);
+ p31b_setcfg(CTL_P1003_1B_TIMER_MAX, TIMER_MAX);
+ EVENTHANDLER_REGISTER(process_exit, itimers_event_hook_exit,
+ (void *)ITIMER_EV_EXIT, EVENTHANDLER_PRI_ANY);
+ EVENTHANDLER_REGISTER(process_exec, itimers_event_hook_exec,
+ (void *)ITIMER_EV_EXEC, EVENTHANDLER_PRI_ANY);
+}
+
+int
+register_posix_clock(int clockid, struct kclock *clk)
+{
+ if ((unsigned)clockid >= MAX_CLOCKS) {
+ printf("%s: invalid clockid\n", __func__);
+ return (0);
+ }
+ posix_clocks[clockid] = *clk;
+ return (1);
+}
+
+static int
+itimer_init(void *mem, int size, int flags)
+{
+ struct itimer *it;
+
+ it = (struct itimer *)mem;
+ mtx_init(&it->it_mtx, "itimer lock", NULL, MTX_DEF);
+ return (0);
+}
+
+static void
+itimer_fini(void *mem, int size)
+{
+ struct itimer *it;
+
+ it = (struct itimer *)mem;
+ mtx_destroy(&it->it_mtx);
+}
+
+static void
+itimer_enter(struct itimer *it)
+{
+
+ mtx_assert(&it->it_mtx, MA_OWNED);
+ it->it_usecount++;
+}
+
+static void
+itimer_leave(struct itimer *it)
+{
+
+ mtx_assert(&it->it_mtx, MA_OWNED);
+ KASSERT(it->it_usecount > 0, ("invalid it_usecount"));
+
+ if (--it->it_usecount == 0 && (it->it_flags & ITF_WANTED) != 0)
+ wakeup(it);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_create_args {
+ clockid_t clock_id;
+ struct sigevent * evp;
+ int * timerid;
+};
+#endif
+int
+ktimer_create(struct thread *td, struct ktimer_create_args *uap)
+{
+ struct sigevent *evp1, ev;
+ int id;
+ int error;
+
+ if (uap->evp != NULL) {
+ error = copyin(uap->evp, &ev, sizeof(ev));
+ if (error != 0)
+ return (error);
+ evp1 = &ev;
+ } else
+ evp1 = NULL;
+
+ error = kern_timer_create(td, uap->clock_id, evp1, &id, -1);
+
+ if (error == 0) {
+ error = copyout(&id, uap->timerid, sizeof(int));
+ if (error != 0)
+ kern_timer_delete(td, id);
+ }
+ return (error);
+}
+
+static int
+kern_timer_create(struct thread *td, clockid_t clock_id,
+ struct sigevent *evp, int *timerid, int preset_id)
+{
+ struct proc *p = td->td_proc;
+ struct itimer *it;
+ int id;
+ int error;
+
+ if (clock_id < 0 || clock_id >= MAX_CLOCKS)
+ return (EINVAL);
+
+ if (posix_clocks[clock_id].timer_create == NULL)
+ return (EINVAL);
+
+ if (evp != NULL) {
+ if (evp->sigev_notify != SIGEV_NONE &&
+ evp->sigev_notify != SIGEV_SIGNAL &&
+ evp->sigev_notify != SIGEV_THREAD_ID)
+ return (EINVAL);
+ if ((evp->sigev_notify == SIGEV_SIGNAL ||
+ evp->sigev_notify == SIGEV_THREAD_ID) &&
+ !_SIG_VALID(evp->sigev_signo))
+ return (EINVAL);
+ }
+
+ if (p->p_itimers == NULL)
+ itimers_alloc(p);
+
+ it = uma_zalloc(itimer_zone, M_WAITOK);
+ it->it_flags = 0;
+ it->it_usecount = 0;
+ it->it_active = 0;
+ timespecclear(&it->it_time.it_value);
+ timespecclear(&it->it_time.it_interval);
+ it->it_overrun = 0;
+ it->it_overrun_last = 0;
+ it->it_clockid = clock_id;
+ it->it_timerid = -1;
+ it->it_proc = p;
+ ksiginfo_init(&it->it_ksi);
+ it->it_ksi.ksi_flags |= KSI_INS | KSI_EXT;
+ error = CLOCK_CALL(clock_id, timer_create, (it));
+ if (error != 0)
+ goto out;
+
+ PROC_LOCK(p);
+ if (preset_id != -1) {
+ KASSERT(preset_id >= 0 && preset_id < 3, ("invalid preset_id"));
+ id = preset_id;
+ if (p->p_itimers->its_timers[id] != NULL) {
+ PROC_UNLOCK(p);
+ error = 0;
+ goto out;
+ }
+ } else {
+ /*
+ * Find a free timer slot, skipping those reserved
+ * for setitimer().
+ */
+ for (id = 3; id < TIMER_MAX; id++)
+ if (p->p_itimers->its_timers[id] == NULL)
+ break;
+ if (id == TIMER_MAX) {
+ PROC_UNLOCK(p);
+ error = EAGAIN;
+ goto out;
+ }
+ }
+ it->it_timerid = id;
+ p->p_itimers->its_timers[id] = it;
+ if (evp != NULL)
+ it->it_sigev = *evp;
+ else {
+ it->it_sigev.sigev_notify = SIGEV_SIGNAL;
+ switch (clock_id) {
+ default:
+ case CLOCK_REALTIME:
+ it->it_sigev.sigev_signo = SIGALRM;
+ break;
+ case CLOCK_VIRTUAL:
+ it->it_sigev.sigev_signo = SIGVTALRM;
+ break;
+ case CLOCK_PROF:
+ it->it_sigev.sigev_signo = SIGPROF;
+ break;
+ }
+ it->it_sigev.sigev_value.sival_int = id;
+ }
+
+ if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
+ it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
+ it->it_ksi.ksi_signo = it->it_sigev.sigev_signo;
+ it->it_ksi.ksi_code = SI_TIMER;
+ it->it_ksi.ksi_value = it->it_sigev.sigev_value;
+ it->it_ksi.ksi_timerid = id;
+ }
+ PROC_UNLOCK(p);
+ *timerid = id;
+ return (0);
+
+out:
+ ITIMER_LOCK(it);
+ CLOCK_CALL(it->it_clockid, timer_delete, (it));
+ ITIMER_UNLOCK(it);
+ uma_zfree(itimer_zone, it);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_delete_args {
+ int timerid;
+};
+#endif
+int
+ktimer_delete(struct thread *td, struct ktimer_delete_args *uap)
+{
+ return (kern_timer_delete(td, uap->timerid));
+}
+
+static struct itimer *
+itimer_find(struct proc *p, int timerid)
+{
+ struct itimer *it;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if ((p->p_itimers == NULL) || (timerid >= TIMER_MAX) ||
+ (it = p->p_itimers->its_timers[timerid]) == NULL) {
+ return (NULL);
+ }
+ ITIMER_LOCK(it);
+ if ((it->it_flags & ITF_DELETING) != 0) {
+ ITIMER_UNLOCK(it);
+ it = NULL;
+ }
+ return (it);
+}
+
+static int
+kern_timer_delete(struct thread *td, int timerid)
+{
+ struct proc *p = td->td_proc;
+ struct itimer *it;
+
+ PROC_LOCK(p);
+ it = itimer_find(p, timerid);
+ if (it == NULL) {
+ PROC_UNLOCK(p);
+ return (EINVAL);
+ }
+ PROC_UNLOCK(p);
+
+ it->it_flags |= ITF_DELETING;
+ while (it->it_usecount > 0) {
+ it->it_flags |= ITF_WANTED;
+ msleep(it, &it->it_mtx, PPAUSE, "itimer", 0);
+ }
+ it->it_flags &= ~ITF_WANTED;
+ CLOCK_CALL(it->it_clockid, timer_delete, (it));
+ ITIMER_UNLOCK(it);
+
+ PROC_LOCK(p);
+ if (KSI_ONQ(&it->it_ksi))
+ sigqueue_take(&it->it_ksi);
+ p->p_itimers->its_timers[timerid] = NULL;
+ PROC_UNLOCK(p);
+ uma_zfree(itimer_zone, it);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_settime_args {
+ int timerid;
+ int flags;
+ const struct itimerspec * value;
+ struct itimerspec * ovalue;
+};
+#endif
+int
+ktimer_settime(struct thread *td, struct ktimer_settime_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct itimer *it;
+ struct itimerspec val, oval, *ovalp;
+ int error;
+
+ error = copyin(uap->value, &val, sizeof(val));
+ if (error != 0)
+ return (error);
+
+ if (uap->ovalue != NULL)
+ ovalp = &oval;
+ else
+ ovalp = NULL;
+
+ PROC_LOCK(p);
+ if (uap->timerid < 3 ||
+ (it = itimer_find(p, uap->timerid)) == NULL) {
+ PROC_UNLOCK(p);
+ error = EINVAL;
+ } else {
+ PROC_UNLOCK(p);
+ itimer_enter(it);
+ error = CLOCK_CALL(it->it_clockid, timer_settime,
+ (it, uap->flags, &val, ovalp));
+ itimer_leave(it);
+ ITIMER_UNLOCK(it);
+ }
+ if (error == 0 && uap->ovalue != NULL)
+ error = copyout(ovalp, uap->ovalue, sizeof(*ovalp));
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_gettime_args {
+ int timerid;
+ struct itimerspec * value;
+};
+#endif
+int
+ktimer_gettime(struct thread *td, struct ktimer_gettime_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct itimer *it;
+ struct itimerspec val;
+ int error;
+
+ PROC_LOCK(p);
+ if (uap->timerid < 3 ||
+ (it = itimer_find(p, uap->timerid)) == NULL) {
+ PROC_UNLOCK(p);
+ error = EINVAL;
+ } else {
+ PROC_UNLOCK(p);
+ itimer_enter(it);
+ error = CLOCK_CALL(it->it_clockid, timer_gettime,
+ (it, &val));
+ itimer_leave(it);
+ ITIMER_UNLOCK(it);
+ }
+ if (error == 0)
+ error = copyout(&val, uap->value, sizeof(val));
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct timer_getoverrun_args {
+ int timerid;
+};
+#endif
+int
+ktimer_getoverrun(struct thread *td, struct ktimer_getoverrun_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct itimer *it;
+ int error ;
+
+ PROC_LOCK(p);
+ if (uap->timerid < 3 ||
+ (it = itimer_find(p, uap->timerid)) == NULL) {
+ PROC_UNLOCK(p);
+ error = EINVAL;
+ } else {
+ td->td_retval[0] = it->it_overrun_last;
+ ITIMER_UNLOCK(it);
+ PROC_UNLOCK(p);
+ error = 0;
+ }
+ return (error);
+}
+
+static int
+realtimer_create(struct itimer *it)
+{
+ callout_init_mtx(&it->it_callout, &it->it_mtx, 0);
+ return (0);
+}
+
+static int
+realtimer_delete(struct itimer *it)
+{
+ mtx_assert(&it->it_mtx, MA_OWNED);
+
+ ITIMER_UNLOCK(it);
+ callout_drain(&it->it_callout);
+ ITIMER_LOCK(it);
+ return (0);
+}
+
+static int
+realtimer_gettime(struct itimer *it, struct itimerspec *ovalue)
+{
+ struct timespec cts;
+
+ mtx_assert(&it->it_mtx, MA_OWNED);
+
+ realtimer_clocktime(it->it_clockid, &cts);
+ *ovalue = it->it_time;
+ if (ovalue->it_value.tv_sec != 0 || ovalue->it_value.tv_nsec != 0) {
+ timespecsub(&ovalue->it_value, &cts);
+ if (ovalue->it_value.tv_sec < 0 ||
+ (ovalue->it_value.tv_sec == 0 &&
+ ovalue->it_value.tv_nsec == 0)) {
+ ovalue->it_value.tv_sec = 0;
+ ovalue->it_value.tv_nsec = 1;
+ }
+ }
+ return (0);
+}
+
+static int
+realtimer_settime(struct itimer *it, int flags,
+ struct itimerspec *value, struct itimerspec *ovalue)
+{
+ struct timespec cts, ts;
+ struct timeval tv;
+ struct itimerspec val;
+
+ mtx_assert(&it->it_mtx, MA_OWNED);
+
+ val = *value;
+ if (itimespecfix(&val.it_value))
+ return (EINVAL);
+
+ if (timespecisset(&val.it_value)) {
+ if (itimespecfix(&val.it_interval))
+ return (EINVAL);
+ } else {
+ timespecclear(&val.it_interval);
+ }
+
+ if (ovalue != NULL)
+ realtimer_gettime(it, ovalue);
+
+ it->it_time = val;
+ if (timespecisset(&val.it_value)) {
+ realtimer_clocktime(it->it_clockid, &cts);
+ ts = val.it_value;
+ if ((flags & TIMER_ABSTIME) == 0) {
+ /* Convert to absolute time. */
+ timespecadd(&it->it_time.it_value, &cts);
+ } else {
+ timespecsub(&ts, &cts);
+ /*
+ * We don't care if ts is negative, tztohz will
+ * fix it.
+ */
+ }
+ TIMESPEC_TO_TIMEVAL(&tv, &ts);
+ callout_reset(&it->it_callout, tvtohz(&tv),
+ realtimer_expire, it);
+ } else {
+ callout_stop(&it->it_callout);
+ }
+
+ return (0);
+}
+
+static void
+realtimer_clocktime(clockid_t id, struct timespec *ts)
+{
+ if (id == CLOCK_REALTIME)
+ getnanotime(ts);
+ else /* CLOCK_MONOTONIC */
+ getnanouptime(ts);
+}
+
+int
+itimer_accept(struct proc *p, int timerid, ksiginfo_t *ksi)
+{
+ struct itimer *it;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ it = itimer_find(p, timerid);
+ if (it != NULL) {
+ ksi->ksi_overrun = it->it_overrun;
+ it->it_overrun_last = it->it_overrun;
+ it->it_overrun = 0;
+ ITIMER_UNLOCK(it);
+ return (0);
+ }
+ return (EINVAL);
+}
+
+int
+itimespecfix(struct timespec *ts)
+{
+
+ if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
+ return (EINVAL);
+ if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000)
+ ts->tv_nsec = tick * 1000;
+ return (0);
+}
+
+/* Timeout callback for realtime timer */
+static void
+realtimer_expire(void *arg)
+{
+ struct timespec cts, ts;
+ struct timeval tv;
+ struct itimer *it;
+ struct proc *p;
+
+ it = (struct itimer *)arg;
+ p = it->it_proc;
+
+ realtimer_clocktime(it->it_clockid, &cts);
+ /* Only fire if time is reached. */
+ if (timespeccmp(&cts, &it->it_time.it_value, >=)) {
+ if (timespecisset(&it->it_time.it_interval)) {
+ timespecadd(&it->it_time.it_value,
+ &it->it_time.it_interval);
+ while (timespeccmp(&cts, &it->it_time.it_value, >=)) {
+ if (it->it_overrun < INT_MAX)
+ it->it_overrun++;
+ else
+ it->it_ksi.ksi_errno = ERANGE;
+ timespecadd(&it->it_time.it_value,
+ &it->it_time.it_interval);
+ }
+ } else {
+ /* single shot timer ? */
+ timespecclear(&it->it_time.it_value);
+ }
+ if (timespecisset(&it->it_time.it_value)) {
+ ts = it->it_time.it_value;
+ timespecsub(&ts, &cts);
+ TIMESPEC_TO_TIMEVAL(&tv, &ts);
+ callout_reset(&it->it_callout, tvtohz(&tv),
+ realtimer_expire, it);
+ }
+ ITIMER_UNLOCK(it);
+ itimer_fire(it);
+ ITIMER_LOCK(it);
+ } else if (timespecisset(&it->it_time.it_value)) {
+ ts = it->it_time.it_value;
+ timespecsub(&ts, &cts);
+ TIMESPEC_TO_TIMEVAL(&tv, &ts);
+ callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire,
+ it);
+ }
+}
+
+void
+itimer_fire(struct itimer *it)
+{
+ struct proc *p = it->it_proc;
+ int ret;
+
+ if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
+ it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
+ PROC_LOCK(p);
+ if (!KSI_ONQ(&it->it_ksi)) {
+ it->it_ksi.ksi_errno = 0;
+ ret = psignal_event(p, &it->it_sigev, &it->it_ksi);
+ if (__predict_false(ret != 0)) {
+ it->it_overrun++;
+ /*
+ * Broken userland code, thread went
+ * away, disarm the timer.
+ */
+ if (ret == ESRCH) {
+ ITIMER_LOCK(it);
+ timespecclear(&it->it_time.it_value);
+ timespecclear(&it->it_time.it_interval);
+ callout_stop(&it->it_callout);
+ ITIMER_UNLOCK(it);
+ }
+ }
+ } else {
+ if (it->it_overrun < INT_MAX)
+ it->it_overrun++;
+ else
+ it->it_ksi.ksi_errno = ERANGE;
+ }
+ PROC_UNLOCK(p);
+ }
+}
+
+static void
+itimers_alloc(struct proc *p)
+{
+ struct itimers *its;
+ int i;
+
+ its = malloc(sizeof (struct itimers), M_SUBPROC, M_WAITOK | M_ZERO);
+ LIST_INIT(&its->its_virtual);
+ LIST_INIT(&its->its_prof);
+ TAILQ_INIT(&its->its_worklist);
+ for (i = 0; i < TIMER_MAX; i++)
+ its->its_timers[i] = NULL;
+ PROC_LOCK(p);
+ if (p->p_itimers == NULL) {
+ p->p_itimers = its;
+ PROC_UNLOCK(p);
+ }
+ else {
+ PROC_UNLOCK(p);
+ free(its, M_SUBPROC);
+ }
+}
+
+static void
+itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
+{
+ itimers_event_hook_exit(arg, p);
+}
+
+/* Clean up timers when some process events are being triggered. */
+static void
+itimers_event_hook_exit(void *arg, struct proc *p)
+{
+ struct itimers *its;
+ struct itimer *it;
+ int event = (int)(intptr_t)arg;
+ int i;
+
+ if (p->p_itimers != NULL) {
+ its = p->p_itimers;
+ for (i = 0; i < MAX_CLOCKS; ++i) {
+ if (posix_clocks[i].event_hook != NULL)
+ CLOCK_CALL(i, event_hook, (p, i, event));
+ }
+ /*
+ * According to susv3, XSI interval timers should be inherited
+ * by new image.
+ */
+ if (event == ITIMER_EV_EXEC)
+ i = 3;
+ else if (event == ITIMER_EV_EXIT)
+ i = 0;
+ else
+ panic("unhandled event");
+ for (; i < TIMER_MAX; ++i) {
+ if ((it = its->its_timers[i]) != NULL)
+ kern_timer_delete(curthread, i);
+ }
+ if (its->its_timers[0] == NULL &&
+ its->its_timers[1] == NULL &&
+ its->its_timers[2] == NULL) {
+ free(its, M_SUBPROC);
+ p->p_itimers = NULL;
+ }
+ }
+}
Index: kern_kthread.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_kthread.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_kthread.c -L sys/kern/kern_kthread.c -u -r1.2 -r1.3
--- sys/kern/kern_kthread.c
+++ sys/kern/kern_kthread.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_kthread.c,v 1.34 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_kthread.c,v 1.38 2007/06/05 00:00:54 jeff Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -38,6 +38,7 @@
#include <sys/sx.h>
#include <sys/unistd.h>
#include <sys/wait.h>
+#include <sys/sched.h>
#include <machine/stdarg.h>
@@ -112,9 +113,9 @@
/* Delay putting it on the run queue until now. */
if (!(flags & RFSTOPPED)) {
- mtx_lock_spin(&sched_lock);
- setrunqueue(td, SRQ_BORING);
- mtx_unlock_spin(&sched_lock);
+ thread_lock(td);
+ sched_add(td, SRQ_BORING);
+ thread_unlock(td);
}
return 0;
@@ -128,11 +129,23 @@
td = curthread;
p = td->td_proc;
+
+ /*
+ * Reparent curthread from proc0 to init so that the zombie
+ * is harvested.
+ */
sx_xlock(&proctree_lock);
PROC_LOCK(p);
proc_reparent(p, initproc);
PROC_UNLOCK(p);
sx_xunlock(&proctree_lock);
+
+ /*
+ * Wakeup anyone waiting for us to exit.
+ */
+ wakeup(p);
+
+ /* Buh-bye! */
exit1(td, W_EXITCODE(ecode, 0));
}
Index: kern_thr.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_thr.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_thr.c -L sys/kern/kern_thr.c -u -r1.2 -r1.3
--- sys/kern/kern_thr.c
+++ sys/kern/kern_thr.c
@@ -25,45 +25,64 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_thr.c,v 1.34.2.2 2006/01/16 06:25:32 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_thr.c,v 1.62.4.1 2008/01/19 18:15:05 kib Exp $");
+#include "opt_compat.h"
+#include "opt_posix.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
+#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/posix4.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/smp.h>
+#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/signalvar.h>
#include <sys/ucontext.h>
#include <sys/thr.h>
+#include <sys/rtprio.h>
#include <sys/umtx.h>
+#include <sys/limits.h>
#include <machine/frame.h>
-extern int max_threads_per_proc;
-extern int max_groups_per_proc;
+#include <security/audit/audit.h>
+
+#ifdef COMPAT_IA32
-SYSCTL_DECL(_kern_threads);
-static int thr_scope = 0;
-SYSCTL_INT(_kern_threads, OID_AUTO, thr_scope, CTLFLAG_RW,
- &thr_scope, 0, "sys or proc scope scheduling");
-
-static int thr_concurrency = 0;
-SYSCTL_INT(_kern_threads, OID_AUTO, thr_concurrency, CTLFLAG_RW,
- &thr_concurrency, 0, "a concurrency value if not default");
+extern struct sysentvec ia32_freebsd_sysvec;
+
+static inline int
+suword_lwpid(void *addr, lwpid_t lwpid)
+{
+ int error;
+
+ if (curproc->p_sysent != &ia32_freebsd_sysvec)
+ error = suword(addr, lwpid);
+ else
+ error = suword32(addr, lwpid);
+ return (error);
+}
+
+#else
+#define suword_lwpid suword
+#endif
+
+extern int max_threads_per_proc;
static int create_thread(struct thread *td, mcontext_t *ctx,
void (*start_func)(void *), void *arg,
char *stack_base, size_t stack_size,
char *tls_base,
long *child_tid, long *parent_tid,
- int flags);
+ int flags, struct rtprio *rtp);
/*
* System call interface.
@@ -79,7 +98,7 @@
return (error);
error = create_thread(td, &ctx.uc_mcontext, NULL, NULL,
- NULL, 0, NULL, uap->id, NULL, uap->flags);
+ NULL, 0, NULL, uap->id, NULL, uap->flags, NULL);
return (error);
}
@@ -90,13 +109,29 @@
struct thr_param param;
int error;
- if (uap->param_size < sizeof(param))
+ if (uap->param_size < 0 || uap->param_size > sizeof(param))
return (EINVAL);
- if ((error = copyin(uap->param, ¶m, sizeof(param))))
+ bzero(¶m, sizeof(param));
+ if ((error = copyin(uap->param, ¶m, uap->param_size)))
return (error);
- error = create_thread(td, NULL, param.start_func, param.arg,
- param.stack_base, param.stack_size, param.tls_base,
- param.child_tid, param.parent_tid, param.flags);
+ return (kern_thr_new(td, ¶m));
+}
+
+int
+kern_thr_new(struct thread *td, struct thr_param *param)
+{
+ struct rtprio rtp, *rtpp;
+ int error;
+
+ rtpp = NULL;
+ if (param->rtp != 0) {
+ error = copyin(param->rtp, &rtp, sizeof(struct rtprio));
+ rtpp = &rtp;
+ }
+ error = create_thread(td, NULL, param->start_func, param->arg,
+ param->stack_base, param->stack_size, param->tls_base,
+ param->child_tid, param->parent_tid, param->flags,
+ rtpp);
return (error);
}
@@ -106,36 +141,42 @@
char *stack_base, size_t stack_size,
char *tls_base,
long *child_tid, long *parent_tid,
- int flags)
+ int flags, struct rtprio *rtp)
{
stack_t stack;
struct thread *newtd;
- struct ksegrp *kg, *newkg;
struct proc *p;
- long id;
- int error, scope_sys, linkkg;
+ int error;
error = 0;
p = td->td_proc;
- kg = td->td_ksegrp;
/* Have race condition but it is cheap. */
- if ((p->p_numksegrps >= max_groups_per_proc) ||
- (p->p_numthreads >= max_threads_per_proc)) {
+ if (p->p_numthreads >= max_threads_per_proc)
return (EPROCLIM);
- }
- /* Check PTHREAD_SCOPE_SYSTEM */
- scope_sys = (flags & THR_SYSTEM_SCOPE) != 0;
-
- /* sysctl overrides user's flag */
- if (thr_scope == 1)
- scope_sys = 0;
- else if (thr_scope == 2)
- scope_sys = 1;
+ if (rtp != NULL) {
+ switch(rtp->type) {
+ case RTP_PRIO_REALTIME:
+ case RTP_PRIO_FIFO:
+ /* Only root can set scheduler policy */
+ if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0)
+ return (EPERM);
+ if (rtp->prio > RTP_PRIO_MAX)
+ return (EINVAL);
+ break;
+ case RTP_PRIO_NORMAL:
+ rtp->prio = 0;
+ break;
+ default:
+ return (EINVAL);
+ }
+ }
- /* Initialize our td and new ksegrp.. */
+ /* Initialize our td */
newtd = thread_alloc();
+ if (newtd == NULL)
+ return (ENOMEM);
/*
* Try the copyout as soon as we allocate the td so we don't
@@ -146,14 +187,14 @@
* its storage, because child thread may exit quickly and
* memory is freed before parent thread can access it.
*/
- id = newtd->td_tid;
if ((child_tid != NULL &&
- (error = copyout(&id, child_tid, sizeof(long)))) ||
+ suword_lwpid(child_tid, newtd->td_tid)) ||
(parent_tid != NULL &&
- (error = copyout(&id, parent_tid, sizeof(long))))) {
- thread_free(newtd);
- return (error);
+ suword_lwpid(parent_tid, newtd->td_tid))) {
+ thread_free(newtd);
+ return (EFAULT);
}
+
bzero(&newtd->td_startzero,
__rangeof(struct thread, td_startzero, td_endzero));
bcopy(&td->td_startcopy, &newtd->td_startcopy,
@@ -185,70 +226,29 @@
}
}
- if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
- /* Treat initial thread as it has PTHREAD_SCOPE_PROCESS. */
- p->p_procscopegrp = kg;
- mtx_lock_spin(&sched_lock);
- sched_set_concurrency(kg,
- thr_concurrency ? thr_concurrency : (2*mp_ncpus));
- mtx_unlock_spin(&sched_lock);
- }
-
- linkkg = 0;
- if (scope_sys) {
- linkkg = 1;
- newkg = ksegrp_alloc();
- bzero(&newkg->kg_startzero,
- __rangeof(struct ksegrp, kg_startzero, kg_endzero));
- bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
- __rangeof(struct ksegrp, kg_startcopy, kg_endcopy));
- sched_init_concurrency(newkg);
- PROC_LOCK(td->td_proc);
- } else {
- /*
- * Try to create a KSE group which will be shared
- * by all PTHREAD_SCOPE_PROCESS threads.
- */
-retry:
- PROC_LOCK(td->td_proc);
- if ((newkg = p->p_procscopegrp) == NULL) {
- PROC_UNLOCK(p);
- newkg = ksegrp_alloc();
- bzero(&newkg->kg_startzero,
- __rangeof(struct ksegrp, kg_startzero, kg_endzero));
- bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
- __rangeof(struct ksegrp, kg_startcopy, kg_endcopy));
- PROC_LOCK(p);
- if (p->p_procscopegrp == NULL) {
- p->p_procscopegrp = newkg;
- sched_init_concurrency(newkg);
- sched_set_concurrency(newkg,
- thr_concurrency ? thr_concurrency : (2*mp_ncpus));
- linkkg = 1;
- } else {
- PROC_UNLOCK(p);
- ksegrp_free(newkg);
- goto retry;
- }
- }
- }
-
+ PROC_LOCK(td->td_proc);
td->td_proc->p_flag |= P_HADTHREADS;
newtd->td_sigmask = td->td_sigmask;
- mtx_lock_spin(&sched_lock);
- if (linkkg)
- ksegrp_link(newkg, p);
- thread_link(newtd, newkg);
- PROC_UNLOCK(p);
-
+ PROC_SLOCK(p);
+ thread_link(newtd, p);
+ thread_lock(td);
/* let the scheduler know about these things. */
- if (linkkg)
- sched_fork_ksegrp(td, newkg);
sched_fork_thread(td, newtd);
+ thread_unlock(td);
+ PROC_SUNLOCK(p);
+ PROC_UNLOCK(p);
+ thread_lock(newtd);
+ if (rtp != NULL) {
+ if (!(td->td_pri_class == PRI_TIMESHARE &&
+ rtp->type == RTP_PRIO_NORMAL)) {
+ rtp_to_pri(rtp, newtd);
+ sched_prio(newtd, newtd->td_user_pri);
+ } /* ignore timesharing class */
+ }
TD_SET_CAN_RUN(newtd);
/* if ((flags & THR_SUSPENDED) == 0) */
- setrunqueue(newtd, SRQ_BORING);
- mtx_unlock_spin(&sched_lock);
+ sched_add(newtd, SRQ_BORING);
+ thread_unlock(newtd);
return (error);
}
@@ -257,13 +257,11 @@
thr_self(struct thread *td, struct thr_self_args *uap)
/* long *id */
{
- long id;
int error;
- id = td->td_tid;
- if ((error = copyout(&id, uap->id, sizeof(long))))
- return (error);
-
+ error = suword_lwpid(uap->id, (unsigned)td->td_tid);
+ if (error == -1)
+ return (EFAULT);
return (0);
}
@@ -277,12 +275,13 @@
/* Signal userland that it can free the stack. */
if ((void *)uap->state != NULL) {
- suword((void *)uap->state, 1);
+ suword_lwpid(uap->state, 1);
kern_umtx_wake(td, uap->state, INT_MAX);
}
PROC_LOCK(p);
- mtx_lock_spin(&sched_lock);
+ sigqueue_flush(&td->td_sigqueue);
+ PROC_SLOCK(p);
/*
* Shutting down last thread in the proc. This will actually
@@ -293,7 +292,7 @@
thread_exit();
/* NOTREACHED */
}
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
PROC_UNLOCK(p);
return (0);
}
@@ -319,17 +318,14 @@
error = 0;
if (uap->sig == 0)
break;
- tdsignal(ttd, uap->sig, SIGTARGET_TD);
+ tdsignal(p, ttd, uap->sig, NULL);
}
}
}
} else {
- if (uap->id != td->td_tid) {
- FOREACH_THREAD_IN_PROC(p, ttd) {
- if (ttd->td_tid == uap->id)
- break;
- }
- } else
+ if (uap->id != td->td_tid)
+ ttd = thread_find(p, uap->id);
+ else
ttd = td;
if (ttd == NULL)
error = ESRCH;
@@ -338,7 +334,60 @@
else if (!_SIG_VALID(uap->sig))
error = EINVAL;
else
- tdsignal(ttd, uap->sig, SIGTARGET_TD);
+ tdsignal(p, ttd, uap->sig, NULL);
+ }
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+int
+thr_kill2(struct thread *td, struct thr_kill2_args *uap)
+ /* pid_t pid, long id, int sig */
+{
+ struct thread *ttd;
+ struct proc *p;
+ int error;
+
+ AUDIT_ARG(signum, uap->sig);
+
+ if (uap->pid == td->td_proc->p_pid) {
+ p = td->td_proc;
+ PROC_LOCK(p);
+ } else if ((p = pfind(uap->pid)) == NULL) {
+ return (ESRCH);
+ }
+ AUDIT_ARG(process, p);
+
+ error = p_cansignal(td, p, uap->sig);
+ if (error == 0) {
+ if (uap->id == -1) {
+ if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
+ error = EINVAL;
+ } else {
+ error = ESRCH;
+ FOREACH_THREAD_IN_PROC(p, ttd) {
+ if (ttd != td) {
+ error = 0;
+ if (uap->sig == 0)
+ break;
+ tdsignal(p, ttd, uap->sig, NULL);
+ }
+ }
+ }
+ } else {
+ if (uap->id != td->td_tid)
+ ttd = thread_find(p, uap->id);
+ else
+ ttd = td;
+ if (ttd == NULL)
+ error = ESRCH;
+ else if (uap->sig == 0)
+ ;
+ else if (!_SIG_VALID(uap->sig))
+ error = EINVAL;
+ else
+ tdsignal(p, ttd, uap->sig, NULL);
+ }
}
PROC_UNLOCK(p);
return (error);
@@ -348,33 +397,50 @@
thr_suspend(struct thread *td, struct thr_suspend_args *uap)
/* const struct timespec *timeout */
{
- struct timespec ts;
- struct timeval tv;
+ struct timespec ts, *tsp;
int error;
- int hz;
- hz = 0;
error = 0;
+ tsp = NULL;
if (uap->timeout != NULL) {
error = copyin((const void *)uap->timeout, (void *)&ts,
sizeof(struct timespec));
if (error != 0)
return (error);
- if (ts.tv_nsec < 0 || ts.tv_nsec > 1000000000)
+ tsp = &ts;
+ }
+
+ return (kern_thr_suspend(td, tsp));
+}
+
+int
+kern_thr_suspend(struct thread *td, struct timespec *tsp)
+{
+ struct timeval tv;
+ int error = 0, hz = 0;
+
+ if (tsp != NULL) {
+ if (tsp->tv_nsec < 0 || tsp->tv_nsec > 1000000000)
return (EINVAL);
- if (ts.tv_sec == 0 && ts.tv_nsec == 0)
+ if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
return (ETIMEDOUT);
- TIMESPEC_TO_TIMEVAL(&tv, &ts);
+ TIMESPEC_TO_TIMEVAL(&tv, tsp);
hz = tvtohz(&tv);
}
+
+ if (td->td_pflags & TDP_WAKEUP) {
+ td->td_pflags &= ~TDP_WAKEUP;
+ return (0);
+ }
+
PROC_LOCK(td->td_proc);
if ((td->td_flags & TDF_THRWAKEUP) == 0)
- error = msleep((void *)td, &td->td_proc->p_mtx,
- PCATCH, "lthr", hz);
+ error = msleep((void *)td, &td->td_proc->p_mtx, PCATCH, "lthr",
+ hz);
if (td->td_flags & TDF_THRWAKEUP) {
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_flags &= ~TDF_THRWAKEUP;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
PROC_UNLOCK(td->td_proc);
return (0);
}
@@ -392,21 +458,54 @@
thr_wake(struct thread *td, struct thr_wake_args *uap)
/* long id */
{
+ struct proc *p;
struct thread *ttd;
- PROC_LOCK(td->td_proc);
- FOREACH_THREAD_IN_PROC(td->td_proc, ttd) {
- if (ttd->td_tid == uap->id)
- break;
- }
+ if (uap->id == td->td_tid) {
+ td->td_pflags |= TDP_WAKEUP;
+ return (0);
+ }
+
+ p = td->td_proc;
+ PROC_LOCK(p);
+ ttd = thread_find(p, uap->id);
if (ttd == NULL) {
- PROC_UNLOCK(td->td_proc);
+ PROC_UNLOCK(p);
return (ESRCH);
}
- mtx_lock_spin(&sched_lock);
+ thread_lock(ttd);
ttd->td_flags |= TDF_THRWAKEUP;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(ttd);
wakeup((void *)ttd);
- PROC_UNLOCK(td->td_proc);
+ PROC_UNLOCK(p);
return (0);
}
+
+int
+thr_set_name(struct thread *td, struct thr_set_name_args *uap)
+{
+ struct proc *p = td->td_proc;
+ char name[MAXCOMLEN + 1];
+ struct thread *ttd;
+ int error;
+
+ error = 0;
+ name[0] = '\0';
+ if (uap->name != NULL) {
+ error = copyinstr(uap->name, name, sizeof(name),
+ NULL);
+ if (error)
+ return (error);
+ }
+ PROC_LOCK(p);
+ if (uap->id == td->td_tid)
+ ttd = td;
+ else
+ ttd = thread_find(p, uap->id);
+ if (ttd != NULL)
+ strcpy(ttd->td_name, name);
+ else
+ error = ESRCH;
+ PROC_UNLOCK(p);
+ return (error);
+}
Index: kern_event.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_event.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/kern_event.c -L sys/kern/kern_event.c -u -r1.3 -r1.4
--- sys/kern/kern_event.c
+++ sys/kern/kern_event.c
@@ -26,7 +26,9 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_event.c,v 1.93.2.3.2.1 2006/04/19 16:00:31 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_event.c,v 1.113 2007/07/14 21:23:30 rodrigc Exp $");
+
+#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -57,6 +59,9 @@
#include <sys/syscallsubr.h>
#include <sys/taskqueue.h>
#include <sys/uio.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
#include <vm/uma.h>
@@ -83,7 +88,9 @@
static int kevent_copyout(void *arg, struct kevent *kevp, int count);
static int kevent_copyin(void *arg, struct kevent *kevp, int count);
-static int kqueue_aquire(struct file *fp, struct kqueue **kqp);
+static int kqueue_register(struct kqueue *kq, struct kevent *kev,
+ struct thread *td, int waitok);
+static int kqueue_acquire(struct file *fp, struct kqueue **kqp);
static void kqueue_release(struct kqueue *kq, int locked);
static int kqueue_expand(struct kqueue *kq, struct filterops *fops,
uintptr_t ident, int waitok);
@@ -247,6 +254,7 @@
{ &timer_filtops }, /* EVFILT_TIMER */
{ &file_filtops }, /* EVFILT_NETDEV */
{ &fs_filtops }, /* EVFILT_FS */
+ { &null_filtops }, /* EVFILT_LIO */
};
/*
@@ -388,6 +396,7 @@
if (!(kn->kn_status & KN_DETACHED))
knlist_remove_inevent(&p->p_klist, kn);
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+ kn->kn_data = p->p_xstat;
kn->kn_ptr.p_proc = NULL;
return (1);
}
@@ -497,9 +506,6 @@
return (kn->kn_data != 0);
}
-/*
- * MPSAFE
- */
int
kqueue(struct thread *td, struct kqueue_args *uap)
{
@@ -521,15 +527,15 @@
knlist_init(&kq->kq_sel.si_note, &kq->kq_lock, NULL, NULL, NULL);
TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
- FILEDESC_LOCK_FAST(fdp);
+ FILEDESC_XLOCK(fdp);
SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_XUNLOCK(fdp);
FILE_LOCK(fp);
fp->f_flag = FREAD | FWRITE;
fp->f_type = DTYPE_KQUEUE;
- fp->f_ops = &kqueueops;
fp->f_data = kq;
+ fp->f_ops = &kqueueops;
FILE_UNLOCK(fp);
fdrop(fp, td);
@@ -548,9 +554,6 @@
const struct timespec *timeout;
};
#endif
-/*
- * MPSAFE
- */
int
kevent(struct thread *td, struct kevent_args *uap)
{
@@ -559,6 +562,12 @@
kevent_copyout,
kevent_copyin};
int error;
+#ifdef KTRACE
+ struct uio ktruio;
+ struct iovec ktriov;
+ struct uio *ktruioin = NULL;
+ struct uio *ktruioout = NULL;
+#endif
if (uap->timeout != NULL) {
error = copyin(uap->timeout, &ts, sizeof(ts));
@@ -568,8 +577,33 @@
} else
tsp = NULL;
- return (kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
- &k_ops, tsp));
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_GENIO)) {
+ ktriov.iov_base = uap->changelist;
+ ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
+ ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
+ .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
+ .uio_td = td };
+ ktruioin = cloneuio(&ktruio);
+ ktriov.iov_base = uap->eventlist;
+ ktriov.iov_len = uap->nevents * sizeof(struct kevent);
+ ktruioout = cloneuio(&ktruio);
+ }
+#endif
+
+ error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
+ &k_ops, tsp);
+
+#ifdef KTRACE
+ if (ktruioin != NULL) {
+ ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
+ ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
+ ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
+ ktrgenio(uap->fd, UIO_READ, ktruioout, error);
+ }
+#endif
+
+ return (error);
}
/*
@@ -620,7 +654,7 @@
if ((error = fget(td, fd, &fp)) != 0)
return (error);
- if ((error = kqueue_aquire(fp, &kq)) != 0)
+ if ((error = kqueue_acquire(fp, &kq)) != 0)
goto done_norel;
nerrors = 0;
@@ -633,6 +667,8 @@
changes = keva;
for (i = 0; i < n; i++) {
kevp = &changes[i];
+ if (!kevp->filter)
+ continue;
kevp->flags &= ~EV_SYSFLAGS;
error = kqueue_register(kq, kevp, td, 1);
if (error) {
@@ -660,8 +696,7 @@
done:
kqueue_release(kq, 0);
done_norel:
- if (fp != NULL)
- fdrop(fp, td);
+ fdrop(fp, td);
return (error);
}
@@ -744,22 +779,19 @@
}
/*
- * A ref to kq (obtained via kqueue_aquire) should be held. waitok will
+ * A ref to kq (obtained via kqueue_acquire) must be held. waitok will
* influence if memory allocation should wait. Make sure it is 0 if you
* hold any mutexes.
*/
-int
+static int
kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
{
- struct filedesc *fdp;
struct filterops *fops;
struct file *fp;
struct knote *kn, *tkn;
int error, filt, event;
int haskqglobal;
- int fd;
- fdp = NULL;
fp = NULL;
kn = NULL;
error = 0;
@@ -775,22 +807,13 @@
findkn:
if (fops->f_isfd) {
KASSERT(td != NULL, ("td is NULL"));
- fdp = td->td_proc->p_fd;
- FILEDESC_LOCK(fdp);
- /* validate descriptor */
- fd = kev->ident;
- if (fd < 0 || fd >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[fd]) == NULL) {
- FILEDESC_UNLOCK(fdp);
- error = EBADF;
+ error = fget(td, kev->ident, &fp);
+ if (error)
goto done;
- }
- fhold(fp);
if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
kev->ident, 0) != 0) {
- /* unlock and try again */
- FILEDESC_UNLOCK(fdp);
+ /* try again */
fdrop(fp, td);
fp = NULL;
error = kqueue_expand(kq, fops, kev->ident, waitok);
@@ -808,15 +831,13 @@
* they are the same thing.
*/
if (fp->f_data == kq) {
- FILEDESC_UNLOCK(fdp);
error = EINVAL;
- goto done_noglobal;
+ goto done;
}
KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
}
- FILEDESC_UNLOCK(fdp);
KQ_LOCK(kq);
if (kev->ident < kq->kq_knlistsize) {
SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
@@ -866,6 +887,7 @@
kn = tkn;
tkn = NULL;
if (kn == NULL) {
+ KQ_UNLOCK(kq);
error = ENOMEM;
goto done;
}
@@ -951,7 +973,6 @@
done:
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
-done_noglobal:
if (fp != NULL)
fdrop(fp, td);
if (tkn != NULL)
@@ -962,7 +983,7 @@
}
static int
-kqueue_aquire(struct file *fp, struct kqueue **kqp)
+kqueue_acquire(struct file *fp, struct kqueue **kqp)
{
int error;
struct kqueue *kq;
@@ -1370,7 +1391,7 @@
int revents = 0;
int error;
- if ((error = kqueue_aquire(fp, &kq)))
+ if ((error = kqueue_acquire(fp, &kq)))
return POLLERR;
KQ_LOCK(kq);
@@ -1415,7 +1436,7 @@
int i;
int error;
- if ((error = kqueue_aquire(fp, &kq)))
+ if ((error = kqueue_acquire(fp, &kq)))
return error;
KQ_LOCK(kq);
@@ -1471,9 +1492,9 @@
KQ_UNLOCK(kq);
- FILEDESC_LOCK_FAST(fdp);
+ FILEDESC_XLOCK(fdp);
SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list);
- FILEDESC_UNLOCK_FAST(fdp);
+ FILEDESC_XUNLOCK(fdp);
knlist_destroy(&kq->kq_sel.si_note);
mtx_destroy(&kq->kq_lock);
@@ -1669,7 +1690,7 @@
knl->kl_lock = knlist_mtx_lock;
else
knl->kl_lock = kl_lock;
- if (kl_lock == NULL)
+ if (kl_unlock == NULL)
knl->kl_unlock = knlist_mtx_unlock;
else
knl->kl_unlock = kl_unlock;
@@ -1705,18 +1726,18 @@
void
knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
{
- struct knote *kn;
+ struct knote *kn, *kn2;
struct kqueue *kq;
if (islocked)
KNL_ASSERT_LOCKED(knl);
else {
KNL_ASSERT_UNLOCKED(knl);
-again: /* need to reaquire lock since we have dropped it */
+again: /* need to reacquire lock since we have dropped it */
knl->kl_lock(knl->kl_lockarg);
}
- SLIST_FOREACH(kn, &knl->kl_list, kn_selnext) {
+ SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
kq = kn->kn_kq;
KQ_LOCK(kq);
if ((kn->kn_status & KN_INFLUX)) {
@@ -1759,9 +1780,9 @@
}
/*
- * remove all knotes referencing a specified fd
- * must be called with FILEDESC lock. This prevents a race where a new fd
- * comes along and occupies the entry and we attach a knote to the fd.
+ * Remove all knotes referencing a specified fd must be called with FILEDESC
+ * lock. This prevents a race where a new fd comes along and occupies the
+ * entry and we attach a knote to the fd.
*/
void
knote_fdclose(struct thread *td, int fd)
@@ -1771,7 +1792,7 @@
struct knote *kn;
int influx;
- FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+ FILEDESC_XLOCK_ASSERT(fdp);
/*
* We shouldn't have to worry about new kevents appearing on fd
@@ -1828,7 +1849,7 @@
}
/*
- * knote must already have been detatched using the f_detach method.
+ * knote must already have been detached using the f_detach method.
* no lock need to be held, it is assumed that the KN_INFLUX flag is set
* to prevent other removal.
*/
@@ -1850,7 +1871,8 @@
else
list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
- SLIST_REMOVE(list, kn, knote, kn_link);
+ if (!SLIST_EMPTY(list))
+ SLIST_REMOVE(list, kn, knote, kn_link);
if (kn->kn_status & KN_QUEUED)
knote_dequeue(kn);
KQ_UNLOCK_FLUX(kq);
@@ -1913,3 +1935,28 @@
if (kn != NULL)
uma_zfree(knote_zone, kn);
}
+
+/*
+ * Register the kev w/ the kq specified by fd.
+ */
+int
+kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
+{
+ struct kqueue *kq;
+ struct file *fp;
+ int error;
+
+ if ((error = fget(td, fd, &fp)) != 0)
+ return (error);
+ if ((error = kqueue_acquire(fp, &kq)) != 0)
+ goto noacquire;
+
+ error = kqueue_register(kq, kev, td, waitok);
+
+ kqueue_release(kq, 0);
+
+noacquire:
+ fdrop(fp, td);
+
+ return error;
+}
Index: uipc_usrreq.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/uipc_usrreq.c -L sys/kern/uipc_usrreq.c -u -r1.2 -r1.3
--- sys/kern/uipc_usrreq.c
+++ sys/kern/uipc_usrreq.c
@@ -1,7 +1,7 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California.
- * Copyright 2004-2005 Robert N. M. Watson
+ * Copyright (c) 2004-2007 Robert N. M. Watson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -31,21 +31,47 @@
* From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94
*/
+/*
+ * UNIX Domain (Local) Sockets
+ *
+ * This is an implementation of UNIX (local) domain sockets. Each socket has
+ * an associated struct unpcb (UNIX protocol control block). Stream sockets
+ * may be connected to 0 or 1 other socket. Datagram sockets may be
+ * connected to 0, 1, or many other sockets. Sockets may be created and
+ * connected in pairs (socketpair(2)), or bound/connected to using the file
+ * system name space. For most purposes, only the receive socket buffer is
+ * used, as sending on one socket delivers directly to the receive socket
+ * buffer of a second socket.
+ *
+ * The implementation is substantially complicated by the fact that
+ * "ancillary data", such as file descriptors or credentials, may be passed
+ * across UNIX domain sockets. The potential for passing UNIX domain sockets
+ * over other UNIX domain sockets requires the implementation of a simple
+ * garbage collector to find and tear down cycles of disconnected sockets.
+ *
+ * TODO:
+ * SEQPACKET, RDM
+ * rethink name space problems
+ * need a proper out-of-band
+ * lock pushdown
+ */
+
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_usrreq.c,v 1.155.2.3 2006/03/13 03:06:03 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_usrreq.c,v 1.206.4.1 2008/01/23 12:08:12 rwatson Exp $");
+#include "opt_ddb.h"
#include "opt_mac.h"
#include <sys/param.h>
#include <sys/domain.h>
#include <sys/fcntl.h>
#include <sys/malloc.h> /* XXX must be before <sys/file.h> */
+#include <sys/eventhandler.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/mbuf.h>
#include <sys/mount.h>
#include <sys/mutex.h>
@@ -53,6 +79,7 @@
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/signalvar.h>
@@ -65,315 +92,653 @@
#include <sys/unpcb.h>
#include <sys/vnode.h>
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <security/mac/mac_framework.h>
+
#include <vm/uma.h>
-static uma_zone_t unp_zone;
-static unp_gen_t unp_gencnt;
-static u_int unp_count;
+static uma_zone_t unp_zone;
+static unp_gen_t unp_gencnt;
+static u_int unp_count; /* Count of local sockets. */
+static ino_t unp_ino; /* Prototype for fake inode numbers. */
+static int unp_rights; /* File descriptors in flight. */
+static struct unp_head unp_shead; /* List of local stream sockets. */
+static struct unp_head unp_dhead; /* List of local datagram sockets. */
-static struct unp_head unp_shead, unp_dhead;
+static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
/*
- * Unix communications domain.
- *
- * TODO:
- * SEQPACKET, RDM
- * rethink name space problems
- * need a proper out-of-band
- * lock pushdown
+ * Garbage collection of cyclic file descriptor/socket references occurs
+ * asynchronously in a taskqueue context in order to avoid recursion and
+ * reentrance in the UNIX domain socket, file descriptor, and socket layer
+ * code. See unp_gc() for a full description.
*/
-static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
-static ino_t unp_ino; /* prototype for fake inode numbers */
-struct mbuf *unp_addsockcred(struct thread *, struct mbuf *);
+static struct task unp_gc_task;
/*
- * Currently, UNIX domain sockets are protected by a single subsystem lock,
- * which covers global data structures and variables, the contents of each
- * per-socket unpcb structure, and the so_pcb field in sockets attached to
- * the UNIX domain. This provides for a moderate degree of paralellism, as
- * receive operations on UNIX domain sockets do not need to acquire the
- * subsystem lock. Finer grained locking to permit send() without acquiring
- * a global lock would be a logical next step.
+ * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
+ * stream sockets, although the total for sender and receiver is actually
+ * only PIPSIZ.
*
- * The UNIX domain socket lock preceds all socket layer locks, including the
- * socket lock and socket buffer lock, permitting UNIX domain socket code to
- * call into socket support routines without releasing its locks.
+ * Datagram sockets really use the sendspace as the maximum datagram size,
+ * and don't really want to reserve the sendspace. Their recvspace should be
+ * large enough for at least one max-size datagram plus address.
+ */
+#ifndef PIPSIZ
+#define PIPSIZ 8192
+#endif
+static u_long unpst_sendspace = PIPSIZ;
+static u_long unpst_recvspace = PIPSIZ;
+static u_long unpdg_sendspace = 2*1024; /* really max datagram size */
+static u_long unpdg_recvspace = 4*1024;
+
+SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
+SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM");
+SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
+
+SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
+ &unpst_sendspace, 0, "");
+SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
+ &unpst_recvspace, 0, "");
+SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
+ &unpdg_sendspace, 0, "");
+SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
+ &unpdg_recvspace, 0, "");
+SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
+
+/*-
+ * Locking and synchronization:
+ *
+ * The global UNIX domain socket rwlock (unp_global_rwlock) protects all
+ * global variables, including the linked lists tracking the set of allocated
+ * UNIX domain sockets. The global rwlock also serves to prevent deadlock
+ * when more than one PCB lock is acquired at a time (i.e., during
+ * connect()). Finally, the global rwlock protects uncounted references from
+ * vnodes to sockets bound to those vnodes: to safely dereference the
+ * v_socket pointer, the global rwlock must be held while a full reference is
+ * acquired.
*
- * Some caution is required in areas where the UNIX domain socket code enters
- * VFS in order to create or find rendezvous points. This results in
- * dropping of the UNIX domain socket subsystem lock, acquisition of the
- * Giant lock, and potential sleeping. This increases the chances of races,
- * and exposes weaknesses in the socket->protocol API by offering poor
- * failure modes.
+ * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
+ * allocated in pru_attach() and freed in pru_detach(). The validity of that
+ * pointer is an invariant, so no lock is required to dereference the so_pcb
+ * pointer if a valid socket reference is held by the caller. In practice,
+ * this is always true during operations performed on a socket. Each unpcb
+ * has a back-pointer to its socket, unp_socket, which will be stable under
+ * the same circumstances.
+ *
+ * This pointer may only be safely dereferenced as long as a valid reference
+ * to the unpcb is held. Typically, this reference will be from the socket,
+ * or from another unpcb when the referring unpcb's lock is held (in order
+ * that the reference not be invalidated during use). For example, to follow
+ * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn,
+ * as unp_socket remains valid as long as the reference to unp_conn is valid.
+ *
+ * Fields of unpcbss are locked using a per-unpcb lock, unp_mtx. Individual
+ * atomic reads without the lock may be performed "lockless", but more
+ * complex reads and read-modify-writes require the mutex to be held. No
+ * lock order is defined between unpcb locks -- multiple unpcb locks may be
+ * acquired at the same time only when holding the global UNIX domain socket
+ * rwlock exclusively, which prevents deadlocks.
+ *
+ * Blocking with UNIX domain sockets is a tricky issue: unlike most network
+ * protocols, bind() is a non-atomic operation, and connect() requires
+ * potential sleeping in the protocol, due to potentially waiting on local or
+ * distributed file systems. We try to separate "lookup" operations, which
+ * may sleep, and the IPC operations themselves, which typically can occur
+ * with relative atomicity as locks can be held over the entire operation.
+ *
+ * Another tricky issue is simultaneous multi-threaded or multi-process
+ * access to a single UNIX domain socket. These are handled by the flags
+ * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
+ * binding, both of which involve dropping UNIX domain socket locks in order
+ * to perform namei() and other file system operations.
*/
-static struct mtx unp_mtx;
-#define UNP_LOCK_INIT() \
- mtx_init(&unp_mtx, "unp", NULL, MTX_DEF)
-#define UNP_LOCK() mtx_lock(&unp_mtx)
-#define UNP_UNLOCK() mtx_unlock(&unp_mtx)
-#define UNP_LOCK_ASSERT() mtx_assert(&unp_mtx, MA_OWNED)
-#define UNP_UNLOCK_ASSERT() mtx_assert(&unp_mtx, MA_NOTOWNED)
+static struct rwlock unp_global_rwlock;
+
+#define UNP_GLOBAL_LOCK_INIT() rw_init(&unp_global_rwlock, \
+ "unp_global_rwlock")
+
+#define UNP_GLOBAL_LOCK_ASSERT() rw_assert(&unp_global_rwlock, \
+ RA_LOCKED)
+#define UNP_GLOBAL_UNLOCK_ASSERT() rw_assert(&unp_global_rwlock, \
+ RA_UNLOCKED)
+
+#define UNP_GLOBAL_WLOCK() rw_wlock(&unp_global_rwlock)
+#define UNP_GLOBAL_WUNLOCK() rw_wunlock(&unp_global_rwlock)
+#define UNP_GLOBAL_WLOCK_ASSERT() rw_assert(&unp_global_rwlock, \
+ RA_WLOCKED)
+#define UNP_GLOBAL_WOWNED() rw_wowned(&unp_global_rwlock)
+
+#define UNP_GLOBAL_RLOCK() rw_rlock(&unp_global_rwlock)
+#define UNP_GLOBAL_RUNLOCK() rw_runlock(&unp_global_rwlock)
+#define UNP_GLOBAL_RLOCK_ASSERT() rw_assert(&unp_global_rwlock, \
+ RA_RLOCKED)
+
+#define UNP_PCB_LOCK_INIT(unp) mtx_init(&(unp)->unp_mtx, \
+ "unp_mtx", "unp_mtx", \
+ MTX_DUPOK|MTX_DEF|MTX_RECURSE)
+#define UNP_PCB_LOCK_DESTROY(unp) mtx_destroy(&(unp)->unp_mtx)
+#define UNP_PCB_LOCK(unp) mtx_lock(&(unp)->unp_mtx)
+#define UNP_PCB_UNLOCK(unp) mtx_unlock(&(unp)->unp_mtx)
+#define UNP_PCB_LOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_OWNED)
+
+static int unp_connect(struct socket *, struct sockaddr *,
+ struct thread *);
+static int unp_connect2(struct socket *so, struct socket *so2, int);
+static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
+static void unp_shutdown(struct unpcb *);
+static void unp_drop(struct unpcb *, int);
+static void unp_gc(__unused void *, int);
+static void unp_scan(struct mbuf *, void (*)(struct file *));
+static void unp_mark(struct file *);
+static void unp_discard(struct file *);
+static void unp_freerights(struct file **, int);
+static int unp_internalize(struct mbuf **, struct thread *);
+static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *);
/*
- * Garbage collection of cyclic file descriptor/socket references occurs
- * asynchronously in a taskqueue context in order to avoid recursion and
- * reentrance in the UNIX domain socket, file descriptor, and socket layer
- * code. See unp_gc() for a full description.
+ * Definitions of protocols supported in the LOCAL domain.
*/
-static struct task unp_gc_task;
+static struct domain localdomain;
+static struct protosw localsw[] = {
+{
+ .pr_type = SOCK_STREAM,
+ .pr_domain = &localdomain,
+ .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
+ .pr_ctloutput = &uipc_ctloutput,
+ .pr_usrreqs = &uipc_usrreqs
+},
+{
+ .pr_type = SOCK_DGRAM,
+ .pr_domain = &localdomain,
+ .pr_flags = PR_ATOMIC|PR_ADDR|PR_RIGHTS,
+ .pr_usrreqs = &uipc_usrreqs
+},
+};
-static int unp_attach(struct socket *);
-static void unp_detach(struct unpcb *);
-static int unp_bind(struct unpcb *,struct sockaddr *, struct thread *);
-static int unp_connect(struct socket *,struct sockaddr *, struct thread *);
-static int unp_connect2(struct socket *so, struct socket *so2, int);
-static void unp_disconnect(struct unpcb *);
-static void unp_shutdown(struct unpcb *);
-static void unp_drop(struct unpcb *, int);
-static void unp_gc(__unused void *, int);
-static void unp_scan(struct mbuf *, void (*)(struct file *));
-static void unp_mark(struct file *);
-static void unp_discard(struct file *);
-static void unp_freerights(struct file **, int);
-static int unp_internalize(struct mbuf **, struct thread *);
-static int unp_listen(struct socket *, struct unpcb *, struct thread *);
+static struct domain localdomain = {
+ .dom_family = AF_LOCAL,
+ .dom_name = "local",
+ .dom_init = unp_init,
+ .dom_externalize = unp_externalize,
+ .dom_dispose = unp_dispose,
+ .dom_protosw = localsw,
+ .dom_protoswNPROTOSW = &localsw[sizeof(localsw)/sizeof(localsw[0])]
+};
+DOMAIN_SET(local);
-static int
+static void
uipc_abort(struct socket *so)
{
- struct unpcb *unp;
+ struct unpcb *unp, *unp2;
- UNP_LOCK();
unp = sotounpcb(so);
- if (unp == NULL) {
- UNP_UNLOCK();
- return (EINVAL);
+ KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
+
+ UNP_GLOBAL_WLOCK();
+ UNP_PCB_LOCK(unp);
+ unp2 = unp->unp_conn;
+ if (unp2 != NULL) {
+ UNP_PCB_LOCK(unp2);
+ unp_drop(unp2, ECONNABORTED);
+ UNP_PCB_UNLOCK(unp2);
}
- unp_drop(unp, ECONNABORTED);
- unp_detach(unp);
- UNP_UNLOCK_ASSERT();
- ACCEPT_LOCK();
- SOCK_LOCK(so);
- sotryfree(so);
- return (0);
+ UNP_PCB_UNLOCK(unp);
+ UNP_GLOBAL_WUNLOCK();
}
static int
uipc_accept(struct socket *so, struct sockaddr **nam)
{
- struct unpcb *unp;
+ struct unpcb *unp, *unp2;
const struct sockaddr *sa;
/*
- * Pass back name of connected socket,
- * if it was bound and we are still connected
- * (our peer may have closed already!).
+ * Pass back name of connected socket, if it was bound and we are
+ * still connected (our peer may have closed already!).
*/
- *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
- UNP_LOCK();
unp = sotounpcb(so);
- if (unp == NULL) {
- UNP_UNLOCK();
- free(*nam, M_SONAME);
- *nam = NULL;
- return (EINVAL);
- }
- if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL)
- sa = (struct sockaddr *) unp->unp_conn->unp_addr;
- else
+ KASSERT(unp != NULL, ("uipc_accept: unp == NULL"));
+
+ *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
+ UNP_GLOBAL_RLOCK();
+ unp2 = unp->unp_conn;
+ if (unp2 != NULL && unp2->unp_addr != NULL) {
+ UNP_PCB_LOCK(unp2);
+ sa = (struct sockaddr *) unp2->unp_addr;
+ bcopy(sa, *nam, sa->sa_len);
+ UNP_PCB_UNLOCK(unp2);
+ } else {
sa = &sun_noname;
- bcopy(sa, *nam, sa->sa_len);
- UNP_UNLOCK();
+ bcopy(sa, *nam, sa->sa_len);
+ }
+ UNP_GLOBAL_RUNLOCK();
return (0);
}
static int
uipc_attach(struct socket *so, int proto, struct thread *td)
{
- struct unpcb *unp = sotounpcb(so);
+ u_long sendspace, recvspace;
+ struct unpcb *unp;
+ int error, locked;
+
+ KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
+ if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+ switch (so->so_type) {
+ case SOCK_STREAM:
+ sendspace = unpst_sendspace;
+ recvspace = unpst_recvspace;
+ break;
+
+ case SOCK_DGRAM:
+ sendspace = unpdg_sendspace;
+ recvspace = unpdg_recvspace;
+ break;
+
+ default:
+ panic("uipc_attach");
+ }
+ error = soreserve(so, sendspace, recvspace);
+ if (error)
+ return (error);
+ }
+ unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO);
+ if (unp == NULL)
+ return (ENOBUFS);
+ LIST_INIT(&unp->unp_refs);
+ UNP_PCB_LOCK_INIT(unp);
+ unp->unp_socket = so;
+ so->so_pcb = unp;
+ unp->unp_refcount = 1;
+
+ /*
+ * uipc_attach() may be called indirectly from within the UNIX domain
+ * socket code via sonewconn() in unp_connect(). Since rwlocks can
+ * not be recursed, we do the closest thing.
+ */
+ locked = 0;
+ if (!UNP_GLOBAL_WOWNED()) {
+ UNP_GLOBAL_WLOCK();
+ locked = 1;
+ }
+ unp->unp_gencnt = ++unp_gencnt;
+ unp_count++;
+ LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead : &unp_shead,
+ unp, unp_link);
+ if (locked)
+ UNP_GLOBAL_WUNLOCK();
- if (unp != NULL)
- return (EISCONN);
- return (unp_attach(so));
+ return (0);
}
static int
uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
{
+ struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+ struct vattr vattr;
+ int error, namelen, vfslocked;
+ struct nameidata nd;
struct unpcb *unp;
- int error;
+ struct vnode *vp;
+ struct mount *mp;
+ char *buf;
- UNP_LOCK();
unp = sotounpcb(so);
- if (unp == NULL) {
- UNP_UNLOCK();
+ KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
+
+ namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
+ if (namelen <= 0)
return (EINVAL);
+
+ /*
+ * We don't allow simultaneous bind() calls on a single UNIX domain
+ * socket, so flag in-progress operations, and return an error if an
+ * operation is already in progress.
+ *
+ * Historically, we have not allowed a socket to be rebound, so this
+ * also returns an error. Not allowing re-binding simplifies the
+ * implementation and avoids a great many possible failure modes.
+ */
+ UNP_PCB_LOCK(unp);
+ if (unp->unp_vnode != NULL) {
+ UNP_PCB_UNLOCK(unp);
+ return (EINVAL);
+ }
+ if (unp->unp_flags & UNP_BINDING) {
+ UNP_PCB_UNLOCK(unp);
+ return (EALREADY);
+ }
+ unp->unp_flags |= UNP_BINDING;
+ UNP_PCB_UNLOCK(unp);
+
+ buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
+ strlcpy(buf, soun->sun_path, namelen + 1);
+
+restart:
+ vfslocked = 0;
+ NDINIT(&nd, CREATE, MPSAFE | NOFOLLOW | LOCKPARENT | SAVENAME,
+ UIO_SYSSPACE, buf, td);
+/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
+ error = namei(&nd);
+ if (error)
+ goto error;
+ vp = nd.ni_vp;
+ vfslocked = NDHASGIANT(&nd);
+ if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (vp != NULL) {
+ vrele(vp);
+ error = EADDRINUSE;
+ goto error;
+ }
+ error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
+ if (error)
+ goto error;
+ VFS_UNLOCK_GIANT(vfslocked);
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VSOCK;
+ vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
+#ifdef MAC
+ error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+ &vattr);
+#endif
+ if (error == 0) {
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if (error) {
+ vn_finished_write(mp);
+ goto error;
}
- error = unp_bind(unp, nam, td);
- UNP_UNLOCK();
+ vp = nd.ni_vp;
+ ASSERT_VOP_ELOCKED(vp, "uipc_bind");
+ soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
+
+ UNP_GLOBAL_WLOCK();
+ UNP_PCB_LOCK(unp);
+ vp->v_socket = unp->unp_socket;
+ unp->unp_vnode = vp;
+ unp->unp_addr = soun;
+ unp->unp_flags &= ~UNP_BINDING;
+ UNP_PCB_UNLOCK(unp);
+ UNP_GLOBAL_WUNLOCK();
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ free(buf, M_TEMP);
+ return (0);
+
+error:
+ VFS_UNLOCK_GIANT(vfslocked);
+ UNP_PCB_LOCK(unp);
+ unp->unp_flags &= ~UNP_BINDING;
+ UNP_PCB_UNLOCK(unp);
+ free(buf, M_TEMP);
return (error);
}
static int
uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
{
- struct unpcb *unp;
int error;
KASSERT(td == curthread, ("uipc_connect: td != curthread"));
+ UNP_GLOBAL_WLOCK();
+ error = unp_connect(so, nam, td);
+ UNP_GLOBAL_WUNLOCK();
+ return (error);
+}
+
+static void
+uipc_close(struct socket *so)
+{
+ struct unpcb *unp, *unp2;
- UNP_LOCK();
unp = sotounpcb(so);
- if (unp == NULL) {
- UNP_UNLOCK();
- return (EINVAL);
+ KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
+
+ UNP_GLOBAL_WLOCK();
+ UNP_PCB_LOCK(unp);
+ unp2 = unp->unp_conn;
+ if (unp2 != NULL) {
+ UNP_PCB_LOCK(unp2);
+ unp_disconnect(unp, unp2);
+ UNP_PCB_UNLOCK(unp2);
}
- error = unp_connect(so, nam, td);
- UNP_UNLOCK();
- return (error);
+ UNP_PCB_UNLOCK(unp);
+ UNP_GLOBAL_WUNLOCK();
}
int
uipc_connect2(struct socket *so1, struct socket *so2)
{
- struct unpcb *unp;
+ struct unpcb *unp, *unp2;
int error;
- UNP_LOCK();
- unp = sotounpcb(so1);
- if (unp == NULL) {
- UNP_UNLOCK();
- return (EINVAL);
- }
+ UNP_GLOBAL_WLOCK();
+ unp = so1->so_pcb;
+ KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
+ UNP_PCB_LOCK(unp);
+ unp2 = so2->so_pcb;
+ KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
+ UNP_PCB_LOCK(unp2);
error = unp_connect2(so1, so2, PRU_CONNECT2);
- UNP_UNLOCK();
+ UNP_PCB_UNLOCK(unp2);
+ UNP_PCB_UNLOCK(unp);
+ UNP_GLOBAL_WUNLOCK();
return (error);
}
/* control is EOPNOTSUPP */
-static int
+static void
uipc_detach(struct socket *so)
{
- struct unpcb *unp;
+ struct unpcb *unp, *unp2;
+ struct sockaddr_un *saved_unp_addr;
+ struct vnode *vp;
+ int freeunp, local_unp_rights;
- UNP_LOCK();
unp = sotounpcb(so);
- if (unp == NULL) {
- UNP_UNLOCK();
- return (EINVAL);
+ KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
+
+ UNP_GLOBAL_WLOCK();
+ UNP_PCB_LOCK(unp);
+
+ LIST_REMOVE(unp, unp_link);
+ unp->unp_gencnt = ++unp_gencnt;
+ --unp_count;
+
+ /*
+ * XXXRW: Should assert vp->v_socket == so.
+ */
+ if ((vp = unp->unp_vnode) != NULL) {
+ unp->unp_vnode->v_socket = NULL;
+ unp->unp_vnode = NULL;
}
- unp_detach(unp);
- UNP_UNLOCK_ASSERT();
- return (0);
+ unp2 = unp->unp_conn;
+ if (unp2 != NULL) {
+ UNP_PCB_LOCK(unp2);
+ unp_disconnect(unp, unp2);
+ UNP_PCB_UNLOCK(unp2);
+ }
+
+ /*
+ * We hold the global lock, so it's OK to acquire multiple pcb locks
+ * at a time.
+ */
+ while (!LIST_EMPTY(&unp->unp_refs)) {
+ struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
+
+ UNP_PCB_LOCK(ref);
+ unp_drop(ref, ECONNRESET);
+ UNP_PCB_UNLOCK(ref);
+ }
+ UNP_GLOBAL_WUNLOCK();
+ unp->unp_socket->so_pcb = NULL;
+ local_unp_rights = unp_rights;
+ saved_unp_addr = unp->unp_addr;
+ unp->unp_addr = NULL;
+ unp->unp_refcount--;
+ freeunp = (unp->unp_refcount == 0);
+ if (saved_unp_addr != NULL)
+ FREE(saved_unp_addr, M_SONAME);
+ if (freeunp) {
+ UNP_PCB_LOCK_DESTROY(unp);
+ uma_zfree(unp_zone, unp);
+ } else
+ UNP_PCB_UNLOCK(unp);
+ if (vp) {
+ int vfslocked;
+
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ vrele(vp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ }
+ if (local_unp_rights)
+ taskqueue_enqueue(taskqueue_thread, &unp_gc_task);
}
static int
uipc_disconnect(struct socket *so)
{
- struct unpcb *unp;
+ struct unpcb *unp, *unp2;
- UNP_LOCK();
unp = sotounpcb(so);
- if (unp == NULL) {
- UNP_UNLOCK();
- return (EINVAL);
+ KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
+
+ UNP_GLOBAL_WLOCK();
+ UNP_PCB_LOCK(unp);
+ unp2 = unp->unp_conn;
+ if (unp2 != NULL) {
+ UNP_PCB_LOCK(unp2);
+ unp_disconnect(unp, unp2);
+ UNP_PCB_UNLOCK(unp2);
}
- unp_disconnect(unp);
- UNP_UNLOCK();
+ UNP_PCB_UNLOCK(unp);
+ UNP_GLOBAL_WUNLOCK();
return (0);
}
static int
-uipc_listen(struct socket *so, struct thread *td)
+uipc_listen(struct socket *so, int backlog, struct thread *td)
{
struct unpcb *unp;
int error;
- UNP_LOCK();
unp = sotounpcb(so);
- if (unp == NULL || unp->unp_vnode == NULL) {
- UNP_UNLOCK();
+ KASSERT(unp != NULL, ("uipc_listen: unp == NULL"));
+
+ UNP_PCB_LOCK(unp);
+ if (unp->unp_vnode == NULL) {
+ UNP_PCB_UNLOCK(unp);
return (EINVAL);
}
- error = unp_listen(so, unp, td);
- UNP_UNLOCK();
+
+ SOCK_LOCK(so);
+ error = solisten_proto_check(so);
+ if (error == 0) {
+ cru2x(td->td_ucred, &unp->unp_peercred);
+ unp->unp_flags |= UNP_HAVEPCCACHED;
+ solisten_proto(so, backlog);
+ }
+ SOCK_UNLOCK(so);
+ UNP_PCB_UNLOCK(unp);
return (error);
}
static int
uipc_peeraddr(struct socket *so, struct sockaddr **nam)
{
- struct unpcb *unp;
+ struct unpcb *unp, *unp2;
const struct sockaddr *sa;
- *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
- UNP_LOCK();
unp = sotounpcb(so);
- if (unp == NULL) {
- UNP_UNLOCK();
- free(*nam, M_SONAME);
- *nam = NULL;
- return (EINVAL);
- }
- if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL)
- sa = (struct sockaddr *) unp->unp_conn->unp_addr;
- else {
- /*
- * XXX: It seems that this test always fails even when
- * connection is established. So, this else clause is
- * added as workaround to return PF_LOCAL sockaddr.
- */
+ KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
+
+ *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
+ UNP_PCB_LOCK(unp);
+ /*
+ * XXX: It seems that this test always fails even when connection is
+ * established. So, this else clause is added as workaround to
+ * return PF_LOCAL sockaddr.
+ */
+ unp2 = unp->unp_conn;
+ if (unp2 != NULL) {
+ UNP_PCB_LOCK(unp2);
+ if (unp2->unp_addr != NULL)
+ sa = (struct sockaddr *) unp->unp_conn->unp_addr;
+ else
+ sa = &sun_noname;
+ bcopy(sa, *nam, sa->sa_len);
+ UNP_PCB_UNLOCK(unp2);
+ } else {
sa = &sun_noname;
+ bcopy(sa, *nam, sa->sa_len);
}
- bcopy(sa, *nam, sa->sa_len);
- UNP_UNLOCK();
+ UNP_PCB_UNLOCK(unp);
return (0);
}
static int
uipc_rcvd(struct socket *so, int flags)
{
- struct unpcb *unp;
+ struct unpcb *unp, *unp2;
struct socket *so2;
+ u_int mbcnt, sbcc;
u_long newhiwat;
- UNP_LOCK();
unp = sotounpcb(so);
- if (unp == NULL) {
- UNP_UNLOCK();
- return (EINVAL);
- }
- switch (so->so_type) {
- case SOCK_DGRAM:
- panic("uipc_rcvd DGRAM?");
- /*NOTREACHED*/
+ KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL"));
- case SOCK_STREAM:
- if (unp->unp_conn == NULL)
- break;
- so2 = unp->unp_conn->unp_socket;
- SOCKBUF_LOCK(&so2->so_snd);
- SOCKBUF_LOCK(&so->so_rcv);
- /*
- * Adjust backpressure on sender
- * and wakeup any waiting to write.
- */
- so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
- unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
- newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
- so->so_rcv.sb_cc;
- (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
- newhiwat, RLIM_INFINITY);
- unp->unp_cc = so->so_rcv.sb_cc;
- SOCKBUF_UNLOCK(&so->so_rcv);
- sowwakeup_locked(so2);
- break;
+ if (so->so_type == SOCK_DGRAM)
+ panic("uipc_rcvd DGRAM?");
- default:
+ if (so->so_type != SOCK_STREAM)
panic("uipc_rcvd unknown socktype");
+
+ /*
+ * Adjust backpressure on sender and wakeup any waiting to write.
+ *
+ * The unp lock is acquired to maintain the validity of the unp_conn
+ * pointer; no lock on unp2 is required as unp2->unp_socket will be
+ * static as long as we don't permit unp2 to disconnect from unp,
+ * which is prevented by the lock on unp. We cache values from
+ * so_rcv to avoid holding the so_rcv lock over the entire
+ * transaction on the remote so_snd.
+ */
+ SOCKBUF_LOCK(&so->so_rcv);
+ mbcnt = so->so_rcv.sb_mbcnt;
+ sbcc = so->so_rcv.sb_cc;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ UNP_PCB_LOCK(unp);
+ unp2 = unp->unp_conn;
+ if (unp2 == NULL) {
+ UNP_PCB_UNLOCK(unp);
+ return (0);
}
- UNP_UNLOCK();
+ so2 = unp2->unp_socket;
+ SOCKBUF_LOCK(&so2->so_snd);
+ so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt;
+ newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc;
+ (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
+ newhiwat, RLIM_INFINITY);
+ sowwakeup_locked(so2);
+ unp->unp_mbcnt = mbcnt;
+ unp->unp_cc = sbcc;
+ UNP_PCB_UNLOCK(unp);
return (0);
}
@@ -383,16 +748,15 @@
uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
struct mbuf *control, struct thread *td)
{
- int error = 0;
- struct unpcb *unp;
+ struct unpcb *unp, *unp2;
struct socket *so2;
+ u_int mbcnt, sbcc;
u_long newhiwat;
+ int error = 0;
unp = sotounpcb(so);
- if (unp == NULL) {
- error = EINVAL;
- goto release;
- }
+ KASSERT(unp != NULL, ("uipc_send: unp == NULL"));
+
if (flags & PRUS_OOB) {
error = EOPNOTSUPP;
goto release;
@@ -401,40 +765,48 @@
if (control != NULL && (error = unp_internalize(&control, td)))
goto release;
- UNP_LOCK();
- unp = sotounpcb(so);
- if (unp == NULL) {
- UNP_UNLOCK();
- error = EINVAL;
- goto dispose_release;
- }
+ if ((nam != NULL) || (flags & PRUS_EOF))
+ UNP_GLOBAL_WLOCK();
+ else
+ UNP_GLOBAL_RLOCK();
switch (so->so_type) {
case SOCK_DGRAM:
{
const struct sockaddr *from;
+ unp2 = unp->unp_conn;
if (nam != NULL) {
- if (unp->unp_conn != NULL) {
+ UNP_GLOBAL_WLOCK_ASSERT();
+ if (unp2 != NULL) {
error = EISCONN;
break;
}
error = unp_connect(so, nam, td);
if (error)
break;
- } else {
- if (unp->unp_conn == NULL) {
- error = ENOTCONN;
- break;
- }
+ unp2 = unp->unp_conn;
}
- so2 = unp->unp_conn->unp_socket;
+ /*
+ * Because connect() and send() are non-atomic in a sendto()
+ * with a target address, it's possible that the socket will
+ * have disconnected before the send() can run. In that case
+ * return the slightly counter-intuitive but otherwise
+ * correct error that the socket is not connected.
+ */
+ if (unp2 == NULL) {
+ error = ENOTCONN;
+ break;
+ }
+ /* Lockless read. */
+ if (unp2->unp_flags & UNP_WANTCRED)
+ control = unp_addsockcred(td, control);
+ UNP_PCB_LOCK(unp);
if (unp->unp_addr != NULL)
from = (struct sockaddr *)unp->unp_addr;
else
from = &sun_noname;
- if (unp->unp_conn->unp_flags & UNP_WANTCRED)
- control = unp_addsockcred(td, control);
+ so2 = unp2->unp_socket;
SOCKBUF_LOCK(&so2->so_rcv);
if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
sorwakeup_locked(so2);
@@ -444,19 +816,26 @@
SOCKBUF_UNLOCK(&so2->so_rcv);
error = ENOBUFS;
}
- if (nam != NULL)
- unp_disconnect(unp);
+ if (nam != NULL) {
+ UNP_GLOBAL_WLOCK_ASSERT();
+ UNP_PCB_LOCK(unp2);
+ unp_disconnect(unp, unp2);
+ UNP_PCB_UNLOCK(unp2);
+ }
+ UNP_PCB_UNLOCK(unp);
break;
}
case SOCK_STREAM:
- /* Connect if not connected yet. */
/*
- * Note: A better implementation would complain
- * if not equal to the peer's address.
+ * Connect if not connected yet.
+ *
+ * Note: A better implementation would complain if not equal
+ * to the peer's address.
*/
if ((so->so_state & SS_ISCONNECTED) == 0) {
if (nam != NULL) {
+ UNP_GLOBAL_WLOCK_ASSERT();
error = unp_connect(so, nam, td);
if (error)
break; /* XXX */
@@ -466,45 +845,61 @@
}
}
- SOCKBUF_LOCK(&so->so_snd);
+ /* Lockless read. */
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
- SOCKBUF_UNLOCK(&so->so_snd);
error = EPIPE;
break;
}
- if (unp->unp_conn == NULL)
- panic("uipc_send connected but no connection?");
- so2 = unp->unp_conn->unp_socket;
+ /*
+ * Because connect() and send() are non-atomic in a sendto()
+ * with a target address, it's possible that the socket will
+ * have disconnected before the send() can run. In that case
+ * return the slightly counter-intuitive but otherwise
+ * correct error that the socket is not connected.
+ *
+ * Locking here must be done carefully: the global lock
+ * prevents interconnections between unpcbs from changing, so
+ * we can traverse from unp to unp2 without acquiring unp's
+ * lock. Socket buffer locks follow unpcb locks, so we can
+ * acquire both remote and lock socket buffer locks.
+ */
+ unp2 = unp->unp_conn;
+ if (unp2 == NULL) {
+ error = ENOTCONN;
+ break;
+ }
+ so2 = unp2->unp_socket;
+ UNP_PCB_LOCK(unp2);
SOCKBUF_LOCK(&so2->so_rcv);
- if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
+ if (unp2->unp_flags & UNP_WANTCRED) {
/*
- * Credentials are passed only once on
- * SOCK_STREAM.
+ * Credentials are passed only once on SOCK_STREAM.
*/
- unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
+ unp2->unp_flags &= ~UNP_WANTCRED;
control = unp_addsockcred(td, control);
}
/*
- * Send to paired receive port, and then reduce
- * send buffer hiwater marks to maintain backpressure.
- * Wake up readers.
+ * Send to paired receive port, and then reduce send buffer
+ * hiwater marks to maintain backpressure. Wake up readers.
*/
if (control != NULL) {
if (sbappendcontrol_locked(&so2->so_rcv, m, control))
control = NULL;
- } else {
+ } else
sbappend_locked(&so2->so_rcv, m);
- }
- so->so_snd.sb_mbmax -=
- so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
- unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
- newhiwat = so->so_snd.sb_hiwat -
- (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
+ mbcnt = so2->so_rcv.sb_mbcnt - unp2->unp_mbcnt;
+ unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt;
+ sbcc = so2->so_rcv.sb_cc;
+ sorwakeup_locked(so2);
+
+ SOCKBUF_LOCK(&so->so_snd);
+ newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc);
(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
newhiwat, RLIM_INFINITY);
+ so->so_snd.sb_mbmax -= mbcnt;
SOCKBUF_UNLOCK(&so->so_snd);
- unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
- sorwakeup_locked(so2);
+ unp2->unp_cc = sbcc;
+ UNP_PCB_UNLOCK(unp2);
m = NULL;
break;
@@ -513,16 +908,20 @@
}
/*
- * SEND_EOF is equivalent to a SEND followed by
- * a SHUTDOWN.
+ * SEND_EOF is equivalent to a SEND followed by a SHUTDOWN.
*/
if (flags & PRUS_EOF) {
+ UNP_PCB_LOCK(unp);
socantsendmore(so);
unp_shutdown(unp);
+ UNP_PCB_UNLOCK(unp);
}
- UNP_UNLOCK();
-dispose_release:
+ if ((nam != NULL) || (flags & PRUS_EOF))
+ UNP_GLOBAL_WUNLOCK();
+ else
+ UNP_GLOBAL_RUNLOCK();
+
if (control != NULL && error != 0)
unp_dispose(control);
@@ -537,25 +936,26 @@
static int
uipc_sense(struct socket *so, struct stat *sb)
{
- struct unpcb *unp;
+ struct unpcb *unp, *unp2;
struct socket *so2;
- UNP_LOCK();
unp = sotounpcb(so);
- if (unp == NULL) {
- UNP_UNLOCK();
- return (EINVAL);
- }
+ KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
+
sb->st_blksize = so->so_snd.sb_hiwat;
- if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) {
- so2 = unp->unp_conn->unp_socket;
+ UNP_GLOBAL_RLOCK();
+ UNP_PCB_LOCK(unp);
+ unp2 = unp->unp_conn;
+ if (so->so_type == SOCK_STREAM && unp2 != NULL) {
+ so2 = unp2->unp_socket;
sb->st_blksize += so2->so_rcv.sb_cc;
}
sb->st_dev = NODEV;
if (unp->unp_ino == 0)
unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
sb->st_ino = unp->unp_ino;
- UNP_UNLOCK();
+ UNP_PCB_UNLOCK(unp);
+ UNP_GLOBAL_RUNLOCK();
return (0);
}
@@ -564,15 +964,15 @@
{
struct unpcb *unp;
- UNP_LOCK();
unp = sotounpcb(so);
- if (unp == NULL) {
- UNP_UNLOCK();
- return (EINVAL);
- }
+ KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
+
+ UNP_GLOBAL_WLOCK();
+ UNP_PCB_LOCK(unp);
socantsendmore(so);
unp_shutdown(unp);
- UNP_UNLOCK();
+ UNP_PCB_UNLOCK(unp);
+ UNP_GLOBAL_WUNLOCK();
return (0);
}
@@ -582,21 +982,17 @@
struct unpcb *unp;
const struct sockaddr *sa;
- *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
- UNP_LOCK();
unp = sotounpcb(so);
- if (unp == NULL) {
- UNP_UNLOCK();
- free(*nam, M_SONAME);
- *nam = NULL;
- return (EINVAL);
- }
+ KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
+
+ *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
+ UNP_PCB_LOCK(unp);
if (unp->unp_addr != NULL)
sa = (struct sockaddr *) unp->unp_addr;
else
sa = &sun_noname;
bcopy(sa, *nam, sa->sa_len);
- UNP_UNLOCK();
+ UNP_PCB_UNLOCK(unp);
return (0);
}
@@ -616,9 +1012,7 @@
.pru_sense = uipc_sense,
.pru_shutdown = uipc_shutdown,
.pru_sockaddr = uipc_sockaddr,
- .pru_sosend = sosend,
- .pru_soreceive = soreceive,
- .pru_sopoll = sopoll,
+ .pru_close = uipc_close,
};
int
@@ -631,284 +1025,87 @@
if (sopt->sopt_level != 0)
return (EINVAL);
- UNP_LOCK();
unp = sotounpcb(so);
- if (unp == NULL) {
- UNP_UNLOCK();
- return (EINVAL);
- }
- error = 0;
-
- switch (sopt->sopt_dir) {
- case SOPT_GET:
- switch (sopt->sopt_name) {
- case LOCAL_PEERCRED:
- if (unp->unp_flags & UNP_HAVEPC)
- xu = unp->unp_peercred;
- else {
- if (so->so_type == SOCK_STREAM)
- error = ENOTCONN;
- else
- error = EINVAL;
- }
- if (error == 0)
- error = sooptcopyout(sopt, &xu, sizeof(xu));
- break;
- case LOCAL_CREDS:
- optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
- error = sooptcopyout(sopt, &optval, sizeof(optval));
- break;
- case LOCAL_CONNWAIT:
- optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
- error = sooptcopyout(sopt, &optval, sizeof(optval));
- break;
- default:
- error = EOPNOTSUPP;
- break;
- }
- break;
- case SOPT_SET:
- switch (sopt->sopt_name) {
- case LOCAL_CREDS:
- case LOCAL_CONNWAIT:
- error = sooptcopyin(sopt, &optval, sizeof(optval),
- sizeof(optval));
- if (error)
- break;
-
-#define OPTSET(bit) \
- if (optval) \
- unp->unp_flags |= bit; \
- else \
- unp->unp_flags &= ~bit;
-
- switch (sopt->sopt_name) {
- case LOCAL_CREDS:
- OPTSET(UNP_WANTCRED);
- break;
- case LOCAL_CONNWAIT:
- OPTSET(UNP_CONNWAIT);
- break;
- default:
- break;
- }
- break;
-#undef OPTSET
- default:
- error = ENOPROTOOPT;
- break;
- }
- break;
- default:
- error = EOPNOTSUPP;
- break;
- }
- UNP_UNLOCK();
- return (error);
-}
-
-/*
- * Both send and receive buffers are allocated PIPSIZ bytes of buffering
- * for stream sockets, although the total for sender and receiver is
- * actually only PIPSIZ.
- * Datagram sockets really use the sendspace as the maximum datagram size,
- * and don't really want to reserve the sendspace. Their recvspace should
- * be large enough for at least one max-size datagram plus address.
- */
-#ifndef PIPSIZ
-#define PIPSIZ 8192
-#endif
-static u_long unpst_sendspace = PIPSIZ;
-static u_long unpst_recvspace = PIPSIZ;
-static u_long unpdg_sendspace = 2*1024; /* really max datagram size */
-static u_long unpdg_recvspace = 4*1024;
-
-static int unp_rights; /* file descriptors in flight */
-
-SYSCTL_DECL(_net_local_stream);
-SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
- &unpst_sendspace, 0, "");
-SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
- &unpst_recvspace, 0, "");
-SYSCTL_DECL(_net_local_dgram);
-SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
- &unpdg_sendspace, 0, "");
-SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
- &unpdg_recvspace, 0, "");
-SYSCTL_DECL(_net_local);
-SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
-
-static int
-unp_attach(struct socket *so)
-{
- struct unpcb *unp;
- int error;
-
- if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
- switch (so->so_type) {
-
- case SOCK_STREAM:
- error = soreserve(so, unpst_sendspace, unpst_recvspace);
- break;
-
- case SOCK_DGRAM:
- error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
- break;
-
- default:
- panic("unp_attach");
- }
- if (error)
- return (error);
- }
- unp = uma_zalloc(unp_zone, M_WAITOK | M_ZERO);
- if (unp == NULL)
- return (ENOBUFS);
- LIST_INIT(&unp->unp_refs);
- unp->unp_socket = so;
- so->so_pcb = unp;
-
- UNP_LOCK();
- unp->unp_gencnt = ++unp_gencnt;
- unp_count++;
- LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
- : &unp_shead, unp, unp_link);
- UNP_UNLOCK();
-
- return (0);
-}
-
-static void
-unp_detach(struct unpcb *unp)
-{
- struct vnode *vp;
- int local_unp_rights;
-
- UNP_LOCK_ASSERT();
-
- LIST_REMOVE(unp, unp_link);
- unp->unp_gencnt = ++unp_gencnt;
- --unp_count;
- if ((vp = unp->unp_vnode) != NULL) {
- /*
- * XXXRW: should v_socket be frobbed only while holding
- * Giant?
- */
- unp->unp_vnode->v_socket = NULL;
- unp->unp_vnode = NULL;
- }
- if (unp->unp_conn != NULL)
- unp_disconnect(unp);
- while (!LIST_EMPTY(&unp->unp_refs)) {
- struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
- unp_drop(ref, ECONNRESET);
- }
- soisdisconnected(unp->unp_socket);
- unp->unp_socket->so_pcb = NULL;
- local_unp_rights = unp_rights;
- UNP_UNLOCK();
- if (unp->unp_addr != NULL)
- FREE(unp->unp_addr, M_SONAME);
- uma_zfree(unp_zone, unp);
- if (vp) {
- int vfslocked;
+ KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
+ error = 0;
+ switch (sopt->sopt_dir) {
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+ case LOCAL_PEERCRED:
+ UNP_PCB_LOCK(unp);
+ if (unp->unp_flags & UNP_HAVEPC)
+ xu = unp->unp_peercred;
+ else {
+ if (so->so_type == SOCK_STREAM)
+ error = ENOTCONN;
+ else
+ error = EINVAL;
+ }
+ UNP_PCB_UNLOCK(unp);
+ if (error == 0)
+ error = sooptcopyout(sopt, &xu, sizeof(xu));
+ break;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
- vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
- }
- if (local_unp_rights)
- taskqueue_enqueue(taskqueue_thread, &unp_gc_task);
-}
+ case LOCAL_CREDS:
+ /* Unocked read. */
+ optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
+ error = sooptcopyout(sopt, &optval, sizeof(optval));
+ break;
-static int
-unp_bind(struct unpcb *unp, struct sockaddr *nam, struct thread *td)
-{
- struct sockaddr_un *soun = (struct sockaddr_un *)nam;
- struct vnode *vp;
- struct mount *mp;
- struct vattr vattr;
- int error, namelen;
- struct nameidata nd;
- char *buf;
+ case LOCAL_CONNWAIT:
+ /* Unocked read. */
+ optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
+ error = sooptcopyout(sopt, &optval, sizeof(optval));
+ break;
- UNP_LOCK_ASSERT();
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ break;
- /*
- * XXXRW: This test-and-set of unp_vnode is non-atomic; the
- * unlocked read here is fine, but the value of unp_vnode needs
- * to be tested again after we do all the lookups to see if the
- * pcb is still unbound?
- */
- if (unp->unp_vnode != NULL)
- return (EINVAL);
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+ case LOCAL_CREDS:
+ case LOCAL_CONNWAIT:
+ error = sooptcopyin(sopt, &optval, sizeof(optval),
+ sizeof(optval));
+ if (error)
+ break;
- namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
- if (namelen <= 0)
- return (EINVAL);
+#define OPTSET(bit) do { \
+ UNP_PCB_LOCK(unp); \
+ if (optval) \
+ unp->unp_flags |= bit; \
+ else \
+ unp->unp_flags &= ~bit; \
+ UNP_PCB_UNLOCK(unp); \
+} while (0)
- UNP_UNLOCK();
+ switch (sopt->sopt_name) {
+ case LOCAL_CREDS:
+ OPTSET(UNP_WANTCRED);
+ break;
- buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
- strlcpy(buf, soun->sun_path, namelen + 1);
+ case LOCAL_CONNWAIT:
+ OPTSET(UNP_CONNWAIT);
+ break;
- mtx_lock(&Giant);
-restart:
- mtx_assert(&Giant, MA_OWNED);
- NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE,
- buf, td);
-/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
- error = namei(&nd);
- if (error)
- goto done;
- vp = nd.ni_vp;
- if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
- NDFREE(&nd, NDF_ONLY_PNBUF);
- if (nd.ni_dvp == vp)
- vrele(nd.ni_dvp);
- else
- vput(nd.ni_dvp);
- if (vp != NULL) {
- vrele(vp);
- error = EADDRINUSE;
- goto done;
+ default:
+ break;
+ }
+ break;
+#undef OPTSET
+ default:
+ error = ENOPROTOOPT;
+ break;
}
- error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
- if (error)
- goto done;
- goto restart;
- }
- VATTR_NULL(&vattr);
- vattr.va_type = VSOCK;
- vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
-#ifdef MAC
- error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
- &vattr);
-#endif
- if (error == 0) {
- VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
- error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
- }
- NDFREE(&nd, NDF_ONLY_PNBUF);
- vput(nd.ni_dvp);
- if (error) {
- vn_finished_write(mp);
- goto done;
+ break;
+
+ default:
+ error = EOPNOTSUPP;
+ break;
}
- vp = nd.ni_vp;
- ASSERT_VOP_LOCKED(vp, "unp_bind");
- soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
- UNP_LOCK();
- vp->v_socket = unp->unp_socket;
- unp->unp_vnode = vp;
- unp->unp_addr = soun;
- UNP_UNLOCK();
- VOP_UNLOCK(vp, 0, td);
- vn_finished_write(mp);
-done:
- mtx_unlock(&Giant);
- free(buf, M_TEMP);
- UNP_LOCK();
return (error);
}
@@ -919,28 +1116,40 @@
struct vnode *vp;
struct socket *so2, *so3;
struct unpcb *unp, *unp2, *unp3;
- int error, len;
+ int error, len, vfslocked;
struct nameidata nd;
char buf[SOCK_MAXADDRLEN];
struct sockaddr *sa;
- UNP_LOCK_ASSERT();
+ UNP_GLOBAL_WLOCK_ASSERT();
+
unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
if (len <= 0)
return (EINVAL);
strlcpy(buf, soun->sun_path, len + 1);
- UNP_UNLOCK();
+
+ UNP_PCB_LOCK(unp);
+ if (unp->unp_flags & UNP_CONNECTING) {
+ UNP_PCB_UNLOCK(unp);
+ return (EALREADY);
+ }
+ UNP_GLOBAL_WUNLOCK();
+ unp->unp_flags |= UNP_CONNECTING;
+ UNP_PCB_UNLOCK(unp);
+
sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
- mtx_lock(&Giant);
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td);
+ NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf,
+ td);
error = namei(&nd);
if (error)
vp = NULL;
else
vp = nd.ni_vp;
ASSERT_VOP_LOCKED(vp, "unp_connect");
+ vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
if (error)
goto bad;
@@ -949,16 +1158,24 @@
error = ENOTSOCK;
goto bad;
}
+#ifdef MAC
+ error = mac_check_vnode_open(td->td_ucred, vp, VWRITE | VREAD);
+ if (error)
+ goto bad;
+#endif
error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
if (error)
goto bad;
- mtx_unlock(&Giant);
- UNP_LOCK();
+ VFS_UNLOCK_GIANT(vfslocked);
+
unp = sotounpcb(so);
- if (unp == NULL) {
- error = EINVAL;
- goto bad2;
- }
+ KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
+
+ /*
+ * Lock global lock for two reasons: make sure v_socket is stable,
+ * and to protect simultaneous locking of multiple pcbs.
+ */
+ UNP_GLOBAL_WLOCK();
so2 = vp->v_socket;
if (so2 == NULL) {
error = ECONNREFUSED;
@@ -971,14 +1188,11 @@
if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
if (so2->so_options & SO_ACCEPTCONN) {
/*
- * NB: drop locks here so unp_attach is entered
- * w/o locks; this avoids a recursive lock
- * of the head and holding sleep locks across
- * a (potentially) blocking malloc.
+ * We can't drop the global lock here or 'so2' may
+ * become invalid. As a result, we need to handle
+ * possibly lock recursion in uipc_attach.
*/
- UNP_UNLOCK();
so3 = sonewconn(so2, 0);
- UNP_LOCK();
} else
so3 = NULL;
if (so3 == NULL) {
@@ -988,6 +1202,9 @@
unp = sotounpcb(so);
unp2 = sotounpcb(so2);
unp3 = sotounpcb(so3);
+ UNP_PCB_LOCK(unp);
+ UNP_PCB_LOCK(unp2);
+ UNP_PCB_LOCK(unp3);
if (unp2->unp_addr != NULL) {
bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
unp3->unp_addr = (struct sockaddr_un *) sa;
@@ -996,24 +1213,27 @@
/*
* unp_peercred management:
*
- * The connecter's (client's) credentials are copied
- * from its process structure at the time of connect()
- * (which is now).
+ * The connecter's (client's) credentials are copied from its
+ * process structure at the time of connect() (which is now).
*/
cru2x(td->td_ucred, &unp3->unp_peercred);
unp3->unp_flags |= UNP_HAVEPC;
/*
- * The receiver's (server's) credentials are copied
- * from the unp_peercred member of socket on which the
- * former called listen(); unp_listen() cached that
- * process's credentials at that time so we can use
- * them now.
+ * The receiver's (server's) credentials are copied from the
+ * unp_peercred member of socket on which the former called
+ * listen(); uipc_listen() cached that process's credentials
+ * at that time so we can use them now.
*/
KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
("unp_connect: listener without cached peercred"));
memcpy(&unp->unp_peercred, &unp2->unp_peercred,
sizeof(unp->unp_peercred));
unp->unp_flags |= UNP_HAVEPC;
+ if (unp2->unp_flags & UNP_WANTCRED)
+ unp3->unp_flags |= UNP_WANTCRED;
+ UNP_PCB_UNLOCK(unp3);
+ UNP_PCB_UNLOCK(unp2);
+ UNP_PCB_UNLOCK(unp);
#ifdef MAC
SOCK_LOCK(so);
mac_set_socket_peer_from_socket(so, so3);
@@ -1023,34 +1243,55 @@
so2 = so3;
}
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
+ unp2 = sotounpcb(so2);
+ KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL"));
+ UNP_PCB_LOCK(unp);
+ UNP_PCB_LOCK(unp2);
error = unp_connect2(so, so2, PRU_CONNECT);
+ UNP_PCB_UNLOCK(unp2);
+ UNP_PCB_UNLOCK(unp);
bad2:
- UNP_UNLOCK();
- mtx_lock(&Giant);
+ UNP_GLOBAL_WUNLOCK();
+ if (vfslocked)
+ /*
+ * Giant has been previously acquired. This means filesystem
+ * isn't MPSAFE. Do it once again.
+ */
+ mtx_lock(&Giant);
bad:
- mtx_assert(&Giant, MA_OWNED);
if (vp != NULL)
vput(vp);
- mtx_unlock(&Giant);
+ VFS_UNLOCK_GIANT(vfslocked);
free(sa, M_SONAME);
- UNP_LOCK();
+ UNP_GLOBAL_WLOCK();
+ UNP_PCB_LOCK(unp);
+ unp->unp_flags &= ~UNP_CONNECTING;
+ UNP_PCB_UNLOCK(unp);
return (error);
}
static int
unp_connect2(struct socket *so, struct socket *so2, int req)
{
- struct unpcb *unp = sotounpcb(so);
+ struct unpcb *unp;
struct unpcb *unp2;
- UNP_LOCK_ASSERT();
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
+ unp2 = sotounpcb(so2);
+ KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
+
+ UNP_GLOBAL_WLOCK_ASSERT();
+ UNP_PCB_LOCK_ASSERT(unp);
+ UNP_PCB_LOCK_ASSERT(unp2);
if (so2->so_type != so->so_type)
return (EPROTOTYPE);
- unp2 = sotounpcb(so2);
unp->unp_conn = unp2;
- switch (so->so_type) {
+ switch (so->so_type) {
case SOCK_DGRAM:
LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
soisconnected(so);
@@ -1073,18 +1314,18 @@
}
static void
-unp_disconnect(struct unpcb *unp)
+unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
{
- struct unpcb *unp2 = unp->unp_conn;
struct socket *so;
- UNP_LOCK_ASSERT();
+ KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL"));
+
+ UNP_GLOBAL_WLOCK_ASSERT();
+ UNP_PCB_LOCK_ASSERT(unp);
+ UNP_PCB_LOCK_ASSERT(unp2);
- if (unp2 == NULL)
- return;
unp->unp_conn = NULL;
switch (unp->unp_socket->so_type) {
-
case SOCK_DGRAM:
LIST_REMOVE(unp, unp_reflink);
so = unp->unp_socket;
@@ -1101,28 +1342,18 @@
}
}
-#ifdef notdef
-void
-unp_abort(struct unpcb *unp)
-{
-
- unp_detach(unp);
- UNP_UNLOCK_ASSERT();
-}
-#endif
-
/*
- * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed
- * by the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers
- * are safe to reference. It first scans the list of struct unpcb's to
- * generate a pointer list, then it rescans its list one entry at a time to
- * externalize and copyout. It checks the generation number to see if a
- * struct unpcb has been reused, and will skip it if so.
+ * unp_pcblist() walks the global list of struct unpcb's to generate a
+ * pointer list, bumping the refcount on each unpcb. It then copies them out
+ * sequentially, validating the generation number on each to see if it has
+ * been detached. All of this is necessary because copyout() may sleep on
+ * disk I/O.
*/
static int
unp_pcblist(SYSCTL_HANDLER_ARGS)
{
int error, i, n;
+ int freeunp;
struct unpcb *unp, **unp_list;
unp_gen_t gencnt;
struct xunpgen *xug;
@@ -1149,10 +1380,10 @@
* OK, now we're committed to doing something.
*/
xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
- UNP_LOCK();
+ UNP_GLOBAL_RLOCK();
gencnt = unp_gencnt;
n = unp_count;
- UNP_UNLOCK();
+ UNP_GLOBAL_RUNLOCK();
xug->xug_len = sizeof *xug;
xug->xug_count = n;
@@ -1166,24 +1397,31 @@
unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
- UNP_LOCK();
+ UNP_GLOBAL_RLOCK();
for (unp = LIST_FIRST(head), i = 0; unp && i < n;
unp = LIST_NEXT(unp, unp_link)) {
+ UNP_PCB_LOCK(unp);
if (unp->unp_gencnt <= gencnt) {
if (cr_cansee(req->td->td_ucred,
- unp->unp_socket->so_cred))
+ unp->unp_socket->so_cred)) {
+ UNP_PCB_UNLOCK(unp);
continue;
+ }
unp_list[i++] = unp;
+ unp->unp_refcount++;
}
+ UNP_PCB_UNLOCK(unp);
}
- UNP_UNLOCK();
- n = i; /* in case we lost some during malloc */
+ UNP_GLOBAL_RUNLOCK();
+ n = i; /* In case we lost some during malloc. */
error = 0;
xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
for (i = 0; i < n; i++) {
unp = unp_list[i];
- if (unp->unp_gencnt <= gencnt) {
+ UNP_PCB_LOCK(unp);
+ unp->unp_refcount--;
+ if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) {
xu->xu_len = sizeof *xu;
xu->xu_unpp = unp;
/*
@@ -1200,17 +1438,24 @@
unp->unp_conn->unp_addr->sun_len);
bcopy(unp, &xu->xu_unp, sizeof *unp);
sotoxsocket(unp->unp_socket, &xu->xu_socket);
+ UNP_PCB_UNLOCK(unp);
error = SYSCTL_OUT(req, xu, sizeof *xu);
+ } else {
+ freeunp = (unp->unp_refcount == 0);
+ UNP_PCB_UNLOCK(unp);
+ if (freeunp) {
+ UNP_PCB_LOCK_DESTROY(unp);
+ uma_zfree(unp_zone, unp);
+ }
}
}
free(xu, M_TEMP);
if (!error) {
/*
- * Give the user an updated idea of our state.
- * If the generation differs from what we told
- * her before, she knows that something happened
- * while we were processing this request, and it
- * might be necessary to retry.
+ * Give the user an updated idea of our state. If the
+ * generation differs from what we told her before, she knows
+ * that something happened while we were processing this
+ * request, and it might be necessary to retry.
*/
xug->xug_gen = unp_gencnt;
xug->xug_sogen = so_gencnt;
@@ -1232,33 +1477,38 @@
static void
unp_shutdown(struct unpcb *unp)
{
+ struct unpcb *unp2;
struct socket *so;
- UNP_LOCK_ASSERT();
+ UNP_GLOBAL_WLOCK_ASSERT();
+ UNP_PCB_LOCK_ASSERT(unp);
- if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
- (so = unp->unp_conn->unp_socket))
- socantrcvmore(so);
+ unp2 = unp->unp_conn;
+ if (unp->unp_socket->so_type == SOCK_STREAM && unp2 != NULL) {
+ so = unp2->unp_socket;
+ if (so != NULL)
+ socantrcvmore(so);
+ }
}
static void
unp_drop(struct unpcb *unp, int errno)
{
struct socket *so = unp->unp_socket;
+ struct unpcb *unp2;
- UNP_LOCK_ASSERT();
+ UNP_GLOBAL_WLOCK_ASSERT();
+ UNP_PCB_LOCK_ASSERT(unp);
so->so_error = errno;
- unp_disconnect(unp);
-}
-
-#ifdef notdef
-void
-unp_drain(void)
-{
+ unp2 = unp->unp_conn;
+ if (unp2 == NULL)
+ return;
+ UNP_PCB_LOCK(unp2);
+ unp_disconnect(unp, unp2);
+ UNP_PCB_UNLOCK(unp2);
}
-#endif
static void
unp_freerights(struct file **rp, int fdcount)
@@ -1267,13 +1517,14 @@
struct file *fp;
for (i = 0; i < fdcount; i++) {
- fp = *rp;
/*
- * zero the pointer before calling
- * unp_discard since it may end up
- * in unp_gc()..
+ * Zero the pointer before calling unp_discard since it may
+ * end up in unp_gc()..
+ *
+ * XXXRW: This is less true than it used to be.
*/
- *rp++ = 0;
+ fp = *rp;
+ *rp++ = NULL;
unp_discard(fp);
}
}
@@ -1293,7 +1544,7 @@
int f;
u_int newlen;
- UNP_UNLOCK_ASSERT();
+ UNP_GLOBAL_UNLOCK_ASSERT();
error = 0;
if (controlp != NULL) /* controlp == NULL => free control messages */
@@ -1318,25 +1569,25 @@
unp_freerights(rp, newfds);
goto next;
}
- FILEDESC_LOCK(td->td_proc->p_fd);
+ FILEDESC_XLOCK(td->td_proc->p_fd);
/* if the new FD's will not fit free them. */
if (!fdavail(td, newfds)) {
- FILEDESC_UNLOCK(td->td_proc->p_fd);
+ FILEDESC_XUNLOCK(td->td_proc->p_fd);
error = EMSGSIZE;
unp_freerights(rp, newfds);
goto next;
}
/*
- * now change each pointer to an fd in the global
- * table to an integer that is the index to the
- * local fd table entry that we set up to point
- * to the global one we are transferring.
+ * Now change each pointer to an fd in the global
+ * table to an integer that is the index to the local
+ * fd table entry that we set up to point to the
+ * global one we are transferring.
*/
newlen = newfds * sizeof(int);
*controlp = sbcreatecontrol(NULL, newlen,
SCM_RIGHTS, SOL_SOCKET);
if (*controlp == NULL) {
- FILEDESC_UNLOCK(td->td_proc->p_fd);
+ FILEDESC_XUNLOCK(td->td_proc->p_fd);
error = E2BIG;
unp_freerights(rp, newfds);
goto next;
@@ -1355,8 +1606,9 @@
unp_rights--;
*fdp++ = f;
}
- FILEDESC_UNLOCK(td->td_proc->p_fd);
- } else { /* We can just copy anything else across */
+ FILEDESC_XUNLOCK(td->td_proc->p_fd);
+ } else {
+ /* We can just copy anything else across. */
if (error || controlp == NULL)
goto next;
*controlp = sbcreatecontrol(NULL, datalen,
@@ -1388,18 +1640,28 @@
return (error);
}
+static void
+unp_zone_change(void *tag)
+{
+
+ uma_zone_set_max(unp_zone, maxsockets);
+}
+
void
unp_init(void)
{
+
unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
- NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ NULL, NULL, UMA_ALIGN_PTR, 0);
if (unp_zone == NULL)
panic("unp_init");
- uma_zone_set_max(unp_zone, nmbclusters);
+ uma_zone_set_max(unp_zone, maxsockets);
+ EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
+ NULL, EVENTHANDLER_PRI_ANY);
LIST_INIT(&unp_dhead);
LIST_INIT(&unp_shead);
TASK_INIT(&unp_gc_task, 0, unp_gc, NULL);
- UNP_LOCK_INIT();
+ UNP_GLOBAL_LOCK_INIT();
}
static int
@@ -1419,7 +1681,7 @@
int error, oldfds;
u_int newlen;
- UNP_UNLOCK_ASSERT();
+ UNP_GLOBAL_UNLOCK_ASSERT();
error = 0;
*controlp = NULL;
@@ -1462,27 +1724,28 @@
case SCM_RIGHTS:
oldfds = datalen / sizeof (int);
/*
- * check that all the FDs passed in refer to legal files
- * If not, reject the entire operation.
+ * Check that all the FDs passed in refer to legal
+ * files. If not, reject the entire operation.
*/
fdp = data;
- FILEDESC_LOCK(fdescp);
+ FILEDESC_SLOCK(fdescp);
for (i = 0; i < oldfds; i++) {
fd = *fdp++;
if ((unsigned)fd >= fdescp->fd_nfiles ||
fdescp->fd_ofiles[fd] == NULL) {
- FILEDESC_UNLOCK(fdescp);
+ FILEDESC_SUNLOCK(fdescp);
error = EBADF;
goto out;
}
fp = fdescp->fd_ofiles[fd];
if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
- FILEDESC_UNLOCK(fdescp);
+ FILEDESC_SUNLOCK(fdescp);
error = EOPNOTSUPP;
goto out;
}
}
+
/*
* Now replace the integer FDs with pointers to
* the associated global file table entry..
@@ -1491,7 +1754,7 @@
*controlp = sbcreatecontrol(NULL, newlen,
SCM_RIGHTS, SOL_SOCKET);
if (*controlp == NULL) {
- FILEDESC_UNLOCK(fdescp);
+ FILEDESC_SUNLOCK(fdescp);
error = E2BIG;
goto out;
}
@@ -1508,7 +1771,7 @@
FILE_UNLOCK(fp);
unp_rights++;
}
- FILEDESC_UNLOCK(fdescp);
+ FILEDESC_SUNLOCK(fdescp);
break;
case SCM_TIMESTAMP:
@@ -1546,11 +1809,12 @@
return (error);
}
-struct mbuf *
+static struct mbuf *
unp_addsockcred(struct thread *td, struct mbuf *control)
{
- struct mbuf *m, *n;
+ struct mbuf *m, *n, *n_prev;
struct sockcred *sc;
+ const struct cmsghdr *cm;
int ngroups;
int i;
@@ -1559,7 +1823,6 @@
m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET);
if (m == NULL)
return (control);
- m->m_next = NULL;
sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *));
sc->sc_uid = td->td_ucred->cr_ruid;
@@ -1571,16 +1834,30 @@
sc->sc_groups[i] = td->td_ucred->cr_groups[i];
/*
- * If a control message already exists, append us to the end.
+ * Unlink SCM_CREDS control messages (struct cmsgcred), since just
+ * created SCM_CREDS control message (struct sockcred) has another
+ * format.
*/
- if (control != NULL) {
- for (n = control; n->m_next != NULL; n = n->m_next)
- ;
- n->m_next = m;
- } else
- control = m;
+ if (control != NULL)
+ for (n = control, n_prev = NULL; n != NULL;) {
+ cm = mtod(n, struct cmsghdr *);
+ if (cm->cmsg_level == SOL_SOCKET &&
+ cm->cmsg_type == SCM_CREDS) {
+ if (n_prev == NULL)
+ control = n->m_next;
+ else
+ n_prev->m_next = n->m_next;
+ n = m_free(n);
+ } else {
+ n_prev = n;
+ n = n->m_next;
+ }
+ }
+
+ /* Prepend it to the head. */
+ m->m_next = control;
- return (control);
+ return (m);
}
/*
@@ -1609,13 +1886,14 @@
unp_taskcount++;
unp_defer = 0;
/*
- * before going through all this, set all FDs to
- * be NOT defered and NOT externally accessible
+ * Before going through all this, set all FDs to be NOT deferred and
+ * NOT externally accessible.
*/
sx_slock(&filelist_lock);
LIST_FOREACH(fp, &filehead, f_list)
fp->f_gcflag &= ~(FMARK|FDEFER);
do {
+ KASSERT(unp_defer >= 0, ("unp_gc: unp_defer %d", unp_defer));
LIST_FOREACH(fp, &filehead, f_list) {
FILE_LOCK(fp);
/*
@@ -1633,16 +1911,16 @@
continue;
}
/*
- * If we already marked it as 'defer' in a
- * previous pass, then try process it this time
- * and un-mark it
+ * If we already marked it as 'defer' in a
+ * previous pass, then try to process it this
+ * time and un-mark it.
*/
if (fp->f_gcflag & FDEFER) {
fp->f_gcflag &= ~FDEFER;
unp_defer--;
} else {
/*
- * if it's not defered, then check if it's
+ * If it's not deferred, then check if it's
* already marked.. if so skip it
*/
if (fp->f_gcflag & FMARK) {
@@ -1650,9 +1928,9 @@
continue;
}
/*
- * If all references are from messages
- * in transit, then skip it. it's not
- * externally accessible.
+ * If all references are from messages in
+ * transit, then skip it. it's not externally
+ * accessible.
*/
if (fp->f_count == fp->f_msgcount) {
FILE_UNLOCK(fp);
@@ -1665,29 +1943,47 @@
fp->f_gcflag |= FMARK;
}
/*
- * either it was defered, or it is externally
- * accessible and not already marked so.
- * Now check if it is possibly one of OUR sockets.
+ * Either it was deferred, or it is externally
+ * accessible and not already marked so. Now check
+ * if it is possibly one of OUR sockets.
*/
if (fp->f_type != DTYPE_SOCKET ||
(so = fp->f_data) == NULL) {
FILE_UNLOCK(fp);
continue;
}
- FILE_UNLOCK(fp);
if (so->so_proto->pr_domain != &localdomain ||
- (so->so_proto->pr_flags&PR_RIGHTS) == 0)
+ (so->so_proto->pr_flags & PR_RIGHTS) == 0) {
+ FILE_UNLOCK(fp);
continue;
+ }
+
+ /*
+ * Tell any other threads that do a subsequent
+ * fdrop() that we are scanning the message
+ * buffers.
+ */
+ fp->f_gcflag |= FWAIT;
+ FILE_UNLOCK(fp);
+
/*
- * So, Ok, it's one of our sockets and it IS externally
- * accessible (or was defered). Now we look
- * to see if we hold any file descriptors in its
+ * So, Ok, it's one of our sockets and it IS
+ * externally accessible (or was deferred). Now we
+ * look to see if we hold any file descriptors in its
* message buffers. Follow those links and mark them
* as accessible too.
*/
SOCKBUF_LOCK(&so->so_rcv);
unp_scan(so->so_rcv.sb_mb, unp_mark);
SOCKBUF_UNLOCK(&so->so_rcv);
+
+ /*
+ * Wake up any threads waiting in fdrop().
+ */
+ FILE_LOCK(fp);
+ fp->f_gcflag &= ~FWAIT;
+ wakeup(&fp->f_gcflag);
+ FILE_UNLOCK(fp);
}
} while (unp_defer);
sx_sunlock(&filelist_lock);
@@ -1695,9 +1991,9 @@
* XXXRW: The following comments need updating for a post-SMPng and
* deferred unp_gc() world, but are still generally accurate.
*
- * We grab an extra reference to each of the file table entries
- * that are not otherwise accessible and then free the rights
- * that are stored in messages on them.
+ * We grab an extra reference to each of the file table entries that
+ * are not otherwise accessible and then free the rights that are
+ * stored in messages on them.
*
* The bug in the orginal code is a little tricky, so I'll describe
* what's wrong with it here.
@@ -1711,12 +2007,12 @@
* results in the following chain. Closef calls soo_close, which
* calls soclose. Soclose calls first (through the switch
* uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply
- * returns because the previous instance had set unp_gcing, and
- * we return all the way back to soclose, which marks the socket
- * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush
- * to free up the rights that are queued in messages on the socket A,
- * i.e., the reference on B. The sorflush calls via the dom_dispose
- * switch unp_dispose, which unp_scans with unp_discard. This second
+ * returns because the previous instance had set unp_gcing, and we
+ * return all the way back to soclose, which marks the socket with
+ * SS_NOFDREF, and then calls sofree. Sofree calls sorflush to free
+ * up the rights that are queued in messages on the socket A, i.e.,
+ * the reference on B. The sorflush calls via the dom_dispose switch
+ * unp_dispose, which unp_scans with unp_discard. This second
* instance of unp_discard just calls closef on B.
*
* Well, a similar chain occurs on B, resulting in a sorflush on B,
@@ -1725,11 +2021,11 @@
* SS_NOFDREF, and soclose panics at this point.
*
* Here, we first take an extra reference to each inaccessible
- * descriptor. Then, we call sorflush ourself, since we know
- * it is a Unix domain socket anyhow. After we destroy all the
- * rights carried in messages, we do a last closef to get rid
- * of our extra reference. This is the last close, and the
- * unp_detach etc will shut down the socket.
+ * descriptor. Then, we call sorflush ourself, since we know it is a
+ * Unix domain socket anyhow. After we destroy all the rights
+ * carried in messages, we do a last closef to get rid of our extra
+ * reference. This is the last close, and the unp_detach etc will
+ * shut down the socket.
*
* 91/09/19, bsy at cs.cmu.edu
*/
@@ -1757,9 +2053,9 @@
}
/*
* If all refs are from msgs, and it's not marked accessible
- * then it must be referenced from some unreachable cycle
- * of (shut-down) FDs, so include it in our
- * list of FDs to remove
+ * then it must be referenced from some unreachable cycle of
+ * (shut-down) FDs, so include it in our list of FDs to
+ * remove.
*/
if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) {
*fpp++ = fp;
@@ -1770,7 +2066,7 @@
}
sx_sunlock(&filelist_lock);
/*
- * for each FD on our hit list, do the following two things
+ * For each FD on our hit list, do the following two things:
*/
for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
struct file *tfp = *fpp;
@@ -1798,24 +2094,6 @@
unp_scan(m, unp_discard);
}
-static int
-unp_listen(struct socket *so, struct unpcb *unp, struct thread *td)
-{
- int error;
-
- UNP_LOCK_ASSERT();
-
- SOCK_LOCK(so);
- error = solisten_proto_check(so);
- if (error == 0) {
- cru2x(td->td_ucred, &unp->unp_peercred);
- unp->unp_flags |= UNP_HAVEPCCACHED;
- solisten_proto(so);
- }
- SOCK_UNLOCK(so);
- return (error);
-}
-
static void
unp_scan(struct mbuf *m0, void (*op)(struct file *))
{
@@ -1868,6 +2146,9 @@
static void
unp_mark(struct file *fp)
{
+
+ /* XXXRW: Should probably assert file list lock here. */
+
if (fp->f_gcflag & FMARK)
return;
unp_defer++;
@@ -1877,11 +2158,128 @@
static void
unp_discard(struct file *fp)
{
- UNP_LOCK();
+
+ UNP_GLOBAL_WLOCK();
FILE_LOCK(fp);
fp->f_msgcount--;
unp_rights--;
FILE_UNLOCK(fp);
- UNP_UNLOCK();
+ UNP_GLOBAL_WUNLOCK();
(void) closef(fp, (struct thread *)NULL);
}
+
+#ifdef DDB
+static void
+db_print_indent(int indent)
+{
+ int i;
+
+ for (i = 0; i < indent; i++)
+ db_printf(" ");
+}
+
+static void
+db_print_unpflags(int unp_flags)
+{
+ int comma;
+
+ comma = 0;
+ if (unp_flags & UNP_HAVEPC) {
+ db_printf("%sUNP_HAVEPC", comma ? ", " : "");
+ comma = 1;
+ }
+ if (unp_flags & UNP_HAVEPCCACHED) {
+ db_printf("%sUNP_HAVEPCCACHED", comma ? ", " : "");
+ comma = 1;
+ }
+ if (unp_flags & UNP_WANTCRED) {
+ db_printf("%sUNP_WANTCRED", comma ? ", " : "");
+ comma = 1;
+ }
+ if (unp_flags & UNP_CONNWAIT) {
+ db_printf("%sUNP_CONNWAIT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (unp_flags & UNP_CONNECTING) {
+ db_printf("%sUNP_CONNECTING", comma ? ", " : "");
+ comma = 1;
+ }
+ if (unp_flags & UNP_BINDING) {
+ db_printf("%sUNP_BINDING", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_xucred(int indent, struct xucred *xu)
+{
+ int comma, i;
+
+ db_print_indent(indent);
+ db_printf("cr_version: %u cr_uid: %u cr_ngroups: %d\n",
+ xu->cr_version, xu->cr_uid, xu->cr_ngroups);
+ db_print_indent(indent);
+ db_printf("cr_groups: ");
+ comma = 0;
+ for (i = 0; i < xu->cr_ngroups; i++) {
+ db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]);
+ comma = 1;
+ }
+ db_printf("\n");
+}
+
+static void
+db_print_unprefs(int indent, struct unp_head *uh)
+{
+ struct unpcb *unp;
+ int counter;
+
+ counter = 0;
+ LIST_FOREACH(unp, uh, unp_reflink) {
+ if (counter % 4 == 0)
+ db_print_indent(indent);
+ db_printf("%p ", unp);
+ if (counter % 4 == 3)
+ db_printf("\n");
+ counter++;
+ }
+ if (counter != 0 && counter % 4 != 0)
+ db_printf("\n");
+}
+
+DB_SHOW_COMMAND(unpcb, db_show_unpcb)
+{
+ struct unpcb *unp;
+
+ if (!have_addr) {
+ db_printf("usage: show unpcb <addr>\n");
+ return;
+ }
+ unp = (struct unpcb *)addr;
+
+ db_printf("unp_socket: %p unp_vnode: %p\n", unp->unp_socket,
+ unp->unp_vnode);
+
+ db_printf("unp_ino: %d unp_conn: %p\n", unp->unp_ino,
+ unp->unp_conn);
+
+ db_printf("unp_refs:\n");
+ db_print_unprefs(2, &unp->unp_refs);
+
+ /* XXXRW: Would be nice to print the full address, if any. */
+ db_printf("unp_addr: %p\n", unp->unp_addr);
+
+ db_printf("unp_cc: %d unp_mbcnt: %d unp_gencnt: %llu\n",
+ unp->unp_cc, unp->unp_mbcnt,
+ (unsigned long long)unp->unp_gencnt);
+
+ db_printf("unp_flags: %x (", unp->unp_flags);
+ db_print_unpflags(unp->unp_flags);
+ db_printf(")\n");
+
+ db_printf("unp_peercred:\n");
+ db_print_xucred(2, &unp->unp_peercred);
+
+ db_printf("unp_refcount: %u\n", unp->unp_refcount);
+}
+#endif
Index: kern_poll.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_poll.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_poll.c -L sys/kern/kern_poll.c -u -r1.2 -r1.3
--- sys/kern/kern_poll.c
+++ sys/kern/kern_poll.c
@@ -26,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_poll.c,v 1.19.2.2 2005/10/07 14:00:05 glebius Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_poll.c,v 1.31 2007/08/06 14:26:00 rwatson Exp $");
#include "opt_device_polling.h"
@@ -113,7 +113,7 @@
uint32_t val = poll_burst_max;
int error;
- error = sysctl_handle_int(oidp, &val, sizeof(int), req);
+ error = sysctl_handle_int(oidp, &val, 0, req);
if (error || !req->newptr )
return (error);
if (val < MIN_POLL_BURST_MAX || val > MAX_POLL_BURST_MAX)
@@ -137,7 +137,7 @@
uint32_t val = poll_each_burst;
int error;
- error = sysctl_handle_int(oidp, &val, sizeof(int), req);
+ error = sysctl_handle_int(oidp, &val, 0, req);
if (error || !req->newptr )
return (error);
if (val < 1)
@@ -167,7 +167,7 @@
uint32_t val = user_frac;
int error;
- error = sysctl_handle_int(oidp, &val, sizeof(int), req);
+ error = sysctl_handle_int(oidp, &val, 0, req);
if (error || !req->newptr )
return (error);
if (val < 0 || val > 99)
@@ -190,7 +190,7 @@
uint32_t val = reg_frac;
int error;
- error = sysctl_handle_int(oidp, &val, sizeof(int), req);
+ error = sysctl_handle_int(oidp, &val, 0, req);
if (error || !req->newptr )
return (error);
if (val < 1 || val > hz)
@@ -329,7 +329,6 @@
{
int i;
- NET_LOCK_GIANT();
mtx_lock(&poll_mtx);
if (count > poll_each_burst)
@@ -339,7 +338,6 @@
pr[i].handler(pr[i].ifp, POLL_ONLY, count);
mtx_unlock(&poll_mtx);
- NET_UNLOCK_GIANT();
}
/*
@@ -366,8 +364,6 @@
struct timeval t;
int kern_load;
- NET_ASSERT_GIANT();
-
mtx_lock(&poll_mtx);
phase = 5;
if (residual_burst > 0) {
@@ -417,8 +413,6 @@
int i, cycles;
enum poll_cmd arg = POLL_ONLY;
- NET_ASSERT_GIANT();
-
mtx_lock(&poll_mtx);
phase = 3;
if (residual_burst == 0) { /* first call in this tick */
@@ -456,8 +450,6 @@
KASSERT(h != NULL, ("%s: handler is NULL", __func__));
KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
- NET_ASSERT_GIANT();
-
mtx_lock(&poll_mtx);
if (poll_handlers >= POLL_LIST_LEN) {
/*
@@ -504,7 +496,6 @@
KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
- NET_ASSERT_GIANT();
mtx_lock(&poll_mtx);
for (i = 0 ; i < poll_handlers ; i++)
@@ -535,7 +526,7 @@
int error;
int val = polling;
- error = sysctl_handle_int(oidp, &val, sizeof(int), req);
+ error = sysctl_handle_int(oidp, &val, 0, req);
if (error || !req->newptr )
return (error);
@@ -547,7 +538,6 @@
polling = val;
- NET_LOCK_GIANT();
IFNET_RLOCK();
TAILQ_FOREACH(ifp, &ifnet, if_link) {
if (ifp->if_capabilities & IFCAP_POLLING) {
@@ -565,7 +555,6 @@
}
}
IFNET_RUNLOCK();
- NET_UNLOCK_GIANT();
log(LOG_ERR, "kern.polling.enable is deprecated. Use ifconfig(8)");
@@ -580,17 +569,17 @@
rtp.prio = RTP_PRIO_MAX; /* lowest priority */
rtp.type = RTP_PRIO_IDLE;
- mtx_lock_spin(&sched_lock);
- rtp_to_pri(&rtp, td->td_ksegrp);
- mtx_unlock_spin(&sched_lock);
+ PROC_SLOCK(td->td_proc);
+ rtp_to_pri(&rtp, td);
+ PROC_SUNLOCK(td->td_proc);
for (;;) {
if (poll_in_idle_loop && poll_handlers > 0) {
idlepoll_sleeping = 0;
ether_poll(poll_each_burst);
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
mi_switch(SW_VOL, NULL);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
} else {
idlepoll_sleeping = 1;
tsleep(&idlepoll_sleeping, 0, "pollid", hz * 3);
Index: subr_taskqueue.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_taskqueue.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/subr_taskqueue.c -L sys/kern/subr_taskqueue.c -u -r1.2 -r1.3
--- sys/kern/subr_taskqueue.c
+++ sys/kern/subr_taskqueue.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/kern/subr_taskqueue.c,v 1.27.2.4 2006/07/06 08:32:50 glebius Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_taskqueue.c,v 1.39 2007/06/05 00:00:54 jeff Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -131,10 +131,8 @@
struct taskqueue *
taskqueue_create(const char *name, int mflags,
- taskqueue_enqueue_fn enqueue, void *context,
- struct proc **pp)
+ taskqueue_enqueue_fn enqueue, void *context)
{
- (void) pp;
return _taskqueue_create(name, mflags, enqueue, context,
MTX_DEF, "taskqueue");
}
@@ -317,32 +315,48 @@
{
va_list ap;
struct taskqueue *tq;
+ struct thread *td;
char ktname[MAXCOMLEN];
- int i;
+ int i, error;
if (count <= 0)
return (EINVAL);
tq = *tqp;
- if ((tq->tq_pproc = malloc(sizeof(struct proc *) * count, M_TASKQUEUE,
- M_NOWAIT | M_ZERO)) == NULL)
- return (ENOMEM);
-
va_start(ap, name);
vsnprintf(ktname, MAXCOMLEN, name, ap);
va_end(ap);
+ tq->tq_pproc = malloc(sizeof(struct proc *) * count, M_TASKQUEUE,
+ M_NOWAIT | M_ZERO);
+ if (tq->tq_pproc == NULL) {
+ printf("%s: no memory for %s threads\n", __func__, ktname);
+ return (ENOMEM);
+ }
+
for (i = 0; i < count; i++) {
if (count == 1)
- kthread_create(taskqueue_thread_loop, tqp,
- &tq->tq_pproc[i], 0, 0, ktname);
+ error = kthread_create(taskqueue_thread_loop, tqp,
+ &tq->tq_pproc[i], RFSTOPPED, 0, ktname);
else
- kthread_create(taskqueue_thread_loop, tqp,
- &tq->tq_pproc[i], 0, 0, "%s_%d", ktname, i);
- mtx_lock_spin(&sched_lock);
- sched_prio(FIRST_THREAD_IN_PROC(tq->tq_pproc[i]), pri);
- mtx_unlock_spin(&sched_lock);
- tq->tq_pcount++;
+ error = kthread_create(taskqueue_thread_loop, tqp,
+ &tq->tq_pproc[i], RFSTOPPED, 0, "%s_%d", ktname, i);
+ if (error) {
+ /* should be ok to continue, taskqueue_free will dtrt */
+ printf("%s: kthread_create(%s): error %d",
+ __func__, ktname, error);
+ tq->tq_pproc[i] = NULL; /* paranoid */
+ } else
+ tq->tq_pcount++;
+ }
+ for (i = 0; i < count; i++) {
+ if (tq->tq_pproc[i] == NULL)
+ continue;
+ td = FIRST_THREAD_IN_PROC(tq->tq_pproc[i]);
+ thread_lock(td);
+ sched_prio(td, pri);
+ sched_add(td, SRQ_BORING);
+ thread_unlock(td);
}
return (0);
@@ -358,7 +372,7 @@
TQ_LOCK(tq);
do {
taskqueue_run(tq);
- TQ_SLEEP(tq, tq, &tq->tq_mutex, curthread->td_priority, "-", 0);
+ TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0);
} while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0);
/* rendezvous with thread that asked us to terminate */
Index: vfs_cache.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_cache.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/vfs_cache.c -L sys/kern/vfs_cache.c -u -r1.2 -r1.3
--- sys/kern/vfs_cache.c
+++ sys/kern/vfs_cache.c
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_cache.c,v 1.103.2.1 2006/03/13 03:06:14 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_cache.c,v 1.114 2007/09/21 10:16:56 pjd Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -141,8 +141,8 @@
SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
/* Export size information to userland */
-SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
-SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
+SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
+ sizeof(struct namecache), "");
/*
* The new name cache statistics
@@ -293,37 +293,6 @@
}
/*
- * cache_leaf_test()
- *
- * Test whether this (directory) vnode's namei cache entry contains
- * subdirectories or not. Used to determine whether the directory is
- * a leaf in the namei cache or not. Note: the directory may still
- * contain files in the namei cache.
- *
- * Returns 0 if the directory is a leaf, -1 if it isn't.
- */
-int
-cache_leaf_test(struct vnode *vp)
-{
- struct namecache *ncpc;
- int leaf;
-
- leaf = 0;
- CACHE_LOCK();
- for (ncpc = LIST_FIRST(&vp->v_cache_src);
- ncpc != NULL;
- ncpc = LIST_NEXT(ncpc, nc_src)
- ) {
- if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR) {
- leaf = -1;
- break;
- }
- }
- CACHE_UNLOCK();
- return (leaf);
-}
-
-/*
* Lookup an entry in the cache
*
* Lookup is called with dvp pointing to the directory to search,
@@ -345,13 +314,15 @@
struct componentname *cnp;
{
struct namecache *ncp;
+ struct thread *td;
u_int32_t hash;
- int error;
+ int error, ltype;
if (!doingcache) {
cnp->cn_flags &= ~MAKEENTRY;
return (0);
}
+ td = cnp->cn_thread;
retry:
CACHE_LOCK();
numcalls++;
@@ -450,15 +421,29 @@
if (dvp == *vpp) { /* lookup on "." */
VREF(*vpp);
CACHE_UNLOCK();
+ /*
+ * When we lookup "." we still can be asked to lock it
+ * differently...
+ */
+ ltype = cnp->cn_lkflags & (LK_SHARED | LK_EXCLUSIVE);
+ if (ltype == VOP_ISLOCKED(*vpp, td))
+ return (-1);
+ else if (ltype == LK_EXCLUSIVE)
+ vn_lock(*vpp, LK_UPGRADE | LK_RETRY, td);
return (-1);
}
- if (cnp->cn_flags & ISDOTDOT)
- VOP_UNLOCK(dvp, 0, cnp->cn_thread);
+ ltype = 0; /* silence gcc warning */
+ if (cnp->cn_flags & ISDOTDOT) {
+ ltype = VOP_ISLOCKED(dvp, td);
+ VOP_UNLOCK(dvp, 0, td);
+ }
VI_LOCK(*vpp);
CACHE_UNLOCK();
- error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread);
+ error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, td);
if (cnp->cn_flags & ISDOTDOT)
- vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, cnp->cn_thread);
+ vn_lock(dvp, ltype | LK_RETRY, td);
+ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_lkflags & LK_EXCLUSIVE))
+ ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
if (error) {
*vpp = NULL;
goto retry;
@@ -601,9 +586,6 @@
/*
* Flush all entries referencing a particular filesystem.
- *
- * Since we need to check it anyway, we will flush all the invalid
- * entries at the same time.
*/
void
cache_purgevfs(mp)
@@ -611,24 +593,15 @@
{
struct nchashhead *ncpp;
struct namecache *ncp, *nnp;
- struct nchashhead mplist;
-
- LIST_INIT(&mplist);
- ncp = NULL;
/* Scan hash tables for applicable entries */
CACHE_LOCK();
for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
- for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
- nnp = LIST_NEXT(ncp, nc_hash);
- if (ncp->nc_dvp->v_mount == mp) {
- LIST_REMOVE(ncp, nc_hash);
- LIST_INSERT_HEAD(&mplist, ncp, nc_hash);
- }
+ LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
+ if (ncp->nc_dvp->v_mount == mp)
+ cache_zap(ncp);
}
}
- while (!LIST_EMPTY(&mplist))
- cache_zap(LIST_FIRST(&mplist));
CACHE_UNLOCK();
}
@@ -690,7 +663,7 @@
SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
"Disable the getcwd syscall");
-/* Implementation of the getcwd syscall */
+/* Implementation of the getcwd syscall. */
int
__getcwd(td, uap)
struct thread *td;
@@ -717,10 +690,10 @@
tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
fdp = td->td_proc->p_fd;
mtx_lock(&Giant);
- FILEDESC_LOCK(fdp);
+ FILEDESC_SLOCK(fdp);
error = vn_fullpath1(td, fdp->fd_cdir, fdp->fd_rdir, tmpbuf,
&bp, buflen);
- FILEDESC_UNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
mtx_unlock(&Giant);
if (!error) {
@@ -771,11 +744,9 @@
buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
fdp = td->td_proc->p_fd;
- mtx_lock(&Giant);
- FILEDESC_LOCK(fdp);
+ FILEDESC_SLOCK(fdp);
error = vn_fullpath1(td, vn, fdp->fd_rdir, buf, retbuf, MAXPATHLEN);
- FILEDESC_UNLOCK(fdp);
- mtx_unlock(&Giant);
+ FILEDESC_SUNLOCK(fdp);
if (!error)
*freebuf = buf;
@@ -795,8 +766,6 @@
int error, i, slash_prefixed;
struct namecache *ncp;
- mtx_assert(&Giant, MA_OWNED);
-
bp = buf + buflen - 1;
*bp = '\0';
error = 0;
Index: kern_pmc.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_pmc.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_pmc.c -L sys/kern/kern_pmc.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_pmc.c
+++ sys/kern/kern_pmc.c
@@ -24,7 +24,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_pmc.c,v 1.4.2.1 2005/08/15 18:46:12 jkoshy Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_pmc.c,v 1.6 2005/12/04 02:12:43 ru Exp $");
#include "opt_hwpmc_hooks.h"
@@ -33,7 +33,7 @@
#include <sys/pmckern.h>
#include <sys/smp.h>
-#if HWPMC_HOOKS
+#ifdef HWPMC_HOOKS
#define PMC_KERNEL_VERSION PMC_VERSION
#else
#define PMC_KERNEL_VERSION 0
Index: sys_process.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sys_process.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/sys_process.c -L sys/kern/sys_process.c -u -r1.2 -r1.3
--- sys/kern/sys_process.c
+++ sys/kern/sys_process.c
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sys_process.c,v 1.131.2.3 2006/03/07 18:08:09 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sys_process.c,v 1.145 2007/10/09 00:03:39 jeff Exp $");
#include "opt_compat.h"
@@ -49,6 +49,8 @@
#include <machine/reg.h>
+#include <security/audit/audit.h>
+
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_extern.h>
@@ -102,7 +104,7 @@
int error; \
\
PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); \
- if ((td->td_proc->p_sflag & PS_INMEM) == 0) \
+ if ((td->td_proc->p_flag & P_INMEM) == 0) \
error = EIO; \
else \
error = (action); \
@@ -366,9 +368,6 @@
#define COPYIN(u, k, s) copyin(u, k, s)
#define COPYOUT(k, u, s) copyout(k, u, s)
#endif
-/*
- * MPSAFE
- */
int
ptrace(struct thread *td, struct ptrace_args *uap)
{
@@ -397,6 +396,10 @@
if (td->td_proc->p_sysent == &ia32_freebsd_sysvec)
wrap32 = 1;
#endif
+ AUDIT_ARG(pid, uap->pid);
+ AUDIT_ARG(cmd, uap->req);
+ AUDIT_ARG(addr, uap->addr);
+ AUDIT_ARG(value, uap->data);
addr = &r;
switch (uap->req) {
case PT_GETREGS:
@@ -524,12 +527,12 @@
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
FOREACH_THREAD_IN_PROC(p, td2) {
if (td2->td_tid == pid)
break;
}
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
if (td2 != NULL)
break; /* proc lock held */
PROC_UNLOCK(p);
@@ -544,6 +547,7 @@
pid = p->p_pid;
}
}
+ AUDIT_ARG(process, p);
if ((p->p_flag & P_WEXIT) != 0) {
error = ESRCH;
@@ -697,15 +701,15 @@
break;
case PT_SUSPEND:
- mtx_lock_spin(&sched_lock);
+ thread_lock(td2);
td2->td_flags |= TDF_DBSUSPEND;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td2);
break;
case PT_RESUME:
- mtx_lock_spin(&sched_lock);
+ thread_lock(td2);
td2->td_flags &= ~TDF_DBSUSPEND;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td2);
break;
case PT_STEP:
@@ -748,6 +752,10 @@
if (p->p_oppid != p->p_pptr->p_pid) {
struct proc *pp;
+ PROC_LOCK(p->p_pptr);
+ sigqueue_take(p->p_ksi);
+ PROC_UNLOCK(p->p_pptr);
+
PROC_UNLOCK(p);
pp = pfind(p->p_oppid);
if (pp == NULL)
@@ -763,6 +771,7 @@
p->p_oppid = 0;
/* should we send SIGCHLD? */
+ /* childproc_continued(p); */
}
sendsig:
@@ -770,36 +779,41 @@
sx_xunlock(&proctree_lock);
proctree_locked = 0;
}
- /* deliver or queue signal */
- mtx_lock_spin(&sched_lock);
- td2->td_flags &= ~TDF_XSIG;
- mtx_unlock_spin(&sched_lock);
- td2->td_xsig = data;
p->p_xstat = data;
p->p_xthread = NULL;
if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) != 0) {
- mtx_lock_spin(&sched_lock);
+ /* deliver or queue signal */
+ thread_lock(td2);
+ td2->td_flags &= ~TDF_XSIG;
+ thread_unlock(td2);
+ td2->td_xsig = data;
+
+ PROC_SLOCK(p);
if (req == PT_DETACH) {
struct thread *td3;
- FOREACH_THREAD_IN_PROC(p, td3)
+ FOREACH_THREAD_IN_PROC(p, td3) {
+ thread_lock(td3);
td3->td_flags &= ~TDF_DBSUSPEND;
+ thread_unlock(td3);
+ }
}
/*
* unsuspend all threads, to not let a thread run,
* you should use PT_SUSPEND to suspend it before
* continuing process.
*/
- mtx_unlock_spin(&sched_lock);
+#ifdef KSE
+ PROC_SUNLOCK(p);
thread_continued(p);
+ PROC_SLOCK(p);
+#endif
p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG|P_WAITED);
- mtx_lock_spin(&sched_lock);
thread_unsuspend(p);
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
+ } else {
+ if (data)
+ psignal(p, data);
}
-
- if (data)
- psignal(p, data);
-
break;
case PT_WRITE_I:
@@ -918,7 +932,7 @@
break;
case PT_LWPINFO:
- if (data == 0 || data > sizeof(*pl)) {
+ if (data <= 0 || data > sizeof(*pl)) {
error = EINVAL;
break;
}
@@ -928,6 +942,7 @@
pl->pl_event = PL_EVENT_SIGNAL;
else
pl->pl_event = 0;
+#ifdef KSE
if (td2->td_pflags & TDP_SA) {
pl->pl_flags = PL_FLAG_SA;
if (td2->td_upcall && !TD_CAN_UNBIND(td2))
@@ -935,6 +950,11 @@
} else {
pl->pl_flags = 0;
}
+#else
+ pl->pl_flags = 0;
+#endif
+ pl->pl_sigmask = td2->td_sigmask;
+ pl->pl_siglist = td2->td_siglist;
break;
case PT_GETNUMLWPS:
@@ -951,18 +971,18 @@
buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK);
tmp = 0;
PROC_LOCK(p);
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
FOREACH_THREAD_IN_PROC(p, td2) {
if (tmp >= num)
break;
buf[tmp++] = td2->td_tid;
}
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
PROC_UNLOCK(p);
error = copyout(buf, addr, tmp * sizeof(lwpid_t));
free(buf, M_TEMP);
if (!error)
- td->td_retval[0] = num;
+ td->td_retval[0] = tmp;
PROC_LOCK(p);
break;
Index: sysv_sem.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sysv_sem.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/sysv_sem.c -L sys/kern/sysv_sem.c -u -r1.1.1.1 -r1.2
--- sys/kern/sysv_sem.c
+++ sys/kern/sysv_sem.c
@@ -37,7 +37,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sysv_sem.c,v 1.78 2005/06/07 05:03:27 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sysv_sem.c,v 1.89 2007/07/03 15:58:47 kib Exp $");
#include "opt_sysvipc.h"
#include "opt_mac.h"
@@ -53,11 +53,14 @@
#include <sys/mutex.h>
#include <sys/sem.h>
#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/sysctl.h>
+#include <sys/uio.h>
#include <sys/malloc.h>
#include <sys/jail.h>
-#include <sys/mac.h>
+
+#include <security/mac/mac_framework.h>
static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores");
@@ -66,11 +69,6 @@
#else
#define DPRINTF(a)
#endif
-#ifdef MAC_DEBUG
-#define MPRINTF(a) printf a
-#else
-#define MPRINTF(a)
-#endif
static void seminit(void);
static int sysvsem_modload(struct module *, int, void *);
@@ -196,7 +194,6 @@
SEMAEM /* adjust on exit max value */
};
-SYSCTL_DECL(_kern_ipc);
SYSCTL_INT(_kern_ipc, OID_AUTO, semmap, CTLFLAG_RW, &seminfo.semmap, 0,
"Number of entries in the semaphore map");
SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0,
@@ -322,9 +319,7 @@
MODULE_VERSION(sysvsem, 1);
/*
- * Entry point for all SEM calls
- *
- * MPSAFE
+ * Entry point for all SEM calls.
*/
int
semsys(td, uap)
@@ -536,7 +531,7 @@
}
/*
- * Note that the user-mode half of this passes a union, not a pointer
+ * Note that the user-mode half of this passes a union, not a pointer.
*/
#ifndef _SYS_SYSPROTO_H_
struct __semctl_args {
@@ -546,29 +541,80 @@
union semun *arg;
};
#endif
-
-/*
- * MPSAFE
- */
int
__semctl(td, uap)
struct thread *td;
struct __semctl_args *uap;
{
- int semid = uap->semid;
- int semnum = uap->semnum;
- int cmd = uap->cmd;
+ struct semid_ds dsbuf;
+ union semun arg, semun;
+ register_t rval;
+ int error;
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_SET:
+ case IPC_STAT:
+ case GETALL:
+ case SETVAL:
+ case SETALL:
+ error = copyin(uap->arg, &arg, sizeof(arg));
+ if (error)
+ return (error);
+ break;
+ }
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_STAT:
+ semun.buf = &dsbuf;
+ break;
+ case IPC_SET:
+ error = copyin(arg.buf, &dsbuf, sizeof(dsbuf));
+ if (error)
+ return (error);
+ semun.buf = &dsbuf;
+ break;
+ case GETALL:
+ case SETALL:
+ semun.array = arg.array;
+ break;
+ case SETVAL:
+ semun.val = arg.val;
+ break;
+ }
+
+ error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
+ &rval);
+ if (error)
+ return (error);
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_STAT:
+ error = copyout(&dsbuf, arg.buf, sizeof(dsbuf));
+ break;
+ }
+
+ if (error == 0)
+ td->td_retval[0] = rval;
+ return (error);
+}
+
+int
+kern_semctl(struct thread *td, int semid, int semnum, int cmd,
+ union semun *arg, register_t *rval)
+{
u_short *array;
- union semun *arg = uap->arg;
- union semun real_arg;
struct ucred *cred = td->td_ucred;
- int i, rval, error;
- struct semid_ds sbuf;
+ int i, error;
+ struct semid_ds *sbuf;
struct semid_kernel *semakptr;
struct mtx *sema_mtxp;
u_short usval, count;
+ int semidx;
- DPRINTF(("call to semctl(%d, %d, %d, 0x%x)\n",
+ DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n",
semid, semnum, cmd, arg));
if (!jail_sysvipc_allowed && jailed(td->td_ucred))
return (ENOSYS);
@@ -577,10 +623,12 @@
switch(cmd) {
case SEM_STAT:
+ /*
+ * For this command we assume semid is an array index
+ * rather than an IPC id.
+ */
if (semid < 0 || semid >= seminfo.semmni)
return (EINVAL);
- if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
- return (error);
semakptr = &sema[semid];
sema_mtxp = &sema_mtx[semid];
mtx_lock(sema_mtxp);
@@ -592,45 +640,34 @@
goto done2;
#ifdef MAC
error = mac_check_sysv_semctl(cred, semakptr, cmd);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_semctl returned %d\n",
- error));
+ if (error != 0)
goto done2;
- }
#endif
+ bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
+ *rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm);
mtx_unlock(sema_mtxp);
- error = copyout(&semakptr->u, real_arg.buf,
- sizeof(struct semid_ds));
- rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm);
- if (error == 0)
- td->td_retval[0] = rval;
- return (error);
+ return (0);
}
- semid = IPCID_TO_IX(semid);
- if (semid < 0 || semid >= seminfo.semmni)
+ semidx = IPCID_TO_IX(semid);
+ if (semidx < 0 || semidx >= seminfo.semmni)
return (EINVAL);
- semakptr = &sema[semid];
- sema_mtxp = &sema_mtx[semid];
-#ifdef MAC
+ semakptr = &sema[semidx];
+ sema_mtxp = &sema_mtx[semidx];
mtx_lock(sema_mtxp);
+#ifdef MAC
error = mac_check_sysv_semctl(cred, semakptr, cmd);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_semctl returned %d\n", error));
- mtx_unlock(sema_mtxp);
- return (error);
- }
- mtx_unlock(sema_mtxp);
+ if (error != 0)
+ goto done2;
#endif
error = 0;
- rval = 0;
+ *rval = 0;
switch (cmd) {
case IPC_RMID:
- mtx_lock(sema_mtxp);
- if ((error = semvalid(uap->semid, semakptr)) != 0)
+ if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
goto done2;
@@ -649,45 +686,34 @@
mac_cleanup_sysv_sem(semakptr);
#endif
SEMUNDO_LOCK();
- semundo_clear(semid, -1);
+ semundo_clear(semidx, -1);
SEMUNDO_UNLOCK();
wakeup(semakptr);
break;
case IPC_SET:
- if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
- goto done2;
- if ((error = copyin(real_arg.buf, &sbuf, sizeof(sbuf))) != 0)
- goto done2;
- mtx_lock(sema_mtxp);
- if ((error = semvalid(uap->semid, semakptr)) != 0)
+ if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
goto done2;
- semakptr->u.sem_perm.uid = sbuf.sem_perm.uid;
- semakptr->u.sem_perm.gid = sbuf.sem_perm.gid;
+ sbuf = arg->buf;
+ semakptr->u.sem_perm.uid = sbuf->sem_perm.uid;
+ semakptr->u.sem_perm.gid = sbuf->sem_perm.gid;
semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode &
- ~0777) | (sbuf.sem_perm.mode & 0777);
+ ~0777) | (sbuf->sem_perm.mode & 0777);
semakptr->u.sem_ctime = time_second;
break;
case IPC_STAT:
- if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
- goto done2;
- mtx_lock(sema_mtxp);
- if ((error = semvalid(uap->semid, semakptr)) != 0)
+ if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
goto done2;
- sbuf = semakptr->u;
- mtx_unlock(sema_mtxp);
- error = copyout(&semakptr->u, real_arg.buf,
- sizeof(struct semid_ds));
+ bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
break;
case GETNCNT:
- mtx_lock(sema_mtxp);
- if ((error = semvalid(uap->semid, semakptr)) != 0)
+ if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
goto done2;
@@ -695,12 +721,11 @@
error = EINVAL;
goto done2;
}
- rval = semakptr->u.sem_base[semnum].semncnt;
+ *rval = semakptr->u.sem_base[semnum].semncnt;
break;
case GETPID:
- mtx_lock(sema_mtxp);
- if ((error = semvalid(uap->semid, semakptr)) != 0)
+ if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
goto done2;
@@ -708,12 +733,11 @@
error = EINVAL;
goto done2;
}
- rval = semakptr->u.sem_base[semnum].sempid;
+ *rval = semakptr->u.sem_base[semnum].sempid;
break;
case GETVAL:
- mtx_lock(sema_mtxp);
- if ((error = semvalid(uap->semid, semakptr)) != 0)
+ if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
goto done2;
@@ -721,29 +745,48 @@
error = EINVAL;
goto done2;
}
- rval = semakptr->u.sem_base[semnum].semval;
+ *rval = semakptr->u.sem_base[semnum].semval;
break;
case GETALL:
- if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
- goto done2;
- array = malloc(sizeof(*array) * semakptr->u.sem_nsems, M_TEMP,
- M_WAITOK);
+ /*
+ * Unfortunately, callers of this function don't know
+ * in advance how many semaphores are in this set.
+ * While we could just allocate the maximum size array
+ * and pass the actual size back to the caller, that
+ * won't work for SETALL since we can't copyin() more
+ * data than the user specified as we may return a
+ * spurious EFAULT.
+ *
+ * Note that the number of semaphores in a set is
+ * fixed for the life of that set. The only way that
+ * the 'count' could change while are blocked in
+ * malloc() is if this semaphore set were destroyed
+ * and a new one created with the same index.
+ * However, semvalid() will catch that due to the
+ * sequence number unless exactly 0x8000 (or a
+ * multiple thereof) semaphore sets for the same index
+ * are created and destroyed while we are in malloc!
+ *
+ */
+ count = semakptr->u.sem_nsems;
+ mtx_unlock(sema_mtxp);
+ array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
mtx_lock(sema_mtxp);
- if ((error = semvalid(uap->semid, semakptr)) != 0)
+ if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
+ KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
goto done2;
for (i = 0; i < semakptr->u.sem_nsems; i++)
array[i] = semakptr->u.sem_base[i].semval;
mtx_unlock(sema_mtxp);
- error = copyout(array, real_arg.array,
- i * sizeof(real_arg.array[0]));
+ error = copyout(array, arg->array, count * sizeof(*array));
+ mtx_lock(sema_mtxp);
break;
case GETZCNT:
- mtx_lock(sema_mtxp);
- if ((error = semvalid(uap->semid, semakptr)) != 0)
+ if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
goto done2;
@@ -751,14 +794,11 @@
error = EINVAL;
goto done2;
}
- rval = semakptr->u.sem_base[semnum].semzcnt;
+ *rval = semakptr->u.sem_base[semnum].semzcnt;
break;
case SETVAL:
- if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
- goto done2;
- mtx_lock(sema_mtxp);
- if ((error = semvalid(uap->semid, semakptr)) != 0)
+ if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
goto done2;
@@ -766,39 +806,32 @@
error = EINVAL;
goto done2;
}
- if (real_arg.val < 0 || real_arg.val > seminfo.semvmx) {
+ if (arg->val < 0 || arg->val > seminfo.semvmx) {
error = ERANGE;
goto done2;
}
- semakptr->u.sem_base[semnum].semval = real_arg.val;
+ semakptr->u.sem_base[semnum].semval = arg->val;
SEMUNDO_LOCK();
- semundo_clear(semid, semnum);
+ semundo_clear(semidx, semnum);
SEMUNDO_UNLOCK();
wakeup(semakptr);
break;
case SETALL:
- mtx_lock(sema_mtxp);
-raced:
- if ((error = semvalid(uap->semid, semakptr)) != 0)
- goto done2;
+ /*
+ * See comment on GETALL for why 'count' shouldn't change
+ * and why we require a userland buffer.
+ */
count = semakptr->u.sem_nsems;
- mtx_unlock(sema_mtxp);
- if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
- goto done2;
+ mtx_unlock(sema_mtxp);
array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
- error = copyin(real_arg.array, array, count * sizeof(*array));
+ error = copyin(arg->array, array, count * sizeof(*array));
+ mtx_lock(sema_mtxp);
if (error)
break;
- mtx_lock(sema_mtxp);
- if ((error = semvalid(uap->semid, semakptr)) != 0)
+ if ((error = semvalid(semid, semakptr)) != 0)
goto done2;
- /* we could have raced? */
- if (count != semakptr->u.sem_nsems) {
- free(array, M_TEMP);
- array = NULL;
- goto raced;
- }
+ KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
goto done2;
for (i = 0; i < semakptr->u.sem_nsems; i++) {
@@ -810,7 +843,7 @@
semakptr->u.sem_base[i].semval = usval;
}
SEMUNDO_LOCK();
- semundo_clear(semid, -1);
+ semundo_clear(semidx, -1);
SEMUNDO_UNLOCK();
wakeup(semakptr);
break;
@@ -820,11 +853,8 @@
break;
}
- if (error == 0)
- td->td_retval[0] = rval;
done2:
- if (mtx_owned(sema_mtxp))
- mtx_unlock(sema_mtxp);
+ mtx_unlock(sema_mtxp);
if (array != NULL)
free(array, M_TEMP);
return(error);
@@ -837,10 +867,6 @@
int semflg;
};
#endif
-
-/*
- * MPSAFE
- */
int
semget(td, uap)
struct thread *td;
@@ -881,11 +907,8 @@
}
#ifdef MAC
error = mac_check_sysv_semget(cred, &sema[semid]);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_semget returned %d\n",
- error));
+ if (error != 0)
goto done2;
- }
#endif
goto found;
}
@@ -934,7 +957,7 @@
#ifdef MAC
mac_create_sysv_sem(cred, &sema[semid]);
#endif
- DPRINTF(("sembase = 0x%x, next = 0x%x\n",
+ DPRINTF(("sembase = %p, next = %p\n",
sema[semid].u.sem_base, &sem[semtot]));
} else {
DPRINTF(("didn't find it and wasn't asked to create it\n"));
@@ -956,10 +979,6 @@
size_t nsops;
};
#endif
-
-/*
- * MPSAFE
- */
int
semop(td, uap)
struct thread *td;
@@ -979,7 +998,10 @@
int error;
int do_wakeup, do_undos;
- DPRINTF(("call to semop(%d, 0x%x, %u)\n", semid, sops, nsops));
+#ifdef SEM_DEBUG
+ sops = NULL;
+#endif
+ DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops));
if (!jail_sysvipc_allowed && jailed(td->td_ucred))
return (ENOSYS);
@@ -1000,7 +1022,7 @@
return (E2BIG);
}
if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) {
- DPRINTF(("error = %d from copyin(%08x, %08x, %d)\n", error,
+ DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error,
uap->sops, sops, nsops * sizeof(sops[0])));
if (sops != small_sops)
free(sops, M_SEM);
@@ -1042,10 +1064,8 @@
}
#ifdef MAC
error = mac_check_sysv_semop(td->td_ucred, semakptr, j);
- if (error != 0) {
- MPRINTF(("mac_check_sysv_semop returned %d\n", error));
+ if (error != 0)
goto done2;
- }
#endif
/*
@@ -1066,8 +1086,8 @@
semptr = &semakptr->u.sem_base[sopptr->sem_num];
DPRINTF((
- "semop: semakptr=%x, sem_base=%x, "
- "semptr=%x, sem[%d]=%d : op=%d, flag=%s\n",
+ "semop: semakptr=%p, sem_base=%p, "
+ "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n",
semakptr, semakptr->u.sem_base, semptr,
sopptr->sem_num, semptr->semval, sopptr->sem_op,
(sopptr->sem_flg & IPC_NOWAIT) ?
@@ -1267,15 +1287,17 @@
*/
SEMUNDO_LOCK();
SLIST_FOREACH_PREVPTR(suptr, supptr, &semu_list, un_next) {
- if (suptr->un_proc == p)
+ if (suptr->un_proc == p) {
+ *supptr = SLIST_NEXT(suptr, un_next);
break;
+ }
}
SEMUNDO_UNLOCK();
if (suptr == NULL)
return;
- DPRINTF(("proc @%08x has undo structure with %d entries\n", p,
+ DPRINTF(("proc @%p has undo structure with %d entries\n", p,
suptr->un_cnt));
/*
@@ -1301,7 +1323,7 @@
panic("semexit - semnum out of range");
DPRINTF((
- "semexit: %08x id=%d num=%d(adj=%d) ; sem=%d\n",
+ "semexit: %p id=%d num=%d(adj=%d) ; sem=%d\n",
suptr->un_proc, suptr->un_ent[ix].un_id,
suptr->un_ent[ix].un_num,
suptr->un_ent[ix].un_adjval,
@@ -1328,8 +1350,9 @@
* Deallocate the undo vector.
*/
DPRINTF(("removing vector\n"));
+ SEMUNDO_LOCK();
suptr->un_proc = NULL;
- *supptr = SLIST_NEXT(suptr, un_next);
+ SEMUNDO_UNLOCK();
}
static int
Index: subr_witness.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_witness.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/subr_witness.c -L sys/kern/subr_witness.c -u -r1.3 -r1.4
--- sys/kern/subr_witness.c
+++ sys/kern/subr_witness.c
@@ -82,9 +82,10 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_witness.c,v 1.195.2.7 2006/01/04 19:27:22 truckman Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_witness.c,v 1.236.2.1 2007/11/27 13:18:54 attilio Exp $");
#include "opt_ddb.h"
+#include "opt_hwpmc_hooks.h"
#include "opt_witness.h"
#include <sys/param.h>
@@ -95,6 +96,7 @@
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
@@ -103,6 +105,17 @@
#include <machine/stdarg.h>
+/* Note that these traces do not work with KTR_ALQ. */
+#if 0
+#define KTR_WITNESS KTR_SUBSYS
+#else
+#define KTR_WITNESS 0
+#endif
+
+/* Easier to stay with the old names. */
+#define lo_list lo_witness_data.lod_list
+#define lo_witness lo_witness_data.lod_witness
+
/* Define this to check for blessed mutexes */
#undef BLESSING
@@ -167,11 +180,7 @@
static int itismychild(struct witness *parent, struct witness *child);
static void removechild(struct witness *parent, struct witness *child);
static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS);
-static void witness_displaydescendants(void(*)(const char *fmt, ...),
- struct witness *, int indent);
static const char *fixup_filename(const char *file);
-static void witness_leveldescendents(struct witness *parent, int level);
-static void witness_levelall(void);
static struct witness *witness_get(void);
static void witness_free(struct witness *m);
static struct witness_child_list_entry *witness_child_get(void);
@@ -182,10 +191,14 @@
struct lock_object *lock);
static void witness_list_lock(struct lock_instance *instance);
#ifdef DDB
-static void witness_list(struct thread *td);
+static void witness_leveldescendents(struct witness *parent, int level);
+static void witness_levelall(void);
+static void witness_displaydescendants(void(*)(const char *fmt, ...),
+ struct witness *, int indent);
static void witness_display_list(void(*prnt)(const char *fmt, ...),
struct witness_list *list);
static void witness_display(void(*)(const char *fmt, ...));
+static void witness_list(struct thread *td);
#endif
SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, 0, "Witness Locking");
@@ -205,7 +218,7 @@
/*
* When KDB is enabled and witness_kdb is set to 1, it will cause the system
* to drop into kdebug() when:
- * - a lock heirarchy violation occurs
+ * - a lock hierarchy violation occurs
* - locks are held when going to sleep.
*/
#ifdef WITNESS_KDB
@@ -219,7 +232,7 @@
/*
* When KDB is enabled and witness_trace is set to 1, it will cause the system
* to print a stack trace:
- * - a lock heirarchy violation occurs
+ * - a lock hierarchy violation occurs
* - locks are held when going to sleep.
*/
int witness_trace = 1;
@@ -264,12 +277,12 @@
*/
{ "proctree", &lock_class_sx },
{ "allproc", &lock_class_sx },
+ { "allprison", &lock_class_sx },
{ NULL, NULL },
/*
* Various mutexes
*/
{ "Giant", &lock_class_mtx_sleep },
- { "filedesc structure", &lock_class_mtx_sleep },
{ "pipe mutex", &lock_class_mtx_sleep },
{ "sigio lock", &lock_class_mtx_sleep },
{ "process group", &lock_class_mtx_sleep },
@@ -277,12 +290,13 @@
{ "session", &lock_class_mtx_sleep },
{ "uidinfo hash", &lock_class_mtx_sleep },
{ "uidinfo struct", &lock_class_mtx_sleep },
- { "allprison", &lock_class_mtx_sleep },
+#ifdef HWPMC_HOOKS
+ { "pmc-sleep", &lock_class_mtx_sleep },
+#endif
{ NULL, NULL },
/*
* Sockets
*/
- { "filedesc structure", &lock_class_mtx_sleep },
{ "accept", &lock_class_mtx_sleep },
{ "so_snd", &lock_class_mtx_sleep },
{ "so_rcv", &lock_class_mtx_sleep },
@@ -297,8 +311,9 @@
{ "ifaddr", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
- * Multicast - protocol locks before interface locks.
+ * Multicast - protocol locks before interface locks, after UDP locks.
*/
+ { "udpinp", &lock_class_mtx_sleep },
{ "in_multi_mtx", &lock_class_mtx_sleep },
{ "igmp_mtx", &lock_class_mtx_sleep },
{ "if_addr_mtx", &lock_class_mtx_sleep },
@@ -348,6 +363,24 @@
{ "nfsd_mtx", &lock_class_mtx_sleep },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },
+
+ /*
+ * IEEE 802.11
+ */
+ { "802.11 com lock", &lock_class_mtx_sleep},
+ { NULL, NULL },
+ /*
+ * Network drivers
+ */
+ { "network driver", &lock_class_mtx_sleep},
+ { NULL, NULL },
+
+ /*
+ * Netgraph
+ */
+ { "ng_node", &lock_class_mtx_sleep },
+ { "ng_worklist", &lock_class_mtx_sleep },
+ { NULL, NULL },
/*
* CDEV
*/
@@ -357,6 +390,13 @@
{ "cdev", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
+ * kqueue/VFS interaction
+ */
+ { "kqueue", &lock_class_mtx_sleep },
+ { "struct mount mtx", &lock_class_mtx_sleep },
+ { "vnode interlock", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
* spin locks
*/
#ifdef SMP
@@ -364,42 +404,50 @@
#endif
{ "rm.mutex_mtx", &lock_class_mtx_spin },
{ "sio", &lock_class_mtx_spin },
+ { "scrlock", &lock_class_mtx_spin },
#ifdef __i386__
{ "cy", &lock_class_mtx_spin },
#endif
+#ifdef __sparc64__
+ { "pcib_mtx", &lock_class_mtx_spin },
+ { "rtc_mtx", &lock_class_mtx_spin },
+#endif
+ { "scc_hwmtx", &lock_class_mtx_spin },
{ "uart_hwmtx", &lock_class_mtx_spin },
- { "sabtty", &lock_class_mtx_spin },
- { "zstty", &lock_class_mtx_spin },
- { "ng_node", &lock_class_mtx_spin },
- { "ng_worklist", &lock_class_mtx_spin },
- { "taskqueue_fast", &lock_class_mtx_spin },
+ { "fast_taskqueue", &lock_class_mtx_spin },
{ "intr table", &lock_class_mtx_spin },
+#ifdef HWPMC_HOOKS
+ { "pmc-per-proc", &lock_class_mtx_spin },
+#endif
+ { "process slock", &lock_class_mtx_spin },
{ "sleepq chain", &lock_class_mtx_spin },
- { "sched lock", &lock_class_mtx_spin },
+ { "umtx lock", &lock_class_mtx_spin },
{ "turnstile chain", &lock_class_mtx_spin },
+ { "turnstile lock", &lock_class_mtx_spin },
+ { "sched lock", &lock_class_mtx_spin },
{ "td_contested", &lock_class_mtx_spin },
{ "callout", &lock_class_mtx_spin },
{ "entropy harvest mutex", &lock_class_mtx_spin },
{ "syscons video lock", &lock_class_mtx_spin },
+ { "time lock", &lock_class_mtx_spin },
+#ifdef SMP
+ { "smp rendezvous", &lock_class_mtx_spin },
+#endif
/*
* leaf locks
*/
- { "allpmaps", &lock_class_mtx_spin },
- { "vm page queue free mutex", &lock_class_mtx_spin },
{ "icu", &lock_class_mtx_spin },
-#ifdef SMP
- { "smp rendezvous", &lock_class_mtx_spin },
-#if defined(__i386__) || defined(__amd64__)
- { "tlb", &lock_class_mtx_spin },
-#endif
-#ifdef __sparc64__
+#if defined(SMP) && defined(__sparc64__)
{ "ipi", &lock_class_mtx_spin },
- { "rtc_mtx", &lock_class_mtx_spin },
#endif
+#ifdef __i386__
+ { "allpmaps", &lock_class_mtx_spin },
+ { "descriptor tables", &lock_class_mtx_spin },
#endif
{ "clk", &lock_class_mtx_spin },
- { "mutex profiling lock", &lock_class_mtx_spin },
- { "kse zombie lock", &lock_class_mtx_spin },
+ { "mprof lock", &lock_class_mtx_spin },
+ { "kse lock", &lock_class_mtx_spin },
+ { "zombie lock", &lock_class_mtx_spin },
{ "ALD Queue", &lock_class_mtx_spin },
#ifdef __ia64__
{ "MCA spin lock", &lock_class_mtx_spin },
@@ -413,6 +461,10 @@
{ "tw_cl_io_lock", &lock_class_mtx_spin },
{ "tw_cl_intr_lock", &lock_class_mtx_spin },
{ "tw_cl_gen_lock", &lock_class_mtx_spin },
+#ifdef HWPMC_HOOKS
+ { "pmc-leaf", &lock_class_mtx_spin },
+#endif
+ { "blocked lock", &lock_class_mtx_spin },
{ NULL, NULL },
{ NULL, NULL }
};
@@ -429,19 +481,11 @@
#endif
/*
- * List of all locks in the system.
+ * List of locks initialized prior to witness being initialized whose
+ * enrollment is currently deferred.
*/
-TAILQ_HEAD(, lock_object) all_locks = TAILQ_HEAD_INITIALIZER(all_locks);
-
-static struct mtx all_mtx = {
- { &lock_class_mtx_sleep, /* mtx_object.lo_class */
- "All locks list", /* mtx_object.lo_name */
- "All locks list", /* mtx_object.lo_type */
- LO_INITIALIZED, /* mtx_object.lo_flags */
- { NULL, NULL }, /* mtx_object.lo_list */
- NULL }, /* mtx_object.lo_witness */
- MTX_UNOWNED, 0 /* mtx_lock, mtx_recurse */
-};
+STAILQ_HEAD(, lock_object) pending_locks =
+ STAILQ_HEAD_INITIALIZER(pending_locks);
/*
* This global is set to 0 once it becomes safe to use the witness code.
@@ -455,13 +499,9 @@
static int witness_spin_warn = 0;
/*
- * Global variables for book keeping.
- */
-static int lock_cur_cnt;
-static int lock_max_cnt;
-
-/*
- * The WITNESS-enabled diagnostic code.
+ * The WITNESS-enabled diagnostic code. Note that the witness code does
+ * assume that the early boot is single-threaded at least until after this
+ * routine is completed.
*/
static void
witness_initialize(void *dummy __unused)
@@ -479,9 +519,8 @@
mtx_assert(&Giant, MA_NOTOWNED);
CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
- TAILQ_INSERT_HEAD(&all_locks, &all_mtx.mtx_object, lo_list);
mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
- MTX_NOWITNESS);
+ MTX_NOWITNESS | MTX_NOPROFILE);
for (i = 0; i < WITNESS_COUNT; i++)
witness_free(&w_data[i]);
for (i = 0; i < WITNESS_CHILDCOUNT; i++)
@@ -508,15 +547,14 @@
witness_spin_warn = 1;
/* Iterate through all locks and add them to witness. */
- mtx_lock(&all_mtx);
- TAILQ_FOREACH(lock, &all_locks, lo_list) {
- if (lock->lo_flags & LO_WITNESS)
- lock->lo_witness = enroll(lock->lo_type,
- lock->lo_class);
- else
- lock->lo_witness = NULL;
+ while (!STAILQ_EMPTY(&pending_locks)) {
+ lock = STAILQ_FIRST(&pending_locks);
+ STAILQ_REMOVE_HEAD(&pending_locks, lo_list);
+ KASSERT(lock->lo_flags & LO_WITNESS,
+ ("%s: lock %s is on pending list but not LO_WITNESS",
+ __func__, lock->lo_name));
+ lock->lo_witness = enroll(lock->lo_type, LOCK_CLASS(lock));
}
- mtx_unlock(&all_mtx);
/* Mark the witness code as being ready for use. */
witness_cold = 0;
@@ -534,9 +572,6 @@
error = sysctl_handle_int(oidp, &value, 0, req);
if (error != 0 || req->newptr == NULL)
return (error);
- error = suser(req->td);
- if (error != 0)
- return (error);
if (value == witness_watch)
return (0);
if (value != 0)
@@ -550,10 +585,8 @@
{
struct lock_class *class;
- class = lock->lo_class;
- if (lock->lo_flags & LO_INITIALIZED)
- panic("%s: lock (%s) %s is already initialized", __func__,
- class->lc_name, lock->lo_name);
+ /* Various sanity checks. */
+ class = LOCK_CLASS(lock);
if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
(class->lc_flags & LC_RECURSABLE) == 0)
panic("%s: lock (%s) %s can not be recursable", __func__,
@@ -567,35 +600,38 @@
panic("%s: lock (%s) %s can not be upgradable", __func__,
class->lc_name, lock->lo_name);
- mtx_lock(&all_mtx);
- TAILQ_INSERT_TAIL(&all_locks, lock, lo_list);
- lock->lo_flags |= LO_INITIALIZED;
- lock_cur_cnt++;
- if (lock_cur_cnt > lock_max_cnt)
- lock_max_cnt = lock_cur_cnt;
- mtx_unlock(&all_mtx);
- if (!witness_cold && witness_watch != 0 && panicstr == NULL &&
- (lock->lo_flags & LO_WITNESS) != 0)
- lock->lo_witness = enroll(lock->lo_type, class);
- else
+ /*
+ * If we shouldn't watch this lock, then just clear lo_witness.
+ * Otherwise, if witness_cold is set, then it is too early to
+ * enroll this lock, so defer it to witness_initialize() by adding
+ * it to the pending_locks list. If it is not too early, then enroll
+ * the lock now.
+ */
+ if (witness_watch == 0 || panicstr != NULL ||
+ (lock->lo_flags & LO_WITNESS) == 0)
lock->lo_witness = NULL;
+ else if (witness_cold) {
+ STAILQ_INSERT_TAIL(&pending_locks, lock, lo_list);
+ lock->lo_flags |= LO_ENROLLPEND;
+ } else
+ lock->lo_witness = enroll(lock->lo_type, class);
}
void
witness_destroy(struct lock_object *lock)
{
+ struct lock_class *class;
struct witness *w;
+ class = LOCK_CLASS(lock);
if (witness_cold)
panic("lock (%s) %s destroyed while witness_cold",
- lock->lo_class->lc_name, lock->lo_name);
- if ((lock->lo_flags & LO_INITIALIZED) == 0)
- panic("%s: lock (%s) %s is not initialized", __func__,
- lock->lo_class->lc_name, lock->lo_name);
+ class->lc_name, lock->lo_name);
/* XXX: need to verify that no one holds the lock */
- w = lock->lo_witness;
- if (w != NULL) {
+ if ((lock->lo_flags & (LO_WITNESS | LO_ENROLLPEND)) == LO_WITNESS &&
+ lock->lo_witness != NULL) {
+ w = lock->lo_witness;
mtx_lock_spin(&w_mtx);
MPASS(w->w_refcount > 0);
w->w_refcount--;
@@ -608,15 +644,99 @@
mtx_unlock_spin(&w_mtx);
}
- mtx_lock(&all_mtx);
- lock_cur_cnt--;
- TAILQ_REMOVE(&all_locks, lock, lo_list);
- lock->lo_flags &= ~LO_INITIALIZED;
- mtx_unlock(&all_mtx);
+ /*
+ * If this lock is destroyed before witness is up and running,
+ * remove it from the pending list.
+ */
+ if (lock->lo_flags & LO_ENROLLPEND) {
+ STAILQ_REMOVE(&pending_locks, lock, lock_object, lo_list);
+ lock->lo_flags &= ~LO_ENROLLPEND;
+ }
}
#ifdef DDB
static void
+witness_levelall (void)
+{
+ struct witness_list *list;
+ struct witness *w, *w1;
+
+ /*
+ * First clear all levels.
+ */
+ STAILQ_FOREACH(w, &w_all, w_list) {
+ w->w_level = 0;
+ }
+
+ /*
+ * Look for locks with no parent and level all their descendants.
+ */
+ STAILQ_FOREACH(w, &w_all, w_list) {
+ /*
+ * This is just an optimization, technically we could get
+ * away just walking the all list each time.
+ */
+ if (w->w_class->lc_flags & LC_SLEEPLOCK)
+ list = &w_sleep;
+ else
+ list = &w_spin;
+ STAILQ_FOREACH(w1, list, w_typelist) {
+ if (isitmychild(w1, w))
+ goto skip;
+ }
+ witness_leveldescendents(w, 0);
+ skip:
+ ; /* silence GCC 3.x */
+ }
+}
+
+static void
+witness_leveldescendents(struct witness *parent, int level)
+{
+ struct witness_child_list_entry *wcl;
+ int i;
+
+ if (parent->w_level < level)
+ parent->w_level = level;
+ level++;
+ for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next)
+ for (i = 0; i < wcl->wcl_count; i++)
+ witness_leveldescendents(wcl->wcl_children[i], level);
+}
+
+static void
+witness_displaydescendants(void(*prnt)(const char *fmt, ...),
+ struct witness *parent, int indent)
+{
+ struct witness_child_list_entry *wcl;
+ int i, level;
+
+ level = parent->w_level;
+ prnt("%-2d", level);
+ for (i = 0; i < indent; i++)
+ prnt(" ");
+ if (parent->w_refcount > 0)
+ prnt("%s", parent->w_name);
+ else
+ prnt("(dead)");
+ if (parent->w_displayed) {
+ prnt(" -- (already displayed)\n");
+ return;
+ }
+ parent->w_displayed = 1;
+ if (parent->w_refcount > 0) {
+ if (parent->w_file != NULL)
+ prnt(" -- last acquired @ %s:%d", parent->w_file,
+ parent->w_line);
+ }
+ prnt("\n");
+ for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next)
+ for (i = 0; i < wcl->wcl_count; i++)
+ witness_displaydescendants(prnt,
+ wcl->wcl_children[i], indent + 1);
+}
+
+static void
witness_display_list(void(*prnt)(const char *fmt, ...),
struct witness_list *list)
{
@@ -742,7 +862,7 @@
__func__);
w = lock->lo_witness;
- class = lock->lo_class;
+ class = LOCK_CLASS(lock);
td = curthread;
file = fixup_filename(file);
@@ -867,14 +987,14 @@
* lock, then skip it.
*/
if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
- lock == &Giant.mtx_object)
+ lock == &Giant.lock_object)
continue;
/*
* If we are locking a sleepable lock and this lock
* is Giant, then skip it.
*/
if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
- lock1->li_lock == &Giant.mtx_object)
+ lock1->li_lock == &Giant.lock_object)
continue;
/*
* If we are locking a sleepable lock and this lock
@@ -890,7 +1010,7 @@
* lock, then treat it as a reversal.
*/
if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
- lock == &Giant.mtx_object)
+ lock == &Giant.lock_object)
goto reversal;
/*
* Check the lock order hierarchy for a reveresal.
@@ -912,7 +1032,7 @@
if (blessed(w, w1))
return;
#endif
- if (lock1->li_lock == &Giant.mtx_object) {
+ if (lock1->li_lock == &Giant.lock_object) {
if (w1->w_Giant_squawked)
return;
else
@@ -931,7 +1051,7 @@
printf(
"lock order reversal: (sleepable after non-sleepable)\n");
else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
- && lock == &Giant.mtx_object)
+ && lock == &Giant.lock_object)
printf(
"lock order reversal: (Giant after non-sleepable)\n");
else
@@ -986,7 +1106,7 @@
* always come before Giant.
*/
if (flags & LOP_NEWORDER &&
- !(lock1->li_lock == &Giant.mtx_object &&
+ !(lock1->li_lock == &Giant.lock_object &&
(lock->lo_flags & LO_SLEEPABLE) != 0)) {
CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
lock->lo_type, lock1->li_lock->lo_type);
@@ -1022,7 +1142,7 @@
file = fixup_filename(file);
/* Determine lock list for this lock. */
- if (lock->lo_class->lc_flags & LC_SLEEPLOCK)
+ if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
lock_list = &td->td_sleeplocks;
else
lock_list = PCPU_PTR(spinlocks);
@@ -1075,7 +1195,7 @@
KASSERT(!witness_cold, ("%s: witness_cold", __func__));
if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL)
return;
- class = lock->lo_class;
+ class = LOCK_CLASS(lock);
file = fixup_filename(file);
if ((lock->lo_flags & LO_UPGRADABLE) == 0)
panic("upgrade of non-upgradable lock (%s) %s @ %s:%d",
@@ -1083,7 +1203,7 @@
if ((flags & LOP_TRYLOCK) == 0)
panic("non-try upgrade of lock (%s) %s @ %s:%d", class->lc_name,
lock->lo_name, file, line);
- if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
+ if ((class->lc_flags & LC_SLEEPLOCK) == 0)
panic("upgrade of non-sleep lock (%s) %s @ %s:%d",
class->lc_name, lock->lo_name, file, line);
instance = find_instance(curthread->td_sleeplocks, lock);
@@ -1110,12 +1230,12 @@
KASSERT(!witness_cold, ("%s: witness_cold", __func__));
if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL)
return;
- class = lock->lo_class;
+ class = LOCK_CLASS(lock);
file = fixup_filename(file);
if ((lock->lo_flags & LO_UPGRADABLE) == 0)
panic("downgrade of non-upgradable lock (%s) %s @ %s:%d",
class->lc_name, lock->lo_name, file, line);
- if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
+ if ((class->lc_flags & LC_SLEEPLOCK) == 0)
panic("downgrade of non-sleep lock (%s) %s @ %s:%d",
class->lc_name, lock->lo_name, file, line);
instance = find_instance(curthread->td_sleeplocks, lock);
@@ -1146,7 +1266,7 @@
panicstr != NULL)
return;
td = curthread;
- class = lock->lo_class;
+ class = LOCK_CLASS(lock);
file = fixup_filename(file);
/* Find lock instance associated with this lock. */
@@ -1238,7 +1358,7 @@
if (lock1->li_lock == lock)
continue;
if (flags & WARN_GIANTOK &&
- lock1->li_lock == &Giant.mtx_object)
+ lock1->li_lock == &Giant.lock_object)
continue;
if (flags & WARN_SLEEPOK &&
(lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
@@ -1520,87 +1640,6 @@
return (0);
}
-static void
-witness_levelall (void)
-{
- struct witness_list *list;
- struct witness *w, *w1;
-
- /*
- * First clear all levels.
- */
- STAILQ_FOREACH(w, &w_all, w_list) {
- w->w_level = 0;
- }
-
- /*
- * Look for locks with no parent and level all their descendants.
- */
- STAILQ_FOREACH(w, &w_all, w_list) {
- /*
- * This is just an optimization, technically we could get
- * away just walking the all list each time.
- */
- if (w->w_class->lc_flags & LC_SLEEPLOCK)
- list = &w_sleep;
- else
- list = &w_spin;
- STAILQ_FOREACH(w1, list, w_typelist) {
- if (isitmychild(w1, w))
- goto skip;
- }
- witness_leveldescendents(w, 0);
- skip:
- ; /* silence GCC 3.x */
- }
-}
-
-static void
-witness_leveldescendents(struct witness *parent, int level)
-{
- struct witness_child_list_entry *wcl;
- int i;
-
- if (parent->w_level < level)
- parent->w_level = level;
- level++;
- for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next)
- for (i = 0; i < wcl->wcl_count; i++)
- witness_leveldescendents(wcl->wcl_children[i], level);
-}
-
-static void
-witness_displaydescendants(void(*prnt)(const char *fmt, ...),
- struct witness *parent, int indent)
-{
- struct witness_child_list_entry *wcl;
- int i, level;
-
- level = parent->w_level;
- prnt("%-2d", level);
- for (i = 0; i < indent; i++)
- prnt(" ");
- if (parent->w_refcount > 0)
- prnt("%s", parent->w_name);
- else
- prnt("(dead)");
- if (parent->w_displayed) {
- prnt(" -- (already displayed)\n");
- return;
- }
- parent->w_displayed = 1;
- if (parent->w_refcount > 0) {
- if (parent->w_file != NULL)
- prnt(" -- last acquired @ %s:%d", parent->w_file,
- parent->w_line);
- }
- prnt("\n");
- for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next)
- for (i = 0; i < wcl->wcl_count; i++)
- witness_displaydescendants(prnt,
- wcl->wcl_children[i], indent + 1);
-}
-
#ifdef BLESSING
static int
blessed(struct witness *w1, struct witness *w2)
@@ -1738,7 +1777,7 @@
lock = instance->li_lock;
printf("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
- "exclusive" : "shared", lock->lo_class->lc_name, lock->lo_name);
+ "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
if (lock->lo_type != lock->lo_name)
printf(" (%s)", lock->lo_type);
printf(" r = %d (%p) locked @ %s:%d\n",
@@ -1806,18 +1845,25 @@
void
witness_save(struct lock_object *lock, const char **filep, int *linep)
{
+ struct lock_list_entry *lock_list;
struct lock_instance *instance;
+ struct lock_class *class;
KASSERT(!witness_cold, ("%s: witness_cold", __func__));
if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL)
return;
- if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
- panic("%s: lock (%s) %s is not a sleep lock", __func__,
- lock->lo_class->lc_name, lock->lo_name);
- instance = find_instance(curthread->td_sleeplocks, lock);
+ class = LOCK_CLASS(lock);
+ if (class->lc_flags & LC_SLEEPLOCK)
+ lock_list = curthread->td_sleeplocks;
+ else {
+ if (witness_skipspin)
+ return;
+ lock_list = PCPU_GET(spinlocks);
+ }
+ instance = find_instance(lock_list, lock);
if (instance == NULL)
panic("%s: lock (%s) %s not locked", __func__,
- lock->lo_class->lc_name, lock->lo_name);
+ class->lc_name, lock->lo_name);
*filep = instance->li_file;
*linep = instance->li_line;
}
@@ -1825,18 +1871,25 @@
void
witness_restore(struct lock_object *lock, const char *file, int line)
{
+ struct lock_list_entry *lock_list;
struct lock_instance *instance;
+ struct lock_class *class;
KASSERT(!witness_cold, ("%s: witness_cold", __func__));
if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL)
return;
- if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
- panic("%s: lock (%s) %s is not a sleep lock", __func__,
- lock->lo_class->lc_name, lock->lo_name);
- instance = find_instance(curthread->td_sleeplocks, lock);
+ class = LOCK_CLASS(lock);
+ if (class->lc_flags & LC_SLEEPLOCK)
+ lock_list = curthread->td_sleeplocks;
+ else {
+ if (witness_skipspin)
+ return;
+ lock_list = PCPU_GET(spinlocks);
+ }
+ instance = find_instance(lock_list, lock);
if (instance == NULL)
panic("%s: lock (%s) %s not locked", __func__,
- lock->lo_class->lc_name, lock->lo_name);
+ class->lc_name, lock->lo_name);
lock->lo_witness->w_file = file;
lock->lo_witness->w_line = line;
instance->li_file = file;
@@ -1848,23 +1901,25 @@
{
#ifdef INVARIANT_SUPPORT
struct lock_instance *instance;
+ struct lock_class *class;
if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL)
return;
- if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) != 0)
+ class = LOCK_CLASS(lock);
+ if ((class->lc_flags & LC_SLEEPLOCK) != 0)
instance = find_instance(curthread->td_sleeplocks, lock);
- else if ((lock->lo_class->lc_flags & LC_SPINLOCK) != 0)
+ else if ((class->lc_flags & LC_SPINLOCK) != 0)
instance = find_instance(PCPU_GET(spinlocks), lock);
else {
panic("Lock (%s) %s is not sleep or spin!",
- lock->lo_class->lc_name, lock->lo_name);
+ class->lc_name, lock->lo_name);
}
file = fixup_filename(file);
switch (flags) {
case LA_UNLOCKED:
if (instance != NULL)
panic("Lock (%s) %s locked @ %s:%d.",
- lock->lo_class->lc_name, lock->lo_name, file, line);
+ class->lc_name, lock->lo_name, file, line);
break;
case LA_LOCKED:
case LA_LOCKED | LA_RECURSED:
@@ -1877,25 +1932,25 @@
case LA_XLOCKED | LA_NOTRECURSED:
if (instance == NULL) {
panic("Lock (%s) %s not locked @ %s:%d.",
- lock->lo_class->lc_name, lock->lo_name, file, line);
+ class->lc_name, lock->lo_name, file, line);
break;
}
if ((flags & LA_XLOCKED) != 0 &&
(instance->li_flags & LI_EXCLUSIVE) == 0)
panic("Lock (%s) %s not exclusively locked @ %s:%d.",
- lock->lo_class->lc_name, lock->lo_name, file, line);
+ class->lc_name, lock->lo_name, file, line);
if ((flags & LA_SLOCKED) != 0 &&
(instance->li_flags & LI_EXCLUSIVE) != 0)
panic("Lock (%s) %s exclusively locked @ %s:%d.",
- lock->lo_class->lc_name, lock->lo_name, file, line);
+ class->lc_name, lock->lo_name, file, line);
if ((flags & LA_RECURSED) != 0 &&
(instance->li_flags & LI_RECURSEMASK) == 0)
panic("Lock (%s) %s not recursed @ %s:%d.",
- lock->lo_class->lc_name, lock->lo_name, file, line);
+ class->lc_name, lock->lo_name, file, line);
if ((flags & LA_NOTRECURSED) != 0 &&
(instance->li_flags & LI_RECURSEMASK) != 0)
panic("Lock (%s) %s recursed @ %s:%d.",
- lock->lo_class->lc_name, lock->lo_name, file, line);
+ class->lc_name, lock->lo_name, file, line);
break;
default:
panic("Invalid lock assertion at %s:%d.", file, line);
@@ -1925,10 +1980,10 @@
* td->td_oncpu to get the list of spinlocks for this thread
* and "fix" this.
*
- * That still wouldn't really fix this unless we locked sched_lock
- * or stopped the other CPU to make sure it wasn't changing the list
- * out from under us. It is probably best to just not try to handle
- * threads on other CPU's for now.
+ * That still wouldn't really fix this unless we locked the scheduler
+ * lock or stopped the other CPU to make sure it wasn't changing the
+ * list out from under us. It is probably best to just not try to
+ * handle threads on other CPU's for now.
*/
if (td == curthread && PCPU_GET(spinlocks) != NULL)
witness_list_locks(PCPU_PTR(spinlocks));
@@ -1937,30 +1992,12 @@
DB_SHOW_COMMAND(locks, db_witness_list)
{
struct thread *td;
- pid_t pid;
- struct proc *p;
- if (have_addr) {
- pid = (addr % 16) + ((addr >> 4) % 16) * 10 +
- ((addr >> 8) % 16) * 100 + ((addr >> 12) % 16) * 1000 +
- ((addr >> 16) % 16) * 10000;
- /* sx_slock(&allproc_lock); */
- FOREACH_PROC_IN_SYSTEM(p) {
- if (p->p_pid == pid)
- break;
- }
- /* sx_sunlock(&allproc_lock); */
- if (p == NULL) {
- db_printf("pid %d not found\n", pid);
- return;
- }
- FOREACH_THREAD_IN_PROC(p, td) {
- witness_list(td);
- }
- } else {
- td = curthread;
- witness_list(td);
- }
+ if (have_addr)
+ td = db_lookup_thread(addr, TRUE);
+ else
+ td = kdb_thread;
+ witness_list(td);
}
DB_SHOW_COMMAND(alllocks, db_witness_list_all)
Index: init_sysent.c
===================================================================
RCS file: /home/cvs/src/sys/kern/init_sysent.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/init_sysent.c -L sys/kern/init_sysent.c -u -r1.2 -r1.3
--- sys/kern/init_sysent.c
+++ sys/kern/init_sysent.c
@@ -2,8 +2,8 @@
* System call switch table.
*
* DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/kern/init_sysent.c,v 1.195.2.2 2006/03/17 01:47:32 rwatson Exp $
- * created from FreeBSD: src/sys/kern/syscalls.master,v 1.198.2.2 2006/03/17 01:47:06 rwatson Exp
+ * $FreeBSD: src/sys/kern/init_sysent.c,v 1.230 2007/08/16 05:32:25 davidxu Exp $
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.232 2007/07/04 22:47:37 peter Exp
*/
#include "opt_compat.h"
@@ -29,460 +29,486 @@
/* The casts are bogus but will do for now. */
struct sysent sysent[] = {
- { SYF_MPSAFE | 0, (sy_call_t *)nosys, AUE_NULL }, /* 0 = syscall */
- { SYF_MPSAFE | AS(sys_exit_args), (sy_call_t *)sys_exit, AUE_NULL }, /* 1 = exit */
- { SYF_MPSAFE | 0, (sy_call_t *)fork, AUE_NULL }, /* 2 = fork */
- { SYF_MPSAFE | AS(read_args), (sy_call_t *)read, AUE_NULL }, /* 3 = read */
- { SYF_MPSAFE | AS(write_args), (sy_call_t *)write, AUE_NULL }, /* 4 = write */
- { SYF_MPSAFE | AS(open_args), (sy_call_t *)open, AUE_NULL }, /* 5 = open */
- { SYF_MPSAFE | AS(close_args), (sy_call_t *)close, AUE_NULL }, /* 6 = close */
- { SYF_MPSAFE | AS(wait_args), (sy_call_t *)wait4, AUE_NULL }, /* 7 = wait4 */
- { compat(SYF_MPSAFE | AS(ocreat_args),creat), AUE_NULL }, /* 8 = old creat */
- { SYF_MPSAFE | AS(link_args), (sy_call_t *)link, AUE_NULL }, /* 9 = link */
- { SYF_MPSAFE | AS(unlink_args), (sy_call_t *)unlink, AUE_NULL }, /* 10 = unlink */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 11 = obsolete execv */
- { SYF_MPSAFE | AS(chdir_args), (sy_call_t *)chdir, AUE_NULL }, /* 12 = chdir */
- { SYF_MPSAFE | AS(fchdir_args), (sy_call_t *)fchdir, AUE_NULL }, /* 13 = fchdir */
- { SYF_MPSAFE | AS(mknod_args), (sy_call_t *)mknod, AUE_NULL }, /* 14 = mknod */
- { SYF_MPSAFE | AS(chmod_args), (sy_call_t *)chmod, AUE_NULL }, /* 15 = chmod */
- { SYF_MPSAFE | AS(chown_args), (sy_call_t *)chown, AUE_NULL }, /* 16 = chown */
- { SYF_MPSAFE | AS(obreak_args), (sy_call_t *)obreak, AUE_NULL }, /* 17 = break */
- { compat4(SYF_MPSAFE | AS(freebsd4_getfsstat_args),getfsstat), AUE_NULL }, /* 18 = old getfsstat */
- { compat(SYF_MPSAFE | AS(olseek_args),lseek), AUE_NULL }, /* 19 = old lseek */
- { SYF_MPSAFE | 0, (sy_call_t *)getpid, AUE_NULL }, /* 20 = getpid */
- { AS(mount_args), (sy_call_t *)mount, AUE_NULL }, /* 21 = mount */
- { AS(unmount_args), (sy_call_t *)unmount, AUE_NULL }, /* 22 = unmount */
- { SYF_MPSAFE | AS(setuid_args), (sy_call_t *)setuid, AUE_NULL }, /* 23 = setuid */
- { SYF_MPSAFE | 0, (sy_call_t *)getuid, AUE_NULL }, /* 24 = getuid */
- { SYF_MPSAFE | 0, (sy_call_t *)geteuid, AUE_NULL }, /* 25 = geteuid */
- { SYF_MPSAFE | AS(ptrace_args), (sy_call_t *)ptrace, AUE_NULL }, /* 26 = ptrace */
- { SYF_MPSAFE | AS(recvmsg_args), (sy_call_t *)recvmsg, AUE_NULL }, /* 27 = recvmsg */
- { SYF_MPSAFE | AS(sendmsg_args), (sy_call_t *)sendmsg, AUE_NULL }, /* 28 = sendmsg */
- { SYF_MPSAFE | AS(recvfrom_args), (sy_call_t *)recvfrom, AUE_NULL }, /* 29 = recvfrom */
- { SYF_MPSAFE | AS(accept_args), (sy_call_t *)accept, AUE_NULL }, /* 30 = accept */
- { SYF_MPSAFE | AS(getpeername_args), (sy_call_t *)getpeername, AUE_NULL }, /* 31 = getpeername */
- { SYF_MPSAFE | AS(getsockname_args), (sy_call_t *)getsockname, AUE_NULL }, /* 32 = getsockname */
- { SYF_MPSAFE | AS(access_args), (sy_call_t *)access, AUE_NULL }, /* 33 = access */
- { SYF_MPSAFE | AS(chflags_args), (sy_call_t *)chflags, AUE_NULL }, /* 34 = chflags */
- { SYF_MPSAFE | AS(fchflags_args), (sy_call_t *)fchflags, AUE_NULL }, /* 35 = fchflags */
- { SYF_MPSAFE | 0, (sy_call_t *)sync, AUE_NULL }, /* 36 = sync */
- { SYF_MPSAFE | AS(kill_args), (sy_call_t *)kill, AUE_NULL }, /* 37 = kill */
- { compat(SYF_MPSAFE | AS(ostat_args),stat), AUE_NULL }, /* 38 = old stat */
- { SYF_MPSAFE | 0, (sy_call_t *)getppid, AUE_NULL }, /* 39 = getppid */
- { compat(SYF_MPSAFE | AS(olstat_args),lstat), AUE_NULL }, /* 40 = old lstat */
- { SYF_MPSAFE | AS(dup_args), (sy_call_t *)dup, AUE_NULL }, /* 41 = dup */
- { SYF_MPSAFE | 0, (sy_call_t *)pipe, AUE_NULL }, /* 42 = pipe */
- { SYF_MPSAFE | 0, (sy_call_t *)getegid, AUE_NULL }, /* 43 = getegid */
- { SYF_MPSAFE | AS(profil_args), (sy_call_t *)profil, AUE_NULL }, /* 44 = profil */
- { SYF_MPSAFE | AS(ktrace_args), (sy_call_t *)ktrace, AUE_NULL }, /* 45 = ktrace */
- { compat(SYF_MPSAFE | AS(osigaction_args),sigaction), AUE_NULL }, /* 46 = old sigaction */
- { SYF_MPSAFE | 0, (sy_call_t *)getgid, AUE_NULL }, /* 47 = getgid */
- { compat(SYF_MPSAFE | AS(osigprocmask_args),sigprocmask), AUE_NULL }, /* 48 = old sigprocmask */
- { SYF_MPSAFE | AS(getlogin_args), (sy_call_t *)getlogin, AUE_NULL }, /* 49 = getlogin */
- { SYF_MPSAFE | AS(setlogin_args), (sy_call_t *)setlogin, AUE_NULL }, /* 50 = setlogin */
- { SYF_MPSAFE | AS(acct_args), (sy_call_t *)acct, AUE_NULL }, /* 51 = acct */
- { compat(SYF_MPSAFE | 0,sigpending), AUE_NULL }, /* 52 = old sigpending */
- { SYF_MPSAFE | AS(sigaltstack_args), (sy_call_t *)sigaltstack, AUE_NULL }, /* 53 = sigaltstack */
- { SYF_MPSAFE | AS(ioctl_args), (sy_call_t *)ioctl, AUE_NULL }, /* 54 = ioctl */
- { SYF_MPSAFE | AS(reboot_args), (sy_call_t *)reboot, AUE_NULL }, /* 55 = reboot */
- { SYF_MPSAFE | AS(revoke_args), (sy_call_t *)revoke, AUE_NULL }, /* 56 = revoke */
- { SYF_MPSAFE | AS(symlink_args), (sy_call_t *)symlink, AUE_NULL }, /* 57 = symlink */
- { SYF_MPSAFE | AS(readlink_args), (sy_call_t *)readlink, AUE_NULL }, /* 58 = readlink */
- { SYF_MPSAFE | AS(execve_args), (sy_call_t *)execve, AUE_NULL }, /* 59 = execve */
- { SYF_MPSAFE | AS(umask_args), (sy_call_t *)umask, AUE_NULL }, /* 60 = umask */
- { SYF_MPSAFE | AS(chroot_args), (sy_call_t *)chroot, AUE_NULL }, /* 61 = chroot */
- { compat(SYF_MPSAFE | AS(ofstat_args),fstat), AUE_NULL }, /* 62 = old fstat */
- { compat(SYF_MPSAFE | AS(getkerninfo_args),getkerninfo), AUE_NULL }, /* 63 = old getkerninfo */
- { compat(SYF_MPSAFE | 0,getpagesize), AUE_NULL }, /* 64 = old getpagesize */
- { SYF_MPSAFE | AS(msync_args), (sy_call_t *)msync, AUE_NULL }, /* 65 = msync */
- { SYF_MPSAFE | 0, (sy_call_t *)vfork, AUE_NULL }, /* 66 = vfork */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 67 = obsolete vread */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 68 = obsolete vwrite */
- { SYF_MPSAFE | AS(sbrk_args), (sy_call_t *)sbrk, AUE_NULL }, /* 69 = sbrk */
- { SYF_MPSAFE | AS(sstk_args), (sy_call_t *)sstk, AUE_NULL }, /* 70 = sstk */
- { compat(SYF_MPSAFE | AS(ommap_args),mmap), AUE_NULL }, /* 71 = old mmap */
- { SYF_MPSAFE | AS(ovadvise_args), (sy_call_t *)ovadvise, AUE_NULL }, /* 72 = vadvise */
- { SYF_MPSAFE | AS(munmap_args), (sy_call_t *)munmap, AUE_NULL }, /* 73 = munmap */
- { SYF_MPSAFE | AS(mprotect_args), (sy_call_t *)mprotect, AUE_NULL }, /* 74 = mprotect */
- { SYF_MPSAFE | AS(madvise_args), (sy_call_t *)madvise, AUE_NULL }, /* 75 = madvise */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 76 = obsolete vhangup */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 77 = obsolete vlimit */
- { SYF_MPSAFE | AS(mincore_args), (sy_call_t *)mincore, AUE_NULL }, /* 78 = mincore */
- { SYF_MPSAFE | AS(getgroups_args), (sy_call_t *)getgroups, AUE_NULL }, /* 79 = getgroups */
- { SYF_MPSAFE | AS(setgroups_args), (sy_call_t *)setgroups, AUE_NULL }, /* 80 = setgroups */
- { SYF_MPSAFE | 0, (sy_call_t *)getpgrp, AUE_NULL }, /* 81 = getpgrp */
- { SYF_MPSAFE | AS(setpgid_args), (sy_call_t *)setpgid, AUE_NULL }, /* 82 = setpgid */
- { SYF_MPSAFE | AS(setitimer_args), (sy_call_t *)setitimer, AUE_NULL }, /* 83 = setitimer */
- { compat(SYF_MPSAFE | 0,wait), AUE_NULL }, /* 84 = old wait */
- { SYF_MPSAFE | AS(swapon_args), (sy_call_t *)swapon, AUE_NULL }, /* 85 = swapon */
- { SYF_MPSAFE | AS(getitimer_args), (sy_call_t *)getitimer, AUE_NULL }, /* 86 = getitimer */
- { compat(SYF_MPSAFE | AS(gethostname_args),gethostname), AUE_NULL }, /* 87 = old gethostname */
- { compat(SYF_MPSAFE | AS(sethostname_args),sethostname), AUE_NULL }, /* 88 = old sethostname */
- { SYF_MPSAFE | 0, (sy_call_t *)getdtablesize, AUE_NULL }, /* 89 = getdtablesize */
- { SYF_MPSAFE | AS(dup2_args), (sy_call_t *)dup2, AUE_NULL }, /* 90 = dup2 */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 91 = getdopt */
- { SYF_MPSAFE | AS(fcntl_args), (sy_call_t *)fcntl, AUE_NULL }, /* 92 = fcntl */
- { SYF_MPSAFE | AS(select_args), (sy_call_t *)select, AUE_NULL }, /* 93 = select */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 94 = setdopt */
- { SYF_MPSAFE | AS(fsync_args), (sy_call_t *)fsync, AUE_NULL }, /* 95 = fsync */
- { SYF_MPSAFE | AS(setpriority_args), (sy_call_t *)setpriority, AUE_NULL }, /* 96 = setpriority */
- { SYF_MPSAFE | AS(socket_args), (sy_call_t *)socket, AUE_NULL }, /* 97 = socket */
- { SYF_MPSAFE | AS(connect_args), (sy_call_t *)connect, AUE_NULL }, /* 98 = connect */
- { compat(SYF_MPSAFE | AS(accept_args),accept), AUE_NULL }, /* 99 = old accept */
- { SYF_MPSAFE | AS(getpriority_args), (sy_call_t *)getpriority, AUE_NULL }, /* 100 = getpriority */
- { compat(SYF_MPSAFE | AS(osend_args),send), AUE_NULL }, /* 101 = old send */
- { compat(SYF_MPSAFE | AS(orecv_args),recv), AUE_NULL }, /* 102 = old recv */
- { compat(SYF_MPSAFE | AS(osigreturn_args),sigreturn), AUE_NULL }, /* 103 = old sigreturn */
- { SYF_MPSAFE | AS(bind_args), (sy_call_t *)bind, AUE_NULL }, /* 104 = bind */
- { SYF_MPSAFE | AS(setsockopt_args), (sy_call_t *)setsockopt, AUE_NULL }, /* 105 = setsockopt */
- { SYF_MPSAFE | AS(listen_args), (sy_call_t *)listen, AUE_NULL }, /* 106 = listen */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 107 = obsolete vtimes */
- { compat(SYF_MPSAFE | AS(osigvec_args),sigvec), AUE_NULL }, /* 108 = old sigvec */
- { compat(SYF_MPSAFE | AS(osigblock_args),sigblock), AUE_NULL }, /* 109 = old sigblock */
- { compat(SYF_MPSAFE | AS(osigsetmask_args),sigsetmask), AUE_NULL }, /* 110 = old sigsetmask */
- { compat(SYF_MPSAFE | AS(osigsuspend_args),sigsuspend), AUE_NULL }, /* 111 = old sigsuspend */
- { compat(SYF_MPSAFE | AS(osigstack_args),sigstack), AUE_NULL }, /* 112 = old sigstack */
- { compat(SYF_MPSAFE | AS(orecvmsg_args),recvmsg), AUE_NULL }, /* 113 = old recvmsg */
- { compat(SYF_MPSAFE | AS(osendmsg_args),sendmsg), AUE_NULL }, /* 114 = old sendmsg */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 115 = obsolete vtrace */
- { SYF_MPSAFE | AS(gettimeofday_args), (sy_call_t *)gettimeofday, AUE_NULL }, /* 116 = gettimeofday */
- { SYF_MPSAFE | AS(getrusage_args), (sy_call_t *)getrusage, AUE_NULL }, /* 117 = getrusage */
- { SYF_MPSAFE | AS(getsockopt_args), (sy_call_t *)getsockopt, AUE_NULL }, /* 118 = getsockopt */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 119 = resuba */
- { SYF_MPSAFE | AS(readv_args), (sy_call_t *)readv, AUE_NULL }, /* 120 = readv */
- { SYF_MPSAFE | AS(writev_args), (sy_call_t *)writev, AUE_NULL }, /* 121 = writev */
- { SYF_MPSAFE | AS(settimeofday_args), (sy_call_t *)settimeofday, AUE_NULL }, /* 122 = settimeofday */
- { SYF_MPSAFE | AS(fchown_args), (sy_call_t *)fchown, AUE_NULL }, /* 123 = fchown */
- { SYF_MPSAFE | AS(fchmod_args), (sy_call_t *)fchmod, AUE_NULL }, /* 124 = fchmod */
- { compat(SYF_MPSAFE | AS(recvfrom_args),recvfrom), AUE_NULL }, /* 125 = old recvfrom */
- { SYF_MPSAFE | AS(setreuid_args), (sy_call_t *)setreuid, AUE_NULL }, /* 126 = setreuid */
- { SYF_MPSAFE | AS(setregid_args), (sy_call_t *)setregid, AUE_NULL }, /* 127 = setregid */
- { SYF_MPSAFE | AS(rename_args), (sy_call_t *)rename, AUE_NULL }, /* 128 = rename */
- { compat(SYF_MPSAFE | AS(otruncate_args),truncate), AUE_NULL }, /* 129 = old truncate */
- { compat(SYF_MPSAFE | AS(oftruncate_args),ftruncate), AUE_NULL }, /* 130 = old ftruncate */
- { SYF_MPSAFE | AS(flock_args), (sy_call_t *)flock, AUE_NULL }, /* 131 = flock */
- { SYF_MPSAFE | AS(mkfifo_args), (sy_call_t *)mkfifo, AUE_NULL }, /* 132 = mkfifo */
- { SYF_MPSAFE | AS(sendto_args), (sy_call_t *)sendto, AUE_NULL }, /* 133 = sendto */
- { SYF_MPSAFE | AS(shutdown_args), (sy_call_t *)shutdown, AUE_NULL }, /* 134 = shutdown */
- { SYF_MPSAFE | AS(socketpair_args), (sy_call_t *)socketpair, AUE_NULL }, /* 135 = socketpair */
- { SYF_MPSAFE | AS(mkdir_args), (sy_call_t *)mkdir, AUE_NULL }, /* 136 = mkdir */
- { SYF_MPSAFE | AS(rmdir_args), (sy_call_t *)rmdir, AUE_NULL }, /* 137 = rmdir */
- { SYF_MPSAFE | AS(utimes_args), (sy_call_t *)utimes, AUE_NULL }, /* 138 = utimes */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 139 = obsolete 4.2 sigreturn */
- { SYF_MPSAFE | AS(adjtime_args), (sy_call_t *)adjtime, AUE_NULL }, /* 140 = adjtime */
- { compat(SYF_MPSAFE | AS(ogetpeername_args),getpeername), AUE_NULL }, /* 141 = old getpeername */
- { compat(SYF_MPSAFE | 0,gethostid), AUE_NULL }, /* 142 = old gethostid */
- { compat(SYF_MPSAFE | AS(osethostid_args),sethostid), AUE_NULL }, /* 143 = old sethostid */
- { compat(SYF_MPSAFE | AS(ogetrlimit_args),getrlimit), AUE_NULL }, /* 144 = old getrlimit */
- { compat(SYF_MPSAFE | AS(osetrlimit_args),setrlimit), AUE_NULL }, /* 145 = old setrlimit */
- { compat(SYF_MPSAFE | AS(okillpg_args),killpg), AUE_NULL }, /* 146 = old killpg */
- { SYF_MPSAFE | 0, (sy_call_t *)setsid, AUE_NULL }, /* 147 = setsid */
- { SYF_MPSAFE | AS(quotactl_args), (sy_call_t *)quotactl, AUE_NULL }, /* 148 = quotactl */
- { compat(SYF_MPSAFE | 0,quota), AUE_NULL }, /* 149 = old quota */
- { compat(SYF_MPSAFE | AS(getsockname_args),getsockname), AUE_NULL }, /* 150 = old getsockname */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 151 = sem_lock */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 152 = sem_wakeup */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 153 = asyncdaemon */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 154 = nosys */
- { SYF_MPSAFE | AS(nfssvc_args), (sy_call_t *)nosys, AUE_NULL }, /* 155 = nfssvc */
- { compat(AS(ogetdirentries_args),getdirentries), AUE_NULL }, /* 156 = old getdirentries */
- { compat4(SYF_MPSAFE | AS(freebsd4_statfs_args),statfs), AUE_NULL }, /* 157 = old statfs */
- { compat4(SYF_MPSAFE | AS(freebsd4_fstatfs_args),fstatfs), AUE_NULL }, /* 158 = old fstatfs */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 159 = nosys */
- { SYF_MPSAFE | AS(lgetfh_args), (sy_call_t *)lgetfh, AUE_NULL }, /* 160 = lgetfh */
- { SYF_MPSAFE | AS(getfh_args), (sy_call_t *)getfh, AUE_NULL }, /* 161 = getfh */
- { SYF_MPSAFE | AS(getdomainname_args), (sy_call_t *)getdomainname, AUE_NULL }, /* 162 = getdomainname */
- { SYF_MPSAFE | AS(setdomainname_args), (sy_call_t *)setdomainname, AUE_NULL }, /* 163 = setdomainname */
- { SYF_MPSAFE | AS(uname_args), (sy_call_t *)uname, AUE_NULL }, /* 164 = uname */
- { SYF_MPSAFE | AS(sysarch_args), (sy_call_t *)sysarch, AUE_NULL }, /* 165 = sysarch */
- { SYF_MPSAFE | AS(rtprio_args), (sy_call_t *)rtprio, AUE_NULL }, /* 166 = rtprio */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 167 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 168 = nosys */
- { SYF_MPSAFE | AS(semsys_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 169 = semsys */
- { SYF_MPSAFE | AS(msgsys_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 170 = msgsys */
- { SYF_MPSAFE | AS(shmsys_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 171 = shmsys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 172 = nosys */
- { SYF_MPSAFE | AS(pread_args), (sy_call_t *)pread, AUE_NULL }, /* 173 = pread */
- { SYF_MPSAFE | AS(pwrite_args), (sy_call_t *)pwrite, AUE_NULL }, /* 174 = pwrite */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 175 = nosys */
- { SYF_MPSAFE | AS(ntp_adjtime_args), (sy_call_t *)ntp_adjtime, AUE_NULL }, /* 176 = ntp_adjtime */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 177 = sfork */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 178 = getdescriptor */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 179 = setdescriptor */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 180 = nosys */
- { SYF_MPSAFE | AS(setgid_args), (sy_call_t *)setgid, AUE_NULL }, /* 181 = setgid */
- { SYF_MPSAFE | AS(setegid_args), (sy_call_t *)setegid, AUE_NULL }, /* 182 = setegid */
- { SYF_MPSAFE | AS(seteuid_args), (sy_call_t *)seteuid, AUE_NULL }, /* 183 = seteuid */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 184 = lfs_bmapv */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 185 = lfs_markv */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 186 = lfs_segclean */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 187 = lfs_segwait */
- { SYF_MPSAFE | AS(stat_args), (sy_call_t *)stat, AUE_NULL }, /* 188 = stat */
- { SYF_MPSAFE | AS(fstat_args), (sy_call_t *)fstat, AUE_NULL }, /* 189 = fstat */
- { SYF_MPSAFE | AS(lstat_args), (sy_call_t *)lstat, AUE_NULL }, /* 190 = lstat */
- { SYF_MPSAFE | AS(pathconf_args), (sy_call_t *)pathconf, AUE_NULL }, /* 191 = pathconf */
- { SYF_MPSAFE | AS(fpathconf_args), (sy_call_t *)fpathconf, AUE_NULL }, /* 192 = fpathconf */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 193 = nosys */
- { SYF_MPSAFE | AS(__getrlimit_args), (sy_call_t *)getrlimit, AUE_NULL }, /* 194 = getrlimit */
- { SYF_MPSAFE | AS(__setrlimit_args), (sy_call_t *)setrlimit, AUE_NULL }, /* 195 = setrlimit */
- { SYF_MPSAFE | AS(getdirentries_args), (sy_call_t *)getdirentries, AUE_NULL }, /* 196 = getdirentries */
- { SYF_MPSAFE | AS(mmap_args), (sy_call_t *)mmap, AUE_NULL }, /* 197 = mmap */
- { SYF_MPSAFE | 0, (sy_call_t *)nosys, AUE_NULL }, /* 198 = __syscall */
- { SYF_MPSAFE | AS(lseek_args), (sy_call_t *)lseek, AUE_NULL }, /* 199 = lseek */
- { SYF_MPSAFE | AS(truncate_args), (sy_call_t *)truncate, AUE_NULL }, /* 200 = truncate */
- { SYF_MPSAFE | AS(ftruncate_args), (sy_call_t *)ftruncate, AUE_NULL }, /* 201 = ftruncate */
- { SYF_MPSAFE | AS(sysctl_args), (sy_call_t *)__sysctl, AUE_NULL }, /* 202 = __sysctl */
- { SYF_MPSAFE | AS(mlock_args), (sy_call_t *)mlock, AUE_NULL }, /* 203 = mlock */
- { SYF_MPSAFE | AS(munlock_args), (sy_call_t *)munlock, AUE_NULL }, /* 204 = munlock */
- { SYF_MPSAFE | AS(undelete_args), (sy_call_t *)undelete, AUE_NULL }, /* 205 = undelete */
- { SYF_MPSAFE | AS(futimes_args), (sy_call_t *)futimes, AUE_NULL }, /* 206 = futimes */
- { SYF_MPSAFE | AS(getpgid_args), (sy_call_t *)getpgid, AUE_NULL }, /* 207 = getpgid */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 208 = newreboot */
- { SYF_MPSAFE | AS(poll_args), (sy_call_t *)poll, AUE_NULL }, /* 209 = poll */
- { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL }, /* 210 = lkmnosys */
- { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL }, /* 211 = lkmnosys */
- { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL }, /* 212 = lkmnosys */
- { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL }, /* 213 = lkmnosys */
- { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL }, /* 214 = lkmnosys */
- { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL }, /* 215 = lkmnosys */
- { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL }, /* 216 = lkmnosys */
- { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL }, /* 217 = lkmnosys */
- { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL }, /* 218 = lkmnosys */
- { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL }, /* 219 = lkmnosys */
- { SYF_MPSAFE | AS(__semctl_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 220 = __semctl */
- { SYF_MPSAFE | AS(semget_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 221 = semget */
- { SYF_MPSAFE | AS(semop_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 222 = semop */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 223 = semconfig */
- { SYF_MPSAFE | AS(msgctl_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 224 = msgctl */
- { SYF_MPSAFE | AS(msgget_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 225 = msgget */
- { SYF_MPSAFE | AS(msgsnd_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 226 = msgsnd */
- { SYF_MPSAFE | AS(msgrcv_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 227 = msgrcv */
- { SYF_MPSAFE | AS(shmat_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 228 = shmat */
- { SYF_MPSAFE | AS(shmctl_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 229 = shmctl */
- { SYF_MPSAFE | AS(shmdt_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 230 = shmdt */
- { SYF_MPSAFE | AS(shmget_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 231 = shmget */
- { SYF_MPSAFE | AS(clock_gettime_args), (sy_call_t *)clock_gettime, AUE_NULL }, /* 232 = clock_gettime */
- { SYF_MPSAFE | AS(clock_settime_args), (sy_call_t *)clock_settime, AUE_NULL }, /* 233 = clock_settime */
- { SYF_MPSAFE | AS(clock_getres_args), (sy_call_t *)clock_getres, AUE_NULL }, /* 234 = clock_getres */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 235 = timer_create */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 236 = timer_delete */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 237 = timer_settime */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 238 = timer_gettime */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 239 = timer_getoverrun */
- { SYF_MPSAFE | AS(nanosleep_args), (sy_call_t *)nanosleep, AUE_NULL }, /* 240 = nanosleep */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 241 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 242 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 243 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 244 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 245 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 246 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 247 = nosys */
- { SYF_MPSAFE | AS(ntp_gettime_args), (sy_call_t *)ntp_gettime, AUE_NULL }, /* 248 = ntp_gettime */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 249 = nosys */
- { SYF_MPSAFE | AS(minherit_args), (sy_call_t *)minherit, AUE_NULL }, /* 250 = minherit */
- { SYF_MPSAFE | AS(rfork_args), (sy_call_t *)rfork, AUE_NULL }, /* 251 = rfork */
- { SYF_MPSAFE | AS(openbsd_poll_args), (sy_call_t *)openbsd_poll, AUE_NULL }, /* 252 = openbsd_poll */
- { SYF_MPSAFE | 0, (sy_call_t *)issetugid, AUE_NULL }, /* 253 = issetugid */
- { SYF_MPSAFE | AS(lchown_args), (sy_call_t *)lchown, AUE_NULL }, /* 254 = lchown */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 255 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 256 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 257 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 258 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 259 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 260 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 261 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 262 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 263 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 264 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 265 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 266 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 267 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 268 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 269 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 270 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 271 = nosys */
- { SYF_MPSAFE | AS(getdents_args), (sy_call_t *)getdents, AUE_NULL }, /* 272 = getdents */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 273 = nosys */
- { SYF_MPSAFE | AS(lchmod_args), (sy_call_t *)lchmod, AUE_NULL }, /* 274 = lchmod */
- { SYF_MPSAFE | AS(lchown_args), (sy_call_t *)lchown, AUE_NULL }, /* 275 = netbsd_lchown */
- { SYF_MPSAFE | AS(lutimes_args), (sy_call_t *)lutimes, AUE_NULL }, /* 276 = lutimes */
- { SYF_MPSAFE | AS(msync_args), (sy_call_t *)msync, AUE_NULL }, /* 277 = netbsd_msync */
- { SYF_MPSAFE | AS(nstat_args), (sy_call_t *)nstat, AUE_NULL }, /* 278 = nstat */
- { SYF_MPSAFE | AS(nfstat_args), (sy_call_t *)nfstat, AUE_NULL }, /* 279 = nfstat */
- { SYF_MPSAFE | AS(nlstat_args), (sy_call_t *)nlstat, AUE_NULL }, /* 280 = nlstat */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 281 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 282 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 283 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 284 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 285 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 286 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 287 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 288 = nosys */
- { SYF_MPSAFE | AS(preadv_args), (sy_call_t *)preadv, AUE_NULL }, /* 289 = preadv */
- { SYF_MPSAFE | AS(pwritev_args), (sy_call_t *)pwritev, AUE_NULL }, /* 290 = pwritev */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 291 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 292 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 293 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 294 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 295 = nosys */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 296 = nosys */
- { compat4(SYF_MPSAFE | AS(freebsd4_fhstatfs_args),fhstatfs), AUE_NULL }, /* 297 = old fhstatfs */
- { SYF_MPSAFE | AS(fhopen_args), (sy_call_t *)fhopen, AUE_NULL }, /* 298 = fhopen */
- { SYF_MPSAFE | AS(fhstat_args), (sy_call_t *)fhstat, AUE_NULL }, /* 299 = fhstat */
- { SYF_MPSAFE | AS(modnext_args), (sy_call_t *)modnext, AUE_NULL }, /* 300 = modnext */
- { SYF_MPSAFE | AS(modstat_args), (sy_call_t *)modstat, AUE_NULL }, /* 301 = modstat */
- { SYF_MPSAFE | AS(modfnext_args), (sy_call_t *)modfnext, AUE_NULL }, /* 302 = modfnext */
- { SYF_MPSAFE | AS(modfind_args), (sy_call_t *)modfind, AUE_NULL }, /* 303 = modfind */
- { SYF_MPSAFE | AS(kldload_args), (sy_call_t *)kldload, AUE_NULL }, /* 304 = kldload */
- { SYF_MPSAFE | AS(kldunload_args), (sy_call_t *)kldunload, AUE_NULL }, /* 305 = kldunload */
- { SYF_MPSAFE | AS(kldfind_args), (sy_call_t *)kldfind, AUE_NULL }, /* 306 = kldfind */
- { SYF_MPSAFE | AS(kldnext_args), (sy_call_t *)kldnext, AUE_NULL }, /* 307 = kldnext */
- { SYF_MPSAFE | AS(kldstat_args), (sy_call_t *)kldstat, AUE_NULL }, /* 308 = kldstat */
- { SYF_MPSAFE | AS(kldfirstmod_args), (sy_call_t *)kldfirstmod, AUE_NULL }, /* 309 = kldfirstmod */
- { SYF_MPSAFE | AS(getsid_args), (sy_call_t *)getsid, AUE_NULL }, /* 310 = getsid */
- { SYF_MPSAFE | AS(setresuid_args), (sy_call_t *)setresuid, AUE_NULL }, /* 311 = setresuid */
- { SYF_MPSAFE | AS(setresgid_args), (sy_call_t *)setresgid, AUE_NULL }, /* 312 = setresgid */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 313 = obsolete signanosleep */
- { AS(aio_return_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 314 = aio_return */
- { AS(aio_suspend_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 315 = aio_suspend */
- { AS(aio_cancel_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 316 = aio_cancel */
- { AS(aio_error_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 317 = aio_error */
- { AS(aio_read_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 318 = aio_read */
- { AS(aio_write_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 319 = aio_write */
- { AS(lio_listio_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 320 = lio_listio */
- { SYF_MPSAFE | 0, (sy_call_t *)yield, AUE_NULL }, /* 321 = yield */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 322 = obsolete thr_sleep */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 323 = obsolete thr_wakeup */
- { SYF_MPSAFE | AS(mlockall_args), (sy_call_t *)mlockall, AUE_NULL }, /* 324 = mlockall */
- { SYF_MPSAFE | 0, (sy_call_t *)munlockall, AUE_NULL }, /* 325 = munlockall */
- { SYF_MPSAFE | AS(__getcwd_args), (sy_call_t *)__getcwd, AUE_NULL }, /* 326 = __getcwd */
- { SYF_MPSAFE | AS(sched_setparam_args), (sy_call_t *)sched_setparam, AUE_NULL }, /* 327 = sched_setparam */
- { SYF_MPSAFE | AS(sched_getparam_args), (sy_call_t *)sched_getparam, AUE_NULL }, /* 328 = sched_getparam */
- { SYF_MPSAFE | AS(sched_setscheduler_args), (sy_call_t *)sched_setscheduler, AUE_NULL }, /* 329 = sched_setscheduler */
- { SYF_MPSAFE | AS(sched_getscheduler_args), (sy_call_t *)sched_getscheduler, AUE_NULL }, /* 330 = sched_getscheduler */
- { SYF_MPSAFE | 0, (sy_call_t *)sched_yield, AUE_NULL }, /* 331 = sched_yield */
- { SYF_MPSAFE | AS(sched_get_priority_max_args), (sy_call_t *)sched_get_priority_max, AUE_NULL }, /* 332 = sched_get_priority_max */
- { SYF_MPSAFE | AS(sched_get_priority_min_args), (sy_call_t *)sched_get_priority_min, AUE_NULL }, /* 333 = sched_get_priority_min */
- { SYF_MPSAFE | AS(sched_rr_get_interval_args), (sy_call_t *)sched_rr_get_interval, AUE_NULL }, /* 334 = sched_rr_get_interval */
- { SYF_MPSAFE | AS(utrace_args), (sy_call_t *)utrace, AUE_NULL }, /* 335 = utrace */
- { compat4(SYF_MPSAFE | AS(freebsd4_sendfile_args),sendfile), AUE_NULL }, /* 336 = old sendfile */
- { SYF_MPSAFE | AS(kldsym_args), (sy_call_t *)kldsym, AUE_NULL }, /* 337 = kldsym */
- { SYF_MPSAFE | AS(jail_args), (sy_call_t *)jail, AUE_NULL }, /* 338 = jail */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 339 = pioctl */
- { SYF_MPSAFE | AS(sigprocmask_args), (sy_call_t *)sigprocmask, AUE_NULL }, /* 340 = sigprocmask */
- { SYF_MPSAFE | AS(sigsuspend_args), (sy_call_t *)sigsuspend, AUE_NULL }, /* 341 = sigsuspend */
- { compat4(SYF_MPSAFE | AS(freebsd4_sigaction_args),sigaction), AUE_NULL }, /* 342 = old sigaction */
- { SYF_MPSAFE | AS(sigpending_args), (sy_call_t *)sigpending, AUE_NULL }, /* 343 = sigpending */
- { compat4(SYF_MPSAFE | AS(freebsd4_sigreturn_args),sigreturn), AUE_NULL }, /* 344 = old sigreturn */
- { SYF_MPSAFE | AS(sigtimedwait_args), (sy_call_t *)sigtimedwait, AUE_NULL }, /* 345 = sigtimedwait */
- { SYF_MPSAFE | AS(sigwaitinfo_args), (sy_call_t *)sigwaitinfo, AUE_NULL }, /* 346 = sigwaitinfo */
- { SYF_MPSAFE | AS(__acl_get_file_args), (sy_call_t *)__acl_get_file, AUE_NULL }, /* 347 = __acl_get_file */
- { SYF_MPSAFE | AS(__acl_set_file_args), (sy_call_t *)__acl_set_file, AUE_NULL }, /* 348 = __acl_set_file */
- { SYF_MPSAFE | AS(__acl_get_fd_args), (sy_call_t *)__acl_get_fd, AUE_NULL }, /* 349 = __acl_get_fd */
- { SYF_MPSAFE | AS(__acl_set_fd_args), (sy_call_t *)__acl_set_fd, AUE_NULL }, /* 350 = __acl_set_fd */
- { SYF_MPSAFE | AS(__acl_delete_file_args), (sy_call_t *)__acl_delete_file, AUE_NULL }, /* 351 = __acl_delete_file */
- { SYF_MPSAFE | AS(__acl_delete_fd_args), (sy_call_t *)__acl_delete_fd, AUE_NULL }, /* 352 = __acl_delete_fd */
- { SYF_MPSAFE | AS(__acl_aclcheck_file_args), (sy_call_t *)__acl_aclcheck_file, AUE_NULL }, /* 353 = __acl_aclcheck_file */
- { SYF_MPSAFE | AS(__acl_aclcheck_fd_args), (sy_call_t *)__acl_aclcheck_fd, AUE_NULL }, /* 354 = __acl_aclcheck_fd */
- { SYF_MPSAFE | AS(extattrctl_args), (sy_call_t *)extattrctl, AUE_NULL }, /* 355 = extattrctl */
- { SYF_MPSAFE | AS(extattr_set_file_args), (sy_call_t *)extattr_set_file, AUE_NULL }, /* 356 = extattr_set_file */
- { SYF_MPSAFE | AS(extattr_get_file_args), (sy_call_t *)extattr_get_file, AUE_NULL }, /* 357 = extattr_get_file */
- { SYF_MPSAFE | AS(extattr_delete_file_args), (sy_call_t *)extattr_delete_file, AUE_NULL }, /* 358 = extattr_delete_file */
- { AS(aio_waitcomplete_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 359 = aio_waitcomplete */
- { SYF_MPSAFE | AS(getresuid_args), (sy_call_t *)getresuid, AUE_NULL }, /* 360 = getresuid */
- { SYF_MPSAFE | AS(getresgid_args), (sy_call_t *)getresgid, AUE_NULL }, /* 361 = getresgid */
- { SYF_MPSAFE | 0, (sy_call_t *)kqueue, AUE_NULL }, /* 362 = kqueue */
- { SYF_MPSAFE | AS(kevent_args), (sy_call_t *)kevent, AUE_NULL }, /* 363 = kevent */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 364 = __cap_get_proc */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 365 = __cap_set_proc */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 366 = __cap_get_fd */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 367 = __cap_get_file */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 368 = __cap_set_fd */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 369 = __cap_set_file */
- { AS(nosys_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 370 = lkmressys */
- { SYF_MPSAFE | AS(extattr_set_fd_args), (sy_call_t *)extattr_set_fd, AUE_NULL }, /* 371 = extattr_set_fd */
- { SYF_MPSAFE | AS(extattr_get_fd_args), (sy_call_t *)extattr_get_fd, AUE_NULL }, /* 372 = extattr_get_fd */
- { SYF_MPSAFE | AS(extattr_delete_fd_args), (sy_call_t *)extattr_delete_fd, AUE_NULL }, /* 373 = extattr_delete_fd */
- { SYF_MPSAFE | AS(__setugid_args), (sy_call_t *)__setugid, AUE_NULL }, /* 374 = __setugid */
- { AS(nfsclnt_args), (sy_call_t *)nosys, AUE_NULL }, /* 375 = nfsclnt */
- { SYF_MPSAFE | AS(eaccess_args), (sy_call_t *)eaccess, AUE_NULL }, /* 376 = eaccess */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 377 = afs_syscall */
- { AS(nmount_args), (sy_call_t *)nmount, AUE_NULL }, /* 378 = nmount */
- { SYF_MPSAFE | 0, (sy_call_t *)kse_exit, AUE_NULL }, /* 379 = kse_exit */
- { SYF_MPSAFE | AS(kse_wakeup_args), (sy_call_t *)kse_wakeup, AUE_NULL }, /* 380 = kse_wakeup */
- { SYF_MPSAFE | AS(kse_create_args), (sy_call_t *)kse_create, AUE_NULL }, /* 381 = kse_create */
- { SYF_MPSAFE | AS(kse_thr_interrupt_args), (sy_call_t *)kse_thr_interrupt, AUE_NULL }, /* 382 = kse_thr_interrupt */
- { SYF_MPSAFE | AS(kse_release_args), (sy_call_t *)kse_release, AUE_NULL }, /* 383 = kse_release */
- { SYF_MPSAFE | AS(__mac_get_proc_args), (sy_call_t *)__mac_get_proc, AUE_NULL }, /* 384 = __mac_get_proc */
- { SYF_MPSAFE | AS(__mac_set_proc_args), (sy_call_t *)__mac_set_proc, AUE_NULL }, /* 385 = __mac_set_proc */
- { SYF_MPSAFE | AS(__mac_get_fd_args), (sy_call_t *)__mac_get_fd, AUE_NULL }, /* 386 = __mac_get_fd */
- { SYF_MPSAFE | AS(__mac_get_file_args), (sy_call_t *)__mac_get_file, AUE_NULL }, /* 387 = __mac_get_file */
- { SYF_MPSAFE | AS(__mac_set_fd_args), (sy_call_t *)__mac_set_fd, AUE_NULL }, /* 388 = __mac_set_fd */
- { SYF_MPSAFE | AS(__mac_set_file_args), (sy_call_t *)__mac_set_file, AUE_NULL }, /* 389 = __mac_set_file */
- { SYF_MPSAFE | AS(kenv_args), (sy_call_t *)kenv, AUE_NULL }, /* 390 = kenv */
- { SYF_MPSAFE | AS(lchflags_args), (sy_call_t *)lchflags, AUE_NULL }, /* 391 = lchflags */
- { SYF_MPSAFE | AS(uuidgen_args), (sy_call_t *)uuidgen, AUE_NULL }, /* 392 = uuidgen */
- { SYF_MPSAFE | AS(sendfile_args), (sy_call_t *)sendfile, AUE_NULL }, /* 393 = sendfile */
- { SYF_MPSAFE | AS(mac_syscall_args), (sy_call_t *)mac_syscall, AUE_NULL }, /* 394 = mac_syscall */
- { SYF_MPSAFE | AS(getfsstat_args), (sy_call_t *)getfsstat, AUE_NULL }, /* 395 = getfsstat */
- { SYF_MPSAFE | AS(statfs_args), (sy_call_t *)statfs, AUE_NULL }, /* 396 = statfs */
- { SYF_MPSAFE | AS(fstatfs_args), (sy_call_t *)fstatfs, AUE_NULL }, /* 397 = fstatfs */
- { SYF_MPSAFE | AS(fhstatfs_args), (sy_call_t *)fhstatfs, AUE_NULL }, /* 398 = fhstatfs */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 399 = nosys */
- { SYF_MPSAFE | AS(ksem_close_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 400 = ksem_close */
- { SYF_MPSAFE | AS(ksem_post_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 401 = ksem_post */
- { SYF_MPSAFE | AS(ksem_wait_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 402 = ksem_wait */
- { SYF_MPSAFE | AS(ksem_trywait_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 403 = ksem_trywait */
- { SYF_MPSAFE | AS(ksem_init_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 404 = ksem_init */
- { SYF_MPSAFE | AS(ksem_open_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 405 = ksem_open */
- { SYF_MPSAFE | AS(ksem_unlink_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 406 = ksem_unlink */
- { SYF_MPSAFE | AS(ksem_getvalue_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 407 = ksem_getvalue */
- { SYF_MPSAFE | AS(ksem_destroy_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 408 = ksem_destroy */
- { SYF_MPSAFE | AS(__mac_get_pid_args), (sy_call_t *)__mac_get_pid, AUE_NULL }, /* 409 = __mac_get_pid */
- { SYF_MPSAFE | AS(__mac_get_link_args), (sy_call_t *)__mac_get_link, AUE_NULL }, /* 410 = __mac_get_link */
- { SYF_MPSAFE | AS(__mac_set_link_args), (sy_call_t *)__mac_set_link, AUE_NULL }, /* 411 = __mac_set_link */
- { SYF_MPSAFE | AS(extattr_set_link_args), (sy_call_t *)extattr_set_link, AUE_NULL }, /* 412 = extattr_set_link */
- { SYF_MPSAFE | AS(extattr_get_link_args), (sy_call_t *)extattr_get_link, AUE_NULL }, /* 413 = extattr_get_link */
- { SYF_MPSAFE | AS(extattr_delete_link_args), (sy_call_t *)extattr_delete_link, AUE_NULL }, /* 414 = extattr_delete_link */
- { SYF_MPSAFE | AS(__mac_execve_args), (sy_call_t *)__mac_execve, AUE_NULL }, /* 415 = __mac_execve */
- { SYF_MPSAFE | AS(sigaction_args), (sy_call_t *)sigaction, AUE_NULL }, /* 416 = sigaction */
- { SYF_MPSAFE | AS(sigreturn_args), (sy_call_t *)sigreturn, AUE_NULL }, /* 417 = sigreturn */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 418 = __xstat */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 419 = __xfstat */
- { 0, (sy_call_t *)nosys, AUE_NULL }, /* 420 = __xlstat */
- { SYF_MPSAFE | AS(getcontext_args), (sy_call_t *)getcontext, AUE_NULL }, /* 421 = getcontext */
- { SYF_MPSAFE | AS(setcontext_args), (sy_call_t *)setcontext, AUE_NULL }, /* 422 = setcontext */
- { SYF_MPSAFE | AS(swapcontext_args), (sy_call_t *)swapcontext, AUE_NULL }, /* 423 = swapcontext */
- { SYF_MPSAFE | AS(swapoff_args), (sy_call_t *)swapoff, AUE_NULL }, /* 424 = swapoff */
- { SYF_MPSAFE | AS(__acl_get_link_args), (sy_call_t *)__acl_get_link, AUE_NULL }, /* 425 = __acl_get_link */
- { SYF_MPSAFE | AS(__acl_set_link_args), (sy_call_t *)__acl_set_link, AUE_NULL }, /* 426 = __acl_set_link */
- { SYF_MPSAFE | AS(__acl_delete_link_args), (sy_call_t *)__acl_delete_link, AUE_NULL }, /* 427 = __acl_delete_link */
- { SYF_MPSAFE | AS(__acl_aclcheck_link_args), (sy_call_t *)__acl_aclcheck_link, AUE_NULL }, /* 428 = __acl_aclcheck_link */
- { SYF_MPSAFE | AS(sigwait_args), (sy_call_t *)sigwait, AUE_NULL }, /* 429 = sigwait */
- { SYF_MPSAFE | AS(thr_create_args), (sy_call_t *)thr_create, AUE_NULL }, /* 430 = thr_create */
- { SYF_MPSAFE | AS(thr_exit_args), (sy_call_t *)thr_exit, AUE_NULL }, /* 431 = thr_exit */
- { SYF_MPSAFE | AS(thr_self_args), (sy_call_t *)thr_self, AUE_NULL }, /* 432 = thr_self */
- { SYF_MPSAFE | AS(thr_kill_args), (sy_call_t *)thr_kill, AUE_NULL }, /* 433 = thr_kill */
- { SYF_MPSAFE | AS(_umtx_lock_args), (sy_call_t *)_umtx_lock, AUE_NULL }, /* 434 = _umtx_lock */
- { SYF_MPSAFE | AS(_umtx_unlock_args), (sy_call_t *)_umtx_unlock, AUE_NULL }, /* 435 = _umtx_unlock */
- { SYF_MPSAFE | AS(jail_attach_args), (sy_call_t *)jail_attach, AUE_NULL }, /* 436 = jail_attach */
- { SYF_MPSAFE | AS(extattr_list_fd_args), (sy_call_t *)extattr_list_fd, AUE_NULL }, /* 437 = extattr_list_fd */
- { SYF_MPSAFE | AS(extattr_list_file_args), (sy_call_t *)extattr_list_file, AUE_NULL }, /* 438 = extattr_list_file */
- { SYF_MPSAFE | AS(extattr_list_link_args), (sy_call_t *)extattr_list_link, AUE_NULL }, /* 439 = extattr_list_link */
- { SYF_MPSAFE | AS(kse_switchin_args), (sy_call_t *)kse_switchin, AUE_NULL }, /* 440 = kse_switchin */
- { SYF_MPSAFE | AS(ksem_timedwait_args), (sy_call_t *)lkmressys, AUE_NULL }, /* 441 = ksem_timedwait */
- { SYF_MPSAFE | AS(thr_suspend_args), (sy_call_t *)thr_suspend, AUE_NULL }, /* 442 = thr_suspend */
- { SYF_MPSAFE | AS(thr_wake_args), (sy_call_t *)thr_wake, AUE_NULL }, /* 443 = thr_wake */
- { SYF_MPSAFE | AS(kldunloadf_args), (sy_call_t *)kldunloadf, AUE_NULL }, /* 444 = kldunloadf */
- { SYF_MPSAFE | AS(audit_args), (sy_call_t *)audit, AUE_NULL }, /* 445 = audit */
- { SYF_MPSAFE | AS(auditon_args), (sy_call_t *)auditon, AUE_NULL }, /* 446 = auditon */
- { SYF_MPSAFE | AS(getauid_args), (sy_call_t *)getauid, AUE_NULL }, /* 447 = getauid */
- { SYF_MPSAFE | AS(setauid_args), (sy_call_t *)setauid, AUE_NULL }, /* 448 = setauid */
- { SYF_MPSAFE | AS(getaudit_args), (sy_call_t *)getaudit, AUE_NULL }, /* 449 = getaudit */
- { SYF_MPSAFE | AS(setaudit_args), (sy_call_t *)setaudit, AUE_NULL }, /* 450 = setaudit */
- { SYF_MPSAFE | AS(getaudit_addr_args), (sy_call_t *)getaudit_addr, AUE_NULL }, /* 451 = getaudit_addr */
- { SYF_MPSAFE | AS(setaudit_addr_args), (sy_call_t *)setaudit_addr, AUE_NULL }, /* 452 = setaudit_addr */
- { SYF_MPSAFE | AS(auditctl_args), (sy_call_t *)auditctl, AUE_NULL }, /* 453 = auditctl */
- { SYF_MPSAFE | AS(_umtx_op_args), (sy_call_t *)_umtx_op, AUE_NULL }, /* 454 = _umtx_op */
- { SYF_MPSAFE | AS(thr_new_args), (sy_call_t *)thr_new, AUE_NULL }, /* 455 = thr_new */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 0 = syscall */
+ { AS(sys_exit_args), (sy_call_t *)sys_exit, AUE_EXIT, NULL, 0, 0 }, /* 1 = exit */
+ { 0, (sy_call_t *)fork, AUE_FORK, NULL, 0, 0 }, /* 2 = fork */
+ { AS(read_args), (sy_call_t *)read, AUE_NULL, NULL, 0, 0 }, /* 3 = read */
+ { AS(write_args), (sy_call_t *)write, AUE_NULL, NULL, 0, 0 }, /* 4 = write */
+ { AS(open_args), (sy_call_t *)open, AUE_OPEN_RWTC, NULL, 0, 0 }, /* 5 = open */
+ { AS(close_args), (sy_call_t *)close, AUE_CLOSE, NULL, 0, 0 }, /* 6 = close */
+ { AS(wait_args), (sy_call_t *)wait4, AUE_WAIT4, NULL, 0, 0 }, /* 7 = wait4 */
+ { compat(AS(ocreat_args),creat), AUE_CREAT, NULL, 0, 0 }, /* 8 = old creat */
+ { AS(link_args), (sy_call_t *)link, AUE_LINK, NULL, 0, 0 }, /* 9 = link */
+ { AS(unlink_args), (sy_call_t *)unlink, AUE_UNLINK, NULL, 0, 0 }, /* 10 = unlink */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 11 = obsolete execv */
+ { AS(chdir_args), (sy_call_t *)chdir, AUE_CHDIR, NULL, 0, 0 }, /* 12 = chdir */
+ { AS(fchdir_args), (sy_call_t *)fchdir, AUE_FCHDIR, NULL, 0, 0 }, /* 13 = fchdir */
+ { AS(mknod_args), (sy_call_t *)mknod, AUE_MKNOD, NULL, 0, 0 }, /* 14 = mknod */
+ { AS(chmod_args), (sy_call_t *)chmod, AUE_CHMOD, NULL, 0, 0 }, /* 15 = chmod */
+ { AS(chown_args), (sy_call_t *)chown, AUE_CHOWN, NULL, 0, 0 }, /* 16 = chown */
+ { AS(obreak_args), (sy_call_t *)obreak, AUE_NULL, NULL, 0, 0 }, /* 17 = break */
+ { compat4(AS(freebsd4_getfsstat_args),getfsstat), AUE_GETFSSTAT, NULL, 0, 0 }, /* 18 = old getfsstat */
+ { compat(AS(olseek_args),lseek), AUE_LSEEK, NULL, 0, 0 }, /* 19 = old lseek */
+ { 0, (sy_call_t *)getpid, AUE_GETPID, NULL, 0, 0 }, /* 20 = getpid */
+ { AS(mount_args), (sy_call_t *)mount, AUE_MOUNT, NULL, 0, 0 }, /* 21 = mount */
+ { AS(unmount_args), (sy_call_t *)unmount, AUE_UMOUNT, NULL, 0, 0 }, /* 22 = unmount */
+ { AS(setuid_args), (sy_call_t *)setuid, AUE_SETUID, NULL, 0, 0 }, /* 23 = setuid */
+ { 0, (sy_call_t *)getuid, AUE_GETUID, NULL, 0, 0 }, /* 24 = getuid */
+ { 0, (sy_call_t *)geteuid, AUE_GETEUID, NULL, 0, 0 }, /* 25 = geteuid */
+ { AS(ptrace_args), (sy_call_t *)ptrace, AUE_PTRACE, NULL, 0, 0 }, /* 26 = ptrace */
+ { AS(recvmsg_args), (sy_call_t *)recvmsg, AUE_RECVMSG, NULL, 0, 0 }, /* 27 = recvmsg */
+ { AS(sendmsg_args), (sy_call_t *)sendmsg, AUE_SENDMSG, NULL, 0, 0 }, /* 28 = sendmsg */
+ { AS(recvfrom_args), (sy_call_t *)recvfrom, AUE_RECVFROM, NULL, 0, 0 }, /* 29 = recvfrom */
+ { AS(accept_args), (sy_call_t *)accept, AUE_ACCEPT, NULL, 0, 0 }, /* 30 = accept */
+ { AS(getpeername_args), (sy_call_t *)getpeername, AUE_GETPEERNAME, NULL, 0, 0 }, /* 31 = getpeername */
+ { AS(getsockname_args), (sy_call_t *)getsockname, AUE_GETSOCKNAME, NULL, 0, 0 }, /* 32 = getsockname */
+ { AS(access_args), (sy_call_t *)access, AUE_ACCESS, NULL, 0, 0 }, /* 33 = access */
+ { AS(chflags_args), (sy_call_t *)chflags, AUE_CHFLAGS, NULL, 0, 0 }, /* 34 = chflags */
+ { AS(fchflags_args), (sy_call_t *)fchflags, AUE_FCHFLAGS, NULL, 0, 0 }, /* 35 = fchflags */
+ { 0, (sy_call_t *)sync, AUE_SYNC, NULL, 0, 0 }, /* 36 = sync */
+ { AS(kill_args), (sy_call_t *)kill, AUE_KILL, NULL, 0, 0 }, /* 37 = kill */
+ { compat(AS(ostat_args),stat), AUE_STAT, NULL, 0, 0 }, /* 38 = old stat */
+ { 0, (sy_call_t *)getppid, AUE_GETPPID, NULL, 0, 0 }, /* 39 = getppid */
+ { compat(AS(olstat_args),lstat), AUE_LSTAT, NULL, 0, 0 }, /* 40 = old lstat */
+ { AS(dup_args), (sy_call_t *)dup, AUE_DUP, NULL, 0, 0 }, /* 41 = dup */
+ { 0, (sy_call_t *)pipe, AUE_PIPE, NULL, 0, 0 }, /* 42 = pipe */
+ { 0, (sy_call_t *)getegid, AUE_GETEGID, NULL, 0, 0 }, /* 43 = getegid */
+ { AS(profil_args), (sy_call_t *)profil, AUE_PROFILE, NULL, 0, 0 }, /* 44 = profil */
+ { AS(ktrace_args), (sy_call_t *)ktrace, AUE_KTRACE, NULL, 0, 0 }, /* 45 = ktrace */
+ { compat(AS(osigaction_args),sigaction), AUE_SIGACTION, NULL, 0, 0 }, /* 46 = old sigaction */
+ { 0, (sy_call_t *)getgid, AUE_GETGID, NULL, 0, 0 }, /* 47 = getgid */
+ { compat(AS(osigprocmask_args),sigprocmask), AUE_SIGPROCMASK, NULL, 0, 0 }, /* 48 = old sigprocmask */
+ { AS(getlogin_args), (sy_call_t *)getlogin, AUE_GETLOGIN, NULL, 0, 0 }, /* 49 = getlogin */
+ { AS(setlogin_args), (sy_call_t *)setlogin, AUE_SETLOGIN, NULL, 0, 0 }, /* 50 = setlogin */
+ { AS(acct_args), (sy_call_t *)acct, AUE_ACCT, NULL, 0, 0 }, /* 51 = acct */
+ { compat(0,sigpending), AUE_SIGPENDING, NULL, 0, 0 }, /* 52 = old sigpending */
+ { AS(sigaltstack_args), (sy_call_t *)sigaltstack, AUE_SIGALTSTACK, NULL, 0, 0 }, /* 53 = sigaltstack */
+ { AS(ioctl_args), (sy_call_t *)ioctl, AUE_IOCTL, NULL, 0, 0 }, /* 54 = ioctl */
+ { AS(reboot_args), (sy_call_t *)reboot, AUE_REBOOT, NULL, 0, 0 }, /* 55 = reboot */
+ { AS(revoke_args), (sy_call_t *)revoke, AUE_REVOKE, NULL, 0, 0 }, /* 56 = revoke */
+ { AS(symlink_args), (sy_call_t *)symlink, AUE_SYMLINK, NULL, 0, 0 }, /* 57 = symlink */
+ { AS(readlink_args), (sy_call_t *)readlink, AUE_READLINK, NULL, 0, 0 }, /* 58 = readlink */
+ { AS(execve_args), (sy_call_t *)execve, AUE_EXECVE, NULL, 0, 0 }, /* 59 = execve */
+ { AS(umask_args), (sy_call_t *)umask, AUE_UMASK, NULL, 0, 0 }, /* 60 = umask */
+ { AS(chroot_args), (sy_call_t *)chroot, AUE_CHROOT, NULL, 0, 0 }, /* 61 = chroot */
+ { compat(AS(ofstat_args),fstat), AUE_FSTAT, NULL, 0, 0 }, /* 62 = old fstat */
+ { compat(AS(getkerninfo_args),getkerninfo), AUE_NULL, NULL, 0, 0 }, /* 63 = old getkerninfo */
+ { compat(0,getpagesize), AUE_NULL, NULL, 0, 0 }, /* 64 = old getpagesize */
+ { AS(msync_args), (sy_call_t *)msync, AUE_MSYNC, NULL, 0, 0 }, /* 65 = msync */
+ { 0, (sy_call_t *)vfork, AUE_VFORK, NULL, 0, 0 }, /* 66 = vfork */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 67 = obsolete vread */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 68 = obsolete vwrite */
+ { AS(sbrk_args), (sy_call_t *)sbrk, AUE_SBRK, NULL, 0, 0 }, /* 69 = sbrk */
+ { AS(sstk_args), (sy_call_t *)sstk, AUE_SSTK, NULL, 0, 0 }, /* 70 = sstk */
+ { compat(AS(ommap_args),mmap), AUE_MMAP, NULL, 0, 0 }, /* 71 = old mmap */
+ { AS(ovadvise_args), (sy_call_t *)ovadvise, AUE_O_VADVISE, NULL, 0, 0 }, /* 72 = vadvise */
+ { AS(munmap_args), (sy_call_t *)munmap, AUE_MUNMAP, NULL, 0, 0 }, /* 73 = munmap */
+ { AS(mprotect_args), (sy_call_t *)mprotect, AUE_MPROTECT, NULL, 0, 0 }, /* 74 = mprotect */
+ { AS(madvise_args), (sy_call_t *)madvise, AUE_MADVISE, NULL, 0, 0 }, /* 75 = madvise */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 76 = obsolete vhangup */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 77 = obsolete vlimit */
+ { AS(mincore_args), (sy_call_t *)mincore, AUE_MINCORE, NULL, 0, 0 }, /* 78 = mincore */
+ { AS(getgroups_args), (sy_call_t *)getgroups, AUE_GETGROUPS, NULL, 0, 0 }, /* 79 = getgroups */
+ { AS(setgroups_args), (sy_call_t *)setgroups, AUE_SETGROUPS, NULL, 0, 0 }, /* 80 = setgroups */
+ { 0, (sy_call_t *)getpgrp, AUE_GETPGRP, NULL, 0, 0 }, /* 81 = getpgrp */
+ { AS(setpgid_args), (sy_call_t *)setpgid, AUE_SETPGRP, NULL, 0, 0 }, /* 82 = setpgid */
+ { AS(setitimer_args), (sy_call_t *)setitimer, AUE_SETITIMER, NULL, 0, 0 }, /* 83 = setitimer */
+ { compat(0,wait), AUE_WAIT4, NULL, 0, 0 }, /* 84 = old wait */
+ { AS(swapon_args), (sy_call_t *)swapon, AUE_SWAPON, NULL, 0, 0 }, /* 85 = swapon */
+ { AS(getitimer_args), (sy_call_t *)getitimer, AUE_GETITIMER, NULL, 0, 0 }, /* 86 = getitimer */
+ { compat(AS(gethostname_args),gethostname), AUE_SYSCTL, NULL, 0, 0 }, /* 87 = old gethostname */
+ { compat(AS(sethostname_args),sethostname), AUE_SYSCTL, NULL, 0, 0 }, /* 88 = old sethostname */
+ { 0, (sy_call_t *)getdtablesize, AUE_GETDTABLESIZE, NULL, 0, 0 }, /* 89 = getdtablesize */
+ { AS(dup2_args), (sy_call_t *)dup2, AUE_DUP2, NULL, 0, 0 }, /* 90 = dup2 */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 91 = getdopt */
+ { AS(fcntl_args), (sy_call_t *)fcntl, AUE_FCNTL, NULL, 0, 0 }, /* 92 = fcntl */
+ { AS(select_args), (sy_call_t *)select, AUE_SELECT, NULL, 0, 0 }, /* 93 = select */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 94 = setdopt */
+ { AS(fsync_args), (sy_call_t *)fsync, AUE_FSYNC, NULL, 0, 0 }, /* 95 = fsync */
+ { AS(setpriority_args), (sy_call_t *)setpriority, AUE_SETPRIORITY, NULL, 0, 0 }, /* 96 = setpriority */
+ { AS(socket_args), (sy_call_t *)socket, AUE_SOCKET, NULL, 0, 0 }, /* 97 = socket */
+ { AS(connect_args), (sy_call_t *)connect, AUE_CONNECT, NULL, 0, 0 }, /* 98 = connect */
+ { compat(AS(accept_args),accept), AUE_ACCEPT, NULL, 0, 0 }, /* 99 = old accept */
+ { AS(getpriority_args), (sy_call_t *)getpriority, AUE_GETPRIORITY, NULL, 0, 0 }, /* 100 = getpriority */
+ { compat(AS(osend_args),send), AUE_SEND, NULL, 0, 0 }, /* 101 = old send */
+ { compat(AS(orecv_args),recv), AUE_RECV, NULL, 0, 0 }, /* 102 = old recv */
+ { compat(AS(osigreturn_args),sigreturn), AUE_SIGRETURN, NULL, 0, 0 }, /* 103 = old sigreturn */
+ { AS(bind_args), (sy_call_t *)bind, AUE_BIND, NULL, 0, 0 }, /* 104 = bind */
+ { AS(setsockopt_args), (sy_call_t *)setsockopt, AUE_SETSOCKOPT, NULL, 0, 0 }, /* 105 = setsockopt */
+ { AS(listen_args), (sy_call_t *)listen, AUE_LISTEN, NULL, 0, 0 }, /* 106 = listen */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 107 = obsolete vtimes */
+ { compat(AS(osigvec_args),sigvec), AUE_NULL, NULL, 0, 0 }, /* 108 = old sigvec */
+ { compat(AS(osigblock_args),sigblock), AUE_NULL, NULL, 0, 0 }, /* 109 = old sigblock */
+ { compat(AS(osigsetmask_args),sigsetmask), AUE_NULL, NULL, 0, 0 }, /* 110 = old sigsetmask */
+ { compat(AS(osigsuspend_args),sigsuspend), AUE_NULL, NULL, 0, 0 }, /* 111 = old sigsuspend */
+ { compat(AS(osigstack_args),sigstack), AUE_NULL, NULL, 0, 0 }, /* 112 = old sigstack */
+ { compat(AS(orecvmsg_args),recvmsg), AUE_RECVMSG, NULL, 0, 0 }, /* 113 = old recvmsg */
+ { compat(AS(osendmsg_args),sendmsg), AUE_SENDMSG, NULL, 0, 0 }, /* 114 = old sendmsg */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 115 = obsolete vtrace */
+ { AS(gettimeofday_args), (sy_call_t *)gettimeofday, AUE_GETTIMEOFDAY, NULL, 0, 0 }, /* 116 = gettimeofday */
+ { AS(getrusage_args), (sy_call_t *)getrusage, AUE_GETRUSAGE, NULL, 0, 0 }, /* 117 = getrusage */
+ { AS(getsockopt_args), (sy_call_t *)getsockopt, AUE_GETSOCKOPT, NULL, 0, 0 }, /* 118 = getsockopt */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 119 = resuba */
+ { AS(readv_args), (sy_call_t *)readv, AUE_READV, NULL, 0, 0 }, /* 120 = readv */
+ { AS(writev_args), (sy_call_t *)writev, AUE_WRITEV, NULL, 0, 0 }, /* 121 = writev */
+ { AS(settimeofday_args), (sy_call_t *)settimeofday, AUE_SETTIMEOFDAY, NULL, 0, 0 }, /* 122 = settimeofday */
+ { AS(fchown_args), (sy_call_t *)fchown, AUE_FCHOWN, NULL, 0, 0 }, /* 123 = fchown */
+ { AS(fchmod_args), (sy_call_t *)fchmod, AUE_FCHMOD, NULL, 0, 0 }, /* 124 = fchmod */
+ { compat(AS(recvfrom_args),recvfrom), AUE_RECVFROM, NULL, 0, 0 }, /* 125 = old recvfrom */
+ { AS(setreuid_args), (sy_call_t *)setreuid, AUE_SETREUID, NULL, 0, 0 }, /* 126 = setreuid */
+ { AS(setregid_args), (sy_call_t *)setregid, AUE_SETREGID, NULL, 0, 0 }, /* 127 = setregid */
+ { AS(rename_args), (sy_call_t *)rename, AUE_RENAME, NULL, 0, 0 }, /* 128 = rename */
+ { compat(AS(otruncate_args),truncate), AUE_TRUNCATE, NULL, 0, 0 }, /* 129 = old truncate */
+ { compat(AS(oftruncate_args),ftruncate), AUE_FTRUNCATE, NULL, 0, 0 }, /* 130 = old ftruncate */
+ { AS(flock_args), (sy_call_t *)flock, AUE_FLOCK, NULL, 0, 0 }, /* 131 = flock */
+ { AS(mkfifo_args), (sy_call_t *)mkfifo, AUE_MKFIFO, NULL, 0, 0 }, /* 132 = mkfifo */
+ { AS(sendto_args), (sy_call_t *)sendto, AUE_SENDTO, NULL, 0, 0 }, /* 133 = sendto */
+ { AS(shutdown_args), (sy_call_t *)shutdown, AUE_SHUTDOWN, NULL, 0, 0 }, /* 134 = shutdown */
+ { AS(socketpair_args), (sy_call_t *)socketpair, AUE_SOCKETPAIR, NULL, 0, 0 }, /* 135 = socketpair */
+ { AS(mkdir_args), (sy_call_t *)mkdir, AUE_MKDIR, NULL, 0, 0 }, /* 136 = mkdir */
+ { AS(rmdir_args), (sy_call_t *)rmdir, AUE_RMDIR, NULL, 0, 0 }, /* 137 = rmdir */
+ { AS(utimes_args), (sy_call_t *)utimes, AUE_UTIMES, NULL, 0, 0 }, /* 138 = utimes */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 139 = obsolete 4.2 sigreturn */
+ { AS(adjtime_args), (sy_call_t *)adjtime, AUE_ADJTIME, NULL, 0, 0 }, /* 140 = adjtime */
+ { compat(AS(ogetpeername_args),getpeername), AUE_GETPEERNAME, NULL, 0, 0 }, /* 141 = old getpeername */
+ { compat(0,gethostid), AUE_SYSCTL, NULL, 0, 0 }, /* 142 = old gethostid */
+ { compat(AS(osethostid_args),sethostid), AUE_SYSCTL, NULL, 0, 0 }, /* 143 = old sethostid */
+ { compat(AS(ogetrlimit_args),getrlimit), AUE_GETRLIMIT, NULL, 0, 0 }, /* 144 = old getrlimit */
+ { compat(AS(osetrlimit_args),setrlimit), AUE_SETRLIMIT, NULL, 0, 0 }, /* 145 = old setrlimit */
+ { compat(AS(okillpg_args),killpg), AUE_KILLPG, NULL, 0, 0 }, /* 146 = old killpg */
+ { 0, (sy_call_t *)setsid, AUE_SETSID, NULL, 0, 0 }, /* 147 = setsid */
+ { AS(quotactl_args), (sy_call_t *)quotactl, AUE_QUOTACTL, NULL, 0, 0 }, /* 148 = quotactl */
+ { compat(0,quota), AUE_O_QUOTA, NULL, 0, 0 }, /* 149 = old quota */
+ { compat(AS(getsockname_args),getsockname), AUE_GETSOCKNAME, NULL, 0, 0 }, /* 150 = old getsockname */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 151 = sem_lock */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 152 = sem_wakeup */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 153 = asyncdaemon */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 154 = nosys */
+ { AS(nfssvc_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 155 = nfssvc */
+ { compat(AS(ogetdirentries_args),getdirentries), AUE_GETDIRENTRIES, NULL, 0, 0 }, /* 156 = old getdirentries */
+ { compat4(AS(freebsd4_statfs_args),statfs), AUE_STATFS, NULL, 0, 0 }, /* 157 = old statfs */
+ { compat4(AS(freebsd4_fstatfs_args),fstatfs), AUE_FSTATFS, NULL, 0, 0 }, /* 158 = old fstatfs */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 159 = nosys */
+ { AS(lgetfh_args), (sy_call_t *)lgetfh, AUE_LGETFH, NULL, 0, 0 }, /* 160 = lgetfh */
+ { AS(getfh_args), (sy_call_t *)getfh, AUE_NFS_GETFH, NULL, 0, 0 }, /* 161 = getfh */
+ { AS(getdomainname_args), (sy_call_t *)getdomainname, AUE_SYSCTL, NULL, 0, 0 }, /* 162 = getdomainname */
+ { AS(setdomainname_args), (sy_call_t *)setdomainname, AUE_SYSCTL, NULL, 0, 0 }, /* 163 = setdomainname */
+ { AS(uname_args), (sy_call_t *)uname, AUE_NULL, NULL, 0, 0 }, /* 164 = uname */
+ { AS(sysarch_args), (sy_call_t *)sysarch, AUE_SYSARCH, NULL, 0, 0 }, /* 165 = sysarch */
+ { AS(rtprio_args), (sy_call_t *)rtprio, AUE_RTPRIO, NULL, 0, 0 }, /* 166 = rtprio */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 167 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 168 = nosys */
+ { AS(semsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 169 = semsys */
+ { AS(msgsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 170 = msgsys */
+ { AS(shmsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 171 = shmsys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 172 = nosys */
+ { AS(freebsd6_pread_args), (sy_call_t *)freebsd6_pread, AUE_PREAD, NULL, 0, 0 }, /* 173 = freebsd6_pread */
+ { AS(freebsd6_pwrite_args), (sy_call_t *)freebsd6_pwrite, AUE_PWRITE, NULL, 0, 0 }, /* 174 = freebsd6_pwrite */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 175 = nosys */
+ { AS(ntp_adjtime_args), (sy_call_t *)ntp_adjtime, AUE_NTP_ADJTIME, NULL, 0, 0 }, /* 176 = ntp_adjtime */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 177 = sfork */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 178 = getdescriptor */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 179 = setdescriptor */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 180 = nosys */
+ { AS(setgid_args), (sy_call_t *)setgid, AUE_SETGID, NULL, 0, 0 }, /* 181 = setgid */
+ { AS(setegid_args), (sy_call_t *)setegid, AUE_SETEGID, NULL, 0, 0 }, /* 182 = setegid */
+ { AS(seteuid_args), (sy_call_t *)seteuid, AUE_SETEUID, NULL, 0, 0 }, /* 183 = seteuid */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 184 = lfs_bmapv */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 185 = lfs_markv */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 186 = lfs_segclean */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 187 = lfs_segwait */
+ { AS(stat_args), (sy_call_t *)stat, AUE_STAT, NULL, 0, 0 }, /* 188 = stat */
+ { AS(fstat_args), (sy_call_t *)fstat, AUE_FSTAT, NULL, 0, 0 }, /* 189 = fstat */
+ { AS(lstat_args), (sy_call_t *)lstat, AUE_LSTAT, NULL, 0, 0 }, /* 190 = lstat */
+ { AS(pathconf_args), (sy_call_t *)pathconf, AUE_PATHCONF, NULL, 0, 0 }, /* 191 = pathconf */
+ { AS(fpathconf_args), (sy_call_t *)fpathconf, AUE_FPATHCONF, NULL, 0, 0 }, /* 192 = fpathconf */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 193 = nosys */
+ { AS(__getrlimit_args), (sy_call_t *)getrlimit, AUE_GETRLIMIT, NULL, 0, 0 }, /* 194 = getrlimit */
+ { AS(__setrlimit_args), (sy_call_t *)setrlimit, AUE_SETRLIMIT, NULL, 0, 0 }, /* 195 = setrlimit */
+ { AS(getdirentries_args), (sy_call_t *)getdirentries, AUE_GETDIRENTRIES, NULL, 0, 0 }, /* 196 = getdirentries */
+ { AS(freebsd6_mmap_args), (sy_call_t *)freebsd6_mmap, AUE_MMAP, NULL, 0, 0 }, /* 197 = freebsd6_mmap */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 198 = __syscall */
+ { AS(freebsd6_lseek_args), (sy_call_t *)freebsd6_lseek, AUE_LSEEK, NULL, 0, 0 }, /* 199 = freebsd6_lseek */
+ { AS(freebsd6_truncate_args), (sy_call_t *)freebsd6_truncate, AUE_TRUNCATE, NULL, 0, 0 }, /* 200 = freebsd6_truncate */
+ { AS(freebsd6_ftruncate_args), (sy_call_t *)freebsd6_ftruncate, AUE_FTRUNCATE, NULL, 0, 0 }, /* 201 = freebsd6_ftruncate */
+ { AS(sysctl_args), (sy_call_t *)__sysctl, AUE_SYSCTL, NULL, 0, 0 }, /* 202 = __sysctl */
+ { AS(mlock_args), (sy_call_t *)mlock, AUE_MLOCK, NULL, 0, 0 }, /* 203 = mlock */
+ { AS(munlock_args), (sy_call_t *)munlock, AUE_MUNLOCK, NULL, 0, 0 }, /* 204 = munlock */
+ { AS(undelete_args), (sy_call_t *)undelete, AUE_UNDELETE, NULL, 0, 0 }, /* 205 = undelete */
+ { AS(futimes_args), (sy_call_t *)futimes, AUE_FUTIMES, NULL, 0, 0 }, /* 206 = futimes */
+ { AS(getpgid_args), (sy_call_t *)getpgid, AUE_GETPGID, NULL, 0, 0 }, /* 207 = getpgid */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 208 = newreboot */
+ { AS(poll_args), (sy_call_t *)poll, AUE_POLL, NULL, 0, 0 }, /* 209 = poll */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 }, /* 210 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 }, /* 211 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 }, /* 212 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 }, /* 213 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 }, /* 214 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 }, /* 215 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 }, /* 216 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 }, /* 217 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 }, /* 218 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 }, /* 219 = lkmnosys */
+ { AS(__semctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 220 = __semctl */
+ { AS(semget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 221 = semget */
+ { AS(semop_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 222 = semop */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 223 = semconfig */
+ { AS(msgctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 224 = msgctl */
+ { AS(msgget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 225 = msgget */
+ { AS(msgsnd_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 226 = msgsnd */
+ { AS(msgrcv_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 227 = msgrcv */
+ { AS(shmat_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 228 = shmat */
+ { AS(shmctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 229 = shmctl */
+ { AS(shmdt_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 230 = shmdt */
+ { AS(shmget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 231 = shmget */
+ { AS(clock_gettime_args), (sy_call_t *)clock_gettime, AUE_NULL, NULL, 0, 0 }, /* 232 = clock_gettime */
+ { AS(clock_settime_args), (sy_call_t *)clock_settime, AUE_CLOCK_SETTIME, NULL, 0, 0 }, /* 233 = clock_settime */
+ { AS(clock_getres_args), (sy_call_t *)clock_getres, AUE_NULL, NULL, 0, 0 }, /* 234 = clock_getres */
+ { AS(ktimer_create_args), (sy_call_t *)ktimer_create, AUE_NULL, NULL, 0, 0 }, /* 235 = ktimer_create */
+ { AS(ktimer_delete_args), (sy_call_t *)ktimer_delete, AUE_NULL, NULL, 0, 0 }, /* 236 = ktimer_delete */
+ { AS(ktimer_settime_args), (sy_call_t *)ktimer_settime, AUE_NULL, NULL, 0, 0 }, /* 237 = ktimer_settime */
+ { AS(ktimer_gettime_args), (sy_call_t *)ktimer_gettime, AUE_NULL, NULL, 0, 0 }, /* 238 = ktimer_gettime */
+ { AS(ktimer_getoverrun_args), (sy_call_t *)ktimer_getoverrun, AUE_NULL, NULL, 0, 0 }, /* 239 = ktimer_getoverrun */
+ { AS(nanosleep_args), (sy_call_t *)nanosleep, AUE_NULL, NULL, 0, 0 }, /* 240 = nanosleep */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 241 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 242 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 243 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 244 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 245 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 246 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 247 = nosys */
+ { AS(ntp_gettime_args), (sy_call_t *)ntp_gettime, AUE_NULL, NULL, 0, 0 }, /* 248 = ntp_gettime */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 249 = nosys */
+ { AS(minherit_args), (sy_call_t *)minherit, AUE_MINHERIT, NULL, 0, 0 }, /* 250 = minherit */
+ { AS(rfork_args), (sy_call_t *)rfork, AUE_RFORK, NULL, 0, 0 }, /* 251 = rfork */
+ { AS(openbsd_poll_args), (sy_call_t *)openbsd_poll, AUE_POLL, NULL, 0, 0 }, /* 252 = openbsd_poll */
+ { 0, (sy_call_t *)issetugid, AUE_ISSETUGID, NULL, 0, 0 }, /* 253 = issetugid */
+ { AS(lchown_args), (sy_call_t *)lchown, AUE_LCHOWN, NULL, 0, 0 }, /* 254 = lchown */
+ { AS(aio_read_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 255 = aio_read */
+ { AS(aio_write_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 256 = aio_write */
+ { AS(lio_listio_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 257 = lio_listio */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 258 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 259 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 260 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 261 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 262 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 263 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 264 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 265 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 266 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 267 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 268 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 269 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 270 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 271 = nosys */
+ { AS(getdents_args), (sy_call_t *)getdents, AUE_O_GETDENTS, NULL, 0, 0 }, /* 272 = getdents */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 273 = nosys */
+ { AS(lchmod_args), (sy_call_t *)lchmod, AUE_LCHMOD, NULL, 0, 0 }, /* 274 = lchmod */
+ { AS(lchown_args), (sy_call_t *)lchown, AUE_LCHOWN, NULL, 0, 0 }, /* 275 = netbsd_lchown */
+ { AS(lutimes_args), (sy_call_t *)lutimes, AUE_LUTIMES, NULL, 0, 0 }, /* 276 = lutimes */
+ { AS(msync_args), (sy_call_t *)msync, AUE_MSYNC, NULL, 0, 0 }, /* 277 = netbsd_msync */
+ { AS(nstat_args), (sy_call_t *)nstat, AUE_STAT, NULL, 0, 0 }, /* 278 = nstat */
+ { AS(nfstat_args), (sy_call_t *)nfstat, AUE_FSTAT, NULL, 0, 0 }, /* 279 = nfstat */
+ { AS(nlstat_args), (sy_call_t *)nlstat, AUE_LSTAT, NULL, 0, 0 }, /* 280 = nlstat */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 281 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 282 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 283 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 284 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 285 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 286 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 287 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 288 = nosys */
+ { AS(preadv_args), (sy_call_t *)preadv, AUE_PREADV, NULL, 0, 0 }, /* 289 = preadv */
+ { AS(pwritev_args), (sy_call_t *)pwritev, AUE_PWRITEV, NULL, 0, 0 }, /* 290 = pwritev */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 291 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 292 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 293 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 294 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 295 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 296 = nosys */
+ { compat4(AS(freebsd4_fhstatfs_args),fhstatfs), AUE_FHSTATFS, NULL, 0, 0 }, /* 297 = old fhstatfs */
+ { AS(fhopen_args), (sy_call_t *)fhopen, AUE_FHOPEN, NULL, 0, 0 }, /* 298 = fhopen */
+ { AS(fhstat_args), (sy_call_t *)fhstat, AUE_FHSTAT, NULL, 0, 0 }, /* 299 = fhstat */
+ { AS(modnext_args), (sy_call_t *)modnext, AUE_NULL, NULL, 0, 0 }, /* 300 = modnext */
+ { AS(modstat_args), (sy_call_t *)modstat, AUE_NULL, NULL, 0, 0 }, /* 301 = modstat */
+ { AS(modfnext_args), (sy_call_t *)modfnext, AUE_NULL, NULL, 0, 0 }, /* 302 = modfnext */
+ { AS(modfind_args), (sy_call_t *)modfind, AUE_NULL, NULL, 0, 0 }, /* 303 = modfind */
+ { AS(kldload_args), (sy_call_t *)kldload, AUE_MODLOAD, NULL, 0, 0 }, /* 304 = kldload */
+ { AS(kldunload_args), (sy_call_t *)kldunload, AUE_MODUNLOAD, NULL, 0, 0 }, /* 305 = kldunload */
+ { AS(kldfind_args), (sy_call_t *)kldfind, AUE_NULL, NULL, 0, 0 }, /* 306 = kldfind */
+ { AS(kldnext_args), (sy_call_t *)kldnext, AUE_NULL, NULL, 0, 0 }, /* 307 = kldnext */
+ { AS(kldstat_args), (sy_call_t *)kldstat, AUE_NULL, NULL, 0, 0 }, /* 308 = kldstat */
+ { AS(kldfirstmod_args), (sy_call_t *)kldfirstmod, AUE_NULL, NULL, 0, 0 }, /* 309 = kldfirstmod */
+ { AS(getsid_args), (sy_call_t *)getsid, AUE_GETSID, NULL, 0, 0 }, /* 310 = getsid */
+ { AS(setresuid_args), (sy_call_t *)setresuid, AUE_SETRESUID, NULL, 0, 0 }, /* 311 = setresuid */
+ { AS(setresgid_args), (sy_call_t *)setresgid, AUE_SETRESGID, NULL, 0, 0 }, /* 312 = setresgid */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 313 = obsolete signanosleep */
+ { AS(aio_return_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 314 = aio_return */
+ { AS(aio_suspend_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 315 = aio_suspend */
+ { AS(aio_cancel_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 316 = aio_cancel */
+ { AS(aio_error_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 317 = aio_error */
+ { AS(oaio_read_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 318 = oaio_read */
+ { AS(oaio_write_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 319 = oaio_write */
+ { AS(olio_listio_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 320 = olio_listio */
+ { 0, (sy_call_t *)yield, AUE_NULL, NULL, 0, 0 }, /* 321 = yield */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 322 = obsolete thr_sleep */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 323 = obsolete thr_wakeup */
+ { AS(mlockall_args), (sy_call_t *)mlockall, AUE_MLOCKALL, NULL, 0, 0 }, /* 324 = mlockall */
+ { 0, (sy_call_t *)munlockall, AUE_MUNLOCKALL, NULL, 0, 0 }, /* 325 = munlockall */
+ { AS(__getcwd_args), (sy_call_t *)__getcwd, AUE_GETCWD, NULL, 0, 0 }, /* 326 = __getcwd */
+ { AS(sched_setparam_args), (sy_call_t *)sched_setparam, AUE_NULL, NULL, 0, 0 }, /* 327 = sched_setparam */
+ { AS(sched_getparam_args), (sy_call_t *)sched_getparam, AUE_NULL, NULL, 0, 0 }, /* 328 = sched_getparam */
+ { AS(sched_setscheduler_args), (sy_call_t *)sched_setscheduler, AUE_NULL, NULL, 0, 0 }, /* 329 = sched_setscheduler */
+ { AS(sched_getscheduler_args), (sy_call_t *)sched_getscheduler, AUE_NULL, NULL, 0, 0 }, /* 330 = sched_getscheduler */
+ { 0, (sy_call_t *)sched_yield, AUE_NULL, NULL, 0, 0 }, /* 331 = sched_yield */
+ { AS(sched_get_priority_max_args), (sy_call_t *)sched_get_priority_max, AUE_NULL, NULL, 0, 0 }, /* 332 = sched_get_priority_max */
+ { AS(sched_get_priority_min_args), (sy_call_t *)sched_get_priority_min, AUE_NULL, NULL, 0, 0 }, /* 333 = sched_get_priority_min */
+ { AS(sched_rr_get_interval_args), (sy_call_t *)sched_rr_get_interval, AUE_NULL, NULL, 0, 0 }, /* 334 = sched_rr_get_interval */
+ { AS(utrace_args), (sy_call_t *)utrace, AUE_NULL, NULL, 0, 0 }, /* 335 = utrace */
+ { compat4(AS(freebsd4_sendfile_args),sendfile), AUE_SENDFILE, NULL, 0, 0 }, /* 336 = old sendfile */
+ { AS(kldsym_args), (sy_call_t *)kldsym, AUE_NULL, NULL, 0, 0 }, /* 337 = kldsym */
+ { AS(jail_args), (sy_call_t *)jail, AUE_JAIL, NULL, 0, 0 }, /* 338 = jail */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 339 = pioctl */
+ { AS(sigprocmask_args), (sy_call_t *)sigprocmask, AUE_SIGPROCMASK, NULL, 0, 0 }, /* 340 = sigprocmask */
+ { AS(sigsuspend_args), (sy_call_t *)sigsuspend, AUE_SIGSUSPEND, NULL, 0, 0 }, /* 341 = sigsuspend */
+ { compat4(AS(freebsd4_sigaction_args),sigaction), AUE_SIGACTION, NULL, 0, 0 }, /* 342 = old sigaction */
+ { AS(sigpending_args), (sy_call_t *)sigpending, AUE_SIGPENDING, NULL, 0, 0 }, /* 343 = sigpending */
+ { compat4(AS(freebsd4_sigreturn_args),sigreturn), AUE_SIGRETURN, NULL, 0, 0 }, /* 344 = old sigreturn */
+ { AS(sigtimedwait_args), (sy_call_t *)sigtimedwait, AUE_SIGWAIT, NULL, 0, 0 }, /* 345 = sigtimedwait */
+ { AS(sigwaitinfo_args), (sy_call_t *)sigwaitinfo, AUE_NULL, NULL, 0, 0 }, /* 346 = sigwaitinfo */
+ { AS(__acl_get_file_args), (sy_call_t *)__acl_get_file, AUE_NULL, NULL, 0, 0 }, /* 347 = __acl_get_file */
+ { AS(__acl_set_file_args), (sy_call_t *)__acl_set_file, AUE_NULL, NULL, 0, 0 }, /* 348 = __acl_set_file */
+ { AS(__acl_get_fd_args), (sy_call_t *)__acl_get_fd, AUE_NULL, NULL, 0, 0 }, /* 349 = __acl_get_fd */
+ { AS(__acl_set_fd_args), (sy_call_t *)__acl_set_fd, AUE_NULL, NULL, 0, 0 }, /* 350 = __acl_set_fd */
+ { AS(__acl_delete_file_args), (sy_call_t *)__acl_delete_file, AUE_NULL, NULL, 0, 0 }, /* 351 = __acl_delete_file */
+ { AS(__acl_delete_fd_args), (sy_call_t *)__acl_delete_fd, AUE_NULL, NULL, 0, 0 }, /* 352 = __acl_delete_fd */
+ { AS(__acl_aclcheck_file_args), (sy_call_t *)__acl_aclcheck_file, AUE_NULL, NULL, 0, 0 }, /* 353 = __acl_aclcheck_file */
+ { AS(__acl_aclcheck_fd_args), (sy_call_t *)__acl_aclcheck_fd, AUE_NULL, NULL, 0, 0 }, /* 354 = __acl_aclcheck_fd */
+ { AS(extattrctl_args), (sy_call_t *)extattrctl, AUE_EXTATTRCTL, NULL, 0, 0 }, /* 355 = extattrctl */
+ { AS(extattr_set_file_args), (sy_call_t *)extattr_set_file, AUE_EXTATTR_SET_FILE, NULL, 0, 0 }, /* 356 = extattr_set_file */
+ { AS(extattr_get_file_args), (sy_call_t *)extattr_get_file, AUE_EXTATTR_GET_FILE, NULL, 0, 0 }, /* 357 = extattr_get_file */
+ { AS(extattr_delete_file_args), (sy_call_t *)extattr_delete_file, AUE_EXTATTR_DELETE_FILE, NULL, 0, 0 }, /* 358 = extattr_delete_file */
+ { AS(aio_waitcomplete_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 359 = aio_waitcomplete */
+ { AS(getresuid_args), (sy_call_t *)getresuid, AUE_GETRESUID, NULL, 0, 0 }, /* 360 = getresuid */
+ { AS(getresgid_args), (sy_call_t *)getresgid, AUE_GETRESGID, NULL, 0, 0 }, /* 361 = getresgid */
+ { 0, (sy_call_t *)kqueue, AUE_KQUEUE, NULL, 0, 0 }, /* 362 = kqueue */
+ { AS(kevent_args), (sy_call_t *)kevent, AUE_NULL, NULL, 0, 0 }, /* 363 = kevent */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 364 = __cap_get_proc */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 365 = __cap_set_proc */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 366 = __cap_get_fd */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 367 = __cap_get_file */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 368 = __cap_set_fd */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 369 = __cap_set_file */
+ { AS(nosys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 370 = lkmressys */
+ { AS(extattr_set_fd_args), (sy_call_t *)extattr_set_fd, AUE_EXTATTR_SET_FD, NULL, 0, 0 }, /* 371 = extattr_set_fd */
+ { AS(extattr_get_fd_args), (sy_call_t *)extattr_get_fd, AUE_EXTATTR_GET_FD, NULL, 0, 0 }, /* 372 = extattr_get_fd */
+ { AS(extattr_delete_fd_args), (sy_call_t *)extattr_delete_fd, AUE_EXTATTR_DELETE_FD, NULL, 0, 0 }, /* 373 = extattr_delete_fd */
+ { AS(__setugid_args), (sy_call_t *)__setugid, AUE_NULL, NULL, 0, 0 }, /* 374 = __setugid */
+ { AS(nfsclnt_args), (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 375 = nfsclnt */
+ { AS(eaccess_args), (sy_call_t *)eaccess, AUE_EACCESS, NULL, 0, 0 }, /* 376 = eaccess */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 377 = afs_syscall */
+ { AS(nmount_args), (sy_call_t *)nmount, AUE_NMOUNT, NULL, 0, 0 }, /* 378 = nmount */
+ { 0, (sy_call_t *)kse_exit, AUE_NULL, NULL, 0, 0 }, /* 379 = kse_exit */
+ { AS(kse_wakeup_args), (sy_call_t *)kse_wakeup, AUE_NULL, NULL, 0, 0 }, /* 380 = kse_wakeup */
+ { AS(kse_create_args), (sy_call_t *)kse_create, AUE_NULL, NULL, 0, 0 }, /* 381 = kse_create */
+ { AS(kse_thr_interrupt_args), (sy_call_t *)kse_thr_interrupt, AUE_NULL, NULL, 0, 0 }, /* 382 = kse_thr_interrupt */
+ { AS(kse_release_args), (sy_call_t *)kse_release, AUE_NULL, NULL, 0, 0 }, /* 383 = kse_release */
+ { AS(__mac_get_proc_args), (sy_call_t *)__mac_get_proc, AUE_NULL, NULL, 0, 0 }, /* 384 = __mac_get_proc */
+ { AS(__mac_set_proc_args), (sy_call_t *)__mac_set_proc, AUE_NULL, NULL, 0, 0 }, /* 385 = __mac_set_proc */
+ { AS(__mac_get_fd_args), (sy_call_t *)__mac_get_fd, AUE_NULL, NULL, 0, 0 }, /* 386 = __mac_get_fd */
+ { AS(__mac_get_file_args), (sy_call_t *)__mac_get_file, AUE_NULL, NULL, 0, 0 }, /* 387 = __mac_get_file */
+ { AS(__mac_set_fd_args), (sy_call_t *)__mac_set_fd, AUE_NULL, NULL, 0, 0 }, /* 388 = __mac_set_fd */
+ { AS(__mac_set_file_args), (sy_call_t *)__mac_set_file, AUE_NULL, NULL, 0, 0 }, /* 389 = __mac_set_file */
+ { AS(kenv_args), (sy_call_t *)kenv, AUE_NULL, NULL, 0, 0 }, /* 390 = kenv */
+ { AS(lchflags_args), (sy_call_t *)lchflags, AUE_LCHFLAGS, NULL, 0, 0 }, /* 391 = lchflags */
+ { AS(uuidgen_args), (sy_call_t *)uuidgen, AUE_NULL, NULL, 0, 0 }, /* 392 = uuidgen */
+ { AS(sendfile_args), (sy_call_t *)sendfile, AUE_SENDFILE, NULL, 0, 0 }, /* 393 = sendfile */
+ { AS(mac_syscall_args), (sy_call_t *)mac_syscall, AUE_NULL, NULL, 0, 0 }, /* 394 = mac_syscall */
+ { AS(getfsstat_args), (sy_call_t *)getfsstat, AUE_GETFSSTAT, NULL, 0, 0 }, /* 395 = getfsstat */
+ { AS(statfs_args), (sy_call_t *)statfs, AUE_STATFS, NULL, 0, 0 }, /* 396 = statfs */
+ { AS(fstatfs_args), (sy_call_t *)fstatfs, AUE_FSTATFS, NULL, 0, 0 }, /* 397 = fstatfs */
+ { AS(fhstatfs_args), (sy_call_t *)fhstatfs, AUE_FHSTATFS, NULL, 0, 0 }, /* 398 = fhstatfs */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 399 = nosys */
+ { AS(ksem_close_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 400 = ksem_close */
+ { AS(ksem_post_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 401 = ksem_post */
+ { AS(ksem_wait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 402 = ksem_wait */
+ { AS(ksem_trywait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 403 = ksem_trywait */
+ { AS(ksem_init_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 404 = ksem_init */
+ { AS(ksem_open_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 405 = ksem_open */
+ { AS(ksem_unlink_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 406 = ksem_unlink */
+ { AS(ksem_getvalue_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 407 = ksem_getvalue */
+ { AS(ksem_destroy_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 408 = ksem_destroy */
+ { AS(__mac_get_pid_args), (sy_call_t *)__mac_get_pid, AUE_NULL, NULL, 0, 0 }, /* 409 = __mac_get_pid */
+ { AS(__mac_get_link_args), (sy_call_t *)__mac_get_link, AUE_NULL, NULL, 0, 0 }, /* 410 = __mac_get_link */
+ { AS(__mac_set_link_args), (sy_call_t *)__mac_set_link, AUE_NULL, NULL, 0, 0 }, /* 411 = __mac_set_link */
+ { AS(extattr_set_link_args), (sy_call_t *)extattr_set_link, AUE_EXTATTR_SET_LINK, NULL, 0, 0 }, /* 412 = extattr_set_link */
+ { AS(extattr_get_link_args), (sy_call_t *)extattr_get_link, AUE_EXTATTR_GET_LINK, NULL, 0, 0 }, /* 413 = extattr_get_link */
+ { AS(extattr_delete_link_args), (sy_call_t *)extattr_delete_link, AUE_EXTATTR_DELETE_LINK, NULL, 0, 0 }, /* 414 = extattr_delete_link */
+ { AS(__mac_execve_args), (sy_call_t *)__mac_execve, AUE_NULL, NULL, 0, 0 }, /* 415 = __mac_execve */
+ { AS(sigaction_args), (sy_call_t *)sigaction, AUE_SIGACTION, NULL, 0, 0 }, /* 416 = sigaction */
+ { AS(sigreturn_args), (sy_call_t *)sigreturn, AUE_SIGRETURN, NULL, 0, 0 }, /* 417 = sigreturn */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 418 = __xstat */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 419 = __xfstat */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 420 = __xlstat */
+ { AS(getcontext_args), (sy_call_t *)getcontext, AUE_NULL, NULL, 0, 0 }, /* 421 = getcontext */
+ { AS(setcontext_args), (sy_call_t *)setcontext, AUE_NULL, NULL, 0, 0 }, /* 422 = setcontext */
+ { AS(swapcontext_args), (sy_call_t *)swapcontext, AUE_NULL, NULL, 0, 0 }, /* 423 = swapcontext */
+ { AS(swapoff_args), (sy_call_t *)swapoff, AUE_SWAPOFF, NULL, 0, 0 }, /* 424 = swapoff */
+ { AS(__acl_get_link_args), (sy_call_t *)__acl_get_link, AUE_NULL, NULL, 0, 0 }, /* 425 = __acl_get_link */
+ { AS(__acl_set_link_args), (sy_call_t *)__acl_set_link, AUE_NULL, NULL, 0, 0 }, /* 426 = __acl_set_link */
+ { AS(__acl_delete_link_args), (sy_call_t *)__acl_delete_link, AUE_NULL, NULL, 0, 0 }, /* 427 = __acl_delete_link */
+ { AS(__acl_aclcheck_link_args), (sy_call_t *)__acl_aclcheck_link, AUE_NULL, NULL, 0, 0 }, /* 428 = __acl_aclcheck_link */
+ { AS(sigwait_args), (sy_call_t *)sigwait, AUE_SIGWAIT, NULL, 0, 0 }, /* 429 = sigwait */
+ { AS(thr_create_args), (sy_call_t *)thr_create, AUE_NULL, NULL, 0, 0 }, /* 430 = thr_create */
+ { AS(thr_exit_args), (sy_call_t *)thr_exit, AUE_NULL, NULL, 0, 0 }, /* 431 = thr_exit */
+ { AS(thr_self_args), (sy_call_t *)thr_self, AUE_NULL, NULL, 0, 0 }, /* 432 = thr_self */
+ { AS(thr_kill_args), (sy_call_t *)thr_kill, AUE_NULL, NULL, 0, 0 }, /* 433 = thr_kill */
+ { AS(_umtx_lock_args), (sy_call_t *)_umtx_lock, AUE_NULL, NULL, 0, 0 }, /* 434 = _umtx_lock */
+ { AS(_umtx_unlock_args), (sy_call_t *)_umtx_unlock, AUE_NULL, NULL, 0, 0 }, /* 435 = _umtx_unlock */
+ { AS(jail_attach_args), (sy_call_t *)jail_attach, AUE_NULL, NULL, 0, 0 }, /* 436 = jail_attach */
+ { AS(extattr_list_fd_args), (sy_call_t *)extattr_list_fd, AUE_EXTATTR_LIST_FD, NULL, 0, 0 }, /* 437 = extattr_list_fd */
+ { AS(extattr_list_file_args), (sy_call_t *)extattr_list_file, AUE_EXTATTR_LIST_FILE, NULL, 0, 0 }, /* 438 = extattr_list_file */
+ { AS(extattr_list_link_args), (sy_call_t *)extattr_list_link, AUE_EXTATTR_LIST_LINK, NULL, 0, 0 }, /* 439 = extattr_list_link */
+ { AS(kse_switchin_args), (sy_call_t *)kse_switchin, AUE_NULL, NULL, 0, 0 }, /* 440 = kse_switchin */
+ { AS(ksem_timedwait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 441 = ksem_timedwait */
+ { AS(thr_suspend_args), (sy_call_t *)thr_suspend, AUE_NULL, NULL, 0, 0 }, /* 442 = thr_suspend */
+ { AS(thr_wake_args), (sy_call_t *)thr_wake, AUE_NULL, NULL, 0, 0 }, /* 443 = thr_wake */
+ { AS(kldunloadf_args), (sy_call_t *)kldunloadf, AUE_MODUNLOAD, NULL, 0, 0 }, /* 444 = kldunloadf */
+ { AS(audit_args), (sy_call_t *)audit, AUE_AUDIT, NULL, 0, 0 }, /* 445 = audit */
+ { AS(auditon_args), (sy_call_t *)auditon, AUE_AUDITON, NULL, 0, 0 }, /* 446 = auditon */
+ { AS(getauid_args), (sy_call_t *)getauid, AUE_GETAUID, NULL, 0, 0 }, /* 447 = getauid */
+ { AS(setauid_args), (sy_call_t *)setauid, AUE_SETAUID, NULL, 0, 0 }, /* 448 = setauid */
+ { AS(getaudit_args), (sy_call_t *)getaudit, AUE_GETAUDIT, NULL, 0, 0 }, /* 449 = getaudit */
+ { AS(setaudit_args), (sy_call_t *)setaudit, AUE_SETAUDIT, NULL, 0, 0 }, /* 450 = setaudit */
+ { AS(getaudit_addr_args), (sy_call_t *)getaudit_addr, AUE_GETAUDIT_ADDR, NULL, 0, 0 }, /* 451 = getaudit_addr */
+ { AS(setaudit_addr_args), (sy_call_t *)setaudit_addr, AUE_SETAUDIT_ADDR, NULL, 0, 0 }, /* 452 = setaudit_addr */
+ { AS(auditctl_args), (sy_call_t *)auditctl, AUE_AUDITCTL, NULL, 0, 0 }, /* 453 = auditctl */
+ { AS(_umtx_op_args), (sy_call_t *)_umtx_op, AUE_NULL, NULL, 0, 0 }, /* 454 = _umtx_op */
+ { AS(thr_new_args), (sy_call_t *)thr_new, AUE_NULL, NULL, 0, 0 }, /* 455 = thr_new */
+ { AS(sigqueue_args), (sy_call_t *)sigqueue, AUE_NULL, NULL, 0, 0 }, /* 456 = sigqueue */
+ { AS(kmq_open_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 457 = kmq_open */
+ { AS(kmq_setattr_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 458 = kmq_setattr */
+ { AS(kmq_timedreceive_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 459 = kmq_timedreceive */
+ { AS(kmq_timedsend_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 460 = kmq_timedsend */
+ { AS(kmq_notify_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 461 = kmq_notify */
+ { AS(kmq_unlink_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 462 = kmq_unlink */
+ { AS(abort2_args), (sy_call_t *)abort2, AUE_NULL, NULL, 0, 0 }, /* 463 = abort2 */
+ { AS(thr_set_name_args), (sy_call_t *)thr_set_name, AUE_NULL, NULL, 0, 0 }, /* 464 = thr_set_name */
+ { AS(aio_fsync_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 }, /* 465 = aio_fsync */
+ { AS(rtprio_thread_args), (sy_call_t *)rtprio_thread, AUE_RTPRIO, NULL, 0, 0 }, /* 466 = rtprio_thread */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 467 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 468 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 469 = __getpath_fromfd */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 470 = __getpath_fromaddr */
+ { AS(sctp_peeloff_args), (sy_call_t *)sctp_peeloff, AUE_NULL, NULL, 0, 0 }, /* 471 = sctp_peeloff */
+ { AS(sctp_generic_sendmsg_args), (sy_call_t *)sctp_generic_sendmsg, AUE_NULL, NULL, 0, 0 }, /* 472 = sctp_generic_sendmsg */
+ { AS(sctp_generic_sendmsg_iov_args), (sy_call_t *)sctp_generic_sendmsg_iov, AUE_NULL, NULL, 0, 0 }, /* 473 = sctp_generic_sendmsg_iov */
+ { AS(sctp_generic_recvmsg_args), (sy_call_t *)sctp_generic_recvmsg, AUE_NULL, NULL, 0, 0 }, /* 474 = sctp_generic_recvmsg */
+ { AS(pread_args), (sy_call_t *)pread, AUE_PREAD, NULL, 0, 0 }, /* 475 = pread */
+ { AS(pwrite_args), (sy_call_t *)pwrite, AUE_PWRITE, NULL, 0, 0 }, /* 476 = pwrite */
+ { AS(mmap_args), (sy_call_t *)mmap, AUE_MMAP, NULL, 0, 0 }, /* 477 = mmap */
+ { AS(lseek_args), (sy_call_t *)lseek, AUE_LSEEK, NULL, 0, 0 }, /* 478 = lseek */
+ { AS(truncate_args), (sy_call_t *)truncate, AUE_TRUNCATE, NULL, 0, 0 }, /* 479 = truncate */
+ { AS(ftruncate_args), (sy_call_t *)ftruncate, AUE_FTRUNCATE, NULL, 0, 0 }, /* 480 = ftruncate */
+ { AS(thr_kill2_args), (sy_call_t *)thr_kill2, AUE_KILL, NULL, 0, 0 }, /* 481 = thr_kill2 */
};
Index: sched_ule.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sched_ule.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/sched_ule.c -L sys/kern/sched_ule.c -u -r1.1.1.1 -r1.2
--- sys/kern/sched_ule.c
+++ sys/kern/sched_ule.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2002-2005, Jeffrey Roberson <jeff at freebsd.org>
+ * Copyright (c) 2002-2007, Jeffrey Roberson <jeff at freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -24,14 +24,23 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+/*
+ * This file implements the ULE scheduler. ULE supports independent CPU
+ * run queues and fine grain locking. It has superior interactive
+ * performance under load even on uni-processor systems.
+ *
+ * etymology:
+ * ULE is the last three letters in schedule. It owes its name to a
+ * generic user created for a scheduling system by Paul Mikesell at
+ * Isilon Systems and a general lack of creativity on the part of the author.
+ */
+
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.153.2.3 2005/09/27 12:00:31 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.214.2.2 2007/12/20 07:15:40 davidxu Exp $");
#include "opt_hwpmc_hooks.h"
#include "opt_sched.h"
-#define kse td_sched
-
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kdb.h>
@@ -48,6 +57,7 @@
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/turnstile.h>
+#include <sys/umtx.h>
#include <sys/vmmeter.h>
#ifdef KTRACE
#include <sys/uio.h>
@@ -61,111 +71,84 @@
#include <machine/cpu.h>
#include <machine/smp.h>
-/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
-/* XXX This is bogus compatability crap for ps */
-static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
-SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
-
-static void sched_setup(void *dummy);
-SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
-
-static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
-
-SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
- "Scheduler name");
-
-static int slice_min = 1;
-SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
-
-static int slice_max = 10;
-SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
+#if !defined(__i386__) && !defined(__amd64__) && !defined(__arm__)
+#error "This architecture is not currently compatible with ULE"
+#endif
-int realstathz;
-int tickincr = 1;
+#define KTR_ULE 0
/*
- * The following datastructures are allocated within their parent structure
- * but are scheduler specific.
- */
-/*
- * The schedulable entity that can be given a context to run. A process may
- * have several of these.
- */
-struct kse {
- TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */
- int ke_flags; /* (j) KEF_* flags. */
- struct thread *ke_thread; /* (*) Active associated thread. */
- fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */
- char ke_rqindex; /* (j) Run queue index. */
- enum {
- KES_THREAD = 0x0, /* slaved to thread state */
- KES_ONRUNQ
- } ke_state; /* (j) thread sched specific status. */
- int ke_slptime;
- int ke_slice;
- struct runq *ke_runq;
- u_char ke_cpu; /* CPU that we have affinity for. */
+ * Thread scheduler specific section. All fields are protected
+ * by the thread lock.
+ */
+struct td_sched {
+ TAILQ_ENTRY(td_sched) ts_procq; /* Run queue. */
+ struct thread *ts_thread; /* Active associated thread. */
+ struct runq *ts_runq; /* Run-queue we're queued on. */
+ short ts_flags; /* TSF_* flags. */
+ u_char ts_rqindex; /* Run queue index. */
+ u_char ts_cpu; /* CPU that we have affinity for. */
+ int ts_slice; /* Ticks of slice remaining. */
+ u_int ts_slptime; /* Number of ticks we vol. slept */
+ u_int ts_runtime; /* Number of ticks we were running */
/* The following variables are only used for pctcpu calculation */
- int ke_ltick; /* Last tick that we were running on */
- int ke_ftick; /* First tick that we were running on */
- int ke_ticks; /* Tick count */
-
-};
-#define td_kse td_sched
-#define td_slptime td_kse->ke_slptime
-#define ke_proc ke_thread->td_proc
-#define ke_ksegrp ke_thread->td_ksegrp
-#define ke_assign ke_procq.tqe_next
-/* flags kept in ke_flags */
-#define KEF_ASSIGNED 0x0001 /* Thread is being migrated. */
-#define KEF_BOUND 0x0002 /* Thread can not migrate. */
-#define KEF_XFERABLE 0x0004 /* Thread was added as transferable. */
-#define KEF_HOLD 0x0008 /* Thread is temporarily bound. */
-#define KEF_REMOVED 0x0010 /* Thread was removed while ASSIGNED */
-#define KEF_INTERNAL 0x0020 /* Thread added due to migration. */
-#define KEF_PREEMPTED 0x0040 /* Thread was preempted */
-#define KEF_DIDRUN 0x02000 /* Thread actually ran. */
-#define KEF_EXIT 0x04000 /* Thread is being killed. */
-
-struct kg_sched {
- struct thread *skg_last_assigned; /* (j) Last thread assigned to */
- /* the system scheduler */
- int skg_slptime; /* Number of ticks we vol. slept */
- int skg_runtime; /* Number of ticks we were running */
- int skg_avail_opennings; /* (j) Num unfilled slots in group.*/
- int skg_concurrency; /* (j) Num threads requested in group.*/
+ int ts_ltick; /* Last tick that we were running on */
+ int ts_ftick; /* First tick that we were running on */
+ int ts_ticks; /* Tick count */
+#ifdef SMP
+ int ts_rltick; /* Real last tick, for affinity. */
+#endif
};
-#define kg_last_assigned kg_sched->skg_last_assigned
-#define kg_avail_opennings kg_sched->skg_avail_opennings
-#define kg_concurrency kg_sched->skg_concurrency
-#define kg_runtime kg_sched->skg_runtime
-#define kg_slptime kg_sched->skg_slptime
-
-#define SLOT_RELEASE(kg) (kg)->kg_avail_opennings++
-#define SLOT_USE(kg) (kg)->kg_avail_opennings--
-
-static struct kse kse0;
-static struct kg_sched kg_sched0;
-
-/*
- * The priority is primarily determined by the interactivity score. Thus, we
- * give lower(better) priorities to kse groups that use less CPU. The nice
- * value is then directly added to this to allow nice to have some effect
- * on latency.
+/* flags kept in ts_flags */
+#define TSF_BOUND 0x0001 /* Thread can not migrate. */
+#define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */
+
+static struct td_sched td_sched0;
+
+/*
+ * Cpu percentage computation macros and defines.
+ *
+ * SCHED_TICK_SECS: Number of seconds to average the cpu usage across.
+ * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across.
+ * SCHED_TICK_MAX: Maximum number of ticks before scaling back.
+ * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results.
+ * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count.
+ * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks.
+ */
+#define SCHED_TICK_SECS 10
+#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS)
+#define SCHED_TICK_MAX (SCHED_TICK_TARG + hz)
+#define SCHED_TICK_SHIFT 10
+#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT)
+#define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz))
+
+/*
+ * These macros determine priorities for non-interactive threads. They are
+ * assigned a priority based on their recent cpu utilization as expressed
+ * by the ratio of ticks to the tick total. NHALF priorities at the start
+ * and end of the MIN to MAX timeshare range are only reachable with negative
+ * or positive nice respectively.
*
- * PRI_RANGE: Total priority range for timeshare threads.
+ * PRI_RANGE: Priority range for utilization dependent priorities.
* PRI_NRESV: Number of nice values.
- * PRI_BASE: The start of the dynamic range.
+ * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total.
+ * PRI_NICE: Determines the part of the priority inherited from nice.
*/
-#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
-#define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1)
+#define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN)
#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2)
-#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE)
-#define SCHED_PRI_INTERACT(score) \
- ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
+#define SCHED_PRI_MIN (PRI_MIN_TIMESHARE + SCHED_PRI_NHALF)
+#define SCHED_PRI_MAX (PRI_MAX_TIMESHARE - SCHED_PRI_NHALF)
+#define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN)
+#define SCHED_PRI_TICKS(ts) \
+ (SCHED_TICK_HZ((ts)) / \
+ (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
+#define SCHED_PRI_NICE(nice) (nice)
/*
- * These determine the interactivity of a process.
+ * These determine the interactivity of a process. Interactivity differs from
+ * cpu utilization in that it expresses the voluntary time slept vs time ran
+ * while cpu utilization includes all time not running. This more accurately
+ * models the intent of the thread.
*
* SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate
* before throttling back.
@@ -173,280 +156,351 @@
* INTERACT_MAX: Maximum interactivity value. Smaller is better.
* INTERACT_THRESH: Threshhold for placement on the current runq.
*/
-#define SCHED_SLP_RUN_MAX ((hz * 5) << 10)
-#define SCHED_SLP_RUN_FORK ((hz / 2) << 10)
+#define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT)
+#define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT)
#define SCHED_INTERACT_MAX (100)
#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2)
#define SCHED_INTERACT_THRESH (30)
/*
- * These parameters and macros determine the size of the time slice that is
- * granted to each thread.
- *
- * SLICE_MIN: Minimum time slice granted, in units of ticks.
- * SLICE_MAX: Maximum time slice granted.
- * SLICE_RANGE: Range of available time slices scaled by hz.
- * SLICE_SCALE: The number slices granted per val in the range of [0, max].
- * SLICE_NICE: Determine the amount of slice granted to a scaled nice.
- * SLICE_NTHRESH: The nice cutoff point for slice assignment.
- */
-#define SCHED_SLICE_MIN (slice_min)
-#define SCHED_SLICE_MAX (slice_max)
-#define SCHED_SLICE_INTERACTIVE (slice_max)
-#define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1)
-#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
-#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max))
-#define SCHED_SLICE_NICE(nice) \
- (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH))
-
-/*
- * This macro determines whether or not the thread belongs on the current or
- * next run queue.
- */
-#define SCHED_INTERACTIVE(kg) \
- (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
-#define SCHED_CURR(kg, ke) \
- ((ke->ke_thread->td_flags & TDF_BORROWING) || \
- (ke->ke_flags & KEF_PREEMPTED) || SCHED_INTERACTIVE(kg))
-
-/*
- * Cpu percentage computation macros and defines.
- *
- * SCHED_CPU_TIME: Number of seconds to average the cpu usage across.
- * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across.
- */
-
-#define SCHED_CPU_TIME 10
-#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME)
+ * tickincr: Converts a stathz tick into a hz domain scaled by
+ * the shift factor. Without the shift the error rate
+ * due to rounding would be unacceptably high.
+ * realstathz: stathz is sometimes 0 and run off of hz.
+ * sched_slice: Runtime of each thread before rescheduling.
+ * preempt_thresh: Priority threshold for preemption and remote IPIs.
+ */
+static int sched_interact = SCHED_INTERACT_THRESH;
+static int realstathz;
+static int tickincr;
+static int sched_slice;
+#ifdef PREEMPTION
+#ifdef FULL_PREEMPTION
+static int preempt_thresh = PRI_MAX_IDLE;
+#else
+static int preempt_thresh = PRI_MIN_KERN;
+#endif
+#else
+static int preempt_thresh = 0;
+#endif
/*
- * kseq - per processor runqs and statistics.
- */
-struct kseq {
- struct runq ksq_idle; /* Queue of IDLE threads. */
- struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */
- struct runq *ksq_next; /* Next timeshare queue. */
- struct runq *ksq_curr; /* Current queue. */
- int ksq_load_timeshare; /* Load for timeshare. */
- int ksq_load; /* Aggregate load. */
- short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */
- short ksq_nicemin; /* Least nice. */
-#ifdef SMP
- int ksq_transferable;
- LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */
- struct kseq_group *ksq_group; /* Our processor group. */
- volatile struct kse *ksq_assigned; /* assigned by another CPU. */
+ * tdq - per processor runqs and statistics. All fields are protected by the
+ * tdq_lock. The load and lowpri may be accessed without to avoid excess
+ * locking in sched_pickcpu();
+ */
+struct tdq {
+ struct mtx *tdq_lock; /* Pointer to group lock. */
+ struct runq tdq_realtime; /* real-time run queue. */
+ struct runq tdq_timeshare; /* timeshare run queue. */
+ struct runq tdq_idle; /* Queue of IDLE threads. */
+ int tdq_load; /* Aggregate load. */
+ u_char tdq_idx; /* Current insert index. */
+ u_char tdq_ridx; /* Current removal index. */
+#ifdef SMP
+ u_char tdq_lowpri; /* Lowest priority thread. */
+ int tdq_transferable; /* Transferable thread count. */
+ LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */
+ struct tdq_group *tdq_group; /* Our processor group. */
#else
- int ksq_sysload; /* For loadavg, !ITHD load. */
+ int tdq_sysload; /* For loadavg, !ITHD load. */
#endif
-};
+} __aligned(64);
+
#ifdef SMP
/*
- * kseq groups are groups of processors which can cheaply share threads. When
+ * tdq groups are groups of processors which can cheaply share threads. When
* one processor in the group goes idle it will check the runqs of the other
* processors in its group prior to halting and waiting for an interrupt.
* These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
* In a numa environment we'd want an idle bitmap per group and a two tiered
* load balancer.
*/
-struct kseq_group {
- int ksg_cpus; /* Count of CPUs in this kseq group. */
- cpumask_t ksg_cpumask; /* Mask of cpus in this group. */
- cpumask_t ksg_idlemask; /* Idle cpus in this group. */
- cpumask_t ksg_mask; /* Bit mask for first cpu. */
- int ksg_load; /* Total load of this group. */
- int ksg_transferable; /* Transferable load of this group. */
- LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */
-};
-#endif
-
-/*
- * One kse queue per processor.
- */
-#ifdef SMP
-static cpumask_t kseq_idle;
-static int ksg_maxid;
-static struct kseq kseq_cpu[MAXCPU];
-static struct kseq_group kseq_groups[MAXCPU];
-static int bal_tick;
-static int gbal_tick;
-static int balance_groups;
-
-#define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)])
-#define KSEQ_CPU(x) (&kseq_cpu[(x)])
-#define KSEQ_ID(x) ((x) - kseq_cpu)
-#define KSEQ_GROUP(x) (&kseq_groups[(x)])
+struct tdq_group {
+ struct mtx tdg_lock; /* Protects all fields below. */
+ int tdg_cpus; /* Count of CPUs in this tdq group. */
+ cpumask_t tdg_cpumask; /* Mask of cpus in this group. */
+ cpumask_t tdg_idlemask; /* Idle cpus in this group. */
+ cpumask_t tdg_mask; /* Bit mask for first cpu. */
+ int tdg_load; /* Total load of this group. */
+ int tdg_transferable; /* Transferable load of this group. */
+ LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */
+ char tdg_name[16]; /* lock name. */
+} __aligned(64);
+
+#define SCHED_AFFINITY_DEFAULT (max(1, hz / 300))
+#define SCHED_AFFINITY(ts) ((ts)->ts_rltick > ticks - affinity)
+
+/*
+ * Run-time tunables.
+ */
+static int rebalance = 1;
+static int balance_interval = 128; /* Default set in sched_initticks(). */
+static int pick_pri = 1;
+static int affinity;
+static int tryself = 1;
+static int steal_htt = 1;
+static int steal_idle = 1;
+static int steal_thresh = 2;
+static int topology = 0;
+
+/*
+ * One thread queue per processor.
+ */
+static volatile cpumask_t tdq_idle;
+static int tdg_maxid;
+static struct tdq tdq_cpu[MAXCPU];
+static struct tdq_group tdq_groups[MAXCPU];
+static struct tdq *balance_tdq;
+static int balance_group_ticks;
+static int balance_ticks;
+
+#define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)])
+#define TDQ_CPU(x) (&tdq_cpu[(x)])
+#define TDQ_ID(x) ((int)((x) - tdq_cpu))
+#define TDQ_GROUP(x) (&tdq_groups[(x)])
+#define TDG_ID(x) ((int)((x) - tdq_groups))
#else /* !SMP */
-static struct kseq kseq_cpu;
+static struct tdq tdq_cpu;
+static struct mtx tdq_lock;
-#define KSEQ_SELF() (&kseq_cpu)
-#define KSEQ_CPU(x) (&kseq_cpu)
+#define TDQ_ID(x) (0)
+#define TDQ_SELF() (&tdq_cpu)
+#define TDQ_CPU(x) (&tdq_cpu)
#endif
-static void slot_fill(struct ksegrp *);
-static struct kse *sched_choose(void); /* XXX Should be thread * */
-static void sched_slice(struct kse *);
-static void sched_priority(struct ksegrp *);
+#define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type))
+#define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t)))
+#define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
+#define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t)))
+#define TDQ_LOCKPTR(t) ((t)->tdq_lock)
+
+static void sched_priority(struct thread *);
static void sched_thread_priority(struct thread *, u_char);
-static int sched_interact_score(struct ksegrp *);
-static void sched_interact_update(struct ksegrp *);
-static void sched_interact_fork(struct ksegrp *);
-static void sched_pctcpu_update(struct kse *);
+static int sched_interact_score(struct thread *);
+static void sched_interact_update(struct thread *);
+static void sched_interact_fork(struct thread *);
+static void sched_pctcpu_update(struct td_sched *);
/* Operations on per processor queues */
-static struct kse * kseq_choose(struct kseq *);
-static void kseq_setup(struct kseq *);
-static void kseq_load_add(struct kseq *, struct kse *);
-static void kseq_load_rem(struct kseq *, struct kse *);
-static __inline void kseq_runq_add(struct kseq *, struct kse *, int);
-static __inline void kseq_runq_rem(struct kseq *, struct kse *);
-static void kseq_nice_add(struct kseq *, int);
-static void kseq_nice_rem(struct kseq *, int);
-void kseq_print(int cpu);
-#ifdef SMP
-static int kseq_transfer(struct kseq *, struct kse *, int);
-static struct kse *runq_steal(struct runq *);
+static struct td_sched * tdq_choose(struct tdq *);
+static void tdq_setup(struct tdq *);
+static void tdq_load_add(struct tdq *, struct td_sched *);
+static void tdq_load_rem(struct tdq *, struct td_sched *);
+static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int);
+static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
+void tdq_print(int cpu);
+static void runq_print(struct runq *rq);
+static void tdq_add(struct tdq *, struct thread *, int);
+#ifdef SMP
+static void tdq_move(struct tdq *, struct tdq *);
+static int tdq_idled(struct tdq *);
+static void tdq_notify(struct td_sched *);
+static struct td_sched *tdq_steal(struct tdq *);
+static struct td_sched *runq_steal(struct runq *);
+static int sched_pickcpu(struct td_sched *, int);
static void sched_balance(void);
static void sched_balance_groups(void);
-static void sched_balance_group(struct kseq_group *);
-static void sched_balance_pair(struct kseq *, struct kseq *);
-static void kseq_move(struct kseq *, int);
-static int kseq_idled(struct kseq *);
-static void kseq_notify(struct kse *, int);
-static void kseq_assign(struct kseq *);
-static struct kse *kseq_steal(struct kseq *, int);
-#define KSE_CAN_MIGRATE(ke) \
- ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
+static void sched_balance_group(struct tdq_group *);
+static void sched_balance_pair(struct tdq *, struct tdq *);
+static inline struct tdq *sched_setcpu(struct td_sched *, int, int);
+static inline struct mtx *thread_block_switch(struct thread *);
+static inline void thread_unblock_switch(struct thread *, struct mtx *);
+static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
+
+#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0)
#endif
-void
-kseq_print(int cpu)
+static void sched_setup(void *dummy);
+SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
+
+static void sched_initticks(void *dummy);
+SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
+
+/*
+ * Print the threads waiting on a run-queue.
+ */
+static void
+runq_print(struct runq *rq)
{
- struct kseq *kseq;
+ struct rqhead *rqh;
+ struct td_sched *ts;
+ int pri;
+ int j;
int i;
- kseq = KSEQ_CPU(cpu);
+ for (i = 0; i < RQB_LEN; i++) {
+ printf("\t\trunq bits %d 0x%zx\n",
+ i, rq->rq_status.rqb_bits[i]);
+ for (j = 0; j < RQB_BPW; j++)
+ if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
+ pri = j + (i << RQB_L2BPW);
+ rqh = &rq->rq_queues[pri];
+ TAILQ_FOREACH(ts, rqh, ts_procq) {
+ printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
+ ts->ts_thread, ts->ts_thread->td_proc->p_comm, ts->ts_thread->td_priority, ts->ts_rqindex, pri);
+ }
+ }
+ }
+}
+
+/*
+ * Print the status of a per-cpu thread queue. Should be a ddb show cmd.
+ */
+void
+tdq_print(int cpu)
+{
+ struct tdq *tdq;
+
+ tdq = TDQ_CPU(cpu);
- printf("kseq:\n");
- printf("\tload: %d\n", kseq->ksq_load);
- printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare);
-#ifdef SMP
- printf("\tload transferable: %d\n", kseq->ksq_transferable);
-#endif
- printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
- printf("\tnice counts:\n");
- for (i = 0; i < SCHED_PRI_NRESV; i++)
- if (kseq->ksq_nice[i])
- printf("\t\t%d = %d\n",
- i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
+ printf("tdq %d:\n", TDQ_ID(tdq));
+ printf("\tlockptr %p\n", TDQ_LOCKPTR(tdq));
+ printf("\tload: %d\n", tdq->tdq_load);
+ printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
+ printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
+ printf("\trealtime runq:\n");
+ runq_print(&tdq->tdq_realtime);
+ printf("\ttimeshare runq:\n");
+ runq_print(&tdq->tdq_timeshare);
+ printf("\tidle runq:\n");
+ runq_print(&tdq->tdq_idle);
+#ifdef SMP
+ printf("\tload transferable: %d\n", tdq->tdq_transferable);
+ printf("\tlowest priority: %d\n", tdq->tdq_lowpri);
+ printf("\tgroup: %d\n", TDG_ID(tdq->tdq_group));
+ printf("\tLock name: %s\n", tdq->tdq_group->tdg_name);
+#endif
}
+#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
+/*
+ * Add a thread to the actual run-queue. Keeps transferable counts up to
+ * date with what is actually on the run-queue. Selects the correct
+ * queue position for timeshare threads.
+ */
static __inline void
-kseq_runq_add(struct kseq *kseq, struct kse *ke, int flags)
+tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
{
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
#ifdef SMP
- if (KSE_CAN_MIGRATE(ke)) {
- kseq->ksq_transferable++;
- kseq->ksq_group->ksg_transferable++;
- ke->ke_flags |= KEF_XFERABLE;
+ if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
+ tdq->tdq_transferable++;
+ tdq->tdq_group->tdg_transferable++;
+ ts->ts_flags |= TSF_XFERABLE;
}
#endif
- if (ke->ke_flags & KEF_PREEMPTED)
- flags |= SRQ_PREEMPTED;
- runq_add(ke->ke_runq, ke, flags);
+ if (ts->ts_runq == &tdq->tdq_timeshare) {
+ u_char pri;
+
+ pri = ts->ts_thread->td_priority;
+ KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
+ ("Invalid priority %d on timeshare runq", pri));
+ /*
+ * This queue contains only priorities between MIN and MAX
+ * realtime. Use the whole queue to represent these values.
+ */
+ if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
+ pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
+ pri = (pri + tdq->tdq_idx) % RQ_NQS;
+ /*
+ * This effectively shortens the queue by one so we
+ * can have a one slot difference between idx and
+ * ridx while we wait for threads to drain.
+ */
+ if (tdq->tdq_ridx != tdq->tdq_idx &&
+ pri == tdq->tdq_ridx)
+ pri = (unsigned char)(pri - 1) % RQ_NQS;
+ } else
+ pri = tdq->tdq_ridx;
+ runq_add_pri(ts->ts_runq, ts, pri, flags);
+ } else
+ runq_add(ts->ts_runq, ts, flags);
}
+/*
+ * Remove a thread from a run-queue. This typically happens when a thread
+ * is selected to run. Running threads are not on the queue and the
+ * transferable count does not reflect them.
+ */
static __inline void
-kseq_runq_rem(struct kseq *kseq, struct kse *ke)
+tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
{
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ KASSERT(ts->ts_runq != NULL,
+ ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread));
#ifdef SMP
- if (ke->ke_flags & KEF_XFERABLE) {
- kseq->ksq_transferable--;
- kseq->ksq_group->ksg_transferable--;
- ke->ke_flags &= ~KEF_XFERABLE;
+ if (ts->ts_flags & TSF_XFERABLE) {
+ tdq->tdq_transferable--;
+ tdq->tdq_group->tdg_transferable--;
+ ts->ts_flags &= ~TSF_XFERABLE;
}
#endif
- runq_remove(ke->ke_runq, ke);
+ if (ts->ts_runq == &tdq->tdq_timeshare) {
+ if (tdq->tdq_idx != tdq->tdq_ridx)
+ runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx);
+ else
+ runq_remove_idx(ts->ts_runq, ts, NULL);
+ /*
+ * For timeshare threads we update the priority here so
+ * the priority reflects the time we've been sleeping.
+ */
+ ts->ts_ltick = ticks;
+ sched_pctcpu_update(ts);
+ sched_priority(ts->ts_thread);
+ } else
+ runq_remove(ts->ts_runq, ts);
}
+/*
+ * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load
+ * for this thread to the referenced thread queue.
+ */
static void
-kseq_load_add(struct kseq *kseq, struct kse *ke)
+tdq_load_add(struct tdq *tdq, struct td_sched *ts)
{
int class;
- mtx_assert(&sched_lock, MA_OWNED);
- class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
- if (class == PRI_TIMESHARE)
- kseq->ksq_load_timeshare++;
- kseq->ksq_load++;
- CTR1(KTR_SCHED, "load: %d", kseq->ksq_load);
- if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
+
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+ class = PRI_BASE(ts->ts_thread->td_pri_class);
+ tdq->tdq_load++;
+ CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load);
+ if (class != PRI_ITHD &&
+ (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
#ifdef SMP
- kseq->ksq_group->ksg_load++;
+ tdq->tdq_group->tdg_load++;
#else
- kseq->ksq_sysload++;
+ tdq->tdq_sysload++;
#endif
- if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
- kseq_nice_add(kseq, ke->ke_proc->p_nice);
}
+/*
+ * Remove the load from a thread that is transitioning to a sleep state or
+ * exiting.
+ */
static void
-kseq_load_rem(struct kseq *kseq, struct kse *ke)
+tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
{
int class;
- mtx_assert(&sched_lock, MA_OWNED);
- class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
- if (class == PRI_TIMESHARE)
- kseq->ksq_load_timeshare--;
- if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
+
+ THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ class = PRI_BASE(ts->ts_thread->td_pri_class);
+ if (class != PRI_ITHD &&
+ (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
#ifdef SMP
- kseq->ksq_group->ksg_load--;
+ tdq->tdq_group->tdg_load--;
#else
- kseq->ksq_sysload--;
+ tdq->tdq_sysload--;
#endif
- kseq->ksq_load--;
- CTR1(KTR_SCHED, "load: %d", kseq->ksq_load);
- ke->ke_runq = NULL;
- if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
- kseq_nice_rem(kseq, ke->ke_proc->p_nice);
-}
-
-static void
-kseq_nice_add(struct kseq *kseq, int nice)
-{
- mtx_assert(&sched_lock, MA_OWNED);
- /* Normalize to zero. */
- kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
- if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1)
- kseq->ksq_nicemin = nice;
-}
-
-static void
-kseq_nice_rem(struct kseq *kseq, int nice)
-{
- int n;
-
- mtx_assert(&sched_lock, MA_OWNED);
- /* Normalize to zero. */
- n = nice + SCHED_PRI_NHALF;
- kseq->ksq_nice[n]--;
- KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
-
- /*
- * If this wasn't the smallest nice value or there are more in
- * this bucket we can just return. Otherwise we have to recalculate
- * the smallest nice.
- */
- if (nice != kseq->ksq_nicemin ||
- kseq->ksq_nice[n] != 0 ||
- kseq->ksq_load_timeshare == 0)
- return;
-
- for (; n < SCHED_PRI_NRESV; n++)
- if (kseq->ksq_nice[n]) {
- kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
- return;
- }
+ KASSERT(tdq->tdq_load != 0,
+ ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
+ tdq->tdq_load--;
+ CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
+ ts->ts_runq = NULL;
}
#ifdef SMP
@@ -459,83 +513,131 @@
* installations will only have 2 cpus. Secondly, load balancing too much at
* once can have an unpleasant effect on the system. The scheduler rarely has
* enough information to make perfect decisions. So this algorithm chooses
- * algorithm simplicity and more gradual effects on load in larger systems.
- *
- * It could be improved by considering the priorities and slices assigned to
- * each task prior to balancing them. There are many pathological cases with
- * any approach and so the semi random algorithm below may work as well as any.
+ * simplicity and more gradual effects on load in larger systems.
*
*/
static void
-sched_balance(void)
+sched_balance()
{
- struct kseq_group *high;
- struct kseq_group *low;
- struct kseq_group *ksg;
+ struct tdq_group *high;
+ struct tdq_group *low;
+ struct tdq_group *tdg;
+ struct tdq *tdq;
int cnt;
int i;
- bal_tick = ticks + (random() % (hz * 2));
- if (smp_started == 0)
+ /*
+ * Select a random time between .5 * balance_interval and
+ * 1.5 * balance_interval.
+ */
+ balance_ticks = max(balance_interval / 2, 1);
+ balance_ticks += random() % balance_interval;
+ if (smp_started == 0 || rebalance == 0)
return;
+ tdq = TDQ_SELF();
+ TDQ_UNLOCK(tdq);
low = high = NULL;
- i = random() % (ksg_maxid + 1);
- for (cnt = 0; cnt <= ksg_maxid; cnt++) {
- ksg = KSEQ_GROUP(i);
+ i = random() % (tdg_maxid + 1);
+ for (cnt = 0; cnt <= tdg_maxid; cnt++) {
+ tdg = TDQ_GROUP(i);
/*
* Find the CPU with the highest load that has some
* threads to transfer.
*/
- if ((high == NULL || ksg->ksg_load > high->ksg_load)
- && ksg->ksg_transferable)
- high = ksg;
- if (low == NULL || ksg->ksg_load < low->ksg_load)
- low = ksg;
- if (++i > ksg_maxid)
+ if ((high == NULL || tdg->tdg_load > high->tdg_load)
+ && tdg->tdg_transferable)
+ high = tdg;
+ if (low == NULL || tdg->tdg_load < low->tdg_load)
+ low = tdg;
+ if (++i > tdg_maxid)
i = 0;
}
if (low != NULL && high != NULL && high != low)
- sched_balance_pair(LIST_FIRST(&high->ksg_members),
- LIST_FIRST(&low->ksg_members));
+ sched_balance_pair(LIST_FIRST(&high->tdg_members),
+ LIST_FIRST(&low->tdg_members));
+ TDQ_LOCK(tdq);
}
+/*
+ * Balance load between CPUs in a group. Will only migrate within the group.
+ */
static void
-sched_balance_groups(void)
+sched_balance_groups()
{
+ struct tdq *tdq;
int i;
- gbal_tick = ticks + (random() % (hz * 2));
- mtx_assert(&sched_lock, MA_OWNED);
- if (smp_started)
- for (i = 0; i <= ksg_maxid; i++)
- sched_balance_group(KSEQ_GROUP(i));
+ /*
+ * Select a random time between .5 * balance_interval and
+ * 1.5 * balance_interval.
+ */
+ balance_group_ticks = max(balance_interval / 2, 1);
+ balance_group_ticks += random() % balance_interval;
+ if (smp_started == 0 || rebalance == 0)
+ return;
+ tdq = TDQ_SELF();
+ TDQ_UNLOCK(tdq);
+ for (i = 0; i <= tdg_maxid; i++)
+ sched_balance_group(TDQ_GROUP(i));
+ TDQ_LOCK(tdq);
}
+/*
+ * Finds the greatest imbalance between two tdqs in a group.
+ */
static void
-sched_balance_group(struct kseq_group *ksg)
+sched_balance_group(struct tdq_group *tdg)
{
- struct kseq *kseq;
- struct kseq *high;
- struct kseq *low;
+ struct tdq *tdq;
+ struct tdq *high;
+ struct tdq *low;
int load;
- if (ksg->ksg_transferable == 0)
+ if (tdg->tdg_transferable == 0)
return;
low = NULL;
high = NULL;
- LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
- load = kseq->ksq_load;
- if (high == NULL || load > high->ksq_load)
- high = kseq;
- if (low == NULL || load < low->ksq_load)
- low = kseq;
+ LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
+ load = tdq->tdq_load;
+ if (high == NULL || load > high->tdq_load)
+ high = tdq;
+ if (low == NULL || load < low->tdq_load)
+ low = tdq;
}
if (high != NULL && low != NULL && high != low)
sched_balance_pair(high, low);
}
+/*
+ * Lock two thread queues using their address to maintain lock order.
+ */
+static void
+tdq_lock_pair(struct tdq *one, struct tdq *two)
+{
+ if (one < two) {
+ TDQ_LOCK(one);
+ TDQ_LOCK_FLAGS(two, MTX_DUPOK);
+ } else {
+ TDQ_LOCK(two);
+ TDQ_LOCK_FLAGS(one, MTX_DUPOK);
+ }
+}
+
+/*
+ * Unlock two thread queues. Order is not important here.
+ */
+static void
+tdq_unlock_pair(struct tdq *one, struct tdq *two)
+{
+ TDQ_UNLOCK(one);
+ TDQ_UNLOCK(two);
+}
+
+/*
+ * Transfer load between two imbalanced thread queues.
+ */
static void
-sched_balance_pair(struct kseq *high, struct kseq *low)
+sched_balance_pair(struct tdq *high, struct tdq *low)
{
int transferable;
int high_load;
@@ -544,186 +646,268 @@
int diff;
int i;
+ tdq_lock_pair(high, low);
/*
* If we're transfering within a group we have to use this specific
- * kseq's transferable count, otherwise we can steal from other members
+ * tdq's transferable count, otherwise we can steal from other members
* of the group.
*/
- if (high->ksq_group == low->ksq_group) {
- transferable = high->ksq_transferable;
- high_load = high->ksq_load;
- low_load = low->ksq_load;
+ if (high->tdq_group == low->tdq_group) {
+ transferable = high->tdq_transferable;
+ high_load = high->tdq_load;
+ low_load = low->tdq_load;
} else {
- transferable = high->ksq_group->ksg_transferable;
- high_load = high->ksq_group->ksg_load;
- low_load = low->ksq_group->ksg_load;
+ transferable = high->tdq_group->tdg_transferable;
+ high_load = high->tdq_group->tdg_load;
+ low_load = low->tdq_group->tdg_load;
}
- if (transferable == 0)
- return;
/*
* Determine what the imbalance is and then adjust that to how many
- * kses we actually have to give up (transferable).
+ * threads we actually have to give up (transferable).
*/
- diff = high_load - low_load;
- move = diff / 2;
- if (diff & 0x1)
- move++;
- move = min(move, transferable);
- for (i = 0; i < move; i++)
- kseq_move(high, KSEQ_ID(low));
+ if (transferable != 0) {
+ diff = high_load - low_load;
+ move = diff / 2;
+ if (diff & 0x1)
+ move++;
+ move = min(move, transferable);
+ for (i = 0; i < move; i++)
+ tdq_move(high, low);
+ /*
+ * IPI the target cpu to force it to reschedule with the new
+ * workload.
+ */
+ ipi_selected(1 << TDQ_ID(low), IPI_PREEMPT);
+ }
+ tdq_unlock_pair(high, low);
return;
}
+/*
+ * Move a thread from one thread queue to another.
+ */
static void
-kseq_move(struct kseq *from, int cpu)
+tdq_move(struct tdq *from, struct tdq *to)
{
- struct kseq *kseq;
- struct kseq *to;
- struct kse *ke;
-
- kseq = from;
- to = KSEQ_CPU(cpu);
- ke = kseq_steal(kseq, 1);
- if (ke == NULL) {
- struct kseq_group *ksg;
-
- ksg = kseq->ksq_group;
- LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
- if (kseq == from || kseq->ksq_transferable == 0)
+ struct td_sched *ts;
+ struct thread *td;
+ struct tdq *tdq;
+ int cpu;
+
+ TDQ_LOCK_ASSERT(from, MA_OWNED);
+ TDQ_LOCK_ASSERT(to, MA_OWNED);
+
+ tdq = from;
+ cpu = TDQ_ID(to);
+ ts = tdq_steal(tdq);
+ if (ts == NULL) {
+ struct tdq_group *tdg;
+
+ tdg = tdq->tdq_group;
+ LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
+ if (tdq == from || tdq->tdq_transferable == 0)
continue;
- ke = kseq_steal(kseq, 1);
+ ts = tdq_steal(tdq);
break;
}
- if (ke == NULL)
- panic("kseq_move: No KSEs available with a "
- "transferable count of %d\n",
- ksg->ksg_transferable);
- }
- if (kseq == to)
- return;
- ke->ke_state = KES_THREAD;
- kseq_runq_rem(kseq, ke);
- kseq_load_rem(kseq, ke);
- kseq_notify(ke, cpu);
+ if (ts == NULL)
+ return;
+ }
+ if (tdq == to)
+ return;
+ td = ts->ts_thread;
+ /*
+ * Although the run queue is locked the thread may be blocked. Lock
+ * it to clear this and acquire the run-queue lock.
+ */
+ thread_lock(td);
+ /* Drop recursive lock on from acquired via thread_lock(). */
+ TDQ_UNLOCK(from);
+ sched_rem(td);
+ ts->ts_cpu = cpu;
+ td->td_lock = TDQ_LOCKPTR(to);
+ tdq_add(to, td, SRQ_YIELDING);
}
+/*
+ * This tdq has idled. Try to steal a thread from another cpu and switch
+ * to it.
+ */
static int
-kseq_idled(struct kseq *kseq)
+tdq_idled(struct tdq *tdq)
{
- struct kseq_group *ksg;
- struct kseq *steal;
- struct kse *ke;
+ struct tdq_group *tdg;
+ struct tdq *steal;
+ int highload;
+ int highcpu;
+ int cpu;
- ksg = kseq->ksq_group;
+ if (smp_started == 0 || steal_idle == 0)
+ return (1);
+ /* We don't want to be preempted while we're iterating over tdqs */
+ spinlock_enter();
+ tdg = tdq->tdq_group;
+ /*
+ * If we're in a cpu group, try and steal threads from another cpu in
+ * the group before idling. In a HTT group all cpus share the same
+ * run-queue lock, however, we still need a recursive lock to
+ * call tdq_move().
+ */
+ if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
+ TDQ_LOCK(tdq);
+ LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
+ if (steal == tdq || steal->tdq_transferable == 0)
+ continue;
+ TDQ_LOCK(steal);
+ goto steal;
+ }
+ TDQ_UNLOCK(tdq);
+ }
/*
- * If we're in a cpu group, try and steal kses from another cpu in
- * the group before idling.
+ * Find the least loaded CPU with a transferable thread and attempt
+ * to steal it. We make a lockless pass and then verify that the
+ * thread is still available after locking.
*/
- if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) {
- LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) {
- if (steal == kseq || steal->ksq_transferable == 0)
+ for (;;) {
+ highcpu = 0;
+ highload = 0;
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ if (CPU_ABSENT(cpu))
continue;
- ke = kseq_steal(steal, 0);
- if (ke == NULL)
+ steal = TDQ_CPU(cpu);
+ if (steal->tdq_transferable == 0)
continue;
- ke->ke_state = KES_THREAD;
- kseq_runq_rem(steal, ke);
- kseq_load_rem(steal, ke);
- ke->ke_cpu = PCPU_GET(cpuid);
- ke->ke_flags |= KEF_INTERNAL | KEF_HOLD;
- sched_add(ke->ke_thread, SRQ_YIELDING);
- return (0);
+ if (steal->tdq_load < highload)
+ continue;
+ highload = steal->tdq_load;
+ highcpu = cpu;
}
+ if (highload < steal_thresh)
+ break;
+ steal = TDQ_CPU(highcpu);
+ if (steal == tdq)
+ break;
+ tdq_lock_pair(tdq, steal);
+ if (steal->tdq_load >= steal_thresh && steal->tdq_transferable)
+ goto steal;
+ tdq_unlock_pair(tdq, steal);
}
- /*
- * We only set the idled bit when all of the cpus in the group are
- * idle. Otherwise we could get into a situation where a KSE bounces
- * back and forth between two idle cores on seperate physical CPUs.
- */
- ksg->ksg_idlemask |= PCPU_GET(cpumask);
- if (ksg->ksg_idlemask != ksg->ksg_cpumask)
- return (1);
- atomic_set_int(&kseq_idle, ksg->ksg_mask);
+ spinlock_exit();
return (1);
+steal:
+ spinlock_exit();
+ tdq_move(steal, tdq);
+ TDQ_UNLOCK(steal);
+ mi_switch(SW_VOL, NULL);
+ thread_unlock(curthread);
+
+ return (0);
}
+/*
+ * Notify a remote cpu of new work. Sends an IPI if criteria are met.
+ */
static void
-kseq_assign(struct kseq *kseq)
+tdq_notify(struct td_sched *ts)
{
- struct kse *nke;
- struct kse *ke;
+ struct thread *ctd;
+ struct pcpu *pcpu;
+ int cpri;
+ int pri;
+ int cpu;
- do {
- *(volatile struct kse **)&ke = kseq->ksq_assigned;
- } while(!atomic_cmpset_ptr((volatile uintptr_t *)&kseq->ksq_assigned,
- (uintptr_t)ke, (uintptr_t)NULL));
- for (; ke != NULL; ke = nke) {
- nke = ke->ke_assign;
- kseq->ksq_group->ksg_load--;
- kseq->ksq_load--;
- ke->ke_flags &= ~KEF_ASSIGNED;
- if (ke->ke_flags & KEF_REMOVED) {
- ke->ke_flags &= ~KEF_REMOVED;
- continue;
- }
- ke->ke_flags |= KEF_INTERNAL | KEF_HOLD;
- sched_add(ke->ke_thread, SRQ_YIELDING);
- }
+ cpu = ts->ts_cpu;
+ pri = ts->ts_thread->td_priority;
+ pcpu = pcpu_find(cpu);
+ ctd = pcpu->pc_curthread;
+ cpri = ctd->td_priority;
+
+ /*
+ * If our priority is not better than the current priority there is
+ * nothing to do.
+ */
+ if (pri > cpri)
+ return;
+ /*
+ * Always IPI idle.
+ */
+ if (cpri > PRI_MIN_IDLE)
+ goto sendipi;
+ /*
+ * If we're realtime or better and there is timeshare or worse running
+ * send an IPI.
+ */
+ if (pri < PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME)
+ goto sendipi;
+ /*
+ * Otherwise only IPI if we exceed the threshold.
+ */
+ if (pri > preempt_thresh)
+ return;
+sendipi:
+ ctd->td_flags |= TDF_NEEDRESCHED;
+ ipi_selected(1 << cpu, IPI_PREEMPT);
}
-static void
-kseq_notify(struct kse *ke, int cpu)
+/*
+ * Steals load from a timeshare queue. Honors the rotating queue head
+ * index.
+ */
+static struct td_sched *
+runq_steal_from(struct runq *rq, u_char start)
{
- struct kseq *kseq;
- struct thread *td;
- struct pcpu *pcpu;
- int class;
- int prio;
+ struct td_sched *ts;
+ struct rqbits *rqb;
+ struct rqhead *rqh;
+ int first;
+ int bit;
+ int pri;
+ int i;
- kseq = KSEQ_CPU(cpu);
- /* XXX */
- class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
- if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
- (kseq_idle & kseq->ksq_group->ksg_mask))
- atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
- kseq->ksq_group->ksg_load++;
- kseq->ksq_load++;
- ke->ke_cpu = cpu;
- ke->ke_flags |= KEF_ASSIGNED;
- prio = ke->ke_thread->td_priority;
-
- /*
- * Place a KSE on another cpu's queue and force a resched.
- */
- do {
- *(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned;
- } while(!atomic_cmpset_ptr((volatile uintptr_t *)&kseq->ksq_assigned,
- (uintptr_t)ke->ke_assign, (uintptr_t)ke));
- /*
- * Without sched_lock we could lose a race where we set NEEDRESCHED
- * on a thread that is switched out before the IPI is delivered. This
- * would lead us to miss the resched. This will be a problem once
- * sched_lock is pushed down.
- */
- pcpu = pcpu_find(cpu);
- td = pcpu->pc_curthread;
- if (ke->ke_thread->td_priority < td->td_priority ||
- td == pcpu->pc_idlethread) {
- td->td_flags |= TDF_NEEDRESCHED;
- ipi_selected(1 << cpu, IPI_AST);
+ rqb = &rq->rq_status;
+ bit = start & (RQB_BPW -1);
+ pri = 0;
+ first = 0;
+again:
+ for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
+ if (rqb->rqb_bits[i] == 0)
+ continue;
+ if (bit != 0) {
+ for (pri = bit; pri < RQB_BPW; pri++)
+ if (rqb->rqb_bits[i] & (1ul << pri))
+ break;
+ if (pri >= RQB_BPW)
+ continue;
+ } else
+ pri = RQB_FFS(rqb->rqb_bits[i]);
+ pri += (i << RQB_L2BPW);
+ rqh = &rq->rq_queues[pri];
+ TAILQ_FOREACH(ts, rqh, ts_procq) {
+ if (first && THREAD_CAN_MIGRATE(ts->ts_thread))
+ return (ts);
+ first = 1;
+ }
}
+ if (start != 0) {
+ start = 0;
+ goto again;
+ }
+
+ return (NULL);
}
-static struct kse *
+/*
+ * Steals load from a standard linear queue.
+ */
+static struct td_sched *
runq_steal(struct runq *rq)
{
struct rqhead *rqh;
struct rqbits *rqb;
- struct kse *ke;
+ struct td_sched *ts;
int word;
int bit;
- mtx_assert(&sched_lock, MA_OWNED);
rqb = &rq->rq_status;
for (word = 0; word < RQB_LEN; word++) {
if (rqb->rqb_bits[word] == 0)
@@ -732,523 +916,717 @@
if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
continue;
rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
- TAILQ_FOREACH(ke, rqh, ke_procq) {
- if (KSE_CAN_MIGRATE(ke))
- return (ke);
- }
+ TAILQ_FOREACH(ts, rqh, ts_procq)
+ if (THREAD_CAN_MIGRATE(ts->ts_thread))
+ return (ts);
}
}
return (NULL);
}
-static struct kse *
-kseq_steal(struct kseq *kseq, int stealidle)
+/*
+ * Attempt to steal a thread in priority order from a thread queue.
+ */
+static struct td_sched *
+tdq_steal(struct tdq *tdq)
{
- struct kse *ke;
+ struct td_sched *ts;
- /*
- * Steal from next first to try to get a non-interactive task that
- * may not have run for a while.
- */
- if ((ke = runq_steal(kseq->ksq_next)) != NULL)
- return (ke);
- if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
- return (ke);
- if (stealidle)
- return (runq_steal(&kseq->ksq_idle));
- return (NULL);
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL)
+ return (ts);
+ if ((ts = runq_steal_from(&tdq->tdq_timeshare, tdq->tdq_ridx)) != NULL)
+ return (ts);
+ return (runq_steal(&tdq->tdq_idle));
}
-int
-kseq_transfer(struct kseq *kseq, struct kse *ke, int class)
+/*
+ * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the
+ * current lock and returns with the assigned queue locked.
+ */
+static inline struct tdq *
+sched_setcpu(struct td_sched *ts, int cpu, int flags)
{
- struct kseq_group *nksg;
- struct kseq_group *ksg;
- struct kseq *old;
- int cpu;
- int idx;
+ struct thread *td;
+ struct tdq *tdq;
- if (smp_started == 0)
- return (0);
- cpu = 0;
- /*
- * If our load exceeds a certain threshold we should attempt to
- * reassign this thread. The first candidate is the cpu that
- * originally ran the thread. If it is idle, assign it there,
- * otherwise, pick an idle cpu.
- *
- * The threshold at which we start to reassign kses has a large impact
- * on the overall performance of the system. Tuned too high and
- * some CPUs may idle. Too low and there will be excess migration
- * and context switches.
- */
- old = KSEQ_CPU(ke->ke_cpu);
- nksg = old->ksq_group;
- ksg = kseq->ksq_group;
- if (kseq_idle) {
- if (kseq_idle & nksg->ksg_mask) {
- cpu = ffs(nksg->ksg_idlemask);
- if (cpu) {
- CTR2(KTR_SCHED,
- "kseq_transfer: %p found old cpu %X "
- "in idlemask.", ke, cpu);
- goto migrate;
- }
- }
- /*
- * Multiple cpus could find this bit simultaneously
- * but the race shouldn't be terrible.
- */
- cpu = ffs(kseq_idle);
- if (cpu) {
- CTR2(KTR_SCHED, "kseq_transfer: %p found %X "
- "in idlemask.", ke, cpu);
- goto migrate;
- }
- }
- idx = 0;
-#if 0
- if (old->ksq_load < kseq->ksq_load) {
- cpu = ke->ke_cpu + 1;
- CTR2(KTR_SCHED, "kseq_transfer: %p old cpu %X "
- "load less than ours.", ke, cpu);
- goto migrate;
- }
- /*
- * No new CPU was found, look for one with less load.
- */
- for (idx = 0; idx <= ksg_maxid; idx++) {
- nksg = KSEQ_GROUP(idx);
- if (nksg->ksg_load /*+ (nksg->ksg_cpus * 2)*/ < ksg->ksg_load) {
- cpu = ffs(nksg->ksg_cpumask);
- CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X load less "
- "than ours.", ke, cpu);
- goto migrate;
- }
- }
-#endif
+ THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+
+ tdq = TDQ_CPU(cpu);
+ td = ts->ts_thread;
+ ts->ts_cpu = cpu;
+
+ /* If the lock matches just return the queue. */
+ if (td->td_lock == TDQ_LOCKPTR(tdq))
+ return (tdq);
+#ifdef notyet
/*
- * If another cpu in this group has idled, assign a thread over
- * to them after checking to see if there are idled groups.
+ * If the thread isn't running its lockptr is a
+ * turnstile or a sleepqueue. We can just lock_set without
+ * blocking.
*/
- if (ksg->ksg_idlemask) {
- cpu = ffs(ksg->ksg_idlemask);
- if (cpu) {
- CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X idle in "
- "group.", ke, cpu);
- goto migrate;
- }
+ if (TD_CAN_RUN(td)) {
+ TDQ_LOCK(tdq);
+ thread_lock_set(td, TDQ_LOCKPTR(tdq));
+ return (tdq);
}
- return (0);
-migrate:
+#endif
/*
- * Now that we've found an idle CPU, migrate the thread.
+ * The hard case, migration, we need to block the thread first to
+ * prevent order reversals with other cpus locks.
*/
- cpu--;
- ke->ke_runq = NULL;
- kseq_notify(ke, cpu);
-
- return (1);
+ thread_lock_block(td);
+ TDQ_LOCK(tdq);
+ thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
+ return (tdq);
}
-#endif /* SMP */
-
/*
- * Pick the highest priority task we have and return it.
+ * Find the thread queue running the lowest priority thread.
*/
-
-static struct kse *
-kseq_choose(struct kseq *kseq)
+static int
+tdq_lowestpri(void)
{
- struct runq *swap;
- struct kse *ke;
- int nice;
-
- mtx_assert(&sched_lock, MA_OWNED);
- swap = NULL;
+ struct tdq *tdq;
+ int lowpri;
+ int lowcpu;
+ int lowload;
+ int load;
+ int cpu;
+ int pri;
- for (;;) {
- ke = runq_choose(kseq->ksq_curr);
- if (ke == NULL) {
- /*
- * We already swapped once and didn't get anywhere.
- */
- if (swap)
- break;
- swap = kseq->ksq_curr;
- kseq->ksq_curr = kseq->ksq_next;
- kseq->ksq_next = swap;
+ lowload = 0;
+ lowpri = lowcpu = 0;
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ if (CPU_ABSENT(cpu))
continue;
- }
- /*
- * If we encounter a slice of 0 the kse is in a
- * TIMESHARE kse group and its nice was too far out
- * of the range that receives slices.
- */
- nice = ke->ke_proc->p_nice + (0 - kseq->ksq_nicemin);
-#if 0
- if (ke->ke_slice == 0 || (nice > SCHED_SLICE_NTHRESH &&
- ke->ke_proc->p_nice != 0)) {
- runq_remove(ke->ke_runq, ke);
- sched_slice(ke);
- ke->ke_runq = kseq->ksq_next;
- runq_add(ke->ke_runq, ke, 0);
+ tdq = TDQ_CPU(cpu);
+ pri = tdq->tdq_lowpri;
+ load = TDQ_CPU(cpu)->tdq_load;
+ CTR4(KTR_ULE,
+ "cpu %d pri %d lowcpu %d lowpri %d",
+ cpu, pri, lowcpu, lowpri);
+ if (pri < lowpri)
continue;
- }
-#endif
- return (ke);
+ if (lowpri && lowpri == pri && load > lowload)
+ continue;
+ lowpri = pri;
+ lowcpu = cpu;
+ lowload = load;
}
- return (runq_choose(&kseq->ksq_idle));
+ return (lowcpu);
}
-static void
-kseq_setup(struct kseq *kseq)
-{
- runq_init(&kseq->ksq_timeshare[0]);
- runq_init(&kseq->ksq_timeshare[1]);
- runq_init(&kseq->ksq_idle);
- kseq->ksq_curr = &kseq->ksq_timeshare[0];
- kseq->ksq_next = &kseq->ksq_timeshare[1];
- kseq->ksq_load = 0;
- kseq->ksq_load_timeshare = 0;
+/*
+ * Find the thread queue with the least load.
+ */
+static int
+tdq_lowestload(void)
+{
+ struct tdq *tdq;
+ int lowload;
+ int lowpri;
+ int lowcpu;
+ int load;
+ int cpu;
+ int pri;
+
+ lowcpu = 0;
+ lowload = TDQ_CPU(0)->tdq_load;
+ lowpri = TDQ_CPU(0)->tdq_lowpri;
+ for (cpu = 1; cpu <= mp_maxid; cpu++) {
+ if (CPU_ABSENT(cpu))
+ continue;
+ tdq = TDQ_CPU(cpu);
+ load = tdq->tdq_load;
+ pri = tdq->tdq_lowpri;
+ CTR4(KTR_ULE, "cpu %d load %d lowcpu %d lowload %d",
+ cpu, load, lowcpu, lowload);
+ if (load > lowload)
+ continue;
+ if (load == lowload && pri < lowpri)
+ continue;
+ lowcpu = cpu;
+ lowload = load;
+ lowpri = pri;
+ }
+
+ return (lowcpu);
+}
+
+/*
+ * Pick the destination cpu for sched_add(). Respects affinity and makes
+ * a determination based on load or priority of available processors.
+ */
+static int
+sched_pickcpu(struct td_sched *ts, int flags)
+{
+ struct tdq *tdq;
+ int self;
+ int pri;
+ int cpu;
+
+ cpu = self = PCPU_GET(cpuid);
+ if (smp_started == 0)
+ return (self);
+ /*
+ * Don't migrate a running thread from sched_switch().
+ */
+ if (flags & SRQ_OURSELF) {
+ CTR1(KTR_ULE, "YIELDING %d",
+ curthread->td_priority);
+ return (self);
+ }
+ pri = ts->ts_thread->td_priority;
+ cpu = ts->ts_cpu;
+ /*
+ * Regardless of affinity, if the last cpu is idle send it there.
+ */
+ tdq = TDQ_CPU(cpu);
+ if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
+ CTR5(KTR_ULE,
+ "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d",
+ ts->ts_cpu, ts->ts_rltick, ticks, pri,
+ tdq->tdq_lowpri);
+ return (ts->ts_cpu);
+ }
+ /*
+ * If we have affinity, try to place it on the cpu we last ran on.
+ */
+ if (SCHED_AFFINITY(ts) && tdq->tdq_lowpri > pri) {
+ CTR5(KTR_ULE,
+ "affinity for %d, ltick %d ticks %d pri %d curthread %d",
+ ts->ts_cpu, ts->ts_rltick, ticks, pri,
+ tdq->tdq_lowpri);
+ return (ts->ts_cpu);
+ }
+ /*
+ * Look for an idle group.
+ */
+ CTR1(KTR_ULE, "tdq_idle %X", tdq_idle);
+ cpu = ffs(tdq_idle);
+ if (cpu)
+ return (--cpu);
+ /*
+ * If there are no idle cores see if we can run the thread locally.
+ * This may improve locality among sleepers and wakers when there
+ * is shared data.
+ */
+ if (tryself && pri < curthread->td_priority) {
+ CTR1(KTR_ULE, "tryself %d",
+ curthread->td_priority);
+ return (self);
+ }
+ /*
+ * Now search for the cpu running the lowest priority thread with
+ * the least load.
+ */
+ if (pick_pri)
+ cpu = tdq_lowestpri();
+ else
+ cpu = tdq_lowestload();
+ return (cpu);
}
+#endif /* SMP */
+
+/*
+ * Pick the highest priority task we have and return it.
+ */
+static struct td_sched *
+tdq_choose(struct tdq *tdq)
+{
+ struct td_sched *ts;
+
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ ts = runq_choose(&tdq->tdq_realtime);
+ if (ts != NULL)
+ return (ts);
+ ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
+ if (ts != NULL) {
+ KASSERT(ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE,
+ ("tdq_choose: Invalid priority on timeshare queue %d",
+ ts->ts_thread->td_priority));
+ return (ts);
+ }
+
+ ts = runq_choose(&tdq->tdq_idle);
+ if (ts != NULL) {
+ KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE,
+ ("tdq_choose: Invalid priority on idle queue %d",
+ ts->ts_thread->td_priority));
+ return (ts);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Initialize a thread queue.
+ */
static void
-sched_setup(void *dummy)
+tdq_setup(struct tdq *tdq)
{
-#ifdef SMP
- int i;
-#endif
- slice_min = (hz/100); /* 10ms */
- slice_max = (hz/7); /* ~140ms */
+ if (bootverbose)
+ printf("ULE: setup cpu %d\n", TDQ_ID(tdq));
+ runq_init(&tdq->tdq_realtime);
+ runq_init(&tdq->tdq_timeshare);
+ runq_init(&tdq->tdq_idle);
+ tdq->tdq_load = 0;
+}
#ifdef SMP
- balance_groups = 0;
- /*
- * Initialize the kseqs.
- */
- for (i = 0; i < MAXCPU; i++) {
- struct kseq *ksq;
+static void
+tdg_setup(struct tdq_group *tdg)
+{
+ if (bootverbose)
+ printf("ULE: setup cpu group %d\n", TDG_ID(tdg));
+ snprintf(tdg->tdg_name, sizeof(tdg->tdg_name),
+ "sched lock %d", (int)TDG_ID(tdg));
+ mtx_init(&tdg->tdg_lock, tdg->tdg_name, "sched lock",
+ MTX_SPIN | MTX_RECURSE);
+ LIST_INIT(&tdg->tdg_members);
+ tdg->tdg_load = 0;
+ tdg->tdg_transferable = 0;
+ tdg->tdg_cpus = 0;
+ tdg->tdg_mask = 0;
+ tdg->tdg_cpumask = 0;
+ tdg->tdg_idlemask = 0;
+}
- ksq = &kseq_cpu[i];
- ksq->ksq_assigned = NULL;
- kseq_setup(&kseq_cpu[i]);
- }
- if (smp_topology == NULL) {
- struct kseq_group *ksg;
- struct kseq *ksq;
- int cpus;
+static void
+tdg_add(struct tdq_group *tdg, struct tdq *tdq)
+{
+ if (tdg->tdg_mask == 0)
+ tdg->tdg_mask |= 1 << TDQ_ID(tdq);
+ tdg->tdg_cpumask |= 1 << TDQ_ID(tdq);
+ tdg->tdg_cpus++;
+ tdq->tdq_group = tdg;
+ tdq->tdq_lock = &tdg->tdg_lock;
+ LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings);
+ if (bootverbose)
+ printf("ULE: adding cpu %d to group %d: cpus %d mask 0x%X\n",
+ TDQ_ID(tdq), TDG_ID(tdg), tdg->tdg_cpus, tdg->tdg_cpumask);
+}
- for (cpus = 0, i = 0; i < MAXCPU; i++) {
- if (CPU_ABSENT(i))
- continue;
- ksq = &kseq_cpu[cpus];
- ksg = &kseq_groups[cpus];
- /*
- * Setup a kseq group with one member.
- */
- ksq->ksq_transferable = 0;
- ksq->ksq_group = ksg;
- ksg->ksg_cpus = 1;
- ksg->ksg_idlemask = 0;
- ksg->ksg_cpumask = ksg->ksg_mask = 1 << i;
- ksg->ksg_load = 0;
- ksg->ksg_transferable = 0;
- LIST_INIT(&ksg->ksg_members);
- LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings);
- cpus++;
- }
- ksg_maxid = cpus - 1;
- } else {
- struct kseq_group *ksg;
- struct cpu_group *cg;
- int j;
-
- for (i = 0; i < smp_topology->ct_count; i++) {
- cg = &smp_topology->ct_group[i];
- ksg = &kseq_groups[i];
- /*
- * Initialize the group.
- */
- ksg->ksg_idlemask = 0;
- ksg->ksg_load = 0;
- ksg->ksg_transferable = 0;
- ksg->ksg_cpus = cg->cg_count;
- ksg->ksg_cpumask = cg->cg_mask;
- LIST_INIT(&ksg->ksg_members);
- /*
- * Find all of the group members and add them.
- */
- for (j = 0; j < MAXCPU; j++) {
- if ((cg->cg_mask & (1 << j)) != 0) {
- if (ksg->ksg_mask == 0)
- ksg->ksg_mask = 1 << j;
- kseq_cpu[j].ksq_transferable = 0;
- kseq_cpu[j].ksq_group = ksg;
- LIST_INSERT_HEAD(&ksg->ksg_members,
- &kseq_cpu[j], ksq_siblings);
- }
+static void
+sched_setup_topology(void)
+{
+ struct tdq_group *tdg;
+ struct cpu_group *cg;
+ int balance_groups;
+ struct tdq *tdq;
+ int i;
+ int j;
+
+ topology = 1;
+ balance_groups = 0;
+ for (i = 0; i < smp_topology->ct_count; i++) {
+ cg = &smp_topology->ct_group[i];
+ tdg = &tdq_groups[i];
+ /*
+ * Initialize the group.
+ */
+ tdg_setup(tdg);
+ /*
+ * Find all of the group members and add them.
+ */
+ for (j = 0; j < MAXCPU; j++) {
+ if ((cg->cg_mask & (1 << j)) != 0) {
+ tdq = TDQ_CPU(j);
+ tdq_setup(tdq);
+ tdg_add(tdg, tdq);
}
- if (ksg->ksg_cpus > 1)
- balance_groups = 1;
}
- ksg_maxid = smp_topology->ct_count - 1;
+ if (tdg->tdg_cpus > 1)
+ balance_groups = 1;
}
+ tdg_maxid = smp_topology->ct_count - 1;
+ if (balance_groups)
+ sched_balance_groups();
+}
+
+static void
+sched_setup_smp(void)
+{
+ struct tdq_group *tdg;
+ struct tdq *tdq;
+ int cpus;
+ int i;
+
+ for (cpus = 0, i = 0; i < MAXCPU; i++) {
+ if (CPU_ABSENT(i))
+ continue;
+ tdq = &tdq_cpu[i];
+ tdg = &tdq_groups[i];
+ /*
+ * Setup a tdq group with one member.
+ */
+ tdg_setup(tdg);
+ tdq_setup(tdq);
+ tdg_add(tdg, tdq);
+ cpus++;
+ }
+ tdg_maxid = cpus - 1;
+}
+
+/*
+ * Fake a topology with one group containing all CPUs.
+ */
+static void
+sched_fake_topo(void)
+{
+#ifdef SCHED_FAKE_TOPOLOGY
+ static struct cpu_top top;
+ static struct cpu_group group;
+
+ top.ct_count = 1;
+ top.ct_group = &group;
+ group.cg_mask = all_cpus;
+ group.cg_count = mp_ncpus;
+ group.cg_children = 0;
+ smp_topology = ⊤
+#endif
+}
+#endif
+
+/*
+ * Setup the thread queues and initialize the topology based on MD
+ * information.
+ */
+static void
+sched_setup(void *dummy)
+{
+ struct tdq *tdq;
+
+ tdq = TDQ_SELF();
+#ifdef SMP
+ sched_fake_topo();
/*
- * Stagger the group and global load balancer so they do not
- * interfere with each other.
+ * Setup tdqs based on a topology configuration or vanilla SMP based
+ * on mp_maxid.
*/
- bal_tick = ticks + hz;
- if (balance_groups)
- gbal_tick = ticks + (hz / 2);
+ if (smp_topology == NULL)
+ sched_setup_smp();
+ else
+ sched_setup_topology();
+ balance_tdq = tdq;
+ sched_balance();
#else
- kseq_setup(KSEQ_SELF());
+ tdq_setup(tdq);
+ mtx_init(&tdq_lock, "sched lock", "sched lock", MTX_SPIN | MTX_RECURSE);
+ tdq->tdq_lock = &tdq_lock;
#endif
- mtx_lock_spin(&sched_lock);
- kseq_load_add(KSEQ_SELF(), &kse0);
- mtx_unlock_spin(&sched_lock);
+ /*
+ * To avoid divide-by-zero, we set realstathz a dummy value
+ * in case which sched_clock() called before sched_initticks().
+ */
+ realstathz = hz;
+ sched_slice = (realstathz/10); /* ~100ms */
+ tickincr = 1 << SCHED_TICK_SHIFT;
+
+ /* Add thread0's load since it's running. */
+ TDQ_LOCK(tdq);
+ thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
+ tdq_load_add(tdq, &td_sched0);
+ TDQ_UNLOCK(tdq);
}
/*
- * Scale the scheduling priority according to the "interactivity" of this
- * process.
+ * This routine determines the tickincr after stathz and hz are setup.
*/
+/* ARGSUSED */
static void
-sched_priority(struct ksegrp *kg)
+sched_initticks(void *dummy)
{
- int pri;
+ int incr;
+
+ realstathz = stathz ? stathz : hz;
+ sched_slice = (realstathz/10); /* ~100ms */
+
+ /*
+ * tickincr is shifted out by 10 to avoid rounding errors due to
+ * hz not being evenly divisible by stathz on all platforms.
+ */
+ incr = (hz << SCHED_TICK_SHIFT) / realstathz;
+ /*
+ * This does not work for values of stathz that are more than
+ * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen.
+ */
+ if (incr == 0)
+ incr = 1;
+ tickincr = incr;
+#ifdef SMP
+ /*
+ * Set the default balance interval now that we know
+ * what realstathz is.
+ */
+ balance_interval = realstathz;
+ /*
+ * Set steal thresh to log2(mp_ncpu) but no greater than 4. This
+ * prevents excess thrashing on large machines and excess idle on
+ * smaller machines.
+ */
+ steal_thresh = min(ffs(mp_ncpus) - 1, 4);
+ affinity = SCHED_AFFINITY_DEFAULT;
+#endif
+}
- if (kg->kg_pri_class != PRI_TIMESHARE)
- return;
- pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
- pri += SCHED_PRI_BASE;
- pri += kg->kg_proc->p_nice;
+/*
+ * This is the core of the interactivity algorithm. Determines a score based
+ * on past behavior. It is the ratio of sleep time to run time scaled to
+ * a [0, 100] integer. This is the voluntary sleep time of a process, which
+ * differs from the cpu usage because it does not account for time spent
+ * waiting on a run-queue. Would be prettier if we had floating point.
+ */
+static int
+sched_interact_score(struct thread *td)
+{
+ struct td_sched *ts;
+ int div;
- if (pri > PRI_MAX_TIMESHARE)
- pri = PRI_MAX_TIMESHARE;
- else if (pri < PRI_MIN_TIMESHARE)
- pri = PRI_MIN_TIMESHARE;
+ ts = td->td_sched;
+ /*
+ * The score is only needed if this is likely to be an interactive
+ * task. Don't go through the expense of computing it if there's
+ * no chance.
+ */
+ if (sched_interact <= SCHED_INTERACT_HALF &&
+ ts->ts_runtime >= ts->ts_slptime)
+ return (SCHED_INTERACT_HALF);
- kg->kg_user_pri = pri;
+ if (ts->ts_runtime > ts->ts_slptime) {
+ div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
+ return (SCHED_INTERACT_HALF +
+ (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
+ }
+ if (ts->ts_slptime > ts->ts_runtime) {
+ div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
+ return (ts->ts_runtime / div);
+ }
+ /* runtime == slptime */
+ if (ts->ts_runtime)
+ return (SCHED_INTERACT_HALF);
+
+ /*
+ * This can happen if slptime and runtime are 0.
+ */
+ return (0);
- return;
}
/*
- * Calculate a time slice based on the properties of the kseg and the runq
- * that we're on. This is only for PRI_TIMESHARE ksegrps.
+ * Scale the scheduling priority according to the "interactivity" of this
+ * process.
*/
static void
-sched_slice(struct kse *ke)
+sched_priority(struct thread *td)
{
- struct kseq *kseq;
- struct ksegrp *kg;
-
- kg = ke->ke_ksegrp;
- kseq = KSEQ_CPU(ke->ke_cpu);
+ int score;
+ int pri;
- if (ke->ke_thread->td_flags & TDF_BORROWING) {
- ke->ke_slice = SCHED_SLICE_MIN;
+ if (td->td_pri_class != PRI_TIMESHARE)
return;
- }
-
/*
- * Rationale:
- * KSEs in interactive ksegs get a minimal slice so that we
- * quickly notice if it abuses its advantage.
- *
- * KSEs in non-interactive ksegs are assigned a slice that is
- * based on the ksegs nice value relative to the least nice kseg
- * on the run queue for this cpu.
- *
- * If the KSE is less nice than all others it gets the maximum
- * slice and other KSEs will adjust their slice relative to
- * this when they first expire.
+ * If the score is interactive we place the thread in the realtime
+ * queue with a priority that is less than kernel and interrupt
+ * priorities. These threads are not subject to nice restrictions.
*
- * There is 20 point window that starts relative to the least
- * nice kse on the run queue. Slice size is determined by
- * the kse distance from the last nice ksegrp.
+ * Scores greater than this are placed on the normal timeshare queue
+ * where the priority is partially decided by the most recent cpu
+ * utilization and the rest is decided by nice value.
*
- * If the kse is outside of the window it will get no slice
- * and will be reevaluated each time it is selected on the
- * run queue. The exception to this is nice 0 ksegs when
- * a nice -20 is running. They are always granted a minimum
- * slice.
- */
- if (!SCHED_INTERACTIVE(kg)) {
- int nice;
-
- nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin);
- if (kseq->ksq_load_timeshare == 0 ||
- kg->kg_proc->p_nice < kseq->ksq_nicemin)
- ke->ke_slice = SCHED_SLICE_MAX;
- else if (nice <= SCHED_SLICE_NTHRESH)
- ke->ke_slice = SCHED_SLICE_NICE(nice);
- else if (kg->kg_proc->p_nice == 0)
- ke->ke_slice = SCHED_SLICE_MIN;
- else
- ke->ke_slice = SCHED_SLICE_MIN; /* 0 */
- } else
- ke->ke_slice = SCHED_SLICE_INTERACTIVE;
+ * The nice value of the process has a linear effect on the calculated
+ * score. Negative nice values make it easier for a thread to be
+ * considered interactive.
+ */
+ score = imax(0, sched_interact_score(td) - td->td_proc->p_nice);
+ if (score < sched_interact) {
+ pri = PRI_MIN_REALTIME;
+ pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
+ * score;
+ KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME,
+ ("sched_priority: invalid interactive priority %d score %d",
+ pri, score));
+ } else {
+ pri = SCHED_PRI_MIN;
+ if (td->td_sched->ts_ticks)
+ pri += SCHED_PRI_TICKS(td->td_sched);
+ pri += SCHED_PRI_NICE(td->td_proc->p_nice);
+ KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE,
+ ("sched_priority: invalid priority %d: nice %d, "
+ "ticks %d ftick %d ltick %d tick pri %d",
+ pri, td->td_proc->p_nice, td->td_sched->ts_ticks,
+ td->td_sched->ts_ftick, td->td_sched->ts_ltick,
+ SCHED_PRI_TICKS(td->td_sched)));
+ }
+ sched_user_prio(td, pri);
return;
}
/*
* This routine enforces a maximum limit on the amount of scheduling history
- * kept. It is called after either the slptime or runtime is adjusted.
- * This routine will not operate correctly when slp or run times have been
- * adjusted to more than double their maximum.
+ * kept. It is called after either the slptime or runtime is adjusted. This
+ * function is ugly due to integer math.
*/
static void
-sched_interact_update(struct ksegrp *kg)
+sched_interact_update(struct thread *td)
{
- int sum;
+ struct td_sched *ts;
+ u_int sum;
- sum = kg->kg_runtime + kg->kg_slptime;
+ ts = td->td_sched;
+ sum = ts->ts_runtime + ts->ts_slptime;
if (sum < SCHED_SLP_RUN_MAX)
return;
/*
+ * This only happens from two places:
+ * 1) We have added an unusual amount of run time from fork_exit.
+ * 2) We have added an unusual amount of sleep time from sched_sleep().
+ */
+ if (sum > SCHED_SLP_RUN_MAX * 2) {
+ if (ts->ts_runtime > ts->ts_slptime) {
+ ts->ts_runtime = SCHED_SLP_RUN_MAX;
+ ts->ts_slptime = 1;
+ } else {
+ ts->ts_slptime = SCHED_SLP_RUN_MAX;
+ ts->ts_runtime = 1;
+ }
+ return;
+ }
+ /*
* If we have exceeded by more than 1/5th then the algorithm below
* will not bring us back into range. Dividing by two here forces
* us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
*/
if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
- kg->kg_runtime /= 2;
- kg->kg_slptime /= 2;
+ ts->ts_runtime /= 2;
+ ts->ts_slptime /= 2;
return;
}
- kg->kg_runtime = (kg->kg_runtime / 5) * 4;
- kg->kg_slptime = (kg->kg_slptime / 5) * 4;
+ ts->ts_runtime = (ts->ts_runtime / 5) * 4;
+ ts->ts_slptime = (ts->ts_slptime / 5) * 4;
}
+/*
+ * Scale back the interactivity history when a child thread is created. The
+ * history is inherited from the parent but the thread may behave totally
+ * differently. For example, a shell spawning a compiler process. We want
+ * to learn that the compiler is behaving badly very quickly.
+ */
static void
-sched_interact_fork(struct ksegrp *kg)
+sched_interact_fork(struct thread *td)
{
int ratio;
int sum;
- sum = kg->kg_runtime + kg->kg_slptime;
+ sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime;
if (sum > SCHED_SLP_RUN_FORK) {
ratio = sum / SCHED_SLP_RUN_FORK;
- kg->kg_runtime /= ratio;
- kg->kg_slptime /= ratio;
+ td->td_sched->ts_runtime /= ratio;
+ td->td_sched->ts_slptime /= ratio;
}
}
-static int
-sched_interact_score(struct ksegrp *kg)
-{
- int div;
-
- if (kg->kg_runtime > kg->kg_slptime) {
- div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
- return (SCHED_INTERACT_HALF +
- (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
- } if (kg->kg_slptime > kg->kg_runtime) {
- div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
- return (kg->kg_runtime / div);
- }
-
- /*
- * This can happen if slptime and runtime are 0.
- */
- return (0);
-
-}
-
/*
- * Very early in the boot some setup of scheduler-specific
- * parts of proc0 and of soem scheduler resources needs to be done.
- * Called from:
- * proc0_init()
+ * Called from proc0_init() to setup the scheduler fields.
*/
void
schedinit(void)
{
+
/*
* Set up the scheduler specific parts of proc0.
*/
proc0.p_sched = NULL; /* XXX */
- ksegrp0.kg_sched = &kg_sched0;
- thread0.td_sched = &kse0;
- kse0.ke_thread = &thread0;
- kse0.ke_state = KES_THREAD;
- kg_sched0.skg_concurrency = 1;
- kg_sched0.skg_avail_opennings = 0; /* we are already running */
+ thread0.td_sched = &td_sched0;
+ td_sched0.ts_ltick = ticks;
+ td_sched0.ts_ftick = ticks;
+ td_sched0.ts_thread = &thread0;
}
/*
* This is only somewhat accurate since given many processes of the same
* priority they will switch when their slices run out, which will be
- * at most SCHED_SLICE_MAX.
+ * at most sched_slice stathz ticks.
*/
int
sched_rr_interval(void)
{
- return (SCHED_SLICE_MAX);
+
+ /* Convert sched_slice to hz */
+ return (hz/(realstathz/sched_slice));
}
+/*
+ * Update the percent cpu tracking information when it is requested or
+ * the total history exceeds the maximum. We keep a sliding history of
+ * tick counts that slowly decays. This is less precise than the 4BSD
+ * mechanism since it happens with less regular and frequent events.
+ */
static void
-sched_pctcpu_update(struct kse *ke)
+sched_pctcpu_update(struct td_sched *ts)
{
+
+ if (ts->ts_ticks == 0)
+ return;
+ if (ticks - (hz / 10) < ts->ts_ltick &&
+ SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX)
+ return;
/*
* Adjust counters and watermark for pctcpu calc.
*/
- if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
- /*
- * Shift the tick count out so that the divide doesn't
- * round away our results.
- */
- ke->ke_ticks <<= 10;
- ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
- SCHED_CPU_TICKS;
- ke->ke_ticks >>= 10;
- } else
- ke->ke_ticks = 0;
- ke->ke_ltick = ticks;
- ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
+ if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
+ ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
+ SCHED_TICK_TARG;
+ else
+ ts->ts_ticks = 0;
+ ts->ts_ltick = ticks;
+ ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
}
-void
+/*
+ * Adjust the priority of a thread. Move it to the appropriate run-queue
+ * if necessary. This is the back-end for several priority related
+ * functions.
+ */
+static void
sched_thread_priority(struct thread *td, u_char prio)
{
- struct kse *ke;
+ struct td_sched *ts;
CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, prio, curthread,
curthread->td_proc->p_comm);
- ke = td->td_kse;
- mtx_assert(&sched_lock, MA_OWNED);
+ ts = td->td_sched;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_priority == prio)
return;
- if (TD_ON_RUNQ(td)) {
+
+ if (TD_ON_RUNQ(td) && prio < td->td_priority) {
/*
* If the priority has been elevated due to priority
* propagation, we may have to move ourselves to a new
- * queue. We still call adjustrunqueue below in case kse
- * needs to fix things up.
+ * queue. This could be optimized to not re-add in some
+ * cases.
*/
- if (prio < td->td_priority && ke->ke_runq != NULL &&
- (ke->ke_flags & KEF_ASSIGNED) == 0 &&
- ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) {
- runq_remove(ke->ke_runq, ke);
- ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
- runq_add(ke->ke_runq, ke, 0);
- }
- /*
- * Hold this kse on this cpu so that sched_prio() doesn't
- * cause excessive migration. We only want migration to
- * happen as the result of a wakeup.
- */
- ke->ke_flags |= KEF_HOLD;
- adjustrunqueue(td, prio);
- ke->ke_flags &= ~KEF_HOLD;
- } else
+ sched_rem(td);
+ td->td_priority = prio;
+ sched_add(td, SRQ_BORROWING);
+ } else {
+#ifdef SMP
+ struct tdq *tdq;
+
+ tdq = TDQ_CPU(ts->ts_cpu);
+ if (prio < tdq->tdq_lowpri)
+ tdq->tdq_lowpri = prio;
+#endif
td->td_priority = prio;
+ }
}
/*
@@ -1278,7 +1656,7 @@
if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
td->td_base_pri <= PRI_MAX_TIMESHARE)
- base_pri = td->td_ksegrp->kg_user_pri;
+ base_pri = td->td_user_pri;
else
base_pri = td->td_base_pri;
if (prio >= base_pri) {
@@ -1288,6 +1666,9 @@
sched_lend_prio(td, prio);
}
+/*
+ * Standard entry for setting the priority to an absolute value.
+ */
void
sched_prio(struct thread *td, u_char prio)
{
@@ -1315,157 +1696,293 @@
turnstile_adjust(td, oldprio);
}
+/*
+ * Set the base user priority, does not effect current running priority.
+ */
void
-sched_switch(struct thread *td, struct thread *newtd, int flags)
+sched_user_prio(struct thread *td, u_char prio)
+{
+ u_char oldprio;
+
+ td->td_base_user_pri = prio;
+ if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
+ return;
+ oldprio = td->td_user_pri;
+ td->td_user_pri = prio;
+}
+
+void
+sched_lend_user_prio(struct thread *td, u_char prio)
{
- struct kseq *ksq;
- struct kse *ke;
+ u_char oldprio;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ td->td_flags |= TDF_UBORROWING;
+ oldprio = td->td_user_pri;
+ td->td_user_pri = prio;
+}
- ke = td->td_kse;
- ksq = KSEQ_SELF();
+void
+sched_unlend_user_prio(struct thread *td, u_char prio)
+{
+ u_char base_pri;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ base_pri = td->td_base_user_pri;
+ if (prio >= base_pri) {
+ td->td_flags &= ~TDF_UBORROWING;
+ sched_user_prio(td, base_pri);
+ } else {
+ sched_lend_user_prio(td, prio);
+ }
+}
+
+/*
+ * Add the thread passed as 'newtd' to the run queue before selecting
+ * the next thread to run. This is only used for KSE.
+ */
+static void
+sched_switchin(struct tdq *tdq, struct thread *td)
+{
+#ifdef SMP
+ spinlock_enter();
+ TDQ_UNLOCK(tdq);
+ thread_lock(td);
+ spinlock_exit();
+ sched_setcpu(td->td_sched, TDQ_ID(tdq), SRQ_YIELDING);
+#else
+ td->td_lock = TDQ_LOCKPTR(tdq);
+#endif
+ tdq_add(tdq, td, SRQ_YIELDING);
+ MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+}
+
+/*
+ * Handle migration from sched_switch(). This happens only for
+ * cpu binding.
+ */
+static struct mtx *
+sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
+{
+ struct tdq *tdn;
+
+ tdn = TDQ_CPU(td->td_sched->ts_cpu);
+#ifdef SMP
+ /*
+ * Do the lock dance required to avoid LOR. We grab an extra
+ * spinlock nesting to prevent preemption while we're
+ * not holding either run-queue lock.
+ */
+ spinlock_enter();
+ thread_block_switch(td); /* This releases the lock on tdq. */
+ TDQ_LOCK(tdn);
+ tdq_add(tdn, td, flags);
+ tdq_notify(td->td_sched);
+ /*
+ * After we unlock tdn the new cpu still can't switch into this
+ * thread until we've unblocked it in cpu_switch(). The lock
+ * pointers may match in the case of HTT cores. Don't unlock here
+ * or we can deadlock when the other CPU runs the IPI handler.
+ */
+ if (TDQ_LOCKPTR(tdn) != TDQ_LOCKPTR(tdq)) {
+ TDQ_UNLOCK(tdn);
+ TDQ_LOCK(tdq);
+ }
+ spinlock_exit();
+#endif
+ return (TDQ_LOCKPTR(tdn));
+}
+
+/*
+ * Block a thread for switching. Similar to thread_block() but does not
+ * bump the spin count.
+ */
+static inline struct mtx *
+thread_block_switch(struct thread *td)
+{
+ struct mtx *lock;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ lock = td->td_lock;
+ td->td_lock = &blocked_lock;
+ mtx_unlock_spin(lock);
+
+ return (lock);
+}
+
+/*
+ * Release a thread that was blocked with thread_block_switch().
+ */
+static inline void
+thread_unblock_switch(struct thread *td, struct mtx *mtx)
+{
+ atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
+ (uintptr_t)mtx);
+}
+
+/*
+ * Switch threads. This function has to handle threads coming in while
+ * blocked for some reason, running, or idle. It also must deal with
+ * migrating a thread from one queue to another as running threads may
+ * be assigned elsewhere via binding.
+ */
+void
+sched_switch(struct thread *td, struct thread *newtd, int flags)
+{
+ struct tdq *tdq;
+ struct td_sched *ts;
+ struct mtx *mtx;
+ int srqflag;
+ int cpuid;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+ cpuid = PCPU_GET(cpuid);
+ tdq = TDQ_CPU(cpuid);
+ ts = td->td_sched;
+ mtx = td->td_lock;
+#ifdef SMP
+ ts->ts_rltick = ticks;
+ if (newtd && newtd->td_priority < tdq->tdq_lowpri)
+ tdq->tdq_lowpri = newtd->td_priority;
+#endif
td->td_lastcpu = td->td_oncpu;
td->td_oncpu = NOCPU;
td->td_flags &= ~TDF_NEEDRESCHED;
td->td_owepreempt = 0;
-
/*
- * If the KSE has been assigned it may be in the process of switching
- * to the new cpu. This is the case in sched_bind().
+ * The lock pointer in an idle thread should never change. Reset it
+ * to CAN_RUN as well.
*/
- if (td == PCPU_GET(idlethread)) {
+ if (TD_IS_IDLETHREAD(td)) {
+ MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
TD_SET_CAN_RUN(td);
- } else if ((ke->ke_flags & KEF_ASSIGNED) == 0) {
- /* We are ending our run so make our slot available again */
- SLOT_RELEASE(td->td_ksegrp);
- kseq_load_rem(ksq, ke);
- if (TD_IS_RUNNING(td)) {
- /*
- * Don't allow the thread to migrate
- * from a preemption.
- */
- ke->ke_flags |= KEF_HOLD;
- setrunqueue(td, (flags & SW_PREEMPT) ?
- SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
- SRQ_OURSELF|SRQ_YIELDING);
- ke->ke_flags &= ~KEF_HOLD;
- } else if ((td->td_proc->p_flag & P_HADTHREADS) &&
- (newtd == NULL || newtd->td_ksegrp != td->td_ksegrp))
- /*
- * We will not be on the run queue.
- * So we must be sleeping or similar.
- * Don't use the slot if we will need it
- * for newtd.
- */
- slot_fill(td->td_ksegrp);
+ } else if (TD_IS_RUNNING(td)) {
+ MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+ tdq_load_rem(tdq, ts);
+ srqflag = (flags & SW_PREEMPT) ?
+ SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
+ SRQ_OURSELF|SRQ_YIELDING;
+ if (ts->ts_cpu == cpuid)
+ tdq_add(tdq, td, srqflag);
+ else
+ mtx = sched_switch_migrate(tdq, td, srqflag);
+ } else {
+ /* This thread must be going to sleep. */
+ TDQ_LOCK(tdq);
+ mtx = thread_block_switch(td);
+ tdq_load_rem(tdq, ts);
}
- if (newtd != NULL) {
- /*
- * If we bring in a thread account for it as if it had been
- * added to the run queue and then chosen.
- */
- newtd->td_kse->ke_flags |= KEF_DIDRUN;
- newtd->td_kse->ke_runq = ksq->ksq_curr;
- TD_SET_RUNNING(newtd);
- kseq_load_add(KSEQ_SELF(), newtd->td_kse);
- /*
- * XXX When we preempt, we've already consumed a slot because
- * we got here through sched_add(). However, newtd can come
- * from thread_switchout() which can't SLOT_USE() because
- * the SLOT code is scheduler dependent. We must use the
- * slot here otherwise.
- */
- if ((flags & SW_PREEMPT) == 0)
- SLOT_USE(newtd->td_ksegrp);
- } else
- newtd = choosethread();
+ /*
+ * We enter here with the thread blocked and assigned to the
+ * appropriate cpu run-queue or sleep-queue and with the current
+ * thread-queue locked.
+ */
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
+ /*
+ * If KSE assigned a new thread just add it here and let choosethread
+ * select the best one.
+ */
+ if (newtd != NULL)
+ sched_switchin(tdq, newtd);
+ newtd = choosethread();
+ /*
+ * Call the MD code to switch contexts if necessary.
+ */
if (td != newtd) {
#ifdef HWPMC_HOOKS
if (PMC_PROC_IS_USING_PMCS(td->td_proc))
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
#endif
- cpu_switch(td, newtd);
+ TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
+ cpu_switch(td, newtd, mtx);
+ /*
+ * We may return from cpu_switch on a different cpu. However,
+ * we always return with td_lock pointing to the current cpu's
+ * run queue lock.
+ */
+ cpuid = PCPU_GET(cpuid);
+ tdq = TDQ_CPU(cpuid);
#ifdef HWPMC_HOOKS
if (PMC_PROC_IS_USING_PMCS(td->td_proc))
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
#endif
- }
-
- sched_lock.mtx_lock = (uintptr_t)td;
-
- td->td_oncpu = PCPU_GET(cpuid);
+ } else
+ thread_unblock_switch(td, mtx);
+ /*
+ * Assert that all went well and return.
+ */
+#ifdef SMP
+ /* We should always get here with the lowest priority td possible */
+ tdq->tdq_lowpri = td->td_priority;
+#endif
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
+ MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+ td->td_oncpu = cpuid;
}
+/*
+ * Adjust thread priorities as a result of a nice request.
+ */
void
sched_nice(struct proc *p, int nice)
{
- struct ksegrp *kg;
- struct kse *ke;
struct thread *td;
- struct kseq *kseq;
PROC_LOCK_ASSERT(p, MA_OWNED);
- mtx_assert(&sched_lock, MA_OWNED);
- /*
- * We need to adjust the nice counts for running KSEs.
- */
- FOREACH_KSEGRP_IN_PROC(p, kg) {
- if (kg->kg_pri_class == PRI_TIMESHARE) {
- FOREACH_THREAD_IN_GROUP(kg, td) {
- ke = td->td_kse;
- if (ke->ke_runq == NULL)
- continue;
- kseq = KSEQ_CPU(ke->ke_cpu);
- kseq_nice_rem(kseq, p->p_nice);
- kseq_nice_add(kseq, nice);
- }
- }
- }
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+
p->p_nice = nice;
- FOREACH_KSEGRP_IN_PROC(p, kg) {
- sched_priority(kg);
- FOREACH_THREAD_IN_GROUP(kg, td)
- td->td_flags |= TDF_NEEDRESCHED;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ sched_priority(td);
+ sched_prio(td, td->td_base_user_pri);
+ thread_unlock(td);
}
}
+/*
+ * Record the sleep time for the interactivity scorer.
+ */
void
sched_sleep(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
- td->td_slptime = ticks;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+ td->td_slptick = ticks;
}
+/*
+ * Schedule a thread to resume execution and record how long it voluntarily
+ * slept. We also update the pctcpu, interactivity, and priority.
+ */
void
sched_wakeup(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ struct td_sched *ts;
+ int slptick;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ ts = td->td_sched;
/*
- * Let the kseg know how long we slept for. This is because process
- * interactivity behavior is modeled in the kseg.
+ * If we slept for more than a tick update our interactivity and
+ * priority.
*/
- if (td->td_slptime) {
- struct ksegrp *kg;
- int hzticks;
-
- kg = td->td_ksegrp;
- hzticks = (ticks - td->td_slptime) << 10;
- if (hzticks >= SCHED_SLP_RUN_MAX) {
- kg->kg_slptime = SCHED_SLP_RUN_MAX;
- kg->kg_runtime = 1;
- } else {
- kg->kg_slptime += hzticks;
- sched_interact_update(kg);
- }
- sched_priority(kg);
- sched_slice(td->td_kse);
- td->td_slptime = 0;
- }
- setrunqueue(td, SRQ_BORING);
+ slptick = td->td_slptick;
+ td->td_slptick = 0;
+ if (slptick && slptick != ticks) {
+ u_int hzticks;
+
+ hzticks = (ticks - slptick) << SCHED_TICK_SHIFT;
+ ts->ts_slptime += hzticks;
+ sched_interact_update(td);
+ sched_pctcpu_update(ts);
+ sched_priority(td);
+ }
+ /* Reset the slice value after we sleep. */
+ ts->ts_slice = sched_slice;
+ sched_add(td, SRQ_BORING);
}
/*
@@ -1473,495 +1990,566 @@
* priority.
*/
void
-sched_fork(struct thread *td, struct thread *childtd)
-{
-
- mtx_assert(&sched_lock, MA_OWNED);
-
- sched_fork_ksegrp(td, childtd->td_ksegrp);
- sched_fork_thread(td, childtd);
-}
-
-void
-sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
+sched_fork(struct thread *td, struct thread *child)
{
- struct ksegrp *kg = td->td_ksegrp;
- mtx_assert(&sched_lock, MA_OWNED);
-
- child->kg_slptime = kg->kg_slptime;
- child->kg_runtime = kg->kg_runtime;
- child->kg_user_pri = kg->kg_user_pri;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ sched_fork_thread(td, child);
+ /*
+ * Penalize the parent and child for forking.
+ */
sched_interact_fork(child);
- kg->kg_runtime += tickincr << 10;
- sched_interact_update(kg);
+ sched_priority(child);
+ td->td_sched->ts_runtime += tickincr;
+ sched_interact_update(td);
+ sched_priority(td);
}
+/*
+ * Fork a new thread, may be within the same process.
+ */
void
sched_fork_thread(struct thread *td, struct thread *child)
{
- struct kse *ke;
- struct kse *ke2;
+ struct td_sched *ts;
+ struct td_sched *ts2;
+ /*
+ * Initialize child.
+ */
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
sched_newthread(child);
- ke = td->td_kse;
- ke2 = child->td_kse;
- ke2->ke_slice = 1; /* Attempt to quickly learn interactivity. */
- ke2->ke_cpu = ke->ke_cpu;
- ke2->ke_runq = NULL;
-
- /* Grab our parents cpu estimation information. */
- ke2->ke_ticks = ke->ke_ticks;
- ke2->ke_ltick = ke->ke_ltick;
- ke2->ke_ftick = ke->ke_ftick;
+ child->td_lock = TDQ_LOCKPTR(TDQ_SELF());
+ ts = td->td_sched;
+ ts2 = child->td_sched;
+ ts2->ts_cpu = ts->ts_cpu;
+ ts2->ts_runq = NULL;
+ /*
+ * Grab our parents cpu estimation information and priority.
+ */
+ ts2->ts_ticks = ts->ts_ticks;
+ ts2->ts_ltick = ts->ts_ltick;
+ ts2->ts_ftick = ts->ts_ftick;
+ child->td_user_pri = td->td_user_pri;
+ child->td_base_user_pri = td->td_base_user_pri;
+ /*
+ * And update interactivity score.
+ */
+ ts2->ts_slptime = ts->ts_slptime;
+ ts2->ts_runtime = ts->ts_runtime;
+ ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */
}
+/*
+ * Adjust the priority class of a thread.
+ */
void
-sched_class(struct ksegrp *kg, int class)
+sched_class(struct thread *td, int class)
{
- struct kseq *kseq;
- struct kse *ke;
- struct thread *td;
- int nclass;
- int oclass;
- mtx_assert(&sched_lock, MA_OWNED);
- if (kg->kg_pri_class == class)
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ if (td->td_pri_class == class)
return;
- nclass = PRI_BASE(class);
- oclass = PRI_BASE(kg->kg_pri_class);
- FOREACH_THREAD_IN_GROUP(kg, td) {
- ke = td->td_kse;
- if ((ke->ke_state != KES_ONRUNQ &&
- ke->ke_state != KES_THREAD) || ke->ke_runq == NULL)
- continue;
- kseq = KSEQ_CPU(ke->ke_cpu);
-
#ifdef SMP
- /*
- * On SMP if we're on the RUNQ we must adjust the transferable
- * count because could be changing to or from an interrupt
- * class.
- */
- if (ke->ke_state == KES_ONRUNQ) {
- if (KSE_CAN_MIGRATE(ke)) {
- kseq->ksq_transferable--;
- kseq->ksq_group->ksg_transferable--;
- }
- if (KSE_CAN_MIGRATE(ke)) {
- kseq->ksq_transferable++;
- kseq->ksq_group->ksg_transferable++;
- }
- }
-#endif
- if (oclass == PRI_TIMESHARE) {
- kseq->ksq_load_timeshare--;
- kseq_nice_rem(kseq, kg->kg_proc->p_nice);
+ /*
+ * On SMP if we're on the RUNQ we must adjust the transferable
+ * count because could be changing to or from an interrupt
+ * class.
+ */
+ if (TD_ON_RUNQ(td)) {
+ struct tdq *tdq;
+
+ tdq = TDQ_CPU(td->td_sched->ts_cpu);
+ if (THREAD_CAN_MIGRATE(td)) {
+ tdq->tdq_transferable--;
+ tdq->tdq_group->tdg_transferable--;
}
- if (nclass == PRI_TIMESHARE) {
- kseq->ksq_load_timeshare++;
- kseq_nice_add(kseq, kg->kg_proc->p_nice);
+ td->td_pri_class = class;
+ if (THREAD_CAN_MIGRATE(td)) {
+ tdq->tdq_transferable++;
+ tdq->tdq_group->tdg_transferable++;
}
}
-
- kg->kg_pri_class = class;
+#endif
+ td->td_pri_class = class;
}
/*
* Return some of the child's priority and interactivity to the parent.
*/
void
-sched_exit(struct proc *p, struct thread *childtd)
+sched_exit(struct proc *p, struct thread *child)
{
- mtx_assert(&sched_lock, MA_OWNED);
- sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd);
- sched_exit_thread(NULL, childtd);
+ struct thread *td;
+
+ CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
+ child, child->td_proc->p_comm, child->td_priority);
+
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ td = FIRST_THREAD_IN_PROC(p);
+ sched_exit_thread(td, child);
}
+/*
+ * Penalize another thread for the time spent on this one. This helps to
+ * worsen the priority and interactivity of processes which schedule batch
+ * jobs such as make. This has little effect on the make process itself but
+ * causes new processes spawned by it to receive worse scores immediately.
+ */
void
-sched_exit_ksegrp(struct ksegrp *kg, struct thread *td)
+sched_exit_thread(struct thread *td, struct thread *child)
{
- /* kg->kg_slptime += td->td_ksegrp->kg_slptime; */
- kg->kg_runtime += td->td_ksegrp->kg_runtime;
- sched_interact_update(kg);
+
+ CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
+ child, child->td_proc->p_comm, child->td_priority);
+
+#ifdef KSE
+ /*
+ * KSE forks and exits so often that this penalty causes short-lived
+ * threads to always be non-interactive. This causes mozilla to
+ * crawl under load.
+ */
+ if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc)
+ return;
+#endif
+ /*
+ * Give the child's runtime to the parent without returning the
+ * sleep time as a penalty to the parent. This causes shells that
+ * launch expensive things to mark their children as expensive.
+ */
+ thread_lock(td);
+ td->td_sched->ts_runtime += child->td_sched->ts_runtime;
+ sched_interact_update(td);
+ sched_priority(td);
+ thread_unlock(td);
}
+/*
+ * Fix priorities on return to user-space. Priorities may be elevated due
+ * to static priorities in msleep() or similar.
+ */
void
-sched_exit_thread(struct thread *td, struct thread *childtd)
+sched_userret(struct thread *td)
{
- CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
- childtd, childtd->td_proc->p_comm, childtd->td_priority);
- kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse);
+ /*
+ * XXX we cheat slightly on the locking here to avoid locking in
+ * the usual case. Setting td_priority here is essentially an
+ * incomplete workaround for not setting it properly elsewhere.
+ * Now that some interrupt handlers are threads, not setting it
+ * properly elsewhere can clobber it in the window between setting
+ * it here and returning to user mode, so don't waste time setting
+ * it perfectly here.
+ */
+ KASSERT((td->td_flags & TDF_BORROWING) == 0,
+ ("thread with borrowed priority returning to userland"));
+ if (td->td_priority != td->td_user_pri) {
+ thread_lock(td);
+ td->td_priority = td->td_user_pri;
+ td->td_base_pri = td->td_user_pri;
+ thread_unlock(td);
+ }
}
+/*
+ * Handle a stathz tick. This is really only relevant for timeshare
+ * threads.
+ */
void
sched_clock(struct thread *td)
{
- struct kseq *kseq;
- struct ksegrp *kg;
- struct kse *ke;
-
- mtx_assert(&sched_lock, MA_OWNED);
- kseq = KSEQ_SELF();
-#ifdef SMP
- if (ticks >= bal_tick)
- sched_balance();
- if (ticks >= gbal_tick && balance_groups)
- sched_balance_groups();
+ struct tdq *tdq;
+ struct td_sched *ts;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ tdq = TDQ_SELF();
+#ifdef SMP
/*
- * We could have been assigned a non real-time thread without an
- * IPI.
+ * We run the long term load balancer infrequently on the first cpu.
*/
- if (kseq->ksq_assigned)
- kseq_assign(kseq); /* Potentially sets NEEDRESCHED */
+ if (balance_tdq == tdq) {
+ if (balance_ticks && --balance_ticks == 0)
+ sched_balance();
+ if (balance_group_ticks && --balance_group_ticks == 0)
+ sched_balance_groups();
+ }
#endif
/*
- * sched_setup() apparently happens prior to stathz being set. We
- * need to resolve the timers earlier in the boot so we can avoid
- * calculating this here.
+ * Advance the insert index once for each tick to ensure that all
+ * threads get a chance to run.
*/
- if (realstathz == 0) {
- realstathz = stathz ? stathz : hz;
- tickincr = hz / realstathz;
- /*
- * XXX This does not work for values of stathz that are much
- * larger than hz.
- */
- if (tickincr == 0)
- tickincr = 1;
+ if (tdq->tdq_idx == tdq->tdq_ridx) {
+ tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
+ if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
+ tdq->tdq_ridx = tdq->tdq_idx;
}
-
- ke = td->td_kse;
- kg = ke->ke_ksegrp;
-
- /* Adjust ticks for pctcpu */
- ke->ke_ticks++;
- ke->ke_ltick = ticks;
-
- /* Go up to one second beyond our max and then trim back down */
- if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
- sched_pctcpu_update(ke);
-
- if (td->td_flags & TDF_IDLETD)
- return;
+ ts = td->td_sched;
/*
- * We only do slicing code for TIMESHARE ksegrps.
+ * We only do slicing code for TIMESHARE threads.
*/
- if (kg->kg_pri_class != PRI_TIMESHARE)
+ if (td->td_pri_class != PRI_TIMESHARE)
return;
/*
- * We used a tick charge it to the ksegrp so that we can compute our
+ * We used a tick; charge it to the thread so that we can compute our
* interactivity.
*/
- kg->kg_runtime += tickincr << 10;
- sched_interact_update(kg);
-
+ td->td_sched->ts_runtime += tickincr;
+ sched_interact_update(td);
/*
* We used up one time slice.
*/
- if (--ke->ke_slice > 0)
+ if (--ts->ts_slice > 0)
return;
/*
* We're out of time, recompute priorities and requeue.
*/
- kseq_load_rem(kseq, ke);
- sched_priority(kg);
- sched_slice(ke);
- if (SCHED_CURR(kg, ke))
- ke->ke_runq = kseq->ksq_curr;
- else
- ke->ke_runq = kseq->ksq_next;
- kseq_load_add(kseq, ke);
+ sched_priority(td);
td->td_flags |= TDF_NEEDRESCHED;
}
+/*
+ * Called once per hz tick. Used for cpu utilization information. This
+ * is easier than trying to scale based on stathz.
+ */
+void
+sched_tick(void)
+{
+ struct td_sched *ts;
+
+ ts = curthread->td_sched;
+ /* Adjust ticks for pctcpu */
+ ts->ts_ticks += 1 << SCHED_TICK_SHIFT;
+ ts->ts_ltick = ticks;
+ /*
+ * Update if we've exceeded our desired tick threshhold by over one
+ * second.
+ */
+ if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
+ sched_pctcpu_update(ts);
+}
+
+/*
+ * Return whether the current CPU has runnable tasks. Used for in-kernel
+ * cooperative idle threads.
+ */
int
sched_runnable(void)
{
- struct kseq *kseq;
+ struct tdq *tdq;
int load;
load = 1;
- kseq = KSEQ_SELF();
-#ifdef SMP
- if (kseq->ksq_assigned) {
- mtx_lock_spin(&sched_lock);
- kseq_assign(kseq);
- mtx_unlock_spin(&sched_lock);
- }
-#endif
+ tdq = TDQ_SELF();
if ((curthread->td_flags & TDF_IDLETD) != 0) {
- if (kseq->ksq_load > 0)
+ if (tdq->tdq_load > 0)
goto out;
} else
- if (kseq->ksq_load - 1 > 0)
+ if (tdq->tdq_load - 1 > 0)
goto out;
load = 0;
out:
return (load);
}
-void
-sched_userret(struct thread *td)
+/*
+ * Choose the highest priority thread to run. The thread is removed from
+ * the run-queue while running however the load remains. For SMP we set
+ * the tdq in the global idle bitmask if it idles here.
+ */
+struct thread *
+sched_choose(void)
{
- struct ksegrp *kg;
+#ifdef SMP
+ struct tdq_group *tdg;
+#endif
+ struct td_sched *ts;
+ struct tdq *tdq;
- KASSERT((td->td_flags & TDF_BORROWING) == 0,
- ("thread with borrowed priority returning to userland"));
- kg = td->td_ksegrp;
- if (td->td_priority != kg->kg_user_pri) {
- mtx_lock_spin(&sched_lock);
- td->td_priority = kg->kg_user_pri;
- td->td_base_pri = kg->kg_user_pri;
- mtx_unlock_spin(&sched_lock);
+ tdq = TDQ_SELF();
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ ts = tdq_choose(tdq);
+ if (ts) {
+ tdq_runq_rem(tdq, ts);
+ return (ts->ts_thread);
}
+#ifdef SMP
+ /*
+ * We only set the idled bit when all of the cpus in the group are
+ * idle. Otherwise we could get into a situation where a thread bounces
+ * back and forth between two idle cores on seperate physical CPUs.
+ */
+ tdg = tdq->tdq_group;
+ tdg->tdg_idlemask |= PCPU_GET(cpumask);
+ if (tdg->tdg_idlemask == tdg->tdg_cpumask)
+ atomic_set_int(&tdq_idle, tdg->tdg_mask);
+ tdq->tdq_lowpri = PRI_MAX_IDLE;
+#endif
+ return (PCPU_GET(idlethread));
}
-struct kse *
-sched_choose(void)
+/*
+ * Set owepreempt if necessary. Preemption never happens directly in ULE,
+ * we always request it once we exit a critical section.
+ */
+static inline void
+sched_setpreempt(struct thread *td)
{
- struct kseq *kseq;
- struct kse *ke;
+ struct thread *ctd;
+ int cpri;
+ int pri;
+
+ ctd = curthread;
+ pri = td->td_priority;
+ cpri = ctd->td_priority;
+ if (td->td_priority < ctd->td_priority)
+ curthread->td_flags |= TDF_NEEDRESCHED;
+ if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
+ return;
+ /*
+ * Always preempt IDLE threads. Otherwise only if the preempting
+ * thread is an ithread.
+ */
+ if (pri > preempt_thresh && cpri < PRI_MIN_IDLE)
+ return;
+ ctd->td_owepreempt = 1;
+ return;
+}
- mtx_assert(&sched_lock, MA_OWNED);
- kseq = KSEQ_SELF();
+/*
+ * Add a thread to a thread queue. Initializes priority, slice, runq, and
+ * add it to the appropriate queue. This is the internal function called
+ * when the tdq is predetermined.
+ */
+void
+tdq_add(struct tdq *tdq, struct thread *td, int flags)
+{
+ struct td_sched *ts;
+ int class;
#ifdef SMP
-restart:
- if (kseq->ksq_assigned)
- kseq_assign(kseq);
+ int cpumask;
#endif
- ke = kseq_choose(kseq);
- if (ke) {
+
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ KASSERT((td->td_inhibitors == 0),
+ ("sched_add: trying to run inhibited thread"));
+ KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+ ("sched_add: bad thread state"));
+ KASSERT(td->td_flags & TDF_INMEM,
+ ("sched_add: thread swapped out"));
+
+ ts = td->td_sched;
+ class = PRI_BASE(td->td_pri_class);
+ TD_SET_RUNQ(td);
+ if (ts->ts_slice == 0)
+ ts->ts_slice = sched_slice;
+ /*
+ * Pick the run queue based on priority.
+ */
+ if (td->td_priority <= PRI_MAX_REALTIME)
+ ts->ts_runq = &tdq->tdq_realtime;
+ else if (td->td_priority <= PRI_MAX_TIMESHARE)
+ ts->ts_runq = &tdq->tdq_timeshare;
+ else
+ ts->ts_runq = &tdq->tdq_idle;
#ifdef SMP
- if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE)
- if (kseq_idled(kseq) == 0)
- goto restart;
-#endif
- kseq_runq_rem(kseq, ke);
- ke->ke_state = KES_THREAD;
- ke->ke_flags &= ~KEF_PREEMPTED;
- return (ke);
+ cpumask = 1 << ts->ts_cpu;
+ /*
+ * If we had been idle, clear our bit in the group and potentially
+ * the global bitmap.
+ */
+ if ((class != PRI_IDLE && class != PRI_ITHD) &&
+ (tdq->tdq_group->tdg_idlemask & cpumask) != 0) {
+ /*
+ * Check to see if our group is unidling, and if so, remove it
+ * from the global idle mask.
+ */
+ if (tdq->tdq_group->tdg_idlemask ==
+ tdq->tdq_group->tdg_cpumask)
+ atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
+ /*
+ * Now remove ourselves from the group specific idle mask.
+ */
+ tdq->tdq_group->tdg_idlemask &= ~cpumask;
}
-#ifdef SMP
- if (kseq_idled(kseq) == 0)
- goto restart;
+ if (td->td_priority < tdq->tdq_lowpri)
+ tdq->tdq_lowpri = td->td_priority;
#endif
- return (NULL);
+ tdq_runq_add(tdq, ts, flags);
+ tdq_load_add(tdq, ts);
}
+/*
+ * Select the target thread queue and add a thread to it. Request
+ * preemption or IPI a remote processor if required.
+ */
void
sched_add(struct thread *td, int flags)
{
- struct kseq *kseq;
- struct ksegrp *kg;
- struct kse *ke;
- int preemptive;
- int canmigrate;
- int class;
-
+ struct td_sched *ts;
+ struct tdq *tdq;
+#ifdef SMP
+ int cpuid;
+ int cpu;
+#endif
CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread,
curthread->td_proc->p_comm);
- mtx_assert(&sched_lock, MA_OWNED);
- ke = td->td_kse;
- kg = td->td_ksegrp;
- canmigrate = 1;
- preemptive = !(flags & SRQ_YIELDING);
- class = PRI_BASE(kg->kg_pri_class);
- kseq = KSEQ_SELF();
- if ((ke->ke_flags & KEF_INTERNAL) == 0)
- SLOT_USE(td->td_ksegrp);
- ke->ke_flags &= ~KEF_INTERNAL;
-#ifdef SMP
- if (ke->ke_flags & KEF_ASSIGNED) {
- if (ke->ke_flags & KEF_REMOVED)
- ke->ke_flags &= ~KEF_REMOVED;
- return;
- }
- canmigrate = KSE_CAN_MIGRATE(ke);
- /*
- * Don't migrate running threads here. Force the long term balancer
- * to do it.
- */
- if (ke->ke_flags & KEF_HOLD) {
- ke->ke_flags &= ~KEF_HOLD;
- canmigrate = 0;
- }
-#endif
- KASSERT(ke->ke_state != KES_ONRUNQ,
- ("sched_add: kse %p (%s) already in run queue", ke,
- ke->ke_proc->p_comm));
- KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
- ("sched_add: process swapped out"));
- KASSERT(ke->ke_runq == NULL,
- ("sched_add: KSE %p is still assigned to a run queue", ke));
- if (flags & SRQ_PREEMPTED)
- ke->ke_flags |= KEF_PREEMPTED;
- switch (class) {
- case PRI_ITHD:
- case PRI_REALTIME:
- ke->ke_runq = kseq->ksq_curr;
- ke->ke_slice = SCHED_SLICE_MAX;
- if (canmigrate)
- ke->ke_cpu = PCPU_GET(cpuid);
- break;
- case PRI_TIMESHARE:
- if (SCHED_CURR(kg, ke))
- ke->ke_runq = kseq->ksq_curr;
- else
- ke->ke_runq = kseq->ksq_next;
- break;
- case PRI_IDLE:
- /*
- * This is for priority prop.
- */
- if (ke->ke_thread->td_priority < PRI_MIN_IDLE)
- ke->ke_runq = kseq->ksq_curr;
- else
- ke->ke_runq = &kseq->ksq_idle;
- ke->ke_slice = SCHED_SLICE_MIN;
- break;
- default:
- panic("Unknown pri class.");
- break;
- }
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ ts = td->td_sched;
+ /*
+ * Recalculate the priority before we select the target cpu or
+ * run-queue.
+ */
+ if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
+ sched_priority(td);
#ifdef SMP
+ cpuid = PCPU_GET(cpuid);
/*
- * If this thread is pinned or bound, notify the target cpu.
+ * Pick the destination cpu and if it isn't ours transfer to the
+ * target cpu.
*/
- if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) {
- ke->ke_runq = NULL;
- kseq_notify(ke, ke->ke_cpu);
+ if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_MIGRATE(td))
+ cpu = cpuid;
+ else if (!THREAD_CAN_MIGRATE(td))
+ cpu = ts->ts_cpu;
+ else
+ cpu = sched_pickcpu(ts, flags);
+ tdq = sched_setcpu(ts, cpu, flags);
+ tdq_add(tdq, td, flags);
+ if (cpu != cpuid) {
+ tdq_notify(ts);
return;
}
+#else
+ tdq = TDQ_SELF();
+ TDQ_LOCK(tdq);
/*
- * If we had been idle, clear our bit in the group and potentially
- * the global bitmap. If not, see if we should transfer this thread.
+ * Now that the thread is moving to the run-queue, set the lock
+ * to the scheduler's lock.
*/
- if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
- (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) {
- /*
- * Check to see if our group is unidling, and if so, remove it
- * from the global idle mask.
- */
- if (kseq->ksq_group->ksg_idlemask ==
- kseq->ksq_group->ksg_cpumask)
- atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
- /*
- * Now remove ourselves from the group specific idle mask.
- */
- kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask);
- } else if (canmigrate && kseq->ksq_load > 1 && class != PRI_ITHD)
- if (kseq_transfer(kseq, ke, class))
- return;
- ke->ke_cpu = PCPU_GET(cpuid);
+ thread_lock_set(td, TDQ_LOCKPTR(tdq));
+ tdq_add(tdq, td, flags);
#endif
- if (td->td_priority < curthread->td_priority &&
- ke->ke_runq == kseq->ksq_curr)
- curthread->td_flags |= TDF_NEEDRESCHED;
- if (preemptive && maybe_preempt(td))
- return;
- ke->ke_state = KES_ONRUNQ;
-
- kseq_runq_add(kseq, ke, flags);
- kseq_load_add(kseq, ke);
+ if (!(flags & SRQ_YIELDING))
+ sched_setpreempt(td);
}
+/*
+ * Remove a thread from a run-queue without running it. This is used
+ * when we're stealing a thread from a remote queue. Otherwise all threads
+ * exit by calling sched_exit_thread() and sched_throw() themselves.
+ */
void
sched_rem(struct thread *td)
{
- struct kseq *kseq;
- struct kse *ke;
+ struct tdq *tdq;
+ struct td_sched *ts;
CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread,
curthread->td_proc->p_comm);
- mtx_assert(&sched_lock, MA_OWNED);
- ke = td->td_kse;
- SLOT_RELEASE(td->td_ksegrp);
- ke->ke_flags &= ~KEF_PREEMPTED;
- if (ke->ke_flags & KEF_ASSIGNED) {
- ke->ke_flags |= KEF_REMOVED;
- return;
- }
- KASSERT((ke->ke_state == KES_ONRUNQ),
- ("sched_rem: KSE not on run queue"));
-
- ke->ke_state = KES_THREAD;
- kseq = KSEQ_CPU(ke->ke_cpu);
- kseq_runq_rem(kseq, ke);
- kseq_load_rem(kseq, ke);
+ ts = td->td_sched;
+ tdq = TDQ_CPU(ts->ts_cpu);
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+ KASSERT(TD_ON_RUNQ(td),
+ ("sched_rem: thread not on run queue"));
+ tdq_runq_rem(tdq, ts);
+ tdq_load_rem(tdq, ts);
+ TD_SET_CAN_RUN(td);
}
+/*
+ * Fetch cpu utilization information. Updates on demand.
+ */
fixpt_t
sched_pctcpu(struct thread *td)
{
fixpt_t pctcpu;
- struct kse *ke;
+ struct td_sched *ts;
pctcpu = 0;
- ke = td->td_kse;
- if (ke == NULL)
+ ts = td->td_sched;
+ if (ts == NULL)
return (0);
- mtx_lock_spin(&sched_lock);
- if (ke->ke_ticks) {
+ thread_lock(td);
+ if (ts->ts_ticks) {
int rtick;
- /*
- * Don't update more frequently than twice a second. Allowing
- * this causes the cpu usage to decay away too quickly due to
- * rounding errors.
- */
- if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick ||
- ke->ke_ltick < (ticks - (hz / 2)))
- sched_pctcpu_update(ke);
+ sched_pctcpu_update(ts);
/* How many rtick per second ? */
- rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
- pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
+ rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
+ pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
}
-
- ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
return (pctcpu);
}
+/*
+ * Bind a thread to a target cpu.
+ */
void
sched_bind(struct thread *td, int cpu)
{
- struct kse *ke;
+ struct td_sched *ts;
- mtx_assert(&sched_lock, MA_OWNED);
- ke = td->td_kse;
- ke->ke_flags |= KEF_BOUND;
+ THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
+ ts = td->td_sched;
+ if (ts->ts_flags & TSF_BOUND)
+ sched_unbind(td);
+ ts->ts_flags |= TSF_BOUND;
#ifdef SMP
+ sched_pin();
if (PCPU_GET(cpuid) == cpu)
return;
- /* sched_rem without the runq_remove */
- ke->ke_state = KES_THREAD;
- kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
- kseq_notify(ke, cpu);
+ ts->ts_cpu = cpu;
/* When we return from mi_switch we'll be on the correct cpu. */
mi_switch(SW_VOL, NULL);
#endif
}
+/*
+ * Release a bound thread.
+ */
void
sched_unbind(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
- td->td_kse->ke_flags &= ~KEF_BOUND;
+ struct td_sched *ts;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ ts = td->td_sched;
+ if ((ts->ts_flags & TSF_BOUND) == 0)
+ return;
+ ts->ts_flags &= ~TSF_BOUND;
+#ifdef SMP
+ sched_unpin();
+#endif
}
int
sched_is_bound(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
- return (td->td_kse->ke_flags & KEF_BOUND);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ return (td->td_sched->ts_flags & TSF_BOUND);
+}
+
+/*
+ * Basic yield call.
+ */
+void
+sched_relinquish(struct thread *td)
+{
+ thread_lock(td);
+ SCHED_STAT_INC(switch_relinquish);
+ mi_switch(SW_VOL, NULL);
+ thread_unlock(td);
}
+/*
+ * Return the total system load.
+ */
int
sched_load(void)
{
@@ -1970,21 +2558,15 @@
int i;
total = 0;
- for (i = 0; i <= ksg_maxid; i++)
- total += KSEQ_GROUP(i)->ksg_load;
+ for (i = 0; i <= tdg_maxid; i++)
+ total += TDQ_GROUP(i)->tdg_load;
return (total);
#else
- return (KSEQ_SELF()->ksq_sysload);
+ return (TDQ_SELF()->tdq_sysload);
#endif
}
int
-sched_sizeof_ksegrp(void)
-{
- return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
-}
-
-int
sched_sizeof_proc(void)
{
return (sizeof(struct proc));
@@ -1995,5 +2577,116 @@
{
return (sizeof(struct thread) + sizeof(struct td_sched));
}
+
+/*
+ * The actual idle process.
+ */
+void
+sched_idletd(void *dummy)
+{
+ struct thread *td;
+ struct tdq *tdq;
+
+ td = curthread;
+ tdq = TDQ_SELF();
+ mtx_assert(&Giant, MA_NOTOWNED);
+ /* ULE relies on preemption for idle interruption. */
+ for (;;) {
+#ifdef SMP
+ if (tdq_idled(tdq))
+ cpu_idle();
+#else
+ cpu_idle();
+#endif
+ }
+}
+
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+ struct thread *newtd;
+ struct tdq *tdq;
+
+ tdq = TDQ_SELF();
+ if (td == NULL) {
+ /* Correct spinlock nesting and acquire the correct lock. */
+ TDQ_LOCK(tdq);
+ spinlock_exit();
+ } else {
+ MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+ tdq_load_rem(tdq, td->td_sched);
+ }
+ KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+ newtd = choosethread();
+ TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
+ PCPU_SET(switchtime, cpu_ticks());
+ PCPU_SET(switchticks, ticks);
+ cpu_throw(td, newtd); /* doesn't return */
+}
+
+/*
+ * This is called from fork_exit(). Just acquire the correct locks and
+ * let fork do the rest of the work.
+ */
+void
+sched_fork_exit(struct thread *td)
+{
+ struct td_sched *ts;
+ struct tdq *tdq;
+ int cpuid;
+
+ /*
+ * Finish setting up thread glue so that it begins execution in a
+ * non-nested critical section with the scheduler lock held.
+ */
+ cpuid = PCPU_GET(cpuid);
+ tdq = TDQ_CPU(cpuid);
+ ts = td->td_sched;
+ if (TD_IS_IDLETHREAD(td))
+ td->td_lock = TDQ_LOCKPTR(tdq);
+ MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+ td->td_oncpu = cpuid;
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
+}
+
+static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0,
+ "Scheduler");
+SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
+ "Scheduler name");
+SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
+ "Slice size for timeshare threads");
+SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
+ "Interactivity score threshold");
+SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
+ 0,"Min priority for preemption, lower priorities have greater precedence");
+#ifdef SMP
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0,
+ "Pick the target cpu based on priority rather than load.");
+SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
+ "Number of hz ticks to keep thread affinity for");
+SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
+ "Enables the long-term load balancer");
+SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
+ &balance_interval, 0,
+ "Average frequency in stathz ticks to run the long-term balancer");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
+ "Steals work from another hyper-threaded core on idle");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
+ "Attempts to steal work from other cores before idling");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
+ "Minimum load on remote cpu before we'll steal");
+SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0,
+ "True when a topology has been specified by the MD code.");
+#endif
+
+/* ps compat. All cpu percentages from ULE are weighted. */
+static int ccpu = 0;
+SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
+
+
#define KERN_SWITCH_INCLUDE 1
#include "kern/kern_switch.c"
Index: uipc_mbuf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_mbuf.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -L sys/kern/uipc_mbuf.c -L sys/kern/uipc_mbuf.c -u -r1.5 -r1.6
--- sys/kern/uipc_mbuf.c
+++ sys/kern/uipc_mbuf.c
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.148.2.6 2006/03/23 23:24:32 sam Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.174 2007/10/06 21:42:39 kmacy Exp $");
#include "opt_mac.h"
#include "opt_param.h"
@@ -41,7 +41,6 @@
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/sysctl.h>
@@ -49,6 +48,8 @@
#include <sys/protosw.h>
#include <sys/uio.h>
+#include <security/mac/mac_framework.h>
+
int max_linkhdr;
int max_protohdr;
int max_hdr;
@@ -64,7 +65,6 @@
/*
* sysctl(8) exported objects
*/
-SYSCTL_DECL(_kern_ipc);
SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
&max_linkhdr, 0, "Size of largest link layer header");
SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
@@ -87,11 +87,6 @@
#endif
/*
- * Malloc-type for external ext_buf ref counts.
- */
-static MALLOC_DEFINE(M_MBUF, "mbextcnt", "mbuf external ref counts");
-
-/*
* Allocate a given length worth of mbufs and/or clusters (whatever fits
* best) and return a pointer to the top of the allocated chain. If an
* existing mbuf chain is provided, then we will append the new chain
@@ -99,61 +94,61 @@
* chain.
*/
struct mbuf *
-m_getm(struct mbuf *m, int len, int how, short type)
+m_getm2(struct mbuf *m, int len, int how, short type, int flags)
{
- struct mbuf *mb, *top, *cur, *mtail;
- int num, rem;
- int i;
+ struct mbuf *mb, *nm = NULL, *mtail = NULL;
- KASSERT(len >= 0, ("m_getm(): len is < 0"));
+ KASSERT(len >= 0, ("%s: len is < 0", __func__));
- /* If m != NULL, we will append to the end of that chain. */
- if (m != NULL)
- for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
- else
- mtail = NULL;
+ /* Validate flags. */
+ flags &= (M_PKTHDR | M_EOR);
- /*
- * Calculate how many mbufs+clusters ("packets") we need and how much
- * leftover there is after that and allocate the first mbuf+cluster
- * if required.
- */
- num = len / MCLBYTES;
- rem = len % MCLBYTES;
- top = cur = NULL;
- if (num > 0) {
- if ((top = cur = m_getcl(how, type, 0)) == NULL)
- goto failed;
- top->m_len = 0;
- }
- num--;
-
- for (i = 0; i < num; i++) {
- mb = m_getcl(how, type, 0);
- if (mb == NULL)
- goto failed;
- mb->m_len = 0;
- cur = (cur->m_next = mb);
- }
- if (rem > 0) {
- mb = (rem > MINCLSIZE) ?
- m_getcl(how, type, 0) : m_get(how, type);
- if (mb == NULL)
- goto failed;
- mb->m_len = 0;
- if (cur == NULL)
- top = mb;
+ /* Packet header mbuf must be first in chain. */
+ if ((flags & M_PKTHDR) && m != NULL)
+ flags &= ~M_PKTHDR;
+
+ /* Loop and append maximum sized mbufs to the chain tail. */
+ while (len > 0) {
+ if (len > MCLBYTES)
+ mb = m_getjcl(how, type, (flags & M_PKTHDR),
+ MJUMPAGESIZE);
+ else if (len >= MINCLSIZE)
+ mb = m_getcl(how, type, (flags & M_PKTHDR));
+ else if (flags & M_PKTHDR)
+ mb = m_gethdr(how, type);
else
- cur->m_next = mb;
- }
+ mb = m_get(how, type);
- if (mtail != NULL)
- mtail->m_next = top;
- return top;
-failed:
- if (top != NULL)
- m_freem(top);
- return NULL;
+ /* Fail the whole operation if one mbuf can't be allocated. */
+ if (mb == NULL) {
+ if (nm != NULL)
+ m_freem(nm);
+ return (NULL);
+ }
+
+ /* Book keeping. */
+ len -= (mb->m_flags & M_EXT) ? mb->m_ext.ext_size :
+ ((mb->m_flags & M_PKTHDR) ? MHLEN : MLEN);
+ if (mtail != NULL)
+ mtail->m_next = mb;
+ else
+ nm = mb;
+ mtail = mb;
+ flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */
+ }
+ if (flags & M_EOR)
+ mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */
+
+ /* If mbuf was supplied, append new chain to the end of it. */
+ if (m != NULL) {
+ for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next)
+ ;
+ mtail->m_next = nm;
+ mtail->m_flags &= ~M_EOR;
+ } else
+ m = nm;
+
+ return (m);
}
/*
@@ -193,16 +188,10 @@
m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
void (*freef)(void *, void *), void *args, int flags, int type)
{
- u_int *ref_cnt = NULL;
+ KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__));
- /* XXX Shouldn't be adding EXT_CLUSTER with this API */
- if (type == EXT_CLUSTER)
- ref_cnt = (u_int *)uma_find_refcnt(zone_clust,
- mb->m_ext.ext_buf);
- else if (type == EXT_EXTREF)
- ref_cnt = __DEVOLATILE(u_int *, mb->m_ext.ref_cnt);
- mb->m_ext.ref_cnt = (ref_cnt == NULL) ?
- malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)ref_cnt;
+ if (type != EXT_EXTREF)
+ mb->m_ext.ref_cnt = (u_int *)uma_zalloc(zone_ext_refcnt, M_NOWAIT);
if (mb->m_ext.ref_cnt != NULL) {
*(mb->m_ext.ref_cnt) = 1;
mb->m_flags |= (M_EXT | flags);
@@ -217,45 +206,33 @@
/*
* Non-directly-exported function to clean up after mbufs with M_EXT
- * storage attached to them if the reference count hits 0.
+ * storage attached to them if the reference count hits 1.
*/
void
mb_free_ext(struct mbuf *m)
{
- u_int cnt;
- int dofree;
+ int skipmbuf;
+
+ KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
+ KASSERT(m->m_ext.ref_cnt != NULL, ("%s: ref_cnt not set", __func__));
- /* Account for lazy ref count assign. */
- if (m->m_ext.ref_cnt == NULL)
- dofree = 1;
- else
- dofree = 0;
/*
- * This is tricky. We need to make sure to decrement the
- * refcount in a safe way but to also clean up if we're the
- * last reference. This method seems to do it without race.
- */
- while (dofree == 0) {
- cnt = *(m->m_ext.ref_cnt);
- if (atomic_cmpset_int(m->m_ext.ref_cnt, cnt, cnt - 1)) {
- if (cnt == 1)
- dofree = 1;
- break;
- }
- }
-
- if (dofree) {
- /*
- * Do the free, should be safe.
- */
+ * check if the header is embedded in the cluster
+ */
+ skipmbuf = (m->m_flags & M_NOFREE);
+
+ /* Free attached storage if this mbuf is the only reference to it. */
+ if (*(m->m_ext.ref_cnt) == 1 ||
+ atomic_fetchadd_int(m->m_ext.ref_cnt, -1) == 1) {
switch (m->m_ext.ext_type) {
- case EXT_PACKET:
+ case EXT_PACKET: /* The packet zone is special. */
+ if (*(m->m_ext.ref_cnt) == 0)
+ *(m->m_ext.ref_cnt) = 1;
uma_zfree(zone_pack, m);
- return;
+ return; /* Job done. */
case EXT_CLUSTER:
uma_zfree(zone_clust, m->m_ext.ext_buf);
- m->m_ext.ext_buf = NULL;
break;
case EXT_JUMBOP:
uma_zfree(zone_jumbop, m->m_ext.ext_buf);
@@ -266,24 +243,180 @@
case EXT_JUMBO16:
uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
break;
- default:
+ case EXT_SFBUF:
+ case EXT_NET_DRV:
+ case EXT_MOD_TYPE:
+ case EXT_DISPOSABLE:
+ *(m->m_ext.ref_cnt) = 0;
+ uma_zfree(zone_ext_refcnt, __DEVOLATILE(u_int *,
+ m->m_ext.ref_cnt));
+ /* FALLTHROUGH */
+ case EXT_EXTREF:
KASSERT(m->m_ext.ext_free != NULL,
- ("%s: external free pointer not set", __func__));
+ ("%s: ext_free not set", __func__));
(*(m->m_ext.ext_free))(m->m_ext.ext_buf,
m->m_ext.ext_args);
- if (m->m_ext.ext_type != EXT_EXTREF) {
- if (m->m_ext.ref_cnt != NULL)
- free(__DEVOLATILE(u_int *,
- m->m_ext.ref_cnt), M_MBUF);
- m->m_ext.ref_cnt = NULL;
- }
- m->m_ext.ext_buf = NULL;
+ break;
+ default:
+ KASSERT(m->m_ext.ext_type == 0,
+ ("%s: unknown ext_type", __func__));
}
}
+ if (skipmbuf)
+ return;
+
+ /*
+ * Free this mbuf back to the mbuf zone with all m_ext
+ * information purged.
+ */
+ m->m_ext.ext_buf = NULL;
+ m->m_ext.ext_free = NULL;
+ m->m_ext.ext_args = NULL;
+ m->m_ext.ref_cnt = NULL;
+ m->m_ext.ext_size = 0;
+ m->m_ext.ext_type = 0;
+ m->m_flags &= ~M_EXT;
uma_zfree(zone_mbuf, m);
}
/*
+ * Attach the the cluster from *m to *n, set up m_ext in *n
+ * and bump the refcount of the cluster.
+ */
+static void
+mb_dupcl(struct mbuf *n, struct mbuf *m)
+{
+ KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
+ KASSERT(m->m_ext.ref_cnt != NULL, ("%s: ref_cnt not set", __func__));
+ KASSERT((n->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__));
+
+ if (*(m->m_ext.ref_cnt) == 1)
+ *(m->m_ext.ref_cnt) += 1;
+ else
+ atomic_add_int(m->m_ext.ref_cnt, 1);
+ n->m_ext.ext_buf = m->m_ext.ext_buf;
+ n->m_ext.ext_free = m->m_ext.ext_free;
+ n->m_ext.ext_args = m->m_ext.ext_args;
+ n->m_ext.ext_size = m->m_ext.ext_size;
+ n->m_ext.ref_cnt = m->m_ext.ref_cnt;
+ n->m_ext.ext_type = m->m_ext.ext_type;
+ n->m_flags |= M_EXT;
+}
+
+/*
+ * Clean up mbuf (chain) from any tags and packet headers.
+ * If "all" is set then the first mbuf in the chain will be
+ * cleaned too.
+ */
+void
+m_demote(struct mbuf *m0, int all)
+{
+ struct mbuf *m;
+
+ for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
+ if (m->m_flags & M_PKTHDR) {
+ m_tag_delete_chain(m, NULL);
+ m->m_flags &= ~M_PKTHDR;
+ bzero(&m->m_pkthdr, sizeof(struct pkthdr));
+ }
+ if (m->m_type == MT_HEADER)
+ m->m_type = MT_DATA;
+ if (m != m0 && m->m_nextpkt != NULL)
+ m->m_nextpkt = NULL;
+ m->m_flags = m->m_flags & (M_EXT|M_EOR|M_RDONLY|M_FREELIST);
+ }
+}
+
+/*
+ * Sanity checks on mbuf (chain) for use in KASSERT() and general
+ * debugging.
+ * Returns 0 or panics when bad and 1 on all tests passed.
+ * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
+ * blow up later.
+ */
+int
+m_sanity(struct mbuf *m0, int sanitize)
+{
+ struct mbuf *m;
+ caddr_t a, b;
+ int pktlen = 0;
+
+#ifdef INVARIANTS
+#define M_SANITY_ACTION(s) panic("mbuf %p: " s, m)
+#else
+#define M_SANITY_ACTION(s) printf("mbuf %p: " s, m)
+#endif
+
+ for (m = m0; m != NULL; m = m->m_next) {
+ /*
+ * Basic pointer checks. If any of these fails then some
+ * unrelated kernel memory before or after us is trashed.
+ * No way to recover from that.
+ */
+ a = ((m->m_flags & M_EXT) ? m->m_ext.ext_buf :
+ ((m->m_flags & M_PKTHDR) ? (caddr_t)(&m->m_pktdat) :
+ (caddr_t)(&m->m_dat)) );
+ b = (caddr_t)(a + (m->m_flags & M_EXT ? m->m_ext.ext_size :
+ ((m->m_flags & M_PKTHDR) ? MHLEN : MLEN)));
+ if ((caddr_t)m->m_data < a)
+ M_SANITY_ACTION("m_data outside mbuf data range left");
+ if ((caddr_t)m->m_data > b)
+ M_SANITY_ACTION("m_data outside mbuf data range right");
+ if ((caddr_t)m->m_data + m->m_len > b)
+ M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
+ if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.header) {
+ if ((caddr_t)m->m_pkthdr.header < a ||
+ (caddr_t)m->m_pkthdr.header > b)
+ M_SANITY_ACTION("m_pkthdr.header outside mbuf data range");
+ }
+
+ /* m->m_nextpkt may only be set on first mbuf in chain. */
+ if (m != m0 && m->m_nextpkt != NULL) {
+ if (sanitize) {
+ m_freem(m->m_nextpkt);
+ m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
+ } else
+ M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
+ }
+
+ /* packet length (not mbuf length!) calculation */
+ if (m0->m_flags & M_PKTHDR)
+ pktlen += m->m_len;
+
+ /* m_tags may only be attached to first mbuf in chain. */
+ if (m != m0 && m->m_flags & M_PKTHDR &&
+ !SLIST_EMPTY(&m->m_pkthdr.tags)) {
+ if (sanitize) {
+ m_tag_delete_chain(m, NULL);
+ /* put in 0xDEADC0DE perhaps? */
+ } else
+ M_SANITY_ACTION("m_tags on in-chain mbuf");
+ }
+
+ /* M_PKTHDR may only be set on first mbuf in chain */
+ if (m != m0 && m->m_flags & M_PKTHDR) {
+ if (sanitize) {
+ bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
+ m->m_flags &= ~M_PKTHDR;
+ /* put in 0xDEADCODE and leave hdr flag in */
+ } else
+ M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
+ }
+ }
+ m = m0;
+ if (pktlen && pktlen != m->m_pkthdr.len) {
+ if (sanitize)
+ m->m_pkthdr.len = 0;
+ else
+ M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
+ }
+ return 1;
+
+#undef M_SANITY_ACTION
+}
+
+
+/*
* "Move" mbuf pkthdr from "from" to "to".
* "from" must have M_PKTHDR set, and "to" must be empty.
*/
@@ -369,8 +502,13 @@
M_MOVE_PKTHDR(mn, m);
mn->m_next = m;
m = mn;
- if (len < MHLEN)
- MH_ALIGN(m, len);
+ if(m->m_flags & M_PKTHDR) {
+ if (len < MHLEN)
+ MH_ALIGN(m, len);
+ } else {
+ if (len < MLEN)
+ M_ALIGN(m, len);
+ }
m->m_len = len;
return (m);
}
@@ -429,10 +567,7 @@
n->m_len = min(len, m->m_len - off);
if (m->m_flags & M_EXT) {
n->m_data = m->m_data + off;
- n->m_ext = m->m_ext;
- n->m_flags |= M_EXT;
- MEXT_ADD_REF(m);
- n->m_ext.ref_cnt = m->m_ext.ref_cnt;
+ mb_dupcl(n, m);
} else
bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
(u_int)n->m_len);
@@ -453,6 +588,154 @@
}
/*
+ * Returns mbuf chain with new head for the prepending case.
+ * Copies from mbuf (chain) n from off for len to mbuf (chain) m
+ * either prepending or appending the data.
+ * The resulting mbuf (chain) m is fully writeable.
+ * m is destination (is made writeable)
+ * n is source, off is offset in source, len is len from offset
+ * dir, 0 append, 1 prepend
+ * how, wait or nowait
+ */
+
+static int
+m_bcopyxxx(void *s, void *t, u_int len)
+{
+ bcopy(s, t, (size_t)len);
+ return 0;
+}
+
+struct mbuf *
+m_copymdata(struct mbuf *m, struct mbuf *n, int off, int len,
+ int prep, int how)
+{
+ struct mbuf *mm, *x, *z, *prev = NULL;
+ caddr_t p;
+ int i, nlen = 0;
+ caddr_t buf[MLEN];
+
+ KASSERT(m != NULL && n != NULL, ("m_copymdata, no target or source"));
+ KASSERT(off >= 0, ("m_copymdata, negative off %d", off));
+ KASSERT(len >= 0, ("m_copymdata, negative len %d", len));
+ KASSERT(prep == 0 || prep == 1, ("m_copymdata, unknown direction %d", prep));
+
+ mm = m;
+ if (!prep) {
+ while(mm->m_next) {
+ prev = mm;
+ mm = mm->m_next;
+ }
+ }
+ for (z = n; z != NULL; z = z->m_next)
+ nlen += z->m_len;
+ if (len == M_COPYALL)
+ len = nlen - off;
+ if (off + len > nlen || len < 1)
+ return NULL;
+
+ if (!M_WRITABLE(mm)) {
+ /* XXX: Use proper m_xxx function instead. */
+ x = m_getcl(how, MT_DATA, mm->m_flags);
+ if (x == NULL)
+ return NULL;
+ bcopy(mm->m_ext.ext_buf, x->m_ext.ext_buf, x->m_ext.ext_size);
+ p = x->m_ext.ext_buf + (mm->m_data - mm->m_ext.ext_buf);
+ x->m_data = p;
+ mm->m_next = NULL;
+ if (mm != m)
+ prev->m_next = x;
+ m_free(mm);
+ mm = x;
+ }
+
+ /*
+ * Append/prepend the data. Allocating mbufs as necessary.
+ */
+ /* Shortcut if enough free space in first/last mbuf. */
+ if (!prep && M_TRAILINGSPACE(mm) >= len) {
+ m_apply(n, off, len, m_bcopyxxx, mtod(mm, caddr_t) +
+ mm->m_len);
+ mm->m_len += len;
+ mm->m_pkthdr.len += len;
+ return m;
+ }
+ if (prep && M_LEADINGSPACE(mm) >= len) {
+ mm->m_data = mtod(mm, caddr_t) - len;
+ m_apply(n, off, len, m_bcopyxxx, mtod(mm, caddr_t));
+ mm->m_len += len;
+ mm->m_pkthdr.len += len;
+ return mm;
+ }
+
+ /* Expand first/last mbuf to cluster if possible. */
+ if (!prep && !(mm->m_flags & M_EXT) && len > M_TRAILINGSPACE(mm)) {
+ bcopy(mm->m_data, &buf, mm->m_len);
+ m_clget(mm, how);
+ if (!(mm->m_flags & M_EXT))
+ return NULL;
+ bcopy(&buf, mm->m_ext.ext_buf, mm->m_len);
+ mm->m_data = mm->m_ext.ext_buf;
+ mm->m_pkthdr.header = NULL;
+ }
+ if (prep && !(mm->m_flags & M_EXT) && len > M_LEADINGSPACE(mm)) {
+ bcopy(mm->m_data, &buf, mm->m_len);
+ m_clget(mm, how);
+ if (!(mm->m_flags & M_EXT))
+ return NULL;
+ bcopy(&buf, (caddr_t *)mm->m_ext.ext_buf +
+ mm->m_ext.ext_size - mm->m_len, mm->m_len);
+ mm->m_data = (caddr_t)mm->m_ext.ext_buf +
+ mm->m_ext.ext_size - mm->m_len;
+ mm->m_pkthdr.header = NULL;
+ }
+
+ /* Append/prepend as many mbuf (clusters) as necessary to fit len. */
+ if (!prep && len > M_TRAILINGSPACE(mm)) {
+ if (!m_getm(mm, len - M_TRAILINGSPACE(mm), how, MT_DATA))
+ return NULL;
+ }
+ if (prep && len > M_LEADINGSPACE(mm)) {
+ if (!(z = m_getm(NULL, len - M_LEADINGSPACE(mm), how, MT_DATA)))
+ return NULL;
+ i = 0;
+ for (x = z; x != NULL; x = x->m_next) {
+ i += x->m_flags & M_EXT ? x->m_ext.ext_size :
+ (x->m_flags & M_PKTHDR ? MHLEN : MLEN);
+ if (!x->m_next)
+ break;
+ }
+ z->m_data += i - len;
+ m_move_pkthdr(mm, z);
+ x->m_next = mm;
+ mm = z;
+ }
+
+ /* Seek to start position in source mbuf. Optimization for long chains. */
+ while (off > 0) {
+ if (off < n->m_len)
+ break;
+ off -= n->m_len;
+ n = n->m_next;
+ }
+
+ /* Copy data into target mbuf. */
+ z = mm;
+ while (len > 0) {
+ KASSERT(z != NULL, ("m_copymdata, falling off target edge"));
+ i = M_TRAILINGSPACE(z);
+ m_apply(n, off, i, m_bcopyxxx, mtod(z, caddr_t) + z->m_len);
+ z->m_len += i;
+ /* fixup pkthdr.len if necessary */
+ if ((prep ? mm : m)->m_flags & M_PKTHDR)
+ (prep ? mm : m)->m_pkthdr.len += i;
+ off += i;
+ len -= i;
+ z = z->m_next;
+ }
+ return (prep ? mm : m);
+}
+
+/*
* Copy an entire packet, including header (which must be present).
* An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
* Note that the copy is read-only, because clusters are not copied,
@@ -477,10 +760,7 @@
n->m_len = m->m_len;
if (m->m_flags & M_EXT) {
n->m_data = m->m_data;
- n->m_ext = m->m_ext;
- n->m_flags |= M_EXT;
- MEXT_ADD_REF(m);
- n->m_ext.ref_cnt = m->m_ext.ref_cnt;
+ mb_dupcl(n, m);
} else {
n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
@@ -498,10 +778,7 @@
n->m_len = m->m_len;
if (m->m_flags & M_EXT) {
n->m_data = m->m_data;
- n->m_ext = m->m_ext;
- n->m_flags |= M_EXT;
- MEXT_ADD_REF(m);
- n->m_ext.ref_cnt = m->m_ext.ref_cnt;
+ mb_dupcl(n, m);
} else {
bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
}
@@ -885,11 +1162,8 @@
}
extpacket:
if (m->m_flags & M_EXT) {
- n->m_flags |= M_EXT;
- n->m_ext = m->m_ext;
- MEXT_ADD_REF(m);
- n->m_ext.ref_cnt = m->m_ext.ref_cnt;
n->m_data = m->m_data + len;
+ mb_dupcl(n, m);
} else {
bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
}
@@ -906,7 +1180,7 @@
*/
struct mbuf *
m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
- void (*copy)(char *from, caddr_t to, u_int len))
+ void (*copy)(char *from, caddr_t to, u_int len))
{
struct mbuf *m;
struct mbuf *top = NULL, **mp = ⊤
@@ -1143,7 +1417,7 @@
"\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
"\3eor\2pkthdr\1ext", pdata ? "" : "\n");
if (pdata)
- printf(", %*D\n", m2->m_len, (u_char *)m2->m_data, "-");
+ printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
if (len != -1)
len -= m2->m_len;
m2 = m2->m_next;
@@ -1347,55 +1621,61 @@
#endif
+/*
+ * Copy the contents of uio into a properly sized mbuf chain.
+ */
struct mbuf *
-m_uiotombuf(struct uio *uio, int how, int len, int align)
+m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
{
- struct mbuf *m_new = NULL, *m_final = NULL;
- int progress = 0, error = 0, length, total;
+ struct mbuf *m, *mb;
+ int error, length, total;
+ int progress = 0;
+ /*
+ * len can be zero or an arbitrary large value bound by
+ * the total data supplied by the uio.
+ */
if (len > 0)
total = min(uio->uio_resid, len);
else
total = uio->uio_resid;
+
+ /*
+ * The smallest unit returned by m_getm2() is a single mbuf
+ * with pkthdr. We can't align past it. Align align itself.
+ */
+ if (align)
+ align &= ~(sizeof(long) - 1);
if (align >= MHLEN)
- goto nospace;
- if (total + align > MHLEN)
- m_final = m_getcl(how, MT_DATA, M_PKTHDR);
- else
- m_final = m_gethdr(how, MT_DATA);
- if (m_final == NULL)
- goto nospace;
- m_final->m_data += align;
- m_new = m_final;
- while (progress < total) {
- length = total - progress;
- if (length > MCLBYTES)
- length = MCLBYTES;
- if (m_new == NULL) {
- if (length > MLEN)
- m_new = m_getcl(how, MT_DATA, 0);
- else
- m_new = m_get(how, MT_DATA);
- if (m_new == NULL)
- goto nospace;
+ return (NULL);
+
+ /*
+ * Give us the full allocation or nothing.
+ * If len is zero return the smallest empty mbuf.
+ */
+ m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags);
+ if (m == NULL)
+ return (NULL);
+ m->m_data += align;
+
+ /* Fill all mbufs with uio data and update header information. */
+ for (mb = m; mb != NULL; mb = mb->m_next) {
+ length = min(M_TRAILINGSPACE(mb), total - progress);
+
+ error = uiomove(mtod(mb, void *), length, uio);
+ if (error) {
+ m_freem(m);
+ return (NULL);
}
- error = uiomove(mtod(m_new, void *), length, uio);
- if (error)
- goto nospace;
+
+ mb->m_len = length;
progress += length;
- m_new->m_len = length;
- if (m_new != m_final)
- m_cat(m_final, m_new);
- m_new = NULL;
+ if (flags & M_PKTHDR)
+ m->m_pkthdr.len += length;
}
- m_fixhdr(m_final);
- return (m_final);
-nospace:
- if (m_new)
- m_free(m_new);
- if (m_final)
- m_freem(m_final);
- return (NULL);
+ KASSERT(progress == total, ("%s: progress != total", __func__));
+
+ return (m);
}
/*
Index: kern_ktr.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_ktr.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_ktr.c -L sys/kern/kern_ktr.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_ktr.c
+++ sys/kern/kern_ktr.c
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_ktr.c,v 1.48 2005/06/10 23:21:29 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_ktr.c,v 1.53 2006/09/09 16:09:01 rwatson Exp $");
#include "opt_ddb.h"
#include "opt_ktr.h"
@@ -55,8 +55,10 @@
#include <machine/ktr.h>
#endif
-
+#ifdef DDB
#include <ddb/ddb.h>
+#include <ddb/db_output.h>
+#endif
#ifndef KTR_ENTRIES
#define KTR_ENTRIES 1024
@@ -100,6 +102,26 @@
volatile int ktr_idx = 0;
struct ktr_entry ktr_buf[KTR_ENTRIES];
+static int
+sysctl_debug_ktr_clear(SYSCTL_HANDLER_ARGS)
+{
+ int clear, error;
+
+ clear = 0;
+ error = sysctl_handle_int(oidp, &clear, 0, req);
+ if (error || !req->newptr)
+ return (error);
+
+ if (clear) {
+ bzero(ktr_buf, sizeof(ktr_buf));
+ ktr_idx = 0;
+ }
+
+ return (error);
+}
+SYSCTL_PROC(_debug_ktr, OID_AUTO, clear, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
+ sysctl_debug_ktr_clear, "I", "Clear KTR Buffer");
+
#ifdef KTR_VERBOSE
int ktr_verbose = KTR_VERBOSE;
TUNABLE_INT("debug.ktr.verbose", &ktr_verbose);
@@ -134,21 +156,17 @@
enable = ktr_alq_enabled;
- error = sysctl_handle_int(oidp, &enable, 0, req);
- if (error || !req->newptr)
- return (error);
+ error = sysctl_handle_int(oidp, &enable, 0, req);
+ if (error || !req->newptr)
+ return (error);
if (enable) {
if (ktr_alq_enabled)
return (0);
- error = suser(curthread);
- if (error)
- return (error);
error = alq_open(&ktr_alq, (const char *)ktr_alq_file,
req->td->td_ucred, ALQ_DEFAULT_CMODE,
sizeof(struct ktr_entry), ktr_alq_depth);
if (error == 0) {
- ktr_mask &= ~KTR_ALQ_MASK;
ktr_alq_cnt = 0;
ktr_alq_failed = 0;
ktr_alq_enabled = 1;
@@ -269,22 +287,17 @@
DB_SHOW_COMMAND(ktr, db_ktr_all)
{
- int quit;
- quit = 0;
tstate.cur = (ktr_idx - 1) & (KTR_ENTRIES - 1);
tstate.first = -1;
- if (strcmp(modif, "v") == 0)
- db_ktr_verbose = 1;
- else
- db_ktr_verbose = 0;
- if (strcmp(modif, "a") == 0) {
+ db_ktr_verbose = index(modif, 'v') != NULL;
+ if (index(modif, 'a') != NULL) {
+ db_disable_pager();
while (cncheckc() != -1)
if (db_mach_vtrace() == 0)
break;
} else {
- db_setup_paging(db_simple_pager, &quit, db_lines_per_page);
- while (!quit)
+ while (!db_pager_quit)
if (db_mach_vtrace() == 0)
break;
}
Index: kern_cpu.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_cpu.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_cpu.c -L sys/kern/kern_cpu.c -u -r1.2 -r1.3
--- sys/kern/kern_cpu.c
+++ sys/kern/kern_cpu.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2004-2005 Nate Lawson (SDG)
+ * Copyright (c) 2004-2007 Nate Lawson (SDG)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_cpu.c,v 1.14.2.4 2006/03/05 00:03:29 mnag Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_cpu.c,v 1.27.4.1 2008/01/19 20:30:59 njl Exp $");
#include <sys/param.h>
#include <sys/bus.h>
@@ -37,12 +37,14 @@
#include <sys/module.h>
#include <sys/proc.h>
#include <sys/queue.h>
+#include <sys/sbuf.h>
#include <sys/sched.h>
+#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
-#include <sys/sbuf.h>
#include <sys/sx.h>
#include <sys/timetc.h>
+#include <sys/taskqueue.h>
#include "cpufreq_if.h"
@@ -73,6 +75,7 @@
int max_mhz;
device_t dev;
struct sysctl_ctx_list sysctl_ctx;
+ struct task startup_task;
};
struct cf_setting_array {
@@ -94,8 +97,8 @@
} while (0)
static int cpufreq_attach(device_t dev);
+static void cpufreq_startup_task(void *ctx, int pending);
static int cpufreq_detach(device_t dev);
-static void cpufreq_evaluate(void *arg);
static int cf_set_method(device_t dev, const struct cf_level *level,
int priority);
static int cf_get_method(device_t dev, struct cf_level *level);
@@ -127,8 +130,6 @@
static devclass_t cpufreq_dc;
DRIVER_MODULE(cpufreq, cpu, cpufreq_driver, cpufreq_dc, 0, 0);
-static eventhandler_tag cf_ev_tag;
-
static int cf_lowest_freq;
static int cf_verbose;
TUNABLE_INT("debug.cpufreq.lowest", &cf_lowest_freq);
@@ -176,12 +177,25 @@
SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
OID_AUTO, "freq_levels", CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
cpufreq_levels_sysctl, "A", "CPU frequency levels");
- cf_ev_tag = EVENTHANDLER_REGISTER(cpufreq_changed, cpufreq_evaluate,
- NULL, EVENTHANDLER_PRI_ANY);
+
+ /*
+ * Queue a one-shot broadcast that levels have changed.
+ * It will run once the system has completed booting.
+ */
+ TASK_INIT(&sc->startup_task, 0, cpufreq_startup_task, dev);
+ taskqueue_enqueue(taskqueue_thread, &sc->startup_task);
return (0);
}
+/* Handle any work to be done for all drivers that attached during boot. */
+static void
+cpufreq_startup_task(void *ctx, int pending)
+{
+
+ cpufreq_settings_changed((device_t)ctx);
+}
+
static int
cpufreq_detach(device_t dev)
{
@@ -202,18 +216,11 @@
numdevs = devclass_get_count(cpufreq_dc);
if (numdevs == 1) {
CF_DEBUG("final shutdown for %s\n", device_get_nameunit(dev));
- EVENTHANDLER_DEREGISTER(cpufreq_changed, cf_ev_tag);
}
return (0);
}
-static void
-cpufreq_evaluate(void *arg)
-{
- /* TODO: Re-evaluate when notified of changes to drivers. */
-}
-
static int
cf_set_method(device_t dev, const struct cf_level *level, int priority)
{
@@ -221,30 +228,37 @@
const struct cf_setting *set;
struct cf_saved_freq *saved_freq, *curr_freq;
struct pcpu *pc;
- int cpu_id, error, i;
- static int once;
+ int error, i;
sc = device_get_softc(dev);
error = 0;
set = NULL;
saved_freq = NULL;
- /*
- * Check that the TSC isn't being used as a timecounter.
- * If it is, then return EBUSY and refuse to change the
- * clock speed.
- */
- if (strcmp(timecounter->tc_name, "TSC") == 0) {
- if (!once) {
- printf("cpufreq: frequency change with timecounter"
- " TSC not allowed, see cpufreq(4)\n");
- once = 1;
- }
- return (EBUSY);
+ /* We are going to change levels so notify the pre-change handler. */
+ EVENTHANDLER_INVOKE(cpufreq_pre_change, level, &error);
+ if (error != 0) {
+ EVENTHANDLER_INVOKE(cpufreq_post_change, level, error);
+ return (error);
}
CF_MTX_LOCK(&sc->lock);
+#ifdef SMP
+ /*
+ * If still booting and secondary CPUs not started yet, don't allow
+ * changing the frequency until they're online. This is because we
+ * can't switch to them using sched_bind() and thus we'd only be
+ * switching the main CPU. XXXTODO: Need to think more about how to
+ * handle having different CPUs at different frequencies.
+ */
+ if (mp_ncpus > 1 && !smp_active) {
+ device_printf(dev, "rejecting change, SMP not started yet\n");
+ error = ENXIO;
+ goto out;
+ }
+#endif /* SMP */
+
/*
* If the requested level has a lower priority, don't allow
* the new level right now.
@@ -296,22 +310,17 @@
goto out;
}
- /* Bind to the target CPU before switching, if necessary. */
- cpu_id = PCPU_GET(cpuid);
+ /* Bind to the target CPU before switching. */
pc = cpu_get_pcpu(set->dev);
- if (cpu_id != pc->pc_cpuid) {
- mtx_lock_spin(&sched_lock);
- sched_bind(curthread, pc->pc_cpuid);
- mtx_unlock_spin(&sched_lock);
- }
+ thread_lock(curthread);
+ sched_bind(curthread, pc->pc_cpuid);
+ thread_unlock(curthread);
CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq,
device_get_nameunit(set->dev), PCPU_GET(cpuid));
error = CPUFREQ_DRV_SET(set->dev, set);
- if (cpu_id != pc->pc_cpuid) {
- mtx_lock_spin(&sched_lock);
- sched_unbind(curthread);
- mtx_unlock_spin(&sched_lock);
- }
+ thread_lock(curthread);
+ sched_unbind(curthread);
+ thread_unlock(curthread);
if (error) {
goto out;
}
@@ -325,22 +334,17 @@
goto out;
}
- /* Bind to the target CPU before switching, if necessary. */
- cpu_id = PCPU_GET(cpuid);
+ /* Bind to the target CPU before switching. */
pc = cpu_get_pcpu(set->dev);
- if (cpu_id != pc->pc_cpuid) {
- mtx_lock_spin(&sched_lock);
- sched_bind(curthread, pc->pc_cpuid);
- mtx_unlock_spin(&sched_lock);
- }
+ thread_lock(curthread);
+ sched_bind(curthread, pc->pc_cpuid);
+ thread_unlock(curthread);
CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq,
device_get_nameunit(set->dev), PCPU_GET(cpuid));
error = CPUFREQ_DRV_SET(set->dev, set);
- if (cpu_id != pc->pc_cpuid) {
- mtx_lock_spin(&sched_lock);
- sched_unbind(curthread);
- mtx_unlock_spin(&sched_lock);
- }
+ thread_lock(curthread);
+ sched_unbind(curthread);
+ thread_unlock(curthread);
if (error) {
/* XXX Back out any successful setting? */
goto out;
@@ -378,8 +382,15 @@
out:
CF_MTX_UNLOCK(&sc->lock);
+
+ /*
+ * We changed levels (or attempted to) so notify the post-change
+ * handler of new frequency or error.
+ */
+ EVENTHANDLER_INVOKE(cpufreq_post_change, level, error);
if (error && set)
device_printf(set->dev, "set freq failed, err %d\n", error);
+
return (error);
}
@@ -391,7 +402,7 @@
struct cf_setting *curr_set, set;
struct pcpu *pc;
device_t *devs;
- int count, error, i, numdevs;
+ int count, error, i, n, numdevs;
uint64_t rate;
sc = device_get_softc(dev);
@@ -438,10 +449,10 @@
* The estimation code below catches this case though.
*/
CF_MTX_LOCK(&sc->lock);
- for (i = 0; i < numdevs && curr_set->freq == CPUFREQ_VAL_UNKNOWN; i++) {
- if (!device_is_attached(devs[i]))
+ for (n = 0; n < numdevs && curr_set->freq == CPUFREQ_VAL_UNKNOWN; n++) {
+ if (!device_is_attached(devs[n]))
continue;
- error = CPUFREQ_DRV_GET(devs[i], &set);
+ error = CPUFREQ_DRV_GET(devs[n], &set);
if (error)
continue;
for (i = 0; i < count; i++) {
@@ -595,6 +606,17 @@
/* Finally, output the list of levels. */
i = 0;
TAILQ_FOREACH(lev, &sc->all_levels, link) {
+ /*
+ * Skip levels that are too close in frequency to the
+ * previous levels. Some systems report bogus duplicate
+ * settings (i.e., for acpi_perf).
+ */
+ if (i > 0 && CPUFREQ_CMP(lev->total_set.freq,
+ levels[i - 1].total_set.freq)) {
+ sc->all_count--;
+ continue;
+ }
+
/* Skip levels that have a frequency that is too low. */
if (lev->total_set.freq < cf_lowest_freq) {
sc->all_count--;
@@ -1021,3 +1043,12 @@
return (0);
}
+
+int
+cpufreq_settings_changed(device_t dev)
+{
+
+ EVENTHANDLER_INVOKE(cpufreq_levels_changed,
+ device_get_unit(device_get_parent(dev)));
+ return (0);
+}
Index: subr_pcpu.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_pcpu.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_pcpu.c -L sys/kern/subr_pcpu.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_pcpu.c
+++ sys/kern/subr_pcpu.c
@@ -43,7 +43,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_pcpu.c,v 1.6.2.2 2005/11/11 18:50:45 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_pcpu.c,v 1.8 2005/11/03 21:06:29 jhb Exp $");
#include "opt_ddb.h"
Index: syscalls.c
===================================================================
RCS file: /home/cvs/src/sys/kern/syscalls.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/syscalls.c -L sys/kern/syscalls.c -u -r1.2 -r1.3
--- sys/kern/syscalls.c
+++ sys/kern/syscalls.c
@@ -2,8 +2,8 @@
* System call names.
*
* DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/kern/syscalls.c,v 1.181.2.2 2006/03/17 01:47:32 rwatson Exp $
- * created from FreeBSD: src/sys/kern/syscalls.master,v 1.198.2.2 2006/03/17 01:47:06 rwatson Exp
+ * $FreeBSD: src/sys/kern/syscalls.c,v 1.214 2007/08/16 05:32:26 davidxu Exp $
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.232 2007/07/04 22:47:37 peter Exp
*/
const char *syscallnames[] = {
@@ -15,7 +15,7 @@
"open", /* 5 = open */
"close", /* 6 = close */
"wait4", /* 7 = wait4 */
- "old.creat", /* 8 = old creat */
+ "compat.creat", /* 8 = old creat */
"link", /* 9 = link */
"unlink", /* 10 = unlink */
"obs_execv", /* 11 = obsolete execv */
@@ -25,8 +25,8 @@
"chmod", /* 15 = chmod */
"chown", /* 16 = chown */
"break", /* 17 = break */
- "old.getfsstat", /* 18 = old getfsstat */
- "old.lseek", /* 19 = old lseek */
+ "compat4.getfsstat", /* 18 = old getfsstat */
+ "compat.lseek", /* 19 = old lseek */
"getpid", /* 20 = getpid */
"mount", /* 21 = mount */
"unmount", /* 22 = unmount */
@@ -45,21 +45,21 @@
"fchflags", /* 35 = fchflags */
"sync", /* 36 = sync */
"kill", /* 37 = kill */
- "old.stat", /* 38 = old stat */
+ "compat.stat", /* 38 = old stat */
"getppid", /* 39 = getppid */
- "old.lstat", /* 40 = old lstat */
+ "compat.lstat", /* 40 = old lstat */
"dup", /* 41 = dup */
"pipe", /* 42 = pipe */
"getegid", /* 43 = getegid */
"profil", /* 44 = profil */
"ktrace", /* 45 = ktrace */
- "old.sigaction", /* 46 = old sigaction */
+ "compat.sigaction", /* 46 = old sigaction */
"getgid", /* 47 = getgid */
- "old.sigprocmask", /* 48 = old sigprocmask */
+ "compat.sigprocmask", /* 48 = old sigprocmask */
"getlogin", /* 49 = getlogin */
"setlogin", /* 50 = setlogin */
"acct", /* 51 = acct */
- "old.sigpending", /* 52 = old sigpending */
+ "compat.sigpending", /* 52 = old sigpending */
"sigaltstack", /* 53 = sigaltstack */
"ioctl", /* 54 = ioctl */
"reboot", /* 55 = reboot */
@@ -69,16 +69,16 @@
"execve", /* 59 = execve */
"umask", /* 60 = umask */
"chroot", /* 61 = chroot */
- "old.fstat", /* 62 = old fstat */
- "old.getkerninfo", /* 63 = old getkerninfo */
- "old.getpagesize", /* 64 = old getpagesize */
+ "compat.fstat", /* 62 = old fstat */
+ "compat.getkerninfo", /* 63 = old getkerninfo */
+ "compat.getpagesize", /* 64 = old getpagesize */
"msync", /* 65 = msync */
"vfork", /* 66 = vfork */
"obs_vread", /* 67 = obsolete vread */
"obs_vwrite", /* 68 = obsolete vwrite */
"sbrk", /* 69 = sbrk */
"sstk", /* 70 = sstk */
- "old.mmap", /* 71 = old mmap */
+ "compat.mmap", /* 71 = old mmap */
"vadvise", /* 72 = vadvise */
"munmap", /* 73 = munmap */
"mprotect", /* 74 = mprotect */
@@ -91,11 +91,11 @@
"getpgrp", /* 81 = getpgrp */
"setpgid", /* 82 = setpgid */
"setitimer", /* 83 = setitimer */
- "old.wait", /* 84 = old wait */
+ "compat.wait", /* 84 = old wait */
"swapon", /* 85 = swapon */
"getitimer", /* 86 = getitimer */
- "old.gethostname", /* 87 = old gethostname */
- "old.sethostname", /* 88 = old sethostname */
+ "compat.gethostname", /* 87 = old gethostname */
+ "compat.sethostname", /* 88 = old sethostname */
"getdtablesize", /* 89 = getdtablesize */
"dup2", /* 90 = dup2 */
"#91", /* 91 = getdopt */
@@ -106,22 +106,22 @@
"setpriority", /* 96 = setpriority */
"socket", /* 97 = socket */
"connect", /* 98 = connect */
- "old.accept", /* 99 = old accept */
+ "compat.accept", /* 99 = old accept */
"getpriority", /* 100 = getpriority */
- "old.send", /* 101 = old send */
- "old.recv", /* 102 = old recv */
- "old.sigreturn", /* 103 = old sigreturn */
+ "compat.send", /* 101 = old send */
+ "compat.recv", /* 102 = old recv */
+ "compat.sigreturn", /* 103 = old sigreturn */
"bind", /* 104 = bind */
"setsockopt", /* 105 = setsockopt */
"listen", /* 106 = listen */
"obs_vtimes", /* 107 = obsolete vtimes */
- "old.sigvec", /* 108 = old sigvec */
- "old.sigblock", /* 109 = old sigblock */
- "old.sigsetmask", /* 110 = old sigsetmask */
- "old.sigsuspend", /* 111 = old sigsuspend */
- "old.sigstack", /* 112 = old sigstack */
- "old.recvmsg", /* 113 = old recvmsg */
- "old.sendmsg", /* 114 = old sendmsg */
+ "compat.sigvec", /* 108 = old sigvec */
+ "compat.sigblock", /* 109 = old sigblock */
+ "compat.sigsetmask", /* 110 = old sigsetmask */
+ "compat.sigsuspend", /* 111 = old sigsuspend */
+ "compat.sigstack", /* 112 = old sigstack */
+ "compat.recvmsg", /* 113 = old recvmsg */
+ "compat.sendmsg", /* 114 = old sendmsg */
"obs_vtrace", /* 115 = obsolete vtrace */
"gettimeofday", /* 116 = gettimeofday */
"getrusage", /* 117 = getrusage */
@@ -132,12 +132,12 @@
"settimeofday", /* 122 = settimeofday */
"fchown", /* 123 = fchown */
"fchmod", /* 124 = fchmod */
- "old.recvfrom", /* 125 = old recvfrom */
+ "compat.recvfrom", /* 125 = old recvfrom */
"setreuid", /* 126 = setreuid */
"setregid", /* 127 = setregid */
"rename", /* 128 = rename */
- "old.truncate", /* 129 = old truncate */
- "old.ftruncate", /* 130 = old ftruncate */
+ "compat.truncate", /* 129 = old truncate */
+ "compat.ftruncate", /* 130 = old ftruncate */
"flock", /* 131 = flock */
"mkfifo", /* 132 = mkfifo */
"sendto", /* 133 = sendto */
@@ -148,24 +148,24 @@
"utimes", /* 138 = utimes */
"obs_4.2", /* 139 = obsolete 4.2 sigreturn */
"adjtime", /* 140 = adjtime */
- "old.getpeername", /* 141 = old getpeername */
- "old.gethostid", /* 142 = old gethostid */
- "old.sethostid", /* 143 = old sethostid */
- "old.getrlimit", /* 144 = old getrlimit */
- "old.setrlimit", /* 145 = old setrlimit */
- "old.killpg", /* 146 = old killpg */
+ "compat.getpeername", /* 141 = old getpeername */
+ "compat.gethostid", /* 142 = old gethostid */
+ "compat.sethostid", /* 143 = old sethostid */
+ "compat.getrlimit", /* 144 = old getrlimit */
+ "compat.setrlimit", /* 145 = old setrlimit */
+ "compat.killpg", /* 146 = old killpg */
"setsid", /* 147 = setsid */
"quotactl", /* 148 = quotactl */
- "old.quota", /* 149 = old quota */
- "old.getsockname", /* 150 = old getsockname */
+ "compat.quota", /* 149 = old quota */
+ "compat.getsockname", /* 150 = old getsockname */
"#151", /* 151 = sem_lock */
"#152", /* 152 = sem_wakeup */
"#153", /* 153 = asyncdaemon */
"#154", /* 154 = nosys */
"nfssvc", /* 155 = nfssvc */
- "old.getdirentries", /* 156 = old getdirentries */
- "old.statfs", /* 157 = old statfs */
- "old.fstatfs", /* 158 = old fstatfs */
+ "compat.getdirentries", /* 156 = old getdirentries */
+ "compat4.statfs", /* 157 = old statfs */
+ "compat4.fstatfs", /* 158 = old fstatfs */
"#159", /* 159 = nosys */
"lgetfh", /* 160 = lgetfh */
"getfh", /* 161 = getfh */
@@ -180,8 +180,8 @@
"msgsys", /* 170 = msgsys */
"shmsys", /* 171 = shmsys */
"#172", /* 172 = nosys */
- "pread", /* 173 = pread */
- "pwrite", /* 174 = pwrite */
+ "freebsd6_pread", /* 173 = freebsd6_pread */
+ "freebsd6_pwrite", /* 174 = freebsd6_pwrite */
"#175", /* 175 = nosys */
"ntp_adjtime", /* 176 = ntp_adjtime */
"#177", /* 177 = sfork */
@@ -204,11 +204,11 @@
"getrlimit", /* 194 = getrlimit */
"setrlimit", /* 195 = setrlimit */
"getdirentries", /* 196 = getdirentries */
- "mmap", /* 197 = mmap */
+ "freebsd6_mmap", /* 197 = freebsd6_mmap */
"__syscall", /* 198 = __syscall */
- "lseek", /* 199 = lseek */
- "truncate", /* 200 = truncate */
- "ftruncate", /* 201 = ftruncate */
+ "freebsd6_lseek", /* 199 = freebsd6_lseek */
+ "freebsd6_truncate", /* 200 = freebsd6_truncate */
+ "freebsd6_ftruncate", /* 201 = freebsd6_ftruncate */
"__sysctl", /* 202 = __sysctl */
"mlock", /* 203 = mlock */
"munlock", /* 204 = munlock */
@@ -242,11 +242,11 @@
"clock_gettime", /* 232 = clock_gettime */
"clock_settime", /* 233 = clock_settime */
"clock_getres", /* 234 = clock_getres */
- "#235", /* 235 = timer_create */
- "#236", /* 236 = timer_delete */
- "#237", /* 237 = timer_settime */
- "#238", /* 238 = timer_gettime */
- "#239", /* 239 = timer_getoverrun */
+ "ktimer_create", /* 235 = ktimer_create */
+ "ktimer_delete", /* 236 = ktimer_delete */
+ "ktimer_settime", /* 237 = ktimer_settime */
+ "ktimer_gettime", /* 238 = ktimer_gettime */
+ "ktimer_getoverrun", /* 239 = ktimer_getoverrun */
"nanosleep", /* 240 = nanosleep */
"#241", /* 241 = nosys */
"#242", /* 242 = nosys */
@@ -262,9 +262,9 @@
"openbsd_poll", /* 252 = openbsd_poll */
"issetugid", /* 253 = issetugid */
"lchown", /* 254 = lchown */
- "#255", /* 255 = nosys */
- "#256", /* 256 = nosys */
- "#257", /* 257 = nosys */
+ "aio_read", /* 255 = aio_read */
+ "aio_write", /* 256 = aio_write */
+ "lio_listio", /* 257 = lio_listio */
"#258", /* 258 = nosys */
"#259", /* 259 = nosys */
"#260", /* 260 = nosys */
@@ -304,7 +304,7 @@
"#294", /* 294 = nosys */
"#295", /* 295 = nosys */
"#296", /* 296 = nosys */
- "old.fhstatfs", /* 297 = old fhstatfs */
+ "compat4.fhstatfs", /* 297 = old fhstatfs */
"fhopen", /* 298 = fhopen */
"fhstat", /* 299 = fhstat */
"modnext", /* 300 = modnext */
@@ -325,9 +325,9 @@
"aio_suspend", /* 315 = aio_suspend */
"aio_cancel", /* 316 = aio_cancel */
"aio_error", /* 317 = aio_error */
- "aio_read", /* 318 = aio_read */
- "aio_write", /* 319 = aio_write */
- "lio_listio", /* 320 = lio_listio */
+ "oaio_read", /* 318 = oaio_read */
+ "oaio_write", /* 319 = oaio_write */
+ "olio_listio", /* 320 = olio_listio */
"yield", /* 321 = yield */
"obs_thr_sleep", /* 322 = obsolete thr_sleep */
"obs_thr_wakeup", /* 323 = obsolete thr_wakeup */
@@ -343,15 +343,15 @@
"sched_get_priority_min", /* 333 = sched_get_priority_min */
"sched_rr_get_interval", /* 334 = sched_rr_get_interval */
"utrace", /* 335 = utrace */
- "old.sendfile", /* 336 = old sendfile */
+ "compat4.sendfile", /* 336 = old sendfile */
"kldsym", /* 337 = kldsym */
"jail", /* 338 = jail */
"#339", /* 339 = pioctl */
"sigprocmask", /* 340 = sigprocmask */
"sigsuspend", /* 341 = sigsuspend */
- "old.sigaction", /* 342 = old sigaction */
+ "compat4.sigaction", /* 342 = old sigaction */
"sigpending", /* 343 = sigpending */
- "old.sigreturn", /* 344 = old sigreturn */
+ "compat4.sigreturn", /* 344 = old sigreturn */
"sigtimedwait", /* 345 = sigtimedwait */
"sigwaitinfo", /* 346 = sigwaitinfo */
"__acl_get_file", /* 347 = __acl_get_file */
@@ -463,4 +463,30 @@
"auditctl", /* 453 = auditctl */
"_umtx_op", /* 454 = _umtx_op */
"thr_new", /* 455 = thr_new */
+ "sigqueue", /* 456 = sigqueue */
+ "kmq_open", /* 457 = kmq_open */
+ "kmq_setattr", /* 458 = kmq_setattr */
+ "kmq_timedreceive", /* 459 = kmq_timedreceive */
+ "kmq_timedsend", /* 460 = kmq_timedsend */
+ "kmq_notify", /* 461 = kmq_notify */
+ "kmq_unlink", /* 462 = kmq_unlink */
+ "abort2", /* 463 = abort2 */
+ "thr_set_name", /* 464 = thr_set_name */
+ "aio_fsync", /* 465 = aio_fsync */
+ "rtprio_thread", /* 466 = rtprio_thread */
+ "#467", /* 467 = nosys */
+ "#468", /* 468 = nosys */
+ "#469", /* 469 = __getpath_fromfd */
+ "#470", /* 470 = __getpath_fromaddr */
+ "sctp_peeloff", /* 471 = sctp_peeloff */
+ "sctp_generic_sendmsg", /* 472 = sctp_generic_sendmsg */
+ "sctp_generic_sendmsg_iov", /* 473 = sctp_generic_sendmsg_iov */
+ "sctp_generic_recvmsg", /* 474 = sctp_generic_recvmsg */
+ "pread", /* 475 = pread */
+ "pwrite", /* 476 = pwrite */
+ "mmap", /* 477 = mmap */
+ "lseek", /* 478 = lseek */
+ "truncate", /* 479 = truncate */
+ "ftruncate", /* 480 = ftruncate */
+ "thr_kill2", /* 481 = thr_kill2 */
};
Index: kern_timeout.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_timeout.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_timeout.c -L sys/kern/kern_timeout.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_timeout.c
+++ sys/kern/kern_timeout.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_timeout.c,v 1.97.2.2 2005/09/26 19:49:12 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_timeout.c,v 1.106 2007/09/15 12:33:23 rwatson Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -46,6 +46,7 @@
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/sleepqueue.h>
#include <sys/sysctl.h>
static int avg_depth;
@@ -78,37 +79,22 @@
/**
* Locked by callout_lock:
* curr_callout - If a callout is in progress, it is curr_callout.
- * If curr_callout is non-NULL, threads waiting on
- * callout_wait will be woken up as soon as the
+ * If curr_callout is non-NULL, threads waiting in
+ * callout_drain() will be woken up as soon as the
* relevant callout completes.
* curr_cancelled - Changing to 1 with both callout_lock and c_mtx held
* guarantees that the current callout will not run.
* The softclock() function sets this to 0 before it
* drops callout_lock to acquire c_mtx, and it calls
- * the handler only if curr_cancelled still 0 when
+ * the handler only if curr_cancelled is still 0 after
* c_mtx is successfully acquired.
- * wakeup_ctr - Incremented every time a thread wants to wait
- * for a callout to complete. Modified only when
+ * callout_wait - If a thread is waiting in callout_drain(), then
+ * callout_wait is nonzero. Set only when
* curr_callout is non-NULL.
- * wakeup_needed - If a thread is waiting on callout_wait, then
- * wakeup_needed is nonzero. Increased only when
- * cutt_callout is non-NULL.
*/
static struct callout *curr_callout;
static int curr_cancelled;
-static int wakeup_ctr;
-static int wakeup_needed;
-
-/**
- * Locked by callout_wait_lock:
- * callout_wait - If wakeup_needed is set, callout_wait will be
- * triggered after the current callout finishes.
- * wakeup_done_ctr - Set to the current value of wakeup_ctr after
- * callout_wait is triggered.
- */
-static struct mtx callout_wait_lock;
-static struct cv callout_wait;
-static int wakeup_done_ctr;
+static int callout_wait;
/*
* kern_timeout_callwheel_alloc() - kernel low level callwheel initialization
@@ -157,8 +143,6 @@
TAILQ_INIT(&callwheel[i]);
}
mtx_init(&callout_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
- mtx_init(&callout_wait_lock, "callout_wait_lock", NULL, MTX_DEF);
- cv_init(&callout_wait, "callout_wait");
}
/*
@@ -188,7 +172,6 @@
int mpcalls;
int mtxcalls;
int gcalls;
- int wakeup_cookie;
#ifdef DIAGNOSTIC
struct bintime bt1, bt2;
struct timespec ts2;
@@ -262,26 +245,27 @@
*/
if (curr_cancelled) {
mtx_unlock(c_mtx);
- mtx_lock_spin(&callout_lock);
- goto done_locked;
+ goto skip;
}
/* The callout cannot be stopped now. */
curr_cancelled = 1;
if (c_mtx == &Giant) {
gcalls++;
- CTR1(KTR_CALLOUT, "callout %p",
- c_func);
+ CTR3(KTR_CALLOUT,
+ "callout %p func %p arg %p",
+ c, c_func, c_arg);
} else {
mtxcalls++;
- CTR1(KTR_CALLOUT,
- "callout mtx %p",
- c_func);
+ CTR3(KTR_CALLOUT, "callout mtx"
+ " %p func %p arg %p",
+ c, c_func, c_arg);
}
} else {
mpcalls++;
- CTR1(KTR_CALLOUT, "callout mpsafe %p",
- c_func);
+ CTR3(KTR_CALLOUT,
+ "callout mpsafe %p func %p arg %p",
+ c, c_func, c_arg);
}
#ifdef DIAGNOSTIC
binuptime(&bt1);
@@ -308,22 +292,18 @@
#endif
if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
mtx_unlock(c_mtx);
+ skip:
mtx_lock_spin(&callout_lock);
-done_locked:
curr_callout = NULL;
- if (wakeup_needed) {
+ if (callout_wait) {
/*
- * There might be someone waiting
+ * There is someone waiting
* for the callout to complete.
*/
- wakeup_cookie = wakeup_ctr;
+ callout_wait = 0;
mtx_unlock_spin(&callout_lock);
- mtx_lock(&callout_wait_lock);
- cv_broadcast(&callout_wait);
- wakeup_done_ctr = wakeup_cookie;
- mtx_unlock(&callout_wait_lock);
+ wakeup(&callout_wait);
mtx_lock_spin(&callout_lock);
- wakeup_needed = 0;
}
steps = 0;
c = nextsoftcheck;
@@ -445,11 +425,14 @@
*/
if (c->c_mtx != NULL && !curr_cancelled)
cancelled = curr_cancelled = 1;
- if (wakeup_needed) {
+ if (callout_wait) {
/*
* Someone has called callout_drain to kill this
* callout. Don't reschedule.
*/
+ CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
+ cancelled ? "cancelled" : "failed to cancel",
+ c, c->c_func, c->c_arg);
mtx_unlock_spin(&callout_lock);
return (cancelled);
}
@@ -487,6 +470,8 @@
c->c_time = ticks + to_ticks;
TAILQ_INSERT_TAIL(&callwheel[c->c_time & callwheelmask],
c, c_links.tqe);
+ CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d",
+ cancelled ? "re" : "", c, c->c_func, c->c_arg, to_ticks);
mtx_unlock_spin(&callout_lock);
return (cancelled);
@@ -497,7 +482,7 @@
struct callout *c;
int safe;
{
- int use_mtx, wakeup_cookie;
+ int use_mtx, sq_locked;
if (!safe && c->c_mtx != NULL) {
#ifdef notyet /* Some callers do not hold Giant for Giant-locked callouts. */
@@ -510,41 +495,100 @@
use_mtx = 0;
}
+ sq_locked = 0;
+again:
mtx_lock_spin(&callout_lock);
/*
- * Don't attempt to delete a callout that's not on the queue.
+ * If the callout isn't pending, it's not on the queue, so
+ * don't attempt to remove it from the queue. We can try to
+ * stop it by other means however.
*/
if (!(c->c_flags & CALLOUT_PENDING)) {
c->c_flags &= ~CALLOUT_ACTIVE;
+
+ /*
+ * If it wasn't on the queue and it isn't the current
+ * callout, then we can't stop it, so just bail.
+ */
if (c != curr_callout) {
+ CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
+ c, c->c_func, c->c_arg);
mtx_unlock_spin(&callout_lock);
+ if (sq_locked)
+ sleepq_release(&callout_wait);
return (0);
}
- if (safe) {
- /* We need to wait until the callout is finished. */
- wakeup_needed = 1;
- wakeup_cookie = wakeup_ctr++;
- mtx_unlock_spin(&callout_lock);
- mtx_lock(&callout_wait_lock);
+ if (safe) {
/*
- * Check to make sure that softclock() didn't
- * do the wakeup in between our dropping
- * callout_lock and picking up callout_wait_lock
+ * The current callout is running (or just
+ * about to run) and blocking is allowed, so
+ * just wait for the current invocation to
+ * finish.
*/
- if (wakeup_cookie - wakeup_done_ctr > 0)
- cv_wait(&callout_wait, &callout_wait_lock);
+ while (c == curr_callout) {
+
+ /*
+ * Use direct calls to sleepqueue interface
+ * instead of cv/msleep in order to avoid
+ * a LOR between callout_lock and sleepqueue
+ * chain spinlocks. This piece of code
+ * emulates a msleep_spin() call actually.
+ *
+ * If we already have the sleepqueue chain
+ * locked, then we can safely block. If we
+ * don't already have it locked, however,
+ * we have to drop the callout_lock to lock
+ * it. This opens several races, so we
+ * restart at the beginning once we have
+ * both locks. If nothing has changed, then
+ * we will end up back here with sq_locked
+ * set.
+ */
+ if (!sq_locked) {
+ mtx_unlock_spin(&callout_lock);
+ sleepq_lock(&callout_wait);
+ sq_locked = 1;
+ goto again;
+ }
+
+ callout_wait = 1;
+ DROP_GIANT();
+ mtx_unlock_spin(&callout_lock);
+ sleepq_add(&callout_wait,
+ &callout_lock.lock_object, "codrain",
+ SLEEPQ_SLEEP, 0);
+ sleepq_wait(&callout_wait);
+ sq_locked = 0;
- mtx_unlock(&callout_wait_lock);
+ /* Reacquire locks previously released. */
+ PICKUP_GIANT();
+ mtx_lock_spin(&callout_lock);
+ }
} else if (use_mtx && !curr_cancelled) {
- /* We can stop the callout before it runs. */
+ /*
+ * The current callout is waiting for it's
+ * mutex which we hold. Cancel the callout
+ * and return. After our caller drops the
+ * mutex, the callout will be skipped in
+ * softclock().
+ */
curr_cancelled = 1;
+ CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
+ c, c->c_func, c->c_arg);
mtx_unlock_spin(&callout_lock);
+ KASSERT(!sq_locked, ("sleepqueue chain locked"));
return (1);
- } else
- mtx_unlock_spin(&callout_lock);
+ }
+ CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
+ c, c->c_func, c->c_arg);
+ mtx_unlock_spin(&callout_lock);
+ KASSERT(!sq_locked, ("sleepqueue chain still locked"));
return (0);
}
+ if (sq_locked)
+ sleepq_release(&callout_wait);
+
c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
if (nextsoftcheck == c) {
@@ -552,6 +596,9 @@
}
TAILQ_REMOVE(&callwheel[c->c_time & callwheelmask], c, c_links.tqe);
+ CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
+ c, c->c_func, c->c_arg);
+
if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
c->c_func = NULL;
SLIST_INSERT_HEAD(&callfree, c, c_links.sle);
@@ -583,12 +630,12 @@
{
bzero(c, sizeof *c);
c->c_mtx = mtx;
- KASSERT((flags & ~CALLOUT_RETURNUNLOCKED) == 0,
+ KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED)) == 0,
("callout_init_mtx: bad flags %d", flags));
/* CALLOUT_RETURNUNLOCKED makes no sense without a mutex. */
KASSERT(mtx != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
("callout_init_mtx: CALLOUT_RETURNUNLOCKED with no mutex"));
- c->c_flags = flags & CALLOUT_RETURNUNLOCKED;
+ c->c_flags = flags & (CALLOUT_RETURNUNLOCKED);
}
#ifdef APM_FIXUP_CALLTODO
Index: subr_param.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_param.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_param.c -L sys/kern/subr_param.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_param.c
+++ sys/kern/subr_param.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_param.c,v 1.71.2.1 2005/10/17 00:16:54 kris Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_param.c,v 1.73 2005/10/16 03:58:10 kris Exp $");
#include "opt_param.h"
#include "opt_maxusers.h"
Index: kern_kse.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_kse.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/kern_kse.c -L sys/kern/kern_kse.c -u -r1.3 -r1.4
--- sys/kern/kern_kse.c
+++ sys/kern/kern_kse.c
@@ -27,7 +27,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_kse.c,v 1.214.2.6 2006/03/07 18:08:09 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_kse.c,v 1.235.4.1 2008/01/19 18:15:05 kib Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -43,13 +43,12 @@
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/sleepqueue.h>
+#include <sys/syslog.h>
#include <sys/kse.h>
#include <sys/ktr.h>
#include <vm/uma.h>
-/*
- * KSEGRP related storage.
- */
+#ifdef KSE
static uma_zone_t upcall_zone;
/* DEBUG ONLY */
@@ -59,14 +58,20 @@
extern int max_threads_per_proc;
extern int max_groups_per_proc;
extern int max_threads_hits;
-extern struct mtx kse_zombie_lock;
+extern struct mtx kse_lock;
TAILQ_HEAD(, kse_upcall) zombie_upcalls =
TAILQ_HEAD_INITIALIZER(zombie_upcalls);
static int thread_update_usr_ticks(struct thread *td);
-static void thread_alloc_spare(struct thread *td);
+static int thread_alloc_spare(struct thread *td);
+static struct thread *thread_schedule_upcall(struct thread *td, struct kse_upcall *ku);
+static struct kse_upcall *upcall_alloc(void);
+
+
+struct mtx kse_lock;
+MTX_SYSINIT(kse_lock, &kse_lock, "kse lock", MTX_SPIN);
struct kse_upcall *
upcall_alloc(void)
@@ -78,45 +83,45 @@
}
void
-upcall_free(struct kse_upcall *ku)
-{
-
- uma_zfree(upcall_zone, ku);
-}
-
-void
-upcall_link(struct kse_upcall *ku, struct ksegrp *kg)
-{
-
- mtx_assert(&sched_lock, MA_OWNED);
- TAILQ_INSERT_TAIL(&kg->kg_upcalls, ku, ku_link);
- ku->ku_ksegrp = kg;
- kg->kg_numupcalls++;
-}
-
-void
-upcall_unlink(struct kse_upcall *ku)
+upcall_reap(void)
{
- struct ksegrp *kg = ku->ku_ksegrp;
+ TAILQ_HEAD(, kse_upcall) zupcalls;
+ struct kse_upcall *ku_item, *ku_tmp;
- mtx_assert(&sched_lock, MA_OWNED);
- KASSERT(ku->ku_owner == NULL, ("%s: have owner", __func__));
- TAILQ_REMOVE(&kg->kg_upcalls, ku, ku_link);
- kg->kg_numupcalls--;
- upcall_stash(ku);
+ TAILQ_INIT(&zupcalls);
+ mtx_lock_spin(&kse_lock);
+ if (!TAILQ_EMPTY(&zombie_upcalls)) {
+ TAILQ_CONCAT(&zupcalls, &zombie_upcalls, ku_link);
+ TAILQ_INIT(&zombie_upcalls);
+ }
+ mtx_unlock_spin(&kse_lock);
+ TAILQ_FOREACH_SAFE(ku_item, &zupcalls, ku_link, ku_tmp)
+ uma_zfree(upcall_zone, ku_item);
}
void
upcall_remove(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_upcall != NULL) {
+ /*
+ * If we are not a bound thread then decrement the count of
+ * possible upcall sources
+ */
+ if (td->td_pflags & TDP_SA)
+ td->td_proc->p_numupcalls--;
+ mtx_lock_spin(&kse_lock);
td->td_upcall->ku_owner = NULL;
- upcall_unlink(td->td_upcall);
+ TAILQ_REMOVE(&td->td_upcall->ku_proc->p_upcalls, td->td_upcall,
+ ku_link);
+ TAILQ_INSERT_HEAD(&zombie_upcalls, td->td_upcall, ku_link);
+ mtx_unlock_spin(&kse_lock);
td->td_upcall = NULL;
}
}
+#endif
#ifndef _SYS_SYSPROTO_H_
struct kse_switchin_args {
@@ -125,15 +130,31 @@
};
#endif
+#ifdef KSE
+void
+kse_unlink(struct thread *td)
+{
+ mtx_lock_spin(&kse_lock);
+ thread_unlink(td);
+ mtx_unlock_spin(&kse_lock);
+ upcall_remove(td);
+}
+#endif
+
int
kse_switchin(struct thread *td, struct kse_switchin_args *uap)
{
+#ifdef KSE
struct kse_thr_mailbox tmbx;
struct kse_upcall *ku;
int error;
- if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td))
+ thread_lock(td);
+ if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td)) {
+ thread_unlock(td);
return (EINVAL);
+ }
+ thread_unlock(td);
error = (uap->tmbx == NULL) ? EINVAL : 0;
if (!error)
error = copyin(uap->tmbx, &tmbx, sizeof(tmbx));
@@ -156,17 +177,20 @@
else
ptrace_clear_single_step(td);
if (tmbx.tm_dflags & TMDF_SUSPEND) {
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
/* fuword can block, check again */
if (td->td_upcall)
ku->ku_flags |= KUF_DOUPCALL;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
_PRELE(td->td_proc);
}
PROC_UNLOCK(td->td_proc);
}
return ((error == 0) ? EJUSTRETURN : error);
+#else /* !KSE */
+ return (EOPNOTSUPP);
+#endif
}
/*
@@ -179,6 +203,7 @@
int
kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
{
+#ifdef KSE
struct kse_execve_args args;
struct image_args iargs;
struct proc *p;
@@ -190,8 +215,12 @@
p = td->td_proc;
- if (!(p->p_flag & P_SA))
+ PROC_LOCK(p);
+ if (!(p->p_flag & P_SA)) {
+ PROC_UNLOCK(p);
return (EINVAL);
+ }
+ PROC_UNLOCK(p);
switch (uap->cmd) {
case KSE_INTR_SENDSIG:
@@ -200,23 +229,25 @@
case KSE_INTR_INTERRUPT:
case KSE_INTR_RESTART:
PROC_LOCK(p);
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
FOREACH_THREAD_IN_PROC(p, td2) {
if (td2->td_mailbox == uap->tmbx)
break;
}
if (td2 == NULL) {
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
PROC_UNLOCK(p);
return (ESRCH);
}
+ thread_lock(td2);
+ PROC_SUNLOCK(p);
if (uap->cmd == KSE_INTR_SENDSIG) {
if (uap->data > 0) {
td2->td_flags &= ~TDF_INTERRUPT;
- mtx_unlock_spin(&sched_lock);
- tdsignal(td2, (int)uap->data, SIGTARGET_TD);
+ thread_unlock(td2);
+ tdsignal(p, td2, (int)uap->data, NULL);
} else {
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td2);
}
} else {
td2->td_flags |= TDF_INTERRUPT | TDF_ASTPENDING;
@@ -228,7 +259,7 @@
td2->td_intrval = ERESTART;
if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR))
sleepq_abort(td2, td2->td_intrval);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td2);
}
PROC_UNLOCK(p);
break;
@@ -243,23 +274,29 @@
/* this sub-function is only for bound thread */
if (td->td_pflags & TDP_SA)
return (EINVAL);
+ thread_lock(td);
ku = td->td_upcall;
+ thread_unlock(td);
tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread);
if (tmbx == NULL || tmbx == (void *)-1)
return (EINVAL);
flags = 0;
+ PROC_LOCK(p);
while ((p->p_flag & P_TRACED) && !(p->p_flag & P_SINGLE_EXIT)) {
flags = fuword32(&tmbx->tm_dflags);
if (!(flags & TMDF_SUSPEND))
break;
- PROC_LOCK(p);
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
thread_stopped(p);
- thread_suspend_one(td);
PROC_UNLOCK(p);
+ thread_lock(td);
+ thread_suspend_one(td);
+ PROC_SUNLOCK(p);
mi_switch(SW_VOL, NULL);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
+ PROC_LOCK(p);
}
+ PROC_UNLOCK(p);
return (0);
case KSE_INTR_EXECVE:
@@ -270,7 +307,6 @@
args.argv, args.envp);
if (error == 0)
error = kern_execve(td, &iargs, NULL);
- exec_free_args(&iargs);
if (error == 0) {
PROC_LOCK(p);
SIGSETOR(td->td_siglist, args.sigpend);
@@ -284,6 +320,9 @@
return (EINVAL);
}
return (0);
+#else /* !KSE */
+ return (EOPNOTSUPP);
+#endif
}
/*
@@ -294,8 +333,8 @@
int
kse_exit(struct thread *td, struct kse_exit_args *uap)
{
+#ifdef KSE
struct proc *p;
- struct ksegrp *kg;
struct kse_upcall *ku, *ku2;
int error, count;
@@ -303,35 +342,39 @@
/*
* Ensure that this is only called from the UTS
*/
- if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td))
+ thread_lock(td);
+ if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td)) {
+ thread_unlock(td);
return (EINVAL);
-
- kg = td->td_ksegrp;
- count = 0;
+ }
+ thread_unlock(td);
/*
- * Calculate the existing non-exiting upcalls in this ksegroup.
+ * Calculate the existing non-exiting upcalls in this process.
* If we are the last upcall but there are still other threads,
* then do not exit. We need the other threads to be able to
* complete whatever they are doing.
* XXX This relies on the userland knowing what to do if we return.
* It may be a better choice to convert ourselves into a kse_release
* ( or similar) and wait in the kernel to be needed.
+ * XXX Where are those other threads? I suppose they are waiting in
+ * the kernel. We should wait for them all at the user boundary after
+ * turning into an exit.
*/
+ count = 0;
PROC_LOCK(p);
- mtx_lock_spin(&sched_lock);
- FOREACH_UPCALL_IN_GROUP(kg, ku2) {
- if (ku2->ku_flags & KUF_EXITING)
+ PROC_SLOCK(p);
+ FOREACH_UPCALL_IN_PROC(p, ku2) {
+ if ((ku2->ku_flags & KUF_EXITING) == 0)
count++;
}
- if ((kg->kg_numupcalls - count) == 1 &&
- (kg->kg_numthreads > 1)) {
- mtx_unlock_spin(&sched_lock);
+ if (count == 1 && (p->p_numthreads > 1)) {
+ PROC_SUNLOCK(p);
PROC_UNLOCK(p);
return (EDEADLK);
}
ku->ku_flags |= KUF_EXITING;
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
PROC_UNLOCK(p);
/*
@@ -346,23 +389,18 @@
PROC_LOCK(p);
if (error)
psignal(p, SIGSEGV);
- mtx_lock_spin(&sched_lock);
+ sigqueue_flush(&td->td_sigqueue);
+ PROC_SLOCK(p);
+ thread_lock(td);
upcall_remove(td);
+ thread_unlock(td);
if (p->p_numthreads != 1) {
- /*
- * If we are not the last thread, but we are the last
- * thread in this ksegrp, then by definition this is not
- * the last group and we need to clean it up as well.
- * thread_exit will clean up the kseg as needed.
- */
thread_stopped(p);
thread_exit();
/* NOTREACHED */
}
/*
* This is the last thread. Just return to the user.
- * We know that there is only one ksegrp too, as any others
- * would have been discarded in previous calls to thread_exit().
* Effectively we have left threading mode..
* The only real thing left to do is ensure that the
* scheduler sets out concurrency back to 1 as that may be a
@@ -372,13 +410,17 @@
* The other possibility would be to let the process exit.
*/
thread_unthread(td);
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
PROC_UNLOCK(p);
-#if 1
+#if 0
return (0);
#else
+ printf("kse_exit: called on last thread. Calling exit1()");
exit1(td, 0);
#endif
+#else /* !KSE */
+ return (EOPNOTSUPP);
+#endif
}
/*
@@ -393,8 +435,8 @@
int
kse_release(struct thread *td, struct kse_release_args *uap)
{
+#ifdef KSE
struct proc *p;
- struct ksegrp *kg;
struct kse_upcall *ku;
struct timespec timeout;
struct timeval tv;
@@ -402,9 +444,13 @@
int error;
p = td->td_proc;
- kg = td->td_ksegrp;
- if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td))
- return (EINVAL);
+ thread_lock(td);
+ if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td)) {
+ thread_unlock(td);
+ printf("kse_release: called outside of threading. exiting");
+ exit1(td, 0);
+ }
+ thread_unlock(td);
if (uap->timeout != NULL) {
if ((error = copyin(uap->timeout, &timeout, sizeof(timeout))))
return (error);
@@ -437,23 +483,26 @@
} else {
if ((ku->ku_flags & KUF_DOUPCALL) == 0 &&
((ku->ku_mflags & KMF_NOCOMPLETED) ||
- (kg->kg_completed == NULL))) {
- kg->kg_upsleeps++;
+ (p->p_completed == NULL))) {
+ p->p_upsleeps++;
td->td_kflags |= TDK_KSEREL;
- error = msleep(&kg->kg_completed, &p->p_mtx,
+ error = msleep(&p->p_completed, &p->p_mtx,
PPAUSE|PCATCH, "kserel",
(uap->timeout ? tvtohz(&tv) : 0));
td->td_kflags &= ~(TDK_KSEREL | TDK_WAKEUP);
- kg->kg_upsleeps--;
+ p->p_upsleeps--;
}
PROC_UNLOCK(p);
}
if (ku->ku_flags & KUF_DOUPCALL) {
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
ku->ku_flags &= ~KUF_DOUPCALL;
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
}
return (0);
+#else /* !KSE */
+ return (EOPNOTSUPP);
+#endif
}
/* struct kse_wakeup_args {
@@ -462,8 +511,8 @@
int
kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
{
+#ifdef KSE
struct proc *p;
- struct ksegrp *kg;
struct kse_upcall *ku;
struct thread *td2;
@@ -471,60 +520,64 @@
td2 = NULL;
ku = NULL;
/* KSE-enabled processes only, please. */
- if (!(p->p_flag & P_SA))
- return (EINVAL);
PROC_LOCK(p);
- mtx_lock_spin(&sched_lock);
+ if (!(p->p_flag & P_SA)) {
+ PROC_UNLOCK(p);
+ return (EINVAL);
+ }
+ PROC_SLOCK(p);
if (uap->mbx) {
- FOREACH_KSEGRP_IN_PROC(p, kg) {
- FOREACH_UPCALL_IN_GROUP(kg, ku) {
- if (ku->ku_mailbox == uap->mbx)
- break;
- }
- if (ku)
+ FOREACH_UPCALL_IN_PROC(p, ku) {
+ if (ku->ku_mailbox == uap->mbx)
break;
}
} else {
- kg = td->td_ksegrp;
- if (kg->kg_upsleeps) {
- mtx_unlock_spin(&sched_lock);
- wakeup(&kg->kg_completed);
+ if (p->p_upsleeps) {
+ PROC_SUNLOCK(p);
+ wakeup(&p->p_completed);
PROC_UNLOCK(p);
return (0);
}
- ku = TAILQ_FIRST(&kg->kg_upcalls);
+ ku = TAILQ_FIRST(&p->p_upcalls);
}
if (ku == NULL) {
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
PROC_UNLOCK(p);
return (ESRCH);
}
+ mtx_lock_spin(&kse_lock);
if ((td2 = ku->ku_owner) == NULL) {
- mtx_unlock_spin(&sched_lock);
+ mtx_unlock_spin(&kse_lock);
+ PROC_SUNLOCK(p);
+ PROC_UNLOCK(p);
panic("%s: no owner", __func__);
} else if (td2->td_kflags & (TDK_KSEREL | TDK_KSERELSIG)) {
- mtx_unlock_spin(&sched_lock);
+ mtx_unlock_spin(&kse_lock);
if (!(td2->td_kflags & TDK_WAKEUP)) {
td2->td_kflags |= TDK_WAKEUP;
if (td2->td_kflags & TDK_KSEREL)
- sleepq_remove(td2, &kg->kg_completed);
+ sleepq_remove(td2, &p->p_completed);
else
sleepq_remove(td2, &p->p_siglist);
}
} else {
ku->ku_flags |= KUF_DOUPCALL;
- mtx_unlock_spin(&sched_lock);
+ mtx_unlock_spin(&kse_lock);
}
+ PROC_SUNLOCK(p);
PROC_UNLOCK(p);
return (0);
+#else /* !KSE */
+ return (EOPNOTSUPP);
+#endif
}
/*
- * No new KSEG: first call: use current KSE, don't schedule an upcall
+ * newgroup == 0: first call: use current KSE, don't schedule an upcall
* All other situations, do allocate max new KSEs and schedule an upcall.
*
* XXX should be changed so that 'first' behaviour lasts for as long
- * as you have not made a kse in this ksegrp. i.e. as long as we do not have
+ * as you have not made a thread in this proc. i.e. as long as we do not have
* a mailbox..
*/
/* struct kse_create_args {
@@ -534,8 +587,7 @@
int
kse_create(struct thread *td, struct kse_create_args *uap)
{
- struct ksegrp *newkg;
- struct ksegrp *kg;
+#ifdef KSE
struct proc *p;
struct kse_mailbox mbx;
struct kse_upcall *newku;
@@ -543,187 +595,117 @@
struct thread *newtd;
p = td->td_proc;
- kg = td->td_ksegrp;
- if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
- return (err);
-
- ncpus = mp_ncpus;
- if (virtual_cpu != 0)
- ncpus = virtual_cpu;
- /*
- * If the new UTS mailbox says that this
- * will be a BOUND lwp, then it had better
- * have its thread mailbox already there.
- * In addition, this ksegrp will be limited to
- * a concurrency of 1. There is more on this later.
- */
- if (mbx.km_flags & KMF_BOUND) {
- if (mbx.km_curthread == NULL)
- return (EINVAL);
- ncpus = 1;
- } else {
- sa = TDP_SA;
- }
- PROC_LOCK(p);
/*
* Processes using the other threading model can't
* suddenly start calling this one
+ * XXX maybe...
*/
+ PROC_LOCK(p);
if ((p->p_flag & (P_SA|P_HADTHREADS)) == P_HADTHREADS) {
PROC_UNLOCK(p);
return (EINVAL);
}
-
- /*
- * Limit it to NCPU upcall contexts per ksegrp in any case.
- * There is a small race here as we don't hold proclock
- * until we inc the ksegrp count, but it's not really a big problem
- * if we get one too many, but we save a proc lock.
- */
- if ((!uap->newgroup) && (kg->kg_numupcalls >= ncpus)) {
- PROC_UNLOCK(p);
- return (EPROCLIM);
- }
-
if (!(p->p_flag & P_SA)) {
first = 1;
p->p_flag |= P_SA|P_HADTHREADS;
}
-
PROC_UNLOCK(p);
+
+ if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
+ return (err);
+
+ ncpus = mp_ncpus;
+ if (virtual_cpu != 0)
+ ncpus = virtual_cpu;
/*
- * Now pay attention!
- * If we are going to be bound, then we need to be either
- * a new group, or the first call ever. In either
- * case we will be creating (or be) the only thread in a group.
- * and the concurrency will be set to 1.
- * This is not quite right, as we may still make ourself
- * bound after making other ksegrps but it will do for now.
- * The library will only try do this much.
+ * If the new UTS mailbox says that this
+ * will be a BOUND lwp, then it had better
+ * have its thread mailbox already there.
*/
- if (!sa && !(uap->newgroup || first))
- return (EINVAL);
-
- if (uap->newgroup) {
- newkg = ksegrp_alloc();
- bzero(&newkg->kg_startzero,
- __rangeof(struct ksegrp, kg_startzero, kg_endzero));
- bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
- __rangeof(struct ksegrp, kg_startcopy, kg_endcopy));
- sched_init_concurrency(newkg);
+ if ((mbx.km_flags & KMF_BOUND) || uap->newgroup) {
+ /* It's a bound thread (1:1) */
+ if (mbx.km_curthread == NULL)
+ return (EINVAL);
+ ncpus = 1;
+ if (!(uap->newgroup || first))
+ return (EINVAL);
+ } else {
+ /* It's an upcall capable thread */
+ sa = TDP_SA;
PROC_LOCK(p);
- if (p->p_numksegrps >= max_groups_per_proc) {
+ /*
+ * Limit it to NCPU upcall contexts per proc in any case.
+ * numupcalls will soon be numkse or something
+ * as it will represent the number of
+ * non-bound upcalls available. (i.e. ones that can
+ * actually call up).
+ */
+ if (p->p_numupcalls >= ncpus) {
PROC_UNLOCK(p);
- ksegrp_free(newkg);
return (EPROCLIM);
}
- ksegrp_link(newkg, p);
- mtx_lock_spin(&sched_lock);
- sched_fork_ksegrp(td, newkg);
- mtx_unlock_spin(&sched_lock);
+ p->p_numupcalls++;
PROC_UNLOCK(p);
- } else {
- /*
- * We want to make a thread in our own ksegrp.
- * If we are just the first call, either kind
- * is ok, but if not then either we must be
- * already an upcallable thread to make another,
- * or a bound thread to make one of those.
- * Once again, not quite right but good enough for now.. XXXKSE
- */
- if (!first && ((td->td_pflags & TDP_SA) != sa))
- return (EINVAL);
-
- newkg = kg;
}
- /*
- * This test is a bit "indirect".
- * It might simplify things if we made a direct way of testing
- * if a ksegrp has been worked on before.
- * In the case of a bound request and the concurrency being set to
- * one, the concurrency will already be 1 so it's just inefficient
- * but not dangerous to call this again. XXX
+ /*
+ * For the first call this may not have been set.
+ * Of course nor may it actually be needed.
+ * thread_schedule_upcall() will look for it.
*/
- if (newkg->kg_numupcalls == 0) {
- /*
- * Initialize KSE group with the appropriate
- * concurrency.
- *
- * For a multiplexed group, create as as much concurrency
- * as the number of physical cpus.
- * This increases concurrency in the kernel even if the
- * userland is not MP safe and can only run on a single CPU.
- * In an ideal world, every physical cpu should execute a
- * thread. If there is enough concurrency, threads in the
- * kernel can be executed parallel on different cpus at
- * full speed without being restricted by the number of
- * upcalls the userland provides.
- * Adding more upcall structures only increases concurrency
- * in userland.
- *
- * For a bound thread group, because there is only one thread
- * in the group, we only set the concurrency for the group
- * to 1. A thread in this kind of group will never schedule
- * an upcall when blocked. This simulates pthread system
- * scope thread behaviour.
- */
- sched_set_concurrency(newkg, ncpus);
+ if (td->td_standin == NULL) {
+ if (!thread_alloc_spare(td))
+ return (ENOMEM);
}
+
/*
* Even bound LWPs get a mailbox and an upcall to hold it.
+ * XXX This should change.
*/
newku = upcall_alloc();
newku->ku_mailbox = uap->mbx;
newku->ku_func = mbx.km_func;
bcopy(&mbx.km_stack, &newku->ku_stack, sizeof(stack_t));
- /*
- * For the first call this may not have been set.
- * Of course nor may it actually be needed.
- */
- if (td->td_standin == NULL)
- thread_alloc_spare(td);
-
PROC_LOCK(p);
- mtx_lock_spin(&sched_lock);
- if (newkg->kg_numupcalls >= ncpus) {
- mtx_unlock_spin(&sched_lock);
- PROC_UNLOCK(p);
- upcall_free(newku);
- return (EPROCLIM);
- }
-
+ PROC_SLOCK(p);
/*
* If we are the first time, and a normal thread,
* then transfer all the signals back to the 'process'.
* SA threading will make a special thread to handle them.
*/
- if (first && sa) {
- SIGSETOR(p->p_siglist, td->td_siglist);
- SIGEMPTYSET(td->td_siglist);
+ if (first) {
+ sigqueue_move_set(&td->td_sigqueue, &p->p_sigqueue,
+ &td->td_sigqueue.sq_signals);
SIGFILLSET(td->td_sigmask);
SIG_CANTMASK(td->td_sigmask);
}
/*
- * Make the new upcall available to the ksegrp.
+ * Make the new upcall available to the process.
* It may or may not use it, but it's available.
*/
- upcall_link(newku, newkg);
+ TAILQ_INSERT_TAIL(&p->p_upcalls, newku, ku_link);
+ newku->ku_proc = p;
PROC_UNLOCK(p);
if (mbx.km_quantum)
- newkg->kg_upquantum = max(1, mbx.km_quantum / tick);
+/* XXX should this be in the thread? */
+ p->p_upquantum = max(1, mbx.km_quantum / tick);
/*
* Each upcall structure has an owner thread, find which
* one owns it.
*/
+ thread_lock(td);
+ mtx_lock_spin(&kse_lock);
if (uap->newgroup) {
/*
- * Because the new ksegrp hasn't a thread,
- * create an initial upcall thread to own it.
+ * The newgroup parameter now means
+ * "bound, non SA, system scope"
+ * It is only used for the interrupt thread at the
+ * moment I think.. (or system scope threads dopey).
+ * We'll rename it later.
*/
newtd = thread_schedule_upcall(td, newku);
} else {
@@ -743,11 +725,14 @@
newtd = thread_schedule_upcall(td, newku);
}
}
- mtx_unlock_spin(&sched_lock);
+ mtx_unlock_spin(&kse_lock);
+ thread_unlock(td);
+ PROC_SUNLOCK(p);
/*
* Let the UTS instance know its LWPID.
* It doesn't really care. But the debugger will.
+ * XXX warning.. remember that this moves.
*/
suword32(&newku->ku_mailbox->km_lwp, newtd->td_tid);
@@ -755,13 +740,22 @@
* In the same manner, if the UTS has a current user thread,
* then it is also running on this LWP so set it as well.
* The library could do that of course.. but why not..
+ * XXX I'm not sure this can ever happen but ...
+ * XXX does the UTS ever set this in the mailbox before calling this?
*/
if (mbx.km_curthread)
suword32(&mbx.km_curthread->tm_lwp, newtd->td_tid);
-
if (sa) {
newtd->td_pflags |= TDP_SA;
+ /*
+ * If we are starting a new thread, kick it off.
+ */
+ if (newtd != td) {
+ thread_lock(newtd);
+ sched_add(newtd, SRQ_BORING);
+ thread_unlock(newtd);
+ }
} else {
newtd->td_pflags &= ~TDP_SA;
@@ -793,20 +787,18 @@
_PRELE(p);
}
PROC_UNLOCK(p);
+ thread_lock(newtd);
+ sched_add(newtd, SRQ_BORING);
+ thread_unlock(newtd);
}
}
-
- /*
- * If we are starting a new thread, kick it off.
- */
- if (newtd != td) {
- mtx_lock_spin(&sched_lock);
- setrunqueue(newtd, SRQ_BORING);
- mtx_unlock_spin(&sched_lock);
- }
return (0);
+#else /* !KSE */
+ return (EOPNOTSUPP);
+#endif
}
+#ifdef KSE
/*
* Initialize global thread allocation resources.
*/
@@ -819,60 +811,20 @@
}
/*
- * Stash an embarasingly extra upcall into the zombie upcall queue.
- */
-
-void
-upcall_stash(struct kse_upcall *ku)
-{
- mtx_lock_spin(&kse_zombie_lock);
- TAILQ_INSERT_HEAD(&zombie_upcalls, ku, ku_link);
- mtx_unlock_spin(&kse_zombie_lock);
-}
-
-/*
- * Reap zombie kse resource.
- */
-void
-kse_GC(void)
-{
- struct kse_upcall *ku_first, *ku_next;
-
- /*
- * Don't even bother to lock if none at this instant,
- * we really don't care about the next instant..
- */
- if (!TAILQ_EMPTY(&zombie_upcalls)) {
- mtx_lock_spin(&kse_zombie_lock);
- ku_first = TAILQ_FIRST(&zombie_upcalls);
- if (ku_first)
- TAILQ_INIT(&zombie_upcalls);
- mtx_unlock_spin(&kse_zombie_lock);
- while (ku_first) {
- ku_next = TAILQ_NEXT(ku_first, ku_link);
- upcall_free(ku_first);
- ku_first = ku_next;
- }
- }
-}
-
-/*
* Store the thread context in the UTS's mailbox.
* then add the mailbox at the head of a list we are building in user space.
- * The list is anchored in the ksegrp structure.
+ * The list is anchored in the proc structure.
*/
int
thread_export_context(struct thread *td, int willexit)
{
struct proc *p;
- struct ksegrp *kg;
uintptr_t mbx;
void *addr;
int error = 0, sig;
mcontext_t mc;
p = td->td_proc;
- kg = td->td_ksegrp;
/*
* Post sync signal, or process SIGKILL and SIGSTOP.
@@ -881,9 +833,9 @@
*/
PROC_LOCK(p);
if (td->td_flags & TDF_NEEDSIGCHK) {
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_flags &= ~TDF_NEEDSIGCHK;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
mtx_lock(&p->p_sigacts->ps_mtx);
while ((sig = cursig(td)) != 0)
postsig(sig);
@@ -913,14 +865,15 @@
* entry into this one
*/
for (;;) {
- mbx = (uintptr_t)kg->kg_completed;
+ mbx = (uintptr_t)p->p_completed;
if (suword(addr, mbx)) {
error = EFAULT;
goto bad;
}
PROC_LOCK(p);
- if (mbx == (uintptr_t)kg->kg_completed) {
- kg->kg_completed = td->td_mailbox;
+ if (mbx == (uintptr_t)p->p_completed) {
+ thread_lock(td);
+ p->p_completed = td->td_mailbox;
/*
* The thread context may be taken away by
* other upcall threads when we unlock
@@ -928,6 +881,7 @@
* use it again in any other places.
*/
td->td_mailbox = NULL;
+ thread_unlock(td);
PROC_UNLOCK(p);
break;
}
@@ -943,19 +897,18 @@
}
/*
- * Take the list of completed mailboxes for this KSEGRP and put them on this
+ * Take the list of completed mailboxes for this Process and put them on this
* upcall's mailbox as it's the next one going up.
*/
static int
-thread_link_mboxes(struct ksegrp *kg, struct kse_upcall *ku)
+thread_link_mboxes(struct proc *p, struct kse_upcall *ku)
{
- struct proc *p = kg->kg_proc;
void *addr;
uintptr_t mbx;
addr = (void *)(&ku->ku_mailbox->km_completed);
for (;;) {
- mbx = (uintptr_t)kg->kg_completed;
+ mbx = (uintptr_t)p->p_completed;
if (suword(addr, mbx)) {
PROC_LOCK(p);
psignal(p, SIGSEGV);
@@ -963,8 +916,8 @@
return (EFAULT);
}
PROC_LOCK(p);
- if (mbx == (uintptr_t)kg->kg_completed) {
- kg->kg_completed = NULL;
+ if (mbx == (uintptr_t)p->p_completed) {
+ p->p_completed = NULL;
PROC_UNLOCK(p);
break;
}
@@ -985,9 +938,9 @@
return (0);
if (user) {
/* Current always do via ast() */
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_flags |= TDF_ASTPENDING;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
td->td_uuticks++;
} else if (td->td_mailbox != NULL)
td->td_usticks++;
@@ -1004,8 +957,12 @@
caddr_t addr;
u_int uticks;
- if (td->td_mailbox == NULL)
+ thread_lock(td);
+ if (td->td_mailbox == NULL) {
+ thread_unlock(td);
return (-1);
+ }
+ thread_unlock(td);
if ((uticks = td->td_uuticks) != 0) {
td->td_uuticks = 0;
@@ -1030,25 +987,29 @@
/*
* This function is intended to be used to initialize a spare thread
- * for upcall. Initialize thread's large data area outside sched_lock
+ * for upcall. Initialize thread's large data area outside the thread lock
* for thread_schedule_upcall(). The crhold is also here to get it out
* from the schedlock as it has a mutex op itself.
* XXX BUG.. we need to get the cr ref after the thread has
* checked and chenged its own, not 6 months before...
*/
-void
+int
thread_alloc_spare(struct thread *td)
{
struct thread *spare;
if (td->td_standin)
- return;
+ return (1);
spare = thread_alloc();
+ if (spare == NULL)
+ return (0);
td->td_standin = spare;
bzero(&spare->td_startzero,
__rangeof(struct thread, td_startzero, td_endzero));
spare->td_proc = td->td_proc;
spare->td_ucred = crhold(td->td_ucred);
+ spare->td_flags = TDF_INMEM;
+ return (1);
}
/*
@@ -1060,8 +1021,8 @@
{
struct thread *td2;
- mtx_assert(&sched_lock, MA_OWNED);
-
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ mtx_assert(&kse_lock, MA_OWNED);
/*
* Schedule an upcall thread on specified kse_upcall,
* the kse_upcall must be free.
@@ -1082,19 +1043,18 @@
*/
bcopy(&td->td_startcopy, &td2->td_startcopy,
__rangeof(struct thread, td_startcopy, td_endcopy));
- thread_link(td2, ku->ku_ksegrp);
+ sched_fork_thread(td, td2);
+ thread_link(td2, ku->ku_proc);
/* inherit parts of blocked thread's context as a good template */
cpu_set_upcall(td2, td);
/* Let the new thread become owner of the upcall */
ku->ku_owner = td2;
td2->td_upcall = ku;
- td2->td_flags = 0;
td2->td_pflags = TDP_SA|TDP_UPCALLING;
td2->td_state = TDS_CAN_RUN;
td2->td_inhibitors = 0;
SIGFILLSET(td2->td_sigmask);
SIG_CANTMASK(td2->td_sigmask);
- sched_fork_thread(td, td2);
return (td2); /* bogus.. should be a void function */
}
@@ -1103,10 +1063,9 @@
* debugged.
*/
void
-thread_signal_add(struct thread *td, int sig)
+thread_signal_add(struct thread *td, ksiginfo_t *ksi)
{
struct proc *p;
- siginfo_t siginfo;
struct sigacts *ps;
int error;
@@ -1115,11 +1074,11 @@
ps = p->p_sigacts;
mtx_assert(&ps->ps_mtx, MA_OWNED);
- cpu_thread_siginfo(sig, 0, &siginfo);
mtx_unlock(&ps->ps_mtx);
- SIGADDSET(td->td_sigmask, sig);
+ SIGADDSET(td->td_sigmask, ksi->ksi_signo);
PROC_UNLOCK(p);
- error = copyout(&siginfo, &td->td_mailbox->tm_syncsig, sizeof(siginfo));
+ error = copyout(&ksi->ksi_info, &td->td_mailbox->tm_syncsig,
+ sizeof(siginfo_t));
if (error) {
PROC_LOCK(p);
sigexit(td, SIGSEGV);
@@ -1134,7 +1093,7 @@
struct kse_upcall *ku;
struct thread *td2;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
/*
* If the outgoing thread is in threaded group and has never
@@ -1160,13 +1119,17 @@
* start up immediatly, or at least before us if
* we release our slot.
*/
+ mtx_lock_spin(&kse_lock);
ku = td->td_upcall;
ku->ku_owner = NULL;
td->td_upcall = NULL;
td->td_pflags &= ~TDP_CAN_UNBIND;
td2 = thread_schedule_upcall(td, ku);
+ mtx_unlock_spin(&kse_lock);
if (flags & SW_INVOL || nextthread) {
- setrunqueue(td2, SRQ_YIELDING);
+ thread_lock(td2);
+ sched_add(td2, SRQ_YIELDING);
+ thread_unlock(td2);
} else {
/* Keep up with reality.. we have one extra thread
* in the picture.. and it's 'running'.
@@ -1184,7 +1147,6 @@
thread_user_enter(struct thread *td)
{
struct proc *p = td->td_proc;
- struct ksegrp *kg;
struct kse_upcall *ku;
struct kse_thr_mailbox *tmbx;
uint32_t flags;
@@ -1207,15 +1169,26 @@
* note where our mailbox is.
*/
- kg = td->td_ksegrp;
+ thread_lock(td);
ku = td->td_upcall;
+ thread_unlock(td);
KASSERT(ku != NULL, ("no upcall owned"));
KASSERT(ku->ku_owner == td, ("wrong owner"));
KASSERT(!TD_CAN_UNBIND(td), ("can unbind"));
- if (td->td_standin == NULL)
- thread_alloc_spare(td);
+ if (td->td_standin == NULL) {
+ if (!thread_alloc_spare(td)) {
+ PROC_LOCK(p);
+ if (kern_logsigexit)
+ log(LOG_INFO,
+ "pid %d (%s), uid %d: thread_alloc_spare failed\n",
+ p->p_pid, p->p_comm,
+ td->td_ucred ? td->td_ucred->cr_uid : -1);
+ sigexit(td, SIGSEGV); /* XXX ? */
+ /* panic("thread_user_enter: thread_alloc_spare failed"); */
+ }
+ }
ku->ku_mflags = fuword32((void *)&ku->ku_mailbox->km_flags);
tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread);
if ((tmbx == NULL) || (tmbx == (void *)-1L) ||
@@ -1235,16 +1208,18 @@
} else {
td->td_mailbox = tmbx;
td->td_pflags |= TDP_CAN_UNBIND;
+ PROC_LOCK(p);
if (__predict_false(p->p_flag & P_TRACED)) {
flags = fuword32(&tmbx->tm_dflags);
if (flags & TMDF_SUSPEND) {
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
/* fuword can block, check again */
if (td->td_upcall)
ku->ku_flags |= KUF_DOUPCALL;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
}
+ PROC_UNLOCK(p);
}
}
}
@@ -1265,10 +1240,9 @@
thread_userret(struct thread *td, struct trapframe *frame)
{
struct kse_upcall *ku;
- struct ksegrp *kg, *kg2;
struct proc *p;
struct timespec ts;
- int error = 0, upcalls, uts_crit;
+ int error = 0, uts_crit;
/* Nothing to do with bound thread */
if (!(td->td_pflags & TDP_SA))
@@ -1285,7 +1259,7 @@
}
p = td->td_proc;
- kg = td->td_ksegrp;
+ thread_lock(td);
ku = td->td_upcall;
/*
@@ -1295,11 +1269,12 @@
* then it can return direct to userland.
*/
if (TD_CAN_UNBIND(td)) {
+ thread_unlock(td);
td->td_pflags &= ~TDP_CAN_UNBIND;
if ((td->td_flags & TDF_NEEDSIGCHK) == 0 &&
- (kg->kg_completed == NULL) &&
+ (p->p_completed == NULL) &&
(ku->ku_flags & KUF_DOUPCALL) == 0 &&
- (kg->kg_upquantum && ticks < kg->kg_nextupcall)) {
+ (p->p_upquantum && ticks < p->p_nextupcall)) {
nanotime(&ts);
error = copyout(&ts,
(caddr_t)&ku->ku_mailbox->km_timeofday,
@@ -1318,53 +1293,46 @@
*/
td->td_pflags |= TDP_UPCALLING;
} else if (td->td_mailbox && (ku == NULL)) {
+ thread_unlock(td);
thread_export_context(td, 1);
PROC_LOCK(p);
- if (kg->kg_upsleeps)
- wakeup(&kg->kg_completed);
- WITNESS_WARN(WARN_PANIC, &p->p_mtx.mtx_object,
+ if (p->p_upsleeps)
+ wakeup(&p->p_completed);
+ WITNESS_WARN(WARN_PANIC, &p->p_mtx.lock_object,
"thread exiting in userret");
- mtx_lock_spin(&sched_lock);
+ sigqueue_flush(&td->td_sigqueue);
+ PROC_SLOCK(p);
thread_stopped(p);
thread_exit();
/* NOTREACHED */
- }
+ } else
+ thread_unlock(td);
KASSERT(ku != NULL, ("upcall is NULL"));
KASSERT(TD_CAN_UNBIND(td) == 0, ("can unbind"));
+ PROC_LOCK(p);
+ PROC_SLOCK(p);
if (p->p_numthreads > max_threads_per_proc) {
max_threads_hits++;
- PROC_LOCK(p);
- mtx_lock_spin(&sched_lock);
- p->p_maxthrwaits++;
while (p->p_numthreads > max_threads_per_proc) {
- upcalls = 0;
- FOREACH_KSEGRP_IN_PROC(p, kg2) {
- if (kg2->kg_numupcalls == 0)
- upcalls++;
- else
- upcalls += kg2->kg_numupcalls;
- }
- if (upcalls >= max_threads_per_proc)
+ if (p->p_numupcalls >= max_threads_per_proc)
break;
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
if (msleep(&p->p_numthreads, &p->p_mtx, PPAUSE|PCATCH,
"maxthreads", hz/10) != EWOULDBLOCK) {
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
break;
- } else {
- mtx_lock_spin(&sched_lock);
- }
+ } else
+ PROC_SLOCK(p);
}
- p->p_maxthrwaits--;
- mtx_unlock_spin(&sched_lock);
- PROC_UNLOCK(p);
}
+ PROC_SUNLOCK(p);
+ PROC_UNLOCK(p);
if (td->td_pflags & TDP_UPCALLING) {
uts_crit = 0;
- kg->kg_nextupcall = ticks + kg->kg_upquantum;
+ p->p_nextupcall = ticks + p->p_upquantum;
/*
* There is no more work to do and we are going to ride
* this thread up to userland as an upcall.
@@ -1375,9 +1343,9 @@
td->td_pflags &= ~TDP_UPCALLING;
if (ku->ku_flags & KUF_DOUPCALL) {
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
ku->ku_flags &= ~KUF_DOUPCALL;
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
}
/*
* Set user context to the UTS
@@ -1409,7 +1377,7 @@
* this KSE's mailbox.
*/
if (!(ku->ku_mflags & KMF_NOCOMPLETED) &&
- (error = thread_link_mboxes(kg, ku)) != 0)
+ (error = thread_link_mboxes(p, ku)) != 0)
goto out;
}
if (!uts_crit) {
@@ -1434,7 +1402,7 @@
* for when we re-enter the kernel.
*/
if (td->td_standin == NULL)
- thread_alloc_spare(td);
+ thread_alloc_spare(td); /* XXX care of failure ? */
}
ku->ku_mflags = 0;
@@ -1452,7 +1420,6 @@
void
thread_continued(struct proc *p)
{
- struct ksegrp *kg;
struct kse_upcall *ku;
struct thread *td;
@@ -1463,19 +1430,15 @@
return;
if (p->p_flag & P_TRACED) {
- FOREACH_KSEGRP_IN_PROC(p, kg) {
- td = TAILQ_FIRST(&kg->kg_threads);
- if (td == NULL)
- continue;
- /* not a SA group, nothing to do */
- if (!(td->td_pflags & TDP_SA))
- continue;
- FOREACH_UPCALL_IN_GROUP(kg, ku) {
- mtx_lock_spin(&sched_lock);
+ td = TAILQ_FIRST(&p->p_threads);
+ if (td && (td->td_pflags & TDP_SA)) {
+ FOREACH_UPCALL_IN_PROC(p, ku) {
+ PROC_SLOCK(p);
ku->ku_flags |= KUF_DOUPCALL;
- mtx_unlock_spin(&sched_lock);
- wakeup(&kg->kg_completed);
+ PROC_SUNLOCK(p);
+ wakeup(&p->p_completed);
}
}
}
}
+#endif
Index: subr_rman.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_rman.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/subr_rman.c -L sys/kern/subr_rman.c -u -r1.1.1.2 -r1.2
--- sys/kern/subr_rman.c
+++ sys/kern/subr_rman.c
@@ -55,13 +55,15 @@
* permitted.
*/
+#include "opt_ddb.h"
+
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_rman.c,v 1.43.2.1 2006/01/20 07:38:01 yongari Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_rman.c,v 1.57 2007/04/28 07:37:49 jmg Exp $");
-#define __RMAN_RESOURCE_VISIBLE
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
+#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
@@ -70,6 +72,33 @@
#include <sys/rman.h>
#include <sys/sysctl.h>
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/*
+ * We use a linked list rather than a bitmap because we need to be able to
+ * represent potentially huge objects (like all of a processor's physical
+ * address space). That is also why the indices are defined to have type
+ * `unsigned long' -- that being the largest integral type in ISO C (1990).
+ * The 1999 version of C allows `long long'; we may need to switch to that
+ * at some point in the future, particularly if we want to support 36-bit
+ * addresses on IA32 hardware.
+ */
+struct resource_i {
+ struct resource r_r;
+ TAILQ_ENTRY(resource_i) r_link;
+ LIST_ENTRY(resource_i) r_sharelink;
+ LIST_HEAD(, resource_i) *r_sharehead;
+ u_long r_start; /* index of the first entry in this resource */
+ u_long r_end; /* index of the last entry (inclusive) */
+ u_int r_flags;
+ void *r_virtual; /* virtual address of this resource */
+ struct device *r_dev; /* device which has allocated this resource */
+ struct rman *r_rm; /* resource manager from whence this came */
+ int r_rid; /* optional rid for this resource. */
+};
+
int rman_debug = 0;
TUNABLE_INT("debug.rman_debug", &rman_debug);
SYSCTL_INT(_debug, OID_AUTO, rman_debug, CTLFLAG_RW,
@@ -81,10 +110,22 @@
struct rman_head rman_head;
static struct mtx rman_mtx; /* mutex to protect rman_head */
-static int int_rman_activate_resource(struct rman *rm, struct resource *r,
- struct resource **whohas);
-static int int_rman_deactivate_resource(struct resource *r);
-static int int_rman_release_resource(struct rman *rm, struct resource *r);
+static int int_rman_activate_resource(struct rman *rm, struct resource_i *r,
+ struct resource_i **whohas);
+static int int_rman_deactivate_resource(struct resource_i *r);
+static int int_rman_release_resource(struct rman *rm, struct resource_i *r);
+
+static __inline struct resource_i *
+int_alloc_resource(int malloc_flag)
+{
+ struct resource_i *r;
+
+ r = malloc(sizeof *r, M_RMAN, malloc_flag | M_ZERO);
+ if (r != NULL) {
+ r->r_r.__r_i = r;
+ }
+ return (r);
+}
int
rman_init(struct rman *rm)
@@ -114,18 +155,14 @@
return 0;
}
-/*
- * NB: this interface is not robust against programming errors which
- * add multiple copies of the same region.
- */
int
rman_manage_region(struct rman *rm, u_long start, u_long end)
{
- struct resource *r, *s;
+ struct resource_i *r, *s, *t;
DPRINTF(("rman_manage_region: <%s> request: start %#lx, end %#lx\n",
rm->rm_descr, start, end));
- r = malloc(sizeof *r, M_RMAN, M_NOWAIT | M_ZERO);
+ r = int_alloc_resource(M_NOWAIT);
if (r == NULL)
return ENOMEM;
r->r_start = start;
@@ -133,15 +170,56 @@
r->r_rm = rm;
mtx_lock(rm->rm_mtx);
- for (s = TAILQ_FIRST(&rm->rm_list);
- s && s->r_end < r->r_start;
- s = TAILQ_NEXT(s, r_link))
- ;
+ /* Skip entries before us. */
+ TAILQ_FOREACH(s, &rm->rm_list, r_link) {
+ if (s->r_end == ULONG_MAX)
+ break;
+ if (s->r_end + 1 >= r->r_start)
+ break;
+ }
+
+ /* If we ran off the end of the list, insert at the tail. */
if (s == NULL) {
TAILQ_INSERT_TAIL(&rm->rm_list, r, r_link);
} else {
- TAILQ_INSERT_BEFORE(s, r, r_link);
+ /* Check for any overlap with the current region. */
+ if (r->r_start <= s->r_end && r->r_end >= s->r_start)
+ return EBUSY;
+
+ /* Check for any overlap with the next region. */
+ t = TAILQ_NEXT(s, r_link);
+ if (t && r->r_start <= t->r_end && r->r_end >= t->r_start)
+ return EBUSY;
+
+ /*
+ * See if this region can be merged with the next region. If
+ * not, clear the pointer.
+ */
+ if (t && (r->r_end + 1 != t->r_start || t->r_flags != 0))
+ t = NULL;
+
+ /* See if we can merge with the current region. */
+ if (s->r_end + 1 == r->r_start && s->r_flags == 0) {
+ /* Can we merge all 3 regions? */
+ if (t != NULL) {
+ s->r_end = t->r_end;
+ TAILQ_REMOVE(&rm->rm_list, t, r_link);
+ free(r, M_RMAN);
+ free(t, M_RMAN);
+ } else {
+ s->r_end = r->r_end;
+ free(r, M_RMAN);
+ }
+ } else if (t != NULL) {
+ /* Can we merge with just the next region? */
+ t->r_start = r->r_start;
+ free(r, M_RMAN);
+ } else if (s->r_end < r->r_start) {
+ TAILQ_INSERT_AFTER(&rm->rm_list, s, r, r_link);
+ } else {
+ TAILQ_INSERT_BEFORE(s, r, r_link);
+ }
}
mtx_unlock(rm->rm_mtx);
@@ -149,9 +227,19 @@
}
int
+rman_init_from_resource(struct rman *rm, struct resource *r)
+{
+ int rv;
+
+ if ((rv = rman_init(rm)) != 0)
+ return (rv);
+ return (rman_manage_region(rm, r->__r_i->r_start, r->__r_i->r_end));
+}
+
+int
rman_fini(struct rman *rm)
{
- struct resource *r;
+ struct resource_i *r;
mtx_lock(rm->rm_mtx);
TAILQ_FOREACH(r, &rm->rm_list, r_link) {
@@ -186,14 +274,15 @@
struct device *dev)
{
u_int want_activate;
- struct resource *r, *s, *rv;
+ struct resource_i *r, *s, *rv;
u_long rstart, rend, amask, bmask;
rv = NULL;
- DPRINTF(("rman_reserve_resource: <%s> request: [%#lx, %#lx], length "
- "%#lx, flags %u, device %s\n", rm->rm_descr, start, end, count,
- flags, dev == NULL ? "<null>" : device_get_nameunit(dev)));
+ DPRINTF(("rman_reserve_resource_bound: <%s> request: [%#lx, %#lx], "
+ "length %#lx, flags %u, device %s\n", rm->rm_descr, start, end,
+ count, flags,
+ dev == NULL ? "<null>" : device_get_nameunit(dev)));
want_activate = (flags & RF_ACTIVE);
flags &= ~RF_ACTIVE;
@@ -267,7 +356,7 @@
* split it in two. The first case requires
* two new allocations; the second requires but one.
*/
- rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT | M_ZERO);
+ rv = int_alloc_resource(M_NOWAIT);
if (rv == NULL)
goto out;
rv->r_start = rstart;
@@ -285,7 +374,7 @@
/*
* We are allocating in the middle.
*/
- r = malloc(sizeof *r, M_RMAN, M_NOWAIT|M_ZERO);
+ r = int_alloc_resource(M_NOWAIT);
if (r == NULL) {
free(rv, M_RMAN);
rv = NULL;
@@ -343,7 +432,7 @@
&& (s->r_end - s->r_start + 1) == count &&
(s->r_start & amask) == 0 &&
((s->r_start ^ s->r_end) & bmask) == 0) {
- rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT | M_ZERO);
+ rv = int_alloc_resource(M_NOWAIT);
if (rv == NULL)
goto out;
rv->r_start = s->r_start;
@@ -383,7 +472,7 @@
* make sense for RF_TIMESHARE-type resources.)
*/
if (rv && want_activate) {
- struct resource *whohas;
+ struct resource_i *whohas;
if (int_rman_activate_resource(rm, rv, &whohas)) {
int_rman_release_resource(rm, rv);
rv = NULL;
@@ -391,7 +480,7 @@
}
mtx_unlock(rm->rm_mtx);
- return (rv);
+ return (rv == NULL ? NULL : &rv->r_r);
}
struct resource *
@@ -404,10 +493,10 @@
}
static int
-int_rman_activate_resource(struct rman *rm, struct resource *r,
- struct resource **whohas)
+int_rman_activate_resource(struct rman *rm, struct resource_i *r,
+ struct resource_i **whohas)
{
- struct resource *s;
+ struct resource_i *s;
int ok;
/*
@@ -439,12 +528,13 @@
}
int
-rman_activate_resource(struct resource *r)
+rman_activate_resource(struct resource *re)
{
int rv;
- struct resource *whohas;
+ struct resource_i *r, *whohas;
struct rman *rm;
+ r = re->__r_i;
rm = r->r_rm;
mtx_lock(rm->rm_mtx);
rv = int_rman_activate_resource(rm, r, &whohas);
@@ -453,12 +543,13 @@
}
int
-rman_await_resource(struct resource *r, int pri, int timo)
+rman_await_resource(struct resource *re, int pri, int timo)
{
int rv;
- struct resource *whohas;
+ struct resource_i *r, *whohas;
struct rman *rm;
+ r = re->__r_i;
rm = r->r_rm;
mtx_lock(rm->rm_mtx);
for (;;) {
@@ -478,7 +569,7 @@
}
static int
-int_rman_deactivate_resource(struct resource *r)
+int_rman_deactivate_resource(struct resource_i *r)
{
r->r_flags &= ~RF_ACTIVE;
@@ -494,17 +585,17 @@
{
struct rman *rm;
- rm = r->r_rm;
+ rm = r->__r_i->r_rm;
mtx_lock(rm->rm_mtx);
- int_rman_deactivate_resource(r);
+ int_rman_deactivate_resource(r->__r_i);
mtx_unlock(rm->rm_mtx);
return 0;
}
static int
-int_rman_release_resource(struct rman *rm, struct resource *r)
+int_rman_release_resource(struct rman *rm, struct resource_i *r)
{
- struct resource *s, *t;
+ struct resource_i *s, *t;
if (r->r_flags & RF_ACTIVE)
int_rman_deactivate_resource(r);
@@ -595,11 +686,14 @@
}
int
-rman_release_resource(struct resource *r)
+rman_release_resource(struct resource *re)
{
int rv;
- struct rman *rm = r->r_rm;
+ struct resource_i *r;
+ struct rman *rm;
+ r = re->__r_i;
+ rm = r->r_rm;
mtx_lock(rm->rm_mtx);
rv = int_rman_release_resource(rm, r);
mtx_unlock(rm->rm_mtx);
@@ -627,37 +721,37 @@
u_long
rman_get_start(struct resource *r)
{
- return (r->r_start);
+ return (r->__r_i->r_start);
}
u_long
rman_get_end(struct resource *r)
{
- return (r->r_end);
+ return (r->__r_i->r_end);
}
u_long
rman_get_size(struct resource *r)
{
- return (r->r_end - r->r_start + 1);
+ return (r->__r_i->r_end - r->__r_i->r_start + 1);
}
u_int
rman_get_flags(struct resource *r)
{
- return (r->r_flags);
+ return (r->__r_i->r_flags);
}
void
rman_set_virtual(struct resource *r, void *v)
{
- r->r_virtual = v;
+ r->__r_i->r_virtual = v;
}
void *
rman_get_virtual(struct resource *r)
{
- return (r->r_virtual);
+ return (r->__r_i->r_virtual);
}
void
@@ -687,37 +781,44 @@
void
rman_set_rid(struct resource *r, int rid)
{
- r->r_rid = rid;
+ r->__r_i->r_rid = rid;
}
void
rman_set_start(struct resource *r, u_long start)
{
- r->r_start = start;
+ r->__r_i->r_start = start;
}
void
rman_set_end(struct resource *r, u_long end)
{
- r->r_end = end;
+ r->__r_i->r_end = end;
}
int
rman_get_rid(struct resource *r)
{
- return (r->r_rid);
+ return (r->__r_i->r_rid);
}
struct device *
rman_get_device(struct resource *r)
{
- return (r->r_dev);
+ return (r->__r_i->r_dev);
}
void
rman_set_device(struct resource *r, struct device *dev)
{
- r->r_dev = dev;
+ r->__r_i->r_dev = dev;
+}
+
+int
+rman_is_region_manager(struct resource *r, struct rman *rm)
+{
+
+ return (r->__r_i->r_rm == rm);
}
/*
@@ -733,7 +834,7 @@
u_int namelen = arg2;
int rman_idx, res_idx;
struct rman *rm;
- struct resource *res;
+ struct resource_i *res;
struct u_rman urm;
struct u_resource ures;
int error;
@@ -777,7 +878,7 @@
/*
* Find the indexed resource and return it.
*/
- mtx_lock(&rman_mtx);
+ mtx_lock(rm->rm_mtx);
TAILQ_FOREACH(res, &rm->rm_list, r_link) {
if (res_idx-- == 0) {
bzero(&ures, sizeof(ures));
@@ -801,14 +902,58 @@
ures.r_size = res->r_end - res->r_start + 1;
ures.r_flags = res->r_flags;
- mtx_unlock(&rman_mtx);
+ mtx_unlock(rm->rm_mtx);
error = SYSCTL_OUT(req, &ures, sizeof(ures));
return (error);
}
}
- mtx_unlock(&rman_mtx);
+ mtx_unlock(rm->rm_mtx);
return (ENOENT);
}
SYSCTL_NODE(_hw_bus, OID_AUTO, rman, CTLFLAG_RD, sysctl_rman,
"kernel resource manager");
+
+#ifdef DDB
+static void
+dump_rman(struct rman *rm)
+{
+ struct resource_i *r;
+ const char *devname;
+
+ if (db_pager_quit)
+ return;
+ db_printf("rman: %s\n", rm->rm_descr);
+ db_printf(" 0x%lx-0x%lx (full range)\n", rm->rm_start, rm->rm_end);
+ TAILQ_FOREACH(r, &rm->rm_list, r_link) {
+ if (r->r_dev != NULL) {
+ devname = device_get_nameunit(r->r_dev);
+ if (devname == NULL)
+ devname = "nomatch";
+ } else
+ devname = NULL;
+ db_printf(" 0x%lx-0x%lx ", r->r_start, r->r_end);
+ if (devname != NULL)
+ db_printf("(%s)\n", devname);
+ else
+ db_printf("----\n");
+ if (db_pager_quit)
+ return;
+ }
+}
+
+DB_SHOW_COMMAND(rman, db_show_rman)
+{
+
+ if (have_addr)
+ dump_rman((struct rman *)addr);
+}
+
+DB_SHOW_COMMAND(allrman, db_show_all_rman)
+{
+ struct rman *rm;
+
+ TAILQ_FOREACH(rm, &rman_head, rm_link)
+ dump_rman(rm);
+}
+#endif
Index: subr_firmware.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_firmware.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_firmware.c -L sys/kern/subr_firmware.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_firmware.c
+++ sys/kern/subr_firmware.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_firmware.c,v 1.1.2.1 2006/02/23 02:13:31 mlaier Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_firmware.c,v 1.9 2007/02/15 17:21:31 luigi Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
@@ -38,44 +38,148 @@
#include <sys/errno.h>
#include <sys/linker.h>
#include <sys/firmware.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/module.h>
+/*
+ * Loadable firmware support. See sys/sys/firmware.h and firmware(9)
+ * form more details on the subsystem.
+ *
+ * 'struct firmware' is the user-visible part of the firmware table.
+ * Additional internal information is stored in a 'struct priv_fw'
+ * (currently a static array). A slot is in use if FW_INUSE is true:
+ */
+
+#define FW_INUSE(p) ((p)->file != NULL || (p)->fw.name != NULL)
+
+/*
+ * fw.name != NULL when an image is registered; file != NULL for
+ * autoloaded images whose handling has not been completed.
+ *
+ * The state of a slot evolves as follows:
+ * firmware_register --> fw.name = image_name
+ * (autoloaded image) --> file = module reference
+ * firmware_unregister --> fw.name = NULL
+ * (unloadentry complete) --> file = NULL
+ *
+ * In order for the above to work, the 'file' field must remain
+ * unchanged in firmware_unregister().
+ *
+ * Images residing in the same module are linked to each other
+ * through the 'parent' argument of firmware_register().
+ * One image (typically, one with the same name as the module to let
+ * the autoloading mechanism work) is considered the parent image for
+ * all other images in the same module. Children affect the refcount
+ * on the parent image preventing improper unloading of the image itself.
+ */
+
+struct priv_fw {
+ int refcnt; /* reference count */
+
+ /*
+ * parent entry, see above. Set on firmware_register(),
+ * cleared on firmware_unregister().
+ */
+ struct priv_fw *parent;
+
+ int flags; /* record FIRMWARE_UNLOAD requests */
+#define FW_UNLOAD 0x100
+
+ /*
+ * 'file' is private info managed by the autoload/unload code.
+ * Set at the end of firmware_get(), cleared only in the
+ * firmware_task, so the latter can depend on its value even
+ * while the lock is not held.
+ */
+ linker_file_t file; /* module file, if autoloaded */
+
+ /*
+ * 'fw' is the externally visible image information.
+ * We do not make it the first field in priv_fw, to avoid the
+ * temptation of casting pointers to each other.
+ * Use PRIV_FW(fw) to get a pointer to the cointainer of fw.
+ * Beware, PRIV_FW does not work for a NULL pointer.
+ */
+ struct firmware fw; /* externally visible information */
+};
+
+/*
+ * PRIV_FW returns the pointer to the container of struct firmware *x.
+ * Cast to intptr_t to override the 'const' attribute of x
+ */
+#define PRIV_FW(x) ((struct priv_fw *) \
+ ((intptr_t)(x) - offsetof(struct priv_fw, fw)) )
+
+/*
+ * At the moment we use a static array as backing store for the registry.
+ * Should we move to a dynamic structure, keep in mind that we cannot
+ * reallocate the array because pointers are held externally.
+ * A list may work, though.
+ */
#define FIRMWARE_MAX 30
-static char *name_unload = "UNLOADING";
-static struct firmware firmware_table[FIRMWARE_MAX];
+static struct priv_fw firmware_table[FIRMWARE_MAX];
+
+/*
+ * module release are handled in a separate task as they might sleep.
+ */
struct task firmware_task;
+
+/*
+ * This mutex protects accesses to the firmware table.
+ */
struct mtx firmware_mtx;
MTX_SYSINIT(firmware, &firmware_mtx, "firmware table", MTX_DEF);
/*
+ * Helper function to lookup a name.
+ * As a side effect, it sets the pointer to a free slot, if any.
+ * This way we can concentrate most of the registry scanning in
+ * this function, which makes it easier to replace the registry
+ * with some other data structure.
+ */
+static struct priv_fw *
+lookup(const char *name, struct priv_fw **empty_slot)
+{
+ struct priv_fw *fp = NULL;
+ struct priv_fw *dummy;
+ int i;
+
+ if (empty_slot == NULL)
+ empty_slot = &dummy;
+ *empty_slot = NULL;
+ for (i = 0; i < FIRMWARE_MAX; i++) {
+ fp = &firmware_table[i];
+ if (fp->fw.name != NULL && strcasecmp(name, fp->fw.name) == 0)
+ break;
+ else if (!FW_INUSE(fp))
+ *empty_slot = fp;
+ }
+ return (i < FIRMWARE_MAX ) ? fp : NULL;
+}
+
+/*
* Register a firmware image with the specified name. The
* image name must not already be registered. If this is a
* subimage then parent refers to a previously registered
* image that this should be associated with.
*/
-struct firmware *
+const struct firmware *
firmware_register(const char *imagename, const void *data, size_t datasize,
- unsigned int version, struct firmware *parent)
+ unsigned int version, const struct firmware *parent)
{
- struct firmware *frp = NULL;
- int i;
+ struct priv_fw *match, *frp;
mtx_lock(&firmware_mtx);
- for (i = 0; i < FIRMWARE_MAX; i++) {
- struct firmware *fp = &firmware_table[i];
-
- if (fp->name == NULL) {
- if (frp == NULL)
- frp = fp;
- continue;
- }
- if (strcasecmp(imagename, fp->name) == 0) {
- mtx_unlock(&firmware_mtx);
- printf("%s: image %s already registered!\n",
- __func__, imagename);
- return NULL;
- }
+ /*
+ * Do a lookup to make sure the name is unique or find a free slot.
+ */
+ match = lookup(imagename, &frp);
+ if (match != NULL) {
+ mtx_unlock(&firmware_mtx);
+ printf("%s: image %s already registered!\n",
+ __func__, imagename);
+ return NULL;
}
if (frp == NULL) {
mtx_unlock(&firmware_mtx);
@@ -83,49 +187,20 @@
__func__, imagename);
return NULL;
}
- frp->name = imagename;
- frp->data = data;
- frp->datasize = datasize;
- frp->version = version;
- frp->refcnt = 0;
- if (parent != NULL)
- parent->refcnt++;
- frp->parent = parent;
- frp->file = NULL;
- mtx_unlock(&firmware_mtx);
- return frp;
-}
-
-static void
-clearentry(struct firmware *fp, int keep_file)
-{
- KASSERT(fp->refcnt == 0, ("image %s refcnt %u", fp->name, fp->refcnt));
- if (keep_file && (fp->file != NULL))
- fp->name = name_unload;
- else {
- fp->name = NULL;
- fp->file = NULL;
- }
- fp->data = NULL;
- fp->datasize = 0;
- fp->version = 0;
- if (fp->parent != NULL) { /* release parent reference */
- fp->parent->refcnt--;
- fp->parent = NULL;
+ bzero(frp, sizeof(frp)); /* start from a clean record */
+ frp->fw.name = imagename;
+ frp->fw.data = data;
+ frp->fw.datasize = datasize;
+ frp->fw.version = version;
+ if (parent != NULL) {
+ frp->parent = PRIV_FW(parent);
+ frp->parent->refcnt++;
}
-}
-
-static struct firmware *
-lookup(const char *name)
-{
- int i;
-
- for (i = 0; i < FIRMWARE_MAX; i++) {
- struct firmware * fp = &firmware_table[i];
- if (fp->name != NULL && strcasecmp(name, fp->name) == 0)
- return fp;
- }
- return NULL;
+ mtx_unlock(&firmware_mtx);
+ if (bootverbose)
+ printf("firmware: '%s' version %u: %zu bytes loaded at %p\n",
+ imagename, version, datasize, data);
+ return &frp->fw;
}
/*
@@ -136,111 +211,168 @@
int
firmware_unregister(const char *imagename)
{
- struct firmware *fp;
- int refcnt = 0;
+ struct priv_fw *fp;
+ int err;
mtx_lock(&firmware_mtx);
- /*
- * NB: it is ok for the lookup to fail; this can happen
- * when a module is unloaded on last reference and the
- * module unload handler unregister's each of it's
- * firmware images.
- */
- fp = lookup(imagename);
- if (fp != NULL) {
- refcnt = fp->refcnt;
- if (refcnt == 0)
- clearentry(fp, 0);
+ fp = lookup(imagename, NULL);
+ if (fp == NULL) {
+ /*
+ * It is ok for the lookup to fail; this can happen
+ * when a module is unloaded on last reference and the
+ * module unload handler unregister's each of it's
+ * firmware images.
+ */
+ err = 0;
+ } else if (fp->refcnt != 0) { /* cannot unregister */
+ err = EBUSY;
+ } else {
+ linker_file_t x = fp->file; /* save value */
+
+ if (fp->parent != NULL) /* release parent reference */
+ fp->parent->refcnt--;
+ /*
+ * Clear the whole entry with bzero to make sure we
+ * do not forget anything. Then restore 'file' which is
+ * non-null for autoloaded images.
+ */
+ bzero(fp, sizeof(struct priv_fw));
+ fp->file = x;
+ err = 0;
}
mtx_unlock(&firmware_mtx);
- return (refcnt != 0 ? EBUSY : 0);
+ return err;
}
/*
* Lookup and potentially load the specified firmware image.
- * If the firmware is not found in the registry attempt to
- * load a kernel module with the image name. If the firmware
- * is located a reference is returned. The caller must release
- * this reference for the image to be eligible for removal/unload.
+ * If the firmware is not found in the registry, try to load a kernel
+ * module named as the image name.
+ * If the firmware is located, a reference is returned. The caller must
+ * release this reference for the image to be eligible for removal/unload.
*/
-struct firmware *
+const struct firmware *
firmware_get(const char *imagename)
{
struct thread *td;
- struct firmware *fp;
+ struct priv_fw *fp;
linker_file_t result;
- int requested_load = 0;
-again:
mtx_lock(&firmware_mtx);
- fp = lookup(imagename);
- if (fp != NULL) {
- if (requested_load)
- fp->file = result;
- fp->refcnt++;
- mtx_unlock(&firmware_mtx);
- return fp;
- }
+ fp = lookup(imagename, NULL);
+ if (fp != NULL)
+ goto found;
/*
- * Image not present, try to load the module holding it
- * or if we already tried give up.
+ * Image not present, try to load the module holding it.
*/
mtx_unlock(&firmware_mtx);
- if (requested_load) {
- printf("%s: failed to load firmware image %s\n",
- __func__, imagename);
- return NULL;
- }
td = curthread;
- if (suser(td) != 0 || securelevel_gt(td->td_ucred, 0) != 0) {
+ if (priv_check(td, PRIV_FIRMWARE_LOAD) != 0 ||
+ securelevel_gt(td->td_ucred, 0) != 0) {
printf("%s: insufficient privileges to "
"load firmware image %s\n", __func__, imagename);
return NULL;
}
- mtx_lock(&Giant); /* XXX */
(void) linker_reference_module(imagename, NULL, &result);
- mtx_unlock(&Giant); /* XXX */
- requested_load = 1;
- goto again; /* sort of an Algol-style for loop */
+ /*
+ * After loading the module, see if the image is registered now.
+ */
+ mtx_lock(&firmware_mtx);
+ fp = lookup(imagename, NULL);
+ if (fp == NULL) {
+ mtx_unlock(&firmware_mtx);
+ printf("%s: failed to load firmware image %s\n",
+ __func__, imagename);
+ (void) linker_release_module(imagename, NULL, NULL);
+ return NULL;
+ }
+ fp->file = result; /* record the module identity */
+
+found: /* common exit point on success */
+ fp->refcnt++;
+ mtx_unlock(&firmware_mtx);
+ return &fp->fw;
}
-static void
-unloadentry(void *unused1, int unused2)
+/*
+ * Release a reference to a firmware image returned by firmware_get.
+ * The caller may specify, with the FIRMWARE_UNLOAD flag, its desire
+ * to release the resource, but the flag is only advisory.
+ *
+ * If this is the last reference to the firmware image, and this is an
+ * autoloaded module, wake up the firmware_task to figure out what to do
+ * with the associated module.
+ */
+void
+firmware_put(const struct firmware *p, int flags)
{
- struct firmware *fp;
+ struct priv_fw *fp = PRIV_FW(p);
mtx_lock(&firmware_mtx);
- while ((fp = lookup(name_unload))) {
- /*
- * XXX: ugly, we should be able to lookup unlocked here if
- * we properly lock around clearentry below to avoid double
- * unload. Play it safe for now.
- */
- mtx_unlock(&firmware_mtx);
-
- linker_file_unload(fp->file, LINKER_UNLOAD_NORMAL);
-
- mtx_lock(&firmware_mtx);
- clearentry(fp, 0);
+ fp->refcnt--;
+ if (fp->refcnt == 0) {
+ if (flags & FIRMWARE_UNLOAD)
+ fp->flags |= FW_UNLOAD;
+ if (fp->file)
+ taskqueue_enqueue(taskqueue_thread, &firmware_task);
}
mtx_unlock(&firmware_mtx);
}
/*
- * Release a reference to a firmware image returned by
- * firmware_get. The reference is released and if this is
- * the last reference to the firmware image the associated
- * module may be released/unloaded.
+ * The body of the task in charge of unloading autoloaded modules
+ * that are not needed anymore.
+ * Images can be cross-linked so we may need to make multiple passes,
+ * but the time we spend in the loop is bounded because we clear entries
+ * as we touch them.
*/
-void
-firmware_put(struct firmware *fp, int flags)
+static void
+unloadentry(void *unused1, int unused2)
{
+ int limit = FIRMWARE_MAX;
+ int i; /* current cycle */
+
mtx_lock(&firmware_mtx);
- fp->refcnt--;
- if (fp->refcnt == 0 && (flags & FIRMWARE_UNLOAD))
- clearentry(fp, 1);
- if (fp->file)
- taskqueue_enqueue(taskqueue_thread, &firmware_task);
+ /*
+ * Scan the table. limit is set to make sure we make another
+ * full sweep after matching an entry that requires unloading.
+ */
+ for (i = 0; i < limit; i++) {
+ struct priv_fw *fp;
+ int err;
+
+ fp = &firmware_table[i % FIRMWARE_MAX];
+ if (fp->fw.name == NULL || fp->file == NULL ||
+ fp->refcnt != 0 || (fp->flags & FW_UNLOAD) == 0)
+ continue;
+
+ /*
+ * Found an entry. Now:
+ * 1. bump up limit to make sure we make another full round;
+ * 2. clear FW_UNLOAD so we don't try this entry again.
+ * 3. release the lock while trying to unload the module.
+ * 'file' remains set so that the entry cannot be reused
+ * in the meantime (it also means that fp->file will
+ * not change while we release the lock).
+ */
+ limit = i + FIRMWARE_MAX; /* make another full round */
+ fp->flags &= ~FW_UNLOAD; /* do not try again */
+
+ mtx_unlock(&firmware_mtx);
+ err = linker_release_module(NULL, NULL, fp->file);
+ mtx_lock(&firmware_mtx);
+
+ /*
+ * We rely on the module to call firmware_unregister()
+ * on unload to actually release the entry.
+ * If err = 0 we can drop our reference as the system
+ * accepted it. Otherwise unloading failed (e.g. the
+ * module itself gave an error) so our reference is
+ * still valid.
+ */
+ if (err == 0)
+ fp->file = NULL;
+ }
mtx_unlock(&firmware_mtx);
}
@@ -250,13 +382,34 @@
static int
firmware_modevent(module_t mod, int type, void *unused)
{
+ struct priv_fw *fp;
+ int i, err = EINVAL;
+
switch (type) {
case MOD_LOAD:
TASK_INIT(&firmware_task, 0, unloadentry, NULL);
return 0;
+
case MOD_UNLOAD:
+ /* request all autoloaded modules to be released */
+ mtx_lock(&firmware_mtx);
+ for (i = 0; i < FIRMWARE_MAX; i++) {
+ fp = &firmware_table[i];
+ fp->flags |= FW_UNLOAD;;
+ }
+ mtx_unlock(&firmware_mtx);
+ taskqueue_enqueue(taskqueue_thread, &firmware_task);
taskqueue_drain(taskqueue_thread, &firmware_task);
- return 0;
+ for (i = 0; i < FIRMWARE_MAX; i++) {
+ fp = &firmware_table[i];
+ if (fp->fw.name != NULL) {
+ printf("%s: image %p ref %d still active slot %d\n",
+ __func__, fp->fw.name,
+ fp->refcnt, i);
+ err = EINVAL;
+ }
+ }
+ return err;
}
return EINVAL;
}
Index: kern_xxx.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_xxx.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_xxx.c -L sys/kern/kern_xxx.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_xxx.c
+++ sys/kern/kern_xxx.c
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_xxx.c,v 1.46 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_xxx.c,v 1.49 2007/03/05 13:10:57 rwatson Exp $");
#include "opt_compat.h"
@@ -38,6 +38,7 @@
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/kernel.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
@@ -53,9 +54,6 @@
u_int len;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
ogethostname(td, uap)
@@ -81,9 +79,6 @@
u_int len;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
osethostname(td, uap)
@@ -107,9 +102,6 @@
int dummy;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
ogethostid(td, uap)
@@ -128,9 +120,6 @@
long hostid;
};
#endif
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
osethostid(td, uap)
@@ -139,7 +128,8 @@
{
int error;
- if ((error = suser(td)))
+ error = priv_check(td, PRIV_SETHOSTID);
+ if (error)
return (error);
mtx_lock(&Giant);
hostid = uap->hostid;
@@ -147,22 +137,20 @@
return (0);
}
-/*
- * MPSAFE
- */
int
oquota(td, uap)
struct thread *td;
struct oquota_args *uap;
{
+
return (ENOSYS);
}
#endif /* COMPAT_43 */
/*
- * This is the FreeBSD-1.1 compatable uname(2) interface. These
- * days it is done in libc as a wrapper around a bunch of sysctl's.
- * This must maintain the old 1.1 binary ABI.
+ * This is the FreeBSD-1.1 compatable uname(2) interface. These days it is
+ * done in libc as a wrapper around a bunch of sysctl's. This must maintain
+ * the old 1.1 binary ABI.
*/
#if SYS_NMLN != 32
#error "FreeBSD-1.1 uname syscall has been broken"
@@ -172,10 +160,6 @@
struct utsname *name;
};
#endif
-
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
uname(td, uap)
@@ -255,10 +239,6 @@
int len;
};
#endif
-
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
getdomainname(td, uap)
@@ -283,10 +263,6 @@
int len;
};
#endif
-
-/*
- * MPSAFE
- */
/* ARGSUSED */
int
setdomainname(td, uap)
@@ -295,9 +271,10 @@
{
int error, domainnamelen;
+ error = priv_check(td, PRIV_SETDOMAINNAME);
+ if (error)
+ return (error);
mtx_lock(&Giant);
- if ((error = suser(td)))
- goto done2;
if ((u_int)uap->len > sizeof (domainname) - 1) {
error = EINVAL;
goto done2;
@@ -309,4 +286,3 @@
mtx_unlock(&Giant);
return (error);
}
-
Index: tty.c
===================================================================
RCS file: /home/cvs/src/sys/kern/tty.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/tty.c -L sys/kern/tty.c -u -r1.3 -r1.4
--- sys/kern/tty.c
+++ sys/kern/tty.c
@@ -71,7 +71,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/tty.c,v 1.250.2.1 2005/11/06 16:09:32 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/tty.c,v 1.273.4.1 2008/01/12 00:20:06 jhb Exp $");
#include "opt_compat.h"
#include "opt_tty.h"
@@ -83,11 +83,10 @@
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/sx.h>
-#ifndef BURN_BRIDGES
-#if defined(COMPAT_43)
+#if defined(COMPAT_43TTY)
#include <sys/ioctl_compat.h>
#endif
-#endif
+#include <sys/priv.h>
#include <sys/proc.h>
#define TTYDEFCHARS
#include <sys/tty.h>
@@ -148,7 +147,9 @@
.d_flags = D_TTY | D_NEEDGIANT,
};
-static int proc_compare(struct proc *p1, struct proc *p2);
+static int proc_sum(struct proc *, int *);
+static int proc_compare(struct proc *, struct proc *);
+static int thread_compare(struct thread *, struct thread *);
static int ttnread(struct tty *tp);
static void ttyecho(int c, struct tty *tp);
static int ttyoutput(int c, struct tty *tp);
@@ -253,6 +254,7 @@
*/
static TAILQ_HEAD(, tty) tty_list = TAILQ_HEAD_INITIALIZER(tty_list);
static struct mtx tty_list_mutex;
+MTX_SYSINIT(tty_list, &tty_list_mutex, "ttylist", MTX_DEF);
static struct unrhdr *tty_unit;
@@ -331,7 +333,7 @@
tp->t_hotchar = 0;
tp->t_pgrp = NULL;
tp->t_session = NULL;
- ostate= tp->t_state;
+ ostate = tp->t_state;
tp->t_state = 0;
knlist_clear(&tp->t_rsel.si_note, 0);
knlist_clear(&tp->t_wsel.si_note, 0);
@@ -517,7 +519,7 @@
if (CCEQ(cc[VSTOP], c)) {
if (!ISSET(tp->t_state, TS_TTSTOP)) {
SET(tp->t_state, TS_TTSTOP);
- (*tp->t_stop)(tp, 0);
+ tt_stop(tp, 0);
return (0);
}
if (!CCEQ(cc[VSTART], c))
@@ -834,8 +836,7 @@
case TIOCSTI:
case TIOCSTOP:
case TIOCSWINSZ:
-#ifndef BURN_BRIDGES
-#if defined(COMPAT_43)
+#if defined(COMPAT_43TTY)
case TIOCLBIC:
case TIOCLBIS:
case TIOCLSET:
@@ -845,7 +846,6 @@
case TIOCSETP:
case TIOCSLTC:
#endif
-#endif
sx_slock(&proctree_lock);
PROC_LOCK(p);
while (isbackground(p, tp) && !(p->p_flag & P_PPWAIT) &&
@@ -873,45 +873,33 @@
break;
}
- if (tp->t_break != NULL) {
- switch (cmd) {
- case TIOCSBRK:
- tp->t_break(tp, 1);
- return (0);
- case TIOCCBRK:
- tp->t_break(tp, 0);
- return (0);
- default:
- break;
- }
- }
if (tp->t_modem != NULL) {
switch (cmd) {
case TIOCSDTR:
- tp->t_modem(tp, SER_DTR, 0);
+ tt_modem(tp, SER_DTR, 0);
return (0);
case TIOCCDTR:
- tp->t_modem(tp, 0, SER_DTR);
+ tt_modem(tp, 0, SER_DTR);
return (0);
case TIOCMSET:
bits = *(int *)data;
sig = (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1;
sig2 = ((~bits) & (TIOCM_DTR | TIOCM_RTS)) >> 1;
- tp->t_modem(tp, sig, sig2);
+ tt_modem(tp, sig, sig2);
return (0);
case TIOCMBIS:
bits = *(int *)data;
sig = (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1;
- tp->t_modem(tp, sig, 0);
+ tt_modem(tp, sig, 0);
return (0);
case TIOCMBIC:
bits = *(int *)data;
sig = (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1;
- tp->t_modem(tp, 0, sig);
+ tt_modem(tp, 0, sig);
return (0);
case TIOCMGET:
- sig = tp->t_modem(tp, 0, 0);
+ sig = tt_modem(tp, 0, 0);
/* See <sys/serial.h. for the "<< 1" stuff */
bits = TIOCM_LE + (sig << 1);
*(int *)data = bits;
@@ -1034,7 +1022,7 @@
break;
case TIOCMSDTRWAIT:
/* must be root since the wait applies to following logins */
- error = suser(td);
+ error = priv_check(td, PRIV_TTY_DTRWAIT);
if (error)
return (error);
tp->t_dtr_wait = *(int *)data * hz / 100;
@@ -1072,7 +1060,8 @@
/*
* Set device hardware.
*/
- if (tp->t_param && (error = (*tp->t_param)(tp, t))) {
+ error = tt_param(tp, t);
+ if (error) {
splx(s);
return (error);
}
@@ -1182,9 +1171,9 @@
splx(s);
break;
case TIOCSTI: /* simulate terminal input */
- if ((flag & FREAD) == 0 && suser(td))
+ if ((flag & FREAD) == 0 && priv_check(td, PRIV_TTY_STI))
return (EPERM);
- if (!isctty(p, tp) && suser(td))
+ if (!isctty(p, tp) && priv_check(td, PRIV_TTY_STI))
return (EACCES);
s = spltty();
ttyld_rint(tp, *(u_char *)data);
@@ -1194,7 +1183,7 @@
s = spltty();
if (!ISSET(tp->t_state, TS_TTSTOP)) {
SET(tp->t_state, TS_TTSTOP);
- (*tp->t_stop)(tp, 0);
+ tt_stop(tp, 0);
}
splx(s);
break;
@@ -1257,7 +1246,7 @@
}
break;
case TIOCSDRAINWAIT:
- error = suser(td);
+ error = priv_check(td, PRIV_TTY_DRAINWAIT);
if (error)
return (error);
tp->t_timeout = *(int *)data * hz;
@@ -1267,16 +1256,16 @@
case TIOCGDRAINWAIT:
*(int *)data = tp->t_timeout / hz;
break;
+ case TIOCSBRK:
+ return (tt_break(tp, 1));
+ case TIOCCBRK:
+ return (tt_break(tp, 0));
default:
-#if defined(COMPAT_43)
-#ifndef BURN_BRIDGES
+#if defined(COMPAT_43TTY)
return (ttcompat(tp, cmd, data, flag));
#else
return (ENOIOCTL);
#endif
-#else
- return (ENOIOCTL);
-#endif
}
return (0);
}
@@ -1330,6 +1319,8 @@
int s;
tp = tty_gettp(dev);
+ if (tp->t_state & TS_GONE)
+ return (ENODEV);
switch (kn->kn_filter) {
case EVFILT_READ:
@@ -1344,7 +1335,7 @@
return (EINVAL);
}
- kn->kn_hook = (caddr_t)dev;
+ kn->kn_hook = (caddr_t)tp;
s = spltty();
knlist_add(klist, kn, 0);
@@ -1356,7 +1347,7 @@
static void
filt_ttyrdetach(struct knote *kn)
{
- struct tty *tp = ((struct cdev *)kn->kn_hook)->si_tty;
+ struct tty *tp = (struct tty *)kn->kn_hook;
int s = spltty();
knlist_remove(&tp->t_rsel.si_note, kn, 0);
@@ -1366,10 +1357,10 @@
static int
filt_ttyread(struct knote *kn, long hint)
{
- struct tty *tp = ((struct cdev *)kn->kn_hook)->si_tty;
+ struct tty *tp = (struct tty *)kn->kn_hook;
kn->kn_data = ttnread(tp);
- if (ISSET(tp->t_state, TS_ZOMBIE)) {
+ if ((tp->t_state & TS_GONE) || ISSET(tp->t_state, TS_ZOMBIE)) {
kn->kn_flags |= EV_EOF;
return (1);
}
@@ -1379,7 +1370,7 @@
static void
filt_ttywdetach(struct knote *kn)
{
- struct tty *tp = ((struct cdev *)kn->kn_hook)->si_tty;
+ struct tty *tp = (struct tty *)kn->kn_hook;
int s = spltty();
knlist_remove(&tp->t_wsel.si_note, kn, 0);
@@ -1389,10 +1380,10 @@
static int
filt_ttywrite(struct knote *kn, long hint)
{
- struct tty *tp = ((struct cdev *)kn->kn_hook)->si_tty;
+ struct tty *tp = (struct tty *)kn->kn_hook;
kn->kn_data = tp->t_outq.c_cc;
- if (ISSET(tp->t_state, TS_ZOMBIE))
+ if ((tp->t_state & TS_GONE) || ISSET(tp->t_state, TS_ZOMBIE))
return (1);
return (kn->kn_data <= tp->t_olowat &&
ISSET(tp->t_state, TS_CONNECTED));
@@ -1429,7 +1420,7 @@
s = spltty();
while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
ISSET(tp->t_state, TS_CONNECTED) && tp->t_oproc) {
- (*tp->t_oproc)(tp);
+ tt_oproc(tp);
if ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
ISSET(tp->t_state, TS_CONNECTED)) {
SET(tp->t_state, TS_SO_OCOMPLETE);
@@ -1479,7 +1470,7 @@
FLUSHQ(&tp->t_outq);
CLR(tp->t_state, TS_TTSTOP);
}
- (*tp->t_stop)(tp, rw);
+ tt_stop(tp, rw);
if (rw & FREAD) {
FLUSHQ(&tp->t_canq);
FLUSHQ(&tp->t_rawq);
@@ -1611,8 +1602,7 @@
ttstart(struct tty *tp)
{
- if (tp->t_oproc != NULL) /* XXX: Kludge for pty. */
- (*tp->t_oproc)(tp);
+ tt_oproc(tp);
return (0);
}
@@ -1650,7 +1640,7 @@
} else if (!ISSET(tp->t_state, TS_CAR_OFLOW)) {
SET(tp->t_state, TS_CAR_OFLOW);
SET(tp->t_state, TS_TTSTOP);
- (*tp->t_stop)(tp, 0);
+ tt_stop(tp, 0);
}
} else if (flag == 0) {
/*
@@ -1732,7 +1722,7 @@
int s, first, error = 0;
int has_stime = 0, last_cc = 0;
long slp = 0; /* XXX this should be renamed `timo'. */
- struct timeval stime;
+ struct timeval stime = { 0, 0 };
struct pgrp *pg;
td = curthread;
@@ -2542,12 +2532,13 @@
{
struct timeval utime, stime;
struct proc *p, *pick;
- struct thread *td;
+ struct thread *td, *picktd;
const char *stateprefix, *state;
long rss;
int load, pctcpu;
pid_t pid;
char comm[MAXCOMLEN + 1];
+ struct rusage ru;
if (ttycheckoutq(tp,0) == 0)
return;
@@ -2580,31 +2571,25 @@
/*
* Pick the most interesting process and copy some of its
- * state for printing later. sched_lock must be held for
- * most parts of this. Holding it throughout is simplest
- * and prevents even unimportant inconsistencies in the
- * copy of the state, but may increase interrupt latency
- * too much.
+ * state for printing later. This operation could rely on stale
+ * data as we can't hold the proc slock or thread locks over the
+ * whole list. However, we're guaranteed not to reference an exited
+ * thread or proc since we hold the tty locked.
*/
pick = NULL;
- mtx_lock_spin(&sched_lock);
LIST_FOREACH(p, &tp->t_pgrp->pg_members, p_pglist)
if (proc_compare(pick, p))
pick = p;
- td = FIRST_THREAD_IN_PROC(pick); /* XXXKSE */
-#if 0
- KASSERT(td != NULL, ("ttyinfo: no thread"));
-#else
- if (td == NULL) {
- mtx_unlock_spin(&sched_lock);
- PGRP_UNLOCK(tp->t_pgrp);
- ttyprintf(tp, "foreground process without thread\n");
- tp->t_rocount = 0;
- return;
- }
-#endif
+ PROC_SLOCK(pick);
+ picktd = NULL;
+ td = FIRST_THREAD_IN_PROC(pick);
+ FOREACH_THREAD_IN_PROC(pick, td)
+ if (thread_compare(picktd, td))
+ picktd = td;
+ td = picktd;
stateprefix = "";
+ thread_lock(td);
if (TD_IS_RUNNING(td))
state = "running";
else if (TD_ON_RUNQ(td) || TD_CAN_RUN(td))
@@ -2625,14 +2610,15 @@
else
state = "unknown";
pctcpu = (sched_pctcpu(td) * 10000 + FSCALE / 2) >> FSHIFT;
+ thread_unlock(td);
if (pick->p_state == PRS_NEW || pick->p_state == PRS_ZOMBIE)
rss = 0;
else
rss = pgtok(vmspace_resident_count(pick->p_vmspace));
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(pick);
PROC_LOCK(pick);
PGRP_UNLOCK(tp->t_pgrp);
- calcru(pick, &utime, &stime);
+ rufetchcalc(pick, &ru, &utime, &stime);
pid = pick->p_pid;
bcopy(pick->p_comm, comm, sizeof(comm));
PROC_UNLOCK(pick);
@@ -2660,18 +2646,6 @@
* we pick out just "short-term" sleepers (P_SINTR == 0).
* 4) Further ties are broken by picking the highest pid.
*/
-#define ISRUN(p, val) \
-do { \
- struct thread *td; \
- val = 0; \
- FOREACH_THREAD_IN_PROC(p, td) { \
- if (TD_ON_RUNQ(td) || \
- TD_IS_RUNNING(td)) { \
- val = 1; \
- break; \
- } \
- } \
-} while (0)
#define TESTAB(a, b) ((a)<<1 | (b))
#define ONLYA 2
@@ -2679,71 +2653,134 @@
#define BOTH 3
static int
-proc_compare(struct proc *p1, struct proc *p2)
+proc_sum(struct proc *p, int *estcpup)
{
+ struct thread *td;
+ int estcpu;
+ int val;
- int esta, estb;
- struct ksegrp *kg;
- mtx_assert(&sched_lock, MA_OWNED);
- if (p1 == NULL)
+ val = 0;
+ estcpu = 0;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (TD_ON_RUNQ(td) ||
+ TD_IS_RUNNING(td))
+ val = 1;
+ estcpu += sched_pctcpu(td);
+ thread_unlock(td);
+ }
+ *estcpup = estcpu;
+
+ return (val);
+}
+
+static int
+thread_compare(struct thread *td, struct thread *td2)
+{
+ int runa, runb;
+ int slpa, slpb;
+ fixpt_t esta, estb;
+
+ if (td == NULL)
return (1);
- ISRUN(p1, esta);
- ISRUN(p2, estb);
-
+ /*
+ * Fetch running stats, pctcpu usage, and interruptable flag.
+ */
+ thread_lock(td);
+ runa = TD_IS_RUNNING(td) | TD_ON_RUNQ(td);
+ slpa = td->td_flags & TDF_SINTR;
+ esta = sched_pctcpu(td);
+ thread_unlock(td);
+ thread_lock(td2);
+ runb = TD_IS_RUNNING(td2) | TD_ON_RUNQ(td2);
+ estb = sched_pctcpu(td2);
+ slpb = td2->td_flags & TDF_SINTR;
+ thread_unlock(td2);
/*
* see if at least one of them is runnable
*/
- switch (TESTAB(esta, estb)) {
+ switch (TESTAB(runa, runb)) {
case ONLYA:
return (0);
case ONLYB:
return (1);
case BOTH:
- /*
- * tie - favor one with highest recent cpu utilization
- */
- esta = estb = 0;
- FOREACH_KSEGRP_IN_PROC(p1,kg) {
- esta += kg->kg_estcpu;
- }
- FOREACH_KSEGRP_IN_PROC(p2,kg) {
- estb += kg->kg_estcpu;
- }
- if (estb > esta)
- return (1);
- if (esta > estb)
- return (0);
- return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
+ break;
}
/*
- * weed out zombies
+ * favor one with highest recent cpu utilization
*/
- switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) {
- case ONLYA:
+ if (estb > esta)
return (1);
- case ONLYB:
+ if (esta > estb)
return (0);
+ /*
+ * favor one sleeping in a non-interruptible sleep
+ */
+ switch (TESTAB(slpa, slpb)) {
+ case ONLYA:
+ return (0);
+ case ONLYB:
+ return (1);
case BOTH:
- return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
+ break;
}
-#if 0 /* XXXKSE */
+ return (td < td2);
+}
+
+static int
+proc_compare(struct proc *p1, struct proc *p2)
+{
+
+ int runa, runb;
+ fixpt_t esta, estb;
+
+ if (p1 == NULL)
+ return (1);
+
/*
- * pick the one with the smallest sleep time
+ * Fetch various stats about these processes. After we drop the
+ * lock the information could be stale but the race is unimportant.
+ */
+ PROC_SLOCK(p1);
+ runa = proc_sum(p1, &esta);
+ PROC_SUNLOCK(p1);
+ PROC_SLOCK(p2);
+ runb = proc_sum(p2, &estb);
+ PROC_SUNLOCK(p2);
+
+ /*
+ * see if at least one of them is runnable
*/
- if (p2->p_slptime > p1->p_slptime)
+ switch (TESTAB(runa, runb)) {
+ case ONLYA:
return (0);
- if (p1->p_slptime > p2->p_slptime)
+ case ONLYB:
return (1);
+ case BOTH:
+ break;
+ }
/*
- * favor one sleeping in a non-interruptible sleep
+ * favor one with highest recent cpu utilization
*/
- if (p1->p_sflag & PS_SINTR && (p2->p_sflag & PS_SINTR) == 0)
+ if (estb > esta)
return (1);
- if (p2->p_sflag & PS_SINTR && (p1->p_sflag & PS_SINTR) == 0)
+ if (esta > estb)
return (0);
-#endif
+ /*
+ * weed out zombies
+ */
+ switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) {
+ case ONLYA:
+ return (1);
+ case ONLYB:
+ return (0);
+ case BOTH:
+ break;
+ }
+
return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
}
@@ -2841,23 +2878,10 @@
* tty_open().
*/
struct tty *
-ttymalloc(struct tty *tp)
+ttyalloc()
{
- static int once;
-
- if (!once) {
- mtx_init(&tty_list_mutex, "ttylist", NULL, MTX_DEF);
- once++;
- }
+ struct tty *tp;
- if (tp) {
- /*
- * XXX: Either this argument should go away, or we should
- * XXX: require it and do a ttyrel(tp) here and allocate
- * XXX: a new tty. For now do nothing.
- */
- return(tp);
- }
tp = malloc(sizeof *tp, M_TTYS, M_WAITOK | M_ZERO);
mtx_init(&tp->t_mtx, "tty", NULL, MTX_DEF);
@@ -2882,13 +2906,6 @@
return (tp);
}
-struct tty *
-ttyalloc()
-{
-
- return (ttymalloc(NULL));
-}
-
static void
ttypurge(struct cdev *dev)
{
@@ -2912,9 +2929,11 @@
*/
int
-ttycreate(struct tty *tp, struct cdevsw *csw, int unit, int flags, const char *fmt, ...)
+ttycreate(struct tty *tp, int flags, const char *fmt, ...)
{
char namebuf[SPECNAMELEN - 3]; /* XXX space for "tty" */
+ struct cdevsw *csw = NULL;
+ int unit = 0;
va_list ap;
struct cdev *cp;
int i, minor, sminor, sunit;
@@ -2964,7 +2983,7 @@
cp->si_drv2 = &tp->t_lock_in;
cp->si_tty = tp;
- if (flags & MINOR_CALLOUT) {
+ if (flags & TS_CALLOUT) {
cp = make_dev(csw, minor | MINOR_CALLOUT,
UID_UUCP, GID_DIALER, 0660, "cua%s", namebuf);
dev_depends(tp->t_dev, cp);
@@ -2998,13 +3017,20 @@
{
tp->t_state |= TS_GONE;
+ if (SEL_WAITING(&tp->t_rsel))
+ selwakeuppri(&tp->t_rsel, TTIPRI);
+ if (SEL_WAITING(&tp->t_wsel))
+ selwakeuppri(&tp->t_wsel, TTOPRI);
+ if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL)
+ pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
wakeup(&tp->t_dtr_wait);
wakeup(TSA_CARR_ON(tp));
wakeup(TSA_HUP_OR_INPUT(tp));
wakeup(TSA_OCOMPLETE(tp));
wakeup(TSA_OLOWAT(tp));
- if (tp->t_purge != NULL)
- tp->t_purge(tp);
+ KNOTE_UNLOCKED(&tp->t_rsel.si_note, 0);
+ KNOTE_UNLOCKED(&tp->t_wsel.si_note, 0);
+ tt_purge(tp);
}
/*
@@ -3014,16 +3040,19 @@
*
* XXX: This shall sleep until all threads have left the driver.
*/
-
void
ttyfree(struct tty *tp)
{
+ struct cdev *dev;
u_int unit;
mtx_assert(&Giant, MA_OWNED);
ttygone(tp);
unit = tp->t_devunit;
- destroy_dev(tp->t_mdev);
+ dev = tp->t_mdev;
+ tp->t_dev = NULL;
+ ttyrel(tp);
+ destroy_dev(dev);
free_unr(tty_unit, unit);
}
@@ -3039,7 +3068,6 @@
tp = TAILQ_FIRST(&tty_list);
if (tp != NULL)
ttyref(tp);
- mtx_unlock(&tty_list_mutex);
while (tp != NULL) {
bzero(&xt, sizeof xt);
xt.xt_size = sizeof xt;
@@ -3048,6 +3076,18 @@
xt.xt_cancc = tp->t_canq.c_cc;
xt.xt_outcc = tp->t_outq.c_cc;
XT_COPY(line);
+
+ /*
+ * XXX: We hold the tty list lock while doing this to
+ * work around a race with pty/pts tty destruction.
+ * They set t_dev to NULL and then call ttyrel() to
+ * free the structure which will block on the list
+ * lock before they call destroy_dev() on the cdev
+ * backing t_dev.
+ *
+ * XXX: ttyfree() now does the same since it has been
+ * fixed to not leak ttys.
+ */
if (tp->t_dev != NULL)
xt.xt_dev = dev2udev(tp->t_dev);
XT_COPY(state);
@@ -3070,6 +3110,7 @@
XT_COPY(olowat);
XT_COPY(ospeedwat);
#undef XT_COPY
+ mtx_unlock(&tty_list_mutex);
error = SYSCTL_OUT(req, &xt, sizeof xt);
if (error != 0) {
ttyrel(tp);
@@ -3082,7 +3123,9 @@
mtx_unlock(&tty_list_mutex);
ttyrel(tp);
tp = tp2;
+ mtx_lock(&tty_list_mutex);
}
+ mtx_unlock(&tty_list_mutex);
return (0);
}
@@ -3108,6 +3151,7 @@
struct tty *tp;
tp = dev->si_tty;
+
s = spltty();
/*
* We jump to this label after all non-interrupted sleeps to pick
@@ -3135,7 +3179,8 @@
goto out;
goto open_top;
}
- if (tp->t_state & TS_XCLUDE && suser(td))
+ if (tp->t_state & TS_XCLUDE && priv_check(td,
+ PRIV_TTY_EXCLUSIVE))
return (EBUSY);
} else {
/*
@@ -3147,16 +3192,15 @@
tp->t_termios = ISCALLOUT(dev) ? tp->t_init_out : tp->t_init_in;
tp->t_cflag = tp->t_termios.c_cflag;
if (tp->t_modem != NULL)
- tp->t_modem(tp, SER_DTR | SER_RTS, 0);
+ tt_modem(tp, SER_DTR | SER_RTS, 0);
++tp->t_wopeners;
- error = tp->t_param(tp, &tp->t_termios);
+ error = tt_param(tp, &tp->t_termios);
--tp->t_wopeners;
- if (error == 0 && tp->t_open != NULL)
- error = tp->t_open(tp, dev);
+ if (error == 0)
+ error = tt_open(tp, dev);
if (error != 0)
goto out;
- if (ISCALLOUT(dev) || (tp->t_modem != NULL &&
- (tp->t_modem(tp, 0, 0) & SER_DCD)))
+ if (ISCALLOUT(dev) || (tt_modem(tp, 0, 0) & SER_DCD))
ttyld_modem(tp, 1);
}
/*
@@ -3177,9 +3221,8 @@
tp->t_actout = TRUE;
out:
splx(s);
- if (!(tp->t_state & TS_ISOPEN) && tp->t_wopeners == 0 &&
- tp->t_close != NULL)
- tp->t_close(tp);
+ if (!(tp->t_state & TS_ISOPEN) && tp->t_wopeners == 0)
+ tt_close(tp);
return (error);
}
@@ -3191,8 +3234,7 @@
tp = dev->si_tty;
ttyld_close(tp, flag);
ttyldoptim(tp);
- if (tp->t_close != NULL)
- tp->t_close(tp);
+ tt_close(tp);
tp->t_do_timestamp = 0;
if (tp->t_pps != NULL)
tp->t_pps->ppsparam.mode = 0;
@@ -3364,7 +3406,7 @@
ct = dev->si_drv2;
switch (cmd) {
case TIOCSETA:
- error = suser(td);
+ error = priv_check(td, PRIV_TTY_SETA);
if (error != 0)
return (error);
*ct = *(struct termios *)data;
@@ -3424,6 +3466,7 @@
tp->t_lock_in.c_ispeed = tp->t_lock_in.c_ospeed = speed;
tp->t_init_out = tp->t_init_in;
tp->t_termios = tp->t_init_in;
+ ttsetwater(tp);
}
/*
Index: uipc_sem.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_sem.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/uipc_sem.c -L sys/kern/uipc_sem.c -u -r1.1.1.2 -r1.2
--- sys/kern/uipc_sem.c
+++ sys/kern/uipc_sem.c
@@ -32,7 +32,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_sem.c,v 1.20.2.1 2006/02/13 23:51:19 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_sem.c,v 1.28.4.1 2008/01/17 19:52:01 rwatson Exp $");
#include "opt_mac.h"
#include "opt_posix.h"
@@ -42,26 +42,27 @@
#include <sys/sysproto.h>
#include <sys/eventhandler.h>
#include <sys/kernel.h>
+#include <sys/ksem.h>
+#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/posix4.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/module.h>
#include <sys/condvar.h>
#include <sys/sem.h>
#include <sys/uio.h>
+#include <sys/semaphore.h>
#include <sys/syscall.h>
#include <sys/stat.h>
#include <sys/sysent.h>
#include <sys/sysctl.h>
#include <sys/time.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
#include <sys/fcntl.h>
+#include <sys/_semaphore.h>
-#include <posix4/ksem.h>
-#include <posix4/posix4.h>
-#include <posix4/semaphore.h>
-#include <posix4/_semaphore.h>
+#include <security/mac/mac_framework.h>
static int sem_count_proc(struct proc *p);
static struct ksem *sem_lookup_byname(const char *name);
@@ -71,6 +72,7 @@
static int sem_perm(struct thread *td, struct ksem *ks);
static void sem_enter(struct proc *p, struct ksem *ks);
static int sem_leave(struct proc *p, struct ksem *ks);
+static void sem_exechook(void *arg, struct proc *p, struct image_params *imgp);
static void sem_exithook(void *arg, struct proc *p);
static void sem_forkhook(void *arg, struct proc *p1, struct proc *p2,
int flags);
@@ -417,21 +419,32 @@
{
struct ucred *uc;
+ /*
+ * XXXRW: This permission routine appears to be incorrect. If the
+ * user matches, we shouldn't go on to the group if the user
+ * permissions don't allow the action? Not changed for now. To fix,
+ * change from a series of if (); if (); to if () else if () else...
+ */
uc = td->td_ucred;
DP(("sem_perm: uc(%d,%d) ks(%d,%d,%o)\n",
uc->cr_uid, uc->cr_gid,
ks->ks_uid, ks->ks_gid, ks->ks_mode));
- if ((uc->cr_uid == ks->ks_uid && (ks->ks_mode & S_IWUSR) != 0) ||
- (uc->cr_gid == ks->ks_gid && (ks->ks_mode & S_IWGRP) != 0) ||
- (ks->ks_mode & S_IWOTH) != 0 || suser(td) == 0)
+ if ((uc->cr_uid == ks->ks_uid) && (ks->ks_mode & S_IWUSR) != 0)
+ return (0);
+ if ((uc->cr_gid == ks->ks_gid) && (ks->ks_mode & S_IWGRP) != 0)
+ return (0);
+ if ((ks->ks_mode & S_IWOTH) != 0)
return (0);
- return (EPERM);
+ return (priv_check(td, PRIV_SEM_WRITE));
}
static void
sem_free(struct ksem *ks)
{
+#ifdef MAC
+ mac_destroy_posix_sem(ks);
+#endif
nsems--;
if (ks->ks_onlist)
LIST_REMOVE(ks, ks_entry);
@@ -508,7 +521,6 @@
};
int ksem_unlink(struct thread *td, struct ksem_unlink_args *uap);
#endif
-
int
ksem_unlink(struct thread *td, struct ksem_unlink_args *uap)
{
@@ -556,7 +568,6 @@
};
int ksem_close(struct thread *td, struct ksem_close_args *uap);
#endif
-
int
ksem_close(struct thread *td, struct ksem_close_args *uap)
{
@@ -629,7 +640,6 @@
};
int ksem_wait(struct thread *td, struct ksem_wait_args *uap);
#endif
-
int
ksem_wait(struct thread *td, struct ksem_wait_args *uap)
{
@@ -640,7 +650,7 @@
#ifndef _SYS_SYSPROTO_H_
struct ksem_timedwait_args {
semid_t id;
- struct timespec *abstime;
+ const struct timespec *abstime;
};
int ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap);
#endif
@@ -919,6 +929,12 @@
}
static void
+sem_exechook(void *arg, struct proc *p, struct image_params *imgp __unused)
+{
+ sem_exithook(arg, p);
+}
+
+static void
sem_exithook(void *arg, struct proc *p)
{
struct ksem *ks, *ksnext;
@@ -951,7 +967,7 @@
p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX);
sem_exit_tag = EVENTHANDLER_REGISTER(process_exit, sem_exithook,
NULL, EVENTHANDLER_PRI_ANY);
- sem_exec_tag = EVENTHANDLER_REGISTER(process_exec, sem_exithook,
+ sem_exec_tag = EVENTHANDLER_REGISTER(process_exec, sem_exechook,
NULL, EVENTHANDLER_PRI_ANY);
sem_fork_tag = EVENTHANDLER_REGISTER(process_fork, sem_forkhook, NULL, EVENTHANDLER_PRI_ANY);
break;
Index: bus_if.m
===================================================================
RCS file: /home/cvs/src/sys/kern/bus_if.m,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/bus_if.m -L sys/kern/bus_if.m -u -r1.1.1.1 -r1.2
--- sys/kern/bus_if.m
+++ sys/kern/bus_if.m
@@ -23,7 +23,7 @@
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
-# $FreeBSD: src/sys/kern/bus_if.m,v 1.29 2005/01/06 23:35:38 imp Exp $
+# $FreeBSD: src/sys/kern/bus_if.m,v 1.34 2007/02/23 12:19:01 piso Exp $
#
#include <sys/bus.h>
@@ -326,6 +326,7 @@
device_t _child;
struct resource *_irq;
int _flags;
+ driver_filter_t *_filter;
driver_intr_t *_intr;
void *_arg;
void **_cookiep;
@@ -507,3 +508,36 @@
enum intr_trigger _trig;
enum intr_polarity _pol;
} DEFAULT bus_generic_config_intr;
+
+/**
+ * @brief Notify a (bus) driver about a child that the hints mechanism
+ * believes it has discovered.
+ *
+ * The bus is responsible for then adding the child in the right order
+ * and discovering other things about the child. The bus driver is
+ * free to ignore this hint, to do special things, etc. It is all up
+ * to the bus driver to interpret.
+ *
+ * This method is only called in response to the parent bus asking for
+ * hinted devices to be enumerated.
+ *
+ * @param _dev the bus device
+ * @param _dname the name of the device w/o unit numbers
+ * @param _dunit the unit number of the device
+ */
+METHOD void hinted_child {
+ device_t _dev;
+ const char * _dname;
+ int _dunit;
+};
+
+/**
+ * @brief Returns bus_dma_tag_t for use w/ devices on the bus.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device to which the tag will belong
+ */
+METHOD bus_dma_tag_t get_dma_tag {
+ device_t _dev;
+ device_t _child;
+} DEFAULT bus_generic_get_dma_tag;
Index: uipc_mbuf2.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_mbuf2.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/uipc_mbuf2.c -L sys/kern/uipc_mbuf2.c -u -r1.1.1.1 -r1.2
--- sys/kern/uipc_mbuf2.c
+++ sys/kern/uipc_mbuf2.c
@@ -61,7 +61,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_mbuf2.c,v 1.31.2.1 2005/07/25 00:08:12 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_mbuf2.c,v 1.33 2006/10/22 11:52:13 rwatson Exp $");
/*#define PULLDOWN_DEBUG*/
@@ -71,11 +71,12 @@
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/mutex.h>
+#include <security/mac/mac_framework.h>
+
static MALLOC_DEFINE(M_PACKET_TAGS, MBUF_TAG_MEM_NAME,
"packet-attached information");
Index: imgact_elf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/imgact_elf.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/imgact_elf.c -L sys/kern/imgact_elf.c -u -r1.2 -r1.3
--- sys/kern/imgact_elf.c
+++ sys/kern/imgact_elf.c
@@ -29,7 +29,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/imgact_elf.c,v 1.162.2.3 2006/03/16 00:25:31 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/imgact_elf.c,v 1.178.2.2.2.1 2008/01/19 18:15:05 kib Exp $");
#include "opt_compat.h"
@@ -106,6 +106,10 @@
static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
+#define trunc_page_ps(va, ps) ((va) & ~(ps - 1))
+#define round_page_ps(va, ps) (((va) + (ps - 1)) & ~(ps - 1))
+#define aligned(a, t) (trunc_page_ps((u_long)(a), sizeof(t)) == (u_long)(a))
+
int
__elfN(insert_brand_entry)(Elf_Brandinfo *entry)
{
@@ -145,7 +149,7 @@
int rval = FALSE;
sx_slock(&allproc_lock);
- LIST_FOREACH(p, &allproc, p_list) {
+ FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_sysent == entry->sysvec) {
rval = TRUE;
break;
@@ -360,9 +364,6 @@
return (ENOEXEC);
}
-#define trunc_page_ps(va, ps) ((va) & ~(ps - 1))
-#define round_page_ps(va, ps) (((va) + (ps - 1)) & ~(ps - 1))
-
map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
file_addr = trunc_page_ps(offset, pagesize);
@@ -549,6 +550,10 @@
}
phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
+ if (!aligned(phdr, Elf_Addr)) {
+ error = ENOEXEC;
+ goto fail;
+ }
for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
if (phdr[i].p_type == PT_LOAD) { /* Loadable segment */
@@ -592,11 +597,13 @@
return (error);
}
+static const char FREEBSD_ABI_VENDOR[] = "FreeBSD";
+
static int
__CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
{
const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
- const Elf_Phdr *phdr;
+ const Elf_Phdr *phdr, *pnote = NULL;
Elf_Auxargs *elf_auxargs;
struct vmspace *vmspace;
vm_prot_t prot;
@@ -607,7 +614,9 @@
int error = 0, i;
const char *interp = NULL;
Elf_Brandinfo *brand_info;
+ const Elf_Note *note, *note_end;
char *path;
+ const char *note_name;
struct thread *td = curthread;
struct sysentvec *sv;
@@ -632,6 +641,8 @@
return (ENOEXEC);
}
phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
+ if (!aligned(phdr, Elf_Addr))
+ return (ENOEXEC);
for (i = 0; i < hdr->e_phnum; i++) {
if (phdr[i].p_type == PT_INTERP) {
/* Path to interpreter */
@@ -649,7 +660,8 @@
hdr->e_ident[EI_OSABI]);
return (ENOEXEC);
}
- if (hdr->e_type == ET_DYN && brand_info->brand != ELFOSABI_LINUX)
+ if (hdr->e_type == ET_DYN &&
+ (brand_info->flags & BI_CAN_EXEC_DYN) == 0)
return (ENOEXEC);
sv = brand_info->sysvec;
if (interp != NULL && brand_info->interp_newpath != NULL)
@@ -665,9 +677,12 @@
*/
VOP_UNLOCK(imgp->vp, 0, td);
- exec_new_vmspace(imgp, sv);
+ error = exec_new_vmspace(imgp, sv);
+ imgp->proc->p_sysent = sv;
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (error)
+ return (error);
vmspace = imgp->proc->p_vmspace;
@@ -743,6 +758,9 @@
case PT_PHDR: /* Program header table info */
proghdr = phdr[i].p_vaddr;
break;
+ case PT_NOTE:
+ pnote = &phdr[i];
+ break;
default:
break;
}
@@ -783,7 +801,6 @@
imgp->entry_addr = entry;
- imgp->proc->p_sysent = sv;
if (interp != NULL) {
VOP_UNLOCK(imgp->vp, 0, td);
if (brand_info->emul_path != NULL &&
@@ -825,6 +842,41 @@
imgp->auxargs = elf_auxargs;
imgp->interpreted = 0;
+ /*
+ * Try to fetch the osreldate for FreeBSD binary from the ELF
+ * OSABI-note. Only the first page of the image is searched,
+ * the same as for headers.
+ */
+ if (pnote != NULL && pnote->p_offset < PAGE_SIZE &&
+ pnote->p_offset + pnote->p_filesz < PAGE_SIZE ) {
+ note = (const Elf_Note *)(imgp->image_header + pnote->p_offset);
+ if (!aligned(note, Elf32_Addr)) {
+ free(imgp->auxargs, M_TEMP);
+ imgp->auxargs = NULL;
+ return (ENOEXEC);
+ }
+ note_end = (const Elf_Note *)(imgp->image_header + pnote->p_offset +
+ pnote->p_filesz);
+ while (note < note_end) {
+ if (note->n_namesz == sizeof(FREEBSD_ABI_VENDOR) &&
+ note->n_descsz == sizeof(int32_t) &&
+ note->n_type == 1 /* ABI_NOTETYPE */) {
+ note_name = (const char *)(note + 1);
+ if (strncmp(FREEBSD_ABI_VENDOR, note_name,
+ sizeof(FREEBSD_ABI_VENDOR)) == 0) {
+ imgp->proc->p_osrel = *(const int32_t *)
+ (note_name +
+ round_page_ps(sizeof(FREEBSD_ABI_VENDOR),
+ sizeof(Elf32_Addr)));
+ break;
+ }
+ }
+ note = (const Elf_Note *)((const char *)(note + 1) +
+ round_page_ps(note->n_namesz, sizeof(Elf32_Addr)) +
+ round_page_ps(note->n_descsz, sizeof(Elf32_Addr)));
+ }
+ }
+
return (error);
}
@@ -891,8 +943,6 @@
static void __elfN(putnote)(void *, size_t *, const char *, int,
const void *, size_t);
-extern int osreldate;
-
int
__elfN(coredump)(td, vp, limit)
struct thread *td;
@@ -1017,11 +1067,12 @@
struct proc *p = td->td_proc;
vm_map_t map = &p->p_vmspace->vm_map;
vm_map_entry_t entry;
+ vm_object_t backing_object, object;
+ boolean_t ignore_entry;
+ vm_map_lock_read(map);
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
- vm_object_t obj;
-
/*
* Don't dump inaccessible mappings, deal with legacy
* coredump mode.
@@ -1047,21 +1098,25 @@
if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP))
continue;
- if ((obj = entry->object.vm_object) == NULL)
+ if ((object = entry->object.vm_object) == NULL)
continue;
- /* Find the deepest backing object. */
- while (obj->backing_object != NULL)
- obj = obj->backing_object;
-
/* Ignore memory-mapped devices and such things. */
- if (obj->type != OBJT_DEFAULT &&
- obj->type != OBJT_SWAP &&
- obj->type != OBJT_VNODE)
+ VM_OBJECT_LOCK(object);
+ while ((backing_object = object->backing_object) != NULL) {
+ VM_OBJECT_LOCK(backing_object);
+ VM_OBJECT_UNLOCK(object);
+ object = backing_object;
+ }
+ ignore_entry = object->type != OBJT_DEFAULT &&
+ object->type != OBJT_SWAP && object->type != OBJT_VNODE;
+ VM_OBJECT_UNLOCK(object);
+ if (ignore_entry)
continue;
(*func)(entry, closure);
}
+ vm_map_unlock_read(map);
}
/*
Index: vfs_hash.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_hash.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/vfs_hash.c -L sys/kern/vfs_hash.c -u -r1.1.1.1 -r1.2
--- sys/kern/vfs_hash.c
+++ sys/kern/vfs_hash.c
@@ -26,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_hash.c,v 1.9.2.1 2005/09/12 15:53:58 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_hash.c,v 1.13 2007/03/13 01:50:26 tegge Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -35,7 +35,7 @@
#include <sys/mount.h>
#include <sys/vnode.h>
-static MALLOC_DEFINE(M_VFS_HASH, "VFS hash", "VFS hash table");
+static MALLOC_DEFINE(M_VFS_HASH, "vfs_hash", "VFS hash table");
static LIST_HEAD(vfs_hash_head, vnode) *vfs_hash_tbl;
static LIST_HEAD(,vnode) vfs_hash_side;
@@ -55,14 +55,14 @@
SYSINIT(vfs_hash, SI_SUB_VFS, SI_ORDER_SECOND, vfs_hashinit, NULL)
static struct vfs_hash_head *
-vfs_hash_index(struct mount *mp, u_int hash)
+vfs_hash_index(const struct mount *mp, u_int hash)
{
return(&vfs_hash_tbl[(hash + mp->mnt_hashseed) & vfs_hash_mask]);
}
int
-vfs_hash_get(struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
{
struct vnode *vp;
int error;
@@ -109,7 +109,6 @@
struct vnode *vp2;
int error;
- lockmgr(vp->v_vnlock, flags & LK_TYPE_MASK, NULL, td);
*vpp = NULL;
while (1) {
mtx_lock(&vfs_hash_mtx);
Index: sched_4bsd.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sched_4bsd.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/sched_4bsd.c -L sys/kern/sched_4bsd.c -u -r1.2 -r1.3
--- sys/kern/sched_4bsd.c
+++ sys/kern/sched_4bsd.c
@@ -33,12 +33,10 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.77 2005/06/24 00:16:57 peter Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.106.2.1 2007/12/20 07:15:40 davidxu Exp $");
#include "opt_hwpmc_hooks.h"
-#define kse td_sched
-
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -53,6 +51,8 @@
#include <sys/sysctl.h>
#include <sys/sx.h>
#include <sys/turnstile.h>
+#include <sys/umtx.h>
+#include <machine/pcb.h>
#include <machine/smp.h>
#ifdef HWPMC_HOOKS
@@ -74,84 +74,35 @@
#define NICE_WEIGHT 1 /* Priorities per nice level. */
/*
- * The schedulable entity that can be given a context to run.
- * A process may have several of these. Probably one per processor
- * but posibly a few more. In this universe they are grouped
- * with a KSEG that contains the priority and niceness
- * for the group.
- */
-struct kse {
- TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */
- struct thread *ke_thread; /* (*) Active associated thread. */
- fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */
- char ke_rqindex; /* (j) Run queue index. */
- enum {
- KES_THREAD = 0x0, /* slaved to thread state */
- KES_ONRUNQ
- } ke_state; /* (j) KSE status. */
- int ke_cpticks; /* (j) Ticks of cpu time. */
- struct runq *ke_runq; /* runq the kse is currently on */
+ * The schedulable entity that runs a context.
+ * This is an extension to the thread structure and is tailored to
+ * the requirements of this scheduler
+ */
+struct td_sched {
+ TAILQ_ENTRY(td_sched) ts_procq; /* (j/z) Run queue. */
+ struct thread *ts_thread; /* (*) Active associated thread. */
+ fixpt_t ts_pctcpu; /* (j) %cpu during p_swtime. */
+ u_char ts_rqindex; /* (j) Run queue index. */
+ int ts_cpticks; /* (j) Ticks of cpu time. */
+ int ts_slptime; /* (j) Seconds !RUNNING. */
+ struct runq *ts_runq; /* runq the thread is currently on */
};
-#define ke_proc ke_thread->td_proc
-#define ke_ksegrp ke_thread->td_ksegrp
-
-#define td_kse td_sched
-
/* flags kept in td_flags */
-#define TDF_DIDRUN TDF_SCHED0 /* KSE actually ran. */
-#define TDF_EXIT TDF_SCHED1 /* KSE is being killed. */
+#define TDF_DIDRUN TDF_SCHED0 /* thread actually ran. */
+#define TDF_EXIT TDF_SCHED1 /* thread is being killed. */
#define TDF_BOUND TDF_SCHED2
-#define ke_flags ke_thread->td_flags
-#define KEF_DIDRUN TDF_DIDRUN /* KSE actually ran. */
-#define KEF_EXIT TDF_EXIT /* KSE is being killed. */
-#define KEF_BOUND TDF_BOUND /* stuck to one CPU */
-
-#define SKE_RUNQ_PCPU(ke) \
- ((ke)->ke_runq != 0 && (ke)->ke_runq != &runq)
-
-struct kg_sched {
- struct thread *skg_last_assigned; /* (j) Last thread assigned to */
- /* the system scheduler. */
- int skg_avail_opennings; /* (j) Num KSEs requested in group. */
- int skg_concurrency; /* (j) Num KSEs requested in group. */
-};
-#define kg_last_assigned kg_sched->skg_last_assigned
-#define kg_avail_opennings kg_sched->skg_avail_opennings
-#define kg_concurrency kg_sched->skg_concurrency
-
-#define SLOT_RELEASE(kg) \
-do { \
- kg->kg_avail_opennings++; \
- CTR3(KTR_RUNQ, "kg %p(%d) Slot released (->%d)", \
- kg, \
- kg->kg_concurrency, \
- kg->kg_avail_opennings); \
-/* KASSERT((kg->kg_avail_opennings <= kg->kg_concurrency), \
- ("slots out of whack"));*/ \
-} while (0)
-
-#define SLOT_USE(kg) \
-do { \
- kg->kg_avail_opennings--; \
- CTR3(KTR_RUNQ, "kg %p(%d) Slot used (->%d)", \
- kg, \
- kg->kg_concurrency, \
- kg->kg_avail_opennings); \
-/* KASSERT((kg->kg_avail_opennings >= 0), \
- ("slots out of whack"));*/ \
-} while (0)
+#define ts_flags ts_thread->td_flags
+#define TSF_DIDRUN TDF_DIDRUN /* thread actually ran. */
+#define TSF_EXIT TDF_EXIT /* thread is being killed. */
+#define TSF_BOUND TDF_BOUND /* stuck to one CPU */
-/*
- * KSE_CAN_MIGRATE macro returns true if the kse can migrate between
- * cpus.
- */
-#define KSE_CAN_MIGRATE(ke) \
- ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
+#define SKE_RUNQ_PCPU(ts) \
+ ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
-static struct kse kse0;
-static struct kg_sched kg_sched0;
+static struct td_sched td_sched0;
+struct mtx sched_lock;
static int sched_tdcnt; /* Total runnable threads in the system. */
static int sched_quantum; /* Roundrobin scheduling quantum in ticks. */
@@ -159,9 +110,6 @@
static struct callout roundrobin_callout;
-static void slot_fill(struct ksegrp *kg);
-static struct kse *sched_choose(void); /* XXX Should be thread * */
-
static void setup_runqs(void);
static void roundrobin(void *arg);
static void schedcpu(void);
@@ -169,9 +117,9 @@
static void sched_priority(struct thread *td, u_char prio);
static void sched_setup(void *dummy);
static void maybe_resched(struct thread *td);
-static void updatepri(struct ksegrp *kg);
-static void resetpriority(struct ksegrp *kg);
-static void resetpriority_thread(struct thread *td, struct ksegrp *kg);
+static void updatepri(struct thread *td);
+static void resetpriority(struct thread *td);
+static void resetpriority_thread(struct thread *td);
#ifdef SMP
static int forward_wakeup(int cpunum);
#endif
@@ -274,20 +222,12 @@
"account for htt");
#endif
+#if 0
static int sched_followon = 0;
SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
&sched_followon, 0,
"allow threads to share a quantum");
-
-static int sched_pfollowons = 0;
-SYSCTL_INT(_kern_sched, OID_AUTO, pfollowons, CTLFLAG_RD,
- &sched_pfollowons, 0,
- "number of followons done to a different ksegrp");
-
-static int sched_kgfollowons = 0;
-SYSCTL_INT(_kern_sched, OID_AUTO, kgfollowons, CTLFLAG_RD,
- &sched_kgfollowons, 0,
- "number of followons done in a ksegrp");
+#endif
static __inline void
sched_load_add(void)
@@ -310,7 +250,7 @@
maybe_resched(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_priority < curthread->td_priority)
curthread->td_flags |= TDF_NEEDRESCHED;
}
@@ -338,20 +278,20 @@
/*
* Constants for digital decay and forget:
- * 90% of (kg_estcpu) usage in 5 * loadav time
- * 95% of (ke_pctcpu) usage in 60 seconds (load insensitive)
+ * 90% of (td_estcpu) usage in 5 * loadav time
+ * 95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
* Note that, as ps(1) mentions, this can let percentages
* total over 100% (I've seen 137.9% for 3 processes).
*
- * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously.
+ * Note that schedclock() updates td_estcpu and p_cpticks asynchronously.
*
- * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds.
+ * We wish to decay away 90% of td_estcpu in (5 * loadavg) seconds.
* That is, the system wants to compute a value of decay such
* that the following for loop:
* for (i = 0; i < (5 * loadavg); i++)
- * kg_estcpu *= decay;
+ * td_estcpu *= decay;
* will compute
- * kg_estcpu *= 0.1;
+ * td_estcpu *= 0.1;
* for all values of loadavg:
*
* Mathematically this loop can be expressed by saying:
@@ -404,7 +344,7 @@
#define loadfactor(loadav) (2 * (loadav))
#define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
-/* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
+/* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
@@ -433,79 +373,70 @@
register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
struct thread *td;
struct proc *p;
- struct kse *ke;
- struct ksegrp *kg;
+ struct td_sched *ts;
int awake, realstathz;
realstathz = stathz ? stathz : hz;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
- /*
- * Prevent state changes and protect run queue.
- */
- mtx_lock_spin(&sched_lock);
- /*
- * Increment time in/out of memory. We ignore overflow; with
- * 16-bit int's (remember them?) overflow takes 45 days.
- */
- p->p_swtime++;
- FOREACH_KSEGRP_IN_PROC(p, kg) {
+ PROC_SLOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td) {
awake = 0;
- FOREACH_THREAD_IN_GROUP(kg, td) {
- ke = td->td_kse;
- /*
- * Increment sleep time (if sleeping). We
- * ignore overflow, as above.
- */
- /*
- * The kse slptimes are not touched in wakeup
- * because the thread may not HAVE a KSE.
- */
- if (ke->ke_state == KES_ONRUNQ) {
- awake = 1;
- ke->ke_flags &= ~KEF_DIDRUN;
- } else if ((ke->ke_state == KES_THREAD) &&
- (TD_IS_RUNNING(td))) {
- awake = 1;
- /* Do not clear KEF_DIDRUN */
- } else if (ke->ke_flags & KEF_DIDRUN) {
- awake = 1;
- ke->ke_flags &= ~KEF_DIDRUN;
- }
+ thread_lock(td);
+ ts = td->td_sched;
+ /*
+ * Increment sleep time (if sleeping). We
+ * ignore overflow, as above.
+ */
+ /*
+ * The td_sched slptimes are not touched in wakeup
+ * because the thread may not HAVE everything in
+ * memory? XXX I think this is out of date.
+ */
+ if (TD_ON_RUNQ(td)) {
+ awake = 1;
+ ts->ts_flags &= ~TSF_DIDRUN;
+ } else if (TD_IS_RUNNING(td)) {
+ awake = 1;
+ /* Do not clear TSF_DIDRUN */
+ } else if (ts->ts_flags & TSF_DIDRUN) {
+ awake = 1;
+ ts->ts_flags &= ~TSF_DIDRUN;
+ }
- /*
- * ke_pctcpu is only for ps and ttyinfo().
- * Do it per kse, and add them up at the end?
- * XXXKSE
- */
- ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >>
- FSHIFT;
- /*
- * If the kse has been idle the entire second,
- * stop recalculating its priority until
- * it wakes up.
- */
- if (ke->ke_cpticks == 0)
- continue;
+ /*
+ * ts_pctcpu is only for ps and ttyinfo().
+ * Do it per td_sched, and add them up at the end?
+ * XXXKSE
+ */
+ ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT;
+ /*
+ * If the td_sched has been idle the entire second,
+ * stop recalculating its priority until
+ * it wakes up.
+ */
+ if (ts->ts_cpticks != 0) {
#if (FSHIFT >= CCPU_SHIFT)
- ke->ke_pctcpu += (realstathz == 100)
- ? ((fixpt_t) ke->ke_cpticks) <<
+ ts->ts_pctcpu += (realstathz == 100)
+ ? ((fixpt_t) ts->ts_cpticks) <<
(FSHIFT - CCPU_SHIFT) :
- 100 * (((fixpt_t) ke->ke_cpticks)
+ 100 * (((fixpt_t) ts->ts_cpticks)
<< (FSHIFT - CCPU_SHIFT)) / realstathz;
#else
- ke->ke_pctcpu += ((FSCALE - ccpu) *
- (ke->ke_cpticks *
+ ts->ts_pctcpu += ((FSCALE - ccpu) *
+ (ts->ts_cpticks *
FSCALE / realstathz)) >> FSHIFT;
#endif
- ke->ke_cpticks = 0;
- } /* end of kse loop */
+ ts->ts_cpticks = 0;
+ }
/*
- * If there are ANY running threads in this KSEGRP,
+ * If there are ANY running threads in this process,
* then don't count it as sleeping.
+XXX this is broken
+
*/
if (awake) {
- if (kg->kg_slptime > 1) {
+ if (ts->ts_slptime > 1) {
/*
* In an ideal world, this should not
* happen, because whoever woke us
@@ -515,20 +446,21 @@
* priority. Should KASSERT at some
* point when all the cases are fixed.
*/
- updatepri(kg);
+ updatepri(td);
}
- kg->kg_slptime = 0;
+ ts->ts_slptime = 0;
} else
- kg->kg_slptime++;
- if (kg->kg_slptime > 1)
+ ts->ts_slptime++;
+ if (ts->ts_slptime > 1) {
+ thread_unlock(td);
continue;
- kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);
- resetpriority(kg);
- FOREACH_THREAD_IN_GROUP(kg, td) {
- resetpriority_thread(td, kg);
}
- } /* end of ksegrp loop */
- mtx_unlock_spin(&sched_lock);
+ td->td_estcpu = decay_cpu(loadfac, td->td_estcpu);
+ resetpriority(td);
+ resetpriority_thread(td);
+ thread_unlock(td);
+ } /* end of thread loop */
+ PROC_SUNLOCK(p);
} /* end of process loop */
sx_sunlock(&allproc_lock);
}
@@ -539,34 +471,35 @@
static void
schedcpu_thread(void)
{
- int nowake;
for (;;) {
schedcpu();
- tsleep(&nowake, 0, "-", hz);
+ pause("-", hz);
}
}
/*
* Recalculate the priority of a process after it has slept for a while.
- * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at
- * least six times the loadfactor will decay kg_estcpu to zero.
+ * For all load averages >= 1 and max td_estcpu of 255, sleeping for at
+ * least six times the loadfactor will decay td_estcpu to zero.
*/
static void
-updatepri(struct ksegrp *kg)
+updatepri(struct thread *td)
{
- register fixpt_t loadfac;
- register unsigned int newcpu;
+ struct td_sched *ts;
+ fixpt_t loadfac;
+ unsigned int newcpu;
+ ts = td->td_sched;
loadfac = loadfactor(averunnable.ldavg[0]);
- if (kg->kg_slptime > 5 * loadfac)
- kg->kg_estcpu = 0;
+ if (ts->ts_slptime > 5 * loadfac)
+ td->td_estcpu = 0;
else {
- newcpu = kg->kg_estcpu;
- kg->kg_slptime--; /* was incremented in schedcpu() */
- while (newcpu && --kg->kg_slptime)
+ newcpu = td->td_estcpu;
+ ts->ts_slptime--; /* was incremented in schedcpu() */
+ while (newcpu && --ts->ts_slptime)
newcpu = decay_cpu(loadfac, newcpu);
- kg->kg_estcpu = newcpu;
+ td->td_estcpu = newcpu;
}
}
@@ -576,25 +509,25 @@
* than that of the current process.
*/
static void
-resetpriority(struct ksegrp *kg)
+resetpriority(struct thread *td)
{
register unsigned int newpriority;
- if (kg->kg_pri_class == PRI_TIMESHARE) {
- newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT +
- NICE_WEIGHT * (kg->kg_proc->p_nice - PRIO_MIN);
+ if (td->td_pri_class == PRI_TIMESHARE) {
+ newpriority = PUSER + td->td_estcpu / INVERSE_ESTCPU_WEIGHT +
+ NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN);
newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
PRI_MAX_TIMESHARE);
- kg->kg_user_pri = newpriority;
+ sched_user_prio(td, newpriority);
}
}
/*
- * Update the thread's priority when the associated ksegroup's user
+ * Update the thread's priority when the associated process's user
* priority changes.
*/
static void
-resetpriority_thread(struct thread *td, struct ksegrp *kg)
+resetpriority_thread(struct thread *td)
{
/* Only change threads with a time sharing user priority. */
@@ -605,7 +538,7 @@
/* XXX the whole needresched thing is broken, but not silly. */
maybe_resched(td);
- sched_prio(td, kg->kg_user_pri);
+ sched_prio(td, td->td_user_pri);
}
/* ARGSUSED */
@@ -641,12 +574,10 @@
* Set up the scheduler specific parts of proc0.
*/
proc0.p_sched = NULL; /* XXX */
- ksegrp0.kg_sched = &kg_sched0;
- thread0.td_sched = &kse0;
- kse0.ke_thread = &thread0;
- kse0.ke_state = KES_THREAD;
- kg_sched0.skg_concurrency = 1;
- kg_sched0.skg_avail_opennings = 0; /* we are already running */
+ thread0.td_sched = &td_sched0;
+ thread0.td_lock = &sched_lock;
+ td_sched0.ts_thread = &thread0;
+ mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
}
int
@@ -670,8 +601,8 @@
/*
* We adjust the priority of the current process. The priority of
* a process gets worse as it accumulates CPU time. The cpu usage
- * estimator (kg_estcpu) is increased here. resetpriority() will
- * compute a different priority each time kg_estcpu increases by
+ * estimator (td_estcpu) is increased here. resetpriority() will
+ * compute a different priority each time td_estcpu increases by
* INVERSE_ESTCPU_WEIGHT
* (until MAXPRI is reached). The cpu usage estimator ramps up
* quite quickly when the process is running (linearly), and decays
@@ -684,102 +615,86 @@
void
sched_clock(struct thread *td)
{
- struct ksegrp *kg;
- struct kse *ke;
+ struct td_sched *ts;
- mtx_assert(&sched_lock, MA_OWNED);
- kg = td->td_ksegrp;
- ke = td->td_kse;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ ts = td->td_sched;
- ke->ke_cpticks++;
- kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
- if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
- resetpriority(kg);
- resetpriority_thread(td, kg);
+ ts->ts_cpticks++;
+ td->td_estcpu = ESTCPULIM(td->td_estcpu + 1);
+ if ((td->td_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
+ resetpriority(td);
+ resetpriority_thread(td);
}
}
/*
* charge childs scheduling cpu usage to parent.
- *
- * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp.
- * Charge it to the ksegrp that did the wait since process estcpu is sum of
- * all ksegrps, this is strictly as expected. Assume that the child process
- * aggregated all the estcpu into the 'built-in' ksegrp.
*/
void
sched_exit(struct proc *p, struct thread *td)
{
- sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td);
- sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
-}
-void
-sched_exit_ksegrp(struct ksegrp *kg, struct thread *childtd)
-{
-
- mtx_assert(&sched_lock, MA_OWNED);
- kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + childtd->td_ksegrp->kg_estcpu);
+ CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
+ td, td->td_proc->p_comm, td->td_priority);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
}
void
sched_exit_thread(struct thread *td, struct thread *child)
{
+
CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
child, child->td_proc->p_comm, child->td_priority);
+ thread_lock(td);
+ td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu);
+ thread_unlock(td);
+ mtx_lock_spin(&sched_lock);
if ((child->td_proc->p_flag & P_NOLOAD) == 0)
sched_load_rem();
+ mtx_unlock_spin(&sched_lock);
}
void
sched_fork(struct thread *td, struct thread *childtd)
{
- sched_fork_ksegrp(td, childtd->td_ksegrp);
sched_fork_thread(td, childtd);
}
void
-sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
-{
- mtx_assert(&sched_lock, MA_OWNED);
- child->kg_estcpu = td->td_ksegrp->kg_estcpu;
-}
-
-void
sched_fork_thread(struct thread *td, struct thread *childtd)
{
+ childtd->td_estcpu = td->td_estcpu;
+ childtd->td_lock = &sched_lock;
sched_newthread(childtd);
}
void
sched_nice(struct proc *p, int nice)
{
- struct ksegrp *kg;
struct thread *td;
PROC_LOCK_ASSERT(p, MA_OWNED);
- mtx_assert(&sched_lock, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
p->p_nice = nice;
- FOREACH_KSEGRP_IN_PROC(p, kg) {
- resetpriority(kg);
- FOREACH_THREAD_IN_GROUP(kg, td) {
- resetpriority_thread(td, kg);
- }
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ resetpriority(td);
+ resetpriority_thread(td);
+ thread_unlock(td);
}
}
void
-sched_class(struct ksegrp *kg, int class)
+sched_class(struct thread *td, int class)
{
- mtx_assert(&sched_lock, MA_OWNED);
- kg->kg_pri_class = class;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ td->td_pri_class = class;
}
/*
* Adjust the priority of a thread.
- * This may include moving the thread within the KSEGRP,
- * changing the assignment of a kse to the thread,
- * and moving a KSE in the system run queue.
*/
static void
sched_priority(struct thread *td, u_char prio)
@@ -788,13 +703,14 @@
td, td->td_proc->p_comm, td->td_priority, prio, curthread,
curthread->td_proc->p_comm);
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_priority == prio)
return;
- if (TD_ON_RUNQ(td)) {
- adjustrunqueue(td, prio);
- } else {
- td->td_priority = prio;
+ td->td_priority = prio;
+ if (TD_ON_RUNQ(td) &&
+ td->td_sched->ts_rqindex != (prio / RQ_PPQ)) {
+ sched_rem(td);
+ sched_add(td, SRQ_BORING);
}
}
@@ -825,7 +741,7 @@
if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
td->td_base_pri <= PRI_MAX_TIMESHARE)
- base_pri = td->td_ksegrp->kg_user_pri;
+ base_pri = td->td_user_pri;
else
base_pri = td->td_base_pri;
if (prio >= base_pri) {
@@ -863,54 +779,75 @@
}
void
-sched_sleep(struct thread *td)
+sched_user_prio(struct thread *td, u_char prio)
{
+ u_char oldprio;
- mtx_assert(&sched_lock, MA_OWNED);
- td->td_ksegrp->kg_slptime = 0;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ td->td_base_user_pri = prio;
+ if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
+ return;
+ oldprio = td->td_user_pri;
+ td->td_user_pri = prio;
+}
+
+void
+sched_lend_user_prio(struct thread *td, u_char prio)
+{
+ u_char oldprio;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ td->td_flags |= TDF_UBORROWING;
+
+ oldprio = td->td_user_pri;
+ td->td_user_pri = prio;
+}
+
+void
+sched_unlend_user_prio(struct thread *td, u_char prio)
+{
+ u_char base_pri;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ base_pri = td->td_base_user_pri;
+ if (prio >= base_pri) {
+ td->td_flags &= ~TDF_UBORROWING;
+ sched_user_prio(td, base_pri);
+ } else {
+ sched_lend_user_prio(td, prio);
+ }
}
-static void remrunqueue(struct thread *td);
+void
+sched_sleep(struct thread *td)
+{
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ td->td_slptick = ticks;
+ td->td_sched->ts_slptime = 0;
+}
void
sched_switch(struct thread *td, struct thread *newtd, int flags)
{
- struct kse *ke;
- struct ksegrp *kg;
+ struct td_sched *ts;
struct proc *p;
- ke = td->td_kse;
+ ts = td->td_sched;
p = td->td_proc;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ /*
+ * Switch to the sched lock to fix things up and pick
+ * a new thread.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_unlock(td);
+ }
if ((p->p_flag & P_NOLOAD) == 0)
sched_load_rem();
- /*
- * We are volunteering to switch out so we get to nominate
- * a successor for the rest of our quantum
- * First try another thread in our ksegrp, and then look for
- * other ksegrps in our process.
- */
- if (sched_followon &&
- (p->p_flag & P_HADTHREADS) &&
- (flags & SW_VOL) &&
- newtd == NULL) {
- /* lets schedule another thread from this process */
- kg = td->td_ksegrp;
- if ((newtd = TAILQ_FIRST(&kg->kg_runq))) {
- remrunqueue(newtd);
- sched_kgfollowons++;
- } else {
- FOREACH_KSEGRP_IN_PROC(p, kg) {
- if ((newtd = TAILQ_FIRST(&kg->kg_runq))) {
- sched_pfollowons++;
- remrunqueue(newtd);
- break;
- }
- }
- }
- }
if (newtd)
newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED);
@@ -925,24 +862,17 @@
* or stopped or any thing else similar. We never put the idle
* threads on the run queue, however.
*/
- if (td == PCPU_GET(idlethread))
+ if (td->td_flags & TDF_IDLETD) {
TD_SET_CAN_RUN(td);
- else {
- SLOT_RELEASE(td->td_ksegrp);
+#ifdef SMP
+ idle_cpus_mask &= ~PCPU_GET(cpumask);
+#endif
+ } else {
if (TD_IS_RUNNING(td)) {
- /* Put us back on the run queue (kse and all). */
- setrunqueue(td, (flags & SW_PREEMPT) ?
+ /* Put us back on the run queue. */
+ sched_add(td, (flags & SW_PREEMPT) ?
SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
SRQ_OURSELF|SRQ_YIELDING);
- } else if (p->p_flag & P_HADTHREADS) {
- /*
- * We will not be on the run queue. So we must be
- * sleeping or similar. As it's available,
- * someone else can use the KSE if they need it.
- * It's NOT available if we are about to need it
- */
- if (newtd == NULL || newtd->td_ksegrp != td->td_ksegrp)
- slot_fill(td->td_ksegrp);
}
}
if (newtd) {
@@ -955,45 +885,68 @@
* * A followon
*/
KASSERT((newtd->td_inhibitors == 0),
- ("trying to run inhibitted thread"));
- SLOT_USE(newtd->td_ksegrp);
- newtd->td_kse->ke_flags |= KEF_DIDRUN;
+ ("trying to run inhibited thread"));
+ newtd->td_sched->ts_flags |= TSF_DIDRUN;
TD_SET_RUNNING(newtd);
if ((newtd->td_proc->p_flag & P_NOLOAD) == 0)
sched_load_add();
} else {
newtd = choosethread();
}
+ MPASS(newtd->td_lock == &sched_lock);
if (td != newtd) {
#ifdef HWPMC_HOOKS
if (PMC_PROC_IS_USING_PMCS(td->td_proc))
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
#endif
- cpu_switch(td, newtd);
+
+ /* I feel sleepy */
+ cpu_switch(td, newtd, td->td_lock);
+ /*
+ * Where am I? What year is it?
+ * We are in the same thread that went to sleep above,
+ * but any amount of time may have passed. All out context
+ * will still be available as will local variables.
+ * PCPU values however may have changed as we may have
+ * changed CPU so don't trust cached values of them.
+ * New threads will go to fork_exit() instead of here
+ * so if you change things here you may need to change
+ * things there too.
+ * If the thread above was exiting it will never wake
+ * up again here, so either it has saved everything it
+ * needed to, or the thread_wait() or wait() will
+ * need to reap it.
+ */
#ifdef HWPMC_HOOKS
if (PMC_PROC_IS_USING_PMCS(td->td_proc))
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
#endif
}
+#ifdef SMP
+ if (td->td_flags & TDF_IDLETD)
+ idle_cpus_mask |= PCPU_GET(cpumask);
+#endif
sched_lock.mtx_lock = (uintptr_t)td;
td->td_oncpu = PCPU_GET(cpuid);
+ MPASS(td->td_lock == &sched_lock);
}
void
sched_wakeup(struct thread *td)
{
- struct ksegrp *kg;
+ struct td_sched *ts;
- mtx_assert(&sched_lock, MA_OWNED);
- kg = td->td_ksegrp;
- if (kg->kg_slptime > 1) {
- updatepri(kg);
- resetpriority(kg);
- }
- kg->kg_slptime = 0;
- setrunqueue(td, SRQ_BORING);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ ts = td->td_sched;
+ if (ts->ts_slptime > 1) {
+ updatepri(td);
+ resetpriority(td);
+ }
+ td->td_slptick = ticks;
+ ts->ts_slptime = 0;
+ sched_add(td, SRQ_BORING);
}
#ifdef SMP
@@ -1123,41 +1076,50 @@
sched_add(struct thread *td, int flags)
#ifdef SMP
{
- struct kse *ke;
+ struct td_sched *ts;
int forwarded = 0;
int cpu;
int single_cpu = 0;
- ke = td->td_kse;
- mtx_assert(&sched_lock, MA_OWNED);
- KASSERT(ke->ke_state != KES_ONRUNQ,
- ("sched_add: kse %p (%s) already in run queue", ke,
- ke->ke_proc->p_comm));
- KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
- ("sched_add: process swapped out"));
+ ts = td->td_sched;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT((td->td_inhibitors == 0),
+ ("sched_add: trying to run inhibited thread"));
+ KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+ ("sched_add: bad thread state"));
+ KASSERT(td->td_flags & TDF_INMEM,
+ ("sched_add: thread swapped out"));
CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread,
curthread->td_proc->p_comm);
-
+ /*
+ * Now that the thread is moving to the run-queue, set the lock
+ * to the scheduler's lock.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_lock_set(td, &sched_lock);
+ }
+ TD_SET_RUNQ(td);
if (td->td_pinned != 0) {
cpu = td->td_lastcpu;
- ke->ke_runq = &runq_pcpu[cpu];
+ ts->ts_runq = &runq_pcpu[cpu];
single_cpu = 1;
CTR3(KTR_RUNQ,
- "sched_add: Put kse:%p(td:%p) on cpu%d runq", ke, td, cpu);
- } else if ((ke)->ke_flags & KEF_BOUND) {
+ "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu);
+ } else if ((ts)->ts_flags & TSF_BOUND) {
/* Find CPU from bound runq */
- KASSERT(SKE_RUNQ_PCPU(ke),("sched_add: bound kse not on cpu runq"));
- cpu = ke->ke_runq - &runq_pcpu[0];
+ KASSERT(SKE_RUNQ_PCPU(ts),("sched_add: bound td_sched not on cpu runq"));
+ cpu = ts->ts_runq - &runq_pcpu[0];
single_cpu = 1;
CTR3(KTR_RUNQ,
- "sched_add: Put kse:%p(td:%p) on cpu%d runq", ke, td, cpu);
+ "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu);
} else {
CTR2(KTR_RUNQ,
- "sched_add: adding kse:%p (td:%p) to gbl runq", ke, td);
+ "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts, td);
cpu = NOCPU;
- ke->ke_runq = &runq;
+ ts->ts_runq = &runq;
}
if (single_cpu && (cpu != PCPU_GET(cpuid))) {
@@ -1183,25 +1145,33 @@
if ((td->td_proc->p_flag & P_NOLOAD) == 0)
sched_load_add();
- SLOT_USE(td->td_ksegrp);
- runq_add(ke->ke_runq, ke, flags);
- ke->ke_state = KES_ONRUNQ;
+ runq_add(ts->ts_runq, ts, flags);
}
#else /* SMP */
{
- struct kse *ke;
- ke = td->td_kse;
- mtx_assert(&sched_lock, MA_OWNED);
- KASSERT(ke->ke_state != KES_ONRUNQ,
- ("sched_add: kse %p (%s) already in run queue", ke,
- ke->ke_proc->p_comm));
- KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
- ("sched_add: process swapped out"));
+ struct td_sched *ts;
+ ts = td->td_sched;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT((td->td_inhibitors == 0),
+ ("sched_add: trying to run inhibited thread"));
+ KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+ ("sched_add: bad thread state"));
+ KASSERT(td->td_flags & TDF_INMEM,
+ ("sched_add: thread swapped out"));
CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread,
curthread->td_proc->p_comm);
- CTR2(KTR_RUNQ, "sched_add: adding kse:%p (td:%p) to runq", ke, td);
- ke->ke_runq = &runq;
+ /*
+ * Now that the thread is moving to the run-queue, set the lock
+ * to the scheduler's lock.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_lock_set(td, &sched_lock);
+ }
+ TD_SET_RUNQ(td);
+ CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
+ ts->ts_runq = &runq;
/*
* If we are yielding (on the way out anyhow)
@@ -1220,9 +1190,7 @@
}
if ((td->td_proc->p_flag & P_NOLOAD) == 0)
sched_load_add();
- SLOT_USE(td->td_ksegrp);
- runq_add(ke->ke_runq, ke, flags);
- ke->ke_state = KES_ONRUNQ;
+ runq_add(ts->ts_runq, ts, flags);
maybe_resched(td);
}
#endif /* SMP */
@@ -1230,13 +1198,13 @@
void
sched_rem(struct thread *td)
{
- struct kse *ke;
+ struct td_sched *ts;
- ke = td->td_kse;
- KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
- ("sched_rem: process swapped out"));
- KASSERT((ke->ke_state == KES_ONRUNQ),
- ("sched_rem: KSE not on run queue"));
+ ts = td->td_sched;
+ KASSERT(td->td_flags & TDF_INMEM,
+ ("sched_rem: thread swapped out"));
+ KASSERT(TD_ON_RUNQ(td),
+ ("sched_rem: thread not on run queue"));
mtx_assert(&sched_lock, MA_OWNED);
CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread,
@@ -1244,59 +1212,58 @@
if ((td->td_proc->p_flag & P_NOLOAD) == 0)
sched_load_rem();
- SLOT_RELEASE(td->td_ksegrp);
- runq_remove(ke->ke_runq, ke);
-
- ke->ke_state = KES_THREAD;
+ runq_remove(ts->ts_runq, ts);
+ TD_SET_CAN_RUN(td);
}
/*
* Select threads to run.
* Notice that the running threads still consume a slot.
*/
-struct kse *
+struct thread *
sched_choose(void)
{
- struct kse *ke;
+ struct td_sched *ts;
struct runq *rq;
+ mtx_assert(&sched_lock, MA_OWNED);
#ifdef SMP
- struct kse *kecpu;
+ struct td_sched *kecpu;
rq = &runq;
- ke = runq_choose(&runq);
+ ts = runq_choose(&runq);
kecpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
- if (ke == NULL ||
+ if (ts == NULL ||
(kecpu != NULL &&
- kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) {
- CTR2(KTR_RUNQ, "choosing kse %p from pcpu runq %d", kecpu,
+ kecpu->ts_thread->td_priority < ts->ts_thread->td_priority)) {
+ CTR2(KTR_RUNQ, "choosing td_sched %p from pcpu runq %d", kecpu,
PCPU_GET(cpuid));
- ke = kecpu;
+ ts = kecpu;
rq = &runq_pcpu[PCPU_GET(cpuid)];
} else {
- CTR1(KTR_RUNQ, "choosing kse %p from main runq", ke);
+ CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", ts);
}
#else
rq = &runq;
- ke = runq_choose(&runq);
+ ts = runq_choose(&runq);
#endif
- if (ke != NULL) {
- runq_remove(rq, ke);
- ke->ke_state = KES_THREAD;
-
- KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
- ("sched_choose: process swapped out"));
- }
- return (ke);
+ if (ts) {
+ runq_remove(rq, ts);
+ ts->ts_flags |= TSF_DIDRUN;
+
+ KASSERT(ts->ts_thread->td_flags & TDF_INMEM,
+ ("sched_choose: thread swapped out"));
+ return (ts->ts_thread);
+ }
+ return (PCPU_GET(idlethread));
}
void
sched_userret(struct thread *td)
{
- struct ksegrp *kg;
/*
* XXX we cheat slightly on the locking here to avoid locking in
* the usual case. Setting td_priority here is essentially an
@@ -1308,34 +1275,31 @@
*/
KASSERT((td->td_flags & TDF_BORROWING) == 0,
("thread with borrowed priority returning to userland"));
- kg = td->td_ksegrp;
- if (td->td_priority != kg->kg_user_pri) {
- mtx_lock_spin(&sched_lock);
- td->td_priority = kg->kg_user_pri;
- td->td_base_pri = kg->kg_user_pri;
- mtx_unlock_spin(&sched_lock);
+ if (td->td_priority != td->td_user_pri) {
+ thread_lock(td);
+ td->td_priority = td->td_user_pri;
+ td->td_base_pri = td->td_user_pri;
+ thread_unlock(td);
}
}
void
sched_bind(struct thread *td, int cpu)
{
- struct kse *ke;
+ struct td_sched *ts;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
KASSERT(TD_IS_RUNNING(td),
("sched_bind: cannot bind non-running thread"));
- ke = td->td_kse;
+ ts = td->td_sched;
- ke->ke_flags |= KEF_BOUND;
+ ts->ts_flags |= TSF_BOUND;
#ifdef SMP
- ke->ke_runq = &runq_pcpu[cpu];
+ ts->ts_runq = &runq_pcpu[cpu];
if (PCPU_GET(cpuid) == cpu)
return;
- ke->ke_state = KES_THREAD;
-
mi_switch(SW_VOL, NULL);
#endif
}
@@ -1343,48 +1307,121 @@
void
sched_unbind(struct thread* td)
{
- mtx_assert(&sched_lock, MA_OWNED);
- td->td_kse->ke_flags &= ~KEF_BOUND;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ td->td_sched->ts_flags &= ~TSF_BOUND;
}
int
sched_is_bound(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
- return (td->td_kse->ke_flags & KEF_BOUND);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ return (td->td_sched->ts_flags & TSF_BOUND);
}
-int
-sched_load(void)
+void
+sched_relinquish(struct thread *td)
{
- return (sched_tdcnt);
+ thread_lock(td);
+ SCHED_STAT_INC(switch_relinquish);
+ mi_switch(SW_VOL, NULL);
+ thread_unlock(td);
}
int
-sched_sizeof_ksegrp(void)
+sched_load(void)
{
- return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
+ return (sched_tdcnt);
}
+
int
sched_sizeof_proc(void)
{
return (sizeof(struct proc));
}
+
int
sched_sizeof_thread(void)
{
- return (sizeof(struct thread) + sizeof(struct kse));
+ return (sizeof(struct thread) + sizeof(struct td_sched));
}
fixpt_t
sched_pctcpu(struct thread *td)
{
- struct kse *ke;
+ struct td_sched *ts;
- ke = td->td_kse;
- return (ke->ke_pctcpu);
+ ts = td->td_sched;
+ return (ts->ts_pctcpu);
+}
- return (0);
+void
+sched_tick(void)
+{
+}
+
+/*
+ * The actual idle process.
+ */
+void
+sched_idletd(void *dummy)
+{
+ struct proc *p;
+ struct thread *td;
+
+ td = curthread;
+ p = td->td_proc;
+ for (;;) {
+ mtx_assert(&Giant, MA_NOTOWNED);
+
+ while (sched_runnable() == 0)
+ cpu_idle();
+
+ mtx_lock_spin(&sched_lock);
+ mi_switch(SW_VOL, NULL);
+ mtx_unlock_spin(&sched_lock);
+ }
+}
+
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+ /*
+ * Correct spinlock nesting. The idle thread context that we are
+ * borrowing was created so that it would start out with a single
+ * spin lock (sched_lock) held in fork_trampoline(). Since we've
+ * explicitly acquired locks in this function, the nesting count
+ * is now 2 rather than 1. Since we are nested, calling
+ * spinlock_exit() will simply adjust the counts without allowing
+ * spin lock using code to interrupt us.
+ */
+ if (td == NULL) {
+ mtx_lock_spin(&sched_lock);
+ spinlock_exit();
+ } else {
+ MPASS(td->td_lock == &sched_lock);
+ }
+ mtx_assert(&sched_lock, MA_OWNED);
+ KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+ PCPU_SET(switchtime, cpu_ticks());
+ PCPU_SET(switchticks, ticks);
+ cpu_throw(td, choosethread()); /* doesn't return */
}
+
+void
+sched_fork_exit(struct thread *td)
+{
+
+ /*
+ * Finish setting up thread glue so that it begins execution in a
+ * non-nested critical section with sched_lock held but not recursed.
+ */
+ td->td_oncpu = PCPU_GET(cpuid);
+ sched_lock.mtx_lock = (uintptr_t)td;
+ THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
+}
+
#define KERN_SWITCH_INCLUDE 1
#include "kern/kern_switch.c"
Index: imgact_gzip.c
===================================================================
RCS file: /home/cvs/src/sys/kern/imgact_gzip.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/imgact_gzip.c -L sys/kern/imgact_gzip.c -u -r1.2 -r1.3
--- sys/kern/imgact_gzip.c
+++ sys/kern/imgact_gzip.c
@@ -22,7 +22,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/imgact_gzip.c,v 1.54.2.1 2006/03/16 00:25:32 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/imgact_gzip.c,v 1.55.4.1 2008/01/19 18:15:05 kib Exp $");
#include <sys/param.h>
#include <sys/exec.h>
@@ -239,9 +239,13 @@
/*
* Destroy old process VM and create a new one (with a new stack)
*/
- exec_new_vmspace(gz->ip, &aout_sysvec);
+ error = exec_new_vmspace(gz->ip, &aout_sysvec);
vn_lock(gz->ip->vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (error) {
+ gz->where = __LINE__;
+ return (error);
+ }
vmspace = gz->ip->proc->p_vmspace;
Index: makesyscalls.sh
===================================================================
RCS file: /home/cvs/src/sys/kern/makesyscalls.sh,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/makesyscalls.sh -L sys/kern/makesyscalls.sh -u -r1.1.1.1 -r1.2
--- sys/kern/makesyscalls.sh
+++ sys/kern/makesyscalls.sh
@@ -1,12 +1,13 @@
#! /bin/sh -
# @(#)makesyscalls.sh 8.1 (Berkeley) 6/10/93
-# $FreeBSD: src/sys/kern/makesyscalls.sh,v 1.62 2005/05/30 15:09:15 rwatson Exp $
+# $FreeBSD: src/sys/kern/makesyscalls.sh,v 1.68 2007/07/04 22:38:28 peter Exp $
set -e
# name of compat options:
compat=COMPAT_43
compat4=COMPAT_FREEBSD4
+compat6=COMPAT_FREEBSD6
# output files:
sysnames="syscalls.c"
@@ -18,21 +19,25 @@
syscallprefix="SYS_"
switchname="sysent"
namesname="syscallnames"
+systrace="systrace_args.c"
# tmp files:
+sysaue="sysent.aue.$$"
sysdcl="sysent.dcl.$$"
syscompat="sysent.compat.$$"
syscompatdcl="sysent.compatdcl.$$"
syscompat4="sysent.compat4.$$"
syscompat4dcl="sysent.compat4dcl.$$"
+syscompat6="sysent.compat6.$$"
+syscompat6dcl="sysent.compat6dcl.$$"
sysent="sysent.switch.$$"
sysinc="sysinc.switch.$$"
sysarg="sysarg.switch.$$"
sysprotoend="sysprotoend.$$"
-trap "rm $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $sysent $sysinc $sysarg $sysprotoend" 0
+trap "rm $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $sysent $sysinc $sysarg $sysprotoend" 0
-touch $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $sysent $sysinc $sysarg $sysprotoend
+touch $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $sysent $sysinc $sysarg $sysprotoend
case $# in
0) echo "usage: $0 input-file <config-file>" 1>&2
@@ -58,6 +63,7 @@
}
' < $1 | awk "
BEGIN {
+ sysaue = \"$sysaue\"
sysdcl = \"$sysdcl\"
sysproto = \"$sysproto\"
sysprotoend = \"$sysprotoend\"
@@ -66,6 +72,8 @@
syscompatdcl = \"$syscompatdcl\"
syscompat4 = \"$syscompat4\"
syscompat4dcl = \"$syscompat4dcl\"
+ syscompat6 = \"$syscompat6\"
+ syscompat6dcl = \"$syscompat6dcl\"
sysent = \"$sysent\"
syssw = \"$syssw\"
sysinc = \"$sysinc\"
@@ -73,8 +81,10 @@
sysnames = \"$sysnames\"
syshdr = \"$syshdr\"
sysmk = \"$sysmk\"
+ systrace = \"$systrace\"
compat = \"$compat\"
compat4 = \"$compat4\"
+ compat6 = \"$compat6\"
syscallprefix = \"$syscallprefix\"
switchname = \"$switchname\"
namesname = \"$namesname\"
@@ -91,6 +101,7 @@
printf "\n#ifdef %s\n\n", compat > syscompat
printf "\n#ifdef %s\n\n", compat4 > syscompat4
+ printf "\n#ifdef %s\n\n", compat6 > syscompat6
printf "/*\n * System call names.\n *\n" > sysnames
printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames
@@ -102,6 +113,10 @@
printf "# FreeBSD system call names.\n" > sysmk
printf "# DO NOT EDIT-- this file is automatically generated.\n" > sysmk
printf "# $%s$\n", "FreeBSD" > sysmk
+
+ printf "/*\n * System call argument to DTrace register array converstion.\n *\n" > systrace
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > systrace
+ printf " * $%s$\n", "FreeBSD" > systrace
}
NR == 1 {
gsub("[$]FreeBSD: ", "", $0)
@@ -117,10 +132,9 @@
printf "#define\t%s\n\n", sysproto_h > sysarg
printf "#include <sys/signal.h>\n" > sysarg
printf "#include <sys/acl.h>\n" > sysarg
- printf "#include <sys/thr.h>\n" > sysarg
- printf "#include <sys/umtx.h>\n" > sysarg
- printf "#include <posix4/_semaphore.h>\n\n" > sysarg
+ printf "#include <sys/_semaphore.h>\n" > sysarg
printf "#include <sys/ucontext.h>\n\n" > sysarg
+ printf "#include <bsm/audit_kevents.h>\n\n" > sysarg
printf "struct proc;\n\n" > sysarg
printf "struct thread;\n\n" > sysarg
printf "#define\tPAD_(t)\t(sizeof(register_t) <= sizeof(t) ? \\\n" > sysarg
@@ -140,6 +154,11 @@
printf "# created from%s\nMIASM = ", $0 > sysmk
+ printf " * This file is part of the DTrace syscall provider.\n */\n\n" > systrace
+ printf "static void\nsystrace_args(int sysnum, void *params, u_int64_t *uarg, int *n_args)\n{\n" > systrace
+ printf "\tint64_t *iarg = (int64_t *) uarg;\n" > systrace
+ printf "\tswitch (sysnum) {\n" > systrace
+
next
}
NF == 0 || $1 ~ /^;/ {
@@ -155,6 +174,7 @@
print > sysarg
print > syscompat
print > syscompat4
+ print > syscompat6
print > sysnames
savesyscall = syscall
next
@@ -165,6 +185,7 @@
print > sysarg
print > syscompat
print > syscompat4
+ print > syscompat6
print > sysnames
syscall = savesyscall
next
@@ -175,6 +196,7 @@
print > sysarg
print > syscompat
print > syscompat4
+ print > syscompat6
print > sysnames
next
}
@@ -243,6 +265,8 @@
argalias = "o" argalias
if ($3 == "COMPAT4")
argalias = "freebsd4_" argalias
+ if ($3 == "COMPAT6")
+ argalias = "freebsd6_" argalias
}
f++
@@ -288,41 +312,28 @@
auditev = $2;
}
- # The 'M' type prefix
- #
- {
- mpsafe = "SYF_MPSAFE | ";
- if ($3 == "MSTD") {
- $3 = "STD";
- } else if ($3 == "MNODEF") {
- $3 = "NODEF";
- } else if ($3 == "MNOARGS") {
- $3 = "NOARGS";
- } else if ($3 == "MNOPROTO") {
- $3 = "NOPROTO";
- } else if ($3 == "MNOIMPL") {
- $3 = "NOIMPL";
- } else if ($3 == "MNOSTD") {
- $3 = "NOSTD";
- } else if ($3 == "MCOMPAT") {
- $3 = "COMPAT";
- } else if ($3 == "MCOMPAT4") {
- $3 = "COMPAT4";
- } else if ($3 == "MCPT_NOA") {
- $3 = "CPT_NOA";
- } else if ($3 == "MLIBCOMPAT") {
- $3 = "LIBCOMPAT";
- } else if ($3 == "MOBSOL") {
- $3 = "OBSOL";
- } else if ($3 == "MUNIMPL") {
- $3 = "UNIMPL";
- } else {
- mpsafe = "";
- }
- }
$3 == "STD" || $3 == "NODEF" || $3 == "NOARGS" || $3 == "NOPROTO" \
|| $3 == "NOIMPL" || $3 == "NOSTD" {
parseline()
+ printf("\t/* %s */\n\tcase %d: {\n", funcname, syscall) > systrace
+ if (argc > 0) {
+ printf("\t\tstruct %s *p = params;\n", argalias) > systrace
+ for (i = 1; i <= argc; i++) {
+ if (index(argtype[i], "*") > 0 || argtype[i] == "caddr_t")
+ printf("\t\tuarg[%d] = (intptr_t) p->%s; /* %s */\n", \
+ i - 1, \
+ argname[i], argtype[i]) > systrace
+ else if (substr(argtype[i], 1, 1) == "u" || argtype[i] == "size_t")
+ printf("\t\tuarg[%d] = p->%s; /* %s */\n", \
+ i - 1, \
+ argname[i], argtype[i]) > systrace
+ else
+ printf("\t\tiarg[%d] = p->%s; /* %s */\n", \
+ i - 1, \
+ argname[i], argtype[i]) > systrace
+ }
+ }
+ printf("\t\t*n_args = %d;\n\t\tbreak;\n\t}\n", argc) > systrace
if ((!nosys || funcname != "nosys") && \
(funcname != "lkmnosys") && (funcname != "lkmressys")) {
if (argc != 0 && $3 != "NOARGS" && $3 != "NOPROTO") {
@@ -347,21 +358,23 @@
printf("%s\t%s(struct thread *, struct %s *)",
rettype, funcname, argalias) > sysdcl
printf(";\n") > sysdcl
+ printf("#define\t%sAUE_%s\t%s\n", syscallprefix,
+ funcalias, auditev) > sysaue
}
if (funcname == "nosys")
nosys = 1
if (funcname == "lkmnosys")
lkmnosys = 1
- printf("\t{ %s%s, (sy_call_t *)", mpsafe, argssize) > sysent
- column = 8 + 2 + length(mpsafe) + length(argssize) + 15
+ printf("\t{ %s, (sy_call_t *)", argssize) > sysent
+ column = 8 + 2 + length(argssize) + 15
if ($3 == "NOIMPL") {
- printf("%s },", "nosys, AUE_NULL") > sysent
+ printf("%s },", "nosys, AUE_NULL, NULL, 0, 0") > sysent
column = column + length("nosys") + 3
} else if ($3 == "NOSTD") {
- printf("%s },", "lkmressys, AUE_NULL") > sysent
+ printf("%s },", "lkmressys, AUE_NULL, NULL, 0, 0") > sysent
column = column + length("lkmressys") + 3
} else {
- printf("%s, %s },", funcname, auditev) > sysent
+ printf("%s, %s, NULL, 0, 0 },", funcname, auditev) > sysent
column = column + length(funcname) + length(auditev) + 3
}
align_sysent_comment(column)
@@ -376,7 +389,7 @@
syscall++
next
}
- $3 == "COMPAT" || $3 == "COMPAT4" || $3 == "CPT_NOA" {
+ $3 == "COMPAT" || $3 == "COMPAT4" || $3 == "COMPAT6" || $3 == "CPT_NOA" {
if ($3 == "COMPAT" || $3 == "CPT_NOA") {
ncompat++
out = syscompat
@@ -389,6 +402,12 @@
outdcl = syscompat4dcl
wrap = "compat4"
prefix = "freebsd4_"
+ } else if ($3 == "COMPAT6") {
+ ncompat6++
+ out = syscompat6
+ outdcl = syscompat6dcl
+ wrap = "compat6"
+ prefix = "freebsd6_"
}
parseline()
if (argc != 0 && $3 != "CPT_NOA") {
@@ -406,15 +425,21 @@
argalias) > sysarg
printf("%s\t%s%s(struct thread *, struct %s *);\n",
rettype, prefix, funcname, argalias) > outdcl
- printf("\t{ %s(%s%s,%s), %s },",
- wrap, mpsafe, argssize, funcname, auditev) > sysent
- align_sysent_comment(8 + 9 + length(mpsafe) + \
+ printf("\t{ %s(%s,%s), %s, NULL, 0, 0 },",
+ wrap, argssize, funcname, auditev) > sysent
+ align_sysent_comment(8 + 9 + \
length(argssize) + 1 + length(funcname) + length(auditev) + 4)
printf("/* %d = old %s */\n", syscall, funcalias) > sysent
- printf("\t\"old.%s\",\t\t/* %d = old %s */\n",
- funcalias, syscall, funcalias) > sysnames
- printf("\t\t\t\t/* %d is old %s */\n",
- syscall, funcalias) > syshdr
+ printf("\t\"%s.%s\",\t\t/* %d = old %s */\n",
+ wrap, funcalias, syscall, funcalias) > sysnames
+ if ($3 == "COMPAT" || $3 == "CPT_NOA") {
+ printf("\t\t\t\t/* %d is old %s */\n",
+ syscall, funcalias) > syshdr
+ } else {
+ printf("#define\t%s%s%s\t%d\n", syscallprefix,
+ prefix, funcalias, syscall) > syshdr
+ printf(" \\\n\t%s%s.o", prefix, funcalias) > sysmk
+ }
syscall++
next
}
@@ -422,9 +447,9 @@
ncompat++
parseline()
printf("%s\to%s();\n", rettype, funcname) > syscompatdcl
- printf("\t{ compat(%s%s,%s), %s },",
- mpsafe, argssize, funcname, auditev) > sysent
- align_sysent_comment(8 + 9 + length(mpsafe) + \
+ printf("\t{ compat(%s,%s), %s, NULL, 0, 0 },",
+ argssize, funcname, auditev) > sysent
+ align_sysent_comment(8 + 9 + \
length(argssize) + 1 + length(funcname) + length(auditev) + 4)
printf("/* %d = old %s */\n", syscall, funcalias) > sysent
printf("\t\"old.%s\",\t\t/* %d = old %s */\n",
@@ -436,7 +461,7 @@
next
}
$3 == "OBSOL" {
- printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL },") > sysent
+ printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },") > sysent
align_sysent_comment(34)
printf("/* %d = obsolete %s */\n", syscall, comment) > sysent
printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n",
@@ -447,7 +472,7 @@
next
}
$3 == "UNIMPL" {
- printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL },\t\t\t/* %d = %s */\n",
+ printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },\t\t\t/* %d = %s */\n",
syscall, comment) > sysent
printf("\t\"#%d\",\t\t\t/* %d = %s */\n",
syscall, syscall, comment) > sysnames
@@ -461,7 +486,7 @@
END {
printf "\n#define AS(name) (sizeof(struct name) / sizeof(register_t))\n" > sysinc
- if (ncompat != 0 || ncompat4 != 0)
+ if (ncompat != 0 || ncompat4 != 0 || ncompat6 != 0)
printf "#include \"opt_compat.h\"\n\n" > syssw
printf "#include \<bsm/audit_kevents.h\>\n" > syssw
@@ -481,11 +506,19 @@
printf "#endif\n" > sysinc
}
- printf("\n#endif /* %s */\n\n", compat) > syscompatdcl
+ if (ncompat6 != 0) {
+ printf "\n#ifdef %s\n", compat6 > sysinc
+ printf "#define compat6(n, name) n, (sy_call_t *)__CONCAT(freebsd6_,name)\n" > sysinc
+ printf "#else\n" > sysinc
+ printf "#define compat6(n, name) 0, (sy_call_t *)nosys\n" > sysinc
+ printf "#endif\n" > sysinc
+ }
+ printf("\n#endif /* %s */\n\n", compat) > syscompatdcl
printf("\n#endif /* %s */\n\n", compat4) > syscompat4dcl
+ printf("\n#endif /* %s */\n\n", compat6) > syscompat6dcl
- printf("#undef PAD_\n") > sysprotoend
+ printf("\n#undef PAD_\n") > sysprotoend
printf("#undef PADL_\n") > sysprotoend
printf("#undef PADR_\n") > sysprotoend
printf("\n#endif /* !%s */\n", sysproto_h) > sysprotoend
@@ -495,11 +528,13 @@
printf("};\n") > sysnames
printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \
> syshdr
+ printf "\tdefault:\n\t\t*n_args = 0;\n\t\tbreak;\n\t};\n}\n" > systrace
} '
cat $sysinc $sysent >> $syssw
cat $sysarg $sysdcl \
$syscompat $syscompatdcl \
$syscompat4 $syscompat4dcl \
- $sysprotoend > $sysproto
+ $syscompat6 $syscompat6dcl \
+ $sysaue $sysprotoend > $sysproto
--- /dev/null
+++ sys/kern/serdev_if.m
@@ -0,0 +1,94 @@
+#-
+# Copyright (c) 2006 Marcel Moolenaar
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD: src/sys/kern/serdev_if.m,v 1.2 2006/04/23 22:12:39 marcel Exp $
+#
+
+#include <sys/bus.h>
+#include <sys/serial.h>
+
+# The serdev interface is used by umbrella drivers and children thereof to
+# establish a more intimate relationship, necessary for efficient handling
+# of multiple (concurrent) serial communication channels. Examples include
+# serial communications controller (SCC) drivers, multi-I/O adapter drivers
+# and intelligent multi-port serial drivers. Methods specifically deal
+# with interrupt handling and configuration. Conceptually, the umbrella
+# driver is responsible for the overall operation of the hardware and uses
+# child drivers to handle each individual channel.
+# The serdev interface is intended to inherit the device interface.
+
+INTERFACE serdev;
+
+# Default implementations of some methods.
+CODE {
+ static serdev_intr_t *
+ default_ihand(device_t dev, int ipend)
+ {
+ return (NULL);
+ }
+
+ static int
+ default_ipend(device_t dev)
+ {
+ return (-1);
+ }
+
+ static int
+ default_sysdev(device_t dev)
+ {
+ return (0);
+ }
+};
+
+# ihand() - Query serial device interrupt handler.
+# This method is called by the umbrella driver to obtain function pointers
+# to interrupt handlers for each individual interrupt source. This allows
+# the umbralla driver to control the servicing of interrupts between the
+# different channels in the most flexible way.
+METHOD serdev_intr_t* ihand {
+ device_t dev;
+ int ipend;
+} DEFAULT default_ihand;
+
+# ipend() - Query pending interrupt status.
+# This method is called by the umbrella driver to obtain interrupt status
+# for the UART in question. This allows the umbrella driver to build a
+# matrix and service the interrupts in the most flexible way by calling
+# interrupt handlers collected with the ihand() method.
+METHOD int ipend {
+ device_t dev;
+} DEFAULT default_ipend;
+
+# sysdev() - Query system device status
+# This method may be called by the umbrella driver for each child driver
+# to establish if a particular channel and mode is currently being used
+# for system specific usage. If this is the case, the hardware is not
+# reset and the channel will not change its operation mode.
+# The return value is !0 if the channel and mode are used for a system
+# device and 0 otherwise.
+METHOD int sysdev {
+ device_t dev;
+} DEFAULT default_sysdev;
+
Index: kern_linker.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_linker.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_linker.c -L sys/kern/kern_linker.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_linker.c
+++ sys/kern/kern_linker.c
@@ -25,9 +25,10 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_linker.c,v 1.117.2.1 2005/11/04 17:05:13 jdp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_linker.c,v 1.149 2007/05/31 11:51:51 kib Exp $");
#include "opt_ddb.h"
+#include "opt_hwpmc_hooks.h"
#include "opt_mac.h"
#include <sys/param.h>
@@ -36,31 +37,57 @@
#include <sys/malloc.h>
#include <sys/sysproto.h>
#include <sys/sysent.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sx.h>
-#include <sys/mac.h>
#include <sys/module.h>
+#include <sys/mount.h>
#include <sys/linker.h>
#include <sys/fcntl.h>
#include <sys/libkern.h>
#include <sys/namei.h>
#include <sys/vnode.h>
+#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
+#include <security/mac/mac_framework.h>
+
#include "linker_if.h"
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
#ifdef KLD_DEBUG
int kld_debug = 0;
#endif
+#define KLD_LOCK() sx_xlock(&kld_sx)
+#define KLD_UNLOCK() sx_xunlock(&kld_sx)
+#define KLD_LOCKED() sx_xlocked(&kld_sx)
+#define KLD_LOCK_ASSERT() do { \
+ if (!cold) \
+ sx_assert(&kld_sx, SX_XLOCKED); \
+} while (0)
+
/*
* static char *linker_search_path(const char *name, struct mod_depend
* *verinfo);
*/
static const char *linker_basename(const char *path);
+/*
+ * Find a currently loaded file given its filename.
+ */
+static linker_file_t linker_find_file_by_name(const char* _filename);
+
+/*
+ * Find a currently loaded file given its file id.
+ */
+static linker_file_t linker_find_file_by_id(int _fileid);
+
/* Metadata from the static kernel */
SET_DECLARE(modmetadata_set, struct mod_metadata);
@@ -68,7 +95,7 @@
linker_file_t linker_kernel_file;
-static struct mtx kld_mtx; /* kernel linker mutex */
+static struct sx kld_sx; /* kernel linker lock */
static linker_class_list_t classes;
static linker_file_list_t linker_files;
@@ -78,17 +105,15 @@
#define LINKER_GET_NEXT_FILE_ID(a) do { \
linker_file_t lftmp; \
\
+ KLD_LOCK_ASSERT(); \
retry: \
- mtx_lock(&kld_mtx); \
TAILQ_FOREACH(lftmp, &linker_files, link) { \
if (next_file_id == lftmp->id) { \
next_file_id++; \
- mtx_unlock(&kld_mtx); \
goto retry; \
} \
} \
(a) = next_file_id; \
- mtx_unlock(&kld_mtx); /* Hold for safe read of id variable */ \
} while(0)
@@ -103,8 +128,14 @@
typedef struct modlist *modlist_t;
static modlisthead_t found_modules;
-static modlist_t modlist_lookup2(const char *name,
- struct mod_depend *verinfo);
+static int linker_file_add_dependency(linker_file_t file,
+ linker_file_t dep);
+static caddr_t linker_file_lookup_symbol_internal(linker_file_t file,
+ const char* name, int deps);
+static int linker_load_module(const char *kldname,
+ const char *modname, struct linker_file *parent,
+ struct mod_depend *verinfo, struct linker_file **lfpp);
+static modlist_t modlist_lookup2(const char *name, struct mod_depend *verinfo);
static char *
linker_strdup(const char *str)
@@ -120,7 +151,7 @@
linker_init(void *arg)
{
- mtx_init(&kld_mtx, "kernel linker", NULL, MTX_DEF);
+ sx_init(&kld_sx, "kernel linker");
TAILQ_INIT(&classes);
TAILQ_INIT(&linker_files);
}
@@ -166,7 +197,7 @@
/*
* Perform a bubble sort of the system initialization objects by
* their subsystem (primary key) and order (secondary key).
- *
+ *
* Since some things care about execution order, this is the operation
* which ensures continued function.
*/
@@ -186,6 +217,7 @@
* Traverse the (now) ordered list of system initialization tasks.
* Perform each task, and continue on to the next task.
*/
+ mtx_lock(&Giant);
for (sipp = start; sipp < stop; sipp++) {
if ((*sipp)->subsystem == SI_SUB_DUMMY)
continue; /* skip dummy task(s) */
@@ -193,6 +225,7 @@
/* Call function */
(*((*sipp)->func)) ((*sipp)->udata);
}
+ mtx_unlock(&Giant);
}
static void
@@ -210,7 +243,7 @@
/*
* Perform a reverse bubble sort of the system initialization objects
* by their subsystem (primary key) and order (secondary key).
- *
+ *
* Since some things care about execution order, this is the operation
* which ensures continued function.
*/
@@ -230,6 +263,7 @@
* Traverse the (now) ordered list of system initialization tasks.
* Perform each task, and continue on to the next task.
*/
+ mtx_lock(&Giant);
for (sipp = start; sipp < stop; sipp++) {
if ((*sipp)->subsystem == SI_SUB_DUMMY)
continue; /* skip dummy task(s) */
@@ -237,6 +271,7 @@
/* Call function */
(*((*sipp)->func)) ((*sipp)->udata);
}
+ mtx_unlock(&Giant);
}
static void
@@ -251,8 +286,10 @@
if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
return;
+ mtx_lock(&Giant);
for (oidp = start; oidp < stop; oidp++)
sysctl_register_oid(*oidp);
+ mtx_unlock(&Giant);
}
static void
@@ -266,8 +303,10 @@
if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
return;
+ mtx_lock(&Giant);
for (oidp = start; oidp < stop; oidp++)
sysctl_unregister_oid(*oidp);
+ mtx_unlock(&Giant);
}
static int
@@ -281,7 +320,7 @@
" in %s\n", lf->filename));
if (linker_file_lookup_set(lf, "modmetadata_set", &start,
- &stop, 0) != 0) {
+ &stop, NULL) != 0) {
/*
* This fallback should be unnecessary, but if we get booted
* from boot2 instead of loader and we are missing our
@@ -325,22 +364,23 @@
{
linker_class_t lc;
linker_file_t lf;
- int foundfile, error = 0;
+ int foundfile, error;
/* Refuse to load modules if securelevel raised */
if (securelevel > 0)
return (EPERM);
+ KLD_LOCK_ASSERT();
lf = linker_find_file_by_name(filename);
if (lf) {
KLD_DPF(FILE, ("linker_load_file: file %s is already loaded,"
" incrementing refs\n", filename));
*result = lf;
lf->refs++;
- goto out;
+ return (0);
}
- lf = NULL;
foundfile = 0;
+ error = 0;
/*
* We do not need to protect (lock) classes here because there is
@@ -361,14 +401,15 @@
error = linker_file_register_modules(lf);
if (error == EEXIST) {
linker_file_unload(lf, LINKER_UNLOAD_FORCE);
- goto out;
+ return (error);
}
+ KLD_UNLOCK();
linker_file_register_sysctls(lf);
linker_file_sysinit(lf);
+ KLD_LOCK();
lf->flags |= LINKER_FILE_LINKED;
*result = lf;
- error = 0;
- goto out;
+ return (0);
}
}
/*
@@ -388,7 +429,6 @@
error = ENOEXEC;
} else
error = ENOENT; /* Nothing found */
-out:
return (error);
}
@@ -397,67 +437,107 @@
linker_file_t *result)
{
modlist_t mod;
+ int error;
+ KLD_LOCK();
if ((mod = modlist_lookup2(modname, verinfo)) != NULL) {
*result = mod->container;
(*result)->refs++;
+ KLD_UNLOCK();
return (0);
}
- return (linker_load_module(NULL, modname, NULL, verinfo, result));
+ error = linker_load_module(NULL, modname, NULL, verinfo, result);
+ KLD_UNLOCK();
+ return (error);
}
-linker_file_t
+int
+linker_release_module(const char *modname, struct mod_depend *verinfo,
+ linker_file_t lf)
+{
+ modlist_t mod;
+ int error;
+
+ KLD_LOCK();
+ if (lf == NULL) {
+ KASSERT(modname != NULL,
+ ("linker_release_module: no file or name"));
+ mod = modlist_lookup2(modname, verinfo);
+ if (mod == NULL) {
+ KLD_UNLOCK();
+ return (ESRCH);
+ }
+ lf = mod->container;
+ } else
+ KASSERT(modname == NULL && verinfo == NULL,
+ ("linker_release_module: both file and name"));
+ error = linker_file_unload(lf, LINKER_UNLOAD_NORMAL);
+ KLD_UNLOCK();
+ return (error);
+}
+
+static linker_file_t
linker_find_file_by_name(const char *filename)
{
- linker_file_t lf = 0;
+ linker_file_t lf;
char *koname;
koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK);
- if (koname == NULL)
- goto out;
sprintf(koname, "%s.ko", filename);
- mtx_lock(&kld_mtx);
+ KLD_LOCK_ASSERT();
TAILQ_FOREACH(lf, &linker_files, link) {
if (strcmp(lf->filename, koname) == 0)
break;
if (strcmp(lf->filename, filename) == 0)
break;
}
- mtx_unlock(&kld_mtx);
-out:
- if (koname)
- free(koname, M_LINKER);
+ free(koname, M_LINKER);
return (lf);
}
-linker_file_t
+static linker_file_t
linker_find_file_by_id(int fileid)
{
- linker_file_t lf = 0;
-
- mtx_lock(&kld_mtx);
+ linker_file_t lf;
+
+ KLD_LOCK_ASSERT();
TAILQ_FOREACH(lf, &linker_files, link)
- if (lf->id == fileid)
+ if (lf->id == fileid && lf->flags & LINKER_FILE_LINKED)
break;
- mtx_unlock(&kld_mtx);
return (lf);
}
+int
+linker_file_foreach(linker_predicate_t *predicate, void *context)
+{
+ linker_file_t lf;
+ int retval = 0;
+
+ KLD_LOCK();
+ TAILQ_FOREACH(lf, &linker_files, link) {
+ retval = predicate(lf, context);
+ if (retval != 0)
+ break;
+ }
+ KLD_UNLOCK();
+ return (retval);
+}
+
linker_file_t
linker_make_file(const char *pathname, linker_class_t lc)
{
linker_file_t lf;
const char *filename;
- lf = NULL;
+ KLD_LOCK_ASSERT();
filename = linker_basename(pathname);
KLD_DPF(FILE, ("linker_make_file: new file, filename=%s\n", filename));
lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK);
if (lf == NULL)
- goto out;
+ return (NULL);
lf->refs = 1;
lf->userrefs = 0;
lf->flags = 0;
@@ -467,10 +547,7 @@
lf->deps = NULL;
STAILQ_INIT(&lf->common);
TAILQ_INIT(&lf->modules);
- mtx_lock(&kld_mtx);
TAILQ_INSERT_TAIL(&linker_files, lf, link);
- mtx_unlock(&kld_mtx);
-out:
return (lf);
}
@@ -482,66 +559,59 @@
struct common_symbol *cp;
int error, i;
- error = 0;
-
/* Refuse to unload modules if securelevel raised. */
if (securelevel > 0)
return (EPERM);
-#ifdef MAC
- error = mac_check_kld_unload(curthread->td_ucred);
- if (error)
- return (error);
-#endif
+ KLD_LOCK_ASSERT();
KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs));
- if (file->refs == 1) {
- KLD_DPF(FILE, ("linker_file_unload: file is unloading,"
- " informing modules\n"));
+
+ /* Easy case of just dropping a reference. */
+ if (file->refs > 1) {
+ file->refs--;
+ return (0);
+ }
+
+ KLD_DPF(FILE, ("linker_file_unload: file is unloading,"
+ " informing modules\n"));
+
+ /*
+ * Inform any modules associated with this file.
+ */
+ MOD_XLOCK;
+ for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) {
+ next = module_getfnext(mod);
+ MOD_XUNLOCK;
/*
- * Inform any modules associated with this file.
+ * Give the module a chance to veto the unload.
*/
- MOD_XLOCK;
- for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) {
- next = module_getfnext(mod);
- MOD_XUNLOCK;
-
- /*
- * Give the module a chance to veto the unload.
- */
- if ((error = module_unload(mod, flags)) != 0) {
- KLD_DPF(FILE, ("linker_file_unload: module %p"
- " vetoes unload\n", mod));
- goto out;
- } else
- MOD_XLOCK;
- module_release(mod);
+ if ((error = module_unload(mod, flags)) != 0) {
+ KLD_DPF(FILE, ("linker_file_unload: module %p"
+ " vetoes unload\n", mod));
+ return (error);
}
- MOD_XUNLOCK;
- }
- file->refs--;
- if (file->refs > 0) {
- goto out;
+ MOD_XLOCK;
+ module_release(mod);
}
- for (ml = TAILQ_FIRST(&found_modules); ml; ml = nextml) {
- nextml = TAILQ_NEXT(ml, link);
+ MOD_XUNLOCK;
+
+ TAILQ_FOREACH_SAFE(ml, &found_modules, link, nextml) {
if (ml->container == file) {
TAILQ_REMOVE(&found_modules, ml, link);
free(ml, M_LINKER);
}
}
- /*
- * Don't try to run SYSUNINITs if we are unloaded due to a
+ /*
+ * Don't try to run SYSUNINITs if we are unloaded due to a
* link error.
*/
if (file->flags & LINKER_FILE_LINKED) {
linker_file_sysuninit(file);
linker_file_unregister_sysctls(file);
}
- mtx_lock(&kld_mtx);
TAILQ_REMOVE(&linker_files, file, link);
- mtx_unlock(&kld_mtx);
if (file->deps) {
for (i = 0; i < file->ndeps; i++)
@@ -549,9 +619,8 @@
free(file->deps, M_LINKER);
file->deps = NULL;
}
- for (cp = STAILQ_FIRST(&file->common); cp;
- cp = STAILQ_FIRST(&file->common)) {
- STAILQ_REMOVE(&file->common, cp, common_symbol, link);
+ while ((cp = STAILQ_FIRST(&file->common)) != NULL) {
+ STAILQ_REMOVE_HEAD(&file->common, link);
free(cp, M_LINKER);
}
@@ -561,15 +630,15 @@
file->filename = NULL;
}
kobj_delete((kobj_t) file, M_LINKER);
-out:
- return (error);
+ return (0);
}
-int
+static int
linker_file_add_dependency(linker_file_t file, linker_file_t dep)
{
linker_file_t *newdeps;
+ KLD_LOCK_ASSERT();
newdeps = malloc((file->ndeps + 1) * sizeof(linker_file_t *),
M_LINKER, M_WAITOK | M_ZERO);
if (newdeps == NULL)
@@ -588,25 +657,51 @@
/*
* Locate a linker set and its contents. This is a helper function to avoid
- * linker_if.h exposure elsewhere. Note: firstp and lastp are really void ***
+ * linker_if.h exposure elsewhere. Note: firstp and lastp are really void **.
+ * This function is used in this file so we can avoid having lots of (void **)
+ * casts.
*/
int
linker_file_lookup_set(linker_file_t file, const char *name,
void *firstp, void *lastp, int *countp)
{
+ int error, locked;
- return (LINKER_LOOKUP_SET(file, name, firstp, lastp, countp));
+ locked = KLD_LOCKED();
+ if (!locked)
+ KLD_LOCK();
+ error = LINKER_LOOKUP_SET(file, name, firstp, lastp, countp);
+ if (!locked)
+ KLD_UNLOCK();
+ return (error);
}
caddr_t
linker_file_lookup_symbol(linker_file_t file, const char *name, int deps)
{
+ caddr_t sym;
+ int locked;
+
+ locked = KLD_LOCKED();
+ if (!locked)
+ KLD_LOCK();
+ sym = linker_file_lookup_symbol_internal(file, name, deps);
+ if (!locked)
+ KLD_UNLOCK();
+ return (sym);
+}
+
+static caddr_t
+linker_file_lookup_symbol_internal(linker_file_t file, const char *name,
+ int deps)
+{
c_linker_sym_t sym;
linker_symval_t symval;
caddr_t address;
size_t common_size = 0;
int i;
+ KLD_LOCK_ASSERT();
KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%p, name=%s, deps=%d\n",
file, name, deps));
@@ -627,8 +722,8 @@
}
if (deps) {
for (i = 0; i < file->ndeps; i++) {
- address = linker_file_lookup_symbol(file->deps[i],
- name, 0);
+ address = linker_file_lookup_symbol_internal(
+ file->deps[i], name, 0);
if (address) {
KLD_DPF(SYM, ("linker_file_lookup_symbol:"
" deps value=%p\n", address));
@@ -658,10 +753,6 @@
cp = malloc(sizeof(struct common_symbol)
+ common_size + strlen(name) + 1, M_LINKER,
M_WAITOK | M_ZERO);
- if (cp == NULL) {
- KLD_DPF(SYM, ("linker_file_lookup_symbol: nomem\n"));
- return (0);
- }
cp->address = (caddr_t)(cp + 1);
cp->name = cp->address + common_size;
strcpy(cp->name, name);
@@ -680,7 +771,7 @@
/*
* DDB Helpers. DDB has to look across multiple files with their own symbol
* tables and string tables.
- *
+ *
* Note that we do not obey list locking protocols here. We really don't need
* DDB to hang because somebody's got the lock held. We'll take the chance
* that the files list is inconsistant instead.
@@ -745,73 +836,87 @@
/*
* Syscalls.
*/
-/*
- * MPSAFE
- */
int
-kldload(struct thread *td, struct kldload_args *uap)
+kern_kldload(struct thread *td, const char *file, int *fileid)
{
- char *kldname, *modname;
- char *pathname = NULL;
+#ifdef HWPMC_HOOKS
+ struct pmckern_map_in pkm;
+#endif
+ const char *kldname, *modname;
linker_file_t lf;
- int error = 0;
-
- td->td_retval[0] = -1;
-
- mtx_lock(&Giant);
+ int error;
if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
- goto out;
-
- if ((error = suser(td)) != 0)
- goto out;
+ return (error);
- pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
- if ((error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL)) != 0)
- goto out;
+ if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0)
+ return (error);
/*
- * If path do not contain qualified name or any dot in it
- * (kldname.ko, or kldname.ver.ko) treat it as interface
+ * If file does not contain a qualified name or any dot in it
+ * (kldname.ko, or kldname.ver.ko) treat it as an interface
* name.
*/
- if (index(pathname, '/') || index(pathname, '.')) {
- kldname = pathname;
+ if (index(file, '/') || index(file, '.')) {
+ kldname = file;
modname = NULL;
} else {
kldname = NULL;
- modname = pathname;
+ modname = file;
}
+
+ KLD_LOCK();
error = linker_load_module(kldname, modname, NULL, NULL, &lf);
if (error)
- goto out;
-
+ goto unlock;
+#ifdef HWPMC_HOOKS
+ pkm.pm_file = lf->filename;
+ pkm.pm_address = (uintptr_t) lf->address;
+ PMC_CALL_HOOK(td, PMC_FN_KLD_LOAD, (void *) &pkm);
+#endif
lf->userrefs++;
- td->td_retval[0] = lf->id;
-out:
- if (pathname)
- free(pathname, M_TEMP);
- mtx_unlock(&Giant);
+ if (fileid != NULL)
+ *fileid = lf->id;
+unlock:
+ KLD_UNLOCK();
return (error);
}
-/*
- * MPSAFE
- */
-static int
+int
+kldload(struct thread *td, struct kldload_args *uap)
+{
+ char *pathname = NULL;
+ int error, fileid;
+
+ td->td_retval[0] = -1;
+
+ pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL);
+ if (error == 0) {
+ error = kern_kldload(td, pathname, &fileid);
+ if (error == 0)
+ td->td_retval[0] = fileid;
+ }
+ free(pathname, M_TEMP);
+ return (error);
+}
+
+int
kern_kldunload(struct thread *td, int fileid, int flags)
{
+#ifdef HWPMC_HOOKS
+ struct pmckern_map_out pkm;
+#endif
linker_file_t lf;
int error = 0;
- mtx_lock(&Giant);
-
if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
- goto out;
+ return (error);
- if ((error = suser(td)) != 0)
- goto out;
+ if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0)
+ return (error);
+ KLD_LOCK();
lf = linker_find_file_by_id(fileid);
if (lf) {
KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs));
@@ -822,22 +927,28 @@
printf("kldunload: attempt to unload file that was"
" loaded by the kernel\n");
error = EBUSY;
- goto out;
+ } else {
+#ifdef HWPMC_HOOKS
+ /* Save data needed by hwpmc(4) before unloading. */
+ pkm.pm_address = (uintptr_t) lf->address;
+ pkm.pm_size = lf->size;
+#endif
+ lf->userrefs--;
+ error = linker_file_unload(lf, flags);
+ if (error)
+ lf->userrefs++;
}
- lf->userrefs--;
- error = linker_file_unload(lf, flags);
- if (error)
- lf->userrefs++;
} else
error = ENOENT;
-out:
- mtx_unlock(&Giant);
+
+#ifdef HWPMC_HOOKS
+ if (error == 0)
+ PMC_CALL_HOOK(td, PMC_FN_KLD_UNLOAD, (void *) &pkm);
+#endif
+ KLD_UNLOCK();
return (error);
}
-/*
- * MPSAFE
- */
int
kldunload(struct thread *td, struct kldunload_args *uap)
{
@@ -845,9 +956,6 @@
return (kern_kldunload(td, uap->fileid, LINKER_UNLOAD_NORMAL));
}
-/*
- * MPSAFE
- */
int
kldunloadf(struct thread *td, struct kldunloadf_args *uap)
{
@@ -858,16 +966,13 @@
return (kern_kldunload(td, uap->fileid, uap->flags));
}
-/*
- * MPSAFE
- */
int
kldfind(struct thread *td, struct kldfind_args *uap)
{
char *pathname;
const char *filename;
linker_file_t lf;
- int error = 0;
+ int error;
#ifdef MAC
error = mac_check_kld_stat(td->td_ucred);
@@ -875,7 +980,6 @@
return (error);
#endif
- mtx_lock(&Giant);
td->td_retval[0] = -1;
pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
@@ -883,21 +987,18 @@
goto out;
filename = linker_basename(pathname);
+ KLD_LOCK();
lf = linker_find_file_by_name(filename);
if (lf)
td->td_retval[0] = lf->id;
else
error = ENOENT;
+ KLD_UNLOCK();
out:
- if (pathname)
- free(pathname, M_TEMP);
- mtx_unlock(&Giant);
+ free(pathname, M_TEMP);
return (error);
}
-/*
- * MPSAFE
- */
int
kldnext(struct thread *td, struct kldnext_args *uap)
{
@@ -910,40 +1011,46 @@
return (error);
#endif
- mtx_lock(&Giant);
-
- if (uap->fileid == 0) {
- mtx_lock(&kld_mtx);
- if (TAILQ_FIRST(&linker_files))
- td->td_retval[0] = TAILQ_FIRST(&linker_files)->id;
- else
- td->td_retval[0] = 0;
- mtx_unlock(&kld_mtx);
- goto out;
+ KLD_LOCK();
+ if (uap->fileid == 0)
+ lf = TAILQ_FIRST(&linker_files);
+ else {
+ lf = linker_find_file_by_id(uap->fileid);
+ if (lf == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ lf = TAILQ_NEXT(lf, link);
}
- lf = linker_find_file_by_id(uap->fileid);
- if (lf) {
- if (TAILQ_NEXT(lf, link))
- td->td_retval[0] = TAILQ_NEXT(lf, link)->id;
- else
- td->td_retval[0] = 0;
- } else
- error = ENOENT;
+
+ /* Skip partially loaded files. */
+ while (lf != NULL && !(lf->flags & LINKER_FILE_LINKED))
+ lf = TAILQ_NEXT(lf, link);
+
+ if (lf)
+ td->td_retval[0] = lf->id;
+ else
+ td->td_retval[0] = 0;
out:
- mtx_unlock(&Giant);
+ KLD_UNLOCK();
return (error);
}
-/*
- * MPSAFE
- */
int
kldstat(struct thread *td, struct kldstat_args *uap)
{
+ struct kld_file_stat stat;
linker_file_t lf;
- int error = 0;
- int namelen, version;
- struct kld_file_stat *stat;
+ int error, namelen;
+
+ /*
+ * Check the version of the user's structure.
+ */
+ error = copyin(uap->stat, &stat, sizeof(struct kld_file_stat));
+ if (error)
+ return (error);
+ if (stat.version != sizeof(struct kld_file_stat))
+ return (EINVAL);
#ifdef MAC
error = mac_check_kld_stat(td->td_ucred);
@@ -951,48 +1058,28 @@
return (error);
#endif
- mtx_lock(&Giant);
-
+ KLD_LOCK();
lf = linker_find_file_by_id(uap->fileid);
if (lf == NULL) {
- error = ENOENT;
- goto out;
+ KLD_UNLOCK();
+ return (ENOENT);
}
- stat = uap->stat;
- /*
- * Check the version of the user's structure.
- */
- if ((error = copyin(&stat->version, &version, sizeof(version))) != 0)
- goto out;
- if (version != sizeof(struct kld_file_stat)) {
- error = EINVAL;
- goto out;
- }
namelen = strlen(lf->filename) + 1;
if (namelen > MAXPATHLEN)
namelen = MAXPATHLEN;
- if ((error = copyout(lf->filename, &stat->name[0], namelen)) != 0)
- goto out;
- if ((error = copyout(&lf->refs, &stat->refs, sizeof(int))) != 0)
- goto out;
- if ((error = copyout(&lf->id, &stat->id, sizeof(int))) != 0)
- goto out;
- if ((error = copyout(&lf->address, &stat->address,
- sizeof(caddr_t))) != 0)
- goto out;
- if ((error = copyout(&lf->size, &stat->size, sizeof(size_t))) != 0)
- goto out;
+ bcopy(lf->filename, &stat.name[0], namelen);
+ stat.refs = lf->refs;
+ stat.id = lf->id;
+ stat.address = lf->address;
+ stat.size = lf->size;
+ KLD_UNLOCK();
td->td_retval[0] = 0;
-out:
- mtx_unlock(&Giant);
- return (error);
+
+ return (copyout(&stat, uap->stat, sizeof(struct kld_file_stat)));
}
-/*
- * MPSAFE
- */
int
kldfirstmod(struct thread *td, struct kldfirstmod_args *uap)
{
@@ -1006,7 +1093,7 @@
return (error);
#endif
- mtx_lock(&Giant);
+ KLD_LOCK();
lf = linker_find_file_by_id(uap->fileid);
if (lf) {
MOD_SLOCK;
@@ -1018,13 +1105,10 @@
MOD_SUNLOCK;
} else
error = ENOENT;
- mtx_unlock(&Giant);
+ KLD_UNLOCK();
return (error);
}
-/*
- * MPSAFE
- */
int
kldsym(struct thread *td, struct kldsym_args *uap)
{
@@ -1041,25 +1125,20 @@
return (error);
#endif
- mtx_lock(&Giant);
-
if ((error = copyin(uap->data, &lookup, sizeof(lookup))) != 0)
- goto out;
+ return (error);
if (lookup.version != sizeof(lookup) ||
- uap->cmd != KLDSYM_LOOKUP) {
- error = EINVAL;
- goto out;
- }
+ uap->cmd != KLDSYM_LOOKUP)
+ return (EINVAL);
symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0)
goto out;
+ KLD_LOCK();
if (uap->fileid != 0) {
lf = linker_find_file_by_id(uap->fileid);
- if (lf == NULL) {
+ if (lf == NULL)
error = ENOENT;
- goto out;
- }
- if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
+ else if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
lookup.symvalue = (uintptr_t) symval.value;
lookup.symsize = symval.size;
@@ -1067,7 +1146,6 @@
} else
error = ENOENT;
} else {
- mtx_lock(&kld_mtx);
TAILQ_FOREACH(lf, &linker_files, link) {
if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
@@ -1078,14 +1156,12 @@
break;
}
}
- mtx_unlock(&kld_mtx);
if (lf == NULL)
error = ENOENT;
}
+ KLD_UNLOCK();
out:
- if (symstr)
- free(symstr, M_TEMP);
- mtx_unlock(&Giant);
+ free(symstr, M_TEMP);
return (error);
}
@@ -1115,8 +1191,7 @@
if (verinfo == NULL)
return (modlist_lookup(name, 0));
bestmod = NULL;
- for (mod = TAILQ_FIRST(&found_modules); mod;
- mod = TAILQ_NEXT(mod, link)) {
+ TAILQ_FOREACH(mod, &found_modules, link) {
if (strcmp(mod->name, name) != 0)
continue;
ver = mod->version;
@@ -1174,7 +1249,7 @@
caddr_t modptr;
const char *modname, *nmodname;
char *modtype;
- linker_file_t lf;
+ linker_file_t lf, nlf;
linker_class_t lc;
int error;
linker_file_list_t loaded_files;
@@ -1228,8 +1303,8 @@
linker_addmodules(linker_kernel_file, start, stop, 1);
/*
- * this is a once-off kinky bubble sort resolve relocation dependency
- * requirements
+ * This is a once-off kinky bubble sort to resolve relocation
+ * dependency requirements.
*/
restart:
TAILQ_FOREACH(lf, &loaded_files, loaded) {
@@ -1257,7 +1332,7 @@
}
if (nmdp < stop) /* it's a self reference */
continue;
-
+
/*
* ok, the module isn't here yet, we
* are not finished
@@ -1284,10 +1359,10 @@
nver) != NULL) {
printf("module %s already"
" present!\n", modname);
- linker_file_unload(lf,
- LINKER_UNLOAD_FORCE);
TAILQ_REMOVE(&loaded_files,
lf, loaded);
+ linker_file_unload(lf,
+ LINKER_UNLOAD_FORCE);
/* we changed tailq next ptr */
goto restart;
}
@@ -1309,16 +1384,16 @@
/*
* At this point, we check to see what could not be resolved..
*/
- TAILQ_FOREACH(lf, &loaded_files, loaded) {
+ while ((lf = TAILQ_FIRST(&loaded_files)) != NULL) {
+ TAILQ_REMOVE(&loaded_files, lf, loaded);
printf("KLD file %s is missing dependencies\n", lf->filename);
linker_file_unload(lf, LINKER_UNLOAD_FORCE);
- TAILQ_REMOVE(&loaded_files, lf, loaded);
}
/*
* We made it. Finish off the linking in the order we determined.
*/
- TAILQ_FOREACH(lf, &depended_files, loaded) {
+ TAILQ_FOREACH_SAFE(lf, &depended_files, loaded, nlf) {
if (linker_kernel_file) {
linker_kernel_file->refs++;
error = linker_file_add_dependency(lf,
@@ -1353,6 +1428,7 @@
*/
error = LINKER_LINK_PRELOAD_FINISH(lf);
if (error) {
+ TAILQ_REMOVE(&depended_files, lf, loaded);
printf("KLD file %s - could not finalize loading\n",
lf->filename);
linker_file_unload(lf, LINKER_UNLOAD_FORCE);
@@ -1372,15 +1448,15 @@
/*
* Search for a not-loaded module by name.
- *
+ *
* Modules may be found in the following locations:
- *
+ *
* - preloaded (result is just the module name) - on disk (result is full path
* to module)
- *
+ *
* If the module name is qualified in any way (contains path, etc.) the we
* simply return a copy of it.
- *
+ *
* The search path can be manipulated via sysctl. Note that we use the ';'
* character as a separator to be consistent with the bootloader.
*/
@@ -1411,7 +1487,7 @@
struct nameidata nd;
struct thread *td = curthread; /* XXX */
char *result, **cpp, *sep;
- int error, len, extlen, reclen, flags;
+ int error, len, extlen, reclen, flags, vfslocked;
enum vtype type;
extlen = 0;
@@ -1432,16 +1508,18 @@
* Attempt to open the file, and return the path if
* we succeed and it's a regular file.
*/
- NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, result, td);
flags = FREAD;
- error = vn_open(&nd, &flags, 0, -1);
+ error = vn_open(&nd, &flags, 0, NULL);
if (error == 0) {
+ vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
type = nd.ni_vp->v_type;
if (vap)
VOP_GETATTR(nd.ni_vp, vap, td->td_ucred, td);
VOP_UNLOCK(nd.ni_vp, 0, td);
vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+ VFS_UNLOCK_GIANT(vfslocked);
if (type == VREG)
return (result);
}
@@ -1469,6 +1547,7 @@
u_char *hints = NULL;
u_char *cp, *recptr, *bufend, *result, *best, *pathbuf, *sep;
int error, ival, bestver, *intp, reclen, found, flags, clen, blen;
+ int vfslocked = 0;
result = NULL;
bestver = found = 0;
@@ -1480,11 +1559,12 @@
snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep,
linker_hintfile);
- NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pathbuf, td);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, pathbuf, td);
flags = FREAD;
- error = vn_open(&nd, &flags, 0, -1);
+ error = vn_open(&nd, &flags, 0, NULL);
if (error)
goto bad;
+ vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
if (nd.ni_vp->v_type != VREG)
goto bad;
@@ -1508,6 +1588,7 @@
goto bad;
VOP_UNLOCK(nd.ni_vp, 0, td);
vn_close(nd.ni_vp, FREAD, cred, td);
+ VFS_UNLOCK_GIANT(vfslocked);
nd.ni_vp = NULL;
if (reclen != 0) {
printf("can't read %d\n", reclen);
@@ -1576,6 +1657,7 @@
if (nd.ni_vp != NULL) {
VOP_UNLOCK(nd.ni_vp, 0, td);
vn_close(nd.ni_vp, FREAD, cred, td);
+ VFS_UNLOCK_GIANT(vfslocked);
}
/*
* If nothing found or hints is absent - fallback to the old
@@ -1618,21 +1700,13 @@
static char *
linker_search_kld(const char *name)
{
- char *cp, *ep, *result, **cpp;
- int extlen, len;
+ char *cp, *ep, *result;
+ int len;
/* qualified at all? */
if (index(name, '/'))
return (linker_strdup(name));
- extlen = 0;
- for (cpp = linker_ext_list; *cpp; cpp++) {
- len = strlen(*cpp);
- if (len > extlen)
- extlen = len;
- }
- extlen++; /* trailing '\0' */
-
/* traverse the linker path */
len = strlen(name);
for (ep = linker_path; *ep; ep++) {
@@ -1659,11 +1733,71 @@
return (filename);
}
+#ifdef HWPMC_HOOKS
+
+struct hwpmc_context {
+ int nobjects;
+ int nmappings;
+ struct pmckern_map_in *kobase;
+};
+
+static int
+linker_hwpmc_list_object(linker_file_t lf, void *arg)
+{
+ struct hwpmc_context *hc;
+
+ hc = arg;
+
+ /* If we run out of mappings, fail. */
+ if (hc->nobjects >= hc->nmappings)
+ return (1);
+
+ /* Save the info for this linker file. */
+ hc->kobase[hc->nobjects].pm_file = lf->filename;
+ hc->kobase[hc->nobjects].pm_address = (uintptr_t)lf->address;
+ hc->nobjects++;
+ return (0);
+}
+
+/*
+ * Inform hwpmc about the set of kernel modules currently loaded.
+ */
+void *
+linker_hwpmc_list_objects(void)
+{
+ struct hwpmc_context hc;
+
+ hc.nmappings = 15; /* a reasonable default */
+
+ retry:
+ /* allocate nmappings+1 entries */
+ MALLOC(hc.kobase, struct pmckern_map_in *,
+ (hc.nmappings + 1) * sizeof(struct pmckern_map_in), M_LINKER,
+ M_WAITOK | M_ZERO);
+
+ hc.nobjects = 0;
+ if (linker_file_foreach(linker_hwpmc_list_object, &hc) != 0) {
+ hc.nmappings = hc.nobjects;
+ FREE(hc.kobase, M_LINKER);
+ goto retry;
+ }
+
+ KASSERT(hc.nobjects > 0, ("linker_hpwmc_list_objects: no kernel "
+ "objects?"));
+
+ /* The last entry of the malloced area comprises of all zeros. */
+ KASSERT(hc.kobase[hc.nobjects].pm_file == NULL,
+ ("linker_hwpmc_list_objects: last object not NULL"));
+
+ return ((void *)hc.kobase);
+}
+#endif
+
/*
* Find a file which contains given module and load it, if "parent" is not
* NULL, register a reference to it.
*/
-int
+static int
linker_load_module(const char *kldname, const char *modname,
struct linker_file *parent, struct mod_depend *verinfo,
struct linker_file **lfpp)
@@ -1673,6 +1807,7 @@
char *pathname;
int error;
+ KLD_LOCK_ASSERT();
if (modname == NULL) {
/*
* We have to load KLD
@@ -1704,11 +1839,9 @@
* provide different versions of the same modules.
*/
filename = linker_basename(pathname);
- if (linker_find_file_by_name(filename)) {
+ if (linker_find_file_by_name(filename))
error = EEXIST;
- goto out;
- }
- do {
+ else do {
error = linker_load_file(pathname, &lfdep);
if (error)
break;
@@ -1726,9 +1859,7 @@
if (lfpp)
*lfpp = lfdep;
} while (0);
-out:
- if (pathname)
- free(pathname, M_LINKER);
+ free(pathname, M_LINKER);
return (error);
}
@@ -1750,6 +1881,7 @@
/*
* All files are dependant on /kernel.
*/
+ KLD_LOCK_ASSERT();
if (linker_kernel_file) {
linker_kernel_file->refs++;
error = linker_file_add_dependency(lf, linker_kernel_file);
@@ -1841,16 +1973,16 @@
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
return (error);
- mtx_lock(&kld_mtx);
+ KLD_LOCK();
TAILQ_FOREACH(lf, &linker_files, link) {
error = LINKER_EACH_FUNCTION_NAME(lf,
sysctl_kern_function_list_iterate, req);
if (error) {
- mtx_unlock(&kld_mtx);
+ KLD_UNLOCK();
return (error);
}
}
- mtx_unlock(&kld_mtx);
+ KLD_UNLOCK();
return (SYSCTL_OUT(req, "", 1));
}
Index: vfs_aio.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_aio.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/vfs_aio.c -L sys/kern/vfs_aio.c -u -r1.3 -r1.4
--- sys/kern/vfs_aio.c
+++ sys/kern/vfs_aio.c
@@ -19,7 +19,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_aio.c,v 1.195.2.2 2005/11/08 16:08:40 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_aio.c,v 1.233.4.1 2008/01/28 10:43:10 dumbbell Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -38,43 +38,54 @@
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/unistd.h>
+#include <sys/posix4.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/protosw.h>
+#include <sys/sema.h>
+#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syscall.h>
#include <sys/sysent.h>
#include <sys/sysctl.h>
#include <sys/sx.h>
+#include <sys/taskqueue.h>
#include <sys/vnode.h>
#include <sys/conf.h>
#include <sys/event.h>
+#include <sys/mount.h>
+
+#include <machine/atomic.h>
-#include <posix4/posix4.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
+#include <vm/vm_object.h>
#include <vm/uma.h>
#include <sys/aio.h>
#include "opt_vfs_aio.h"
-NET_NEEDS_GIANT("aio");
-
/*
* Counter for allocating reference ids to new jobs. Wrapped to 1 on
- * overflow.
+ * overflow. (XXX will be removed soon.)
+ */
+static u_long jobrefid;
+
+/*
+ * Counter for aio_fsync.
*/
-static long jobrefid;
+static uint64_t jobseqno;
-#define JOBST_NULL 0x0
-#define JOBST_JOBQGLOBAL 0x2
-#define JOBST_JOBRUNNING 0x3
-#define JOBST_JOBFINISHED 0x4
-#define JOBST_JOBQBUF 0x5
-#define JOBST_JOBBFINISHED 0x6
+#define JOBST_NULL 0
+#define JOBST_JOBQSOCK 1
+#define JOBST_JOBQGLOBAL 2
+#define JOBST_JOBRUNNING 3
+#define JOBST_JOBFINISHED 4
+#define JOBST_JOBQBUF 5
+#define JOBST_JOBQSYNC 6
#ifndef MAX_AIO_PER_PROC
#define MAX_AIO_PER_PROC 32
@@ -141,7 +152,7 @@
"Number of aio requests presently handled by the buf subsystem");
/* Number of async I/O thread in the process of being started */
-/* XXX This should be local to _aio_aqueue() */
+/* XXX This should be local to aio_aqueue() */
static int num_aio_resv_start = 0;
static int aiod_timeout;
@@ -170,26 +181,70 @@
SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
"Maximum buf aio requests per process (stored in the process)");
+typedef struct oaiocb {
+ int aio_fildes; /* File descriptor */
+ off_t aio_offset; /* File offset for I/O */
+ volatile void *aio_buf; /* I/O buffer in process space */
+ size_t aio_nbytes; /* Number of bytes for I/O */
+ struct osigevent aio_sigevent; /* Signal to deliver */
+ int aio_lio_opcode; /* LIO opcode */
+ int aio_reqprio; /* Request priority -- ignored */
+ struct __aiocb_private _aiocb_private;
+} oaiocb_t;
+
+/*
+ * Below is a key of locks used to protect each member of struct aiocblist
+ * aioliojob and kaioinfo and any backends.
+ *
+ * * - need not protected
+ * a - locked by kaioinfo lock
+ * b - locked by backend lock, the backend lock can be null in some cases,
+ * for example, BIO belongs to this type, in this case, proc lock is
+ * reused.
+ * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
+ */
+
+/*
+ * Current, there is only two backends: BIO and generic file I/O.
+ * socket I/O is served by generic file I/O, this is not a good idea, since
+ * disk file I/O and any other types without O_NONBLOCK flag can block daemon
+ * threads, if there is no thread to serve socket I/O, the socket I/O will be
+ * delayed too long or starved, we should create some threads dedicated to
+ * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O
+ * systems we really need non-blocking interface, fiddling O_NONBLOCK in file
+ * structure is not safe because there is race between userland and aio
+ * daemons.
+ */
+
struct aiocblist {
- TAILQ_ENTRY(aiocblist) list; /* List of jobs */
- TAILQ_ENTRY(aiocblist) plist; /* List of jobs for proc */
- int jobflags;
- int jobstate;
- int inputcharge;
- int outputcharge;
- struct buf *bp; /* Buffer pointer */
- struct proc *userproc; /* User process */ /* Not td! */
- struct ucred *cred; /* Active credential when created */
- struct file *fd_file; /* Pointer to file structure */
- struct aio_liojob *lio; /* Optional lio job */
- struct aiocb *uuaiocb; /* Pointer in userspace of aiocb */
- struct knlist klist; /* list of knotes */
- struct aiocb uaiocb; /* Kernel I/O control block */
+ TAILQ_ENTRY(aiocblist) list; /* (b) internal list of for backend */
+ TAILQ_ENTRY(aiocblist) plist; /* (a) list of jobs for each backend */
+ TAILQ_ENTRY(aiocblist) allist; /* (a) list of all jobs in proc */
+ int jobflags; /* (a) job flags */
+ int jobstate; /* (b) job state */
+ int inputcharge; /* (*) input blockes */
+ int outputcharge; /* (*) output blockes */
+ struct buf *bp; /* (*) private to BIO backend,
+ * buffer pointer
+ */
+ struct proc *userproc; /* (*) user process */
+ struct ucred *cred; /* (*) active credential when created */
+ struct file *fd_file; /* (*) pointer to file structure */
+ struct aioliojob *lio; /* (*) optional lio job */
+ struct aiocb *uuaiocb; /* (*) pointer in userspace of aiocb */
+ struct knlist klist; /* (a) list of knotes */
+ struct aiocb uaiocb; /* (*) kernel I/O control block */
+ ksiginfo_t ksi; /* (a) realtime signal info */
+ struct task biotask; /* (*) private to BIO backend */
+ uint64_t seqno; /* (*) job number */
+ int pending; /* (a) number of pending I/O, aio_fsync only */
};
/* jobflags */
-#define AIOCBLIST_RUNDOWN 0x4
-#define AIOCBLIST_DONE 0x10
+#define AIOCBLIST_DONE 0x01
+#define AIOCBLIST_BUFDONE 0x02
+#define AIOCBLIST_RUNDOWN 0x04
+#define AIOCBLIST_CHECKSYNC 0x08
/*
* AIO process info
@@ -197,71 +252,95 @@
#define AIOP_FREE 0x1 /* proc on free queue */
struct aiothreadlist {
- int aiothreadflags; /* AIO proc flags */
- TAILQ_ENTRY(aiothreadlist) list; /* List of processes */
- struct thread *aiothread; /* The AIO thread */
+ int aiothreadflags; /* (c) AIO proc flags */
+ TAILQ_ENTRY(aiothreadlist) list; /* (c) list of processes */
+ struct thread *aiothread; /* (*) the AIO thread */
};
/*
* data-structure for lio signal management
*/
-struct aio_liojob {
- int lioj_flags;
- int lioj_buffer_count;
- int lioj_buffer_finished_count;
- int lioj_queue_count;
- int lioj_queue_finished_count;
- struct sigevent lioj_signal; /* signal on all I/O done */
- TAILQ_ENTRY(aio_liojob) lioj_list;
+struct aioliojob {
+ int lioj_flags; /* (a) listio flags */
+ int lioj_count; /* (a) listio flags */
+ int lioj_finished_count; /* (a) listio flags */
+ struct sigevent lioj_signal; /* (a) signal on all I/O done */
+ TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */
+ struct knlist klist; /* (a) list of knotes */
+ ksiginfo_t lioj_ksi; /* (a) Realtime signal info */
};
+
#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
+#define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */
/*
* per process aio data structure
*/
struct kaioinfo {
- int kaio_flags; /* per process kaio flags */
- int kaio_maxactive_count; /* maximum number of AIOs */
- int kaio_active_count; /* number of currently used AIOs */
- int kaio_qallowed_count; /* maxiumu size of AIO queue */
- int kaio_queue_count; /* size of AIO queue */
- int kaio_ballowed_count; /* maximum number of buffers */
- int kaio_queue_finished_count; /* number of daemon jobs finished */
- int kaio_buffer_count; /* number of physio buffers */
- int kaio_buffer_finished_count; /* count of I/O done */
- TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
- TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */
- TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */
- TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */
- TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */
- TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */
+ struct mtx kaio_mtx; /* the lock to protect this struct */
+ int kaio_flags; /* (a) per process kaio flags */
+ int kaio_maxactive_count; /* (*) maximum number of AIOs */
+ int kaio_active_count; /* (c) number of currently used AIOs */
+ int kaio_qallowed_count; /* (*) maxiumu size of AIO queue */
+ int kaio_count; /* (a) size of AIO queue */
+ int kaio_ballowed_count; /* (*) maximum number of buffers */
+ int kaio_buffer_count; /* (a) number of physio buffers */
+ TAILQ_HEAD(,aiocblist) kaio_all; /* (a) all AIOs in the process */
+ TAILQ_HEAD(,aiocblist) kaio_done; /* (a) done queue for process */
+ TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
+ TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* (a) job queue for process */
+ TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* (a) buffer job queue for process */
+ TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* (a) queue for aios waiting on sockets,
+ * NOT USED YET.
+ */
+ TAILQ_HEAD(,aiocblist) kaio_syncqueue; /* (a) queue for aio_fsync */
+ struct task kaio_task; /* (*) task to kick aio threads */
};
+#define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx)
+#define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx)
+#define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f))
+#define AIO_MTX(ki) (&(ki)->kaio_mtx)
+
#define KAIO_RUNDOWN 0x1 /* process is being run down */
#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */
-static TAILQ_HEAD(,aiothreadlist) aio_freeproc; /* Idle daemons */
-static struct mtx aio_freeproc_mtx;
-
-static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */
+static TAILQ_HEAD(,aiothreadlist) aio_freeproc; /* (c) Idle daemons */
+static struct sema aio_newproc_sem;
+static struct mtx aio_job_mtx;
+static struct mtx aio_sock_mtx;
+static TAILQ_HEAD(,aiocblist) aio_jobs; /* (c) Async job list */
+static struct unrhdr *aiod_unr;
-static void aio_init_aioinfo(struct proc *p);
+void aio_init_aioinfo(struct proc *p);
static void aio_onceonly(void);
static int aio_free_entry(struct aiocblist *aiocbe);
static void aio_process(struct aiocblist *aiocbe);
-static int aio_newproc(void);
-static int aio_aqueue(struct thread *td, struct aiocb *job, int type);
+static int aio_newproc(int *);
+int aio_aqueue(struct thread *td, struct aiocb *job,
+ struct aioliojob *lio, int type, int osigev);
static void aio_physwakeup(struct buf *bp);
static void aio_proc_rundown(void *arg, struct proc *p);
-static int aio_fphysio(struct aiocblist *aiocbe);
+static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
-static void aio_daemon(void *uproc);
+static void biohelper(void *, int);
+static void aio_daemon(void *param);
static void aio_swake_cb(struct socket *, struct sockbuf *);
static int aio_unload(void);
+static void aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type);
+#define DONE_BUF 1
+#define DONE_QUEUE 2
+static int do_lio_listio(struct thread *td, struct lio_listio_args *uap, int oldsigev);
+static int aio_kick(struct proc *userp);
+static void aio_kick_nowait(struct proc *userp);
+static void aio_kick_helper(void *context, int pending);
static int filt_aioattach(struct knote *kn);
static void filt_aiodetach(struct knote *kn);
static int filt_aio(struct knote *kn, long hint);
+static int filt_lioattach(struct knote *kn);
+static void filt_liodetach(struct knote *kn);
+static int filt_lio(struct knote *kn, long hint);
/*
* Zones for:
@@ -276,9 +355,13 @@
/* kqueue filters for aio */
static struct filterops aio_filtops =
{ 0, filt_aioattach, filt_aiodetach, filt_aio };
+static struct filterops lio_filtops =
+ { 0, filt_lioattach, filt_liodetach, filt_lio };
static eventhandler_tag exit_tag, exec_tag;
+TASKQUEUE_DEFINE_THREAD(aiod_bio);
+
/*
* Main operations function for use as a kernel module.
*/
@@ -309,14 +392,18 @@
NULL
};
-SYSCALL_MODULE_HELPER(aio_return);
-SYSCALL_MODULE_HELPER(aio_suspend);
SYSCALL_MODULE_HELPER(aio_cancel);
SYSCALL_MODULE_HELPER(aio_error);
+SYSCALL_MODULE_HELPER(aio_fsync);
SYSCALL_MODULE_HELPER(aio_read);
-SYSCALL_MODULE_HELPER(aio_write);
+SYSCALL_MODULE_HELPER(aio_return);
+SYSCALL_MODULE_HELPER(aio_suspend);
SYSCALL_MODULE_HELPER(aio_waitcomplete);
+SYSCALL_MODULE_HELPER(aio_write);
SYSCALL_MODULE_HELPER(lio_listio);
+SYSCALL_MODULE_HELPER(oaio_read);
+SYSCALL_MODULE_HELPER(oaio_write);
+SYSCALL_MODULE_HELPER(olio_listio);
DECLARE_MODULE(aio, aio_mod,
SI_SUB_VFS, SI_ORDER_ANY);
@@ -333,12 +420,16 @@
aio_swake = &aio_swake_cb;
exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
EVENTHANDLER_PRI_ANY);
- exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown, NULL,
+ exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL,
EVENTHANDLER_PRI_ANY);
kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
+ kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
TAILQ_INIT(&aio_freeproc);
- mtx_init(&aio_freeproc_mtx, "aio_freeproc", NULL, MTX_DEF);
+ sema_init(&aio_newproc_sem, 0, "aio_new_proc");
+ mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
+ mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF);
TAILQ_INIT(&aio_jobs);
+ aiod_unr = new_unrhdr(1, INT_MAX, NULL);
kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
@@ -347,7 +438,7 @@
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
- aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aio_liojob), NULL,
+ aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiod_timeout = AIOD_TIMEOUT_DEFAULT;
aiod_lifetime = AIOD_LIFETIME_DEFAULT;
@@ -370,6 +461,9 @@
* XXX: no unloads by default, it's too dangerous.
* perhaps we could do it if locked out callers and then
* did an aio_proc_rundown() on each process.
+ *
+ * jhb: aio_proc_rundown() needs to run on curproc though,
+ * so I don't think that would fly.
*/
if (!unloadable)
return (EOPNOTSUPP);
@@ -377,11 +471,23 @@
error = kqueue_del_filteropts(EVFILT_AIO);
if (error)
return error;
-
+ error = kqueue_del_filteropts(EVFILT_LIO);
+ if (error)
+ return error;
async_io_version = 0;
aio_swake = NULL;
+ taskqueue_free(taskqueue_aiod_bio);
+ delete_unrhdr(aiod_unr);
+ uma_zdestroy(kaio_zone);
+ uma_zdestroy(aiop_zone);
+ uma_zdestroy(aiocb_zone);
+ uma_zdestroy(aiol_zone);
+ uma_zdestroy(aiolio_zone);
EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
+ mtx_destroy(&aio_job_mtx);
+ mtx_destroy(&aio_sock_mtx);
+ sema_destroy(&aio_newproc_sem);
p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
@@ -392,37 +498,55 @@
* Init the per-process aioinfo structure. The aioinfo limits are set
* per-process for user limit (resource) management.
*/
-static void
+void
aio_init_aioinfo(struct proc *p)
{
struct kaioinfo *ki;
ki = uma_zalloc(kaio_zone, M_WAITOK);
+ mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF);
ki->kaio_flags = 0;
ki->kaio_maxactive_count = max_aio_per_proc;
ki->kaio_active_count = 0;
ki->kaio_qallowed_count = max_aio_queue_per_proc;
- ki->kaio_queue_count = 0;
+ ki->kaio_count = 0;
ki->kaio_ballowed_count = max_buf_aio;
ki->kaio_buffer_count = 0;
- ki->kaio_buffer_finished_count = 0;
- TAILQ_INIT(&ki->kaio_jobdone);
+ TAILQ_INIT(&ki->kaio_all);
+ TAILQ_INIT(&ki->kaio_done);
TAILQ_INIT(&ki->kaio_jobqueue);
- TAILQ_INIT(&ki->kaio_bufdone);
TAILQ_INIT(&ki->kaio_bufqueue);
TAILQ_INIT(&ki->kaio_liojoblist);
TAILQ_INIT(&ki->kaio_sockqueue);
+ TAILQ_INIT(&ki->kaio_syncqueue);
+ TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
PROC_LOCK(p);
if (p->p_aioinfo == NULL) {
p->p_aioinfo = ki;
PROC_UNLOCK(p);
} else {
PROC_UNLOCK(p);
+ mtx_destroy(&ki->kaio_mtx);
uma_zfree(kaio_zone, ki);
}
while (num_aio_procs < target_aio_procs)
- aio_newproc();
+ aio_newproc(NULL);
+}
+
+static int
+aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
+{
+ int ret = 0;
+
+ PROC_LOCK(p);
+ if (!KSI_ONQ(ksi)) {
+ ksi->ksi_code = SI_ASYNCIO;
+ ksi->ksi_flags |= KSI_EXT | KSI_INS;
+ ret = psignal_event(p, sigev, ksi);
+ }
+ PROC_UNLOCK(p);
+ return (ret);
}
/*
@@ -434,225 +558,170 @@
aio_free_entry(struct aiocblist *aiocbe)
{
struct kaioinfo *ki;
- struct aio_liojob *lj;
+ struct aioliojob *lj;
struct proc *p;
- int error;
- int s;
-
- if (aiocbe->jobstate == JOBST_NULL)
- panic("aio_free_entry: freeing already free job");
p = aiocbe->userproc;
+ MPASS(curproc == p);
ki = p->p_aioinfo;
- lj = aiocbe->lio;
- if (ki == NULL)
- panic("aio_free_entry: missing p->p_aioinfo");
+ MPASS(ki != NULL);
- while (aiocbe->jobstate == JOBST_JOBRUNNING) {
- aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
- tsleep(aiocbe, PRIBIO, "jobwai", 0);
- }
- if (aiocbe->bp == NULL) {
- if (ki->kaio_queue_count <= 0)
- panic("aio_free_entry: process queue size <= 0");
- if (num_queue_count <= 0)
- panic("aio_free_entry: system wide queue size <= 0");
-
- if (lj) {
- lj->lioj_queue_count--;
- if (aiocbe->jobflags & AIOCBLIST_DONE)
- lj->lioj_queue_finished_count--;
- }
- ki->kaio_queue_count--;
- if (aiocbe->jobflags & AIOCBLIST_DONE)
- ki->kaio_queue_finished_count--;
- num_queue_count--;
- } else {
- if (lj) {
- lj->lioj_buffer_count--;
- if (aiocbe->jobflags & AIOCBLIST_DONE)
- lj->lioj_buffer_finished_count--;
- }
- if (aiocbe->jobflags & AIOCBLIST_DONE)
- ki->kaio_buffer_finished_count--;
- ki->kaio_buffer_count--;
- num_buf_aio--;
- }
+ AIO_LOCK_ASSERT(ki, MA_OWNED);
+ MPASS(aiocbe->jobstate == JOBST_JOBFINISHED);
- /* aiocbe is going away, we need to destroy any knotes */
- /* XXXKSE Note the thread here is used to eventually find the
- * owning process again, but it is also used to do a fo_close
- * and that requires the thread. (but does it require the
- * OWNING thread? (or maybe the running thread?)
- * There is a semantic problem here...
- */
- knlist_delete(&aiocbe->klist, FIRST_THREAD_IN_PROC(p), 0); /* XXXKSE */
+ atomic_subtract_int(&num_queue_count, 1);
- if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
- && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
- ki->kaio_flags &= ~KAIO_WAKEUP;
- wakeup(p);
- }
+ ki->kaio_count--;
+ MPASS(ki->kaio_count >= 0);
- if (aiocbe->jobstate == JOBST_JOBQBUF) {
- if ((error = aio_fphysio(aiocbe)) != 0)
- return (error);
- if (aiocbe->jobstate != JOBST_JOBBFINISHED)
- panic("aio_free_entry: invalid physio finish-up state");
- s = splbio();
- TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
- splx(s);
- } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
- s = splnet();
- TAILQ_REMOVE(&aio_jobs, aiocbe, list);
- TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
- splx(s);
- } else if (aiocbe->jobstate == JOBST_JOBFINISHED)
- TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
- else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
- s = splbio();
- TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
- splx(s);
- if (aiocbe->bp) {
- vunmapbuf(aiocbe->bp);
- relpbuf(aiocbe->bp, NULL);
- aiocbe->bp = NULL;
+ TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist);
+ TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
+
+ lj = aiocbe->lio;
+ if (lj) {
+ lj->lioj_count--;
+ lj->lioj_finished_count--;
+
+ if (lj->lioj_count == 0) {
+ TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+ /* lio is going away, we need to destroy any knotes */
+ knlist_delete(&lj->klist, curthread, 1);
+ PROC_LOCK(p);
+ sigqueue_take(&lj->lioj_ksi);
+ PROC_UNLOCK(p);
+ uma_zfree(aiolio_zone, lj);
}
}
- if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
- TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
- uma_zfree(aiolio_zone, lj);
- }
+
+ /* aiocbe is going away, we need to destroy any knotes */
+ knlist_delete(&aiocbe->klist, curthread, 1);
+ PROC_LOCK(p);
+ sigqueue_take(&aiocbe->ksi);
+ PROC_UNLOCK(p);
+
+ MPASS(aiocbe->bp == NULL);
aiocbe->jobstate = JOBST_NULL;
+ AIO_UNLOCK(ki);
+
+ /*
+ * The thread argument here is used to find the owning process
+ * and is also passed to fo_close() which may pass it to various
+ * places such as devsw close() routines. Because of that, we
+ * need a thread pointer from the process owning the job that is
+ * persistent and won't disappear out from under us or move to
+ * another process.
+ *
+ * Currently, all the callers of this function call it to remove
+ * an aiocblist from the current process' job list either via a
+ * syscall or due to the current process calling exit() or
+ * execve(). Thus, we know that p == curproc. We also know that
+ * curthread can't exit since we are curthread.
+ *
+ * Therefore, we use curthread as the thread to pass to
+ * knlist_delete(). This does mean that it is possible for the
+ * thread pointer at close time to differ from the thread pointer
+ * at open time, but this is already true of file descriptors in
+ * a multithreaded process.
+ */
fdrop(aiocbe->fd_file, curthread);
crfree(aiocbe->cred);
uma_zfree(aiocb_zone, aiocbe);
+ AIO_LOCK(ki);
+
return (0);
}
+static void
+aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
+{
+ aio_proc_rundown(arg, p);
+}
+
/*
* Rundown the jobs for a given process.
*/
static void
aio_proc_rundown(void *arg, struct proc *p)
{
- int s;
struct kaioinfo *ki;
- struct aio_liojob *lj, *ljn;
- struct aiocblist *aiocbe, *aiocbn;
+ struct aioliojob *lj;
+ struct aiocblist *cbe, *cbn;
struct file *fp;
struct socket *so;
+ int remove;
+ KASSERT(curthread->td_proc == p,
+ ("%s: called on non-curproc", __func__));
ki = p->p_aioinfo;
if (ki == NULL)
return;
- mtx_lock(&Giant);
- ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
- while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
- ki->kaio_buffer_finished_count)) {
- ki->kaio_flags |= KAIO_RUNDOWN;
- if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
- break;
- }
+ AIO_LOCK(ki);
+ ki->kaio_flags |= KAIO_RUNDOWN;
+
+restart:
/*
- * Move any aio ops that are waiting on socket I/O to the normal job
- * queues so they are cleaned up with any others.
+ * Try to cancel all pending requests. This code simulates
+ * aio_cancel on all pending I/O requests.
*/
- s = splnet();
- for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
- aiocbn) {
- aiocbn = TAILQ_NEXT(aiocbe, plist);
- fp = aiocbe->fd_file;
- if (fp != NULL) {
+ TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
+ remove = 0;
+ mtx_lock(&aio_job_mtx);
+ if (cbe->jobstate == JOBST_JOBQGLOBAL) {
+ TAILQ_REMOVE(&aio_jobs, cbe, list);
+ remove = 1;
+ } else if (cbe->jobstate == JOBST_JOBQSOCK) {
+ fp = cbe->fd_file;
+ MPASS(fp->f_type == DTYPE_SOCKET);
so = fp->f_data;
- TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
- if (TAILQ_EMPTY(&so->so_aiojobq)) {
- SOCKBUF_LOCK(&so->so_snd);
- so->so_snd.sb_flags &= ~SB_AIO;
- SOCKBUF_UNLOCK(&so->so_snd);
- SOCKBUF_LOCK(&so->so_rcv);
- so->so_rcv.sb_flags &= ~SB_AIO;
- SOCKBUF_UNLOCK(&so->so_rcv);
- }
+ TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
+ remove = 1;
+ } else if (cbe->jobstate == JOBST_JOBQSYNC) {
+ TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
+ remove = 1;
+ }
+ mtx_unlock(&aio_job_mtx);
+
+ if (remove) {
+ cbe->jobstate = JOBST_JOBFINISHED;
+ cbe->uaiocb._aiocb_private.status = -1;
+ cbe->uaiocb._aiocb_private.error = ECANCELED;
+ TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
+ aio_bio_done_notify(p, cbe, DONE_QUEUE);
}
- TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
- TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
- TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
- }
- splx(s);
-
-restart1:
- for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
- aiocbn = TAILQ_NEXT(aiocbe, plist);
- if (aio_free_entry(aiocbe))
- goto restart1;
}
-restart2:
- for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
- aiocbn) {
- aiocbn = TAILQ_NEXT(aiocbe, plist);
- if (aio_free_entry(aiocbe))
- goto restart2;
- }
-
-/*
- * Note the use of lots of splbio here, trying to avoid splbio for long chains
- * of I/O. Probably unnecessary.
- */
-restart3:
- s = splbio();
- while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
+ /* Wait for all running I/O to be finished */
+ if (TAILQ_FIRST(&ki->kaio_bufqueue) ||
+ TAILQ_FIRST(&ki->kaio_jobqueue)) {
ki->kaio_flags |= KAIO_WAKEUP;
- tsleep(p, PRIBIO, "aioprn", 0);
- splx(s);
- goto restart3;
- }
- splx(s);
-
-restart4:
- s = splbio();
- for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
- aiocbn = TAILQ_NEXT(aiocbe, plist);
- if (aio_free_entry(aiocbe)) {
- splx(s);
- goto restart4;
- }
+ msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
+ goto restart;
}
- splx(s);
- /*
- * If we've slept, jobs might have moved from one queue to another.
- * Retry rundown if we didn't manage to empty the queues.
- */
- if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
- TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
- TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
- TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
- goto restart1;
-
- for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
- ljn = TAILQ_NEXT(lj, lioj_list);
- if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
- 0)) {
+ /* Free all completed I/O requests. */
+ while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL)
+ aio_free_entry(cbe);
+
+ while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
+ if (lj->lioj_count == 0) {
TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+ knlist_delete(&lj->klist, curthread, 1);
+ PROC_LOCK(p);
+ sigqueue_take(&lj->lioj_ksi);
+ PROC_UNLOCK(p);
uma_zfree(aiolio_zone, lj);
} else {
-#ifdef DIAGNOSTIC
- printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
- "QF:%d\n", lj->lioj_buffer_count,
- lj->lioj_buffer_finished_count,
- lj->lioj_queue_count,
- lj->lioj_queue_finished_count);
-#endif
+ panic("LIO job not cleaned up: C:%d, FC:%d\n",
+ lj->lioj_count, lj->lioj_finished_count);
}
}
-
+ AIO_UNLOCK(ki);
+ taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task);
+ mtx_destroy(&ki->kaio_mtx);
uma_zfree(kaio_zone, ki);
p->p_aioinfo = NULL;
- mtx_unlock(&Giant);
}
/*
@@ -661,26 +730,53 @@
static struct aiocblist *
aio_selectjob(struct aiothreadlist *aiop)
{
- int s;
struct aiocblist *aiocbe;
struct kaioinfo *ki;
struct proc *userp;
- s = splnet();
- for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
- TAILQ_NEXT(aiocbe, list)) {
+ mtx_assert(&aio_job_mtx, MA_OWNED);
+ TAILQ_FOREACH(aiocbe, &aio_jobs, list) {
userp = aiocbe->userproc;
ki = userp->p_aioinfo;
if (ki->kaio_active_count < ki->kaio_maxactive_count) {
TAILQ_REMOVE(&aio_jobs, aiocbe, list);
- splx(s);
- return (aiocbe);
+ /* Account for currently active jobs. */
+ ki->kaio_active_count++;
+ aiocbe->jobstate = JOBST_JOBRUNNING;
+ break;
}
}
- splx(s);
+ return (aiocbe);
+}
+
+/*
+ * Move all data to a permanent storage device, this code
+ * simulates fsync syscall.
+ */
+static int
+aio_fsync_vnode(struct thread *td, struct vnode *vp)
+{
+ struct mount *mp;
+ int vfslocked;
+ int error;
- return (NULL);
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ goto drop;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (vp->v_object != NULL) {
+ VM_OBJECT_LOCK(vp->v_object);
+ vm_object_page_clean(vp->v_object, 0, 0, 0);
+ VM_OBJECT_UNLOCK(vp->v_object);
+ }
+ error = VOP_FSYNC(vp, MNT_WAIT, td);
+
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+drop:
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
}
/*
@@ -688,15 +784,17 @@
* the non-physio version of the operations. The normal vn operations are used,
* and this code should work in all instances for every type of file, including
* pipes, sockets, fifos, and regular files.
+ *
+ * XXX I don't think it works well for socket, pipe, and fifo.
*/
static void
aio_process(struct aiocblist *aiocbe)
{
struct ucred *td_savedcred;
struct thread *td;
- struct proc *mycp;
struct aiocb *cb;
struct file *fp;
+ struct socket *so;
struct uio auio;
struct iovec aiov;
int cnt;
@@ -707,10 +805,20 @@
td = curthread;
td_savedcred = td->td_ucred;
td->td_ucred = aiocbe->cred;
- mycp = td->td_proc;
cb = &aiocbe->uaiocb;
fp = aiocbe->fd_file;
+ if (cb->aio_lio_opcode == LIO_SYNC) {
+ error = 0;
+ cnt = 0;
+ if (fp->f_vnode != NULL)
+ error = aio_fsync_vnode(td, fp->f_vnode);
+ cb->_aiocb_private.error = error;
+ cb->_aiocb_private.status = 0;
+ td->td_ucred = td_savedcred;
+ return;
+ }
+
aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
aiov.iov_len = cb->aio_nbytes;
@@ -722,21 +830,26 @@
auio.uio_segflg = UIO_USERSPACE;
auio.uio_td = td;
- inblock_st = mycp->p_stats->p_ru.ru_inblock;
- oublock_st = mycp->p_stats->p_ru.ru_oublock;
+ inblock_st = td->td_ru.ru_inblock;
+ oublock_st = td->td_ru.ru_oublock;
/*
- * _aio_aqueue() acquires a reference to the file that is
+ * aio_aqueue() acquires a reference to the file that is
* released in aio_free_entry().
*/
if (cb->aio_lio_opcode == LIO_READ) {
auio.uio_rw = UIO_READ;
- error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+ if (auio.uio_resid == 0)
+ error = 0;
+ else
+ error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
} else {
+ if (fp->f_type == DTYPE_VNODE)
+ bwillwrite();
auio.uio_rw = UIO_WRITE;
error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
}
- inblock_end = mycp->p_stats->p_ru.ru_inblock;
- oublock_end = mycp->p_stats->p_ru.ru_oublock;
+ inblock_end = td->td_ru.ru_inblock;
+ oublock_end = td->td_ru.ru_oublock;
aiocbe->inputcharge = inblock_end - inblock_st;
aiocbe->outputcharge = oublock_end - oublock_st;
@@ -745,9 +858,17 @@
if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
error = 0;
if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
- PROC_LOCK(aiocbe->userproc);
- psignal(aiocbe->userproc, SIGPIPE);
- PROC_UNLOCK(aiocbe->userproc);
+ int sigpipe = 1;
+ if (fp->f_type == DTYPE_SOCKET) {
+ so = fp->f_data;
+ if (so->so_options & SO_NOSIGPIPE)
+ sigpipe = 0;
+ }
+ if (sigpipe) {
+ PROC_LOCK(aiocbe->userproc);
+ psignal(aiocbe->userproc, SIGPIPE);
+ PROC_UNLOCK(aiocbe->userproc);
+ }
}
}
@@ -757,24 +878,90 @@
td->td_ucred = td_savedcred;
}
+static void
+aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
+{
+ struct aioliojob *lj;
+ struct kaioinfo *ki;
+ struct aiocblist *scb, *scbn;
+ int lj_done;
+
+ ki = userp->p_aioinfo;
+ AIO_LOCK_ASSERT(ki, MA_OWNED);
+ lj = aiocbe->lio;
+ lj_done = 0;
+ if (lj) {
+ lj->lioj_finished_count++;
+ if (lj->lioj_count == lj->lioj_finished_count)
+ lj_done = 1;
+ }
+ if (type == DONE_QUEUE) {
+ aiocbe->jobflags |= AIOCBLIST_DONE;
+ } else {
+ aiocbe->jobflags |= AIOCBLIST_BUFDONE;
+ }
+ TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist);
+ aiocbe->jobstate = JOBST_JOBFINISHED;
+
+ if (ki->kaio_flags & KAIO_RUNDOWN)
+ goto notification_done;
+
+ if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
+ aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
+ aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi);
+
+ KNOTE_LOCKED(&aiocbe->klist, 1);
+
+ if (lj_done) {
+ if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+ lj->lioj_flags |= LIOJ_KEVENT_POSTED;
+ KNOTE_LOCKED(&lj->klist, 1);
+ }
+ if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
+ == LIOJ_SIGNAL
+ && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+ lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
+ aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
+ lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+ }
+ }
+
+notification_done:
+ if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) {
+ TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) {
+ if (aiocbe->fd_file == scb->fd_file &&
+ aiocbe->seqno < scb->seqno) {
+ if (--scb->pending == 0) {
+ mtx_lock(&aio_job_mtx);
+ scb->jobstate = JOBST_JOBQGLOBAL;
+ TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list);
+ TAILQ_INSERT_TAIL(&aio_jobs, scb, list);
+ aio_kick_nowait(userp);
+ mtx_unlock(&aio_job_mtx);
+ }
+ }
+ }
+ }
+ if (ki->kaio_flags & KAIO_WAKEUP) {
+ ki->kaio_flags &= ~KAIO_WAKEUP;
+ wakeup(&userp->p_aioinfo);
+ }
+}
+
/*
* The AIO daemon, most of the actual work is done in aio_process,
* but the setup (and address space mgmt) is done in this routine.
*/
static void
-aio_daemon(void *uproc)
+aio_daemon(void *_id)
{
- int s;
- struct aio_liojob *lj;
- struct aiocb *cb;
struct aiocblist *aiocbe;
struct aiothreadlist *aiop;
struct kaioinfo *ki;
struct proc *curcp, *mycp, *userp;
struct vmspace *myvm, *tmpvm;
struct thread *td = curthread;
- struct pgrp *newpgrp;
- struct session *newsess;
+ int id = (intptr_t)_id;
/*
* Local copies of curproc (cp) and vmspace (myvm)
@@ -790,32 +977,18 @@
*/
aiop = uma_zalloc(aiop_zone, M_WAITOK);
aiop->aiothread = td;
- aiop->aiothreadflags |= AIOP_FREE;
-
- /*
- * Place thread (lightweight process) onto the AIO free thread list.
- */
- mtx_lock(&aio_freeproc_mtx);
- TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
- mtx_unlock(&aio_freeproc_mtx);
+ aiop->aiothreadflags = 0;
/* The daemon resides in its own pgrp. */
- MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP,
- M_WAITOK | M_ZERO);
- MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION,
- M_WAITOK | M_ZERO);
-
- sx_xlock(&proctree_lock);
- enterpgrp(mycp, mycp->p_pid, newpgrp, newsess);
- sx_xunlock(&proctree_lock);
- mtx_lock(&Giant);
+ setsid(td, NULL);
/*
* Wakeup parent process. (Parent sleeps to keep from blasting away
* and creating too many daemons.)
*/
- wakeup(mycp);
+ sema_post(&aio_newproc_sem);
+ mtx_lock(&aio_job_mtx);
for (;;) {
/*
* curcp is the current daemon process context.
@@ -826,22 +999,18 @@
/*
* Take daemon off of free queue
*/
- mtx_lock(&aio_freeproc_mtx);
if (aiop->aiothreadflags & AIOP_FREE) {
TAILQ_REMOVE(&aio_freeproc, aiop, list);
aiop->aiothreadflags &= ~AIOP_FREE;
}
- mtx_unlock(&aio_freeproc_mtx);
/*
* Check for jobs.
*/
while ((aiocbe = aio_selectjob(aiop)) != NULL) {
- cb = &aiocbe->uaiocb;
+ mtx_unlock(&aio_job_mtx);
userp = aiocbe->userproc;
- aiocbe->jobstate = JOBST_JOBRUNNING;
-
/*
* Connect to process address space for user program.
*/
@@ -875,71 +1044,30 @@
}
ki = userp->p_aioinfo;
- lj = aiocbe->lio;
-
- /* Account for currently active jobs. */
- ki->kaio_active_count++;
/* Do the I/O function. */
aio_process(aiocbe);
+ mtx_lock(&aio_job_mtx);
/* Decrement the active job count. */
ki->kaio_active_count--;
+ mtx_unlock(&aio_job_mtx);
- /*
- * Increment the completion count for wakeup/signal
- * comparisons.
- */
- aiocbe->jobflags |= AIOCBLIST_DONE;
- ki->kaio_queue_finished_count++;
- if (lj)
- lj->lioj_queue_finished_count++;
- if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
- & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
- ki->kaio_flags &= ~KAIO_WAKEUP;
- wakeup(userp);
- }
-
- s = splbio();
- if (lj && (lj->lioj_flags &
- (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
- if ((lj->lioj_queue_finished_count ==
- lj->lioj_queue_count) &&
- (lj->lioj_buffer_finished_count ==
- lj->lioj_buffer_count)) {
- PROC_LOCK(userp);
- psignal(userp,
- lj->lioj_signal.sigev_signo);
- PROC_UNLOCK(userp);
- lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
- }
- }
- splx(s);
-
- aiocbe->jobstate = JOBST_JOBFINISHED;
-
- s = splnet();
+ AIO_LOCK(ki);
TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
- TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist);
- splx(s);
- KNOTE_UNLOCKED(&aiocbe->klist, 0);
-
- if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
- wakeup(aiocbe);
- aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
- }
+ aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
+ AIO_UNLOCK(ki);
- if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
- PROC_LOCK(userp);
- psignal(userp, cb->aio_sigevent.sigev_signo);
- PROC_UNLOCK(userp);
- }
+ mtx_lock(&aio_job_mtx);
}
/*
* Disconnect from user address space.
*/
if (curcp != mycp) {
+
+ mtx_unlock(&aio_job_mtx);
+
/* Get the user address space to disconnect from. */
tmpvm = mycp->p_vmspace;
@@ -958,9 +1086,18 @@
vmspace_free(tmpvm);
curcp = mycp;
+
+ mtx_lock(&aio_job_mtx);
+ /*
+ * We have to restart to avoid race, we only sleep if
+ * no job can be selected, that should be
+ * curcp == mycp.
+ */
+ continue;
}
- mtx_lock(&aio_freeproc_mtx);
+ mtx_assert(&aio_job_mtx, MA_OWNED);
+
TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
aiop->aiothreadflags |= AIOP_FREE;
@@ -968,18 +1105,16 @@
* If daemon is inactive for a long time, allow it to exit,
* thereby freeing resources.
*/
- if (msleep(aiop->aiothread, &aio_freeproc_mtx, PDROP | PRIBIO,
- "aiordy", aiod_lifetime)) {
- s = splnet();
+ if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy",
+ aiod_lifetime)) {
if (TAILQ_EMPTY(&aio_jobs)) {
- mtx_lock(&aio_freeproc_mtx);
if ((aiop->aiothreadflags & AIOP_FREE) &&
(num_aio_procs > target_aio_procs)) {
TAILQ_REMOVE(&aio_freeproc, aiop, list);
- mtx_unlock(&aio_freeproc_mtx);
- splx(s);
- uma_zfree(aiop_zone, aiop);
num_aio_procs--;
+ mtx_unlock(&aio_job_mtx);
+ uma_zfree(aiop_zone, aiop);
+ free_unr(aiod_unr, id);
#ifdef DIAGNOSTIC
if (mycp->p_vmspace->vm_refcnt <= 1) {
printf("AIOD: bad vm refcnt for"
@@ -989,36 +1124,40 @@
#endif
kthread_exit(0);
}
- mtx_unlock(&aio_freeproc_mtx);
}
- splx(s);
}
}
+ mtx_unlock(&aio_job_mtx);
+ panic("shouldn't be here\n");
}
/*
- * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
+ * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
* AIO daemon modifies its environment itself.
*/
static int
-aio_newproc(void)
+aio_newproc(int *start)
{
int error;
struct proc *p;
+ int id;
- error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, 0, "aiod%d",
- num_aio_procs);
- if (error)
- return (error);
-
- /*
- * Wait until daemon is started, but continue on just in case to
- * handle error conditions.
- */
- error = tsleep(p, PZERO, "aiosta", aiod_timeout);
-
- num_aio_procs++;
-
+ id = alloc_unr(aiod_unr);
+ error = kthread_create(aio_daemon, (void *)(intptr_t)id, &p,
+ RFNOWAIT, 0, "aiod%d", id);
+ if (error == 0) {
+ /*
+ * Wait until daemon is started.
+ */
+ sema_wait(&aio_newproc_sem);
+ mtx_lock(&aio_job_mtx);
+ num_aio_procs++;
+ if (start != NULL)
+ (*start)--;
+ mtx_unlock(&aio_job_mtx);
+ } else {
+ free_unr(aiod_unr, id);
+ }
return (error);
}
@@ -1027,22 +1166,20 @@
* VCHR devices. This method doesn't use an aio helper thread, and
* thus has very low overhead.
*
- * Assumes that the caller, _aio_aqueue(), has incremented the file
+ * Assumes that the caller, aio_aqueue(), has incremented the file
* structure's reference count, preventing its deallocation for the
* duration of this call.
*/
static int
aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
{
- int error;
struct aiocb *cb;
struct file *fp;
struct buf *bp;
struct vnode *vp;
struct kaioinfo *ki;
- struct aio_liojob *lj;
- int s;
- int notify;
+ struct aioliojob *lj;
+ int error;
cb = &aiocbe->uaiocb;
fp = aiocbe->fd_file;
@@ -1070,6 +1207,9 @@
if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
return (-1);
+ if (cb->aio_nbytes > vp->v_rdev->si_iosize_max)
+ return (-1);
+
if (cb->aio_nbytes >
MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
return (-1);
@@ -1078,16 +1218,18 @@
if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
return (-1);
- ki->kaio_buffer_count++;
-
- lj = aiocbe->lio;
- if (lj)
- lj->lioj_buffer_count++;
-
/* Create and build a buffer header for a transfer. */
bp = (struct buf *)getpbuf(NULL);
BUF_KERNPROC(bp);
+ AIO_LOCK(ki);
+ ki->kaio_count++;
+ ki->kaio_buffer_count++;
+ lj = aiocbe->lio;
+ if (lj)
+ lj->lioj_count++;
+ AIO_UNLOCK(ki);
+
/*
* Get a copy of the kva from the physical buffer.
*/
@@ -1111,96 +1253,34 @@
goto doerror;
}
- s = splbio();
+ AIO_LOCK(ki);
aiocbe->bp = bp;
bp->b_caller1 = (void *)aiocbe;
TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
+ TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
aiocbe->jobstate = JOBST_JOBQBUF;
cb->_aiocb_private.status = cb->aio_nbytes;
- num_buf_aio++;
+ AIO_UNLOCK(ki);
+
+ atomic_add_int(&num_queue_count, 1);
+ atomic_add_int(&num_buf_aio, 1);
+
bp->b_error = 0;
- splx(s);
+ TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
/* Perform transfer. */
dev_strategy(vp->v_rdev, bp);
-
- notify = 0;
- s = splbio();
-
- /*
- * If we had an error invoking the request, or an error in processing
- * the request before we have returned, we process it as an error in
- * transfer. Note that such an I/O error is not indicated immediately,
- * but is returned using the aio_error mechanism. In this case,
- * aio_suspend will return immediately.
- */
- if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
- struct aiocb *job = aiocbe->uuaiocb;
-
- aiocbe->uaiocb._aiocb_private.status = 0;
- suword(&job->_aiocb_private.status, 0);
- aiocbe->uaiocb._aiocb_private.error = bp->b_error;
- suword(&job->_aiocb_private.error, bp->b_error);
-
- ki->kaio_buffer_finished_count++;
-
- if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
- aiocbe->jobstate = JOBST_JOBBFINISHED;
- aiocbe->jobflags |= AIOCBLIST_DONE;
- TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
- TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
- notify = 1;
- }
- }
- splx(s);
- if (notify)
- KNOTE_UNLOCKED(&aiocbe->klist, 0);
return (0);
doerror:
+ AIO_LOCK(ki);
+ ki->kaio_count--;
ki->kaio_buffer_count--;
if (lj)
- lj->lioj_buffer_count--;
+ lj->lioj_count--;
aiocbe->bp = NULL;
- relpbuf(bp, NULL);
- return (error);
-}
-
-/*
- * This waits/tests physio completion.
- */
-static int
-aio_fphysio(struct aiocblist *iocb)
-{
- int s;
- struct buf *bp;
- int error;
-
- bp = iocb->bp;
-
- s = splbio();
- while ((bp->b_flags & B_DONE) == 0) {
- if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
- if ((bp->b_flags & B_DONE) == 0) {
- splx(s);
- return (EINPROGRESS);
- } else
- break;
- }
- }
- splx(s);
-
- /* Release mapping into kernel space. */
- vunmapbuf(bp);
- iocb->bp = 0;
-
- error = 0;
-
- /* Check for an error. */
- if (bp->b_ioflags & BIO_ERROR)
- error = bp->b_error;
-
+ AIO_UNLOCK(ki);
relpbuf(bp, NULL);
return (error);
}
@@ -1211,94 +1291,106 @@
static void
aio_swake_cb(struct socket *so, struct sockbuf *sb)
{
- struct aiocblist *cb,*cbn;
- struct proc *p;
- struct kaioinfo *ki = NULL;
- int opcode, wakecount = 0;
- struct aiothreadlist *aiop;
+ struct aiocblist *cb, *cbn;
+ int opcode;
- if (sb == &so->so_snd) {
+ if (sb == &so->so_snd)
opcode = LIO_WRITE;
- SOCKBUF_LOCK(&so->so_snd);
- so->so_snd.sb_flags &= ~SB_AIO;
- SOCKBUF_UNLOCK(&so->so_snd);
- } else {
+ else
opcode = LIO_READ;
- SOCKBUF_LOCK(&so->so_rcv);
- so->so_rcv.sb_flags &= ~SB_AIO;
- SOCKBUF_UNLOCK(&so->so_rcv);
- }
- for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
- cbn = TAILQ_NEXT(cb, list);
+ SOCKBUF_LOCK(sb);
+ sb->sb_flags &= ~SB_AIO;
+ mtx_lock(&aio_job_mtx);
+ TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) {
if (opcode == cb->uaiocb.aio_lio_opcode) {
- p = cb->userproc;
- ki = p->p_aioinfo;
+ if (cb->jobstate != JOBST_JOBQSOCK)
+ panic("invalid queue value");
+ /* XXX
+ * We don't have actual sockets backend yet,
+ * so we simply move the requests to the generic
+ * file I/O backend.
+ */
TAILQ_REMOVE(&so->so_aiojobq, cb, list);
- TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
- TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
- wakecount++;
- if (cb->jobstate != JOBST_JOBQGLOBAL)
- panic("invalid queue value");
- }
- }
-
- while (wakecount--) {
- mtx_lock(&aio_freeproc_mtx);
- if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
- TAILQ_REMOVE(&aio_freeproc, aiop, list);
- aiop->aiothreadflags &= ~AIOP_FREE;
- wakeup(aiop->aiothread);
+ aio_kick_nowait(cb->userproc);
}
- mtx_unlock(&aio_freeproc_mtx);
}
+ mtx_unlock(&aio_job_mtx);
+ SOCKBUF_UNLOCK(sb);
}
/*
* Queue a new AIO request. Choosing either the threaded or direct physio VCHR
* technique is done in this code.
*/
-static int
-_aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type)
+int
+aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
+ int type, int oldsigev)
{
struct proc *p = td->td_proc;
- struct filedesc *fdp;
struct file *fp;
- unsigned int fd;
struct socket *so;
- int s;
- int error;
- int opcode;
- struct aiocblist *aiocbe;
- struct aiothreadlist *aiop;
+ struct aiocblist *aiocbe, *cb;
struct kaioinfo *ki;
struct kevent kev;
- struct kqueue *kq;
- struct file *kq_fp;
struct sockbuf *sb;
+ int opcode;
+ int error;
+ int fd, kqfd;
+ int jid;
- aiocbe = uma_zalloc(aiocb_zone, M_WAITOK);
- aiocbe->inputcharge = 0;
- aiocbe->outputcharge = 0;
- /* XXX - need a lock */
- knlist_init(&aiocbe->klist, NULL, NULL, NULL, NULL);
+ if (p->p_aioinfo == NULL)
+ aio_init_aioinfo(p);
+
+ ki = p->p_aioinfo;
suword(&job->_aiocb_private.status, -1);
suword(&job->_aiocb_private.error, 0);
suword(&job->_aiocb_private.kernelinfo, -1);
- error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
- if (error) {
- suword(&job->_aiocb_private.error, error);
- uma_zfree(aiocb_zone, aiocbe);
- return (error);
+ if (num_queue_count >= max_queue_count ||
+ ki->kaio_count >= ki->kaio_qallowed_count) {
+ suword(&job->_aiocb_private.error, EAGAIN);
+ return (EAGAIN);
}
- if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
- !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
- uma_zfree(aiocb_zone, aiocbe);
+
+ aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
+ aiocbe->inputcharge = 0;
+ aiocbe->outputcharge = 0;
+ knlist_init(&aiocbe->klist, AIO_MTX(ki), NULL, NULL, NULL);
+
+ if (oldsigev) {
+ bzero(&aiocbe->uaiocb, sizeof(struct aiocb));
+ error = copyin(job, &aiocbe->uaiocb, sizeof(struct oaiocb));
+ bcopy(&aiocbe->uaiocb.__spare__, &aiocbe->uaiocb.aio_sigevent,
+ sizeof(struct osigevent));
+ } else {
+ error = copyin(job, &aiocbe->uaiocb, sizeof(struct aiocb));
+ }
+ if (error) {
+ suword(&job->_aiocb_private.error, error);
+ uma_zfree(aiocb_zone, aiocbe);
+ return (error);
+ }
+
+ if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
+ aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
+ aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
+ aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
+ suword(&job->_aiocb_private.error, EINVAL);
+ uma_zfree(aiocb_zone, aiocbe);
return (EINVAL);
}
+
+ if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
+ aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
+ !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
+ uma_zfree(aiocb_zone, aiocbe);
+ return (EINVAL);
+ }
+
+ ksiginfo_init(&aiocbe->ksi);
/* Save userspace address of the job info. */
aiocbe->uuaiocb = job;
@@ -1308,90 +1400,72 @@
aiocbe->uaiocb.aio_lio_opcode = type;
opcode = aiocbe->uaiocb.aio_lio_opcode;
- /* Get the fd info for process. */
- fdp = p->p_fd;
-
- /*
- * Range check file descriptor.
- */
- FILEDESC_LOCK(fdp);
+ /* Fetch the file object for the specified file descriptor. */
fd = aiocbe->uaiocb.aio_fildes;
- if (fd >= fdp->fd_nfiles) {
- FILEDESC_UNLOCK(fdp);
+ switch (opcode) {
+ case LIO_WRITE:
+ error = fget_write(td, fd, &fp);
+ break;
+ case LIO_READ:
+ error = fget_read(td, fd, &fp);
+ break;
+ default:
+ error = fget(td, fd, &fp);
+ }
+ if (error) {
uma_zfree(aiocb_zone, aiocbe);
- if (type == 0)
- suword(&job->_aiocb_private.error, EBADF);
- return (EBADF);
+ suword(&job->_aiocb_private.error, error);
+ return (error);
}
- fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
- if ((fp == NULL) ||
- ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0)) ||
- ((opcode == LIO_READ) && ((fp->f_flag & FREAD) == 0))) {
- FILEDESC_UNLOCK(fdp);
- uma_zfree(aiocb_zone, aiocbe);
- if (type == 0)
- suword(&job->_aiocb_private.error, EBADF);
- return (EBADF);
+ if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
+ error = EINVAL;
+ goto aqueue_fail;
}
- fhold(fp);
- FILEDESC_UNLOCK(fdp);
- if (aiocbe->uaiocb.aio_offset == -1LL) {
+ if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) {
error = EINVAL;
goto aqueue_fail;
}
- error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
+
+ aiocbe->fd_file = fp;
+
+ mtx_lock(&aio_job_mtx);
+ jid = jobrefid++;
+ aiocbe->seqno = jobseqno++;
+ mtx_unlock(&aio_job_mtx);
+ error = suword(&job->_aiocb_private.kernelinfo, jid);
if (error) {
error = EINVAL;
goto aqueue_fail;
}
- aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
- if (jobrefid == LONG_MAX)
- jobrefid = 1;
- else
- jobrefid++;
+ aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
if (opcode == LIO_NOP) {
fdrop(fp, td);
uma_zfree(aiocb_zone, aiocbe);
- if (type == 0) {
- suword(&job->_aiocb_private.error, 0);
- suword(&job->_aiocb_private.status, 0);
- suword(&job->_aiocb_private.kernelinfo, 0);
- }
return (0);
}
- if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
- if (type == 0)
- suword(&job->_aiocb_private.status, 0);
+ if ((opcode != LIO_READ) && (opcode != LIO_WRITE) &&
+ (opcode != LIO_SYNC)) {
error = EINVAL;
goto aqueue_fail;
}
- if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
- kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
- kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
- } else
+ if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
goto no_kqueue;
- if ((u_int)kev.ident >= fdp->fd_nfiles ||
- (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
- (kq_fp->f_type != DTYPE_KQUEUE)) {
- error = EBADF;
- goto aqueue_fail;
- }
- kq = kq_fp->f_data;
+ kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
kev.ident = (uintptr_t)aiocbe->uuaiocb;
kev.filter = EVFILT_AIO;
kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
kev.data = (intptr_t)aiocbe;
- error = kqueue_register(kq, &kev, td, 1);
+ kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr;
+ error = kqfd_register(kqfd, &kev, td, 1);
aqueue_fail:
if (error) {
fdrop(fp, td);
uma_zfree(aiocb_zone, aiocbe);
- if (type == 0)
- suword(&job->_aiocb_private.error, error);
+ suword(&job->_aiocb_private.error, error);
goto done;
}
no_kqueue:
@@ -1402,7 +1476,9 @@
aiocbe->cred = crhold(td->td_ucred);
aiocbe->jobflags = 0;
aiocbe->lio = lj;
- ki = p->p_aioinfo;
+
+ if (opcode == LIO_SYNC)
+ goto queueit;
if (fp->f_type == DTYPE_SOCKET) {
/*
@@ -1421,56 +1497,111 @@
so = fp->f_data;
sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
SOCKBUF_LOCK(sb);
- s = splnet();
if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
LIO_WRITE) && (!sowriteable(so)))) {
- TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
- TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
sb->sb_flags |= SB_AIO;
- aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
- ki->kaio_queue_count++;
- num_queue_count++;
+
+ mtx_lock(&aio_job_mtx);
+ TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
+ mtx_unlock(&aio_job_mtx);
+
+ AIO_LOCK(ki);
+ TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
+ TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
+ aiocbe->jobstate = JOBST_JOBQSOCK;
+ ki->kaio_count++;
+ if (lj)
+ lj->lioj_count++;
+ AIO_UNLOCK(ki);
SOCKBUF_UNLOCK(sb);
- splx(s);
+ atomic_add_int(&num_queue_count, 1);
error = 0;
goto done;
}
SOCKBUF_UNLOCK(sb);
- splx(s);
}
if ((error = aio_qphysio(p, aiocbe)) == 0)
goto done;
+#if 0
if (error > 0) {
- suword(&job->_aiocb_private.status, 0);
aiocbe->uaiocb._aiocb_private.error = error;
suword(&job->_aiocb_private.error, error);
goto done;
}
-
+#endif
+queueit:
/* No buffer for daemon I/O. */
aiocbe->bp = NULL;
+ atomic_add_int(&num_queue_count, 1);
- ki->kaio_queue_count++;
+ AIO_LOCK(ki);
+ ki->kaio_count++;
if (lj)
- lj->lioj_queue_count++;
- s = splnet();
+ lj->lioj_count++;
TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
+ TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
+ if (opcode == LIO_SYNC) {
+ TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) {
+ if (cb->fd_file == aiocbe->fd_file &&
+ cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
+ cb->seqno < aiocbe->seqno) {
+ cb->jobflags |= AIOCBLIST_CHECKSYNC;
+ aiocbe->pending++;
+ }
+ }
+ TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) {
+ if (cb->fd_file == aiocbe->fd_file &&
+ cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
+ cb->seqno < aiocbe->seqno) {
+ cb->jobflags |= AIOCBLIST_CHECKSYNC;
+ aiocbe->pending++;
+ }
+ }
+ if (aiocbe->pending != 0) {
+ TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list);
+ aiocbe->jobstate = JOBST_JOBQSYNC;
+ AIO_UNLOCK(ki);
+ goto done;
+ }
+ }
+ mtx_lock(&aio_job_mtx);
TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
- splx(s);
aiocbe->jobstate = JOBST_JOBQGLOBAL;
-
- num_queue_count++;
+ aio_kick_nowait(p);
+ mtx_unlock(&aio_job_mtx);
+ AIO_UNLOCK(ki);
error = 0;
+done:
+ return (error);
+}
- /*
- * If we don't have a free AIO process, and we are below our quota, then
- * start one. Otherwise, depend on the subsequent I/O completions to
- * pick-up this job. If we don't sucessfully create the new process
- * (thread) due to resource issues, we return an error for now (EAGAIN),
- * which is likely not the correct thing to do.
- */
- mtx_lock(&aio_freeproc_mtx);
+static void
+aio_kick_nowait(struct proc *userp)
+{
+ struct kaioinfo *ki = userp->p_aioinfo;
+ struct aiothreadlist *aiop;
+
+ mtx_assert(&aio_job_mtx, MA_OWNED);
+ if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ aiop->aiothreadflags &= ~AIOP_FREE;
+ wakeup(aiop->aiothread);
+ } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
+ ((ki->kaio_active_count + num_aio_resv_start) <
+ ki->kaio_maxactive_count)) {
+ taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task);
+ }
+}
+
+static int
+aio_kick(struct proc *userp)
+{
+ struct kaioinfo *ki = userp->p_aioinfo;
+ struct aiothreadlist *aiop;
+ int error, ret = 0;
+
+ mtx_assert(&aio_job_mtx, MA_OWNED);
retryproc:
if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
TAILQ_REMOVE(&aio_freeproc, aiop, list);
@@ -1480,40 +1611,30 @@
((ki->kaio_active_count + num_aio_resv_start) <
ki->kaio_maxactive_count)) {
num_aio_resv_start++;
- mtx_unlock(&aio_freeproc_mtx);
- if ((error = aio_newproc()) == 0) {
- mtx_lock(&aio_freeproc_mtx);
+ mtx_unlock(&aio_job_mtx);
+ error = aio_newproc(&num_aio_resv_start);
+ mtx_lock(&aio_job_mtx);
+ if (error) {
num_aio_resv_start--;
goto retryproc;
}
- mtx_lock(&aio_freeproc_mtx);
- num_aio_resv_start--;
+ } else {
+ ret = -1;
}
- mtx_unlock(&aio_freeproc_mtx);
-done:
- return (error);
+ return (ret);
}
-/*
- * This routine queues an AIO request, checking for quotas.
- */
-static int
-aio_aqueue(struct thread *td, struct aiocb *job, int type)
+static void
+aio_kick_helper(void *context, int pending)
{
- struct proc *p = td->td_proc;
- struct kaioinfo *ki;
-
- if (p->p_aioinfo == NULL)
- aio_init_aioinfo(p);
+ struct proc *userp = context;
- if (num_queue_count >= max_queue_count)
- return (EAGAIN);
-
- ki = p->p_aioinfo;
- if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
- return (EAGAIN);
-
- return _aio_aqueue(td, job, NULL, type);
+ mtx_lock(&aio_job_mtx);
+ while (--pending >= 0) {
+ if (aio_kick(userp))
+ break;
+ }
+ mtx_unlock(&aio_job_mtx);
}
/*
@@ -1524,56 +1645,41 @@
aio_return(struct thread *td, struct aio_return_args *uap)
{
struct proc *p = td->td_proc;
- int s;
- long jobref;
- struct aiocblist *cb, *ncb;
- struct aiocb *ujob;
+ struct aiocblist *cb;
+ struct aiocb *uaiocb;
struct kaioinfo *ki;
-
- ujob = uap->aiocbp;
- jobref = fuword(&ujob->_aiocb_private.kernelinfo);
- if (jobref == -1 || jobref == 0)
- return (EINVAL);
+ int status, error;
ki = p->p_aioinfo;
if (ki == NULL)
return (EINVAL);
- PROC_LOCK(p);
- TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
- if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
- jobref) {
- if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
- p->p_stats->p_ru.ru_oublock +=
- cb->outputcharge;
- cb->outputcharge = 0;
- } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
- p->p_stats->p_ru.ru_inblock += cb->inputcharge;
- cb->inputcharge = 0;
- }
- goto done;
- }
- }
- s = splbio();
- for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
- ncb = TAILQ_NEXT(cb, plist);
- if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
- == jobref) {
+ uaiocb = uap->aiocbp;
+ AIO_LOCK(ki);
+ TAILQ_FOREACH(cb, &ki->kaio_done, plist) {
+ if (cb->uuaiocb == uaiocb)
break;
- }
}
- splx(s);
- done:
- PROC_UNLOCK(p);
if (cb != NULL) {
- if (ujob == cb->uuaiocb) {
- td->td_retval[0] =
- cb->uaiocb._aiocb_private.status;
- } else
- td->td_retval[0] = EFAULT;
+ MPASS(cb->jobstate == JOBST_JOBFINISHED);
+ status = cb->uaiocb._aiocb_private.status;
+ error = cb->uaiocb._aiocb_private.error;
+ td->td_retval[0] = status;
+ if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+ td->td_ru.ru_oublock += cb->outputcharge;
+ cb->outputcharge = 0;
+ } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+ td->td_ru.ru_inblock += cb->inputcharge;
+ cb->inputcharge = 0;
+ }
aio_free_entry(cb);
- return (0);
+ AIO_UNLOCK(ki);
+ suword(&uaiocb->_aiocb_private.error, error);
+ suword(&uaiocb->_aiocb_private.status, status);
+ } else {
+ error = EINVAL;
+ AIO_UNLOCK(ki);
}
- return (EINVAL);
+ return (error);
}
/*
@@ -1587,12 +1693,12 @@
struct timespec ts;
struct aiocb *const *cbptr, *cbp;
struct kaioinfo *ki;
- struct aiocblist *cb;
- int i;
- int njoblist;
- int error, s, timo;
- long *ijoblist;
+ struct aiocblist *cb, *cbfirst;
struct aiocb **ujoblist;
+ int njoblist;
+ int error;
+ int timo;
+ int i;
if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
return (EINVAL);
@@ -1617,7 +1723,6 @@
return (EAGAIN);
njoblist = 0;
- ijoblist = uma_zalloc(aiol_zone, M_WAITOK);
ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
cbptr = uap->aiocbp;
@@ -1626,70 +1731,44 @@
if (cbp == 0)
continue;
ujoblist[njoblist] = cbp;
- ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
njoblist++;
}
if (njoblist == 0) {
- uma_zfree(aiol_zone, ijoblist);
uma_zfree(aiol_zone, ujoblist);
return (0);
}
- error = 0;
+ AIO_LOCK(ki);
for (;;) {
- PROC_LOCK(p);
- TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
+ cbfirst = NULL;
+ error = 0;
+ TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
for (i = 0; i < njoblist; i++) {
- if (((intptr_t)
- cb->uaiocb._aiocb_private.kernelinfo) ==
- ijoblist[i]) {
- PROC_UNLOCK(p);
- if (ujoblist[i] != cb->uuaiocb)
- error = EINVAL;
- uma_zfree(aiol_zone, ijoblist);
- uma_zfree(aiol_zone, ujoblist);
- return (error);
- }
- }
- }
-
- s = splbio();
- for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
- TAILQ_NEXT(cb, plist)) {
- for (i = 0; i < njoblist; i++) {
- if (((intptr_t)
- cb->uaiocb._aiocb_private.kernelinfo) ==
- ijoblist[i]) {
- PROC_UNLOCK(p);
- splx(s);
- if (ujoblist[i] != cb->uuaiocb)
- error = EINVAL;
- uma_zfree(aiol_zone, ijoblist);
- uma_zfree(aiol_zone, ujoblist);
- return (error);
+ if (cb->uuaiocb == ujoblist[i]) {
+ if (cbfirst == NULL)
+ cbfirst = cb;
+ if (cb->jobstate == JOBST_JOBFINISHED)
+ goto RETURN;
}
}
}
+ /* All tasks were finished. */
+ if (cbfirst == NULL)
+ break;
ki->kaio_flags |= KAIO_WAKEUP;
- error = msleep(p, &p->p_mtx, PDROP | PRIBIO | PCATCH, "aiospn",
- timo);
- splx(s);
-
- if (error == ERESTART || error == EINTR) {
- uma_zfree(aiol_zone, ijoblist);
- uma_zfree(aiol_zone, ujoblist);
- return (EINTR);
- } else if (error == EWOULDBLOCK) {
- uma_zfree(aiol_zone, ijoblist);
- uma_zfree(aiol_zone, ujoblist);
- return (EAGAIN);
- }
+ error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
+ "aiospn", timo);
+ if (error == ERESTART)
+ error = EINTR;
+ if (error)
+ break;
}
-
-/* NOTREACHED */
- return (EINVAL);
+RETURN:
+ AIO_UNLOCK(ki);
+ uma_zfree(aiol_zone, ujoblist);
+ return (error);
}
/*
@@ -1703,237 +1782,195 @@
struct kaioinfo *ki;
struct aiocblist *cbe, *cbn;
struct file *fp;
- struct filedesc *fdp;
struct socket *so;
- struct proc *po;
- int s,error;
- int cancelled=0;
- int notcancelled=0;
+ int error;
+ int remove;
+ int cancelled = 0;
+ int notcancelled = 0;
struct vnode *vp;
- fdp = p->p_fd;
- if ((u_int)uap->fd >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[uap->fd]) == NULL)
- return (EBADF);
+ /* Lookup file object. */
+ error = fget(td, uap->fd, &fp);
+ if (error)
+ return (error);
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ goto done;
if (fp->f_type == DTYPE_VNODE) {
vp = fp->f_vnode;
-
- if (vn_isdisk(vp,&error)) {
+ if (vn_isdisk(vp, &error)) {
+ fdrop(fp, td);
td->td_retval[0] = AIO_NOTCANCELED;
return (0);
}
- } else if (fp->f_type == DTYPE_SOCKET) {
- so = fp->f_data;
-
- s = splnet();
-
- for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
- cbn = TAILQ_NEXT(cbe, list);
- if ((uap->aiocbp == NULL) ||
- (uap->aiocbp == cbe->uuaiocb) ) {
- po = cbe->userproc;
- ki = po->p_aioinfo;
- TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
- TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
- TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
- if (ki->kaio_flags & KAIO_WAKEUP) {
- wakeup(po);
- }
- cbe->jobstate = JOBST_JOBFINISHED;
- cbe->uaiocb._aiocb_private.status=-1;
- cbe->uaiocb._aiocb_private.error=ECANCELED;
- cancelled++;
-/* XXX cancelled, knote? */
- if (cbe->uaiocb.aio_sigevent.sigev_notify ==
- SIGEV_SIGNAL) {
- PROC_LOCK(cbe->userproc);
- psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
- PROC_UNLOCK(cbe->userproc);
- }
- if (uap->aiocbp)
- break;
- }
- }
- splx(s);
-
- if ((cancelled) && (uap->aiocbp)) {
- td->td_retval[0] = AIO_CANCELED;
- return (0);
- }
}
- ki=p->p_aioinfo;
- if (ki == NULL)
- goto done;
- s = splnet();
-
- for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
- cbn = TAILQ_NEXT(cbe, plist);
+ AIO_LOCK(ki);
+ TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
if ((uap->fd == cbe->uaiocb.aio_fildes) &&
- ((uap->aiocbp == NULL ) ||
+ ((uap->aiocbp == NULL) ||
(uap->aiocbp == cbe->uuaiocb))) {
+ remove = 0;
+ mtx_lock(&aio_job_mtx);
if (cbe->jobstate == JOBST_JOBQGLOBAL) {
TAILQ_REMOVE(&aio_jobs, cbe, list);
+ remove = 1;
+ } else if (cbe->jobstate == JOBST_JOBQSOCK) {
+ MPASS(fp->f_type == DTYPE_SOCKET);
+ so = fp->f_data;
+ TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
+ remove = 1;
+ } else if (cbe->jobstate == JOBST_JOBQSYNC) {
+ TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
+ remove = 1;
+ }
+ mtx_unlock(&aio_job_mtx);
+
+ if (remove) {
TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
- TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
- plist);
- cancelled++;
- ki->kaio_queue_finished_count++;
- cbe->jobstate = JOBST_JOBFINISHED;
cbe->uaiocb._aiocb_private.status = -1;
cbe->uaiocb._aiocb_private.error = ECANCELED;
-/* XXX cancelled, knote? */
- if (cbe->uaiocb.aio_sigevent.sigev_notify ==
- SIGEV_SIGNAL) {
- PROC_LOCK(cbe->userproc);
- psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
- PROC_UNLOCK(cbe->userproc);
- }
+ aio_bio_done_notify(p, cbe, DONE_QUEUE);
+ cancelled++;
} else {
notcancelled++;
}
+ if (uap->aiocbp != NULL)
+ break;
}
}
- splx(s);
+ AIO_UNLOCK(ki);
+
done:
+ fdrop(fp, td);
+
+ if (uap->aiocbp != NULL) {
+ if (cancelled) {
+ td->td_retval[0] = AIO_CANCELED;
+ return (0);
+ }
+ }
+
if (notcancelled) {
td->td_retval[0] = AIO_NOTCANCELED;
return (0);
}
+
if (cancelled) {
td->td_retval[0] = AIO_CANCELED;
return (0);
}
+
td->td_retval[0] = AIO_ALLDONE;
return (0);
}
/*
- * aio_error is implemented in the kernel level for compatibility purposes only.
- * For a user mode async implementation, it would be best to do it in a userland
- * subroutine.
+ * aio_error is implemented in the kernel level for compatibility purposes
+ * only. For a user mode async implementation, it would be best to do it in
+ * a userland subroutine.
*/
int
aio_error(struct thread *td, struct aio_error_args *uap)
{
struct proc *p = td->td_proc;
- int s;
struct aiocblist *cb;
struct kaioinfo *ki;
- long jobref;
+ int status;
ki = p->p_aioinfo;
- if (ki == NULL)
- return (EINVAL);
-
- jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
- if ((jobref == -1) || (jobref == 0))
- return (EINVAL);
-
- PROC_LOCK(p);
- TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
- if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
- jobref) {
- PROC_UNLOCK(p);
- td->td_retval[0] = cb->uaiocb._aiocb_private.error;
- return (0);
- }
- }
-
- s = splnet();
-
- for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
- plist)) {
- if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
- jobref) {
- PROC_UNLOCK(p);
- td->td_retval[0] = EINPROGRESS;
- splx(s);
- return (0);
- }
- }
-
- for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
- plist)) {
- if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
- jobref) {
- PROC_UNLOCK(p);
- td->td_retval[0] = EINPROGRESS;
- splx(s);
- return (0);
- }
- }
- splx(s);
-
- s = splbio();
- for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
- plist)) {
- if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
- jobref) {
- PROC_UNLOCK(p);
- td->td_retval[0] = cb->uaiocb._aiocb_private.error;
- splx(s);
- return (0);
- }
+ if (ki == NULL) {
+ td->td_retval[0] = EINVAL;
+ return (0);
}
- for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
- plist)) {
- if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
- jobref) {
- PROC_UNLOCK(p);
- td->td_retval[0] = EINPROGRESS;
- splx(s);
+ AIO_LOCK(ki);
+ TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
+ if (cb->uuaiocb == uap->aiocbp) {
+ if (cb->jobstate == JOBST_JOBFINISHED)
+ td->td_retval[0] =
+ cb->uaiocb._aiocb_private.error;
+ else
+ td->td_retval[0] = EINPROGRESS;
+ AIO_UNLOCK(ki);
return (0);
}
}
- splx(s);
- PROC_UNLOCK(p);
+ AIO_UNLOCK(ki);
-#if (0)
/*
- * Hack for lio.
+ * Hack for failure of aio_aqueue.
*/
status = fuword(&uap->aiocbp->_aiocb_private.status);
- if (status == -1)
- return fuword(&uap->aiocbp->_aiocb_private.error);
-#endif
- return (EINVAL);
+ if (status == -1) {
+ td->td_retval[0] = fuword(&uap->aiocbp->_aiocb_private.error);
+ return (0);
+ }
+
+ td->td_retval[0] = EINVAL;
+ return (0);
}
/* syscall - asynchronous read from a file (REALTIME) */
int
+oaio_read(struct thread *td, struct oaio_read_args *uap)
+{
+
+ return aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 1);
+}
+
+int
aio_read(struct thread *td, struct aio_read_args *uap)
{
- return aio_aqueue(td, uap->aiocbp, LIO_READ);
+ return aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, 0);
}
/* syscall - asynchronous write to a file (REALTIME) */
int
+oaio_write(struct thread *td, struct oaio_write_args *uap)
+{
+
+ return aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 1);
+}
+
+int
aio_write(struct thread *td, struct aio_write_args *uap)
{
- return aio_aqueue(td, uap->aiocbp, LIO_WRITE);
+ return aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, 0);
+}
+
+/* syscall - list directed I/O (REALTIME) */
+int
+olio_listio(struct thread *td, struct olio_listio_args *uap)
+{
+ return do_lio_listio(td, (struct lio_listio_args *)uap, 1);
}
/* syscall - list directed I/O (REALTIME) */
int
lio_listio(struct thread *td, struct lio_listio_args *uap)
{
+ return do_lio_listio(td, uap, 0);
+}
+
+static int
+do_lio_listio(struct thread *td, struct lio_listio_args *uap, int oldsigev)
+{
struct proc *p = td->td_proc;
- int nent, nentqueued;
struct aiocb *iocb, * const *cbptr;
- struct aiocblist *cb;
struct kaioinfo *ki;
- struct aio_liojob *lj;
- int error, runningcode;
+ struct aioliojob *lj;
+ struct kevent kev;
+ int nent;
+ int error;
int nerror;
int i;
- int s;
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
return (EINVAL);
@@ -1945,224 +1982,175 @@
if (p->p_aioinfo == NULL)
aio_init_aioinfo(p);
- if ((nent + num_queue_count) > max_queue_count)
- return (EAGAIN);
-
ki = p->p_aioinfo;
- if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
- return (EAGAIN);
lj = uma_zalloc(aiolio_zone, M_WAITOK);
- if (!lj)
- return (EAGAIN);
-
lj->lioj_flags = 0;
- lj->lioj_buffer_count = 0;
- lj->lioj_buffer_finished_count = 0;
- lj->lioj_queue_count = 0;
- lj->lioj_queue_finished_count = 0;
+ lj->lioj_count = 0;
+ lj->lioj_finished_count = 0;
+ knlist_init(&lj->klist, AIO_MTX(ki), NULL, NULL, NULL);
+ ksiginfo_init(&lj->lioj_ksi);
/*
* Setup signal.
*/
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+ bzero(&lj->lioj_signal, sizeof(&lj->lioj_signal));
error = copyin(uap->sig, &lj->lioj_signal,
- sizeof(lj->lioj_signal));
+ oldsigev ? sizeof(struct osigevent) :
+ sizeof(struct sigevent));
if (error) {
uma_zfree(aiolio_zone, lj);
return (error);
}
- if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
+
+ if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+ /* Assume only new style KEVENT */
+ kev.filter = EVFILT_LIO;
+ kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
+ kev.ident = (uintptr_t)uap->acb_list; /* something unique */
+ kev.data = (intptr_t)lj;
+ /* pass user defined sigval data */
+ kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
+ error = kqfd_register(
+ lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
+ if (error) {
+ uma_zfree(aiolio_zone, lj);
+ return (error);
+ }
+ } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
+ ;
+ } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+ lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
+ if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
+ uma_zfree(aiolio_zone, lj);
+ return EINVAL;
+ }
+ lj->lioj_flags |= LIOJ_SIGNAL;
+ } else {
uma_zfree(aiolio_zone, lj);
- return (EINVAL);
+ return EINVAL;
}
- lj->lioj_flags |= LIOJ_SIGNAL;
}
+
+ AIO_LOCK(ki);
TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
/*
+ * Add extra aiocb count to avoid the lio to be freed
+ * by other threads doing aio_waitcomplete or aio_return,
+ * and prevent event from being sent until we have queued
+ * all tasks.
+ */
+ lj->lioj_count = 1;
+ AIO_UNLOCK(ki);
+
+ /*
* Get pointers to the list of I/O requests.
*/
nerror = 0;
- nentqueued = 0;
cbptr = uap->acb_list;
for (i = 0; i < uap->nent; i++) {
iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) {
- error = _aio_aqueue(td, iocb, lj, 0);
- if (error == 0)
- nentqueued++;
- else
+ error = aio_aqueue(td, iocb, lj, LIO_NOP, oldsigev);
+ if (error != 0)
nerror++;
}
}
- /*
- * If we haven't queued any, then just return error.
- */
- if (nentqueued == 0)
- return (0);
-
- /*
- * Calculate the appropriate error return.
- */
- runningcode = 0;
- if (nerror)
- runningcode = EIO;
-
+ error = 0;
+ AIO_LOCK(ki);
if (uap->mode == LIO_WAIT) {
- int command, found;
- long jobref;
-
- for (;;) {
- found = 0;
- for (i = 0; i < uap->nent; i++) {
- /*
- * Fetch address of the control buf pointer in
- * user space.
- */
- iocb = (struct aiocb *)
- (intptr_t)fuword(&cbptr[i]);
- if (((intptr_t)iocb == -1) || ((intptr_t)iocb
- == 0))
- continue;
-
- /*
- * Fetch the associated command from user space.
- */
- command = fuword(&iocb->aio_lio_opcode);
- if (command == LIO_NOP) {
- found++;
- continue;
- }
-
- jobref =
- fuword(&iocb->_aiocb_private.kernelinfo);
-
- TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
- if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
- == jobref) {
- if (cb->uaiocb.aio_lio_opcode
- == LIO_WRITE) {
- p->p_stats->p_ru.ru_oublock
- +=
- cb->outputcharge;
- cb->outputcharge = 0;
- } else if (cb->uaiocb.aio_lio_opcode
- == LIO_READ) {
- p->p_stats->p_ru.ru_inblock
- += cb->inputcharge;
- cb->inputcharge = 0;
- }
- found++;
- break;
- }
- }
-
- s = splbio();
- TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
- if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
- == jobref) {
- found++;
- break;
- }
- }
- splx(s);
- }
-
- /*
- * If all I/Os have been disposed of, then we can
- * return.
- */
- if (found == nentqueued)
- return (runningcode);
-
+ while (lj->lioj_count - 1 != lj->lioj_finished_count) {
ki->kaio_flags |= KAIO_WAKEUP;
- error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
-
- if (error == EINTR)
- return (EINTR);
- else if (error == EWOULDBLOCK)
- return (EAGAIN);
+ error = msleep(&p->p_aioinfo, AIO_MTX(ki),
+ PRIBIO | PCATCH, "aiospn", 0);
+ if (error == ERESTART)
+ error = EINTR;
+ if (error)
+ break;
+ }
+ } else {
+ if (lj->lioj_count - 1 == lj->lioj_finished_count) {
+ if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+ lj->lioj_flags |= LIOJ_KEVENT_POSTED;
+ KNOTE_LOCKED(&lj->klist, 1);
+ }
+ if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
+ == LIOJ_SIGNAL
+ && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+ lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
+ aio_sendsig(p, &lj->lioj_signal,
+ &lj->lioj_ksi);
+ lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+ }
}
}
+ lj->lioj_count--;
+ if (lj->lioj_count == 0) {
+ TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+ knlist_delete(&lj->klist, curthread, 1);
+ PROC_LOCK(p);
+ sigqueue_take(&lj->lioj_ksi);
+ PROC_UNLOCK(p);
+ AIO_UNLOCK(ki);
+ uma_zfree(aiolio_zone, lj);
+ } else
+ AIO_UNLOCK(ki);
- return (runningcode);
+ if (nerror)
+ return (EIO);
+ return (error);
}
/*
- * Interrupt handler for physio, performs the necessary process wakeups, and
- * signals.
+ * Called from interrupt thread for physio, we should return as fast
+ * as possible, so we schedule a biohelper task.
*/
static void
aio_physwakeup(struct buf *bp)
{
struct aiocblist *aiocbe;
- struct proc *p;
- struct kaioinfo *ki;
- struct aio_liojob *lj;
-
- mtx_lock(&Giant);
- bp->b_flags |= B_DONE;
- wakeup(bp);
aiocbe = (struct aiocblist *)bp->b_caller1;
- if (aiocbe) {
- p = aiocbe->userproc;
-
- aiocbe->jobstate = JOBST_JOBBFINISHED;
- aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
- aiocbe->uaiocb._aiocb_private.error = 0;
- aiocbe->jobflags |= AIOCBLIST_DONE;
-
- if (bp->b_ioflags & BIO_ERROR)
- aiocbe->uaiocb._aiocb_private.error = bp->b_error;
-
- lj = aiocbe->lio;
- if (lj) {
- lj->lioj_buffer_finished_count++;
+ taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
+}
- /*
- * wakeup/signal if all of the interrupt jobs are done.
- */
- if (lj->lioj_buffer_finished_count ==
- lj->lioj_buffer_count &&
- lj->lioj_queue_finished_count ==
- lj->lioj_queue_count) {
- /*
- * Post a signal if it is called for.
- */
- if ((lj->lioj_flags &
- (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
- LIOJ_SIGNAL) {
- PROC_LOCK(p);
- psignal(p, lj->lioj_signal.sigev_signo);
- PROC_UNLOCK(p);
- lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
- }
- }
- }
+/*
+ * Task routine to perform heavy tasks, process wakeup, and signals.
+ */
+static void
+biohelper(void *context, int pending)
+{
+ struct aiocblist *aiocbe = context;
+ struct buf *bp;
+ struct proc *userp;
+ struct kaioinfo *ki;
+ int nblks;
- ki = p->p_aioinfo;
- if (ki) {
- ki->kaio_buffer_finished_count++;
- TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
- TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
-
- KNOTE_UNLOCKED(&aiocbe->klist, 0);
- /* Do the wakeup. */
- if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
- ki->kaio_flags &= ~KAIO_WAKEUP;
- wakeup(p);
- }
- }
+ bp = aiocbe->bp;
+ userp = aiocbe->userproc;
+ ki = userp->p_aioinfo;
+ AIO_LOCK(ki);
+ aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
+ aiocbe->uaiocb._aiocb_private.error = 0;
+ if (bp->b_ioflags & BIO_ERROR)
+ aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+ nblks = btodb(aiocbe->uaiocb.aio_nbytes);
+ if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
+ aiocbe->outputcharge += nblks;
+ else
+ aiocbe->inputcharge += nblks;
+ aiocbe->bp = NULL;
+ TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
+ ki->kaio_buffer_count--;
+ aio_bio_done_notify(userp, aiocbe, DONE_BUF);
+ AIO_UNLOCK(ki);
- if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
- PROC_LOCK(p);
- psignal(p, aiocbe->uaiocb.aio_sigevent.sigev_signo);
- PROC_UNLOCK(p);
- }
- }
- mtx_unlock(&Giant);
+ /* Release mapping into kernel space. */
+ vunmapbuf(bp);
+ relpbuf(bp, NULL);
+ atomic_subtract_int(&num_buf_aio, 1);
}
/* syscall - wait for the next completion of an aio request */
@@ -2173,10 +2161,11 @@
struct timeval atv;
struct timespec ts;
struct kaioinfo *ki;
- struct aiocblist *cb = NULL;
- int error, s, timo;
+ struct aiocblist *cb;
+ struct aiocb *uuaiocb;
+ int error, status, timo;
- suword(uap->aiocbp, (int)NULL);
+ suword(uap->aiocbp, (long)NULL);
timo = 0;
if (uap->timeout) {
@@ -2194,54 +2183,59 @@
timo = tvtohz(&atv);
}
+ if (p->p_aioinfo == NULL)
+ aio_init_aioinfo(p);
ki = p->p_aioinfo;
- if (ki == NULL)
- return (EAGAIN);
- for (;;) {
- PROC_LOCK(p);
- if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
- PROC_UNLOCK(p);
- suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
- td->td_retval[0] = cb->uaiocb._aiocb_private.status;
- if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
- p->p_stats->p_ru.ru_oublock +=
- cb->outputcharge;
- cb->outputcharge = 0;
- } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
- p->p_stats->p_ru.ru_inblock += cb->inputcharge;
- cb->inputcharge = 0;
- }
- error = cb->uaiocb._aiocb_private.error;
- aio_free_entry(cb);
- return (error);
- }
+ error = 0;
+ cb = NULL;
+ AIO_LOCK(ki);
+ while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
+ ki->kaio_flags |= KAIO_WAKEUP;
+ error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
+ "aiowc", timo);
+ if (timo && error == ERESTART)
+ error = EINTR;
+ if (error)
+ break;
+ }
- s = splbio();
- if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
- PROC_UNLOCK(p);
- splx(s);
- suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
- error = cb->uaiocb._aiocb_private.error;
- td->td_retval[0] = cb->uaiocb._aiocb_private.status;
- aio_free_entry(cb);
- return (error);
+ if (cb != NULL) {
+ MPASS(cb->jobstate == JOBST_JOBFINISHED);
+ uuaiocb = cb->uuaiocb;
+ status = cb->uaiocb._aiocb_private.status;
+ error = cb->uaiocb._aiocb_private.error;
+ td->td_retval[0] = status;
+ if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+ td->td_ru.ru_oublock += cb->outputcharge;
+ cb->outputcharge = 0;
+ } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+ td->td_ru.ru_inblock += cb->inputcharge;
+ cb->inputcharge = 0;
}
+ aio_free_entry(cb);
+ AIO_UNLOCK(ki);
+ suword(uap->aiocbp, (long)uuaiocb);
+ suword(&uuaiocb->_aiocb_private.error, error);
+ suword(&uuaiocb->_aiocb_private.status, status);
+ } else
+ AIO_UNLOCK(ki);
- ki->kaio_flags |= KAIO_WAKEUP;
- error = msleep(p, &p->p_mtx, PDROP | PRIBIO | PCATCH, "aiowc",
- timo);
- splx(s);
+ return (error);
+}
- if (error == ERESTART)
- return (EINTR);
- else if (error < 0)
- return (error);
- else if (error == EINTR)
- return (EINTR);
- else if (error == EWOULDBLOCK)
- return (EAGAIN);
- }
+int
+aio_fsync(struct thread *td, struct aio_fsync_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct kaioinfo *ki;
+
+ if (uap->op != O_SYNC) /* XXX lack of O_DSYNC */
+ return (EINVAL);
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ aio_init_aioinfo(p);
+ return aio_aqueue(td, uap->aiocbp, NULL, LIO_SYNC, 0);
}
/* kqueue attach function */
@@ -2257,6 +2251,7 @@
*/
if ((kn->kn_flags & EV_FLAG1) == 0)
return (EPERM);
+ kn->kn_ptr.p_aio = aiocbe;
kn->kn_flags &= ~EV_FLAG1;
knlist_add(&aiocbe->klist, kn, 0);
@@ -2268,9 +2263,10 @@
static void
filt_aiodetach(struct knote *kn)
{
- struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
+ struct aiocblist *aiocbe = kn->kn_ptr.p_aio;
- knlist_remove(&aiocbe->klist, kn, 0);
+ if (!knlist_empty(&aiocbe->klist))
+ knlist_remove(&aiocbe->klist, kn, 0);
}
/* kqueue filter function */
@@ -2278,12 +2274,52 @@
static int
filt_aio(struct knote *kn, long hint)
{
- struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
+ struct aiocblist *aiocbe = kn->kn_ptr.p_aio;
kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
- if (aiocbe->jobstate != JOBST_JOBFINISHED &&
- aiocbe->jobstate != JOBST_JOBBFINISHED)
+ if (aiocbe->jobstate != JOBST_JOBFINISHED)
return (0);
kn->kn_flags |= EV_EOF;
return (1);
}
+
+/* kqueue attach function */
+static int
+filt_lioattach(struct knote *kn)
+{
+ struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
+
+ /*
+ * The aioliojob pointer must be validated before using it, so
+ * registration is restricted to the kernel; the user cannot
+ * set EV_FLAG1.
+ */
+ if ((kn->kn_flags & EV_FLAG1) == 0)
+ return (EPERM);
+ kn->kn_ptr.p_lio = lj;
+ kn->kn_flags &= ~EV_FLAG1;
+
+ knlist_add(&lj->klist, kn, 0);
+
+ return (0);
+}
+
+/* kqueue detach function */
+static void
+filt_liodetach(struct knote *kn)
+{
+ struct aioliojob * lj = kn->kn_ptr.p_lio;
+
+ if (!knlist_empty(&lj->klist))
+ knlist_remove(&lj->klist, kn, 0);
+}
+
+/* kqueue filter function */
+/*ARGSUSED*/
+static int
+filt_lio(struct knote *kn, long hint)
+{
+ struct aioliojob * lj = kn->kn_ptr.p_lio;
+
+ return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
+}
Index: kern_malloc.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_malloc.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_malloc.c -L sys/kern/kern_malloc.c -u -r1.2 -r1.3
--- sys/kern/kern_malloc.c
+++ sys/kern/kern_malloc.c
@@ -1,7 +1,7 @@
/*-
* Copyright (c) 1987, 1991, 1993
* The Regents of the University of California.
- * Copyright (c) 2005 Robert N. M. Watson
+ * Copyright (c) 2005-2006 Robert N. M. Watson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -31,8 +31,19 @@
* @(#)kern_malloc.c 8.3 (Berkeley) 1/4/94
*/
+/*
+ * Kernel malloc(9) implementation -- general purpose kernel memory allocator
+ * based on memory types. Back end is implemented using the UMA(9) zone
+ * allocator. A set of fixed-size buckets are used for smaller allocations,
+ * and a special UMA allocation interface is used for larger allocations.
+ * Callers declare memory types, and statistics are maintained independently
+ * for each memory type. Statistics are maintained per-CPU for performance
+ * reasons. See malloc(9) and comments in malloc.h for a detailed
+ * description.
+ */
+
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_malloc.c,v 1.142.2.7 2006/01/17 10:19:37 pjd Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_malloc.c,v 1.162 2007/06/27 13:39:38 rwatson Exp $");
#include "opt_ddb.h"
#include "opt_vm.h"
@@ -65,6 +76,9 @@
#ifdef DEBUG_MEMGUARD
#include <vm/memguard.h>
#endif
+#ifdef DEBUG_REDZONE
+#include <vm/redzone.h>
+#endif
#if defined(INVARIANTS) && defined(__i386__)
#include <machine/cpu.h>
@@ -82,6 +96,9 @@
#define REALLOC_FRACTION 1 /* new block if <= half the size */
#endif
+/*
+ * Centrally define some common malloc types.
+ */
MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches");
MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers");
@@ -95,8 +112,8 @@
static MALLOC_DEFINE(M_FREE, "free", "should be on free list");
static struct malloc_type *kmemstatistics;
-static char *kmembase;
-static char *kmemlimit;
+static vm_offset_t kmembase;
+static vm_offset_t kmemlimit;
static int kmemcount;
#define KMEM_ZSHIFT 4
@@ -107,7 +124,14 @@
#define KMEM_ZSIZE (KMEM_ZMAX >> KMEM_ZSHIFT)
static u_int8_t kmemsize[KMEM_ZSIZE + 1];
-/* These won't be powers of two for long */
+/*
+ * Small malloc(9) memory allocations are allocated from a set of UMA buckets
+ * of various sizes.
+ *
+ * XXX: The comment here used to read "These won't be powers of two for
+ * long." It's possible that a significant amount of wasted memory could be
+ * recovered by tuning the sizes of these buckets.
+ */
struct {
int kz_size;
char *kz_name;
@@ -140,18 +164,24 @@
{0, NULL},
};
+/*
+ * Zone to allocate malloc type descriptions from. For ABI reasons, memory
+ * types are described by a data structure passed by the declaring code, but
+ * the malloc(9) implementation has its own data structure describing the
+ * type and statistics. This permits the malloc(9)-internal data structures
+ * to be modified without breaking binary-compiled kernel modules that
+ * declare malloc types.
+ */
static uma_zone_t mt_zone;
-#ifdef DEBUG_MEMGUARD
-u_int vm_memguard_divisor;
-SYSCTL_UINT(_vm, OID_AUTO, memguard_divisor, CTLFLAG_RD, &vm_memguard_divisor,
- 0, "(kmem_size/memguard_divisor) == memguard submap size");
-#endif
-
u_int vm_kmem_size;
SYSCTL_UINT(_vm, OID_AUTO, kmem_size, CTLFLAG_RD, &vm_kmem_size, 0,
"Size of kernel memory");
+u_int vm_kmem_size_min;
+SYSCTL_UINT(_vm, OID_AUTO, kmem_size_min, CTLFLAG_RD, &vm_kmem_size_min, 0,
+ "Minimum size of kernel memory");
+
u_int vm_kmem_size_max;
SYSCTL_UINT(_vm, OID_AUTO, kmem_size_max, CTLFLAG_RD, &vm_kmem_size_max, 0,
"Maximum size of kernel memory");
@@ -163,7 +193,6 @@
/*
* The malloc_mtx protects the kmemstatistics linked list.
*/
-
struct mtx malloc_mtx;
#ifdef MALLOC_PROFILE
@@ -172,17 +201,18 @@
static int sysctl_kern_mprof(SYSCTL_HANDLER_ARGS);
#endif
-static int sysctl_kern_malloc(SYSCTL_HANDLER_ARGS);
static int sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS);
-/* time_uptime of last malloc(9) failure */
+/*
+ * time_uptime of the last malloc(9) failure (induced or real).
+ */
static time_t t_malloc_fail;
-#ifdef MALLOC_MAKE_FAILURES
/*
- * Causes malloc failures every (n) mallocs with M_NOWAIT. If set to 0,
- * doesn't cause failures.
+ * malloc(9) fault injection -- cause malloc failures every (n) mallocs when
+ * the caller specifies M_NOWAIT. If set to 0, no failures are caused.
*/
+#ifdef MALLOC_MAKE_FAILURES
SYSCTL_NODE(_debug, OID_AUTO, malloc, CTLFLAG_RD, 0,
"Kernel malloc debugging options");
@@ -204,7 +234,10 @@
}
/*
- * Add this to the informational malloc_type bucket.
+ * An allocation has succeeded -- update malloc type statistics for the
+ * amount of bucket size. Occurs within a critical section so that the
+ * thread isn't preempted and doesn't migrate while updating per-PCU
+ * statistics.
*/
static void
malloc_type_zone_allocated(struct malloc_type *mtp, unsigned long size,
@@ -234,7 +267,10 @@
}
/*
- * Remove this allocation from the informational malloc_type bucket.
+ * A free operation has occurred -- update malloc type statistics for the
+ * amount of the bucket size. Occurs within a critical section so that the
+ * thread isn't preempted and doesn't migrate while updating per-CPU
+ * statistics.
*/
void
malloc_type_freed(struct malloc_type *mtp, unsigned long size)
@@ -265,7 +301,7 @@
caddr_t va;
uma_zone_t zone;
uma_keg_t keg;
-#ifdef DIAGNOSTIC
+#if defined(DIAGNOSTIC) || defined(DEBUG_REDZONE)
unsigned long osize = size;
#endif
@@ -285,10 +321,6 @@
}
}
#endif
-#if 0
- if (size == 0)
- kdb_enter("zero size malloc");
-#endif
#ifdef MALLOC_MAKE_FAILURES
if ((flags & M_NOWAIT) && (malloc_failure_rate != 0)) {
atomic_add_int(&malloc_nowait_count, 1);
@@ -304,11 +336,14 @@
("malloc(M_WAITOK) in interrupt context"));
#ifdef DEBUG_MEMGUARD
- /* XXX CHANGEME! */
- if (mtp == M_SUBPROC)
+ if (memguard_cmp(mtp))
return memguard_alloc(size, flags);
#endif
+#ifdef DEBUG_REDZONE
+ size = redzone_size_ntor(size);
+#endif
+
if (size <= KMEM_ZMAX) {
if (size & KMEM_ZMASK)
size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
@@ -338,6 +373,10 @@
memset(va, 0x70, osize);
}
#endif
+#ifdef DEBUG_REDZONE
+ if (va != NULL)
+ va = redzone_setup(va, osize);
+#endif
return ((void *) va);
}
@@ -359,13 +398,17 @@
return;
#ifdef DEBUG_MEMGUARD
- /* XXX CHANGEME! */
- if (mtp == M_SUBPROC) {
+ if (memguard_cmp(mtp)) {
memguard_free(addr);
return;
}
#endif
+#ifdef DEBUG_REDZONE
+ redzone_check(addr);
+ addr = redzone_addr_ntor(addr);
+#endif
+
size = 0;
slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK));
@@ -423,13 +466,16 @@
*/
#ifdef DEBUG_MEMGUARD
-/* XXX: CHANGEME! */
-if (mtp == M_SUBPROC) {
+if (memguard_cmp(mtp)) {
slab = NULL;
alloc = size;
} else {
#endif
+#ifdef DEBUG_REDZONE
+ slab = NULL;
+ alloc = redzone_get_size(addr);
+#else
slab = vtoslab((vm_offset_t)addr & ~(UMA_SLAB_MASK));
/* Sanity check */
@@ -446,6 +492,7 @@
if (size <= alloc
&& (size > (alloc >> REALLOC_FRACTION) || alloc == MINALLOCSIZE))
return (addr);
+#endif /* !DEBUG_REDZONE */
#ifdef DEBUG_MEMGUARD
}
@@ -510,6 +557,14 @@
(mem_size / vm_kmem_size_scale) > (vm_kmem_size / PAGE_SIZE))
vm_kmem_size = (mem_size / vm_kmem_size_scale) * PAGE_SIZE;
+#if defined(VM_KMEM_SIZE_MIN)
+ vm_kmem_size_min = VM_KMEM_SIZE_MIN;
+#endif
+ TUNABLE_INT_FETCH("vm.kmem_size_min", &vm_kmem_size_min);
+ if (vm_kmem_size_min > 0 && vm_kmem_size < vm_kmem_size_min) {
+ vm_kmem_size = vm_kmem_size_min;
+ }
+
#if defined(VM_KMEM_SIZE_MAX)
vm_kmem_size_max = VM_KMEM_SIZE_MAX;
#endif
@@ -538,8 +593,8 @@
*/
init_param3(vm_kmem_size / PAGE_SIZE);
- kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
- (vm_offset_t *)&kmemlimit, vm_kmem_size);
+ kmem_map = kmem_suballoc(kernel_map, &kmembase, &kmemlimit,
+ vm_kmem_size);
kmem_map->system_map = 1;
#ifdef DEBUG_MEMGUARD
@@ -549,7 +604,7 @@
* scenarios as they occur. It is only used for debugging.
*/
vm_memguard_divisor = 10;
- TUNABLE_INT_FETCH("vm.memguard_divisor", &vm_memguard_divisor);
+ TUNABLE_INT_FETCH("vm.memguard.divisor", &vm_memguard_divisor);
/* Pick a conservative value if provided value sucks. */
if ((vm_memguard_divisor <= 0) ||
@@ -647,113 +702,23 @@
temp_allocs, temp_bytes);
}
- slab = vtoslab((vm_offset_t) mtip & (~UMA_SLAB_MASK));
+ slab = vtoslab((vm_offset_t) mtip & (~UMA_SLAB_MASK));
uma_zfree_arg(mt_zone, mtip, slab);
}
-static int
-sysctl_kern_malloc(SYSCTL_HANDLER_ARGS)
+struct malloc_type *
+malloc_desc2type(const char *desc)
{
- struct malloc_type_stats mts_local, *mtsp;
- struct malloc_type_internal *mtip;
struct malloc_type *mtp;
- struct sbuf sbuf;
- long temp_allocs, temp_bytes;
- int linesize = 128;
- int bufsize;
- int first;
- int error;
- char *buf;
- int cnt;
- int i;
-
- cnt = 0;
-
- /* Guess at how much room is needed. */
- mtx_lock(&malloc_mtx);
- cnt = kmemcount;
- mtx_unlock(&malloc_mtx);
-
- bufsize = linesize * (cnt + 1);
- buf = malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
- sbuf_new(&sbuf, buf, bufsize, SBUF_FIXEDLEN);
-
- mtx_lock(&malloc_mtx);
- sbuf_printf(&sbuf,
- "\n Type InUse MemUse HighUse Requests Size(s)\n");
- for (mtp = kmemstatistics; cnt != 0 && mtp != NULL;
- mtp = mtp->ks_next, cnt--) {
- mtip = mtp->ks_handle;
- bzero(&mts_local, sizeof(mts_local));
- for (i = 0; i < MAXCPU; i++) {
- mtsp = &mtip->mti_stats[i];
- mts_local.mts_memalloced += mtsp->mts_memalloced;
- mts_local.mts_memfreed += mtsp->mts_memfreed;
- mts_local.mts_numallocs += mtsp->mts_numallocs;
- mts_local.mts_numfrees += mtsp->mts_numfrees;
- mts_local.mts_size |= mtsp->mts_size;
- }
- if (mts_local.mts_numallocs == 0)
- continue;
-
- /*
- * Due to races in per-CPU statistics gather, it's possible to
- * get a slightly negative number here. If we do, approximate
- * with 0.
- */
- if (mts_local.mts_numallocs > mts_local.mts_numfrees)
- temp_allocs = mts_local.mts_numallocs -
- mts_local.mts_numfrees;
- else
- temp_allocs = 0;
- /*
- * Ditto for bytes allocated.
- */
- if (mts_local.mts_memalloced > mts_local.mts_memfreed)
- temp_bytes = mts_local.mts_memalloced -
- mts_local.mts_memfreed;
- else
- temp_bytes = 0;
-
- /*
- * High-waterwark is no longer easily available, so we just
- * print '-' for that column.
- */
- sbuf_printf(&sbuf, "%13s%6lu%6luK -%9llu",
- mtp->ks_shortdesc,
- temp_allocs,
- (temp_bytes + 1023) / 1024,
- (unsigned long long)mts_local.mts_numallocs);
-
- first = 1;
- for (i = 0; i < sizeof(kmemzones) / sizeof(kmemzones[0]) - 1;
- i++) {
- if (mts_local.mts_size & (1 << i)) {
- if (first)
- sbuf_printf(&sbuf, " ");
- else
- sbuf_printf(&sbuf, ",");
- sbuf_printf(&sbuf, "%s",
- kmemzones[i].kz_name);
- first = 0;
- }
- }
- sbuf_printf(&sbuf, "\n");
+ mtx_assert(&malloc_mtx, MA_OWNED);
+ for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
+ if (strcmp(mtp->ks_shortdesc, desc) == 0)
+ return (mtp);
}
- sbuf_finish(&sbuf);
- mtx_unlock(&malloc_mtx);
-
- error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
-
- sbuf_delete(&sbuf);
- free(buf, M_TEMP);
- return (error);
+ return (NULL);
}
-SYSCTL_OID(_kern, OID_AUTO, malloc, CTLTYPE_STRING|CTLFLAG_RD,
- NULL, 0, sysctl_kern_malloc, "A", "Malloc Stats");
-
static int
sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS)
{
@@ -845,20 +810,26 @@
struct malloc_type_internal *mtip;
struct malloc_type *mtp;
u_int64_t allocs, frees;
+ u_int64_t alloced, freed;
int i;
- db_printf("%18s %12s %12s %12s\n", "Type", "Allocs", "Frees",
- "Used");
+ db_printf("%18s %12s %12s %12s\n", "Type", "InUse", "MemUse",
+ "Requests");
for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
mtip = (struct malloc_type_internal *)mtp->ks_handle;
allocs = 0;
frees = 0;
+ alloced = 0;
+ freed = 0;
for (i = 0; i < MAXCPU; i++) {
allocs += mtip->mti_stats[i].mts_numallocs;
frees += mtip->mti_stats[i].mts_numfrees;
+ alloced += mtip->mti_stats[i].mts_memalloced;
+ freed += mtip->mti_stats[i].mts_memfreed;
}
- db_printf("%18s %12ju %12ju %12ju\n", mtp->ks_shortdesc,
- allocs, frees, allocs - frees);
+ db_printf("%18s %12ju %12juK %12ju\n",
+ mtp->ks_shortdesc, allocs - frees,
+ (alloced - freed + 1023) / 1024, allocs);
}
}
#endif
Index: vfs_init.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_init.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/vfs_init.c -L sys/kern/vfs_init.c -u -r1.1.1.1 -r1.2
--- sys/kern/vfs_init.c
+++ sys/kern/vfs_init.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_init.c,v 1.81 2005/02/20 23:02:20 das Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_init.c,v 1.85 2007/02/16 17:32:41 pjd Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -43,6 +43,7 @@
#include <sys/linker.h>
#include <sys/mount.h>
#include <sys/proc.h>
+#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
@@ -108,30 +109,21 @@
vfs_byname_kld(const char *fstype, struct thread *td, int *error)
{
struct vfsconf *vfsp;
- linker_file_t lf;
+ int fileid;
vfsp = vfs_byname(fstype);
if (vfsp != NULL)
return (vfsp);
- /* Only load modules for root (very important!). */
- *error = suser(td);
+ /* Try to load the respective module. */
+ *error = kern_kldload(td, fstype, &fileid);
if (*error)
return (NULL);
- *error = securelevel_gt(td->td_ucred, 0);
- if (*error)
- return (NULL);
- *error = linker_load_module(NULL, fstype, NULL, NULL, &lf);
- if (lf == NULL)
- *error = ENODEV;
- if (*error)
- return (NULL);
- lf->userrefs++;
+
/* Look up again to see if the VFS was loaded. */
vfsp = vfs_byname(fstype);
if (vfsp == NULL) {
- lf->userrefs--;
- linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+ (void)kern_kldunload(td, fileid, LINKER_UNLOAD_FORCE);
*error = ENODEV;
return (NULL);
}
@@ -223,9 +215,6 @@
if (vfsops->vfs_checkexp == NULL)
/* check if file system is exported */
vfsops->vfs_checkexp = vfs_stdcheckexp;
- if (vfsops->vfs_vptofh == NULL)
- /* turn a vnode into an NFS file handle */
- vfsops->vfs_vptofh = vfs_stdvptofh;
if (vfsops->vfs_init == NULL)
/* file system specific initialisation */
vfsops->vfs_init = vfs_stdinit;
Index: subr_bus.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_bus.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_bus.c -L sys/kern/subr_bus.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_bus.c
+++ sys/kern/subr_bus.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_bus.c,v 1.184.2.1 2005/10/06 23:15:18 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_bus.c,v 1.201.4.1 2008/02/06 03:35:40 iwasaki Exp $");
#include "opt_bus.h"
@@ -50,6 +50,7 @@
#include <sys/systm.h>
#include <sys/uio.h>
#include <sys/bus.h>
+#include <sys/interrupt.h>
#include <machine/stdarg.h>
@@ -417,7 +418,7 @@
* userland in realtime. We are required to free the data as well as
* the n1 object because we allocate them separately. Also note that
* we return one record at a time. If you try to read this device a
- * character at a time, you will loose the rest of the data. Listening
+ * character at a time, you will lose the rest of the data. Listening
* programs are expected to cope.
*/
static int
@@ -498,6 +499,15 @@
}
/**
+ * @brief Return whether the userland process is running
+ */
+boolean_t
+devctl_process_running(void)
+{
+ return (devsoftc.async_proc != NULL);
+}
+
+/**
* @brief Queue data to be read from the devctl device
*
* Generic interface to queue data to the devctl device. It is
@@ -781,8 +791,18 @@
bus_data_generation_update();
}
- if (parentname && dc && !dc->parent) {
- dc->parent = devclass_find_internal(parentname, 0, FALSE);
+
+ /*
+ * If a parent class is specified, then set that as our parent so
+ * that this devclass will support drivers for the parent class as
+ * well. If the parent class has the same name don't do this though
+ * as it creates a cycle that can trigger an infinite loop in
+ * device_probe_child() if a device exists for which there is no
+ * suitable driver.
+ */
+ if (parentname && dc && !dc->parent &&
+ strcmp(classname, parentname) != 0) {
+ dc->parent = devclass_find_internal(parentname, NULL, FALSE);
}
return (dc);
@@ -799,7 +819,7 @@
devclass_t
devclass_create(const char *classname)
{
- return (devclass_find_internal(classname, 0, TRUE));
+ return (devclass_find_internal(classname, NULL, TRUE));
}
/**
@@ -813,7 +833,7 @@
devclass_t
devclass_find(const char *classname)
{
- return (devclass_find_internal(classname, 0, FALSE));
+ return (devclass_find_internal(classname, NULL, FALSE));
}
/**
@@ -850,7 +870,7 @@
/*
* Make sure the devclass which the driver is implementing exists.
*/
- devclass_find_internal(driver->name, 0, TRUE);
+ devclass_find_internal(driver->name, NULL, TRUE);
dl->driver = driver;
TAILQ_INSERT_TAIL(&dc->drivers, dl, link);
@@ -1427,7 +1447,7 @@
PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit));
if (name) {
- dc = devclass_find_internal(name, 0, TRUE);
+ dc = devclass_find_internal(name, NULL, TRUE);
if (!dc) {
printf("make_device: can't find device class %s\n",
name);
@@ -1677,11 +1697,11 @@
/**
* @internal
*/
-static int
+int
device_probe_child(device_t dev, device_t child)
{
devclass_t dc;
- driverlink_t best = 0;
+ driverlink_t best = NULL;
driverlink_t dl;
int result, pri = 0;
int hasclass = (child->devclass != 0);
@@ -1717,7 +1737,7 @@
/* Reset flags and devclass before the next probe. */
child->devflags = 0;
if (!hasclass)
- device_set_devclass(child, 0);
+ device_set_devclass(child, NULL);
/*
* If the driver returns SUCCESS, there can be
@@ -1734,7 +1754,7 @@
* certainly doesn't match.
*/
if (result > 0) {
- device_set_driver(child, 0);
+ device_set_driver(child, NULL);
continue;
}
@@ -1743,7 +1763,7 @@
* best matching driver. Initialise the value
* of pri for the first match.
*/
- if (best == 0 || result > pri) {
+ if (best == NULL || result > pri) {
best = dl;
pri = result;
continue;
@@ -2230,7 +2250,7 @@
return (EINVAL);
}
- dc = devclass_find_internal(classname, 0, TRUE);
+ dc = devclass_find_internal(classname, NULL, TRUE);
if (!dc)
return (ENOMEM);
@@ -2260,7 +2280,7 @@
free(dev->softc, M_BUS_SC);
dev->softc = NULL;
}
- kobj_delete((kobj_t) dev, 0);
+ kobj_delete((kobj_t) dev, NULL);
dev->driver = driver;
if (driver) {
kobj_init((kobj_t) dev, (kobj_class_t) driver);
@@ -2268,7 +2288,7 @@
dev->softc = malloc(driver->size, M_BUS_SC,
M_NOWAIT | M_ZERO);
if (!dev->softc) {
- kobj_delete((kobj_t) dev, 0);
+ kobj_delete((kobj_t) dev, NULL);
kobj_init((kobj_t) dev, &null_class);
dev->driver = NULL;
return (ENOMEM);
@@ -2369,8 +2389,8 @@
printf("device_attach: %s%d attach returned %d\n",
dev->driver->name, dev->unit, error);
/* Unset the class; set in device_probe_child */
- if (dev->devclass == 0)
- device_set_devclass(dev, 0);
+ if (dev->devclass == NULL)
+ device_set_devclass(dev, NULL);
device_set_driver(dev, NULL);
device_sysctl_fini(dev);
dev->state = DS_NOTPRESENT;
@@ -2681,7 +2701,7 @@
resource_list_alloc(struct resource_list *rl, device_t bus, device_t child,
int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
{
- struct resource_list_entry *rle = 0;
+ struct resource_list_entry *rle = NULL;
int passthrough = (device_get_parent(child) != bus);
int isdefault = (start == 0UL && end == ~0UL);
@@ -2740,7 +2760,7 @@
resource_list_release(struct resource_list *rl, device_t bus, device_t child,
int type, int rid, struct resource *res)
{
- struct resource_list_entry *rle = 0;
+ struct resource_list_entry *rle = NULL;
int passthrough = (device_get_parent(child) != bus);
int error;
@@ -2820,7 +2840,7 @@
{
struct resource_list_entry *rle;
- STAILQ_FOREACH(rle, rl, link) {
+ while ((rle = STAILQ_FIRST(rl)) != NULL) {
if (rle->res)
bus_release_resource(rman_get_device(rle->res),
rle->type, rle->rid, rle->res);
@@ -2829,6 +2849,13 @@
}
}
+device_t
+bus_generic_add_child(device_t dev, int order, const char *name, int unit)
+{
+
+ return (device_add_child_ordered(dev, order, name, unit));
+}
+
/**
* @brief Helper function for implementing DEVICE_PROBE()
*
@@ -3078,12 +3105,13 @@
*/
int
bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq,
- int flags, driver_intr_t *intr, void *arg, void **cookiep)
+ int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg,
+ void **cookiep)
{
/* Propagate up the bus hierarchy until someone handles it. */
if (dev->parent)
return (BUS_SETUP_INTR(dev->parent, child, irq, flags,
- intr, arg, cookiep));
+ filter, intr, arg, cookiep));
return (EINVAL);
}
@@ -3189,6 +3217,22 @@
}
/**
+ * @brief Helper function for implementing BUS_GET_DMA_TAG().
+ *
+ * This simple implementation of BUS_GET_DMA_TAG() simply calls the
+ * BUS_GET_DMA_TAG() method of the parent of @p dev.
+ */
+bus_dma_tag_t
+bus_generic_get_dma_tag(device_t dev, device_t child)
+{
+
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent != NULL)
+ return (BUS_GET_DMA_TAG(dev->parent, child));
+ return (NULL);
+}
+
+/**
* @brief Helper function for implementing BUS_GET_RESOURCE().
*
* This implementation of BUS_GET_RESOURCE() uses the
@@ -3325,6 +3369,39 @@
* to maintain some sort of a list of resources allocated by each device.
*/
+int
+bus_alloc_resources(device_t dev, struct resource_spec *rs,
+ struct resource **res)
+{
+ int i;
+
+ for (i = 0; rs[i].type != -1; i++)
+ res[i] = NULL;
+ for (i = 0; rs[i].type != -1; i++) {
+ res[i] = bus_alloc_resource_any(dev,
+ rs[i].type, &rs[i].rid, rs[i].flags);
+ if (res[i] == NULL && !(rs[i].flags & RF_OPTIONAL)) {
+ bus_release_resources(dev, rs, res);
+ return (ENXIO);
+ }
+ }
+ return (0);
+}
+
+void
+bus_release_resources(device_t dev, const struct resource_spec *rs,
+ struct resource **res)
+{
+ int i;
+
+ for (i = 0; rs[i].type != -1; i++)
+ if (res[i] != NULL) {
+ bus_release_resource(
+ dev, rs[i].type, rs[i].rid, res[i]);
+ res[i] = NULL;
+ }
+}
+
/**
* @brief Wrapper function for BUS_ALLOC_RESOURCE().
*
@@ -3335,8 +3412,8 @@
bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end,
u_long count, u_int flags)
{
- if (dev->parent == 0)
- return (0);
+ if (dev->parent == NULL)
+ return (NULL);
return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
count, flags));
}
@@ -3350,7 +3427,7 @@
int
bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
{
- if (dev->parent == 0)
+ if (dev->parent == NULL)
return (EINVAL);
return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
}
@@ -3364,7 +3441,7 @@
int
bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
{
- if (dev->parent == 0)
+ if (dev->parent == NULL)
return (EINVAL);
return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
}
@@ -3378,7 +3455,7 @@
int
bus_release_resource(device_t dev, int type, int rid, struct resource *r)
{
- if (dev->parent == 0)
+ if (dev->parent == NULL)
return (EINVAL);
return (BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r));
}
@@ -3391,23 +3468,25 @@
*/
int
bus_setup_intr(device_t dev, struct resource *r, int flags,
- driver_intr_t handler, void *arg, void **cookiep)
+ driver_filter_t filter, driver_intr_t handler, void *arg, void **cookiep)
{
int error;
- if (dev->parent != 0) {
- if ((flags &~ INTR_ENTROPY) == (INTR_TYPE_NET | INTR_MPSAFE) &&
- !debug_mpsafenet)
- flags &= ~INTR_MPSAFE;
+ if (dev->parent != NULL) {
error = BUS_SETUP_INTR(dev->parent, dev, r, flags,
- handler, arg, cookiep);
+ filter, handler, arg, cookiep);
if (error == 0) {
- if (!(flags & (INTR_MPSAFE | INTR_FAST)))
+ if (handler != NULL && !(flags & INTR_MPSAFE))
device_printf(dev, "[GIANT-LOCKED]\n");
if (bootverbose && (flags & INTR_MPSAFE))
device_printf(dev, "[MPSAFE]\n");
- if (flags & INTR_FAST)
- device_printf(dev, "[FAST]\n");
+ if (filter != NULL) {
+ if (handler == NULL)
+ device_printf(dev, "[FILTER]\n");
+ else
+ device_printf(dev, "[FILTER+ITHREAD]\n");
+ } else
+ device_printf(dev, "[ITHREAD]\n");
}
} else
error = EINVAL;
@@ -3423,7 +3502,7 @@
int
bus_teardown_intr(device_t dev, struct resource *r, void *cookie)
{
- if (dev->parent == 0)
+ if (dev->parent == NULL)
return (EINVAL);
return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie));
}
@@ -3556,6 +3635,35 @@
return (BUS_CHILD_LOCATION_STR(parent, child, buf, buflen));
}
+/**
+ * @brief Wrapper function for BUS_GET_DMA_TAG().
+ *
+ * This function simply calls the BUS_GET_DMA_TAG() method of the
+ * parent of @p dev.
+ */
+bus_dma_tag_t
+bus_get_dma_tag(device_t dev)
+{
+ device_t parent;
+
+ parent = device_get_parent(dev);
+ if (parent == NULL)
+ return (NULL);
+ return (BUS_GET_DMA_TAG(parent, dev));
+}
+
+/* Resume all devices and then notify userland that we're up again. */
+static int
+root_resume(device_t dev)
+{
+ int error;
+
+ error = bus_generic_resume(dev);
+ if (error == 0)
+ devctl_notify("kern", "power", "resume", NULL);
+ return (error);
+}
+
static int
root_print_child(device_t dev, device_t child)
{
@@ -3594,7 +3702,7 @@
/* Device interface */
KOBJMETHOD(device_shutdown, bus_generic_shutdown),
KOBJMETHOD(device_suspend, bus_generic_suspend),
- KOBJMETHOD(device_resume, bus_generic_resume),
+ KOBJMETHOD(device_resume, root_resume),
/* Bus interface */
KOBJMETHOD(bus_print_child, root_print_child),
@@ -3627,7 +3735,7 @@
kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver);
root_bus->driver = &root_driver;
root_bus->state = DS_ATTACHED;
- root_devclass = devclass_find_internal("root", 0, FALSE);
+ root_devclass = devclass_find_internal("root", NULL, FALSE);
devinit();
return (0);
@@ -3683,7 +3791,7 @@
kobj_class_t driver;
dmd = (struct driver_module_data *)arg;
- bus_devclass = devclass_find_internal(dmd->dmd_busname, 0, TRUE);
+ bus_devclass = devclass_find_internal(dmd->dmd_busname, NULL, TRUE);
error = 0;
switch (what) {
@@ -3713,7 +3821,7 @@
parentname, TRUE);
} else {
*dmd->dmd_devclass =
- devclass_find_internal(driver->name, 0, TRUE);
+ devclass_find_internal(driver->name, NULL, TRUE);
}
break;
@@ -3745,6 +3853,40 @@
return (error);
}
+/**
+ * @brief Enumerate all hinted devices for this bus.
+ *
+ * Walks through the hints for this bus and calls the bus_hinted_child
+ * routine for each one it fines. It searches first for the specific
+ * bus that's being probed for hinted children (eg isa0), and then for
+ * generic children (eg isa).
+ *
+ * @param dev bus device to enumerate
+ */
+void
+bus_enumerate_hinted_children(device_t bus)
+{
+ int i;
+ const char *dname, *busname;
+ int dunit;
+
+ /*
+ * enumerate all devices on the specific bus
+ */
+ busname = device_get_nameunit(bus);
+ i = 0;
+ while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
+ BUS_HINTED_CHILD(bus, dname, dunit);
+
+ /*
+ * and all the generic ones.
+ */
+ busname = device_get_name(bus);
+ i = 0;
+ while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
+ BUS_HINTED_CHILD(bus, dname, dunit);
+}
+
#ifdef BUS_DEBUG
/* the _short versions avoid iteration by not calling anything that prints
Index: vnode_if.src
===================================================================
RCS file: /home/cvs/src/sys/kern/vnode_if.src,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/vnode_if.src -L sys/kern/vnode_if.src -u -r1.1.1.1 -r1.2
--- sys/kern/vnode_if.src
+++ sys/kern/vnode_if.src
@@ -27,16 +27,16 @@
# SUCH DAMAGE.
#
# @(#)vnode_if.src 8.12 (Berkeley) 5/14/95
-# $FreeBSD: src/sys/kern/vnode_if.src,v 1.78 2005/06/09 20:20:29 ssouhlal Exp $
+# $FreeBSD: src/sys/kern/vnode_if.src,v 1.87 2007/05/31 11:51:52 kib Exp $
#
#
-# Above each of the vop descriptors is a specification of the locking
-# protocol used by each vop call. The first column is the name of
-# the variable, the remaining three columns are in, out and error
-# respectively. The "in" column defines the lock state on input,
-# the "out" column defines the state on succesful return, and the
-# "error" column defines the locking state on error exit.
+# Above each of the vop descriptors in lines starting with %%
+# is a specification of the locking protocol used by each vop call.
+# The first column is the name of the variable, the remaining three
+# columns are in, out and error respectively. The "in" column defines
+# the lock state on input, the "out" column defines the state on succesful
+# return, and the "error" column defines the locking state on error exit.
#
# The locking value can take the following values:
# L: locked; not converted to type of lock.
@@ -52,51 +52,48 @@
# The paramater named "vpp" is assumed to be always used with double
# indirection (**vpp) and that name is hard-codeed in vnode_if.awk !
#
+# Lines starting with %! specify a pre or post-condition function
+# to call before/after the vop call.
+#
# If other such parameters are introduced, they have to be added to
# the AWK script at the head of the definition of "add_debug_code()".
#
-#
-# islocked vp = = =
-#
vop_islocked {
IN struct vnode *vp;
IN struct thread *td;
};
-#
-# lookup dvp L ? ?
-# lookup vpp - L -
-#! lookup pre vop_lookup_pre
-#! lookup post vop_lookup_post
-#
+%% lookup dvp L ? ?
+%% lookup vpp - L -
+%! lookup pre vop_lookup_pre
+%! lookup post vop_lookup_post
+
# XXX - the lookup locking protocol defies simple description and depends
# on the flags and operation fields in the (cnp) structure. Note
# especially that *vpp may equal dvp and both may be locked.
-#
+
vop_lookup {
IN struct vnode *dvp;
INOUT struct vnode **vpp;
IN struct componentname *cnp;
};
-#
-#% cachedlookup dvp L ? ?
-#% cachedlookup vpp - L -
-#
+%% cachedlookup dvp L ? ?
+%% cachedlookup vpp - L -
+
# This must be an exact copy of lookup. See kern/vfs_cache.c for details.
-#
+
vop_cachedlookup {
IN struct vnode *dvp;
INOUT struct vnode **vpp;
IN struct componentname *cnp;
};
-#
-#% create dvp E E E
-#% create vpp - L -
-#! create post vop_create_post
-#
+%% create dvp E E E
+%% create vpp - L -
+%! create post vop_create_post
+
vop_create {
IN struct vnode *dvp;
OUT struct vnode **vpp;
@@ -104,20 +101,20 @@
IN struct vattr *vap;
};
-#
-#% whiteout dvp E E E
-#
+
+%% whiteout dvp E E E
+
vop_whiteout {
IN struct vnode *dvp;
IN struct componentname *cnp;
IN int flags;
};
-#
-#% mknod dvp E E E
-#% mknod vpp - L -
-#! mknod post vop_mknod_post
-#
+
+%% mknod dvp E E E
+%% mknod vpp - L -
+%! mknod post vop_mknod_post
+
vop_mknod {
IN struct vnode *dvp;
OUT struct vnode **vpp;
@@ -125,20 +122,20 @@
IN struct vattr *vap;
};
-#
-#% open vp L L L
-#
+
+%% open vp L L L
+
vop_open {
IN struct vnode *vp;
IN int mode;
IN struct ucred *cred;
IN struct thread *td;
- IN int fdidx;
+ IN struct file *fp;
};
-#
-#% close vp E E E
-#
+
+%% close vp E E E
+
vop_close {
IN struct vnode *vp;
IN int fflag;
@@ -146,9 +143,9 @@
IN struct thread *td;
};
-#
-#% access vp L L L
-#
+
+%% access vp L L L
+
vop_access {
IN struct vnode *vp;
IN int mode;
@@ -156,9 +153,9 @@
IN struct thread *td;
};
-#
-#% getattr vp L L L
-#
+
+%% getattr vp L L L
+
vop_getattr {
IN struct vnode *vp;
OUT struct vattr *vap;
@@ -166,10 +163,10 @@
IN struct thread *td;
};
-#
-#% setattr vp E E E
-#! setattr post vop_setattr_post
-#
+
+%% setattr vp E E E
+%! setattr post vop_setattr_post
+
vop_setattr {
IN struct vnode *vp;
IN struct vattr *vap;
@@ -177,9 +174,9 @@
IN struct thread *td;
};
-#
-#% read vp L L L
-#
+
+%% read vp L L L
+
vop_read {
IN struct vnode *vp;
INOUT struct uio *uio;
@@ -187,11 +184,11 @@
IN struct ucred *cred;
};
-#
-#% write vp E E E
-#! write pre VOP_WRITE_PRE
-#! write post VOP_WRITE_POST
-#
+
+%% write vp E E E
+%! write pre VOP_WRITE_PRE
+%! write post VOP_WRITE_POST
+
vop_write {
IN struct vnode *vp;
INOUT struct uio *uio;
@@ -199,9 +196,9 @@
IN struct ucred *cred;
};
-#
-#% lease vp = = =
-#
+
+%% lease vp = = =
+
vop_lease {
IN struct vnode *vp;
IN struct thread *td;
@@ -209,21 +206,21 @@
IN int flag;
};
-#
-#% ioctl vp U U U
-#
+
+%% ioctl vp U U U
+
vop_ioctl {
IN struct vnode *vp;
IN u_long command;
- IN caddr_t data;
+ IN void *data;
IN int fflag;
IN struct ucred *cred;
IN struct thread *td;
};
-#
-#% poll vp U U U
-#
+
+%% poll vp U U U
+
vop_poll {
IN struct vnode *vp;
IN int events;
@@ -231,61 +228,57 @@
IN struct thread *td;
};
-#
-#% kqfilter vp U U U
-#
+
+%% kqfilter vp U U U
+
vop_kqfilter {
IN struct vnode *vp;
IN struct knote *kn;
};
-#
-#% revoke vp L L L
-#
+
+%% revoke vp L L L
+
vop_revoke {
IN struct vnode *vp;
IN int flags;
};
-#
-#% fsync vp E E E
-#
+
+%% fsync vp E E E
+
vop_fsync {
IN struct vnode *vp;
IN int waitfor;
IN struct thread *td;
};
-#
-#% remove dvp E E E
-#% remove vp E E E
-#! remove post vop_remove_post
-#
+
+%% remove dvp E E E
+%% remove vp E E E
+%! remove post vop_remove_post
+
vop_remove {
IN struct vnode *dvp;
IN struct vnode *vp;
IN struct componentname *cnp;
};
-#
-#% link tdvp E E E
-#% link vp E E E
-#! link post vop_link_post
-#
+
+%% link tdvp E E E
+%% link vp E E E
+%! link post vop_link_post
+
vop_link {
IN struct vnode *tdvp;
IN struct vnode *vp;
IN struct componentname *cnp;
};
-#
-# rename fdvp U U U
-# rename fvp U U U
-# rename tdvp E U U
-# rename tvp X U U
-#! rename pre vop_rename_pre
-#! rename post vop_rename_post
-#
+
+%! rename pre vop_rename_pre
+%! rename post vop_rename_post
+
vop_rename {
IN WILLRELE struct vnode *fdvp;
IN WILLRELE struct vnode *fvp;
@@ -295,11 +288,11 @@
IN struct componentname *tcnp;
};
-#
-#% mkdir dvp E E E
-#% mkdir vpp - E -
-#! mkdir post vop_mkdir_post
-#
+
+%% mkdir dvp E E E
+%% mkdir vpp - E -
+%! mkdir post vop_mkdir_post
+
vop_mkdir {
IN struct vnode *dvp;
OUT struct vnode **vpp;
@@ -307,22 +300,22 @@
IN struct vattr *vap;
};
-#
-#% rmdir dvp E E E
-#% rmdir vp E E E
-#! rmdir post vop_rmdir_post
-#
+
+%% rmdir dvp E E E
+%% rmdir vp E E E
+%! rmdir post vop_rmdir_post
+
vop_rmdir {
IN struct vnode *dvp;
IN struct vnode *vp;
IN struct componentname *cnp;
};
-#
-#% symlink dvp E E E
-#% symlink vpp - E -
-#! symlink post vop_symlink_post
-#
+
+%% symlink dvp E E E
+%% symlink vpp - E -
+%! symlink post vop_symlink_post
+
vop_symlink {
IN struct vnode *dvp;
OUT struct vnode **vpp;
@@ -331,9 +324,9 @@
IN char *target;
};
-#
-#% readdir vp L L L
-#
+
+%% readdir vp L L L
+
vop_readdir {
IN struct vnode *vp;
INOUT struct uio *uio;
@@ -343,56 +336,56 @@
INOUT u_long **cookies;
};
-#
-#% readlink vp L L L
-#
+
+%% readlink vp L L L
+
vop_readlink {
IN struct vnode *vp;
INOUT struct uio *uio;
IN struct ucred *cred;
};
-#
-#% inactive vp E E E
-#
+
+%% inactive vp E E E
+
vop_inactive {
IN struct vnode *vp;
IN struct thread *td;
};
-#
-#% reclaim vp E E E
-#
+
+%% reclaim vp E E E
+
vop_reclaim {
IN struct vnode *vp;
IN struct thread *td;
};
-#
-#lock vp ? ? ?
-#! lock pre vop_lock_pre
-#! lock post vop_lock_post
-#
-vop_lock {
+
+%! lock1 pre vop_lock_pre
+%! lock1 post vop_lock_post
+
+vop_lock1 {
IN struct vnode *vp;
IN int flags;
IN struct thread *td;
+ IN char *file;
+ IN int line;
};
-#
-#unlock vp L ? L
-#! unlock pre vop_unlock_pre
-#! unlock post vop_unlock_post
-#
+
+%! unlock pre vop_unlock_pre
+%! unlock post vop_unlock_post
+
vop_unlock {
IN struct vnode *vp;
IN int flags;
IN struct thread *td;
};
-#
-#% bmap vp L L L
-#
+
+%% bmap vp L L L
+
vop_bmap {
IN struct vnode *vp;
IN daddr_t bn;
@@ -402,61 +395,61 @@
OUT int *runb;
};
-#
-# strategy vp L L L
-#! strategy pre vop_strategy_pre
-#
+
+%% strategy vp L L L
+%! strategy pre vop_strategy_pre
+
vop_strategy {
IN struct vnode *vp;
IN struct buf *bp;
};
-#
-#% getwritemount vp = = =
-#
+
+%% getwritemount vp = = =
+
vop_getwritemount {
IN struct vnode *vp;
OUT struct mount **mpp;
};
-#
-#% print vp = = =
-#
+
+%% print vp = = =
+
vop_print {
IN struct vnode *vp;
};
-#
-#% pathconf vp L L L
-#
+
+%% pathconf vp L L L
+
vop_pathconf {
IN struct vnode *vp;
IN int name;
OUT register_t *retval;
};
-#
-#% advlock vp U U U
-#
+
+%% advlock vp U U U
+
vop_advlock {
IN struct vnode *vp;
- IN caddr_t id;
+ IN void *id;
IN int op;
IN struct flock *fl;
IN int flags;
};
-#
-#% reallocblks vp E E E
-#
+
+%% reallocblks vp E E E
+
vop_reallocblks {
IN struct vnode *vp;
IN struct cluster_save *buflist;
};
-#
-#% getpages vp L L L
-#
+
+%% getpages vp L L L
+
vop_getpages {
IN struct vnode *vp;
IN vm_page_t *m;
@@ -465,9 +458,9 @@
IN vm_ooffset_t offset;
};
-#
-#% putpages vp E E E
-#
+
+%% putpages vp E E E
+
vop_putpages {
IN struct vnode *vp;
IN vm_page_t *m;
@@ -477,9 +470,9 @@
IN vm_ooffset_t offset;
};
-#
-#% getacl vp L L L
-#
+
+%% getacl vp L L L
+
vop_getacl {
IN struct vnode *vp;
IN acl_type_t type;
@@ -488,9 +481,9 @@
IN struct thread *td;
};
-#
-#% setacl vp E E E
-#
+
+%% setacl vp E E E
+
vop_setacl {
IN struct vnode *vp;
IN acl_type_t type;
@@ -499,9 +492,9 @@
IN struct thread *td;
};
-#
-#% aclcheck vp = = =
-#
+
+%% aclcheck vp = = =
+
vop_aclcheck {
IN struct vnode *vp;
IN acl_type_t type;
@@ -510,9 +503,9 @@
IN struct thread *td;
};
-#
-#% closeextattr vp L L L
-#
+
+%% closeextattr vp L L L
+
vop_closeextattr {
IN struct vnode *vp;
IN int commit;
@@ -520,9 +513,9 @@
IN struct thread *td;
};
-#
-#% getextattr vp L L L
-#
+
+%% getextattr vp L L L
+
vop_getextattr {
IN struct vnode *vp;
IN int attrnamespace;
@@ -533,9 +526,9 @@
IN struct thread *td;
};
-#
-#% listextattr vp L L L
-#
+
+%% listextattr vp L L L
+
vop_listextattr {
IN struct vnode *vp;
IN int attrnamespace;
@@ -545,18 +538,18 @@
IN struct thread *td;
};
-#
-#% openextattr vp L L L
-#
+
+%% openextattr vp L L L
+
vop_openextattr {
IN struct vnode *vp;
IN struct ucred *cred;
IN struct thread *td;
};
-#
-#% deleteextattr vp E E E
-#
+
+%% deleteextattr vp E E E
+
vop_deleteextattr {
IN struct vnode *vp;
IN int attrnamespace;
@@ -565,9 +558,9 @@
IN struct thread *td;
};
-#
-#% setextattr vp E E E
-#
+
+%% setextattr vp E E E
+
vop_setextattr {
IN struct vnode *vp;
IN int attrnamespace;
@@ -577,12 +570,20 @@
IN struct thread *td;
};
-#
-#% setlabel vp E E E
-#
+
+%% setlabel vp E E E
+
vop_setlabel {
IN struct vnode *vp;
IN struct label *label;
IN struct ucred *cred;
IN struct thread *td;
};
+
+
+%% setlabel vp = = =
+
+vop_vptofh {
+ IN struct vnode *vp;
+ IN struct fid *fhp;
+};
Index: kern_idle.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_idle.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_idle.c -L sys/kern/kern_idle.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_idle.c
+++ sys/kern/kern_idle.c
@@ -24,7 +24,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_idle.c,v 1.43 2005/04/04 21:53:54 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_idle.c,v 1.48 2007/06/05 00:00:53 jeff Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -43,8 +43,6 @@
static void idle_setup(void *dummy);
SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL)
-static void idle_proc(void *dummy);
-
/*
* Set up per-cpu idle process contexts. The AP's shouldn't be running or
* accessing their idle processes at this point, so don't bother with
@@ -62,11 +60,11 @@
#ifdef SMP
SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
- error = kthread_create(idle_proc, NULL, &p,
+ error = kthread_create(sched_idletd, NULL, &p,
RFSTOPPED | RFHIGHPID, 0, "idle: cpu%d", pc->pc_cpuid);
pc->pc_idlethread = FIRST_THREAD_IN_PROC(p);
#else
- error = kthread_create(idle_proc, NULL, &p,
+ error = kthread_create(sched_idletd, NULL, &p,
RFSTOPPED | RFHIGHPID, 0, "idle");
PCPU_SET(idlethread, FIRST_THREAD_IN_PROC(p));
#endif
@@ -75,53 +73,15 @@
PROC_LOCK(p);
p->p_flag |= P_NOLOAD;
- mtx_lock_spin(&sched_lock);
td = FIRST_THREAD_IN_PROC(p);
+ thread_lock(td);
TD_SET_CAN_RUN(td);
td->td_flags |= TDF_IDLETD;
- sched_class(td->td_ksegrp, PRI_IDLE);
+ sched_class(td, PRI_IDLE);
sched_prio(td, PRI_MAX_IDLE);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
PROC_UNLOCK(p);
#ifdef SMP
}
#endif
}
-
-/*
- * The actual idle process.
- */
-static void
-idle_proc(void *dummy)
-{
- struct proc *p;
- struct thread *td;
-#ifdef SMP
- cpumask_t mycpu;
-#endif
-
- td = curthread;
- p = td->td_proc;
-#ifdef SMP
- mycpu = PCPU_GET(cpumask);
- mtx_lock_spin(&sched_lock);
- idle_cpus_mask |= mycpu;
- mtx_unlock_spin(&sched_lock);
-#endif
- for (;;) {
- mtx_assert(&Giant, MA_NOTOWNED);
-
- while (sched_runnable() == 0)
- cpu_idle();
-
- mtx_lock_spin(&sched_lock);
-#ifdef SMP
- idle_cpus_mask &= ~mycpu;
-#endif
- mi_switch(SW_VOL, NULL);
-#ifdef SMP
- idle_cpus_mask |= mycpu;
-#endif
- mtx_unlock_spin(&sched_lock);
- }
-}
Index: uipc_domain.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_domain.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/uipc_domain.c -L sys/kern/uipc_domain.c -u -r1.2 -r1.3
--- sys/kern/uipc_domain.c
+++ sys/kern/uipc_domain.c
@@ -30,12 +30,13 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_domain.c,v 1.44.2.2 2006/03/01 20:58:36 andre Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_domain.c,v 1.51 2007/08/06 14:26:00 rwatson Exp $");
#include <sys/param.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/domain.h>
+#include <sys/eventhandler.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/lock.h>
@@ -79,14 +80,12 @@
* All functions return EOPNOTSUPP.
*/
struct pr_usrreqs nousrreqs = {
- .pru_abort = pru_abort_notsupp,
.pru_accept = pru_accept_notsupp,
.pru_attach = pru_attach_notsupp,
.pru_bind = pru_bind_notsupp,
.pru_connect = pru_connect_notsupp,
.pru_connect2 = pru_connect2_notsupp,
.pru_control = pru_control_notsupp,
- .pru_detach = pru_detach_notsupp,
.pru_disconnect = pru_disconnect_notsupp,
.pru_listen = pru_listen_notsupp,
.pru_peeraddr = pru_peeraddr_notsupp,
@@ -99,7 +98,6 @@
.pru_sosend = pru_sosend_notsupp,
.pru_soreceive = pru_soreceive_notsupp,
.pru_sopoll = pru_sopoll_notsupp,
- .pru_sosetlabel = pru_sosetlabel_null
};
static void
@@ -121,10 +119,9 @@
DEFAULT(pu->pru_rcvd, pru_rcvd_notsupp);
DEFAULT(pu->pru_rcvoob, pru_rcvoob_notsupp);
DEFAULT(pu->pru_sense, pru_sense_null);
- DEFAULT(pu->pru_sosend, sosend);
- DEFAULT(pu->pru_soreceive, soreceive);
- DEFAULT(pu->pru_sopoll, sopoll);
- DEFAULT(pu->pru_sosetlabel, pru_sosetlabel_null);
+ DEFAULT(pu->pru_sosend, sosend_generic);
+ DEFAULT(pu->pru_soreceive, soreceive_generic);
+ DEFAULT(pu->pru_sopoll, sopoll_generic);
#undef DEFAULT
if (pr->pr_init)
(*pr->pr_init)();
@@ -181,39 +178,41 @@
("attempt to net_add_domain(%s) after domainfinalize()",
dp->dom_name));
#else
-#ifdef DIAGNOSTIC
if (domain_init_status >= 2)
printf("WARNING: attempt to net_add_domain(%s) after "
"domainfinalize()\n", dp->dom_name);
#endif
-#endif
mtx_unlock(&dom_mtx);
net_init_domain(dp);
}
+static void
+socket_zone_change(void *tag)
+{
+
+ uma_zone_set_max(socket_zone, maxsockets);
+}
+
/* ARGSUSED*/
static void
domaininit(void *dummy)
{
+
/*
* Before we do any setup, make sure to initialize the
* zone allocator we get struct sockets from.
*/
-
socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
uma_zone_set_max(socket_zone, maxsockets);
+ EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
+ EVENTHANDLER_PRI_FIRST);
if (max_linkhdr < 16) /* XXX */
max_linkhdr = 16;
- if (debug_mpsafenet) {
- callout_init(&pffast_callout, CALLOUT_MPSAFE);
- callout_init(&pfslow_callout, CALLOUT_MPSAFE);
- } else {
- callout_init(&pffast_callout, 0);
- callout_init(&pfslow_callout, 0);
- }
+ callout_init(&pffast_callout, CALLOUT_MPSAFE);
+ callout_init(&pfslow_callout, CALLOUT_MPSAFE);
mtx_lock(&dom_mtx);
KASSERT(domain_init_status == 0, ("domaininit called too late!"));
@@ -225,6 +224,7 @@
static void
domainfinalize(void *dummy)
{
+
mtx_lock(&dom_mtx);
KASSERT(domain_init_status == 1, ("domainfinalize called too late!"));
domain_init_status = 2;
@@ -235,12 +235,10 @@
}
struct protosw *
-pffindtype(family, type)
- int family;
- int type;
+pffindtype(int family, int type)
{
- register struct domain *dp;
- register struct protosw *pr;
+ struct domain *dp;
+ struct protosw *pr;
for (dp = domains; dp; dp = dp->dom_next)
if (dp->dom_family == family)
@@ -254,13 +252,10 @@
}
struct protosw *
-pffindproto(family, protocol, type)
- int family;
- int protocol;
- int type;
+pffindproto(int family, int protocol, int type)
{
- register struct domain *dp;
- register struct protosw *pr;
+ struct domain *dp;
+ struct protosw *pr;
struct protosw *maybe = 0;
if (family == 0)
@@ -286,9 +281,7 @@
* accept requests before it is registered.
*/
int
-pf_proto_register(family, npr)
- int family;
- struct protosw *npr;
+pf_proto_register(int family, struct protosw *npr)
{
struct domain *dp;
struct protosw *pr, *fpr;
@@ -355,10 +348,7 @@
* all sockets and release all locks and memory references.
*/
int
-pf_proto_unregister(family, protocol, type)
- int family;
- int protocol;
- int type;
+pf_proto_unregister(int family, int protocol, int type)
{
struct domain *dp;
struct protosw *pr, *dpr;
@@ -423,12 +413,10 @@
}
void
-pfctlinput(cmd, sa)
- int cmd;
- struct sockaddr *sa;
+pfctlinput(int cmd, struct sockaddr *sa)
{
- register struct domain *dp;
- register struct protosw *pr;
+ struct domain *dp;
+ struct protosw *pr;
for (dp = domains; dp; dp = dp->dom_next)
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
@@ -437,10 +425,7 @@
}
void
-pfctlinput2(cmd, sa, ctlparam)
- int cmd;
- struct sockaddr *sa;
- void *ctlparam;
+pfctlinput2(int cmd, struct sockaddr *sa, void *ctlparam)
{
struct domain *dp;
struct protosw *pr;
@@ -463,13 +448,10 @@
}
static void
-pfslowtimo(arg)
- void *arg;
+pfslowtimo(void *arg)
{
- register struct domain *dp;
- register struct protosw *pr;
-
- NET_ASSERT_GIANT();
+ struct domain *dp;
+ struct protosw *pr;
for (dp = domains; dp; dp = dp->dom_next)
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
@@ -479,13 +461,10 @@
}
static void
-pffasttimo(arg)
- void *arg;
+pffasttimo(void *arg)
{
- register struct domain *dp;
- register struct protosw *pr;
-
- NET_ASSERT_GIANT();
+ struct domain *dp;
+ struct protosw *pr;
for (dp = domains; dp; dp = dp->dom_next)
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
Index: init_main.c
===================================================================
RCS file: /home/cvs/src/sys/kern/init_main.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/init_main.c -L sys/kern/init_main.c -u -r1.3 -r1.4
--- sys/kern/init_main.c
+++ sys/kern/init_main.c
@@ -39,12 +39,12 @@
* SUCH DAMAGE.
*
* @(#)init_main.c 8.9 (Berkeley) 1/21/94
- * $FreeBSD: src/sys/kern/init_main.c,v 1.256.2.2 2005/10/05 10:31:03 rwatson Exp $
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: src/sys/kern/init_main.c,v 1.283.2.2 2007/12/14 13:41:08 rrs Exp $");
+#include "opt_ddb.h"
#include "opt_init_path.h"
#include "opt_mac.h"
@@ -55,7 +55,6 @@
#include <sys/filedesc.h>
#include <sys/ktr.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/syscallsubr.h>
@@ -77,20 +76,25 @@
#include <machine/cpu.h>
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <sys/copyright.h>
+#include <ddb/ddb.h>
+#include <ddb/db_sym.h>
+
void mi_startup(void); /* Should be elsewhere */
/* Components of the first process -- never freed. */
static struct session session0;
static struct pgrp pgrp0;
struct proc proc0;
-struct thread thread0 __aligned(8);
-struct ksegrp ksegrp0;
+struct thread thread0 __aligned(16);
struct vmspace vmspace0;
struct proc *initproc;
@@ -168,6 +172,11 @@
register struct sysinit **xipp; /* interior loop of sort*/
register struct sysinit *save; /* bubble*/
+#if defined(VERBOSE_SYSINIT)
+ int last;
+ int verbose;
+#endif
+
if (sysinit == NULL) {
sysinit = SET_BEGIN(sysinit_set);
sysinit_end = SET_LIMIT(sysinit_set);
@@ -190,6 +199,14 @@
}
}
+#if defined(VERBOSE_SYSINIT)
+ last = SI_SUB_COPYRIGHT;
+ verbose = 0;
+#if !defined(DDB)
+ printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n");
+#endif
+#endif
+
/*
* Traverse the (now) ordered list of system initialization tasks.
* Perform each task, and continue on to the next task.
@@ -205,9 +222,38 @@
if ((*sipp)->subsystem == SI_SUB_DONE)
continue;
+#if defined(VERBOSE_SYSINIT)
+ if ((*sipp)->subsystem > last) {
+ verbose = 1;
+ last = (*sipp)->subsystem;
+ printf("subsystem %x\n", last);
+ }
+ if (verbose) {
+#if defined(DDB)
+ const char *name;
+ c_db_sym_t sym;
+ db_expr_t offset;
+
+ sym = db_search_symbol((vm_offset_t)(*sipp)->func,
+ DB_STGY_PROC, &offset);
+ db_symbol_values(sym, &name, NULL);
+ if (name != NULL)
+ printf(" %s(%p)... ", name, (*sipp)->udata);
+ else
+#endif
+ printf(" %p(%p)... ", (*sipp)->func,
+ (*sipp)->udata);
+ }
+#endif
+
/* Call function */
(*((*sipp)->func))((*sipp)->udata);
+#if defined(VERBOSE_SYSINIT)
+ if (verbose)
+ printf("done.\n");
+#endif
+
/* Check off the one we're just done */
(*sipp)->subsystem = SI_SUB_DONE;
@@ -242,19 +288,24 @@
printf("%s", (char *)data);
}
SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
-SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, version)
+SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark)
+SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_caddr_t, version)
#ifdef WITNESS
static char wit_warn[] =
"WARNING: WITNESS option enabled, expect reduced performance.\n";
-SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_SECOND + 1,
+SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1,
+ print_caddr_t, wit_warn)
+SYSINIT(witwarn2, SI_SUB_RUN_SCHEDULER, SI_ORDER_THIRD + 1,
print_caddr_t, wit_warn)
#endif
#ifdef DIAGNOSTIC
static char diag_warn[] =
"WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n";
-SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_SECOND + 2,
+SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2,
+ print_caddr_t, diag_warn)
+SYSINIT(diagwarn2, SI_SUB_RUN_SCHEDULER, SI_ORDER_THIRD + 2,
print_caddr_t, diag_warn)
#endif
@@ -316,27 +367,26 @@
struct proc *p;
unsigned i;
struct thread *td;
- struct ksegrp *kg;
GIANT_REQUIRED;
p = &proc0;
td = &thread0;
- kg = &ksegrp0;
/*
- * Initialize magic number.
+ * Initialize magic number and osrel.
*/
p->p_magic = P_MAGIC;
+ p->p_osrel = osreldate;
/*
- * Initialize thread, process and ksegrp structures.
+ * Initialize thread and process structures.
*/
procinit(); /* set up proc zone */
- threadinit(); /* set up thead, upcall and KSEGRP zones */
+ threadinit(); /* set up UMA zones */
/*
* Initialise scheduler resources.
- * Add scheduler specific parts to proc, ksegrp, thread as needed.
+ * Add scheduler specific parts to proc, thread as needed.
*/
schedinit(); /* scheduler gets its house in order */
/*
@@ -366,17 +416,19 @@
session0.s_leader = p;
p->p_sysent = &null_sysvec;
- p->p_flag = P_SYSTEM;
- p->p_sflag = PS_INMEM;
+ p->p_flag = P_SYSTEM | P_INMEM;
p->p_state = PRS_NORMAL;
knlist_init(&p->p_klist, &p->p_mtx, NULL, NULL, NULL);
+ STAILQ_INIT(&p->p_ktr);
p->p_nice = NZERO;
td->td_state = TDS_RUNNING;
- kg->kg_pri_class = PRI_TIMESHARE;
- kg->kg_user_pri = PUSER;
+ td->td_pri_class = PRI_TIMESHARE;
+ td->td_user_pri = PUSER;
+ td->td_base_user_pri = PUSER;
td->td_priority = PVM;
td->td_base_pri = PUSER;
td->td_oncpu = 0;
+ td->td_flags = TDF_INMEM;
p->p_peers = 0;
p->p_leader = p;
@@ -384,6 +436,7 @@
bcopy("swapper", p->p_comm, sizeof ("swapper"));
callout_init(&p->p_itcallout, CALLOUT_MPSAFE);
+ callout_init_mtx(&p->p_limco, &p->p_mtx, 0);
callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
/* Create credentials. */
@@ -392,6 +445,9 @@
p->p_ucred->cr_uidinfo = uifind(0);
p->p_ucred->cr_ruidinfo = uifind(0);
p->p_ucred->cr_prison = NULL; /* Don't jail it. */
+#ifdef AUDIT
+ audit_cred_kproc0(p->p_ucred);
+#endif
#ifdef MAC
mac_create_proc0(p->p_ucred);
#endif
@@ -431,6 +487,15 @@
vm_map_init(&vmspace0.vm_map, p->p_sysent->sv_minuser,
p->p_sysent->sv_maxuser);
vmspace0.vm_map.pmap = vmspace_pmap(&vmspace0);
+ /*-
+ * call the init and ctor for the new thread and proc
+ * we wait to do this until all other structures
+ * are fairly sane.
+ */
+ EVENTHANDLER_INVOKE(process_init, p);
+ EVENTHANDLER_INVOKE(thread_init, td);
+ EVENTHANDLER_INVOKE(process_ctor, p);
+ EVENTHANDLER_INVOKE(thread_ctor, td);
/*
* Charge root for one process.
@@ -445,19 +510,25 @@
{
struct timespec ts;
struct proc *p;
+ struct rusage ru;
/*
* Now we can look at the time, having had a chance to verify the
* time from the filesystem. Pretend that proc0 started now.
*/
sx_slock(&allproc_lock);
- LIST_FOREACH(p, &allproc, p_list) {
+ FOREACH_PROC_IN_SYSTEM(p) {
microuptime(&p->p_stats->p_start);
- p->p_rux.rux_runtime.sec = 0;
- p->p_rux.rux_runtime.frac = 0;
+ PROC_SLOCK(p);
+ rufetch(p, &ru); /* Clears thread stats */
+ PROC_SUNLOCK(p);
+ p->p_rux.rux_runtime = 0;
+ p->p_rux.rux_uticks = 0;
+ p->p_rux.rux_sticks = 0;
+ p->p_rux.rux_iticks = 0;
}
sx_sunlock(&allproc_lock);
- binuptime(PCPU_PTR(switchtime));
+ PCPU_SET(switchtime, cpu_ticks());
PCPU_SET(switchticks, ticks);
/*
@@ -649,19 +720,19 @@
/* divorce init's credentials from the kernel's */
newcred = crget();
PROC_LOCK(initproc);
- initproc->p_flag |= P_SYSTEM;
+ initproc->p_flag |= P_SYSTEM | P_INMEM;
oldcred = initproc->p_ucred;
crcopy(newcred, oldcred);
#ifdef MAC
mac_create_proc1(newcred);
#endif
+#ifdef AUDIT
+ audit_cred_proc1(newcred);
+#endif
initproc->p_ucred = newcred;
PROC_UNLOCK(initproc);
crfree(oldcred);
cred_update_thread(FIRST_THREAD_IN_PROC(initproc));
- mtx_lock_spin(&sched_lock);
- initproc->p_sflag |= PS_INMEM;
- mtx_unlock_spin(&sched_lock);
cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL);
}
SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL)
@@ -675,9 +746,9 @@
struct thread *td;
td = FIRST_THREAD_IN_PROC(initproc);
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
TD_SET_CAN_RUN(td);
- setrunqueue(td, SRQ_BORING); /* XXXKSE */
- mtx_unlock_spin(&sched_lock);
+ sched_add(td, SRQ_BORING);
+ thread_unlock(td);
}
SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)
Index: kern_ktrace.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_ktrace.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_ktrace.c -L sys/kern/kern_ktrace.c -u -r1.2 -r1.3
--- sys/kern/kern_ktrace.c
+++ sys/kern/kern_ktrace.c
@@ -1,6 +1,8 @@
/*-
* Copyright (c) 1989, 1993
- * The Regents of the University of California. All rights reserved.
+ * The Regents of the University of California.
+ * Copyright (c) 2005 Robert N. M. Watson
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -30,7 +32,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_ktrace.c,v 1.101.2.3 2006/03/13 03:05:47 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_ktrace.c,v 1.121 2007/08/29 21:17:11 jhb Exp $");
#include "opt_ktrace.h"
#include "opt_mac.h"
@@ -42,10 +44,10 @@
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/mutex.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/namei.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
@@ -55,6 +57,27 @@
#include <sys/syslog.h>
#include <sys/sysproto.h>
+#include <security/mac/mac_framework.h>
+
+/*
+ * The ktrace facility allows the tracing of certain key events in user space
+ * processes, such as system calls, signal delivery, context switches, and
+ * user generated events using utrace(2). It works by streaming event
+ * records and data to a vnode associated with the process using the
+ * ktrace(2) system call. In general, records can be written directly from
+ * the context that generates the event. One important exception to this is
+ * during a context switch, where sleeping is not permitted. To handle this
+ * case, trace events are generated using in-kernel ktr_request records, and
+ * then delivered to disk at a convenient moment -- either immediately, the
+ * next traceable event, at system call return, or at process exit.
+ *
+ * When dealing with multiple threads or processes writing to the same event
+ * log, ordering guarantees are weak: specifically, if an event has multiple
+ * records (i.e., system call enter and return), they may be interlaced with
+ * records from another event. Process and thread ID information is provided
+ * in the record, and user applications can de-interlace events if required.
+ */
+
static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
#ifdef KTRACE
@@ -66,8 +89,6 @@
struct ktr_request {
struct ktr_header ktr_header;
void *ktr_buffer;
- struct ucred *ktr_cred;
- struct vnode *ktr_vp;
union {
struct ktr_syscall ktr_syscall;
struct ktr_sysret ktr_sysret;
@@ -89,7 +110,6 @@
0 /* KTR_USER */
};
-static STAILQ_HEAD(, ktr_request) ktr_todo;
static STAILQ_HEAD(, ktr_request) ktr_free;
static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options");
@@ -104,20 +124,48 @@
static int print_message = 1;
struct mtx ktrace_mtx;
-static struct cv ktrace_cv;
+static struct sx ktrace_sx;
static void ktrace_init(void *dummy);
static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
static u_int ktrace_resize_pool(u_int newsize);
static struct ktr_request *ktr_getrequest(int type);
-static void ktr_submitrequest(struct ktr_request *req);
+static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
static void ktr_freerequest(struct ktr_request *req);
-static void ktr_loop(void *dummy);
-static void ktr_writerequest(struct ktr_request *req);
+static void ktr_writerequest(struct thread *td, struct ktr_request *req);
static int ktrcanset(struct thread *,struct proc *);
static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
static int ktrops(struct thread *,struct proc *,int,int,struct vnode *);
+/*
+ * ktrace itself generates events, such as context switches, which we do not
+ * wish to trace. Maintain a flag, TDP_INKTRACE, on each thread to determine
+ * whether or not it is in a region where tracing of events should be
+ * suppressed.
+ */
+static void
+ktrace_enter(struct thread *td)
+{
+
+ KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
+ td->td_pflags |= TDP_INKTRACE;
+}
+
+static void
+ktrace_exit(struct thread *td)
+{
+
+ KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
+ td->td_pflags &= ~TDP_INKTRACE;
+}
+
+static void
+ktrace_assert(struct thread *td)
+{
+
+ KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
+}
+
static void
ktrace_init(void *dummy)
{
@@ -125,14 +173,12 @@
int i;
mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
- cv_init(&ktrace_cv, "ktrace");
- STAILQ_INIT(&ktr_todo);
+ sx_init(&ktrace_sx, "ktrace_sx");
STAILQ_INIT(&ktr_free);
for (i = 0; i < ktr_requestpool; i++) {
req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
}
- kthread_create(ktr_loop, NULL, NULL, RFHIGHPID, 0, "ktrace");
}
SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
@@ -155,12 +201,12 @@
if (error)
return (error);
td = curthread;
- td->td_pflags |= TDP_INKTRACE;
+ ktrace_enter(td);
mtx_lock(&ktrace_mtx);
oldsize = ktr_requestpool;
newsize = ktrace_resize_pool(wantsize);
mtx_unlock(&ktrace_mtx);
- td->td_pflags &= ~TDP_INKTRACE;
+ ktrace_exit(td);
error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
if (error)
return (error);
@@ -215,11 +261,11 @@
struct proc *p = td->td_proc;
int pm;
- td->td_pflags |= TDP_INKTRACE;
+ ktrace_enter(td); /* XXX: In caller instead? */
mtx_lock(&ktrace_mtx);
if (!KTRCHECK(td, type)) {
mtx_unlock(&ktrace_mtx);
- td->td_pflags &= ~TDP_INKTRACE;
+ ktrace_exit(td);
return (NULL);
}
req = STAILQ_FIRST(&ktr_free);
@@ -230,11 +276,6 @@
req->ktr_header.ktr_type |= KTR_DROP;
p->p_traceflag &= ~KTRFAC_DROP;
}
- KASSERT(p->p_tracevp != NULL, ("ktrace: no trace vnode"));
- KASSERT(p->p_tracecred != NULL, ("ktrace: no trace cred"));
- req->ktr_vp = p->p_tracevp;
- VREF(p->p_tracevp);
- req->ktr_cred = crhold(p->p_tracecred);
mtx_unlock(&ktrace_mtx);
microtime(&req->ktr_header.ktr_time);
req->ktr_header.ktr_pid = p->p_pid;
@@ -249,74 +290,89 @@
mtx_unlock(&ktrace_mtx);
if (pm)
printf("Out of ktrace request objects.\n");
- td->td_pflags &= ~TDP_INKTRACE;
+ ktrace_exit(td);
}
return (req);
}
+/*
+ * Some trace generation environments don't permit direct access to VFS,
+ * such as during a context switch where sleeping is not allowed. Under these
+ * circumstances, queue a request to the thread to be written asynchronously
+ * later.
+ */
static void
-ktr_submitrequest(struct ktr_request *req)
+ktr_enqueuerequest(struct thread *td, struct ktr_request *req)
{
mtx_lock(&ktrace_mtx);
- STAILQ_INSERT_TAIL(&ktr_todo, req, ktr_list);
- cv_signal(&ktrace_cv);
+ STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
mtx_unlock(&ktrace_mtx);
- curthread->td_pflags &= ~TDP_INKTRACE;
+ ktrace_exit(td);
}
+/*
+ * Drain any pending ktrace records from the per-thread queue to disk. This
+ * is used both internally before committing other records, and also on
+ * system call return. We drain all the ones we can find at the time when
+ * drain is requested, but don't keep draining after that as those events
+ * may me approximately "after" the current event.
+ */
static void
-ktr_freerequest(struct ktr_request *req)
+ktr_drain(struct thread *td)
{
+ struct ktr_request *queued_req;
+ STAILQ_HEAD(, ktr_request) local_queue;
- crfree(req->ktr_cred);
- if (req->ktr_vp != NULL) {
- mtx_lock(&Giant);
- vrele(req->ktr_vp);
- mtx_unlock(&Giant);
- }
- if (req->ktr_buffer != NULL)
- free(req->ktr_buffer, M_KTRACE);
- mtx_lock(&ktrace_mtx);
- STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
- mtx_unlock(&ktrace_mtx);
-}
+ ktrace_assert(td);
+ sx_assert(&ktrace_sx, SX_XLOCKED);
-static void
-ktr_loop(void *dummy)
-{
- struct ktr_request *req;
- struct thread *td;
- struct ucred *cred;
+ STAILQ_INIT(&local_queue); /* XXXRW: needed? */
- /* Only cache these values once. */
- td = curthread;
- cred = td->td_ucred;
- for (;;) {
+ if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
mtx_lock(&ktrace_mtx);
- while (STAILQ_EMPTY(&ktr_todo))
- cv_wait(&ktrace_cv, &ktrace_mtx);
- req = STAILQ_FIRST(&ktr_todo);
- STAILQ_REMOVE_HEAD(&ktr_todo, ktr_list);
- KASSERT(req != NULL, ("got a NULL request"));
+ STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
mtx_unlock(&ktrace_mtx);
- /*
- * It is not enough just to pass the cached cred
- * to the VOP's in ktr_writerequest(). Some VFS
- * operations use curthread->td_ucred, so we need
- * to modify our thread's credentials as well.
- * Evil.
- */
- td->td_ucred = req->ktr_cred;
- ktr_writerequest(req);
- td->td_ucred = cred;
- ktr_freerequest(req);
+
+ while ((queued_req = STAILQ_FIRST(&local_queue))) {
+ STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
+ ktr_writerequest(td, queued_req);
+ ktr_freerequest(queued_req);
+ }
}
}
/*
- * MPSAFE
+ * Submit a trace record for immediate commit to disk -- to be used only
+ * where entering VFS is OK. First drain any pending records that may have
+ * been cached in the thread.
*/
+static void
+ktr_submitrequest(struct thread *td, struct ktr_request *req)
+{
+
+ ktrace_assert(td);
+
+ sx_xlock(&ktrace_sx);
+ ktr_drain(td);
+ ktr_writerequest(td, req);
+ ktr_freerequest(req);
+ sx_xunlock(&ktrace_sx);
+
+ ktrace_exit(td);
+}
+
+static void
+ktr_freerequest(struct ktr_request *req)
+{
+
+ if (req->ktr_buffer != NULL)
+ free(req->ktr_buffer, M_KTRACE);
+ mtx_lock(&ktrace_mtx);
+ STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
+ mtx_unlock(&ktrace_mtx);
+}
+
void
ktrsyscall(code, narg, args)
int code, narg;
@@ -345,12 +401,9 @@
req->ktr_header.ktr_len = buflen;
req->ktr_buffer = buf;
}
- ktr_submitrequest(req);
+ ktr_submitrequest(curthread, req);
}
-/*
- * MPSAFE
- */
void
ktrsysret(code, error, retval)
int code, error;
@@ -366,7 +419,36 @@
ktp->ktr_code = code;
ktp->ktr_error = error;
ktp->ktr_retval = retval; /* what about val2 ? */
- ktr_submitrequest(req);
+ ktr_submitrequest(curthread, req);
+}
+
+/*
+ * When a process exits, drain per-process asynchronous trace records.
+ */
+void
+ktrprocexit(struct thread *td)
+{
+
+ ktrace_enter(td);
+ sx_xlock(&ktrace_sx);
+ ktr_drain(td);
+ sx_xunlock(&ktrace_sx);
+ ktrace_exit(td);
+}
+
+/*
+ * When a thread returns, drain any asynchronous records generated by the
+ * system call.
+ */
+void
+ktruserret(struct thread *td)
+{
+
+ ktrace_enter(td);
+ sx_xlock(&ktrace_sx);
+ ktr_drain(td);
+ sx_xunlock(&ktrace_sx);
+ ktrace_exit(td);
}
void
@@ -392,18 +474,9 @@
req->ktr_header.ktr_len = namelen;
req->ktr_buffer = buf;
}
- ktr_submitrequest(req);
+ ktr_submitrequest(curthread, req);
}
-/*
- * Since the uio may not stay valid, we can not hand off this request to
- * the thread and need to process it synchronously. However, we wish to
- * keep the relative order of records in a trace file correct, so we
- * do put this request on the queue (if it isn't empty) and then block.
- * The ktrace thread waks us back up when it is time for this event to
- * be posted and blocks until we have completed writing out the event
- * and woken it back up.
- */
void
ktrgenio(fd, rw, uio, error)
int fd;
@@ -440,7 +513,7 @@
ktg->ktr_rw = rw;
req->ktr_header.ktr_len = datalen;
req->ktr_buffer = buf;
- ktr_submitrequest(req);
+ ktr_submitrequest(curthread, req);
}
void
@@ -461,7 +534,7 @@
kp->action = action;
kp->mask = *mask;
kp->code = code;
- ktr_submitrequest(req);
+ ktr_enqueuerequest(curthread, req);
}
void
@@ -477,17 +550,12 @@
kc = &req->ktr_data.ktr_csw;
kc->out = out;
kc->user = user;
- ktr_submitrequest(req);
+ ktr_enqueuerequest(curthread, req);
}
#endif /* KTRACE */
/* Interface and common routines */
-/*
- * ktrace system call
- *
- * MPSAFE
- */
#ifndef _SYS_SYSPROTO_H_
struct ktrace_args {
char *fname;
@@ -510,7 +578,7 @@
int ops = KTROP(uap->ops);
int descend = uap->ops & KTRFLAG_DESCEND;
int nfound, ret = 0;
- int flags, error = 0;
+ int flags, error = 0, vfslocked;
struct nameidata nd;
struct ucred *cred;
@@ -520,37 +588,40 @@
if (ops != KTROP_CLEARFILE && facs == 0)
return (EINVAL);
- td->td_pflags |= TDP_INKTRACE;
+ ktrace_enter(td);
if (ops != KTROP_CLEAR) {
/*
* an operation which requires a file argument.
*/
- NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->fname, td);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE,
+ uap->fname, td);
flags = FREAD | FWRITE | O_NOFOLLOW;
- mtx_lock(&Giant);
- error = vn_open(&nd, &flags, 0, -1);
+ error = vn_open(&nd, &flags, 0, NULL);
if (error) {
- mtx_unlock(&Giant);
- td->td_pflags &= ~TDP_INKTRACE;
+ ktrace_exit(td);
return (error);
}
+ vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
VOP_UNLOCK(vp, 0, td);
if (vp->v_type != VREG) {
(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
- mtx_unlock(&Giant);
- td->td_pflags &= ~TDP_INKTRACE;
+ VFS_UNLOCK_GIANT(vfslocked);
+ ktrace_exit(td);
return (EACCES);
}
- mtx_unlock(&Giant);
+ VFS_UNLOCK_GIANT(vfslocked);
}
/*
* Clear all uses of the tracefile.
*/
if (ops == KTROP_CLEARFILE) {
+ int vrele_count;
+
+ vrele_count = 0;
sx_slock(&allproc_lock);
- LIST_FOREACH(p, &allproc, p_list) {
+ FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_tracevp == vp) {
if (ktrcanset(td, p)) {
@@ -560,20 +631,20 @@
p->p_tracevp = NULL;
p->p_traceflag = 0;
mtx_unlock(&ktrace_mtx);
- PROC_UNLOCK(p);
- mtx_lock(&Giant);
- (void) vn_close(vp, FREAD|FWRITE,
- cred, td);
- mtx_unlock(&Giant);
+ vrele_count++;
crfree(cred);
- } else {
- PROC_UNLOCK(p);
+ } else
error = EPERM;
- }
- } else
- PROC_UNLOCK(p);
+ }
+ PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
+ if (vrele_count > 0) {
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ while (vrele_count-- > 0)
+ vrele(vp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ }
goto done;
}
/*
@@ -644,22 +715,17 @@
error = EPERM;
done:
if (vp != NULL) {
- mtx_lock(&Giant);
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
(void) vn_close(vp, FWRITE, td->td_ucred, td);
- mtx_unlock(&Giant);
+ VFS_UNLOCK_GIANT(vfslocked);
}
- td->td_pflags &= ~TDP_INKTRACE;
+ ktrace_exit(td);
return (error);
#else /* !KTRACE */
return (ENOSYS);
#endif /* KTRACE */
}
-/*
- * utrace system call
- *
- * MPSAFE
- */
/* ARGSUSED */
int
utrace(td, uap)
@@ -689,7 +755,7 @@
}
req->ktr_buffer = cp;
req->ktr_header.ktr_len = uap->len;
- ktr_submitrequest(req);
+ ktr_submitrequest(td, req);
return (0);
#else /* !KTRACE */
return (ENOSYS);
@@ -727,7 +793,7 @@
p->p_tracecred = crhold(td->td_ucred);
}
p->p_traceflag |= facs;
- if (td->td_ucred->cr_uid == 0)
+ if (priv_check(td, PRIV_KTRACE) == 0)
p->p_traceflag |= KTRFAC_ROOT;
} else {
/* KTROP_CLEAR */
@@ -790,31 +856,48 @@
}
static void
-ktr_writerequest(struct ktr_request *req)
+ktr_writerequest(struct thread *td, struct ktr_request *req)
{
struct ktr_header *kth;
struct vnode *vp;
struct proc *p;
- struct thread *td;
struct ucred *cred;
struct uio auio;
struct iovec aiov[3];
struct mount *mp;
int datalen, buflen, vrele_count;
- int error;
+ int error, vfslocked;
+
+ /*
+ * We hold the vnode and credential for use in I/O in case ktrace is
+ * disabled on the process as we write out the request.
+ *
+ * XXXRW: This is not ideal: we could end up performing a write after
+ * the vnode has been closed.
+ */
+ mtx_lock(&ktrace_mtx);
+ vp = td->td_proc->p_tracevp;
+ if (vp != NULL)
+ VREF(vp);
+ cred = td->td_proc->p_tracecred;
+ if (cred != NULL)
+ crhold(cred);
+ mtx_unlock(&ktrace_mtx);
- vp = req->ktr_vp;
/*
* If vp is NULL, the vp has been cleared out from under this
- * request, so just drop it.
+ * request, so just drop it. Make sure the credential and vnode are
+ * in sync: we should have both or neither.
*/
- if (vp == NULL)
+ if (vp == NULL) {
+ KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL"));
return;
+ }
+ KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
+
kth = &req->ktr_header;
datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP];
buflen = kth->ktr_len;
- cred = req->ktr_cred;
- td = curthread;
auio.uio_iov = &aiov[0];
auio.uio_offset = 0;
auio.uio_segflg = UIO_SYSSPACE;
@@ -838,7 +921,8 @@
auio.uio_resid += buflen;
auio.uio_iovcnt++;
}
- mtx_lock(&Giant);
+
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_start_write(vp, &mp, V_WAIT);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
(void)VOP_LEASE(vp, td, cred, LEASE_WRITE);
@@ -849,7 +933,8 @@
error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
VOP_UNLOCK(vp, 0, td);
vn_finished_write(mp);
- mtx_unlock(&Giant);
+ vrele(vp);
+ VFS_UNLOCK_GIANT(vfslocked);
if (!error)
return;
/*
@@ -869,7 +954,7 @@
*/
cred = NULL;
sx_slock(&allproc_lock);
- LIST_FOREACH(p, &allproc, p_list) {
+ FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_tracevp == vp) {
mtx_lock(&ktrace_mtx);
@@ -887,21 +972,16 @@
}
}
sx_sunlock(&allproc_lock);
+
/*
- * Second, clear this vnode from any pending requests.
+ * We can't clear any pending requests in threads that have cached
+ * them but not yet committed them, as those are per-thread. The
+ * thread will have to clear it itself on system call return.
*/
- mtx_lock(&ktrace_mtx);
- STAILQ_FOREACH(req, &ktr_todo, ktr_list) {
- if (req->ktr_vp == vp) {
- req->ktr_vp = NULL;
- vrele_count++;
- }
- }
- mtx_unlock(&ktrace_mtx);
- mtx_lock(&Giant);
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
while (vrele_count-- > 0)
vrele(vp);
- mtx_unlock(&Giant);
+ VFS_UNLOCK_GIANT(vfslocked);
}
/*
@@ -919,7 +999,7 @@
PROC_LOCK_ASSERT(targetp, MA_OWNED);
if (targetp->p_traceflag & KTRFAC_ROOT &&
- suser_cred(td->td_ucred, SUSER_ALLOWJAIL))
+ priv_check(td, PRIV_KTRACE))
return (0);
if (p_candebug(td, targetp) != 0)
Index: vfs_mount.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_mount.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -L sys/kern/vfs_mount.c -L sys/kern/vfs_mount.c -u -r1.4 -r1.5
--- sys/kern/vfs_mount.c
+++ sys/kern/vfs_mount.c
@@ -35,18 +35,19 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_mount.c,v 1.196.2.8 2006/03/13 03:06:27 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_mount.c,v 1.265.2.1.2.1 2008/01/20 02:38:42 rodrigc Exp $");
#include <sys/param.h>
#include <sys/conf.h>
+#include <sys/clock.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/libkern.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/filedesc.h>
#include <sys/reboot.h>
@@ -57,11 +58,15 @@
#include <sys/sysent.h>
#include <sys/systm.h>
#include <sys/vnode.h>
+#include <vm/uma.h>
#include <geom/geom.h>
#include <machine/stdarg.h>
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
#include "opt_rootdevname.h"
#include "opt_ddb.h"
#include "opt_mac.h"
@@ -75,14 +80,12 @@
static int vfs_domount(struct thread *td, const char *fstype,
char *fspath, int fsflags, void *fsdata);
-static int vfs_mount_alloc(struct vnode *dvp, struct vfsconf *vfsp,
- const char *fspath, struct thread *td, struct mount **mpp);
static int vfs_mountroot_ask(void);
static int vfs_mountroot_try(const char *mountfrom);
static int vfs_donmount(struct thread *td, int fsflags,
struct uio *fsoptions);
static void free_mntarg(struct mntarg *ma);
-static void vfs_mount_destroy(struct mount *, struct thread *);
+static int vfs_getopt_pos(struct vfsoptlist *opts, const char *name);
static int usermount = 0;
SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
@@ -90,6 +93,7 @@
MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
+static uma_zone_t mount_zone;
/* List of mounted filesystems. */
struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
@@ -127,13 +131,14 @@
* Global opts, taken by all filesystems
*/
static const char *global_opts[] = {
+ "errmsg",
"fstype",
"fspath",
- "rdonly",
"ro",
"rw",
- "suid",
- "exec",
+ "nosuid",
+ "noexec",
+ "update",
NULL
};
@@ -176,7 +181,7 @@
}
/* Release all resources related to the mount options. */
-static void
+void
vfs_freeopts(struct vfsoptlist *opts)
{
struct vfsopt *opt;
@@ -188,6 +193,17 @@
free(opts, M_MOUNT);
}
+void
+vfs_deleteopt(struct vfsoptlist *opts, const char *name)
+{
+ struct vfsopt *opt, *temp;
+
+ TAILQ_FOREACH_SAFE(opt, opts, link, temp) {
+ if (strcmp(opt->name, name) == 0)
+ vfs_freeopt(opts, opt);
+ }
+}
+
/*
* Check if options are equal (with or without the "no" prefix).
*/
@@ -351,8 +367,7 @@
}
/*
- * ---------------------------------------------------------------------
- * Mount a filesystem
+ * Mount a filesystem.
*/
int
nmount(td, uap)
@@ -369,9 +384,15 @@
int error;
u_int iovcnt;
- /* Kick out MNT_ROOTFS early as it is legal internally */
- if (uap->flags & MNT_ROOTFS)
- return (EINVAL);
+ AUDIT_ARG(fflags, uap->flags);
+
+ /*
+ * Filter out MNT_ROOTFS. We do not want clients of nmount() in
+ * userspace to set this flag, but we must filter it out if we want
+ * MNT_UPDATE on the root file system to work.
+ * MNT_ROOTFS should only be set in the kernel in vfs_mountroot_try().
+ */
+ uap->flags &= ~MNT_ROOTFS;
iovcnt = uap->iovcnt;
/*
@@ -393,6 +414,7 @@
iov++;
}
error = vfs_donmount(td, uap->flags, auio);
+
free(auio, M_IOV);
return (error);
}
@@ -420,27 +442,48 @@
MNT_IUNLOCK(mp);
}
+static int
+mount_init(void *mem, int size, int flags)
+{
+ struct mount *mp;
+
+ mp = (struct mount *)mem;
+ mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
+ lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
+ return (0);
+}
+
+static void
+mount_fini(void *mem, int size)
+{
+ struct mount *mp;
+
+ mp = (struct mount *)mem;
+ lockdestroy(&mp->mnt_lock);
+ mtx_destroy(&mp->mnt_mtx);
+}
+
/*
* Allocate and initialize the mount point struct.
*/
-static int
+struct mount *
vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp,
- const char *fspath, struct thread *td, struct mount **mpp)
+ const char *fspath, struct thread *td)
{
struct mount *mp;
- mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
+ mp = uma_zalloc(mount_zone, M_WAITOK);
+ bzero(&mp->mnt_startzero,
+ __rangeof(struct mount, mnt_startzero, mnt_endzero));
TAILQ_INIT(&mp->mnt_nvnodelist);
mp->mnt_nvnodelistsize = 0;
- mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
- lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
- (void) vfs_busy(mp, LK_NOWAIT, 0, td);
mp->mnt_ref = 0;
+ (void) vfs_busy(mp, LK_NOWAIT, 0, td);
mp->mnt_op = vfsp->vfc_vfsops;
mp->mnt_vfc = vfsp;
vfsp->vfc_refcount++; /* XXX Unlocked */
mp->mnt_stat.f_type = vfsp->vfc_typenum;
- mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+ mp->mnt_gen++;
strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
mp->mnt_vnodecovered = vp;
mp->mnt_cred = crdup(td->td_ucred);
@@ -452,19 +495,17 @@
mac_create_mount(td->td_ucred, mp);
#endif
arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
- *mpp = mp;
- return (0);
+ return (mp);
}
/*
* Destroy the mount struct previously allocated by vfs_mount_alloc().
*/
-static void
-vfs_mount_destroy(struct mount *mp, struct thread *td)
+void
+vfs_mount_destroy(struct mount *mp)
{
int i;
- vfs_unbusy(mp, td);
MNT_ILOCK(mp);
for (i = 0; mp->mnt_ref && i < 3; i++)
msleep(mp, MNT_MTX(mp), PVFS, "mntref", hz);
@@ -508,9 +549,13 @@
}
MNT_IUNLOCK(mp);
mp->mnt_vfc->vfc_refcount--;
- if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
+ if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
+ struct vnode *vp;
+
+ TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
+ vprint("", vp);
panic("unmount: dangling vnode");
- lockdestroy(&mp->mnt_lock);
+ }
MNT_ILOCK(mp);
if (mp->mnt_kern_flag & MNTK_MWAIT)
wakeup(mp);
@@ -524,27 +569,37 @@
mp->mnt_nvnodelistsize = -1000;
mp->mnt_secondary_writes = -1000;
MNT_IUNLOCK(mp);
- mtx_destroy(&mp->mnt_mtx);
#ifdef MAC
mac_destroy_mount(mp);
#endif
if (mp->mnt_opt != NULL)
vfs_freeopts(mp->mnt_opt);
crfree(mp->mnt_cred);
- free(mp, M_MOUNT);
+ uma_zfree(mount_zone, mp);
}
static int
vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions)
{
struct vfsoptlist *optlist;
- char *fstype, *fspath;
- int error, fstypelen, fspathlen;
+ struct vfsopt *opt, *noro_opt;
+ char *fstype, *fspath, *errmsg;
+ int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
+ int has_rw, has_noro;
+
+ errmsg = NULL;
+ errmsg_len = 0;
+ errmsg_pos = -1;
+ has_rw = 0;
+ has_noro = 0;
error = vfs_buildopts(fsoptions, &optlist);
if (error)
return (error);
+ if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
+ errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
+
/*
* We need these two options before the others,
* and they are mandatory for any filesystem.
@@ -554,12 +609,16 @@
error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
if (error || fstype[fstypelen - 1] != '\0') {
error = EINVAL;
+ if (errmsg != NULL)
+ strncpy(errmsg, "Invalid fstype", errmsg_len);
goto bail;
}
fspathlen = 0;
error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
if (error || fspath[fspathlen - 1] != '\0') {
error = EINVAL;
+ if (errmsg != NULL)
+ strncpy(errmsg, "Invalid fspath", errmsg_len);
goto bail;
}
@@ -568,63 +627,92 @@
* before we call vfs_domount(), since vfs_domount() has special
* logic based on MNT_UPDATE. This is very important
* when we want to update the root filesystem.
- */
- if (vfs_getopt(optlist, "update", NULL, NULL) == 0)
- fsflags |= MNT_UPDATE;
-
- if (vfs_getopt(optlist, "async", NULL, NULL) == 0)
- fsflags |= MNT_ASYNC;
-
- if (vfs_getopt(optlist, "force", NULL, NULL) == 0)
- fsflags |= MNT_FORCE;
-
- if (vfs_getopt(optlist, "multilabel", NULL, NULL) == 0)
- fsflags |= MNT_MULTILABEL;
-
- if (vfs_getopt(optlist, "noasync", NULL, NULL) == 0)
- fsflags &= ~MNT_ASYNC;
-
- if (vfs_getopt(optlist, "noatime", NULL, NULL) == 0)
- fsflags |= MNT_NOATIME;
-
- if (vfs_getopt(optlist, "noclusterr", NULL, NULL) == 0)
- fsflags |= MNT_NOCLUSTERR;
-
- if (vfs_getopt(optlist, "noclusterw", NULL, NULL) == 0)
- fsflags |= MNT_NOCLUSTERW;
-
- if (vfs_getopt(optlist, "noexec", NULL, NULL) == 0)
- fsflags |= MNT_NOEXEC;
-
- if (vfs_getopt(optlist, "nosuid", NULL, NULL) == 0)
- fsflags |= MNT_NOSUID;
-
- if (vfs_getopt(optlist, "nosymfollow", NULL, NULL) == 0)
- fsflags |= MNT_NOSYMFOLLOW;
-
- if (vfs_getopt(optlist, "noro", NULL, NULL) == 0)
- fsflags &= ~MNT_RDONLY;
-
- if (vfs_getopt(optlist, "ro", NULL, NULL) == 0)
- fsflags |= MNT_RDONLY;
-
- if (vfs_getopt(optlist, "rdonly", NULL, NULL) == 0)
- fsflags |= MNT_RDONLY;
-
- if (vfs_getopt(optlist, "rw", NULL, NULL) == 0)
- fsflags &= ~MNT_RDONLY;
-
- if (vfs_getopt(optlist, "snapshot", NULL, NULL) == 0)
- fsflags |= MNT_SNAPSHOT;
-
- if (vfs_getopt(optlist, "suiddir", NULL, NULL) == 0)
- fsflags |= MNT_SUIDDIR;
-
- if (vfs_getopt(optlist, "sync", NULL, NULL) == 0)
- fsflags |= MNT_SYNCHRONOUS;
-
- if (vfs_getopt(optlist, "union", NULL, NULL) == 0)
- fsflags |= MNT_UNION;
+ */
+ TAILQ_FOREACH(opt, optlist, link) {
+ if (strcmp(opt->name, "update") == 0)
+ fsflags |= MNT_UPDATE;
+ else if (strcmp(opt->name, "async") == 0)
+ fsflags |= MNT_ASYNC;
+ else if (strcmp(opt->name, "force") == 0)
+ fsflags |= MNT_FORCE;
+ else if (strcmp(opt->name, "multilabel") == 0)
+ fsflags |= MNT_MULTILABEL;
+ else if (strcmp(opt->name, "noasync") == 0)
+ fsflags &= ~MNT_ASYNC;
+ else if (strcmp(opt->name, "noatime") == 0)
+ fsflags |= MNT_NOATIME;
+ else if (strcmp(opt->name, "atime") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonoatime", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "noclusterr") == 0)
+ fsflags |= MNT_NOCLUSTERR;
+ else if (strcmp(opt->name, "clusterr") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonoclusterr", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "noclusterw") == 0)
+ fsflags |= MNT_NOCLUSTERW;
+ else if (strcmp(opt->name, "clusterw") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonoclusterw", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "noexec") == 0)
+ fsflags |= MNT_NOEXEC;
+ else if (strcmp(opt->name, "exec") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonoexec", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "nosuid") == 0)
+ fsflags |= MNT_NOSUID;
+ else if (strcmp(opt->name, "suid") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonosuid", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "nosymfollow") == 0)
+ fsflags |= MNT_NOSYMFOLLOW;
+ else if (strcmp(opt->name, "symfollow") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonosymfollow", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "noro") == 0) {
+ fsflags &= ~MNT_RDONLY;
+ has_noro = 1;
+ }
+ else if (strcmp(opt->name, "rw") == 0) {
+ fsflags &= ~MNT_RDONLY;
+ has_rw = 1;
+ }
+ else if (strcmp(opt->name, "ro") == 0)
+ fsflags |= MNT_RDONLY;
+ else if (strcmp(opt->name, "rdonly") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("ro", M_MOUNT);
+ fsflags |= MNT_RDONLY;
+ }
+ else if (strcmp(opt->name, "snapshot") == 0)
+ fsflags |= MNT_SNAPSHOT;
+ else if (strcmp(opt->name, "suiddir") == 0)
+ fsflags |= MNT_SUIDDIR;
+ else if (strcmp(opt->name, "sync") == 0)
+ fsflags |= MNT_SYNCHRONOUS;
+ else if (strcmp(opt->name, "union") == 0)
+ fsflags |= MNT_UNION;
+ }
+
+ /*
+ * If "rw" was specified as a mount option, and we
+ * are trying to update a mount-point from "ro" to "rw",
+ * we need a mount option "noro", since in vfs_mergeopts(),
+ * "noro" will cancel "ro", but "rw" will not do anything.
+ */
+ if (has_rw && !has_noro) {
+ noro_opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
+ noro_opt->name = strdup("noro", M_MOUNT);
+ noro_opt->value = NULL;
+ noro_opt->len = 0;
+ TAILQ_INSERT_TAIL(optlist, noro_opt, link);
+ }
/*
* Be ultra-paranoid about making sure the type and fspath
@@ -640,13 +728,26 @@
error = vfs_domount(td, fstype, fspath, fsflags, optlist);
mtx_unlock(&Giant);
bail:
- if (error)
+ /* copyout the errmsg */
+ if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
+ && errmsg_len > 0 && errmsg != NULL) {
+ if (fsoptions->uio_segflg == UIO_SYSSPACE) {
+ bcopy(errmsg,
+ fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
+ fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
+ } else {
+ copyout(errmsg,
+ fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
+ fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
+ }
+ }
+
+ if (error != 0)
vfs_freeopts(optlist);
return (error);
}
/*
- * ---------------------------------------------------------------------
* Old mount API.
*/
#ifndef _SYS_SYSPROTO_H_
@@ -673,26 +774,35 @@
struct mntarg *ma = NULL;
int error;
- /* Kick out MNT_ROOTFS early as it is legal internally */
- uap->flags &= ~MNT_ROOTFS;
+ AUDIT_ARG(fflags, uap->flags);
- if (uap->data == NULL)
- return (EINVAL);
+ /*
+ * Filter out MNT_ROOTFS. We do not want clients of mount() in
+ * userspace to set this flag, but we must filter it out if we want
+ * MNT_UPDATE on the root file system to work.
+ * MNT_ROOTFS should only be set in the kernel in vfs_mountroot_try().
+ */
+ uap->flags &= ~MNT_ROOTFS;
fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
- if (!error) {
- mtx_lock(&Giant); /* XXX ? */
- vfsp = vfs_byname_kld(fstype, td, &error);
- mtx_unlock(&Giant);
+ if (error) {
+ free(fstype, M_TEMP);
+ return (error);
}
+
+ AUDIT_ARG(text, fstype);
+ mtx_lock(&Giant);
+ vfsp = vfs_byname_kld(fstype, td, &error);
free(fstype, M_TEMP);
- if (error)
- return (error);
- if (vfsp == NULL)
+ if (vfsp == NULL) {
+ mtx_unlock(&Giant);
return (ENOENT);
- if (vfsp->vfc_vfsops->vfs_cmount == NULL)
+ }
+ if (vfsp->vfc_vfsops->vfs_cmount == NULL) {
+ mtx_unlock(&Giant);
return (EOPNOTSUPP);
+ }
ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
@@ -701,6 +811,7 @@
ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec");
error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags, td);
+ mtx_unlock(&Giant);
return (error);
}
@@ -710,7 +821,7 @@
*/
static int
vfs_domount(
- struct thread *td, /* Flags common to all filesystems. */
+ struct thread *td, /* Calling thread. */
const char *fstype, /* Filesystem type. */
char *fspath, /* Mount path. */
int fsflags, /* Flags common to all filesystems. */
@@ -720,7 +831,8 @@
struct vnode *vp;
struct mount *mp;
struct vfsconf *vfsp;
- int error, flag = 0, kern_flag = 0;
+ struct export_args export;
+ int error, flag = 0;
struct vattr va;
struct nameidata nd;
@@ -733,26 +845,31 @@
if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
return (ENAMETOOLONG);
- if (jailed(td->td_ucred))
- return (EPERM);
- if (usermount == 0) {
- if ((error = suser(td)) != 0)
+ if (jailed(td->td_ucred) || usermount == 0) {
+ if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
return (error);
}
/*
* Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
*/
- if (fsflags & (MNT_EXPORTED | MNT_SUIDDIR)) {
- if ((error = suser(td)) != 0)
+ if (fsflags & MNT_EXPORTED) {
+ error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
+ if (error)
+ return (error);
+ }
+ if (fsflags & MNT_SUIDDIR) {
+ error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
+ if (error)
return (error);
}
/*
- * Silently enforce MNT_NOSUID and MNT_USER for
- * unprivileged users.
+ * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
*/
- if (suser(td) != 0)
- fsflags |= MNT_NOSUID | MNT_USER;
+ if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
+ if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
+ fsflags |= MNT_NOSUID | MNT_USER;
+ }
/* Load KLDs before we lock the covered vnode to avoid reversals. */
vfsp = NULL;
@@ -764,11 +881,14 @@
vfsp = vfs_byname_kld(fstype, td, &error);
if (vfsp == NULL)
return (ENODEV);
+ if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL))
+ return (EPERM);
}
/*
* Get vnode to be covered
*/
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE,
+ fspath, td);
if ((error = namei(&nd)) != 0)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -779,17 +899,19 @@
return (EINVAL);
}
mp = vp->v_mount;
+ MNT_ILOCK(mp);
flag = mp->mnt_flag;
- kern_flag = mp->mnt_kern_flag;
/*
* We only allow the filesystem to be reloaded if it
* is currently mounted read-only.
*/
if ((fsflags & MNT_RELOAD) &&
((mp->mnt_flag & MNT_RDONLY) == 0)) {
+ MNT_IUNLOCK(mp);
vput(vp);
return (EOPNOTSUPP); /* Needs translation */
}
+ MNT_IUNLOCK(mp);
/*
* Only privileged root, or (if MNT_USER is set) the user that
* did the original mount is permitted to update it.
@@ -813,8 +935,10 @@
}
vp->v_iflag |= VI_MOUNT;
VI_UNLOCK(vp);
+ MNT_ILOCK(mp);
mp->mnt_flag |= fsflags &
(MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS);
+ MNT_IUNLOCK(mp);
VOP_UNLOCK(vp, 0, td);
mp->mnt_optnew = fsdata;
vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
@@ -829,7 +953,9 @@
return (error);
}
if (va.va_uid != td->td_ucred->cr_uid) {
- if ((error = suser(td)) != 0) {
+ error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN,
+ 0);
+ if (error) {
vput(vp);
return (error);
}
@@ -856,11 +982,7 @@
/*
* Allocate and initialize the filesystem.
*/
- error = vfs_mount_alloc(vp, vfsp, fspath, td, &mp);
- if (error) {
- vput(vp);
- return (error);
- }
+ mp = vfs_mount_alloc(vp, vfsp, fspath, td);
VOP_UNLOCK(vp, 0, td);
/* XXXMAC: pass to vfs_mount_alloc? */
@@ -870,16 +992,30 @@
/*
* Set the mount level flags.
*/
- if (fsflags & MNT_RDONLY)
- mp->mnt_flag |= MNT_RDONLY;
- mp->mnt_flag &=~ MNT_UPDATEMASK;
- mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS);
+ MNT_ILOCK(mp);
+ mp->mnt_flag = (mp->mnt_flag & ~MNT_UPDATEMASK) |
+ (fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS |
+ MNT_RDONLY));
+ if ((mp->mnt_flag & MNT_ASYNC) == 0)
+ mp->mnt_kern_flag &= ~MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
/*
* Mount the filesystem.
* XXX The final recipients of VFS_MOUNT just overwrite the ndp they
* get. No freeing of cn_pnbuf.
*/
error = VFS_MOUNT(mp, td);
+
+ /*
+ * Process the export option only if we are
+ * updating mount options.
+ */
+ if (!error && (fsflags & MNT_UPDATE)) {
+ if (vfs_copyopt(mp->mnt_optnew, "export", &export,
+ sizeof(export)) == 0)
+ error = vfs_export(mp, &export);
+ }
+
if (!error) {
if (mp->mnt_opt != NULL)
vfs_freeopts(mp->mnt_opt);
@@ -892,12 +1028,18 @@
*/
mp->mnt_optnew = NULL;
if (mp->mnt_flag & MNT_UPDATE) {
- mp->mnt_flag &=
- ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT);
- if (error) {
- mp->mnt_flag = flag;
- mp->mnt_kern_flag = kern_flag;
- }
+ MNT_ILOCK(mp);
+ if (error)
+ mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) |
+ (flag & ~MNT_QUOTA);
+ else
+ mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD |
+ MNT_FORCE | MNT_SNAPSHOT);
+ if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
+ mp->mnt_kern_flag |= MNTK_ASYNC;
+ else
+ mp->mnt_kern_flag &= ~MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
if ((mp->mnt_flag & MNT_RDONLY) == 0) {
if (mp->mnt_syncer == NULL)
error = vfs_allocate_syncvnode(mp);
@@ -913,6 +1055,12 @@
vrele(vp);
return (error);
}
+ MNT_ILOCK(mp);
+ if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
+ mp->mnt_kern_flag |= MNTK_ASYNC;
+ else
+ mp->mnt_kern_flag &= ~MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
/*
* Put the new filesystem on the mount list after root.
@@ -943,18 +1091,18 @@
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
VI_UNLOCK(vp);
- vfs_mount_destroy(mp, td);
+ vfs_unbusy(mp, td);
+ vfs_mount_destroy(mp);
vput(vp);
}
return (error);
}
/*
- * ---------------------------------------------------------------------
* Unmount a filesystem.
*
- * Note: unmount takes a path to the vnode mounted on as argument,
- * not special file (as before).
+ * Note: unmount takes a path to the vnode mounted on as argument, not
+ * special file (as before).
*/
#ifndef _SYS_SYSPROTO_H_
struct unmount_args {
@@ -975,10 +1123,9 @@
char *pathbuf;
int error, id0, id1;
- if (jailed(td->td_ucred))
- return (EPERM);
- if (usermount == 0) {
- if ((error = suser(td)) != 0)
+ if (jailed(td->td_ucred) || usermount == 0) {
+ error = priv_check(td, PRIV_VFS_UNMOUNT);
+ if (error)
return (error);
}
@@ -988,9 +1135,12 @@
free(pathbuf, M_TEMP);
return (error);
}
+ AUDIT_ARG(upath, td, pathbuf, ARG_UPATH1);
+ mtx_lock(&Giant);
if (uap->flags & MNT_BYFSID) {
/* Decode the filesystem ID. */
if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
+ mtx_unlock(&Giant);
free(pathbuf, M_TEMP);
return (EINVAL);
}
@@ -1018,23 +1168,17 @@
* now, so in the !MNT_BYFSID case return the more likely
* EINVAL for compatibility.
*/
+ mtx_unlock(&Giant);
return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
}
/*
- * Only privileged root, or (if MNT_USER is set) the user that did the
- * original mount is permitted to unmount this filesystem.
- */
- error = vfs_suser(mp, td);
- if (error)
- return (error);
-
- /*
* Don't allow unmounting the root filesystem.
*/
- if (mp->mnt_flag & MNT_ROOTFS)
+ if (mp->mnt_flag & MNT_ROOTFS) {
+ mtx_unlock(&Giant);
return (EINVAL);
- mtx_lock(&Giant);
+ }
error = dounmount(mp, uap->flags, td);
mtx_unlock(&Giant);
return (error);
@@ -1052,11 +1196,37 @@
struct vnode *coveredvp, *fsrootvp;
int error;
int async_flag;
+ int mnt_gen_r;
mtx_assert(&Giant, MA_OWNED);
- if ((coveredvp = mp->mnt_vnodecovered) != NULL)
- vn_lock(coveredvp, LK_EXCLUSIVE | LK_RETRY, td);
+ if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
+ mnt_gen_r = mp->mnt_gen;
+ VI_LOCK(coveredvp);
+ vholdl(coveredvp);
+ vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY, td);
+ vdrop(coveredvp);
+ /*
+ * Check for mp being unmounted while waiting for the
+ * covered vnode lock.
+ */
+ if (coveredvp->v_mountedhere != mp ||
+ coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
+ VOP_UNLOCK(coveredvp, 0, td);
+ return (EBUSY);
+ }
+ }
+ /*
+ * Only privileged root, or (if MNT_USER is set) the user that did the
+ * original mount is permitted to unmount this filesystem.
+ */
+ error = vfs_suser(mp, td);
+ if (error) {
+ if (coveredvp)
+ VOP_UNLOCK(coveredvp, 0, td);
+ return (error);
+ }
+
MNT_ILOCK(mp);
if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
MNT_IUNLOCK(mp);
@@ -1064,7 +1234,7 @@
VOP_UNLOCK(coveredvp, 0, td);
return (EBUSY);
}
- mp->mnt_kern_flag |= MNTK_UNMOUNT;
+ mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ;
/* Allow filesystems to detect that a forced unmount is in progress. */
if (flags & MNT_FORCE)
mp->mnt_kern_flag |= MNTK_UNMOUNTF;
@@ -1072,7 +1242,8 @@
((flags & MNT_FORCE) ? 0 : LK_NOWAIT), MNT_MTX(mp), td);
if (error) {
MNT_ILOCK(mp);
- mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
+ mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ |
+ MNTK_UNMOUNTF);
if (mp->mnt_kern_flag & MNTK_MWAIT)
wakeup(mp);
MNT_IUNLOCK(mp);
@@ -1086,8 +1257,11 @@
vfs_setpublicfs(NULL, NULL, NULL);
vfs_msync(mp, MNT_WAIT);
+ MNT_ILOCK(mp);
async_flag = mp->mnt_flag & MNT_ASYNC;
mp->mnt_flag &= ~MNT_ASYNC;
+ mp->mnt_kern_flag &= ~MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
cache_purgevfs(mp); /* remove cache entries for this file sys */
if (mp->mnt_syncer != NULL)
vrele(mp->mnt_syncer);
@@ -1124,11 +1298,17 @@
}
vput(fsrootvp);
}
- if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
- (void) vfs_allocate_syncvnode(mp);
MNT_ILOCK(mp);
+ mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ;
+ if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) {
+ MNT_IUNLOCK(mp);
+ (void) vfs_allocate_syncvnode(mp);
+ MNT_ILOCK(mp);
+ }
mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
mp->mnt_flag |= async_flag;
+ if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
+ mp->mnt_kern_flag |= MNTK_ASYNC;
lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
if (mp->mnt_kern_flag & MNTK_MWAIT)
wakeup(mp);
@@ -1145,7 +1325,8 @@
vput(coveredvp);
}
vfs_event_signal(NULL, VQ_UNMOUNT, 0);
- vfs_mount_destroy(mp, td);
+ lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
+ vfs_mount_destroy(mp);
return (0);
}
@@ -1156,13 +1337,18 @@
*/
struct root_hold_token {
- const char *who;
+ const char *who;
LIST_ENTRY(root_hold_token) list;
};
static LIST_HEAD(, root_hold_token) root_holds =
LIST_HEAD_INITIALIZER(&root_holds);
+static int root_mount_complete;
+
+/*
+ * Hold root mount.
+ */
struct root_hold_token *
root_mount_hold(const char *identifier)
{
@@ -1176,6 +1362,9 @@
return (h);
}
+/*
+ * Release root mount.
+ */
void
root_mount_rel(struct root_hold_token *h)
{
@@ -1187,8 +1376,11 @@
free(h, M_DEVBUF);
}
+/*
+ * Wait for all subsystems to release root mount.
+ */
static void
-root_mount_wait(void)
+root_mount_prepare(void)
{
struct root_hold_token *h;
@@ -1210,6 +1402,55 @@
}
}
+/*
+ * Root was mounted, share the good news.
+ */
+static void
+root_mount_done(void)
+{
+
+ /*
+ * Use a mutex to prevent the wakeup being missed and waiting for
+ * an extra 1 second sleep.
+ */
+ mtx_lock(&mountlist_mtx);
+ root_mount_complete = 1;
+ wakeup(&root_mount_complete);
+ mtx_unlock(&mountlist_mtx);
+}
+
+/*
+ * Return true if root is already mounted.
+ */
+int
+root_mounted(void)
+{
+
+ /* No mutex is acquired here because int stores are atomic. */
+ return (root_mount_complete);
+}
+
+/*
+ * Wait until root is mounted.
+ */
+void
+root_mount_wait(void)
+{
+
+ /*
+ * Panic on an obvious deadlock - the function can't be called from
+ * a thread which is doing the whole SYSINIT stuff.
+ */
+ KASSERT(curthread->td_proc->p_pid != 0,
+ ("root_mount_wait: cannot be called from the swapper thread"));
+ mtx_lock(&mountlist_mtx);
+ while (!root_mount_complete) {
+ msleep(&root_mount_complete, &mountlist_mtx, PZERO, "rootwait",
+ hz);
+ }
+ mtx_unlock(&mountlist_mtx);
+}
+
static void
set_rootvnode(struct thread *td)
{
@@ -1219,7 +1460,7 @@
panic("Cannot find root vnode");
p = td->td_proc;
- FILEDESC_LOCK(p->p_fd);
+ FILEDESC_SLOCK(p->p_fd);
if (p->p_fd->fd_cdir != NULL)
vrele(p->p_fd->fd_cdir);
@@ -1231,7 +1472,7 @@
p->p_fd->fd_rdir = rootvnode;
VREF(rootvnode);
- FILEDESC_UNLOCK(p->p_fd);
+ FILEDESC_SUNLOCK(p->p_fd);
VOP_UNLOCK(rootvnode, 0, td);
}
@@ -1245,25 +1486,27 @@
devfs_first(void)
{
struct thread *td = curthread;
+ struct vfsoptlist *opts;
struct vfsconf *vfsp;
struct mount *mp = NULL;
int error;
vfsp = vfs_byname("devfs");
KASSERT(vfsp != NULL, ("Could not find devfs by name"));
- if (vfsp == NULL)
+ if (vfsp == NULL)
return;
- error = vfs_mount_alloc(NULLVP, vfsp, "/dev", td, &mp);
- KASSERT(error == 0, ("vfs_mount_alloc failed %d", error));
- if (error)
- return;
+ mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td);
- error = VFS_MOUNT(mp, curthread);
+ error = VFS_MOUNT(mp, td);
KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
if (error)
return;
+ opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
+ TAILQ_INIT(opts);
+ mp->mnt_opt = opts;
+
mtx_lock(&mountlist_mtx);
TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
mtx_unlock(&mountlist_mtx);
@@ -1297,8 +1540,8 @@
VFS_ROOT(mp, LK_EXCLUSIVE, &dvp, td);
VI_LOCK(dvp);
dvp->v_iflag &= ~VI_MOUNT;
- dvp->v_mountedhere = NULL;
VI_UNLOCK(dvp);
+ dvp->v_mountedhere = NULL;
/* Set up the real rootvnode, and purge the cache */
TAILQ_FIRST(&mountlist)->mnt_vnodecovered = NULL;
@@ -1335,6 +1578,26 @@
}
/*
+ * Report errors during filesystem mounting.
+ */
+void
+vfs_mount_error(struct mount *mp, const char *fmt, ...)
+{
+ struct vfsoptlist *moptlist = mp->mnt_optnew;
+ va_list ap;
+ int error, len;
+ char *errmsg;
+
+ error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
+ if (error || errmsg == NULL || len <= 0)
+ return;
+
+ va_start(ap, fmt);
+ vsnprintf(errmsg, (size_t)len, fmt, ap);
+ va_end(ap);
+}
+
+/*
* Find and mount the root filesystem
*/
void
@@ -1343,8 +1606,11 @@
char *cp;
int error, i, asked = 0;
- root_mount_wait();
+ root_mount_prepare();
+ mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount),
+ NULL, NULL, mount_init, mount_fini,
+ UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
devfs_first();
/*
@@ -1352,7 +1618,7 @@
*/
if (boothowto & RB_ASKNAME) {
if (!vfs_mountroot_ask())
- return;
+ goto mounted;
asked = 1;
}
@@ -1362,7 +1628,7 @@
*/
if (ctrootdevname != NULL && (boothowto & RB_DFLTROOT)) {
if (!vfs_mountroot_try(ctrootdevname))
- return;
+ goto mounted;
ctrootdevname = NULL;
}
@@ -1374,7 +1640,7 @@
if (boothowto & RB_CDROM) {
for (i = 0; cdrom_rootdevnames[i] != NULL; i++) {
if (!vfs_mountroot_try(cdrom_rootdevnames[i]))
- return;
+ goto mounted;
}
}
@@ -1388,32 +1654,35 @@
error = vfs_mountroot_try(cp);
freeenv(cp);
if (!error)
- return;
+ goto mounted;
}
/*
* Try values that may have been computed by code during boot
*/
if (!vfs_mountroot_try(rootdevnames[0]))
- return;
+ goto mounted;
if (!vfs_mountroot_try(rootdevnames[1]))
- return;
+ goto mounted;
/*
* If we (still) have a compiled-in default, try it.
*/
if (ctrootdevname != NULL)
if (!vfs_mountroot_try(ctrootdevname))
- return;
+ goto mounted;
/*
* Everything so far has failed, prompt on the console if we haven't
* already tried that.
*/
if (!asked)
if (!vfs_mountroot_ask())
- return;
+ goto mounted;
panic("Root mount failed, startup aborted.");
+
+mounted:
+ root_mount_done();
}
/*
@@ -1422,7 +1691,7 @@
static int
vfs_mountroot_try(const char *mountfrom)
{
- struct mount *mp;
+ struct mount *mp;
char *vfsname, *path;
time_t timebase;
int error;
@@ -1499,7 +1768,7 @@
for(;;) {
printf("\nManual root filesystem specification:\n");
printf(" <fstype>:<device> Mount <device> using filesystem <fstype>\n");
-#if defined(__i386__) || defined(__ia64__)
+#if defined(__amd64__) || defined(__i386__) || defined(__ia64__)
printf(" eg. ufs:da0s1a\n");
#else
printf(" eg. ufs:/dev/da0a\n");
@@ -1532,27 +1801,47 @@
vfs_filteropt(struct vfsoptlist *opts, const char **legal)
{
struct vfsopt *opt;
- const char **t, *p;
-
+ char errmsg[255];
+ const char **t, *p, *q;
+ int ret = 0;
TAILQ_FOREACH(opt, opts, link) {
p = opt->name;
+ q = NULL;
if (p[0] == 'n' && p[1] == 'o')
- p += 2;
- for(t = global_opts; *t != NULL; t++)
- if (!strcmp(*t, p))
+ q = p + 2;
+ for(t = global_opts; *t != NULL; t++) {
+ if (strcmp(*t, p) == 0)
break;
+ if (q != NULL) {
+ if (strcmp(*t, q) == 0)
+ break;
+ }
+ }
if (*t != NULL)
continue;
- for(t = legal; *t != NULL; t++)
- if (!strcmp(*t, p))
+ for(t = legal; *t != NULL; t++) {
+ if (strcmp(*t, p) == 0)
break;
+ if (q != NULL) {
+ if (strcmp(*t, q) == 0)
+ break;
+ }
+ }
if (*t != NULL)
continue;
- printf("mount option <%s> is unknown\n", p);
- return (EINVAL);
+ sprintf(errmsg, "mount option <%s> is unknown", p);
+ printf("%s\n", errmsg);
+ ret = EINVAL;
+ }
+ if (ret != 0) {
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(opt->name, "errmsg") == 0) {
+ strncpy((char *)opt->value, errmsg, opt->len);
+ }
+ }
}
- return (0);
+ return (ret);
}
/*
@@ -1586,6 +1875,24 @@
return (ENOENT);
}
+static int
+vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
+{
+ struct vfsopt *opt;
+ int i;
+
+ if (opts == NULL)
+ return (-1);
+
+ i = 0;
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) == 0)
+ return (i);
+ ++i;
+ }
+ return (-1);
+}
+
char *
vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
{
@@ -1601,6 +1908,7 @@
}
return (opt->value);
}
+ *error = ENOENT;
return (NULL);
}
@@ -1633,6 +1941,8 @@
TAILQ_FOREACH(opt, opts, link) {
if (strcmp(name, opt->name) != 0)
continue;
+ if (opt->len == 0 || opt->value == NULL)
+ return (0);
if (((char *)opt->value)[opt->len - 1] != '\0')
return (0);
va_start(ap, fmt);
@@ -1687,9 +1997,9 @@
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
- while (vp != NULL && vp->v_type == VMARKER)
+ while (vp != NULL && vp->v_type == VMARKER)
vp = TAILQ_NEXT(vp, v_nmntvnodes);
-
+
/* Check if we are done */
if (vp == NULL) {
__mnt_vnode_markerfree(mvp, mp);
@@ -1708,9 +2018,9 @@
mtx_assert(MNT_MTX(mp), MA_OWNED);
vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
- while (vp != NULL && vp->v_type == VMARKER)
+ while (vp != NULL && vp->v_type == VMARKER)
vp = TAILQ_NEXT(vp, v_nmntvnodes);
-
+
/* Check if we are done */
if (vp == NULL) {
*mvp = NULL;
@@ -1725,9 +2035,9 @@
(*mvp)->v_type = VMARKER;
vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
- while (vp != NULL && vp->v_type == VMARKER)
+ while (vp != NULL && vp->v_type == VMARKER)
vp = TAILQ_NEXT(vp, v_nmntvnodes);
-
+
/* Check if we are done */
if (vp == NULL) {
MNT_IUNLOCK(mp);
@@ -1752,7 +2062,7 @@
if (*mvp == NULL)
return;
-
+
mtx_assert(MNT_MTX(mp), MA_OWNED);
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
Index: subr_trap.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_trap.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_trap.c -L sys/kern/subr_trap.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_trap.c
+++ sys/kern/subr_trap.c
@@ -38,19 +38,19 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_trap.c,v 1.281 2005/03/28 12:52:46 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_trap.c,v 1.299 2007/09/17 05:27:20 jeff Exp $");
#include "opt_ktrace.h"
#include "opt_mac.h"
#ifdef __i386__
#include "opt_npx.h"
#endif
+#include "opt_sched.h"
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/kernel.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/ktr.h>
@@ -67,17 +67,14 @@
#include <machine/cpu.h>
#include <machine/pcb.h>
+#include <security/mac/mac_framework.h>
+
/*
- * Define the code needed before returning to user mode, for
- * trap and syscall.
- *
- * MPSAFE
+ * Define the code needed before returning to user mode, for trap and
+ * syscall.
*/
void
-userret(td, frame, oticks)
- struct thread *td;
- struct trapframe *frame;
- u_int oticks;
+userret(struct thread *td, struct trapframe *frame)
{
struct proc *p = td->td_proc;
@@ -86,14 +83,18 @@
#ifdef DIAGNOSTIC
/* Check that we called signotify() enough. */
PROC_LOCK(p);
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 ||
(td->td_flags & TDF_ASTPENDING) == 0))
printf("failed to set signal flags properly for ast()\n");
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
PROC_UNLOCK(p);
#endif
+#ifdef KTRACE
+ KTRUSERRET(td);
+#endif
+
/*
* If this thread tickled GEOM, we need to wait for the giggling to
* stop before we return to userland
@@ -113,20 +114,20 @@
PROC_UNLOCK(p);
}
+#ifdef KSE
/*
* Do special thread processing, e.g. upcall tweaking and such.
*/
if (p->p_flag & P_SA)
thread_userret(td, frame);
+#endif
/*
* Charge system time if profiling.
*/
if (p->p_flag & P_PROFIL) {
- quad_t ticks;
- ticks = td->td_sticks - oticks;
- addupc_task(td, TRAPF_PC(frame), (u_int)ticks * psratio);
+ addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio);
}
/*
@@ -147,54 +148,48 @@
{
struct thread *td;
struct proc *p;
- struct ksegrp *kg;
- struct rlimit rlim;
- u_int sticks;
- int sflag;
int flags;
int sig;
#if defined(DEV_NPX) && !defined(SMP)
int ucode;
+ ksiginfo_t ksi;
#endif
td = curthread;
p = td->td_proc;
- kg = td->td_ksegrp;
CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid,
p->p_comm);
KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode");
mtx_assert(&Giant, MA_NOTOWNED);
- mtx_assert(&sched_lock, MA_NOTOWNED);
+ THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
td->td_frame = framep;
- sticks = td->td_sticks;
+ td->td_pticks = 0;
+#ifdef KSE
if ((p->p_flag & P_SA) && (td->td_mailbox == NULL))
thread_user_enter(td);
+#endif
/*
- * This updates the p_sflag's for the checks below in one
+ * This updates the td_flag's for the checks below in one
* "atomic" operation with turning off the astpending flag.
* If another AST is triggered while we are handling the
- * AST's saved in sflag, the astpending flag will be set and
+ * AST's saved in flags, the astpending flag will be set and
* ast() will be called again.
*/
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
flags = td->td_flags;
- sflag = p->p_sflag;
- p->p_sflag &= ~(PS_ALRMPEND | PS_PROFPEND | PS_XCPU);
-#ifdef MAC
- p->p_sflag &= ~PS_MACPEND;
-#endif
td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK |
- TDF_NEEDRESCHED | TDF_INTERRUPT);
- cnt.v_soft++;
- mtx_unlock_spin(&sched_lock);
+ TDF_NEEDRESCHED | TDF_INTERRUPT | TDF_ALRMPEND | TDF_PROFPEND |
+ TDF_MACPEND);
+ thread_unlock(td);
+ PCPU_INC(cnt.v_trap);
/*
* XXXKSE While the fact that we owe a user profiling
- * tick is stored per KSE in this code, the statistics
+ * tick is stored per thread in this code, the statistics
* themselves are still stored per process.
* This should probably change, by which I mean that
* possibly the location of both might change.
@@ -206,7 +201,7 @@
td->td_profil_ticks = 0;
td->td_pflags &= ~TDP_OWEUPC;
}
- if (sflag & PS_ALRMPEND) {
+ if (flags & TDF_ALRMPEND) {
PROC_LOCK(p);
psignal(p, SIGVTALRM);
PROC_UNLOCK(p);
@@ -217,32 +212,20 @@
PCB_NPXTRAP);
ucode = npxtrap();
if (ucode != -1) {
- trapsignal(td, SIGFPE, ucode);
+ ksiginfo_init_trap(&ksi);
+ ksi.ksi_signo = SIGFPE;
+ ksi.ksi_code = ucode;
+ trapsignal(td, &ksi);
}
}
#endif
- if (sflag & PS_PROFPEND) {
+ if (flags & TDF_PROFPEND) {
PROC_LOCK(p);
psignal(p, SIGPROF);
PROC_UNLOCK(p);
}
- if (sflag & PS_XCPU) {
- PROC_LOCK(p);
- lim_rlimit(p, RLIMIT_CPU, &rlim);
- mtx_lock_spin(&sched_lock);
- if (p->p_rux.rux_runtime.sec >= rlim.rlim_max) {
- mtx_unlock_spin(&sched_lock);
- killproc(p, "exceeded maximum CPU limit");
- } else {
- if (p->p_cpulimit < rlim.rlim_max)
- p->p_cpulimit += 5;
- mtx_unlock_spin(&sched_lock);
- psignal(p, SIGXCPU);
- }
- PROC_UNLOCK(p);
- }
#ifdef MAC
- if (sflag & PS_MACPEND)
+ if (flags & TDF_MACPEND)
mac_thread_userret(td);
#endif
if (flags & TDF_NEEDRESCHED) {
@@ -250,10 +233,11 @@
if (KTRPOINT(td, KTR_CSW))
ktrcsw(1, 1);
#endif
- mtx_lock_spin(&sched_lock);
- sched_prio(td, kg->kg_user_pri);
+ thread_lock(td);
+ sched_prio(td, td->td_user_pri);
+ SCHED_STAT_INC(switch_needresched);
mi_switch(SW_INVOL, NULL);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
#ifdef KTRACE
if (KTRPOINT(td, KTR_CSW))
ktrcsw(0, 1);
@@ -268,6 +252,6 @@
PROC_UNLOCK(p);
}
- userret(td, framep, sticks);
+ userret(td, framep);
mtx_assert(&Giant, MA_NOTOWNED);
}
Index: kern_clock.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_clock.c -L sys/kern/kern_clock.c -u -r1.2 -r1.3
--- sys/kern/kern_clock.c
+++ sys/kern/kern_clock.c
@@ -35,8 +35,9 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_clock.c,v 1.178.2.3 2006/03/10 19:37:33 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_clock.c,v 1.202 2007/09/17 05:27:20 jeff Exp $");
+#include "opt_kdb.h"
#include "opt_device_polling.h"
#include "opt_hwpmc_hooks.h"
#include "opt_ntp.h"
@@ -65,8 +66,6 @@
#include <sys/limits.h>
#include <sys/timetc.h>
-#include <machine/cpu.h>
-
#ifdef GPROF
#include <sys/gmon.h>
#endif
@@ -85,6 +84,9 @@
/* Some of these don't belong here, but it's easiest to concentrate them. */
long cp_time[CPUSTATES];
+/* Spin-lock protecting profiling statistics. */
+static struct mtx time_lock;
+
static int
sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS)
{
@@ -92,7 +94,7 @@
#ifdef SCTL_MASK32
int i;
unsigned int cp_time32[CPUSTATES];
-
+
if (req->flags & SCTL_MASK32) {
if (!req->oldptr)
return SYSCTL_OUT(req, 0, sizeof(cp_time32));
@@ -109,7 +111,7 @@
return error;
}
-SYSCTL_PROC(_kern, OID_AUTO, cp_time, CTLTYPE_LONG|CTLFLAG_RD,
+SYSCTL_PROC(_kern, OID_AUTO, cp_time, CTLTYPE_LONG|CTLFLAG_RD,
0,0, sysctl_kern_cp_time, "LU", "CPU time statistics");
#ifdef SW_WATCHDOG
@@ -173,6 +175,7 @@
* Set divisors to 1 (normal case) and let the machine-specific
* code do its bit.
*/
+ mtx_init(&time_lock, "time lock", NULL, MTX_SPIN);
cpu_initclocks();
/*
@@ -189,38 +192,39 @@
/*
* Each time the real-time timer fires, this function is called on all CPUs.
- * Note that hardclock() calls hardclock_process() for the boot CPU, so only
+ * Note that hardclock() calls hardclock_cpu() for the boot CPU, so only
* the other CPUs in the system need to call this function.
*/
void
-hardclock_process(frame)
- register struct clockframe *frame;
+hardclock_cpu(int usermode)
{
struct pstats *pstats;
struct thread *td = curthread;
struct proc *p = td->td_proc;
+ int flags;
/*
* Run current process's virtual and profile time, as needed.
*/
- mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
- if (p->p_flag & P_SA) {
- /* XXXKSE What to do? */
- } else {
- pstats = p->p_stats;
- if (CLKF_USERMODE(frame) &&
- timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
- itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
- p->p_sflag |= PS_ALRMPEND;
- td->td_flags |= TDF_ASTPENDING;
- }
- if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
- itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
- p->p_sflag |= PS_PROFPEND;
- td->td_flags |= TDF_ASTPENDING;
- }
- }
- mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
+ pstats = p->p_stats;
+ flags = 0;
+ if (usermode &&
+ timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
+ PROC_SLOCK(p);
+ if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
+ flags |= TDF_ALRMPEND | TDF_ASTPENDING;
+ PROC_SUNLOCK(p);
+ }
+ if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
+ PROC_SLOCK(p);
+ if (itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
+ flags |= TDF_PROFPEND | TDF_ASTPENDING;
+ PROC_SUNLOCK(p);
+ }
+ thread_lock(td);
+ sched_tick();
+ td->td_flags |= flags;
+ thread_unlock(td);
#ifdef HWPMC_HOOKS
if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
@@ -232,13 +236,11 @@
* The real-time timer, interrupting hz times per second.
*/
void
-hardclock(frame)
- register struct clockframe *frame;
+hardclock(int usermode, uintfptr_t pc)
{
int need_softclock = 0;
- CTR0(KTR_CLK, "hardclock fired");
- hardclock_process(frame);
+ hardclock_cpu(usermode);
tc_ticktock();
/*
@@ -247,8 +249,8 @@
* XXX: this only works for UP
*/
if (stathz == 0) {
- profclock(frame);
- statclock(frame);
+ profclock(usermode, pc);
+ statclock(usermode);
}
#ifdef DEVICE_POLLING
@@ -261,15 +263,15 @@
*/
mtx_lock_spin_flags(&callout_lock, MTX_QUIET);
ticks++;
- if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
+ if (!TAILQ_EMPTY(&callwheel[ticks & callwheelmask])) {
need_softclock = 1;
} else if (softticks + 1 == ticks)
++softticks;
mtx_unlock_spin_flags(&callout_lock, MTX_QUIET);
/*
- * swi_sched acquires sched_lock, so we don't want to call it with
- * callout_lock held; incorrect locking order.
+ * swi_sched acquires the thread lock, so we don't want to call it
+ * with callout_lock held; incorrect locking order.
*/
if (need_softclock)
swi_sched(softclock_ih, 0);
@@ -350,20 +352,15 @@
register struct proc *p;
{
- /*
- * XXX; Right now sched_lock protects statclock(), but perhaps
- * it should be protected later on by a time_lock, which would
- * cover psdiv, etc. as well.
- */
PROC_LOCK_ASSERT(p, MA_OWNED);
if (p->p_flag & P_STOPPROF)
return;
if ((p->p_flag & P_PROFIL) == 0) {
- mtx_lock_spin(&sched_lock);
p->p_flag |= P_PROFIL;
+ mtx_lock_spin(&time_lock);
if (++profprocs == 1)
cpu_startprofclock();
- mtx_unlock_spin(&sched_lock);
+ mtx_unlock_spin(&time_lock);
}
}
@@ -386,24 +383,22 @@
}
if ((p->p_flag & P_PROFIL) == 0)
return;
- mtx_lock_spin(&sched_lock);
p->p_flag &= ~P_PROFIL;
+ mtx_lock_spin(&time_lock);
if (--profprocs == 0)
cpu_stopprofclock();
- mtx_unlock_spin(&sched_lock);
+ mtx_unlock_spin(&time_lock);
}
}
/*
- * Statistics clock. Grab profile sample, and if divider reaches 0,
- * do process and kernel statistics. Most of the statistics are only
- * used by user-level statistics programs. The main exceptions are
- * ke->ke_uticks, p->p_rux.rux_sticks, p->p_rux.rux_iticks, and p->p_estcpu.
+ * Statistics clock. Updates rusage information and calls the scheduler
+ * to adjust priorities of the active thread.
+ *
* This should be called by all active processors.
*/
void
-statclock(frame)
- register struct clockframe *frame;
+statclock(int usermode)
{
struct rusage *ru;
struct vmspace *vm;
@@ -414,18 +409,20 @@
td = curthread;
p = td->td_proc;
- mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
- if (CLKF_USERMODE(frame)) {
+ thread_lock_flags(td, MTX_QUIET);
+ if (usermode) {
/*
* Charge the time as appropriate.
*/
+#ifdef KSE
if (p->p_flag & P_SA)
thread_statclock(1);
- p->p_rux.rux_uticks++;
+#endif
+ td->td_uticks++;
if (p->p_nice > NZERO)
- cp_time[CP_NICE]++;
+ atomic_add_long(&cp_time[CP_NICE], 1);
else
- cp_time[CP_USER]++;
+ atomic_add_long(&cp_time[CP_USER], 1);
} else {
/*
* Came from kernel mode, so we were:
@@ -441,50 +438,49 @@
*/
if ((td->td_pflags & TDP_ITHREAD) ||
td->td_intr_nesting_level >= 2) {
- p->p_rux.rux_iticks++;
- cp_time[CP_INTR]++;
+ td->td_iticks++;
+ atomic_add_long(&cp_time[CP_INTR], 1);
} else {
+#ifdef KSE
if (p->p_flag & P_SA)
thread_statclock(0);
+#endif
+ td->td_pticks++;
td->td_sticks++;
- p->p_rux.rux_sticks++;
- if (td != PCPU_GET(idlethread))
- cp_time[CP_SYS]++;
+ if (!TD_IS_IDLETHREAD(td))
+ atomic_add_long(&cp_time[CP_SYS], 1);
else
- cp_time[CP_IDLE]++;
+ atomic_add_long(&cp_time[CP_IDLE], 1);
}
}
- CTR4(KTR_SCHED, "statclock: %p(%s) prio %d stathz %d",
- td, td->td_proc->p_comm, td->td_priority, (stathz)?stathz:hz);
-
- sched_clock(td);
/* Update resource usage integrals and maximums. */
- MPASS(p->p_stats != NULL);
MPASS(p->p_vmspace != NULL);
vm = p->p_vmspace;
- ru = &p->p_stats->p_ru;
+ ru = &td->td_ru;
ru->ru_ixrss += pgtok(vm->vm_tsize);
ru->ru_idrss += pgtok(vm->vm_dsize);
ru->ru_isrss += pgtok(vm->vm_ssize);
rss = pgtok(vmspace_resident_count(vm));
if (ru->ru_maxrss < rss)
ru->ru_maxrss = rss;
- mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
+ CTR4(KTR_SCHED, "statclock: %p(%s) prio %d stathz %d",
+ td, td->td_proc->p_comm, td->td_priority, (stathz)?stathz:hz);
+ sched_clock(td);
+ thread_unlock(td);
}
void
-profclock(frame)
- register struct clockframe *frame;
+profclock(int usermode, uintfptr_t pc)
{
struct thread *td;
#ifdef GPROF
struct gmonparam *g;
- int i;
+ uintfptr_t i;
#endif
td = curthread;
- if (CLKF_USERMODE(frame)) {
+ if (usermode) {
/*
* Came from user mode; CPU was in user state.
* If this process is being profiled, record the tick.
@@ -492,7 +488,7 @@
* bother trying to count it.
*/
if (td->td_proc->p_flag & P_PROFIL)
- addupc_intr(td, CLKF_PC(frame), 1);
+ addupc_intr(td, pc, 1);
}
#ifdef GPROF
else {
@@ -500,11 +496,10 @@
* Kernel statistics are just like addupc_intr, only easier.
*/
g = &_gmonparam;
- if (g->state == GMON_PROF_ON) {
- i = CLKF_PC(frame) - g->lowpc;
+ if (g->state == GMON_PROF_ON && pc >= g->lowpc) {
+ i = PC_TO_I(g, pc);
if (i < g->textsize) {
- i /= HISTFRACTION * sizeof(*g->kcount);
- g->kcount[i]++;
+ KCOUNT(g, i)++;
}
}
}
@@ -536,15 +531,15 @@
#ifdef SW_WATCHDOG
static void
-watchdog_config(void *unused __unused, u_int cmd, int *err)
+watchdog_config(void *unused __unused, u_int cmd, int *error)
{
u_int u;
u = cmd & WD_INTERVAL;
- if ((cmd & WD_ACTIVE) && u >= WD_TO_1SEC) {
+ if (u >= WD_TO_1SEC) {
watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz;
watchdog_enabled = 1;
- *err = 0;
+ *error = 0;
} else {
watchdog_enabled = 0;
}
@@ -552,7 +547,7 @@
/*
* Handle a watchdog timeout by dumping interrupt information and
- * then either dropping to DDB or panicing.
+ * then either dropping to DDB or panicking.
*/
static void
watchdog_fire(void)
@@ -566,7 +561,7 @@
curname = intrnames;
inttotal = 0;
nintr = eintrcnt - intrcnt;
-
+
printf("interrupt total\n");
while (--nintr >= 0) {
if (*curintr)
@@ -576,12 +571,12 @@
}
printf("Total %20ju\n", (uintmax_t)inttotal);
-#ifdef KDB
+#if defined(KDB) && !defined(KDB_UNATTENDED)
kdb_backtrace();
kdb_enter("watchdog timeout");
#else
panic("watchdog timeout");
-#endif /* KDB */
+#endif
}
#endif /* SW_WATCHDOG */
Index: kern_mutex.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_mutex.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/kern_mutex.c -L sys/kern/kern_mutex.c -u -r1.1.1.2 -r1.2
--- sys/kern/kern_mutex.c
+++ sys/kern/kern_mutex.c
@@ -34,12 +34,11 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_mutex.c,v 1.154.2.5 2005/12/20 19:28:23 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_mutex.c,v 1.198.2.1 2007/12/01 11:28:37 attilio Exp $");
#include "opt_adaptive_mutexes.h"
#include "opt_ddb.h"
-#include "opt_mprof.h"
-#include "opt_mutex_wake_all.h"
+#include "opt_global.h"
#include "opt_sched.h"
#include <sys/param.h>
@@ -59,10 +58,10 @@
#include <sys/sysctl.h>
#include <sys/turnstile.h>
#include <sys/vmmeter.h>
+#include <sys/lock_profile.h>
#include <machine/atomic.h>
#include <machine/bus.h>
-#include <machine/clock.h>
#include <machine/cpu.h>
#include <ddb/ddb.h>
@@ -72,13 +71,8 @@
#include <vm/vm.h>
#include <vm/vm_extern.h>
-/*
- * Force MUTEX_WAKE_ALL for now.
- * single thread wakeup needs fixes to avoid race conditions with
- * priority inheritance.
- */
-#ifndef MUTEX_WAKE_ALL
-#define MUTEX_WAKE_ALL
+#if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES)
+#define ADAPTIVE_MUTEXES
#endif
/*
@@ -86,188 +80,91 @@
*/
#define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED)
-#define mtx_owner(m) (mtx_unowned((m)) ? NULL \
- : (struct thread *)((m)->mtx_lock & MTX_FLAGMASK))
+#define mtx_destroyed(m) ((m)->mtx_lock == MTX_DESTROYED)
+
+#define mtx_owner(m) ((struct thread *)((m)->mtx_lock & ~MTX_FLAGMASK))
#ifdef DDB
static void db_show_mtx(struct lock_object *lock);
#endif
+static void lock_mtx(struct lock_object *lock, int how);
+static void lock_spin(struct lock_object *lock, int how);
+static int unlock_mtx(struct lock_object *lock);
+static int unlock_spin(struct lock_object *lock);
/*
* Lock classes for sleep and spin mutexes.
*/
struct lock_class lock_class_mtx_sleep = {
- "sleep mutex",
- LC_SLEEPLOCK | LC_RECURSABLE,
+ .lc_name = "sleep mutex",
+ .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE,
#ifdef DDB
- db_show_mtx
+ .lc_ddb_show = db_show_mtx,
#endif
+ .lc_lock = lock_mtx,
+ .lc_unlock = unlock_mtx,
};
struct lock_class lock_class_mtx_spin = {
- "spin mutex",
- LC_SPINLOCK | LC_RECURSABLE,
+ .lc_name = "spin mutex",
+ .lc_flags = LC_SPINLOCK | LC_RECURSABLE,
#ifdef DDB
- db_show_mtx
+ .lc_ddb_show = db_show_mtx,
#endif
+ .lc_lock = lock_spin,
+ .lc_unlock = unlock_spin,
};
/*
* System-wide mutexes
*/
-struct mtx sched_lock;
+struct mtx blocked_lock;
struct mtx Giant;
-#ifdef MUTEX_PROFILING
-SYSCTL_NODE(_debug, OID_AUTO, mutex, CTLFLAG_RD, NULL, "mutex debugging");
-SYSCTL_NODE(_debug_mutex, OID_AUTO, prof, CTLFLAG_RD, NULL, "mutex profiling");
-static int mutex_prof_enable = 0;
-SYSCTL_INT(_debug_mutex_prof, OID_AUTO, enable, CTLFLAG_RW,
- &mutex_prof_enable, 0, "Enable tracing of mutex holdtime");
-
-struct mutex_prof {
- const char *name;
- const char *file;
- int line;
- uintmax_t cnt_max;
- uintmax_t cnt_tot;
- uintmax_t cnt_cur;
- uintmax_t cnt_contest_holding;
- uintmax_t cnt_contest_locking;
- struct mutex_prof *next;
-};
-
-/*
- * mprof_buf is a static pool of profiling records to avoid possible
- * reentrance of the memory allocation functions.
- *
- * Note: NUM_MPROF_BUFFERS must be smaller than MPROF_HASH_SIZE.
- */
-#ifdef MPROF_BUFFERS
-#define NUM_MPROF_BUFFERS MPROF_BUFFERS
+#ifdef LOCK_PROFILING
+static inline void lock_profile_init(void)
+{
+ int i;
+ /* Initialize the mutex profiling locks */
+ for (i = 0; i < LPROF_LOCK_SIZE; i++) {
+ mtx_init(&lprof_locks[i], "mprof lock",
+ NULL, MTX_SPIN|MTX_QUIET|MTX_NOPROFILE);
+ }
+}
#else
-#define NUM_MPROF_BUFFERS 1000
+static inline void lock_profile_init(void) {;}
#endif
-static struct mutex_prof mprof_buf[NUM_MPROF_BUFFERS];
-static int first_free_mprof_buf;
-#ifndef MPROF_HASH_SIZE
-#define MPROF_HASH_SIZE 1009
-#endif
-#if NUM_MPROF_BUFFERS >= MPROF_HASH_SIZE
-#error MPROF_BUFFERS must be larger than MPROF_HASH_SIZE
-#endif
-static struct mutex_prof *mprof_hash[MPROF_HASH_SIZE];
-/* SWAG: sbuf size = avg stat. line size * number of locks */
-#define MPROF_SBUF_SIZE 256 * 400
-
-static int mutex_prof_acquisitions;
-SYSCTL_INT(_debug_mutex_prof, OID_AUTO, acquisitions, CTLFLAG_RD,
- &mutex_prof_acquisitions, 0, "Number of mutex acquistions recorded");
-static int mutex_prof_records;
-SYSCTL_INT(_debug_mutex_prof, OID_AUTO, records, CTLFLAG_RD,
- &mutex_prof_records, 0, "Number of profiling records");
-static int mutex_prof_maxrecords = NUM_MPROF_BUFFERS;
-SYSCTL_INT(_debug_mutex_prof, OID_AUTO, maxrecords, CTLFLAG_RD,
- &mutex_prof_maxrecords, 0, "Maximum number of profiling records");
-static int mutex_prof_rejected;
-SYSCTL_INT(_debug_mutex_prof, OID_AUTO, rejected, CTLFLAG_RD,
- &mutex_prof_rejected, 0, "Number of rejected profiling records");
-static int mutex_prof_hashsize = MPROF_HASH_SIZE;
-SYSCTL_INT(_debug_mutex_prof, OID_AUTO, hashsize, CTLFLAG_RD,
- &mutex_prof_hashsize, 0, "Hash size");
-static int mutex_prof_collisions = 0;
-SYSCTL_INT(_debug_mutex_prof, OID_AUTO, collisions, CTLFLAG_RD,
- &mutex_prof_collisions, 0, "Number of hash collisions");
-
-/*
- * mprof_mtx protects the profiling buffers and the hash.
- */
-static struct mtx mprof_mtx;
-MTX_SYSINIT(mprof, &mprof_mtx, "mutex profiling lock", MTX_SPIN | MTX_QUIET);
-
-static u_int64_t
-nanoseconds(void)
-{
- struct timespec tv;
-
- nanotime(&tv);
- return (tv.tv_sec * (u_int64_t)1000000000 + tv.tv_nsec);
-}
-
-static int
-dump_mutex_prof_stats(SYSCTL_HANDLER_ARGS)
-{
- struct sbuf *sb;
- int error, i;
- static int multiplier = 1;
-
- if (first_free_mprof_buf == 0)
- return (SYSCTL_OUT(req, "No locking recorded",
- sizeof("No locking recorded")));
-
-retry_sbufops:
- sb = sbuf_new(NULL, NULL, MPROF_SBUF_SIZE * multiplier, SBUF_FIXEDLEN);
- sbuf_printf(sb, "\n%6s %12s %11s %5s %12s %12s %s\n",
- "max", "total", "count", "avg", "cnt_hold", "cnt_lock", "name");
- /*
- * XXX this spinlock seems to be by far the largest perpetrator
- * of spinlock latency (1.6 msec on an Athlon1600 was recorded
- * even before I pessimized it further by moving the average
- * computation here).
- */
- mtx_lock_spin(&mprof_mtx);
- for (i = 0; i < first_free_mprof_buf; ++i) {
- sbuf_printf(sb, "%6ju %12ju %11ju %5ju %12ju %12ju %s:%d (%s)\n",
- mprof_buf[i].cnt_max / 1000,
- mprof_buf[i].cnt_tot / 1000,
- mprof_buf[i].cnt_cur,
- mprof_buf[i].cnt_cur == 0 ? (uintmax_t)0 :
- mprof_buf[i].cnt_tot / (mprof_buf[i].cnt_cur * 1000),
- mprof_buf[i].cnt_contest_holding,
- mprof_buf[i].cnt_contest_locking,
- mprof_buf[i].file, mprof_buf[i].line, mprof_buf[i].name);
- if (sbuf_overflowed(sb)) {
- mtx_unlock_spin(&mprof_mtx);
- sbuf_delete(sb);
- multiplier++;
- goto retry_sbufops;
- }
- }
- mtx_unlock_spin(&mprof_mtx);
- sbuf_finish(sb);
- error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
- sbuf_delete(sb);
- return (error);
-}
-SYSCTL_PROC(_debug_mutex_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
- NULL, 0, dump_mutex_prof_stats, "A", "Mutex profiling statistics");
-
-static int
-reset_mutex_prof_stats(SYSCTL_HANDLER_ARGS)
-{
- int error, v;
-
- if (first_free_mprof_buf == 0)
- return (0);
-
- v = 0;
- error = sysctl_handle_int(oidp, &v, 0, req);
- if (error)
- return (error);
- if (req->newptr == NULL)
- return (error);
- if (v == 0)
- return (0);
-
- mtx_lock_spin(&mprof_mtx);
- bzero(mprof_buf, sizeof(*mprof_buf) * first_free_mprof_buf);
- bzero(mprof_hash, sizeof(struct mtx *) * MPROF_HASH_SIZE);
- first_free_mprof_buf = 0;
- mtx_unlock_spin(&mprof_mtx);
+
+void
+lock_mtx(struct lock_object *lock, int how)
+{
+
+ mtx_lock((struct mtx *)lock);
+}
+
+void
+lock_spin(struct lock_object *lock, int how)
+{
+
+ panic("spin locks can only use msleep_spin");
+}
+
+int
+unlock_mtx(struct lock_object *lock)
+{
+ struct mtx *m;
+
+ m = (struct mtx *)lock;
+ mtx_assert(m, MA_OWNED | MA_NOTRECURSED);
+ mtx_unlock(m);
return (0);
}
-SYSCTL_PROC(_debug_mutex_prof, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
- NULL, 0, reset_mutex_prof_stats, "I", "Reset mutex profiling statistics");
-#endif
+
+int
+unlock_spin(struct lock_object *lock)
+{
+
+ panic("spin locks can only use msleep_spin");
+}
/*
* Function versions of the inlined __mtx_* macros. These are used by
@@ -278,119 +175,57 @@
{
MPASS(curthread != NULL);
- KASSERT(m->mtx_object.lo_class == &lock_class_mtx_sleep,
- ("mtx_lock() of spin mutex %s @ %s:%d", m->mtx_object.lo_name,
+ KASSERT(m->mtx_lock != MTX_DESTROYED,
+ ("mtx_lock() of destroyed mutex @ %s:%d", file, line));
+ KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
+ ("mtx_lock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
file, line));
- WITNESS_CHECKORDER(&m->mtx_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE,
+ WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE,
file, line);
+
_get_sleep_lock(m, curthread, opts, file, line);
- LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+ LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
line);
- WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
-#ifdef MUTEX_PROFILING
- /* don't reset the timer when/if recursing */
- if (m->mtx_acqtime == 0) {
- m->mtx_filename = file;
- m->mtx_lineno = line;
- m->mtx_acqtime = mutex_prof_enable ? nanoseconds() : 0;
- ++mutex_prof_acquisitions;
- }
-#endif
+ WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+ curthread->td_locks++;
}
void
_mtx_unlock_flags(struct mtx *m, int opts, const char *file, int line)
{
-
MPASS(curthread != NULL);
- KASSERT(m->mtx_object.lo_class == &lock_class_mtx_sleep,
- ("mtx_unlock() of spin mutex %s @ %s:%d", m->mtx_object.lo_name,
+ KASSERT(m->mtx_lock != MTX_DESTROYED,
+ ("mtx_unlock() of destroyed mutex @ %s:%d", file, line));
+ KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
+ ("mtx_unlock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
file, line));
- WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
- LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+ curthread->td_locks--;
+ WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+ LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file,
line);
mtx_assert(m, MA_OWNED);
-#ifdef MUTEX_PROFILING
- if (m->mtx_acqtime != 0) {
- static const char *unknown = "(unknown)";
- struct mutex_prof *mpp;
- u_int64_t acqtime, now;
- const char *p, *q;
- volatile u_int hash;
-
- now = nanoseconds();
- acqtime = m->mtx_acqtime;
- m->mtx_acqtime = 0;
- if (now <= acqtime)
- goto out;
- for (p = m->mtx_filename;
- p != NULL && strncmp(p, "../", 3) == 0; p += 3)
- /* nothing */ ;
- if (p == NULL || *p == '\0')
- p = unknown;
- for (hash = m->mtx_lineno, q = p; *q != '\0'; ++q)
- hash = (hash * 2 + *q) % MPROF_HASH_SIZE;
- mtx_lock_spin(&mprof_mtx);
- for (mpp = mprof_hash[hash]; mpp != NULL; mpp = mpp->next)
- if (mpp->line == m->mtx_lineno &&
- strcmp(mpp->file, p) == 0)
- break;
- if (mpp == NULL) {
- /* Just exit if we cannot get a trace buffer */
- if (first_free_mprof_buf >= NUM_MPROF_BUFFERS) {
- ++mutex_prof_rejected;
- goto unlock;
- }
- mpp = &mprof_buf[first_free_mprof_buf++];
- mpp->name = mtx_name(m);
- mpp->file = p;
- mpp->line = m->mtx_lineno;
- mpp->next = mprof_hash[hash];
- if (mprof_hash[hash] != NULL)
- ++mutex_prof_collisions;
- mprof_hash[hash] = mpp;
- ++mutex_prof_records;
- }
- /*
- * Record if the mutex has been held longer now than ever
- * before.
- */
- if (now - acqtime > mpp->cnt_max)
- mpp->cnt_max = now - acqtime;
- mpp->cnt_tot += now - acqtime;
- mpp->cnt_cur++;
- /*
- * There's a small race, really we should cmpxchg
- * 0 with the current value, but that would bill
- * the contention to the wrong lock instance if
- * it followed this also.
- */
- mpp->cnt_contest_holding += m->mtx_contest_holding;
- m->mtx_contest_holding = 0;
- mpp->cnt_contest_locking += m->mtx_contest_locking;
- m->mtx_contest_locking = 0;
-unlock:
- mtx_unlock_spin(&mprof_mtx);
- }
-out:
-#endif
+
+ if (m->mtx_recurse == 0)
+ lock_profile_release_lock(&m->lock_object);
_rel_sleep_lock(m, curthread, opts, file, line);
}
void
_mtx_lock_spin_flags(struct mtx *m, int opts, const char *file, int line)
{
-
+
MPASS(curthread != NULL);
- KASSERT(m->mtx_object.lo_class == &lock_class_mtx_spin,
+ KASSERT(m->mtx_lock != MTX_DESTROYED,
+ ("mtx_lock_spin() of destroyed mutex @ %s:%d", file, line));
+ KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
("mtx_lock_spin() of sleep mutex %s @ %s:%d",
- m->mtx_object.lo_name, file, line));
- WITNESS_CHECKORDER(&m->mtx_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE,
+ m->lock_object.lo_name, file, line));
+ WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE,
file, line);
_get_spin_lock(m, curthread, opts, file, line);
- LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+ LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
line);
- WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+ WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
}
void
@@ -398,13 +233,16 @@
{
MPASS(curthread != NULL);
- KASSERT(m->mtx_object.lo_class == &lock_class_mtx_spin,
+ KASSERT(m->mtx_lock != MTX_DESTROYED,
+ ("mtx_unlock_spin() of destroyed mutex @ %s:%d", file, line));
+ KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
("mtx_unlock_spin() of sleep mutex %s @ %s:%d",
- m->mtx_object.lo_name, file, line));
- WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
- LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+ m->lock_object.lo_name, file, line));
+ WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+ LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file,
line);
mtx_assert(m, MA_OWNED);
+
_rel_spin_lock(m);
}
@@ -416,24 +254,33 @@
int
_mtx_trylock(struct mtx *m, int opts, const char *file, int line)
{
- int rval;
-
+ int rval, contested = 0;
+ uint64_t waittime = 0;
+
MPASS(curthread != NULL);
- KASSERT(m->mtx_object.lo_class == &lock_class_mtx_sleep,
- ("mtx_trylock() of spin mutex %s @ %s:%d", m->mtx_object.lo_name,
+ KASSERT(m->mtx_lock != MTX_DESTROYED,
+ ("mtx_trylock() of destroyed mutex @ %s:%d", file, line));
+ KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
+ ("mtx_trylock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
file, line));
- if (mtx_owned(m) && (m->mtx_object.lo_flags & LO_RECURSABLE) != 0) {
+ if (mtx_owned(m) && (m->lock_object.lo_flags & LO_RECURSABLE) != 0) {
m->mtx_recurse++;
atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
rval = 1;
} else
rval = _obtain_lock(m, (uintptr_t)curthread);
- LOCK_LOG_TRY("LOCK", &m->mtx_object, opts, rval, file, line);
- if (rval)
- WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
+ LOCK_LOG_TRY("LOCK", &m->lock_object, opts, rval, file, line);
+ if (rval) {
+ WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
file, line);
+ curthread->td_locks++;
+ if (m->mtx_recurse == 0)
+ lock_profile_obtain_lock_success(&m->lock_object, contested,
+ waittime, file, line);
+
+ }
return (rval);
}
@@ -448,42 +295,62 @@
_mtx_lock_sleep(struct mtx *m, uintptr_t tid, int opts, const char *file,
int line)
{
-#if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES)
- struct thread *owner;
+ struct turnstile *ts;
+#ifdef ADAPTIVE_MUTEXES
+ volatile struct thread *owner;
#endif
- uintptr_t v;
#ifdef KTR
int cont_logged = 0;
#endif
-#ifdef MUTEX_PROFILING
- int contested;
-#endif
-
+ int contested = 0;
+ uint64_t waittime = 0;
+ uintptr_t v;
+
if (mtx_owned(m)) {
- KASSERT((m->mtx_object.lo_flags & LO_RECURSABLE) != 0,
+ KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0,
("_mtx_lock_sleep: recursed on non-recursive mutex %s @ %s:%d\n",
- m->mtx_object.lo_name, file, line));
+ m->lock_object.lo_name, file, line));
m->mtx_recurse++;
atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
- if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ if (LOCK_LOG_TEST(&m->lock_object, opts))
CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
return;
}
- if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ lock_profile_obtain_lock_failed(&m->lock_object,
+ &contested, &waittime);
+ if (LOCK_LOG_TEST(&m->lock_object, opts))
CTR4(KTR_LOCK,
"_mtx_lock_sleep: %s contested (lock=%p) at %s:%d",
- m->mtx_object.lo_name, (void *)m->mtx_lock, file, line);
+ m->lock_object.lo_name, (void *)m->mtx_lock, file, line);
-#ifdef MUTEX_PROFILING
- contested = 0;
+ while (!_obtain_lock(m, tid)) {
+#ifdef ADAPTIVE_MUTEXES
+ /*
+ * If the owner is running on another CPU, spin until the
+ * owner stops running or the state of the lock changes.
+ */
+ v = m->mtx_lock;
+ if (v != MTX_UNOWNED) {
+ owner = (struct thread *)(v & ~MTX_FLAGMASK);
+#ifdef ADAPTIVE_GIANT
+ if (TD_IS_RUNNING(owner)) {
+#else
+ if (m != &Giant && TD_IS_RUNNING(owner)) {
#endif
- while (!_obtain_lock(m, tid)) {
-#ifdef MUTEX_PROFILING
- contested = 1;
- atomic_add_int(&m->mtx_contest_holding, 1);
+ if (LOCK_LOG_TEST(&m->lock_object, 0))
+ CTR3(KTR_LOCK,
+ "%s: spinning on %p held by %p",
+ __func__, m, owner);
+ while (mtx_owner(m) == owner &&
+ TD_IS_RUNNING(owner))
+ cpu_spinwait();
+ continue;
+ }
+ }
#endif
- turnstile_lock(&m->mtx_object);
+
+ ts = turnstile_trywait(&m->lock_object);
v = m->mtx_lock;
/*
@@ -491,24 +358,27 @@
* the turnstile chain lock.
*/
if (v == MTX_UNOWNED) {
- turnstile_release(&m->mtx_object);
+ turnstile_cancel(ts);
cpu_spinwait();
continue;
}
-#ifdef MUTEX_WAKE_ALL
MPASS(v != MTX_CONTESTED);
-#else
+
+#ifdef ADAPTIVE_MUTEXES
/*
- * The mutex was marked contested on release. This means that
- * there are other threads blocked on it. Grab ownership of
- * it and propagate its priority to the current thread if
- * necessary.
+ * If the current owner of the lock is executing on another
+ * CPU quit the hard path and try to spin.
*/
- if (v == MTX_CONTESTED) {
- m->mtx_lock = tid | MTX_CONTESTED;
- turnstile_claim(&m->mtx_object);
- break;
+ owner = (struct thread *)(v & ~MTX_FLAGMASK);
+#ifdef ADAPTIVE_GIANT
+ if (TD_IS_RUNNING(owner)) {
+#else
+ if (m != &Giant && TD_IS_RUNNING(owner)) {
+#endif
+ turnstile_cancel(ts);
+ cpu_spinwait();
+ continue;
}
#endif
@@ -519,30 +389,11 @@
*/
if ((v & MTX_CONTESTED) == 0 &&
!atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) {
- turnstile_release(&m->mtx_object);
+ turnstile_cancel(ts);
cpu_spinwait();
continue;
}
-#if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES)
- /*
- * If the current owner of the lock is executing on another
- * CPU, spin instead of blocking.
- */
- owner = (struct thread *)(v & MTX_FLAGMASK);
-#ifdef ADAPTIVE_GIANT
- if (TD_IS_RUNNING(owner)) {
-#else
- if (m != &Giant && TD_IS_RUNNING(owner)) {
-#endif
- turnstile_release(&m->mtx_object);
- while (mtx_owner(m) == owner && TD_IS_RUNNING(owner)) {
- cpu_spinwait();
- }
- continue;
- }
-#endif /* SMP && !NO_ADAPTIVE_MUTEXES */
-
/*
* We definitely must sleep for this lock.
*/
@@ -552,9 +403,9 @@
if (!cont_logged) {
CTR6(KTR_CONTENTION,
"contention: %p at %s:%d wants %s, taken by %s:%d",
- (void *)tid, file, line, m->mtx_object.lo_name,
- WITNESS_FILE(&m->mtx_object),
- WITNESS_LINE(&m->mtx_object));
+ (void *)tid, file, line, m->lock_object.lo_name,
+ WITNESS_FILE(&m->lock_object),
+ WITNESS_LINE(&m->lock_object));
cont_logged = 1;
}
#endif
@@ -562,22 +413,36 @@
/*
* Block on the turnstile.
*/
- turnstile_wait(&m->mtx_object, mtx_owner(m));
+ turnstile_wait(ts, mtx_owner(m), TS_EXCLUSIVE_QUEUE);
}
-
#ifdef KTR
if (cont_logged) {
CTR4(KTR_CONTENTION,
"contention end: %s acquired by %p at %s:%d",
- m->mtx_object.lo_name, (void *)tid, file, line);
+ m->lock_object.lo_name, (void *)tid, file, line);
}
#endif
-#ifdef MUTEX_PROFILING
- if (contested)
- m->mtx_contest_locking++;
- m->mtx_contest_holding = 0;
+ lock_profile_obtain_lock_success(&m->lock_object, contested,
+ waittime, (file), (line));
+}
+
+static void
+_mtx_lock_spin_failed(struct mtx *m)
+{
+ struct thread *td;
+
+ td = mtx_owner(m);
+
+ /* If the mutex is unlocked, try again. */
+ if (td == NULL)
+ return;
+
+ printf( "spin lock %p (%s) held by %p (tid %d) too long\n",
+ m, m->lock_object.lo_name, td, td->td_tid);
+#ifdef WITNESS
+ witness_display_spinlock(&m->lock_object, td);
#endif
- return;
+ panic("spin lock held too long");
}
#ifdef SMP
@@ -591,14 +456,14 @@
_mtx_lock_spin(struct mtx *m, uintptr_t tid, int opts, const char *file,
int line)
{
- int i = 0;
-
- if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ int i = 0, contested = 0;
+ uint64_t waittime = 0;
+
+ if (LOCK_LOG_TEST(&m->lock_object, opts))
CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
- for (;;) {
- if (_obtain_lock(m, tid))
- break;
+ lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime);
+ while (!_obtain_lock(m, tid)) {
/* Give interrupts a chance while we spin. */
spinlock_exit();
@@ -607,29 +472,107 @@
cpu_spinwait();
continue;
}
- if (i < 60000000)
+ if (i < 60000000 || kdb_active || panicstr != NULL)
DELAY(1);
- else if (!kdb_active && !panicstr) {
- printf("spin lock %s held by %p for > 5 seconds\n",
- m->mtx_object.lo_name, (void *)m->mtx_lock);
-#ifdef WITNESS
- witness_display_spinlock(&m->mtx_object,
- mtx_owner(m));
-#endif
- panic("spin lock held too long");
- }
+ else
+ _mtx_lock_spin_failed(m);
cpu_spinwait();
}
spinlock_enter();
}
- if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ if (LOCK_LOG_TEST(&m->lock_object, opts))
CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
- return;
+ lock_profile_obtain_lock_success(&m->lock_object, contested,
+ waittime, (file), (line));
}
#endif /* SMP */
+void
+_thread_lock_flags(struct thread *td, int opts, const char *file, int line)
+{
+ struct mtx *m;
+ uintptr_t tid;
+ int i, contested;
+ uint64_t waittime;
+
+
+ contested = i = 0;
+ waittime = 0;
+ tid = (uintptr_t)curthread;
+ for (;;) {
+retry:
+ spinlock_enter();
+ m = td->td_lock;
+ WITNESS_CHECKORDER(&m->lock_object,
+ opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line);
+ while (!_obtain_lock(m, tid)) {
+ if (m->mtx_lock == tid) {
+ m->mtx_recurse++;
+ break;
+ }
+ lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime);
+ /* Give interrupts a chance while we spin. */
+ spinlock_exit();
+ while (m->mtx_lock != MTX_UNOWNED) {
+ if (i++ < 10000000)
+ cpu_spinwait();
+ else if (i < 60000000 ||
+ kdb_active || panicstr != NULL)
+ DELAY(1);
+ else
+ _mtx_lock_spin_failed(m);
+ cpu_spinwait();
+ if (m != td->td_lock)
+ goto retry;
+ }
+ spinlock_enter();
+ }
+ if (m == td->td_lock)
+ break;
+ _rel_spin_lock(m); /* does spinlock_exit() */
+ }
+ lock_profile_obtain_lock_success(&m->lock_object, contested,
+ waittime, (file), (line));
+ WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+}
+
+struct mtx *
+thread_lock_block(struct thread *td)
+{
+ struct mtx *lock;
+
+ spinlock_enter();
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ lock = td->td_lock;
+ td->td_lock = &blocked_lock;
+ mtx_unlock_spin(lock);
+
+ return (lock);
+}
+
+void
+thread_lock_unblock(struct thread *td, struct mtx *new)
+{
+ mtx_assert(new, MA_OWNED);
+ MPASS(td->td_lock == &blocked_lock);
+ atomic_store_rel_ptr((volatile void *)&td->td_lock, (uintptr_t)new);
+ spinlock_exit();
+}
+
+void
+thread_lock_set(struct thread *td, struct mtx *new)
+{
+ struct mtx *lock;
+
+ mtx_assert(new, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ lock = td->td_lock;
+ td->td_lock = new;
+ mtx_unlock_spin(lock);
+}
+
/*
* _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
*
@@ -640,95 +583,33 @@
_mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
{
struct turnstile *ts;
-#ifndef PREEMPTION
- struct thread *td, *td1;
-#endif
if (mtx_recursed(m)) {
if (--(m->mtx_recurse) == 0)
atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
- if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ if (LOCK_LOG_TEST(&m->lock_object, opts))
CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
return;
}
- turnstile_lock(&m->mtx_object);
- ts = turnstile_lookup(&m->mtx_object);
- if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ /*
+ * We have to lock the chain before the turnstile so this turnstile
+ * can be removed from the hash list if it is empty.
+ */
+ turnstile_chain_lock(&m->lock_object);
+ ts = turnstile_lookup(&m->lock_object);
+ if (LOCK_LOG_TEST(&m->lock_object, opts))
CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
-#if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES)
- if (ts == NULL) {
- _release_lock_quick(m);
- if (LOCK_LOG_TEST(&m->mtx_object, opts))
- CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p no sleepers", m);
- turnstile_release(&m->mtx_object);
- return;
- }
-#else
MPASS(ts != NULL);
-#endif
-#ifndef PREEMPTION
- /* XXX */
- td1 = turnstile_head(ts);
-#endif
-#ifdef MUTEX_WAKE_ALL
- turnstile_broadcast(ts);
+ turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE);
_release_lock_quick(m);
-#else
- if (turnstile_signal(ts)) {
- _release_lock_quick(m);
- if (LOCK_LOG_TEST(&m->mtx_object, opts))
- CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m);
- } else {
- m->mtx_lock = MTX_CONTESTED;
- if (LOCK_LOG_TEST(&m->mtx_object, opts))
- CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p still contested",
- m);
- }
-#endif
- turnstile_unpend(ts);
-
-#ifndef PREEMPTION
/*
- * XXX: This is just a hack until preemption is done. However,
- * once preemption is done we need to either wrap the
- * turnstile_signal() and release of the actual lock in an
- * extra critical section or change the preemption code to
- * always just set a flag and never do instant-preempts.
+ * This turnstile is now no longer associated with the mutex. We can
+ * unlock the chain lock so a new turnstile may take it's place.
*/
- td = curthread;
- if (td->td_critnest > 0 || td1->td_priority >= td->td_priority)
- return;
- mtx_lock_spin(&sched_lock);
- if (!TD_IS_RUNNING(td1)) {
-#ifdef notyet
- if (td->td_ithd != NULL) {
- struct ithd *it = td->td_ithd;
-
- if (it->it_interrupted) {
- if (LOCK_LOG_TEST(&m->mtx_object, opts))
- CTR2(KTR_LOCK,
- "_mtx_unlock_sleep: %p interrupted %p",
- it, it->it_interrupted);
- intr_thd_fixup(it);
- }
- }
-#endif
- if (LOCK_LOG_TEST(&m->mtx_object, opts))
- CTR2(KTR_LOCK,
- "_mtx_unlock_sleep: %p switching out lock=%p", m,
- (void *)m->mtx_lock);
-
- mi_switch(SW_INVOL, NULL);
- if (LOCK_LOG_TEST(&m->mtx_object, opts))
- CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p",
- m, (void *)m->mtx_lock);
- }
- mtx_unlock_spin(&sched_lock);
-#endif
-
- return;
+ turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+ turnstile_chain_unlock(&m->lock_object);
}
/*
@@ -752,20 +633,20 @@
case MA_OWNED | MA_NOTRECURSED:
if (!mtx_owned(m))
panic("mutex %s not owned at %s:%d",
- m->mtx_object.lo_name, file, line);
+ m->lock_object.lo_name, file, line);
if (mtx_recursed(m)) {
if ((what & MA_NOTRECURSED) != 0)
panic("mutex %s recursed at %s:%d",
- m->mtx_object.lo_name, file, line);
+ m->lock_object.lo_name, file, line);
} else if ((what & MA_RECURSED) != 0) {
panic("mutex %s unrecursed at %s:%d",
- m->mtx_object.lo_name, file, line);
+ m->lock_object.lo_name, file, line);
}
break;
case MA_NOTOWNED:
if (mtx_owned(m))
panic("mutex %s owned at %s:%d",
- m->mtx_object.lo_name, file, line);
+ m->lock_object.lo_name, file, line);
break;
default:
panic("unknown mtx_assert at %s:%d", file, line);
@@ -791,11 +672,6 @@
* XXX: When kernacc() does not require Giant we can reenable this check
*/
#ifdef notyet
-/*
- * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
- * we can re-enable the kernacc() checks.
- */
-#ifndef __alpha__
/*
* Can't call kernacc() from early init386(), especially when
* initializing Giant mutex, because some stuff in kernacc()
@@ -806,7 +682,6 @@
VM_PROT_READ | VM_PROT_WRITE))
panic("Can't read and write to mutex %p", m);
#endif
-#endif
}
#endif
@@ -830,40 +705,39 @@
void
mtx_init(struct mtx *m, const char *name, const char *type, int opts)
{
- struct lock_object *lock;
+ struct lock_class *class;
+ int flags;
MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE |
- MTX_NOWITNESS | MTX_DUPOK)) == 0);
+ MTX_NOWITNESS | MTX_DUPOK | MTX_NOPROFILE)) == 0);
#ifdef MUTEX_DEBUG
/* Diagnostic and error correction */
mtx_validate(m);
#endif
- lock = &m->mtx_object;
- KASSERT((lock->lo_flags & LO_INITIALIZED) == 0,
- ("mutex \"%s\" %p already initialized", name, m));
- bzero(m, sizeof(*m));
+ /* Determine lock class and lock flags. */
if (opts & MTX_SPIN)
- lock->lo_class = &lock_class_mtx_spin;
+ class = &lock_class_mtx_spin;
else
- lock->lo_class = &lock_class_mtx_sleep;
- lock->lo_name = name;
- lock->lo_type = type != NULL ? type : name;
+ class = &lock_class_mtx_sleep;
+ flags = 0;
if (opts & MTX_QUIET)
- lock->lo_flags = LO_QUIET;
+ flags |= LO_QUIET;
if (opts & MTX_RECURSE)
- lock->lo_flags |= LO_RECURSABLE;
+ flags |= LO_RECURSABLE;
if ((opts & MTX_NOWITNESS) == 0)
- lock->lo_flags |= LO_WITNESS;
+ flags |= LO_WITNESS;
if (opts & MTX_DUPOK)
- lock->lo_flags |= LO_DUPOK;
+ flags |= LO_DUPOK;
+ if (opts & MTX_NOPROFILE)
+ flags |= LO_NOPROFILE;
+ /* Initialize mutex. */
m->mtx_lock = MTX_UNOWNED;
+ m->mtx_recurse = 0;
- LOCK_LOG_INIT(lock, opts);
-
- WITNESS_INIT(lock);
+ lock_init(&m->lock_object, class, name, type, flags);
}
/*
@@ -876,19 +750,24 @@
mtx_destroy(struct mtx *m)
{
- LOCK_LOG_DESTROY(&m->mtx_object, 0);
-
if (!mtx_owned(m))
MPASS(mtx_unowned(m));
else {
MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
+ /* Perform the non-mtx related part of mtx_unlock_spin(). */
+ if (LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin)
+ spinlock_exit();
+ else
+ curthread->td_locks--;
+
/* Tell witness this isn't locked to make it happy. */
- WITNESS_UNLOCK(&m->mtx_object, LOP_EXCLUSIVE, __FILE__,
+ WITNESS_UNLOCK(&m->lock_object, LOP_EXCLUSIVE, __FILE__,
__LINE__);
}
- WITNESS_DESTROY(&m->mtx_object);
+ m->mtx_lock = MTX_DESTROYED;
+ lock_destroy(&m->lock_object);
}
/*
@@ -900,9 +779,6 @@
mutex_init(void)
{
- /* Setup thread0 so that mutexes work. */
- LIST_INIT(&thread0.td_contested);
-
/* Setup turnstiles so that sleep mutexes work. */
init_turnstiles();
@@ -910,34 +786,17 @@
* Initialize mutexes.
*/
mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
- mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
+ mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN);
+ blocked_lock.mtx_lock = 0xdeadc0de; /* Always blocked. */
mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+ mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
mtx_init(&devmtx, "cdev", NULL, MTX_DEF);
mtx_lock(&Giant);
+
+ lock_profile_init();
}
#ifdef DDB
-/* XXX: This function is not mutex-specific. */
-DB_SHOW_COMMAND(lock, db_show_lock)
-{
- struct lock_object *lock;
-
- if (!have_addr)
- return;
- lock = (struct lock_object *)addr;
- if (lock->lo_class != &lock_class_mtx_sleep &&
- lock->lo_class != &lock_class_mtx_spin &&
- lock->lo_class != &lock_class_sx) {
- db_printf("Unknown lock class\n");
- return;
- }
- db_printf(" class: %s\n", lock->lo_class->lc_name);
- db_printf(" name: %s\n", lock->lo_name);
- if (lock->lo_type && lock->lo_type != lock->lo_name)
- db_printf(" type: %s\n", lock->lo_type);
- lock->lo_class->lc_ddb_show(lock);
-}
-
void
db_show_mtx(struct lock_object *lock)
{
@@ -947,18 +806,20 @@
m = (struct mtx *)lock;
db_printf(" flags: {");
- if (m->mtx_object.lo_class == &lock_class_mtx_spin)
+ if (LOCK_CLASS(lock) == &lock_class_mtx_spin)
db_printf("SPIN");
else
db_printf("DEF");
- if (m->mtx_object.lo_flags & LO_RECURSABLE)
+ if (m->lock_object.lo_flags & LO_RECURSABLE)
db_printf(", RECURSE");
- if (m->mtx_object.lo_flags & LO_DUPOK)
+ if (m->lock_object.lo_flags & LO_DUPOK)
db_printf(", DUPOK");
db_printf("}\n");
db_printf(" state: {");
if (mtx_unowned(m))
db_printf("UNOWNED");
+ else if (mtx_destroyed(m))
+ db_printf("DESTROYED");
else {
db_printf("OWNED");
if (m->mtx_lock & MTX_CONTESTED)
@@ -967,7 +828,7 @@
db_printf(", RECURSED");
}
db_printf("}\n");
- if (!mtx_unowned(m)) {
+ if (!mtx_unowned(m) && !mtx_destroyed(m)) {
td = mtx_owner(m);
db_printf(" owner: %p (tid %d, pid %d, \"%s\")\n", td,
td->td_tid, td->td_proc->p_pid, td->td_proc->p_comm);
Index: kern_acct.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_acct.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/kern_acct.c -L sys/kern/kern_acct.c -u -r1.1.1.2 -r1.2
--- sys/kern/kern_acct.c
+++ sys/kern/kern_acct.c
@@ -2,13 +2,39 @@
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
+ * Copyright (c) 2005 Robert N. M. Watson
+ * All rights reserved.
+ *
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
* Copyright (c) 1994 Christopher G. Demetriou
- * Copyright (c) 2005 Robert N. M. Watson
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -42,7 +68,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_acct.c,v 1.74.2.3 2006/02/14 23:13:17 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_acct.c,v 1.95 2007/08/31 13:56:26 dds Exp $");
#include "opt_mac.h"
@@ -52,11 +78,12 @@
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
+#include <sys/limits.h>
#include <sys/lock.h>
-#include <sys/mac.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
@@ -68,23 +95,33 @@
#include <sys/tty.h>
#include <sys/vnode.h>
+#include <security/mac/mac_framework.h>
+
/*
* The routines implemented in this file are described in:
* Leffler, et al.: The Design and Implementation of the 4.3BSD
* UNIX Operating System (Addison Welley, 1989)
* on pages 62-63.
+ * On May 2007 the historic 3 bits base 8 exponent, 13 bit fraction
+ * compt_t representation described in the above reference was replaced
+ * with that of IEEE-754 floats.
*
* Arguably, to simplify accounting operations, this mechanism should
* be replaced by one in which an accounting log file (similar to /dev/klog)
* is read by a user process, etc. However, that has its own problems.
*/
+/* Floating point definitions from <float.h>. */
+#define FLT_MANT_DIG 24 /* p */
+#define FLT_MAX_EXP 128 /* emax */
+
/*
* Internal accounting functions.
* The former's operation is described in Leffler, et al., and the latter
* was provided by UCB with the 4.4BSD-Lite release
*/
-static comp_t encode_comp_t(u_long, u_long);
+static uint32_t encode_timeval(struct timeval);
+static uint32_t encode_long(long);
static void acctwatch(void);
static void acct_thread(void *);
static int acct_disable(struct thread *);
@@ -94,6 +131,7 @@
* acct_sx protects against changes to the active vnode and credentials
* while accounting records are being committed to disk.
*/
+static int acct_configured;
static int acct_suspended;
static struct vnode *acct_vp;
static struct ucred *acct_cred;
@@ -146,60 +184,60 @@
&acctchkfreq, 0, sysctl_acct_chkfreq, "I",
"frequency for checking the free space");
+SYSCTL_INT(_kern, OID_AUTO, acct_configured, CTLFLAG_RD, &acct_configured, 0,
+ "Accounting configured or not");
+
SYSCTL_INT(_kern, OID_AUTO, acct_suspended, CTLFLAG_RD, &acct_suspended, 0,
"Accounting suspended or not");
/*
- * Accounting system call. Written based on the specification and
- * previous implementation done by Mark Tinguely.
- *
- * MPSAFE
+ * Accounting system call. Written based on the specification and previous
+ * implementation done by Mark Tinguely.
*/
int
acct(struct thread *td, struct acct_args *uap)
{
struct nameidata nd;
- int error, flags;
+ int error, flags, vfslocked;
- /* Make sure that the caller is root. */
- error = suser(td);
+ error = priv_check(td, PRIV_ACCT);
if (error)
return (error);
/*
* If accounting is to be started to a file, open that file for
- * appending and make sure it's a 'normal'. While we could
- * conditionally acquire Giant here, we're actually interacting with
- * vnodes from possibly two file systems, making the logic a bit
- * complicated. For now, use Giant unconditionally.
+ * appending and make sure it's a 'normal'.
*/
- mtx_lock(&Giant);
if (uap->path != NULL) {
- NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1,
+ UIO_USERSPACE, uap->path, td);
flags = FWRITE | O_APPEND;
- error = vn_open(&nd, &flags, 0, -1);
+ error = vn_open(&nd, &flags, 0, NULL);
if (error)
- goto done;
+ return (error);
+ vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
#ifdef MAC
error = mac_check_system_acct(td->td_ucred, nd.ni_vp);
if (error) {
VOP_UNLOCK(nd.ni_vp, 0, td);
vn_close(nd.ni_vp, flags, td->td_ucred, td);
- goto done;
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
}
#endif
VOP_UNLOCK(nd.ni_vp, 0, td);
if (nd.ni_vp->v_type != VREG) {
vn_close(nd.ni_vp, flags, td->td_ucred, td);
- error = EACCES;
- goto done;
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (EACCES);
}
+ VFS_UNLOCK_GIANT(vfslocked);
#ifdef MAC
} else {
error = mac_check_system_acct(td->td_ucred, NULL);
if (error)
- goto done;
+ return (error);
#endif
}
@@ -216,15 +254,18 @@
* enabled.
*/
acct_suspended = 0;
- if (acct_vp != NULL)
+ if (acct_vp != NULL) {
+ vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
error = acct_disable(td);
+ VFS_UNLOCK_GIANT(vfslocked);
+ }
if (uap->path == NULL) {
if (acct_state & ACCT_RUNNING) {
acct_state |= ACCT_EXITREQ;
wakeup(&acct_state);
}
sx_xunlock(&acct_sx);
- goto done;
+ return (error);
}
/*
@@ -245,20 +286,22 @@
error = kthread_create(acct_thread, NULL, NULL, 0, 0,
"accounting");
if (error) {
+ vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
(void) vn_close(acct_vp, acct_flags, acct_cred, td);
+ VFS_UNLOCK_GIANT(vfslocked);
crfree(acct_cred);
+ acct_configured = 0;
acct_vp = NULL;
acct_cred = NULL;
acct_flags = 0;
sx_xunlock(&acct_sx);
log(LOG_NOTICE, "Unable to start accounting thread\n");
- goto done;
+ return (error);
}
}
+ acct_configured = 1;
sx_xunlock(&acct_sx);
log(LOG_NOTICE, "Accounting enabled\n");
-done:
- mtx_unlock(&Giant);
return (error);
}
@@ -274,6 +317,7 @@
sx_assert(&acct_sx, SX_XLOCKED);
error = vn_close(acct_vp, acct_flags, acct_cred, td);
crfree(acct_cred);
+ acct_configured = 0;
acct_vp = NULL;
acct_cred = NULL;
acct_flags = 0;
@@ -290,11 +334,11 @@
int
acct_process(struct thread *td)
{
- struct acct acct;
+ struct acctv2 acct;
struct timeval ut, st, tmp;
struct plimit *newlim, *oldlim;
struct proc *p;
- struct rusage *r;
+ struct rusage ru;
int t, ret, vfslocked;
/*
@@ -327,9 +371,9 @@
bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
/* (2) The amount of user and system time that was used */
- calcru(p, &ut, &st);
- acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec);
- acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec);
+ rufetchcalc(p, &ru, &ut, &st);
+ acct.ac_utime = encode_timeval(ut);
+ acct.ac_stime = encode_timeval(st);
/* (3) The elapsed time the command ran (and its starting time) */
tmp = boottime;
@@ -337,20 +381,21 @@
acct.ac_btime = tmp.tv_sec;
microuptime(&tmp);
timevalsub(&tmp, &p->p_stats->p_start);
- acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec);
+ acct.ac_etime = encode_timeval(tmp);
/* (4) The average amount of memory used */
- r = &p->p_stats->p_ru;
tmp = ut;
timevaladd(&tmp, &st);
+ /* Convert tmp (i.e. u + s) into hz units to match ru_i*. */
t = tmp.tv_sec * hz + tmp.tv_usec / tick;
if (t)
- acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t;
+ acct.ac_mem = encode_long((ru.ru_ixrss + ru.ru_idrss +
+ + ru.ru_isrss) / t);
else
acct.ac_mem = 0;
/* (5) The number of disk I/O operations done */
- acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0);
+ acct.ac_io = encode_long(ru.ru_inblock + ru.ru_oublock);
/* (6) The UID and GID of the process */
acct.ac_uid = p->p_ucred->cr_ruid;
@@ -365,9 +410,15 @@
SESS_UNLOCK(p->p_session);
/* (8) The boolean flags that tell how the process terminated, etc. */
- acct.ac_flag = p->p_acflag;
+ acct.ac_flagx = p->p_acflag;
PROC_UNLOCK(p);
+ /* Setup ancillary structure fields. */
+ acct.ac_flagx |= ANVER;
+ acct.ac_zero = 0;
+ acct.ac_version = 2;
+ acct.ac_len = acct.ac_len2 = sizeof(acct);
+
/*
* Eliminate any file size rlimit.
*/
@@ -393,44 +444,107 @@
return (ret);
}
+/* FLOAT_CONVERSION_START (Regression testing; don't remove this line.) */
+
+/* Convert timevals and longs into IEEE-754 bit patterns. */
+
+/* Mantissa mask (MSB is implied, so subtract 1). */
+#define MANT_MASK ((1 << (FLT_MANT_DIG - 1)) - 1)
+
/*
- * Encode_comp_t converts from ticks in seconds and microseconds
- * to ticks in 1/AHZ seconds. The encoding is described in
- * Leffler, et al., on page 63.
+ * We calculate integer values to a precision of approximately
+ * 28 bits.
+ * This is high-enough precision to fill the 24 float bits
+ * and low-enough to avoid overflowing the 32 int bits.
*/
+#define CALC_BITS 28
-#define MANTSIZE 13 /* 13 bit mantissa. */
-#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
-#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
+/* log_2(1000000). */
+#define LOG2_1M 20
-static comp_t
-encode_comp_t(u_long s, u_long us)
+/*
+ * Convert the elements of a timeval into a 32-bit word holding
+ * the bits of a IEEE-754 float.
+ * The float value represents the timeval's value in microsecond units.
+ */
+static uint32_t
+encode_timeval(struct timeval tv)
{
- int exp, rnd;
-
- exp = 0;
- rnd = 0;
- s *= AHZ;
- s += us / (1000000 / AHZ); /* Maximize precision. */
+ int log2_s;
+ int val, exp; /* Unnormalized value and exponent */
+ int norm_exp; /* Normalized exponent */
+ int shift;
- while (s > MAXFRACT) {
- rnd = s & (1 << (EXPSIZE - 1)); /* Round up? */
- s >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
- exp++;
+ /*
+ * First calculate value and exponent to about CALC_BITS precision.
+ * Note that the following conditionals have been ordered so that
+ * the most common cases appear first.
+ */
+ if (tv.tv_sec == 0) {
+ if (tv.tv_usec == 0)
+ return (0);
+ exp = 0;
+ val = tv.tv_usec;
+ } else {
+ /*
+ * Calculate the value to a precision of approximately
+ * CALC_BITS.
+ */
+ log2_s = fls(tv.tv_sec) - 1;
+ if (log2_s + LOG2_1M < CALC_BITS) {
+ exp = 0;
+ val = 1000000 * tv.tv_sec + tv.tv_usec;
+ } else {
+ exp = log2_s + LOG2_1M - CALC_BITS;
+ val = (unsigned int)(((u_int64_t)1000000 * tv.tv_sec +
+ tv.tv_usec) >> exp);
+ }
}
+ /* Now normalize and pack the value into an IEEE-754 float. */
+ norm_exp = fls(val) - 1;
+ shift = FLT_MANT_DIG - norm_exp - 1;
+#ifdef ACCT_DEBUG
+ printf("val=%d exp=%d shift=%d log2(val)=%d\n",
+ val, exp, shift, norm_exp);
+ printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp,
+ ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK));
+#endif
+ return (((FLT_MAX_EXP - 1 + exp + norm_exp) << (FLT_MANT_DIG - 1)) |
+ ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK));
+}
- /* If we need to round up, do it (and handle overflow correctly). */
- if (rnd && (++s > MAXFRACT)) {
- s >>= EXPSIZE;
- exp++;
- }
+/*
+ * Convert a non-negative long value into the bit pattern of
+ * an IEEE-754 float value.
+ */
+static uint32_t
+encode_long(long val)
+{
+ int norm_exp; /* Normalized exponent */
+ int shift;
- /* Clean it up and polish it off. */
- exp <<= MANTSIZE; /* Shift the exponent into place */
- exp += s; /* and add on the mantissa. */
- return (exp);
+ if (val == 0)
+ return (0);
+ if (val < 0) {
+ log(LOG_NOTICE,
+ "encode_long: negative value %ld in accounting record\n",
+ val);
+ val = LONG_MAX;
+ }
+ norm_exp = fls(val) - 1;
+ shift = FLT_MANT_DIG - norm_exp - 1;
+#ifdef ACCT_DEBUG
+ printf("val=%d shift=%d log2(val)=%d\n",
+ val, shift, norm_exp);
+ printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp,
+ ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK));
+#endif
+ return (((FLT_MAX_EXP - 1 + norm_exp) << (FLT_MANT_DIG - 1)) |
+ ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK));
}
+/* FLOAT_CONVERSION_END (Regression testing; don't remove this line.) */
+
/*
* Periodically check the filesystem to see if accounting
* should be turned on or off. Beware the case where the vnode
@@ -503,9 +617,9 @@
/* This is a low-priority kernel thread. */
pri = PRI_MAX_KERN;
- mtx_lock_spin(&sched_lock);
+ thread_lock(curthread);
sched_prio(curthread, pri);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(curthread);
/* If another accounting kthread is already running, just die. */
sx_xlock(&acct_sx);
@@ -527,9 +641,8 @@
* to exit.
*/
if (!(acct_state & ACCT_EXITREQ)) {
- sx_xunlock(&acct_sx);
- tsleep(&acct_state, pri, "-", acctchkfreq * hz);
- sx_xlock(&acct_sx);
+ sx_sleep(&acct_state, &acct_sx, 0, "-",
+ acctchkfreq * hz);
}
}
Index: kern_uuid.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_uuid.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_uuid.c -L sys/kern/kern_uuid.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_uuid.c
+++ sys/kern/kern_uuid.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_uuid.c,v 1.8 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_uuid.c,v 1.13 2007/04/23 12:53:00 pjd Exp $");
#include <sys/param.h>
#include <sys/endian.h>
@@ -116,7 +116,7 @@
/*
* Get the current time as a 60 bit count of 100-nanosecond intervals
* since 00:00:00.00, October 15,1582. We apply a magic offset to convert
- * the Unix time since 00:00:00.00, Januari 1, 1970 to the date of the
+ * the Unix time since 00:00:00.00, January 1, 1970 to the date of the
* Gregorian reform to the Christian calendar.
*/
static uint64_t
@@ -131,30 +131,12 @@
return (time & ((1LL << 60) - 1LL));
}
-#ifndef _SYS_SYSPROTO_H_
-struct uuidgen_args {
- struct uuid *store;
- int count;
-};
-#endif
-
-int
-uuidgen(struct thread *td, struct uuidgen_args *uap)
+struct uuid *
+kern_uuidgen(struct uuid *store, size_t count)
{
struct uuid_private uuid;
uint64_t time;
- int error;
-
- /*
- * Limit the number of UUIDs that can be created at the same time
- * to some arbitrary number. This isn't really necessary, but I
- * like to have some sort of upper-bound that's less than 2G :-)
- * XXX needs to be tunable.
- */
- if (uap->count < 1 || uap->count > 2048)
- return (EINVAL);
-
- /* XXX: pre-validate accessibility to the whole of the UUID store? */
+ size_t n;
mtx_lock(&uuid_mutex);
@@ -171,25 +153,52 @@
uuid.seq = uuid_last.seq;
uuid_last = uuid;
- uuid_last.time.ll = (time + uap->count - 1) & ((1LL << 60) - 1LL);
+ uuid_last.time.ll = (time + count - 1) & ((1LL << 60) - 1LL);
mtx_unlock(&uuid_mutex);
/* Set sequence and variant and deal with byte order. */
uuid.seq = htobe16(uuid.seq | 0x8000);
- /* XXX: this should copyout larger chunks at a time. */
- do {
- /* Set time and version (=1) and deal with byte order. */
+ for (n = 0; n < count; n++) {
+ /* Set time and version (=1). */
uuid.time.x.low = (uint32_t)time;
uuid.time.x.mid = (uint16_t)(time >> 32);
uuid.time.x.hi = ((uint16_t)(time >> 48) & 0xfff) | (1 << 12);
- error = copyout(&uuid, uap->store, sizeof(uuid));
- uap->store++;
- uap->count--;
+ store[n] = *(struct uuid *)&uuid;
time++;
- } while (uap->count > 0 && !error);
+ }
+
+ return (store);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct uuidgen_args {
+ struct uuid *store;
+ int count;
+};
+#endif
+int
+uuidgen(struct thread *td, struct uuidgen_args *uap)
+{
+ struct uuid *store;
+ size_t count;
+ int error;
+ /*
+ * Limit the number of UUIDs that can be created at the same time
+ * to some arbitrary number. This isn't really necessary, but I
+ * like to have some sort of upper-bound that's less than 2G :-)
+ * XXX probably needs to be tunable.
+ */
+ if (uap->count < 1 || uap->count > 2048)
+ return (EINVAL);
+
+ count = uap->count;
+ store = malloc(count * sizeof(struct uuid), M_TEMP, M_WAITOK);
+ kern_uuidgen(store, count);
+ error = copyout(store, uap->store, count * sizeof(struct uuid));
+ free(store, M_TEMP);
return (error);
}
@@ -272,6 +281,7 @@
for (i = 0; i < _UUID_NODE_LEN; i++)
uuid->node[i] = p[10 + i];
}
+
void
be_uuid_enc(void *buf, struct uuid const *uuid)
{
@@ -303,3 +313,49 @@
for (i = 0; i < _UUID_NODE_LEN; i++)
uuid->node[i] = p[10 + i];
}
+
+int
+parse_uuid(const char *str, struct uuid *uuid)
+{
+ u_int c[11];
+ int n;
+
+ /* An empty string represents a nil UUID. */
+ if (*str == '\0') {
+ bzero(uuid, sizeof(*uuid));
+ return (0);
+ }
+
+ /* The UUID string representation has a fixed length. */
+ if (strlen(str) != 36)
+ return (EINVAL);
+
+ /*
+ * We only work with "new" UUIDs. New UUIDs have the form:
+ * 01234567-89ab-cdef-0123-456789abcdef
+ * The so called "old" UUIDs, which we don't support, have the form:
+ * 0123456789ab.cd.ef.01.23.45.67.89.ab
+ */
+ if (str[8] != '-')
+ return (EINVAL);
+
+ n = sscanf(str, "%8x-%4x-%4x-%2x%2x-%2x%2x%2x%2x%2x%2x", c + 0, c + 1,
+ c + 2, c + 3, c + 4, c + 5, c + 6, c + 7, c + 8, c + 9, c + 10);
+ /* Make sure we have all conversions. */
+ if (n != 11)
+ return (EINVAL);
+
+ /* Successful scan. Build the UUID. */
+ uuid->time_low = c[0];
+ uuid->time_mid = c[1];
+ uuid->time_hi_and_version = c[2];
+ uuid->clock_seq_hi_and_reserved = c[3];
+ uuid->clock_seq_low = c[4];
+ for (n = 0; n < 6; n++)
+ uuid->node[n] = c[n + 5];
+
+ /* Check semantics... */
+ return (((c[3] & 0x80) != 0x00 && /* variant 0? */
+ (c[3] & 0xc0) != 0x80 && /* variant 1? */
+ (c[3] & 0xe0) != 0xc0) ? EINVAL : 0); /* variant 2? */
+}
Index: kern_exit.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_exit.c -L sys/kern/kern_exit.c -u -r1.2 -r1.3
--- sys/kern/kern_exit.c
+++ sys/kern/kern_exit.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_exit.c,v 1.263.2.7 2006/03/18 23:36:21 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_exit.c,v 1.304 2007/06/13 20:01:42 jhb Exp $");
#include "opt_compat.h"
#include "opt_ktrace.h"
@@ -56,20 +56,24 @@
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <sys/resourcevar.h>
+#include <sys/sbuf.h>
#include <sys/signalvar.h>
#include <sys/sched.h>
#include <sys/sx.h>
#include <sys/syscallsubr.h>
+#include <sys/syslog.h>
#include <sys/ptrace.h>
#include <sys/acct.h> /* for acct_process() function prototype */
#include <sys/filedesc.h>
-#include <sys/mac.h>
#include <sys/shm.h>
#include <sys/sem.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_param.h>
@@ -85,10 +89,7 @@
void (*nlminfo_release_p)(struct proc *p);
/*
- * exit --
- * Death of process.
- *
- * MPSAFE
+ * exit -- death of process.
*/
void
sys_exit(struct thread *td, struct sys_exit_args *uap)
@@ -99,9 +100,9 @@
}
/*
- * Exit: deallocate address space and other resources, change proc state
- * to zombie, and unlink proc from allproc and parent's lists. Save exit
- * status and rusage for wait(). Check for child processes and orphan them.
+ * Exit: deallocate address space and other resources, change proc state to
+ * zombie, and unlink proc from allproc and parent's lists. Save exit status
+ * and rusage for wait(). Check for child processes and orphan them.
*/
void
exit1(struct thread *td, int rv)
@@ -109,14 +110,13 @@
struct proc *p, *nq, *q;
struct tty *tp;
struct vnode *ttyvp;
- struct vmspace *vm;
struct vnode *vtmp;
#ifdef KTRACE
struct vnode *tracevp;
struct ucred *tracecred;
#endif
struct plimit *plim;
- int locked, refcnt;
+ int locked;
/*
* Drop Giant if caller has it. Eventually we should warn about
@@ -169,7 +169,8 @@
* Threading support has been turned off.
*/
}
-
+ KASSERT(p->p_numthreads == 1,
+ ("exit1: proc %p exiting with %d threads", p, p->p_numthreads));
/*
* Wakeup anyone in procfs' PIOCWAIT. They should have a hold
* on our vmspace, so we should block below until they have
@@ -193,7 +194,21 @@
*/
while (p->p_lock > 0)
msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0);
+
PROC_UNLOCK(p);
+ /* Drain the limit callout while we don't have the proc locked */
+ callout_drain(&p->p_limco);
+
+#ifdef AUDIT
+ /*
+ * The Sun BSM exit token contains two components: an exit status as
+ * passed to exit(), and a return value to indicate what sort of exit
+ * it was. The exit status is WEXITSTATUS(rv), but it's not clear
+ * what the return value is.
+ */
+ AUDIT_ARG(exit, WEXITSTATUS(rv), 0);
+ AUDIT_SYSCALL_EXIT(0, td);
+#endif
/* Are we a task leader? */
if (p == p->p_leader) {
@@ -217,8 +232,6 @@
*/
EVENTHANDLER_INVOKE(process_exit, p);
- MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage),
- M_ZOMBIE, M_WAITOK);
/*
* If parent is waiting for us to exit or exec,
* P_PPWAIT is set; we will wakeup the parent below.
@@ -226,8 +239,6 @@
PROC_LOCK(p);
stopprofclock(p);
p->p_flag &= ~(P_TRACED | P_PPWAIT);
- SIGEMPTYSET(p->p_siglist);
- SIGEMPTYSET(td->td_siglist);
/*
* Stop the real interval timer. If the handler is currently
@@ -246,9 +257,7 @@
* Reset any sigio structures pointing to us as a result of
* F_SETOWN with our pid.
*/
- mtx_lock(&Giant); /* XXX: not sure if needed */
funsetownlst(&p->p_sigiolst);
- mtx_unlock(&Giant);
/*
* If this process has an nlminfo data area (for lockd), release it
@@ -282,42 +291,15 @@
}
mtx_unlock(&ppeers_lock);
- /* The next two chunks should probably be moved to vmspace_exit. */
- vm = p->p_vmspace;
- /*
- * Release user portion of address space.
- * This releases references to vnodes,
- * which could cause I/O if the file has been unlinked.
- * Need to do this early enough that we can still sleep.
- * Can't free the entire vmspace as the kernel stack
- * may be mapped within that space also.
- *
- * Processes sharing the same vmspace may exit in one order, and
- * get cleaned up by vmspace_exit() in a different order. The
- * last exiting process to reach this point releases as much of
- * the environment as it can, and the last process cleaned up
- * by vmspace_exit() (which decrements exitingcnt) cleans up the
- * remainder.
- */
- atomic_add_int(&vm->vm_exitingcnt, 1);
- do
- refcnt = vm->vm_refcnt;
- while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
- if (refcnt == 1) {
- shmexit(vm);
- pmap_remove_pages(vmspace_pmap(vm), vm_map_min(&vm->vm_map),
- vm_map_max(&vm->vm_map));
- (void) vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
- vm_map_max(&vm->vm_map));
- }
+ vmspace_exit(td);
+ mtx_lock(&Giant); /* XXX TTY */
sx_xlock(&proctree_lock);
if (SESS_LEADER(p)) {
struct session *sp;
sp = p->p_session;
if (sp->s_ttyvp) {
- locked = VFS_LOCK_GIANT(sp->s_ttyvp->v_mount);
/*
* Controlling process.
* Signal foreground pgrp,
@@ -363,7 +345,6 @@
* that the session once had a controlling terminal.
* (for logging and informational purposes)
*/
- VFS_UNLOCK_GIANT(locked);
}
SESS_LOCK(p->p_session);
sp->s_leader = NULL;
@@ -372,26 +353,35 @@
fixjobc(p, p->p_pgrp, 0);
sx_xunlock(&proctree_lock);
(void)acct_process(td);
+ mtx_unlock(&Giant);
#ifdef KTRACE
/*
- * release trace file
+ * Disable tracing, then drain any pending records and release
+ * the trace file.
*/
- PROC_LOCK(p);
- mtx_lock(&ktrace_mtx);
- p->p_traceflag = 0; /* don't trace the vrele() */
- tracevp = p->p_tracevp;
- p->p_tracevp = NULL;
- tracecred = p->p_tracecred;
- p->p_tracecred = NULL;
- mtx_unlock(&ktrace_mtx);
- PROC_UNLOCK(p);
- if (tracevp != NULL) {
- locked = VFS_LOCK_GIANT(tracevp->v_mount);
- vrele(tracevp);
- VFS_UNLOCK_GIANT(locked);
+ if (p->p_traceflag != 0) {
+ PROC_LOCK(p);
+ mtx_lock(&ktrace_mtx);
+ p->p_traceflag = 0;
+ mtx_unlock(&ktrace_mtx);
+ PROC_UNLOCK(p);
+ ktrprocexit(td);
+ PROC_LOCK(p);
+ mtx_lock(&ktrace_mtx);
+ tracevp = p->p_tracevp;
+ p->p_tracevp = NULL;
+ tracecred = p->p_tracecred;
+ p->p_tracecred = NULL;
+ mtx_unlock(&ktrace_mtx);
+ PROC_UNLOCK(p);
+ if (tracevp != NULL) {
+ locked = VFS_LOCK_GIANT(tracevp->v_mount);
+ vrele(tracevp);
+ VFS_UNLOCK_GIANT(locked);
+ }
+ if (tracecred != NULL)
+ crfree(tracecred);
}
- if (tracecred != NULL)
- crfree(tracecred);
#endif
/*
* Release reference to text vnode
@@ -422,6 +412,19 @@
LIST_REMOVE(p, p_hash);
sx_xunlock(&allproc_lock);
+ /*
+ * Call machine-dependent code to release any
+ * machine-dependent resources other than the address space.
+ * The address space is released by "vmspace_exitfree(p)" in
+ * vm_waitproc().
+ */
+ cpu_exit(td);
+
+ WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid);
+
+ /*
+ * Reparent all of our children to init.
+ */
sx_xlock(&proctree_lock);
q = LIST_FIRST(&p->p_children);
if (q != NULL) /* only need this if any child is S_ZOMB */
@@ -442,16 +445,10 @@
PROC_UNLOCK(q);
}
- /*
- * Save exit status and finalize rusage info except for times,
- * adding in child rusage info.
- */
+ /* Save exit status. */
PROC_LOCK(p);
p->p_xstat = rv;
p->p_xthread = td;
- p->p_stats->p_ru.ru_nvcsw++;
- *p->p_ru = p->p_stats->p_ru;
-
/*
* Notify interested parties of our demise.
*/
@@ -492,31 +489,21 @@
if (p->p_pptr == initproc)
psignal(p->p_pptr, SIGCHLD);
- else if (p->p_sigparent != 0)
- psignal(p->p_pptr, p->p_sigparent);
- PROC_UNLOCK(p->p_pptr);
+ else if (p->p_sigparent != 0) {
+ if (p->p_sigparent == SIGCHLD)
+ childproc_exited(p);
+ else /* LINUX thread */
+ psignal(p->p_pptr, p->p_sigparent);
+ }
+ sx_xunlock(&proctree_lock);
/*
- * If this is a kthread, then wakeup anyone waiting for it to exit.
+ * The state PRS_ZOMBIE prevents other proesses from sending
+ * signal to the process, to avoid memory leak, we free memory
+ * for signal queue at the time when the state is set.
*/
- if (p->p_flag & P_KTHREAD)
- wakeup(p);
- PROC_UNLOCK(p);
-
- /*
- * Finally, call machine-dependent code to release the remaining
- * resources including address space.
- * The address space is released by "vmspace_exitfree(p)" in
- * vm_waitproc().
- */
- cpu_exit(td);
-
- WITNESS_WARN(WARN_PANIC, &proctree_lock.sx_object,
- "process (pid %d) exiting", p->p_pid);
-
- PROC_LOCK(p);
- PROC_LOCK(p->p_pptr);
- sx_xunlock(&proctree_lock);
+ sigqueue_flush(&p->p_sigqueue);
+ sigqueue_flush(&td->td_sigqueue);
/*
* We have to wait until after acquiring all locks before
@@ -529,12 +516,13 @@
* proc lock.
*/
wakeup(p->p_pptr);
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p->p_pptr);
+ sched_exit(p->p_pptr, td);
+ PROC_SUNLOCK(p->p_pptr);
+ PROC_SLOCK(p);
p->p_state = PRS_ZOMBIE;
PROC_UNLOCK(p->p_pptr);
- sched_exit(p->p_pptr, td);
-
/*
* Hopefully no one will try to deliver a signal to the process this
* late in the game.
@@ -542,6 +530,11 @@
knlist_destroy(&p->p_klist);
/*
+ * Save our children's rusage information in our exit rusage.
+ */
+ ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
+
+ /*
* Make sure the scheduler takes this thread out of its tables etc.
* This will also release this thread's reference to the ucred.
* Other thread parts to release include pcb bits and such.
@@ -549,11 +542,87 @@
thread_exit();
}
+
+#ifndef _SYS_SYSPROTO_H_
+struct abort2_args {
+ char *why;
+ int nargs;
+ void **args;
+};
+#endif
+
+int
+abort2(struct thread *td, struct abort2_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct sbuf *sb;
+ void *uargs[16];
+ int error, i, sig;
+
+ error = 0; /* satisfy compiler */
+
+ /*
+ * Do it right now so we can log either proper call of abort2(), or
+ * note, that invalid argument was passed. 512 is big enough to
+ * handle 16 arguments' descriptions with additional comments.
+ */
+ sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN);
+ sbuf_clear(sb);
+ sbuf_printf(sb, "%s(pid %d uid %d) aborted: ",
+ p->p_comm, p->p_pid, td->td_ucred->cr_uid);
+ /*
+ * Since we can't return from abort2(), send SIGKILL in cases, where
+ * abort2() was called improperly
+ */
+ sig = SIGKILL;
+ /* Prevent from DoSes from user-space. */
+ if (uap->nargs < 0 || uap->nargs > 16)
+ goto out;
+ if (uap->args == NULL)
+ goto out;
+ error = copyin(uap->args, uargs, uap->nargs * sizeof(void *));
+ if (error != 0)
+ goto out;
+ /*
+ * Limit size of 'reason' string to 128. Will fit even when
+ * maximal number of arguments was chosen to be logged.
+ */
+ if (uap->why != NULL) {
+ error = sbuf_copyin(sb, uap->why, 128);
+ if (error < 0)
+ goto out;
+ } else {
+ sbuf_printf(sb, "(null)");
+ }
+ if (uap->nargs) {
+ sbuf_printf(sb, "(");
+ for (i = 0;i < uap->nargs; i++)
+ sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]);
+ sbuf_printf(sb, ")");
+ }
+ /*
+ * Final stage: arguments were proper, string has been
+ * successfully copied from userspace, and copying pointers
+ * from user-space succeed.
+ */
+ sig = SIGABRT;
+out:
+ if (sig == SIGKILL) {
+ sbuf_trim(sb);
+ sbuf_printf(sb, " (Reason text inaccessible)");
+ }
+ sbuf_cat(sb, "\n");
+ sbuf_finish(sb);
+ log(LOG_INFO, "%s", sbuf_data(sb));
+ sbuf_delete(sb);
+ exit1(td, W_EXITCODE(0, sig));
+ return (0);
+}
+
+
#ifdef COMPAT_43
/*
* The dirty work is handled by kern_wait().
- *
- * MPSAFE.
*/
int
owait(struct thread *td, struct owait_args *uap __unused)
@@ -569,8 +638,6 @@
/*
* The dirty work is handled by kern_wait().
- *
- * MPSAFE.
*/
int
wait4(struct thread *td, struct wait_args *uap)
@@ -597,6 +664,8 @@
struct proc *p, *q, *t;
int error, nfound;
+ AUDIT_ARG(pid, pid);
+
q = td->td_proc;
if (pid == 0) {
PROC_LOCK(q);
@@ -640,28 +709,19 @@
}
nfound++;
+ PROC_SLOCK(p);
if (p->p_state == PRS_ZOMBIE) {
-
- /*
- * It is possible that the last thread of this
- * process is still running on another CPU
- * in thread_exit() after having dropped the process
- * lock via PROC_UNLOCK() but before it has completed
- * cpu_throw(). In that case, the other thread must
- * still hold sched_lock, so simply by acquiring
- * sched_lock once we will wait long enough for the
- * thread to exit in that case.
- */
- mtx_lock_spin(&sched_lock);
- mtx_unlock_spin(&sched_lock);
-
- td->td_retval[0] = p->p_pid;
- if (status)
- *status = p->p_xstat; /* convert to int */
if (rusage) {
- *rusage = *p->p_ru;
+ *rusage = p->p_ru;
calcru(p, &rusage->ru_utime, &rusage->ru_stime);
}
+ PROC_SUNLOCK(p);
+ td->td_retval[0] = p->p_pid;
+ if (status)
+ *status = p->p_xstat; /* convert to int */
+ PROC_LOCK(q);
+ sigqueue_take(p->p_ksi);
+ PROC_UNLOCK(q);
/*
* If we got the child via a ptrace 'attach',
@@ -673,7 +733,7 @@
p->p_oppid = 0;
proc_reparent(p, t);
PROC_UNLOCK(p);
- psignal(t, SIGCHLD);
+ tdsignal(t, NULL, SIGCHLD, p->p_ksi);
wakeup(t);
PROC_UNLOCK(t);
sx_xunlock(&proctree_lock);
@@ -700,11 +760,9 @@
p->p_xstat = 0; /* XXX: why? */
PROC_UNLOCK(p);
PROC_LOCK(q);
- ruadd(&q->p_stats->p_cru, &q->p_crux, p->p_ru,
+ ruadd(&q->p_stats->p_cru, &q->p_crux, &p->p_ru,
&p->p_rux);
PROC_UNLOCK(q);
- FREE(p->p_ru, M_ZOMBIE);
- p->p_ru = NULL;
/*
* Decrement the count of procs running with this uid.
@@ -743,25 +801,33 @@
sx_xunlock(&allproc_lock);
return (0);
}
- mtx_lock_spin(&sched_lock);
if ((p->p_flag & P_STOPPED_SIG) &&
(p->p_suspcount == p->p_numthreads) &&
(p->p_flag & P_WAITED) == 0 &&
(p->p_flag & P_TRACED || options & WUNTRACED)) {
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
p->p_flag |= P_WAITED;
sx_xunlock(&proctree_lock);
td->td_retval[0] = p->p_pid;
if (status)
*status = W_STOPCODE(p->p_xstat);
+
+ PROC_LOCK(q);
+ sigqueue_take(p->p_ksi);
+ PROC_UNLOCK(q);
PROC_UNLOCK(p);
+
return (0);
}
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
if (options & WCONTINUED && (p->p_flag & P_CONTINUED)) {
sx_xunlock(&proctree_lock);
td->td_retval[0] = p->p_pid;
p->p_flag &= ~P_CONTINUED;
+
+ PROC_LOCK(q);
+ sigqueue_take(p->p_ksi);
+ PROC_UNLOCK(q);
PROC_UNLOCK(p);
if (status)
@@ -805,6 +871,9 @@
if (child->p_pptr == parent)
return;
+ PROC_LOCK(child->p_pptr);
+ sigqueue_take(child->p_ksi);
+ PROC_UNLOCK(child->p_pptr);
LIST_REMOVE(child, p_sibling);
LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
child->p_pptr = parent;
Index: subr_disk.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_disk.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/subr_disk.c -L sys/kern/subr_disk.c -u -r1.1.1.2 -r1.2
--- sys/kern/subr_disk.c
+++ sys/kern/subr_disk.c
@@ -8,7 +8,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_disk.c,v 1.85.2.1 2006/02/14 03:29:31 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_disk.c,v 1.88 2006/10/31 21:11:21 pjd Exp $");
#include "opt_geom.h"
@@ -43,6 +43,7 @@
case BIO_WRITE: printf("cmd=write "); break;
case BIO_DELETE: printf("cmd=delete "); break;
case BIO_GETATTR: printf("cmd=getattr "); break;
+ case BIO_FLUSH: printf("cmd=flush "); break;
default: printf("cmd=%x ", bp->bio_cmd); break;
}
sn = bp->bio_pblkno;
@@ -99,7 +100,7 @@
bioq_insert_head(struct bio_queue_head *head, struct bio *bp)
{
- if (TAILQ_FIRST(&head->queue) == NULL)
+ if (TAILQ_EMPTY(&head->queue))
head->insert_point = bp;
TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
}
@@ -108,7 +109,7 @@
bioq_insert_tail(struct bio_queue_head *head, struct bio *bp)
{
- if (TAILQ_FIRST(&head->queue) == NULL)
+ if (TAILQ_EMPTY(&head->queue))
head->insert_point = bp;
TAILQ_INSERT_TAIL(&head->queue, bp, bio_queue);
}
--- /dev/null
+++ sys/kern/subr_rtc.c
@@ -0,0 +1,166 @@
+/*-
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1982, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: Utah $Hdr: clock.c 1.18 91/01/21$
+ * from: @(#)clock.c 8.2 (Berkeley) 1/12/94
+ * from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp
+ * and
+ * from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
+ */
+
+/*
+ * Helpers for time-of-day clocks. This is useful for architectures that need
+ * support multiple models of such clocks, and generally serves to make the
+ * code more machine-independent.
+ * If the clock in question can also be used as a time counter, the driver
+ * needs to initiate this.
+ * This code is not yet used by all architectures.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/subr_rtc.c,v 1.9 2006/10/02 18:23:37 phk Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/clock.h>
+#include <sys/sysctl.h>
+#include <sys/timetc.h>
+
+#include "clock_if.h"
+
+static device_t clock_dev = NULL;
+static long clock_res;
+
+void
+clock_register(device_t dev, long res) /* res has units of microseconds */
+{
+
+ if (clock_dev != NULL) {
+ if (clock_res > res) {
+ if (bootverbose) {
+ device_printf(dev, "not installed as "
+ "time-of-day clock: clock %s has higher "
+ "resolution\n", device_get_name(clock_dev));
+ }
+ return;
+ } else {
+ if (bootverbose) {
+ device_printf(clock_dev, "removed as "
+ "time-of-day clock: clock %s has higher "
+ "resolution\n", device_get_name(dev));
+ }
+ }
+ }
+ clock_dev = dev;
+ clock_res = res;
+ if (bootverbose) {
+ device_printf(dev, "registered as a time-of-day clock "
+ "(resolution %ldus)\n", res);
+ }
+}
+
+/*
+ * inittodr and settodr derived from the i386 versions written
+ * by Christoph Robitschko <chmr at edvz.tu-graz.ac.at>, reintroduced and
+ * updated by Chris Stenton <chris at gnome.co.uk> 8/10/94
+ */
+
+/*
+ * Initialize the time of day register, based on the time base which is, e.g.
+ * from a filesystem.
+ */
+void
+inittodr(time_t base)
+{
+ struct timespec diff, ref, ts;
+ int error;
+
+ if (base) {
+ ref.tv_sec = base;
+ ref.tv_nsec = 0;
+ tc_setclock(&ref);
+ }
+
+ if (clock_dev == NULL) {
+ printf("warning: no time-of-day clock registered, system time "
+ "will not be set accurately\n");
+ return;
+ }
+ error = CLOCK_GETTIME(clock_dev, &ts);
+ if (error != 0 && error != EINVAL) {
+ printf("warning: clock_gettime failed (%d), the system time "
+ "will not be set accurately\n", error);
+ return;
+ }
+ if (error == EINVAL || ts.tv_sec < 0) {
+ printf("Invalid time in real time clock.\n");
+ printf("Check and reset the date immediately!\n");
+ }
+
+ ts.tv_sec += utc_offset();
+
+ if (timespeccmp(&ref, &ts, >)) {
+ diff = ref;
+ timespecsub(&ref, &ts);
+ } else {
+ diff = ts;
+ timespecsub(&diff, &ref);
+ }
+ if (ts.tv_sec >= 2) {
+ /* badly off, adjust it */
+ tc_setclock(&ts);
+ }
+}
+
+/*
+ * Write system time back to RTC
+ */
+void
+resettodr()
+{
+ struct timespec ts;
+ int error;
+
+ if (disable_rtc_set || clock_dev == NULL)
+ return;
+
+ getnanotime(&ts);
+ ts.tv_sec -= utc_offset();
+ if ((error = CLOCK_SETTIME(clock_dev, &ts)) != 0) {
+ printf("warning: clock_settime failed (%d), time-of-day clock "
+ "not adjusted to system time\n", error);
+ return;
+ }
+}
Index: kern_subr.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_subr.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_subr.c -L sys/kern/kern_subr.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_subr.c
+++ sys/kern/kern_subr.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_subr.c,v 1.96 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_subr.c,v 1.103 2007/06/05 00:00:54 jeff Exp $");
#include "opt_zero.h"
@@ -105,9 +105,9 @@
VM_OBJECT_LOCK(uobject);
retry:
if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) {
- vm_page_lock_queues();
- if (vm_page_sleep_if_busy(user_pg, 1, "vm_pgmoveco"))
+ if (vm_page_sleep_if_busy(user_pg, TRUE, "vm_pgmoveco"))
goto retry;
+ vm_page_lock_queues();
pmap_remove_all(user_pg);
vm_page_free(user_pg);
} else {
@@ -358,10 +358,11 @@
}
/*
- * General routine to allocate a hash table.
+ * General routine to allocate a hash table with control of memory flags.
*/
void *
-hashinit(int elements, struct malloc_type *type, u_long *hashmask)
+hashinit_flags(int elements, struct malloc_type *type, u_long *hashmask,
+ int flags)
{
long hashsize;
LIST_HEAD(generic, generic) *hashtbl;
@@ -369,16 +370,40 @@
if (elements <= 0)
panic("hashinit: bad elements");
+
+ /* Exactly one of HASH_WAITOK and HASH_NOWAIT must be set. */
+ KASSERT((flags & HASH_WAITOK) ^ (flags & HASH_NOWAIT),
+ ("Bad flags (0x%x) passed to hashinit_flags", flags));
+
for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
continue;
hashsize >>= 1;
- hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
- for (i = 0; i < hashsize; i++)
- LIST_INIT(&hashtbl[i]);
- *hashmask = hashsize - 1;
+
+ if (flags & HASH_NOWAIT)
+ hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl),
+ type, M_NOWAIT);
+ else
+ hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl),
+ type, M_WAITOK);
+
+ if (hashtbl != NULL) {
+ for (i = 0; i < hashsize; i++)
+ LIST_INIT(&hashtbl[i]);
+ *hashmask = hashsize - 1;
+ }
return (hashtbl);
}
+/*
+ * Allocate and initialize a hash table with default flag: may sleep.
+ */
+void *
+hashinit(int elements, struct malloc_type *type, u_long *hashmask)
+{
+
+ return (hashinit_flags(elements, type, hashmask, HASH_WAITOK));
+}
+
void
hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask)
{
@@ -428,11 +453,11 @@
struct thread *td;
td = curthread;
- mtx_lock_spin(&sched_lock);
DROP_GIANT();
- sched_prio(td, td->td_ksegrp->kg_user_pri); /* XXXKSE */
+ thread_lock(td);
+ sched_prio(td, td->td_user_pri);
mi_switch(SW_INVOL, NULL);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
PICKUP_GIANT();
}
--- /dev/null
+++ sys/kern/kern_priv.c
@@ -0,0 +1,155 @@
+/*-
+ * Copyright (c) 2006 nCircle Network Security, Inc.
+ * All rights reserved.
+ *
+ * This software was developed by Robert N. M. Watson for the TrustedBSD
+ * Project under contract to nCircle Network Security, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY,
+ * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/kern/kern_priv.c,v 1.4 2007/07/02 14:03:29 rwatson Exp $
+ */
+
+#include "opt_mac.h"
+
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <security/mac/mac_framework.h>
+
+/*
+ * `suser_enabled' (which can be set by the security.bsd.suser_enabled
+ * sysctl) determines whether the system 'super-user' policy is in effect. If
+ * it is nonzero, an effective uid of 0 connotes special privilege,
+ * overriding many mandatory and discretionary protections. If it is zero,
+ * uid 0 is offered no special privilege in the kernel security policy.
+ * Setting it to zero may seriously impact the functionality of many existing
+ * userland programs, and should not be done without careful consideration of
+ * the consequences.
+ */
+static int suser_enabled = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, suser_enabled, CTLFLAG_RW,
+ &suser_enabled, 0, "processes with uid 0 have privilege");
+TUNABLE_INT("security.bsd.suser_enabled", &suser_enabled);
+
+/*
+ * Check a credential for privilege. Lots of good reasons to deny privilege;
+ * only a few to grant it.
+ */
+int
+priv_check_cred(struct ucred *cred, int priv, int flags)
+{
+ int error;
+
+ KASSERT(PRIV_VALID(priv), ("priv_check_cred: invalid privilege %d",
+ priv));
+
+ /*
+ * We first evaluate policies that may deny the granting of
+ * privilege unilaterally.
+ */
+#ifdef MAC
+ error = mac_priv_check(cred, priv);
+ if (error)
+ return (error);
+#endif
+
+ /*
+ * Jail policy will restrict certain privileges that may otherwise be
+ * be granted.
+ */
+ error = prison_priv_check(cred, priv);
+ if (error)
+ return (error);
+
+ /*
+ * Having determined if privilege is restricted by various policies,
+ * now determine if privilege is granted. At this point, any policy
+ * may grant privilege. For now, we allow short-circuit boolean
+ * evaluation, so may not call all policies. Perhaps we should.
+ *
+ * Superuser policy grants privilege based on the effective (or in
+ * the case of specific privileges, real) uid being 0. We allow the
+ * superuser policy to be globally disabled, although this is
+ * currenty of limited utility.
+ */
+ if (suser_enabled) {
+ switch (priv) {
+ case PRIV_MAXFILES:
+ case PRIV_MAXPROC:
+ case PRIV_PROC_LIMIT:
+ if (cred->cr_ruid == 0)
+ return (0);
+ break;
+
+ default:
+ if (cred->cr_uid == 0)
+ return (0);
+ break;
+ }
+ }
+
+ /*
+ * Now check with MAC, if enabled, to see if a policy module grants
+ * privilege.
+ */
+#ifdef MAC
+ if (mac_priv_grant(cred, priv) == 0)
+ return (0);
+#endif
+ return (EPERM);
+}
+
+int
+priv_check(struct thread *td, int priv)
+{
+
+ KASSERT(td == curthread, ("priv_check: td != curthread"));
+
+ return (priv_check_cred(td->td_ucred, priv, 0));
+}
+
+/*
+ * Historical suser() wrapper functions, which now simply request PRIV_ROOT.
+ * These will be removed in the near future, and exist solely because
+ * the kernel and modules are not yet fully adapted to the new model.
+ */
+int
+suser_cred(struct ucred *cred, int flags)
+{
+
+ return (priv_check_cred(cred, PRIV_ROOT, flags));
+}
+
+int
+suser(struct thread *td)
+{
+
+ KASSERT(td == curthread, ("suser: td != curthread"));
+
+ return (suser_cred(td->td_ucred, 0));
+}
Index: vfs_export.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_export.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/vfs_export.c -L sys/kern/vfs_export.c -u -r1.1.1.1 -r1.2
--- sys/kern/vfs_export.c
+++ sys/kern/vfs_export.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_export.c,v 1.333 2005/05/11 18:25:42 kan Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_export.c,v 1.341 2007/02/15 22:08:35 pjd Exp $");
#include <sys/param.h>
#include <sys/dirent.h>
@@ -46,13 +46,14 @@
#include <sys/mbuf.h>
#include <sys/mount.h>
#include <sys/mutex.h>
+#include <sys/refcount.h>
#include <sys/socket.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <net/radix.h>
-static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
+static MALLOC_DEFINE(M_NETADDR, "export_host", "Export host address structure");
static void vfs_free_addrlist(struct netexport *nep);
static int vfs_free_netcred(struct radix_node *rn, void *w);
@@ -82,10 +83,8 @@
* Called by ufs_mount() to set up the lists of export addresses.
*/
static int
-vfs_hang_addrlist(mp, nep, argp)
- struct mount *mp;
- struct netexport *nep;
- struct export_args *argp;
+vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
+ struct export_args *argp)
{
register struct netcred *np;
register struct radix_node_head *rnh;
@@ -102,12 +101,18 @@
* with fields like cr_uidinfo and cr_prison? Currently, this
* routine does not touch them (leaves them as NULL).
*/
- if (argp->ex_anon.cr_version != XUCRED_VERSION)
+ if (argp->ex_anon.cr_version != XUCRED_VERSION) {
+ vfs_mount_error(mp, "ex_anon.cr_version: %d != %d",
+ argp->ex_anon.cr_version, XUCRED_VERSION);
return (EINVAL);
+ }
if (argp->ex_addrlen == 0) {
- if (mp->mnt_flag & MNT_DEFEXPORTED)
+ if (mp->mnt_flag & MNT_DEFEXPORTED) {
+ vfs_mount_error(mp,
+ "MNT_DEFEXPORTED already set for mount %p", mp);
return (EPERM);
+ }
np = &nep->ne_defexported;
np->netc_exflags = argp->ex_flags;
bzero(&np->netc_anon, sizeof(np->netc_anon));
@@ -115,14 +120,19 @@
np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups;
bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups,
sizeof(np->netc_anon.cr_groups));
- np->netc_anon.cr_ref = 1;
+ refcount_init(&np->netc_anon.cr_ref, 1);
+ MNT_ILOCK(mp);
mp->mnt_flag |= MNT_DEFEXPORTED;
+ MNT_IUNLOCK(mp);
return (0);
}
#if MSIZE <= 256
- if (argp->ex_addrlen > MLEN)
+ if (argp->ex_addrlen > MLEN) {
+ vfs_mount_error(mp, "ex_addrlen %d is greater than %d",
+ argp->ex_addrlen, MLEN);
return (EINVAL);
+ }
#endif
i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
@@ -130,8 +140,9 @@
saddr = (struct sockaddr *) (np + 1);
if ((error = copyin(argp->ex_addr, saddr, argp->ex_addrlen)))
goto out;
- if (saddr->sa_family > AF_MAX) {
+ if (saddr->sa_family == AF_UNSPEC || saddr->sa_family > AF_MAX) {
error = EINVAL;
+ vfs_mount_error(mp, "Invalid saddr->sa_family: %d");
goto out;
}
if (saddr->sa_len > argp->ex_addrlen)
@@ -158,6 +169,9 @@
}
if ((rnh = nep->ne_rtable[i]) == NULL) {
error = ENOBUFS;
+ vfs_mount_error(mp, "%s %s %d",
+ "Unable to initialize radix node head ",
+ "for address family", i);
goto out;
}
}
@@ -166,6 +180,8 @@
RADIX_NODE_HEAD_UNLOCK(rnh);
if (rn == NULL || np != (struct netcred *)rn) { /* already exists */
error = EPERM;
+ vfs_mount_error(mp, "Invalid radix node head, rn: %p %p",
+ rn, np);
goto out;
}
np->netc_exflags = argp->ex_flags;
@@ -174,7 +190,7 @@
np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups;
bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups,
sizeof(np->netc_anon.cr_groups));
- np->netc_anon.cr_ref = 1;
+ refcount_init(&np->netc_anon.cr_ref, 1);
return (0);
out:
free(np, M_NETADDR);
@@ -184,9 +200,7 @@
/* Helper for vfs_free_addrlist. */
/* ARGSUSED */
static int
-vfs_free_netcred(rn, w)
- struct radix_node *rn;
- void *w;
+vfs_free_netcred(struct radix_node *rn, void *w)
{
register struct radix_node_head *rnh = (struct radix_node_head *) w;
@@ -199,8 +213,7 @@
* Free the net address hash lists that are hanging off the mount points.
*/
static void
-vfs_free_addrlist(nep)
- struct netexport *nep;
+vfs_free_addrlist(struct netexport *nep)
{
register int i;
register struct radix_node_head *rnh;
@@ -222,26 +235,31 @@
* the structure is described in sys/mount.h
*/
int
-vfs_export(mp, argp)
- struct mount *mp;
- struct export_args *argp;
+vfs_export(struct mount *mp, struct export_args *argp)
{
struct netexport *nep;
int error;
nep = mp->mnt_export;
+ error = 0;
if (argp->ex_flags & MNT_DELEXPORT) {
- if (nep == NULL)
- return (ENOENT);
+ if (nep == NULL) {
+ error = ENOENT;
+ goto out;
+ }
if (mp->mnt_flag & MNT_EXPUBLIC) {
vfs_setpublicfs(NULL, NULL, NULL);
+ MNT_ILOCK(mp);
mp->mnt_flag &= ~MNT_EXPUBLIC;
+ MNT_IUNLOCK(mp);
}
vfs_free_addrlist(nep);
mp->mnt_export = NULL;
free(nep, M_MOUNT);
nep = NULL;
+ MNT_ILOCK(mp);
mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+ MNT_IUNLOCK(mp);
}
if (argp->ex_flags & MNT_EXPORTED) {
if (nep == NULL) {
@@ -250,14 +268,30 @@
}
if (argp->ex_flags & MNT_EXPUBLIC) {
if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
- return (error);
+ goto out;
+ MNT_ILOCK(mp);
mp->mnt_flag |= MNT_EXPUBLIC;
+ MNT_IUNLOCK(mp);
}
if ((error = vfs_hang_addrlist(mp, nep, argp)))
- return (error);
+ goto out;
+ MNT_ILOCK(mp);
mp->mnt_flag |= MNT_EXPORTED;
+ MNT_IUNLOCK(mp);
}
- return (0);
+
+out:
+ /*
+ * Once we have executed the vfs_export() command, we do
+ * not want to keep the "export" option around in the
+ * options list, since that will cause subsequent MNT_UPDATE
+ * calls to fail. The export information is saved in
+ * mp->mnt_export, so we can safely delete the "export" mount option
+ * here.
+ */
+ vfs_deleteopt(mp->mnt_optnew, "export");
+ vfs_deleteopt(mp->mnt_opt, "export");
+ return (error);
}
/*
@@ -265,10 +299,8 @@
* one public filesystem is possible in the spec (RFC 2054 and 2055)
*/
int
-vfs_setpublicfs(mp, nep, argp)
- struct mount *mp;
- struct netexport *nep;
- struct export_args *argp;
+vfs_setpublicfs(struct mount *mp, struct netexport *nep,
+ struct export_args *argp)
{
int error;
struct vnode *rvp;
@@ -305,7 +337,7 @@
if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp, curthread /* XXX */)))
return (error);
- if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
+ if ((error = VOP_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
return (error);
vput(rvp);
@@ -393,11 +425,8 @@
*/
int
-vfs_stdcheckexp(mp, nam, extflagsp, credanonp)
- struct mount *mp;
- struct sockaddr *nam;
- int *extflagsp;
- struct ucred **credanonp;
+vfs_stdcheckexp(struct mount *mp, struct sockaddr *nam, int *extflagsp,
+ struct ucred **credanonp)
{
struct netcred *np;
Index: kern_switch.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_switch.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_switch.c -L sys/kern/kern_switch.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_switch.c
+++ sys/kern/kern_switch.c
@@ -24,69 +24,9 @@
* SUCH DAMAGE.
*/
-/***
-Here is the logic..
-
-If there are N processors, then there are at most N KSEs (kernel
-schedulable entities) working to process threads that belong to a
-KSEGROUP (kg). If there are X of these KSEs actually running at the
-moment in question, then there are at most M (N-X) of these KSEs on
-the run queue, as running KSEs are not on the queue.
-
-Runnable threads are queued off the KSEGROUP in priority order.
-If there are M or more threads runnable, the top M threads
-(by priority) are 'preassigned' to the M KSEs not running. The KSEs take
-their priority from those threads and are put on the run queue.
-
-The last thread that had a priority high enough to have a KSE associated
-with it, AND IS ON THE RUN QUEUE is pointed to by
-kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
-assigned as all the available KSEs are activly running, or because there
-are no threads queued, that pointer is NULL.
-
-When a KSE is removed from the run queue to become runnable, we know
-it was associated with the highest priority thread in the queue (at the head
-of the queue). If it is also the last assigned we know M was 1 and must
-now be 0. Since the thread is no longer queued that pointer must be
-removed from it. Since we know there were no more KSEs available,
-(M was 1 and is now 0) and since we are not FREEING our KSE
-but using it, we know there are STILL no more KSEs available, we can prove
-that the next thread in the ksegrp list will not have a KSE to assign to
-it, so we can show that the pointer must be made 'invalid' (NULL).
-
-The pointer exists so that when a new thread is made runnable, it can
-have its priority compared with the last assigned thread to see if
-it should 'steal' its KSE or not.. i.e. is it 'earlier'
-on the list than that thread or later.. If it's earlier, then the KSE is
-removed from the last assigned (which is now not assigned a KSE)
-and reassigned to the new thread, which is placed earlier in the list.
-The pointer is then backed up to the previous thread (which may or may not
-be the new thread).
-
-When a thread sleeps or is removed, the KSE becomes available and if there
-are queued threads that are not assigned KSEs, the highest priority one of
-them is assigned the KSE, which is then placed back on the run queue at
-the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
-to point to it.
-
-The following diagram shows 2 KSEs and 3 threads from a single process.
-
- RUNQ: --->KSE---KSE--... (KSEs queued at priorities from threads)
- \ \____
- \ \
- KSEGROUP---thread--thread--thread (queued in priority order)
- \ /
- \_______________/
- (last_assigned)
-
-The result of this scheme is that the M available KSEs are always
-queued at the priorities they have inherrited from the M highest priority
-threads for that KSEGROUP. If this situation changes, the KSEs are
-reassigned to keep this true.
-***/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_switch.c,v 1.116.2.1 2005/08/06 03:06:25 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_switch.c,v 1.137 2007/10/08 23:37:28 jeff Exp $");
#include "opt_sched.h"
@@ -109,6 +49,15 @@
#include <sys/sysctl.h>
#endif
+#include <machine/cpu.h>
+
+/* Uncomment this to enable logging of critical_enter/exit. */
+#if 0
+#define KTR_CRITICAL KTR_SCHED
+#else
+#define KTR_CRITICAL 0
+#endif
+
#ifdef FULL_PREEMPTION
#ifndef PREEMPTION
#error "The FULL_PREEMPTION option requires the PREEMPTION option"
@@ -117,8 +66,6 @@
CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
-#define td_kse td_sched
-
/*
* kern.sched.preemption allows user space to determine if preemption support
* is compiled in or not. It is not currently a boot or runtime flag that
@@ -132,55 +79,62 @@
SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
&kern_sched_preemption, 0, "Kernel preemption enabled");
+#ifdef SCHED_STATS
+long switch_preempt;
+long switch_owepreempt;
+long switch_turnstile;
+long switch_sleepq;
+long switch_sleepqtimo;
+long switch_relinquish;
+long switch_needresched;
+static SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, preempt, CTLFLAG_RD, &switch_preempt, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, owepreempt, CTLFLAG_RD, &switch_owepreempt, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, turnstile, CTLFLAG_RD, &switch_turnstile, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepq, CTLFLAG_RD, &switch_sleepq, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepqtimo, CTLFLAG_RD, &switch_sleepqtimo, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, relinquish, CTLFLAG_RD, &switch_relinquish, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, needresched, CTLFLAG_RD, &switch_needresched, 0, "");
+static int
+sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ int val;
+
+ val = 0;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (val == 0)
+ return (0);
+ switch_preempt = 0;
+ switch_owepreempt = 0;
+ switch_turnstile = 0;
+ switch_sleepq = 0;
+ switch_sleepqtimo = 0;
+ switch_relinquish = 0;
+ switch_needresched = 0;
+
+ return (0);
+}
+
+SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL,
+ 0, sysctl_stats_reset, "I", "Reset scheduler statistics");
+#endif
+
/************************************************************************
* Functions that manipulate runnability from a thread perspective. *
************************************************************************/
/*
- * Select the KSE that will be run next. From that find the thread, and
- * remove it from the KSEGRP's run queue. If there is thread clustering,
- * this will be what does it.
+ * Select the thread that will be run next.
*/
struct thread *
choosethread(void)
{
- struct kse *ke;
struct thread *td;
- struct ksegrp *kg;
-
-#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
- if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
- /* Shutting down, run idlethread on AP's */
- td = PCPU_GET(idlethread);
- ke = td->td_kse;
- CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
- ke->ke_flags |= KEF_DIDRUN;
- TD_SET_RUNNING(td);
- return (td);
- }
-#endif
retry:
- ke = sched_choose();
- if (ke) {
- td = ke->ke_thread;
- KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
- kg = ke->ke_ksegrp;
- if (td->td_proc->p_flag & P_HADTHREADS) {
- if (kg->kg_last_assigned == td) {
- kg->kg_last_assigned = TAILQ_PREV(td,
- threadqueue, td_runq);
- }
- TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
- }
- CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
- td, td->td_priority);
- } else {
- /* Simulate runq_choose() having returned the idle thread */
- td = PCPU_GET(idlethread);
- ke = td->td_kse;
- CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
- }
- ke->ke_flags |= KEF_DIDRUN;
+ td = sched_choose();
/*
* If we are in panic, only allow system threads,
@@ -198,395 +152,6 @@
}
/*
- * Given a surplus system slot, try assign a new runnable thread to it.
- * Called from:
- * sched_thread_exit() (local)
- * sched_switch() (local)
- * sched_thread_exit() (local)
- * remrunqueue() (local) (not at the moment)
- */
-static void
-slot_fill(struct ksegrp *kg)
-{
- struct thread *td;
-
- mtx_assert(&sched_lock, MA_OWNED);
- while (kg->kg_avail_opennings > 0) {
- /*
- * Find the first unassigned thread
- */
- if ((td = kg->kg_last_assigned) != NULL)
- td = TAILQ_NEXT(td, td_runq);
- else
- td = TAILQ_FIRST(&kg->kg_runq);
-
- /*
- * If we found one, send it to the system scheduler.
- */
- if (td) {
- kg->kg_last_assigned = td;
- sched_add(td, SRQ_YIELDING);
- CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg);
- } else {
- /* no threads to use up the slots. quit now */
- break;
- }
- }
-}
-
-#ifdef SCHED_4BSD
-/*
- * Remove a thread from its KSEGRP's run queue.
- * This in turn may remove it from a KSE if it was already assigned
- * to one, possibly causing a new thread to be assigned to the KSE
- * and the KSE getting a new priority.
- */
-static void
-remrunqueue(struct thread *td)
-{
- struct thread *td2, *td3;
- struct ksegrp *kg;
- struct kse *ke;
-
- mtx_assert(&sched_lock, MA_OWNED);
- KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
- kg = td->td_ksegrp;
- ke = td->td_kse;
- CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
- TD_SET_CAN_RUN(td);
- /*
- * If it is not a threaded process, take the shortcut.
- */
- if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
- /* remve from sys run queue and free up a slot */
- sched_rem(td);
- ke->ke_state = KES_THREAD;
- return;
- }
- td3 = TAILQ_PREV(td, threadqueue, td_runq);
- TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
- if (ke->ke_state == KES_ONRUNQ) {
- /*
- * This thread has been assigned to the system run queue.
- * We need to dissociate it and try assign the
- * KSE to the next available thread. Then, we should
- * see if we need to move the KSE in the run queues.
- */
- sched_rem(td);
- ke->ke_state = KES_THREAD;
- td2 = kg->kg_last_assigned;
- KASSERT((td2 != NULL), ("last assigned has wrong value"));
- if (td2 == td)
- kg->kg_last_assigned = td3;
- /* slot_fill(kg); */ /* will replace it with another */
- }
-}
-#endif
-
-/*
- * Change the priority of a thread that is on the run queue.
- */
-void
-adjustrunqueue( struct thread *td, int newpri)
-{
- struct ksegrp *kg;
- struct kse *ke;
-
- mtx_assert(&sched_lock, MA_OWNED);
- KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
-
- ke = td->td_kse;
- CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
- /*
- * If it is not a threaded process, take the shortcut.
- */
- if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
- /* We only care about the kse in the run queue. */
- td->td_priority = newpri;
- if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
- sched_rem(td);
- sched_add(td, SRQ_BORING);
- }
- return;
- }
-
- /* It is a threaded process */
- kg = td->td_ksegrp;
- if (ke->ke_state == KES_ONRUNQ
-#ifdef SCHED_ULE
- || ((ke->ke_flags & KEF_ASSIGNED) != 0 &&
- (ke->ke_flags & KEF_REMOVED) == 0)
-#endif
- ) {
- if (kg->kg_last_assigned == td) {
- kg->kg_last_assigned =
- TAILQ_PREV(td, threadqueue, td_runq);
- }
- sched_rem(td);
- }
- TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
- TD_SET_CAN_RUN(td);
- td->td_priority = newpri;
- setrunqueue(td, SRQ_BORING);
-}
-
-/*
- * This function is called when a thread is about to be put on a
- * ksegrp run queue because it has been made runnable or its
- * priority has been adjusted and the ksegrp does not have a
- * free kse slot. It determines if a thread from the same ksegrp
- * should be preempted. If so, it tries to switch threads
- * if the thread is on the same cpu or notifies another cpu that
- * it should switch threads.
- */
-
-static void
-maybe_preempt_in_ksegrp(struct thread *td)
-#if !defined(SMP)
-{
- struct thread *running_thread;
-
- mtx_assert(&sched_lock, MA_OWNED);
- running_thread = curthread;
-
- if (running_thread->td_ksegrp != td->td_ksegrp)
- return;
-
- if (td->td_priority >= running_thread->td_priority)
- return;
-#ifdef PREEMPTION
-#ifndef FULL_PREEMPTION
- if (td->td_priority > PRI_MAX_ITHD) {
- running_thread->td_flags |= TDF_NEEDRESCHED;
- return;
- }
-#endif /* FULL_PREEMPTION */
-
- if (running_thread->td_critnest > 1)
- running_thread->td_owepreempt = 1;
- else
- mi_switch(SW_INVOL, NULL);
-
-#else /* PREEMPTION */
- running_thread->td_flags |= TDF_NEEDRESCHED;
-#endif /* PREEMPTION */
- return;
-}
-
-#else /* SMP */
-{
- struct thread *running_thread;
- int worst_pri;
- struct ksegrp *kg;
- cpumask_t cpumask,dontuse;
- struct pcpu *pc;
- struct pcpu *best_pcpu;
- struct thread *cputhread;
-
- mtx_assert(&sched_lock, MA_OWNED);
-
- running_thread = curthread;
-
-#if !defined(KSEG_PEEMPT_BEST_CPU)
- if (running_thread->td_ksegrp != td->td_ksegrp) {
-#endif
- kg = td->td_ksegrp;
-
- /* if someone is ahead of this thread, wait our turn */
- if (td != TAILQ_FIRST(&kg->kg_runq))
- return;
-
- worst_pri = td->td_priority;
- best_pcpu = NULL;
- dontuse = stopped_cpus | idle_cpus_mask;
-
- /*
- * Find a cpu with the worst priority that runs at thread from
- * the same ksegrp - if multiple exist give first the last run
- * cpu and then the current cpu priority
- */
-
- SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
- cpumask = pc->pc_cpumask;
- cputhread = pc->pc_curthread;
-
- if ((cpumask & dontuse) ||
- cputhread->td_ksegrp != kg)
- continue;
-
- if (cputhread->td_priority > worst_pri) {
- worst_pri = cputhread->td_priority;
- best_pcpu = pc;
- continue;
- }
-
- if (cputhread->td_priority == worst_pri &&
- best_pcpu != NULL &&
- (td->td_lastcpu == pc->pc_cpuid ||
- (PCPU_GET(cpumask) == cpumask &&
- td->td_lastcpu != best_pcpu->pc_cpuid)))
- best_pcpu = pc;
- }
-
- /* Check if we need to preempt someone */
- if (best_pcpu == NULL)
- return;
-
-#if defined(IPI_PREEMPTION) && defined(PREEMPTION)
-#if !defined(FULL_PREEMPTION)
- if (td->td_priority <= PRI_MAX_ITHD)
-#endif /* ! FULL_PREEMPTION */
- {
- ipi_selected(best_pcpu->pc_cpumask, IPI_PREEMPT);
- return;
- }
-#endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
-
- if (PCPU_GET(cpuid) != best_pcpu->pc_cpuid) {
- best_pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
- ipi_selected(best_pcpu->pc_cpumask, IPI_AST);
- return;
- }
-#if !defined(KSEG_PEEMPT_BEST_CPU)
- }
-#endif
-
- if (td->td_priority >= running_thread->td_priority)
- return;
-#ifdef PREEMPTION
-
-#if !defined(FULL_PREEMPTION)
- if (td->td_priority > PRI_MAX_ITHD) {
- running_thread->td_flags |= TDF_NEEDRESCHED;
- }
-#endif /* ! FULL_PREEMPTION */
-
- if (running_thread->td_critnest > 1)
- running_thread->td_owepreempt = 1;
- else
- mi_switch(SW_INVOL, NULL);
-
-#else /* PREEMPTION */
- running_thread->td_flags |= TDF_NEEDRESCHED;
-#endif /* PREEMPTION */
- return;
-}
-#endif /* !SMP */
-
-
-int limitcount;
-void
-setrunqueue(struct thread *td, int flags)
-{
- struct ksegrp *kg;
- struct thread *td2;
- struct thread *tda;
-
- CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d",
- td, td->td_ksegrp, td->td_proc->p_pid);
- CTR5(KTR_SCHED, "setrunqueue: %p(%s) prio %d by %p(%s)",
- td, td->td_proc->p_comm, td->td_priority, curthread,
- curthread->td_proc->p_comm);
- mtx_assert(&sched_lock, MA_OWNED);
- KASSERT((td->td_inhibitors == 0),
- ("setrunqueue: trying to run inhibitted thread"));
- KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
- ("setrunqueue: bad thread state"));
- TD_SET_RUNQ(td);
- kg = td->td_ksegrp;
- if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
- /*
- * Common path optimisation: Only one of everything
- * and the KSE is always already attached.
- * Totally ignore the ksegrp run queue.
- */
- if (kg->kg_avail_opennings != 1) {
- if (limitcount < 1) {
- limitcount++;
- printf("pid %d: corrected slot count (%d->1)\n",
- td->td_proc->p_pid, kg->kg_avail_opennings);
-
- }
- kg->kg_avail_opennings = 1;
- }
- sched_add(td, flags);
- return;
- }
-
- /*
- * If the concurrency has reduced, and we would go in the
- * assigned section, then keep removing entries from the
- * system run queue, until we are not in that section
- * or there is room for us to be put in that section.
- * What we MUST avoid is the case where there are threads of less
- * priority than the new one scheduled, but it can not
- * be scheduled itself. That would lead to a non contiguous set
- * of scheduled threads, and everything would break.
- */
- tda = kg->kg_last_assigned;
- while ((kg->kg_avail_opennings <= 0) &&
- (tda && (tda->td_priority > td->td_priority))) {
- /*
- * None free, but there is one we can commandeer.
- */
- CTR2(KTR_RUNQ,
- "setrunqueue: kg:%p: take slot from td: %p", kg, tda);
- sched_rem(tda);
- tda = kg->kg_last_assigned =
- TAILQ_PREV(tda, threadqueue, td_runq);
- }
-
- /*
- * Add the thread to the ksegrp's run queue at
- * the appropriate place.
- */
- TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
- if (td2->td_priority > td->td_priority) {
- TAILQ_INSERT_BEFORE(td2, td, td_runq);
- break;
- }
- }
- if (td2 == NULL) {
- /* We ran off the end of the TAILQ or it was empty. */
- TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
- }
-
- /*
- * If we have a slot to use, then put the thread on the system
- * run queue and if needed, readjust the last_assigned pointer.
- * it may be that we need to schedule something anyhow
- * even if the availabel slots are -ve so that
- * all the items < last_assigned are scheduled.
- */
- if (kg->kg_avail_opennings > 0) {
- if (tda == NULL) {
- /*
- * No pre-existing last assigned so whoever is first
- * gets the slot.. (maybe us)
- */
- td2 = TAILQ_FIRST(&kg->kg_runq);
- kg->kg_last_assigned = td2;
- } else if (tda->td_priority > td->td_priority) {
- td2 = td;
- } else {
- /*
- * We are past last_assigned, so
- * give the next slot to whatever is next,
- * which may or may not be us.
- */
- td2 = TAILQ_NEXT(tda, td_runq);
- kg->kg_last_assigned = td2;
- }
- sched_add(td2, flags);
- } else {
- CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
- td, td->td_ksegrp, td->td_proc->p_pid);
- if ((flags & SRQ_YIELDING) == 0)
- maybe_preempt_in_ksegrp(td);
- }
-}
-
-/*
* Kernel thread preemption implementation. Critical sections mark
* regions of code in which preemptions are not allowed.
*/
@@ -609,22 +174,20 @@
td = curthread;
KASSERT(td->td_critnest != 0,
("critical_exit: td_critnest == 0"));
-#ifdef PREEMPTION
+
if (td->td_critnest == 1) {
td->td_critnest = 0;
- mtx_assert(&sched_lock, MA_NOTOWNED);
if (td->td_owepreempt) {
td->td_critnest = 1;
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_critnest--;
- mi_switch(SW_INVOL, NULL);
- mtx_unlock_spin(&sched_lock);
+ SCHED_STAT_INC(switch_owepreempt);
+ mi_switch(SW_INVOL|SW_PREEMPT, NULL);
+ thread_unlock(td);
}
- } else
-#endif
+ } else
td->td_critnest--;
-
-
+
CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
(long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
}
@@ -644,7 +207,6 @@
int cpri, pri;
#endif
- mtx_assert(&sched_lock, MA_OWNED);
#ifdef PREEMPTION
/*
* The new thread should not preempt the current thread if any of the
@@ -670,14 +232,15 @@
* to the new thread.
*/
ctd = curthread;
- KASSERT ((ctd->td_kse != NULL && ctd->td_kse->ke_thread == ctd),
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT ((ctd->td_sched != NULL && ctd->td_sched->ts_thread == ctd),
("thread has no (or wrong) sched-private part."));
KASSERT((td->td_inhibitors == 0),
- ("maybe_preempt: trying to run inhibitted thread"));
+ ("maybe_preempt: trying to run inhibited thread"));
pri = td->td_priority;
cpri = ctd->td_priority;
if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
- TD_IS_INHIBITED(ctd) || td->td_kse->ke_state != KES_THREAD)
+ TD_IS_INHIBITED(ctd))
return (0);
#ifndef FULL_PREEMPTION
if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
@@ -690,32 +253,24 @@
ctd->td_owepreempt = 1;
return (0);
}
-
/*
* Thread is runnable but not yet put on system run queue.
*/
+ MPASS(ctd->td_lock == td->td_lock);
MPASS(TD_ON_RUNQ(td));
- MPASS(td->td_sched->ke_state != KES_ONRUNQ);
- if (td->td_proc->p_flag & P_HADTHREADS) {
- /*
- * If this is a threaded process we actually ARE on the
- * ksegrp run queue so take it off that first.
- * Also undo any damage done to the last_assigned pointer.
- * XXX Fix setrunqueue so this isn't needed
- */
- struct ksegrp *kg;
-
- kg = td->td_ksegrp;
- if (kg->kg_last_assigned == td)
- kg->kg_last_assigned =
- TAILQ_PREV(td, threadqueue, td_runq);
- TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
- }
-
TD_SET_RUNNING(td);
CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
td->td_proc->p_pid, td->td_proc->p_comm);
+ SCHED_STAT_INC(switch_preempt);
mi_switch(SW_INVOL|SW_PREEMPT, td);
+ /*
+ * td's lock pointer may have changed. We have to return with it
+ * locked.
+ */
+ spinlock_enter();
+ thread_unlock(ctd);
+ thread_lock(td);
+ spinlock_exit();
return (1);
#else
return (0);
@@ -793,6 +348,38 @@
return (-1);
}
+static __inline int
+runq_findbit_from(struct runq *rq, u_char pri)
+{
+ struct rqbits *rqb;
+ rqb_word_t mask;
+ int i;
+
+ /*
+ * Set the mask for the first word so we ignore priorities before 'pri'.
+ */
+ mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
+ rqb = &rq->rq_status;
+again:
+ for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
+ mask = rqb->rqb_bits[i] & mask;
+ if (mask == 0)
+ continue;
+ pri = RQB_FFS(mask) + (i << RQB_L2BPW);
+ CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
+ mask, i, pri);
+ return (pri);
+ }
+ if (pri == 0)
+ return (-1);
+ /*
+ * Wrap back around to the beginning of the list just once so we
+ * scan the whole thing.
+ */
+ pri = 0;
+ goto again;
+}
+
/*
* Set the status bit of the queue corresponding to priority level pri,
* indicating that it is non-empty.
@@ -811,28 +398,45 @@
}
/*
- * Add the KSE to the queue specified by its priority, and set the
+ * Add the thread to the queue specified by its priority, and set the
* corresponding status bit.
*/
void
-runq_add(struct runq *rq, struct kse *ke, int flags)
+runq_add(struct runq *rq, struct td_sched *ts, int flags)
{
struct rqhead *rqh;
int pri;
- pri = ke->ke_thread->td_priority / RQ_PPQ;
- ke->ke_rqindex = pri;
+ pri = ts->ts_thread->td_priority / RQ_PPQ;
+ ts->ts_rqindex = pri;
runq_setbit(rq, pri);
rqh = &rq->rq_queues[pri];
- CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
- ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
+ CTR5(KTR_RUNQ, "runq_add: td=%p ts=%p pri=%d %d rqh=%p",
+ ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
if (flags & SRQ_PREEMPTED) {
- TAILQ_INSERT_HEAD(rqh, ke, ke_procq);
+ TAILQ_INSERT_HEAD(rqh, ts, ts_procq);
} else {
- TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
+ TAILQ_INSERT_TAIL(rqh, ts, ts_procq);
}
}
+void
+runq_add_pri(struct runq *rq, struct td_sched *ts, u_char pri, int flags)
+{
+ struct rqhead *rqh;
+
+ KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
+ ts->ts_rqindex = pri;
+ runq_setbit(rq, pri);
+ rqh = &rq->rq_queues[pri];
+ CTR5(KTR_RUNQ, "runq_add_pri: td=%p ke=%p pri=%d idx=%d rqh=%p",
+ ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
+ if (flags & SRQ_PREEMPTED) {
+ TAILQ_INSERT_HEAD(rqh, ts, ts_procq);
+ } else {
+ TAILQ_INSERT_TAIL(rqh, ts, ts_procq);
+ }
+}
/*
* Return true if there are runnable processes of any priority on the run
* queue, false otherwise. Has no side effects, does not modify the run
@@ -864,14 +468,13 @@
/*
* Find the highest priority process on the run queue.
*/
-struct kse *
+struct td_sched *
runq_choose(struct runq *rq)
{
struct rqhead *rqh;
- struct kse *ke;
+ struct td_sched *ts;
int pri;
- mtx_assert(&sched_lock, MA_OWNED);
while ((pri = runq_findbit(rq)) != -1) {
rqh = &rq->rq_queues[pri];
#if defined(SMP) && defined(SCHED_4BSD)
@@ -883,72 +486,106 @@
*/
int count = runq_fuzz;
int cpu = PCPU_GET(cpuid);
- struct kse *ke2;
- ke2 = ke = TAILQ_FIRST(rqh);
+ struct td_sched *ts2;
+ ts2 = ts = TAILQ_FIRST(rqh);
- while (count-- && ke2) {
- if (ke->ke_thread->td_lastcpu == cpu) {
- ke = ke2;
+ while (count-- && ts2) {
+ if (ts->ts_thread->td_lastcpu == cpu) {
+ ts = ts2;
break;
}
- ke2 = TAILQ_NEXT(ke2, ke_procq);
+ ts2 = TAILQ_NEXT(ts2, ts_procq);
}
- } else
+ } else
#endif
- ke = TAILQ_FIRST(rqh);
- KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
+ ts = TAILQ_FIRST(rqh);
+ KASSERT(ts != NULL, ("runq_choose: no proc on busy queue"));
CTR3(KTR_RUNQ,
- "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
- return (ke);
+ "runq_choose: pri=%d td_sched=%p rqh=%p", pri, ts, rqh);
+ return (ts);
}
CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
return (NULL);
}
+struct td_sched *
+runq_choose_from(struct runq *rq, u_char idx)
+{
+ struct rqhead *rqh;
+ struct td_sched *ts;
+ int pri;
+
+ if ((pri = runq_findbit_from(rq, idx)) != -1) {
+ rqh = &rq->rq_queues[pri];
+ ts = TAILQ_FIRST(rqh);
+ KASSERT(ts != NULL, ("runq_choose: no proc on busy queue"));
+ CTR4(KTR_RUNQ,
+ "runq_choose_from: pri=%d kse=%p idx=%d rqh=%p",
+ pri, ts, ts->ts_rqindex, rqh);
+ return (ts);
+ }
+ CTR1(KTR_RUNQ, "runq_choose_from: idleproc pri=%d", pri);
+
+ return (NULL);
+}
/*
- * Remove the KSE from the queue specified by its priority, and clear the
+ * Remove the thread from the queue specified by its priority, and clear the
* corresponding status bit if the queue becomes empty.
- * Caller must set ke->ke_state afterwards.
+ * Caller must set state afterwards.
*/
void
-runq_remove(struct runq *rq, struct kse *ke)
+runq_remove(struct runq *rq, struct td_sched *ts)
+{
+
+ runq_remove_idx(rq, ts, NULL);
+}
+
+void
+runq_remove_idx(struct runq *rq, struct td_sched *ts, u_char *idx)
{
struct rqhead *rqh;
- int pri;
+ u_char pri;
- KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
- ("runq_remove: process swapped out"));
- pri = ke->ke_rqindex;
+ KASSERT(ts->ts_thread->td_flags & TDF_INMEM,
+ ("runq_remove_idx: thread swapped out"));
+ pri = ts->ts_rqindex;
+ KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
rqh = &rq->rq_queues[pri];
- CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
- ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
- KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
- TAILQ_REMOVE(rqh, ke, ke_procq);
+ CTR5(KTR_RUNQ, "runq_remove_idx: td=%p, ts=%p pri=%d %d rqh=%p",
+ ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
+ {
+ struct td_sched *nts;
+
+ TAILQ_FOREACH(nts, rqh, ts_procq)
+ if (nts == ts)
+ break;
+ if (ts != nts)
+ panic("runq_remove_idx: ts %p not on rqindex %d",
+ ts, pri);
+ }
+ TAILQ_REMOVE(rqh, ts, ts_procq);
if (TAILQ_EMPTY(rqh)) {
- CTR0(KTR_RUNQ, "runq_remove: empty");
+ CTR0(KTR_RUNQ, "runq_remove_idx: empty");
runq_clrbit(rq, pri);
+ if (idx != NULL && *idx == pri)
+ *idx = (pri + 1) % RQ_NQS;
}
}
/****** functions that are temporarily here ***********/
#include <vm/uma.h>
-extern struct mtx kse_zombie_lock;
/*
* Allocate scheduler specific per-process resources.
- * The thread and ksegrp have already been linked in.
- * In this case just set the default concurrency value.
+ * The thread and proc have already been linked in.
*
* Called from:
* proc_init() (UMA init method)
*/
void
-sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
+sched_newproc(struct proc *p, struct thread *td)
{
-
- /* This can go in sched_fork */
- sched_init_concurrency(kg);
}
/*
@@ -962,70 +599,12 @@
void
sched_newthread(struct thread *td)
{
- struct td_sched *ke;
-
- ke = (struct td_sched *) (td + 1);
- bzero(ke, sizeof(*ke));
- td->td_sched = ke;
- ke->ke_thread = td;
- ke->ke_state = KES_THREAD;
-}
-
-/*
- * Set up an initial concurrency of 1
- * and set the given thread (if given) to be using that
- * concurrency slot.
- * May be used "offline"..before the ksegrp is attached to the world
- * and thus wouldn't need schedlock in that case.
- * Called from:
- * thr_create()
- * proc_init() (UMA) via sched_newproc()
- */
-void
-sched_init_concurrency(struct ksegrp *kg)
-{
-
- CTR1(KTR_RUNQ,"kg %p init slots and concurrency to 1", kg);
- kg->kg_concurrency = 1;
- kg->kg_avail_opennings = 1;
-}
-
-/*
- * Change the concurrency of an existing ksegrp to N
- * Called from:
- * kse_create()
- * kse_exit()
- * thread_exit()
- * thread_single()
- */
-void
-sched_set_concurrency(struct ksegrp *kg, int concurrency)
-{
-
- CTR4(KTR_RUNQ,"kg %p set concurrency to %d, slots %d -> %d",
- kg,
- concurrency,
- kg->kg_avail_opennings,
- kg->kg_avail_opennings + (concurrency - kg->kg_concurrency));
- kg->kg_avail_opennings += (concurrency - kg->kg_concurrency);
- kg->kg_concurrency = concurrency;
-}
-
-/*
- * Called from thread_exit() for all exiting thread
- *
- * Not to be confused with sched_exit_thread()
- * that is only called from thread_exit() for threads exiting
- * without the rest of the process exiting because it is also called from
- * sched_exit() and we wouldn't want to call it twice.
- * XXX This can probably be fixed.
- */
-void
-sched_thread_exit(struct thread *td)
-{
+ struct td_sched *ts;
- SLOT_RELEASE(td->td_ksegrp);
- slot_fill(td->td_ksegrp);
+ ts = (struct td_sched *) (td + 1);
+ bzero(ts, sizeof(*ts));
+ td->td_sched = ts;
+ ts->ts_thread = td;
}
#endif /* KERN_SWITCH_INCLUDE */
More information about the Midnightbsd-cvs
mailing list