[Midnightbsd-cvs] src: sys/nfsclient: merge

Tue Dec 2 16:48:59 EST 2008

Log Message:
-----------
merge

Modified Files:
--------------
    src/sys/nfsclient:
        bootp_subr.c (r1.1.1.1 -> r1.2)
        krpc_subr.c (r1.1.1.1 -> r1.2)
        nfs.h (r1.1.1.1 -> r1.2)
        nfs_bio.c (r1.1.1.1 -> r1.2)
        nfs_diskless.c (r1.2 -> r1.3)
        nfs_lock.c (r1.1.1.2 -> r1.2)
        nfs_nfsiod.c (r1.1.1.1 -> r1.2)
        nfs_node.c (r1.2 -> r1.3)
        nfs_socket.c (r1.5 -> r1.6)
        nfs_subs.c (r1.1.1.1 -> r1.2)
        nfs_vfsops.c (r1.2 -> r1.3)
        nfs_vnops.c (r1.2 -> r1.3)
        nfsdiskless.h (r1.1.1.1 -> r1.2)
        nfsm_subs.h (r1.1.1.1 -> r1.2)
        nfsmount.h (r1.1.1.1 -> r1.2)
        nfsnode.h (r1.1.1.1 -> r1.2)
        nlminfo.h (r1.1.1.1 -> r1.2)

-------------- next part --------------
Index: nfs_socket.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_socket.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -L sys/nfsclient/nfs_socket.c -L sys/nfsclient/nfs_socket.c -u -r1.5 -r1.6

--- sys/nfsclient/nfs_socket.c
+++ sys/nfsclient/nfs_socket.c
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_socket.c,v 1.125.2.6 2006/02/16 02:39:52 rees Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_socket.c,v 1.154.2.1 2007/10/12 19:18:46 mohans Exp $");
 
 /*
  * Socket operations for use by nfs
@@ -78,44 +78,14 @@
 #define	FALSE	0
 
 extern u_int32_t nfs_xid;
-
-/*
- * Estimate rto for an nfs rpc sent via. an unreliable datagram.
- * Use the mean and mean deviation of rtt for the appropriate type of rpc
- * for the frequent rpcs and a default for the others.
- * The justification for doing "other" this way is that these rpcs
- * happen so infrequently that timer est. would probably be stale.
- * Also, since many of these rpcs are
- * non-idempotent, a conservative timeout is desired.
- * getattr, lookup - A+2D
- * read, write     - A+4D
- * other           - nm_timeo
- */
-#define	NFS_RTO(n, t) \
-	((t) == 0 ? (n)->nm_timeo : \
-	 ((t) < 3 ? \
-	  (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
-	  ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
-#define	NFS_SRTT(r)	(r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
-#define	NFS_SDRTT(r)	(r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
-
-/*
- * Defines which timer to use for the procnum.
- * 0 - default
- * 1 - getattr
- * 2 - lookup
- * 3 - read
- * 4 - write
- */
-static int proct[NFS_NPROCS] = {
-	0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
-};
+extern struct mtx nfs_xid_mtx;
 
 static int	nfs_realign_test;
 static int	nfs_realign_count;
 static int	nfs_bufpackets = 4;
 static int	nfs_reconnects;
-static int	nfs3_jukebox_delay = 10;
+static int     nfs3_jukebox_delay = 10;
+static int     nfs_skip_wcc_data_onerr = 1;
 
 SYSCTL_DECL(_vfs_nfs);
 
@@ -125,7 +95,8 @@
 SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
     "number of times the nfs client has had to reconnect");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
-    "number of seconds to delay a retry after receiving EJUKEBOX");
+	   "number of seconds to delay a retry after receiving EJUKEBOX");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0, "");
 
 /*
  * There is a congestion window for outstanding rpcs maintained per mount
@@ -153,10 +124,134 @@
 static int	nfs_reconnect(struct nfsreq *rep);
 static void nfs_clnt_tcp_soupcall(struct socket *so, void *arg, int waitflag);
 static void nfs_clnt_udp_soupcall(struct socket *so, void *arg, int waitflag);
-static void wakeup_nfsreq(struct nfsreq *req);
 
 extern struct mtx nfs_reqq_mtx;
-extern struct mtx nfs_reply_mtx;
+
+/*
+ * RTT estimator
+ */
+
+static enum nfs_rto_timer_t nfs_proct[NFS_NPROCS] = {
+	NFS_DEFAULT_TIMER,	/* NULL */
+	NFS_GETATTR_TIMER,	/* GETATTR */
+	NFS_DEFAULT_TIMER,	/* SETATTR */
+	NFS_LOOKUP_TIMER,	/* LOOKUP */
+	NFS_GETATTR_TIMER,	/* ACCESS */
+	NFS_READ_TIMER,		/* READLINK */
+	NFS_READ_TIMER,		/* READ */
+	NFS_WRITE_TIMER,	/* WRITE */
+	NFS_DEFAULT_TIMER,	/* CREATE */
+	NFS_DEFAULT_TIMER,	/* MKDIR */
+	NFS_DEFAULT_TIMER,	/* SYMLINK */
+	NFS_DEFAULT_TIMER,	/* MKNOD */
+	NFS_DEFAULT_TIMER,	/* REMOVE */
+	NFS_DEFAULT_TIMER,	/* RMDIR */
+	NFS_DEFAULT_TIMER,	/* RENAME */
+	NFS_DEFAULT_TIMER,	/* LINK */
+	NFS_READ_TIMER,		/* READDIR */
+	NFS_READ_TIMER,		/* READDIRPLUS */
+	NFS_DEFAULT_TIMER,	/* FSSTAT */
+	NFS_DEFAULT_TIMER,	/* FSINFO */
+	NFS_DEFAULT_TIMER,	/* PATHCONF */
+	NFS_DEFAULT_TIMER,	/* COMMIT */
+	NFS_DEFAULT_TIMER,	/* NOOP */
+};
+
+/*
+ * Choose the correct RTT timer for this NFS procedure.
+ */
+static inline enum nfs_rto_timer_t
+nfs_rto_timer(u_int32_t procnum)
+{
+	return nfs_proct[procnum];
+}
+
+/*
+ * Initialize the RTT estimator state for a new mount point.
+ */
+static void
+nfs_init_rtt(struct nfsmount *nmp)
+{
+	int i;
+
+	for (i = 0; i < NFS_MAX_TIMER; i++)
+		nmp->nm_srtt[i] = NFS_INITRTT;
+	for (i = 0; i < NFS_MAX_TIMER; i++)
+		nmp->nm_sdrtt[i] = 0;
+}
+
+/*
+ * Update a mount point's RTT estimator state using data from the
+ * passed-in request.
+ * 
+ * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation.
+ *
+ * NB: Since the timer resolution of NFS_HZ is so course, it can often
+ * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is
+ * between N + dt and N + 2 - dt ticks, add 1 before calculating the
+ * update values.
+ */
+static void
+nfs_update_rtt(struct nfsreq *rep)
+{
+	int t1 = rep->r_rtt + 1;
+	int index = nfs_rto_timer(rep->r_procnum) - 1;
+	int *srtt = &rep->r_nmp->nm_srtt[index];
+	int *sdrtt = &rep->r_nmp->nm_sdrtt[index];
+
+	t1 -= *srtt >> 3;
+	*srtt += t1;
+	if (t1 < 0)
+		t1 = -t1;
+	t1 -= *sdrtt >> 2;
+	*sdrtt += t1;
+}
+
+/*
+ * Estimate RTO for an NFS RPC sent via an unreliable datagram.
+ *
+ * Use the mean and mean deviation of RTT for the appropriate type
+ * of RPC for the frequent RPCs and a default for the others.
+ * The justification for doing "other" this way is that these RPCs
+ * happen so infrequently that timer est. would probably be stale.
+ * Also, since many of these RPCs are non-idempotent, a conservative
+ * timeout is desired.
+ *
+ * getattr, lookup - A+2D
+ * read, write     - A+4D
+ * other           - nm_timeo
+ */
+static int
+nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum)
+{
+	enum nfs_rto_timer_t timer = nfs_rto_timer(procnum);
+	int index = timer - 1;
+	int rto;
+
+	switch (timer) {
+	case NFS_GETATTR_TIMER:
+	case NFS_LOOKUP_TIMER:
+		rto = ((nmp->nm_srtt[index] + 3) >> 2) +
+				((nmp->nm_sdrtt[index] + 1) >> 1);
+		break;
+	case NFS_READ_TIMER:
+	case NFS_WRITE_TIMER:
+		rto = ((nmp->nm_srtt[index] + 7) >> 3) +
+				(nmp->nm_sdrtt[index] + 1);
+		break;
+	default:
+		rto = nmp->nm_timeo;
+		return (rto);
+	}
+
+	if (rto < NFS_MINRTO)
+		rto = NFS_MINRTO;
+	else if (rto > NFS_MAXRTO)
+		rto = NFS_MAXRTO;
+
+	return (rto);
+}
+
 
 /*
  * Initialize sockets and congestion for a new NFS connection.
@@ -171,13 +266,11 @@
 	struct sockaddr *saddr;
 	struct thread *td = &thread0; /* only used for socreate and sobind */
 
-	NET_ASSERT_GIANT();
-
 	if (nmp->nm_sotype == SOCK_STREAM) {
-		mtx_lock(&nmp->nm_nfstcpstate.mtx);
+		mtx_lock(&nmp->nm_mtx);
  		nmp->nm_nfstcpstate.flags |= NFS_TCP_EXPECT_RPCMARKER;
  		nmp->nm_nfstcpstate.rpcresid = 0;
-		mtx_unlock(&nmp->nm_nfstcpstate.mtx);
+		mtx_unlock(&nmp->nm_mtx);
  	}	
 	nmp->nm_so = NULL;
 	saddr = nmp->nm_nam;
@@ -242,12 +335,16 @@
 	 * Protocols that do not require connections may be optionally left
 	 * unconnected for servers that reply from a port other than NFS_PORT.
 	 */
+	mtx_lock(&nmp->nm_mtx);
 	if (nmp->nm_flag & NFSMNT_NOCONN) {
 		if (nmp->nm_soflags & PR_CONNREQUIRED) {
 			error = ENOTCONN;
+			mtx_unlock(&nmp->nm_mtx);
 			goto bad;
-		}
+		} else
+			mtx_unlock(&nmp->nm_mtx);
 	} else {
+		mtx_unlock(&nmp->nm_mtx);
 		error = soconnect(so, nmp->nm_nam, td);
 		if (error)
 			goto bad;
@@ -278,7 +375,10 @@
 		SOCK_UNLOCK(so);
 	}
 	so->so_rcv.sb_timeo = 12 * hz;
-	so->so_snd.sb_timeo = 5 * hz;
+	if (nmp->nm_sotype == SOCK_STREAM)
+		so->so_snd.sb_timeo = 1 * hz;	/* 1s snd timeout for NFS/TCP */
+	else
+		so->so_snd.sb_timeo = 5 * hz;
 
 	/*
 	 * Get buffer reservation size from sysctl, but impose reasonable
@@ -289,7 +389,7 @@
 		pktscale = 2;
 	if (pktscale > 64)
 		pktscale = 64;
-
+	mtx_lock(&nmp->nm_mtx);
 	if (nmp->nm_sotype == SOCK_DGRAM) {
 		sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
 		rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
@@ -312,7 +412,9 @@
 			sopt.sopt_val = &val;
 			sopt.sopt_valsize = sizeof val;
 			val = 1;
+			mtx_unlock(&nmp->nm_mtx);
 			sosetopt(so, &sopt);
+			mtx_lock(&nmp->nm_mtx);
 		}
 		if (so->so_proto->pr_protocol == IPPROTO_TCP) {
 			struct sockopt sopt;
@@ -325,13 +427,16 @@
 			sopt.sopt_val = &val;
 			sopt.sopt_valsize = sizeof val;
 			val = 1;
+			mtx_unlock(&nmp->nm_mtx);
 			sosetopt(so, &sopt);
+			mtx_lock(&nmp->nm_mtx);
 		}
 		sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
 		    sizeof (u_int32_t)) * pktscale;
 		rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
 		    sizeof (u_int32_t)) * pktscale;
 	}
+	mtx_unlock(&nmp->nm_mtx);
 	error = soreserve(so, sndreserve, rcvreserve);
 	if (error)
 		goto bad;
@@ -348,14 +453,13 @@
 	so->so_snd.sb_flags |= SB_NOINTR;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
+	mtx_lock(&nmp->nm_mtx);
 	/* Initialize other non-zero congestion variables */
-	nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
-		nmp->nm_srtt[3] = (NFS_TIMEO << 3);
-	nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
-		nmp->nm_sdrtt[3] = 0;
+	nfs_init_rtt(nmp);
 	nmp->nm_cwnd = NFS_MAXCWND / 2;	    /* Initial send window */
 	nmp->nm_sent = 0;
 	nmp->nm_timeouts = 0;
+	mtx_unlock(&nmp->nm_mtx);
 	return (0);
 
 bad:
@@ -363,6 +467,17 @@
 	return (error);
 }
 
+static void
+nfs_wakup_reconnectors(struct nfsmount *nmp)
+{
+	KASSERT(mtx_owned(&nmp->nm_mtx), ("NFS mnt lock not owned !"));
+	if (--nmp->nm_nfstcpstate.sock_send_inprog == 0 &&
+	    (nmp->nm_nfstcpstate.flags & NFS_TCP_WAIT_WRITE_DRAIN)) {
+		nmp->nm_nfstcpstate.flags &= ~NFS_TCP_WAIT_WRITE_DRAIN;
+		wakeup((caddr_t)&nmp->nm_nfstcpstate.sock_send_inprog);
+	}
+}
+
 /*
  * Reconnect routine:
  * Called when a connection is broken on a reliable protocol.
@@ -378,14 +493,41 @@
 	struct nfsreq *rp;
 	struct nfsmount *nmp = rep->r_nmp;
 	int error;
+	int slpflag = 0;
+
+	KASSERT(mtx_owned(&nmp->nm_mtx), ("NFS mnt lock not owned !"));
+	if (nmp->nm_flag & NFSMNT_INT)
+		slpflag = PCATCH;
+	/*
+	 * Wait for any pending writes to this socket to drain (or timeout).
+	 */
+	while (nmp->nm_nfstcpstate.sock_send_inprog > 0) {
+		nmp->nm_nfstcpstate.flags |= NFS_TCP_WAIT_WRITE_DRAIN;
+		error = msleep((caddr_t)&nmp->nm_nfstcpstate.sock_send_inprog,
+			       &nmp->nm_mtx, slpflag | (PZERO - 1), "nfscon", 0);		
+	}
+	/*
+	 * Grab the nfs_connect_lock to serialize connects. 
+	 * After grabbing the nfs_connect_lock, check if a reconnect is necessary or
+	 * if someone else beat us to the connect !
+	 */
+	error = nfs_connect_lock(rep);
+	if (error)
+		goto unlock_exit;
+	if ((nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) == 0)
+		goto unlock_exit;
+	else
+		mtx_unlock(&nmp->nm_mtx);
 
 	nfs_reconnects++;
 	nfs_disconnect(nmp);
 	while ((error = nfs_connect(nmp, rep)) != 0) {
 		if (error == ERESTART)
 			error = EINTR;
-		if (error == EIO || error == EINTR)
-			return (error);
+		if (error == EIO || error == EINTR) {
+			mtx_lock(&nmp->nm_mtx);
+			goto unlock_exit;
+		}
 		(void) tsleep(&lbolt, PSOCK, "nfscon", 0);
 	}
 
@@ -398,9 +540,10 @@
  	 * until the connection is established successfully, and 
  	 * then re-transmit the request.
  	 */
-	mtx_lock(&nmp->nm_nfstcpstate.mtx);
+	mtx_lock(&nmp->nm_mtx);
 	nmp->nm_nfstcpstate.flags &= ~NFS_TCP_FORCE_RECONNECT;
-	mtx_unlock(&nmp->nm_nfstcpstate.mtx);	
+	nmp->nm_nfstcpstate.rpcresid = 0;
+	mtx_unlock(&nmp->nm_mtx);	
 
 	/*
 	 * Loop through outstanding request list and fix up all requests
@@ -408,11 +551,18 @@
 	 */
 	mtx_lock(&nfs_reqq_mtx);
 	TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
-		if (rp->r_nmp == nmp)
+		if (rp->r_nmp == nmp) {
+			mtx_lock(&rp->r_mtx);			
 			rp->r_flags |= R_MUSTRESEND;
+			mtx_unlock(&rp->r_mtx);
+		}
 	}
 	mtx_unlock(&nfs_reqq_mtx);
-	return (0);
+	mtx_lock(&nmp->nm_mtx);
+unlock_exit:
+	nfs_connect_unlock(rep);
+	mtx_unlock(&nmp->nm_mtx);		
+	return (error);
 }
 
 /*
@@ -423,11 +573,11 @@
 {
 	struct socket *so;
 
-	NET_ASSERT_GIANT();
-
+	mtx_lock(&nmp->nm_mtx);
 	if (nmp->nm_so) {
 		so = nmp->nm_so;
 		nmp->nm_so = NULL;
+		mtx_unlock(&nmp->nm_mtx);
 		SOCKBUF_LOCK(&so->so_rcv);
  		so->so_upcallarg = NULL;
  		so->so_upcall = NULL;
@@ -435,7 +585,8 @@
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		soshutdown(so, SHUT_WR);
 		soclose(so);
-	}
+	} else
+		mtx_unlock(&nmp->nm_mtx);
 }
 
 void
@@ -462,8 +613,6 @@
 	struct sockaddr *sendnam;
 	int error, error2, soflags, flags;
 
-	NET_ASSERT_GIANT();
-
 	KASSERT(rep, ("nfs_send: called with rep == NULL"));
 
 	error = nfs_sigintr(rep->r_nmp, rep, rep->r_td);
@@ -471,13 +620,19 @@
 		m_freem(top);
 		return (error);
 	}
+	mtx_lock(&rep->r_nmp->nm_mtx);
+	mtx_lock(&rep->r_mtx);
 	if ((so = rep->r_nmp->nm_so) == NULL) {
 		rep->r_flags |= R_MUSTRESEND;
+		mtx_unlock(&rep->r_mtx);
+		mtx_unlock(&rep->r_nmp->nm_mtx);
 		m_freem(top);
-		return (0);
+		return (EPIPE);
 	}
 	rep->r_flags &= ~R_MUSTRESEND;
 	soflags = rep->r_nmp->nm_soflags;
+	mtx_unlock(&rep->r_mtx);
+	mtx_unlock(&rep->r_nmp->nm_mtx);
 
 	if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
 		sendnam = NULL;
@@ -488,11 +643,12 @@
 	else
 		flags = 0;
 
-	error = so->so_proto->pr_usrreqs->pru_sosend(so, sendnam, 0, top, 0,
-						     flags, curthread /*XXX*/);
+	error = sosend(so, sendnam, 0, top, 0, flags, curthread /*XXX*/);
 	if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
 		error = 0;
+		mtx_lock(&rep->r_mtx);
 		rep->r_flags |= R_MUSTRESEND;
+		mtx_unlock(&rep->r_mtx);
 	}
 
 	if (error) {
@@ -512,15 +668,17 @@
 		error2 = NFS_SIGREP(rep);
 		if (error2)
 			error = error2;
-		else
+		else {
+			mtx_lock(&rep->r_mtx);
 			rep->r_flags |= R_MUSTRESEND;
+			mtx_unlock(&rep->r_mtx);
+		}
 
 		/*
 		 * Handle any recoverable (soft) socket errors here. (?)
-		 * Make EWOULDBLOCK a recoverable error, we'll rexmit from nfs_timer()
+		 * Make EWOULDBLOCK a recoverable error, we'll rexmit from nfs_timer().
 		 */
-		if (error != EINTR && error != ERESTART && error != EIO &&
-			error != EPIPE)
+		if (error != EINTR && error != ERESTART && error != EIO && error != EPIPE)
 			error = 0;
 	}
 	return (error);
@@ -532,82 +690,98 @@
 	register struct socket *so;
 	register struct mbuf *m;
 	int error = 0, sotype, slpflag;
-
-	NET_ASSERT_GIANT();
-
-	sotype = rep->r_nmp->nm_sotype;
+	struct nfsmount *nmp = rep->r_nmp;
+	
+	sotype = nmp->nm_sotype;
 	/*
 	 * For reliable protocols, lock against other senders/receivers
 	 * in case a reconnect is necessary.
 	 */
 	if (sotype != SOCK_DGRAM) {
-		error = nfs_sndlock(rep);
-		if (error)
-			return (error);
 tryagain:
+		mtx_lock(&nmp->nm_mtx);
+		mtx_lock(&rep->r_mtx);
 		if (rep->r_mrep) {
-			nfs_sndunlock(rep);
+			mtx_unlock(&rep->r_mtx);
+			mtx_unlock(&nmp->nm_mtx);
 			return (0);
 		}
 		if (rep->r_flags & R_SOFTTERM) {
-			nfs_sndunlock(rep);
+			mtx_unlock(&rep->r_mtx);
+			mtx_unlock(&nmp->nm_mtx);
 			return (EINTR);
 		}
-		so = rep->r_nmp->nm_so;
-		mtx_lock(&rep->r_nmp->nm_nfstcpstate.mtx);
+		so = nmp->nm_so;
 		if (!so || 
-		    (rep->r_nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) {
-			mtx_unlock(&rep->r_nmp->nm_nfstcpstate.mtx);
+		    (nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) {
+			mtx_unlock(&rep->r_mtx);
+			nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
 			error = nfs_reconnect(rep);
-			if (error) {
-				nfs_sndunlock(rep);
+			if (error)
 				return (error);
-			}
 			goto tryagain;
-		} else
-			mtx_unlock(&rep->r_nmp->nm_nfstcpstate.mtx);
+		}
 		while (rep->r_flags & R_MUSTRESEND) {
+			mtx_unlock(&rep->r_mtx);
+			nmp->nm_nfstcpstate.sock_send_inprog++;
+			mtx_unlock(&nmp->nm_mtx);
 			m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
 			nfsstats.rpcretries++;
-			error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
+			error = nfs_send(so, nmp->nm_nam, m, rep);
 			if (error) {
-				if (error == EINTR || error == ERESTART ||
-				    (error = nfs_reconnect(rep)) != 0) {
-					nfs_sndunlock(rep);
+				mtx_lock(&nmp->nm_mtx);
+				nfs_wakup_reconnectors(nmp);
+				if (!(error == EINTR || error == ERESTART)) {
+					nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
+					error = nfs_reconnect(rep);
+				} else
+					mtx_unlock(&nmp->nm_mtx);
+				if (error)
 					return (error);
-				}
 				goto tryagain;
-			}
+			} else {
+				mtx_lock(&nmp->nm_mtx);
+				nfs_wakup_reconnectors(nmp);
+				mtx_lock(&rep->r_mtx);
+ 			}
 		}
-		nfs_sndunlock(rep);
+		mtx_unlock(&rep->r_mtx);
+		mtx_unlock(&nmp->nm_mtx);
 	}
 	slpflag = 0;
-	if (rep->r_nmp->nm_flag & NFSMNT_INT)
+	mtx_lock(&nmp->nm_mtx);
+	if (nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
-	mtx_lock(&nfs_reply_mtx);
+	mtx_unlock(&nmp->nm_mtx);
+	mtx_lock(&rep->r_mtx);
 	while ((rep->r_mrep == NULL) && (error == 0) && 
 	       ((rep->r_flags & R_SOFTTERM) == 0) &&
 	       ((sotype == SOCK_DGRAM) || ((rep->r_flags & R_MUSTRESEND) == 0)))
-		error = msleep((caddr_t)rep, &nfs_reply_mtx, 
+		error = msleep((caddr_t)rep, &rep->r_mtx, 
 			       slpflag | (PZERO - 1), "nfsreq", 0);
-	mtx_unlock(&nfs_reply_mtx);
-	if (error == EINTR || error == ERESTART)
+	if (error == EINTR || error == ERESTART) {
 		/* NFS operations aren't restartable. Map ERESTART to EINTR */
+		mtx_unlock(&rep->r_mtx);
 		return (EINTR);
-	if (rep->r_flags & R_SOFTTERM)
+	}
+	if (rep->r_flags & R_SOFTTERM) {
 		/* Request was terminated because we exceeded the retries (soft mount) */
+		mtx_unlock(&rep->r_mtx);
 		return (ETIMEDOUT);
+	}
+	mtx_unlock(&rep->r_mtx);
 	if (sotype == SOCK_STREAM) {
-		mtx_lock(&rep->r_nmp->nm_nfstcpstate.mtx);
-		if (((rep->r_nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) || 
+		mtx_lock(&nmp->nm_mtx);
+		mtx_lock(&rep->r_mtx);
+		if (((nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) || 
 		     (rep->r_flags & R_MUSTRESEND))) {
-			mtx_unlock(&rep->r_nmp->nm_nfstcpstate.mtx);	
-			error = nfs_sndlock(rep);
-			if (error)
-				return (error);
+			mtx_unlock(&rep->r_mtx);
+			mtx_unlock(&nmp->nm_mtx);	
 			goto tryagain;
-		} else
-			mtx_unlock(&rep->r_nmp->nm_nfstcpstate.mtx);
+		} else {
+			mtx_unlock(&rep->r_mtx);
+			mtx_unlock(&nmp->nm_mtx);	
+		}
 	}
 	return (error);
 }
@@ -625,7 +799,6 @@
 	caddr_t dpos;
 	u_int32_t rxid, *tl;
 	struct nfsreq *rep;
-	register int32_t t1;
 	int error;
 	
 	/*
@@ -660,6 +833,8 @@
 	 * Iff no match, just drop the datagram
 	 */
 	TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
+		mtx_lock(&nmp->nm_mtx);
+		mtx_lock(&rep->r_mtx);
 		if (rep->r_mrep == NULL && rxid == rep->r_xid) {
 			/* Found it.. */
 			rep->r_mrep = mrep;
@@ -681,30 +856,16 @@
 				rep->r_flags &= ~R_SENT;
 				nmp->nm_sent -= NFS_CWNDSCALE;
 			}
-			/*
-			 * Update rtt using a gain of 0.125 on the mean
-			 * and a gain of 0.25 on the deviation.
-			 */
-			if (rep->r_flags & R_TIMING) {
-				/*
-				 * Since the timer resolution of
-				 * NFS_HZ is so course, it can often
-				 * result in r_rtt == 0. Since
-				 * r_rtt == N means that the actual
-				 * rtt is between N+dt and N+2-dt ticks,
-				 * add 1.
-				 */
-				t1 = rep->r_rtt + 1;
-				t1 -= (NFS_SRTT(rep) >> 3);
-				NFS_SRTT(rep) += t1;
-				if (t1 < 0)
-					t1 = -t1;
-				t1 -= (NFS_SDRTT(rep) >> 2);
-				NFS_SDRTT(rep) += t1;
-			}
+			if (rep->r_flags & R_TIMING)
+				nfs_update_rtt(rep);
 			nmp->nm_timeouts = 0;
+			wakeup((caddr_t)rep);
+			mtx_unlock(&rep->r_mtx);
+			mtx_unlock(&nmp->nm_mtx);
 			break;
 		}
+		mtx_unlock(&rep->r_mtx);
+		mtx_unlock(&nmp->nm_mtx);
 	}
 	/*
 	 * If not matched to a request, drop it.
@@ -713,31 +874,18 @@
 	if (rep == 0) {
 		nfsstats.rpcunexpected++;
 		m_freem(mrep);
-	} else
-		wakeup_nfsreq(rep);
+	}
 	mtx_unlock(&nfs_reqq_mtx);
 }
 
-/* 
- * The wakeup of the requestor should be done under the mutex
- * to avoid potential missed wakeups.
- */
-static void 
-wakeup_nfsreq(struct nfsreq *req)
-{
-	mtx_lock(&nfs_reply_mtx);
-	wakeup((caddr_t)req);
-	mtx_unlock(&nfs_reply_mtx);	
-}
-
 static void
 nfs_mark_for_reconnect(struct nfsmount *nmp)
 {
 	struct nfsreq *rp;
 
-	mtx_lock(&nmp->nm_nfstcpstate.mtx);
+	mtx_lock(&nmp->nm_mtx);
 	nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
-	mtx_unlock(&nmp->nm_nfstcpstate.mtx);
+	mtx_unlock(&nmp->nm_mtx);
 	/* 
 	 * Wakeup all processes that are waiting for replies 
 	 * on this mount point. One of them does the reconnect.
@@ -745,8 +893,10 @@
 	mtx_lock(&nfs_reqq_mtx);
 	TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
 		if (rp->r_nmp == nmp) {
+			mtx_lock(&rp->r_mtx);
 			rp->r_flags |= R_MUSTRESEND;
-			wakeup_nfsreq(rp);
+			wakeup((caddr_t)rp);
+			mtx_unlock(&rp->r_mtx);
 		}
 	}
 	mtx_unlock(&nfs_reqq_mtx);
@@ -767,6 +917,20 @@
 
 #define nfstcp_marker_readable(so)	nfstcp_readable(so, sizeof(u_int32_t))
 
+static int
+nfs_copy_len(struct mbuf *mp, char *buf, int len)
+{
+	while (len > 0 && mp != NULL) {
+		int copylen = min(len, mp->m_len);
+		
+		bcopy(mp->m_data, buf, copylen);
+		buf += copylen;
+		len -= copylen;
+		mp = mp->m_next;
+	}
+	return (len);
+}
+
 static void
 nfs_clnt_tcp_soupcall(struct socket *so, void *arg, int waitflag)
 {
@@ -781,17 +945,21 @@
 	 * Don't pick any more data from the socket if we've marked the 
 	 * mountpoint for reconnect.
 	 */
-	mtx_lock(&nmp->nm_nfstcpstate.mtx);
+	mtx_lock(&nmp->nm_mtx);
 	if (nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) {
-		mtx_unlock(&nmp->nm_nfstcpstate.mtx);		
+		mtx_unlock(&nmp->nm_mtx);		
 		return;
 	} else			
-		mtx_unlock(&nmp->nm_nfstcpstate.mtx);
+		mtx_unlock(&nmp->nm_mtx);
 	auio.uio_td = curthread;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	for ( ; ; ) {
+		mtx_lock(&nmp->nm_mtx);
 		if (nmp->nm_nfstcpstate.flags & NFS_TCP_EXPECT_RPCMARKER) {
+			int resid;
+
+			mtx_unlock(&nmp->nm_mtx);
 			if (!nfstcp_marker_readable(so)) {
 				/* Marker is not readable */
 				return;
@@ -801,9 +969,8 @@
 			auio.uio_iovcnt = 0;
 			mp = NULL;
 			rcvflg = (MSG_DONTWAIT | MSG_SOCALLBCK);
-			error =  so->so_proto->pr_usrreqs->pru_soreceive
-				(so, (struct sockaddr **)0,
-				 &auio, &mp, (struct mbuf **)0, &rcvflg);
+			error =  soreceive(so, (struct sockaddr **)0, &auio,
+			    &mp, (struct mbuf **)0, &rcvflg);
 			/*
 			 * We've already tested that the socket is readable. 2 cases 
 			 * here, we either read 0 bytes (client closed connection), 
@@ -820,7 +987,21 @@
 			}
 			if (mp == NULL)
 				panic("nfs_clnt_tcp_soupcall: Got empty mbuf chain from sorecv\n");
-			bcopy(mtod(mp, u_int32_t *), &len, sizeof(len));
+			/*
+			 * Sigh. We can't do the obvious thing here (which would
+			 * be to have soreceive copy the length from mbufs for us).
+			 * Calling uiomove() from the context of a socket callback
+			 * (even for kernel-kernel copies) leads to LORs (since
+			 * we hold network locks at this point).
+			 */
+			if ((resid = nfs_copy_len(mp, (char *)&len, 
+						  sizeof(u_int32_t)))) {
+				log(LOG_ERR, "%s (%d) from nfs server %s\n",
+				    "Bad RPC HDR length",
+				    (int)(sizeof(u_int32_t) - resid),
+				    nmp->nm_mountp->mnt_stat.f_mntfromname);
+				goto mark_reconnect;
+			}				
 			len = ntohl(len) & ~0x80000000;
 			m_freem(mp);
 			/*
@@ -834,14 +1015,20 @@
 				    nmp->nm_mountp->mnt_stat.f_mntfromname);
 				goto mark_reconnect;
 			}
+			mtx_lock(&nmp->nm_mtx);
 			nmp->nm_nfstcpstate.rpcresid = len;
 			nmp->nm_nfstcpstate.flags &= ~(NFS_TCP_EXPECT_RPCMARKER);
-		}
+			mtx_unlock(&nmp->nm_mtx);
+		} else
+			mtx_unlock(&nmp->nm_mtx);
+
 		/* 
 		 * Processed RPC marker or no RPC marker to process. 
 		 * Pull in and process data.
 		 */
+		mtx_lock(&nmp->nm_mtx);
 		if (nmp->nm_nfstcpstate.rpcresid > 0) {
+			mtx_unlock(&nmp->nm_mtx);
 			if (!nfstcp_readable(so, nmp->nm_nfstcpstate.rpcresid)) {
 				/* All data not readable */
 				return;
@@ -851,9 +1038,8 @@
 			auio.uio_iovcnt = 0;
 			mp = NULL;
 			rcvflg = (MSG_DONTWAIT | MSG_SOCALLBCK);
-			error =  so->so_proto->pr_usrreqs->pru_soreceive
-				(so, (struct sockaddr **)0,
-				 &auio, &mp, (struct mbuf **)0, &rcvflg);
+			error =  soreceive(so, (struct sockaddr **)0, &auio,
+			    &mp, (struct mbuf **)0, &rcvflg);
 			if (error || auio.uio_resid > 0) {
 				if (error && error != ECONNRESET) {
 					log(LOG_ERR, 
@@ -864,11 +1050,14 @@
 			}
 			if (mp == NULL)
 				panic("nfs_clnt_tcp_soupcall: Got empty mbuf chain from sorecv\n");
+			mtx_lock(&nmp->nm_mtx);
 			nmp->nm_nfstcpstate.rpcresid = 0;
 			nmp->nm_nfstcpstate.flags |= NFS_TCP_EXPECT_RPCMARKER;
+			mtx_unlock(&nmp->nm_mtx);
 			/* We got the entire RPC reply. Match XIDs and wake up requestor */
 			nfs_clnt_match_xid(so, nmp, mp);
-		}
+		} else
+			mtx_unlock(&nmp->nm_mtx);
 	}
 
 mark_reconnect:
@@ -890,9 +1079,7 @@
 	auio.uio_resid = 1000000000;
 	do {
 		mp = control = NULL;
-		error = so->so_proto->pr_usrreqs->pru_soreceive(so,
-					NULL, &auio, &mp,
-					&control, &rcvflag);
+		error = soreceive(so, NULL, &auio, &mp, &control, &rcvflag);
 		if (control)
 			m_freem(control);
 		if (mp)
@@ -910,7 +1097,6 @@
  *	  by mrep or error
  * nb: always frees up mreq mbuf list
  */
-
 int
 nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
     struct thread *td, struct ucred *cred, struct mbuf **mrp,
@@ -924,7 +1110,7 @@
 	struct mbuf *m, *md, *mheadend;
 	time_t waituntil;
 	caddr_t dpos;
-	int s, error = 0, mrest_len, auth_len, auth_type;
+	int error = 0, mrest_len, auth_len, auth_type;
 	struct timeval now;
 	u_int32_t *xidp;
 
@@ -937,11 +1123,12 @@
 	if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
 		return nfs4_request(vp, mrest, procnum, td, cred, mrp, mdp, dposp);
 	MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
-	rep->r_mrep = rep->r_md = NULL;
+	bzero(rep, sizeof(struct nfsreq));
 	rep->r_nmp = nmp;
 	rep->r_vp = vp;
 	rep->r_td = td;
 	rep->r_procnum = procnum;
+	mtx_init(&rep->r_mtx, "NFSrep lock", NULL, MTX_DEF);
 
 	getmicrouptime(&now);
 	rep->r_lastmsg = now.tv_sec -
@@ -976,7 +1163,7 @@
 	else
 		rep->r_retry = NFS_MAXREXMIT + 1;	/* past clip limit */
 	rep->r_rtt = rep->r_rexmit = 0;
-	if (proct[procnum] > 0)
+	if (nfs_rto_timer(procnum) != NFS_DEFAULT_TIMER)
 		rep->r_flags = R_TIMING;
 	else
 		rep->r_flags = 0;
@@ -990,7 +1177,6 @@
 	 * Chain request into list of outstanding requests. Be sure
 	 * to put it LAST so timer finds oldest requests first.
 	 */
-	s = splsoftclock();
 	mtx_lock(&nfs_reqq_mtx);
 	if (TAILQ_EMPTY(&nfs_reqq))
 		callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL);
@@ -1002,24 +1188,31 @@
 	 * send this one now but let timer do it. If not timing a request,
 	 * do it now.
 	 */
-	if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
-		(nmp->nm_flag & NFSMNT_DUMBTIMR) ||
-		nmp->nm_sent < nmp->nm_cwnd)) {
-		splx(s);
-		error = nfs_sndlock(rep);
-		if (!error) {
-			m2 = m_copym(m, 0, M_COPYALL, M_TRYWAIT);
-			error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
-			nfs_sndunlock(rep);
-		}
-		mtx_lock(&nfs_reqq_mtx);
-		if (!error && (rep->r_flags & R_MUSTRESEND) == 0) {
+	mtx_lock(&nmp->nm_mtx);
+	if (nmp->nm_so && 
+	    (((nmp->nm_sotype == SOCK_STREAM) && !(nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) || 
+	     (nmp->nm_flag & NFSMNT_DUMBTIMR) || nmp->nm_sent < nmp->nm_cwnd)) {
+		if (nmp->nm_sotype == SOCK_STREAM)
+			nmp->nm_nfstcpstate.sock_send_inprog++;
+		mtx_unlock(&nmp->nm_mtx);
+		m2 = m_copym(m, 0, M_COPYALL, M_TRYWAIT);
+		error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
+		mtx_lock(&nmp->nm_mtx);
+		mtx_lock(&rep->r_mtx);
+		/* 
+		 * nfs_timer() could've re-transmitted the request if we ended up
+		 * blocking on nfs_send() too long, so check for R_SENT here.
+		 */
+		if (!error && (rep->r_flags & (R_SENT | R_MUSTRESEND)) == 0) {
 			nmp->nm_sent += NFS_CWNDSCALE;
 			rep->r_flags |= R_SENT;
 		}
-		mtx_unlock(&nfs_reqq_mtx);
+		mtx_unlock(&rep->r_mtx);
+		if (nmp->nm_sotype == SOCK_STREAM)
+			nfs_wakup_reconnectors(rep->r_nmp);
+		mtx_unlock(&nmp->nm_mtx);
 	} else {
-		splx(s);
+		mtx_unlock(&nmp->nm_mtx);
 		rep->r_rtt = -1;
 	}
 
@@ -1030,31 +1223,54 @@
 		error = nfs_reply(rep);
 
 	/*
-	 * RPC done, unlink the request.
-	 */
-	s = splsoftclock();
+	 * nfs_timer() may be in the process of re-transmitting this request.
+	 * nfs_timer() drops the nfs_reqq_mtx before the pru_send() (to avoid LORs).
+	 * Wait till nfs_timer() completes the re-transmission. When the reply 
+	 * comes back, it will be discarded (since the req struct for it no longer 
+	 * exists).
+	 */
+wait_for_pinned_req:
+	mtx_lock(&rep->r_mtx);
+	while (rep->r_flags & R_PIN_REQ) {
+		msleep((caddr_t)&rep->r_flags, &rep->r_mtx, 
+		       (PZERO - 1), "nfsrxmt", 0);
+	}
+	mtx_unlock(&rep->r_mtx);
+
 	mtx_lock(&nfs_reqq_mtx);
+	/* Have to check for R_PIN_REQ after grabbing wlock again */
+	mtx_lock(&rep->r_mtx);
+	if (rep->r_flags & R_PIN_REQ) {
+		mtx_unlock(&rep->r_mtx);
+		mtx_unlock(&nfs_reqq_mtx);
+		goto wait_for_pinned_req;
+	} else
+		mtx_unlock(&rep->r_mtx);
+	/* RPC done (timer not active, request not pinned), unlink the request */
 	TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
 	if (TAILQ_EMPTY(&nfs_reqq))
 		callout_stop(&nfs_callout);
+	mtx_unlock(&nfs_reqq_mtx);
+
 	/*
 	 * Decrement the outstanding request count.
 	 */
+	mtx_lock(&rep->r_mtx);
 	if (rep->r_flags & R_SENT) {
 		rep->r_flags &= ~R_SENT;	/* paranoia */
+		mtx_unlock(&rep->r_mtx);
+		mtx_lock(&nmp->nm_mtx);
 		nmp->nm_sent -= NFS_CWNDSCALE;
-	}
-	mtx_unlock(&nfs_reqq_mtx);
-	splx(s);
+		mtx_unlock(&nmp->nm_mtx);
+	} else
+		mtx_unlock(&rep->r_mtx);
 
 	/*
 	 * If there was a successful reply and a tprintf msg.
 	 * tprintf a response.
 	 */
 	if (!error) {
-		mtx_lock(&Giant);
 		nfs_up(rep, nmp, rep->r_td, "is alive again", NFSSTA_TIMEO);
-		mtx_unlock(&Giant);
 	}
 	mrep = rep->r_mrep;
 	md = rep->r_md;
@@ -1069,6 +1285,7 @@
 		if (rep->r_mrep != NULL)
 			m_freem(rep->r_mrep);
 		m_freem(rep->r_mreq);
+		mtx_destroy(&rep->r_mtx);
 		free((caddr_t)rep, M_NFSREQ);
 		return (error);
 	}
@@ -1087,6 +1304,7 @@
 			error = EACCES;
 		m_freem(mrep);
 		m_freem(rep->r_mreq);
+		mtx_destroy(&rep->r_mtx);
 		free((caddr_t)rep, M_NFSREQ);
 		return (error);
 	}
@@ -1109,12 +1327,14 @@
 				m_freem(mrep);
 				error = 0;
 				waituntil = time_second + nfs3_jukebox_delay;
-				while (time_second < waituntil)
-					(void) tsleep(&lbolt,
-						PSOCK, "nqnfstry", 0);
+				while (time_second < waituntil) {
+					(void) tsleep(&lbolt, PSOCK, "nqnfstry", 0);
+				}
+				mtx_lock(&nfs_xid_mtx);
 				if (++nfs_xid == 0)
 					nfs_xid++;
 				rep->r_xid = *xidp = txdr_unsigned(nfs_xid);
+				mtx_unlock(&nfs_xid_mtx);
 				goto tryagain;
 			}
 
@@ -1124,7 +1344,12 @@
 			 */
 			if (error == ESTALE)
 				cache_purge(vp);
-			if (nmp->nm_flag & NFSMNT_NFSV3) {
+			/*
+			 * Skip wcc data on NFS errors for now. NetApp filers return corrupt
+			 * postop attrs in the wcc data for NFS err EROFS. Not sure if they 
+			 * could return corrupt postop attrs for others errors.
+			 */
+			if ((nmp->nm_flag & NFSMNT_NFSV3) && !nfs_skip_wcc_data_onerr) {
 				*mrp = mrep;
 				*mdp = md;
 				*dposp = dpos;
@@ -1132,6 +1357,7 @@
 			} else
 				m_freem(mrep);
 			m_freem(rep->r_mreq);
+			mtx_destroy(&rep->r_mtx);
 			free((caddr_t)rep, M_NFSREQ);
 			return (error);
 		}
@@ -1140,6 +1366,7 @@
 		*mdp = md;
 		*dposp = dpos;
 		m_freem(rep->r_mreq);
+		mtx_destroy(&rep->r_mtx);
 		FREE((caddr_t)rep, M_NFSREQ);
 		return (0);
 	}
@@ -1147,6 +1374,7 @@
 	error = EPROTONOSUPPORT;
 nfsmout:
 	m_freem(rep->r_mreq);
+	mtx_destroy(&rep->r_mtx);
 	free((caddr_t)rep, M_NFSREQ);
 	return (error);
 }
@@ -1157,19 +1385,11 @@
  * To avoid retransmission attempts on STREAM sockets (in the future) make
  * sure to set the r_retry field to 0 (implies nm_retry == 0).
  * 
- * XXX - 
- * For now, since we don't register MPSAFE callouts for the NFS client -
- * softclock() acquires Giant before calling us. That prevents req entries
- * from being removed from the list (from nfs_request()). But we still 
- * acquire the nfs reqq mutex to make sure the state of individual req
- * entries is not modified from RPC reply handling (from socket callback)
- * while nfs_timer is walking the list of reqs.
  * The nfs reqq lock cannot be held while we do the pru_send() because of a
  * lock ordering violation. The NFS client socket callback acquires 
  * inp_lock->nfsreq mutex and pru_send acquires inp_lock. So we drop the 
- * reqq mutex (and reacquire it after the pru_send()). This won't work
- * when we move to fine grained locking for NFS. When we get to that point, 
- * a rewrite of nfs_timer() will be needed.
+ * reqq mutex (and reacquire it after the pru_send()). The req structure
+ * (for the rexmit) is prevented from being removed by the R_PIN_REQ flag.
  */
 void
 nfs_timer(void *arg)
@@ -1179,51 +1399,76 @@
 	struct socket *so;
 	struct nfsmount *nmp;
 	int timeo;
-	int s, error;
+	int error;
 	struct timeval now;
 
 	getmicrouptime(&now);
-	s = splnet();
-	mtx_lock(&Giant);	/* nfs_down -> tprintf */
 	mtx_lock(&nfs_reqq_mtx);
 	TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
 		nmp = rep->r_nmp;
-		if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
+		mtx_lock(&rep->r_mtx);
+		if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
+			mtx_unlock(&rep->r_mtx);			
 			continue;
+		} else {
+			/*
+			 * Terminate request if force-unmount in progress.
+			 * Note that NFS could have vfs_busy'ed the mount,
+			 * causing the unmount to wait for the mnt_lock, making
+			 * this bit of logic necessary.
+			 */
+			if (rep->r_nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF) {
+				nfs_softterm(rep);
+				mtx_unlock(&rep->r_mtx);
+				continue;
+			}				
+			mtx_unlock(&rep->r_mtx);			
+		}
 		if (nfs_sigintr(nmp, rep, rep->r_td))
 			continue;
+		mtx_lock(&nmp->nm_mtx);
+		mtx_lock(&rep->r_mtx);
 		if (nmp->nm_tprintf_initial_delay != 0 &&
 		    (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
 		    rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
 			rep->r_lastmsg = now.tv_sec;
+			/*
+			 * Pin down the request and drop locks for the acquisition
+			 * of Giant from tprintf() in nfs_down().
+			 */
+			rep->r_flags |= R_PIN_REQ;
+			mtx_unlock(&rep->r_mtx);
+			mtx_unlock(&nmp->nm_mtx);
+			mtx_unlock(&nfs_reqq_mtx);
 			nfs_down(rep, nmp, rep->r_td, "not responding",
-			    0, NFSSTA_TIMEO);
-#if 0
-			if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
-				/* we're not yet completely mounted and */
-				/* we can't complete an RPC, so we fail */
-				nfsstats.rpctimeouts++;
-				nfs_softterm(rep);
-				continue;
-			}
-#endif
+				 0, NFSSTA_TIMEO);
+			mtx_lock(&nfs_reqq_mtx);
+			mtx_lock(&nmp->nm_mtx);
+			mtx_lock(&rep->r_mtx);
+			rep->r_flags &= ~R_PIN_REQ;
+			wakeup((caddr_t)&rep->r_flags);
 		}
 		if (rep->r_rtt >= 0) {
 			rep->r_rtt++;
 			if (nmp->nm_flag & NFSMNT_DUMBTIMR)
 				timeo = nmp->nm_timeo;
 			else
-				timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
+				timeo = nfs_estimate_rto(nmp, rep->r_procnum);
 			if (nmp->nm_timeouts > 0)
 				timeo *= nfs_backoff[nmp->nm_timeouts - 1];
-			if (rep->r_rtt <= timeo)
+			if (rep->r_rtt <= timeo) {
+				mtx_unlock(&rep->r_mtx);
+				mtx_unlock(&nmp->nm_mtx);
 				continue;
+			}
 			if (nmp->nm_timeouts < NFS_NBACKOFF)
 				nmp->nm_timeouts++;
 		}
 		if (rep->r_rexmit >= rep->r_retry) {	/* too many */
 			nfsstats.rpctimeouts++;
 			nfs_softterm(rep);
+			mtx_unlock(&rep->r_mtx);
+			mtx_unlock(&nmp->nm_mtx);
 			continue;
 		}
 		if (nmp->nm_sotype != SOCK_DGRAM) {
@@ -1236,12 +1481,17 @@
 			 * if necessary.
 			 */
 			rep->r_flags |= R_MUSTRESEND;
-			wakeup_nfsreq(rep);
+			wakeup((caddr_t)rep);
 			rep->r_rtt = 0;
+			mtx_unlock(&rep->r_mtx);
+			mtx_unlock(&nmp->nm_mtx);
 			continue;
 		}
-		if ((so = nmp->nm_so) == NULL)
+		if ((so = nmp->nm_so) == NULL) {
+			mtx_unlock(&rep->r_mtx);
+			mtx_unlock(&nmp->nm_mtx);
 			continue;
+		}
 		/*
 		 * If there is enough space and the window allows..
 		 *	Resend it
@@ -1249,48 +1499,66 @@
 		 */
 		rep->r_rtt = -1;
 		if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
-		   ((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
-		    (rep->r_flags & R_SENT) ||
-		    nmp->nm_sent < nmp->nm_cwnd) &&
-		   (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){
-			mtx_unlock(&nfs_reqq_mtx);
-			if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
-			    error = (*so->so_proto->pr_usrreqs->pru_send)
-				    (so, 0, m, NULL, NULL, curthread);
-			else
-			    error = (*so->so_proto->pr_usrreqs->pru_send)
-				    (so, 0, m, nmp->nm_nam, NULL, curthread);
-			mtx_lock(&nfs_reqq_mtx);
-			if (error) {
-				if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
-					so->so_error = 0;
-				rep->r_flags |= R_RESENDERR;
-			} else {
+		    ((nmp->nm_flag & NFSMNT_DUMBTIMR) || (rep->r_flags & R_SENT) ||
+		     nmp->nm_sent < nmp->nm_cwnd)) {
+			mtx_unlock(&rep->r_mtx);
+			mtx_unlock(&nmp->nm_mtx);
+			if ((m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))) {
 				/*
-				 * Iff first send, start timing
-				 * else turn timing off, backoff timer
-				 * and divide congestion window by 2.
+				 * Mark the request to indicate that a XMIT is in 
+				 * progress to prevent the req structure being 
+				 * removed in nfs_request().
 				 */
-				rep->r_flags &= ~R_RESENDERR;
-				if (rep->r_flags & R_SENT) {
-					rep->r_flags &= ~R_TIMING;
-					if (++rep->r_rexmit > NFS_MAXREXMIT)
-						rep->r_rexmit = NFS_MAXREXMIT;
-					nmp->nm_cwnd >>= 1;
-					if (nmp->nm_cwnd < NFS_CWNDSCALE)
-						nmp->nm_cwnd = NFS_CWNDSCALE;
-					nfsstats.rpcretries++;
+				mtx_lock(&rep->r_mtx);
+				rep->r_flags |= R_PIN_REQ;
+				mtx_unlock(&rep->r_mtx);
+				mtx_unlock(&nfs_reqq_mtx);
+				if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
+					error = (*so->so_proto->pr_usrreqs->pru_send)
+						(so, 0, m, NULL, NULL, curthread);
+				else	
+					error = (*so->so_proto->pr_usrreqs->pru_send)
+						(so, 0, m, nmp->nm_nam, NULL, 
+						 curthread);
+				mtx_lock(&nfs_reqq_mtx);
+				mtx_lock(&nmp->nm_mtx);
+				mtx_lock(&rep->r_mtx);
+				rep->r_flags &= ~R_PIN_REQ;
+				wakeup((caddr_t)&rep->r_flags);
+				if (error) {
+					if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
+						so->so_error = 0;
+					rep->r_flags |= R_RESENDERR;
 				} else {
-					rep->r_flags |= R_SENT;
-					nmp->nm_sent += NFS_CWNDSCALE;
+					/*
+					 * Iff first send, start timing
+					 * else turn timing off, backoff timer
+					 * and divide congestion window by 2.
+					 */
+					rep->r_flags &= ~R_RESENDERR;
+					if (rep->r_flags & R_SENT) {
+						rep->r_flags &= ~R_TIMING;
+						if (++rep->r_rexmit > NFS_MAXREXMIT)
+							rep->r_rexmit = NFS_MAXREXMIT;
+						nmp->nm_cwnd >>= 1;
+						if (nmp->nm_cwnd < NFS_CWNDSCALE)
+							nmp->nm_cwnd = NFS_CWNDSCALE;
+						nfsstats.rpcretries++;
+					} else {
+						rep->r_flags |= R_SENT;
+						nmp->nm_sent += NFS_CWNDSCALE;
+					}
+					rep->r_rtt = 0;
 				}
-				rep->r_rtt = 0;
+				mtx_unlock(&rep->r_mtx);
+				mtx_unlock(&nmp->nm_mtx);
 			}
+		} else {
+			mtx_unlock(&rep->r_mtx);
+			mtx_unlock(&nmp->nm_mtx);
 		}
 	}
 	mtx_unlock(&nfs_reqq_mtx);
-	mtx_unlock(&Giant);	/* nfs_down -> tprintf */
-	splx(s);
 	callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL);
 }
 
@@ -1304,28 +1572,28 @@
 	struct nfsmount *nmp;
 {
 	struct nfsreq *req;
-	int i, s;
+	int i;
 
-	s = splnet();
 	mtx_lock(&nfs_reqq_mtx);
 	TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
+		mtx_lock(&req->r_mtx);
 		if (nmp != req->r_nmp || req->r_mrep != NULL ||
-		    (req->r_flags & R_SOFTTERM))
+		    (req->r_flags & R_SOFTTERM)) {
+			mtx_unlock(&req->r_mtx);			
 			continue;
+		}
 		nfs_softterm(req);
+		mtx_unlock(&req->r_mtx);
 	}
 	mtx_unlock(&nfs_reqq_mtx);
-	splx(s);
 
 	for (i = 0; i < 30; i++) {
-		s = splnet();
 		mtx_lock(&nfs_reqq_mtx);
 		TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
 			if (nmp == req->r_nmp)
 				break;
 		}
 		mtx_unlock(&nfs_reqq_mtx);
-		splx(s);
 		if (req == NULL)
 			return (0);
 		tsleep(&lbolt, PSOCK, "nfscancel", 0);
@@ -1342,7 +1610,7 @@
 static void
 nfs_softterm(struct nfsreq *rep)
 {
-
+	KASSERT(mtx_owned(&rep->r_mtx), ("NFS req lock not owned !"));
 	rep->r_flags |= R_SOFTTERM;
 	if (rep->r_flags & R_SENT) {
 		rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
@@ -1352,7 +1620,7 @@
 	 * Request terminated, wakeup the blocked process, so that we
 	 * can return EINTR back.
 	 */
-	wakeup_nfsreq(rep);
+	wakeup((caddr_t)rep);
 }
 
 /*
@@ -1449,28 +1717,6 @@
 }
 
 /*
- * NFS wrapper to tsleep(), that shoves a new p_sigmask and restores the
- * old one after tsleep() returns.
- */
-int
-nfs_tsleep(struct thread *td, void *ident, int priority, char *wmesg, int timo)
-{
-	sigset_t oldset;
-	int error;
-	struct proc *p;
-	
-	if ((priority & PCATCH) == 0)
-		return tsleep(ident, priority, wmesg, timo);
-	if (td == NULL)
-		td = curthread; /* XXX */
-	nfs_set_sigmask(td, &oldset);
-	error = tsleep(ident, priority, wmesg, timo);
-	nfs_restore_sigmask(td, &oldset);
-	p = td->td_proc;
-	return (error);
-}
-
-/*
  * Test for a termination condition pending on the process.
  * This is used for NFSMNT_INT mounts.
  */
@@ -1479,11 +1725,17 @@
 {
 	struct proc *p;
 	sigset_t tmpset;
-
+	
 	if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
 		return nfs4_sigintr(nmp, rep, td);
-	if (rep && (rep->r_flags & R_SOFTTERM))
-		return (EIO);
+	if (rep) {
+		mtx_lock(&rep->r_mtx);
+		if (rep->r_flags & R_SOFTTERM) {
+			mtx_unlock(&rep->r_mtx);
+			return (EIO);
+		} else
+			mtx_unlock(&rep->r_mtx);
+	}
 	/* Terminate all requests while attempting a forced unmount. */
 	if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
 		return (EIO);
@@ -1491,7 +1743,6 @@
 		return (0);
 	if (td == NULL)
 		return (0);
-
 	p = td->td_proc;
 	PROC_LOCK(p);
 	tmpset = p->p_siglist;
@@ -1500,12 +1751,12 @@
 	mtx_lock(&p->p_sigacts->ps_mtx);
 	SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
 	mtx_unlock(&p->p_sigacts->ps_mtx);
-	if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist)) && nfs_sig_pending(tmpset)) {
+	if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist))
+	    && nfs_sig_pending(tmpset)) {
 		PROC_UNLOCK(p);
 		return (EINTR);
 	}
 	PROC_UNLOCK(p);
-
 	return (0);
 }
 
@@ -1516,7 +1767,7 @@
  * in progress when a reconnect is necessary.
  */
 int
-nfs_sndlock(struct nfsreq *rep)
+nfs_connect_lock(struct nfsreq *rep)
 {
 	int *statep = &rep->r_nmp->nm_state;
 	struct thread *td;
@@ -1527,11 +1778,12 @@
 		slpflag = PCATCH;
 	while (*statep & NFSSTA_SNDLOCK) {
 		error = nfs_sigintr(rep->r_nmp, rep, td);
-		if (error)
+		if (error) {
 			return (error);
+		}
 		*statep |= NFSSTA_WANTSND;
-		(void) tsleep(statep, slpflag | (PZERO - 1),
-			"nfsndlck", slptimeo);
+		(void) msleep(statep, &rep->r_nmp->nm_mtx,
+			      slpflag | (PZERO - 1), "nfsndlck", slptimeo);
 		if (slpflag == PCATCH) {
 			slpflag = 0;
 			slptimeo = 2 * hz;
@@ -1545,7 +1797,7 @@
  * Unlock the stream socket for others.
  */
 void
-nfs_sndunlock(struct nfsreq *rep)
+nfs_connect_unlock(struct nfsreq *rep)
 {
 	int *statep = &rep->r_nmp->nm_state;
 
@@ -1622,8 +1874,6 @@
 {
 	struct proc *p;
 
-	GIANT_REQUIRED;	/* tprintf */
-
 	p = td ? td->td_proc : NULL;
 	if (error) {
 		tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n", server,
@@ -1642,25 +1892,31 @@
 	const char *msg;
 	int error, flags;
 {
-
-	GIANT_REQUIRED;	/* nfs_msg */
-
 	if (nmp == NULL)
 		return;
+	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
+		nmp->nm_state |= NFSSTA_TIMEO;
+		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESP, 0);
-		nmp->nm_state |= NFSSTA_TIMEO;
-	}
+	} else
+		mtx_unlock(&nmp->nm_mtx);
 #ifdef NFSSTA_LOCKTIMEO
+	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
+		nmp->nm_state |= NFSSTA_LOCKTIMEO;
+		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESPLOCK, 0);
-		nmp->nm_state |= NFSSTA_LOCKTIMEO;
-	}
+	} else
+		mtx_unlock(&nmp->nm_mtx);
 #endif
-	if (rep)
+	if (rep != NULL) {
+		mtx_lock(&rep->r_mtx);
 		rep->r_flags |= R_TPRINTFMSG;
+		mtx_unlock(&rep->r_mtx);
+	}
 	nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
 }
 
@@ -1672,24 +1928,32 @@
 	const char *msg;
 	int flags;
 {
-
-	GIANT_REQUIRED;	/* nfs_msg */
-
-	if (nmp == NULL)
+	if (nmp == NULL || rep == NULL)
 		return;
-	if ((rep == NULL) || (rep->r_flags & R_TPRINTFMSG) != 0)
+	mtx_lock(&rep->r_mtx);
+	if ((rep->r_flags & R_TPRINTFMSG) != 0) {
+		mtx_unlock(&rep->r_mtx);
 		nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
+	} else
+		mtx_unlock(&rep->r_mtx);
+
+	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
 		nmp->nm_state &= ~NFSSTA_TIMEO;
+		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESP, 1);
-	}
+	} else
+		mtx_unlock(&nmp->nm_mtx);
+	
 #ifdef NFSSTA_LOCKTIMEO
+	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
 		nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
+		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESPLOCK, 1);
-	}
+	} else
+		mtx_unlock(&nmp->nm_mtx);
 #endif
 }
-
Index: nfs_lock.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_lock.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/nfsclient/nfs_lock.c -L sys/nfsclient/nfs_lock.c -u -r1.1.1.2 -r1.2
--- sys/nfsclient/nfs_lock.c
+++ sys/nfsclient/nfs_lock.c
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_lock.c,v 1.40.2.2 2006/02/14 00:06:32 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_lock.c,v 1.45 2007/04/21 18:11:18 rwatson Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -43,6 +43,7 @@
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/socket.h>
@@ -64,8 +65,8 @@
 
 extern void (*nlminfo_release_p)(struct proc *p);
 
-MALLOC_DEFINE(M_NFSLOCK, "NFS lock", "NFS lock request");
-MALLOC_DEFINE(M_NLMINFO, "nlminfo", "NFS lock process structure");
+MALLOC_DEFINE(M_NFSLOCK, "nfsclient_lock", "NFS lock request");
+MALLOC_DEFINE(M_NLMINFO, "nfsclient_nlminfo", "NFS lock process structure");
 
 static int nfslockdans(struct thread *td, struct lockd_ans *ansp);
 static void nlminfo_release(struct proc *p);
@@ -85,6 +86,10 @@
 {
 	int error;
 
+	error = priv_check(td, PRIV_NFS_LOCKD);
+	if (error)
+		return (error);
+
 	mtx_lock(&nfslock_mtx);
 	if (!nfslock_isopen) {
 		error = 0;
@@ -290,7 +295,7 @@
 			return (error);
 
 		/*
-		 * retry after 20 seconds if we haven't gotten a responce yet.
+		 * Retry after 20 seconds if we haven't gotten a response yet.
 		 * This number was picked out of thin air... but is longer
 		 * then even a reasonably loaded system should take (at least
 		 * on a local network).  XXX Probably should use a back-off
@@ -339,17 +344,6 @@
 nfslockdans(struct thread *td, struct lockd_ans *ansp)
 {
 	struct proc *targetp;
-	int error;
-
-	/* Let root, or someone who once was root (lockd generally
-	 * switches to the daemon uid once it is done setting up) make
-	 * this call.
-	 *
-	 * XXX This authorization check is probably not right.
-	 */
-	if ((error = suser(td)) != 0 &&
-	    td->td_ucred->cr_svuid != 0)
-		return (error);
 
 	/* the version should match, or we're out of sync */
 	if (ansp->la_vers != LOCKD_ANS_VERSION)
Index: krpc_subr.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/krpc_subr.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/krpc_subr.c -L sys/nfsclient/krpc_subr.c -u -r1.1.1.1 -r1.2
--- sys/nfsclient/krpc_subr.c
+++ sys/nfsclient/krpc_subr.c
@@ -43,7 +43,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/krpc_subr.c,v 1.29 2005/03/16 08:13:08 jmg Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/krpc_subr.c,v 1.30 2007/08/06 14:26:02 rwatson Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -215,8 +215,6 @@
 	nam = mhead = NULL;
 	from = NULL;
 
-	NET_ASSERT_GIANT();
-
 	/*
 	 * Create socket and set its recieve timeout.
 	 */
Index: nfs.h
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfs.h -L sys/nfsclient/nfs.h -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfs.h
+++ sys/nfsclient/nfs.h
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $FreeBSD: src/sys/nfsclient/nfs.h,v 1.90 2005/01/24 12:31:06 phk Exp $
+ * $FreeBSD: src/sys/nfsclient/nfs.h,v 1.98.2.1 2007/10/12 19:18:46 mohans Exp $
  */
 
 #ifndef _NFSCLIENT_NFS_H_
@@ -53,7 +53,8 @@
 #define	NFS_MAXTIMEO	(60 * NFS_HZ)	/* Max timeout to backoff to */
 #define	NFS_MINIDEMTIMEO (5 * NFS_HZ)	/* Min timeout for non-idempotent ops*/
 #define	NFS_MAXREXMIT	100		/* Stop counting after this many */
-#define	NFS_RETRANS	10		/* Num of retrans for soft mounts */
+#define	NFS_RETRANS	10		/* Num of retrans for UDP soft mounts */
+#define	NFS_RETRANS_TCP	2		/* Num of retrans for TCP soft mounts */
 #define	NFS_MAXGRPS	16		/* Max. size of groups list */
 #ifndef NFS_MINATTRTIMO
 #define	NFS_MINATTRTIMO 3		/* VREG attrib cache timeout in sec */
@@ -85,6 +86,7 @@
 #define NFS_CMPFH(n, f, s) \
 	((n)->n_fhsize == (s) && !bcmp((caddr_t)(n)->n_fhp, (caddr_t)(f), (s)))
 #define NFS_ISV3(v)	(VFSTONFS((v)->v_mount)->nm_flag & NFSMNT_NFSV3)
+#define NFS_ISV4(v)	(VFSTONFS((v)->v_mount)->nm_flag & NFSMNT_NFSV4)
 
 #define NFSSTA_HASWRITEVERF	0x00040000  /* Has write verifier for V3 */
 #define NFSSTA_GOTFSINFO	0x00100000  /* Got the V3 fsinfo */
@@ -131,6 +133,7 @@
 
 extern struct callout nfs_callout;
 extern struct nfsstats nfsstats;
+extern struct mtx nfs_iod_mtx;
 
 extern int nfs_numasync;
 extern unsigned int nfs_iodmax;
@@ -144,18 +147,13 @@
 
 extern int nfsv3_procid[NFS_NPROCS];
 
-struct uio;
-struct buf;
-struct vattr;
-struct nameidata;
-
 /*
  * Socket errors ignored for connectionless sockets??
  * For now, ignore them all
  */
 #define	NFSIGNORE_SOERROR(s, e) \
 		((e) != EINTR && (e) != EIO && \
-		 (e) != ERESTART && (e) != EWOULDBLOCK && \
+		(e) != ERESTART && (e) != EWOULDBLOCK && \
 		((s) & PR_CONNREQUIRED) == 0)
 
 /*
@@ -178,6 +176,7 @@
 	int		r_rtt;		/* RTT for rpc */
 	int		r_lastmsg;	/* last tprintf */
 	struct thread	*r_td;		/* Proc that did I/O system call */
+	struct mtx	r_mtx;		/* Protects nfsreq fields */
 };
 
 /*
@@ -194,19 +193,27 @@
 #define	R_TPRINTFMSG	0x20		/* Did a tprintf msg. */
 #define	R_MUSTRESEND	0x40		/* Must resend request */
 #define	R_GETONEREP	0x80		/* Probe for one reply only */
+#define	R_PIN_REQ	0x100		/* Pin request down (rexmit in prog or other) */
+
+struct buf;
+struct socket;
+struct uio;
+struct vattr;
 
 /*
  * Pointers to ops that differ from v3 to v4
  */
 struct nfs_rpcops {
-	int	(*nr_readrpc)(struct vnode *vp, struct uio *uiop, struct ucred *cred);
-	int	(*nr_writerpc)(struct vnode *vp, struct uio *uiop, struct ucred *cred,
-			       int *iomode, int *must_commit);
+	int	(*nr_readrpc)(struct vnode *vp, struct uio *uiop,
+		    struct ucred *cred);
+	int	(*nr_writerpc)(struct vnode *vp, struct uio *uiop,
+		    struct ucred *cred, int *iomode, int *must_commit);
 	int	(*nr_writebp)(struct buf *bp, int force, struct thread *td);
-	int	(*nr_readlinkrpc)(struct vnode *vp, struct uio *uiop, struct ucred *cred);
+	int	(*nr_readlinkrpc)(struct vnode *vp, struct uio *uiop,
+		    struct ucred *cred);
 	void	(*nr_invaldir)(struct vnode *vp);
 	int	(*nr_commit)(struct vnode *vp, u_quad_t offset, int cnt,
-			     struct ucred *cred, struct thread *td);
+		    struct ucred *cred, struct thread *td);
 };
 
 /*
@@ -254,6 +261,31 @@
 
 #endif
 
+/*
+ * On fast networks, the estimator will try to reduce the
+ * timeout lower than the latency of the server's disks,
+ * which results in too many timeouts, so cap the lower
+ * bound.
+ */
+#define NFS_MINRTO	(NFS_HZ >> 2)
+
+/*
+ * Keep the RTO from increasing to unreasonably large values
+ * when a server is not responding.
+ */
+#define NFS_MAXRTO	(20 * NFS_HZ)
+
+enum nfs_rto_timer_t {
+	NFS_DEFAULT_TIMER,
+	NFS_GETATTR_TIMER,
+	NFS_LOOKUP_TIMER,
+	NFS_READ_TIMER,
+	NFS_WRITE_TIMER,
+};
+#define NFS_MAX_TIMER	(NFS_WRITE_TIMER)
+
+#define NFS_INITRTT	(NFS_HZ << 3)
+
 vfs_init_t nfs_init;
 vfs_uninit_t nfs_uninit;
 int	nfs_mountroot(struct mount *mp, struct thread *td);
@@ -261,8 +293,8 @@
 #ifndef NFS4_USE_RPCCLNT
 int	nfs_send(struct socket *, struct sockaddr *, struct mbuf *,
 	    struct nfsreq *);
-int	nfs_sndlock(struct nfsreq *);
-void	nfs_sndunlock(struct nfsreq *);
+int	nfs_connect_lock(struct nfsreq *);
+void	nfs_connect_unlock(struct nfsreq *);
 #endif /* ! NFS4_USE_RPCCLNT */
 
 int	nfs_vinvalbuf(struct vnode *, int, struct thread *, int);
@@ -275,8 +307,8 @@
 int	nfs_nfsiodnew(void);
 int	nfs_asyncio(struct nfsmount *, struct buf *, struct ucred *, struct thread *);
 int	nfs_doio(struct vnode *, struct buf *, struct ucred *, struct thread *);
-void    nfs_doio_directwrite (struct buf *);
-void    nfs_up(struct nfsreq *, struct nfsmount *, struct thread *,
+void	nfs_doio_directwrite (struct buf *);
+void	nfs_up(struct nfsreq *, struct nfsmount *, struct thread *,
 	    const char *, int);
 void	nfs_down(struct nfsreq *, struct nfsmount *, struct thread *,
 	    const char *, int, int);
@@ -297,6 +329,7 @@
 void	nfs_disconnect(struct nfsmount *);
 void	nfs_safedisconnect(struct nfsmount *);
 int	nfs_getattrcache(struct vnode *, struct vattr *);
+int	nfs_iosize(struct nfsmount *nmp);
 int	nfsm_strtmbuf(struct mbuf **, char **, const char *, long);
 int	nfs_bioread(struct vnode *, struct uio *, int, struct ucred *);
 int	nfsm_uiotombuf(struct uio *, struct mbuf **, int, caddr_t *);
@@ -307,12 +340,10 @@
 int	nfs_meta_setsize (struct vnode *, struct ucred *,
 	    struct thread *, u_quad_t);
 
-void    nfs_set_sigmask __P((struct thread *td, sigset_t *oldset));
-void    nfs_restore_sigmask __P((struct thread *td, sigset_t *set));
-int     nfs_tsleep __P((struct thread *td, void *ident, int priority, char *wmesg, 
-			int timo));
-int     nfs_msleep __P((struct thread *td, void *ident, struct mtx *mtx, int priority, 
-			char *wmesg, int timo));
+void	nfs_set_sigmask(struct thread *td, sigset_t *oldset);
+void	nfs_restore_sigmask(struct thread *td, sigset_t *set);
+int	nfs_msleep(struct thread *td, void *ident, struct mtx *mtx,
+	    int priority, char *wmesg, int timo);
 
 #endif	/* _KERNEL */
 
Index: nfsmount.h
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfsmount.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfsmount.h -L sys/nfsclient/nfsmount.h -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfsmount.h
+++ sys/nfsclient/nfsmount.h
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfsmount.h	8.3 (Berkeley) 3/30/95
- * $FreeBSD: src/sys/nfsclient/nfsmount.h,v 1.30 2005/06/10 23:50:40 green Exp $
+ * $FreeBSD: src/sys/nfsclient/nfsmount.h,v 1.32.2.1 2007/10/12 19:18:46 mohans Exp $
  */
 
 #ifndef _NFSCLIENT_NFSMOUNT_H_
@@ -40,8 +40,9 @@
  	int rpcresid;
 #define NFS_TCP_EXPECT_RPCMARKER 	0x0001 /* Expect to see a RPC/TCP marker next */
 #define NFS_TCP_FORCE_RECONNECT 	0x0002 /* Force a TCP reconnect */
+#define NFS_TCP_WAIT_WRITE_DRAIN 	0x0004 /* Waiting for socket writers to finish */
  	int flags;
-	struct mtx mtx;
+	int sock_send_inprog;
 };
 
 /*
@@ -50,6 +51,7 @@
  * Holds NFS specific information for mount.
  */
 struct	nfsmount {
+	struct mtx	nm_mtx;
 	int	nm_flag;		/* Flags for soft/hard... */
 	int	nm_state;		/* Internal state flags */
 	struct	mount *nm_mountp;	/* Vfs structure for this filesystem */
@@ -64,8 +66,8 @@
 	struct	sockaddr *nm_nam;	/* Addr of server */
 	int	nm_timeo;		/* Init timer for NFSMNT_DUMBTIMR */
 	int	nm_retry;		/* Max retries */
-	int	nm_srtt[4];		/* Timers for rpcs */
-	int	nm_sdrtt[4];
+	int	nm_srtt[NFS_MAX_TIMER],	/* RTT Timers for rpcs */
+		nm_sdrtt[NFS_MAX_TIMER];
 	int	nm_sent;		/* Request send count */
 	int	nm_cwnd;		/* Request send window */
 	int	nm_timeouts;		/* Request timeouts */
Index: nfs_diskless.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_diskless.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/nfsclient/nfs_diskless.c -L sys/nfsclient/nfs_diskless.c -u -r1.2 -r1.3
--- sys/nfsclient/nfs_diskless.c
+++ sys/nfsclient/nfs_diskless.c
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/nfsclient/nfs_diskless.c,v 1.11.2.2 2006/03/20 15:45:14 pjd Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_diskless.c,v 1.17 2006/12/06 02:15:25 sam Exp $");
 
 #include "opt_bootp.h"
 
@@ -60,10 +60,34 @@
 static int hwaddr_to_sockaddr(char *ev, struct sockaddr_dl *sa);
 static int decode_nfshandle(char *ev, u_char *fh);
 
-static void
-nfs_parse_options(const char *envopts, struct nfs_diskless *nd)
+/*
+ * Validate/sanity check a rsize/wsize parameter.
+ */
+static int
+checkrwsize(unsigned long v, const char *name)
+{
+	/*
+	 * 32K is used as an upper bound because most servers
+	 * limit block size to satisfy IPv4's limit of
+	 * 64K/reassembled packet.  The lower bound is pretty
+	 * much arbitrary.
+	 */
+	if (!(4 <= v && v <= 32*1024)) {
+		printf("nfs_parse_options: invalid %s %lu ignored\n", name, v);
+		return 0;
+	} else
+		return 1;
+}
+
+/*
+ * Parse mount options and apply them to the supplied
+ * nfs_diskless state.  Used also by bootp/dhcp support.
+ */
+void
+nfs_parse_options(const char *envopts, struct nfs_args *nd)
 {
 	char *opts, *o, *otmp;
+	unsigned long v;
 
 	opts = strdup(envopts, M_TEMP);
 	otmp = opts;
@@ -71,15 +95,37 @@
 		if (*o == '\0')
 			; /* Skip empty options. */
 		else if (strcmp(o, "soft") == 0)
-			nd->root_args.flags |= NFSMNT_SOFT;
+			nd->flags |= NFSMNT_SOFT;
 		else if (strcmp(o, "intr") == 0)
-			nd->root_args.flags |= NFSMNT_INT;
+			nd->flags |= NFSMNT_INT;
 		else if (strcmp(o, "conn") == 0)
-			nd->root_args.flags |= NFSMNT_NOCONN;
+			nd->flags |= NFSMNT_NOCONN;
 		else if (strcmp(o, "nolockd") == 0)
-			nd->root_args.flags |= NFSMNT_NOLOCKD;
-		else
-			printf("nfs_diskless: unknown option: %s\n", o);
+			nd->flags |= NFSMNT_NOLOCKD;
+		else if (strcmp(o, "nfsv2") == 0)
+			nd->flags &= ~(NFSMNT_NFSV3 | NFSMNT_NFSV4);
+		else if (strcmp(o, "nfsv3") == 0) {
+			nd->flags &= ~NFSMNT_NFSV4;
+			nd->flags |= NFSMNT_NFSV3;
+		} else if (strcmp(o, "tcp") == 0)
+			nd->sotype = SOCK_STREAM;
+		else if (strcmp(o, "udp") == 0)
+			nd->sotype = SOCK_DGRAM;
+		else if (strncmp(o, "rsize=", 6) == 0) {
+			v = strtoul(o+6, NULL, 10);
+			if (checkrwsize(v, "rsize")) {
+				nd->rsize = (int) v;
+				nd->flags |= NFSMNT_RSIZE;
+			}
+		} else if (strncmp(o, "wsize=", 6) == 0) {
+			v = strtoul(o+6, NULL, 10);
+			if (checkrwsize(v, "wsize")) {
+				nd->wsize = (int) v;
+				nd->flags |= NFSMNT_WSIZE;
+			}
+		} else
+			printf("%s: skipping unknown option \"%s\"\n",
+			    __func__, o);
 	}
 	free(opts, M_TEMP);
 }
@@ -132,12 +178,12 @@
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
-			if ((ifa->ifa_addr->sa_family == AF_LINK) &&
-			    (sdl = ((struct sockaddr_dl *)ifa->ifa_addr))) {
+			if (ifa->ifa_addr->sa_family == AF_LINK) {
+				sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 				if ((sdl->sdl_type == ourdl.sdl_type) &&
 				    (sdl->sdl_alen == ourdl.sdl_alen) &&
-				    !bcmp(sdl->sdl_data + sdl->sdl_nlen,
-					  ourdl.sdl_data + ourdl.sdl_nlen, 
+				    !bcmp(LLADDR(sdl),
+					  LLADDR(&ourdl),
 					  sdl->sdl_alen)) {
 				    IFNET_RUNLOCK();
 				    goto match_done;
@@ -174,7 +220,18 @@
 		freeenv(cp);
 	}
 	if ((cp = getenv("boot.nfsroot.options")) != NULL) {
-		nfs_parse_options(cp, nd);
+		struct nfs_args args;
+
+		/* XXX yech, convert between old and current arg format */
+		args.flags = nd->root_args.flags;
+		args.sotype = nd->root_args.sotype;
+		args.rsize = nd->root_args.rsize;
+		args.wsize = nd->root_args.wsize;
+		nfs_parse_options(cp, &args);
+		nd->root_args.flags = args.flags;
+		nd->root_args.sotype = args.sotype;
+		nd->root_args.rsize = args.rsize;
+		nd->root_args.wsize = args.wsize;
 		freeenv(cp);
 	}
 
Index: bootp_subr.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/bootp_subr.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/bootp_subr.c -L sys/nfsclient/bootp_subr.c -u -r1.1.1.1 -r1.2
--- sys/nfsclient/bootp_subr.c
+++ sys/nfsclient/bootp_subr.c
@@ -41,7 +41,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/bootp_subr.c,v 1.64 2005/04/26 20:45:29 des Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/bootp_subr.c,v 1.70 2007/08/06 14:26:02 rwatson Exp $");
 
 #include "opt_bootp.h"
 
@@ -220,7 +220,6 @@
 		    const struct in_addr *siaddr);
 static int	getdec(char **ptr);
 static int	getip(char **ptr, struct in_addr *ip);
-static char	*substr(char *a, char *b);
 static void	mountopts(struct nfs_args *args, char *p);
 static int	xdr_opaque_decode(struct mbuf **ptr, u_char *buf, int len);
 static int	xdr_int_decode(struct mbuf **ptr, int *iptr);
@@ -591,8 +590,6 @@
 	int retry;
 	const char *s;
 
-	NET_ASSERT_GIANT();
-
 	/*
 	 * Create socket and set its recieve timeout.
 	 */
@@ -760,7 +757,7 @@
 			}
 
 			/* XXX: Is this needed ? */
-			tsleep(&error, PZERO + 8, "bootpw", 10);
+			pause("bootpw", hz/10);
 
 			/* Set netmask to 255.0.0.0 */
 
@@ -983,8 +980,6 @@
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
-	NET_ASSERT_GIANT();
-
 	error = socreate(AF_INET, &ifctx->so, SOCK_DGRAM, 0, td->td_ucred, td);
 	if (error != 0)
 		panic("nfs_boot: socreate, error=%d", error);
@@ -1047,13 +1042,12 @@
 	/* Get HW address */
 
 	sdl = NULL;
-	for (ifa = TAILQ_FIRST(&ifctx->ifp->if_addrhead);
-	     ifa != NULL;
-	     ifa = TAILQ_NEXT(ifa, ifa_link))
-		if (ifa->ifa_addr->sa_family == AF_LINK &&
-		    (sdl = ((struct sockaddr_dl *) ifa->ifa_addr)) != NULL &&
-		    sdl->sdl_type == IFT_ETHER)
-			break;
+	TAILQ_FOREACH(ifa, &ifctx->ifp->if_addrhead, ifa_link)
+		if (ifa->ifa_addr->sa_family == AF_LINK) {
+			sdl = (struct sockaddr_dl *)ifa->ifa_addr;
+			if (sdl->sdl_type == IFT_ETHER)
+				break;
+		}
 
 	if (sdl == NULL)
 		panic("bootpc: Unable to find HW address for %s",
@@ -1235,51 +1229,16 @@
 	return ret;
 }
 
-static char *
-substr(char *a, char *b)
-{
-	char *loc1;
-	char *loc2;
-
-        while (*a != '\0') {
-                loc1 = a;
-                loc2 = b;
-                while (*loc1 == *loc2++) {
-                        if (*loc1 == '\0')
-				return 0;
-                        loc1++;
-                        if (*loc2 == '\0')
-				return loc1;
-                }
-		a++;
-        }
-        return 0;
-}
-
 static void
 mountopts(struct nfs_args *args, char *p)
 {
-	char *tmp;
-
 	args->version = NFS_ARGSVERSION;
 	args->rsize = 8192;
 	args->wsize = 8192;
 	args->flags = NFSMNT_RSIZE | NFSMNT_WSIZE | NFSMNT_RESVPORT;
 	args->sotype = SOCK_DGRAM;
-	if (p == NULL)
-		return;
-	if ((tmp = (char *)substr(p, "rsize=")))
-		args->rsize = getdec(&tmp);
-	if ((tmp = (char *)substr(p, "wsize=")))
-		args->wsize = getdec(&tmp);
-	if ((tmp = (char *)substr(p, "intr")))
-		args->flags |= NFSMNT_INT;
-	if ((tmp = (char *)substr(p, "soft")))
-		args->flags |= NFSMNT_SOFT;
-	if ((tmp = (char *)substr(p, "noconn")))
-		args->flags |= NFSMNT_NOCONN;
-	if ((tmp = (char *)substr(p, "tcp")))
-		args->sotype = SOCK_STREAM;
+	if (p != NULL)
+		nfs_parse_options(p, args);
 }
 
 static int
@@ -1815,6 +1774,7 @@
 	int authcount;
 	int authver;
 
+	/* XXX honor v2/v3 flags in args->flags? */
 #ifdef BOOTP_NFSV3
 	/* First try NFS v3 */
 	/* Get port number for MOUNTD. */
Index: nfs_vnops.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_vnops.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/nfsclient/nfs_vnops.c -L sys/nfsclient/nfs_vnops.c -u -r1.2 -r1.3
--- sys/nfsclient/nfs_vnops.c
+++ sys/nfsclient/nfs_vnops.c
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_vnops.c,v 1.258.2.4.2.2 2006/04/18 14:15:50 jon Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_vnops.c,v 1.276.2.1 2007/11/27 12:20:58 rwatson Exp $");
 
 /*
  * vnode op calls for Sun NFS version 2 and 3
@@ -192,6 +192,7 @@
 /*
  * Global variables
  */
+struct mtx 	nfs_iod_mtx;
 struct proc	*nfs_iodwant[NFS_MAXASYNCDAEMON];
 struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON];
 int		 nfs_numasync = 0;
@@ -241,6 +242,23 @@
 #define	NFSV3ACCESS_ALL (NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY		\
 			 | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE	\
 			 | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP)
+
+/*
+ * SMP Locking Note :
+ * The list of locks after the description of the lock is the ordering
+ * of other locks acquired with the lock held.
+ * np->n_mtx : Protects the fields in the nfsnode.
+       VM Object Lock
+       VI_MTX (acquired indirectly)
+ * nmp->nm_mtx : Protects the fields in the nfsmount.
+       rep->r_mtx
+ * nfs_iod_mtx : Global lock, protects shared nfsiod state.
+ * nfs_reqq_mtx : Global lock, protects the nfs_reqq list.
+       nmp->nm_mtx
+       rep->r_mtx
+ * rep->r_mtx : Protects the fields in an nfsreq.
+ */
+
 static int
 nfs3_access_otw(struct vnode *vp, int wmode, struct thread *td,
     struct ucred *cred)
@@ -266,9 +284,11 @@
 	if (!error) {
 		tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 		rmode = fxdr_unsigned(u_int32_t, *tl);
+		mtx_lock(&np->n_mtx);
 		np->n_mode = rmode;
 		np->n_modeuid = cred->cr_uid;
 		np->n_modestamp = time_second;
+		mtx_unlock(&np->n_mtx);
 	}
 	m_freem(mrep);
 nfsmout:
@@ -343,6 +363,7 @@
 		 * Does our cached result allow us to give a definite yes to
 		 * this request?
 		 */
+		mtx_lock(&np->n_mtx);
 		if ((time_second < (np->n_modestamp + nfsaccess_cache_timeout)) &&
 		    (ap->a_cred->cr_uid == np->n_modeuid) &&
 		    ((np->n_mode & mode) == mode)) {
@@ -352,18 +373,21 @@
 			 * Either a no, or a don't know.  Go to the wire.
 			 */
 			nfsstats.accesscache_misses++;
+			mtx_unlock(&np->n_mtx);
 		        error = nfs3_access_otw(vp, wmode, ap->a_td,ap->a_cred);
+			mtx_lock(&np->n_mtx);
 			if (!error) {
 				if ((np->n_mode & mode) != mode) {
 					error = EACCES;
 				}
 			}
 		}
+		mtx_unlock(&np->n_mtx);
 		return (error);
 	} else {
-		if ((error = nfsspec_access(ap)) != 0)
+		if ((error = nfsspec_access(ap)) != 0) {
 			return (error);
-
+		}
 		/*
 		 * Attempt to prevent a mapped root from accessing a file
 		 * which it shouldn't.  We try to read a byte from the file
@@ -371,12 +395,14 @@
 		 * After calling nfsspec_access, we should have the correct
 		 * file size cached.
 		 */
+		mtx_lock(&np->n_mtx);
 		if (ap->a_cred->cr_uid == 0 && (ap->a_mode & VREAD)
 		    && VTONFS(vp)->n_size > 0) {
 			struct iovec aiov;
 			struct uio auio;
 			char buf[1];
 
+			mtx_unlock(&np->n_mtx);
 			aiov.iov_base = buf;
 			aiov.iov_len = 1;
 			auio.uio_iov = &aiov;
@@ -400,11 +426,14 @@
 				error = nfs_readlinkrpc(vp, &auio, ap->a_cred);
 			else
 				error = EACCES;
-		}
+		} else
+			mtx_unlock(&np->n_mtx);
 		return (error);
 	}
 }
 
+int nfs_otw_getattr_avoid = 0;
+
 /*
  * nfs open vnode op
  * Check to see if the type is ok
@@ -428,7 +457,9 @@
 	/*
 	 * Get a valid lease. If cached data is stale, flush it.
 	 */
+	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NMODIFIED) {
+		mtx_unlock(&np->n_mtx);			
 		error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 		if (error == EINTR || error == EIO)
 			return (error);
@@ -438,20 +469,35 @@
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
 		if (error)
 			return (error);
+		mtx_lock(&np->n_mtx);
 		np->n_mtime = vattr.va_mtime;
+		mtx_unlock(&np->n_mtx);
 	} else {
-		np->n_attrstamp = 0;
+		struct thread *td = curthread;
+
+		if (np->n_ac_ts_syscalls != td->td_syscalls ||
+		    np->n_ac_ts_tid != td->td_tid || 
+		    td->td_proc == NULL ||
+		    np->n_ac_ts_pid != td->td_proc->p_pid) {
+			np->n_attrstamp = 0;
+		}
+		mtx_unlock(&np->n_mtx);						
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
 		if (error)
 			return (error);
+		mtx_lock(&np->n_mtx);
 		if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 			if (vp->v_type == VDIR)
 				np->n_direofoffset = 0;
+			mtx_unlock(&np->n_mtx);
 			error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
-			if (error == EINTR || error == EIO)
+			if (error == EINTR || error == EIO) {
 				return (error);
+			}
+			mtx_lock(&np->n_mtx);
 			np->n_mtime = vattr.va_mtime;
 		}
+		mtx_unlock(&np->n_mtx);
 	}
 	/*
 	 * If the object has >= 1 O_DIRECT active opens, we disable caching.
@@ -461,12 +507,13 @@
 			error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 			if (error)
 				return (error);
+			mtx_lock(&np->n_mtx);
 			np->n_flag |= NNONCACHE;
+			mtx_unlock(&np->n_mtx);
 		}
 		np->n_directio_opens++;
 	}
-	np->ra_expect_lbn = 0;
-	vnode_create_vobject_off(vp, vattr.va_size, ap->a_td);
+	vnode_create_vobject(vp, vattr.va_size, ap->a_td);
 	return (0);
 }
 
@@ -519,7 +566,9 @@
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_UNLOCK(vp->v_object);
 	    }
+	    mtx_lock(&np->n_mtx);
 	    if (np->n_flag & NMODIFIED) {
+		mtx_unlock(&np->n_mtx);
 		if (NFS_ISV3(vp)) {
 		    /*
 		     * Under NFSv3 we have dirty buffers to dispose of.  We
@@ -539,6 +588,7 @@
 		    /* np->n_flag &= ~NMODIFIED; */
 		} else
 		    error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
+		mtx_lock(&np->n_mtx);
 	    }
  	    /* 
  	     * Invalidate the attribute cache in all cases.
@@ -551,13 +601,20 @@
 		np->n_flag &= ~NWRITEERR;
 		error = np->n_error;
 	    }
+	    mtx_unlock(&np->n_mtx);
 	}
+	if (nfs_directio_enable)
+		KASSERT((np->n_directio_asyncwr == 0),
+			("nfs_close: dirty unflushed (%d) directio buffers\n",
+			 np->n_directio_asyncwr));
 	if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
+		mtx_lock(&np->n_mtx);
 		KASSERT((np->n_directio_opens > 0), 
-			("nfs_close: unexpectedly value (0) of n_directio_opens\n"));		
+			("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
 		np->n_directio_opens--;
 		if (np->n_directio_opens == 0)
 			np->n_flag &= ~NNONCACHE;
+		mtx_unlock(&np->n_mtx);
 	}
 	return (error);
 }
@@ -578,21 +635,21 @@
 	/*
 	 * Update local times for special files.
 	 */
+	mtx_lock(&np->n_mtx);
 	if (np->n_flag & (NACC | NUPD))
 		np->n_flag |= NCHG;
+	mtx_unlock(&np->n_mtx);
 	/*
 	 * First look in the cache.
 	 */
 	if (nfs_getattrcache(vp, ap->a_vap) == 0)
-		return (0);
-
+		goto nfsmout;
 	if (v3 && nfsaccess_cache_timeout > 0) {
 		nfsstats.accesscache_misses++;
 		nfs3_access_otw(vp, NFSV3ACCESS_ALL, ap->a_td, ap->a_cred);
 		if (nfs_getattrcache(vp, ap->a_vap) == 0)
-			return (0);
+			goto nfsmout;
 	}
-
 	nfsstats.rpccnt[NFSPROC_GETATTR]++;
 	mreq = nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH(v3));
 	mb = mreq;
@@ -624,9 +681,9 @@
 #endif
 
 	/*
-	 * Setting of flags is not supported.
+	 * Setting of flags and marking of atimes are not supported.
 	 */
-	if (vap->va_flags != VNOVAL)
+	if (vap->va_flags != VNOVAL || (vap->va_vaflags & VA_MARK_ATIME))
 		return (EOPNOTSUPP);
 
 	/*
@@ -635,8 +692,10 @@
   	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
-	    (vp->v_mount->mnt_flag & MNT_RDONLY))
-		return (EROFS);
+	    (vp->v_mount->mnt_flag & MNT_RDONLY)) {
+		error = EROFS;
+		goto out;
+	}
 	if (vap->va_size != VNOVAL) {
  		switch (vp->v_type) {
  		case VDIR:
@@ -650,7 +709,7 @@
 			    vap->va_mode == (mode_t)VNOVAL &&
 			    vap->va_uid == (uid_t)VNOVAL &&
 			    vap->va_gid == (gid_t)VNOVAL)
-				return (0);
+				return (0);		
  			vap->va_size = VNOVAL;
  			break;
  		default:
@@ -660,47 +719,60 @@
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
-
 			/*
 			 *  We run vnode_pager_setsize() early (why?),
 			 * we must set np->n_size now to avoid vinvalbuf
 			 * V_SAVE races that might setsize a lower
 			 * value.
 			 */
-
+			mtx_lock(&np->n_mtx);
 			tsize = np->n_size;
+			mtx_unlock(&np->n_mtx);
 			error = nfs_meta_setsize(vp, ap->a_cred, 
-						ap->a_td, vap->va_size);
-
+						 ap->a_td, vap->va_size);
+			mtx_lock(&np->n_mtx);
  			if (np->n_flag & NMODIFIED) {
+			    tsize = np->n_size;
+			    mtx_unlock(&np->n_mtx);
  			    if (vap->va_size == 0)
  				error = nfs_vinvalbuf(vp, 0, ap->a_td, 1);
  			    else
  				error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
  			    if (error) {
-				vnode_pager_setsize(vp, np->n_size);
- 				return (error);
+				vnode_pager_setsize(vp, tsize);
+				goto out;
 			    }
- 			}
+ 			} else
+			    mtx_unlock(&np->n_mtx);
 			/*
 			 * np->n_size has already been set to vap->va_size
 			 * in nfs_meta_setsize(). We must set it again since
 			 * nfs_loadattrcache() could be called through
 			 * nfs_meta_setsize() and could modify np->n_size.
 			 */
+			mtx_lock(&np->n_mtx);
  			np->n_vattr.va_size = np->n_size = vap->va_size;
+			mtx_unlock(&np->n_mtx);
   		};
-  	} else if ((vap->va_mtime.tv_sec != VNOVAL ||
-		vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) &&
-		vp->v_type == VREG &&
-  		(error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1)) != 0 &&
-		    (error == EINTR || error == EIO))
-		return (error);
+  	} else {
+		mtx_lock(&np->n_mtx);
+		if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && 
+		    (np->n_flag & NMODIFIED) && vp->v_type == VREG) {
+			mtx_unlock(&np->n_mtx);
+			if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1)) != 0 &&
+			    (error == EINTR || error == EIO))
+				return error;
+		} else
+			mtx_unlock(&np->n_mtx);
+	}
 	error = nfs_setattrrpc(vp, vap, ap->a_cred, ap->a_td);
 	if (error && vap->va_size != VNOVAL) {
+		mtx_lock(&np->n_mtx);
 		np->n_size = np->n_vattr.va_size = tsize;
-		vnode_pager_setsize(vp, np->n_size);
+		vnode_pager_setsize(vp, tsize);
+		mtx_unlock(&np->n_mtx);
 	}
+out:
 	return (error);
 }
 
@@ -779,7 +851,7 @@
 	int error = 0, attrflag, fhsize;
 	int v3 = NFS_ISV3(dvp);
 	struct thread *td = cnp->cn_thread;
-
+	
 	*vpp = NULLVP;
 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
@@ -840,7 +912,7 @@
 			m_freem(mrep);
 			return (EISDIR);
 		}
-		error = nfs_nget(dvp->v_mount, fhp, fhsize, &np);
+		error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE);
 		if (error) {
 			m_freem(mrep);
 			return (error);
@@ -859,7 +931,7 @@
 
 	if (flags & ISDOTDOT) {
 		VOP_UNLOCK(dvp, 0, td);
-		error = nfs_nget(dvp->v_mount, fhp, fhsize, &np);
+		error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, cnp->cn_lkflags);
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
 		if (error)
 			return (error);
@@ -868,7 +940,7 @@
 		VREF(dvp);
 		newvp = dvp;
 	} else {
-		error = nfs_nget(dvp->v_mount, fhp, fhsize, &np);
+		error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, cnp->cn_lkflags);
 		if (error) {
 			m_freem(mrep);
 			return (error);
@@ -964,8 +1036,10 @@
 		nfsm_strsiz(len, NFS_MAXPATHLEN);
 		if (len == NFS_MAXPATHLEN) {
 			struct nfsnode *np = VTONFS(vp);
+			mtx_lock(&np->n_mtx);
 			if (np->n_size && np->n_size < NFS_MAXPATHLEN)
 				len = np->n_size;
+			mtx_unlock(&np->n_mtx);
 		}
 		nfsm_mtouio(uiop, len);
 	}
@@ -987,17 +1061,23 @@
 	struct nfsmount *nmp;
 	int error = 0, len, retlen, tsiz, eof, attrflag;
 	int v3 = NFS_ISV3(vp);
+	int rsize;
 
 #ifndef nolint
 	eof = 0;
 #endif
 	nmp = VFSTONFS(vp->v_mount);
 	tsiz = uiop->uio_resid;
-	if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize)
+	mtx_lock(&nmp->nm_mtx);
+	if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) {
+		mtx_unlock(&nmp->nm_mtx);
 		return (EFBIG);
+	}
+	rsize = nmp->nm_rsize;
+	mtx_unlock(&nmp->nm_mtx);
 	while (tsiz > 0) {
 		nfsstats.rpccnt[NFSPROC_READ]++;
-		len = (tsiz > nmp->nm_rsize) ? nmp->nm_rsize : tsiz;
+		len = (tsiz > rsize) ? rsize : tsiz;
 		mreq = nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3);
 		mb = mreq;
 		bpos = mtod(mb, caddr_t);
@@ -1020,9 +1100,10 @@
 			}
 			tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED);
 			eof = fxdr_unsigned(int, *(tl + 1));
-		} else
+		} else {
 			nfsm_loadattr(vp, NULL);
-		nfsm_strsiz(retlen, nmp->nm_rsize);
+		}
+		nfsm_strsiz(retlen, rsize);
 		nfsm_mtouio(uiop, retlen);
 		m_freem(mrep);
 		tsiz -= retlen;
@@ -1043,7 +1124,7 @@
  */
 int
 nfs_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
-    int *iomode, int *must_commit)
+	     int *iomode, int *must_commit)
 {
 	u_int32_t *tl;
 	int32_t backup;
@@ -1052,18 +1133,24 @@
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit;
 	int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC;
-
+	int wsize;
+	
 #ifndef DIAGNOSTIC
 	if (uiop->uio_iovcnt != 1)
 		panic("nfs: writerpc iovcnt > 1");
 #endif
 	*must_commit = 0;
 	tsiz = uiop->uio_resid;
-	if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize)
+	mtx_lock(&nmp->nm_mtx);
+	if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) {
+		mtx_unlock(&nmp->nm_mtx);		
 		return (EFBIG);
+	}
+	wsize = nmp->nm_wsize;
+	mtx_unlock(&nmp->nm_mtx);
 	while (tsiz > 0) {
 		nfsstats.rpccnt[NFSPROC_WRITE]++;
-		len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz;
+		len = (tsiz > wsize) ? wsize : tsiz;
 		mreq = nfsm_reqhead(vp, NFSPROC_WRITE,
 			NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len));
 		mb = mreq;
@@ -1122,6 +1209,7 @@
 				else if (committed == NFSV3WRITE_DATASYNC &&
 					commit == NFSV3WRITE_UNSTABLE)
 					committed = commit;
+				mtx_lock(&nmp->nm_mtx);
 				if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){
 				    bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
 					NFSX_V3WRITEVERF);
@@ -1132,18 +1220,23 @@
 				    bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
 					NFSX_V3WRITEVERF);
 				}
+				mtx_unlock(&nmp->nm_mtx);
 			}
-		} else
-		    nfsm_loadattr(vp, NULL);
-		if (wccflag)
-		    VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime;
+		} else {
+			nfsm_loadattr(vp, NULL);
+		}
+		if (wccflag) {
+			mtx_lock(&(VTONFS(vp))->n_mtx);
+			VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime;
+			mtx_unlock(&(VTONFS(vp))->n_mtx);
+		}
 		m_freem(mrep);
 		if (error)
 			break;
 		tsiz -= len;
 	}
 nfsmout:
-	if (vp->v_mount->mnt_flag & MNT_ASYNC)
+	if (vp->v_mount->mnt_kern_flag & MNTK_ASYNC)
 		committed = NFSV3WRITE_FILESYNC;
 	*iomode = committed;
 	if (error)
@@ -1232,9 +1325,11 @@
 			cache_enter(dvp, newvp, cnp);
 		*vpp = newvp;
 	}
+	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	if (!wccflag)
 		VTONFS(dvp)->n_attrstamp = 0;
+	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	return (error);
 }
 
@@ -1246,7 +1341,6 @@
 static int
 nfs_mknod(struct vop_mknod_args *ap)
 {
-
 	return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap));
 }
 
@@ -1353,15 +1447,19 @@
 		if (vap->va_atime.tv_sec == VNOVAL)
 			vap->va_atime = vap->va_mtime;
 		error = nfs_setattrrpc(newvp, vap, cnp->cn_cred, cnp->cn_thread);
+		if (error)
+			vput(newvp);
 	}
 	if (!error) {
 		if (cnp->cn_flags & MAKEENTRY)
 			cache_enter(dvp, newvp, cnp);
 		*ap->a_vpp = newvp;
 	}
+	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	if (!wccflag)
 		VTONFS(dvp)->n_attrstamp = 0;
+	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	return (error);
 }
 
@@ -1434,7 +1532,6 @@
 int
 nfs_removeit(struct sillyrename *sp)
 {
-
 	/*
 	 * Make sure that the directory vnode is still valid.
 	 * XXX we should lock sp->s_dvp here.
@@ -1469,9 +1566,11 @@
 		nfsm_wcc_data(dvp, wccflag);
 	m_freem(mrep);
 nfsmout:
+	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	if (!wccflag)
 		VTONFS(dvp)->n_attrstamp = 0;
+	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	return (error);
 }
 
@@ -1502,7 +1601,7 @@
 	}
 
 	if (fvp == tvp) {
-		printf("nfs_rename: fvp == tvp (can't happen)\n");
+		nfs_printf("nfs_rename: fvp == tvp (can't happen)\n");
 		error = 0;
 		goto out;
 	}
@@ -1609,8 +1708,12 @@
 	}
 	m_freem(mrep);
 nfsmout:
+	mtx_lock(&(VTONFS(fdvp))->n_mtx);
 	VTONFS(fdvp)->n_flag |= NMODIFIED;
+	mtx_unlock(&(VTONFS(fdvp))->n_mtx);
+	mtx_lock(&(VTONFS(tdvp))->n_mtx);
 	VTONFS(tdvp)->n_flag |= NMODIFIED;
+	mtx_unlock(&(VTONFS(tdvp))->n_mtx);
 	if (!fwccflag)
 		VTONFS(fdvp)->n_attrstamp = 0;
 	if (!twccflag)
@@ -1659,16 +1762,13 @@
 	}
 	m_freem(mrep);
 nfsmout:
+	mtx_lock(&(VTONFS(tdvp))->n_mtx);
 	VTONFS(tdvp)->n_flag |= NMODIFIED;
+	mtx_unlock(&(VTONFS(tdvp))->n_mtx);
 	if (!attrflag)
 		VTONFS(vp)->n_attrstamp = 0;
 	if (!wccflag)
 		VTONFS(tdvp)->n_attrstamp = 0;
-	/*
-	 * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry.
-	 */
-	if (error == EEXIST)
-		error = 0;
 	return (error);
 }
 
@@ -1732,17 +1832,9 @@
 nfsmout:
 
 	/*
-	 * If we get an EEXIST error, silently convert it to no-error
-	 * in case of an NFS retry.
-	 */
-	if (error == EEXIST)
-		error = 0;
-
-	/*
-	 * If we do not have (or no longer have) an error, and we could
-	 * not extract the newvp from the response due to the request being
-	 * NFSv2 or the error being EEXIST.  We have to do a lookup in order
-	 * to obtain a newvp to return.
+	 * If we do not have an error and we could not extract the newvp from
+	 * the response due to the request being NFSv2, we have to do a
+	 * lookup in order to obtain a newvp to return.
 	 */
 	if (error == 0 && newvp == NULL) {
 		struct nfsnode *np = NULL;
@@ -1758,7 +1850,9 @@
 	} else {
 		*ap->a_vpp = newvp;
 	}
+	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
+	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	if (!wccflag)
 		VTONFS(dvp)->n_attrstamp = 0;
 	return (error);
@@ -1813,18 +1907,12 @@
 		nfsm_wcc_data(dvp, wccflag);
 	m_freem(mrep);
 nfsmout:
+	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
+	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	if (!wccflag)
 		VTONFS(dvp)->n_attrstamp = 0;
-	/*
-	 * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry
-	 * if we can succeed in looking up the directory.
-	 */
-	if (error == EEXIST || (!error && !gotvp)) {
-		if (newvp) {
-			vput(newvp);
-			newvp = NULL;
-		}
+	if (error == 0 && newvp == NULL) {
 		error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred,
 			cnp->cn_thread, &np);
 		if (!error) {
@@ -1869,7 +1957,9 @@
 		nfsm_wcc_data(dvp, wccflag);
 	m_freem(mrep);
 nfsmout:
+	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
+	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	if (!wccflag)
 		VTONFS(dvp)->n_attrstamp = 0;
 	cache_purge(dvp);
@@ -1891,20 +1981,25 @@
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct uio *uio = ap->a_uio;
-	int tresid, error;
+	int tresid, error = 0;
 	struct vattr vattr;
+	
+	if (vp->v_type != VDIR) 
+		return(EPERM);
 
-	if (vp->v_type != VDIR)
-		return (EPERM);
 	/*
 	 * First, check for hit on the EOF offset cache
 	 */
 	if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset &&
 	    (np->n_flag & NMODIFIED) == 0) {
-		if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_td) == 0 &&
-		    !NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
-			nfsstats.direofcache_hits++;
-			return (0);
+		if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_td) == 0) {
+			mtx_lock(&np->n_mtx);
+			if (!NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
+				mtx_unlock(&np->n_mtx);
+				nfsstats.direofcache_hits++;
+				goto out;
+			} else
+				mtx_unlock(&np->n_mtx);
 		}
 	}
 
@@ -1914,8 +2009,10 @@
 	tresid = uio->uio_resid;
 	error = nfs_bioread(vp, uio, 0, ap->a_cred);
 
-	if (!error && uio->uio_resid == tresid)
+	if (!error && uio->uio_resid == tresid) {
 		nfsstats.direofcache_misses++;
+	}
+out:
 	return (error);
 }
 
@@ -1950,11 +2047,16 @@
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
+	nfs_dircookie_lock(dnp);
 	cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
-	if (cookiep)
+	if (cookiep) {
 		cookie = *cookiep;
-	else
+		nfs_dircookie_unlock(dnp);
+	} else {
+		nfs_dircookie_unlock(dnp);		
 		return (NFSERR_BAD_COOKIE);
+	}
+
 	/*
 	 * Loop around doing readdir rpc's of size nm_readdirsize
 	 * truncated to a multiple of DIRBLKSIZ.
@@ -1971,8 +2073,10 @@
 			tl = nfsm_build(u_int32_t *, 5 * NFSX_UNSIGNED);
 			*tl++ = cookie.nfsuquad[0];
 			*tl++ = cookie.nfsuquad[1];
+			mtx_lock(&dnp->n_mtx);
 			*tl++ = dnp->n_cookieverf.nfsuquad[0];
 			*tl++ = dnp->n_cookieverf.nfsuquad[1];
+			mtx_unlock(&dnp->n_mtx);
 		} else {
 			tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = cookie.nfsuquad[0];
@@ -1984,8 +2088,10 @@
 			if (!error) {
 				tl = nfsm_dissect(u_int32_t *,
 				    2 * NFSX_UNSIGNED);
+				mtx_lock(&dnp->n_mtx);
 				dnp->n_cookieverf.nfsuquad[0] = *tl++;
 				dnp->n_cookieverf.nfsuquad[1] = *tl;
+				mtx_unlock(&dnp->n_mtx);
 			} else {
 				m_freem(mrep);
 				goto nfsmout;
@@ -2100,9 +2206,11 @@
 		dnp->n_direofoffset = uiop->uio_offset;
 	else {
 		if (uiop->uio_resid > 0)
-			printf("EEK! readdirrpc resid > 0\n");
+			nfs_printf("EEK! readdirrpc resid > 0\n");
+		nfs_dircookie_lock(dnp);
 		cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
 		*cookiep = cookie;
+		nfs_dircookie_unlock(dnp);
 	}
 nfsmout:
 	return (error);
@@ -2146,11 +2254,15 @@
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
+	nfs_dircookie_lock(dnp);
 	cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
-	if (cookiep)
+	if (cookiep) {
 		cookie = *cookiep;
-	else
+		nfs_dircookie_unlock(dnp);
+	} else {
+		nfs_dircookie_unlock(dnp);
 		return (NFSERR_BAD_COOKIE);
+	}
 	/*
 	 * Loop around doing readdir rpc's of size nm_readdirsize
 	 * truncated to a multiple of DIRBLKSIZ.
@@ -2166,8 +2278,10 @@
  		tl = nfsm_build(u_int32_t *, 6 * NFSX_UNSIGNED);
 		*tl++ = cookie.nfsuquad[0];
 		*tl++ = cookie.nfsuquad[1];
+		mtx_lock(&dnp->n_mtx);
 		*tl++ = dnp->n_cookieverf.nfsuquad[0];
 		*tl++ = dnp->n_cookieverf.nfsuquad[1];
+		mtx_unlock(&dnp->n_mtx);
 		*tl++ = txdr_unsigned(nmp->nm_readdirsize);
 		*tl = txdr_unsigned(nmp->nm_rsize);
 		nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_td, cred);
@@ -2177,8 +2291,10 @@
 			goto nfsmout;
 		}
 		tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
+		mtx_lock(&dnp->n_mtx);
 		dnp->n_cookieverf.nfsuquad[0] = *tl++;
 		dnp->n_cookieverf.nfsuquad[1] = *tl++;
+		mtx_unlock(&dnp->n_mtx);
 		more_dirs = fxdr_unsigned(int, *tl);
 
 		/* loop thru the dir entries, doctoring them to 4bsd form */
@@ -2288,7 +2404,7 @@
 				    np = dnp;
 				} else {
 				    error = nfs_nget(vp->v_mount, fhp,
-					fhsize, &np);
+					fhsize, &np, LK_EXCLUSIVE);
 				    if (error)
 					doit = 0;
 				    else
@@ -2306,6 +2422,8 @@
 				dp->d_type =
 				    IFTODT(VTTOIF(np->n_vattr.va_type));
 				ndp->ni_vp = newvp;
+				/* Update n_ctime, so subsequent lookup doesn't purge entry */
+				np->n_ctime = np->n_vattr.va_ctime.tv_sec;
 			        cache_enter(ndp->ni_dvp, ndp->ni_vp, cnp);
 			    }
 			} else {
@@ -2313,9 +2431,9 @@
 			    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 			    i = fxdr_unsigned(int, *tl);
 			    if (i) {
-				tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
-				fhsize = fxdr_unsigned(int, *tl);
-				nfsm_adv(nfsm_rndup(fhsize));
+				    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
+				    fhsize = fxdr_unsigned(int, *tl);
+				    nfsm_adv(nfsm_rndup(fhsize));
 			    }
 			}
 			if (newvp != NULLVP) {
@@ -2359,9 +2477,11 @@
 		dnp->n_direofoffset = uiop->uio_offset;
 	else {
 		if (uiop->uio_resid > 0)
-			printf("EEK! readdirplusrpc resid > 0\n");
+			nfs_printf("EEK! readdirplusrpc resid > 0\n");
+		nfs_dircookie_lock(dnp);
 		cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
 		*cookiep = cookie;
+		nfs_dircookie_unlock(dnp);
 	}
 nfsmout:
 	if (newvp != NULLVP) {
@@ -2480,7 +2600,7 @@
 		    VREF(dvp);
 		    newvp = dvp;
 		} else {
-		    error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np);
+		    error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np, LK_EXCLUSIVE);
 		    if (error) {
 			m_freem(mrep);
 			return (error);
@@ -2521,7 +2641,7 @@
  */
 int
 nfs_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred,
-    struct thread *td)
+	   struct thread *td)
 {
 	u_int32_t *tl;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
@@ -2529,8 +2649,12 @@
 	int error = 0, wccflag = NFSV3_WCCRATTR;
 	struct mbuf *mreq, *mrep, *md, *mb;
 
-	if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0)
+	mtx_lock(&nmp->nm_mtx);
+	if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
+		mtx_unlock(&nmp->nm_mtx);
 		return (0);
+	}
+	mtx_unlock(&nmp->nm_mtx);
 	nfsstats.rpccnt[NFSPROC_COMMIT]++;
 	mreq = nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1));
 	mb = mreq;
@@ -2567,16 +2691,10 @@
 {
 	struct buf *bp = ap->a_bp;
 	struct ucred *cr;
-	struct thread *td;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
 	KASSERT(BUF_REFCNT(bp) > 0, ("nfs_strategy: buffer %p not locked", bp));
 
-	if (bp->b_flags & B_ASYNC)
-		td = NULL;
-	else
-		td = curthread;	/* XXX */
-
 	if (bp->b_iocmd == BIO_READ)
 		cr = bp->b_rcred;
 	else
@@ -2588,8 +2706,8 @@
 	 * otherwise just do it ourselves.
 	 */
 	if ((bp->b_flags & B_ASYNC) == 0 ||
-	    nfs_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, td))
-		(void)nfs_doio(ap->a_vp, bp, cr, td);
+	    nfs_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, curthread))
+		(void)nfs_doio(ap->a_vp, bp, cr, curthread);
 	return (0);
 }
 
@@ -2600,7 +2718,6 @@
 static int
 nfs_fsync(struct vop_fsync_args *ap)
 {
-
 	return (nfs_flush(ap->a_vp, ap->a_waitfor, ap->a_td, 1));
 }
 
@@ -2821,8 +2938,10 @@
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp), "nfsfsync", slpflag, slptimeo);
 			splx(s);
-			if (error == 0)
-				panic("nfs_fsync: inconsistent lock");
+			if (error == 0) {
+				BUF_UNLOCK(bp);
+				goto loop;
+			}
 			if (error == ENOLCK)
 				goto loop;
 			if (nfs_sigintr(nmp, NULL, td)) {
@@ -2880,14 +2999,36 @@
 			VI_UNLOCK(vp);
 			goto loop;
 		}
-	}
-	VI_UNLOCK(vp);
+		/*
+		 * Wait for all the async IO requests to drain
+		 */
+		VI_UNLOCK(vp);
+		mtx_lock(&np->n_mtx);
+		while (np->n_directio_asyncwr > 0) {
+			np->n_flag |= NFSYNCWAIT;
+			error = nfs_msleep(td, (caddr_t)&np->n_directio_asyncwr,
+					   &np->n_mtx, slpflag | (PRIBIO + 1), 
+					   "nfsfsync", 0);
+			if (error) {
+				if (nfs_sigintr(nmp, (struct nfsreq *)0, td)) {
+					mtx_unlock(&np->n_mtx);
+					error = EINTR;	
+					goto done;
+				}
+			}
+		}
+		mtx_unlock(&np->n_mtx);
+	} else
+		VI_UNLOCK(vp);
+	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NWRITEERR) {
 		error = np->n_error;
 		np->n_flag &= ~NWRITEERR;
 	}
-  	if (commit && vp->v_bufobj.bo_dirty.bv_cnt == 0)
+  	if (commit && vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
+	    vp->v_bufobj.bo_numoutput == 0 && np->n_directio_asyncwr == 0)
   		np->n_flag &= ~NMODIFIED;
+	mtx_unlock(&np->n_mtx);
 done:
 	if (bvec != NULL && bvec != bvec_on_stack)
 		free(bvec, M_TEMP);
@@ -2900,13 +3041,19 @@
 static int
 nfs_advlock(struct vop_advlock_args *ap)
 {
-
+	int error;
+	
+	mtx_lock(&Giant);
 	if ((VFSTONFS(ap->a_vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
 		struct nfsnode *np = VTONFS(ap->a_vp);
 
-		return (lf_advlock(ap, &(np->n_lockf), np->n_size));
+		error = lf_advlock(ap, &(np->n_lockf), np->n_size);
+		goto out;
 	}
-	return (nfs_dolock(ap));
+	error = nfs_dolock(ap);
+out:	
+	mtx_unlock(&Giant);
+	return (error);
 }
 
 /*
@@ -2918,7 +3065,7 @@
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 
-	printf("\tfileid %ld fsid 0x%x",
+	nfs_printf("\tfileid %ld fsid 0x%x",
 	   np->n_vattr.va_fileid, np->n_vattr.va_fsid);
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
@@ -2961,7 +3108,7 @@
 	bp->b_iocmd = BIO_WRITE;
 
 	bufobj_wref(bp->b_bufobj);
-	curthread->td_proc->p_stats->p_ru.ru_oublock++;
+	curthread->td_ru.ru_oublock++;
 	splx(s);
 
 	/*
@@ -2982,7 +3129,6 @@
 			reassignbuf(bp);
 			splx(s);
 		}
-
 		brelse(bp);
 		return (rtval);
 	}
@@ -3023,9 +3169,11 @@
 	vap = &vattr;
 	error = VOP_GETATTR(vp, vap, cred, ap->a_td);
 	if (error)
-		return (error);
-	return (vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
-	    mode, cred, NULL));
+		goto out;
+	error  = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
+			 mode, cred, NULL);
+out:
+	return error;
 }
 
 /*
@@ -3035,13 +3183,17 @@
 nfsfifo_read(struct vop_read_args *ap)
 {
 	struct nfsnode *np = VTONFS(ap->a_vp);
+	int error;
 
 	/*
 	 * Set access flag.
 	 */
+	mtx_lock(&np->n_mtx);
 	np->n_flag |= NACC;
 	getnanotime(&np->n_atim);
-	return (fifo_specops.vop_read(ap));
+	mtx_unlock(&np->n_mtx);
+	error = fifo_specops.vop_read(ap);
+	return error;	
 }
 
 /*
@@ -3055,9 +3207,11 @@
 	/*
 	 * Set update flag.
 	 */
+	mtx_lock(&np->n_mtx);
 	np->n_flag |= NUPD;
 	getnanotime(&np->n_mtim);
-	return (fifo_specops.vop_write(ap));
+	mtx_unlock(&np->n_mtx);
+	return(fifo_specops.vop_write(ap));
 }
 
 /*
@@ -3073,6 +3227,7 @@
 	struct vattr vattr;
 	struct timespec ts;
 
+	mtx_lock(&np->n_mtx);
 	if (np->n_flag & (NACC | NUPD)) {
 		getnanotime(&ts);
 		if (np->n_flag & NACC)
@@ -3087,9 +3242,13 @@
 				vattr.va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vattr.va_mtime = np->n_mtim;
+			mtx_unlock(&np->n_mtx);
 			(void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_td);
+			goto out;
 		}
 	}
+	mtx_unlock(&np->n_mtx);
+out:
 	return (fifo_specops.vop_close(ap));
 }
 
@@ -3110,4 +3269,5 @@
 	.bop_write	=	nfs_bwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
+	.bop_bdflush	=	bufbdflush,
 };
Index: nfsm_subs.h
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfsm_subs.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfsm_subs.h -L sys/nfsclient/nfsm_subs.h -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfsm_subs.h
+++ sys/nfsclient/nfsm_subs.h
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfsm_subs.h	8.2 (Berkeley) 3/30/95
- * $FreeBSD: src/sys/nfsclient/nfsm_subs.h,v 1.36.2.1 2005/12/13 21:29:26 rees Exp $
+ * $FreeBSD: src/sys/nfsclient/nfsm_subs.h,v 1.37 2005/11/21 18:39:18 rees Exp $
  */
 
 #ifndef _NFSCLIENT_NFSM_SUBS_H_
Index: nfsdiskless.h
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfsdiskless.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfsdiskless.h -L sys/nfsclient/nfsdiskless.h -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfsdiskless.h
+++ sys/nfsclient/nfsdiskless.h
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfsdiskless.h	8.2 (Berkeley) 3/30/95
- * $FreeBSD: src/sys/nfsclient/nfsdiskless.h,v 1.17 2005/01/07 01:45:51 imp Exp $
+ * $FreeBSD: src/sys/nfsclient/nfsdiskless.h,v 1.18 2006/12/06 02:15:25 sam Exp $
  */
 
 #ifndef _NFSCLIENT_NFSDISKLESS_H_
@@ -108,6 +108,7 @@
 extern int	nfs_diskless_valid;
 void bootpc_init(void);
 void nfs_setup_diskless(void);
+void nfs_parse_options(const char *, struct nfs_args *);
 #endif
 
 #endif
Index: nfs_vfsops.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_vfsops.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/nfsclient/nfs_vfsops.c -L sys/nfsclient/nfs_vfsops.c -u -r1.2 -r1.3
--- sys/nfsclient/nfs_vfsops.c
+++ sys/nfsclient/nfs_vfsops.c
@@ -33,7 +33,8 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_vfsops.c,v 1.177.2.1 2006/01/14 01:18:02 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_vfsops.c,v 1.193.2.1 2007/10/26 21:46:31 jhb Exp $");
+
 
 #include "opt_bootp.h"
 #include "opt_nfsroot.h"
@@ -43,6 +44,7 @@
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
+#include <sys/clock.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
@@ -75,17 +77,18 @@
 #include <nfsclient/nfsm_subs.h>
 #include <nfsclient/nfsdiskless.h>
 
-MALLOC_DEFINE(M_NFSREQ, "NFS req", "NFS request header");
-MALLOC_DEFINE(M_NFSBIGFH, "NFSV3 bigfh", "NFS version 3 file handle");
-MALLOC_DEFINE(M_NFSDIROFF, "NFSV3 diroff", "NFS directory offset data");
-MALLOC_DEFINE(M_NFSHASH, "NFS hash", "NFS hash tables");
-MALLOC_DEFINE(M_NFSDIRECTIO, "NFS DirectIO", "NFS Direct IO async write state");
+MALLOC_DEFINE(M_NFSREQ, "nfsclient_req", "NFS request header");
+MALLOC_DEFINE(M_NFSBIGFH, "nfsclient_bigfh", "NFS version 3 file handle");
+MALLOC_DEFINE(M_NFSDIROFF, "nfsclient_diroff", "NFS directory offset data");
+MALLOC_DEFINE(M_NFSHASH, "nfsclient_hash", "NFS hash tables");
+MALLOC_DEFINE(M_NFSDIRECTIO, "nfsclient_directio", "NFS Direct IO async write state");
 
 uma_zone_t nfsmount_zone;
 
 struct nfsstats	nfsstats;
+
 SYSCTL_NODE(_vfs, OID_AUTO, nfs, CTLFLAG_RW, 0, "NFS filesystem");
-SYSCTL_STRUCT(_vfs_nfs, NFS_NFSSTATS, nfsstats, CTLFLAG_RD,
+SYSCTL_STRUCT(_vfs_nfs, NFS_NFSSTATS, nfsstats, CTLFLAG_RW,
 	&nfsstats, nfsstats, "S,nfsstats");
 static int nfs_ip_paranoia = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_ip_paranoia, CTLFLAG_RW,
@@ -102,8 +105,8 @@
 SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_DELAY,
         downdelayinterval, CTLFLAG_RW, &nfs_tprintf_delay, 0, "");
 
-static int	nfs_iosize(struct nfsmount *nmp);
-static void	nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp);
+static void	nfs_decode_args(struct mount *mp, struct nfsmount *nmp,
+		    struct nfs_args *argp);
 static int	mountnfs(struct nfs_args *, struct mount *,
 		    struct sockaddr *, char *, struct vnode **,
 		    struct ucred *cred);
@@ -171,7 +174,7 @@
 static void	nfs_convert_oargs(struct nfs_args *args,
 		    struct onfs_args *oargs);
 
-static int
+int
 nfs_iosize(struct nfsmount *nmp)
 {
 	int iosize;
@@ -182,9 +185,9 @@
 	 * that it is at least one VM page to avoid wasting buffer
 	 * space.
 	 */
-	iosize = max(nmp->nm_rsize, nmp->nm_wsize);
-	if (iosize < PAGE_SIZE) iosize = PAGE_SIZE;
-	return iosize;
+	iosize = imax(nmp->nm_rsize, nmp->nm_wsize);
+	iosize = imax(iosize, PAGE_SIZE);
+	return (iosize);
 }
 
 static void
@@ -219,8 +222,13 @@
 	bcopy(&nfs_diskless.mygateway, &nfsv3_diskless.mygateway,
 		sizeof(struct sockaddr_in));
 	nfs_convert_oargs(&nfsv3_diskless.root_args,&nfs_diskless.root_args);
-	nfsv3_diskless.root_fhsize = NFSX_V2FH;
-	bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_V2FH);
+	if (nfsv3_diskless.root_args.flags & NFSMNT_NFSV3) {
+		nfsv3_diskless.root_fhsize = NFSX_V3FH;
+		bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_V3FH);
+	} else {
+		nfsv3_diskless.root_fhsize = NFSX_V2FH;
+		bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_V2FH);
+	}
 	bcopy(&nfs_diskless.root_saddr,&nfsv3_diskless.root_saddr,
 		sizeof(struct sockaddr_in));
 	bcopy(nfs_diskless.root_hostnam, nfsv3_diskless.root_hostnam, MNAMELEN);
@@ -248,12 +256,21 @@
 #ifndef nolint
 	sfp = NULL;
 #endif
-	error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np);
+	error = vfs_busy(mp, LK_NOWAIT, NULL, td);
 	if (error)
 		return (error);
+	error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE);
+	if (error) {
+		vfs_unbusy(mp, td);
+		return (error);
+	}
 	vp = NFSTOV(np);
-	if (v3 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
+	mtx_lock(&nmp->nm_mtx);
+	if (v3 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
+		mtx_unlock(&nmp->nm_mtx);		
 		(void)nfs_fsinfo(nmp, vp, td->td_ucred, td);
+	} else
+		mtx_unlock(&nmp->nm_mtx);
 	nfsstats.rpccnt[NFSPROC_FSSTAT]++;
 	mreq = nfsm_reqhead(vp, NFSPROC_FSSTAT, NFSX_FH(v3));
 	mb = mreq;
@@ -268,7 +285,9 @@
 		goto nfsmout;
 	}
 	sfp = nfsm_dissect(struct nfs_statfs *, NFSX_STATFS(v3));
+	mtx_lock(&nmp->nm_mtx);
 	sbp->f_iosize = nfs_iosize(nmp);
+	mtx_unlock(&nmp->nm_mtx);
 	if (v3) {
 		sbp->f_bsize = NFS_FABLKSIZE;
 		tquad = fxdr_hyper(&sfp->sf_tbytes);
@@ -292,6 +311,7 @@
 	m_freem(mrep);
 nfsmout:
 	vput(vp);
+	vfs_unbusy(mp, td);
 	return (error);
 }
 
@@ -308,7 +328,7 @@
 	int error = 0, retattr;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	u_int64_t maxfsize;
-
+	
 	nfsstats.rpccnt[NFSPROC_FSINFO]++;
 	mreq = nfsm_reqhead(vp, NFSPROC_FSINFO, NFSX_FH(1));
 	mb = mreq;
@@ -319,6 +339,7 @@
 	if (!error) {
 		fsp = nfsm_dissect(struct nfsv3_fsinfo *, NFSX_V3FSINFO);
 		pref = fxdr_unsigned(u_int32_t, fsp->fs_wtpref);
+		mtx_lock(&nmp->nm_mtx);
 		if (pref < nmp->nm_wsize && pref >= NFS_FABLKSIZE)
 			nmp->nm_wsize = (pref + NFS_FABLKSIZE - 1) &
 				~(NFS_FABLKSIZE - 1);
@@ -352,6 +373,7 @@
 			nmp->nm_maxfilesize = maxfsize;
 		nmp->nm_mountp->mnt_stat.f_iosize = nfs_iosize(nmp);
 		nmp->nm_state |= NFSSTA_GOTFSINFO;
+		mtx_unlock(&nmp->nm_mtx);
 	}
 	m_freem(mrep);
 nfsmout:
@@ -380,11 +402,11 @@
 	struct nfsv3_diskless *nd = &nfsv3_diskless;
 	struct socket *so;
 	struct vnode *vp;
+	struct ifreq ir;
 	int error, i;
 	u_long l;
 	char buf[128];
-
-	NET_ASSERT_GIANT();
+	char *cp;
 
 #if defined(BOOTP_NFSROOT) && defined(BOOTP)
 	bootpc_init();		/* use bootp to get nfs_diskless filled in */
@@ -406,7 +428,7 @@
 	 * Do enough of ifconfig(8) so that the critical net interface can
 	 * talk to the server.
 	 */
-	error = socreate(nd->myif.ifra_addr.sa_family, &so, SOCK_DGRAM, 0,
+	error = socreate(nd->myif.ifra_addr.sa_family, &so, nd->root_args.sotype, 0,
 	    td->td_ucred, td);
 	if (error)
 		panic("nfs_mountroot: socreate(%04x): %d",
@@ -431,6 +453,14 @@
 	error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td);
 	if (error)
 		panic("nfs_mountroot: SIOCAIFADDR: %d", error);
+	if ((cp = getenv("boot.netif.mtu")) != NULL) {
+		ir.ifr_mtu = strtol(cp, NULL, 10);
+		bcopy(nd->myif.ifra_name, ir.ifr_name, IFNAMSIZ);
+		freeenv(cp);
+		error = ifioctl(so, SIOCSIFMTU, (caddr_t)&ir, td);
+		if (error)
+			printf("nfs_mountroot: SIOCSIFMTU: %d", error);
+	}
 	soclose(so);
 
 	/*
@@ -495,8 +525,10 @@
 	struct sockaddr *nam;
 	int error;
 
+	MNT_ILOCK(mp);
 	mp->mnt_kern_flag = 0;
 	mp->mnt_flag = mountflag;
+	MNT_IUNLOCK(mp);
 	nam = sodupsockaddr((struct sockaddr *)sin, M_WAITOK);
 	if ((error = mountnfs(args, mp, nam, path, vpp,
 	    td->td_ucred)) != 0) {
@@ -521,17 +553,27 @@
 	 * flag is already clear, or this is a root mount and it was set
 	 * intentionally at some previous point.
 	 */
-	if (vfs_getopt(mp->mnt_optnew, "ro", NULL, NULL) == 0)
+	if (vfs_getopt(mp->mnt_optnew, "ro", NULL, NULL) == 0) {
+		MNT_ILOCK(mp);
 		mp->mnt_flag |= MNT_RDONLY;
-	else if (mp->mnt_flag & MNT_UPDATE)
+		MNT_IUNLOCK(mp);
+	} else if (mp->mnt_flag & MNT_UPDATE) {
+		MNT_ILOCK(mp);
 		mp->mnt_flag &= ~MNT_RDONLY;
+		MNT_IUNLOCK(mp);
+	}
 
 	/*
 	 * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes
-	 * no sense in that context.
+	 * no sense in that context.  Also, set up appropriate retransmit
+	 * and soft timeout behavior.
 	 */
-	if (argp->sotype == SOCK_STREAM)
+	if (argp->sotype == SOCK_STREAM) {
 		nmp->nm_flag &= ~NFSMNT_NOCONN;
+		nmp->nm_flag |= NFSMNT_DUMBTIMR;
+		nmp->nm_timeo = NFS_MAXTIMEO;
+		nmp->nm_retry = NFS_RETRANS_TCP;
+	}
 
 	/* Also clear RDIRPLUS if not NFSv3, it crashes some servers */
 	if ((argp->flags & NFSMNT_NFSV3) == 0)
@@ -658,8 +700,7 @@
 		if (nmp->nm_sotype == SOCK_DGRAM)
 			while (nfs_connect(nmp, NULL)) {
 				printf("nfs_args: retrying connect\n");
-				(void) tsleep((caddr_t)&lbolt,
-					      PSOCK, "nfscon", 0);
+				(void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
 			}
 	}
 }
@@ -687,24 +728,31 @@
 	size_t len;
 	u_char nfh[NFSX_V3FHMAX];
 
-	if (vfs_filteropt(mp->mnt_optnew, nfs_opts))
-		return (EINVAL);
+	if (vfs_filteropt(mp->mnt_optnew, nfs_opts)) {
+		error = EINVAL;
+		goto out;
+	}
 
-	if (mp->mnt_flag & MNT_ROOTFS)
-		return (nfs_mountroot(mp, td));
+	if (mp->mnt_flag & MNT_ROOTFS) {
+		error = nfs_mountroot(mp, td);
+		goto out;
+	}
 
 	error = vfs_copyopt(mp->mnt_optnew, "nfs_args", &args, sizeof args);
 	if (error)
-		return (error);
+		goto out;
 
 	if (args.version != NFS_ARGSVERSION) {
-		return (EPROGMISMATCH);
+		error = EPROGMISMATCH;
+		goto out;
 	}
 	if (mp->mnt_flag & MNT_UPDATE) {
 		struct nfsmount *nmp = VFSTONFS(mp);
 
-		if (nmp == NULL)
-			return (EIO);
+		if (nmp == NULL) {
+			error = EIO;
+			goto out;
+		}
 		/*
 		 * When doing an update, we can't change from or to
 		 * v3, switch lockd strategies or change cookie translation
@@ -714,7 +762,7 @@
 		    (nmp->nm_flag &
 			(NFSMNT_NFSV3 | NFSMNT_NOLOCKD /*|NFSMNT_XLATECOOKIE*/));
 		nfs_decode_args(mp, nmp, &args);
-		return (0);
+		goto out;
 	}
 
 	/*
@@ -728,21 +776,29 @@
 	 */
 	if (nfs_ip_paranoia == 0)
 		args.flags |= NFSMNT_NOCONN;
-	if (args.fhsize < 0 || args.fhsize > NFSX_V3FHMAX)
-		return (EINVAL);
+	if (args.fhsize < 0 || args.fhsize > NFSX_V3FHMAX) {
+		error = EINVAL;
+		goto out;
+	}
 	error = copyin((caddr_t)args.fh, (caddr_t)nfh, args.fhsize);
 	if (error)
-		return (error);
+		goto out;
 	error = copyinstr(args.hostname, hst, MNAMELEN-1, &len);
 	if (error)
-		return (error);
+		goto out;
 	bzero(&hst[len], MNAMELEN - len);
 	/* sockargs() call must be after above copyin() calls */
 	error = getsockaddr(&nam, (caddr_t)args.addr, args.addrlen);
 	if (error)
-		return (error);
+		goto out;
 	args.fh = nfh;
 	error = mountnfs(&args, mp, nam, hst, &vp, td->td_ucred);
+out:
+	if (!error) {
+		MNT_ILOCK(mp);
+		mp->mnt_kern_flag |= (MNTK_MPSAFE|MNTK_LOOKUP_SHARED);
+		MNT_IUNLOCK(mp);
+	}
 	return (error);
 }
 
@@ -765,12 +821,11 @@
 
 	error = copyin(data, &args, sizeof (struct nfs_args));
 	if (error)
-		return (error);
+		return error;
 
 	ma = mount_arg(ma, "nfs_args", &args, sizeof args);
 
 	error = kernel_mount(ma, flags);
-
 	return (error);
 }
 
@@ -799,6 +854,7 @@
 	}
 	vfs_getnewfsid(mp);
 	nmp->nm_mountp = mp;
+	mtx_init(&nmp->nm_mtx, "NFSmount lock", NULL, MTX_DEF);			
 
 	/*
 	 * V2 can only handle 32 bit filesizes.  A 4GB-1 limit may be too
@@ -823,11 +879,6 @@
 		nmp->nm_wsize = NFS_WSIZE;
 		nmp->nm_rsize = NFS_RSIZE;
 	}
-        if ((desiredvnodes / 1000) == 0) {
-		printf("Increasing desiredvnodes from %i to 1000\n",
-		    desiredvnodes);
-		desiredvnodes = 1000;
-	}
 	nmp->nm_wcommitsize = hibufspace / (desiredvnodes / 1000);
 	nmp->nm_readdirsize = NFS_READDIRSIZE;
 	nmp->nm_numgrps = NFS_MAXGRPS;
@@ -850,10 +901,6 @@
 
 	nfs_decode_args(mp, nmp, argp);
 
-	if (nmp->nm_sotype == SOCK_STREAM)
-		mtx_init(&nmp->nm_nfstcpstate.mtx, "NFS/TCP state lock", 
-			 NULL, MTX_DEF);		
-
 	/*
 	 * For Connection based sockets (TCP,...) defer the connect until
 	 * the first request, in case the server is not responding.
@@ -868,7 +915,9 @@
 	 * stuck on a dead server and we are holding a lock on the mount
 	 * point.
 	 */
+	mtx_lock(&nmp->nm_mtx);
 	mp->mnt_stat.f_iosize = nfs_iosize(nmp);
+	mtx_unlock(&nmp->nm_mtx);
 	/*
 	 * A reference count is needed on the nfsnode representing the
 	 * remote root.  If this object is not persistent, then backward
@@ -877,7 +926,7 @@
 	 * this problem, because one can identify root inodes by their
 	 * number == ROOTINO (2).
 	 */
-	error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np);
+	error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE);
 	if (error)
 		goto bad;
 	*vpp = NFSTOV(np);
@@ -899,9 +948,8 @@
 
 	return (0);
 bad:
-	if (nmp->nm_sotype == SOCK_STREAM)
-		mtx_destroy(&nmp->nm_nfstcpstate.mtx);
 	nfs_disconnect(nmp);
+	mtx_destroy(&nmp->nm_mtx);
 	uma_zfree(nfsmount_zone, nmp);
 	FREE(nam, M_SONAME);
 	return (error);
@@ -929,12 +977,12 @@
 	if (flags & FORCECLOSE) {
 		error = nfs_nmcancelreqs(nmp);
 		if (error)
-			return (error);
+			goto out;
 	}
 	/* We hold 1 extra ref on the root vnode; see comment in mountnfs(). */
 	error = vflush(mp, 1, flags, td);
 	if (error)
-		return (error);
+		goto out;
 
 	/*
 	 * We are now committed to the unmount.
@@ -942,11 +990,10 @@
 	nfs_disconnect(nmp);
 	FREE(nmp->nm_nam, M_SONAME);
 
-	if (nmp->nm_sotype == SOCK_STREAM)
-		mtx_destroy(&nmp->nm_nfstcpstate.mtx);
-	
+	mtx_destroy(&nmp->nm_mtx);
 	uma_zfree(nfsmount_zone, nmp);
-	return (0);
+out:
+	return (error);
 }
 
 /*
@@ -961,17 +1008,20 @@
 	int error;
 
 	nmp = VFSTONFS(mp);
-	error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np);
+	error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, flags);
 	if (error)
-		return (error);
+		return error;
 	vp = NFSTOV(np);
 	/*
 	 * Get transfer parameters and attributes for root vnode once.
 	 */
+	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_state & NFSSTA_GOTFSINFO) == 0 &&
 	    (nmp->nm_flag & NFSMNT_NFSV3)) {
+		mtx_unlock(&nmp->nm_mtx);
 		nfs_fsinfo(nmp, vp, curthread->td_ucred, curthread);
-	}
+	} else 
+		mtx_unlock(&nmp->nm_mtx);
 	if (vp->v_type == VNON)
 	    vp->v_type = VDIR;
 	vp->v_vflag |= VV_ROOT;
@@ -1050,8 +1100,10 @@
 		break;
 #endif
 	case VFS_CTL_QUERY:
+		mtx_lock(&nmp->nm_mtx);
 		if (nmp->nm_state & NFSSTA_TIMEO)
 			vq.vq_flags |= VQ_NOTRESP;
+		mtx_unlock(&nmp->nm_mtx);
 #if 0
 		if (!(nmp->nm_flag & NFSMNT_NOLOCKS) &&
 		    (nmp->nm_state & NFSSTA_LOCKTIMEO))
Index: nfs_subs.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_subs.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfs_subs.c -L sys/nfsclient/nfs_subs.c -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfs_subs.c
+++ sys/nfsclient/nfs_subs.c
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_subs.c,v 1.140.2.1 2005/12/13 21:29:26 rees Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_subs.c,v 1.146.2.1 2007/10/12 19:18:46 mohans Exp $");
 
 /*
  * These functions support the macros and help fiddle mbuf chains for
@@ -76,6 +76,12 @@
 #include <netinet/in.h>
 
 /*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+/*
  * Data items converted to xdr at startup, since they are constant
  * This is kinda hokey, but may save a little time doing byte swaps
  */
@@ -95,8 +101,8 @@
 
 struct nfs_reqq	nfs_reqq;
 struct mtx nfs_reqq_mtx;
-struct mtx nfs_reply_mtx;
 struct nfs_bufq	nfs_bufq;
+struct mtx nfs_xid_mtx;
 
 /*
  * and the reverse mapping from generic to Version 2 procedure numbers
@@ -182,6 +188,7 @@
 	 */
 	tl = nfsm_build(u_int32_t *, 8 * NFSX_UNSIGNED);
 
+	mtx_lock(&nfs_xid_mtx);
 	/* Get a pretty random xid to start with */
 	if (!nfs_xid)
 		nfs_xid = random();
@@ -193,6 +200,7 @@
 
 	*xidpp = tl;
 	*tl++ = txdr_unsigned(nfs_xid);
+	mtx_unlock(&nfs_xid_mtx);
 	*tl++ = rpc_call;
 	*tl++ = rpc_vers;
 	*tl++ = txdr_unsigned(NFS_PROG);
@@ -414,9 +422,10 @@
 	 * Initialize reply list and start timer
 	 */
 	TAILQ_INIT(&nfs_reqq);
-	callout_init(&nfs_callout, 0);
+	callout_init(&nfs_callout, CALLOUT_MPSAFE);
 	mtx_init(&nfs_reqq_mtx, "NFS reqq lock", NULL, MTX_DEF);
-	mtx_init(&nfs_reply_mtx, "Synch NFS reply posting", NULL, MTX_DEF);
+	mtx_init(&nfs_iod_mtx, "NFS iod lock", NULL, MTX_DEF);
+	mtx_init(&nfs_xid_mtx, "NFS xid lock", NULL, MTX_DEF);
 
 	nfs_pbuf_freecnt = nswbuf / 2 + 1;
 
@@ -437,19 +446,80 @@
 	 * Tell all nfsiod processes to exit. Clear nfs_iodmax, and wakeup
 	 * any sleeping nfsiods so they check nfs_iodmax and exit.
 	 */
+	mtx_lock(&nfs_iod_mtx);
 	nfs_iodmax = 0;
 	for (i = 0; i < nfs_numasync; i++)
 		if (nfs_iodwant[i])
 			wakeup(&nfs_iodwant[i]);
 	/* The last nfsiod to exit will wake us up when nfs_numasync hits 0 */
 	while (nfs_numasync)
-		tsleep(&nfs_numasync, PWAIT, "ioddie", 0);
-
+		msleep(&nfs_numasync, &nfs_iod_mtx, PWAIT, "ioddie", 0);
+	mtx_unlock(&nfs_iod_mtx);
 	nfs_nhuninit();
 	uma_zdestroy(nfsmount_zone);
 	return (0);
 }
 
+void 
+nfs_dircookie_lock(struct nfsnode *np)
+{
+	mtx_lock(&np->n_mtx);
+	while (np->n_flag & NDIRCOOKIELK)
+		(void) msleep(&np->n_flag, &np->n_mtx, PZERO, "nfsdirlk", 0);
+	np->n_flag |= NDIRCOOKIELK;
+	mtx_unlock(&np->n_mtx);
+}
+
+void 
+nfs_dircookie_unlock(struct nfsnode *np)
+{
+	mtx_lock(&np->n_mtx);
+	np->n_flag &= ~NDIRCOOKIELK;
+	wakeup(&np->n_flag);
+	mtx_unlock(&np->n_mtx);
+}
+
+int
+nfs_upgrade_vnlock(struct vnode *vp, struct thread *td)
+{
+	int old_lock;
+	
+ 	if ((old_lock = VOP_ISLOCKED(vp, td)) != LK_EXCLUSIVE) {
+ 		if (old_lock == LK_SHARED) {
+ 			/* Upgrade to exclusive lock, this might block */
+ 			vn_lock(vp, LK_UPGRADE | LK_RETRY, td);
+ 		} else {
+ 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ 		}
+  	}
+	return old_lock;
+}
+
+void
+nfs_downgrade_vnlock(struct vnode *vp, struct thread *td, int old_lock)
+{
+	if (old_lock != LK_EXCLUSIVE) {
+ 		if (old_lock == LK_SHARED) {
+ 			/* Downgrade from exclusive lock, this might block */
+ 			vn_lock(vp, LK_DOWNGRADE, td);
+ 		} else {
+ 			VOP_UNLOCK(vp, 0, td);
+ 		}
+  	}
+}
+
+void
+nfs_printf(const char *fmt, ...)
+{
+	va_list ap;
+
+	mtx_lock(&Giant);
+	va_start(ap, fmt);
+	printf(fmt, ap);
+	va_end(ap);
+	mtx_unlock(&Giant);
+}
+
 /*
  * Attribute cache routines.
  * nfs_loadattrcache() - loads or updates the cache contents from attributes
@@ -466,7 +536,7 @@
  */
 int
 nfs_loadattrcache(struct vnode **vpp, struct mbuf **mdp, caddr_t *dposp,
-    struct vattr *vaper, int dontshrink)
+		  struct vattr *vaper, int dontshrink)
 {
 	struct vnode *vp = *vpp;
 	struct vattr *vap;
@@ -478,8 +548,9 @@
 	struct mbuf *md;
 	enum vtype vtyp;
 	u_short vmode;
-	struct timespec mtime;
+	struct timespec mtime, mtime_save;
 	int v3 = NFS_ISV3(vp);
+	struct thread *td = curthread;
 
 	md = *mdp;
 	t1 = (mtod(md, caddr_t) + md->m_len) - *dposp;
@@ -535,6 +606,7 @@
 	 * information.
 	 */
 	np = VTONFS(vp);
+	mtx_lock(&np->n_mtx);
 	if (vp->v_type != vtyp) {
 		vp->v_type = vtyp;
 		if (vp->v_type == VFIFO)
@@ -545,6 +617,7 @@
 	vap->va_type = vtyp;
 	vap->va_mode = (vmode & 07777);
 	vap->va_rdev = rdev;
+	mtime_save = vap->va_mtime;
 	vap->va_mtime = mtime;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	if (v3) {
@@ -578,6 +651,14 @@
 		vap->va_filerev = 0;
 	}
 	np->n_attrstamp = time_second;
+	/* Timestamp the NFS otw getattr fetch */
+	if (td->td_proc) {
+		np->n_ac_ts_tid = td->td_tid;
+		np->n_ac_ts_pid = td->td_proc->p_pid;
+		np->n_ac_ts_syscalls = td->td_syscalls;
+	} else
+		bzero(&np->n_ac_ts, sizeof(struct nfs_attrcache_timestamp));
+	
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (dontshrink && vap->va_size < np->n_size) {
@@ -608,6 +689,21 @@
 			np->n_size = vap->va_size;
 		}
 	}
+	/*
+	 * The following checks are added to prevent a race between (say)
+	 * a READDIR+ and a WRITE. 
+	 * READDIR+, WRITE requests sent out.
+	 * READDIR+ resp, WRITE resp received on client.
+	 * However, the WRITE resp was handled before the READDIR+ resp
+	 * causing the post op attrs from the write to be loaded first
+	 * and the attrs from the READDIR+ to be loaded later. If this 
+	 * happens, we have stale attrs loaded into the attrcache.
+	 * We detect this by for the mtime moving back. We invalidate the 
+	 * attrcache when this happens.
+	 */
+	if (timespeccmp(&mtime_save, &vap->va_mtime, >))
+		/* Size changed or mtime went backwards */
+		np->n_attrstamp = 0;
 	if (vaper != NULL) {
 		bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap));
 		if (np->n_flag & NCHG) {
@@ -617,6 +713,7 @@
 				vaper->va_mtime = np->n_mtim;
 		}
 	}
+	mtx_unlock(&np->n_mtx);
 	return (0);
 }
 
@@ -639,16 +736,20 @@
 	struct vattr *vap;
 	struct nfsmount *nmp;
 	int timeo;
-
+	
 	np = VTONFS(vp);
 	vap = &np->n_vattr;
 	nmp = VFSTONFS(vp->v_mount);
+#ifdef NFS_ACDEBUG
+	mtx_lock(&Giant);	/* nfs_printf() */
+#endif
+	mtx_lock(&np->n_mtx);
 	/* XXX n_mtime doesn't seem to be updated on a miss-and-reload */
 	timeo = (time_second - np->n_mtime.tv_sec) / 10;
 
 #ifdef NFS_ACDEBUG
 	if (nfs_acdebug>1)
-		printf("nfs_getattrcache: initial timeo = %d\n", timeo);
+		nfs_printf("nfs_getattrcache: initial timeo = %d\n", timeo);
 #endif
 
 	if (vap->va_type == VDIR) {
@@ -665,18 +766,19 @@
 
 #ifdef NFS_ACDEBUG
 	if (nfs_acdebug > 2)
-		printf("acregmin %d; acregmax %d; acdirmin %d; acdirmax %d\n",
-			nmp->nm_acregmin, nmp->nm_acregmax,
-			nmp->nm_acdirmin, nmp->nm_acdirmax);
+		nfs_printf("acregmin %d; acregmax %d; acdirmin %d; acdirmax %d\n",
+			   nmp->nm_acregmin, nmp->nm_acregmax,
+			   nmp->nm_acdirmin, nmp->nm_acdirmax);
 
 	if (nfs_acdebug)
-		printf("nfs_getattrcache: age = %d; final timeo = %d\n",
-			(time_second - np->n_attrstamp), timeo);
+		nfs_printf("nfs_getattrcache: age = %d; final timeo = %d\n",
+			   (time_second - np->n_attrstamp), timeo);
 #endif
 
 	if ((time_second - np->n_attrstamp) >= timeo) {
 		nfsstats.attrcache_misses++;
-		return (ENOENT);
+		mtx_unlock(&np->n_mtx);
+		return( ENOENT);
 	}
 	nfsstats.attrcache_hits++;
 	if (vap->va_size != np->n_size) {
@@ -701,6 +803,10 @@
 		if (np->n_flag & NUPD)
 			vaper->va_mtime = np->n_mtim;
 	}
+	mtx_unlock(&np->n_mtx);
+#ifdef NFS_ACDEBUG
+	mtx_unlock(&Giant);	/* nfs_printf() */
+#endif
 	return (0);
 }
 
@@ -714,7 +820,8 @@
 {
 	struct nfsdmap *dp, *dp2;
 	int pos;
-
+	nfsuint64 *retval = NULL;
+	
 	pos = (uoff_t)off / NFS_DIRBLKSIZ;
 	if (pos == 0 || off < 0) {
 #ifdef DIAGNOSTIC
@@ -732,14 +839,14 @@
 			dp->ndm_eocookie = 0;
 			LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list);
 		} else
-			return (NULL);
+			goto out;
 	}
 	while (pos >= NFSNUMCOOKIES) {
 		pos -= NFSNUMCOOKIES;
 		if (LIST_NEXT(dp, ndm_list)) {
 			if (!add && dp->ndm_eocookie < NFSNUMCOOKIES &&
-				pos >= dp->ndm_eocookie)
-				return (NULL);
+			    pos >= dp->ndm_eocookie)
+				goto out;
 			dp = LIST_NEXT(dp, ndm_list);
 		} else if (add) {
 			MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap),
@@ -748,15 +855,17 @@
 			LIST_INSERT_AFTER(dp, dp2, ndm_list);
 			dp = dp2;
 		} else
-			return (NULL);
+			goto out;
 	}
 	if (pos >= dp->ndm_eocookie) {
 		if (add)
 			dp->ndm_eocookie = pos + 1;
 		else
-			return (NULL);
+			goto out;
 	}
-	return (&dp->ndm_cookies[pos]);
+	retval = &dp->ndm_cookies[pos];
+out:
+	return (retval);
 }
 
 /*
@@ -773,11 +882,13 @@
 	if (vp->v_type != VDIR)
 		panic("nfs: invaldir not dir");
 #endif
+	nfs_dircookie_lock(np);
 	np->n_direofoffset = 0;
 	np->n_cookieverf.nfsuquad[0] = 0;
 	np->n_cookieverf.nfsuquad[1] = 0;
 	if (LIST_FIRST(&np->n_cookies))
 		LIST_FIRST(&np->n_cookies)->ndm_eocookie = 0;
+	nfs_dircookie_unlock(np);
 }
 
 /*
@@ -797,8 +908,6 @@
 	struct buf *bp, *nbp;
 	int s;
 
-	GIANT_REQUIRED;
-
 	s = splbio();
 	MNT_ILOCK(mp);
 	MNT_VNODE_FOREACH(vp, mp, nvp) {
@@ -848,7 +957,7 @@
 		t1 = nfsm_getfh_xx(&ttfhp, &ttfhsize, (v3), md, dpos);
 		if (t1 != 0)
 			return t1;
-		t1 = nfs_nget(d->v_mount, ttfhp, ttfhsize, &ttnp);
+		t1 = nfs_nget(d->v_mount, ttfhp, ttfhsize, &ttnp, LK_EXCLUSIVE);
 		if (t1 != 0)
 			return t1;
 		*v = NFSTOV(ttnp);
@@ -896,7 +1005,7 @@
 
 int
 nfsm_loadattr_xx(struct vnode **v, struct vattr *va, struct mbuf **md,
-    caddr_t *dpos)
+		 caddr_t *dpos)
 {
 	int t1;
 
@@ -910,7 +1019,7 @@
 
 int
 nfsm_postop_attr_xx(struct vnode **v, int *f, struct mbuf **md,
-    caddr_t *dpos)
+		    caddr_t *dpos)
 {
 	u_int32_t *tl;
 	int t1;
@@ -945,9 +1054,11 @@
 		tl = nfsm_dissect_xx(6 * NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
+		mtx_lock(&(VTONFS(*v))->n_mtx);
 		if (*f)
  			ttretf = (VTONFS(*v)->n_mtime.tv_sec == fxdr_unsigned(u_int32_t, *(tl + 2)) && 
 				  VTONFS(*v)->n_mtime.tv_nsec == fxdr_unsigned(u_int32_t, *(tl + 3))); 
+		mtx_unlock(&(VTONFS(*v))->n_mtx);
 	}
 	t1 = nfsm_postop_attr_xx(v, &ttattrf, md, dpos);
 	if (t1)
Index: nfs_node.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_node.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/nfsclient/nfs_node.c -L sys/nfsclient/nfs_node.c -u -r1.2 -r1.3
--- sys/nfsclient/nfs_node.c
+++ sys/nfsclient/nfs_node.c
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_node.c,v 1.76.2.2 2006/03/12 21:50:02 scottl Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_node.c,v 1.86 2007/03/13 01:50:26 tegge Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -99,7 +99,7 @@
  * nfsnode structure is returned.
  */
 int
-nfs_nget(struct mount *mntp, nfsfh_t *fhp, int fhsize, struct nfsnode **npp)
+nfs_nget(struct mount *mntp, nfsfh_t *fhp, int fhsize, struct nfsnode **npp, int flags)
 {
 	struct thread *td = curthread;	/* XXX */
 	struct nfsnode *np;
@@ -107,27 +107,17 @@
 	struct vnode *nvp;
 	int error;
 	u_int hash;
-	int rsflags;
 	struct nfsmount *nmp;
 	struct nfs_vncmp ncmp;
 
-	/*
-	 * Calculate nfs mount point and figure out whether the rslock should
-	 * be interruptible or not.
-	 */
 	nmp = VFSTONFS(mntp);
-	if (nmp->nm_flag & NFSMNT_INT)
-		rsflags = PCATCH;
-	else
-		rsflags = 0;
-
 	*npp = NULL;
 
 	hash = fnv_32_buf(fhp->fh_bytes, fhsize, FNV1_32_INIT);
 	ncmp.fhsize = fhsize;
 	ncmp.fh = fhp;
 
-	error = vfs_hash_get(mntp, hash, LK_EXCLUSIVE,
+	error = vfs_hash_get(mntp, hash, flags,
 	    td, &nvp, nfs_vncmpf, &ncmp);
 	if (error)
 		return (error);
@@ -158,23 +148,44 @@
 		vp->v_bufobj.bo_ops = &buf_ops_nfs;
 	vp->v_data = np;
 	np->n_vnode = vp;
-	error = vfs_hash_insert(vp, hash, LK_EXCLUSIVE,
+	/* 
+	 * Initialize the mutex even if the vnode is going to be a loser.
+	 * This simplifies the logic in reclaim, which can then unconditionally
+	 * destroy the mutex (in the case of the loser, or if hash_insert happened
+	 * to return an error no special casing is needed).
+	 */
+	mtx_init(&np->n_mtx, "NFSnode lock", NULL, MTX_DEF);
+	/*
+	 * NFS supports recursive and shared locking.
+	 */
+	vp->v_vnlock->lk_flags |= LK_CANRECURSE;
+	vp->v_vnlock->lk_flags &= ~LK_NOSHARE;
+	if (fhsize > NFS_SMALLFH) {
+		MALLOC(np->n_fhp, nfsfh_t *, fhsize, M_NFSBIGFH, M_WAITOK);
+	} else
+		np->n_fhp = &np->n_fh;
+	bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize);
+	np->n_fhsize = fhsize;
+	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL, td);
+	error = insmntque(vp, mntp);
+	if (error != 0) {
+		*npp = NULL;
+		if (np->n_fhsize > NFS_SMALLFH) {
+			FREE((caddr_t)np->n_fhp, M_NFSBIGFH);
+		}
+		mtx_destroy(&np->n_mtx);
+		uma_zfree(nfsnode_zone, np);
+		return (error);
+	}
+	error = vfs_hash_insert(vp, hash, flags, 
 	    td, &nvp, nfs_vncmpf, &ncmp);
 	if (error)
 		return (error);
 	if (nvp != NULL) {
 		*npp = VTONFS(nvp);
-		/* vrele() the duplicate allocated here, to get it recycled */
-		vrele(vp);
+		/* vfs_hash_insert() vput()'s the losing vnode */
 		return (0);
 	}
-	if (fhsize > NFS_SMALLFH) {
-		MALLOC(np->n_fhp, nfsfh_t *, fhsize, M_NFSBIGFH, M_WAITOK);
-	} else
-		np->n_fhp = &np->n_fh;
-	bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize);
-	np->n_fhsize = fhsize;
-	lockinit(&np->n_rslock, PVFS | rsflags, "nfrslk", 0, 0);
 	*npp = np;
 
 	return (0);
@@ -245,8 +256,7 @@
 	if (np->n_fhsize > NFS_SMALLFH) {
 		FREE((caddr_t)np->n_fhp, M_NFSBIGFH);
 	}
-
-	lockdestroy(&np->n_rslock);
+	mtx_destroy(&np->n_mtx);
 	uma_zfree(nfsnode_zone, vp->v_data);
 	vp->v_data = NULL;
 	return (0);
Index: nfsnode.h
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfsnode.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfsnode.h -L sys/nfsclient/nfsnode.h -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfsnode.h
+++ sys/nfsclient/nfsnode.h
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfsnode.h	8.9 (Berkeley) 5/14/95
- * $FreeBSD: src/sys/nfsclient/nfsnode.h,v 1.55 2005/03/16 11:28:19 phk Exp $
+ * $FreeBSD: src/sys/nfsclient/nfsnode.h,v 1.60 2007/03/09 04:02:37 mohans Exp $
  */
 
 #ifndef _NFSCLIENT_NFSNODE_H_
@@ -74,6 +74,16 @@
 #define ndm_cookies	ndm_un1.ndmu3_cookies
 #define ndm4_cookies	ndm_un1.ndmu4_cookies
 
+#define n_ac_ts_tid		n_ac_ts.nfs_ac_ts_tid
+#define n_ac_ts_pid		n_ac_ts.nfs_ac_ts_pid
+#define n_ac_ts_syscalls	n_ac_ts.nfs_ac_ts_syscalls
+
+struct nfs_attrcache_timestamp {
+	lwpid_t		nfs_ac_ts_tid;
+	pid_t		nfs_ac_ts_pid;
+	unsigned long	nfs_ac_ts_syscalls;	
+};
+
 /*
  * The nfsnode is the nfs equivalent to ufs's inode. Any similarity
  * is purely coincidental.
@@ -88,6 +98,7 @@
  *     be well aligned and, therefore, tightly packed.
  */
 struct nfsnode {
+	struct mtx 		n_mtx;		/* Protects all of these members */
 	u_quad_t		n_size;		/* Current size of file */
 	u_quad_t		n_brev;		/* Modify rev when cached */
 	u_quad_t		n_lrev;		/* Modify rev for lease */
@@ -120,13 +131,13 @@
 	short			n_fhsize;	/* size in bytes, of fh */
 	short			n_flag;		/* Flag for locking.. */
 	nfsfh_t			n_fh;		/* Small File Handle */
-	struct lock		n_rslock;
 	struct nfs4_fctx	n_rfc;
 	struct nfs4_fctx	n_wfc;
 	u_char			*n_name;	/* leaf name, for v4 OPEN op */
 	uint32_t		n_namelen;
-	daddr_t			ra_expect_lbn;
 	int			n_directio_opens;
+	int                     n_directio_asyncwr;
+	struct nfs_attrcache_timestamp n_ac_ts;
 };
 
 #define n_atim		n_un1.nf_atim
@@ -140,6 +151,8 @@
 /*
  * Flags for n_flag
  */
+#define NFSYNCWAIT      0x0002  /* fsync waiting for all directio async writes
+				  to drain */
 #define	NMODIFIED	0x0004	/* Might have a modified buffer in bio */
 #define	NWRITEERR	0x0008	/* Flag write errors so close will know */
 /* 0x20, 0x40, 0x80 free */
@@ -150,6 +163,7 @@
 #define	NTRUNCATE	0x1000	/* Opened by nfs_setattr() */
 #define	NSIZECHANGED	0x2000  /* File size has changed: need cache inval */
 #define NNONCACHE	0x4000  /* Node marked as noncacheable */
+#define NDIRCOOKIELK	0x8000	/* Lock to serialize access to directory cookies */
 
 /*
  * Convert between nfsnode pointers and vnode pointers
@@ -168,31 +182,6 @@
 
 #if defined(_KERNEL)
 
-/*
- *	nfs_rslock -	Attempt to obtain lock on nfsnode
- *
- *	Attempt to obtain a lock on the passed nfsnode, returning ENOLCK
- *	if the lock could not be obtained due to our having to sleep.  This
- *	function is generally used to lock around code that modifies an
- *	NFS file's size.  In order to avoid deadlocks the lock
- *	should not be obtained while other locks are being held.
- */
-
-static __inline int
-nfs_rslock(struct nfsnode *np, struct thread *td)
-{
-
-        return(lockmgr(&np->n_rslock,
-            LK_EXCLUSIVE | LK_CANRECURSE | LK_SLEEPFAIL, NULL, td));
-}
-
-static __inline void
-nfs_rsunlock(struct nfsnode *np, struct thread *td)
-{
-
-	(void)lockmgr(&np->n_rslock, LK_RELEASE, NULL, td);
-}
-
 extern	struct vop_vector	nfs_fifoops;
 extern	struct vop_vector	nfs_vnodeops;
 extern	struct vop_vector	nfs4_vnodeops;
@@ -211,11 +200,17 @@
 /* other stuff */
 int	nfs_removeit(struct sillyrename *);
 int	nfs4_removeit(struct sillyrename *);
-int	nfs_nget(struct mount *, nfsfh_t *, int, struct nfsnode **);
+int	nfs_nget(struct mount *, nfsfh_t *, int, struct nfsnode **, int flags);
 nfsuint64 *nfs_getcookie(struct nfsnode *, off_t, int);
 uint64_t *nfs4_getcookie(struct nfsnode *, off_t, int);
 void	nfs_invaldir(struct vnode *);
 void	nfs4_invaldir(struct vnode *);
+int	nfs_upgrade_vnlock(struct vnode *vp, struct thread *td);
+void	nfs_downgrade_vnlock(struct vnode *vp, struct thread *td, int old_lock);
+void	nfs_printf(const char *fmt, ...);
+
+void nfs_dircookie_lock(struct nfsnode *np);
+void nfs_dircookie_unlock(struct nfsnode *np);
 
 #endif /* _KERNEL */
 
Index: nfs_bio.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_bio.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfs_bio.c -L sys/nfsclient/nfs_bio.c -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfs_bio.c
+++ sys/nfsclient/nfs_bio.c
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_bio.c,v 1.152 2005/06/16 15:43:17 green Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_bio.c,v 1.165 2007/09/25 21:08:48 mohans Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -71,6 +71,7 @@
 
 extern int nfs_directio_enable;
 extern int nfs_directio_allow_mmap;
+
 /*
  * Vnode op for VM getpages.
  */
@@ -90,8 +91,6 @@
 	vm_page_t *pages;
 	struct nfsnode *np;
 
-	GIANT_REQUIRED;
-
 	vp = ap->a_vp;
 	np = VTONFS(vp);
 	td = curthread;				/* XXX */
@@ -101,21 +100,28 @@
 	count = ap->a_count;
 
 	if ((object = vp->v_object) == NULL) {
-		printf("nfs_getpages: called with non-merged cache vnode??\n");
+		nfs_printf("nfs_getpages: called with non-merged cache vnode??\n");
 		return VM_PAGER_ERROR;
 	}
 
-	if (!nfs_directio_allow_mmap && (np->n_flag & NNONCACHE) && 
-	    (vp->v_type == VREG)) {
-		printf("nfs_getpages: called on non-cacheable vnode??\n");
-		return VM_PAGER_ERROR;
+	if (nfs_directio_enable && !nfs_directio_allow_mmap) {
+		mtx_lock(&np->n_mtx);
+		if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
+			mtx_unlock(&np->n_mtx);
+			nfs_printf("nfs_getpages: called on non-cacheable vnode??\n");
+			return VM_PAGER_ERROR;
+		} else
+			mtx_unlock(&np->n_mtx);
 	}
 
+	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
-	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
+	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {	
+		mtx_unlock(&nmp->nm_mtx);
 		/* We'll never get here for v4, because we always have fsinfo */
 		(void)nfs_fsinfo(nmp, vp, cred, td);
-	}
+	} else
+		mtx_unlock(&nmp->nm_mtx);
 
 	npages = btoc(count);
 
@@ -153,8 +159,8 @@
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
-	cnt.v_vnodein++;
-	cnt.v_vnodepgsin += npages;
+	PCPU_INC(cnt.v_vnodein);
+	PCPU_ADD(cnt.v_vnodepgsin, npages);
 
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
@@ -172,7 +178,7 @@
 	relpbuf(bp, &nfs_pbuf_freecnt);
 
 	if (error && (uio.uio_resid == count)) {
-		printf("nfs_getpages: error %d\n", error);
+		nfs_printf("nfs_getpages: error %d\n", error);
 		VM_OBJECT_LOCK(object);
 		vm_page_lock_queues();
 		for (i = 0; i < npages; ++i) {
@@ -234,7 +240,7 @@
 			 * now tell them that it is ok to use.
 			 */
 			if (!error) {
-				if (m->flags & PG_WANTED)
+				if (m->oflags & VPO_WANTED)
 					vm_page_activate(m);
 				else
 					vm_page_deactivate(m);
@@ -269,8 +275,6 @@
 	struct nfsnode *np;
 	vm_page_t *pages;
 
-	GIANT_REQUIRED;
-
 	vp = ap->a_vp;
 	np = VTONFS(vp);
 	td = curthread;				/* XXX */
@@ -281,15 +285,22 @@
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
 	offset = IDX_TO_OFF(pages[0]->pindex);
-
+	
+	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
+		mtx_unlock(&nmp->nm_mtx);
 		(void)nfs_fsinfo(nmp, vp, cred, td);
-	}
+	} else
+		mtx_unlock(&nmp->nm_mtx);
 
-	if (!nfs_directio_allow_mmap && (np->n_flag & NNONCACHE) && 
-	    (vp->v_type == VREG))
-		printf("nfs_putpages: called on noncache-able vnode??\n");
+	mtx_lock(&np->n_mtx);
+	if (nfs_directio_enable && !nfs_directio_allow_mmap && 
+	    (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
+		mtx_unlock(&np->n_mtx);		
+		nfs_printf("nfs_putpages: called on noncache-able vnode??\n");
+		mtx_lock(&np->n_mtx);
+	}
 
 	for (i = 0; i < npages; i++)
 		rtvals[i] = VM_PAGER_AGAIN;
@@ -297,12 +308,12 @@
 	/*
 	 * When putting pages, do not extend file past EOF.
 	 */
-
 	if (offset + count > np->n_size) {
 		count = np->n_size - offset;
 		if (count < 0)
 			count = 0;
 	}
+	mtx_unlock(&np->n_mtx);
 
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
@@ -312,8 +323,8 @@
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
-	cnt.v_vnodeout++;
-	cnt.v_vnodepgsout += count;
+	PCPU_INC(cnt.v_vnodeout);
+	PCPU_ADD(cnt.v_vnodepgsout, count);
 
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
@@ -349,6 +360,81 @@
 }
 
 /*
+ * For nfs, cache consistency can only be maintained approximately.
+ * Although RFC1094 does not specify the criteria, the following is
+ * believed to be compatible with the reference port.
+ * For nfs:
+ * If the file's modify time on the server has changed since the
+ * last read rpc or you have written to the file,
+ * you may have lost data cache consistency with the
+ * server, so flush all of the file's data out of the cache.
+ * Then force a getattr rpc to ensure that you have up to date
+ * attributes.
+ * NB: This implies that cache data can be read when up to
+ * NFS_ATTRTIMEO seconds out of date. If you find that you need current
+ * attributes this could be forced by setting n_attrstamp to 0 before
+ * the VOP_GETATTR() call.
+ */
+static inline int
+nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred)
+{
+	int error = 0;
+	struct vattr vattr;
+	struct nfsnode *np = VTONFS(vp);
+	int old_lock;
+	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
+	
+	/*
+	 * Grab the exclusive lock before checking whether the cache is
+	 * consistent.
+	 * XXX - We can make this cheaper later (by acquiring cheaper locks).
+	 * But for now, this suffices.
+	 */
+	old_lock = nfs_upgrade_vnlock(vp, td);
+	mtx_lock(&np->n_mtx);
+	if (np->n_flag & NMODIFIED) {
+		mtx_unlock(&np->n_mtx);
+		if (vp->v_type != VREG) {
+			if (vp->v_type != VDIR)
+				panic("nfs: bioread, not dir");
+			(nmp->nm_rpcops->nr_invaldir)(vp);
+			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
+			if (error)
+				goto out;
+		}
+		np->n_attrstamp = 0;
+		error = VOP_GETATTR(vp, &vattr, cred, td);
+		if (error)
+			goto out;
+		mtx_lock(&np->n_mtx);
+		np->n_mtime = vattr.va_mtime;
+		mtx_unlock(&np->n_mtx);
+	} else {
+		mtx_unlock(&np->n_mtx);
+		error = VOP_GETATTR(vp, &vattr, cred, td);
+		if (error)
+			return (error);
+		mtx_lock(&np->n_mtx);
+		if ((np->n_flag & NSIZECHANGED)
+		    || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
+			mtx_unlock(&np->n_mtx);
+			if (vp->v_type == VDIR)
+				(nmp->nm_rpcops->nr_invaldir)(vp);
+			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
+			if (error)
+				goto out;
+			mtx_lock(&np->n_mtx);
+			np->n_mtime = vattr.va_mtime;
+			np->n_flag &= ~NSIZECHANGED;
+		}
+		mtx_unlock(&np->n_mtx);
+	}
+out:	
+	nfs_downgrade_vnlock(vp, td, old_lock);
+	return error;
+}
+
+/*
  * Vnode op for read using bio
  */
 int
@@ -357,7 +443,6 @@
 	struct nfsnode *np = VTONFS(vp);
 	int biosize, i;
 	struct buf *bp, *rabp;
-	struct vattr vattr;
 	struct thread *td;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn, rabn;
@@ -375,9 +460,14 @@
 		return (EINVAL);
 	td = uio->uio_td;
 
+	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
-	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
+	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
+		mtx_unlock(&nmp->nm_mtx);
 		(void)nfs_fsinfo(nmp, vp, cred, td);
+	} else
+		mtx_unlock(&nmp->nm_mtx);		
+
 	if (vp->v_type != VDIR &&
 	    (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
 		return (EFBIG);
@@ -388,52 +478,18 @@
 
 	biosize = vp->v_mount->mnt_stat.f_iosize;
 	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
-	/*
-	 * For nfs, cache consistency can only be maintained approximately.
-	 * Although RFC1094 does not specify the criteria, the following is
-	 * believed to be compatible with the reference port.
-	 * For nfs:
-	 * If the file's modify time on the server has changed since the
-	 * last read rpc or you have written to the file,
-	 * you may have lost data cache consistency with the
-	 * server, so flush all of the file's data out of the cache.
-	 * Then force a getattr rpc to ensure that you have up to date
-	 * attributes.
-	 * NB: This implies that cache data can be read when up to
-	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
-	 * attributes this could be forced by setting n_attrstamp to 0 before
-	 * the VOP_GETATTR() call.
-	 */
-	if (np->n_flag & NMODIFIED) {
-		if (vp->v_type != VREG) {
-			if (vp->v_type != VDIR)
-				panic("nfs: bioread, not dir");
-			(nmp->nm_rpcops->nr_invaldir)(vp);
-			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
-			if (error)
-				return (error);
-		}
-		np->n_attrstamp = 0;
-		error = VOP_GETATTR(vp, &vattr, cred, td);
-		if (error)
-			return (error);
-		np->n_mtime = vattr.va_mtime;
-	} else {
-		error = VOP_GETATTR(vp, &vattr, cred, td);
-		if (error)
-			return (error);
-		if ((np->n_flag & NSIZECHANGED)
-		    || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
-			if (vp->v_type == VDIR)
-				(nmp->nm_rpcops->nr_invaldir)(vp);
-			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
-			if (error)
-				return (error);
-			np->n_mtime = vattr.va_mtime;
-			np->n_flag &= ~NSIZECHANGED;
-		}
-	}
+	
+	error = nfs_bioread_check_cons(vp, td, cred);
+	if (error)
+		return error;
+
 	do {
+	    u_quad_t nsize;
+			
+	    mtx_lock(&np->n_mtx);
+	    nsize = np->n_size;
+	    mtx_unlock(&np->n_mtx);		    
+
 	    switch (vp->v_type) {
 	    case VREG:
 		nfsstats.biocache_reads++;
@@ -442,12 +498,10 @@
 
 		/*
 		 * Start the read ahead(s), as required.
-		 * The readahead is kicked off only if sequential access
-		 * is detected, based on the readahead hint (ra_expect_lbn).
 		 */
-		if (nmp->nm_readahead > 0 && np->ra_expect_lbn == lbn) {
+		if (nmp->nm_readahead > 0) {
 		    for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
-			(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
+			(off_t)(lbn + 1 + nra) * biosize < nsize; nra++) {
 			rabn = lbn + 1 + nra;
 			if (incore(&vp->v_bufobj, rabn) == NULL) {
 			    rabp = nfs_getcacheblk(vp, rabn, biosize, td);
@@ -471,48 +525,17 @@
 			    }
 			}
 		    }
-		    np->ra_expect_lbn = lbn + 1;
 		}
 
-		/*
-		 * Obtain the buffer cache block.  Figure out the buffer size
-		 * when we are at EOF.  If we are modifying the size of the
-		 * buffer based on an EOF condition we need to hold
-		 * nfs_rslock() through obtaining the buffer to prevent
-		 * a potential writer-appender from messing with n_size.
-		 * Otherwise we may accidently truncate the buffer and
-		 * lose dirty data.
-		 *
-		 * Note that bcount is *not* DEV_BSIZE aligned.
-		 */
-
-again:
+		/* Note that bcount is *not* DEV_BSIZE aligned. */
 		bcount = biosize;
-		if ((off_t)lbn * biosize >= np->n_size) {
+		if ((off_t)lbn * biosize >= nsize) {
 			bcount = 0;
-		} else if ((off_t)(lbn + 1) * biosize > np->n_size) {
-			bcount = np->n_size - (off_t)lbn * biosize;
+		} else if ((off_t)(lbn + 1) * biosize > nsize) {
+			bcount = nsize - (off_t)lbn * biosize;
 		}
-		if (bcount != biosize) {
-			switch(nfs_rslock(np, td)) {
-			case ENOLCK:
-				goto again;
-				/* not reached */
-			case EIO:
-				return (EIO);
-			case EINTR:
-			case ERESTART:
-				return(EINTR);
-				/* not reached */
-			default:
-				break;
-			}
-		}
-
 		bp = nfs_getcacheblk(vp, lbn, bcount, td);
 
-		if (bcount != biosize)
-			nfs_rsunlock(np, td);
 		if (!bp) {
 			error = nfs_sigintr(nmp, NULL, td);
 			return (error ? error : EINTR);
@@ -681,7 +704,7 @@
 			n = np->n_direofoffset - uio->uio_offset;
 		break;
 	    default:
-		printf(" nfs_bioread: type %x unexpected\n", vp->v_type);
+		nfs_printf(" nfs_bioread: type %x unexpected\n", vp->v_type);
 		bp = NULL;
 		break;
 	    };
@@ -719,14 +742,18 @@
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct thread *td = uiop->uio_td;
 	int size;
-
+	int wsize;
+	
+	mtx_lock(&nmp->nm_mtx);
+	wsize = nmp->nm_wsize;
+	mtx_unlock(&nmp->nm_mtx);
 	if (ioflag & IO_SYNC) {
 		int iomode, must_commit;
 		struct uio uio;
 		struct iovec iov;
 do_sync:
 		while (uiop->uio_resid > 0) {
-			size = min(uiop->uio_resid, nmp->nm_wsize);
+			size = min(uiop->uio_resid, wsize);
 			size = min(uiop->uio_iov->iov_len, size);
 			iov.iov_base = uiop->uio_iov->iov_base;
 			iov.iov_len = size;
@@ -775,7 +802,7 @@
 		 * in NFS directio access.
 		 */
 		while (uiop->uio_resid > 0) {
-			size = min(uiop->uio_resid, nmp->nm_wsize);
+			size = min(uiop->uio_resid, wsize);
 			size = min(uiop->uio_iov->iov_len, size);
 			bp = getpbuf(&nfs_pbuf_freecnt);
 			t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK);
@@ -799,13 +826,11 @@
 				bp->b_wcred = NOCRED;			
 			bp->b_caller1 = (void *)t_uio;
 			bp->b_vp = vp;
-			vhold(vp);
 			error = nfs_asyncio(nmp, bp, NOCRED, td);
 			if (error) {
 				free(t_iov->iov_base, M_NFSDIRECTIO);
 				free(t_iov, M_NFSDIRECTIO);
 				free(t_uio, M_NFSDIRECTIO);
-				vdrop(bp->b_vp);
 				bp->b_vp = NULL;
 				relpbuf(bp, &nfs_pbuf_freecnt);
 				if (error == EINTR)
@@ -846,11 +871,8 @@
 	daddr_t lbn;
 	int bcount;
 	int n, on, error = 0;
-	int haverslock = 0;
 	struct proc *p = td?td->td_proc:NULL;
 
-	GIANT_REQUIRED;
-
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_WRITE)
 		panic("nfs_write mode");
@@ -859,20 +881,29 @@
 #endif
 	if (vp->v_type != VREG)
 		return (EIO);
+	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NWRITEERR) {
 		np->n_flag &= ~NWRITEERR;
+		mtx_unlock(&np->n_mtx);
 		return (np->n_error);
-	}
+	} else
+		mtx_unlock(&np->n_mtx);
+	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
-	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
+	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
+		mtx_unlock(&nmp->nm_mtx);
 		(void)nfs_fsinfo(nmp, vp, cred, td);
+	} else
+		mtx_unlock(&nmp->nm_mtx);
 
 	/*
 	 * Synchronously flush pending buffers if we are in synchronous
 	 * mode or if we are appending.
 	 */
 	if (ioflag & (IO_APPEND | IO_SYNC)) {
+		mtx_lock(&np->n_mtx);
 		if (np->n_flag & NMODIFIED) {
+			mtx_unlock(&np->n_mtx);
 #ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */
 			/*
 			 * Require non-blocking, synchronous writes to
@@ -887,20 +918,22 @@
 			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error)
 				return (error);
-		}
+		} else
+			mtx_unlock(&np->n_mtx);
 	}
 
 	/*
 	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
 	 * get the append lock.
 	 */
-restart:
 	if (ioflag & IO_APPEND) {
 		np->n_attrstamp = 0;
 		error = VOP_GETATTR(vp, &vattr, cred, td);
 		if (error)
 			return (error);
+		mtx_lock(&np->n_mtx);
 		uio->uio_offset = np->n_size;
+		mtx_unlock(&np->n_mtx);
 	}
 
 	if (uio->uio_offset < 0)
@@ -914,38 +947,6 @@
 		return nfs_directio_write(vp, uio, cred, ioflag);
 
 	/*
-	 * We need to obtain the rslock if we intend to modify np->n_size
-	 * in order to guarentee the append point with multiple contending
-	 * writers, to guarentee that no other appenders modify n_size
-	 * while we are trying to obtain a truncated buffer (i.e. to avoid
-	 * accidently truncating data written by another appender due to
-	 * the race), and to ensure that the buffer is populated prior to
-	 * our extending of the file.  We hold rslock through the entire
-	 * operation.
-	 *
-	 * Note that we do not synchronize the case where someone truncates
-	 * the file while we are appending to it because attempting to lock
-	 * this case may deadlock other parts of the system unexpectedly.
-	 */
-	if ((ioflag & IO_APPEND) ||
-	    uio->uio_offset + uio->uio_resid > np->n_size) {
-		switch(nfs_rslock(np, td)) {
-		case ENOLCK:
-			goto restart;
-			/* not reached */
-		case EIO:
-			return (EIO);
-		case EINTR:
-		case ERESTART:
-			return(EINTR);
-			/* not reached */
-		default:
-			break;
-		}
-		haverslock = 1;
-	}
-
-	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, i don't think it matters
 	 */
@@ -955,8 +956,6 @@
 		    lim_cur(p, RLIMIT_FSIZE)) {
 			psignal(p, SIGXFSZ);
 			PROC_UNLOCK(p);
-			if (haverslock)
-				nfs_rsunlock(np, td);
 			return (EFBIG);
 		}
 		PROC_UNLOCK(p);
@@ -972,6 +971,11 @@
 	 * no point optimizing for something that really won't ever happen.
 	 */
 	if (!(ioflag & IO_SYNC)) {
+		int nflag;
+
+		mtx_lock(&np->n_mtx);
+		nflag = np->n_flag;
+		mtx_unlock(&np->n_mtx);		
 		int needrestart = 0;
 		if (nmp->nm_wcommitsize < uio->uio_resid) {
 			/*
@@ -983,9 +987,9 @@
 			if (ioflag & IO_NDELAY)
 				return (EAGAIN);
 			ioflag |= IO_SYNC;
-			if (np->n_flag & NMODIFIED)
+			if (nflag & NMODIFIED)
 				needrestart = 1;
-		} else if (np->n_flag & NMODIFIED) {
+		} else if (nflag & NMODIFIED) {
 			int wouldcommit = 0;
 			BO_LOCK(&vp->v_bufobj);
 			if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
@@ -1012,13 +1016,8 @@
 			if (wouldcommit > nmp->nm_wcommitsize)
 				needrestart = 1;
 		}
-		if (needrestart) {
-			if (haverslock) {
-				nfs_rsunlock(np, td);
-				haverslock = 0;
-			}
+		if (needrestart)
 			goto flush_and_restart;
-		}
 	}
 
 	do {
@@ -1031,8 +1030,9 @@
 		 * Handle direct append and file extension cases, calculate
 		 * unaligned buffer size.
 		 */
-
+		mtx_lock(&np->n_mtx);
 		if (uio->uio_offset == np->n_size && n) {
+			mtx_unlock(&np->n_mtx);
 			/*
 			 * Get the buffer (in its pre-append state to maintain
 			 * B_CACHE if it was previously set).  Resize the
@@ -1045,9 +1045,11 @@
 			if (bp != NULL) {
 				long save;
 
+				mtx_lock(&np->n_mtx);
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
 				vnode_pager_setsize(vp, np->n_size);
+				mtx_unlock(&np->n_mtx);
 
 				save = bp->b_flags & B_CACHE;
 				bcount += n;
@@ -1066,12 +1068,15 @@
 				else
 					bcount = np->n_size - (off_t)lbn * biosize;
 			}
+			mtx_unlock(&np->n_mtx);
 			bp = nfs_getcacheblk(vp, lbn, bcount, td);
+			mtx_lock(&np->n_mtx);
 			if (uio->uio_offset + n > np->n_size) {
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
 				vnode_pager_setsize(vp, np->n_size);
 			}
+			mtx_unlock(&np->n_mtx);
 		}
 
 		if (!bp) {
@@ -1117,7 +1122,9 @@
 		}
 		if (bp->b_wcred == NOCRED)
 			bp->b_wcred = crhold(cred);
+		mtx_lock(&np->n_mtx);
 		np->n_flag |= NMODIFIED;
+		mtx_unlock(&np->n_mtx);
 
 		/*
 		 * If dirtyend exceeds file size, chop it down.  This should
@@ -1129,7 +1136,7 @@
 		 */
 
 		if (bp->b_dirtyend > bcount) {
-			printf("NFS append race @%lx:%d\n",
+			nfs_printf("NFS append race @%lx:%d\n",
 			    (long)bp->b_blkno * DEV_BSIZE,
 			    bp->b_dirtyend - bcount);
 			bp->b_dirtyend = bcount;
@@ -1209,15 +1216,12 @@
 				break;
 		} else if ((n + on) == biosize) {
 			bp->b_flags |= B_ASYNC;
-			(void) (nmp->nm_rpcops->nr_writebp)(bp, 0, 0);
+			(void) (nmp->nm_rpcops->nr_writebp)(bp, 0, NULL);
 		} else {
 			bdwrite(bp);
 		}
 	} while (uio->uio_resid > 0 && n > 0);
 
-	if (haverslock)
-		nfs_rsunlock(np, td);
-
 	return (error);
 }
 
@@ -1302,34 +1306,35 @@
 		slptimeo = 0;
 	}
 
- 	if ((old_lock = VOP_ISLOCKED(vp, td)) != LK_EXCLUSIVE) {
- 		if (old_lock == LK_SHARED) {
- 			/* Upgrade to exclusive lock, this might block */
- 			vn_lock(vp, LK_UPGRADE | LK_RETRY, td);
- 		} else {
- 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
- 		}
-  	}
-
+	old_lock = nfs_upgrade_vnlock(vp, td);
 	/*
 	 * Now, flush as required.
 	 */
+	if ((flags & V_SAVE) && (vp->v_bufobj.bo_object != NULL)) {
+		VM_OBJECT_LOCK(vp->v_bufobj.bo_object);
+		vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC);
+		VM_OBJECT_UNLOCK(vp->v_bufobj.bo_object);
+		/*
+		 * If the page clean was interrupted, fail the invalidation.
+		 * Not doing so, we run the risk of losing dirty pages in the 
+		 * vinvalbuf() call below.
+		 */
+		if (intrflg && (error = nfs_sigintr(nmp, NULL, td)))
+			goto out;
+	}
+
 	error = vinvalbuf(vp, flags, td, slpflag, 0);
 	while (error) {
 		if (intrflg && (error = nfs_sigintr(nmp, NULL, td)))
 			goto out;
 		error = vinvalbuf(vp, flags, td, 0, slptimeo);
 	}
-	np->n_flag &= ~NMODIFIED;
+	mtx_lock(&np->n_mtx);
+	if (np->n_directio_asyncwr == 0)
+		np->n_flag &= ~NMODIFIED;
+	mtx_unlock(&np->n_mtx);
 out:
- 	if (old_lock != LK_EXCLUSIVE) {
- 		if (old_lock == LK_SHARED) {
- 			/* Downgrade from exclusive lock, this might block */
- 			vn_lock(vp, LK_DOWNGRADE, td);
- 		} else {
- 			VOP_UNLOCK(vp, 0, td);
- 		}
-  	}
+	nfs_downgrade_vnlock(vp, td, old_lock);
 	return error;
 }
 
@@ -1355,11 +1360,12 @@
 	 * leave the async daemons for more important rpc's (such as reads
 	 * and writes).
 	 */
+	mtx_lock(&nfs_iod_mtx);
 	if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
 	    (nmp->nm_bufqiods > nfs_numasync / 2)) {
+		mtx_unlock(&nfs_iod_mtx);
 		return(EIO);
 	}
-
 again:
 	if (nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
@@ -1422,12 +1428,15 @@
 			NFS_DPF(ASYNCIO,
 				("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
 			nmp->nm_bufqwant = TRUE;
- 			error = nfs_tsleep(td, &nmp->nm_bufq, slpflag | PRIBIO,
+ 			error = nfs_msleep(td, &nmp->nm_bufq, &nfs_iod_mtx, 
+					   slpflag | PRIBIO,
  					   "nfsaio", slptimeo);
 			if (error) {
 				error2 = nfs_sigintr(nmp, NULL, td);
-				if (error2)
+				if (error2) {
+					mtx_unlock(&nfs_iod_mtx);					
 					return (error2);
+				}
 				if (slpflag == PCATCH) {
 					slpflag = 0;
 					slptimeo = 2 * hz;
@@ -1444,6 +1453,13 @@
 			}
 		}
 
+		/* We might have lost our nfsiod */
+		if (nmp->nm_bufqiods == 0) {
+			NFS_DPF(ASYNCIO,
+				("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
+			goto again;
+		}
+
 		if (bp->b_iocmd == BIO_READ) {
 			if (bp->b_rcred == NOCRED && cred != NOCRED)
 				bp->b_rcred = crhold(cred);
@@ -1457,9 +1473,18 @@
 		BUF_KERNPROC(bp);
 		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
 		nmp->nm_bufqlen++;
+		if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
+			mtx_lock(&(VTONFS(bp->b_vp))->n_mtx);			
+			VTONFS(bp->b_vp)->n_flag |= NMODIFIED;
+			VTONFS(bp->b_vp)->n_directio_asyncwr++;
+			mtx_unlock(&(VTONFS(bp->b_vp))->n_mtx);
+		}
+		mtx_unlock(&nfs_iod_mtx);
 		return (0);
 	}
 
+	mtx_unlock(&nfs_iod_mtx);
+
 	/*
 	 * All the iods are busy on other mounts, so return EIO to
 	 * force the caller to process the i/o synchronously.
@@ -1483,7 +1508,19 @@
 	free(iov_base, M_NFSDIRECTIO);
 	free(uiop->uio_iov, M_NFSDIRECTIO);
 	free(uiop, M_NFSDIRECTIO);
-	vdrop(bp->b_vp);
+	if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
+		struct nfsnode *np = VTONFS(bp->b_vp);
+		mtx_lock(&np->n_mtx);
+		np->n_directio_asyncwr--;
+		if (np->n_directio_asyncwr == 0) {
+			VTONFS(bp->b_vp)->n_flag &= ~NMODIFIED;
+			if ((np->n_flag & NFSYNCWAIT)) {
+				np->n_flag &= ~NFSYNCWAIT;
+				wakeup((caddr_t)&np->n_directio_asyncwr);
+			}
+		}
+		mtx_unlock(&np->n_mtx);
+	}
 	bp->b_vp = NULL;
 	relpbuf(bp, &nfs_pbuf_freecnt);
 }
@@ -1502,7 +1539,8 @@
 	struct uio uio;
 	struct iovec io;
 	struct proc *p = td ? td->td_proc : NULL;
-
+	uint8_t	iocmd;
+	
 	np = VTONFS(vp);
 	nmp = VFSTONFS(vp->v_mount);
 	uiop = &uio;
@@ -1520,8 +1558,8 @@
 	bp->b_ioflags &= ~BIO_ERROR;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
-
-	if (bp->b_iocmd == BIO_READ) {
+	iocmd = bp->b_iocmd;
+	if (iocmd == BIO_READ) {
 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
 	    io.iov_base = bp->b_data;
 	    uiop->uio_rw = UIO_READ;
@@ -1551,11 +1589,15 @@
 		    }
 		}
 		/* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */
-		if (p && (vp->v_vflag & VV_TEXT) &&
-		    (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime))) {
-			PROC_LOCK(p);
-			killproc(p, "text file modification");
-			PROC_UNLOCK(p);
+		if (p && (vp->v_vflag & VV_TEXT)) {
+			mtx_lock(&np->n_mtx);
+			if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime)) {
+				mtx_unlock(&np->n_mtx);
+				PROC_LOCK(p);
+				killproc(p, "text file modification");
+				PROC_UNLOCK(p);
+			} else
+				mtx_unlock(&np->n_mtx);
 		}
 		break;
 	    case VLNK:
@@ -1585,7 +1627,7 @@
 			bp->b_flags |= B_INVAL;
 		break;
 	    default:
-		printf("nfs_doio:  type %x unexpected\n", vp->v_type);
+		nfs_printf("nfs_doio:  type %x unexpected\n", vp->v_type);
 		break;
 	    };
 	    if (error) {
@@ -1619,9 +1661,10 @@
 	    /*
 	     * Setup for actual write
 	     */
-
+	    mtx_lock(&np->n_mtx);
 	    if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
 		bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
+	    mtx_unlock(&np->n_mtx);
 
 	    if (bp->b_dirtyend > bp->b_dirtyoff) {
 		io.iov_len = uiop->uio_resid = bp->b_dirtyend
@@ -1678,8 +1721,21 @@
 		 * the vp's paging queues so we cannot call bdirty().  The
 		 * bp in this case is not an NFS cache block so we should
 		 * be safe. XXX
+		 *
+		 * The logic below breaks up errors into recoverable and 
+		 * unrecoverable. For the former, we clear B_INVAL|B_NOCACHE
+		 * and keep the buffer around for potential write retries.
+		 * For the latter (eg ESTALE), we toss the buffer away (B_INVAL)
+		 * and save the error in the nfsnode. This is less than ideal 
+		 * but necessary. Keeping such buffers around could potentially
+		 * cause buffer exhaustion eventually (they can never be written
+		 * out, so will get constantly be re-dirtied). It also causes
+		 * all sorts of vfs panics. For non-recoverable write errors, 
+		 * also invalidate the attrcache, so we'll be forced to go over
+		 * the wire for this object, returning an error to user on next
+		 * call (most of the time).
 		 */
-    		if (error == EINTR || error == EIO
+    		if (error == EINTR || error == EIO || error == ETIMEDOUT
 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 			int s;
 
@@ -1695,8 +1751,12 @@
 	    	} else {
 		    if (error) {
 			bp->b_ioflags |= BIO_ERROR;
+			bp->b_flags |= B_INVAL;
 			bp->b_error = np->n_error = error;
+			mtx_lock(&np->n_mtx);
 			np->n_flag |= NWRITEERR;
+			np->n_attrstamp = 0;
+			mtx_unlock(&np->n_mtx);
 		    }
 		    bp->b_dirtyoff = bp->b_dirtyend = 0;
 		}
@@ -1725,13 +1785,16 @@
 nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize)
 {
 	struct nfsnode *np = VTONFS(vp);
-	u_quad_t tsize = np->n_size;
+	u_quad_t tsize;
 	int biosize = vp->v_mount->mnt_stat.f_iosize;
 	int error = 0;
 
+	mtx_lock(&np->n_mtx);
+	tsize = np->n_size;
 	np->n_size = nsize;
+	mtx_unlock(&np->n_mtx);
 
-	if (np->n_size < tsize) {
+	if (nsize < tsize) {
 		struct buf *bp;
 		daddr_t lbn;
 		int bufsize;
Index: nfs_nfsiod.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_nfsiod.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfs_nfsiod.c -L sys/nfsclient/nfs_nfsiod.c -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfs_nfsiod.c
+++ sys/nfsclient/nfs_nfsiod.c
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_nfsiod.c,v 1.86 2005/02/07 18:21:50 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_nfsiod.c,v 1.91 2007/09/25 21:08:49 mohans Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -74,7 +74,7 @@
 #include <nfsclient/nfsnode.h>
 #include <nfsclient/nfs_lock.h>
 
-static MALLOC_DEFINE(M_NFSSVC, "NFS srvsock", "Nfs server structure");
+static MALLOC_DEFINE(M_NFSSVC, "nfsclient_srvsock", "Nfs server structure");
 
 static void	nfssvc_iod(void *);
 
@@ -90,7 +90,7 @@
 unsigned int nfs_iodmax = 20;
 
 /* Minimum number of nfsiod kthreads to keep as spares */
-static unsigned int nfs_iodmin = 4;
+static unsigned int nfs_iodmin = 0;
 
 static int
 sysctl_iodmin(SYSCTL_HANDLER_ARGS)
@@ -102,17 +102,22 @@
 	error = sysctl_handle_int(oidp, &newmin, 0, req);
 	if (error || (req->newptr == NULL))
 		return (error);
-	if (newmin > nfs_iodmax)
-		return (EINVAL);
+	mtx_lock(&nfs_iod_mtx);
+	if (newmin > nfs_iodmax) {
+		error = EINVAL;
+		goto out;
+	}
 	nfs_iodmin = newmin;
 	if (nfs_numasync >= nfs_iodmin)
-		return (0);
+		goto out;
 	/*
 	 * If the current number of nfsiod is lower
 	 * than the new minimum, create some more.
 	 */
 	for (i = nfs_iodmin - nfs_numasync; i > 0; i--)
 		nfs_nfsiodnew();
+out:
+	mtx_unlock(&nfs_iod_mtx);	
 	return (0);
 }
 SYSCTL_PROC(_vfs_nfs, OID_AUTO, iodmin, CTLTYPE_UINT | CTLFLAG_RW, 0,
@@ -131,9 +136,10 @@
 		return (error);
 	if (newmax > NFS_MAXASYNCDAEMON)
 		return (EINVAL);
+	mtx_lock(&nfs_iod_mtx);
 	nfs_iodmax = newmax;
 	if (nfs_numasync <= nfs_iodmax)
-		return (0);
+		goto out;
 	/*
 	 * If there are some asleep nfsiods that should
 	 * exit, wakeup() them so that they check nfs_iodmax
@@ -146,6 +152,8 @@
 			wakeup(&nfs_iodwant[iod]);
 		iod--;
 	}
+out:
+	mtx_unlock(&nfs_iod_mtx);
 	return (0);
 }
 SYSCTL_PROC(_vfs_nfs, OID_AUTO, iodmax, CTLTYPE_UINT | CTLFLAG_RW, 0,
@@ -168,8 +176,10 @@
 		}
 	if (newiod == -1)
 		return (-1);
+	mtx_unlock(&nfs_iod_mtx);
 	error = kthread_create(nfssvc_iod, nfs_asyncdaemon + i, NULL, RFHIGHPID,
 	    0, "nfsiod %d", newiod);
+	mtx_lock(&nfs_iod_mtx);
 	if (error)
 		return (-1);
 	nfs_numasync++;
@@ -183,6 +193,7 @@
 	int error;
 
 	TUNABLE_INT_FETCH("vfs.nfs.iodmin", &nfs_iodmin);
+	mtx_lock(&nfs_iod_mtx);
 	/* Silently limit the start number of nfsiod's */
 	if (nfs_iodmin > NFS_MAXASYNCDAEMON)
 		nfs_iodmin = NFS_MAXASYNCDAEMON;
@@ -192,6 +203,7 @@
 		if (error == -1)
 			panic("nfsiod_setup: nfs_nfsiodnew failed");
 	}
+	mtx_unlock(&nfs_iod_mtx);
 }
 SYSINIT(nfsiod, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, nfsiod_setup, NULL);
 
@@ -211,15 +223,14 @@
 	int myiod, timo;
 	int error = 0;
 
-	mtx_lock(&Giant);
+	mtx_lock(&nfs_iod_mtx);
 	myiod = (int *)instance - nfs_asyncdaemon;
 	/*
 	 * Main loop
 	 */
 	for (;;) {
-	    while (((nmp = nfs_iodmount[myiod]) == NULL
-		   || !TAILQ_FIRST(&nmp->nm_bufq))
-		   && error == 0) {
+	    while (((nmp = nfs_iodmount[myiod]) == NULL)
+		   || !TAILQ_FIRST(&nmp->nm_bufq)) {
 		if (myiod >= nfs_iodmax)
 			goto finish;
 		if (nmp)
@@ -230,12 +241,25 @@
 		 * Always keep at least nfs_iodmin kthreads.
 		 */
 		timo = (myiod < nfs_iodmin) ? 0 : nfs_iodmaxidle * hz;
-		error = tsleep(&nfs_iodwant[myiod], PWAIT | PCATCH,
+		error = msleep(&nfs_iodwant[myiod], &nfs_iod_mtx, PWAIT | PCATCH,
 		    "-", timo);
+		if (error) {
+			nmp = nfs_iodmount[myiod];
+			/*
+			 * Rechecking the nm_bufq closes a rare race where the 
+			 * nfsiod is woken up at the exact time the idle timeout
+			 * fires
+			 */
+			if (nmp && TAILQ_FIRST(&nmp->nm_bufq))
+				error = 0;
+			break;
+		}
 	    }
 	    if (error)
 		    break;
 	    while ((bp = TAILQ_FIRST(&nmp->nm_bufq)) != NULL) {
+	        int giant_locked = 0;
+		    
 		/* Take one off the front of the list */
 		TAILQ_REMOVE(&nmp->nm_bufq, bp, b_freelist);
 		nmp->nm_bufqlen--;
@@ -243,6 +267,11 @@
 		    nmp->nm_bufqwant = 0;
 		    wakeup(&nmp->nm_bufq);
 		}
+		mtx_unlock(&nfs_iod_mtx);
+		if (NFS_ISV4(bp->b_vp)) {
+			giant_locked = 1;
+			mtx_lock(&Giant);
+		}
 		if (bp->b_flags & B_DIRECT) {
 			KASSERT((bp->b_iocmd == BIO_WRITE), ("nfscvs_iod: BIO_WRITE not set"));
 			(void)nfs_doio_directwrite(bp);
@@ -252,7 +281,9 @@
 			else
 				(void) nfs_doio(bp->b_vp, bp, bp->b_wcred, NULL);
 		}
-
+		if (giant_locked)
+			mtx_unlock(&Giant);
+		mtx_lock(&nfs_iod_mtx);
 		/*
 		 * If there are more than one iod on this mount, then defect
 		 * so that the iods can be shared out fairly between the mounts
@@ -276,7 +307,7 @@
 	/* Someone may be waiting for the last nfsiod to terminate. */
 	if (--nfs_numasync == 0)
 		wakeup(&nfs_numasync);
-	mtx_unlock(&Giant);
+	mtx_unlock(&nfs_iod_mtx);
 	if ((error == 0) || (error == EWOULDBLOCK))
 		kthread_exit(0);
 	/* Abnormal termination */
Index: nlminfo.h
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nlminfo.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nlminfo.h -L sys/nfsclient/nlminfo.h -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nlminfo.h
+++ sys/nfsclient/nlminfo.h
@@ -25,7 +25,7 @@
  * SUCH DAMAGE.
  *
  *      from BSDI nlminfo.h,v 2.1 1998/03/18 01:30:38 don Exp
- * $FreeBSD: src/sys/nfsclient/nlminfo.h,v 1.2.14.1 2005/10/27 18:32:39 glebius Exp $
+ * $FreeBSD: src/sys/nfsclient/nlminfo.h,v 1.3 2005/10/26 07:18:36 glebius Exp $
  */
 
 /*