[Midnightbsd-cvs] src: sys/nfsclient: merge
laffer1 at midnightbsd.org
laffer1 at midnightbsd.org
Tue Dec 2 16:48:59 EST 2008
Log Message:
-----------
merge
Modified Files:
--------------
src/sys/nfsclient:
bootp_subr.c (r1.1.1.1 -> r1.2)
krpc_subr.c (r1.1.1.1 -> r1.2)
nfs.h (r1.1.1.1 -> r1.2)
nfs_bio.c (r1.1.1.1 -> r1.2)
nfs_diskless.c (r1.2 -> r1.3)
nfs_lock.c (r1.1.1.2 -> r1.2)
nfs_nfsiod.c (r1.1.1.1 -> r1.2)
nfs_node.c (r1.2 -> r1.3)
nfs_socket.c (r1.5 -> r1.6)
nfs_subs.c (r1.1.1.1 -> r1.2)
nfs_vfsops.c (r1.2 -> r1.3)
nfs_vnops.c (r1.2 -> r1.3)
nfsdiskless.h (r1.1.1.1 -> r1.2)
nfsm_subs.h (r1.1.1.1 -> r1.2)
nfsmount.h (r1.1.1.1 -> r1.2)
nfsnode.h (r1.1.1.1 -> r1.2)
nlminfo.h (r1.1.1.1 -> r1.2)
-------------- next part --------------
Index: nfs_socket.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_socket.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -L sys/nfsclient/nfs_socket.c -L sys/nfsclient/nfs_socket.c -u -r1.5 -r1.6
--- sys/nfsclient/nfs_socket.c
+++ sys/nfsclient/nfs_socket.c
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_socket.c,v 1.125.2.6 2006/02/16 02:39:52 rees Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_socket.c,v 1.154.2.1 2007/10/12 19:18:46 mohans Exp $");
/*
* Socket operations for use by nfs
@@ -78,44 +78,14 @@
#define FALSE 0
extern u_int32_t nfs_xid;
-
-/*
- * Estimate rto for an nfs rpc sent via. an unreliable datagram.
- * Use the mean and mean deviation of rtt for the appropriate type of rpc
- * for the frequent rpcs and a default for the others.
- * The justification for doing "other" this way is that these rpcs
- * happen so infrequently that timer est. would probably be stale.
- * Also, since many of these rpcs are
- * non-idempotent, a conservative timeout is desired.
- * getattr, lookup - A+2D
- * read, write - A+4D
- * other - nm_timeo
- */
-#define NFS_RTO(n, t) \
- ((t) == 0 ? (n)->nm_timeo : \
- ((t) < 3 ? \
- (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
- ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
-#define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
-#define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
-
-/*
- * Defines which timer to use for the procnum.
- * 0 - default
- * 1 - getattr
- * 2 - lookup
- * 3 - read
- * 4 - write
- */
-static int proct[NFS_NPROCS] = {
- 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
-};
+extern struct mtx nfs_xid_mtx;
static int nfs_realign_test;
static int nfs_realign_count;
static int nfs_bufpackets = 4;
static int nfs_reconnects;
-static int nfs3_jukebox_delay = 10;
+static int nfs3_jukebox_delay = 10;
+static int nfs_skip_wcc_data_onerr = 1;
SYSCTL_DECL(_vfs_nfs);
@@ -125,7 +95,8 @@
SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
"number of times the nfs client has had to reconnect");
SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
- "number of seconds to delay a retry after receiving EJUKEBOX");
+ "number of seconds to delay a retry after receiving EJUKEBOX");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0, "");
/*
* There is a congestion window for outstanding rpcs maintained per mount
@@ -153,10 +124,134 @@
static int nfs_reconnect(struct nfsreq *rep);
static void nfs_clnt_tcp_soupcall(struct socket *so, void *arg, int waitflag);
static void nfs_clnt_udp_soupcall(struct socket *so, void *arg, int waitflag);
-static void wakeup_nfsreq(struct nfsreq *req);
extern struct mtx nfs_reqq_mtx;
-extern struct mtx nfs_reply_mtx;
+
+/*
+ * RTT estimator
+ */
+
+static enum nfs_rto_timer_t nfs_proct[NFS_NPROCS] = {
+ NFS_DEFAULT_TIMER, /* NULL */
+ NFS_GETATTR_TIMER, /* GETATTR */
+ NFS_DEFAULT_TIMER, /* SETATTR */
+ NFS_LOOKUP_TIMER, /* LOOKUP */
+ NFS_GETATTR_TIMER, /* ACCESS */
+ NFS_READ_TIMER, /* READLINK */
+ NFS_READ_TIMER, /* READ */
+ NFS_WRITE_TIMER, /* WRITE */
+ NFS_DEFAULT_TIMER, /* CREATE */
+ NFS_DEFAULT_TIMER, /* MKDIR */
+ NFS_DEFAULT_TIMER, /* SYMLINK */
+ NFS_DEFAULT_TIMER, /* MKNOD */
+ NFS_DEFAULT_TIMER, /* REMOVE */
+ NFS_DEFAULT_TIMER, /* RMDIR */
+ NFS_DEFAULT_TIMER, /* RENAME */
+ NFS_DEFAULT_TIMER, /* LINK */
+ NFS_READ_TIMER, /* READDIR */
+ NFS_READ_TIMER, /* READDIRPLUS */
+ NFS_DEFAULT_TIMER, /* FSSTAT */
+ NFS_DEFAULT_TIMER, /* FSINFO */
+ NFS_DEFAULT_TIMER, /* PATHCONF */
+ NFS_DEFAULT_TIMER, /* COMMIT */
+ NFS_DEFAULT_TIMER, /* NOOP */
+};
+
+/*
+ * Choose the correct RTT timer for this NFS procedure.
+ */
+static inline enum nfs_rto_timer_t
+nfs_rto_timer(u_int32_t procnum)
+{
+ return nfs_proct[procnum];
+}
+
+/*
+ * Initialize the RTT estimator state for a new mount point.
+ */
+static void
+nfs_init_rtt(struct nfsmount *nmp)
+{
+ int i;
+
+ for (i = 0; i < NFS_MAX_TIMER; i++)
+ nmp->nm_srtt[i] = NFS_INITRTT;
+ for (i = 0; i < NFS_MAX_TIMER; i++)
+ nmp->nm_sdrtt[i] = 0;
+}
+
+/*
+ * Update a mount point's RTT estimator state using data from the
+ * passed-in request.
+ *
+ * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation.
+ *
+ * NB: Since the timer resolution of NFS_HZ is so course, it can often
+ * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is
+ * between N + dt and N + 2 - dt ticks, add 1 before calculating the
+ * update values.
+ */
+static void
+nfs_update_rtt(struct nfsreq *rep)
+{
+ int t1 = rep->r_rtt + 1;
+ int index = nfs_rto_timer(rep->r_procnum) - 1;
+ int *srtt = &rep->r_nmp->nm_srtt[index];
+ int *sdrtt = &rep->r_nmp->nm_sdrtt[index];
+
+ t1 -= *srtt >> 3;
+ *srtt += t1;
+ if (t1 < 0)
+ t1 = -t1;
+ t1 -= *sdrtt >> 2;
+ *sdrtt += t1;
+}
+
+/*
+ * Estimate RTO for an NFS RPC sent via an unreliable datagram.
+ *
+ * Use the mean and mean deviation of RTT for the appropriate type
+ * of RPC for the frequent RPCs and a default for the others.
+ * The justification for doing "other" this way is that these RPCs
+ * happen so infrequently that timer est. would probably be stale.
+ * Also, since many of these RPCs are non-idempotent, a conservative
+ * timeout is desired.
+ *
+ * getattr, lookup - A+2D
+ * read, write - A+4D
+ * other - nm_timeo
+ */
+static int
+nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum)
+{
+ enum nfs_rto_timer_t timer = nfs_rto_timer(procnum);
+ int index = timer - 1;
+ int rto;
+
+ switch (timer) {
+ case NFS_GETATTR_TIMER:
+ case NFS_LOOKUP_TIMER:
+ rto = ((nmp->nm_srtt[index] + 3) >> 2) +
+ ((nmp->nm_sdrtt[index] + 1) >> 1);
+ break;
+ case NFS_READ_TIMER:
+ case NFS_WRITE_TIMER:
+ rto = ((nmp->nm_srtt[index] + 7) >> 3) +
+ (nmp->nm_sdrtt[index] + 1);
+ break;
+ default:
+ rto = nmp->nm_timeo;
+ return (rto);
+ }
+
+ if (rto < NFS_MINRTO)
+ rto = NFS_MINRTO;
+ else if (rto > NFS_MAXRTO)
+ rto = NFS_MAXRTO;
+
+ return (rto);
+}
+
/*
* Initialize sockets and congestion for a new NFS connection.
@@ -171,13 +266,11 @@
struct sockaddr *saddr;
struct thread *td = &thread0; /* only used for socreate and sobind */
- NET_ASSERT_GIANT();
-
if (nmp->nm_sotype == SOCK_STREAM) {
- mtx_lock(&nmp->nm_nfstcpstate.mtx);
+ mtx_lock(&nmp->nm_mtx);
nmp->nm_nfstcpstate.flags |= NFS_TCP_EXPECT_RPCMARKER;
nmp->nm_nfstcpstate.rpcresid = 0;
- mtx_unlock(&nmp->nm_nfstcpstate.mtx);
+ mtx_unlock(&nmp->nm_mtx);
}
nmp->nm_so = NULL;
saddr = nmp->nm_nam;
@@ -242,12 +335,16 @@
* Protocols that do not require connections may be optionally left
* unconnected for servers that reply from a port other than NFS_PORT.
*/
+ mtx_lock(&nmp->nm_mtx);
if (nmp->nm_flag & NFSMNT_NOCONN) {
if (nmp->nm_soflags & PR_CONNREQUIRED) {
error = ENOTCONN;
+ mtx_unlock(&nmp->nm_mtx);
goto bad;
- }
+ } else
+ mtx_unlock(&nmp->nm_mtx);
} else {
+ mtx_unlock(&nmp->nm_mtx);
error = soconnect(so, nmp->nm_nam, td);
if (error)
goto bad;
@@ -278,7 +375,10 @@
SOCK_UNLOCK(so);
}
so->so_rcv.sb_timeo = 12 * hz;
- so->so_snd.sb_timeo = 5 * hz;
+ if (nmp->nm_sotype == SOCK_STREAM)
+ so->so_snd.sb_timeo = 1 * hz; /* 1s snd timeout for NFS/TCP */
+ else
+ so->so_snd.sb_timeo = 5 * hz;
/*
* Get buffer reservation size from sysctl, but impose reasonable
@@ -289,7 +389,7 @@
pktscale = 2;
if (pktscale > 64)
pktscale = 64;
-
+ mtx_lock(&nmp->nm_mtx);
if (nmp->nm_sotype == SOCK_DGRAM) {
sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
@@ -312,7 +412,9 @@
sopt.sopt_val = &val;
sopt.sopt_valsize = sizeof val;
val = 1;
+ mtx_unlock(&nmp->nm_mtx);
sosetopt(so, &sopt);
+ mtx_lock(&nmp->nm_mtx);
}
if (so->so_proto->pr_protocol == IPPROTO_TCP) {
struct sockopt sopt;
@@ -325,13 +427,16 @@
sopt.sopt_val = &val;
sopt.sopt_valsize = sizeof val;
val = 1;
+ mtx_unlock(&nmp->nm_mtx);
sosetopt(so, &sopt);
+ mtx_lock(&nmp->nm_mtx);
}
sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
sizeof (u_int32_t)) * pktscale;
rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
sizeof (u_int32_t)) * pktscale;
}
+ mtx_unlock(&nmp->nm_mtx);
error = soreserve(so, sndreserve, rcvreserve);
if (error)
goto bad;
@@ -348,14 +453,13 @@
so->so_snd.sb_flags |= SB_NOINTR;
SOCKBUF_UNLOCK(&so->so_snd);
+ mtx_lock(&nmp->nm_mtx);
/* Initialize other non-zero congestion variables */
- nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
- nmp->nm_srtt[3] = (NFS_TIMEO << 3);
- nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
- nmp->nm_sdrtt[3] = 0;
+ nfs_init_rtt(nmp);
nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
nmp->nm_sent = 0;
nmp->nm_timeouts = 0;
+ mtx_unlock(&nmp->nm_mtx);
return (0);
bad:
@@ -363,6 +467,17 @@
return (error);
}
+static void
+nfs_wakup_reconnectors(struct nfsmount *nmp)
+{
+ KASSERT(mtx_owned(&nmp->nm_mtx), ("NFS mnt lock not owned !"));
+ if (--nmp->nm_nfstcpstate.sock_send_inprog == 0 &&
+ (nmp->nm_nfstcpstate.flags & NFS_TCP_WAIT_WRITE_DRAIN)) {
+ nmp->nm_nfstcpstate.flags &= ~NFS_TCP_WAIT_WRITE_DRAIN;
+ wakeup((caddr_t)&nmp->nm_nfstcpstate.sock_send_inprog);
+ }
+}
+
/*
* Reconnect routine:
* Called when a connection is broken on a reliable protocol.
@@ -378,14 +493,41 @@
struct nfsreq *rp;
struct nfsmount *nmp = rep->r_nmp;
int error;
+ int slpflag = 0;
+
+ KASSERT(mtx_owned(&nmp->nm_mtx), ("NFS mnt lock not owned !"));
+ if (nmp->nm_flag & NFSMNT_INT)
+ slpflag = PCATCH;
+ /*
+ * Wait for any pending writes to this socket to drain (or timeout).
+ */
+ while (nmp->nm_nfstcpstate.sock_send_inprog > 0) {
+ nmp->nm_nfstcpstate.flags |= NFS_TCP_WAIT_WRITE_DRAIN;
+ error = msleep((caddr_t)&nmp->nm_nfstcpstate.sock_send_inprog,
+ &nmp->nm_mtx, slpflag | (PZERO - 1), "nfscon", 0);
+ }
+ /*
+ * Grab the nfs_connect_lock to serialize connects.
+ * After grabbing the nfs_connect_lock, check if a reconnect is necessary or
+ * if someone else beat us to the connect !
+ */
+ error = nfs_connect_lock(rep);
+ if (error)
+ goto unlock_exit;
+ if ((nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) == 0)
+ goto unlock_exit;
+ else
+ mtx_unlock(&nmp->nm_mtx);
nfs_reconnects++;
nfs_disconnect(nmp);
while ((error = nfs_connect(nmp, rep)) != 0) {
if (error == ERESTART)
error = EINTR;
- if (error == EIO || error == EINTR)
- return (error);
+ if (error == EIO || error == EINTR) {
+ mtx_lock(&nmp->nm_mtx);
+ goto unlock_exit;
+ }
(void) tsleep(&lbolt, PSOCK, "nfscon", 0);
}
@@ -398,9 +540,10 @@
* until the connection is established successfully, and
* then re-transmit the request.
*/
- mtx_lock(&nmp->nm_nfstcpstate.mtx);
+ mtx_lock(&nmp->nm_mtx);
nmp->nm_nfstcpstate.flags &= ~NFS_TCP_FORCE_RECONNECT;
- mtx_unlock(&nmp->nm_nfstcpstate.mtx);
+ nmp->nm_nfstcpstate.rpcresid = 0;
+ mtx_unlock(&nmp->nm_mtx);
/*
* Loop through outstanding request list and fix up all requests
@@ -408,11 +551,18 @@
*/
mtx_lock(&nfs_reqq_mtx);
TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
- if (rp->r_nmp == nmp)
+ if (rp->r_nmp == nmp) {
+ mtx_lock(&rp->r_mtx);
rp->r_flags |= R_MUSTRESEND;
+ mtx_unlock(&rp->r_mtx);
+ }
}
mtx_unlock(&nfs_reqq_mtx);
- return (0);
+ mtx_lock(&nmp->nm_mtx);
+unlock_exit:
+ nfs_connect_unlock(rep);
+ mtx_unlock(&nmp->nm_mtx);
+ return (error);
}
/*
@@ -423,11 +573,11 @@
{
struct socket *so;
- NET_ASSERT_GIANT();
-
+ mtx_lock(&nmp->nm_mtx);
if (nmp->nm_so) {
so = nmp->nm_so;
nmp->nm_so = NULL;
+ mtx_unlock(&nmp->nm_mtx);
SOCKBUF_LOCK(&so->so_rcv);
so->so_upcallarg = NULL;
so->so_upcall = NULL;
@@ -435,7 +585,8 @@
SOCKBUF_UNLOCK(&so->so_rcv);
soshutdown(so, SHUT_WR);
soclose(so);
- }
+ } else
+ mtx_unlock(&nmp->nm_mtx);
}
void
@@ -462,8 +613,6 @@
struct sockaddr *sendnam;
int error, error2, soflags, flags;
- NET_ASSERT_GIANT();
-
KASSERT(rep, ("nfs_send: called with rep == NULL"));
error = nfs_sigintr(rep->r_nmp, rep, rep->r_td);
@@ -471,13 +620,19 @@
m_freem(top);
return (error);
}
+ mtx_lock(&rep->r_nmp->nm_mtx);
+ mtx_lock(&rep->r_mtx);
if ((so = rep->r_nmp->nm_so) == NULL) {
rep->r_flags |= R_MUSTRESEND;
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&rep->r_nmp->nm_mtx);
m_freem(top);
- return (0);
+ return (EPIPE);
}
rep->r_flags &= ~R_MUSTRESEND;
soflags = rep->r_nmp->nm_soflags;
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&rep->r_nmp->nm_mtx);
if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
sendnam = NULL;
@@ -488,11 +643,12 @@
else
flags = 0;
- error = so->so_proto->pr_usrreqs->pru_sosend(so, sendnam, 0, top, 0,
- flags, curthread /*XXX*/);
+ error = sosend(so, sendnam, 0, top, 0, flags, curthread /*XXX*/);
if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
error = 0;
+ mtx_lock(&rep->r_mtx);
rep->r_flags |= R_MUSTRESEND;
+ mtx_unlock(&rep->r_mtx);
}
if (error) {
@@ -512,15 +668,17 @@
error2 = NFS_SIGREP(rep);
if (error2)
error = error2;
- else
+ else {
+ mtx_lock(&rep->r_mtx);
rep->r_flags |= R_MUSTRESEND;
+ mtx_unlock(&rep->r_mtx);
+ }
/*
* Handle any recoverable (soft) socket errors here. (?)
- * Make EWOULDBLOCK a recoverable error, we'll rexmit from nfs_timer()
+ * Make EWOULDBLOCK a recoverable error, we'll rexmit from nfs_timer().
*/
- if (error != EINTR && error != ERESTART && error != EIO &&
- error != EPIPE)
+ if (error != EINTR && error != ERESTART && error != EIO && error != EPIPE)
error = 0;
}
return (error);
@@ -532,82 +690,98 @@
register struct socket *so;
register struct mbuf *m;
int error = 0, sotype, slpflag;
-
- NET_ASSERT_GIANT();
-
- sotype = rep->r_nmp->nm_sotype;
+ struct nfsmount *nmp = rep->r_nmp;
+
+ sotype = nmp->nm_sotype;
/*
* For reliable protocols, lock against other senders/receivers
* in case a reconnect is necessary.
*/
if (sotype != SOCK_DGRAM) {
- error = nfs_sndlock(rep);
- if (error)
- return (error);
tryagain:
+ mtx_lock(&nmp->nm_mtx);
+ mtx_lock(&rep->r_mtx);
if (rep->r_mrep) {
- nfs_sndunlock(rep);
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
return (0);
}
if (rep->r_flags & R_SOFTTERM) {
- nfs_sndunlock(rep);
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
return (EINTR);
}
- so = rep->r_nmp->nm_so;
- mtx_lock(&rep->r_nmp->nm_nfstcpstate.mtx);
+ so = nmp->nm_so;
if (!so ||
- (rep->r_nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) {
- mtx_unlock(&rep->r_nmp->nm_nfstcpstate.mtx);
+ (nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) {
+ mtx_unlock(&rep->r_mtx);
+ nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
error = nfs_reconnect(rep);
- if (error) {
- nfs_sndunlock(rep);
+ if (error)
return (error);
- }
goto tryagain;
- } else
- mtx_unlock(&rep->r_nmp->nm_nfstcpstate.mtx);
+ }
while (rep->r_flags & R_MUSTRESEND) {
+ mtx_unlock(&rep->r_mtx);
+ nmp->nm_nfstcpstate.sock_send_inprog++;
+ mtx_unlock(&nmp->nm_mtx);
m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
nfsstats.rpcretries++;
- error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
+ error = nfs_send(so, nmp->nm_nam, m, rep);
if (error) {
- if (error == EINTR || error == ERESTART ||
- (error = nfs_reconnect(rep)) != 0) {
- nfs_sndunlock(rep);
+ mtx_lock(&nmp->nm_mtx);
+ nfs_wakup_reconnectors(nmp);
+ if (!(error == EINTR || error == ERESTART)) {
+ nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
+ error = nfs_reconnect(rep);
+ } else
+ mtx_unlock(&nmp->nm_mtx);
+ if (error)
return (error);
- }
goto tryagain;
- }
+ } else {
+ mtx_lock(&nmp->nm_mtx);
+ nfs_wakup_reconnectors(nmp);
+ mtx_lock(&rep->r_mtx);
+ }
}
- nfs_sndunlock(rep);
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
}
slpflag = 0;
- if (rep->r_nmp->nm_flag & NFSMNT_INT)
+ mtx_lock(&nmp->nm_mtx);
+ if (nmp->nm_flag & NFSMNT_INT)
slpflag = PCATCH;
- mtx_lock(&nfs_reply_mtx);
+ mtx_unlock(&nmp->nm_mtx);
+ mtx_lock(&rep->r_mtx);
while ((rep->r_mrep == NULL) && (error == 0) &&
((rep->r_flags & R_SOFTTERM) == 0) &&
((sotype == SOCK_DGRAM) || ((rep->r_flags & R_MUSTRESEND) == 0)))
- error = msleep((caddr_t)rep, &nfs_reply_mtx,
+ error = msleep((caddr_t)rep, &rep->r_mtx,
slpflag | (PZERO - 1), "nfsreq", 0);
- mtx_unlock(&nfs_reply_mtx);
- if (error == EINTR || error == ERESTART)
+ if (error == EINTR || error == ERESTART) {
/* NFS operations aren't restartable. Map ERESTART to EINTR */
+ mtx_unlock(&rep->r_mtx);
return (EINTR);
- if (rep->r_flags & R_SOFTTERM)
+ }
+ if (rep->r_flags & R_SOFTTERM) {
/* Request was terminated because we exceeded the retries (soft mount) */
+ mtx_unlock(&rep->r_mtx);
return (ETIMEDOUT);
+ }
+ mtx_unlock(&rep->r_mtx);
if (sotype == SOCK_STREAM) {
- mtx_lock(&rep->r_nmp->nm_nfstcpstate.mtx);
- if (((rep->r_nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) ||
+ mtx_lock(&nmp->nm_mtx);
+ mtx_lock(&rep->r_mtx);
+ if (((nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) ||
(rep->r_flags & R_MUSTRESEND))) {
- mtx_unlock(&rep->r_nmp->nm_nfstcpstate.mtx);
- error = nfs_sndlock(rep);
- if (error)
- return (error);
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
goto tryagain;
- } else
- mtx_unlock(&rep->r_nmp->nm_nfstcpstate.mtx);
+ } else {
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
+ }
}
return (error);
}
@@ -625,7 +799,6 @@
caddr_t dpos;
u_int32_t rxid, *tl;
struct nfsreq *rep;
- register int32_t t1;
int error;
/*
@@ -660,6 +833,8 @@
* Iff no match, just drop the datagram
*/
TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
+ mtx_lock(&nmp->nm_mtx);
+ mtx_lock(&rep->r_mtx);
if (rep->r_mrep == NULL && rxid == rep->r_xid) {
/* Found it.. */
rep->r_mrep = mrep;
@@ -681,30 +856,16 @@
rep->r_flags &= ~R_SENT;
nmp->nm_sent -= NFS_CWNDSCALE;
}
- /*
- * Update rtt using a gain of 0.125 on the mean
- * and a gain of 0.25 on the deviation.
- */
- if (rep->r_flags & R_TIMING) {
- /*
- * Since the timer resolution of
- * NFS_HZ is so course, it can often
- * result in r_rtt == 0. Since
- * r_rtt == N means that the actual
- * rtt is between N+dt and N+2-dt ticks,
- * add 1.
- */
- t1 = rep->r_rtt + 1;
- t1 -= (NFS_SRTT(rep) >> 3);
- NFS_SRTT(rep) += t1;
- if (t1 < 0)
- t1 = -t1;
- t1 -= (NFS_SDRTT(rep) >> 2);
- NFS_SDRTT(rep) += t1;
- }
+ if (rep->r_flags & R_TIMING)
+ nfs_update_rtt(rep);
nmp->nm_timeouts = 0;
+ wakeup((caddr_t)rep);
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
break;
}
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
}
/*
* If not matched to a request, drop it.
@@ -713,31 +874,18 @@
if (rep == 0) {
nfsstats.rpcunexpected++;
m_freem(mrep);
- } else
- wakeup_nfsreq(rep);
+ }
mtx_unlock(&nfs_reqq_mtx);
}
-/*
- * The wakeup of the requestor should be done under the mutex
- * to avoid potential missed wakeups.
- */
-static void
-wakeup_nfsreq(struct nfsreq *req)
-{
- mtx_lock(&nfs_reply_mtx);
- wakeup((caddr_t)req);
- mtx_unlock(&nfs_reply_mtx);
-}
-
static void
nfs_mark_for_reconnect(struct nfsmount *nmp)
{
struct nfsreq *rp;
- mtx_lock(&nmp->nm_nfstcpstate.mtx);
+ mtx_lock(&nmp->nm_mtx);
nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
- mtx_unlock(&nmp->nm_nfstcpstate.mtx);
+ mtx_unlock(&nmp->nm_mtx);
/*
* Wakeup all processes that are waiting for replies
* on this mount point. One of them does the reconnect.
@@ -745,8 +893,10 @@
mtx_lock(&nfs_reqq_mtx);
TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
if (rp->r_nmp == nmp) {
+ mtx_lock(&rp->r_mtx);
rp->r_flags |= R_MUSTRESEND;
- wakeup_nfsreq(rp);
+ wakeup((caddr_t)rp);
+ mtx_unlock(&rp->r_mtx);
}
}
mtx_unlock(&nfs_reqq_mtx);
@@ -767,6 +917,20 @@
#define nfstcp_marker_readable(so) nfstcp_readable(so, sizeof(u_int32_t))
+static int
+nfs_copy_len(struct mbuf *mp, char *buf, int len)
+{
+ while (len > 0 && mp != NULL) {
+ int copylen = min(len, mp->m_len);
+
+ bcopy(mp->m_data, buf, copylen);
+ buf += copylen;
+ len -= copylen;
+ mp = mp->m_next;
+ }
+ return (len);
+}
+
static void
nfs_clnt_tcp_soupcall(struct socket *so, void *arg, int waitflag)
{
@@ -781,17 +945,21 @@
* Don't pick any more data from the socket if we've marked the
* mountpoint for reconnect.
*/
- mtx_lock(&nmp->nm_nfstcpstate.mtx);
+ mtx_lock(&nmp->nm_mtx);
if (nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) {
- mtx_unlock(&nmp->nm_nfstcpstate.mtx);
+ mtx_unlock(&nmp->nm_mtx);
return;
} else
- mtx_unlock(&nmp->nm_nfstcpstate.mtx);
+ mtx_unlock(&nmp->nm_mtx);
auio.uio_td = curthread;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_rw = UIO_READ;
for ( ; ; ) {
+ mtx_lock(&nmp->nm_mtx);
if (nmp->nm_nfstcpstate.flags & NFS_TCP_EXPECT_RPCMARKER) {
+ int resid;
+
+ mtx_unlock(&nmp->nm_mtx);
if (!nfstcp_marker_readable(so)) {
/* Marker is not readable */
return;
@@ -801,9 +969,8 @@
auio.uio_iovcnt = 0;
mp = NULL;
rcvflg = (MSG_DONTWAIT | MSG_SOCALLBCK);
- error = so->so_proto->pr_usrreqs->pru_soreceive
- (so, (struct sockaddr **)0,
- &auio, &mp, (struct mbuf **)0, &rcvflg);
+ error = soreceive(so, (struct sockaddr **)0, &auio,
+ &mp, (struct mbuf **)0, &rcvflg);
/*
* We've already tested that the socket is readable. 2 cases
* here, we either read 0 bytes (client closed connection),
@@ -820,7 +987,21 @@
}
if (mp == NULL)
panic("nfs_clnt_tcp_soupcall: Got empty mbuf chain from sorecv\n");
- bcopy(mtod(mp, u_int32_t *), &len, sizeof(len));
+ /*
+ * Sigh. We can't do the obvious thing here (which would
+ * be to have soreceive copy the length from mbufs for us).
+ * Calling uiomove() from the context of a socket callback
+ * (even for kernel-kernel copies) leads to LORs (since
+ * we hold network locks at this point).
+ */
+ if ((resid = nfs_copy_len(mp, (char *)&len,
+ sizeof(u_int32_t)))) {
+ log(LOG_ERR, "%s (%d) from nfs server %s\n",
+ "Bad RPC HDR length",
+ (int)(sizeof(u_int32_t) - resid),
+ nmp->nm_mountp->mnt_stat.f_mntfromname);
+ goto mark_reconnect;
+ }
len = ntohl(len) & ~0x80000000;
m_freem(mp);
/*
@@ -834,14 +1015,20 @@
nmp->nm_mountp->mnt_stat.f_mntfromname);
goto mark_reconnect;
}
+ mtx_lock(&nmp->nm_mtx);
nmp->nm_nfstcpstate.rpcresid = len;
nmp->nm_nfstcpstate.flags &= ~(NFS_TCP_EXPECT_RPCMARKER);
- }
+ mtx_unlock(&nmp->nm_mtx);
+ } else
+ mtx_unlock(&nmp->nm_mtx);
+
/*
* Processed RPC marker or no RPC marker to process.
* Pull in and process data.
*/
+ mtx_lock(&nmp->nm_mtx);
if (nmp->nm_nfstcpstate.rpcresid > 0) {
+ mtx_unlock(&nmp->nm_mtx);
if (!nfstcp_readable(so, nmp->nm_nfstcpstate.rpcresid)) {
/* All data not readable */
return;
@@ -851,9 +1038,8 @@
auio.uio_iovcnt = 0;
mp = NULL;
rcvflg = (MSG_DONTWAIT | MSG_SOCALLBCK);
- error = so->so_proto->pr_usrreqs->pru_soreceive
- (so, (struct sockaddr **)0,
- &auio, &mp, (struct mbuf **)0, &rcvflg);
+ error = soreceive(so, (struct sockaddr **)0, &auio,
+ &mp, (struct mbuf **)0, &rcvflg);
if (error || auio.uio_resid > 0) {
if (error && error != ECONNRESET) {
log(LOG_ERR,
@@ -864,11 +1050,14 @@
}
if (mp == NULL)
panic("nfs_clnt_tcp_soupcall: Got empty mbuf chain from sorecv\n");
+ mtx_lock(&nmp->nm_mtx);
nmp->nm_nfstcpstate.rpcresid = 0;
nmp->nm_nfstcpstate.flags |= NFS_TCP_EXPECT_RPCMARKER;
+ mtx_unlock(&nmp->nm_mtx);
/* We got the entire RPC reply. Match XIDs and wake up requestor */
nfs_clnt_match_xid(so, nmp, mp);
- }
+ } else
+ mtx_unlock(&nmp->nm_mtx);
}
mark_reconnect:
@@ -890,9 +1079,7 @@
auio.uio_resid = 1000000000;
do {
mp = control = NULL;
- error = so->so_proto->pr_usrreqs->pru_soreceive(so,
- NULL, &auio, &mp,
- &control, &rcvflag);
+ error = soreceive(so, NULL, &auio, &mp, &control, &rcvflag);
if (control)
m_freem(control);
if (mp)
@@ -910,7 +1097,6 @@
* by mrep or error
* nb: always frees up mreq mbuf list
*/
-
int
nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
struct thread *td, struct ucred *cred, struct mbuf **mrp,
@@ -924,7 +1110,7 @@
struct mbuf *m, *md, *mheadend;
time_t waituntil;
caddr_t dpos;
- int s, error = 0, mrest_len, auth_len, auth_type;
+ int error = 0, mrest_len, auth_len, auth_type;
struct timeval now;
u_int32_t *xidp;
@@ -937,11 +1123,12 @@
if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
return nfs4_request(vp, mrest, procnum, td, cred, mrp, mdp, dposp);
MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
- rep->r_mrep = rep->r_md = NULL;
+ bzero(rep, sizeof(struct nfsreq));
rep->r_nmp = nmp;
rep->r_vp = vp;
rep->r_td = td;
rep->r_procnum = procnum;
+ mtx_init(&rep->r_mtx, "NFSrep lock", NULL, MTX_DEF);
getmicrouptime(&now);
rep->r_lastmsg = now.tv_sec -
@@ -976,7 +1163,7 @@
else
rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
rep->r_rtt = rep->r_rexmit = 0;
- if (proct[procnum] > 0)
+ if (nfs_rto_timer(procnum) != NFS_DEFAULT_TIMER)
rep->r_flags = R_TIMING;
else
rep->r_flags = 0;
@@ -990,7 +1177,6 @@
* Chain request into list of outstanding requests. Be sure
* to put it LAST so timer finds oldest requests first.
*/
- s = splsoftclock();
mtx_lock(&nfs_reqq_mtx);
if (TAILQ_EMPTY(&nfs_reqq))
callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL);
@@ -1002,24 +1188,31 @@
* send this one now but let timer do it. If not timing a request,
* do it now.
*/
- if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
- (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
- nmp->nm_sent < nmp->nm_cwnd)) {
- splx(s);
- error = nfs_sndlock(rep);
- if (!error) {
- m2 = m_copym(m, 0, M_COPYALL, M_TRYWAIT);
- error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
- nfs_sndunlock(rep);
- }
- mtx_lock(&nfs_reqq_mtx);
- if (!error && (rep->r_flags & R_MUSTRESEND) == 0) {
+ mtx_lock(&nmp->nm_mtx);
+ if (nmp->nm_so &&
+ (((nmp->nm_sotype == SOCK_STREAM) && !(nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) ||
+ (nmp->nm_flag & NFSMNT_DUMBTIMR) || nmp->nm_sent < nmp->nm_cwnd)) {
+ if (nmp->nm_sotype == SOCK_STREAM)
+ nmp->nm_nfstcpstate.sock_send_inprog++;
+ mtx_unlock(&nmp->nm_mtx);
+ m2 = m_copym(m, 0, M_COPYALL, M_TRYWAIT);
+ error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
+ mtx_lock(&nmp->nm_mtx);
+ mtx_lock(&rep->r_mtx);
+ /*
+ * nfs_timer() could've re-transmitted the request if we ended up
+ * blocking on nfs_send() too long, so check for R_SENT here.
+ */
+ if (!error && (rep->r_flags & (R_SENT | R_MUSTRESEND)) == 0) {
nmp->nm_sent += NFS_CWNDSCALE;
rep->r_flags |= R_SENT;
}
- mtx_unlock(&nfs_reqq_mtx);
+ mtx_unlock(&rep->r_mtx);
+ if (nmp->nm_sotype == SOCK_STREAM)
+ nfs_wakup_reconnectors(rep->r_nmp);
+ mtx_unlock(&nmp->nm_mtx);
} else {
- splx(s);
+ mtx_unlock(&nmp->nm_mtx);
rep->r_rtt = -1;
}
@@ -1030,31 +1223,54 @@
error = nfs_reply(rep);
/*
- * RPC done, unlink the request.
- */
- s = splsoftclock();
+ * nfs_timer() may be in the process of re-transmitting this request.
+ * nfs_timer() drops the nfs_reqq_mtx before the pru_send() (to avoid LORs).
+ * Wait till nfs_timer() completes the re-transmission. When the reply
+ * comes back, it will be discarded (since the req struct for it no longer
+ * exists).
+ */
+wait_for_pinned_req:
+ mtx_lock(&rep->r_mtx);
+ while (rep->r_flags & R_PIN_REQ) {
+ msleep((caddr_t)&rep->r_flags, &rep->r_mtx,
+ (PZERO - 1), "nfsrxmt", 0);
+ }
+ mtx_unlock(&rep->r_mtx);
+
mtx_lock(&nfs_reqq_mtx);
+ /* Have to check for R_PIN_REQ after grabbing wlock again */
+ mtx_lock(&rep->r_mtx);
+ if (rep->r_flags & R_PIN_REQ) {
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nfs_reqq_mtx);
+ goto wait_for_pinned_req;
+ } else
+ mtx_unlock(&rep->r_mtx);
+ /* RPC done (timer not active, request not pinned), unlink the request */
TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
if (TAILQ_EMPTY(&nfs_reqq))
callout_stop(&nfs_callout);
+ mtx_unlock(&nfs_reqq_mtx);
+
/*
* Decrement the outstanding request count.
*/
+ mtx_lock(&rep->r_mtx);
if (rep->r_flags & R_SENT) {
rep->r_flags &= ~R_SENT; /* paranoia */
+ mtx_unlock(&rep->r_mtx);
+ mtx_lock(&nmp->nm_mtx);
nmp->nm_sent -= NFS_CWNDSCALE;
- }
- mtx_unlock(&nfs_reqq_mtx);
- splx(s);
+ mtx_unlock(&nmp->nm_mtx);
+ } else
+ mtx_unlock(&rep->r_mtx);
/*
* If there was a successful reply and a tprintf msg.
* tprintf a response.
*/
if (!error) {
- mtx_lock(&Giant);
nfs_up(rep, nmp, rep->r_td, "is alive again", NFSSTA_TIMEO);
- mtx_unlock(&Giant);
}
mrep = rep->r_mrep;
md = rep->r_md;
@@ -1069,6 +1285,7 @@
if (rep->r_mrep != NULL)
m_freem(rep->r_mrep);
m_freem(rep->r_mreq);
+ mtx_destroy(&rep->r_mtx);
free((caddr_t)rep, M_NFSREQ);
return (error);
}
@@ -1087,6 +1304,7 @@
error = EACCES;
m_freem(mrep);
m_freem(rep->r_mreq);
+ mtx_destroy(&rep->r_mtx);
free((caddr_t)rep, M_NFSREQ);
return (error);
}
@@ -1109,12 +1327,14 @@
m_freem(mrep);
error = 0;
waituntil = time_second + nfs3_jukebox_delay;
- while (time_second < waituntil)
- (void) tsleep(&lbolt,
- PSOCK, "nqnfstry", 0);
+ while (time_second < waituntil) {
+ (void) tsleep(&lbolt, PSOCK, "nqnfstry", 0);
+ }
+ mtx_lock(&nfs_xid_mtx);
if (++nfs_xid == 0)
nfs_xid++;
rep->r_xid = *xidp = txdr_unsigned(nfs_xid);
+ mtx_unlock(&nfs_xid_mtx);
goto tryagain;
}
@@ -1124,7 +1344,12 @@
*/
if (error == ESTALE)
cache_purge(vp);
- if (nmp->nm_flag & NFSMNT_NFSV3) {
+ /*
+ * Skip wcc data on NFS errors for now. NetApp filers return corrupt
+ * postop attrs in the wcc data for NFS err EROFS. Not sure if they
+ * could return corrupt postop attrs for others errors.
+ */
+ if ((nmp->nm_flag & NFSMNT_NFSV3) && !nfs_skip_wcc_data_onerr) {
*mrp = mrep;
*mdp = md;
*dposp = dpos;
@@ -1132,6 +1357,7 @@
} else
m_freem(mrep);
m_freem(rep->r_mreq);
+ mtx_destroy(&rep->r_mtx);
free((caddr_t)rep, M_NFSREQ);
return (error);
}
@@ -1140,6 +1366,7 @@
*mdp = md;
*dposp = dpos;
m_freem(rep->r_mreq);
+ mtx_destroy(&rep->r_mtx);
FREE((caddr_t)rep, M_NFSREQ);
return (0);
}
@@ -1147,6 +1374,7 @@
error = EPROTONOSUPPORT;
nfsmout:
m_freem(rep->r_mreq);
+ mtx_destroy(&rep->r_mtx);
free((caddr_t)rep, M_NFSREQ);
return (error);
}
@@ -1157,19 +1385,11 @@
* To avoid retransmission attempts on STREAM sockets (in the future) make
* sure to set the r_retry field to 0 (implies nm_retry == 0).
*
- * XXX -
- * For now, since we don't register MPSAFE callouts for the NFS client -
- * softclock() acquires Giant before calling us. That prevents req entries
- * from being removed from the list (from nfs_request()). But we still
- * acquire the nfs reqq mutex to make sure the state of individual req
- * entries is not modified from RPC reply handling (from socket callback)
- * while nfs_timer is walking the list of reqs.
* The nfs reqq lock cannot be held while we do the pru_send() because of a
* lock ordering violation. The NFS client socket callback acquires
* inp_lock->nfsreq mutex and pru_send acquires inp_lock. So we drop the
- * reqq mutex (and reacquire it after the pru_send()). This won't work
- * when we move to fine grained locking for NFS. When we get to that point,
- * a rewrite of nfs_timer() will be needed.
+ * reqq mutex (and reacquire it after the pru_send()). The req structure
+ * (for the rexmit) is prevented from being removed by the R_PIN_REQ flag.
*/
void
nfs_timer(void *arg)
@@ -1179,51 +1399,76 @@
struct socket *so;
struct nfsmount *nmp;
int timeo;
- int s, error;
+ int error;
struct timeval now;
getmicrouptime(&now);
- s = splnet();
- mtx_lock(&Giant); /* nfs_down -> tprintf */
mtx_lock(&nfs_reqq_mtx);
TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
nmp = rep->r_nmp;
- if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
+ mtx_lock(&rep->r_mtx);
+ if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
+ mtx_unlock(&rep->r_mtx);
continue;
+ } else {
+ /*
+ * Terminate request if force-unmount in progress.
+ * Note that NFS could have vfs_busy'ed the mount,
+ * causing the unmount to wait for the mnt_lock, making
+ * this bit of logic necessary.
+ */
+ if (rep->r_nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF) {
+ nfs_softterm(rep);
+ mtx_unlock(&rep->r_mtx);
+ continue;
+ }
+ mtx_unlock(&rep->r_mtx);
+ }
if (nfs_sigintr(nmp, rep, rep->r_td))
continue;
+ mtx_lock(&nmp->nm_mtx);
+ mtx_lock(&rep->r_mtx);
if (nmp->nm_tprintf_initial_delay != 0 &&
(rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
rep->r_lastmsg = now.tv_sec;
+ /*
+ * Pin down the request and drop locks for the acquisition
+ * of Giant from tprintf() in nfs_down().
+ */
+ rep->r_flags |= R_PIN_REQ;
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
+ mtx_unlock(&nfs_reqq_mtx);
nfs_down(rep, nmp, rep->r_td, "not responding",
- 0, NFSSTA_TIMEO);
-#if 0
- if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
- /* we're not yet completely mounted and */
- /* we can't complete an RPC, so we fail */
- nfsstats.rpctimeouts++;
- nfs_softterm(rep);
- continue;
- }
-#endif
+ 0, NFSSTA_TIMEO);
+ mtx_lock(&nfs_reqq_mtx);
+ mtx_lock(&nmp->nm_mtx);
+ mtx_lock(&rep->r_mtx);
+ rep->r_flags &= ~R_PIN_REQ;
+ wakeup((caddr_t)&rep->r_flags);
}
if (rep->r_rtt >= 0) {
rep->r_rtt++;
if (nmp->nm_flag & NFSMNT_DUMBTIMR)
timeo = nmp->nm_timeo;
else
- timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
+ timeo = nfs_estimate_rto(nmp, rep->r_procnum);
if (nmp->nm_timeouts > 0)
timeo *= nfs_backoff[nmp->nm_timeouts - 1];
- if (rep->r_rtt <= timeo)
+ if (rep->r_rtt <= timeo) {
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
continue;
+ }
if (nmp->nm_timeouts < NFS_NBACKOFF)
nmp->nm_timeouts++;
}
if (rep->r_rexmit >= rep->r_retry) { /* too many */
nfsstats.rpctimeouts++;
nfs_softterm(rep);
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
continue;
}
if (nmp->nm_sotype != SOCK_DGRAM) {
@@ -1236,12 +1481,17 @@
* if necessary.
*/
rep->r_flags |= R_MUSTRESEND;
- wakeup_nfsreq(rep);
+ wakeup((caddr_t)rep);
rep->r_rtt = 0;
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
continue;
}
- if ((so = nmp->nm_so) == NULL)
+ if ((so = nmp->nm_so) == NULL) {
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
continue;
+ }
/*
* If there is enough space and the window allows..
* Resend it
@@ -1249,48 +1499,66 @@
*/
rep->r_rtt = -1;
if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
- ((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
- (rep->r_flags & R_SENT) ||
- nmp->nm_sent < nmp->nm_cwnd) &&
- (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){
- mtx_unlock(&nfs_reqq_mtx);
- if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
- error = (*so->so_proto->pr_usrreqs->pru_send)
- (so, 0, m, NULL, NULL, curthread);
- else
- error = (*so->so_proto->pr_usrreqs->pru_send)
- (so, 0, m, nmp->nm_nam, NULL, curthread);
- mtx_lock(&nfs_reqq_mtx);
- if (error) {
- if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
- so->so_error = 0;
- rep->r_flags |= R_RESENDERR;
- } else {
+ ((nmp->nm_flag & NFSMNT_DUMBTIMR) || (rep->r_flags & R_SENT) ||
+ nmp->nm_sent < nmp->nm_cwnd)) {
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
+ if ((m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))) {
/*
- * Iff first send, start timing
- * else turn timing off, backoff timer
- * and divide congestion window by 2.
+ * Mark the request to indicate that a XMIT is in
+ * progress to prevent the req structure being
+ * removed in nfs_request().
*/
- rep->r_flags &= ~R_RESENDERR;
- if (rep->r_flags & R_SENT) {
- rep->r_flags &= ~R_TIMING;
- if (++rep->r_rexmit > NFS_MAXREXMIT)
- rep->r_rexmit = NFS_MAXREXMIT;
- nmp->nm_cwnd >>= 1;
- if (nmp->nm_cwnd < NFS_CWNDSCALE)
- nmp->nm_cwnd = NFS_CWNDSCALE;
- nfsstats.rpcretries++;
+ mtx_lock(&rep->r_mtx);
+ rep->r_flags |= R_PIN_REQ;
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nfs_reqq_mtx);
+ if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
+ error = (*so->so_proto->pr_usrreqs->pru_send)
+ (so, 0, m, NULL, NULL, curthread);
+ else
+ error = (*so->so_proto->pr_usrreqs->pru_send)
+ (so, 0, m, nmp->nm_nam, NULL,
+ curthread);
+ mtx_lock(&nfs_reqq_mtx);
+ mtx_lock(&nmp->nm_mtx);
+ mtx_lock(&rep->r_mtx);
+ rep->r_flags &= ~R_PIN_REQ;
+ wakeup((caddr_t)&rep->r_flags);
+ if (error) {
+ if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
+ so->so_error = 0;
+ rep->r_flags |= R_RESENDERR;
} else {
- rep->r_flags |= R_SENT;
- nmp->nm_sent += NFS_CWNDSCALE;
+ /*
+ * Iff first send, start timing
+ * else turn timing off, backoff timer
+ * and divide congestion window by 2.
+ */
+ rep->r_flags &= ~R_RESENDERR;
+ if (rep->r_flags & R_SENT) {
+ rep->r_flags &= ~R_TIMING;
+ if (++rep->r_rexmit > NFS_MAXREXMIT)
+ rep->r_rexmit = NFS_MAXREXMIT;
+ nmp->nm_cwnd >>= 1;
+ if (nmp->nm_cwnd < NFS_CWNDSCALE)
+ nmp->nm_cwnd = NFS_CWNDSCALE;
+ nfsstats.rpcretries++;
+ } else {
+ rep->r_flags |= R_SENT;
+ nmp->nm_sent += NFS_CWNDSCALE;
+ }
+ rep->r_rtt = 0;
}
- rep->r_rtt = 0;
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
}
+ } else {
+ mtx_unlock(&rep->r_mtx);
+ mtx_unlock(&nmp->nm_mtx);
}
}
mtx_unlock(&nfs_reqq_mtx);
- mtx_unlock(&Giant); /* nfs_down -> tprintf */
- splx(s);
callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL);
}
@@ -1304,28 +1572,28 @@
struct nfsmount *nmp;
{
struct nfsreq *req;
- int i, s;
+ int i;
- s = splnet();
mtx_lock(&nfs_reqq_mtx);
TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
+ mtx_lock(&req->r_mtx);
if (nmp != req->r_nmp || req->r_mrep != NULL ||
- (req->r_flags & R_SOFTTERM))
+ (req->r_flags & R_SOFTTERM)) {
+ mtx_unlock(&req->r_mtx);
continue;
+ }
nfs_softterm(req);
+ mtx_unlock(&req->r_mtx);
}
mtx_unlock(&nfs_reqq_mtx);
- splx(s);
for (i = 0; i < 30; i++) {
- s = splnet();
mtx_lock(&nfs_reqq_mtx);
TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
if (nmp == req->r_nmp)
break;
}
mtx_unlock(&nfs_reqq_mtx);
- splx(s);
if (req == NULL)
return (0);
tsleep(&lbolt, PSOCK, "nfscancel", 0);
@@ -1342,7 +1610,7 @@
static void
nfs_softterm(struct nfsreq *rep)
{
-
+ KASSERT(mtx_owned(&rep->r_mtx), ("NFS req lock not owned !"));
rep->r_flags |= R_SOFTTERM;
if (rep->r_flags & R_SENT) {
rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
@@ -1352,7 +1620,7 @@
* Request terminated, wakeup the blocked process, so that we
* can return EINTR back.
*/
- wakeup_nfsreq(rep);
+ wakeup((caddr_t)rep);
}
/*
@@ -1449,28 +1717,6 @@
}
/*
- * NFS wrapper to tsleep(), that shoves a new p_sigmask and restores the
- * old one after tsleep() returns.
- */
-int
-nfs_tsleep(struct thread *td, void *ident, int priority, char *wmesg, int timo)
-{
- sigset_t oldset;
- int error;
- struct proc *p;
-
- if ((priority & PCATCH) == 0)
- return tsleep(ident, priority, wmesg, timo);
- if (td == NULL)
- td = curthread; /* XXX */
- nfs_set_sigmask(td, &oldset);
- error = tsleep(ident, priority, wmesg, timo);
- nfs_restore_sigmask(td, &oldset);
- p = td->td_proc;
- return (error);
-}
-
-/*
* Test for a termination condition pending on the process.
* This is used for NFSMNT_INT mounts.
*/
@@ -1479,11 +1725,17 @@
{
struct proc *p;
sigset_t tmpset;
-
+
if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
return nfs4_sigintr(nmp, rep, td);
- if (rep && (rep->r_flags & R_SOFTTERM))
- return (EIO);
+ if (rep) {
+ mtx_lock(&rep->r_mtx);
+ if (rep->r_flags & R_SOFTTERM) {
+ mtx_unlock(&rep->r_mtx);
+ return (EIO);
+ } else
+ mtx_unlock(&rep->r_mtx);
+ }
/* Terminate all requests while attempting a forced unmount. */
if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
return (EIO);
@@ -1491,7 +1743,6 @@
return (0);
if (td == NULL)
return (0);
-
p = td->td_proc;
PROC_LOCK(p);
tmpset = p->p_siglist;
@@ -1500,12 +1751,12 @@
mtx_lock(&p->p_sigacts->ps_mtx);
SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
mtx_unlock(&p->p_sigacts->ps_mtx);
- if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist)) && nfs_sig_pending(tmpset)) {
+ if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist))
+ && nfs_sig_pending(tmpset)) {
PROC_UNLOCK(p);
return (EINTR);
}
PROC_UNLOCK(p);
-
return (0);
}
@@ -1516,7 +1767,7 @@
* in progress when a reconnect is necessary.
*/
int
-nfs_sndlock(struct nfsreq *rep)
+nfs_connect_lock(struct nfsreq *rep)
{
int *statep = &rep->r_nmp->nm_state;
struct thread *td;
@@ -1527,11 +1778,12 @@
slpflag = PCATCH;
while (*statep & NFSSTA_SNDLOCK) {
error = nfs_sigintr(rep->r_nmp, rep, td);
- if (error)
+ if (error) {
return (error);
+ }
*statep |= NFSSTA_WANTSND;
- (void) tsleep(statep, slpflag | (PZERO - 1),
- "nfsndlck", slptimeo);
+ (void) msleep(statep, &rep->r_nmp->nm_mtx,
+ slpflag | (PZERO - 1), "nfsndlck", slptimeo);
if (slpflag == PCATCH) {
slpflag = 0;
slptimeo = 2 * hz;
@@ -1545,7 +1797,7 @@
* Unlock the stream socket for others.
*/
void
-nfs_sndunlock(struct nfsreq *rep)
+nfs_connect_unlock(struct nfsreq *rep)
{
int *statep = &rep->r_nmp->nm_state;
@@ -1622,8 +1874,6 @@
{
struct proc *p;
- GIANT_REQUIRED; /* tprintf */
-
p = td ? td->td_proc : NULL;
if (error) {
tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n", server,
@@ -1642,25 +1892,31 @@
const char *msg;
int error, flags;
{
-
- GIANT_REQUIRED; /* nfs_msg */
-
if (nmp == NULL)
return;
+ mtx_lock(&nmp->nm_mtx);
if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
+ nmp->nm_state |= NFSSTA_TIMEO;
+ mtx_unlock(&nmp->nm_mtx);
vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
VQ_NOTRESP, 0);
- nmp->nm_state |= NFSSTA_TIMEO;
- }
+ } else
+ mtx_unlock(&nmp->nm_mtx);
#ifdef NFSSTA_LOCKTIMEO
+ mtx_lock(&nmp->nm_mtx);
if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
+ nmp->nm_state |= NFSSTA_LOCKTIMEO;
+ mtx_unlock(&nmp->nm_mtx);
vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
VQ_NOTRESPLOCK, 0);
- nmp->nm_state |= NFSSTA_LOCKTIMEO;
- }
+ } else
+ mtx_unlock(&nmp->nm_mtx);
#endif
- if (rep)
+ if (rep != NULL) {
+ mtx_lock(&rep->r_mtx);
rep->r_flags |= R_TPRINTFMSG;
+ mtx_unlock(&rep->r_mtx);
+ }
nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
}
@@ -1672,24 +1928,32 @@
const char *msg;
int flags;
{
-
- GIANT_REQUIRED; /* nfs_msg */
-
- if (nmp == NULL)
+ if (nmp == NULL || rep == NULL)
return;
- if ((rep == NULL) || (rep->r_flags & R_TPRINTFMSG) != 0)
+ mtx_lock(&rep->r_mtx);
+ if ((rep->r_flags & R_TPRINTFMSG) != 0) {
+ mtx_unlock(&rep->r_mtx);
nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
+ } else
+ mtx_unlock(&rep->r_mtx);
+
+ mtx_lock(&nmp->nm_mtx);
if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
nmp->nm_state &= ~NFSSTA_TIMEO;
+ mtx_unlock(&nmp->nm_mtx);
vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
VQ_NOTRESP, 1);
- }
+ } else
+ mtx_unlock(&nmp->nm_mtx);
+
#ifdef NFSSTA_LOCKTIMEO
+ mtx_lock(&nmp->nm_mtx);
if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
+ mtx_unlock(&nmp->nm_mtx);
vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
VQ_NOTRESPLOCK, 1);
- }
+ } else
+ mtx_unlock(&nmp->nm_mtx);
#endif
}
-
Index: nfs_lock.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_lock.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/nfsclient/nfs_lock.c -L sys/nfsclient/nfs_lock.c -u -r1.1.1.2 -r1.2
--- sys/nfsclient/nfs_lock.c
+++ sys/nfsclient/nfs_lock.c
@@ -29,7 +29,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_lock.c,v 1.40.2.2 2006/02/14 00:06:32 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_lock.c,v 1.45 2007/04/21 18:11:18 rwatson Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -43,6 +43,7 @@
#include <sys/mbuf.h>
#include <sys/mount.h>
#include <sys/namei.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/socket.h>
@@ -64,8 +65,8 @@
extern void (*nlminfo_release_p)(struct proc *p);
-MALLOC_DEFINE(M_NFSLOCK, "NFS lock", "NFS lock request");
-MALLOC_DEFINE(M_NLMINFO, "nlminfo", "NFS lock process structure");
+MALLOC_DEFINE(M_NFSLOCK, "nfsclient_lock", "NFS lock request");
+MALLOC_DEFINE(M_NLMINFO, "nfsclient_nlminfo", "NFS lock process structure");
static int nfslockdans(struct thread *td, struct lockd_ans *ansp);
static void nlminfo_release(struct proc *p);
@@ -85,6 +86,10 @@
{
int error;
+ error = priv_check(td, PRIV_NFS_LOCKD);
+ if (error)
+ return (error);
+
mtx_lock(&nfslock_mtx);
if (!nfslock_isopen) {
error = 0;
@@ -290,7 +295,7 @@
return (error);
/*
- * retry after 20 seconds if we haven't gotten a responce yet.
+ * Retry after 20 seconds if we haven't gotten a response yet.
* This number was picked out of thin air... but is longer
* then even a reasonably loaded system should take (at least
* on a local network). XXX Probably should use a back-off
@@ -339,17 +344,6 @@
nfslockdans(struct thread *td, struct lockd_ans *ansp)
{
struct proc *targetp;
- int error;
-
- /* Let root, or someone who once was root (lockd generally
- * switches to the daemon uid once it is done setting up) make
- * this call.
- *
- * XXX This authorization check is probably not right.
- */
- if ((error = suser(td)) != 0 &&
- td->td_ucred->cr_svuid != 0)
- return (error);
/* the version should match, or we're out of sync */
if (ansp->la_vers != LOCKD_ANS_VERSION)
Index: krpc_subr.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/krpc_subr.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/krpc_subr.c -L sys/nfsclient/krpc_subr.c -u -r1.1.1.1 -r1.2
--- sys/nfsclient/krpc_subr.c
+++ sys/nfsclient/krpc_subr.c
@@ -43,7 +43,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/krpc_subr.c,v 1.29 2005/03/16 08:13:08 jmg Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/krpc_subr.c,v 1.30 2007/08/06 14:26:02 rwatson Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -215,8 +215,6 @@
nam = mhead = NULL;
from = NULL;
- NET_ASSERT_GIANT();
-
/*
* Create socket and set its recieve timeout.
*/
Index: nfs.h
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfs.h -L sys/nfsclient/nfs.h -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfs.h
+++ sys/nfsclient/nfs.h
@@ -30,7 +30,7 @@
* SUCH DAMAGE.
*
* @(#)nfs.h 8.4 (Berkeley) 5/1/95
- * $FreeBSD: src/sys/nfsclient/nfs.h,v 1.90 2005/01/24 12:31:06 phk Exp $
+ * $FreeBSD: src/sys/nfsclient/nfs.h,v 1.98.2.1 2007/10/12 19:18:46 mohans Exp $
*/
#ifndef _NFSCLIENT_NFS_H_
@@ -53,7 +53,8 @@
#define NFS_MAXTIMEO (60 * NFS_HZ) /* Max timeout to backoff to */
#define NFS_MINIDEMTIMEO (5 * NFS_HZ) /* Min timeout for non-idempotent ops*/
#define NFS_MAXREXMIT 100 /* Stop counting after this many */
-#define NFS_RETRANS 10 /* Num of retrans for soft mounts */
+#define NFS_RETRANS 10 /* Num of retrans for UDP soft mounts */
+#define NFS_RETRANS_TCP 2 /* Num of retrans for TCP soft mounts */
#define NFS_MAXGRPS 16 /* Max. size of groups list */
#ifndef NFS_MINATTRTIMO
#define NFS_MINATTRTIMO 3 /* VREG attrib cache timeout in sec */
@@ -85,6 +86,7 @@
#define NFS_CMPFH(n, f, s) \
((n)->n_fhsize == (s) && !bcmp((caddr_t)(n)->n_fhp, (caddr_t)(f), (s)))
#define NFS_ISV3(v) (VFSTONFS((v)->v_mount)->nm_flag & NFSMNT_NFSV3)
+#define NFS_ISV4(v) (VFSTONFS((v)->v_mount)->nm_flag & NFSMNT_NFSV4)
#define NFSSTA_HASWRITEVERF 0x00040000 /* Has write verifier for V3 */
#define NFSSTA_GOTFSINFO 0x00100000 /* Got the V3 fsinfo */
@@ -131,6 +133,7 @@
extern struct callout nfs_callout;
extern struct nfsstats nfsstats;
+extern struct mtx nfs_iod_mtx;
extern int nfs_numasync;
extern unsigned int nfs_iodmax;
@@ -144,18 +147,13 @@
extern int nfsv3_procid[NFS_NPROCS];
-struct uio;
-struct buf;
-struct vattr;
-struct nameidata;
-
/*
* Socket errors ignored for connectionless sockets??
* For now, ignore them all
*/
#define NFSIGNORE_SOERROR(s, e) \
((e) != EINTR && (e) != EIO && \
- (e) != ERESTART && (e) != EWOULDBLOCK && \
+ (e) != ERESTART && (e) != EWOULDBLOCK && \
((s) & PR_CONNREQUIRED) == 0)
/*
@@ -178,6 +176,7 @@
int r_rtt; /* RTT for rpc */
int r_lastmsg; /* last tprintf */
struct thread *r_td; /* Proc that did I/O system call */
+ struct mtx r_mtx; /* Protects nfsreq fields */
};
/*
@@ -194,19 +193,27 @@
#define R_TPRINTFMSG 0x20 /* Did a tprintf msg. */
#define R_MUSTRESEND 0x40 /* Must resend request */
#define R_GETONEREP 0x80 /* Probe for one reply only */
+#define R_PIN_REQ 0x100 /* Pin request down (rexmit in prog or other) */
+
+struct buf;
+struct socket;
+struct uio;
+struct vattr;
/*
* Pointers to ops that differ from v3 to v4
*/
struct nfs_rpcops {
- int (*nr_readrpc)(struct vnode *vp, struct uio *uiop, struct ucred *cred);
- int (*nr_writerpc)(struct vnode *vp, struct uio *uiop, struct ucred *cred,
- int *iomode, int *must_commit);
+ int (*nr_readrpc)(struct vnode *vp, struct uio *uiop,
+ struct ucred *cred);
+ int (*nr_writerpc)(struct vnode *vp, struct uio *uiop,
+ struct ucred *cred, int *iomode, int *must_commit);
int (*nr_writebp)(struct buf *bp, int force, struct thread *td);
- int (*nr_readlinkrpc)(struct vnode *vp, struct uio *uiop, struct ucred *cred);
+ int (*nr_readlinkrpc)(struct vnode *vp, struct uio *uiop,
+ struct ucred *cred);
void (*nr_invaldir)(struct vnode *vp);
int (*nr_commit)(struct vnode *vp, u_quad_t offset, int cnt,
- struct ucred *cred, struct thread *td);
+ struct ucred *cred, struct thread *td);
};
/*
@@ -254,6 +261,31 @@
#endif
+/*
+ * On fast networks, the estimator will try to reduce the
+ * timeout lower than the latency of the server's disks,
+ * which results in too many timeouts, so cap the lower
+ * bound.
+ */
+#define NFS_MINRTO (NFS_HZ >> 2)
+
+/*
+ * Keep the RTO from increasing to unreasonably large values
+ * when a server is not responding.
+ */
+#define NFS_MAXRTO (20 * NFS_HZ)
+
+enum nfs_rto_timer_t {
+ NFS_DEFAULT_TIMER,
+ NFS_GETATTR_TIMER,
+ NFS_LOOKUP_TIMER,
+ NFS_READ_TIMER,
+ NFS_WRITE_TIMER,
+};
+#define NFS_MAX_TIMER (NFS_WRITE_TIMER)
+
+#define NFS_INITRTT (NFS_HZ << 3)
+
vfs_init_t nfs_init;
vfs_uninit_t nfs_uninit;
int nfs_mountroot(struct mount *mp, struct thread *td);
@@ -261,8 +293,8 @@
#ifndef NFS4_USE_RPCCLNT
int nfs_send(struct socket *, struct sockaddr *, struct mbuf *,
struct nfsreq *);
-int nfs_sndlock(struct nfsreq *);
-void nfs_sndunlock(struct nfsreq *);
+int nfs_connect_lock(struct nfsreq *);
+void nfs_connect_unlock(struct nfsreq *);
#endif /* ! NFS4_USE_RPCCLNT */
int nfs_vinvalbuf(struct vnode *, int, struct thread *, int);
@@ -275,8 +307,8 @@
int nfs_nfsiodnew(void);
int nfs_asyncio(struct nfsmount *, struct buf *, struct ucred *, struct thread *);
int nfs_doio(struct vnode *, struct buf *, struct ucred *, struct thread *);
-void nfs_doio_directwrite (struct buf *);
-void nfs_up(struct nfsreq *, struct nfsmount *, struct thread *,
+void nfs_doio_directwrite (struct buf *);
+void nfs_up(struct nfsreq *, struct nfsmount *, struct thread *,
const char *, int);
void nfs_down(struct nfsreq *, struct nfsmount *, struct thread *,
const char *, int, int);
@@ -297,6 +329,7 @@
void nfs_disconnect(struct nfsmount *);
void nfs_safedisconnect(struct nfsmount *);
int nfs_getattrcache(struct vnode *, struct vattr *);
+int nfs_iosize(struct nfsmount *nmp);
int nfsm_strtmbuf(struct mbuf **, char **, const char *, long);
int nfs_bioread(struct vnode *, struct uio *, int, struct ucred *);
int nfsm_uiotombuf(struct uio *, struct mbuf **, int, caddr_t *);
@@ -307,12 +340,10 @@
int nfs_meta_setsize (struct vnode *, struct ucred *,
struct thread *, u_quad_t);
-void nfs_set_sigmask __P((struct thread *td, sigset_t *oldset));
-void nfs_restore_sigmask __P((struct thread *td, sigset_t *set));
-int nfs_tsleep __P((struct thread *td, void *ident, int priority, char *wmesg,
- int timo));
-int nfs_msleep __P((struct thread *td, void *ident, struct mtx *mtx, int priority,
- char *wmesg, int timo));
+void nfs_set_sigmask(struct thread *td, sigset_t *oldset);
+void nfs_restore_sigmask(struct thread *td, sigset_t *set);
+int nfs_msleep(struct thread *td, void *ident, struct mtx *mtx,
+ int priority, char *wmesg, int timo);
#endif /* _KERNEL */
Index: nfsmount.h
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfsmount.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfsmount.h -L sys/nfsclient/nfsmount.h -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfsmount.h
+++ sys/nfsclient/nfsmount.h
@@ -30,7 +30,7 @@
* SUCH DAMAGE.
*
* @(#)nfsmount.h 8.3 (Berkeley) 3/30/95
- * $FreeBSD: src/sys/nfsclient/nfsmount.h,v 1.30 2005/06/10 23:50:40 green Exp $
+ * $FreeBSD: src/sys/nfsclient/nfsmount.h,v 1.32.2.1 2007/10/12 19:18:46 mohans Exp $
*/
#ifndef _NFSCLIENT_NFSMOUNT_H_
@@ -40,8 +40,9 @@
int rpcresid;
#define NFS_TCP_EXPECT_RPCMARKER 0x0001 /* Expect to see a RPC/TCP marker next */
#define NFS_TCP_FORCE_RECONNECT 0x0002 /* Force a TCP reconnect */
+#define NFS_TCP_WAIT_WRITE_DRAIN 0x0004 /* Waiting for socket writers to finish */
int flags;
- struct mtx mtx;
+ int sock_send_inprog;
};
/*
@@ -50,6 +51,7 @@
* Holds NFS specific information for mount.
*/
struct nfsmount {
+ struct mtx nm_mtx;
int nm_flag; /* Flags for soft/hard... */
int nm_state; /* Internal state flags */
struct mount *nm_mountp; /* Vfs structure for this filesystem */
@@ -64,8 +66,8 @@
struct sockaddr *nm_nam; /* Addr of server */
int nm_timeo; /* Init timer for NFSMNT_DUMBTIMR */
int nm_retry; /* Max retries */
- int nm_srtt[4]; /* Timers for rpcs */
- int nm_sdrtt[4];
+ int nm_srtt[NFS_MAX_TIMER], /* RTT Timers for rpcs */
+ nm_sdrtt[NFS_MAX_TIMER];
int nm_sent; /* Request send count */
int nm_cwnd; /* Request send window */
int nm_timeouts; /* Request timeouts */
Index: nfs_diskless.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_diskless.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/nfsclient/nfs_diskless.c -L sys/nfsclient/nfs_diskless.c -u -r1.2 -r1.3
--- sys/nfsclient/nfs_diskless.c
+++ sys/nfsclient/nfs_diskless.c
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/nfsclient/nfs_diskless.c,v 1.11.2.2 2006/03/20 15:45:14 pjd Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_diskless.c,v 1.17 2006/12/06 02:15:25 sam Exp $");
#include "opt_bootp.h"
@@ -60,10 +60,34 @@
static int hwaddr_to_sockaddr(char *ev, struct sockaddr_dl *sa);
static int decode_nfshandle(char *ev, u_char *fh);
-static void
-nfs_parse_options(const char *envopts, struct nfs_diskless *nd)
+/*
+ * Validate/sanity check a rsize/wsize parameter.
+ */
+static int
+checkrwsize(unsigned long v, const char *name)
+{
+ /*
+ * 32K is used as an upper bound because most servers
+ * limit block size to satisfy IPv4's limit of
+ * 64K/reassembled packet. The lower bound is pretty
+ * much arbitrary.
+ */
+ if (!(4 <= v && v <= 32*1024)) {
+ printf("nfs_parse_options: invalid %s %lu ignored\n", name, v);
+ return 0;
+ } else
+ return 1;
+}
+
+/*
+ * Parse mount options and apply them to the supplied
+ * nfs_diskless state. Used also by bootp/dhcp support.
+ */
+void
+nfs_parse_options(const char *envopts, struct nfs_args *nd)
{
char *opts, *o, *otmp;
+ unsigned long v;
opts = strdup(envopts, M_TEMP);
otmp = opts;
@@ -71,15 +95,37 @@
if (*o == '\0')
; /* Skip empty options. */
else if (strcmp(o, "soft") == 0)
- nd->root_args.flags |= NFSMNT_SOFT;
+ nd->flags |= NFSMNT_SOFT;
else if (strcmp(o, "intr") == 0)
- nd->root_args.flags |= NFSMNT_INT;
+ nd->flags |= NFSMNT_INT;
else if (strcmp(o, "conn") == 0)
- nd->root_args.flags |= NFSMNT_NOCONN;
+ nd->flags |= NFSMNT_NOCONN;
else if (strcmp(o, "nolockd") == 0)
- nd->root_args.flags |= NFSMNT_NOLOCKD;
- else
- printf("nfs_diskless: unknown option: %s\n", o);
+ nd->flags |= NFSMNT_NOLOCKD;
+ else if (strcmp(o, "nfsv2") == 0)
+ nd->flags &= ~(NFSMNT_NFSV3 | NFSMNT_NFSV4);
+ else if (strcmp(o, "nfsv3") == 0) {
+ nd->flags &= ~NFSMNT_NFSV4;
+ nd->flags |= NFSMNT_NFSV3;
+ } else if (strcmp(o, "tcp") == 0)
+ nd->sotype = SOCK_STREAM;
+ else if (strcmp(o, "udp") == 0)
+ nd->sotype = SOCK_DGRAM;
+ else if (strncmp(o, "rsize=", 6) == 0) {
+ v = strtoul(o+6, NULL, 10);
+ if (checkrwsize(v, "rsize")) {
+ nd->rsize = (int) v;
+ nd->flags |= NFSMNT_RSIZE;
+ }
+ } else if (strncmp(o, "wsize=", 6) == 0) {
+ v = strtoul(o+6, NULL, 10);
+ if (checkrwsize(v, "wsize")) {
+ nd->wsize = (int) v;
+ nd->flags |= NFSMNT_WSIZE;
+ }
+ } else
+ printf("%s: skipping unknown option \"%s\"\n",
+ __func__, o);
}
free(opts, M_TEMP);
}
@@ -132,12 +178,12 @@
IFNET_RLOCK();
TAILQ_FOREACH(ifp, &ifnet, if_link) {
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
- if ((ifa->ifa_addr->sa_family == AF_LINK) &&
- (sdl = ((struct sockaddr_dl *)ifa->ifa_addr))) {
+ if (ifa->ifa_addr->sa_family == AF_LINK) {
+ sdl = (struct sockaddr_dl *)ifa->ifa_addr;
if ((sdl->sdl_type == ourdl.sdl_type) &&
(sdl->sdl_alen == ourdl.sdl_alen) &&
- !bcmp(sdl->sdl_data + sdl->sdl_nlen,
- ourdl.sdl_data + ourdl.sdl_nlen,
+ !bcmp(LLADDR(sdl),
+ LLADDR(&ourdl),
sdl->sdl_alen)) {
IFNET_RUNLOCK();
goto match_done;
@@ -174,7 +220,18 @@
freeenv(cp);
}
if ((cp = getenv("boot.nfsroot.options")) != NULL) {
- nfs_parse_options(cp, nd);
+ struct nfs_args args;
+
+ /* XXX yech, convert between old and current arg format */
+ args.flags = nd->root_args.flags;
+ args.sotype = nd->root_args.sotype;
+ args.rsize = nd->root_args.rsize;
+ args.wsize = nd->root_args.wsize;
+ nfs_parse_options(cp, &args);
+ nd->root_args.flags = args.flags;
+ nd->root_args.sotype = args.sotype;
+ nd->root_args.rsize = args.rsize;
+ nd->root_args.wsize = args.wsize;
freeenv(cp);
}
Index: bootp_subr.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/bootp_subr.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/bootp_subr.c -L sys/nfsclient/bootp_subr.c -u -r1.1.1.1 -r1.2
--- sys/nfsclient/bootp_subr.c
+++ sys/nfsclient/bootp_subr.c
@@ -41,7 +41,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/bootp_subr.c,v 1.64 2005/04/26 20:45:29 des Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/bootp_subr.c,v 1.70 2007/08/06 14:26:02 rwatson Exp $");
#include "opt_bootp.h"
@@ -220,7 +220,6 @@
const struct in_addr *siaddr);
static int getdec(char **ptr);
static int getip(char **ptr, struct in_addr *ip);
-static char *substr(char *a, char *b);
static void mountopts(struct nfs_args *args, char *p);
static int xdr_opaque_decode(struct mbuf **ptr, u_char *buf, int len);
static int xdr_int_decode(struct mbuf **ptr, int *iptr);
@@ -591,8 +590,6 @@
int retry;
const char *s;
- NET_ASSERT_GIANT();
-
/*
* Create socket and set its recieve timeout.
*/
@@ -760,7 +757,7 @@
}
/* XXX: Is this needed ? */
- tsleep(&error, PZERO + 8, "bootpw", 10);
+ pause("bootpw", hz/10);
/* Set netmask to 255.0.0.0 */
@@ -983,8 +980,6 @@
struct ifaddr *ifa;
struct sockaddr_dl *sdl;
- NET_ASSERT_GIANT();
-
error = socreate(AF_INET, &ifctx->so, SOCK_DGRAM, 0, td->td_ucred, td);
if (error != 0)
panic("nfs_boot: socreate, error=%d", error);
@@ -1047,13 +1042,12 @@
/* Get HW address */
sdl = NULL;
- for (ifa = TAILQ_FIRST(&ifctx->ifp->if_addrhead);
- ifa != NULL;
- ifa = TAILQ_NEXT(ifa, ifa_link))
- if (ifa->ifa_addr->sa_family == AF_LINK &&
- (sdl = ((struct sockaddr_dl *) ifa->ifa_addr)) != NULL &&
- sdl->sdl_type == IFT_ETHER)
- break;
+ TAILQ_FOREACH(ifa, &ifctx->ifp->if_addrhead, ifa_link)
+ if (ifa->ifa_addr->sa_family == AF_LINK) {
+ sdl = (struct sockaddr_dl *)ifa->ifa_addr;
+ if (sdl->sdl_type == IFT_ETHER)
+ break;
+ }
if (sdl == NULL)
panic("bootpc: Unable to find HW address for %s",
@@ -1235,51 +1229,16 @@
return ret;
}
-static char *
-substr(char *a, char *b)
-{
- char *loc1;
- char *loc2;
-
- while (*a != '\0') {
- loc1 = a;
- loc2 = b;
- while (*loc1 == *loc2++) {
- if (*loc1 == '\0')
- return 0;
- loc1++;
- if (*loc2 == '\0')
- return loc1;
- }
- a++;
- }
- return 0;
-}
-
static void
mountopts(struct nfs_args *args, char *p)
{
- char *tmp;
-
args->version = NFS_ARGSVERSION;
args->rsize = 8192;
args->wsize = 8192;
args->flags = NFSMNT_RSIZE | NFSMNT_WSIZE | NFSMNT_RESVPORT;
args->sotype = SOCK_DGRAM;
- if (p == NULL)
- return;
- if ((tmp = (char *)substr(p, "rsize=")))
- args->rsize = getdec(&tmp);
- if ((tmp = (char *)substr(p, "wsize=")))
- args->wsize = getdec(&tmp);
- if ((tmp = (char *)substr(p, "intr")))
- args->flags |= NFSMNT_INT;
- if ((tmp = (char *)substr(p, "soft")))
- args->flags |= NFSMNT_SOFT;
- if ((tmp = (char *)substr(p, "noconn")))
- args->flags |= NFSMNT_NOCONN;
- if ((tmp = (char *)substr(p, "tcp")))
- args->sotype = SOCK_STREAM;
+ if (p != NULL)
+ nfs_parse_options(p, args);
}
static int
@@ -1815,6 +1774,7 @@
int authcount;
int authver;
+ /* XXX honor v2/v3 flags in args->flags? */
#ifdef BOOTP_NFSV3
/* First try NFS v3 */
/* Get port number for MOUNTD. */
Index: nfs_vnops.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_vnops.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/nfsclient/nfs_vnops.c -L sys/nfsclient/nfs_vnops.c -u -r1.2 -r1.3
--- sys/nfsclient/nfs_vnops.c
+++ sys/nfsclient/nfs_vnops.c
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_vnops.c,v 1.258.2.4.2.2 2006/04/18 14:15:50 jon Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_vnops.c,v 1.276.2.1 2007/11/27 12:20:58 rwatson Exp $");
/*
* vnode op calls for Sun NFS version 2 and 3
@@ -192,6 +192,7 @@
/*
* Global variables
*/
+struct mtx nfs_iod_mtx;
struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON];
int nfs_numasync = 0;
@@ -241,6 +242,23 @@
#define NFSV3ACCESS_ALL (NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY \
| NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE \
| NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP)
+
+/*
+ * SMP Locking Note :
+ * The list of locks after the description of the lock is the ordering
+ * of other locks acquired with the lock held.
+ * np->n_mtx : Protects the fields in the nfsnode.
+ VM Object Lock
+ VI_MTX (acquired indirectly)
+ * nmp->nm_mtx : Protects the fields in the nfsmount.
+ rep->r_mtx
+ * nfs_iod_mtx : Global lock, protects shared nfsiod state.
+ * nfs_reqq_mtx : Global lock, protects the nfs_reqq list.
+ nmp->nm_mtx
+ rep->r_mtx
+ * rep->r_mtx : Protects the fields in an nfsreq.
+ */
+
static int
nfs3_access_otw(struct vnode *vp, int wmode, struct thread *td,
struct ucred *cred)
@@ -266,9 +284,11 @@
if (!error) {
tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
rmode = fxdr_unsigned(u_int32_t, *tl);
+ mtx_lock(&np->n_mtx);
np->n_mode = rmode;
np->n_modeuid = cred->cr_uid;
np->n_modestamp = time_second;
+ mtx_unlock(&np->n_mtx);
}
m_freem(mrep);
nfsmout:
@@ -343,6 +363,7 @@
* Does our cached result allow us to give a definite yes to
* this request?
*/
+ mtx_lock(&np->n_mtx);
if ((time_second < (np->n_modestamp + nfsaccess_cache_timeout)) &&
(ap->a_cred->cr_uid == np->n_modeuid) &&
((np->n_mode & mode) == mode)) {
@@ -352,18 +373,21 @@
* Either a no, or a don't know. Go to the wire.
*/
nfsstats.accesscache_misses++;
+ mtx_unlock(&np->n_mtx);
error = nfs3_access_otw(vp, wmode, ap->a_td,ap->a_cred);
+ mtx_lock(&np->n_mtx);
if (!error) {
if ((np->n_mode & mode) != mode) {
error = EACCES;
}
}
}
+ mtx_unlock(&np->n_mtx);
return (error);
} else {
- if ((error = nfsspec_access(ap)) != 0)
+ if ((error = nfsspec_access(ap)) != 0) {
return (error);
-
+ }
/*
* Attempt to prevent a mapped root from accessing a file
* which it shouldn't. We try to read a byte from the file
@@ -371,12 +395,14 @@
* After calling nfsspec_access, we should have the correct
* file size cached.
*/
+ mtx_lock(&np->n_mtx);
if (ap->a_cred->cr_uid == 0 && (ap->a_mode & VREAD)
&& VTONFS(vp)->n_size > 0) {
struct iovec aiov;
struct uio auio;
char buf[1];
+ mtx_unlock(&np->n_mtx);
aiov.iov_base = buf;
aiov.iov_len = 1;
auio.uio_iov = &aiov;
@@ -400,11 +426,14 @@
error = nfs_readlinkrpc(vp, &auio, ap->a_cred);
else
error = EACCES;
- }
+ } else
+ mtx_unlock(&np->n_mtx);
return (error);
}
}
+int nfs_otw_getattr_avoid = 0;
+
/*
* nfs open vnode op
* Check to see if the type is ok
@@ -428,7 +457,9 @@
/*
* Get a valid lease. If cached data is stale, flush it.
*/
+ mtx_lock(&np->n_mtx);
if (np->n_flag & NMODIFIED) {
+ mtx_unlock(&np->n_mtx);
error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
if (error == EINTR || error == EIO)
return (error);
@@ -438,20 +469,35 @@
error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
if (error)
return (error);
+ mtx_lock(&np->n_mtx);
np->n_mtime = vattr.va_mtime;
+ mtx_unlock(&np->n_mtx);
} else {
- np->n_attrstamp = 0;
+ struct thread *td = curthread;
+
+ if (np->n_ac_ts_syscalls != td->td_syscalls ||
+ np->n_ac_ts_tid != td->td_tid ||
+ td->td_proc == NULL ||
+ np->n_ac_ts_pid != td->td_proc->p_pid) {
+ np->n_attrstamp = 0;
+ }
+ mtx_unlock(&np->n_mtx);
error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
if (error)
return (error);
+ mtx_lock(&np->n_mtx);
if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
if (vp->v_type == VDIR)
np->n_direofoffset = 0;
+ mtx_unlock(&np->n_mtx);
error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
- if (error == EINTR || error == EIO)
+ if (error == EINTR || error == EIO) {
return (error);
+ }
+ mtx_lock(&np->n_mtx);
np->n_mtime = vattr.va_mtime;
}
+ mtx_unlock(&np->n_mtx);
}
/*
* If the object has >= 1 O_DIRECT active opens, we disable caching.
@@ -461,12 +507,13 @@
error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
if (error)
return (error);
+ mtx_lock(&np->n_mtx);
np->n_flag |= NNONCACHE;
+ mtx_unlock(&np->n_mtx);
}
np->n_directio_opens++;
}
- np->ra_expect_lbn = 0;
- vnode_create_vobject_off(vp, vattr.va_size, ap->a_td);
+ vnode_create_vobject(vp, vattr.va_size, ap->a_td);
return (0);
}
@@ -519,7 +566,9 @@
vm_object_page_clean(vp->v_object, 0, 0, 0);
VM_OBJECT_UNLOCK(vp->v_object);
}
+ mtx_lock(&np->n_mtx);
if (np->n_flag & NMODIFIED) {
+ mtx_unlock(&np->n_mtx);
if (NFS_ISV3(vp)) {
/*
* Under NFSv3 we have dirty buffers to dispose of. We
@@ -539,6 +588,7 @@
/* np->n_flag &= ~NMODIFIED; */
} else
error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
+ mtx_lock(&np->n_mtx);
}
/*
* Invalidate the attribute cache in all cases.
@@ -551,13 +601,20 @@
np->n_flag &= ~NWRITEERR;
error = np->n_error;
}
+ mtx_unlock(&np->n_mtx);
}
+ if (nfs_directio_enable)
+ KASSERT((np->n_directio_asyncwr == 0),
+ ("nfs_close: dirty unflushed (%d) directio buffers\n",
+ np->n_directio_asyncwr));
if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
+ mtx_lock(&np->n_mtx);
KASSERT((np->n_directio_opens > 0),
- ("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
+ ("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
np->n_directio_opens--;
if (np->n_directio_opens == 0)
np->n_flag &= ~NNONCACHE;
+ mtx_unlock(&np->n_mtx);
}
return (error);
}
@@ -578,21 +635,21 @@
/*
* Update local times for special files.
*/
+ mtx_lock(&np->n_mtx);
if (np->n_flag & (NACC | NUPD))
np->n_flag |= NCHG;
+ mtx_unlock(&np->n_mtx);
/*
* First look in the cache.
*/
if (nfs_getattrcache(vp, ap->a_vap) == 0)
- return (0);
-
+ goto nfsmout;
if (v3 && nfsaccess_cache_timeout > 0) {
nfsstats.accesscache_misses++;
nfs3_access_otw(vp, NFSV3ACCESS_ALL, ap->a_td, ap->a_cred);
if (nfs_getattrcache(vp, ap->a_vap) == 0)
- return (0);
+ goto nfsmout;
}
-
nfsstats.rpccnt[NFSPROC_GETATTR]++;
mreq = nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH(v3));
mb = mreq;
@@ -624,9 +681,9 @@
#endif
/*
- * Setting of flags is not supported.
+ * Setting of flags and marking of atimes are not supported.
*/
- if (vap->va_flags != VNOVAL)
+ if (vap->va_flags != VNOVAL || (vap->va_vaflags & VA_MARK_ATIME))
return (EOPNOTSUPP);
/*
@@ -635,8 +692,10 @@
if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
- (vp->v_mount->mnt_flag & MNT_RDONLY))
- return (EROFS);
+ (vp->v_mount->mnt_flag & MNT_RDONLY)) {
+ error = EROFS;
+ goto out;
+ }
if (vap->va_size != VNOVAL) {
switch (vp->v_type) {
case VDIR:
@@ -650,7 +709,7 @@
vap->va_mode == (mode_t)VNOVAL &&
vap->va_uid == (uid_t)VNOVAL &&
vap->va_gid == (gid_t)VNOVAL)
- return (0);
+ return (0);
vap->va_size = VNOVAL;
break;
default:
@@ -660,47 +719,60 @@
*/
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
-
/*
* We run vnode_pager_setsize() early (why?),
* we must set np->n_size now to avoid vinvalbuf
* V_SAVE races that might setsize a lower
* value.
*/
-
+ mtx_lock(&np->n_mtx);
tsize = np->n_size;
+ mtx_unlock(&np->n_mtx);
error = nfs_meta_setsize(vp, ap->a_cred,
- ap->a_td, vap->va_size);
-
+ ap->a_td, vap->va_size);
+ mtx_lock(&np->n_mtx);
if (np->n_flag & NMODIFIED) {
+ tsize = np->n_size;
+ mtx_unlock(&np->n_mtx);
if (vap->va_size == 0)
error = nfs_vinvalbuf(vp, 0, ap->a_td, 1);
else
error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
if (error) {
- vnode_pager_setsize(vp, np->n_size);
- return (error);
+ vnode_pager_setsize(vp, tsize);
+ goto out;
}
- }
+ } else
+ mtx_unlock(&np->n_mtx);
/*
* np->n_size has already been set to vap->va_size
* in nfs_meta_setsize(). We must set it again since
* nfs_loadattrcache() could be called through
* nfs_meta_setsize() and could modify np->n_size.
*/
+ mtx_lock(&np->n_mtx);
np->n_vattr.va_size = np->n_size = vap->va_size;
+ mtx_unlock(&np->n_mtx);
};
- } else if ((vap->va_mtime.tv_sec != VNOVAL ||
- vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) &&
- vp->v_type == VREG &&
- (error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1)) != 0 &&
- (error == EINTR || error == EIO))
- return (error);
+ } else {
+ mtx_lock(&np->n_mtx);
+ if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) &&
+ (np->n_flag & NMODIFIED) && vp->v_type == VREG) {
+ mtx_unlock(&np->n_mtx);
+ if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1)) != 0 &&
+ (error == EINTR || error == EIO))
+ return error;
+ } else
+ mtx_unlock(&np->n_mtx);
+ }
error = nfs_setattrrpc(vp, vap, ap->a_cred, ap->a_td);
if (error && vap->va_size != VNOVAL) {
+ mtx_lock(&np->n_mtx);
np->n_size = np->n_vattr.va_size = tsize;
- vnode_pager_setsize(vp, np->n_size);
+ vnode_pager_setsize(vp, tsize);
+ mtx_unlock(&np->n_mtx);
}
+out:
return (error);
}
@@ -779,7 +851,7 @@
int error = 0, attrflag, fhsize;
int v3 = NFS_ISV3(dvp);
struct thread *td = cnp->cn_thread;
-
+
*vpp = NULLVP;
if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
(cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
@@ -840,7 +912,7 @@
m_freem(mrep);
return (EISDIR);
}
- error = nfs_nget(dvp->v_mount, fhp, fhsize, &np);
+ error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE);
if (error) {
m_freem(mrep);
return (error);
@@ -859,7 +931,7 @@
if (flags & ISDOTDOT) {
VOP_UNLOCK(dvp, 0, td);
- error = nfs_nget(dvp->v_mount, fhp, fhsize, &np);
+ error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, cnp->cn_lkflags);
vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
if (error)
return (error);
@@ -868,7 +940,7 @@
VREF(dvp);
newvp = dvp;
} else {
- error = nfs_nget(dvp->v_mount, fhp, fhsize, &np);
+ error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, cnp->cn_lkflags);
if (error) {
m_freem(mrep);
return (error);
@@ -964,8 +1036,10 @@
nfsm_strsiz(len, NFS_MAXPATHLEN);
if (len == NFS_MAXPATHLEN) {
struct nfsnode *np = VTONFS(vp);
+ mtx_lock(&np->n_mtx);
if (np->n_size && np->n_size < NFS_MAXPATHLEN)
len = np->n_size;
+ mtx_unlock(&np->n_mtx);
}
nfsm_mtouio(uiop, len);
}
@@ -987,17 +1061,23 @@
struct nfsmount *nmp;
int error = 0, len, retlen, tsiz, eof, attrflag;
int v3 = NFS_ISV3(vp);
+ int rsize;
#ifndef nolint
eof = 0;
#endif
nmp = VFSTONFS(vp->v_mount);
tsiz = uiop->uio_resid;
- if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize)
+ mtx_lock(&nmp->nm_mtx);
+ if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) {
+ mtx_unlock(&nmp->nm_mtx);
return (EFBIG);
+ }
+ rsize = nmp->nm_rsize;
+ mtx_unlock(&nmp->nm_mtx);
while (tsiz > 0) {
nfsstats.rpccnt[NFSPROC_READ]++;
- len = (tsiz > nmp->nm_rsize) ? nmp->nm_rsize : tsiz;
+ len = (tsiz > rsize) ? rsize : tsiz;
mreq = nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3);
mb = mreq;
bpos = mtod(mb, caddr_t);
@@ -1020,9 +1100,10 @@
}
tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED);
eof = fxdr_unsigned(int, *(tl + 1));
- } else
+ } else {
nfsm_loadattr(vp, NULL);
- nfsm_strsiz(retlen, nmp->nm_rsize);
+ }
+ nfsm_strsiz(retlen, rsize);
nfsm_mtouio(uiop, retlen);
m_freem(mrep);
tsiz -= retlen;
@@ -1043,7 +1124,7 @@
*/
int
nfs_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
- int *iomode, int *must_commit)
+ int *iomode, int *must_commit)
{
u_int32_t *tl;
int32_t backup;
@@ -1052,18 +1133,24 @@
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit;
int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC;
-
+ int wsize;
+
#ifndef DIAGNOSTIC
if (uiop->uio_iovcnt != 1)
panic("nfs: writerpc iovcnt > 1");
#endif
*must_commit = 0;
tsiz = uiop->uio_resid;
- if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize)
+ mtx_lock(&nmp->nm_mtx);
+ if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) {
+ mtx_unlock(&nmp->nm_mtx);
return (EFBIG);
+ }
+ wsize = nmp->nm_wsize;
+ mtx_unlock(&nmp->nm_mtx);
while (tsiz > 0) {
nfsstats.rpccnt[NFSPROC_WRITE]++;
- len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz;
+ len = (tsiz > wsize) ? wsize : tsiz;
mreq = nfsm_reqhead(vp, NFSPROC_WRITE,
NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len));
mb = mreq;
@@ -1122,6 +1209,7 @@
else if (committed == NFSV3WRITE_DATASYNC &&
commit == NFSV3WRITE_UNSTABLE)
committed = commit;
+ mtx_lock(&nmp->nm_mtx);
if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){
bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
NFSX_V3WRITEVERF);
@@ -1132,18 +1220,23 @@
bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
NFSX_V3WRITEVERF);
}
+ mtx_unlock(&nmp->nm_mtx);
}
- } else
- nfsm_loadattr(vp, NULL);
- if (wccflag)
- VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime;
+ } else {
+ nfsm_loadattr(vp, NULL);
+ }
+ if (wccflag) {
+ mtx_lock(&(VTONFS(vp))->n_mtx);
+ VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime;
+ mtx_unlock(&(VTONFS(vp))->n_mtx);
+ }
m_freem(mrep);
if (error)
break;
tsiz -= len;
}
nfsmout:
- if (vp->v_mount->mnt_flag & MNT_ASYNC)
+ if (vp->v_mount->mnt_kern_flag & MNTK_ASYNC)
committed = NFSV3WRITE_FILESYNC;
*iomode = committed;
if (error)
@@ -1232,9 +1325,11 @@
cache_enter(dvp, newvp, cnp);
*vpp = newvp;
}
+ mtx_lock(&(VTONFS(dvp))->n_mtx);
VTONFS(dvp)->n_flag |= NMODIFIED;
if (!wccflag)
VTONFS(dvp)->n_attrstamp = 0;
+ mtx_unlock(&(VTONFS(dvp))->n_mtx);
return (error);
}
@@ -1246,7 +1341,6 @@
static int
nfs_mknod(struct vop_mknod_args *ap)
{
-
return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap));
}
@@ -1353,15 +1447,19 @@
if (vap->va_atime.tv_sec == VNOVAL)
vap->va_atime = vap->va_mtime;
error = nfs_setattrrpc(newvp, vap, cnp->cn_cred, cnp->cn_thread);
+ if (error)
+ vput(newvp);
}
if (!error) {
if (cnp->cn_flags & MAKEENTRY)
cache_enter(dvp, newvp, cnp);
*ap->a_vpp = newvp;
}
+ mtx_lock(&(VTONFS(dvp))->n_mtx);
VTONFS(dvp)->n_flag |= NMODIFIED;
if (!wccflag)
VTONFS(dvp)->n_attrstamp = 0;
+ mtx_unlock(&(VTONFS(dvp))->n_mtx);
return (error);
}
@@ -1434,7 +1532,6 @@
int
nfs_removeit(struct sillyrename *sp)
{
-
/*
* Make sure that the directory vnode is still valid.
* XXX we should lock sp->s_dvp here.
@@ -1469,9 +1566,11 @@
nfsm_wcc_data(dvp, wccflag);
m_freem(mrep);
nfsmout:
+ mtx_lock(&(VTONFS(dvp))->n_mtx);
VTONFS(dvp)->n_flag |= NMODIFIED;
if (!wccflag)
VTONFS(dvp)->n_attrstamp = 0;
+ mtx_unlock(&(VTONFS(dvp))->n_mtx);
return (error);
}
@@ -1502,7 +1601,7 @@
}
if (fvp == tvp) {
- printf("nfs_rename: fvp == tvp (can't happen)\n");
+ nfs_printf("nfs_rename: fvp == tvp (can't happen)\n");
error = 0;
goto out;
}
@@ -1609,8 +1708,12 @@
}
m_freem(mrep);
nfsmout:
+ mtx_lock(&(VTONFS(fdvp))->n_mtx);
VTONFS(fdvp)->n_flag |= NMODIFIED;
+ mtx_unlock(&(VTONFS(fdvp))->n_mtx);
+ mtx_lock(&(VTONFS(tdvp))->n_mtx);
VTONFS(tdvp)->n_flag |= NMODIFIED;
+ mtx_unlock(&(VTONFS(tdvp))->n_mtx);
if (!fwccflag)
VTONFS(fdvp)->n_attrstamp = 0;
if (!twccflag)
@@ -1659,16 +1762,13 @@
}
m_freem(mrep);
nfsmout:
+ mtx_lock(&(VTONFS(tdvp))->n_mtx);
VTONFS(tdvp)->n_flag |= NMODIFIED;
+ mtx_unlock(&(VTONFS(tdvp))->n_mtx);
if (!attrflag)
VTONFS(vp)->n_attrstamp = 0;
if (!wccflag)
VTONFS(tdvp)->n_attrstamp = 0;
- /*
- * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry.
- */
- if (error == EEXIST)
- error = 0;
return (error);
}
@@ -1732,17 +1832,9 @@
nfsmout:
/*
- * If we get an EEXIST error, silently convert it to no-error
- * in case of an NFS retry.
- */
- if (error == EEXIST)
- error = 0;
-
- /*
- * If we do not have (or no longer have) an error, and we could
- * not extract the newvp from the response due to the request being
- * NFSv2 or the error being EEXIST. We have to do a lookup in order
- * to obtain a newvp to return.
+ * If we do not have an error and we could not extract the newvp from
+ * the response due to the request being NFSv2, we have to do a
+ * lookup in order to obtain a newvp to return.
*/
if (error == 0 && newvp == NULL) {
struct nfsnode *np = NULL;
@@ -1758,7 +1850,9 @@
} else {
*ap->a_vpp = newvp;
}
+ mtx_lock(&(VTONFS(dvp))->n_mtx);
VTONFS(dvp)->n_flag |= NMODIFIED;
+ mtx_unlock(&(VTONFS(dvp))->n_mtx);
if (!wccflag)
VTONFS(dvp)->n_attrstamp = 0;
return (error);
@@ -1813,18 +1907,12 @@
nfsm_wcc_data(dvp, wccflag);
m_freem(mrep);
nfsmout:
+ mtx_lock(&(VTONFS(dvp))->n_mtx);
VTONFS(dvp)->n_flag |= NMODIFIED;
+ mtx_unlock(&(VTONFS(dvp))->n_mtx);
if (!wccflag)
VTONFS(dvp)->n_attrstamp = 0;
- /*
- * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry
- * if we can succeed in looking up the directory.
- */
- if (error == EEXIST || (!error && !gotvp)) {
- if (newvp) {
- vput(newvp);
- newvp = NULL;
- }
+ if (error == 0 && newvp == NULL) {
error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred,
cnp->cn_thread, &np);
if (!error) {
@@ -1869,7 +1957,9 @@
nfsm_wcc_data(dvp, wccflag);
m_freem(mrep);
nfsmout:
+ mtx_lock(&(VTONFS(dvp))->n_mtx);
VTONFS(dvp)->n_flag |= NMODIFIED;
+ mtx_unlock(&(VTONFS(dvp))->n_mtx);
if (!wccflag)
VTONFS(dvp)->n_attrstamp = 0;
cache_purge(dvp);
@@ -1891,20 +1981,25 @@
struct vnode *vp = ap->a_vp;
struct nfsnode *np = VTONFS(vp);
struct uio *uio = ap->a_uio;
- int tresid, error;
+ int tresid, error = 0;
struct vattr vattr;
+
+ if (vp->v_type != VDIR)
+ return(EPERM);
- if (vp->v_type != VDIR)
- return (EPERM);
/*
* First, check for hit on the EOF offset cache
*/
if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset &&
(np->n_flag & NMODIFIED) == 0) {
- if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_td) == 0 &&
- !NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
- nfsstats.direofcache_hits++;
- return (0);
+ if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_td) == 0) {
+ mtx_lock(&np->n_mtx);
+ if (!NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
+ mtx_unlock(&np->n_mtx);
+ nfsstats.direofcache_hits++;
+ goto out;
+ } else
+ mtx_unlock(&np->n_mtx);
}
}
@@ -1914,8 +2009,10 @@
tresid = uio->uio_resid;
error = nfs_bioread(vp, uio, 0, ap->a_cred);
- if (!error && uio->uio_resid == tresid)
+ if (!error && uio->uio_resid == tresid) {
nfsstats.direofcache_misses++;
+ }
+out:
return (error);
}
@@ -1950,11 +2047,16 @@
/*
* If there is no cookie, assume directory was stale.
*/
+ nfs_dircookie_lock(dnp);
cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
- if (cookiep)
+ if (cookiep) {
cookie = *cookiep;
- else
+ nfs_dircookie_unlock(dnp);
+ } else {
+ nfs_dircookie_unlock(dnp);
return (NFSERR_BAD_COOKIE);
+ }
+
/*
* Loop around doing readdir rpc's of size nm_readdirsize
* truncated to a multiple of DIRBLKSIZ.
@@ -1971,8 +2073,10 @@
tl = nfsm_build(u_int32_t *, 5 * NFSX_UNSIGNED);
*tl++ = cookie.nfsuquad[0];
*tl++ = cookie.nfsuquad[1];
+ mtx_lock(&dnp->n_mtx);
*tl++ = dnp->n_cookieverf.nfsuquad[0];
*tl++ = dnp->n_cookieverf.nfsuquad[1];
+ mtx_unlock(&dnp->n_mtx);
} else {
tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = cookie.nfsuquad[0];
@@ -1984,8 +2088,10 @@
if (!error) {
tl = nfsm_dissect(u_int32_t *,
2 * NFSX_UNSIGNED);
+ mtx_lock(&dnp->n_mtx);
dnp->n_cookieverf.nfsuquad[0] = *tl++;
dnp->n_cookieverf.nfsuquad[1] = *tl;
+ mtx_unlock(&dnp->n_mtx);
} else {
m_freem(mrep);
goto nfsmout;
@@ -2100,9 +2206,11 @@
dnp->n_direofoffset = uiop->uio_offset;
else {
if (uiop->uio_resid > 0)
- printf("EEK! readdirrpc resid > 0\n");
+ nfs_printf("EEK! readdirrpc resid > 0\n");
+ nfs_dircookie_lock(dnp);
cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
*cookiep = cookie;
+ nfs_dircookie_unlock(dnp);
}
nfsmout:
return (error);
@@ -2146,11 +2254,15 @@
/*
* If there is no cookie, assume directory was stale.
*/
+ nfs_dircookie_lock(dnp);
cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
- if (cookiep)
+ if (cookiep) {
cookie = *cookiep;
- else
+ nfs_dircookie_unlock(dnp);
+ } else {
+ nfs_dircookie_unlock(dnp);
return (NFSERR_BAD_COOKIE);
+ }
/*
* Loop around doing readdir rpc's of size nm_readdirsize
* truncated to a multiple of DIRBLKSIZ.
@@ -2166,8 +2278,10 @@
tl = nfsm_build(u_int32_t *, 6 * NFSX_UNSIGNED);
*tl++ = cookie.nfsuquad[0];
*tl++ = cookie.nfsuquad[1];
+ mtx_lock(&dnp->n_mtx);
*tl++ = dnp->n_cookieverf.nfsuquad[0];
*tl++ = dnp->n_cookieverf.nfsuquad[1];
+ mtx_unlock(&dnp->n_mtx);
*tl++ = txdr_unsigned(nmp->nm_readdirsize);
*tl = txdr_unsigned(nmp->nm_rsize);
nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_td, cred);
@@ -2177,8 +2291,10 @@
goto nfsmout;
}
tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
+ mtx_lock(&dnp->n_mtx);
dnp->n_cookieverf.nfsuquad[0] = *tl++;
dnp->n_cookieverf.nfsuquad[1] = *tl++;
+ mtx_unlock(&dnp->n_mtx);
more_dirs = fxdr_unsigned(int, *tl);
/* loop thru the dir entries, doctoring them to 4bsd form */
@@ -2288,7 +2404,7 @@
np = dnp;
} else {
error = nfs_nget(vp->v_mount, fhp,
- fhsize, &np);
+ fhsize, &np, LK_EXCLUSIVE);
if (error)
doit = 0;
else
@@ -2306,6 +2422,8 @@
dp->d_type =
IFTODT(VTTOIF(np->n_vattr.va_type));
ndp->ni_vp = newvp;
+ /* Update n_ctime, so subsequent lookup doesn't purge entry */
+ np->n_ctime = np->n_vattr.va_ctime.tv_sec;
cache_enter(ndp->ni_dvp, ndp->ni_vp, cnp);
}
} else {
@@ -2313,9 +2431,9 @@
tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
i = fxdr_unsigned(int, *tl);
if (i) {
- tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
- fhsize = fxdr_unsigned(int, *tl);
- nfsm_adv(nfsm_rndup(fhsize));
+ tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
+ fhsize = fxdr_unsigned(int, *tl);
+ nfsm_adv(nfsm_rndup(fhsize));
}
}
if (newvp != NULLVP) {
@@ -2359,9 +2477,11 @@
dnp->n_direofoffset = uiop->uio_offset;
else {
if (uiop->uio_resid > 0)
- printf("EEK! readdirplusrpc resid > 0\n");
+ nfs_printf("EEK! readdirplusrpc resid > 0\n");
+ nfs_dircookie_lock(dnp);
cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
*cookiep = cookie;
+ nfs_dircookie_unlock(dnp);
}
nfsmout:
if (newvp != NULLVP) {
@@ -2480,7 +2600,7 @@
VREF(dvp);
newvp = dvp;
} else {
- error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np);
+ error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np, LK_EXCLUSIVE);
if (error) {
m_freem(mrep);
return (error);
@@ -2521,7 +2641,7 @@
*/
int
nfs_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred,
- struct thread *td)
+ struct thread *td)
{
u_int32_t *tl;
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
@@ -2529,8 +2649,12 @@
int error = 0, wccflag = NFSV3_WCCRATTR;
struct mbuf *mreq, *mrep, *md, *mb;
- if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0)
+ mtx_lock(&nmp->nm_mtx);
+ if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
+ mtx_unlock(&nmp->nm_mtx);
return (0);
+ }
+ mtx_unlock(&nmp->nm_mtx);
nfsstats.rpccnt[NFSPROC_COMMIT]++;
mreq = nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1));
mb = mreq;
@@ -2567,16 +2691,10 @@
{
struct buf *bp = ap->a_bp;
struct ucred *cr;
- struct thread *td;
KASSERT(!(bp->b_flags & B_DONE), ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
KASSERT(BUF_REFCNT(bp) > 0, ("nfs_strategy: buffer %p not locked", bp));
- if (bp->b_flags & B_ASYNC)
- td = NULL;
- else
- td = curthread; /* XXX */
-
if (bp->b_iocmd == BIO_READ)
cr = bp->b_rcred;
else
@@ -2588,8 +2706,8 @@
* otherwise just do it ourselves.
*/
if ((bp->b_flags & B_ASYNC) == 0 ||
- nfs_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, td))
- (void)nfs_doio(ap->a_vp, bp, cr, td);
+ nfs_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, curthread))
+ (void)nfs_doio(ap->a_vp, bp, cr, curthread);
return (0);
}
@@ -2600,7 +2718,6 @@
static int
nfs_fsync(struct vop_fsync_args *ap)
{
-
return (nfs_flush(ap->a_vp, ap->a_waitfor, ap->a_td, 1));
}
@@ -2821,8 +2938,10 @@
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
VI_MTX(vp), "nfsfsync", slpflag, slptimeo);
splx(s);
- if (error == 0)
- panic("nfs_fsync: inconsistent lock");
+ if (error == 0) {
+ BUF_UNLOCK(bp);
+ goto loop;
+ }
if (error == ENOLCK)
goto loop;
if (nfs_sigintr(nmp, NULL, td)) {
@@ -2880,14 +2999,36 @@
VI_UNLOCK(vp);
goto loop;
}
- }
- VI_UNLOCK(vp);
+ /*
+ * Wait for all the async IO requests to drain
+ */
+ VI_UNLOCK(vp);
+ mtx_lock(&np->n_mtx);
+ while (np->n_directio_asyncwr > 0) {
+ np->n_flag |= NFSYNCWAIT;
+ error = nfs_msleep(td, (caddr_t)&np->n_directio_asyncwr,
+ &np->n_mtx, slpflag | (PRIBIO + 1),
+ "nfsfsync", 0);
+ if (error) {
+ if (nfs_sigintr(nmp, (struct nfsreq *)0, td)) {
+ mtx_unlock(&np->n_mtx);
+ error = EINTR;
+ goto done;
+ }
+ }
+ }
+ mtx_unlock(&np->n_mtx);
+ } else
+ VI_UNLOCK(vp);
+ mtx_lock(&np->n_mtx);
if (np->n_flag & NWRITEERR) {
error = np->n_error;
np->n_flag &= ~NWRITEERR;
}
- if (commit && vp->v_bufobj.bo_dirty.bv_cnt == 0)
+ if (commit && vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
+ vp->v_bufobj.bo_numoutput == 0 && np->n_directio_asyncwr == 0)
np->n_flag &= ~NMODIFIED;
+ mtx_unlock(&np->n_mtx);
done:
if (bvec != NULL && bvec != bvec_on_stack)
free(bvec, M_TEMP);
@@ -2900,13 +3041,19 @@
static int
nfs_advlock(struct vop_advlock_args *ap)
{
-
+ int error;
+
+ mtx_lock(&Giant);
if ((VFSTONFS(ap->a_vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
struct nfsnode *np = VTONFS(ap->a_vp);
- return (lf_advlock(ap, &(np->n_lockf), np->n_size));
+ error = lf_advlock(ap, &(np->n_lockf), np->n_size);
+ goto out;
}
- return (nfs_dolock(ap));
+ error = nfs_dolock(ap);
+out:
+ mtx_unlock(&Giant);
+ return (error);
}
/*
@@ -2918,7 +3065,7 @@
struct vnode *vp = ap->a_vp;
struct nfsnode *np = VTONFS(vp);
- printf("\tfileid %ld fsid 0x%x",
+ nfs_printf("\tfileid %ld fsid 0x%x",
np->n_vattr.va_fileid, np->n_vattr.va_fsid);
if (vp->v_type == VFIFO)
fifo_printinfo(vp);
@@ -2961,7 +3108,7 @@
bp->b_iocmd = BIO_WRITE;
bufobj_wref(bp->b_bufobj);
- curthread->td_proc->p_stats->p_ru.ru_oublock++;
+ curthread->td_ru.ru_oublock++;
splx(s);
/*
@@ -2982,7 +3129,6 @@
reassignbuf(bp);
splx(s);
}
-
brelse(bp);
return (rtval);
}
@@ -3023,9 +3169,11 @@
vap = &vattr;
error = VOP_GETATTR(vp, vap, cred, ap->a_td);
if (error)
- return (error);
- return (vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
- mode, cred, NULL));
+ goto out;
+ error = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
+ mode, cred, NULL);
+out:
+ return error;
}
/*
@@ -3035,13 +3183,17 @@
nfsfifo_read(struct vop_read_args *ap)
{
struct nfsnode *np = VTONFS(ap->a_vp);
+ int error;
/*
* Set access flag.
*/
+ mtx_lock(&np->n_mtx);
np->n_flag |= NACC;
getnanotime(&np->n_atim);
- return (fifo_specops.vop_read(ap));
+ mtx_unlock(&np->n_mtx);
+ error = fifo_specops.vop_read(ap);
+ return error;
}
/*
@@ -3055,9 +3207,11 @@
/*
* Set update flag.
*/
+ mtx_lock(&np->n_mtx);
np->n_flag |= NUPD;
getnanotime(&np->n_mtim);
- return (fifo_specops.vop_write(ap));
+ mtx_unlock(&np->n_mtx);
+ return(fifo_specops.vop_write(ap));
}
/*
@@ -3073,6 +3227,7 @@
struct vattr vattr;
struct timespec ts;
+ mtx_lock(&np->n_mtx);
if (np->n_flag & (NACC | NUPD)) {
getnanotime(&ts);
if (np->n_flag & NACC)
@@ -3087,9 +3242,13 @@
vattr.va_atime = np->n_atim;
if (np->n_flag & NUPD)
vattr.va_mtime = np->n_mtim;
+ mtx_unlock(&np->n_mtx);
(void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_td);
+ goto out;
}
}
+ mtx_unlock(&np->n_mtx);
+out:
return (fifo_specops.vop_close(ap));
}
@@ -3110,4 +3269,5 @@
.bop_write = nfs_bwrite,
.bop_strategy = bufstrategy,
.bop_sync = bufsync,
+ .bop_bdflush = bufbdflush,
};
Index: nfsm_subs.h
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfsm_subs.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfsm_subs.h -L sys/nfsclient/nfsm_subs.h -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfsm_subs.h
+++ sys/nfsclient/nfsm_subs.h
@@ -30,7 +30,7 @@
* SUCH DAMAGE.
*
* @(#)nfsm_subs.h 8.2 (Berkeley) 3/30/95
- * $FreeBSD: src/sys/nfsclient/nfsm_subs.h,v 1.36.2.1 2005/12/13 21:29:26 rees Exp $
+ * $FreeBSD: src/sys/nfsclient/nfsm_subs.h,v 1.37 2005/11/21 18:39:18 rees Exp $
*/
#ifndef _NFSCLIENT_NFSM_SUBS_H_
Index: nfsdiskless.h
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfsdiskless.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfsdiskless.h -L sys/nfsclient/nfsdiskless.h -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfsdiskless.h
+++ sys/nfsclient/nfsdiskless.h
@@ -30,7 +30,7 @@
* SUCH DAMAGE.
*
* @(#)nfsdiskless.h 8.2 (Berkeley) 3/30/95
- * $FreeBSD: src/sys/nfsclient/nfsdiskless.h,v 1.17 2005/01/07 01:45:51 imp Exp $
+ * $FreeBSD: src/sys/nfsclient/nfsdiskless.h,v 1.18 2006/12/06 02:15:25 sam Exp $
*/
#ifndef _NFSCLIENT_NFSDISKLESS_H_
@@ -108,6 +108,7 @@
extern int nfs_diskless_valid;
void bootpc_init(void);
void nfs_setup_diskless(void);
+void nfs_parse_options(const char *, struct nfs_args *);
#endif
#endif
Index: nfs_vfsops.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_vfsops.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/nfsclient/nfs_vfsops.c -L sys/nfsclient/nfs_vfsops.c -u -r1.2 -r1.3
--- sys/nfsclient/nfs_vfsops.c
+++ sys/nfsclient/nfs_vfsops.c
@@ -33,7 +33,8 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_vfsops.c,v 1.177.2.1 2006/01/14 01:18:02 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_vfsops.c,v 1.193.2.1 2007/10/26 21:46:31 jhb Exp $");
+
#include "opt_bootp.h"
#include "opt_nfsroot.h"
@@ -43,6 +44,7 @@
#include <sys/kernel.h>
#include <sys/bio.h>
#include <sys/buf.h>
+#include <sys/clock.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
@@ -75,17 +77,18 @@
#include <nfsclient/nfsm_subs.h>
#include <nfsclient/nfsdiskless.h>
-MALLOC_DEFINE(M_NFSREQ, "NFS req", "NFS request header");
-MALLOC_DEFINE(M_NFSBIGFH, "NFSV3 bigfh", "NFS version 3 file handle");
-MALLOC_DEFINE(M_NFSDIROFF, "NFSV3 diroff", "NFS directory offset data");
-MALLOC_DEFINE(M_NFSHASH, "NFS hash", "NFS hash tables");
-MALLOC_DEFINE(M_NFSDIRECTIO, "NFS DirectIO", "NFS Direct IO async write state");
+MALLOC_DEFINE(M_NFSREQ, "nfsclient_req", "NFS request header");
+MALLOC_DEFINE(M_NFSBIGFH, "nfsclient_bigfh", "NFS version 3 file handle");
+MALLOC_DEFINE(M_NFSDIROFF, "nfsclient_diroff", "NFS directory offset data");
+MALLOC_DEFINE(M_NFSHASH, "nfsclient_hash", "NFS hash tables");
+MALLOC_DEFINE(M_NFSDIRECTIO, "nfsclient_directio", "NFS Direct IO async write state");
uma_zone_t nfsmount_zone;
struct nfsstats nfsstats;
+
SYSCTL_NODE(_vfs, OID_AUTO, nfs, CTLFLAG_RW, 0, "NFS filesystem");
-SYSCTL_STRUCT(_vfs_nfs, NFS_NFSSTATS, nfsstats, CTLFLAG_RD,
+SYSCTL_STRUCT(_vfs_nfs, NFS_NFSSTATS, nfsstats, CTLFLAG_RW,
&nfsstats, nfsstats, "S,nfsstats");
static int nfs_ip_paranoia = 1;
SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_ip_paranoia, CTLFLAG_RW,
@@ -102,8 +105,8 @@
SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_DELAY,
downdelayinterval, CTLFLAG_RW, &nfs_tprintf_delay, 0, "");
-static int nfs_iosize(struct nfsmount *nmp);
-static void nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp);
+static void nfs_decode_args(struct mount *mp, struct nfsmount *nmp,
+ struct nfs_args *argp);
static int mountnfs(struct nfs_args *, struct mount *,
struct sockaddr *, char *, struct vnode **,
struct ucred *cred);
@@ -171,7 +174,7 @@
static void nfs_convert_oargs(struct nfs_args *args,
struct onfs_args *oargs);
-static int
+int
nfs_iosize(struct nfsmount *nmp)
{
int iosize;
@@ -182,9 +185,9 @@
* that it is at least one VM page to avoid wasting buffer
* space.
*/
- iosize = max(nmp->nm_rsize, nmp->nm_wsize);
- if (iosize < PAGE_SIZE) iosize = PAGE_SIZE;
- return iosize;
+ iosize = imax(nmp->nm_rsize, nmp->nm_wsize);
+ iosize = imax(iosize, PAGE_SIZE);
+ return (iosize);
}
static void
@@ -219,8 +222,13 @@
bcopy(&nfs_diskless.mygateway, &nfsv3_diskless.mygateway,
sizeof(struct sockaddr_in));
nfs_convert_oargs(&nfsv3_diskless.root_args,&nfs_diskless.root_args);
- nfsv3_diskless.root_fhsize = NFSX_V2FH;
- bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_V2FH);
+ if (nfsv3_diskless.root_args.flags & NFSMNT_NFSV3) {
+ nfsv3_diskless.root_fhsize = NFSX_V3FH;
+ bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_V3FH);
+ } else {
+ nfsv3_diskless.root_fhsize = NFSX_V2FH;
+ bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_V2FH);
+ }
bcopy(&nfs_diskless.root_saddr,&nfsv3_diskless.root_saddr,
sizeof(struct sockaddr_in));
bcopy(nfs_diskless.root_hostnam, nfsv3_diskless.root_hostnam, MNAMELEN);
@@ -248,12 +256,21 @@
#ifndef nolint
sfp = NULL;
#endif
- error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np);
+ error = vfs_busy(mp, LK_NOWAIT, NULL, td);
if (error)
return (error);
+ error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE);
+ if (error) {
+ vfs_unbusy(mp, td);
+ return (error);
+ }
vp = NFSTOV(np);
- if (v3 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
+ mtx_lock(&nmp->nm_mtx);
+ if (v3 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
+ mtx_unlock(&nmp->nm_mtx);
(void)nfs_fsinfo(nmp, vp, td->td_ucred, td);
+ } else
+ mtx_unlock(&nmp->nm_mtx);
nfsstats.rpccnt[NFSPROC_FSSTAT]++;
mreq = nfsm_reqhead(vp, NFSPROC_FSSTAT, NFSX_FH(v3));
mb = mreq;
@@ -268,7 +285,9 @@
goto nfsmout;
}
sfp = nfsm_dissect(struct nfs_statfs *, NFSX_STATFS(v3));
+ mtx_lock(&nmp->nm_mtx);
sbp->f_iosize = nfs_iosize(nmp);
+ mtx_unlock(&nmp->nm_mtx);
if (v3) {
sbp->f_bsize = NFS_FABLKSIZE;
tquad = fxdr_hyper(&sfp->sf_tbytes);
@@ -292,6 +311,7 @@
m_freem(mrep);
nfsmout:
vput(vp);
+ vfs_unbusy(mp, td);
return (error);
}
@@ -308,7 +328,7 @@
int error = 0, retattr;
struct mbuf *mreq, *mrep, *md, *mb;
u_int64_t maxfsize;
-
+
nfsstats.rpccnt[NFSPROC_FSINFO]++;
mreq = nfsm_reqhead(vp, NFSPROC_FSINFO, NFSX_FH(1));
mb = mreq;
@@ -319,6 +339,7 @@
if (!error) {
fsp = nfsm_dissect(struct nfsv3_fsinfo *, NFSX_V3FSINFO);
pref = fxdr_unsigned(u_int32_t, fsp->fs_wtpref);
+ mtx_lock(&nmp->nm_mtx);
if (pref < nmp->nm_wsize && pref >= NFS_FABLKSIZE)
nmp->nm_wsize = (pref + NFS_FABLKSIZE - 1) &
~(NFS_FABLKSIZE - 1);
@@ -352,6 +373,7 @@
nmp->nm_maxfilesize = maxfsize;
nmp->nm_mountp->mnt_stat.f_iosize = nfs_iosize(nmp);
nmp->nm_state |= NFSSTA_GOTFSINFO;
+ mtx_unlock(&nmp->nm_mtx);
}
m_freem(mrep);
nfsmout:
@@ -380,11 +402,11 @@
struct nfsv3_diskless *nd = &nfsv3_diskless;
struct socket *so;
struct vnode *vp;
+ struct ifreq ir;
int error, i;
u_long l;
char buf[128];
-
- NET_ASSERT_GIANT();
+ char *cp;
#if defined(BOOTP_NFSROOT) && defined(BOOTP)
bootpc_init(); /* use bootp to get nfs_diskless filled in */
@@ -406,7 +428,7 @@
* Do enough of ifconfig(8) so that the critical net interface can
* talk to the server.
*/
- error = socreate(nd->myif.ifra_addr.sa_family, &so, SOCK_DGRAM, 0,
+ error = socreate(nd->myif.ifra_addr.sa_family, &so, nd->root_args.sotype, 0,
td->td_ucred, td);
if (error)
panic("nfs_mountroot: socreate(%04x): %d",
@@ -431,6 +453,14 @@
error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td);
if (error)
panic("nfs_mountroot: SIOCAIFADDR: %d", error);
+ if ((cp = getenv("boot.netif.mtu")) != NULL) {
+ ir.ifr_mtu = strtol(cp, NULL, 10);
+ bcopy(nd->myif.ifra_name, ir.ifr_name, IFNAMSIZ);
+ freeenv(cp);
+ error = ifioctl(so, SIOCSIFMTU, (caddr_t)&ir, td);
+ if (error)
+ printf("nfs_mountroot: SIOCSIFMTU: %d", error);
+ }
soclose(so);
/*
@@ -495,8 +525,10 @@
struct sockaddr *nam;
int error;
+ MNT_ILOCK(mp);
mp->mnt_kern_flag = 0;
mp->mnt_flag = mountflag;
+ MNT_IUNLOCK(mp);
nam = sodupsockaddr((struct sockaddr *)sin, M_WAITOK);
if ((error = mountnfs(args, mp, nam, path, vpp,
td->td_ucred)) != 0) {
@@ -521,17 +553,27 @@
* flag is already clear, or this is a root mount and it was set
* intentionally at some previous point.
*/
- if (vfs_getopt(mp->mnt_optnew, "ro", NULL, NULL) == 0)
+ if (vfs_getopt(mp->mnt_optnew, "ro", NULL, NULL) == 0) {
+ MNT_ILOCK(mp);
mp->mnt_flag |= MNT_RDONLY;
- else if (mp->mnt_flag & MNT_UPDATE)
+ MNT_IUNLOCK(mp);
+ } else if (mp->mnt_flag & MNT_UPDATE) {
+ MNT_ILOCK(mp);
mp->mnt_flag &= ~MNT_RDONLY;
+ MNT_IUNLOCK(mp);
+ }
/*
* Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes
- * no sense in that context.
+ * no sense in that context. Also, set up appropriate retransmit
+ * and soft timeout behavior.
*/
- if (argp->sotype == SOCK_STREAM)
+ if (argp->sotype == SOCK_STREAM) {
nmp->nm_flag &= ~NFSMNT_NOCONN;
+ nmp->nm_flag |= NFSMNT_DUMBTIMR;
+ nmp->nm_timeo = NFS_MAXTIMEO;
+ nmp->nm_retry = NFS_RETRANS_TCP;
+ }
/* Also clear RDIRPLUS if not NFSv3, it crashes some servers */
if ((argp->flags & NFSMNT_NFSV3) == 0)
@@ -658,8 +700,7 @@
if (nmp->nm_sotype == SOCK_DGRAM)
while (nfs_connect(nmp, NULL)) {
printf("nfs_args: retrying connect\n");
- (void) tsleep((caddr_t)&lbolt,
- PSOCK, "nfscon", 0);
+ (void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
}
}
}
@@ -687,24 +728,31 @@
size_t len;
u_char nfh[NFSX_V3FHMAX];
- if (vfs_filteropt(mp->mnt_optnew, nfs_opts))
- return (EINVAL);
+ if (vfs_filteropt(mp->mnt_optnew, nfs_opts)) {
+ error = EINVAL;
+ goto out;
+ }
- if (mp->mnt_flag & MNT_ROOTFS)
- return (nfs_mountroot(mp, td));
+ if (mp->mnt_flag & MNT_ROOTFS) {
+ error = nfs_mountroot(mp, td);
+ goto out;
+ }
error = vfs_copyopt(mp->mnt_optnew, "nfs_args", &args, sizeof args);
if (error)
- return (error);
+ goto out;
if (args.version != NFS_ARGSVERSION) {
- return (EPROGMISMATCH);
+ error = EPROGMISMATCH;
+ goto out;
}
if (mp->mnt_flag & MNT_UPDATE) {
struct nfsmount *nmp = VFSTONFS(mp);
- if (nmp == NULL)
- return (EIO);
+ if (nmp == NULL) {
+ error = EIO;
+ goto out;
+ }
/*
* When doing an update, we can't change from or to
* v3, switch lockd strategies or change cookie translation
@@ -714,7 +762,7 @@
(nmp->nm_flag &
(NFSMNT_NFSV3 | NFSMNT_NOLOCKD /*|NFSMNT_XLATECOOKIE*/));
nfs_decode_args(mp, nmp, &args);
- return (0);
+ goto out;
}
/*
@@ -728,21 +776,29 @@
*/
if (nfs_ip_paranoia == 0)
args.flags |= NFSMNT_NOCONN;
- if (args.fhsize < 0 || args.fhsize > NFSX_V3FHMAX)
- return (EINVAL);
+ if (args.fhsize < 0 || args.fhsize > NFSX_V3FHMAX) {
+ error = EINVAL;
+ goto out;
+ }
error = copyin((caddr_t)args.fh, (caddr_t)nfh, args.fhsize);
if (error)
- return (error);
+ goto out;
error = copyinstr(args.hostname, hst, MNAMELEN-1, &len);
if (error)
- return (error);
+ goto out;
bzero(&hst[len], MNAMELEN - len);
/* sockargs() call must be after above copyin() calls */
error = getsockaddr(&nam, (caddr_t)args.addr, args.addrlen);
if (error)
- return (error);
+ goto out;
args.fh = nfh;
error = mountnfs(&args, mp, nam, hst, &vp, td->td_ucred);
+out:
+ if (!error) {
+ MNT_ILOCK(mp);
+ mp->mnt_kern_flag |= (MNTK_MPSAFE|MNTK_LOOKUP_SHARED);
+ MNT_IUNLOCK(mp);
+ }
return (error);
}
@@ -765,12 +821,11 @@
error = copyin(data, &args, sizeof (struct nfs_args));
if (error)
- return (error);
+ return error;
ma = mount_arg(ma, "nfs_args", &args, sizeof args);
error = kernel_mount(ma, flags);
-
return (error);
}
@@ -799,6 +854,7 @@
}
vfs_getnewfsid(mp);
nmp->nm_mountp = mp;
+ mtx_init(&nmp->nm_mtx, "NFSmount lock", NULL, MTX_DEF);
/*
* V2 can only handle 32 bit filesizes. A 4GB-1 limit may be too
@@ -823,11 +879,6 @@
nmp->nm_wsize = NFS_WSIZE;
nmp->nm_rsize = NFS_RSIZE;
}
- if ((desiredvnodes / 1000) == 0) {
- printf("Increasing desiredvnodes from %i to 1000\n",
- desiredvnodes);
- desiredvnodes = 1000;
- }
nmp->nm_wcommitsize = hibufspace / (desiredvnodes / 1000);
nmp->nm_readdirsize = NFS_READDIRSIZE;
nmp->nm_numgrps = NFS_MAXGRPS;
@@ -850,10 +901,6 @@
nfs_decode_args(mp, nmp, argp);
- if (nmp->nm_sotype == SOCK_STREAM)
- mtx_init(&nmp->nm_nfstcpstate.mtx, "NFS/TCP state lock",
- NULL, MTX_DEF);
-
/*
* For Connection based sockets (TCP,...) defer the connect until
* the first request, in case the server is not responding.
@@ -868,7 +915,9 @@
* stuck on a dead server and we are holding a lock on the mount
* point.
*/
+ mtx_lock(&nmp->nm_mtx);
mp->mnt_stat.f_iosize = nfs_iosize(nmp);
+ mtx_unlock(&nmp->nm_mtx);
/*
* A reference count is needed on the nfsnode representing the
* remote root. If this object is not persistent, then backward
@@ -877,7 +926,7 @@
* this problem, because one can identify root inodes by their
* number == ROOTINO (2).
*/
- error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np);
+ error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE);
if (error)
goto bad;
*vpp = NFSTOV(np);
@@ -899,9 +948,8 @@
return (0);
bad:
- if (nmp->nm_sotype == SOCK_STREAM)
- mtx_destroy(&nmp->nm_nfstcpstate.mtx);
nfs_disconnect(nmp);
+ mtx_destroy(&nmp->nm_mtx);
uma_zfree(nfsmount_zone, nmp);
FREE(nam, M_SONAME);
return (error);
@@ -929,12 +977,12 @@
if (flags & FORCECLOSE) {
error = nfs_nmcancelreqs(nmp);
if (error)
- return (error);
+ goto out;
}
/* We hold 1 extra ref on the root vnode; see comment in mountnfs(). */
error = vflush(mp, 1, flags, td);
if (error)
- return (error);
+ goto out;
/*
* We are now committed to the unmount.
@@ -942,11 +990,10 @@
nfs_disconnect(nmp);
FREE(nmp->nm_nam, M_SONAME);
- if (nmp->nm_sotype == SOCK_STREAM)
- mtx_destroy(&nmp->nm_nfstcpstate.mtx);
-
+ mtx_destroy(&nmp->nm_mtx);
uma_zfree(nfsmount_zone, nmp);
- return (0);
+out:
+ return (error);
}
/*
@@ -961,17 +1008,20 @@
int error;
nmp = VFSTONFS(mp);
- error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np);
+ error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, flags);
if (error)
- return (error);
+ return error;
vp = NFSTOV(np);
/*
* Get transfer parameters and attributes for root vnode once.
*/
+ mtx_lock(&nmp->nm_mtx);
if ((nmp->nm_state & NFSSTA_GOTFSINFO) == 0 &&
(nmp->nm_flag & NFSMNT_NFSV3)) {
+ mtx_unlock(&nmp->nm_mtx);
nfs_fsinfo(nmp, vp, curthread->td_ucred, curthread);
- }
+ } else
+ mtx_unlock(&nmp->nm_mtx);
if (vp->v_type == VNON)
vp->v_type = VDIR;
vp->v_vflag |= VV_ROOT;
@@ -1050,8 +1100,10 @@
break;
#endif
case VFS_CTL_QUERY:
+ mtx_lock(&nmp->nm_mtx);
if (nmp->nm_state & NFSSTA_TIMEO)
vq.vq_flags |= VQ_NOTRESP;
+ mtx_unlock(&nmp->nm_mtx);
#if 0
if (!(nmp->nm_flag & NFSMNT_NOLOCKS) &&
(nmp->nm_state & NFSSTA_LOCKTIMEO))
Index: nfs_subs.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_subs.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfs_subs.c -L sys/nfsclient/nfs_subs.c -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfs_subs.c
+++ sys/nfsclient/nfs_subs.c
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_subs.c,v 1.140.2.1 2005/12/13 21:29:26 rees Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_subs.c,v 1.146.2.1 2007/10/12 19:18:46 mohans Exp $");
/*
* These functions support the macros and help fiddle mbuf chains for
@@ -76,6 +76,12 @@
#include <netinet/in.h>
/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+/*
* Data items converted to xdr at startup, since they are constant
* This is kinda hokey, but may save a little time doing byte swaps
*/
@@ -95,8 +101,8 @@
struct nfs_reqq nfs_reqq;
struct mtx nfs_reqq_mtx;
-struct mtx nfs_reply_mtx;
struct nfs_bufq nfs_bufq;
+struct mtx nfs_xid_mtx;
/*
* and the reverse mapping from generic to Version 2 procedure numbers
@@ -182,6 +188,7 @@
*/
tl = nfsm_build(u_int32_t *, 8 * NFSX_UNSIGNED);
+ mtx_lock(&nfs_xid_mtx);
/* Get a pretty random xid to start with */
if (!nfs_xid)
nfs_xid = random();
@@ -193,6 +200,7 @@
*xidpp = tl;
*tl++ = txdr_unsigned(nfs_xid);
+ mtx_unlock(&nfs_xid_mtx);
*tl++ = rpc_call;
*tl++ = rpc_vers;
*tl++ = txdr_unsigned(NFS_PROG);
@@ -414,9 +422,10 @@
* Initialize reply list and start timer
*/
TAILQ_INIT(&nfs_reqq);
- callout_init(&nfs_callout, 0);
+ callout_init(&nfs_callout, CALLOUT_MPSAFE);
mtx_init(&nfs_reqq_mtx, "NFS reqq lock", NULL, MTX_DEF);
- mtx_init(&nfs_reply_mtx, "Synch NFS reply posting", NULL, MTX_DEF);
+ mtx_init(&nfs_iod_mtx, "NFS iod lock", NULL, MTX_DEF);
+ mtx_init(&nfs_xid_mtx, "NFS xid lock", NULL, MTX_DEF);
nfs_pbuf_freecnt = nswbuf / 2 + 1;
@@ -437,19 +446,80 @@
* Tell all nfsiod processes to exit. Clear nfs_iodmax, and wakeup
* any sleeping nfsiods so they check nfs_iodmax and exit.
*/
+ mtx_lock(&nfs_iod_mtx);
nfs_iodmax = 0;
for (i = 0; i < nfs_numasync; i++)
if (nfs_iodwant[i])
wakeup(&nfs_iodwant[i]);
/* The last nfsiod to exit will wake us up when nfs_numasync hits 0 */
while (nfs_numasync)
- tsleep(&nfs_numasync, PWAIT, "ioddie", 0);
-
+ msleep(&nfs_numasync, &nfs_iod_mtx, PWAIT, "ioddie", 0);
+ mtx_unlock(&nfs_iod_mtx);
nfs_nhuninit();
uma_zdestroy(nfsmount_zone);
return (0);
}
+void
+nfs_dircookie_lock(struct nfsnode *np)
+{
+ mtx_lock(&np->n_mtx);
+ while (np->n_flag & NDIRCOOKIELK)
+ (void) msleep(&np->n_flag, &np->n_mtx, PZERO, "nfsdirlk", 0);
+ np->n_flag |= NDIRCOOKIELK;
+ mtx_unlock(&np->n_mtx);
+}
+
+void
+nfs_dircookie_unlock(struct nfsnode *np)
+{
+ mtx_lock(&np->n_mtx);
+ np->n_flag &= ~NDIRCOOKIELK;
+ wakeup(&np->n_flag);
+ mtx_unlock(&np->n_mtx);
+}
+
+int
+nfs_upgrade_vnlock(struct vnode *vp, struct thread *td)
+{
+ int old_lock;
+
+ if ((old_lock = VOP_ISLOCKED(vp, td)) != LK_EXCLUSIVE) {
+ if (old_lock == LK_SHARED) {
+ /* Upgrade to exclusive lock, this might block */
+ vn_lock(vp, LK_UPGRADE | LK_RETRY, td);
+ } else {
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ }
+ }
+ return old_lock;
+}
+
+void
+nfs_downgrade_vnlock(struct vnode *vp, struct thread *td, int old_lock)
+{
+ if (old_lock != LK_EXCLUSIVE) {
+ if (old_lock == LK_SHARED) {
+ /* Downgrade from exclusive lock, this might block */
+ vn_lock(vp, LK_DOWNGRADE, td);
+ } else {
+ VOP_UNLOCK(vp, 0, td);
+ }
+ }
+}
+
+void
+nfs_printf(const char *fmt, ...)
+{
+ va_list ap;
+
+ mtx_lock(&Giant);
+ va_start(ap, fmt);
+ printf(fmt, ap);
+ va_end(ap);
+ mtx_unlock(&Giant);
+}
+
/*
* Attribute cache routines.
* nfs_loadattrcache() - loads or updates the cache contents from attributes
@@ -466,7 +536,7 @@
*/
int
nfs_loadattrcache(struct vnode **vpp, struct mbuf **mdp, caddr_t *dposp,
- struct vattr *vaper, int dontshrink)
+ struct vattr *vaper, int dontshrink)
{
struct vnode *vp = *vpp;
struct vattr *vap;
@@ -478,8 +548,9 @@
struct mbuf *md;
enum vtype vtyp;
u_short vmode;
- struct timespec mtime;
+ struct timespec mtime, mtime_save;
int v3 = NFS_ISV3(vp);
+ struct thread *td = curthread;
md = *mdp;
t1 = (mtod(md, caddr_t) + md->m_len) - *dposp;
@@ -535,6 +606,7 @@
* information.
*/
np = VTONFS(vp);
+ mtx_lock(&np->n_mtx);
if (vp->v_type != vtyp) {
vp->v_type = vtyp;
if (vp->v_type == VFIFO)
@@ -545,6 +617,7 @@
vap->va_type = vtyp;
vap->va_mode = (vmode & 07777);
vap->va_rdev = rdev;
+ mtime_save = vap->va_mtime;
vap->va_mtime = mtime;
vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
if (v3) {
@@ -578,6 +651,14 @@
vap->va_filerev = 0;
}
np->n_attrstamp = time_second;
+ /* Timestamp the NFS otw getattr fetch */
+ if (td->td_proc) {
+ np->n_ac_ts_tid = td->td_tid;
+ np->n_ac_ts_pid = td->td_proc->p_pid;
+ np->n_ac_ts_syscalls = td->td_syscalls;
+ } else
+ bzero(&np->n_ac_ts, sizeof(struct nfs_attrcache_timestamp));
+
if (vap->va_size != np->n_size) {
if (vap->va_type == VREG) {
if (dontshrink && vap->va_size < np->n_size) {
@@ -608,6 +689,21 @@
np->n_size = vap->va_size;
}
}
+ /*
+ * The following checks are added to prevent a race between (say)
+ * a READDIR+ and a WRITE.
+ * READDIR+, WRITE requests sent out.
+ * READDIR+ resp, WRITE resp received on client.
+ * However, the WRITE resp was handled before the READDIR+ resp
+ * causing the post op attrs from the write to be loaded first
+ * and the attrs from the READDIR+ to be loaded later. If this
+ * happens, we have stale attrs loaded into the attrcache.
+ * We detect this by for the mtime moving back. We invalidate the
+ * attrcache when this happens.
+ */
+ if (timespeccmp(&mtime_save, &vap->va_mtime, >))
+ /* Size changed or mtime went backwards */
+ np->n_attrstamp = 0;
if (vaper != NULL) {
bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap));
if (np->n_flag & NCHG) {
@@ -617,6 +713,7 @@
vaper->va_mtime = np->n_mtim;
}
}
+ mtx_unlock(&np->n_mtx);
return (0);
}
@@ -639,16 +736,20 @@
struct vattr *vap;
struct nfsmount *nmp;
int timeo;
-
+
np = VTONFS(vp);
vap = &np->n_vattr;
nmp = VFSTONFS(vp->v_mount);
+#ifdef NFS_ACDEBUG
+ mtx_lock(&Giant); /* nfs_printf() */
+#endif
+ mtx_lock(&np->n_mtx);
/* XXX n_mtime doesn't seem to be updated on a miss-and-reload */
timeo = (time_second - np->n_mtime.tv_sec) / 10;
#ifdef NFS_ACDEBUG
if (nfs_acdebug>1)
- printf("nfs_getattrcache: initial timeo = %d\n", timeo);
+ nfs_printf("nfs_getattrcache: initial timeo = %d\n", timeo);
#endif
if (vap->va_type == VDIR) {
@@ -665,18 +766,19 @@
#ifdef NFS_ACDEBUG
if (nfs_acdebug > 2)
- printf("acregmin %d; acregmax %d; acdirmin %d; acdirmax %d\n",
- nmp->nm_acregmin, nmp->nm_acregmax,
- nmp->nm_acdirmin, nmp->nm_acdirmax);
+ nfs_printf("acregmin %d; acregmax %d; acdirmin %d; acdirmax %d\n",
+ nmp->nm_acregmin, nmp->nm_acregmax,
+ nmp->nm_acdirmin, nmp->nm_acdirmax);
if (nfs_acdebug)
- printf("nfs_getattrcache: age = %d; final timeo = %d\n",
- (time_second - np->n_attrstamp), timeo);
+ nfs_printf("nfs_getattrcache: age = %d; final timeo = %d\n",
+ (time_second - np->n_attrstamp), timeo);
#endif
if ((time_second - np->n_attrstamp) >= timeo) {
nfsstats.attrcache_misses++;
- return (ENOENT);
+ mtx_unlock(&np->n_mtx);
+ return( ENOENT);
}
nfsstats.attrcache_hits++;
if (vap->va_size != np->n_size) {
@@ -701,6 +803,10 @@
if (np->n_flag & NUPD)
vaper->va_mtime = np->n_mtim;
}
+ mtx_unlock(&np->n_mtx);
+#ifdef NFS_ACDEBUG
+ mtx_unlock(&Giant); /* nfs_printf() */
+#endif
return (0);
}
@@ -714,7 +820,8 @@
{
struct nfsdmap *dp, *dp2;
int pos;
-
+ nfsuint64 *retval = NULL;
+
pos = (uoff_t)off / NFS_DIRBLKSIZ;
if (pos == 0 || off < 0) {
#ifdef DIAGNOSTIC
@@ -732,14 +839,14 @@
dp->ndm_eocookie = 0;
LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list);
} else
- return (NULL);
+ goto out;
}
while (pos >= NFSNUMCOOKIES) {
pos -= NFSNUMCOOKIES;
if (LIST_NEXT(dp, ndm_list)) {
if (!add && dp->ndm_eocookie < NFSNUMCOOKIES &&
- pos >= dp->ndm_eocookie)
- return (NULL);
+ pos >= dp->ndm_eocookie)
+ goto out;
dp = LIST_NEXT(dp, ndm_list);
} else if (add) {
MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap),
@@ -748,15 +855,17 @@
LIST_INSERT_AFTER(dp, dp2, ndm_list);
dp = dp2;
} else
- return (NULL);
+ goto out;
}
if (pos >= dp->ndm_eocookie) {
if (add)
dp->ndm_eocookie = pos + 1;
else
- return (NULL);
+ goto out;
}
- return (&dp->ndm_cookies[pos]);
+ retval = &dp->ndm_cookies[pos];
+out:
+ return (retval);
}
/*
@@ -773,11 +882,13 @@
if (vp->v_type != VDIR)
panic("nfs: invaldir not dir");
#endif
+ nfs_dircookie_lock(np);
np->n_direofoffset = 0;
np->n_cookieverf.nfsuquad[0] = 0;
np->n_cookieverf.nfsuquad[1] = 0;
if (LIST_FIRST(&np->n_cookies))
LIST_FIRST(&np->n_cookies)->ndm_eocookie = 0;
+ nfs_dircookie_unlock(np);
}
/*
@@ -797,8 +908,6 @@
struct buf *bp, *nbp;
int s;
- GIANT_REQUIRED;
-
s = splbio();
MNT_ILOCK(mp);
MNT_VNODE_FOREACH(vp, mp, nvp) {
@@ -848,7 +957,7 @@
t1 = nfsm_getfh_xx(&ttfhp, &ttfhsize, (v3), md, dpos);
if (t1 != 0)
return t1;
- t1 = nfs_nget(d->v_mount, ttfhp, ttfhsize, &ttnp);
+ t1 = nfs_nget(d->v_mount, ttfhp, ttfhsize, &ttnp, LK_EXCLUSIVE);
if (t1 != 0)
return t1;
*v = NFSTOV(ttnp);
@@ -896,7 +1005,7 @@
int
nfsm_loadattr_xx(struct vnode **v, struct vattr *va, struct mbuf **md,
- caddr_t *dpos)
+ caddr_t *dpos)
{
int t1;
@@ -910,7 +1019,7 @@
int
nfsm_postop_attr_xx(struct vnode **v, int *f, struct mbuf **md,
- caddr_t *dpos)
+ caddr_t *dpos)
{
u_int32_t *tl;
int t1;
@@ -945,9 +1054,11 @@
tl = nfsm_dissect_xx(6 * NFSX_UNSIGNED, md, dpos);
if (tl == NULL)
return EBADRPC;
+ mtx_lock(&(VTONFS(*v))->n_mtx);
if (*f)
ttretf = (VTONFS(*v)->n_mtime.tv_sec == fxdr_unsigned(u_int32_t, *(tl + 2)) &&
VTONFS(*v)->n_mtime.tv_nsec == fxdr_unsigned(u_int32_t, *(tl + 3)));
+ mtx_unlock(&(VTONFS(*v))->n_mtx);
}
t1 = nfsm_postop_attr_xx(v, &ttattrf, md, dpos);
if (t1)
Index: nfs_node.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_node.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/nfsclient/nfs_node.c -L sys/nfsclient/nfs_node.c -u -r1.2 -r1.3
--- sys/nfsclient/nfs_node.c
+++ sys/nfsclient/nfs_node.c
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_node.c,v 1.76.2.2 2006/03/12 21:50:02 scottl Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_node.c,v 1.86 2007/03/13 01:50:26 tegge Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -99,7 +99,7 @@
* nfsnode structure is returned.
*/
int
-nfs_nget(struct mount *mntp, nfsfh_t *fhp, int fhsize, struct nfsnode **npp)
+nfs_nget(struct mount *mntp, nfsfh_t *fhp, int fhsize, struct nfsnode **npp, int flags)
{
struct thread *td = curthread; /* XXX */
struct nfsnode *np;
@@ -107,27 +107,17 @@
struct vnode *nvp;
int error;
u_int hash;
- int rsflags;
struct nfsmount *nmp;
struct nfs_vncmp ncmp;
- /*
- * Calculate nfs mount point and figure out whether the rslock should
- * be interruptible or not.
- */
nmp = VFSTONFS(mntp);
- if (nmp->nm_flag & NFSMNT_INT)
- rsflags = PCATCH;
- else
- rsflags = 0;
-
*npp = NULL;
hash = fnv_32_buf(fhp->fh_bytes, fhsize, FNV1_32_INIT);
ncmp.fhsize = fhsize;
ncmp.fh = fhp;
- error = vfs_hash_get(mntp, hash, LK_EXCLUSIVE,
+ error = vfs_hash_get(mntp, hash, flags,
td, &nvp, nfs_vncmpf, &ncmp);
if (error)
return (error);
@@ -158,23 +148,44 @@
vp->v_bufobj.bo_ops = &buf_ops_nfs;
vp->v_data = np;
np->n_vnode = vp;
- error = vfs_hash_insert(vp, hash, LK_EXCLUSIVE,
+ /*
+ * Initialize the mutex even if the vnode is going to be a loser.
+ * This simplifies the logic in reclaim, which can then unconditionally
+ * destroy the mutex (in the case of the loser, or if hash_insert happened
+ * to return an error no special casing is needed).
+ */
+ mtx_init(&np->n_mtx, "NFSnode lock", NULL, MTX_DEF);
+ /*
+ * NFS supports recursive and shared locking.
+ */
+ vp->v_vnlock->lk_flags |= LK_CANRECURSE;
+ vp->v_vnlock->lk_flags &= ~LK_NOSHARE;
+ if (fhsize > NFS_SMALLFH) {
+ MALLOC(np->n_fhp, nfsfh_t *, fhsize, M_NFSBIGFH, M_WAITOK);
+ } else
+ np->n_fhp = &np->n_fh;
+ bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize);
+ np->n_fhsize = fhsize;
+ lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL, td);
+ error = insmntque(vp, mntp);
+ if (error != 0) {
+ *npp = NULL;
+ if (np->n_fhsize > NFS_SMALLFH) {
+ FREE((caddr_t)np->n_fhp, M_NFSBIGFH);
+ }
+ mtx_destroy(&np->n_mtx);
+ uma_zfree(nfsnode_zone, np);
+ return (error);
+ }
+ error = vfs_hash_insert(vp, hash, flags,
td, &nvp, nfs_vncmpf, &ncmp);
if (error)
return (error);
if (nvp != NULL) {
*npp = VTONFS(nvp);
- /* vrele() the duplicate allocated here, to get it recycled */
- vrele(vp);
+ /* vfs_hash_insert() vput()'s the losing vnode */
return (0);
}
- if (fhsize > NFS_SMALLFH) {
- MALLOC(np->n_fhp, nfsfh_t *, fhsize, M_NFSBIGFH, M_WAITOK);
- } else
- np->n_fhp = &np->n_fh;
- bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize);
- np->n_fhsize = fhsize;
- lockinit(&np->n_rslock, PVFS | rsflags, "nfrslk", 0, 0);
*npp = np;
return (0);
@@ -245,8 +256,7 @@
if (np->n_fhsize > NFS_SMALLFH) {
FREE((caddr_t)np->n_fhp, M_NFSBIGFH);
}
-
- lockdestroy(&np->n_rslock);
+ mtx_destroy(&np->n_mtx);
uma_zfree(nfsnode_zone, vp->v_data);
vp->v_data = NULL;
return (0);
Index: nfsnode.h
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfsnode.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfsnode.h -L sys/nfsclient/nfsnode.h -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfsnode.h
+++ sys/nfsclient/nfsnode.h
@@ -30,7 +30,7 @@
* SUCH DAMAGE.
*
* @(#)nfsnode.h 8.9 (Berkeley) 5/14/95
- * $FreeBSD: src/sys/nfsclient/nfsnode.h,v 1.55 2005/03/16 11:28:19 phk Exp $
+ * $FreeBSD: src/sys/nfsclient/nfsnode.h,v 1.60 2007/03/09 04:02:37 mohans Exp $
*/
#ifndef _NFSCLIENT_NFSNODE_H_
@@ -74,6 +74,16 @@
#define ndm_cookies ndm_un1.ndmu3_cookies
#define ndm4_cookies ndm_un1.ndmu4_cookies
+#define n_ac_ts_tid n_ac_ts.nfs_ac_ts_tid
+#define n_ac_ts_pid n_ac_ts.nfs_ac_ts_pid
+#define n_ac_ts_syscalls n_ac_ts.nfs_ac_ts_syscalls
+
+struct nfs_attrcache_timestamp {
+ lwpid_t nfs_ac_ts_tid;
+ pid_t nfs_ac_ts_pid;
+ unsigned long nfs_ac_ts_syscalls;
+};
+
/*
* The nfsnode is the nfs equivalent to ufs's inode. Any similarity
* is purely coincidental.
@@ -88,6 +98,7 @@
* be well aligned and, therefore, tightly packed.
*/
struct nfsnode {
+ struct mtx n_mtx; /* Protects all of these members */
u_quad_t n_size; /* Current size of file */
u_quad_t n_brev; /* Modify rev when cached */
u_quad_t n_lrev; /* Modify rev for lease */
@@ -120,13 +131,13 @@
short n_fhsize; /* size in bytes, of fh */
short n_flag; /* Flag for locking.. */
nfsfh_t n_fh; /* Small File Handle */
- struct lock n_rslock;
struct nfs4_fctx n_rfc;
struct nfs4_fctx n_wfc;
u_char *n_name; /* leaf name, for v4 OPEN op */
uint32_t n_namelen;
- daddr_t ra_expect_lbn;
int n_directio_opens;
+ int n_directio_asyncwr;
+ struct nfs_attrcache_timestamp n_ac_ts;
};
#define n_atim n_un1.nf_atim
@@ -140,6 +151,8 @@
/*
* Flags for n_flag
*/
+#define NFSYNCWAIT 0x0002 /* fsync waiting for all directio async writes
+ to drain */
#define NMODIFIED 0x0004 /* Might have a modified buffer in bio */
#define NWRITEERR 0x0008 /* Flag write errors so close will know */
/* 0x20, 0x40, 0x80 free */
@@ -150,6 +163,7 @@
#define NTRUNCATE 0x1000 /* Opened by nfs_setattr() */
#define NSIZECHANGED 0x2000 /* File size has changed: need cache inval */
#define NNONCACHE 0x4000 /* Node marked as noncacheable */
+#define NDIRCOOKIELK 0x8000 /* Lock to serialize access to directory cookies */
/*
* Convert between nfsnode pointers and vnode pointers
@@ -168,31 +182,6 @@
#if defined(_KERNEL)
-/*
- * nfs_rslock - Attempt to obtain lock on nfsnode
- *
- * Attempt to obtain a lock on the passed nfsnode, returning ENOLCK
- * if the lock could not be obtained due to our having to sleep. This
- * function is generally used to lock around code that modifies an
- * NFS file's size. In order to avoid deadlocks the lock
- * should not be obtained while other locks are being held.
- */
-
-static __inline int
-nfs_rslock(struct nfsnode *np, struct thread *td)
-{
-
- return(lockmgr(&np->n_rslock,
- LK_EXCLUSIVE | LK_CANRECURSE | LK_SLEEPFAIL, NULL, td));
-}
-
-static __inline void
-nfs_rsunlock(struct nfsnode *np, struct thread *td)
-{
-
- (void)lockmgr(&np->n_rslock, LK_RELEASE, NULL, td);
-}
-
extern struct vop_vector nfs_fifoops;
extern struct vop_vector nfs_vnodeops;
extern struct vop_vector nfs4_vnodeops;
@@ -211,11 +200,17 @@
/* other stuff */
int nfs_removeit(struct sillyrename *);
int nfs4_removeit(struct sillyrename *);
-int nfs_nget(struct mount *, nfsfh_t *, int, struct nfsnode **);
+int nfs_nget(struct mount *, nfsfh_t *, int, struct nfsnode **, int flags);
nfsuint64 *nfs_getcookie(struct nfsnode *, off_t, int);
uint64_t *nfs4_getcookie(struct nfsnode *, off_t, int);
void nfs_invaldir(struct vnode *);
void nfs4_invaldir(struct vnode *);
+int nfs_upgrade_vnlock(struct vnode *vp, struct thread *td);
+void nfs_downgrade_vnlock(struct vnode *vp, struct thread *td, int old_lock);
+void nfs_printf(const char *fmt, ...);
+
+void nfs_dircookie_lock(struct nfsnode *np);
+void nfs_dircookie_unlock(struct nfsnode *np);
#endif /* _KERNEL */
Index: nfs_bio.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_bio.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfs_bio.c -L sys/nfsclient/nfs_bio.c -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfs_bio.c
+++ sys/nfsclient/nfs_bio.c
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_bio.c,v 1.152 2005/06/16 15:43:17 green Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_bio.c,v 1.165 2007/09/25 21:08:48 mohans Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -71,6 +71,7 @@
extern int nfs_directio_enable;
extern int nfs_directio_allow_mmap;
+
/*
* Vnode op for VM getpages.
*/
@@ -90,8 +91,6 @@
vm_page_t *pages;
struct nfsnode *np;
- GIANT_REQUIRED;
-
vp = ap->a_vp;
np = VTONFS(vp);
td = curthread; /* XXX */
@@ -101,21 +100,28 @@
count = ap->a_count;
if ((object = vp->v_object) == NULL) {
- printf("nfs_getpages: called with non-merged cache vnode??\n");
+ nfs_printf("nfs_getpages: called with non-merged cache vnode??\n");
return VM_PAGER_ERROR;
}
- if (!nfs_directio_allow_mmap && (np->n_flag & NNONCACHE) &&
- (vp->v_type == VREG)) {
- printf("nfs_getpages: called on non-cacheable vnode??\n");
- return VM_PAGER_ERROR;
+ if (nfs_directio_enable && !nfs_directio_allow_mmap) {
+ mtx_lock(&np->n_mtx);
+ if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
+ mtx_unlock(&np->n_mtx);
+ nfs_printf("nfs_getpages: called on non-cacheable vnode??\n");
+ return VM_PAGER_ERROR;
+ } else
+ mtx_unlock(&np->n_mtx);
}
+ mtx_lock(&nmp->nm_mtx);
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
- (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
+ (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
+ mtx_unlock(&nmp->nm_mtx);
/* We'll never get here for v4, because we always have fsinfo */
(void)nfs_fsinfo(nmp, vp, cred, td);
- }
+ } else
+ mtx_unlock(&nmp->nm_mtx);
npages = btoc(count);
@@ -153,8 +159,8 @@
kva = (vm_offset_t) bp->b_data;
pmap_qenter(kva, pages, npages);
- cnt.v_vnodein++;
- cnt.v_vnodepgsin += npages;
+ PCPU_INC(cnt.v_vnodein);
+ PCPU_ADD(cnt.v_vnodepgsin, npages);
iov.iov_base = (caddr_t) kva;
iov.iov_len = count;
@@ -172,7 +178,7 @@
relpbuf(bp, &nfs_pbuf_freecnt);
if (error && (uio.uio_resid == count)) {
- printf("nfs_getpages: error %d\n", error);
+ nfs_printf("nfs_getpages: error %d\n", error);
VM_OBJECT_LOCK(object);
vm_page_lock_queues();
for (i = 0; i < npages; ++i) {
@@ -234,7 +240,7 @@
* now tell them that it is ok to use.
*/
if (!error) {
- if (m->flags & PG_WANTED)
+ if (m->oflags & VPO_WANTED)
vm_page_activate(m);
else
vm_page_deactivate(m);
@@ -269,8 +275,6 @@
struct nfsnode *np;
vm_page_t *pages;
- GIANT_REQUIRED;
-
vp = ap->a_vp;
np = VTONFS(vp);
td = curthread; /* XXX */
@@ -281,15 +285,22 @@
rtvals = ap->a_rtvals;
npages = btoc(count);
offset = IDX_TO_OFF(pages[0]->pindex);
-
+
+ mtx_lock(&nmp->nm_mtx);
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
+ mtx_unlock(&nmp->nm_mtx);
(void)nfs_fsinfo(nmp, vp, cred, td);
- }
+ } else
+ mtx_unlock(&nmp->nm_mtx);
- if (!nfs_directio_allow_mmap && (np->n_flag & NNONCACHE) &&
- (vp->v_type == VREG))
- printf("nfs_putpages: called on noncache-able vnode??\n");
+ mtx_lock(&np->n_mtx);
+ if (nfs_directio_enable && !nfs_directio_allow_mmap &&
+ (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
+ mtx_unlock(&np->n_mtx);
+ nfs_printf("nfs_putpages: called on noncache-able vnode??\n");
+ mtx_lock(&np->n_mtx);
+ }
for (i = 0; i < npages; i++)
rtvals[i] = VM_PAGER_AGAIN;
@@ -297,12 +308,12 @@
/*
* When putting pages, do not extend file past EOF.
*/
-
if (offset + count > np->n_size) {
count = np->n_size - offset;
if (count < 0)
count = 0;
}
+ mtx_unlock(&np->n_mtx);
/*
* We use only the kva address for the buffer, but this is extremely
@@ -312,8 +323,8 @@
kva = (vm_offset_t) bp->b_data;
pmap_qenter(kva, pages, npages);
- cnt.v_vnodeout++;
- cnt.v_vnodepgsout += count;
+ PCPU_INC(cnt.v_vnodeout);
+ PCPU_ADD(cnt.v_vnodepgsout, count);
iov.iov_base = (caddr_t) kva;
iov.iov_len = count;
@@ -349,6 +360,81 @@
}
/*
+ * For nfs, cache consistency can only be maintained approximately.
+ * Although RFC1094 does not specify the criteria, the following is
+ * believed to be compatible with the reference port.
+ * For nfs:
+ * If the file's modify time on the server has changed since the
+ * last read rpc or you have written to the file,
+ * you may have lost data cache consistency with the
+ * server, so flush all of the file's data out of the cache.
+ * Then force a getattr rpc to ensure that you have up to date
+ * attributes.
+ * NB: This implies that cache data can be read when up to
+ * NFS_ATTRTIMEO seconds out of date. If you find that you need current
+ * attributes this could be forced by setting n_attrstamp to 0 before
+ * the VOP_GETATTR() call.
+ */
+static inline int
+nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred)
+{
+ int error = 0;
+ struct vattr vattr;
+ struct nfsnode *np = VTONFS(vp);
+ int old_lock;
+ struct nfsmount *nmp = VFSTONFS(vp->v_mount);
+
+ /*
+ * Grab the exclusive lock before checking whether the cache is
+ * consistent.
+ * XXX - We can make this cheaper later (by acquiring cheaper locks).
+ * But for now, this suffices.
+ */
+ old_lock = nfs_upgrade_vnlock(vp, td);
+ mtx_lock(&np->n_mtx);
+ if (np->n_flag & NMODIFIED) {
+ mtx_unlock(&np->n_mtx);
+ if (vp->v_type != VREG) {
+ if (vp->v_type != VDIR)
+ panic("nfs: bioread, not dir");
+ (nmp->nm_rpcops->nr_invaldir)(vp);
+ error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
+ if (error)
+ goto out;
+ }
+ np->n_attrstamp = 0;
+ error = VOP_GETATTR(vp, &vattr, cred, td);
+ if (error)
+ goto out;
+ mtx_lock(&np->n_mtx);
+ np->n_mtime = vattr.va_mtime;
+ mtx_unlock(&np->n_mtx);
+ } else {
+ mtx_unlock(&np->n_mtx);
+ error = VOP_GETATTR(vp, &vattr, cred, td);
+ if (error)
+ return (error);
+ mtx_lock(&np->n_mtx);
+ if ((np->n_flag & NSIZECHANGED)
+ || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
+ mtx_unlock(&np->n_mtx);
+ if (vp->v_type == VDIR)
+ (nmp->nm_rpcops->nr_invaldir)(vp);
+ error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
+ if (error)
+ goto out;
+ mtx_lock(&np->n_mtx);
+ np->n_mtime = vattr.va_mtime;
+ np->n_flag &= ~NSIZECHANGED;
+ }
+ mtx_unlock(&np->n_mtx);
+ }
+out:
+ nfs_downgrade_vnlock(vp, td, old_lock);
+ return error;
+}
+
+/*
* Vnode op for read using bio
*/
int
@@ -357,7 +443,6 @@
struct nfsnode *np = VTONFS(vp);
int biosize, i;
struct buf *bp, *rabp;
- struct vattr vattr;
struct thread *td;
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
daddr_t lbn, rabn;
@@ -375,9 +460,14 @@
return (EINVAL);
td = uio->uio_td;
+ mtx_lock(&nmp->nm_mtx);
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
- (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
+ (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
+ mtx_unlock(&nmp->nm_mtx);
(void)nfs_fsinfo(nmp, vp, cred, td);
+ } else
+ mtx_unlock(&nmp->nm_mtx);
+
if (vp->v_type != VDIR &&
(uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
return (EFBIG);
@@ -388,52 +478,18 @@
biosize = vp->v_mount->mnt_stat.f_iosize;
seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
- /*
- * For nfs, cache consistency can only be maintained approximately.
- * Although RFC1094 does not specify the criteria, the following is
- * believed to be compatible with the reference port.
- * For nfs:
- * If the file's modify time on the server has changed since the
- * last read rpc or you have written to the file,
- * you may have lost data cache consistency with the
- * server, so flush all of the file's data out of the cache.
- * Then force a getattr rpc to ensure that you have up to date
- * attributes.
- * NB: This implies that cache data can be read when up to
- * NFS_ATTRTIMEO seconds out of date. If you find that you need current
- * attributes this could be forced by setting n_attrstamp to 0 before
- * the VOP_GETATTR() call.
- */
- if (np->n_flag & NMODIFIED) {
- if (vp->v_type != VREG) {
- if (vp->v_type != VDIR)
- panic("nfs: bioread, not dir");
- (nmp->nm_rpcops->nr_invaldir)(vp);
- error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
- if (error)
- return (error);
- }
- np->n_attrstamp = 0;
- error = VOP_GETATTR(vp, &vattr, cred, td);
- if (error)
- return (error);
- np->n_mtime = vattr.va_mtime;
- } else {
- error = VOP_GETATTR(vp, &vattr, cred, td);
- if (error)
- return (error);
- if ((np->n_flag & NSIZECHANGED)
- || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
- if (vp->v_type == VDIR)
- (nmp->nm_rpcops->nr_invaldir)(vp);
- error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
- if (error)
- return (error);
- np->n_mtime = vattr.va_mtime;
- np->n_flag &= ~NSIZECHANGED;
- }
- }
+
+ error = nfs_bioread_check_cons(vp, td, cred);
+ if (error)
+ return error;
+
do {
+ u_quad_t nsize;
+
+ mtx_lock(&np->n_mtx);
+ nsize = np->n_size;
+ mtx_unlock(&np->n_mtx);
+
switch (vp->v_type) {
case VREG:
nfsstats.biocache_reads++;
@@ -442,12 +498,10 @@
/*
* Start the read ahead(s), as required.
- * The readahead is kicked off only if sequential access
- * is detected, based on the readahead hint (ra_expect_lbn).
*/
- if (nmp->nm_readahead > 0 && np->ra_expect_lbn == lbn) {
+ if (nmp->nm_readahead > 0) {
for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
- (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
+ (off_t)(lbn + 1 + nra) * biosize < nsize; nra++) {
rabn = lbn + 1 + nra;
if (incore(&vp->v_bufobj, rabn) == NULL) {
rabp = nfs_getcacheblk(vp, rabn, biosize, td);
@@ -471,48 +525,17 @@
}
}
}
- np->ra_expect_lbn = lbn + 1;
}
- /*
- * Obtain the buffer cache block. Figure out the buffer size
- * when we are at EOF. If we are modifying the size of the
- * buffer based on an EOF condition we need to hold
- * nfs_rslock() through obtaining the buffer to prevent
- * a potential writer-appender from messing with n_size.
- * Otherwise we may accidently truncate the buffer and
- * lose dirty data.
- *
- * Note that bcount is *not* DEV_BSIZE aligned.
- */
-
-again:
+ /* Note that bcount is *not* DEV_BSIZE aligned. */
bcount = biosize;
- if ((off_t)lbn * biosize >= np->n_size) {
+ if ((off_t)lbn * biosize >= nsize) {
bcount = 0;
- } else if ((off_t)(lbn + 1) * biosize > np->n_size) {
- bcount = np->n_size - (off_t)lbn * biosize;
+ } else if ((off_t)(lbn + 1) * biosize > nsize) {
+ bcount = nsize - (off_t)lbn * biosize;
}
- if (bcount != biosize) {
- switch(nfs_rslock(np, td)) {
- case ENOLCK:
- goto again;
- /* not reached */
- case EIO:
- return (EIO);
- case EINTR:
- case ERESTART:
- return(EINTR);
- /* not reached */
- default:
- break;
- }
- }
-
bp = nfs_getcacheblk(vp, lbn, bcount, td);
- if (bcount != biosize)
- nfs_rsunlock(np, td);
if (!bp) {
error = nfs_sigintr(nmp, NULL, td);
return (error ? error : EINTR);
@@ -681,7 +704,7 @@
n = np->n_direofoffset - uio->uio_offset;
break;
default:
- printf(" nfs_bioread: type %x unexpected\n", vp->v_type);
+ nfs_printf(" nfs_bioread: type %x unexpected\n", vp->v_type);
bp = NULL;
break;
};
@@ -719,14 +742,18 @@
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
struct thread *td = uiop->uio_td;
int size;
-
+ int wsize;
+
+ mtx_lock(&nmp->nm_mtx);
+ wsize = nmp->nm_wsize;
+ mtx_unlock(&nmp->nm_mtx);
if (ioflag & IO_SYNC) {
int iomode, must_commit;
struct uio uio;
struct iovec iov;
do_sync:
while (uiop->uio_resid > 0) {
- size = min(uiop->uio_resid, nmp->nm_wsize);
+ size = min(uiop->uio_resid, wsize);
size = min(uiop->uio_iov->iov_len, size);
iov.iov_base = uiop->uio_iov->iov_base;
iov.iov_len = size;
@@ -775,7 +802,7 @@
* in NFS directio access.
*/
while (uiop->uio_resid > 0) {
- size = min(uiop->uio_resid, nmp->nm_wsize);
+ size = min(uiop->uio_resid, wsize);
size = min(uiop->uio_iov->iov_len, size);
bp = getpbuf(&nfs_pbuf_freecnt);
t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK);
@@ -799,13 +826,11 @@
bp->b_wcred = NOCRED;
bp->b_caller1 = (void *)t_uio;
bp->b_vp = vp;
- vhold(vp);
error = nfs_asyncio(nmp, bp, NOCRED, td);
if (error) {
free(t_iov->iov_base, M_NFSDIRECTIO);
free(t_iov, M_NFSDIRECTIO);
free(t_uio, M_NFSDIRECTIO);
- vdrop(bp->b_vp);
bp->b_vp = NULL;
relpbuf(bp, &nfs_pbuf_freecnt);
if (error == EINTR)
@@ -846,11 +871,8 @@
daddr_t lbn;
int bcount;
int n, on, error = 0;
- int haverslock = 0;
struct proc *p = td?td->td_proc:NULL;
- GIANT_REQUIRED;
-
#ifdef DIAGNOSTIC
if (uio->uio_rw != UIO_WRITE)
panic("nfs_write mode");
@@ -859,20 +881,29 @@
#endif
if (vp->v_type != VREG)
return (EIO);
+ mtx_lock(&np->n_mtx);
if (np->n_flag & NWRITEERR) {
np->n_flag &= ~NWRITEERR;
+ mtx_unlock(&np->n_mtx);
return (np->n_error);
- }
+ } else
+ mtx_unlock(&np->n_mtx);
+ mtx_lock(&nmp->nm_mtx);
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
- (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
+ (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
+ mtx_unlock(&nmp->nm_mtx);
(void)nfs_fsinfo(nmp, vp, cred, td);
+ } else
+ mtx_unlock(&nmp->nm_mtx);
/*
* Synchronously flush pending buffers if we are in synchronous
* mode or if we are appending.
*/
if (ioflag & (IO_APPEND | IO_SYNC)) {
+ mtx_lock(&np->n_mtx);
if (np->n_flag & NMODIFIED) {
+ mtx_unlock(&np->n_mtx);
#ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */
/*
* Require non-blocking, synchronous writes to
@@ -887,20 +918,22 @@
error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
if (error)
return (error);
- }
+ } else
+ mtx_unlock(&np->n_mtx);
}
/*
* If IO_APPEND then load uio_offset. We restart here if we cannot
* get the append lock.
*/
-restart:
if (ioflag & IO_APPEND) {
np->n_attrstamp = 0;
error = VOP_GETATTR(vp, &vattr, cred, td);
if (error)
return (error);
+ mtx_lock(&np->n_mtx);
uio->uio_offset = np->n_size;
+ mtx_unlock(&np->n_mtx);
}
if (uio->uio_offset < 0)
@@ -914,38 +947,6 @@
return nfs_directio_write(vp, uio, cred, ioflag);
/*
- * We need to obtain the rslock if we intend to modify np->n_size
- * in order to guarentee the append point with multiple contending
- * writers, to guarentee that no other appenders modify n_size
- * while we are trying to obtain a truncated buffer (i.e. to avoid
- * accidently truncating data written by another appender due to
- * the race), and to ensure that the buffer is populated prior to
- * our extending of the file. We hold rslock through the entire
- * operation.
- *
- * Note that we do not synchronize the case where someone truncates
- * the file while we are appending to it because attempting to lock
- * this case may deadlock other parts of the system unexpectedly.
- */
- if ((ioflag & IO_APPEND) ||
- uio->uio_offset + uio->uio_resid > np->n_size) {
- switch(nfs_rslock(np, td)) {
- case ENOLCK:
- goto restart;
- /* not reached */
- case EIO:
- return (EIO);
- case EINTR:
- case ERESTART:
- return(EINTR);
- /* not reached */
- default:
- break;
- }
- haverslock = 1;
- }
-
- /*
* Maybe this should be above the vnode op call, but so long as
* file servers have no limits, i don't think it matters
*/
@@ -955,8 +956,6 @@
lim_cur(p, RLIMIT_FSIZE)) {
psignal(p, SIGXFSZ);
PROC_UNLOCK(p);
- if (haverslock)
- nfs_rsunlock(np, td);
return (EFBIG);
}
PROC_UNLOCK(p);
@@ -972,6 +971,11 @@
* no point optimizing for something that really won't ever happen.
*/
if (!(ioflag & IO_SYNC)) {
+ int nflag;
+
+ mtx_lock(&np->n_mtx);
+ nflag = np->n_flag;
+ mtx_unlock(&np->n_mtx);
int needrestart = 0;
if (nmp->nm_wcommitsize < uio->uio_resid) {
/*
@@ -983,9 +987,9 @@
if (ioflag & IO_NDELAY)
return (EAGAIN);
ioflag |= IO_SYNC;
- if (np->n_flag & NMODIFIED)
+ if (nflag & NMODIFIED)
needrestart = 1;
- } else if (np->n_flag & NMODIFIED) {
+ } else if (nflag & NMODIFIED) {
int wouldcommit = 0;
BO_LOCK(&vp->v_bufobj);
if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
@@ -1012,13 +1016,8 @@
if (wouldcommit > nmp->nm_wcommitsize)
needrestart = 1;
}
- if (needrestart) {
- if (haverslock) {
- nfs_rsunlock(np, td);
- haverslock = 0;
- }
+ if (needrestart)
goto flush_and_restart;
- }
}
do {
@@ -1031,8 +1030,9 @@
* Handle direct append and file extension cases, calculate
* unaligned buffer size.
*/
-
+ mtx_lock(&np->n_mtx);
if (uio->uio_offset == np->n_size && n) {
+ mtx_unlock(&np->n_mtx);
/*
* Get the buffer (in its pre-append state to maintain
* B_CACHE if it was previously set). Resize the
@@ -1045,9 +1045,11 @@
if (bp != NULL) {
long save;
+ mtx_lock(&np->n_mtx);
np->n_size = uio->uio_offset + n;
np->n_flag |= NMODIFIED;
vnode_pager_setsize(vp, np->n_size);
+ mtx_unlock(&np->n_mtx);
save = bp->b_flags & B_CACHE;
bcount += n;
@@ -1066,12 +1068,15 @@
else
bcount = np->n_size - (off_t)lbn * biosize;
}
+ mtx_unlock(&np->n_mtx);
bp = nfs_getcacheblk(vp, lbn, bcount, td);
+ mtx_lock(&np->n_mtx);
if (uio->uio_offset + n > np->n_size) {
np->n_size = uio->uio_offset + n;
np->n_flag |= NMODIFIED;
vnode_pager_setsize(vp, np->n_size);
}
+ mtx_unlock(&np->n_mtx);
}
if (!bp) {
@@ -1117,7 +1122,9 @@
}
if (bp->b_wcred == NOCRED)
bp->b_wcred = crhold(cred);
+ mtx_lock(&np->n_mtx);
np->n_flag |= NMODIFIED;
+ mtx_unlock(&np->n_mtx);
/*
* If dirtyend exceeds file size, chop it down. This should
@@ -1129,7 +1136,7 @@
*/
if (bp->b_dirtyend > bcount) {
- printf("NFS append race @%lx:%d\n",
+ nfs_printf("NFS append race @%lx:%d\n",
(long)bp->b_blkno * DEV_BSIZE,
bp->b_dirtyend - bcount);
bp->b_dirtyend = bcount;
@@ -1209,15 +1216,12 @@
break;
} else if ((n + on) == biosize) {
bp->b_flags |= B_ASYNC;
- (void) (nmp->nm_rpcops->nr_writebp)(bp, 0, 0);
+ (void) (nmp->nm_rpcops->nr_writebp)(bp, 0, NULL);
} else {
bdwrite(bp);
}
} while (uio->uio_resid > 0 && n > 0);
- if (haverslock)
- nfs_rsunlock(np, td);
-
return (error);
}
@@ -1302,34 +1306,35 @@
slptimeo = 0;
}
- if ((old_lock = VOP_ISLOCKED(vp, td)) != LK_EXCLUSIVE) {
- if (old_lock == LK_SHARED) {
- /* Upgrade to exclusive lock, this might block */
- vn_lock(vp, LK_UPGRADE | LK_RETRY, td);
- } else {
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
- }
- }
-
+ old_lock = nfs_upgrade_vnlock(vp, td);
/*
* Now, flush as required.
*/
+ if ((flags & V_SAVE) && (vp->v_bufobj.bo_object != NULL)) {
+ VM_OBJECT_LOCK(vp->v_bufobj.bo_object);
+ vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC);
+ VM_OBJECT_UNLOCK(vp->v_bufobj.bo_object);
+ /*
+ * If the page clean was interrupted, fail the invalidation.
+ * Not doing so, we run the risk of losing dirty pages in the
+ * vinvalbuf() call below.
+ */
+ if (intrflg && (error = nfs_sigintr(nmp, NULL, td)))
+ goto out;
+ }
+
error = vinvalbuf(vp, flags, td, slpflag, 0);
while (error) {
if (intrflg && (error = nfs_sigintr(nmp, NULL, td)))
goto out;
error = vinvalbuf(vp, flags, td, 0, slptimeo);
}
- np->n_flag &= ~NMODIFIED;
+ mtx_lock(&np->n_mtx);
+ if (np->n_directio_asyncwr == 0)
+ np->n_flag &= ~NMODIFIED;
+ mtx_unlock(&np->n_mtx);
out:
- if (old_lock != LK_EXCLUSIVE) {
- if (old_lock == LK_SHARED) {
- /* Downgrade from exclusive lock, this might block */
- vn_lock(vp, LK_DOWNGRADE, td);
- } else {
- VOP_UNLOCK(vp, 0, td);
- }
- }
+ nfs_downgrade_vnlock(vp, td, old_lock);
return error;
}
@@ -1355,11 +1360,12 @@
* leave the async daemons for more important rpc's (such as reads
* and writes).
*/
+ mtx_lock(&nfs_iod_mtx);
if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
(nmp->nm_bufqiods > nfs_numasync / 2)) {
+ mtx_unlock(&nfs_iod_mtx);
return(EIO);
}
-
again:
if (nmp->nm_flag & NFSMNT_INT)
slpflag = PCATCH;
@@ -1422,12 +1428,15 @@
NFS_DPF(ASYNCIO,
("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
nmp->nm_bufqwant = TRUE;
- error = nfs_tsleep(td, &nmp->nm_bufq, slpflag | PRIBIO,
+ error = nfs_msleep(td, &nmp->nm_bufq, &nfs_iod_mtx,
+ slpflag | PRIBIO,
"nfsaio", slptimeo);
if (error) {
error2 = nfs_sigintr(nmp, NULL, td);
- if (error2)
+ if (error2) {
+ mtx_unlock(&nfs_iod_mtx);
return (error2);
+ }
if (slpflag == PCATCH) {
slpflag = 0;
slptimeo = 2 * hz;
@@ -1444,6 +1453,13 @@
}
}
+ /* We might have lost our nfsiod */
+ if (nmp->nm_bufqiods == 0) {
+ NFS_DPF(ASYNCIO,
+ ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
+ goto again;
+ }
+
if (bp->b_iocmd == BIO_READ) {
if (bp->b_rcred == NOCRED && cred != NOCRED)
bp->b_rcred = crhold(cred);
@@ -1457,9 +1473,18 @@
BUF_KERNPROC(bp);
TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
nmp->nm_bufqlen++;
+ if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
+ mtx_lock(&(VTONFS(bp->b_vp))->n_mtx);
+ VTONFS(bp->b_vp)->n_flag |= NMODIFIED;
+ VTONFS(bp->b_vp)->n_directio_asyncwr++;
+ mtx_unlock(&(VTONFS(bp->b_vp))->n_mtx);
+ }
+ mtx_unlock(&nfs_iod_mtx);
return (0);
}
+ mtx_unlock(&nfs_iod_mtx);
+
/*
* All the iods are busy on other mounts, so return EIO to
* force the caller to process the i/o synchronously.
@@ -1483,7 +1508,19 @@
free(iov_base, M_NFSDIRECTIO);
free(uiop->uio_iov, M_NFSDIRECTIO);
free(uiop, M_NFSDIRECTIO);
- vdrop(bp->b_vp);
+ if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
+ struct nfsnode *np = VTONFS(bp->b_vp);
+ mtx_lock(&np->n_mtx);
+ np->n_directio_asyncwr--;
+ if (np->n_directio_asyncwr == 0) {
+ VTONFS(bp->b_vp)->n_flag &= ~NMODIFIED;
+ if ((np->n_flag & NFSYNCWAIT)) {
+ np->n_flag &= ~NFSYNCWAIT;
+ wakeup((caddr_t)&np->n_directio_asyncwr);
+ }
+ }
+ mtx_unlock(&np->n_mtx);
+ }
bp->b_vp = NULL;
relpbuf(bp, &nfs_pbuf_freecnt);
}
@@ -1502,7 +1539,8 @@
struct uio uio;
struct iovec io;
struct proc *p = td ? td->td_proc : NULL;
-
+ uint8_t iocmd;
+
np = VTONFS(vp);
nmp = VFSTONFS(vp->v_mount);
uiop = &uio;
@@ -1520,8 +1558,8 @@
bp->b_ioflags &= ~BIO_ERROR;
KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
-
- if (bp->b_iocmd == BIO_READ) {
+ iocmd = bp->b_iocmd;
+ if (iocmd == BIO_READ) {
io.iov_len = uiop->uio_resid = bp->b_bcount;
io.iov_base = bp->b_data;
uiop->uio_rw = UIO_READ;
@@ -1551,11 +1589,15 @@
}
}
/* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */
- if (p && (vp->v_vflag & VV_TEXT) &&
- (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime))) {
- PROC_LOCK(p);
- killproc(p, "text file modification");
- PROC_UNLOCK(p);
+ if (p && (vp->v_vflag & VV_TEXT)) {
+ mtx_lock(&np->n_mtx);
+ if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime)) {
+ mtx_unlock(&np->n_mtx);
+ PROC_LOCK(p);
+ killproc(p, "text file modification");
+ PROC_UNLOCK(p);
+ } else
+ mtx_unlock(&np->n_mtx);
}
break;
case VLNK:
@@ -1585,7 +1627,7 @@
bp->b_flags |= B_INVAL;
break;
default:
- printf("nfs_doio: type %x unexpected\n", vp->v_type);
+ nfs_printf("nfs_doio: type %x unexpected\n", vp->v_type);
break;
};
if (error) {
@@ -1619,9 +1661,10 @@
/*
* Setup for actual write
*/
-
+ mtx_lock(&np->n_mtx);
if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
+ mtx_unlock(&np->n_mtx);
if (bp->b_dirtyend > bp->b_dirtyoff) {
io.iov_len = uiop->uio_resid = bp->b_dirtyend
@@ -1678,8 +1721,21 @@
* the vp's paging queues so we cannot call bdirty(). The
* bp in this case is not an NFS cache block so we should
* be safe. XXX
+ *
+ * The logic below breaks up errors into recoverable and
+ * unrecoverable. For the former, we clear B_INVAL|B_NOCACHE
+ * and keep the buffer around for potential write retries.
+ * For the latter (eg ESTALE), we toss the buffer away (B_INVAL)
+ * and save the error in the nfsnode. This is less than ideal
+ * but necessary. Keeping such buffers around could potentially
+ * cause buffer exhaustion eventually (they can never be written
+ * out, so will get constantly be re-dirtied). It also causes
+ * all sorts of vfs panics. For non-recoverable write errors,
+ * also invalidate the attrcache, so we'll be forced to go over
+ * the wire for this object, returning an error to user on next
+ * call (most of the time).
*/
- if (error == EINTR || error == EIO
+ if (error == EINTR || error == EIO || error == ETIMEDOUT
|| (!error && (bp->b_flags & B_NEEDCOMMIT))) {
int s;
@@ -1695,8 +1751,12 @@
} else {
if (error) {
bp->b_ioflags |= BIO_ERROR;
+ bp->b_flags |= B_INVAL;
bp->b_error = np->n_error = error;
+ mtx_lock(&np->n_mtx);
np->n_flag |= NWRITEERR;
+ np->n_attrstamp = 0;
+ mtx_unlock(&np->n_mtx);
}
bp->b_dirtyoff = bp->b_dirtyend = 0;
}
@@ -1725,13 +1785,16 @@
nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize)
{
struct nfsnode *np = VTONFS(vp);
- u_quad_t tsize = np->n_size;
+ u_quad_t tsize;
int biosize = vp->v_mount->mnt_stat.f_iosize;
int error = 0;
+ mtx_lock(&np->n_mtx);
+ tsize = np->n_size;
np->n_size = nsize;
+ mtx_unlock(&np->n_mtx);
- if (np->n_size < tsize) {
+ if (nsize < tsize) {
struct buf *bp;
daddr_t lbn;
int bufsize;
Index: nfs_nfsiod.c
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nfs_nfsiod.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nfs_nfsiod.c -L sys/nfsclient/nfs_nfsiod.c -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nfs_nfsiod.c
+++ sys/nfsclient/nfs_nfsiod.c
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_nfsiod.c,v 1.86 2005/02/07 18:21:50 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_nfsiod.c,v 1.91 2007/09/25 21:08:49 mohans Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -74,7 +74,7 @@
#include <nfsclient/nfsnode.h>
#include <nfsclient/nfs_lock.h>
-static MALLOC_DEFINE(M_NFSSVC, "NFS srvsock", "Nfs server structure");
+static MALLOC_DEFINE(M_NFSSVC, "nfsclient_srvsock", "Nfs server structure");
static void nfssvc_iod(void *);
@@ -90,7 +90,7 @@
unsigned int nfs_iodmax = 20;
/* Minimum number of nfsiod kthreads to keep as spares */
-static unsigned int nfs_iodmin = 4;
+static unsigned int nfs_iodmin = 0;
static int
sysctl_iodmin(SYSCTL_HANDLER_ARGS)
@@ -102,17 +102,22 @@
error = sysctl_handle_int(oidp, &newmin, 0, req);
if (error || (req->newptr == NULL))
return (error);
- if (newmin > nfs_iodmax)
- return (EINVAL);
+ mtx_lock(&nfs_iod_mtx);
+ if (newmin > nfs_iodmax) {
+ error = EINVAL;
+ goto out;
+ }
nfs_iodmin = newmin;
if (nfs_numasync >= nfs_iodmin)
- return (0);
+ goto out;
/*
* If the current number of nfsiod is lower
* than the new minimum, create some more.
*/
for (i = nfs_iodmin - nfs_numasync; i > 0; i--)
nfs_nfsiodnew();
+out:
+ mtx_unlock(&nfs_iod_mtx);
return (0);
}
SYSCTL_PROC(_vfs_nfs, OID_AUTO, iodmin, CTLTYPE_UINT | CTLFLAG_RW, 0,
@@ -131,9 +136,10 @@
return (error);
if (newmax > NFS_MAXASYNCDAEMON)
return (EINVAL);
+ mtx_lock(&nfs_iod_mtx);
nfs_iodmax = newmax;
if (nfs_numasync <= nfs_iodmax)
- return (0);
+ goto out;
/*
* If there are some asleep nfsiods that should
* exit, wakeup() them so that they check nfs_iodmax
@@ -146,6 +152,8 @@
wakeup(&nfs_iodwant[iod]);
iod--;
}
+out:
+ mtx_unlock(&nfs_iod_mtx);
return (0);
}
SYSCTL_PROC(_vfs_nfs, OID_AUTO, iodmax, CTLTYPE_UINT | CTLFLAG_RW, 0,
@@ -168,8 +176,10 @@
}
if (newiod == -1)
return (-1);
+ mtx_unlock(&nfs_iod_mtx);
error = kthread_create(nfssvc_iod, nfs_asyncdaemon + i, NULL, RFHIGHPID,
0, "nfsiod %d", newiod);
+ mtx_lock(&nfs_iod_mtx);
if (error)
return (-1);
nfs_numasync++;
@@ -183,6 +193,7 @@
int error;
TUNABLE_INT_FETCH("vfs.nfs.iodmin", &nfs_iodmin);
+ mtx_lock(&nfs_iod_mtx);
/* Silently limit the start number of nfsiod's */
if (nfs_iodmin > NFS_MAXASYNCDAEMON)
nfs_iodmin = NFS_MAXASYNCDAEMON;
@@ -192,6 +203,7 @@
if (error == -1)
panic("nfsiod_setup: nfs_nfsiodnew failed");
}
+ mtx_unlock(&nfs_iod_mtx);
}
SYSINIT(nfsiod, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, nfsiod_setup, NULL);
@@ -211,15 +223,14 @@
int myiod, timo;
int error = 0;
- mtx_lock(&Giant);
+ mtx_lock(&nfs_iod_mtx);
myiod = (int *)instance - nfs_asyncdaemon;
/*
* Main loop
*/
for (;;) {
- while (((nmp = nfs_iodmount[myiod]) == NULL
- || !TAILQ_FIRST(&nmp->nm_bufq))
- && error == 0) {
+ while (((nmp = nfs_iodmount[myiod]) == NULL)
+ || !TAILQ_FIRST(&nmp->nm_bufq)) {
if (myiod >= nfs_iodmax)
goto finish;
if (nmp)
@@ -230,12 +241,25 @@
* Always keep at least nfs_iodmin kthreads.
*/
timo = (myiod < nfs_iodmin) ? 0 : nfs_iodmaxidle * hz;
- error = tsleep(&nfs_iodwant[myiod], PWAIT | PCATCH,
+ error = msleep(&nfs_iodwant[myiod], &nfs_iod_mtx, PWAIT | PCATCH,
"-", timo);
+ if (error) {
+ nmp = nfs_iodmount[myiod];
+ /*
+ * Rechecking the nm_bufq closes a rare race where the
+ * nfsiod is woken up at the exact time the idle timeout
+ * fires
+ */
+ if (nmp && TAILQ_FIRST(&nmp->nm_bufq))
+ error = 0;
+ break;
+ }
}
if (error)
break;
while ((bp = TAILQ_FIRST(&nmp->nm_bufq)) != NULL) {
+ int giant_locked = 0;
+
/* Take one off the front of the list */
TAILQ_REMOVE(&nmp->nm_bufq, bp, b_freelist);
nmp->nm_bufqlen--;
@@ -243,6 +267,11 @@
nmp->nm_bufqwant = 0;
wakeup(&nmp->nm_bufq);
}
+ mtx_unlock(&nfs_iod_mtx);
+ if (NFS_ISV4(bp->b_vp)) {
+ giant_locked = 1;
+ mtx_lock(&Giant);
+ }
if (bp->b_flags & B_DIRECT) {
KASSERT((bp->b_iocmd == BIO_WRITE), ("nfscvs_iod: BIO_WRITE not set"));
(void)nfs_doio_directwrite(bp);
@@ -252,7 +281,9 @@
else
(void) nfs_doio(bp->b_vp, bp, bp->b_wcred, NULL);
}
-
+ if (giant_locked)
+ mtx_unlock(&Giant);
+ mtx_lock(&nfs_iod_mtx);
/*
* If there are more than one iod on this mount, then defect
* so that the iods can be shared out fairly between the mounts
@@ -276,7 +307,7 @@
/* Someone may be waiting for the last nfsiod to terminate. */
if (--nfs_numasync == 0)
wakeup(&nfs_numasync);
- mtx_unlock(&Giant);
+ mtx_unlock(&nfs_iod_mtx);
if ((error == 0) || (error == EWOULDBLOCK))
kthread_exit(0);
/* Abnormal termination */
Index: nlminfo.h
===================================================================
RCS file: /home/cvs/src/sys/nfsclient/nlminfo.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/nfsclient/nlminfo.h -L sys/nfsclient/nlminfo.h -u -r1.1.1.1 -r1.2
--- sys/nfsclient/nlminfo.h
+++ sys/nfsclient/nlminfo.h
@@ -25,7 +25,7 @@
* SUCH DAMAGE.
*
* from BSDI nlminfo.h,v 2.1 1998/03/18 01:30:38 don Exp
- * $FreeBSD: src/sys/nfsclient/nlminfo.h,v 1.2.14.1 2005/10/27 18:32:39 glebius Exp $
+ * $FreeBSD: src/sys/nfsclient/nlminfo.h,v 1.3 2005/10/26 07:18:36 glebius Exp $
*/
/*
More information about the Midnightbsd-cvs
mailing list