1 |
/*- |
2 |
* Copyright (c) 1989, 1993 |
3 |
* The Regents of the University of California. All rights reserved. |
4 |
* |
5 |
* This code is derived from software contributed to Berkeley by |
6 |
* Rick Macklem at The University of Guelph. |
7 |
* |
8 |
* Redistribution and use in source and binary forms, with or without |
9 |
* modification, are permitted provided that the following conditions |
10 |
* are met: |
11 |
* 1. Redistributions of source code must retain the above copyright |
12 |
* notice, this list of conditions and the following disclaimer. |
13 |
* 2. Redistributions in binary form must reproduce the above copyright |
14 |
* notice, this list of conditions and the following disclaimer in the |
15 |
* documentation and/or other materials provided with the distribution. |
16 |
* 4. Neither the name of the University nor the names of its contributors |
17 |
* may be used to endorse or promote products derived from this software |
18 |
* without specific prior written permission. |
19 |
* |
20 |
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
21 |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
22 |
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
23 |
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
24 |
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
25 |
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
26 |
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
27 |
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
28 |
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
29 |
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
30 |
* SUCH DAMAGE. |
31 |
* |
32 |
* @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 |
33 |
*/ |
34 |
|
35 |
#include <sys/cdefs.h> |
36 |
__MBSDID("$MidnightBSD$"); |
37 |
|
38 |
#include "opt_kdtrace.h" |
39 |
|
40 |
#include <sys/param.h> |
41 |
#include <sys/systm.h> |
42 |
#include <sys/bio.h> |
43 |
#include <sys/buf.h> |
44 |
#include <sys/kernel.h> |
45 |
#include <sys/mbuf.h> |
46 |
#include <sys/mount.h> |
47 |
#include <sys/proc.h> |
48 |
#include <sys/vmmeter.h> |
49 |
#include <sys/vnode.h> |
50 |
|
51 |
#include <vm/vm.h> |
52 |
#include <vm/vm_param.h> |
53 |
#include <vm/vm_extern.h> |
54 |
#include <vm/vm_page.h> |
55 |
#include <vm/vm_object.h> |
56 |
#include <vm/vm_pager.h> |
57 |
#include <vm/vnode_pager.h> |
58 |
|
59 |
#include <nfs/nfsproto.h> |
60 |
#include <nfsclient/nfs.h> |
61 |
#include <nfsclient/nfsmount.h> |
62 |
#include <nfsclient/nfsnode.h> |
63 |
#include <nfs/nfs_kdtrace.h> |
64 |
|
65 |
static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, |
66 |
struct thread *td); |
67 |
static int nfs_directio_write(struct vnode *vp, struct uio *uiop, |
68 |
struct ucred *cred, int ioflag); |
69 |
|
70 |
extern int nfs_directio_enable; |
71 |
extern int nfs_directio_allow_mmap; |
72 |
|
73 |
/* |
74 |
* Vnode op for VM getpages. |
75 |
*/ |
76 |
int |
77 |
nfs_getpages(struct vop_getpages_args *ap) |
78 |
{ |
79 |
int i, error, nextoff, size, toff, count, npages; |
80 |
struct uio uio; |
81 |
struct iovec iov; |
82 |
vm_offset_t kva; |
83 |
struct buf *bp; |
84 |
struct vnode *vp; |
85 |
struct thread *td; |
86 |
struct ucred *cred; |
87 |
struct nfsmount *nmp; |
88 |
vm_object_t object; |
89 |
vm_page_t *pages; |
90 |
struct nfsnode *np; |
91 |
|
92 |
vp = ap->a_vp; |
93 |
np = VTONFS(vp); |
94 |
td = curthread; /* XXX */ |
95 |
cred = curthread->td_ucred; /* XXX */ |
96 |
nmp = VFSTONFS(vp->v_mount); |
97 |
pages = ap->a_m; |
98 |
count = ap->a_count; |
99 |
|
100 |
if ((object = vp->v_object) == NULL) { |
101 |
nfs_printf("nfs_getpages: called with non-merged cache vnode??\n"); |
102 |
return (VM_PAGER_ERROR); |
103 |
} |
104 |
|
105 |
if (nfs_directio_enable && !nfs_directio_allow_mmap) { |
106 |
mtx_lock(&np->n_mtx); |
107 |
if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) { |
108 |
mtx_unlock(&np->n_mtx); |
109 |
nfs_printf("nfs_getpages: called on non-cacheable vnode??\n"); |
110 |
return (VM_PAGER_ERROR); |
111 |
} else |
112 |
mtx_unlock(&np->n_mtx); |
113 |
} |
114 |
|
115 |
mtx_lock(&nmp->nm_mtx); |
116 |
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && |
117 |
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { |
118 |
mtx_unlock(&nmp->nm_mtx); |
119 |
/* We'll never get here for v4, because we always have fsinfo */ |
120 |
(void)nfs_fsinfo(nmp, vp, cred, td); |
121 |
} else |
122 |
mtx_unlock(&nmp->nm_mtx); |
123 |
|
124 |
npages = btoc(count); |
125 |
|
126 |
/* |
127 |
* If the requested page is partially valid, just return it and |
128 |
* allow the pager to zero-out the blanks. Partially valid pages |
129 |
* can only occur at the file EOF. |
130 |
*/ |
131 |
VM_OBJECT_LOCK(object); |
132 |
if (pages[ap->a_reqpage]->valid != 0) { |
133 |
for (i = 0; i < npages; ++i) { |
134 |
if (i != ap->a_reqpage) { |
135 |
vm_page_lock(pages[i]); |
136 |
vm_page_free(pages[i]); |
137 |
vm_page_unlock(pages[i]); |
138 |
} |
139 |
} |
140 |
VM_OBJECT_UNLOCK(object); |
141 |
return (0); |
142 |
} |
143 |
VM_OBJECT_UNLOCK(object); |
144 |
|
145 |
/* |
146 |
* We use only the kva address for the buffer, but this is extremely |
147 |
* convienient and fast. |
148 |
*/ |
149 |
bp = getpbuf(&nfs_pbuf_freecnt); |
150 |
|
151 |
kva = (vm_offset_t) bp->b_data; |
152 |
pmap_qenter(kva, pages, npages); |
153 |
PCPU_INC(cnt.v_vnodein); |
154 |
PCPU_ADD(cnt.v_vnodepgsin, npages); |
155 |
|
156 |
iov.iov_base = (caddr_t) kva; |
157 |
iov.iov_len = count; |
158 |
uio.uio_iov = &iov; |
159 |
uio.uio_iovcnt = 1; |
160 |
uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); |
161 |
uio.uio_resid = count; |
162 |
uio.uio_segflg = UIO_SYSSPACE; |
163 |
uio.uio_rw = UIO_READ; |
164 |
uio.uio_td = td; |
165 |
|
166 |
error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred); |
167 |
pmap_qremove(kva, npages); |
168 |
|
169 |
relpbuf(bp, &nfs_pbuf_freecnt); |
170 |
|
171 |
if (error && (uio.uio_resid == count)) { |
172 |
nfs_printf("nfs_getpages: error %d\n", error); |
173 |
VM_OBJECT_LOCK(object); |
174 |
for (i = 0; i < npages; ++i) { |
175 |
if (i != ap->a_reqpage) { |
176 |
vm_page_lock(pages[i]); |
177 |
vm_page_free(pages[i]); |
178 |
vm_page_unlock(pages[i]); |
179 |
} |
180 |
} |
181 |
VM_OBJECT_UNLOCK(object); |
182 |
return (VM_PAGER_ERROR); |
183 |
} |
184 |
|
185 |
/* |
186 |
* Calculate the number of bytes read and validate only that number |
187 |
* of bytes. Note that due to pending writes, size may be 0. This |
188 |
* does not mean that the remaining data is invalid! |
189 |
*/ |
190 |
|
191 |
size = count - uio.uio_resid; |
192 |
VM_OBJECT_LOCK(object); |
193 |
for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { |
194 |
vm_page_t m; |
195 |
nextoff = toff + PAGE_SIZE; |
196 |
m = pages[i]; |
197 |
|
198 |
if (nextoff <= size) { |
199 |
/* |
200 |
* Read operation filled an entire page |
201 |
*/ |
202 |
m->valid = VM_PAGE_BITS_ALL; |
203 |
KASSERT(m->dirty == 0, |
204 |
("nfs_getpages: page %p is dirty", m)); |
205 |
} else if (size > toff) { |
206 |
/* |
207 |
* Read operation filled a partial page. |
208 |
*/ |
209 |
m->valid = 0; |
210 |
vm_page_set_valid(m, 0, size - toff); |
211 |
KASSERT(m->dirty == 0, |
212 |
("nfs_getpages: page %p is dirty", m)); |
213 |
} else { |
214 |
/* |
215 |
* Read operation was short. If no error |
216 |
* occured we may have hit a zero-fill |
217 |
* section. We leave valid set to 0, and page |
218 |
* is freed by vm_page_readahead_finish() if |
219 |
* its index is not equal to requested, or |
220 |
* page is zeroed and set valid by |
221 |
* vm_pager_get_pages() for requested page. |
222 |
*/ |
223 |
; |
224 |
} |
225 |
if (i != ap->a_reqpage) |
226 |
vm_page_readahead_finish(m); |
227 |
} |
228 |
VM_OBJECT_UNLOCK(object); |
229 |
return (0); |
230 |
} |
231 |
|
232 |
/* |
233 |
* Vnode op for VM putpages. |
234 |
*/ |
235 |
int |
236 |
nfs_putpages(struct vop_putpages_args *ap) |
237 |
{ |
238 |
struct uio uio; |
239 |
struct iovec iov; |
240 |
vm_offset_t kva; |
241 |
struct buf *bp; |
242 |
int iomode, must_commit, i, error, npages, count; |
243 |
off_t offset; |
244 |
int *rtvals; |
245 |
struct vnode *vp; |
246 |
struct thread *td; |
247 |
struct ucred *cred; |
248 |
struct nfsmount *nmp; |
249 |
struct nfsnode *np; |
250 |
vm_page_t *pages; |
251 |
|
252 |
vp = ap->a_vp; |
253 |
np = VTONFS(vp); |
254 |
td = curthread; /* XXX */ |
255 |
/* Set the cred to n_writecred for the write rpcs. */ |
256 |
if (np->n_writecred != NULL) |
257 |
cred = crhold(np->n_writecred); |
258 |
else |
259 |
cred = crhold(curthread->td_ucred); /* XXX */ |
260 |
nmp = VFSTONFS(vp->v_mount); |
261 |
pages = ap->a_m; |
262 |
count = ap->a_count; |
263 |
rtvals = ap->a_rtvals; |
264 |
npages = btoc(count); |
265 |
offset = IDX_TO_OFF(pages[0]->pindex); |
266 |
|
267 |
mtx_lock(&nmp->nm_mtx); |
268 |
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && |
269 |
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { |
270 |
mtx_unlock(&nmp->nm_mtx); |
271 |
(void)nfs_fsinfo(nmp, vp, cred, td); |
272 |
} else |
273 |
mtx_unlock(&nmp->nm_mtx); |
274 |
|
275 |
mtx_lock(&np->n_mtx); |
276 |
if (nfs_directio_enable && !nfs_directio_allow_mmap && |
277 |
(np->n_flag & NNONCACHE) && (vp->v_type == VREG)) { |
278 |
mtx_unlock(&np->n_mtx); |
279 |
nfs_printf("nfs_putpages: called on noncache-able vnode??\n"); |
280 |
mtx_lock(&np->n_mtx); |
281 |
} |
282 |
|
283 |
for (i = 0; i < npages; i++) |
284 |
rtvals[i] = VM_PAGER_ERROR; |
285 |
|
286 |
/* |
287 |
* When putting pages, do not extend file past EOF. |
288 |
*/ |
289 |
if (offset + count > np->n_size) { |
290 |
count = np->n_size - offset; |
291 |
if (count < 0) |
292 |
count = 0; |
293 |
} |
294 |
mtx_unlock(&np->n_mtx); |
295 |
|
296 |
/* |
297 |
* We use only the kva address for the buffer, but this is extremely |
298 |
* convienient and fast. |
299 |
*/ |
300 |
bp = getpbuf(&nfs_pbuf_freecnt); |
301 |
|
302 |
kva = (vm_offset_t) bp->b_data; |
303 |
pmap_qenter(kva, pages, npages); |
304 |
PCPU_INC(cnt.v_vnodeout); |
305 |
PCPU_ADD(cnt.v_vnodepgsout, count); |
306 |
|
307 |
iov.iov_base = (caddr_t) kva; |
308 |
iov.iov_len = count; |
309 |
uio.uio_iov = &iov; |
310 |
uio.uio_iovcnt = 1; |
311 |
uio.uio_offset = offset; |
312 |
uio.uio_resid = count; |
313 |
uio.uio_segflg = UIO_SYSSPACE; |
314 |
uio.uio_rw = UIO_WRITE; |
315 |
uio.uio_td = td; |
316 |
|
317 |
if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) |
318 |
iomode = NFSV3WRITE_UNSTABLE; |
319 |
else |
320 |
iomode = NFSV3WRITE_FILESYNC; |
321 |
|
322 |
error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit); |
323 |
crfree(cred); |
324 |
|
325 |
pmap_qremove(kva, npages); |
326 |
relpbuf(bp, &nfs_pbuf_freecnt); |
327 |
|
328 |
if (!error) { |
329 |
vnode_pager_undirty_pages(pages, rtvals, count - uio.uio_resid); |
330 |
if (must_commit) { |
331 |
nfs_clearcommit(vp->v_mount); |
332 |
} |
333 |
} |
334 |
return rtvals[0]; |
335 |
} |
336 |
|
337 |
/* |
338 |
* For nfs, cache consistency can only be maintained approximately. |
339 |
* Although RFC1094 does not specify the criteria, the following is |
340 |
* believed to be compatible with the reference port. |
341 |
* For nfs: |
342 |
* If the file's modify time on the server has changed since the |
343 |
* last read rpc or you have written to the file, |
344 |
* you may have lost data cache consistency with the |
345 |
* server, so flush all of the file's data out of the cache. |
346 |
* Then force a getattr rpc to ensure that you have up to date |
347 |
* attributes. |
348 |
* NB: This implies that cache data can be read when up to |
349 |
* NFS_ATTRTIMEO seconds out of date. If you find that you need current |
350 |
* attributes this could be forced by setting n_attrstamp to 0 before |
351 |
* the VOP_GETATTR() call. |
352 |
*/ |
353 |
static inline int |
354 |
nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred) |
355 |
{ |
356 |
int error = 0; |
357 |
struct vattr vattr; |
358 |
struct nfsnode *np = VTONFS(vp); |
359 |
int old_lock; |
360 |
struct nfsmount *nmp = VFSTONFS(vp->v_mount); |
361 |
|
362 |
/* |
363 |
* Grab the exclusive lock before checking whether the cache is |
364 |
* consistent. |
365 |
* XXX - We can make this cheaper later (by acquiring cheaper locks). |
366 |
* But for now, this suffices. |
367 |
*/ |
368 |
old_lock = nfs_upgrade_vnlock(vp); |
369 |
if (vp->v_iflag & VI_DOOMED) { |
370 |
nfs_downgrade_vnlock(vp, old_lock); |
371 |
return (EBADF); |
372 |
} |
373 |
|
374 |
mtx_lock(&np->n_mtx); |
375 |
if (np->n_flag & NMODIFIED) { |
376 |
mtx_unlock(&np->n_mtx); |
377 |
if (vp->v_type != VREG) { |
378 |
if (vp->v_type != VDIR) |
379 |
panic("nfs: bioread, not dir"); |
380 |
(nmp->nm_rpcops->nr_invaldir)(vp); |
381 |
error = nfs_vinvalbuf(vp, V_SAVE, td, 1); |
382 |
if (error) |
383 |
goto out; |
384 |
} |
385 |
np->n_attrstamp = 0; |
386 |
KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); |
387 |
error = VOP_GETATTR(vp, &vattr, cred); |
388 |
if (error) |
389 |
goto out; |
390 |
mtx_lock(&np->n_mtx); |
391 |
np->n_mtime = vattr.va_mtime; |
392 |
mtx_unlock(&np->n_mtx); |
393 |
} else { |
394 |
mtx_unlock(&np->n_mtx); |
395 |
error = VOP_GETATTR(vp, &vattr, cred); |
396 |
if (error) |
397 |
return (error); |
398 |
mtx_lock(&np->n_mtx); |
399 |
if ((np->n_flag & NSIZECHANGED) |
400 |
|| (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) { |
401 |
mtx_unlock(&np->n_mtx); |
402 |
if (vp->v_type == VDIR) |
403 |
(nmp->nm_rpcops->nr_invaldir)(vp); |
404 |
error = nfs_vinvalbuf(vp, V_SAVE, td, 1); |
405 |
if (error) |
406 |
goto out; |
407 |
mtx_lock(&np->n_mtx); |
408 |
np->n_mtime = vattr.va_mtime; |
409 |
np->n_flag &= ~NSIZECHANGED; |
410 |
} |
411 |
mtx_unlock(&np->n_mtx); |
412 |
} |
413 |
out: |
414 |
nfs_downgrade_vnlock(vp, old_lock); |
415 |
return error; |
416 |
} |
417 |
|
418 |
/* |
419 |
* Vnode op for read using bio |
420 |
*/ |
421 |
int |
422 |
nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) |
423 |
{ |
424 |
struct nfsnode *np = VTONFS(vp); |
425 |
int biosize, i; |
426 |
struct buf *bp, *rabp; |
427 |
struct thread *td; |
428 |
struct nfsmount *nmp = VFSTONFS(vp->v_mount); |
429 |
daddr_t lbn, rabn; |
430 |
off_t end; |
431 |
int bcount; |
432 |
int seqcount; |
433 |
int nra, error = 0, n = 0, on = 0; |
434 |
|
435 |
KASSERT(uio->uio_rw == UIO_READ, ("nfs_read mode")); |
436 |
if (uio->uio_resid == 0) |
437 |
return (0); |
438 |
if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ |
439 |
return (EINVAL); |
440 |
td = uio->uio_td; |
441 |
|
442 |
mtx_lock(&nmp->nm_mtx); |
443 |
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && |
444 |
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { |
445 |
mtx_unlock(&nmp->nm_mtx); |
446 |
(void)nfs_fsinfo(nmp, vp, cred, td); |
447 |
} else |
448 |
mtx_unlock(&nmp->nm_mtx); |
449 |
|
450 |
end = uio->uio_offset + uio->uio_resid; |
451 |
if (vp->v_type != VDIR && |
452 |
(end > nmp->nm_maxfilesize || end < uio->uio_offset)) |
453 |
return (EFBIG); |
454 |
|
455 |
if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG)) |
456 |
/* No caching/ no readaheads. Just read data into the user buffer */ |
457 |
return nfs_readrpc(vp, uio, cred); |
458 |
|
459 |
biosize = vp->v_bufobj.bo_bsize; |
460 |
seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE); |
461 |
|
462 |
error = nfs_bioread_check_cons(vp, td, cred); |
463 |
if (error) |
464 |
return error; |
465 |
|
466 |
do { |
467 |
u_quad_t nsize; |
468 |
|
469 |
mtx_lock(&np->n_mtx); |
470 |
nsize = np->n_size; |
471 |
mtx_unlock(&np->n_mtx); |
472 |
|
473 |
switch (vp->v_type) { |
474 |
case VREG: |
475 |
nfsstats.biocache_reads++; |
476 |
lbn = uio->uio_offset / biosize; |
477 |
on = uio->uio_offset & (biosize - 1); |
478 |
|
479 |
/* |
480 |
* Start the read ahead(s), as required. |
481 |
*/ |
482 |
if (nmp->nm_readahead > 0) { |
483 |
for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && |
484 |
(off_t)(lbn + 1 + nra) * biosize < nsize; nra++) { |
485 |
rabn = lbn + 1 + nra; |
486 |
if (incore(&vp->v_bufobj, rabn) == NULL) { |
487 |
rabp = nfs_getcacheblk(vp, rabn, biosize, td); |
488 |
if (!rabp) { |
489 |
error = nfs_sigintr(nmp, td); |
490 |
return (error ? error : EINTR); |
491 |
} |
492 |
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { |
493 |
rabp->b_flags |= B_ASYNC; |
494 |
rabp->b_iocmd = BIO_READ; |
495 |
vfs_busy_pages(rabp, 0); |
496 |
if (nfs_asyncio(nmp, rabp, cred, td)) { |
497 |
rabp->b_flags |= B_INVAL; |
498 |
rabp->b_ioflags |= BIO_ERROR; |
499 |
vfs_unbusy_pages(rabp); |
500 |
brelse(rabp); |
501 |
break; |
502 |
} |
503 |
} else { |
504 |
brelse(rabp); |
505 |
} |
506 |
} |
507 |
} |
508 |
} |
509 |
|
510 |
/* Note that bcount is *not* DEV_BSIZE aligned. */ |
511 |
bcount = biosize; |
512 |
if ((off_t)lbn * biosize >= nsize) { |
513 |
bcount = 0; |
514 |
} else if ((off_t)(lbn + 1) * biosize > nsize) { |
515 |
bcount = nsize - (off_t)lbn * biosize; |
516 |
} |
517 |
bp = nfs_getcacheblk(vp, lbn, bcount, td); |
518 |
|
519 |
if (!bp) { |
520 |
error = nfs_sigintr(nmp, td); |
521 |
return (error ? error : EINTR); |
522 |
} |
523 |
|
524 |
/* |
525 |
* If B_CACHE is not set, we must issue the read. If this |
526 |
* fails, we return an error. |
527 |
*/ |
528 |
|
529 |
if ((bp->b_flags & B_CACHE) == 0) { |
530 |
bp->b_iocmd = BIO_READ; |
531 |
vfs_busy_pages(bp, 0); |
532 |
error = nfs_doio(vp, bp, cred, td); |
533 |
if (error) { |
534 |
brelse(bp); |
535 |
return (error); |
536 |
} |
537 |
} |
538 |
|
539 |
/* |
540 |
* on is the offset into the current bp. Figure out how many |
541 |
* bytes we can copy out of the bp. Note that bcount is |
542 |
* NOT DEV_BSIZE aligned. |
543 |
* |
544 |
* Then figure out how many bytes we can copy into the uio. |
545 |
*/ |
546 |
|
547 |
n = 0; |
548 |
if (on < bcount) |
549 |
n = MIN((unsigned)(bcount - on), uio->uio_resid); |
550 |
break; |
551 |
case VLNK: |
552 |
nfsstats.biocache_readlinks++; |
553 |
bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td); |
554 |
if (!bp) { |
555 |
error = nfs_sigintr(nmp, td); |
556 |
return (error ? error : EINTR); |
557 |
} |
558 |
if ((bp->b_flags & B_CACHE) == 0) { |
559 |
bp->b_iocmd = BIO_READ; |
560 |
vfs_busy_pages(bp, 0); |
561 |
error = nfs_doio(vp, bp, cred, td); |
562 |
if (error) { |
563 |
bp->b_ioflags |= BIO_ERROR; |
564 |
brelse(bp); |
565 |
return (error); |
566 |
} |
567 |
} |
568 |
n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); |
569 |
on = 0; |
570 |
break; |
571 |
case VDIR: |
572 |
nfsstats.biocache_readdirs++; |
573 |
if (np->n_direofoffset |
574 |
&& uio->uio_offset >= np->n_direofoffset) { |
575 |
return (0); |
576 |
} |
577 |
lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; |
578 |
on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); |
579 |
bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td); |
580 |
if (!bp) { |
581 |
error = nfs_sigintr(nmp, td); |
582 |
return (error ? error : EINTR); |
583 |
} |
584 |
if ((bp->b_flags & B_CACHE) == 0) { |
585 |
bp->b_iocmd = BIO_READ; |
586 |
vfs_busy_pages(bp, 0); |
587 |
error = nfs_doio(vp, bp, cred, td); |
588 |
if (error) { |
589 |
brelse(bp); |
590 |
} |
591 |
while (error == NFSERR_BAD_COOKIE) { |
592 |
(nmp->nm_rpcops->nr_invaldir)(vp); |
593 |
error = nfs_vinvalbuf(vp, 0, td, 1); |
594 |
/* |
595 |
* Yuck! The directory has been modified on the |
596 |
* server. The only way to get the block is by |
597 |
* reading from the beginning to get all the |
598 |
* offset cookies. |
599 |
* |
600 |
* Leave the last bp intact unless there is an error. |
601 |
* Loop back up to the while if the error is another |
602 |
* NFSERR_BAD_COOKIE (double yuch!). |
603 |
*/ |
604 |
for (i = 0; i <= lbn && !error; i++) { |
605 |
if (np->n_direofoffset |
606 |
&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) |
607 |
return (0); |
608 |
bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td); |
609 |
if (!bp) { |
610 |
error = nfs_sigintr(nmp, td); |
611 |
return (error ? error : EINTR); |
612 |
} |
613 |
if ((bp->b_flags & B_CACHE) == 0) { |
614 |
bp->b_iocmd = BIO_READ; |
615 |
vfs_busy_pages(bp, 0); |
616 |
error = nfs_doio(vp, bp, cred, td); |
617 |
/* |
618 |
* no error + B_INVAL == directory EOF, |
619 |
* use the block. |
620 |
*/ |
621 |
if (error == 0 && (bp->b_flags & B_INVAL)) |
622 |
break; |
623 |
} |
624 |
/* |
625 |
* An error will throw away the block and the |
626 |
* for loop will break out. If no error and this |
627 |
* is not the block we want, we throw away the |
628 |
* block and go for the next one via the for loop. |
629 |
*/ |
630 |
if (error || i < lbn) |
631 |
brelse(bp); |
632 |
} |
633 |
} |
634 |
/* |
635 |
* The above while is repeated if we hit another cookie |
636 |
* error. If we hit an error and it wasn't a cookie error, |
637 |
* we give up. |
638 |
*/ |
639 |
if (error) |
640 |
return (error); |
641 |
} |
642 |
|
643 |
/* |
644 |
* If not eof and read aheads are enabled, start one. |
645 |
* (You need the current block first, so that you have the |
646 |
* directory offset cookie of the next block.) |
647 |
*/ |
648 |
if (nmp->nm_readahead > 0 && |
649 |
(bp->b_flags & B_INVAL) == 0 && |
650 |
(np->n_direofoffset == 0 || |
651 |
(lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && |
652 |
incore(&vp->v_bufobj, lbn + 1) == NULL) { |
653 |
rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td); |
654 |
if (rabp) { |
655 |
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { |
656 |
rabp->b_flags |= B_ASYNC; |
657 |
rabp->b_iocmd = BIO_READ; |
658 |
vfs_busy_pages(rabp, 0); |
659 |
if (nfs_asyncio(nmp, rabp, cred, td)) { |
660 |
rabp->b_flags |= B_INVAL; |
661 |
rabp->b_ioflags |= BIO_ERROR; |
662 |
vfs_unbusy_pages(rabp); |
663 |
brelse(rabp); |
664 |
} |
665 |
} else { |
666 |
brelse(rabp); |
667 |
} |
668 |
} |
669 |
} |
670 |
/* |
671 |
* Unlike VREG files, whos buffer size ( bp->b_bcount ) is |
672 |
* chopped for the EOF condition, we cannot tell how large |
673 |
* NFS directories are going to be until we hit EOF. So |
674 |
* an NFS directory buffer is *not* chopped to its EOF. Now, |
675 |
* it just so happens that b_resid will effectively chop it |
676 |
* to EOF. *BUT* this information is lost if the buffer goes |
677 |
* away and is reconstituted into a B_CACHE state ( due to |
678 |
* being VMIO ) later. So we keep track of the directory eof |
679 |
* in np->n_direofoffset and chop it off as an extra step |
680 |
* right here. |
681 |
*/ |
682 |
n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); |
683 |
if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) |
684 |
n = np->n_direofoffset - uio->uio_offset; |
685 |
break; |
686 |
default: |
687 |
nfs_printf(" nfs_bioread: type %x unexpected\n", vp->v_type); |
688 |
bp = NULL; |
689 |
break; |
690 |
}; |
691 |
|
692 |
if (n > 0) { |
693 |
error = uiomove(bp->b_data + on, (int)n, uio); |
694 |
} |
695 |
if (vp->v_type == VLNK) |
696 |
n = 0; |
697 |
if (bp != NULL) |
698 |
brelse(bp); |
699 |
} while (error == 0 && uio->uio_resid > 0 && n > 0); |
700 |
return (error); |
701 |
} |
702 |
|
703 |
/* |
704 |
* The NFS write path cannot handle iovecs with len > 1. So we need to |
705 |
* break up iovecs accordingly (restricting them to wsize). |
706 |
* For the SYNC case, we can do this with 1 copy (user buffer -> mbuf). |
707 |
* For the ASYNC case, 2 copies are needed. The first a copy from the |
708 |
* user buffer to a staging buffer and then a second copy from the staging |
709 |
* buffer to mbufs. This can be optimized by copying from the user buffer |
710 |
* directly into mbufs and passing the chain down, but that requires a |
711 |
* fair amount of re-working of the relevant codepaths (and can be done |
712 |
* later). |
713 |
*/ |
714 |
static int |
715 |
nfs_directio_write(vp, uiop, cred, ioflag) |
716 |
struct vnode *vp; |
717 |
struct uio *uiop; |
718 |
struct ucred *cred; |
719 |
int ioflag; |
720 |
{ |
721 |
int error; |
722 |
struct nfsmount *nmp = VFSTONFS(vp->v_mount); |
723 |
struct thread *td = uiop->uio_td; |
724 |
int size; |
725 |
int wsize; |
726 |
|
727 |
mtx_lock(&nmp->nm_mtx); |
728 |
wsize = nmp->nm_wsize; |
729 |
mtx_unlock(&nmp->nm_mtx); |
730 |
if (ioflag & IO_SYNC) { |
731 |
int iomode, must_commit; |
732 |
struct uio uio; |
733 |
struct iovec iov; |
734 |
do_sync: |
735 |
while (uiop->uio_resid > 0) { |
736 |
size = MIN(uiop->uio_resid, wsize); |
737 |
size = MIN(uiop->uio_iov->iov_len, size); |
738 |
iov.iov_base = uiop->uio_iov->iov_base; |
739 |
iov.iov_len = size; |
740 |
uio.uio_iov = &iov; |
741 |
uio.uio_iovcnt = 1; |
742 |
uio.uio_offset = uiop->uio_offset; |
743 |
uio.uio_resid = size; |
744 |
uio.uio_segflg = UIO_USERSPACE; |
745 |
uio.uio_rw = UIO_WRITE; |
746 |
uio.uio_td = td; |
747 |
iomode = NFSV3WRITE_FILESYNC; |
748 |
error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, |
749 |
&iomode, &must_commit); |
750 |
KASSERT((must_commit == 0), |
751 |
("nfs_directio_write: Did not commit write")); |
752 |
if (error) |
753 |
return (error); |
754 |
uiop->uio_offset += size; |
755 |
uiop->uio_resid -= size; |
756 |
if (uiop->uio_iov->iov_len <= size) { |
757 |
uiop->uio_iovcnt--; |
758 |
uiop->uio_iov++; |
759 |
} else { |
760 |
uiop->uio_iov->iov_base = |
761 |
(char *)uiop->uio_iov->iov_base + size; |
762 |
uiop->uio_iov->iov_len -= size; |
763 |
} |
764 |
} |
765 |
} else { |
766 |
struct uio *t_uio; |
767 |
struct iovec *t_iov; |
768 |
struct buf *bp; |
769 |
|
770 |
/* |
771 |
* Break up the write into blocksize chunks and hand these |
772 |
* over to nfsiod's for write back. |
773 |
* Unfortunately, this incurs a copy of the data. Since |
774 |
* the user could modify the buffer before the write is |
775 |
* initiated. |
776 |
* |
777 |
* The obvious optimization here is that one of the 2 copies |
778 |
* in the async write path can be eliminated by copying the |
779 |
* data here directly into mbufs and passing the mbuf chain |
780 |
* down. But that will require a fair amount of re-working |
781 |
* of the code and can be done if there's enough interest |
782 |
* in NFS directio access. |
783 |
*/ |
784 |
while (uiop->uio_resid > 0) { |
785 |
size = MIN(uiop->uio_resid, wsize); |
786 |
size = MIN(uiop->uio_iov->iov_len, size); |
787 |
bp = getpbuf(&nfs_pbuf_freecnt); |
788 |
t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK); |
789 |
t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK); |
790 |
t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK); |
791 |
t_iov->iov_len = size; |
792 |
t_uio->uio_iov = t_iov; |
793 |
t_uio->uio_iovcnt = 1; |
794 |
t_uio->uio_offset = uiop->uio_offset; |
795 |
t_uio->uio_resid = size; |
796 |
t_uio->uio_segflg = UIO_SYSSPACE; |
797 |
t_uio->uio_rw = UIO_WRITE; |
798 |
t_uio->uio_td = td; |
799 |
KASSERT(uiop->uio_segflg == UIO_USERSPACE || |
800 |
uiop->uio_segflg == UIO_SYSSPACE, |
801 |
("nfs_directio_write: Bad uio_segflg")); |
802 |
if (uiop->uio_segflg == UIO_USERSPACE) { |
803 |
error = copyin(uiop->uio_iov->iov_base, |
804 |
t_iov->iov_base, size); |
805 |
if (error != 0) |
806 |
goto err_free; |
807 |
} else |
808 |
/* |
809 |
* UIO_SYSSPACE may never happen, but handle |
810 |
* it just in case it does. |
811 |
*/ |
812 |
bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, |
813 |
size); |
814 |
bp->b_flags |= B_DIRECT; |
815 |
bp->b_iocmd = BIO_WRITE; |
816 |
if (cred != NOCRED) { |
817 |
crhold(cred); |
818 |
bp->b_wcred = cred; |
819 |
} else |
820 |
bp->b_wcred = NOCRED; |
821 |
bp->b_caller1 = (void *)t_uio; |
822 |
bp->b_vp = vp; |
823 |
error = nfs_asyncio(nmp, bp, NOCRED, td); |
824 |
err_free: |
825 |
if (error) { |
826 |
free(t_iov->iov_base, M_NFSDIRECTIO); |
827 |
free(t_iov, M_NFSDIRECTIO); |
828 |
free(t_uio, M_NFSDIRECTIO); |
829 |
bp->b_vp = NULL; |
830 |
relpbuf(bp, &nfs_pbuf_freecnt); |
831 |
if (error == EINTR) |
832 |
return (error); |
833 |
goto do_sync; |
834 |
} |
835 |
uiop->uio_offset += size; |
836 |
uiop->uio_resid -= size; |
837 |
if (uiop->uio_iov->iov_len <= size) { |
838 |
uiop->uio_iovcnt--; |
839 |
uiop->uio_iov++; |
840 |
} else { |
841 |
uiop->uio_iov->iov_base = |
842 |
(char *)uiop->uio_iov->iov_base + size; |
843 |
uiop->uio_iov->iov_len -= size; |
844 |
} |
845 |
} |
846 |
} |
847 |
return (0); |
848 |
} |
849 |
|
850 |
/* |
851 |
* Vnode op for write using bio |
852 |
*/ |
853 |
int |
854 |
nfs_write(struct vop_write_args *ap) |
855 |
{ |
856 |
int biosize; |
857 |
struct uio *uio = ap->a_uio; |
858 |
struct thread *td = uio->uio_td; |
859 |
struct vnode *vp = ap->a_vp; |
860 |
struct nfsnode *np = VTONFS(vp); |
861 |
struct ucred *cred = ap->a_cred; |
862 |
int ioflag = ap->a_ioflag; |
863 |
struct buf *bp; |
864 |
struct vattr vattr; |
865 |
struct nfsmount *nmp = VFSTONFS(vp->v_mount); |
866 |
daddr_t lbn; |
867 |
off_t end; |
868 |
int bcount; |
869 |
int n, on, error = 0; |
870 |
|
871 |
KASSERT(uio->uio_rw == UIO_WRITE, ("nfs_write mode")); |
872 |
KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, |
873 |
("nfs_write proc")); |
874 |
if (vp->v_type != VREG) |
875 |
return (EIO); |
876 |
mtx_lock(&np->n_mtx); |
877 |
if (np->n_flag & NWRITEERR) { |
878 |
np->n_flag &= ~NWRITEERR; |
879 |
mtx_unlock(&np->n_mtx); |
880 |
return (np->n_error); |
881 |
} else |
882 |
mtx_unlock(&np->n_mtx); |
883 |
mtx_lock(&nmp->nm_mtx); |
884 |
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && |
885 |
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { |
886 |
mtx_unlock(&nmp->nm_mtx); |
887 |
(void)nfs_fsinfo(nmp, vp, cred, td); |
888 |
} else |
889 |
mtx_unlock(&nmp->nm_mtx); |
890 |
|
891 |
/* |
892 |
* Synchronously flush pending buffers if we are in synchronous |
893 |
* mode or if we are appending. |
894 |
*/ |
895 |
if (ioflag & (IO_APPEND | IO_SYNC)) { |
896 |
mtx_lock(&np->n_mtx); |
897 |
if (np->n_flag & NMODIFIED) { |
898 |
mtx_unlock(&np->n_mtx); |
899 |
#ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */ |
900 |
/* |
901 |
* Require non-blocking, synchronous writes to |
902 |
* dirty files to inform the program it needs |
903 |
* to fsync(2) explicitly. |
904 |
*/ |
905 |
if (ioflag & IO_NDELAY) |
906 |
return (EAGAIN); |
907 |
#endif |
908 |
flush_and_restart: |
909 |
np->n_attrstamp = 0; |
910 |
KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); |
911 |
error = nfs_vinvalbuf(vp, V_SAVE, td, 1); |
912 |
if (error) |
913 |
return (error); |
914 |
} else |
915 |
mtx_unlock(&np->n_mtx); |
916 |
} |
917 |
|
918 |
/* |
919 |
* If IO_APPEND then load uio_offset. We restart here if we cannot |
920 |
* get the append lock. |
921 |
*/ |
922 |
if (ioflag & IO_APPEND) { |
923 |
np->n_attrstamp = 0; |
924 |
KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); |
925 |
error = VOP_GETATTR(vp, &vattr, cred); |
926 |
if (error) |
927 |
return (error); |
928 |
mtx_lock(&np->n_mtx); |
929 |
uio->uio_offset = np->n_size; |
930 |
mtx_unlock(&np->n_mtx); |
931 |
} |
932 |
|
933 |
if (uio->uio_offset < 0) |
934 |
return (EINVAL); |
935 |
end = uio->uio_offset + uio->uio_resid; |
936 |
if (end > nmp->nm_maxfilesize || end < uio->uio_offset) |
937 |
return (EFBIG); |
938 |
if (uio->uio_resid == 0) |
939 |
return (0); |
940 |
|
941 |
if (nfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG) |
942 |
return nfs_directio_write(vp, uio, cred, ioflag); |
943 |
|
944 |
/* |
945 |
* Maybe this should be above the vnode op call, but so long as |
946 |
* file servers have no limits, i don't think it matters |
947 |
*/ |
948 |
if (vn_rlimit_fsize(vp, uio, td)) |
949 |
return (EFBIG); |
950 |
|
951 |
biosize = vp->v_bufobj.bo_bsize; |
952 |
/* |
953 |
* Find all of this file's B_NEEDCOMMIT buffers. If our writes |
954 |
* would exceed the local maximum per-file write commit size when |
955 |
* combined with those, we must decide whether to flush, |
956 |
* go synchronous, or return error. We don't bother checking |
957 |
* IO_UNIT -- we just make all writes atomic anyway, as there's |
958 |
* no point optimizing for something that really won't ever happen. |
959 |
*/ |
960 |
if (!(ioflag & IO_SYNC)) { |
961 |
int nflag; |
962 |
|
963 |
mtx_lock(&np->n_mtx); |
964 |
nflag = np->n_flag; |
965 |
mtx_unlock(&np->n_mtx); |
966 |
int needrestart = 0; |
967 |
if (nmp->nm_wcommitsize < uio->uio_resid) { |
968 |
/* |
969 |
* If this request could not possibly be completed |
970 |
* without exceeding the maximum outstanding write |
971 |
* commit size, see if we can convert it into a |
972 |
* synchronous write operation. |
973 |
*/ |
974 |
if (ioflag & IO_NDELAY) |
975 |
return (EAGAIN); |
976 |
ioflag |= IO_SYNC; |
977 |
if (nflag & NMODIFIED) |
978 |
needrestart = 1; |
979 |
} else if (nflag & NMODIFIED) { |
980 |
int wouldcommit = 0; |
981 |
BO_LOCK(&vp->v_bufobj); |
982 |
if (vp->v_bufobj.bo_dirty.bv_cnt != 0) { |
983 |
TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, |
984 |
b_bobufs) { |
985 |
if (bp->b_flags & B_NEEDCOMMIT) |
986 |
wouldcommit += bp->b_bcount; |
987 |
} |
988 |
} |
989 |
BO_UNLOCK(&vp->v_bufobj); |
990 |
/* |
991 |
* Since we're not operating synchronously and |
992 |
* bypassing the buffer cache, we are in a commit |
993 |
* and holding all of these buffers whether |
994 |
* transmitted or not. If not limited, this |
995 |
* will lead to the buffer cache deadlocking, |
996 |
* as no one else can flush our uncommitted buffers. |
997 |
*/ |
998 |
wouldcommit += uio->uio_resid; |
999 |
/* |
1000 |
* If we would initially exceed the maximum |
1001 |
* outstanding write commit size, flush and restart. |
1002 |
*/ |
1003 |
if (wouldcommit > nmp->nm_wcommitsize) |
1004 |
needrestart = 1; |
1005 |
} |
1006 |
if (needrestart) |
1007 |
goto flush_and_restart; |
1008 |
} |
1009 |
|
1010 |
do { |
1011 |
nfsstats.biocache_writes++; |
1012 |
lbn = uio->uio_offset / biosize; |
1013 |
on = uio->uio_offset & (biosize-1); |
1014 |
n = MIN((unsigned)(biosize - on), uio->uio_resid); |
1015 |
again: |
1016 |
/* |
1017 |
* Handle direct append and file extension cases, calculate |
1018 |
* unaligned buffer size. |
1019 |
*/ |
1020 |
mtx_lock(&np->n_mtx); |
1021 |
if (uio->uio_offset == np->n_size && n) { |
1022 |
mtx_unlock(&np->n_mtx); |
1023 |
/* |
1024 |
* Get the buffer (in its pre-append state to maintain |
1025 |
* B_CACHE if it was previously set). Resize the |
1026 |
* nfsnode after we have locked the buffer to prevent |
1027 |
* readers from reading garbage. |
1028 |
*/ |
1029 |
bcount = on; |
1030 |
bp = nfs_getcacheblk(vp, lbn, bcount, td); |
1031 |
|
1032 |
if (bp != NULL) { |
1033 |
long save; |
1034 |
|
1035 |
mtx_lock(&np->n_mtx); |
1036 |
np->n_size = uio->uio_offset + n; |
1037 |
np->n_flag |= NMODIFIED; |
1038 |
vnode_pager_setsize(vp, np->n_size); |
1039 |
mtx_unlock(&np->n_mtx); |
1040 |
|
1041 |
save = bp->b_flags & B_CACHE; |
1042 |
bcount += n; |
1043 |
allocbuf(bp, bcount); |
1044 |
bp->b_flags |= save; |
1045 |
} |
1046 |
} else { |
1047 |
/* |
1048 |
* Obtain the locked cache block first, and then |
1049 |
* adjust the file's size as appropriate. |
1050 |
*/ |
1051 |
bcount = on + n; |
1052 |
if ((off_t)lbn * biosize + bcount < np->n_size) { |
1053 |
if ((off_t)(lbn + 1) * biosize < np->n_size) |
1054 |
bcount = biosize; |
1055 |
else |
1056 |
bcount = np->n_size - (off_t)lbn * biosize; |
1057 |
} |
1058 |
mtx_unlock(&np->n_mtx); |
1059 |
bp = nfs_getcacheblk(vp, lbn, bcount, td); |
1060 |
mtx_lock(&np->n_mtx); |
1061 |
if (uio->uio_offset + n > np->n_size) { |
1062 |
np->n_size = uio->uio_offset + n; |
1063 |
np->n_flag |= NMODIFIED; |
1064 |
vnode_pager_setsize(vp, np->n_size); |
1065 |
} |
1066 |
mtx_unlock(&np->n_mtx); |
1067 |
} |
1068 |
|
1069 |
if (!bp) { |
1070 |
error = nfs_sigintr(nmp, td); |
1071 |
if (!error) |
1072 |
error = EINTR; |
1073 |
break; |
1074 |
} |
1075 |
|
1076 |
/* |
1077 |
* Issue a READ if B_CACHE is not set. In special-append |
1078 |
* mode, B_CACHE is based on the buffer prior to the write |
1079 |
* op and is typically set, avoiding the read. If a read |
1080 |
* is required in special append mode, the server will |
1081 |
* probably send us a short-read since we extended the file |
1082 |
* on our end, resulting in b_resid == 0 and, thusly, |
1083 |
* B_CACHE getting set. |
1084 |
* |
1085 |
* We can also avoid issuing the read if the write covers |
1086 |
* the entire buffer. We have to make sure the buffer state |
1087 |
* is reasonable in this case since we will not be initiating |
1088 |
* I/O. See the comments in kern/vfs_bio.c's getblk() for |
1089 |
* more information. |
1090 |
* |
1091 |
* B_CACHE may also be set due to the buffer being cached |
1092 |
* normally. |
1093 |
*/ |
1094 |
|
1095 |
if (on == 0 && n == bcount) { |
1096 |
bp->b_flags |= B_CACHE; |
1097 |
bp->b_flags &= ~B_INVAL; |
1098 |
bp->b_ioflags &= ~BIO_ERROR; |
1099 |
} |
1100 |
|
1101 |
if ((bp->b_flags & B_CACHE) == 0) { |
1102 |
bp->b_iocmd = BIO_READ; |
1103 |
vfs_busy_pages(bp, 0); |
1104 |
error = nfs_doio(vp, bp, cred, td); |
1105 |
if (error) { |
1106 |
brelse(bp); |
1107 |
break; |
1108 |
} |
1109 |
} |
1110 |
if (bp->b_wcred == NOCRED) |
1111 |
bp->b_wcred = crhold(cred); |
1112 |
mtx_lock(&np->n_mtx); |
1113 |
np->n_flag |= NMODIFIED; |
1114 |
mtx_unlock(&np->n_mtx); |
1115 |
|
1116 |
/* |
1117 |
* If dirtyend exceeds file size, chop it down. This should |
1118 |
* not normally occur but there is an append race where it |
1119 |
* might occur XXX, so we log it. |
1120 |
* |
1121 |
* If the chopping creates a reverse-indexed or degenerate |
1122 |
* situation with dirtyoff/end, we 0 both of them. |
1123 |
*/ |
1124 |
|
1125 |
if (bp->b_dirtyend > bcount) { |
1126 |
nfs_printf("NFS append race @%lx:%d\n", |
1127 |
(long)bp->b_blkno * DEV_BSIZE, |
1128 |
bp->b_dirtyend - bcount); |
1129 |
bp->b_dirtyend = bcount; |
1130 |
} |
1131 |
|
1132 |
if (bp->b_dirtyoff >= bp->b_dirtyend) |
1133 |
bp->b_dirtyoff = bp->b_dirtyend = 0; |
1134 |
|
1135 |
/* |
1136 |
* If the new write will leave a contiguous dirty |
1137 |
* area, just update the b_dirtyoff and b_dirtyend, |
1138 |
* otherwise force a write rpc of the old dirty area. |
1139 |
* |
1140 |
* While it is possible to merge discontiguous writes due to |
1141 |
* our having a B_CACHE buffer ( and thus valid read data |
1142 |
* for the hole), we don't because it could lead to |
1143 |
* significant cache coherency problems with multiple clients, |
1144 |
* especially if locking is implemented later on. |
1145 |
* |
1146 |
* as an optimization we could theoretically maintain |
1147 |
* a linked list of discontinuous areas, but we would still |
1148 |
* have to commit them separately so there isn't much |
1149 |
* advantage to it except perhaps a bit of asynchronization. |
1150 |
*/ |
1151 |
|
1152 |
if (bp->b_dirtyend > 0 && |
1153 |
(on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { |
1154 |
if (bwrite(bp) == EINTR) { |
1155 |
error = EINTR; |
1156 |
break; |
1157 |
} |
1158 |
goto again; |
1159 |
} |
1160 |
|
1161 |
error = uiomove((char *)bp->b_data + on, n, uio); |
1162 |
|
1163 |
/* |
1164 |
* Since this block is being modified, it must be written |
1165 |
* again and not just committed. Since write clustering does |
1166 |
* not work for the stage 1 data write, only the stage 2 |
1167 |
* commit rpc, we have to clear B_CLUSTEROK as well. |
1168 |
*/ |
1169 |
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); |
1170 |
|
1171 |
if (error) { |
1172 |
bp->b_ioflags |= BIO_ERROR; |
1173 |
brelse(bp); |
1174 |
break; |
1175 |
} |
1176 |
|
1177 |
/* |
1178 |
* Only update dirtyoff/dirtyend if not a degenerate |
1179 |
* condition. |
1180 |
*/ |
1181 |
if (n) { |
1182 |
if (bp->b_dirtyend > 0) { |
1183 |
bp->b_dirtyoff = min(on, bp->b_dirtyoff); |
1184 |
bp->b_dirtyend = max((on + n), bp->b_dirtyend); |
1185 |
} else { |
1186 |
bp->b_dirtyoff = on; |
1187 |
bp->b_dirtyend = on + n; |
1188 |
} |
1189 |
vfs_bio_set_valid(bp, on, n); |
1190 |
} |
1191 |
|
1192 |
/* |
1193 |
* If IO_SYNC do bwrite(). |
1194 |
* |
1195 |
* IO_INVAL appears to be unused. The idea appears to be |
1196 |
* to turn off caching in this case. Very odd. XXX |
1197 |
*/ |
1198 |
if ((ioflag & IO_SYNC)) { |
1199 |
if (ioflag & IO_INVAL) |
1200 |
bp->b_flags |= B_NOCACHE; |
1201 |
error = bwrite(bp); |
1202 |
if (error) |
1203 |
break; |
1204 |
} else if ((n + on) == biosize) { |
1205 |
bp->b_flags |= B_ASYNC; |
1206 |
(void) (nmp->nm_rpcops->nr_writebp)(bp, 0, NULL); |
1207 |
} else { |
1208 |
bdwrite(bp); |
1209 |
} |
1210 |
} while (uio->uio_resid > 0 && n > 0); |
1211 |
|
1212 |
return (error); |
1213 |
} |
1214 |
|
1215 |
/* |
1216 |
* Get an nfs cache block. |
1217 |
* |
1218 |
* Allocate a new one if the block isn't currently in the cache |
1219 |
* and return the block marked busy. If the calling process is |
1220 |
* interrupted by a signal for an interruptible mount point, return |
1221 |
* NULL. |
1222 |
* |
1223 |
* The caller must carefully deal with the possible B_INVAL state of |
1224 |
* the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it |
1225 |
* indirectly), so synchronous reads can be issued without worrying about |
1226 |
* the B_INVAL state. We have to be a little more careful when dealing |
1227 |
* with writes (see comments in nfs_write()) when extending a file past |
1228 |
* its EOF. |
1229 |
*/ |
1230 |
static struct buf * |
1231 |
nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td) |
1232 |
{ |
1233 |
struct buf *bp; |
1234 |
struct mount *mp; |
1235 |
struct nfsmount *nmp; |
1236 |
|
1237 |
mp = vp->v_mount; |
1238 |
nmp = VFSTONFS(mp); |
1239 |
|
1240 |
if (nmp->nm_flag & NFSMNT_INT) { |
1241 |
sigset_t oldset; |
1242 |
|
1243 |
nfs_set_sigmask(td, &oldset); |
1244 |
bp = getblk(vp, bn, size, NFS_PCATCH, 0, 0); |
1245 |
nfs_restore_sigmask(td, &oldset); |
1246 |
while (bp == NULL) { |
1247 |
if (nfs_sigintr(nmp, td)) |
1248 |
return (NULL); |
1249 |
bp = getblk(vp, bn, size, 0, 2 * hz, 0); |
1250 |
} |
1251 |
} else { |
1252 |
bp = getblk(vp, bn, size, 0, 0, 0); |
1253 |
} |
1254 |
|
1255 |
if (vp->v_type == VREG) |
1256 |
bp->b_blkno = bn * (vp->v_bufobj.bo_bsize / DEV_BSIZE); |
1257 |
return (bp); |
1258 |
} |
1259 |
|
1260 |
/* |
1261 |
* Flush and invalidate all dirty buffers. If another process is already |
1262 |
* doing the flush, just wait for completion. |
1263 |
*/ |
1264 |
int |
1265 |
nfs_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg) |
1266 |
{ |
1267 |
struct nfsnode *np = VTONFS(vp); |
1268 |
struct nfsmount *nmp = VFSTONFS(vp->v_mount); |
1269 |
int error = 0, slpflag, slptimeo; |
1270 |
int old_lock = 0; |
1271 |
|
1272 |
ASSERT_VOP_LOCKED(vp, "nfs_vinvalbuf"); |
1273 |
|
1274 |
if ((nmp->nm_flag & NFSMNT_INT) == 0) |
1275 |
intrflg = 0; |
1276 |
if (intrflg) { |
1277 |
slpflag = NFS_PCATCH; |
1278 |
slptimeo = 2 * hz; |
1279 |
} else { |
1280 |
slpflag = 0; |
1281 |
slptimeo = 0; |
1282 |
} |
1283 |
|
1284 |
old_lock = nfs_upgrade_vnlock(vp); |
1285 |
if (vp->v_iflag & VI_DOOMED) { |
1286 |
/* |
1287 |
* Since vgonel() uses the generic vinvalbuf() to flush |
1288 |
* dirty buffers and it does not call this function, it |
1289 |
* is safe to just return OK when VI_DOOMED is set. |
1290 |
*/ |
1291 |
nfs_downgrade_vnlock(vp, old_lock); |
1292 |
return (0); |
1293 |
} |
1294 |
|
1295 |
/* |
1296 |
* Now, flush as required. |
1297 |
*/ |
1298 |
if ((flags & V_SAVE) && (vp->v_bufobj.bo_object != NULL)) { |
1299 |
VM_OBJECT_LOCK(vp->v_bufobj.bo_object); |
1300 |
vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); |
1301 |
VM_OBJECT_UNLOCK(vp->v_bufobj.bo_object); |
1302 |
/* |
1303 |
* If the page clean was interrupted, fail the invalidation. |
1304 |
* Not doing so, we run the risk of losing dirty pages in the |
1305 |
* vinvalbuf() call below. |
1306 |
*/ |
1307 |
if (intrflg && (error = nfs_sigintr(nmp, td))) |
1308 |
goto out; |
1309 |
} |
1310 |
|
1311 |
error = vinvalbuf(vp, flags, slpflag, 0); |
1312 |
while (error) { |
1313 |
if (intrflg && (error = nfs_sigintr(nmp, td))) |
1314 |
goto out; |
1315 |
error = vinvalbuf(vp, flags, 0, slptimeo); |
1316 |
} |
1317 |
mtx_lock(&np->n_mtx); |
1318 |
if (np->n_directio_asyncwr == 0) |
1319 |
np->n_flag &= ~NMODIFIED; |
1320 |
mtx_unlock(&np->n_mtx); |
1321 |
out: |
1322 |
nfs_downgrade_vnlock(vp, old_lock); |
1323 |
return error; |
1324 |
} |
1325 |
|
1326 |
/* |
1327 |
* Initiate asynchronous I/O. Return an error if no nfsiods are available. |
1328 |
* This is mainly to avoid queueing async I/O requests when the nfsiods |
1329 |
* are all hung on a dead server. |
1330 |
* |
1331 |
* Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp |
1332 |
* is eventually dequeued by the async daemon, nfs_doio() *will*. |
1333 |
*/ |
1334 |
int |
1335 |
nfs_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td) |
1336 |
{ |
1337 |
int iod; |
1338 |
int gotiod; |
1339 |
int slpflag = 0; |
1340 |
int slptimeo = 0; |
1341 |
int error, error2; |
1342 |
|
1343 |
/* |
1344 |
* Commits are usually short and sweet so lets save some cpu and |
1345 |
* leave the async daemons for more important rpc's (such as reads |
1346 |
* and writes). |
1347 |
*/ |
1348 |
mtx_lock(&nfs_iod_mtx); |
1349 |
if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && |
1350 |
(nmp->nm_bufqiods > nfs_numasync / 2)) { |
1351 |
mtx_unlock(&nfs_iod_mtx); |
1352 |
return(EIO); |
1353 |
} |
1354 |
again: |
1355 |
if (nmp->nm_flag & NFSMNT_INT) |
1356 |
slpflag = NFS_PCATCH; |
1357 |
gotiod = FALSE; |
1358 |
|
1359 |
/* |
1360 |
* Find a free iod to process this request. |
1361 |
*/ |
1362 |
for (iod = 0; iod < nfs_numasync; iod++) |
1363 |
if (nfs_iodwant[iod] == NFSIOD_AVAILABLE) { |
1364 |
gotiod = TRUE; |
1365 |
break; |
1366 |
} |
1367 |
|
1368 |
/* |
1369 |
* Try to create one if none are free. |
1370 |
*/ |
1371 |
if (!gotiod) |
1372 |
nfs_nfsiodnew(); |
1373 |
else { |
1374 |
/* |
1375 |
* Found one, so wake it up and tell it which |
1376 |
* mount to process. |
1377 |
*/ |
1378 |
NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n", |
1379 |
iod, nmp)); |
1380 |
nfs_iodwant[iod] = NFSIOD_NOT_AVAILABLE; |
1381 |
nfs_iodmount[iod] = nmp; |
1382 |
nmp->nm_bufqiods++; |
1383 |
wakeup(&nfs_iodwant[iod]); |
1384 |
} |
1385 |
|
1386 |
/* |
1387 |
* If none are free, we may already have an iod working on this mount |
1388 |
* point. If so, it will process our request. |
1389 |
*/ |
1390 |
if (!gotiod) { |
1391 |
if (nmp->nm_bufqiods > 0) { |
1392 |
NFS_DPF(ASYNCIO, |
1393 |
("nfs_asyncio: %d iods are already processing mount %p\n", |
1394 |
nmp->nm_bufqiods, nmp)); |
1395 |
gotiod = TRUE; |
1396 |
} |
1397 |
} |
1398 |
|
1399 |
/* |
1400 |
* If we have an iod which can process the request, then queue |
1401 |
* the buffer. |
1402 |
*/ |
1403 |
if (gotiod) { |
1404 |
/* |
1405 |
* Ensure that the queue never grows too large. We still want |
1406 |
* to asynchronize so we block rather then return EIO. |
1407 |
*/ |
1408 |
while (nmp->nm_bufqlen >= 2 * nfs_numasync) { |
1409 |
NFS_DPF(ASYNCIO, |
1410 |
("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); |
1411 |
nmp->nm_bufqwant = TRUE; |
1412 |
error = nfs_msleep(td, &nmp->nm_bufq, &nfs_iod_mtx, |
1413 |
slpflag | PRIBIO, |
1414 |
"nfsaio", slptimeo); |
1415 |
if (error) { |
1416 |
error2 = nfs_sigintr(nmp, td); |
1417 |
if (error2) { |
1418 |
mtx_unlock(&nfs_iod_mtx); |
1419 |
return (error2); |
1420 |
} |
1421 |
if (slpflag == NFS_PCATCH) { |
1422 |
slpflag = 0; |
1423 |
slptimeo = 2 * hz; |
1424 |
} |
1425 |
} |
1426 |
/* |
1427 |
* We might have lost our iod while sleeping, |
1428 |
* so check and loop if nescessary. |
1429 |
*/ |
1430 |
goto again; |
1431 |
} |
1432 |
|
1433 |
/* We might have lost our nfsiod */ |
1434 |
if (nmp->nm_bufqiods == 0) { |
1435 |
NFS_DPF(ASYNCIO, |
1436 |
("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); |
1437 |
goto again; |
1438 |
} |
1439 |
|
1440 |
if (bp->b_iocmd == BIO_READ) { |
1441 |
if (bp->b_rcred == NOCRED && cred != NOCRED) |
1442 |
bp->b_rcred = crhold(cred); |
1443 |
} else { |
1444 |
if (bp->b_wcred == NOCRED && cred != NOCRED) |
1445 |
bp->b_wcred = crhold(cred); |
1446 |
} |
1447 |
|
1448 |
if (bp->b_flags & B_REMFREE) |
1449 |
bremfreef(bp); |
1450 |
BUF_KERNPROC(bp); |
1451 |
TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); |
1452 |
nmp->nm_bufqlen++; |
1453 |
if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) { |
1454 |
mtx_lock(&(VTONFS(bp->b_vp))->n_mtx); |
1455 |
VTONFS(bp->b_vp)->n_flag |= NMODIFIED; |
1456 |
VTONFS(bp->b_vp)->n_directio_asyncwr++; |
1457 |
mtx_unlock(&(VTONFS(bp->b_vp))->n_mtx); |
1458 |
} |
1459 |
mtx_unlock(&nfs_iod_mtx); |
1460 |
return (0); |
1461 |
} |
1462 |
|
1463 |
mtx_unlock(&nfs_iod_mtx); |
1464 |
|
1465 |
/* |
1466 |
* All the iods are busy on other mounts, so return EIO to |
1467 |
* force the caller to process the i/o synchronously. |
1468 |
*/ |
1469 |
NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); |
1470 |
return (EIO); |
1471 |
} |
1472 |
|
1473 |
void |
1474 |
nfs_doio_directwrite(struct buf *bp) |
1475 |
{ |
1476 |
int iomode, must_commit; |
1477 |
struct uio *uiop = (struct uio *)bp->b_caller1; |
1478 |
char *iov_base = uiop->uio_iov->iov_base; |
1479 |
struct nfsmount *nmp = VFSTONFS(bp->b_vp->v_mount); |
1480 |
|
1481 |
iomode = NFSV3WRITE_FILESYNC; |
1482 |
uiop->uio_td = NULL; /* NULL since we're in nfsiod */ |
1483 |
(nmp->nm_rpcops->nr_writerpc)(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit); |
1484 |
KASSERT((must_commit == 0), ("nfs_doio_directwrite: Did not commit write")); |
1485 |
free(iov_base, M_NFSDIRECTIO); |
1486 |
free(uiop->uio_iov, M_NFSDIRECTIO); |
1487 |
free(uiop, M_NFSDIRECTIO); |
1488 |
if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) { |
1489 |
struct nfsnode *np = VTONFS(bp->b_vp); |
1490 |
mtx_lock(&np->n_mtx); |
1491 |
np->n_directio_asyncwr--; |
1492 |
if (np->n_directio_asyncwr == 0) { |
1493 |
VTONFS(bp->b_vp)->n_flag &= ~NMODIFIED; |
1494 |
if ((np->n_flag & NFSYNCWAIT)) { |
1495 |
np->n_flag &= ~NFSYNCWAIT; |
1496 |
wakeup((caddr_t)&np->n_directio_asyncwr); |
1497 |
} |
1498 |
} |
1499 |
mtx_unlock(&np->n_mtx); |
1500 |
} |
1501 |
bp->b_vp = NULL; |
1502 |
relpbuf(bp, &nfs_pbuf_freecnt); |
1503 |
} |
1504 |
|
1505 |
/* |
1506 |
* Do an I/O operation to/from a cache block. This may be called |
1507 |
* synchronously or from an nfsiod. |
1508 |
*/ |
1509 |
int |
1510 |
nfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td) |
1511 |
{ |
1512 |
struct uio *uiop; |
1513 |
struct nfsnode *np; |
1514 |
struct nfsmount *nmp; |
1515 |
int error = 0, iomode, must_commit = 0; |
1516 |
struct uio uio; |
1517 |
struct iovec io; |
1518 |
struct proc *p = td ? td->td_proc : NULL; |
1519 |
uint8_t iocmd; |
1520 |
|
1521 |
np = VTONFS(vp); |
1522 |
nmp = VFSTONFS(vp->v_mount); |
1523 |
uiop = &uio; |
1524 |
uiop->uio_iov = &io; |
1525 |
uiop->uio_iovcnt = 1; |
1526 |
uiop->uio_segflg = UIO_SYSSPACE; |
1527 |
uiop->uio_td = td; |
1528 |
|
1529 |
/* |
1530 |
* clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We |
1531 |
* do this here so we do not have to do it in all the code that |
1532 |
* calls us. |
1533 |
*/ |
1534 |
bp->b_flags &= ~B_INVAL; |
1535 |
bp->b_ioflags &= ~BIO_ERROR; |
1536 |
|
1537 |
KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); |
1538 |
iocmd = bp->b_iocmd; |
1539 |
if (iocmd == BIO_READ) { |
1540 |
io.iov_len = uiop->uio_resid = bp->b_bcount; |
1541 |
io.iov_base = bp->b_data; |
1542 |
uiop->uio_rw = UIO_READ; |
1543 |
|
1544 |
switch (vp->v_type) { |
1545 |
case VREG: |
1546 |
uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; |
1547 |
nfsstats.read_bios++; |
1548 |
error = (nmp->nm_rpcops->nr_readrpc)(vp, uiop, cr); |
1549 |
|
1550 |
if (!error) { |
1551 |
if (uiop->uio_resid) { |
1552 |
/* |
1553 |
* If we had a short read with no error, we must have |
1554 |
* hit a file hole. We should zero-fill the remainder. |
1555 |
* This can also occur if the server hits the file EOF. |
1556 |
* |
1557 |
* Holes used to be able to occur due to pending |
1558 |
* writes, but that is not possible any longer. |
1559 |
*/ |
1560 |
int nread = bp->b_bcount - uiop->uio_resid; |
1561 |
int left = uiop->uio_resid; |
1562 |
|
1563 |
if (left > 0) |
1564 |
bzero((char *)bp->b_data + nread, left); |
1565 |
uiop->uio_resid = 0; |
1566 |
} |
1567 |
} |
1568 |
/* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */ |
1569 |
if (p && (vp->v_vflag & VV_TEXT)) { |
1570 |
mtx_lock(&np->n_mtx); |
1571 |
if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime)) { |
1572 |
mtx_unlock(&np->n_mtx); |
1573 |
PROC_LOCK(p); |
1574 |
killproc(p, "text file modification"); |
1575 |
PROC_UNLOCK(p); |
1576 |
} else |
1577 |
mtx_unlock(&np->n_mtx); |
1578 |
} |
1579 |
break; |
1580 |
case VLNK: |
1581 |
uiop->uio_offset = (off_t)0; |
1582 |
nfsstats.readlink_bios++; |
1583 |
error = (nmp->nm_rpcops->nr_readlinkrpc)(vp, uiop, cr); |
1584 |
break; |
1585 |
case VDIR: |
1586 |
nfsstats.readdir_bios++; |
1587 |
uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; |
1588 |
if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) { |
1589 |
error = nfs_readdirplusrpc(vp, uiop, cr); |
1590 |
if (error == NFSERR_NOTSUPP) |
1591 |
nmp->nm_flag &= ~NFSMNT_RDIRPLUS; |
1592 |
} |
1593 |
if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) |
1594 |
error = nfs_readdirrpc(vp, uiop, cr); |
1595 |
/* |
1596 |
* end-of-directory sets B_INVAL but does not generate an |
1597 |
* error. |
1598 |
*/ |
1599 |
if (error == 0 && uiop->uio_resid == bp->b_bcount) |
1600 |
bp->b_flags |= B_INVAL; |
1601 |
break; |
1602 |
default: |
1603 |
nfs_printf("nfs_doio: type %x unexpected\n", vp->v_type); |
1604 |
break; |
1605 |
}; |
1606 |
if (error) { |
1607 |
bp->b_ioflags |= BIO_ERROR; |
1608 |
bp->b_error = error; |
1609 |
} |
1610 |
} else { |
1611 |
/* |
1612 |
* If we only need to commit, try to commit |
1613 |
*/ |
1614 |
if (bp->b_flags & B_NEEDCOMMIT) { |
1615 |
int retv; |
1616 |
off_t off; |
1617 |
|
1618 |
off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; |
1619 |
retv = (nmp->nm_rpcops->nr_commit)( |
1620 |
vp, off, bp->b_dirtyend-bp->b_dirtyoff, |
1621 |
bp->b_wcred, td); |
1622 |
if (retv == 0) { |
1623 |
bp->b_dirtyoff = bp->b_dirtyend = 0; |
1624 |
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); |
1625 |
bp->b_resid = 0; |
1626 |
bufdone(bp); |
1627 |
return (0); |
1628 |
} |
1629 |
if (retv == NFSERR_STALEWRITEVERF) { |
1630 |
nfs_clearcommit(vp->v_mount); |
1631 |
} |
1632 |
} |
1633 |
|
1634 |
/* |
1635 |
* Setup for actual write |
1636 |
*/ |
1637 |
mtx_lock(&np->n_mtx); |
1638 |
if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) |
1639 |
bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; |
1640 |
mtx_unlock(&np->n_mtx); |
1641 |
|
1642 |
if (bp->b_dirtyend > bp->b_dirtyoff) { |
1643 |
io.iov_len = uiop->uio_resid = bp->b_dirtyend |
1644 |
- bp->b_dirtyoff; |
1645 |
uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE |
1646 |
+ bp->b_dirtyoff; |
1647 |
io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; |
1648 |
uiop->uio_rw = UIO_WRITE; |
1649 |
nfsstats.write_bios++; |
1650 |
|
1651 |
if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) |
1652 |
iomode = NFSV3WRITE_UNSTABLE; |
1653 |
else |
1654 |
iomode = NFSV3WRITE_FILESYNC; |
1655 |
|
1656 |
error = (nmp->nm_rpcops->nr_writerpc)(vp, uiop, cr, &iomode, &must_commit); |
1657 |
|
1658 |
/* |
1659 |
* When setting B_NEEDCOMMIT also set B_CLUSTEROK to try |
1660 |
* to cluster the buffers needing commit. This will allow |
1661 |
* the system to submit a single commit rpc for the whole |
1662 |
* cluster. We can do this even if the buffer is not 100% |
1663 |
* dirty (relative to the NFS blocksize), so we optimize the |
1664 |
* append-to-file-case. |
1665 |
* |
1666 |
* (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be |
1667 |
* cleared because write clustering only works for commit |
1668 |
* rpc's, not for the data portion of the write). |
1669 |
*/ |
1670 |
|
1671 |
if (!error && iomode == NFSV3WRITE_UNSTABLE) { |
1672 |
bp->b_flags |= B_NEEDCOMMIT; |
1673 |
if (bp->b_dirtyoff == 0 |
1674 |
&& bp->b_dirtyend == bp->b_bcount) |
1675 |
bp->b_flags |= B_CLUSTEROK; |
1676 |
} else { |
1677 |
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); |
1678 |
} |
1679 |
|
1680 |
/* |
1681 |
* For an interrupted write, the buffer is still valid |
1682 |
* and the write hasn't been pushed to the server yet, |
1683 |
* so we can't set BIO_ERROR and report the interruption |
1684 |
* by setting B_EINTR. For the B_ASYNC case, B_EINTR |
1685 |
* is not relevant, so the rpc attempt is essentially |
1686 |
* a noop. For the case of a V3 write rpc not being |
1687 |
* committed to stable storage, the block is still |
1688 |
* dirty and requires either a commit rpc or another |
1689 |
* write rpc with iomode == NFSV3WRITE_FILESYNC before |
1690 |
* the block is reused. This is indicated by setting |
1691 |
* the B_DELWRI and B_NEEDCOMMIT flags. |
1692 |
* |
1693 |
* If the buffer is marked B_PAGING, it does not reside on |
1694 |
* the vp's paging queues so we cannot call bdirty(). The |
1695 |
* bp in this case is not an NFS cache block so we should |
1696 |
* be safe. XXX |
1697 |
* |
1698 |
* The logic below breaks up errors into recoverable and |
1699 |
* unrecoverable. For the former, we clear B_INVAL|B_NOCACHE |
1700 |
* and keep the buffer around for potential write retries. |
1701 |
* For the latter (eg ESTALE), we toss the buffer away (B_INVAL) |
1702 |
* and save the error in the nfsnode. This is less than ideal |
1703 |
* but necessary. Keeping such buffers around could potentially |
1704 |
* cause buffer exhaustion eventually (they can never be written |
1705 |
* out, so will get constantly be re-dirtied). It also causes |
1706 |
* all sorts of vfs panics. For non-recoverable write errors, |
1707 |
* also invalidate the attrcache, so we'll be forced to go over |
1708 |
* the wire for this object, returning an error to user on next |
1709 |
* call (most of the time). |
1710 |
*/ |
1711 |
if (error == EINTR || error == EIO || error == ETIMEDOUT |
1712 |
|| (!error && (bp->b_flags & B_NEEDCOMMIT))) { |
1713 |
int s; |
1714 |
|
1715 |
s = splbio(); |
1716 |
bp->b_flags &= ~(B_INVAL|B_NOCACHE); |
1717 |
if ((bp->b_flags & B_PAGING) == 0) { |
1718 |
bdirty(bp); |
1719 |
bp->b_flags &= ~B_DONE; |
1720 |
} |
1721 |
if (error && (bp->b_flags & B_ASYNC) == 0) |
1722 |
bp->b_flags |= B_EINTR; |
1723 |
splx(s); |
1724 |
} else { |
1725 |
if (error) { |
1726 |
bp->b_ioflags |= BIO_ERROR; |
1727 |
bp->b_flags |= B_INVAL; |
1728 |
bp->b_error = np->n_error = error; |
1729 |
mtx_lock(&np->n_mtx); |
1730 |
np->n_flag |= NWRITEERR; |
1731 |
np->n_attrstamp = 0; |
1732 |
KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); |
1733 |
mtx_unlock(&np->n_mtx); |
1734 |
} |
1735 |
bp->b_dirtyoff = bp->b_dirtyend = 0; |
1736 |
} |
1737 |
} else { |
1738 |
bp->b_resid = 0; |
1739 |
bufdone(bp); |
1740 |
return (0); |
1741 |
} |
1742 |
} |
1743 |
bp->b_resid = uiop->uio_resid; |
1744 |
if (must_commit) |
1745 |
nfs_clearcommit(vp->v_mount); |
1746 |
bufdone(bp); |
1747 |
return (error); |
1748 |
} |
1749 |
|
1750 |
/* |
1751 |
* Used to aid in handling ftruncate() operations on the NFS client side. |
1752 |
* Truncation creates a number of special problems for NFS. We have to |
1753 |
* throw away VM pages and buffer cache buffers that are beyond EOF, and |
1754 |
* we have to properly handle VM pages or (potentially dirty) buffers |
1755 |
* that straddle the truncation point. |
1756 |
*/ |
1757 |
|
1758 |
int |
1759 |
nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize) |
1760 |
{ |
1761 |
struct nfsnode *np = VTONFS(vp); |
1762 |
u_quad_t tsize; |
1763 |
int biosize = vp->v_bufobj.bo_bsize; |
1764 |
int error = 0; |
1765 |
|
1766 |
mtx_lock(&np->n_mtx); |
1767 |
tsize = np->n_size; |
1768 |
np->n_size = nsize; |
1769 |
mtx_unlock(&np->n_mtx); |
1770 |
|
1771 |
if (nsize < tsize) { |
1772 |
struct buf *bp; |
1773 |
daddr_t lbn; |
1774 |
int bufsize; |
1775 |
|
1776 |
/* |
1777 |
* vtruncbuf() doesn't get the buffer overlapping the |
1778 |
* truncation point. We may have a B_DELWRI and/or B_CACHE |
1779 |
* buffer that now needs to be truncated. |
1780 |
*/ |
1781 |
error = vtruncbuf(vp, cred, td, nsize, biosize); |
1782 |
lbn = nsize / biosize; |
1783 |
bufsize = nsize & (biosize - 1); |
1784 |
bp = nfs_getcacheblk(vp, lbn, bufsize, td); |
1785 |
if (!bp) |
1786 |
return EINTR; |
1787 |
if (bp->b_dirtyoff > bp->b_bcount) |
1788 |
bp->b_dirtyoff = bp->b_bcount; |
1789 |
if (bp->b_dirtyend > bp->b_bcount) |
1790 |
bp->b_dirtyend = bp->b_bcount; |
1791 |
bp->b_flags |= B_RELBUF; /* don't leave garbage around */ |
1792 |
brelse(bp); |
1793 |
} else { |
1794 |
vnode_pager_setsize(vp, nsize); |
1795 |
} |
1796 |
return(error); |
1797 |
} |
1798 |
|