1 /*      $NetBSD: coalesce.c,v 1.33 2015/10/10 22:34:46 dholland Exp $  */
2 
3 /*-
4  * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Konrad E. Schroder <perseant@hhhh.org>.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/mount.h>
34 #include <sys/time.h>
35 #include <sys/resource.h>
36 #include <sys/types.h>
37 #include <sys/wait.h>
38 #include <sys/mman.h>
39 
40 #include <ufs/lfs/lfs.h>
41 
42 #include <fcntl.h>
43 #include <signal.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <time.h>
48 #include <unistd.h>
49 #include <util.h>
50 #include <errno.h>
51 #include <err.h>
52 #include <assert.h>
53 
54 #include <syslog.h>
55 
56 #include "bufcache.h"
57 #include "vnode.h"
58 #include "cleaner.h"
59 #include "kernelops.h"
60 
61 extern int debug, do_mmap;
62 
63 /*
64  * XXX return the arg to just int when/if we don't need it for
65  * potentially huge block counts any more.
66  */
67 static int
log2int(intmax_t n)68 log2int(intmax_t n)
69 {
70           int log;
71 
72           log = 0;
73           while (n > 0) {
74                     ++log;
75                     n >>= 1;
76           }
77           return log - 1;
78 }
79 
80 enum coalesce_returncodes {
81           COALESCE_OK = 0,
82           COALESCE_NOINODE,
83           COALESCE_TOOSMALL,
84           COALESCE_BADSIZE,
85           COALESCE_BADBLOCKSIZE,
86           COALESCE_NOMEM,
87           COALESCE_BADBMAPV,
88           COALESCE_BADMARKV,
89           COALESCE_NOTWORTHIT,
90           COALESCE_NOTHINGLEFT,
91           COALESCE_EIO,
92 
93           COALESCE_MAXERROR
94 };
95 
96 const char *coalesce_return[] = {
97           "Successfully coalesced",
98           "File not in use or inode not found",
99           "Not large enough to coalesce",
100           "Negative size",
101           "Not enough blocks to account for size",
102           "Malloc failed",
103           "LFCNBMAPV failed",
104           "Not broken enough to fix",
105           "Too many blocks not found",
106           "Too many blocks found in active segments",
107           "I/O error",
108 
109           "No such error"
110 };
111 
112 static union lfs_dinode *
get_dinode(struct clfs * fs,ino_t ino)113 get_dinode(struct clfs *fs, ino_t ino)
114 {
115           IFILE *ifp;
116           daddr_t daddr;
117           struct ubuf *bp;
118           union lfs_dinode *dip, *r;
119           unsigned i;
120 
121           lfs_ientry(&ifp, fs, ino, &bp);
122           daddr = lfs_if_getdaddr(fs, ifp);
123           brelse(bp, 0);
124 
125           if (daddr == 0x0)
126                     return NULL;
127 
128           bread(fs->clfs_devvp, daddr, lfs_sb_getibsize(fs), 0, &bp);
129           for (i = 0; i < LFS_INOPB(fs); i++) {
130                     dip = DINO_IN_BLOCK(fs, bp->b_data, i);
131                     if (lfs_dino_getinumber(fs, dip) == ino) {
132                               r = malloc(sizeof(*r));
133                               if (r == NULL)
134                                         break;
135                               /*
136                                * Don't just assign the union, as if we're
137                                * 32-bit and it's the last inode in the block
138                                * that will run off the end of the buffer.
139                                */
140                               if (fs->lfs_is64) {
141                                         r->u_64 = dip->u_64;
142                               } else {
143                                         r->u_32 = dip->u_32;
144                               }
145                               brelse(bp, 0);
146                               return r;
147                     }
148           }
149           brelse(bp, 0);
150           return NULL;
151 }
152 
153 /*
154  * Find out if this inode's data blocks are discontinuous; if they are,
155  * rewrite them using markv.  Return the number of inodes rewritten.
156  */
157 static int
clean_inode(struct clfs * fs,ino_t ino)158 clean_inode(struct clfs *fs, ino_t ino)
159 {
160           BLOCK_INFO *bip = NULL, *tbip;
161           CLEANERINFO cip;
162           struct ubuf *bp;
163           union lfs_dinode *dip;
164           struct clfs_seguse *sup;
165           struct lfs_fcntl_markv /* {
166                     BLOCK_INFO *blkiov;
167                     int blkcnt;
168           } */ lim;
169           daddr_t toff;
170           int noff;
171           blkcnt_t i, nb, onb;
172           int retval;
173           int bps;
174 
175           dip = get_dinode(fs, ino);
176           if (dip == NULL)
177                     return COALESCE_NOINODE;
178 
179           /* Compute file block size, set up for bmapv */
180           onb = nb = lfs_lblkno(fs, lfs_dino_getsize(fs, dip));
181 
182           /* XXX for now, don't do any file small enough to have fragments */
183           if (nb < ULFS_NDADDR) {
184                     free(dip);
185                     return COALESCE_TOOSMALL;
186           }
187 
188           /* Sanity checks */
189 #if 0     /* di_size is uint64_t -- this is a noop */
190           if (lfs_dino_getsize(fs, dip) < 0) {
191                     dlog("ino %d, negative size (%" PRId64 ")", ino,
192                          lfs_dino_getsize(fs, dip));
193                     free(dip);
194                     return COALESCE_BADSIZE;
195           }
196 #endif
197           if (nb > lfs_dino_getblocks(fs, dip)) {
198                     dlog("ino %ju, computed blocks %jd > held blocks %ju",
199                          (uintmax_t)ino, (intmax_t)nb,
200                          (uintmax_t)lfs_dino_getblocks(fs, dip));
201                     free(dip);
202                     return COALESCE_BADBLOCKSIZE;
203           }
204 
205           /*
206            * XXX: We should really coalesce really large files in
207            * chunks, as there's substantial diminishing returns and
208            * mallocing huge amounts of memory just to get those returns
209            * is pretty silly. But that requires a big rework of this
210            * code. (On the plus side though then we can stop worrying
211            * about block counts > 2^31.)
212            */
213 
214           /* ugh, no DADDR_T_MAX */
215           __CTASSERT(sizeof(daddr_t) == sizeof(int64_t));
216           if (nb > INT64_MAX / sizeof(BLOCK_INFO)) {
217                     syslog(LOG_WARNING, "ino %ju, %jd blocks: array too large\n",
218                            (uintmax_t)ino, (uintmax_t)nb);
219                     free(dip);
220                     return COALESCE_NOMEM;
221           }
222 
223           bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
224           if (bip == NULL) {
225                     syslog(LOG_WARNING, "ino %llu, %jd blocks: %s\n",
226                         (unsigned long long)ino, (intmax_t)nb,
227                         strerror(errno));
228                     free(dip);
229                     return COALESCE_NOMEM;
230           }
231           for (i = 0; i < nb; i++) {
232                     memset(bip + i, 0, sizeof(BLOCK_INFO));
233                     bip[i].bi_inode = ino;
234                     bip[i].bi_lbn = i;
235                     bip[i].bi_version = lfs_dino_getgen(fs, dip);
236                     /* Don't set the size, but let lfs_bmap fill it in */
237           }
238           /*
239            * The kernel also contains this check; but as lim.blkcnt is
240            * only 32 bits wide, we need to check ourselves too in case
241            * we'd otherwise truncate a value > 2^31, as that might
242            * succeed and create bizarre results.
243            */
244           if (nb > LFS_MARKV_MAXBLKCNT) {
245                     syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: Too large\n",
246                            lfs_sb_getfsmnt(fs));
247                     retval = COALESCE_BADBMAPV;
248                     goto out;
249           }
250           lim.blkiov = bip;
251           lim.blkcnt = nb;
252           if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
253                     syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
254                            lfs_sb_getfsmnt(fs));
255                     retval = COALESCE_BADBMAPV;
256                     goto out;
257           }
258 #if 0
259           for (i = 0; i < nb; i++) {
260                     printf("bi_size = %d, bi_ino = %ju, "
261                         "bi_lbn = %jd, bi_daddr = %jd\n",
262                         bip[i].bi_size, (uintmax_t)bip[i].bi_inode,
263                         (intmax_t)bip[i].bi_lbn,
264                         (intmax_t)bip[i].bi_daddr);
265           }
266 #endif
267           noff = 0;
268           toff = 0;
269           for (i = 1; i < nb; i++) {
270                     if (bip[i].bi_daddr != bip[i - 1].bi_daddr + lfs_sb_getfrag(fs))
271                               ++noff;
272                     toff += llabs(bip[i].bi_daddr - bip[i - 1].bi_daddr
273                         - lfs_sb_getfrag(fs)) >> lfs_sb_getfbshift(fs);
274           }
275 
276           /*
277            * If this file is not discontinuous, there's no point in rewriting it.
278            *
279            * Explicitly allow a certain amount of discontinuity, since large
280            * files will be broken among segments and medium-sized files
281            * can have a break or two and it's okay.
282            */
283           if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
284               lfs_segtod(fs, noff) * 2 < nb) {
285                     retval = COALESCE_NOTWORTHIT;
286                     goto out;
287           } else if (debug)
288                     syslog(LOG_DEBUG, "ino %llu total discontinuity "
289                         "%d (%jd) for %jd blocks", (unsigned long long)ino,
290                         noff, (intmax_t)toff, (intmax_t)nb);
291 
292           /* Search for blocks in active segments; don't move them. */
293           for (i = 0; i < nb; i++) {
294                     if (bip[i].bi_daddr <= 0)
295                               continue;
296                     sup = &fs->clfs_segtab[lfs_dtosn(fs, bip[i].bi_daddr)];
297                     if (sup->flags & SEGUSE_ACTIVE)
298                               bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
299           }
300 
301           /*
302            * Get rid of any blocks we've marked dead.  If this is an older
303            * kernel that doesn't have bmapv fill in the block sizes, we'll
304            * toss everything here.
305            */
306           onb = nb;
307           toss_old_blocks(fs, &bip, &nb, NULL);
308           nb = i;
309 
310           /*
311            * We may have tossed enough blocks that it is no longer worthwhile
312            * to rewrite this inode.
313            */
314           if (nb == 0 || onb - nb > log2int(onb)) {
315                     if (debug)
316                               syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
317                     retval = COALESCE_NOTHINGLEFT;
318                     goto out;
319           }
320 
321           /*
322            * We are going to rewrite this inode.
323            * For any remaining blocks, read in their contents.
324            */
325           for (i = 0; i < nb; i++) {
326                     bip[i].bi_bp = malloc(bip[i].bi_size);
327                     if (bip[i].bi_bp == NULL) {
328                               syslog(LOG_WARNING, "allocate block buffer size=%d: %s\n",
329                                   bip[i].bi_size, strerror(errno));
330                               retval = COALESCE_NOMEM;
331                               goto out;
332                     }
333 
334                     if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
335                                 lfs_fsbtob(fs, bip[i].bi_daddr)) < 0) {
336                               retval = COALESCE_EIO;
337                               goto out;
338                     }
339           }
340           if (debug)
341                     syslog(LOG_DEBUG, "ino %ju markv %jd blocks",
342                         (uintmax_t)ino, (intmax_t)nb);
343 
344           /*
345            * Write in segment-sized chunks.  If at any point we'd write more
346            * than half of the available segments, sleep until that's not
347            * true any more.
348            *
349            * XXX the pointer arithmetic in this loop is illegal; replace
350            * TBIP with an integer (blkcnt_t) offset.
351            */
352           bps = lfs_segtod(fs, 1);
353           for (tbip = bip; tbip < bip + nb; tbip += bps) {
354                     do {
355                               bread(fs->lfs_ivnode, 0, lfs_sb_getbsize(fs), 0, &bp);
356                               cip = *(CLEANERINFO *)bp->b_data;
357                               brelse(bp, B_INVAL);
358 
359                               if (lfs_ci_getclean(fs, &cip) < 4) /* XXX magic number 4 */
360                                         kops.ko_fcntl(fs->clfs_ifilefd,
361                                             LFCNSEGWAIT, NULL);
362                     } while (lfs_ci_getclean(fs, &cip) < 4);
363 
364                     /*
365                      * Note that although lim.blkcnt is 32 bits wide, bps
366                      * (which is blocks-per-segment) is < 2^32 so the
367                      * value assigned here is always in range.
368                      */
369                     lim.blkiov = tbip;
370                     lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
371                     if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
372                               retval = COALESCE_BADMARKV;
373                               goto out;
374                     }
375           }
376 
377           retval = COALESCE_OK;
378 out:
379           free(dip);
380           if (bip) {
381                     for (i = 0; i < onb; i++)
382                               if (bip[i].bi_bp)
383                                         free(bip[i].bi_bp);
384                     free(bip);
385           }
386           return retval;
387 }
388 
389 /*
390  * Try coalescing every inode in the filesystem.
391  * Return the number of inodes actually altered.
392  */
clean_all_inodes(struct clfs * fs)393 int clean_all_inodes(struct clfs *fs)
394 {
395           int i, r, maxino;
396           int totals[COALESCE_MAXERROR];
397           struct stat st;
398 
399           memset(totals, 0, sizeof(totals));
400 
401           fstat(fs->clfs_ifilefd, &st);
402           maxino = lfs_sb_getifpb(fs) * (st.st_size >> lfs_sb_getbshift(fs)) -
403                     lfs_sb_getsegtabsz(fs) - lfs_sb_getcleansz(fs);
404 
405           for (i = 0; i < maxino; i++) {
406                     r = clean_inode(fs, i);
407                     ++totals[r];
408           }
409 
410           for (i = 0; i < COALESCE_MAXERROR; i++)
411                     if (totals[i])
412                               syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
413                                      totals[i]);
414 
415           return totals[COALESCE_OK];
416 }
417 
418 /*
419  * Fork a child process to coalesce this fs.
420  */
421 int
fork_coalesce(struct clfs * fs)422 fork_coalesce(struct clfs *fs)
423 {
424           static pid_t childpid;
425           int num;
426 
427           /*
428            * If already running a coalescing child, don't start a new one.
429            */
430           if (childpid) {
431                     if (waitpid(childpid, NULL, WNOHANG) == childpid)
432                               childpid = 0;
433           }
434           if (childpid && kill(childpid, 0) >= 0) {
435                     /* already running a coalesce process */
436                     if (debug)
437                               syslog(LOG_DEBUG, "coalescing already in progress");
438                     return 0;
439           }
440 
441           /*
442            * Fork a child and let the child coalease
443            */
444           childpid = fork();
445           if (childpid < 0) {
446                     syslog(LOG_ERR, "%s: fork to coaleasce: %m", lfs_sb_getfsmnt(fs));
447                     return 0;
448           } else if (childpid == 0) {
449                     syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
450                            lfs_sb_getfsmnt(fs), getpid());
451                     num = clean_all_inodes(fs);
452                     syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
453                            lfs_sb_getfsmnt(fs), num);
454                     exit(0);
455           }
456 
457           return 0;
458 }
459