1 /*        $NetBSD: rcache.c,v 1.25 2015/08/24 17:34:03 bouyer Exp $   */
2 
3 /*-
4  * Copyright (c) 1999 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Martin J. Laubach <mjl@emsi.priv.at> and
9  *    Manuel Bouyer <Manuel.Bouyer@lip6.fr>.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 #ifndef lint
35 __RCSID("$NetBSD: rcache.c,v 1.25 2015/08/24 17:34:03 bouyer Exp $");
36 #endif /* not lint */
37 
38 #include <sys/types.h>
39 #include <sys/uio.h>
40 #include <sys/mman.h>
41 #include <sys/param.h>
42 #include <sys/sysctl.h>
43 
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <unistd.h>
47 #include <fcntl.h>
48 #include <errno.h>
49 #include <string.h>
50 
51 #include "dump.h"
52 
53 /*-----------------------------------------------------------------------*/
54 #define MAXCACHEBUFS          512       /* max 512 buffers */
55 #define MAXMEMPART  6         /* max 15% of the user mem */
56 
57 /*-----------------------------------------------------------------------*/
58 union cdesc {
59           volatile size_t cd_count;
60           struct {
61                     volatile daddr_t blkstart;
62                     volatile daddr_t blkend;      /* start + nblksread */
63                     volatile daddr_t blocksRead;
64                     volatile size_t time;
65 #ifdef DIAGNOSTICS
66                     volatile pid_t owner;
67 #endif
68           } desc;
69 #define cd_blkstart desc.blkstart
70 #define cd_blkend   desc.blkend
71 #define cd_blocksRead         desc.blocksRead
72 #define cd_time               desc.time
73 #define cd_owner    desc.owner
74 };
75 
76 static int findlru(void);
77 
78 static void *shareBuffer = NULL;
79 static union cdesc *cheader;
80 static union cdesc *cdesc;
81 static char *cdata;
82 static int cachebufs;
83 static int nblksread;
84 
85 #ifdef STATS
86 static int nreads;
87 static int nphysread;
88 static int64_t readsize;
89 static int64_t physreadsize;
90 #endif
91 
92 #define   CSIZE               (nblksread << dev_bshift)     /* cache buf size */
93 #define   CDATA(desc)         (cdata + ((desc) - cdesc) * CSIZE)
94 
95 void
initcache(int cachesize,int readblksize)96 initcache(int cachesize, int readblksize)
97 {
98           size_t len;
99           size_t sharedSize;
100 
101           if (readblksize == -1) { /* use kern.maxphys */
102                     int kern_maxphys;
103                     int mib[2] = { CTL_KERN, KERN_MAXPHYS };
104 
105                     len = sizeof(kern_maxphys);
106                     if (sysctl(mib, 2, &kern_maxphys, &len, NULL, 0) < 0) {
107                               msg("sysctl(kern.maxphys) failed: %s\n",
108                                   strerror(errno));
109                               return;
110                     }
111                     readblksize = kern_maxphys;
112           }
113 
114           /* Convert read block size in terms of filesystem block size */
115           nblksread = howmany(readblksize, ufsib->ufs_bsize);
116 
117           /* Then, convert it in terms of device block size */
118           nblksread <<= ufsib->ufs_bshift - dev_bshift;
119 
120           if (cachesize == -1) {        /* Compute from memory available */
121                     uint64_t usermem, cachetmp;
122                     int mib[2] = { CTL_HW, HW_USERMEM64 };
123 
124                     len = sizeof(usermem);
125                     if (sysctl(mib, 2, &usermem, &len, NULL, 0) < 0) {
126                               msg("sysctl(hw.usermem) failed: %s\n",
127                                   strerror(errno));
128                               return;
129                     }
130                     cachetmp = (usermem / MAXMEMPART) / CSIZE;
131                     /* for those with TB of RAM */
132                     cachebufs = (cachetmp > INT_MAX) ? INT_MAX : cachetmp;
133           } else {            /* User specified */
134                     cachebufs = cachesize;
135           }
136 
137           if (cachebufs) {    /* Don't allocate if zero --> no caching */
138                     if (cachebufs > MAXCACHEBUFS)
139                               cachebufs = MAXCACHEBUFS;
140 
141                     sharedSize = sizeof(union cdesc) +
142                         sizeof(union cdesc) * cachebufs +
143                         cachebufs * CSIZE;
144 #ifdef STATS
145                     fprintf(stderr, "Using %d buffers (%d bytes)\n", cachebufs,
146                         sharedSize);
147 #endif
148                     shareBuffer = mmap(NULL, sharedSize, PROT_READ | PROT_WRITE,
149                         MAP_ANON | MAP_SHARED, -1, 0);
150                     if (shareBuffer == MAP_FAILED) {
151                               msg("can't mmap shared memory for buffer: %s\n",
152                                   strerror(errno));
153                               return;
154                     }
155                     cheader = shareBuffer;
156                     cdesc = (union cdesc *) (((char *) shareBuffer) +
157                         sizeof(union cdesc));
158                     cdata = ((char *) shareBuffer) + sizeof(union cdesc) +
159                         sizeof(union cdesc) * cachebufs;
160 
161                     memset(shareBuffer, '\0', sharedSize);
162           }
163 }
164 
165 /*
166  * Find the cache buffer descriptor that shows the minimal access time
167  */
168 static int
findlru(void)169 findlru(void)
170 {
171           int       i;
172           size_t    minTime = cdesc[0].cd_time;
173           int       minIdx = 0;
174 
175           for (i = 0; i < cachebufs; i++) {
176                     if (cdesc[i].cd_time < minTime) {
177                               minIdx = i;
178                               minTime = cdesc[i].cd_time;
179                     }
180           }
181 
182           return minIdx;
183 }
184 
185 /*
186  * Read data directly from disk, with smart error handling.
187  * Try to recover from hard errors by reading in sector sized pieces.
188  * Error recovery is attempted at most BREADEMAX times before seeking
189  * consent from the operator to continue.
190  */
191 
192 static int breaderrors = 0;
193 #define BREADEMAX 32
194 
195 void
rawread(daddr_t blkno,char * buf,int size)196 rawread(daddr_t blkno, char *buf, int size)
197 {
198           int cnt, i;
199 
200 #ifdef STATS
201           nphysread++;
202           physreadsize += size;
203 #endif
204 
205 loop:
206           if (lseek(diskfd, ((off_t) blkno << dev_bshift), SEEK_SET) == -1) {
207                     msg("rawread: lseek fails\n");
208                     goto err;
209           }
210           if ((cnt = read(diskfd, buf, size)) == size)
211                     return;
212           if (blkno + (size >> dev_bshift) > ufsib->ufs_dsize) {
213                     /*
214                      * Trying to read the final fragment.
215                      *
216                      * NB - dump only works in TP_BSIZE blocks, hence
217                      * rounds `dev_bsize' fragments up to TP_BSIZE pieces.
218                      * It should be smarter about not actually trying to
219                      * read more than it can get, but for the time being
220                      * we punt and scale back the read only when it gets
221                      * us into trouble. (mkm 9/25/83)
222                      */
223                     size -= dev_bsize;
224                     goto loop;
225           }
226           if (cnt == -1)
227                     msg("read error from %s: %s: [block %lld]: count=%d\n",
228                         disk, strerror(errno), (long long)blkno, size);
229           else
230                     msg("short read error from %s: [block %lld]: "
231                         "count=%d, got=%d\n",
232                         disk, (long long)blkno, size, cnt);
233 err:
234           if (++breaderrors > BREADEMAX) {
235                     msg("More than %d block read errors from %s\n",
236                         BREADEMAX, disk);
237                     broadcast("DUMP IS AILING!\n");
238                     msg("This is an unrecoverable error.\n");
239                     if (!query("Do you want to attempt to continue?")) {
240                               dumpabort(0);
241                               /*NOTREACHED*/
242                     } else
243                               breaderrors = 0;
244           }
245           /*
246            * Zero buffer, then try to read each sector of buffer separately.
247            */
248           memset(buf, 0, size);
249           for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) {
250                     if (lseek(diskfd, ((off_t)blkno << dev_bshift),
251                         SEEK_SET) == -1) {
252                               msg("rawread: lseek2 fails: %s!\n",
253                                   strerror(errno));
254                               continue;
255                     }
256                     if ((cnt = read(diskfd, buf, (int)dev_bsize)) == dev_bsize)
257                               continue;
258                     if (cnt == -1) {
259                               msg("read error from %s: %s: [sector %lld]: "
260                                   "count=%ld\n", disk, strerror(errno),
261                                   (long long)blkno, dev_bsize);
262                               continue;
263                     }
264                     msg("short read error from %s: [sector %lld]: "
265                         "count=%ld, got=%d\n",
266                         disk, (long long)blkno, dev_bsize, cnt);
267           }
268 }
269 
270 void
bread(daddr_t blkno,char * buf,int size)271 bread(daddr_t blkno, char *buf, int size)
272 {
273           int       osize = size, idx;
274           daddr_t oblkno = blkno;
275           char   *obuf = buf;
276           daddr_t numBlocks = howmany(size, dev_bsize);
277 
278 #ifdef STATS
279           nreads++;
280           readsize += size;
281 #endif
282 
283           if (!shareBuffer) {
284                     rawread(blkno, buf, size);
285                     return;
286           }
287 
288           if (flock(diskfd, LOCK_EX)) {
289                     msg("flock(LOCK_EX) failed: %s\n",
290                         strerror(errno));
291                     rawread(blkno, buf, size);
292                     return;
293           }
294 
295 retry:
296           idx = 0;
297           while (size > 0) {
298                     int       i;
299 
300                     for (i = 0; i < cachebufs; i++) {
301                               union cdesc *curr = &cdesc[(i + idx) % cachebufs];
302 
303 #ifdef DIAGNOSTICS
304                               if (curr->cd_owner) {
305                                         fprintf(stderr, "Owner is set (%d, me=%d), can"
306                                             "not happen.\n", curr->cd_owner, getpid());
307                               }
308 #endif
309 
310                               if (curr->cd_blkend == 0)
311                                         continue;
312                               /*
313                                * If we find a bit of the read in the buffers,
314                                * now compute how many blocks we can copy,
315                                * copy them out, adjust blkno, buf and size,
316                                * and restart
317                                */
318                               if (curr->cd_blkstart <= blkno &&
319                                   blkno < curr->cd_blkend) {
320                                         /* Number of data blocks to be copied */
321                                         int toCopy = MIN(size,
322                                             (curr->cd_blkend - blkno) << dev_bshift);
323 #ifdef DIAGNOSTICS
324                                         if (toCopy <= 0 || toCopy > CSIZE) {
325                                                   fprintf(stderr, "toCopy %d !\n",
326                                                       toCopy);
327                                                   dumpabort(0);
328                                         }
329                                         if (CDATA(curr) +
330                                             ((blkno - curr->cd_blkstart) <<
331                                             dev_bshift) < CDATA(curr) ||
332                                             CDATA(curr) +
333                                             ((blkno - curr->cd_blkstart) <<
334                                             dev_bshift) > CDATA(curr) + CSIZE) {
335                                                   fprintf(stderr, "%p < %p !!!\n",
336                                                      CDATA(curr) + ((blkno -
337                                                      curr->cd_blkstart) << dev_bshift),
338                                                      CDATA(curr));
339                                                   fprintf(stderr,
340                                                       "cdesc[i].cd_blkstart %lld "
341                                                       "blkno %lld dev_bsize %ld\n",
342                                                       (long long)curr->cd_blkstart,
343                                                       (long long)blkno,
344                                                       dev_bsize);
345                                                   dumpabort(0);
346                                         }
347 #endif
348                                         memcpy(buf, CDATA(curr) +
349                                             ((blkno - curr->cd_blkstart) <<
350                                             dev_bshift),
351                                             toCopy);
352 
353                                         buf       += toCopy;
354                                         size      -= toCopy;
355                                         blkno     += howmany(toCopy, dev_bsize);
356                                         numBlocks -= howmany(toCopy, dev_bsize);
357 
358                                         curr->cd_time = cheader->cd_count++;
359 
360                                         /*
361                                          * If all data of a cache block have been
362                                          * read, chances are good no more reads
363                                          * will occur, so expire the cache immediately
364                                          */
365 
366                                         curr->cd_blocksRead +=
367                                             howmany(toCopy, dev_bsize);
368                                         if (curr->cd_blocksRead >= nblksread)
369                                                   curr->cd_time = 0;
370 
371                                         goto retry;
372                               }
373                     }
374 
375                     /* No more to do? */
376                     if (size == 0)
377                               break;
378 
379                     /*
380                      * This does actually not happen if fs blocks are not greater
381                      * than nblksread.
382                      */
383                     if (numBlocks > nblksread || blkno >= ufsib->ufs_dsize) {
384                               rawread(oblkno, obuf, osize);
385                               break;
386                     } else {
387                               ssize_t   rsize;
388                               daddr_t   blockBlkNo;
389 
390                               blockBlkNo = (blkno / nblksread) * nblksread;
391                               idx = findlru();
392                               rsize = MIN(nblksread,
393                                   ufsib->ufs_dsize - blockBlkNo) << dev_bshift;
394 
395 #ifdef DIAGNOSTICS
396                               if (cdesc[idx].cd_owner)
397                                         fprintf(stderr, "Owner is set (%d, me=%d), can"
398                                             "not happen(2).\n", cdesc[idx].cd_owner,
399                                             getpid());
400                               cdesc[idx].cd_owner = getpid();
401 #endif
402                               cdesc[idx].cd_time = cheader->cd_count++;
403                               cdesc[idx].cd_blkstart = blockBlkNo;
404                               cdesc[idx].cd_blkend = 0;
405                               cdesc[idx].cd_blocksRead = 0;
406 
407                               if (lseek(diskfd, ((off_t) blockBlkNo << dev_bshift),
408                                   SEEK_SET) == -1) {
409                                         msg("readBlocks: lseek fails: %s\n",
410                                             strerror(errno));
411                                         rsize = -1;
412                               } else {
413                                         rsize = read(diskfd,
414                                             CDATA(&cdesc[idx]), rsize);
415                                         if (rsize < 0) {
416                                                   msg("readBlocks: read fails: %s\n",
417                                                       strerror(errno));
418                                         }
419                               }
420 
421                               /* On errors, panic, punt, try to read without
422                                * cache and let raw read routine do the rest.
423                                */
424 
425                               if (rsize <= 0) {
426                                         rawread(oblkno, obuf, osize);
427 #ifdef DIAGNOSTICS
428                                         if (cdesc[idx].cd_owner != getpid())
429                                                   fprintf(stderr, "Owner changed from "
430                                                       "%d to %d, can't happen\n",
431                                                       getpid(), cdesc[idx].cd_owner);
432                                         cdesc[idx].cd_owner = 0;
433 #endif
434                                         break;
435                               }
436 
437                               /* On short read, just note the fact and go on */
438                               cdesc[idx].cd_blkend = blockBlkNo + rsize / dev_bsize;
439 
440 #ifdef STATS
441                               nphysread++;
442                               physreadsize += rsize;
443 #endif
444 #ifdef DIAGNOSTICS
445                               if (cdesc[idx].cd_owner != getpid())
446                                         fprintf(stderr, "Owner changed from "
447                                             "%d to %d, can't happen\n",
448                                             getpid(), cdesc[idx].cd_owner);
449                               cdesc[idx].cd_owner = 0;
450 #endif
451                               /*
452                                * We swapped some of data in, let the loop fetch
453                                * them from cache
454                                */
455                     }
456           }
457 
458           if (flock(diskfd, LOCK_UN))
459                     msg("flock(LOCK_UN) failed: %s\n",
460                         strerror(errno));
461 }
462 
463 void
printcachestats(void)464 printcachestats(void)
465 {
466 
467 #ifdef STATS
468           fprintf(stderr, "Pid %d: %d reads (%u bytes) "
469               "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n",
470               getpid(), nreads, (u_int) readsize, nphysread,
471               (u_int) physreadsize, (nreads - nphysread) * 100 / nreads,
472               (int) (((physreadsize - readsize) * 100) / readsize));
473 #endif
474 }
475