xref: /dragonfly/sbin/hammer/cmd_dedup.c (revision d50f9ae3448247db98eb135b85b2a32e6e4187f4)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer.h"
36 
37 #include <sys/tree.h>
38 #include <libutil.h>
39 #include <openssl/sha.h>
40 
41 #define DEDUP_BUF (64 * 1024)
42 
43 /* Sorted list of block CRCs - light version for dedup-simulate */
44 struct sim_dedup_entry_rb_tree;
45 static RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
46                                                   RB_INITIALIZER(&sim_dedup_tree);
47 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
48                     rb_sim_dedup_entry_compare, hammer_crc_t);
49 
50 struct sim_dedup_entry {
51           hammer_crc_t        crc;
52           uint64_t  ref_blks; /* number of blocks referenced */
53           uint64_t  ref_size; /* size of data referenced */
54           RB_ENTRY(sim_dedup_entry) rb_entry;
55 };
56 
57 struct dedup_entry {
58           struct hammer_btree_leaf_elm leaf;
59           union {
60                     struct {
61                               uint64_t ref_blks;
62                               uint64_t ref_size;
63                     } de;
64                     RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
65           } u;
66           uint8_t flags;
67           RB_ENTRY(dedup_entry) rb_entry;
68 };
69 
70 #define HAMMER_DEDUP_ENTRY_FICTITIOUS   0x0001
71 
72 struct sha_dedup_entry {
73           struct hammer_btree_leaf_elm  leaf;
74           uint64_t                      ref_blks;
75           uint64_t                      ref_size;
76           uint8_t                                 sha_hash[SHA256_DIGEST_LENGTH];
77           RB_ENTRY(sha_dedup_entry)     fict_entry;
78 };
79 
80 /* Sorted list of HAMMER B-Tree keys */
81 struct dedup_entry_rb_tree;
82 struct sha_dedup_entry_rb_tree;
83 
84 static RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
85                                                   RB_INITIALIZER(&dedup_tree);
86 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
87                     rb_dedup_entry_compare, hammer_crc_t);
88 
89 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
90                     rb_sha_dedup_entry_compare);
91 
92 /*
93  * Pass2 list - contains entries that were not dedup'ed because ioctl failed
94  */
95 static STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
96                                         STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
97 
98 struct pass2_dedup_entry {
99           struct hammer_btree_leaf_elm  leaf;
100           STAILQ_ENTRY(pass2_dedup_entry)         sq_entry;
101 };
102 
103 #define DEDUP_PASS2 0x0001 /* process_btree_elm() mode */
104 
105 static int SigInfoFlag;
106 static int SigAlrmFlag;
107 static int64_t DedupDataReads;
108 static int64_t DedupCurrentRecords;
109 static int64_t DedupTotalRecords;
110 static uint32_t DedupCrcStart;
111 static uint32_t DedupCrcEnd;
112 static uint64_t MemoryUse;
113 
114 /* PFS global ids - we deal with just one PFS at a run */
115 static int glob_fd;
116 static struct hammer_ioc_pseudofs_rw glob_pfs;
117 
118 /*
119  * Global accounting variables
120  *
121  * Last three don't have to be 64-bit, just to be safe..
122  */
123 static uint64_t dedup_alloc_size;
124 static uint64_t dedup_ref_size;
125 static uint64_t dedup_skipped_size;
126 static uint64_t dedup_crc_failures;
127 static uint64_t dedup_sha_failures;
128 static uint64_t dedup_underflows;
129 static uint64_t dedup_successes_count;
130 static uint64_t dedup_successes_bytes;
131 
132 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
133                                         struct sim_dedup_entry *sim_de2);
134 static int rb_dedup_entry_compare(struct dedup_entry *de1,
135                                         struct dedup_entry *de2);
136 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
137                                         struct sha_dedup_entry *sha_de2);
138 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
139 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id);
140 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
141 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
142 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
143 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash);
144 static void dump_simulated_dedup(void);
145 static void dump_real_dedup(void);
146 static void dedup_usage(int code);
147 
148 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
149                     rb_sim_dedup_entry_compare, hammer_crc_t, crc);
150 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
151                     rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
152 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
153                     rb_sha_dedup_entry_compare);
154 
155 static
156 int
rb_sim_dedup_entry_compare(struct sim_dedup_entry * sim_de1,struct sim_dedup_entry * sim_de2)157 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
158                               struct sim_dedup_entry *sim_de2)
159 {
160           if (sim_de1->crc < sim_de2->crc)
161                     return (-1);
162           if (sim_de1->crc > sim_de2->crc)
163                     return (1);
164 
165           return (0);
166 }
167 
168 static
169 int
rb_dedup_entry_compare(struct dedup_entry * de1,struct dedup_entry * de2)170 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
171 {
172           if (de1->leaf.data_crc < de2->leaf.data_crc)
173                     return (-1);
174           if (de1->leaf.data_crc > de2->leaf.data_crc)
175                     return (1);
176 
177           return (0);
178 }
179 
180 static
181 int
rb_sha_dedup_entry_compare(struct sha_dedup_entry * sha_de1,struct sha_dedup_entry * sha_de2)182 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
183                               struct sha_dedup_entry *sha_de2)
184 {
185           unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
186           unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
187           int i;
188 
189           for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
190                     if (h1[i] < h2[i])
191                               return (-1);
192                     if (h1[i] > h2[i])
193                               return (1);
194           }
195 
196           return (0);
197 }
198 
199 /*
200  * dedup-simulate <filesystem>
201  */
202 void
hammer_cmd_dedup_simulate(char ** av,int ac)203 hammer_cmd_dedup_simulate(char **av, int ac)
204 {
205           struct sim_dedup_entry *sim_de;
206 
207           if (ac != 1) {
208                     dedup_usage(1);
209                     /* not reached */
210           }
211 
212           glob_fd = getpfs(&glob_pfs, av[0]);
213 
214           /*
215            * Collection passes (memory limited)
216            */
217           printf("Dedup-simulate running\n");
218           do {
219                     DedupCrcStart = DedupCrcEnd;
220                     DedupCrcEnd = 0;
221                     MemoryUse = 0;
222 
223                     if (VerboseOpt) {
224                               printf("B-Tree pass  crc-range %08x-max\n",
225                                         DedupCrcStart);
226                               fflush(stdout);
227                     }
228                     scan_pfs(av[0], collect_btree_elm, "simu-pass");
229 
230                     if (VerboseOpt >= 2)
231                               dump_simulated_dedup();
232 
233                     /*
234                      * Calculate simulated dedup ratio and get rid of the tree
235                      */
236                     while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
237                               assert(sim_de->ref_blks != 0);
238                               dedup_ref_size += sim_de->ref_size;
239                               dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
240 
241                               RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
242                               free(sim_de);
243                     }
244                     if (DedupCrcEnd && VerboseOpt == 0)
245                               printf(".");
246           } while (DedupCrcEnd);
247 
248           printf("Dedup-simulate %s succeeded\n", av[0]);
249           relpfs(glob_fd, &glob_pfs);
250 
251           printf("Simulated dedup ratio = %.2f\n",
252               (dedup_alloc_size != 0) ?
253                     (double)dedup_ref_size / dedup_alloc_size : 0);
254 }
255 
256 /*
257  * dedup <filesystem>
258  */
259 void
hammer_cmd_dedup(char ** av,int ac)260 hammer_cmd_dedup(char **av, int ac)
261 {
262           struct dedup_entry *de;
263           struct sha_dedup_entry *sha_de;
264           struct pass2_dedup_entry *pass2_de;
265           char *tmp;
266           char buf[8];
267           int needfree = 0;
268 
269           if (TimeoutOpt > 0)
270                     alarm(TimeoutOpt);
271 
272           if (ac != 1) {
273                     dedup_usage(1);
274                     /* not reached */
275           }
276 
277           STAILQ_INIT(&pass2_dedup_queue);
278 
279           glob_fd = getpfs(&glob_pfs, av[0]);
280 
281           /*
282            * A cycle file is _required_ for resuming dedup after the timeout
283            * specified with -t has expired. If no -c option, then place a
284            * .dedup.cycle file either in the PFS snapshots directory or in
285            * the default snapshots directory.
286            */
287           if (!CyclePath) {
288                     if (glob_pfs.ondisk->snapshots[0] != '/') {
289                               asprintf(&tmp, "%s/%s/.dedup.cycle",
290                                   SNAPSHOTS_BASE, av[0]);
291                     } else {
292                               asprintf(&tmp, "%s/.dedup.cycle",
293                                   glob_pfs.ondisk->snapshots);
294                     }
295                     CyclePath = tmp;
296                     needfree = 1;
297           }
298 
299           /*
300            * Pre-pass to cache the btree
301            */
302           scan_pfs(av[0], count_btree_elm, "pre-pass ");
303           DedupTotalRecords = DedupCurrentRecords;
304 
305           /*
306            * Collection passes (memory limited)
307            */
308           printf("Dedup running\n");
309           do {
310                     DedupCrcStart = DedupCrcEnd;
311                     DedupCrcEnd = 0;
312                     MemoryUse = 0;
313 
314                     if (VerboseOpt) {
315                               printf("B-Tree pass  crc-range %08x-max\n",
316                                         DedupCrcStart);
317                               fflush(stdout);
318                     }
319                     scan_pfs(av[0], process_btree_elm, "main-pass");
320 
321                     while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
322                               if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
323                                         dedup_skipped_size -= pass2_de->leaf.data_len;
324 
325                               STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
326                               free(pass2_de);
327                     }
328                     assert(STAILQ_EMPTY(&pass2_dedup_queue));
329 
330                     if (VerboseOpt >= 2)
331                               dump_real_dedup();
332 
333                     /*
334                      * Calculate dedup ratio and get rid of the trees
335                      */
336                     while ((de = RB_ROOT(&dedup_tree)) != NULL) {
337                               if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
338                                         while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
339                                                   assert(sha_de->ref_blks != 0);
340                                                   dedup_ref_size += sha_de->ref_size;
341                                                   dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
342 
343                                                   RB_REMOVE(sha_dedup_entry_rb_tree,
344                                                                       &de->u.fict_root, sha_de);
345                                                   free(sha_de);
346                                         }
347                                         assert(RB_EMPTY(&de->u.fict_root));
348                               } else {
349                                         assert(de->u.de.ref_blks != 0);
350                                         dedup_ref_size += de->u.de.ref_size;
351                                         dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
352                               }
353 
354                               RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
355                               free(de);
356                     }
357                     assert(RB_EMPTY(&dedup_tree));
358                     if (DedupCrcEnd && VerboseOpt == 0)
359                               printf(".");
360           } while (DedupCrcEnd);
361 
362           printf("Dedup %s succeeded\n", av[0]);
363           relpfs(glob_fd, &glob_pfs);
364 
365           humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
366           printf("Dedup ratio = %.2f (in this run)\n"
367                  "    %8s referenced\n",
368                  ((dedup_alloc_size != 0) ?
369                               (double)dedup_ref_size / dedup_alloc_size : 0),
370                  buf
371           );
372           humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
373           printf("    %8s allocated\n", buf);
374           humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
375           printf("    %8s skipped\n", buf);
376           printf("    %8jd CRC collisions\n"
377                  "    %8jd SHA collisions\n"
378                  "    %8jd big-block underflows\n"
379                  "    %8jd new dedup records\n"
380                  "    %8jd new dedup bytes\n",
381                  (intmax_t)dedup_crc_failures,
382                  (intmax_t)dedup_sha_failures,
383                (intmax_t)dedup_underflows,
384                (intmax_t)dedup_successes_count,
385                (intmax_t)dedup_successes_bytes
386           );
387 
388           /* Once completed remove cycle file */
389           hammer_reset_cycle();
390 
391           /* We don't want to mess up with other directives */
392           if (needfree) {
393                     free(tmp);
394                     CyclePath = NULL;
395           }
396 }
397 
398 static
399 int
count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused,int flags __unused)400 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused)
401 {
402           return(1);
403 }
404 
405 static
406 int
collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf,int flags __unused)407 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
408 {
409           struct sim_dedup_entry *sim_de;
410 
411           /*
412            * If we are using too much memory we have to clean some out, which
413            * will cause the run to use multiple passes.  Be careful of integer
414            * overflows!
415            */
416           if (MemoryUse > MemoryLimit) {
417                     DedupCrcEnd = DedupCrcStart +
418                                     (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
419                     if (VerboseOpt) {
420                               printf("memory limit  crc-range %08x-%08x\n",
421                                         DedupCrcStart, DedupCrcEnd);
422                               fflush(stdout);
423                     }
424                     for (;;) {
425                               sim_de = RB_MAX(sim_dedup_entry_rb_tree,
426                                                   &sim_dedup_tree);
427                               if (sim_de == NULL || sim_de->crc < DedupCrcEnd)
428                                         break;
429                               RB_REMOVE(sim_dedup_entry_rb_tree,
430                                           &sim_dedup_tree, sim_de);
431                               MemoryUse -= sizeof(*sim_de);
432                               free(sim_de);
433                     }
434           }
435 
436           /*
437            * Collect statistics based on the CRC only, do not try to read
438            * any data blocks or run SHA hashes.
439            */
440           sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree,
441                                  scan_leaf->data_crc);
442 
443           if (sim_de == NULL) {
444                     sim_de = calloc(1, sizeof(*sim_de));
445                     sim_de->crc = scan_leaf->data_crc;
446                     RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
447                     MemoryUse += sizeof(*sim_de);
448           }
449 
450           sim_de->ref_blks += 1;
451           sim_de->ref_size += scan_leaf->data_len;
452           return (1);
453 }
454 
455 static __inline
456 int
validate_dedup_pair(hammer_btree_leaf_elm_t p,hammer_btree_leaf_elm_t q)457 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
458 {
459           if (HAMMER_ZONE(p->data_offset) != HAMMER_ZONE(q->data_offset))
460                     return (1);
461           if (p->data_len != q->data_len)
462                     return (1);
463 
464           return (0);
465 }
466 
467 #define DEDUP_TECH_FAILURE    1
468 #define DEDUP_CMP_FAILURE     2
469 #define DEDUP_INVALID_ZONE    3
470 #define DEDUP_UNDERFLOW                 4
471 #define DEDUP_VERS_FAILURE    5
472 
473 static __inline
474 int
deduplicate(hammer_btree_leaf_elm_t p,hammer_btree_leaf_elm_t q)475 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
476 {
477           struct hammer_ioc_dedup dedup;
478 
479           bzero(&dedup, sizeof(dedup));
480 
481           /*
482            * If data_offset fields are the same there is no need to run ioctl,
483            * candidate is already dedup'ed.
484            */
485           if (p->data_offset == q->data_offset)
486                     return (0);
487 
488           dedup.elm1 = p->base;
489           dedup.elm2 = q->base;
490           RunningIoctl = 1;
491           if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
492                     if (errno == EOPNOTSUPP)
493                               return (DEDUP_VERS_FAILURE); /* must be at least version 5 */
494                     /* Technical failure - locking or w/e */
495                     return (DEDUP_TECH_FAILURE);
496           }
497           if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE)
498                     return (DEDUP_CMP_FAILURE);
499           if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE)
500                     return (DEDUP_INVALID_ZONE);
501           if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW)
502                     return (DEDUP_UNDERFLOW);
503           RunningIoctl = 0;
504           ++dedup_successes_count;
505           dedup_successes_bytes += p->data_len;
506           return (0);
507 }
508 
509 static
510 int
process_btree_elm(hammer_btree_leaf_elm_t scan_leaf,int flags)511 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
512 {
513           struct dedup_entry *de;
514           struct sha_dedup_entry *sha_de, temp;
515           struct pass2_dedup_entry *pass2_de;
516           int error;
517 
518           /*
519            * If we are using too much memory we have to clean some out, which
520            * will cause the run to use multiple passes.  Be careful of integer
521            * overflows!
522            */
523           while (MemoryUse > MemoryLimit) {
524                     DedupCrcEnd = DedupCrcStart +
525                                     (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
526                     if (VerboseOpt) {
527                               printf("memory limit  crc-range %08x-%08x\n",
528                                         DedupCrcStart, DedupCrcEnd);
529                               fflush(stdout);
530                     }
531 
532                     for (;;) {
533                               de = RB_MAX(dedup_entry_rb_tree, &dedup_tree);
534                               if (de == NULL || de->leaf.data_crc < DedupCrcEnd)
535                                         break;
536                               if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
537                                         while ((sha_de = RB_ROOT(&de->u.fict_root)) !=
538                                                NULL) {
539                                                   RB_REMOVE(sha_dedup_entry_rb_tree,
540                                                               &de->u.fict_root, sha_de);
541                                                   MemoryUse -= sizeof(*sha_de);
542                                                   free(sha_de);
543                                         }
544                               }
545                               RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
546                               MemoryUse -= sizeof(*de);
547                               free(de);
548                     }
549           }
550 
551           /*
552            * Collect statistics based on the CRC.  Colliding CRCs usually
553            * cause a SHA sub-tree to be created under the de.
554            *
555            * Trivial case if de not found.
556            */
557           de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
558           if (de == NULL) {
559                     de = calloc(1, sizeof(*de));
560                     de->leaf = *scan_leaf;
561                     RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
562                     MemoryUse += sizeof(*de);
563                     goto upgrade_stats;
564           }
565 
566           /*
567            * Found entry in CRC tree
568            */
569           if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
570                     /*
571                      * Optimize the case where a CRC failure results in multiple
572                      * SHA entries.  If we unconditionally issue a data-read a
573                      * degenerate situation where a colliding CRC's second SHA
574                      * entry contains the lion's share of the deduplication
575                      * candidates will result in excessive data block reads.
576                      *
577                      * Deal with the degenerate case by looking for a matching
578                      * data_offset/data_len in the SHA elements we already have
579                      * before reading the data block and generating a new SHA.
580                      */
581                     RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
582                               if (sha_de->leaf.data_offset ==
583                                                             scan_leaf->data_offset &&
584                                   sha_de->leaf.data_len == scan_leaf->data_len) {
585                                         memcpy(temp.sha_hash, sha_de->sha_hash,
586                                                   SHA256_DIGEST_LENGTH);
587                                         break;
588                               }
589                     }
590 
591                     /*
592                      * Entry in CRC tree is fictitious, so we already had problems
593                      * with this CRC. Upgrade (compute SHA) the candidate and
594                      * dive into SHA subtree. If upgrade fails insert the candidate
595                      * into Pass2 list (it will be processed later).
596                      */
597                     if (sha_de == NULL) {
598                               if (upgrade_chksum(scan_leaf, temp.sha_hash))
599                                         goto pass2_insert;
600 
601                               sha_de = RB_FIND(sha_dedup_entry_rb_tree,
602                                                    &de->u.fict_root, &temp);
603                     }
604 
605                     /*
606                      * Nothing in SHA subtree so far, so this is a new
607                      * 'dataset'. Insert new entry into SHA subtree.
608                      */
609                     if (sha_de == NULL) {
610                               sha_de = calloc(1, sizeof(*sha_de));
611                               sha_de->leaf = *scan_leaf;
612                               memcpy(sha_de->sha_hash, temp.sha_hash,
613                                      SHA256_DIGEST_LENGTH);
614                               RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root,
615                                           sha_de);
616                               MemoryUse += sizeof(*sha_de);
617                               goto upgrade_stats_sha;
618                     }
619 
620                     /*
621                      * Found entry in SHA subtree, it means we have a potential
622                      * dedup pair. Validate it (zones have to match and data_len
623                      * field have to be the same too. If validation fails, treat
624                      * it as a SHA collision (jump to sha256_failure).
625                      */
626                     if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
627                               goto sha256_failure;
628 
629                     /*
630                      * We have a valid dedup pair (SHA match, validated).
631                      *
632                      * In case of technical failure (dedup pair was good, but
633                      * ioctl failed anyways) insert the candidate into Pass2 list
634                      * (we will try to dedup it after we are done with the rest of
635                      * the tree).
636                      *
637                      * If ioctl fails because either of blocks is in the non-dedup
638                      * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
639                      * bother with the candidate and terminate early.
640                      *
641                      * If ioctl fails because of big-block underflow replace the
642                      * leaf node that found dedup entry represents with scan_leaf.
643                      */
644                     error = deduplicate(&sha_de->leaf, scan_leaf);
645                     switch(error) {
646                     case 0:
647                               goto upgrade_stats_sha;
648                     case DEDUP_TECH_FAILURE:
649                               goto pass2_insert;
650                     case DEDUP_CMP_FAILURE:
651                               goto sha256_failure;
652                     case DEDUP_INVALID_ZONE:
653                               goto terminate_early;
654                     case DEDUP_UNDERFLOW:
655                               ++dedup_underflows;
656                               sha_de->leaf = *scan_leaf;
657                               memcpy(sha_de->sha_hash, temp.sha_hash,
658                                         SHA256_DIGEST_LENGTH);
659                               goto upgrade_stats_sha;
660                     case DEDUP_VERS_FAILURE:
661                               errx(1, "HAMMER filesystem must be at least "
662                                         "version 5 to dedup");
663                               /* not reached */
664                     default:
665                               fprintf(stderr, "Unknown error\n");
666                               goto terminate_early;
667                     }
668 
669                     /*
670                      * Ooh la la.. SHA-256 collision. Terminate early, there's
671                      * nothing we can do here.
672                      */
673 sha256_failure:
674                     ++dedup_sha_failures;
675                     goto terminate_early;
676           } else {
677                     /*
678                      * Candidate CRC is good for now (we found an entry in CRC
679                      * tree and it's not fictitious). This means we have a
680                      * potential dedup pair.
681                      */
682                     if (validate_dedup_pair(&de->leaf, scan_leaf))
683                               goto crc_failure;
684 
685                     /*
686                      * We have a valid dedup pair (CRC match, validated)
687                      */
688                     error = deduplicate(&de->leaf, scan_leaf);
689                     switch(error) {
690                     case 0:
691                               goto upgrade_stats;
692                     case DEDUP_TECH_FAILURE:
693                               goto pass2_insert;
694                     case DEDUP_CMP_FAILURE:
695                               goto crc_failure;
696                     case DEDUP_INVALID_ZONE:
697                               goto terminate_early;
698                     case DEDUP_UNDERFLOW:
699                               ++dedup_underflows;
700                               de->leaf = *scan_leaf;
701                               goto upgrade_stats;
702                     case DEDUP_VERS_FAILURE:
703                               errx(1, "HAMMER filesystem must be at least "
704                                         "version 5 to dedup");
705                               /* not reached */
706                     default:
707                               fprintf(stderr, "Unknown error\n");
708                               goto terminate_early;
709                     }
710 
711 crc_failure:
712                     /*
713                      * We got a CRC collision - either ioctl failed because of
714                      * the comparison failure or validation of the potential
715                      * dedup pair went bad. In all cases insert both blocks
716                      * into SHA subtree (this requires checksum upgrade) and mark
717                      * entry that corresponds to this CRC in the CRC tree
718                      * fictitious, so that all futher operations with this CRC go
719                      * through SHA subtree.
720                      */
721                     ++dedup_crc_failures;
722 
723                     /*
724                      * Insert block that was represented by now fictitious dedup
725                      * entry (create a new SHA entry and preserve stats of the
726                      * old CRC one). If checksum upgrade fails insert the
727                      * candidate into Pass2 list and return - keep both trees
728                      * unmodified.
729                      */
730                     sha_de = calloc(1, sizeof(*sha_de));
731                     sha_de->leaf = de->leaf;
732                     sha_de->ref_blks = de->u.de.ref_blks;
733                     sha_de->ref_size = de->u.de.ref_size;
734                     if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
735                               free(sha_de);
736                               goto pass2_insert;
737                     }
738                     MemoryUse += sizeof(*sha_de);
739 
740                     RB_INIT(&de->u.fict_root);
741                     /*
742                      * Here we can insert without prior checking because the tree
743                      * is empty at this point
744                      */
745                     RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
746 
747                     /*
748                      * Mark entry in CRC tree fictitious
749                      */
750                     de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
751 
752                     /*
753                      * Upgrade checksum of the candidate and insert it into
754                      * SHA subtree. If upgrade fails insert the candidate into
755                      * Pass2 list.
756                      */
757                     if (upgrade_chksum(scan_leaf, temp.sha_hash))
758                               goto pass2_insert;
759                     sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root,
760                                          &temp);
761                     if (sha_de != NULL) {
762                               /* There is an entry with this SHA already, but the only
763                                * RB-tree element at this point is that entry we just
764                                * added. We know for sure these blocks are different
765                                * (this is crc_failure branch) so treat it as SHA
766                                * collision.
767                                */
768                               goto sha256_failure;
769                     }
770 
771                     sha_de = calloc(1, sizeof(*sha_de));
772                     sha_de->leaf = *scan_leaf;
773                     memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
774                     RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
775                     MemoryUse += sizeof(*sha_de);
776                     goto upgrade_stats_sha;
777           }
778 
779 upgrade_stats:
780           de->u.de.ref_blks += 1;
781           de->u.de.ref_size += scan_leaf->data_len;
782           return (1);
783 
784 upgrade_stats_sha:
785           sha_de->ref_blks += 1;
786           sha_de->ref_size += scan_leaf->data_len;
787           return (1);
788 
789 pass2_insert:
790           /*
791            * If in pass2 mode don't insert anything, fall through to
792            * terminate_early
793            */
794           if ((flags & DEDUP_PASS2) == 0) {
795                     pass2_de = calloc(1, sizeof(*pass2_de));
796                     pass2_de->leaf = *scan_leaf;
797                     STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
798                     dedup_skipped_size += scan_leaf->data_len;
799                     return (1);
800           }
801 
802 terminate_early:
803           /*
804            * Early termination path. Fixup stats.
805            */
806           dedup_alloc_size += scan_leaf->data_len;
807           dedup_ref_size += scan_leaf->data_len;
808           return (0);
809 }
810 
811 static
812 int
upgrade_chksum(hammer_btree_leaf_elm_t leaf,uint8_t * sha_hash)813 upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash)
814 {
815           struct hammer_ioc_data data;
816           char *buf = malloc(DEDUP_BUF);
817           SHA256_CTX ctx;
818           int error;
819 
820           bzero(&data, sizeof(data));
821           data.elm = leaf->base;
822           data.ubuf = buf;
823           data.size = DEDUP_BUF;
824 
825           error = 0;
826           if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
827                     fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
828                     error = 1;
829                     goto done;
830           }
831           DedupDataReads += leaf->data_len;
832 
833           if (data.leaf.data_len != leaf->data_len) {
834                     error = 1;
835                     goto done;
836           }
837 
838           if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
839               data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
840                     SHA256_Init(&ctx);
841                     SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
842                     SHA256_Final(sha_hash, &ctx);
843           }
844 
845 done:
846           free(buf);
847           return (error);
848 }
849 
850 static
851 void
sigAlrm(int signo __unused)852 sigAlrm(int signo __unused)
853 {
854           SigAlrmFlag = 1;
855 }
856 
857 static
858 void
sigInfo(int signo __unused)859 sigInfo(int signo __unused)
860 {
861           SigInfoFlag = 1;
862 }
863 
864 static
865 void
scan_pfs(char * filesystem,scan_pfs_cb_t func,const char * id)866 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id)
867 {
868           struct hammer_ioc_mirror_rw mirror;
869           hammer_ioc_mrecord_any_t mrec;
870           struct hammer_btree_leaf_elm elm;
871           char *buf = malloc(DEDUP_BUF);
872           char buf1[8];
873           char buf2[8];
874           int offset, bytes;
875 
876           SigInfoFlag = 0;
877           DedupDataReads = 0;
878           DedupCurrentRecords = 0;
879           signal(SIGINFO, sigInfo);
880           signal(SIGALRM, sigAlrm);
881 
882           /*
883            * Deduplication happens per element so hammer(8) is in full
884            * control of the ioctl()s to actually perform it. SIGALRM
885            * needs to be handled within hammer(8) but a checkpoint
886            * is needed for resuming. Use cycle file for that.
887            *
888            * Try to obtain the previous obj_id from the cycle file and
889            * if not available just start from the beginning.
890            */
891           bzero(&mirror, sizeof(mirror));
892           hammer_key_beg_init(&mirror.key_beg);
893           hammer_get_cycle(&mirror.key_beg, &mirror.tid_beg);
894 
895           if (mirror.key_beg.obj_id != (int64_t)HAMMER_MIN_OBJID) {
896                     if (VerboseOpt) {
897                               fprintf(stderr, "%s: mirror-read: Resuming at object %016jx\n",
898                                   id, (uintmax_t)mirror.key_beg.obj_id);
899                     }
900           }
901 
902           hammer_key_end_init(&mirror.key_end);
903 
904           mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
905           mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
906           mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
907           mirror.ubuf = buf;
908           mirror.size = DEDUP_BUF;
909           mirror.pfs_id = glob_pfs.pfs_id;
910           mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
911 
912           if (VerboseOpt && DedupCrcStart == 0) {
913                     printf("%s %s: objspace %016jx:%04x %016jx:%04x\n",
914                               id, filesystem,
915                               (uintmax_t)mirror.key_beg.obj_id,
916                               mirror.key_beg.localization,
917                               (uintmax_t)mirror.key_end.obj_id,
918                               mirror.key_end.localization);
919                     printf("%s %s: pfs_id %d\n",
920                               id, filesystem, glob_pfs.pfs_id);
921           }
922           fflush(stdout);
923           fflush(stderr);
924 
925           do {
926                     mirror.count = 0;
927                     mirror.pfs_id = glob_pfs.pfs_id;
928                     mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
929                     if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
930                               err(1, "Mirror-read %s failed", filesystem);
931                               /* not reached */
932                     }
933                     if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
934                               errx(1, "Mirror-read %s fatal error %d",
935                                         filesystem, mirror.head.error);
936                               /* not reached */
937                     }
938                     if (mirror.count) {
939                               offset = 0;
940                               while (offset < mirror.count) {
941                                         mrec = (void *)((char *)buf + offset);
942                                         bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
943                                         if (offset + bytes > mirror.count) {
944                                                   errx(1, "Misaligned record");
945                                                   /* not reached */
946                                         }
947                                         assert((mrec->head.type &
948                                                HAMMER_MRECF_TYPE_MASK) ==
949                                                HAMMER_MREC_TYPE_REC);
950                                         offset += bytes;
951                                         elm = mrec->rec.leaf;
952                                         if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD)
953                                                   continue;
954                                         if (elm.base.rec_type != HAMMER_RECTYPE_DATA)
955                                                   continue;
956                                         ++DedupCurrentRecords;
957                                         if (DedupCrcStart != DedupCrcEnd) {
958                                                   if (elm.data_crc < DedupCrcStart)
959                                                             continue;
960                                                   if (DedupCrcEnd &&
961                                                       elm.data_crc >= DedupCrcEnd) {
962                                                             continue;
963                                                   }
964                                         }
965                                         func(&elm, 0);
966                               }
967                     }
968                     mirror.key_beg = mirror.key_cur;
969                     if (DidInterrupt || SigAlrmFlag) {
970                               if (VerboseOpt) {
971                                         fprintf(stderr, "%s\n",
972                                             (DidInterrupt ? "Interrupted" : "Timeout"));
973                               }
974                               hammer_set_cycle(&mirror.key_cur, mirror.tid_beg);
975                               if (VerboseOpt) {
976                                         fprintf(stderr, "Cyclefile %s updated for "
977                                             "continuation\n", CyclePath);
978                               }
979                               exit(1);
980                     }
981                     if (SigInfoFlag) {
982                               if (DedupTotalRecords) {
983                                         humanize_unsigned(buf1, sizeof(buf1),
984                                                               DedupDataReads,
985                                                               "B", 1024);
986                                         humanize_unsigned(buf2, sizeof(buf2),
987                                                               dedup_successes_bytes,
988                                                               "B", 1024);
989                                         fprintf(stderr, "%s count %7jd/%jd "
990                                                             "(%02d.%02d%%) "
991                                                             "ioread %s newddup %s\n",
992                                                   id,
993                                                   (intmax_t)DedupCurrentRecords,
994                                                   (intmax_t)DedupTotalRecords,
995                                                   (int)(DedupCurrentRecords * 100 /
996                                                             DedupTotalRecords),
997                                                   (int)(DedupCurrentRecords * 10000 /
998                                                             DedupTotalRecords % 100),
999                                                   buf1, buf2);
1000                               } else {
1001                                         fprintf(stderr, "%s count %-7jd\n",
1002                                                   id,
1003                                                   (intmax_t)DedupCurrentRecords);
1004                               }
1005                               SigInfoFlag = 0;
1006                     }
1007           } while (mirror.count != 0);
1008 
1009           signal(SIGINFO, SIG_IGN);
1010           signal(SIGALRM, SIG_IGN);
1011 
1012           free(buf);
1013 }
1014 
1015 static
1016 void
dump_simulated_dedup(void)1017 dump_simulated_dedup(void)
1018 {
1019           struct sim_dedup_entry *sim_de;
1020 
1021           printf("=== Dumping simulated dedup entries:\n");
1022           RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
1023                     printf("\tcrc=%08x cnt=%ju size=%ju\n",
1024                               sim_de->crc,
1025                               (uintmax_t)sim_de->ref_blks,
1026                               (uintmax_t)sim_de->ref_size);
1027           }
1028           printf("end of dump ===\n");
1029 }
1030 
1031 static
1032 void
dump_real_dedup(void)1033 dump_real_dedup(void)
1034 {
1035           struct dedup_entry *de;
1036           struct sha_dedup_entry *sha_de;
1037           int i;
1038 
1039           printf("=== Dumping dedup entries:\n");
1040           RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
1041                     if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
1042                               printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
1043 
1044                               RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
1045                                         printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
1046                                                "\t\tsha=",
1047                                                sha_de->leaf.data_crc,
1048                                                (uintmax_t)sha_de->ref_blks,
1049                                                (uintmax_t)sha_de->ref_size);
1050                                         for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
1051                                                   printf("%02x", sha_de->sha_hash[i]);
1052                                         printf("\n");
1053                               }
1054                     } else {
1055                               printf("\tcrc=%08x cnt=%ju size=%ju\n",
1056                                      de->leaf.data_crc,
1057                                      (uintmax_t)de->u.de.ref_blks,
1058                                      (uintmax_t)de->u.de.ref_size);
1059                     }
1060           }
1061           printf("end of dump ===\n");
1062 }
1063 
1064 static
1065 void
dedup_usage(int code)1066 dedup_usage(int code)
1067 {
1068           fprintf(stderr,
1069                     "hammer dedup-simulate <filesystem>\n"
1070                     "hammer dedup <filesystem>\n"
1071           );
1072           exit(code);
1073 }
1074