1 /*-
2  * Copyright (c) 2010-2020 The NetBSD Foundation, Inc.
3  * All rights reserved.
4  *
5  * This material is based upon work partially supported by The
6  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27  * POSSIBILITY OF SUCH DAMAGE.
28  */
29 
30 /*
31  * BPF byte-code generation for NPF rules.
32  *
33  * Overview
34  *
35  *        Each NPF rule is compiled into a BPF micro-program.  There is a
36  *        BPF byte-code fragment for each higher-level filtering logic,
37  *        e.g. to match L4 protocol, IP/mask, etc.  The generation process
38  *        combines multiple BPF-byte code fragments into one program.
39  *
40  * Basic case
41  *
42  *        Consider a basic case where all filters should match.  They
43  *        are expressed as logical conjunction, e.g.:
44  *
45  *                  A and B and C and D
46  *
47  *        Each test (filter) criterion can be evaluated to true (match) or
48  *        false (no match) and the logic is as follows:
49  *
50  *        - If the value is true, then jump to the "next" test (offset 0).
51  *
52  *        - If the value is false, then jump to the JUMP_MAGIC value (0xff).
53  *        This "magic" value is used to indicate that it will have to be
54  *        patched at a later stage.
55  *
56  *        Once all byte-code fragments are combined into one, then there
57  *        are two additional steps:
58  *
59  *        - Two instructions are appended at the end of the program: "return
60  *        success" followed by "return failure".
61  *
62  *        - All jumps with the JUMP_MAGIC value are patched to point to the
63  *        "return failure" instruction.
64  *
65  *        Therefore, if all filter criteria will match, then the first
66  *        instruction will be reached, indicating a successful match of the
67  *        rule.  Otherwise, if any of the criteria will not match, it will
68  *        take the failure path and the rule will not be matching.
69  *
70  * Grouping
71  *
72  *        Filters can have groups, which have an effect of logical
73  *        disjunction, e.g.:
74  *
75  *                  A and B and (C or D)
76  *
77  *        In such case, the logic inside the group has to be inverted i.e.
78  *        the jump values swapped.  If the test value is true, then jump
79  *        out of the group; if false, then jump "next".  At the end of the
80  *        group, an addition failure path is appended and the JUMP_MAGIC
81  *        uses within the group are patched to jump past the said path.
82  *
83  *        For multi-word comparisons (IPv6 addresses), there is another
84  *        layer of grouping:
85  *
86  *                  A and B and ((C and D) or (E and F))
87  *
88  *        This strains the simple-minded JUMP_MAGIC logic, so for now,
89  *        when generating the jump-if-false targets for (C and D), we
90  *        simply count the number of instructions left to skip over.
91  *
92  *        A better architecture might be to create asm-type labels for
93  *        the jt and jf continuations in the first pass, and then, once
94  *        their offsets are determined, go back and fill them in in the
95  *        second pass.  This would simplify the logic (no need to compute
96  *        exactly how many instructions we're about to generate in a
97  *        chain of conditionals) and eliminate redundant RET #0
98  *        instructions which are currently generated after some groups.
99  */
100 
101 #include <sys/cdefs.h>
102 __RCSID("$NetBSD: npf_bpf_comp.c,v 1.17 2024/10/30 11:19:38 riastradh Exp $");
103 
104 #include <stdlib.h>
105 #include <stdbool.h>
106 #include <stddef.h>
107 #include <string.h>
108 #include <inttypes.h>
109 #include <err.h>
110 #include <assert.h>
111 
112 #include <netinet/in.h>
113 #include <netinet/in_systm.h>
114 #define   __FAVOR_BSD
115 #include <netinet/ip.h>
116 #include <netinet/ip6.h>
117 #include <netinet/udp.h>
118 #include <netinet/tcp.h>
119 #include <netinet/ip_icmp.h>
120 #include <netinet/icmp6.h>
121 
122 #include <net/bpf.h>
123 
124 #include "npfctl.h"
125 
126 /*
127  * Note: clear X_EQ_L4OFF when register X is invalidated i.e. it stores
128  * something other than L4 header offset.  Generally, when BPF_LDX is used.
129  */
130 #define   FETCHED_L3                    0x01
131 #define   CHECKED_L4_PROTO    0x02
132 #define   X_EQ_L4OFF                    0x04
133 
134 struct npf_bpf {
135           /*
136            * BPF program code, the allocated length (in bytes), the number
137            * of logical blocks and the flags.
138            */
139           struct bpf_program  prog;
140           size_t                        alen;
141           unsigned            nblocks;
142           sa_family_t                   af;
143           uint32_t            flags;
144 
145           /*
146            * Indicators whether we are inside the group and whether this
147            * group is implementing inverted logic.
148            *
149            * The current group offset (counted in BPF instructions)
150            * and block number at the start of the group.
151            */
152           unsigned            ingroup;
153           bool                          invert;
154           bool                          multiword;
155           unsigned            goff;
156           unsigned            gblock;
157 
158           /* Track inversion (excl. mark). */
159           uint32_t            invflags;
160 
161           /* BPF marks, allocated length and the real length. */
162           uint32_t *                    marks;
163           size_t                        malen;
164           size_t                        mlen;
165 };
166 
167 /*
168  * NPF success and failure values to be returned from BPF.
169  */
170 #define   NPF_BPF_SUCCESS               ((u_int)-1)
171 #define   NPF_BPF_FAILURE               0
172 
173 /*
174  * Magic value to indicate the failure path, which is fixed up on completion.
175  * Note: this is the longest jump offset in BPF, since the offset is one byte.
176  */
177 #define   JUMP_MAGIC                    0xff
178 
179 /* Reduce re-allocations by expanding in 64 byte blocks. */
180 #define   ALLOC_MASK                    (64 - 1)
181 #define   ALLOC_ROUND(x)                (((x) + ALLOC_MASK) & ~ALLOC_MASK)
182 
183 #ifndef IPV6_VERSION
184 #define   IPV6_VERSION                  0x60
185 #endif
186 
187 npf_bpf_t *
npfctl_bpf_create(void)188 npfctl_bpf_create(void)
189 {
190           return ecalloc(1, sizeof(npf_bpf_t));
191 }
192 
193 static void
fixup_jumps(npf_bpf_t * ctx,u_int start,u_int end,bool swap)194 fixup_jumps(npf_bpf_t *ctx, u_int start, u_int end, bool swap)
195 {
196           struct bpf_program *bp = &ctx->prog;
197 
198           for (u_int i = start; i < end; i++) {
199                     struct bpf_insn *insn = &bp->bf_insns[i];
200                     const u_int fail_off = end - i;
201                     bool seen_magic = false;
202 
203                     if (fail_off >= JUMP_MAGIC) {
204                               errx(EXIT_FAILURE, "BPF generation error: "
205                                   "the number of instructions is over the limit");
206                     }
207                     if (BPF_CLASS(insn->code) != BPF_JMP) {
208                               continue;
209                     }
210                     if (BPF_OP(insn->code) == BPF_JA) {
211                               /*
212                                * BPF_JA can be used to jump to the failure path.
213                                * If we are swapping i.e. inside the group, then
214                                * jump "next"; groups have a failure path appended
215                                * at their end.
216                                */
217                               if (insn->k == JUMP_MAGIC) {
218                                         insn->k = swap ? 0 : fail_off;
219                               }
220                               continue;
221                     }
222 
223                     /*
224                      * Fixup the "magic" value.  Swap only the "magic" jumps.
225                      */
226 
227                     if (insn->jt == JUMP_MAGIC) {
228                               insn->jt = fail_off;
229                               seen_magic = true;
230                     }
231                     if (insn->jf == JUMP_MAGIC) {
232                               insn->jf = fail_off;
233                               seen_magic = true;
234                     }
235 
236                     if (seen_magic && swap) {
237                               uint8_t jt = insn->jt;
238                               insn->jt = insn->jf;
239                               insn->jf = jt;
240                     }
241           }
242 }
243 
244 static void
add_insns(npf_bpf_t * ctx,struct bpf_insn * insns,size_t count)245 add_insns(npf_bpf_t *ctx, struct bpf_insn *insns, size_t count)
246 {
247           struct bpf_program *bp = &ctx->prog;
248           size_t offset, len, reqlen;
249 
250           /* Note: bf_len is the count of instructions. */
251           offset = bp->bf_len * sizeof(struct bpf_insn);
252           len = count * sizeof(struct bpf_insn);
253 
254           /* Ensure the memory buffer for the program. */
255           reqlen = ALLOC_ROUND(offset + len);
256           if (reqlen > ctx->alen) {
257                     bp->bf_insns = erealloc(bp->bf_insns, reqlen);
258                     ctx->alen = reqlen;
259           }
260 
261           /* Add the code block. */
262           memcpy((uint8_t *)bp->bf_insns + offset, insns, len);
263           bp->bf_len += count;
264 }
265 
266 static void
add_bmarks(npf_bpf_t * ctx,const uint32_t * m,size_t len)267 add_bmarks(npf_bpf_t *ctx, const uint32_t *m, size_t len)
268 {
269           size_t reqlen, nargs = m[1];
270 
271           if ((len / sizeof(uint32_t) - 2) != nargs) {
272                     errx(EXIT_FAILURE, "invalid BPF block description");
273           }
274           reqlen = ALLOC_ROUND(ctx->mlen + len);
275           if (reqlen > ctx->malen) {
276                     ctx->marks = erealloc(ctx->marks, reqlen);
277                     ctx->malen = reqlen;
278           }
279           memcpy((uint8_t *)ctx->marks + ctx->mlen, m, len);
280           ctx->mlen += len;
281 }
282 
283 static void
done_block(npf_bpf_t * ctx,const uint32_t * m,size_t len)284 done_block(npf_bpf_t *ctx, const uint32_t *m, size_t len)
285 {
286           add_bmarks(ctx, m, len);
287           ctx->nblocks++;
288 }
289 
290 struct bpf_program *
npfctl_bpf_complete(npf_bpf_t * ctx)291 npfctl_bpf_complete(npf_bpf_t *ctx)
292 {
293           struct bpf_program *bp = &ctx->prog;
294           const u_int retoff = bp->bf_len;
295 
296           /* No instructions (optimised out). */
297           if (!bp->bf_len)
298                     return NULL;
299 
300           /* Add the return fragment (success and failure paths). */
301           struct bpf_insn insns_ret[] = {
302                     BPF_STMT(BPF_RET+BPF_K, NPF_BPF_SUCCESS),
303                     BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE),
304           };
305           add_insns(ctx, insns_ret, __arraycount(insns_ret));
306 
307           /* Fixup all jumps to the main failure path. */
308           fixup_jumps(ctx, 0, retoff, false);
309 
310           return &ctx->prog;
311 }
312 
313 const void *
npfctl_bpf_bmarks(npf_bpf_t * ctx,size_t * len)314 npfctl_bpf_bmarks(npf_bpf_t *ctx, size_t *len)
315 {
316           *len = ctx->mlen;
317           return ctx->marks;
318 }
319 
320 void
npfctl_bpf_destroy(npf_bpf_t * ctx)321 npfctl_bpf_destroy(npf_bpf_t *ctx)
322 {
323           free(ctx->prog.bf_insns);
324           free(ctx->marks);
325           free(ctx);
326 }
327 
328 /*
329  * npfctl_bpf_group_enter: begin a logical group.  It merely uses logical
330  * disjunction (OR) for comparisons within the group.
331  */
332 void
npfctl_bpf_group_enter(npf_bpf_t * ctx,bool invert)333 npfctl_bpf_group_enter(npf_bpf_t *ctx, bool invert)
334 {
335           struct bpf_program *bp = &ctx->prog;
336 
337           assert(ctx->goff == 0);
338           assert(ctx->gblock == 0);
339 
340           ctx->goff = bp->bf_len;
341           ctx->gblock = ctx->nblocks;
342           ctx->invert = invert;
343           ctx->multiword = false;
344           ctx->ingroup++;
345 }
346 
347 void
npfctl_bpf_group_exit(npf_bpf_t * ctx)348 npfctl_bpf_group_exit(npf_bpf_t *ctx)
349 {
350           struct bpf_program *bp = &ctx->prog;
351           const size_t curoff = bp->bf_len;
352 
353           assert(ctx->ingroup);
354           ctx->ingroup--;
355 
356           /*
357            * If we're not inverting, there were only zero or one options,
358            * and the last comparison was not a multi-word comparison
359            * requiring a fallthrough failure -- nothing to do.
360            */
361           if (!ctx->invert &&
362               (ctx->nblocks - ctx->gblock) <= 1 &&
363               !ctx->multiword) {
364                     ctx->goff = ctx->gblock = 0;
365                     return;
366           }
367 
368           /*
369            * If inverting, then prepend a jump over the statement below.
370            * On match, it will skip-through and the fail path will be taken.
371            */
372           if (ctx->invert) {
373                     struct bpf_insn insns_ret[] = {
374                               BPF_STMT(BPF_JMP+BPF_JA, 1),
375                     };
376                     add_insns(ctx, insns_ret, __arraycount(insns_ret));
377           }
378 
379           /*
380            * Append a failure return as a fall-through i.e. if there is
381            * no match within the group.
382            */
383           struct bpf_insn insns_ret[] = {
384                     BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE),
385           };
386           add_insns(ctx, insns_ret, __arraycount(insns_ret));
387 
388           /*
389            * Adjust jump offsets: on match - jump outside the group i.e.
390            * to the current offset.  Otherwise, jump to the next instruction
391            * which would lead to the fall-through code above if none matches.
392            */
393           fixup_jumps(ctx, ctx->goff, curoff, true);
394           ctx->goff = ctx->gblock = 0;
395 }
396 
397 static void
fetch_l3(npf_bpf_t * ctx,sa_family_t af,unsigned flags)398 fetch_l3(npf_bpf_t *ctx, sa_family_t af, unsigned flags)
399 {
400           unsigned ver;
401 
402           switch (af) {
403           case AF_INET:
404                     ver = IPVERSION;
405                     break;
406           case AF_INET6:
407                     ver = IPV6_VERSION >> 4;
408                     break;
409           case AF_UNSPEC:
410                     ver = 0;
411                     break;
412           default:
413                     abort();
414           }
415 
416           /*
417            * The memory store is populated with:
418            * - BPF_MW_IPVER: IP version (4 or 6).
419            * - BPF_MW_L4OFF: L4 header offset.
420            * - BPF_MW_L4PROTO: L4 protocol.
421            */
422           if ((ctx->flags & FETCHED_L3) == 0 || (af && ctx->af == 0)) {
423                     const uint8_t jt = ver ? 0 : JUMP_MAGIC;
424                     const uint8_t jf = ver ? JUMP_MAGIC : 0;
425                     const bool ingroup = ctx->ingroup != 0;
426                     const bool invert = ctx->invert;
427 
428                     /*
429                      * L3 block cannot be inserted in the middle of a group.
430                      * In fact, it never is.  Check and start the group after.
431                      */
432                     if (ingroup) {
433                               assert(ctx->nblocks == ctx->gblock);
434                               npfctl_bpf_group_exit(ctx);
435                     }
436 
437                     /*
438                      * A <- IP version; A == expected-version?
439                      * If no particular version specified, check for non-zero.
440                      */
441                     struct bpf_insn insns_af[] = {
442                               BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_IPVER),
443                               BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ver, jt, jf),
444                     };
445                     add_insns(ctx, insns_af, __arraycount(insns_af));
446                     ctx->flags |= FETCHED_L3;
447                     ctx->af = af;
448 
449                     if (af) {
450                               uint32_t mwords[] = { BM_IPVER, 1, af };
451                               add_bmarks(ctx, mwords, sizeof(mwords));
452                     }
453                     if (ingroup) {
454                               npfctl_bpf_group_enter(ctx, invert);
455                     }
456 
457           } else if (af && af != ctx->af) {
458                     errx(EXIT_FAILURE, "address family mismatch");
459           }
460 
461           if ((flags & X_EQ_L4OFF) != 0 && (ctx->flags & X_EQ_L4OFF) == 0) {
462                     /* X <- IP header length */
463                     struct bpf_insn insns_hlen[] = {
464                               BPF_STMT(BPF_LDX+BPF_MEM, BPF_MW_L4OFF),
465                     };
466                     add_insns(ctx, insns_hlen, __arraycount(insns_hlen));
467                     ctx->flags |= X_EQ_L4OFF;
468           }
469 }
470 
471 static void
bm_invert_checkpoint(npf_bpf_t * ctx,const unsigned opts)472 bm_invert_checkpoint(npf_bpf_t *ctx, const unsigned opts)
473 {
474           uint32_t bm = 0;
475 
476           if (ctx->ingroup && ctx->invert) {
477                     const unsigned seen = ctx->invflags;
478 
479                     if ((opts & MATCH_SRC) != 0 && (seen & MATCH_SRC) == 0) {
480                               bm = BM_SRC_NEG;
481                     }
482                     if ((opts & MATCH_DST) != 0 && (seen & MATCH_DST) == 0) {
483                               bm = BM_DST_NEG;
484                     }
485                     ctx->invflags |= opts & (MATCH_SRC | MATCH_DST);
486           }
487           if (bm) {
488                     uint32_t mwords[] = { bm, 0 };
489                     add_bmarks(ctx, mwords, sizeof(mwords));
490           }
491 }
492 
493 /*
494  * npfctl_bpf_ipver: match the IP version.
495  */
496 void
npfctl_bpf_ipver(npf_bpf_t * ctx,sa_family_t af)497 npfctl_bpf_ipver(npf_bpf_t *ctx, sa_family_t af)
498 {
499           fetch_l3(ctx, af, 0);
500 }
501 
502 /*
503  * npfctl_bpf_proto: code block to match IP version and L4 protocol.
504  */
505 void
npfctl_bpf_proto(npf_bpf_t * ctx,unsigned proto)506 npfctl_bpf_proto(npf_bpf_t *ctx, unsigned proto)
507 {
508           struct bpf_insn insns_proto[] = {
509                     /* A <- L4 protocol; A == expected-protocol? */
510                     BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO),
511                     BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, proto, 0, JUMP_MAGIC),
512           };
513           add_insns(ctx, insns_proto, __arraycount(insns_proto));
514 
515           uint32_t mwords[] = { BM_PROTO, 1, proto };
516           done_block(ctx, mwords, sizeof(mwords));
517           ctx->flags |= CHECKED_L4_PROTO;
518 }
519 
520 /*
521  * npfctl_bpf_cidr: code block to match IPv4 or IPv6 CIDR.
522  *
523  * => IP address shall be in the network byte order.
524  */
525 void
npfctl_bpf_cidr(npf_bpf_t * ctx,unsigned opts,sa_family_t af,const npf_addr_t * addr,const npf_netmask_t mask)526 npfctl_bpf_cidr(npf_bpf_t *ctx, unsigned opts, sa_family_t af,
527     const npf_addr_t *addr, const npf_netmask_t mask)
528 {
529           const uint32_t *awords = (const uint32_t *)addr;
530           unsigned nwords, origlength, length, maxmask, off;
531 
532           assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0));
533           assert((mask && mask <= NPF_MAX_NETMASK) || mask == NPF_NO_NETMASK);
534 
535           switch (af) {
536           case AF_INET:
537                     maxmask = 32;
538                     off = (opts & MATCH_SRC) ?
539                         offsetof(struct ip, ip_src) :
540                         offsetof(struct ip, ip_dst);
541                     nwords = sizeof(struct in_addr) / sizeof(uint32_t);
542                     break;
543           case AF_INET6:
544                     maxmask = 128;
545                     off = (opts & MATCH_SRC) ?
546                         offsetof(struct ip6_hdr, ip6_src) :
547                         offsetof(struct ip6_hdr, ip6_dst);
548                     nwords = sizeof(struct in6_addr) / sizeof(uint32_t);
549                     break;
550           default:
551                     abort();
552           }
553 
554           /* Ensure address family. */
555           fetch_l3(ctx, af, 0);
556 
557           length = origlength = (mask == NPF_NO_NETMASK) ? maxmask : mask;
558 
559           /* CAUTION: BPF operates in host byte-order. */
560           for (unsigned i = 0; i < nwords; i++) {
561                     const unsigned woff = i * sizeof(uint32_t);
562                     uint32_t word = ntohl(awords[i]);
563                     uint32_t wordmask;
564 
565                     if (length >= 32) {
566                               /* The mask is a full word - do not apply it. */
567                               wordmask = 0;
568                               length -= 32;
569                     } else if (length) {
570                               wordmask = 0xffffffff << (32 - length);
571                               length = 0;
572                     } else {
573                               /* The mask became zero - skip the rest. */
574                               break;
575                     }
576 
577                     /* A <- IP address (or one word of it) */
578                     struct bpf_insn insns_ip[] = {
579                               BPF_STMT(BPF_LD+BPF_W+BPF_ABS, off + woff),
580                     };
581                     add_insns(ctx, insns_ip, __arraycount(insns_ip));
582 
583                     /* A <- (A & MASK) */
584                     if (wordmask) {
585                               struct bpf_insn insns_mask[] = {
586                                         BPF_STMT(BPF_ALU+BPF_AND+BPF_K, wordmask),
587                               };
588                               add_insns(ctx, insns_mask, __arraycount(insns_mask));
589                     }
590 
591                     /*
592                      * Determine how many instructions we have to jump
593                      * ahead if the match fails.
594                      *
595                      * - If this is the last word, we jump to the final
596                  *   failure, JUMP_MAGIC.
597                      *
598                      * - If this is not the last word, we jump past the
599                      *   remaining instructions to match this sequence.
600                      *   Each 32-bit word in the sequence takes two
601                      *   instructions (BPF_LD and BPF_JMP).  If there is a
602                      *   partial-word mask ahead, there will be one
603                      *   additional instruction (BPF_ALU).
604                      */
605                     uint8_t jf;
606                     if (i + 1 == (origlength + 31)/32) {
607                               jf = JUMP_MAGIC;
608                     } else {
609                               jf = 2*((origlength + 31)/32 - i - 1);
610                               if (origlength % 32 != 0 && wordmask == 0)
611                                         jf += 1;
612                     }
613 
614                     /* A == expected-IP-word ? */
615                     struct bpf_insn insns_cmp[] = {
616                               BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, word, 0, jf),
617                     };
618                     add_insns(ctx, insns_cmp, __arraycount(insns_cmp));
619           }
620 
621           /*
622            * If we checked a chain of words in sequence, mark this as a
623            * multi-word comparison so if this is in a group there will be
624            * a fallthrough case.
625            *
626            * XXX This is a little silly; the compiler should really just
627            * record holes where conditional jumps need success/failure
628            * continuations, and go back to fill in the holes when the
629            * locations of the continuations are determined later.  But
630            * that requires restructuring this code a little more.
631            */
632           ctx->multiword = (origlength + 31)/32 > 1;
633 
634           uint32_t mwords[] = {
635                     (opts & MATCH_SRC) ? BM_SRC_CIDR: BM_DST_CIDR, 6,
636                     af, mask, awords[0], awords[1], awords[2], awords[3],
637           };
638           bm_invert_checkpoint(ctx, opts);
639           done_block(ctx, mwords, sizeof(mwords));
640 }
641 
642 /*
643  * npfctl_bpf_ports: code block to match TCP/UDP port range.
644  *
645  * => Port numbers shall be in the network byte order.
646  */
647 void
npfctl_bpf_ports(npf_bpf_t * ctx,unsigned opts,in_port_t from,in_port_t to)648 npfctl_bpf_ports(npf_bpf_t *ctx, unsigned opts, in_port_t from, in_port_t to)
649 {
650           const unsigned sport_off = offsetof(struct udphdr, uh_sport);
651           const unsigned dport_off = offsetof(struct udphdr, uh_dport);
652           unsigned off;
653 
654           /* TCP and UDP port offsets are the same. */
655           assert(sport_off == offsetof(struct tcphdr, th_sport));
656           assert(dport_off == offsetof(struct tcphdr, th_dport));
657           assert(ctx->flags & CHECKED_L4_PROTO);
658 
659           assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0));
660           off = (opts & MATCH_SRC) ? sport_off : dport_off;
661 
662           /* X <- IP header length */
663           fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
664 
665           struct bpf_insn insns_fetch[] = {
666                     /* A <- port */
667                     BPF_STMT(BPF_LD+BPF_H+BPF_IND, off),
668           };
669           add_insns(ctx, insns_fetch, __arraycount(insns_fetch));
670 
671           /* CAUTION: BPF operates in host byte-order. */
672           from = ntohs(from);
673           to = ntohs(to);
674 
675           if (from == to) {
676                     /* Single port case. */
677                     struct bpf_insn insns_port[] = {
678                               BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, from, 0, JUMP_MAGIC),
679                     };
680                     add_insns(ctx, insns_port, __arraycount(insns_port));
681           } else {
682                     /* Port range case. */
683                     struct bpf_insn insns_range[] = {
684                               BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, from, 0, 1),
685                               BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, to, 0, 1),
686                               BPF_STMT(BPF_JMP+BPF_JA, JUMP_MAGIC),
687                     };
688                     add_insns(ctx, insns_range, __arraycount(insns_range));
689           }
690 
691           uint32_t mwords[] = {
692                     (opts & MATCH_SRC) ? BM_SRC_PORTS : BM_DST_PORTS, 2, from, to
693           };
694           done_block(ctx, mwords, sizeof(mwords));
695 }
696 
697 /*
698  * npfctl_bpf_tcpfl: code block to match TCP flags.
699  */
700 void
npfctl_bpf_tcpfl(npf_bpf_t * ctx,uint8_t tf,uint8_t tf_mask)701 npfctl_bpf_tcpfl(npf_bpf_t *ctx, uint8_t tf, uint8_t tf_mask)
702 {
703           const unsigned tcpfl_off = offsetof(struct tcphdr, th_flags);
704           const bool usingmask = tf_mask != tf;
705 
706           /* X <- IP header length */
707           fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
708 
709           if ((ctx->flags & CHECKED_L4_PROTO) == 0) {
710                     const unsigned jf = usingmask ? 3 : 2;
711                     assert(ctx->ingroup == 0);
712 
713                     /*
714                      * A <- L4 protocol; A == TCP?  If not, jump out.
715                      *
716                      * Note: the TCP flag matching might be without 'proto tcp'
717                      * when using a plain 'stateful' rule.  In such case it also
718                      * handles other protocols, thus no strict TCP check.
719                      */
720                     struct bpf_insn insns_tcp[] = {
721                               BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO),
722                               BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, IPPROTO_TCP, 0, jf),
723                     };
724                     add_insns(ctx, insns_tcp, __arraycount(insns_tcp));
725           }
726 
727           struct bpf_insn insns_tf[] = {
728                     /* A <- TCP flags */
729                     BPF_STMT(BPF_LD+BPF_B+BPF_IND, tcpfl_off),
730           };
731           add_insns(ctx, insns_tf, __arraycount(insns_tf));
732 
733           if (usingmask) {
734                     /* A <- (A & mask) */
735                     struct bpf_insn insns_mask[] = {
736                               BPF_STMT(BPF_ALU+BPF_AND+BPF_K, tf_mask),
737                     };
738                     add_insns(ctx, insns_mask, __arraycount(insns_mask));
739           }
740 
741           struct bpf_insn insns_cmp[] = {
742                     /* A == expected-TCP-flags? */
743                     BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, tf, 0, JUMP_MAGIC),
744           };
745           add_insns(ctx, insns_cmp, __arraycount(insns_cmp));
746 
747           uint32_t mwords[] = { BM_TCPFL, 2, tf, tf_mask };
748           done_block(ctx, mwords, sizeof(mwords));
749 }
750 
751 /*
752  * npfctl_bpf_icmp: code block to match ICMP type and/or code.
753  * Note: suitable for both the ICMPv4 and ICMPv6.
754  */
755 void
npfctl_bpf_icmp(npf_bpf_t * ctx,int type,int code)756 npfctl_bpf_icmp(npf_bpf_t *ctx, int type, int code)
757 {
758           const u_int type_off = offsetof(struct icmp, icmp_type);
759           const u_int code_off = offsetof(struct icmp, icmp_code);
760 
761           assert(ctx->flags & CHECKED_L4_PROTO);
762           assert(offsetof(struct icmp6_hdr, icmp6_type) == type_off);
763           assert(offsetof(struct icmp6_hdr, icmp6_code) == code_off);
764           assert(type != -1 || code != -1);
765 
766           /* X <- IP header length */
767           fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
768 
769           if (type != -1) {
770                     struct bpf_insn insns_type[] = {
771                               BPF_STMT(BPF_LD+BPF_B+BPF_IND, type_off),
772                               BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, type, 0, JUMP_MAGIC),
773                     };
774                     add_insns(ctx, insns_type, __arraycount(insns_type));
775 
776                     uint32_t mwords[] = { BM_ICMP_TYPE, 1, type };
777                     done_block(ctx, mwords, sizeof(mwords));
778           }
779 
780           if (code != -1) {
781                     struct bpf_insn insns_code[] = {
782                               BPF_STMT(BPF_LD+BPF_B+BPF_IND, code_off),
783                               BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, code, 0, JUMP_MAGIC),
784                     };
785                     add_insns(ctx, insns_code, __arraycount(insns_code));
786 
787                     uint32_t mwords[] = { BM_ICMP_CODE, 1, code };
788                     done_block(ctx, mwords, sizeof(mwords));
789           }
790 }
791 
792 #define   SRC_FLAG_BIT        (1U << 31)
793 
794 /*
795  * npfctl_bpf_table: code block to match source/destination IP address
796  * against NPF table specified by ID.
797  */
798 void
npfctl_bpf_table(npf_bpf_t * ctx,unsigned opts,unsigned tid)799 npfctl_bpf_table(npf_bpf_t *ctx, unsigned opts, unsigned tid)
800 {
801           const bool src = (opts & MATCH_SRC) != 0;
802 
803           struct bpf_insn insns_table[] = {
804                     BPF_STMT(BPF_LD+BPF_IMM, (src ? SRC_FLAG_BIT : 0) | tid),
805                     BPF_STMT(BPF_MISC+BPF_COP, NPF_COP_TABLE),
806                     BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, JUMP_MAGIC, 0),
807           };
808           add_insns(ctx, insns_table, __arraycount(insns_table));
809 
810           uint32_t mwords[] = { src ? BM_SRC_TABLE: BM_DST_TABLE, 1, tid };
811           bm_invert_checkpoint(ctx, opts);
812           done_block(ctx, mwords, sizeof(mwords));
813 }
814