1 /*-
2  * Copyright (c) 2014 Sebastian Freundt
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #include "archive_platform.h"
27 
28 /**
29  * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
30  * ISO 28500:2009.
31  * For the purposes of this file we used the final draft from:
32  * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
33  *
34  * Todo:
35  * [ ] real-world warcs can contain resources at endpoints ending in /
36  *     e.g. http://bibnum.bnf.fr/warc/
37  *     if you're lucky their response contains a Content-Location: header
38  *     pointing to a unix-compliant filename, in the example above it's
39  *     Content-Location: http://bibnum.bnf.fr/warc/index.html
40  *     however, that's not mandated and github for example doesn't follow
41  *     this convention.
42  *     We need a set of archive options to control what to do with
43  *     entries like these, at the moment care is taken to skip them.
44  *
45  **/
46 
47 #ifdef HAVE_SYS_STAT_H
48 #include <sys/stat.h>
49 #endif
50 #ifdef HAVE_ERRNO_H
51 #include <errno.h>
52 #endif
53 #ifdef HAVE_STDLIB_H
54 #include <stdlib.h>
55 #endif
56 #ifdef HAVE_STRING_H
57 #include <string.h>
58 #endif
59 #ifdef HAVE_LIMITS_H
60 #include <limits.h>
61 #endif
62 #ifdef HAVE_CTYPE_H
63 #include <ctype.h>
64 #endif
65 #ifdef HAVE_TIME_H
66 #include <time.h>
67 #endif
68 
69 #include "archive.h"
70 #include "archive_entry.h"
71 #include "archive_private.h"
72 #include "archive_read_private.h"
73 
74 typedef enum {
75           WT_NONE,
76           /* warcinfo */
77           WT_INFO,
78           /* metadata */
79           WT_META,
80           /* resource */
81           WT_RSRC,
82           /* request, unsupported */
83           WT_REQ,
84           /* response, unsupported */
85           WT_RSP,
86           /* revisit, unsupported */
87           WT_RVIS,
88           /* conversion, unsupported */
89           WT_CONV,
90           /* continuation, unsupported at the moment */
91           WT_CONT,
92           /* invalid type */
93           LAST_WT
94 } warc_type_t;
95 
96 typedef struct {
97           size_t len;
98           const char *str;
99 } warc_string_t;
100 
101 typedef struct {
102           size_t len;
103           char *str;
104 } warc_strbuf_t;
105 
106 struct warc_s {
107           /* content length ahead */
108           size_t cntlen;
109           /* and how much we've processed so far */
110           size_t cntoff;
111           /* and how much we need to consume between calls */
112           size_t unconsumed;
113 
114           /* string pool */
115           warc_strbuf_t pool;
116           /* previous version */
117           unsigned int pver;
118           /* stringified format name */
119           struct archive_string sver;
120 };
121 
122 static int _warc_bid(struct archive_read *a, int);
123 static int _warc_cleanup(struct archive_read *a);
124 static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
125 static int _warc_skip(struct archive_read *a);
126 static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
127 
128 /* private routines */
129 static unsigned int _warc_rdver(const char *buf, size_t bsz);
130 static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
131 static warc_string_t _warc_rduri(const char *buf, size_t bsz);
132 static ssize_t _warc_rdlen(const char *buf, size_t bsz);
133 static time_t _warc_rdrtm(const char *buf, size_t bsz);
134 static time_t _warc_rdmtm(const char *buf, size_t bsz);
135 static const char *_warc_find_eoh(const char *buf, size_t bsz);
136 static const char *_warc_find_eol(const char *buf, size_t bsz);
137 
138 int
archive_read_support_format_warc(struct archive * _a)139 archive_read_support_format_warc(struct archive *_a)
140 {
141           struct archive_read *a = (struct archive_read *)_a;
142           struct warc_s *w;
143           int r;
144 
145           archive_check_magic(_a, ARCHIVE_READ_MAGIC,
146               ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
147 
148           if ((w = calloc(1, sizeof(*w))) == NULL) {
149                     archive_set_error(&a->archive, ENOMEM,
150                         "Can't allocate warc data");
151                     return (ARCHIVE_FATAL);
152           }
153 
154           r = __archive_read_register_format(
155                     a, w, "warc",
156                     _warc_bid, NULL, _warc_rdhdr, _warc_read,
157                     _warc_skip, NULL, _warc_cleanup, NULL, NULL);
158 
159           if (r != ARCHIVE_OK) {
160                     free(w);
161                     return (r);
162           }
163           return (ARCHIVE_OK);
164 }
165 
166 static int
_warc_cleanup(struct archive_read * a)167 _warc_cleanup(struct archive_read *a)
168 {
169           struct warc_s *w = a->format->data;
170 
171           if (w->pool.len > 0U) {
172                     free(w->pool.str);
173           }
174           archive_string_free(&w->sver);
175           free(w);
176           a->format->data = NULL;
177           return (ARCHIVE_OK);
178 }
179 
180 static int
_warc_bid(struct archive_read * a,int best_bid)181 _warc_bid(struct archive_read *a, int best_bid)
182 {
183           const char *hdr;
184           ssize_t nrd;
185           unsigned int ver;
186 
187           (void)best_bid; /* UNUSED */
188 
189           /* check first line of file, it should be a record already */
190           if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
191                     /* no idea what to do */
192                     return -1;
193           } else if (nrd < 12) {
194                     /* nah, not for us, our magic cookie is at least 12 bytes */
195                     return -1;
196           }
197 
198           /* otherwise snarf the record's version number */
199           ver = _warc_rdver(hdr, nrd);
200           if (ver < 1200U || ver > 10000U) {
201                     /* we only support WARC 0.12 to 1.0 */
202                     return -1;
203           }
204 
205           /* otherwise be confident */
206           return (64);
207 }
208 
209 static int
_warc_rdhdr(struct archive_read * a,struct archive_entry * entry)210 _warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
211 {
212 #define HDR_PROBE_LEN                   (12U)
213           struct warc_s *w = a->format->data;
214           unsigned int ver;
215           const char *buf;
216           ssize_t nrd;
217           const char *eoh;
218           char *tmp;
219           /* for the file name, saves some strndup()'ing */
220           warc_string_t fnam;
221           /* warc record type, not that we really use it a lot */
222           warc_type_t ftyp;
223           /* content-length+error monad */
224           ssize_t cntlen;
225           /* record time is the WARC-Date time we reinterpret it as ctime */
226           time_t rtime;
227           /* mtime is the Last-Modified time which will be the entry's mtime */
228           time_t mtime;
229 
230 start_over:
231           /* just use read_ahead() they keep track of unconsumed
232            * bits and bobs for us; no need to put an extra shift in
233            * and reproduce that functionality here */
234           buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
235 
236           if (nrd < 0) {
237                     /* no good */
238                     archive_set_error(
239                               &a->archive, ARCHIVE_ERRNO_MISC,
240                               "Bad record header");
241                     return (ARCHIVE_FATAL);
242           } else if (buf == NULL) {
243                     /* there should be room for at least WARC/bla\r\n
244                      * must be EOF therefore */
245                     return (ARCHIVE_EOF);
246           }
247           /* looks good so far, try and find the end of the header now */
248           eoh = _warc_find_eoh(buf, nrd);
249           if (eoh == NULL) {
250                     /* still no good, the header end might be beyond the
251                      * probe we've requested, but then again who'd cram
252                      * so much stuff into the header *and* be 28500-compliant */
253                     archive_set_error(
254                               &a->archive, ARCHIVE_ERRNO_MISC,
255                               "Bad record header");
256                     return (ARCHIVE_FATAL);
257           }
258           ver = _warc_rdver(buf, eoh - buf);
259           /* we currently support WARC 0.12 to 1.0 */
260           if (ver == 0U) {
261                     archive_set_error(
262                               &a->archive, ARCHIVE_ERRNO_MISC,
263                               "Invalid record version");
264                     return (ARCHIVE_FATAL);
265           } else if (ver < 1200U || ver > 10000U) {
266                     archive_set_error(
267                               &a->archive, ARCHIVE_ERRNO_MISC,
268                               "Unsupported record version: %u.%u",
269                               ver / 10000, (ver % 10000) / 100);
270                     return (ARCHIVE_FATAL);
271           }
272           cntlen = _warc_rdlen(buf, eoh - buf);
273           if (cntlen < 0) {
274                     /* nightmare!  the specs say content-length is mandatory
275                      * so I don't feel overly bad stopping the reader here */
276                     archive_set_error(
277                               &a->archive, EINVAL,
278                               "Bad content length");
279                     return (ARCHIVE_FATAL);
280           }
281           rtime = _warc_rdrtm(buf, eoh - buf);
282           if (rtime == (time_t)-1) {
283                     /* record time is mandatory as per WARC/1.0,
284                      * so just barf here, fast and loud */
285                     archive_set_error(
286                               &a->archive, EINVAL,
287                               "Bad record time");
288                     return (ARCHIVE_FATAL);
289           }
290 
291           /* let the world know we're a WARC archive */
292           a->archive.archive_format = ARCHIVE_FORMAT_WARC;
293           if (ver != w->pver) {
294                     /* stringify this entry's version */
295                     archive_string_sprintf(&w->sver,
296                               "WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
297                     /* remember the version */
298                     w->pver = ver;
299           }
300           /* start off with the type */
301           ftyp = _warc_rdtyp(buf, eoh - buf);
302           /* and let future calls know about the content */
303           w->cntlen = cntlen;
304           w->cntoff = 0U;
305           mtime = 0;/* Avoid compiling error on some platform. */
306 
307           switch (ftyp) {
308           case WT_RSRC:
309           case WT_RSP:
310                     /* only try and read the filename in the cases that are
311                      * guaranteed to have one */
312                     fnam = _warc_rduri(buf, eoh - buf);
313                     /* check the last character in the URI to avoid creating
314                      * directory endpoints as files, see Todo above */
315                     if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
316                               /* break here for now */
317                               fnam.len = 0U;
318                               fnam.str = NULL;
319                               break;
320                     }
321                     /* bang to our string pool, so we save a
322                      * malloc()+free() roundtrip */
323                     if (fnam.len + 1U > w->pool.len) {
324                               w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
325                               tmp = realloc(w->pool.str, w->pool.len);
326                               if (tmp == NULL) {
327                                         archive_set_error(
328                                                   &a->archive, ENOMEM,
329                                                   "Out of memory");
330                                         return (ARCHIVE_FATAL);
331                               }
332                               w->pool.str = tmp;
333                     }
334                     memcpy(w->pool.str, fnam.str, fnam.len);
335                     w->pool.str[fnam.len] = '\0';
336                     /* let no one else know about the pool, it's a secret, shhh */
337                     fnam.str = w->pool.str;
338 
339                     /* snarf mtime or deduce from rtime
340                      * this is a custom header added by our writer, it's quite
341                      * hard to believe anyone else would go through with it
342                      * (apart from being part of some http responses of course) */
343                     if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
344                               mtime = rtime;
345                     }
346                     break;
347           case WT_NONE:
348           case WT_INFO:
349           case WT_META:
350           case WT_REQ:
351           case WT_RVIS:
352           case WT_CONV:
353           case WT_CONT:
354           case LAST_WT:
355           default:
356                     fnam.len = 0U;
357                     fnam.str = NULL;
358                     break;
359           }
360 
361           /* now eat some of those delicious buffer bits */
362           __archive_read_consume(a, eoh - buf);
363 
364           switch (ftyp) {
365           case WT_RSRC:
366           case WT_RSP:
367                     if (fnam.len > 0U) {
368                               /* populate entry object */
369                               archive_entry_set_filetype(entry, AE_IFREG);
370                               archive_entry_copy_pathname(entry, fnam.str);
371                               archive_entry_set_size(entry, cntlen);
372                               archive_entry_set_perm(entry, 0644);
373                               /* rtime is the new ctime, mtime stays mtime */
374                               archive_entry_set_ctime(entry, rtime, 0L);
375                               archive_entry_set_mtime(entry, mtime, 0L);
376                               break;
377                     }
378                     /* FALLTHROUGH */
379           case WT_NONE:
380           case WT_INFO:
381           case WT_META:
382           case WT_REQ:
383           case WT_RVIS:
384           case WT_CONV:
385           case WT_CONT:
386           case LAST_WT:
387           default:
388                     /* consume the content and start over */
389                     _warc_skip(a);
390                     goto start_over;
391           }
392           return (ARCHIVE_OK);
393 }
394 
395 static int
_warc_read(struct archive_read * a,const void ** buf,size_t * bsz,int64_t * off)396 _warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
397 {
398           struct warc_s *w = a->format->data;
399           const char *rab;
400           ssize_t nrd;
401 
402           if (w->cntoff >= w->cntlen) {
403           eof:
404                     /* it's our lucky day, no work, we can leave early */
405                     *buf = NULL;
406                     *bsz = 0U;
407                     *off = w->cntoff + 4U/*for \r\n\r\n separator*/;
408                     w->unconsumed = 0U;
409                     return (ARCHIVE_EOF);
410           }
411 
412           if (w->unconsumed) {
413                     __archive_read_consume(a, w->unconsumed);
414                     w->unconsumed = 0U;
415           }
416 
417           rab = __archive_read_ahead(a, 1U, &nrd);
418           if (nrd < 0) {
419                     *bsz = 0U;
420                     /* big catastrophe */
421                     return (int)nrd;
422           } else if (nrd == 0) {
423                     goto eof;
424           } else if ((size_t)nrd > w->cntlen - w->cntoff) {
425                     /* clamp to content-length */
426                     nrd = w->cntlen - w->cntoff;
427           }
428           *off = w->cntoff;
429           *bsz = nrd;
430           *buf = rab;
431 
432           w->cntoff += nrd;
433           w->unconsumed = (size_t)nrd;
434           return (ARCHIVE_OK);
435 }
436 
437 static int
_warc_skip(struct archive_read * a)438 _warc_skip(struct archive_read *a)
439 {
440           struct warc_s *w = a->format->data;
441 
442           __archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
443           w->cntlen = 0U;
444           w->cntoff = 0U;
445           return (ARCHIVE_OK);
446 }
447 
448 
449 /* private routines */
450 static void*
deconst(const void * c)451 deconst(const void *c)
452 {
453           return (void *)(uintptr_t)c;
454 }
455 
456 static char*
xmemmem(const char * hay,const size_t haysize,const char * needle,const size_t needlesize)457 xmemmem(const char *hay, const size_t haysize,
458           const char *needle, const size_t needlesize)
459 {
460           const char *const eoh = hay + haysize;
461           const char *const eon = needle + needlesize;
462           const char *hp;
463           const char *np;
464           const char *cand;
465           unsigned int hsum;
466           unsigned int nsum;
467           unsigned int eqp;
468 
469           /* trivial checks first
470          * a 0-sized needle is defined to be found anywhere in haystack
471          * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
472          * that happens to begin with *NEEDLE) */
473           if (needlesize == 0UL) {
474                     return deconst(hay);
475           } else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
476                     /* trivial */
477                     return NULL;
478           }
479 
480           /* First characters of haystack and needle are the same now. Both are
481            * guaranteed to be at least one character long.  Now computes the sum
482            * of characters values of needle together with the sum of the first
483            * needle_len characters of haystack. */
484           for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
485                hp < eoh && np < eon;
486                hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
487 
488           /* HP now references the (NEEDLESIZE + 1)-th character. */
489           if (np < eon) {
490                     /* haystack is smaller than needle, :O */
491                     return NULL;
492           } else if (eqp) {
493                     /* found a match */
494                     return deconst(hay);
495           }
496 
497           /* now loop through the rest of haystack,
498            * updating the sum iteratively */
499           for (cand = hay; hp < eoh; hp++) {
500                     hsum ^= *cand++;
501                     hsum ^= *hp;
502 
503                     /* Since the sum of the characters is already known to be
504                      * equal at that point, it is enough to check just NEEDLESIZE - 1
505                      * characters for equality,
506                      * also CAND is by design < HP, so no need for range checks */
507                     if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
508                               return deconst(cand);
509                     }
510           }
511           return NULL;
512 }
513 
514 static int
strtoi_lim(const char * str,const char ** ep,int llim,int ulim)515 strtoi_lim(const char *str, const char **ep, int llim, int ulim)
516 {
517           int res = 0;
518           const char *sp;
519           /* we keep track of the number of digits via rulim */
520           int rulim;
521 
522           for (sp = str, rulim = ulim > 10 ? ulim : 10;
523                res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
524                sp++, rulim /= 10) {
525                     res *= 10;
526                     res += *sp - '0';
527           }
528           if (sp == str) {
529                     res = -1;
530           } else if (res < llim || res > ulim) {
531                     res = -2;
532           }
533           *ep = (const char*)sp;
534           return res;
535 }
536 
537 static time_t
time_from_tm(struct tm * t)538 time_from_tm(struct tm *t)
539 {
540 #if HAVE__MKGMTIME
541         return _mkgmtime(t);
542 #elif HAVE_TIMEGM
543         /* Use platform timegm() if available. */
544         return (timegm(t));
545 #else
546         /* Else use direct calculation using POSIX assumptions. */
547         /* First, fix up tm_yday based on the year/month/day. */
548         if (mktime(t) == (time_t)-1)
549                 return ((time_t)-1);
550         /* Then we can compute timegm() from first principles. */
551         return (t->tm_sec
552             + t->tm_min * 60
553             + t->tm_hour * 3600
554             + t->tm_yday * 86400
555             + (t->tm_year - 70) * 31536000
556             + ((t->tm_year - 69) / 4) * 86400
557             - ((t->tm_year - 1) / 100) * 86400
558             + ((t->tm_year + 299) / 400) * 86400);
559 #endif
560 }
561 
562 static time_t
xstrpisotime(const char * s,char ** endptr)563 xstrpisotime(const char *s, char **endptr)
564 {
565 /** like strptime() but strictly for ISO 8601 Zulu strings */
566           struct tm tm;
567           time_t res = (time_t)-1;
568 
569           /* make sure tm is clean */
570           memset(&tm, 0, sizeof(tm));
571 
572           /* as a courtesy to our callers, and since this is a non-standard
573            * routine, we skip leading whitespace */
574           while (*s == ' ' || *s == '\t')
575                     ++s;
576 
577           /* read year */
578           if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
579                     goto out;
580           }
581           /* read month */
582           if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
583                     goto out;
584           }
585           /* read day-of-month */
586           if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
587                     goto out;
588           }
589           /* read hour */
590           if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
591                     goto out;
592           }
593           /* read minute */
594           if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
595                     goto out;
596           }
597           /* read second */
598           if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
599                     goto out;
600           }
601 
602           /* massage TM to fulfill some of POSIX' constraints */
603           tm.tm_year -= 1900;
604           tm.tm_mon--;
605 
606           /* now convert our custom tm struct to a unix stamp using UTC */
607           res = time_from_tm(&tm);
608 
609 out:
610           if (endptr != NULL) {
611                     *endptr = deconst(s);
612           }
613           return res;
614 }
615 
616 static unsigned int
_warc_rdver(const char * buf,size_t bsz)617 _warc_rdver(const char *buf, size_t bsz)
618 {
619           static const char magic[] = "WARC/";
620           const char *c;
621           unsigned int ver = 0U;
622           unsigned int end = 0U;
623 
624           if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
625                     /* buffer too small or invalid magic */
626                     return ver;
627           }
628           /* looks good so far, read the version number for a laugh */
629           buf += sizeof(magic) - 1U;
630 
631           if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
632               isdigit((unsigned char)buf[2U])) {
633                     /* we support a maximum of 2 digits in the minor version */
634                     if (isdigit((unsigned char)buf[3U]))
635                               end = 1U;
636                     /* set up major version */
637                     ver = (buf[0U] - '0') * 10000U;
638                     /* set up minor version */
639                     if (end == 1U) {
640                               ver += (buf[2U] - '0') * 1000U;
641                               ver += (buf[3U] - '0') * 100U;
642                     } else
643                               ver += (buf[2U] - '0') * 100U;
644                     /*
645                      * WARC below version 0.12 has a space-separated header
646                      * WARC 0.12 and above terminates the version with a CRLF
647                      */
648                     c = buf + 3U + end;
649                     if (ver >= 1200U) {
650                               if (memcmp(c, "\r\n", 2U) != 0)
651                                         ver = 0U;
652                     } else {
653                               /* ver < 1200U */
654                               if (*c != ' ' && *c != '\t')
655                                         ver = 0U;
656                     }
657           }
658           return ver;
659 }
660 
661 static unsigned int
_warc_rdtyp(const char * buf,size_t bsz)662 _warc_rdtyp(const char *buf, size_t bsz)
663 {
664           static const char _key[] = "\r\nWARC-Type:";
665           const char *val, *eol;
666 
667           if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
668                     /* no bother */
669                     return WT_NONE;
670           }
671           val += sizeof(_key) - 1U;
672           if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
673                     /* no end of line */
674                     return WT_NONE;
675           }
676 
677           /* overread whitespace */
678           while (val < eol && (*val == ' ' || *val == '\t'))
679                     ++val;
680 
681           if (val + 8U == eol) {
682                     if (memcmp(val, "resource", 8U) == 0)
683                               return WT_RSRC;
684                     else if (memcmp(val, "response", 8U) == 0)
685                               return WT_RSP;
686           }
687           return WT_NONE;
688 }
689 
690 static warc_string_t
_warc_rduri(const char * buf,size_t bsz)691 _warc_rduri(const char *buf, size_t bsz)
692 {
693           static const char _key[] = "\r\nWARC-Target-URI:";
694           const char *val, *uri, *eol, *p;
695           warc_string_t res = {0U, NULL};
696 
697           if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
698                     /* no bother */
699                     return res;
700           }
701           /* overread whitespace */
702           val += sizeof(_key) - 1U;
703           if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
704                     /* no end of line */
705                     return res;
706           }
707 
708           while (val < eol && (*val == ' ' || *val == '\t'))
709                     ++val;
710 
711           /* overread URL designators */
712           if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
713                     /* not touching that! */
714                     return res;
715           }
716 
717           /* spaces inside uri are not allowed, CRLF should follow */
718           for (p = val; p < eol; p++) {
719                     if (isspace((unsigned char)*p))
720                               return res;
721           }
722 
723           /* there must be at least space for ftp */
724           if (uri < (val + 3U))
725                     return res;
726 
727           /* move uri to point to after :// */
728           uri += 3U;
729 
730           /* now then, inspect the URI */
731           if (memcmp(val, "file", 4U) == 0) {
732                     /* perfect, nothing left to do here */
733 
734           } else if (memcmp(val, "http", 4U) == 0 ||
735                        memcmp(val, "ftp", 3U) == 0) {
736                     /* overread domain, and the first / */
737                     while (uri < eol && *uri++ != '/');
738           } else {
739                     /* not sure what to do? best to bugger off */
740                     return res;
741           }
742           res.str = uri;
743           res.len = eol - uri;
744           return res;
745 }
746 
747 static ssize_t
_warc_rdlen(const char * buf,size_t bsz)748 _warc_rdlen(const char *buf, size_t bsz)
749 {
750           static const char _key[] = "\r\nContent-Length:";
751           const char *val, *eol;
752           char *on = NULL;
753           long int len;
754 
755           if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
756                     /* no bother */
757                     return -1;
758           }
759           val += sizeof(_key) - 1U;
760           if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
761                     /* no end of line */
762                     return -1;
763           }
764 
765           /* skip leading whitespace */
766           while (val < eol && (*val == ' ' || *val == '\t'))
767                     val++;
768           /* there must be at least one digit */
769           if (!isdigit((unsigned char)*val))
770                     return -1;
771           errno = 0;
772           len = strtol(val, &on, 10);
773           if (errno != 0 || on != eol) {
774                     /* line must end here */
775                     return -1;
776           }
777 
778           return (size_t)len;
779 }
780 
781 static time_t
_warc_rdrtm(const char * buf,size_t bsz)782 _warc_rdrtm(const char *buf, size_t bsz)
783 {
784           static const char _key[] = "\r\nWARC-Date:";
785           const char *val, *eol;
786           char *on = NULL;
787           time_t res;
788 
789           if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
790                     /* no bother */
791                     return (time_t)-1;
792           }
793           val += sizeof(_key) - 1U;
794           if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
795                     /* no end of line */
796                     return -1;
797           }
798 
799           /* xstrpisotime() kindly overreads whitespace for us, so use that */
800           res = xstrpisotime(val, &on);
801           if (on != eol) {
802                     /* line must end here */
803                     return -1;
804           }
805           return res;
806 }
807 
808 static time_t
_warc_rdmtm(const char * buf,size_t bsz)809 _warc_rdmtm(const char *buf, size_t bsz)
810 {
811           static const char _key[] = "\r\nLast-Modified:";
812           const char *val, *eol;
813           char *on = NULL;
814           time_t res;
815 
816           if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
817                     /* no bother */
818                     return (time_t)-1;
819           }
820           val += sizeof(_key) - 1U;
821           if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
822                     /* no end of line */
823                     return -1;
824           }
825 
826           /* xstrpisotime() kindly overreads whitespace for us, so use that */
827           res = xstrpisotime(val, &on);
828           if (on != eol) {
829                     /* line must end here */
830                     return -1;
831           }
832           return res;
833 }
834 
835 static const char*
_warc_find_eoh(const char * buf,size_t bsz)836 _warc_find_eoh(const char *buf, size_t bsz)
837 {
838           static const char _marker[] = "\r\n\r\n";
839           const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
840 
841           if (hit != NULL) {
842                     hit += sizeof(_marker) - 1U;
843           }
844           return hit;
845 }
846 
847 static const char*
_warc_find_eol(const char * buf,size_t bsz)848 _warc_find_eol(const char *buf, size_t bsz)
849 {
850           static const char _marker[] = "\r\n";
851           const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
852 
853           return hit;
854 }
855 /* archive_read_support_format_warc.c ends here */
856