1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License along
17    with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19 #include <sys/cdefs.h>
20 __RCSID("$NetBSD: regex_internal.c,v 1.2 2016/05/17 14:00:09 christos Exp $");
21 
22 
23 static void re_string_construct_common (const char *str, Idx len,
24                                                   re_string_t *pstr,
25                                                   REG_TRANSLATE_TYPE trans, bool icase,
26                                                   const re_dfa_t *dfa) internal_function;
27 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
28                                                     const re_node_set *nodes,
29                                                     re_hashval_t hash) internal_function;
30 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
31                                                     const re_node_set *nodes,
32                                                     unsigned int context,
33                                                     re_hashval_t hash) internal_function;
34 
35 /* Functions for string operation.  */
36 
37 /* This function allocate the buffers.  It is necessary to call
38    re_string_reconstruct before using the object.  */
39 
40 static reg_errcode_t
41 internal_function
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)42 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
43                         REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
44 {
45   reg_errcode_t ret;
46   Idx init_buf_len;
47 
48   /* Ensure at least one character fits into the buffers.  */
49   if (init_len < dfa->mb_cur_max)
50     init_len = dfa->mb_cur_max;
51   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
52   re_string_construct_common (str, len, pstr, trans, icase, dfa);
53 
54   ret = re_string_realloc_buffers (pstr, init_buf_len);
55   if (BE (ret != REG_NOERROR, 0))
56     return ret;
57 
58   pstr->word_char = dfa->word_char;
59   pstr->word_ops_used = dfa->word_ops_used;
60   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
61   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
62   pstr->valid_raw_len = pstr->valid_len;
63   return REG_NOERROR;
64 }
65 
66 /* This function allocate the buffers, and initialize them.  */
67 
68 static reg_errcode_t
69 internal_function
re_string_construct(re_string_t * pstr,const char * str,Idx len,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)70 re_string_construct (re_string_t *pstr, const char *str, Idx len,
71                          REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
72 {
73   reg_errcode_t ret;
74   memset (pstr, '\0', sizeof (re_string_t));
75   re_string_construct_common (str, len, pstr, trans, icase, dfa);
76 
77   if (len > 0)
78     {
79       ret = re_string_realloc_buffers (pstr, len + 1);
80       if (BE (ret != REG_NOERROR, 0))
81           return ret;
82     }
83   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
84 
85   if (icase)
86     {
87 #ifdef RE_ENABLE_I18N
88       if (dfa->mb_cur_max > 1)
89           {
90             while (1)
91               {
92                 ret = build_wcs_upper_buffer (pstr);
93                 if (BE (ret != REG_NOERROR, 0))
94                     return ret;
95                 if (pstr->valid_raw_len >= len)
96                     break;
97                 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
98                     break;
99                 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
100                 if (BE (ret != REG_NOERROR, 0))
101                     return ret;
102               }
103           }
104       else
105 #endif /* RE_ENABLE_I18N  */
106           build_upper_buffer (pstr);
107     }
108   else
109     {
110 #ifdef RE_ENABLE_I18N
111       if (dfa->mb_cur_max > 1)
112           build_wcs_buffer (pstr);
113       else
114 #endif /* RE_ENABLE_I18N  */
115           {
116             if (trans != NULL)
117               re_string_translate_buffer (pstr);
118             else
119               {
120                 pstr->valid_len = pstr->bufs_len;
121                 pstr->valid_raw_len = pstr->bufs_len;
122               }
123           }
124     }
125 
126   return REG_NOERROR;
127 }
128 
129 /* Helper functions for re_string_allocate, and re_string_construct.  */
130 
131 static reg_errcode_t
132 internal_function
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)133 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
134 {
135 #ifdef RE_ENABLE_I18N
136   if (pstr->mb_cur_max > 1)
137     {
138       wint_t *new_wcs = re_xrealloc (pstr->wcs, wint_t, new_buf_len);
139       if (BE (new_wcs == NULL, 0))
140           return REG_ESPACE;
141       pstr->wcs = new_wcs;
142       if (pstr->offsets != NULL)
143           {
144             Idx *new_offsets = re_xrealloc (pstr->offsets, Idx, new_buf_len);
145             if (BE (new_offsets == NULL, 0))
146               return REG_ESPACE;
147             pstr->offsets = new_offsets;
148           }
149     }
150 #endif /* RE_ENABLE_I18N  */
151   if (pstr->mbs_allocated)
152     {
153       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
154                                                      new_buf_len);
155       if (BE (new_mbs == NULL, 0))
156           return REG_ESPACE;
157       pstr->mbs = new_mbs;
158     }
159   pstr->bufs_len = new_buf_len;
160   return REG_NOERROR;
161 }
162 
163 
164 static void
165 internal_function
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)166 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
167                                   REG_TRANSLATE_TYPE trans, bool icase,
168                                   const re_dfa_t *dfa)
169 {
170   pstr->raw_mbs = (const unsigned char *) str;
171   pstr->len = len;
172   pstr->raw_len = len;
173   pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans;
174   pstr->icase = icase;
175   pstr->mbs_allocated = (trans != NULL || icase);
176   pstr->mb_cur_max = dfa->mb_cur_max;
177   pstr->is_utf8 = dfa->is_utf8;
178   pstr->map_notascii = dfa->map_notascii;
179   pstr->stop = pstr->len;
180   pstr->raw_stop = pstr->stop;
181 }
182 
183 #ifdef RE_ENABLE_I18N
184 
185 /* Build wide character buffer PSTR->WCS.
186    If the byte sequence of the string are:
187      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
188    Then wide character buffer will be:
189      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
190    We use WEOF for padding, they indicate that the position isn't
191    a first byte of a multibyte character.
192 
193    Note that this function assumes PSTR->VALID_LEN elements are already
194    built and starts from PSTR->VALID_LEN.  */
195 
196 static void
197 internal_function
build_wcs_buffer(re_string_t * pstr)198 build_wcs_buffer (re_string_t *pstr)
199 {
200 #ifdef _LIBC
201   unsigned char buf[MB_LEN_MAX];
202   assert (MB_LEN_MAX >= pstr->mb_cur_max);
203 #else
204   unsigned char buf[64];
205 #endif
206   mbstate_t prev_st;
207   Idx byte_idx, end_idx, remain_len;
208   size_t mbclen;
209 
210   /* Build the buffers from pstr->valid_len to either pstr->len or
211      pstr->bufs_len.  */
212   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
213   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
214     {
215       wchar_t wc;
216       const char *p;
217 
218       remain_len = end_idx - byte_idx;
219       prev_st = pstr->cur_state;
220       /* Apply the translation if we need.  */
221       if (BE (pstr->trans != NULL, 0))
222           {
223             int i, ch;
224 
225             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
226               {
227                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
228                 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
229               }
230             p = (const char *) buf;
231           }
232       else
233           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
234       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
235       if (BE (mbclen == (size_t) -2, 0))
236           {
237             /* The buffer doesn't have enough space, finish to build.  */
238             pstr->cur_state = prev_st;
239             break;
240           }
241       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
242           {
243             /* We treat these cases as a singlebyte character.  */
244             mbclen = 1;
245             wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
246             if (BE (pstr->trans != NULL, 0))
247               wc = pstr->trans[wc];
248             pstr->cur_state = prev_st;
249           }
250 
251       /* Write wide character and padding.  */
252       pstr->wcs[byte_idx++] = wc;
253       /* Write paddings.  */
254       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
255           pstr->wcs[byte_idx++] = WEOF;
256     }
257   pstr->valid_len = byte_idx;
258   pstr->valid_raw_len = byte_idx;
259 }
260 
261 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
262    but for REG_ICASE.  */
263 
264 static reg_errcode_t
265 internal_function
build_wcs_upper_buffer(re_string_t * pstr)266 build_wcs_upper_buffer (re_string_t *pstr)
267 {
268   mbstate_t prev_st;
269   Idx src_idx, byte_idx, end_idx, remain_len;
270   size_t mbclen;
271 #ifdef _LIBC
272   char buf[MB_LEN_MAX];
273   assert (MB_LEN_MAX >= pstr->mb_cur_max);
274 #else
275   char buf[64];
276 #endif
277 
278   byte_idx = pstr->valid_len;
279   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
280 
281   /* The following optimization assumes that ASCII characters can be
282      mapped to wide characters with a simple cast.  */
283   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
284     {
285       while (byte_idx < end_idx)
286           {
287             wchar_t wc;
288 
289             if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
290                 && mbsinit (&pstr->cur_state))
291               {
292                 /* In case of a singlebyte character.  */
293                 pstr->mbs[byte_idx]
294                     = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
295                 /* The next step uses the assumption that wchar_t is encoded
296                      ASCII-safe: all ASCII values can be converted like this.  */
297                 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
298                 ++byte_idx;
299                 continue;
300               }
301 
302             remain_len = end_idx - byte_idx;
303             prev_st = pstr->cur_state;
304             mbclen = mbrtowc (&wc,
305                                   ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
306                                    + byte_idx), remain_len, &pstr->cur_state);
307             if (BE ((size_t) (mbclen + 2) > 2, 1))
308               {
309                 wchar_t wcu = wc;
310                 if (iswlower (wc))
311                     {
312                       size_t mbcdlen;
313 
314                       wcu = towupper (wc);
315                       mbcdlen = wcrtomb (buf, wcu, &prev_st);
316                       if (BE (mbclen == mbcdlen, 1))
317                         memcpy (pstr->mbs + byte_idx, buf, mbclen);
318                       else
319                         {
320                           src_idx = byte_idx;
321                           goto offsets_needed;
322                         }
323                     }
324                 else
325                     memcpy (pstr->mbs + byte_idx,
326                               pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
327                 pstr->wcs[byte_idx++] = wcu;
328                 /* Write paddings.  */
329                 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
330                     pstr->wcs[byte_idx++] = WEOF;
331               }
332             else if (mbclen == (size_t) -1 || mbclen == 0)
333               {
334                 /* It is an invalid character or '\0'.  Just use the byte.  */
335                 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
336                 pstr->mbs[byte_idx] = ch;
337                 /* And also cast it to wide char.  */
338                 pstr->wcs[byte_idx++] = (wchar_t) ch;
339                 if (BE (mbclen == (size_t) -1, 0))
340                     pstr->cur_state = prev_st;
341               }
342             else
343               {
344                 /* The buffer doesn't have enough space, finish to build.  */
345                 pstr->cur_state = prev_st;
346                 break;
347               }
348           }
349       pstr->valid_len = byte_idx;
350       pstr->valid_raw_len = byte_idx;
351       return REG_NOERROR;
352     }
353   else
354     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
355       {
356           wchar_t wc;
357           const char *p;
358       offsets_needed:
359           remain_len = end_idx - byte_idx;
360           prev_st = pstr->cur_state;
361           if (BE (pstr->trans != NULL, 0))
362             {
363               int i, ch;
364 
365               for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
366                 {
367                     ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
368                     buf[i] = pstr->trans[ch];
369                 }
370               p = (const char *) buf;
371             }
372           else
373             p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
374           mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
375           if (BE ((size_t) (mbclen + 2) > 2, 1))
376             {
377               wchar_t wcu = wc;
378               if (iswlower (wc))
379                 {
380                     size_t mbcdlen;
381 
382                     wcu = towupper (wc);
383                     mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
384                     if (BE (mbclen == mbcdlen, 1))
385                       memcpy (pstr->mbs + byte_idx, buf, mbclen);
386                     else if (mbcdlen != (size_t) -1)
387                       {
388                         size_t i;
389 
390                         if (byte_idx + mbcdlen > pstr->bufs_len)
391                           {
392                               pstr->cur_state = prev_st;
393                               break;
394                           }
395 
396                         if (pstr->offsets == NULL)
397                           {
398                               pstr->offsets = re_xmalloc (Idx, pstr->bufs_len);
399 
400                               if (pstr->offsets == NULL)
401                                 return REG_ESPACE;
402                           }
403                         if (!pstr->offsets_needed)
404                           {
405                               for (i = 0; i < (size_t) byte_idx; ++i)
406                                 pstr->offsets[i] = i;
407                               pstr->offsets_needed = 1;
408                           }
409 
410                         memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
411                         pstr->wcs[byte_idx] = wcu;
412                         pstr->offsets[byte_idx] = src_idx;
413                         for (i = 1; i < mbcdlen; ++i)
414                           {
415                               pstr->offsets[byte_idx + i]
416                                 = src_idx + (i < mbclen ? i : mbclen - 1);
417                               pstr->wcs[byte_idx + i] = WEOF;
418                           }
419                         pstr->len += mbcdlen - mbclen;
420                         if (pstr->raw_stop > src_idx)
421                           pstr->stop += mbcdlen - mbclen;
422                         end_idx = (pstr->bufs_len > pstr->len)
423                                     ? pstr->len : pstr->bufs_len;
424                         byte_idx += mbcdlen;
425                         src_idx += mbclen;
426                         continue;
427                       }
428                 else
429                   memcpy (pstr->mbs + byte_idx, p, mbclen);
430                 }
431               else
432                 memcpy (pstr->mbs + byte_idx, p, mbclen);
433 
434               if (BE (pstr->offsets_needed != 0, 0))
435                 {
436                     size_t i;
437                     for (i = 0; i < mbclen; ++i)
438                       pstr->offsets[byte_idx + i] = src_idx + i;
439                 }
440               src_idx += mbclen;
441 
442               pstr->wcs[byte_idx++] = wcu;
443               /* Write paddings.  */
444               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
445                 pstr->wcs[byte_idx++] = WEOF;
446             }
447           else if (mbclen == (size_t) -1 || mbclen == 0)
448             {
449               /* It is an invalid character or '\0'.  Just use the byte.  */
450               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
451 
452               if (BE (pstr->trans != NULL, 0))
453                 ch = pstr->trans [ch];
454               pstr->mbs[byte_idx] = ch;
455 
456               if (BE (pstr->offsets_needed != 0, 0))
457                 pstr->offsets[byte_idx] = src_idx;
458               ++src_idx;
459 
460               /* And also cast it to wide char.  */
461               pstr->wcs[byte_idx++] = (wchar_t) ch;
462               if (BE (mbclen == (size_t) -1, 0))
463                 pstr->cur_state = prev_st;
464             }
465           else
466             {
467               /* The buffer doesn't have enough space, finish to build.  */
468               pstr->cur_state = prev_st;
469               break;
470             }
471       }
472   pstr->valid_len = byte_idx;
473   pstr->valid_raw_len = src_idx;
474   return REG_NOERROR;
475 }
476 
477 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
478    Return the index.  */
479 
480 static Idx
481 internal_function
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)482 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
483 {
484   mbstate_t prev_st;
485   Idx rawbuf_idx;
486   size_t mbclen;
487   wchar_t wc = 0;
488 
489   /* Skip the characters which are not necessary to check.  */
490   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
491        rawbuf_idx < new_raw_idx;)
492     {
493       Idx remain_len;
494       remain_len = pstr->len - rawbuf_idx;
495       prev_st = pstr->cur_state;
496       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
497                               remain_len, &pstr->cur_state);
498       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
499           {
500             /* We treat these cases as a singlebyte character.  */
501             mbclen = 1;
502             pstr->cur_state = prev_st;
503           }
504       /* Then proceed the next character.  */
505       rawbuf_idx += mbclen;
506     }
507   *last_wc = (wint_t) wc;
508   return rawbuf_idx;
509 }
510 #endif /* RE_ENABLE_I18N  */
511 
512 /* Build the buffer PSTR->MBS, and apply the translation if we need.
513    This function is used in case of REG_ICASE.  */
514 
515 static void
516 internal_function
build_upper_buffer(re_string_t * pstr)517 build_upper_buffer (re_string_t *pstr)
518 {
519   Idx char_idx, end_idx;
520   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
521 
522   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
523     {
524       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
525       if (BE (pstr->trans != NULL, 0))
526           ch = pstr->trans[ch];
527       if (islower (ch))
528           pstr->mbs[char_idx] = toupper (ch);
529       else
530           pstr->mbs[char_idx] = ch;
531     }
532   pstr->valid_len = char_idx;
533   pstr->valid_raw_len = char_idx;
534 }
535 
536 /* Apply TRANS to the buffer in PSTR.  */
537 
538 static void
539 internal_function
re_string_translate_buffer(re_string_t * pstr)540 re_string_translate_buffer (re_string_t *pstr)
541 {
542   Idx buf_idx, end_idx;
543   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
544 
545   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
546     {
547       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
548       pstr->mbs[buf_idx] = pstr->trans[ch];
549     }
550 
551   pstr->valid_len = buf_idx;
552   pstr->valid_raw_len = buf_idx;
553 }
554 
555 /* This function re-construct the buffers.
556    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
557    convert to upper case in case of REG_ICASE, apply translation.  */
558 
559 static reg_errcode_t
560 internal_function
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)561 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
562 {
563   Idx offset;
564 
565   if (BE (pstr->raw_mbs_idx <= idx, 0))
566     offset = idx - pstr->raw_mbs_idx;
567   else
568     {
569       /* Reset buffer.  */
570 #ifdef RE_ENABLE_I18N
571       if (pstr->mb_cur_max > 1)
572           memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
573 #endif /* RE_ENABLE_I18N */
574       pstr->len = pstr->raw_len;
575       pstr->stop = pstr->raw_stop;
576       pstr->valid_len = 0;
577       pstr->raw_mbs_idx = 0;
578       pstr->valid_raw_len = 0;
579       pstr->offsets_needed = 0;
580       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
581                                  : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
582       if (!pstr->mbs_allocated)
583           pstr->mbs = (unsigned char *) pstr->raw_mbs;
584       offset = idx;
585     }
586 
587   if (BE (offset != 0, 1))
588     {
589       /* Are the characters which are already checked remain?  */
590       if (BE (offset < pstr->valid_raw_len, 1)
591 #ifdef RE_ENABLE_I18N
592             /* Handling this would enlarge the code too much.
593                Accept a slowdown in that case.  */
594             && pstr->offsets_needed == 0
595 #endif
596            )
597           {
598             /* Yes, move them to the front of the buffer.  */
599             pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
600 #ifdef RE_ENABLE_I18N
601             if (pstr->mb_cur_max > 1)
602               memmove (pstr->wcs, pstr->wcs + offset,
603                          (pstr->valid_len - offset) * sizeof (wint_t));
604 #endif /* RE_ENABLE_I18N */
605             if (BE (pstr->mbs_allocated, 0))
606               memmove (pstr->mbs, pstr->mbs + offset,
607                          pstr->valid_len - offset);
608             pstr->valid_len -= offset;
609             pstr->valid_raw_len -= offset;
610 #if DEBUG
611             assert (pstr->valid_len > 0);
612 #endif
613           }
614       else
615           {
616             /* No, skip all characters until IDX.  */
617 #ifdef RE_ENABLE_I18N
618             if (BE (pstr->offsets_needed, 0))
619               {
620                 pstr->len = pstr->raw_len - idx + offset;
621                 pstr->stop = pstr->raw_stop - idx + offset;
622                 pstr->offsets_needed = 0;
623               }
624 #endif
625             pstr->valid_len = 0;
626             pstr->valid_raw_len = 0;
627 #ifdef RE_ENABLE_I18N
628             if (pstr->mb_cur_max > 1)
629               {
630                 Idx wcs_idx;
631                 wint_t wc = WEOF;
632 
633                 if (pstr->is_utf8)
634                     {
635                       const unsigned char *raw, *p, *q, *end;
636 
637                       /* Special case UTF-8.  Multi-byte chars start with any
638                          byte other than 0x80 - 0xbf.  */
639                       raw = pstr->raw_mbs + pstr->raw_mbs_idx;
640                       end = raw + (offset - pstr->mb_cur_max);
641                       for (p = raw + offset - 1; p >= end; --p)
642                         if ((*p & 0xc0) != 0x80)
643                           {
644                               mbstate_t cur_state;
645                               wchar_t wc2;
646                               Idx mlen = raw + pstr->len - p;
647                               unsigned char buf[6];
648                               size_t mbclen;
649 
650                               q = p;
651                               if (BE (pstr->trans != NULL, 0))
652                                 {
653                                   int i = mlen < 6 ? mlen : 6;
654                                   while (--i >= 0)
655                                     buf[i] = pstr->trans[p[i]];
656                                   q = buf;
657                                 }
658                               /* XXX Don't use mbrtowc, we know which conversion
659                                  to use (UTF-8 -> UCS4).  */
660                               memset (&cur_state, 0, sizeof (cur_state));
661                               mbclen = mbrtowc (&wc2, (const char *) p, mlen,
662                                                     &cur_state);
663                               if (raw + offset - p <= mbclen && mbclen < (size_t) -2)
664                                 {
665                                   memset (&pstr->cur_state, '\0',
666                                             sizeof (mbstate_t));
667                                   pstr->valid_len = mbclen - (raw + offset - p);
668                                   wc = wc2;
669                                 }
670                               break;
671                           }
672                     }
673 
674                 if (wc == WEOF)
675                     pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
676                 if (BE (pstr->valid_len, 0))
677                     {
678                       for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
679                         pstr->wcs[wcs_idx] = WEOF;
680                       if (pstr->mbs_allocated)
681                         memset (pstr->mbs, -1, pstr->valid_len);
682                     }
683                 pstr->valid_raw_len = pstr->valid_len;
684                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
685                                             && IS_WIDE_WORD_CHAR (wc))
686                                            ? CONTEXT_WORD
687                                            : ((IS_WIDE_NEWLINE (wc)
688                                                && pstr->newline_anchor)
689                                               ? CONTEXT_NEWLINE : 0));
690               }
691             else
692 #endif /* RE_ENABLE_I18N */
693               {
694                 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
695                 if (pstr->trans)
696                     c = pstr->trans[c];
697                 pstr->tip_context = (bitset_contain (pstr->word_char, c)
698                                            ? CONTEXT_WORD
699                                            : ((IS_NEWLINE (c) && pstr->newline_anchor)
700                                               ? CONTEXT_NEWLINE : 0));
701               }
702           }
703       if (!BE (pstr->mbs_allocated, 0))
704           pstr->mbs += offset;
705     }
706   pstr->raw_mbs_idx = idx;
707   pstr->len -= offset;
708   pstr->stop -= offset;
709 
710   /* Then build the buffers.  */
711 #ifdef RE_ENABLE_I18N
712   if (pstr->mb_cur_max > 1)
713     {
714       if (pstr->icase)
715           {
716             reg_errcode_t ret = build_wcs_upper_buffer (pstr);
717             if (BE (ret != REG_NOERROR, 0))
718               return ret;
719           }
720       else
721           build_wcs_buffer (pstr);
722     }
723   else
724 #endif /* RE_ENABLE_I18N */
725   if (BE (pstr->mbs_allocated, 0))
726     {
727       if (pstr->icase)
728           build_upper_buffer (pstr);
729       else if (pstr->trans != NULL)
730           re_string_translate_buffer (pstr);
731     }
732   else
733     pstr->valid_len = pstr->len;
734 
735   pstr->cur_idx = 0;
736   return REG_NOERROR;
737 }
738 
739 static unsigned char
internal_function(pure)740 internal_function __attribute ((pure))
741 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
742 {
743   int ch;
744   Idx off;
745 
746   /* Handle the common (easiest) cases first.  */
747   if (BE (!pstr->mbs_allocated, 1))
748     return re_string_peek_byte (pstr, idx);
749 
750 #ifdef RE_ENABLE_I18N
751   if (pstr->mb_cur_max > 1
752       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
753     return re_string_peek_byte (pstr, idx);
754 #endif
755 
756   off = pstr->cur_idx + idx;
757 #ifdef RE_ENABLE_I18N
758   if (pstr->offsets_needed)
759     off = pstr->offsets[off];
760 #endif
761 
762   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
763 
764 #ifdef RE_ENABLE_I18N
765   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
766      this function returns CAPITAL LETTER I instead of first byte of
767      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
768      since peek_byte_case doesn't advance cur_idx in any way.  */
769   if (pstr->offsets_needed && !isascii (ch))
770     return re_string_peek_byte (pstr, idx);
771 #endif
772 
773   return ch;
774 }
775 
776 static unsigned char
internal_function(pure)777 internal_function __attribute ((pure))
778 re_string_fetch_byte_case (re_string_t *pstr)
779 {
780   if (BE (!pstr->mbs_allocated, 1))
781     return re_string_fetch_byte (pstr);
782 
783 #ifdef RE_ENABLE_I18N
784   if (pstr->offsets_needed)
785     {
786       Idx off;
787       int ch;
788 
789       /* For tr_TR.UTF-8 [[:islower:]] there is
790            [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
791            in that case the whole multi-byte character and return
792            the original letter.  On the other side, with
793            [[: DOTLESS SMALL LETTER I return [[:I, as doing
794            anything else would complicate things too much.  */
795 
796       if (!re_string_first_byte (pstr, pstr->cur_idx))
797           return re_string_fetch_byte (pstr);
798 
799       off = pstr->offsets[pstr->cur_idx];
800       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
801 
802       if (! isascii (ch))
803           return re_string_fetch_byte (pstr);
804 
805       re_string_skip_bytes (pstr,
806                                   re_string_char_size_at (pstr, pstr->cur_idx));
807       return ch;
808     }
809 #endif
810 
811   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
812 }
813 
814 static void
815 internal_function
re_string_destruct(re_string_t * pstr)816 re_string_destruct (re_string_t *pstr)
817 {
818 #ifdef RE_ENABLE_I18N
819   re_free (pstr->wcs);
820   re_free (pstr->offsets);
821 #endif /* RE_ENABLE_I18N  */
822   if (pstr->mbs_allocated)
823     re_free (pstr->mbs);
824 }
825 
826 /* Return the context at IDX in INPUT.  */
827 
828 static unsigned int
829 internal_function
re_string_context_at(const re_string_t * input,Idx idx,int eflags)830 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
831 {
832   int c;
833   if (BE (! REG_VALID_INDEX (idx), 0))
834     /* In this case, we use the value stored in input->tip_context,
835        since we can't know the character in input->mbs[-1] here.  */
836     return input->tip_context;
837   if (BE (idx == input->len, 0))
838     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
839               : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
840 #ifdef RE_ENABLE_I18N
841   if (input->mb_cur_max > 1)
842     {
843       wint_t wc;
844       Idx wc_idx = idx;
845       while(input->wcs[wc_idx] == WEOF)
846           {
847 #ifdef DEBUG
848             /* It must not happen.  */
849             assert (REG_VALID_INDEX (wc_idx));
850 #endif
851             --wc_idx;
852             if (! REG_VALID_INDEX (wc_idx))
853               return input->tip_context;
854           }
855       wc = input->wcs[wc_idx];
856       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
857           return CONTEXT_WORD;
858       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
859                 ? CONTEXT_NEWLINE : 0);
860     }
861   else
862 #endif
863     {
864       c = re_string_byte_at (input, idx);
865       if (bitset_contain (input->word_char, c))
866           return CONTEXT_WORD;
867       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
868     }
869 }
870 
871 /* Functions for set operation.  */
872 
873 static reg_errcode_t
874 internal_function
re_node_set_alloc(re_node_set * set,Idx size)875 re_node_set_alloc (re_node_set *set, Idx size)
876 {
877   set->alloc = size;
878   set->nelem = 0;
879   set->elems = re_xmalloc (Idx, size);
880   if (BE (set->elems == NULL, 0))
881     return REG_ESPACE;
882   return REG_NOERROR;
883 }
884 
885 static reg_errcode_t
886 internal_function
re_node_set_init_1(re_node_set * set,Idx elem)887 re_node_set_init_1 (re_node_set *set, Idx elem)
888 {
889   set->alloc = 1;
890   set->nelem = 1;
891   set->elems = re_malloc (Idx, 1);
892   if (BE (set->elems == NULL, 0))
893     {
894       set->alloc = set->nelem = 0;
895       return REG_ESPACE;
896     }
897   set->elems[0] = elem;
898   return REG_NOERROR;
899 }
900 
901 static reg_errcode_t
902 internal_function
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)903 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
904 {
905   set->alloc = 2;
906   set->elems = re_malloc (Idx, 2);
907   if (BE (set->elems == NULL, 0))
908     return REG_ESPACE;
909   if (elem1 == elem2)
910     {
911       set->nelem = 1;
912       set->elems[0] = elem1;
913     }
914   else
915     {
916       set->nelem = 2;
917       if (elem1 < elem2)
918           {
919             set->elems[0] = elem1;
920             set->elems[1] = elem2;
921           }
922       else
923           {
924             set->elems[0] = elem2;
925             set->elems[1] = elem1;
926           }
927     }
928   return REG_NOERROR;
929 }
930 
931 static reg_errcode_t
932 internal_function
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)933 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
934 {
935   dest->nelem = src->nelem;
936   if (src->nelem > 0)
937     {
938       dest->alloc = dest->nelem;
939       dest->elems = re_malloc (Idx, dest->alloc);
940       if (BE (dest->elems == NULL, 0))
941           {
942             dest->alloc = dest->nelem = 0;
943             return REG_ESPACE;
944           }
945       memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
946     }
947   else
948     re_node_set_init_empty (dest);
949   return REG_NOERROR;
950 }
951 
952 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
953    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
954    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
955 
956 static reg_errcode_t
957 internal_function
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)958 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
959                                  const re_node_set *src2)
960 {
961   Idx i1, i2, is, id, delta, sbase;
962   if (src1->nelem == 0 || src2->nelem == 0)
963     return REG_NOERROR;
964 
965   /* We need dest->nelem + 2 * elems_in_intersection; this is a
966      conservative estimate.  */
967   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
968     {
969       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
970       Idx *new_elems;
971       if (sizeof (Idx) < 3
972             && (new_alloc < dest->alloc
973                 || ((Idx) (src1->nelem + src2->nelem) < src1->nelem)))
974           return REG_ESPACE;
975       new_elems = re_xrealloc (dest->elems, Idx, new_alloc);
976       if (BE (new_elems == NULL, 0))
977         return REG_ESPACE;
978       dest->elems = new_elems;
979       dest->alloc = new_alloc;
980     }
981 
982   /* Find the items in the intersection of SRC1 and SRC2, and copy
983      into the top of DEST those that are not already in DEST itself.  */
984   sbase = dest->nelem + src1->nelem + src2->nelem;
985   i1 = src1->nelem - 1;
986   i2 = src2->nelem - 1;
987   id = dest->nelem - 1;
988   for (;;)
989     {
990       if (src1->elems[i1] == src2->elems[i2])
991           {
992             /* Try to find the item in DEST.  Maybe we could binary search?  */
993             while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
994               --id;
995 
996           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
997             dest->elems[--sbase] = src1->elems[i1];
998 
999             if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1000               break;
1001           }
1002 
1003       /* Lower the highest of the two items.  */
1004       else if (src1->elems[i1] < src2->elems[i2])
1005           {
1006             if (! REG_VALID_INDEX (--i2))
1007               break;
1008           }
1009       else
1010           {
1011             if (! REG_VALID_INDEX (--i1))
1012               break;
1013           }
1014     }
1015 
1016   id = dest->nelem - 1;
1017   is = dest->nelem + src1->nelem + src2->nelem - 1;
1018   delta = is - sbase + 1;
1019 
1020   /* Now copy.  When DELTA becomes zero, the remaining
1021      DEST elements are already in place; this is more or
1022      less the same loop that is in re_node_set_merge.  */
1023   dest->nelem += delta;
1024   if (delta > 0 && REG_VALID_INDEX (id))
1025     for (;;)
1026       {
1027         if (dest->elems[is] > dest->elems[id])
1028           {
1029             /* Copy from the top.  */
1030             dest->elems[id + delta--] = dest->elems[is--];
1031             if (delta == 0)
1032               break;
1033           }
1034         else
1035           {
1036             /* Slide from the bottom.  */
1037             dest->elems[id + delta] = dest->elems[id];
1038             if (! REG_VALID_INDEX (--id))
1039               break;
1040           }
1041       }
1042 
1043   /* Copy remaining SRC elements.  */
1044   memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]);
1045 
1046   return REG_NOERROR;
1047 }
1048 
1049 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1050    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1051 
1052 static reg_errcode_t
1053 internal_function
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1054 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1055                               const re_node_set *src2)
1056 {
1057   Idx i1, i2, id;
1058   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1059     {
1060       dest->alloc = src1->nelem + src2->nelem;
1061       if (sizeof (Idx) < 2 && dest->alloc < src1->nelem)
1062           return REG_ESPACE;
1063       dest->elems = re_xmalloc (Idx, dest->alloc);
1064       if (BE (dest->elems == NULL, 0))
1065           return REG_ESPACE;
1066     }
1067   else
1068     {
1069       if (src1 != NULL && src1->nelem > 0)
1070           return re_node_set_init_copy (dest, src1);
1071       else if (src2 != NULL && src2->nelem > 0)
1072           return re_node_set_init_copy (dest, src2);
1073       else
1074           re_node_set_init_empty (dest);
1075       return REG_NOERROR;
1076     }
1077   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1078     {
1079       if (src1->elems[i1] > src2->elems[i2])
1080           {
1081             dest->elems[id++] = src2->elems[i2++];
1082             continue;
1083           }
1084       if (src1->elems[i1] == src2->elems[i2])
1085           ++i2;
1086       dest->elems[id++] = src1->elems[i1++];
1087     }
1088   if (i1 < src1->nelem)
1089     {
1090       memcpy (dest->elems + id, src1->elems + i1,
1091                (src1->nelem - i1) * sizeof dest->elems[0]);
1092       id += src1->nelem - i1;
1093     }
1094   else if (i2 < src2->nelem)
1095     {
1096       memcpy (dest->elems + id, src2->elems + i2,
1097                (src2->nelem - i2) * sizeof dest->elems[0]);
1098       id += src2->nelem - i2;
1099     }
1100   dest->nelem = id;
1101   return REG_NOERROR;
1102 }
1103 
1104 /* Calculate the union set of the sets DEST and SRC. And store it to
1105    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1106 
1107 static reg_errcode_t
1108 internal_function
re_node_set_merge(re_node_set * dest,const re_node_set * src)1109 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1110 {
1111   Idx is, id, sbase, delta;
1112   if (src == NULL || src->nelem == 0)
1113     return REG_NOERROR;
1114   if (sizeof (Idx) < 3
1115       && ((Idx) (2 * src->nelem) < src->nelem
1116             || (Idx) (2 * src->nelem + dest->nelem) < dest->nelem))
1117     return REG_ESPACE;
1118   if (dest->alloc < 2 * src->nelem + dest->nelem)
1119     {
1120       Idx new_alloc = src->nelem + dest->alloc;
1121       Idx *new_buffer;
1122       if (sizeof (Idx) < 4 && new_alloc < dest->alloc)
1123           return REG_ESPACE;
1124       new_buffer = re_x2realloc (dest->elems, Idx, &new_alloc);
1125       if (BE (new_buffer == NULL, 0))
1126           return REG_ESPACE;
1127       dest->elems = new_buffer;
1128       dest->alloc = new_alloc;
1129     }
1130 
1131   if (BE (dest->nelem == 0, 0))
1132     {
1133       dest->nelem = src->nelem;
1134       memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
1135       return REG_NOERROR;
1136     }
1137 
1138   /* Copy into the top of DEST the items of SRC that are not
1139      found in DEST.  Maybe we could binary search in DEST?  */
1140   for (sbase = dest->nelem + 2 * src->nelem,
1141        is = src->nelem - 1, id = dest->nelem - 1;
1142        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1143     {
1144       if (dest->elems[id] == src->elems[is])
1145         is--, id--;
1146       else if (dest->elems[id] < src->elems[is])
1147         dest->elems[--sbase] = src->elems[is--];
1148       else /* if (dest->elems[id] > src->elems[is]) */
1149         --id;
1150     }
1151 
1152   if (REG_VALID_INDEX (is))
1153     {
1154       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1155       sbase -= is + 1;
1156       memcpy (dest->elems + sbase, src->elems,
1157                 (is + 1) * sizeof dest->elems[0]);
1158     }
1159 
1160   id = dest->nelem - 1;
1161   is = dest->nelem + 2 * src->nelem - 1;
1162   delta = is - sbase + 1;
1163   if (delta == 0)
1164     return REG_NOERROR;
1165 
1166   /* Now copy.  When DELTA becomes zero, the remaining
1167      DEST elements are already in place.  */
1168   dest->nelem += delta;
1169   for (;;)
1170     {
1171       if (dest->elems[is] > dest->elems[id])
1172         {
1173             /* Copy from the top.  */
1174           dest->elems[id + delta--] = dest->elems[is--];
1175             if (delta == 0)
1176               break;
1177           }
1178       else
1179         {
1180           /* Slide from the bottom.  */
1181           dest->elems[id + delta] = dest->elems[id];
1182             if (! REG_VALID_INDEX (--id))
1183               {
1184                 /* Copy remaining SRC elements.  */
1185                 memcpy (dest->elems, dest->elems + sbase,
1186                         delta * sizeof dest->elems[0]);
1187                 break;
1188               }
1189           }
1190     }
1191 
1192   return REG_NOERROR;
1193 }
1194 
1195 /* Insert the new element ELEM to the re_node_set* SET.
1196    SET should not already have ELEM.
1197    Return true if successful.  */
1198 
1199 static bool
1200 internal_function
re_node_set_insert(re_node_set * set,Idx elem)1201 re_node_set_insert (re_node_set *set, Idx elem)
1202 {
1203   Idx idx;
1204   /* In case the set is empty.  */
1205   if (set->alloc == 0)
1206     return re_node_set_init_1 (set, elem) == REG_NOERROR;
1207 
1208   if (BE (set->nelem, 0) == 0)
1209     {
1210       /* We already guaranteed above that set->alloc != 0.  */
1211       set->elems[0] = elem;
1212       ++set->nelem;
1213       return true;
1214     }
1215 
1216   /* Realloc if we need.  */
1217   if (set->alloc == set->nelem)
1218     {
1219       Idx *new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1220       if (BE (new_elems == NULL, 0))
1221           return false;
1222       set->elems = new_elems;
1223     }
1224 
1225   /* Move the elements which follows the new element.  Test the
1226      first element separately to skip a check in the inner loop.  */
1227   if (elem < set->elems[0])
1228     {
1229       idx = 0;
1230       for (idx = set->nelem; idx > 0; idx--)
1231         set->elems[idx] = set->elems[idx - 1];
1232     }
1233   else
1234     {
1235       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1236         set->elems[idx] = set->elems[idx - 1];
1237     }
1238 
1239   /* Insert the new element.  */
1240   set->elems[idx] = elem;
1241   ++set->nelem;
1242   return true;
1243 }
1244 
1245 /* Insert the new element ELEM to the re_node_set* SET.
1246    SET should not already have any element greater than or equal to ELEM.
1247    Return true if successful.  */
1248 
1249 static bool
1250 internal_function
re_node_set_insert_last(re_node_set * set,Idx elem)1251 re_node_set_insert_last (re_node_set *set, Idx elem)
1252 {
1253   /* Realloc if we need.  */
1254   if (set->alloc == set->nelem)
1255     {
1256       Idx *new_elems;
1257       new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1258       if (BE (new_elems == NULL, 0))
1259           return false;
1260       set->elems = new_elems;
1261     }
1262 
1263   /* Insert the new element.  */
1264   set->elems[set->nelem++] = elem;
1265   return true;
1266 }
1267 
1268 /* Compare two node sets SET1 and SET2.
1269    Return true if SET1 and SET2 are equivalent.  */
1270 
1271 static bool
internal_function(pure)1272 internal_function __attribute ((pure))
1273 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1274 {
1275   Idx i;
1276   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1277     return false;
1278   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1279     if (set1->elems[i] != set2->elems[i])
1280       return false;
1281   return true;
1282 }
1283 
1284 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1285 
1286 static Idx
internal_function(pure)1287 internal_function __attribute ((pure))
1288 re_node_set_contains (const re_node_set *set, Idx elem)
1289 {
1290   __re_size_t idx, right, mid;
1291   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1292     return 0;
1293 
1294   /* Binary search the element.  */
1295   idx = 0;
1296   right = set->nelem - 1;
1297   while (idx < right)
1298     {
1299       mid = (idx + right) / 2;
1300       if (set->elems[mid] < elem)
1301           idx = mid + 1;
1302       else
1303           right = mid;
1304     }
1305   return set->elems[idx] == elem ? idx + 1 : 0;
1306 }
1307 
1308 static void
1309 internal_function
re_node_set_remove_at(re_node_set * set,Idx idx)1310 re_node_set_remove_at (re_node_set *set, Idx idx)
1311 {
1312   if (idx < 0 || idx >= set->nelem)
1313     return;
1314   --set->nelem;
1315   for (; idx < set->nelem; idx++)
1316     set->elems[idx] = set->elems[idx + 1];
1317 }
1318 
1319 
1320 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1321    Or return REG_MISSING if an error occurred.  */
1322 
1323 static Idx
1324 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1325 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1326 {
1327   int type = token.type;
1328   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1329     {
1330       Idx new_nodes_alloc = dfa->nodes_alloc;
1331       Idx *new_nexts, *new_indices;
1332       re_node_set *new_edests, *new_eclosures;
1333 
1334       re_token_t *new_nodes = re_x2realloc (dfa->nodes, re_token_t,
1335                                                       &new_nodes_alloc);
1336       if (BE (new_nodes == NULL, 0))
1337           return REG_MISSING;
1338       dfa->nodes = new_nodes;
1339       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1340       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1341       new_edests = re_xrealloc (dfa->edests, re_node_set, new_nodes_alloc);
1342       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1343       if (BE (new_nexts == NULL || new_indices == NULL
1344                 || new_edests == NULL || new_eclosures == NULL, 0))
1345           return REG_MISSING;
1346       dfa->nexts = new_nexts;
1347       dfa->org_indices = new_indices;
1348       dfa->edests = new_edests;
1349       dfa->eclosures = new_eclosures;
1350       dfa->nodes_alloc = new_nodes_alloc;
1351     }
1352   dfa->nodes[dfa->nodes_len] = token;
1353   dfa->nodes[dfa->nodes_len].constraint = 0;
1354 #ifdef RE_ENABLE_I18N
1355   dfa->nodes[dfa->nodes_len].accept_mb =
1356     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1357 #endif
1358   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1359   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1360   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1361   return dfa->nodes_len++;
1362 }
1363 
1364 static inline re_hashval_t
1365 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)1366 calc_state_hash (const re_node_set *nodes, unsigned int context)
1367 {
1368   re_hashval_t hash = nodes->nelem + context;
1369   Idx i;
1370   for (i = 0 ; i < nodes->nelem ; i++)
1371     hash += nodes->elems[i];
1372   return hash;
1373 }
1374 
1375 /* Search for the state whose node_set is equivalent to NODES.
1376    Return the pointer to the state, if we found it in the DFA.
1377    Otherwise create the new one and return it.  In case of an error
1378    return NULL and set the error code in ERR.
1379    Note: - We assume NULL as the invalid state, then it is possible that
1380              return value is NULL and ERR is REG_NOERROR.
1381            - We never return non-NULL value in case of any errors, it is for
1382              optimization.  */
1383 
1384 static re_dfastate_t*
1385 internal_function
re_acquire_state(reg_errcode_t * err,re_dfa_t * dfa,const re_node_set * nodes)1386 re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1387 {
1388   re_hashval_t hash;
1389   re_dfastate_t *new_state;
1390   struct re_state_table_entry *spot;
1391   Idx i;
1392 #ifdef lint
1393   /* Suppress bogus uninitialized-variable warnings.  */
1394   *err = REG_NOERROR;
1395 #endif
1396   if (BE (nodes->nelem == 0, 0))
1397     {
1398       *err = REG_NOERROR;
1399       return NULL;
1400     }
1401   hash = calc_state_hash (nodes, 0);
1402   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1403 
1404   for (i = 0 ; i < spot->num ; i++)
1405     {
1406       re_dfastate_t *state = spot->array[i];
1407       if (hash != state->hash)
1408           continue;
1409       if (re_node_set_compare (&state->nodes, nodes))
1410           return state;
1411     }
1412 
1413   /* There are no appropriate state in the dfa, create the new one.  */
1414   new_state = create_ci_newstate (dfa, nodes, hash);
1415   if (BE (new_state != NULL, 1))
1416     return new_state;
1417   else
1418     {
1419       *err = REG_ESPACE;
1420       return NULL;
1421     }
1422 }
1423 
1424 /* Search for the state whose node_set is equivalent to NODES and
1425    whose context is equivalent to CONTEXT.
1426    Return the pointer to the state, if we found it in the DFA.
1427    Otherwise create the new one and return it.  In case of an error
1428    return NULL and set the error code in ERR.
1429    Note: - We assume NULL as the invalid state, then it is possible that
1430              return value is NULL and ERR is REG_NOERROR.
1431            - We never return non-NULL value in case of any errors, it is for
1432              optimization.  */
1433 
1434 static re_dfastate_t*
1435 internal_function
re_acquire_state_context(reg_errcode_t * err,re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1436 re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1437                                 const re_node_set *nodes, unsigned int context)
1438 {
1439   re_hashval_t hash;
1440   re_dfastate_t *new_state;
1441   struct re_state_table_entry *spot;
1442   Idx i;
1443 #ifdef lint
1444   /* Suppress bogus uninitialized-variable warnings.  */
1445   *err = REG_NOERROR;
1446 #endif
1447   if (nodes->nelem == 0)
1448     {
1449       *err = REG_NOERROR;
1450       return NULL;
1451     }
1452   hash = calc_state_hash (nodes, context);
1453   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1454 
1455   for (i = 0 ; i < spot->num ; i++)
1456     {
1457       re_dfastate_t *state = spot->array[i];
1458       if (state->hash == hash
1459             && state->context == context
1460             && re_node_set_compare (state->entrance_nodes, nodes))
1461           return state;
1462     }
1463   /* There are no appropriate state in `dfa', create the new one.  */
1464   new_state = create_cd_newstate (dfa, nodes, context, hash);
1465   if (BE (new_state != NULL, 1))
1466     return new_state;
1467   else
1468     {
1469       *err = REG_ESPACE;
1470       return NULL;
1471     }
1472 }
1473 
1474 /* Finish initialization of the new state NEWSTATE, and using its hash value
1475    HASH put in the appropriate bucket of DFA's state table.  Return value
1476    indicates the error code if failed.  */
1477 
1478 static reg_errcode_t
1479 internal_function
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1480 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash)
1481 {
1482   struct re_state_table_entry *spot;
1483   reg_errcode_t err;
1484   Idx i;
1485 
1486   newstate->hash = hash;
1487   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1488   if (BE (err != REG_NOERROR, 0))
1489     return REG_ESPACE;
1490   for (i = 0; i < newstate->nodes.nelem; i++)
1491     {
1492       Idx elem = newstate->nodes.elems[i];
1493       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1494           {
1495             bool ok = re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1496             if (BE (! ok, 0))
1497               return REG_ESPACE;
1498           }
1499     }
1500 
1501   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1502   if (BE (spot->alloc <= spot->num, 0))
1503     {
1504       Idx new_alloc = spot->num;
1505       re_dfastate_t **new_array = re_x2realloc (spot->array, re_dfastate_t *,
1506                                                             &new_alloc);
1507       if (BE (new_array == NULL, 0))
1508           return REG_ESPACE;
1509       spot->array = new_array;
1510       spot->alloc = new_alloc;
1511     }
1512   spot->array[spot->num++] = newstate;
1513   return REG_NOERROR;
1514 }
1515 
1516 /* Create the new state which is independ of contexts.
1517    Return the new state if succeeded, otherwise return NULL.  */
1518 
1519 static re_dfastate_t *
1520 internal_function
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1521 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1522                         re_hashval_t hash)
1523 {
1524   Idx i;
1525   reg_errcode_t err;
1526   re_dfastate_t *newstate;
1527 
1528   newstate = re_calloc (re_dfastate_t, 1);
1529   if (BE (newstate == NULL, 0))
1530     return NULL;
1531   err = re_node_set_init_copy (&newstate->nodes, nodes);
1532   if (BE (err != REG_NOERROR, 0))
1533     {
1534       re_free (newstate);
1535       return NULL;
1536     }
1537 
1538   newstate->entrance_nodes = &newstate->nodes;
1539   for (i = 0 ; i < nodes->nelem ; i++)
1540     {
1541       re_token_t *node = dfa->nodes + nodes->elems[i];
1542       re_token_type_t type = node->type;
1543       if (type == CHARACTER && !node->constraint)
1544           continue;
1545 #ifdef RE_ENABLE_I18N
1546       newstate->accept_mb |= node->accept_mb;
1547 #endif /* RE_ENABLE_I18N */
1548 
1549       /* If the state has the halt node, the state is a halt state.  */
1550       if (type == END_OF_RE)
1551           newstate->halt = 1;
1552       else if (type == OP_BACK_REF)
1553           newstate->has_backref = 1;
1554       else if (type == ANCHOR || node->constraint)
1555           newstate->has_constraint = 1;
1556     }
1557   err = register_state (dfa, newstate, hash);
1558   if (BE (err != REG_NOERROR, 0))
1559     {
1560       free_state (newstate);
1561       newstate = NULL;
1562     }
1563   return newstate;
1564 }
1565 
1566 /* Create the new state which is depend on the context CONTEXT.
1567    Return the new state if succeeded, otherwise return NULL.  */
1568 
1569 static re_dfastate_t *
1570 internal_function
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1571 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1572                         unsigned int context, re_hashval_t hash)
1573 {
1574   Idx i, nctx_nodes = 0;
1575   reg_errcode_t err;
1576   re_dfastate_t *newstate;
1577 
1578   newstate = re_calloc (re_dfastate_t, 1);
1579   if (BE (newstate == NULL, 0))
1580     return NULL;
1581   err = re_node_set_init_copy (&newstate->nodes, nodes);
1582   if (BE (err != REG_NOERROR, 0))
1583     {
1584       re_free (newstate);
1585       return NULL;
1586     }
1587 
1588   newstate->context = context;
1589   newstate->entrance_nodes = &newstate->nodes;
1590 
1591   for (i = 0 ; i < nodes->nelem ; i++)
1592     {
1593       unsigned int constraint = 0;
1594       re_token_t *node = dfa->nodes + nodes->elems[i];
1595       re_token_type_t type = node->type;
1596       if (node->constraint)
1597           constraint = node->constraint;
1598 
1599       if (type == CHARACTER && !constraint)
1600           continue;
1601 #ifdef RE_ENABLE_I18N
1602       newstate->accept_mb |= node->accept_mb;
1603 #endif /* RE_ENABLE_I18N */
1604 
1605       /* If the state has the halt node, the state is a halt state.  */
1606       if (type == END_OF_RE)
1607           newstate->halt = 1;
1608       else if (type == OP_BACK_REF)
1609           newstate->has_backref = 1;
1610       else if (type == ANCHOR)
1611           constraint = node->opr.ctx_type;
1612 
1613       if (constraint)
1614           {
1615             if (newstate->entrance_nodes == &newstate->nodes)
1616               {
1617                 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1618                 if (BE (newstate->entrance_nodes == NULL, 0))
1619                     {
1620                       free_state (newstate);
1621                       return NULL;
1622                     }
1623                 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1624                 nctx_nodes = 0;
1625                 newstate->has_constraint = 1;
1626               }
1627 
1628             if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1629               {
1630                 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1631                 ++nctx_nodes;
1632               }
1633           }
1634     }
1635   err = register_state (dfa, newstate, hash);
1636   if (BE (err != REG_NOERROR, 0))
1637     {
1638       free_state (newstate);
1639       newstate = NULL;
1640     }
1641   return  newstate;
1642 }
1643 
1644 static void
1645 internal_function
free_state(re_dfastate_t * state)1646 free_state (re_dfastate_t *state)
1647 {
1648   re_node_set_free (&state->non_eps_nodes);
1649   re_node_set_free (&state->inveclosure);
1650   if (state->entrance_nodes != &state->nodes)
1651     {
1652       re_node_set_free (state->entrance_nodes);
1653       re_free (state->entrance_nodes);
1654     }
1655   re_node_set_free (&state->nodes);
1656   re_free (state->word_trtable);
1657   re_free (state->trtable);
1658   re_free (state);
1659 }
1660