1 /* kwset.c - search for any of a set of keywords.
2 Copyright (C) 1989, 1998, 2000, 2005, 2007, 2009-2023 Free Software
3 Foundation, Inc.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
18 02110-1301, USA. */
19
20 /* Written August 1989 by Mike Haertel. */
21
22 /* For more on the Aho-Corasick and Boyer-Moore algorithms,
23 as well as other algorithms that might help improve performance,
24 see the grep manual's "Performance" chapter. */
25
26 #include <config.h>
27
28 #include "kwset.h"
29
30 #include <stdckdint.h>
31 #include <stdint.h>
32 #include <sys/types.h>
33
34 #include "system.h"
35 #include "memchr2.h"
36 #include "obstack.h"
37 #include "xalloc.h"
38 #include "verify.h"
39
40 #define obstack_chunk_alloc xmalloc
41 #define obstack_chunk_free free
42
43 static unsigned char
44 U (char ch)
45 {
46 return to_uchar (ch);
47 }
48
49 /* Balanced tree of edges and labels leaving a given trie node. */
50 struct tree
51 {
52 struct tree *llink; /* Left link; MUST be first field. */
53 struct tree *rlink; /* Right link (to larger labels). */
54 struct trie *trie; /* Trie node pointed to by this edge. */
55 unsigned char label; /* Label on this edge. */
56 char balance; /* Difference in depths of subtrees. */
57 };
58
59 /* Node of a trie representing a set of keywords. */
60 struct trie
61 {
62 /* If an accepting node, this is either 2*W + 1 where W is the word
63 index, or is -1 if Aho-Corasick is in use and FAIL
64 specifies where to look for more info. If not an accepting node,
65 this is zero. */
66 ptrdiff_t accepting;
67
68 struct tree *links; /* Tree of edges leaving this node. */
69 struct trie *parent; /* Parent of this node. */
70 struct trie *next; /* List of all trie nodes in level order. */
71 struct trie *fail; /* Aho-Corasick failure function. */
72 idx_t depth; /* Depth of this node from the root. */
73 idx_t shift; /* Shift function for search failures. */
74 idx_t maxshift; /* Max shift of self and descendants. */
75 };
76
77 /* Structure returned opaquely to the caller, containing everything. */
78 struct kwset
79 {
80 struct obstack obstack; /* Obstack for node allocation. */
81 idx_t words; /* Number of words in the trie. */
82 struct trie *trie; /* The trie itself. */
83 idx_t mind; /* Minimum depth of an accepting node. */
84 unsigned char delta[NCHAR]; /* Delta table for rapid search. */
85 struct trie *next[NCHAR]; /* Table of children of the root. */
86 char *target; /* Target string if there's only one. */
87 idx_t *shift; /* Used in Boyer-Moore search for one
88 string. */
89 char const *trans; /* Character translation table. */
90
91 /* This helps to match a terminal byte, which is the first byte
92 for Aho-Corasick, and the last byte for Boyer-More. If all the
93 patterns have the same terminal byte (after translation via TRANS
94 if TRANS is nonnull), then this is that byte as an unsigned char.
95 Otherwise this is -1 if there is disagreement among the strings
96 about terminal bytes, and -2 if there are no terminal bytes and
97 no disagreement because all the patterns are empty. */
98 int gc1;
99
100 /* This helps to match a terminal byte. If 0 <= GC1HELP, B is
101 terminal when B == GC1 || B == GC1HELP (note that GC1 == GCHELP
102 is common here). This is typically faster than evaluating
103 to_uchar (TRANS[B]) == GC1. */
104 int gc1help;
105
106 /* If the string has two or more bytes, this is the penultimate byte,
107 after translation via TRANS if TRANS is nonnull. This variable
108 is used only by Boyer-Moore. */
109 char gc2;
110
111 /* kwsexec implementation. */
112 ptrdiff_t (*kwsexec) (kwset_t, char const *, idx_t, struct kwsmatch *, bool);
113 };
114
115 /* Use TRANS to transliterate C. A null TRANS does no transliteration. */
116 static inline char
117 tr (char const *trans, char c)
118 {
119 return trans ? trans[U(c)] : c;
120 }
121
122 static ptrdiff_t acexec (kwset_t, char const *, idx_t,
123 struct kwsmatch *, bool);
124 static ptrdiff_t bmexec (kwset_t, char const *, idx_t,
125 struct kwsmatch *, bool);
126
127 /* Return a newly allocated keyword set. A nonnull TRANS specifies a
128 table of character translations to be applied to all pattern and
129 search text. */
130 kwset_t
131 kwsalloc (char const *trans)
132 {
133 struct kwset *kwset = xmalloc (sizeof *kwset);
134
135 obstack_init (&kwset->obstack);
136 kwset->words = 0;
137 kwset->trie = obstack_alloc (&kwset->obstack, sizeof *kwset->trie);
138 kwset->trie->accepting = 0;
139 kwset->trie->links = NULL;
140 kwset->trie->parent = NULL;
141 kwset->trie->next = NULL;
142 kwset->trie->fail = NULL;
143 kwset->trie->depth = 0;
144 kwset->trie->shift = 0;
145 kwset->mind = IDX_MAX;
146 kwset->target = NULL;
147 kwset->trans = trans;
148 kwset->kwsexec = acexec;
149
150 return kwset;
151 }
152
153 /* This upper bound is valid for CHAR_BIT >= 4 and
154 exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */
155 enum { DEPTH_SIZE = CHAR_BIT + CHAR_BIT / 2 };
156
157 /* Add the given string to the contents of the keyword set. */
158 void
159 kwsincr (kwset_t kwset, char const *text, idx_t len)
160 {
161 assume (0 <= len);
162 struct trie *trie = kwset->trie;
163 char const *trans = kwset->trans;
164 bool reverse = kwset->kwsexec == bmexec;
165
166 if (reverse)
167 text += len;
168
169 /* Descend the trie (built of keywords) character-by-character,
170 installing new nodes when necessary. */
171 while (len--)
172 {
173 unsigned char uc = reverse ? *--text : *text++;
174 unsigned char label = trans ? trans[uc] : uc;
175
176 /* Descend the tree of outgoing links for this trie node,
177 looking for the current character and keeping track
178 of the path followed. */
179 struct tree *cur = trie->links;
180 struct tree *links[DEPTH_SIZE];
181 enum { L, R } dirs[DEPTH_SIZE];
182 links[0] = (struct tree *) &trie->links;
183 dirs[0] = L;
184 idx_t depth = 1;
185
186 while (cur && label != cur->label)
187 {
188 links[depth] = cur;
189 if (label < cur->label)
190 dirs[depth++] = L, cur = cur->llink;
191 else
192 dirs[depth++] = R, cur = cur->rlink;
193 }
194
195 /* The current character doesn't have an outgoing link at
196 this trie node, so build a new trie node and install
197 a link in the current trie node's tree. */
198 if (!cur)
199 {
200 cur = obstack_alloc (&kwset->obstack, sizeof *cur);
201 cur->llink = NULL;
202 cur->rlink = NULL;
203 cur->trie = obstack_alloc (&kwset->obstack, sizeof *cur->trie);
204 cur->trie->accepting = 0;
205 cur->trie->links = NULL;
206 cur->trie->parent = trie;
207 cur->trie->next = NULL;
208 cur->trie->fail = NULL;
209 cur->trie->depth = trie->depth + 1;
210 cur->trie->shift = 0;
211 cur->label = label;
212 cur->balance = 0;
213
214 /* Install the new tree node in its parent. */
215 if (dirs[--depth] == L)
216 links[depth]->llink = cur;
217 else
218 links[depth]->rlink = cur;
219
220 /* Back up the tree fixing the balance flags. */
221 while (depth && !links[depth]->balance)
222 {
223 if (dirs[depth] == L)
224 --links[depth]->balance;
225 else
226 ++links[depth]->balance;
227 --depth;
228 }
229
230 /* Rebalance the tree by pointer rotations if necessary. */
231 if (depth && ((dirs[depth] == L && --links[depth]->balance)
232 || (dirs[depth] == R && ++links[depth]->balance)))
233 {
234 struct tree *t, *r, *l, *rl, *lr;
235
236 switch (links[depth]->balance)
237 {
238 case (char) -2:
239 switch (dirs[depth + 1])
240 {
241 case L:
242 r = links[depth], t = r->llink, rl = t->rlink;
243 t->rlink = r, r->llink = rl;
244 t->balance = r->balance = 0;
245 break;
246 case R:
247 r = links[depth], l = r->llink, t = l->rlink;
248 rl = t->rlink, lr = t->llink;
249 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
250 l->balance = t->balance != 1 ? 0 : -1;
251 r->balance = t->balance != (char) -1 ? 0 : 1;
252 t->balance = 0;
253 break;
254 default:
255 abort ();
256 }
257 break;
258 case 2:
259 switch (dirs[depth + 1])
260 {
261 case R:
262 l = links[depth], t = l->rlink, lr = t->llink;
263 t->llink = l, l->rlink = lr;
264 t->balance = l->balance = 0;
265 break;
266 case L:
267 l = links[depth], r = l->rlink, t = r->llink;
268 lr = t->llink, rl = t->rlink;
269 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
270 l->balance = t->balance != 1 ? 0 : -1;
271 r->balance = t->balance != (char) -1 ? 0 : 1;
272 t->balance = 0;
273 break;
274 default:
275 abort ();
276 }
277 break;
278 default:
279 abort ();
280 }
281
282 if (dirs[depth - 1] == L)
283 links[depth - 1]->llink = t;
284 else
285 links[depth - 1]->rlink = t;
286 }
287 }
288
289 trie = cur->trie;
290 }
291
292 /* Mark the node finally reached as accepting, encoding the
293 index number of this word in the keyword set so far. */
294 if (!trie->accepting)
295 trie->accepting = 2 * kwset->words + 1;
296 ++kwset->words;
297
298 /* Keep track of the longest and shortest string of the keyword set. */
299 if (trie->depth < kwset->mind)
300 kwset->mind = trie->depth;
301 }
302
303 idx_t
304 kwswords (kwset_t kwset)
305 {
306 return kwset->words;
307 }
308
309 /* Enqueue the trie nodes referenced from the given tree in the
310 given queue. */
311 static void
312 enqueue (struct tree *tree, struct trie **last)
313 {
314 if (!tree)
315 return;
316 enqueue (tree->llink, last);
317 enqueue (tree->rlink, last);
318 (*last) = (*last)->next = tree->trie;
319 }
320
321 /* Compute the Aho-Corasick failure function for the trie nodes referenced
322 from the given tree, given the failure function for their parent as
323 well as a last resort failure node. */
324 static void
325 treefails (struct tree const *tree, struct trie const *fail,
326 struct trie *recourse, bool reverse)
327 {
328 struct tree *cur;
329
330 if (!tree)
331 return;
332
333 treefails (tree->llink, fail, recourse, reverse);
334 treefails (tree->rlink, fail, recourse, reverse);
335
336 /* Find, in the chain of fails going back to the root, the first
337 node that has a descendant on the current label. */
338 while (fail)
339 {
340 cur = fail->links;
341 while (cur && tree->label != cur->label)
342 if (tree->label < cur->label)
343 cur = cur->llink;
344 else
345 cur = cur->rlink;
346 if (cur)
347 {
348 tree->trie->fail = cur->trie;
349 if (!reverse && cur->trie->accepting && !tree->trie->accepting)
350 tree->trie->accepting = -1;
351 return;
352 }
353 fail = fail->fail;
354 }
355
356 tree->trie->fail = recourse;
357 }
358
359 /* Set delta entries for the links of the given tree such that
360 the preexisting delta value is larger than the current depth. */
361 static void
362 treedelta (struct tree const *tree, idx_t depth, unsigned char delta[])
363 {
364 if (!tree)
365 return;
366 treedelta (tree->llink, depth, delta);
367 treedelta (tree->rlink, depth, delta);
368 if (depth < delta[tree->label])
369 delta[tree->label] = depth;
370 }
371
372 /* Return true if A has every label in B. */
373 static bool _GL_ATTRIBUTE_PURE
374 hasevery (struct tree const *a, struct tree const *b)
375 {
376 if (!b)
377 return true;
378 if (!hasevery (a, b->llink))
379 return false;
380 if (!hasevery (a, b->rlink))
381 return false;
382 while (a && b->label != a->label)
383 if (b->label < a->label)
384 a = a->llink;
385 else
386 a = a->rlink;
387 return !!a;
388 }
389
390 /* Compute a vector, indexed by character code, of the trie nodes
391 referenced from the given tree. */
392 static void
393 treenext (struct tree const *tree, struct trie *next[])
394 {
395 if (!tree)
396 return;
397 treenext (tree->llink, next);
398 treenext (tree->rlink, next);
399 next[tree->label] = tree->trie;
400 }
401
402 /* Prepare a built keyword set for use. */
403 void
404 kwsprep (kwset_t kwset)
405 {
406 char const *trans = kwset->trans;
407 unsigned char deltabuf[NCHAR];
408 unsigned char *delta = trans ? deltabuf : kwset->delta;
409 struct trie *curr, *last;
410
411 /* Use Boyer-Moore if just one pattern, Aho-Corasick otherwise. */
412 bool reverse = kwset->words == 1;
413
414 if (reverse)
415 {
416 kwset_t new_kwset;
417
418 /* Enqueue the immediate descendants in the level order queue. */
419 for (curr = last = kwset->trie; curr; curr = curr->next)
420 enqueue (curr->links, &last);
421
422 /* Looking for just one string. Extract it from the trie. */
423 kwset->target = obstack_alloc (&kwset->obstack, kwset->mind);
424 curr = kwset->trie;
425 for (idx_t i = 0; i < kwset->mind; i++)
426 {
427 kwset->target[i] = curr->links->label;
428 curr = curr->next;
429 }
430
431 new_kwset = kwsalloc (kwset->trans);
432 new_kwset->kwsexec = bmexec;
433 kwsincr (new_kwset, kwset->target, kwset->mind);
434 obstack_free (&kwset->obstack, NULL);
435 *kwset = *new_kwset;
436 free (new_kwset);
437 }
438
439 /* Initial values for the delta table; will be changed later. The
440 delta entry for a given character is the smallest depth of any
441 node at which an outgoing edge is labeled by that character. */
442 memset (delta, MIN (kwset->mind, UCHAR_MAX), sizeof deltabuf);
443
444 /* Traverse the nodes of the trie in level order, simultaneously
445 computing the delta table, failure function, and shift function. */
446 for (curr = last = kwset->trie; curr; curr = curr->next)
447 {
448 /* Enqueue the immediate descendants in the level order queue. */
449 enqueue (curr->links, &last);
450
451 /* Update the delta table for the descendants of this node. */
452 treedelta (curr->links, curr->depth, delta);
453
454 /* Compute the failure function for the descendants of this node. */
455 treefails (curr->links, curr->fail, kwset->trie, reverse);
456
457 if (reverse)
458 {
459 curr->shift = kwset->mind;
460 curr->maxshift = kwset->mind;
461
462 /* Update the shifts at each node in the current node's chain
463 of fails back to the root. */
464 struct trie *fail;
465 for (fail = curr->fail; fail; fail = fail->fail)
466 {
467 /* If the current node has some outgoing edge that the fail
468 doesn't, then the shift at the fail should be no larger
469 than the difference of their depths. */
470 if (!hasevery (fail->links, curr->links))
471 if (curr->depth - fail->depth < fail->shift)
472 fail->shift = curr->depth - fail->depth;
473
474 /* If the current node is accepting then the shift at the
475 fail and its descendants should be no larger than the
476 difference of their depths. */
477 if (curr->accepting && fail->maxshift > curr->depth - fail->depth)
478 fail->maxshift = curr->depth - fail->depth;
479 }
480 }
481 }
482
483 if (reverse)
484 {
485 /* Traverse the trie in level order again, fixing up all nodes whose
486 shift exceeds their inherited maxshift. */
487 for (curr = kwset->trie->next; curr; curr = curr->next)
488 {
489 if (curr->maxshift > curr->parent->maxshift)
490 curr->maxshift = curr->parent->maxshift;
491 if (curr->shift > curr->maxshift)
492 curr->shift = curr->maxshift;
493 }
494 }
495
496 /* Create a vector, indexed by character code, of the outgoing links
497 from the root node. Accumulate GC1 and GC1HELP. */
498 struct trie *nextbuf[NCHAR];
499 struct trie **next = trans ? nextbuf : kwset->next;
500 memset (next, 0, sizeof nextbuf);
501 treenext (kwset->trie->links, next);
502 int gc1 = -2;
503 int gc1help = -1;
504 for (int i = 0; i < NCHAR; i++)
505 {
506 int ti = i;
507 if (trans)
508 {
509 ti = U(trans[i]);
510 kwset->next[i] = next[ti];
511 }
512 if (kwset->next[i])
513 {
514 if (gc1 < -1)
515 {
516 gc1 = ti;
517 gc1help = i;
518 }
519 else if (gc1 == ti)
520 gc1help = gc1help == ti ? i : -1;
521 else if (i == ti && gc1 == gc1help)
522 gc1help = i;
523 else
524 gc1 = -1;
525 }
526 }
527 kwset->gc1 = gc1;
528 kwset->gc1help = gc1help;
529
530 if (reverse)
531 {
532 /* Looking for just one string. Extract it from the trie. */
533 kwset->target = obstack_alloc (&kwset->obstack, kwset->mind);
534 curr = kwset->trie;
535 for (idx_t i = kwset->mind; 0 < i; i--)
536 {
537 kwset->target[i - 1] = curr->links->label;
538 curr = curr->next;
539 }
540
541 if (kwset->mind > 1)
542 {
543 /* Looking for the delta2 shift that might be made after a
544 backwards match has failed. Extract it from the trie. */
545 kwset->shift
546 = obstack_alloc (&kwset->obstack,
547 sizeof *kwset->shift * (kwset->mind - 1));
548 curr = kwset->trie->next;
549 for (idx_t i = 0; i < kwset->mind - 1; i++)
550 {
551 kwset->shift[i] = curr->shift;
552 curr = curr->next;
553 }
554
555 /* The penultimate byte. */
556 kwset->gc2 = tr (trans, kwset->target[kwset->mind - 2]);
557 }
558 }
559
560 /* Fix things up for any translation table. */
561 if (trans)
562 for (int i = 0; i < NCHAR; ++i)
563 kwset->delta[i] = delta[U(trans[i])];
564 }
565
566 /* Delta2 portion of a Boyer-Moore search. *TP is the string text
567 pointer; it is updated in place. EP is the end of the string text,
568 and SP the end of the pattern. LEN is the pattern length; it must
569 be at least 2. TRANS, if nonnull, is the input translation table.
570 GC1 and GC2 are the last and second-from last bytes of the pattern,
571 transliterated by TRANS; the caller precomputes them for
572 efficiency. If D1 is nonnull, it is a delta1 table for shifting *TP
573 when failing. KWSET->shift says how much to shift. */
574 static inline bool
575 bm_delta2_search (char const **tpp, char const *ep, char const *sp,
576 idx_t len,
577 char const *trans, char gc1, char gc2,
578 unsigned char const *d1, kwset_t kwset)
579 {
580 char const *tp = *tpp;
581 idx_t d = len, skip = 0;
582
583 while (true)
584 {
585 idx_t i = 2;
586 if (tr (trans, tp[-2]) == gc2)
587 {
588 while (++i <= d)
589 if (tr (trans, tp[-i]) != tr (trans, sp[-i]))
590 break;
591 if (i > d)
592 {
593 for (i = d + skip + 1; i <= len; ++i)
594 if (tr (trans, tp[-i]) != tr (trans, sp[-i]))
595 break;
596 if (i > len)
597 {
598 *tpp = tp - len;
599 return true;
600 }
601 }
602 }
603
604 tp += d = kwset->shift[i - 2];
605 if (tp > ep)
606 break;
607 if (tr (trans, tp[-1]) != gc1)
608 {
609 if (d1)
610 tp += d1[U(tp[-1])];
611 break;
612 }
613 skip = i - 1;
614 }
615
616 *tpp = tp;
617 return false;
618 }
619
620 /* Return the address of the first byte in the buffer S (of size N)
621 that matches the terminal byte specified by KWSET, or NULL if there
622 is no match. KWSET->gc1 should be nonnegative. */
623 static char const *
624 memchr_kwset (char const *s, idx_t n, kwset_t kwset)
625 {
626 char const *slim = s + n;
627 if (kwset->gc1help < 0)
628 {
629 for (; s < slim; s++)
630 if (kwset->next[U(*s)])
631 return s;
632 }
633 else
634 {
635 int small_heuristic = 2;
636 idx_t small_bytes = small_heuristic * sizeof (unsigned long int);
637 while (s < slim)
638 {
639 if (kwset->next[U(*s)])
640 return s;
641 s++;
642 if ((uintptr_t) s % small_bytes == 0)
643 return memchr2 (s, kwset->gc1, kwset->gc1help, slim - s);
644 }
645 }
646 return NULL;
647 }
648
649 /* Fast Boyer-Moore search (inlinable version). */
650 static inline ptrdiff_t _GL_ATTRIBUTE_PURE
651 bmexec_trans (kwset_t kwset, char const *text, idx_t size)
652 {
653 assume (0 <= size);
654 unsigned char const *d1;
655 char const *ep, *sp, *tp;
656 int d;
657 idx_t len = kwset->mind;
658 char const *trans = kwset->trans;
659
660 if (len == 0)
661 return 0;
662 if (len > size)
663 return -1;
664 if (len == 1)
665 {
666 tp = memchr_kwset (text, size, kwset);
667 return tp ? tp - text : -1;
668 }
669
670 d1 = kwset->delta;
671 sp = kwset->target + len;
672 tp = text + len;
673 char gc1 = kwset->gc1;
674 char gc2 = kwset->gc2;
675
676 /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
677 idx_t len12;
678 if (!ckd_mul (&len12, len, 12) && len12 < size)
679 /* 11 is not a bug, the initial offset happens only once. */
680 for (ep = text + size - 11 * len; tp <= ep; )
681 {
682 char const *tp0 = tp;
683 d = d1[U(tp[-1])], tp += d;
684 d = d1[U(tp[-1])], tp += d;
685 if (d != 0)
686 {
687 d = d1[U(tp[-1])], tp += d;
688 d = d1[U(tp[-1])], tp += d;
689 d = d1[U(tp[-1])], tp += d;
690 if (d != 0)
691 {
692 d = d1[U(tp[-1])], tp += d;
693 d = d1[U(tp[-1])], tp += d;
694 d = d1[U(tp[-1])], tp += d;
695 if (d != 0)
696 {
697 d = d1[U(tp[-1])], tp += d;
698 d = d1[U(tp[-1])], tp += d;
699
700 /* As a heuristic, prefer memchr to seeking by
701 delta1 when the latter doesn't advance much. */
702 int advance_heuristic = 16 * sizeof (long);
703 if (advance_heuristic <= tp - tp0)
704 continue;
705 tp--;
706 tp = memchr_kwset (tp, text + size - tp, kwset);
707 if (! tp)
708 return -1;
709 tp++;
710 if (ep <= tp)
711 break;
712 }
713 }
714 }
715 if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset))
716 return tp - text;
717 }
718
719 /* Now only a few characters are left to search. Carefully avoid
720 ever producing an out-of-bounds pointer. */
721 ep = text + size;
722 d = d1[U(tp[-1])];
723 while (d <= ep - tp)
724 {
725 d = d1[U((tp += d)[-1])];
726 if (d != 0)
727 continue;
728 if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, NULL, kwset))
729 return tp - text;
730 }
731
732 return -1;
733 }
734
735 /* Fast Boyer-Moore search. */
736 static ptrdiff_t
737 bmexec (kwset_t kwset, char const *text, idx_t size,
738 struct kwsmatch *kwsmatch, bool longest)
739 {
740 /* Help the compiler inline in two ways, depending on whether
741 kwset->trans is null. */
742 ptrdiff_t ret = (IGNORE_DUPLICATE_BRANCH_WARNING
743 (kwset->trans
744 ? bmexec_trans (kwset, text, size)
745 : bmexec_trans (kwset, text, size)));
746 kwsmatch->index = 0;
747 kwsmatch->offset = ret;
748 kwsmatch->size = kwset->mind;
749 return ret;
750 }
751
752 /* Hairy multiple string search with the Aho-Corasick algorithm.
753 (inlinable version) */
754 static inline ptrdiff_t
755 acexec_trans (kwset_t kwset, char const *text, idx_t len,
756 struct kwsmatch *kwsmatch, bool longest)
757 {
758 struct trie const *trie, *accept;
759 char const *tp, *left, *lim;
760 struct tree const *tree;
761 char const *trans;
762
763 /* Initialize register copies and look for easy ways out. */
764 if (len < kwset->mind)
765 return -1;
766 trans = kwset->trans;
767 trie = kwset->trie;
768 lim = text + len;
769 tp = text;
770
771 if (!trie->accepting)
772 {
773 unsigned char c;
774 int gc1 = kwset->gc1;
775
776 while (true)
777 {
778 if (gc1 < 0)
779 {
780 while (! (trie = kwset->next[c = tr (trans, *tp++)]))
781 if (tp >= lim)
782 return -1;
783 }
784 else
785 {
786 tp = memchr_kwset (tp, lim - tp, kwset);
787 if (!tp)
788 return -1;
789 c = tr (trans, *tp++);
790 trie = kwset->next[c];
791 }
792
793 while (true)
794 {
795 if (trie->accepting)
796 goto match;
797 if (tp >= lim)
798 return -1;
799 c = tr (trans, *tp++);
800
801 for (tree = trie->links; c != tree->label; )
802 {
803 tree = c < tree->label ? tree->llink : tree->rlink;
804 if (! tree)
805 {
806 trie = trie->fail;
807 if (!trie)
808 {
809 trie = kwset->next[c];
810 if (trie)
811 goto have_trie;
812 if (tp >= lim)
813 return -1;
814 goto next_c;
815 }
816 if (trie->accepting)
817 {
818 --tp;
819 goto match;
820 }
821 tree = trie->links;
822 }
823 }
824 trie = tree->trie;
825 have_trie:;
826 }
827 next_c:;
828 }
829 }
830
831 match:
832 accept = trie;
833 while (accept->accepting < 0)
834 accept = accept->fail;
835 left = tp - accept->depth;
836
837 /* Try left-most longest match. */
838 if (longest)
839 {
840 while (tp < lim)
841 {
842 struct trie const *accept1;
843 char const *left1;
844 unsigned char c = tr (trans, *tp++);
845
846 do
847 {
848 tree = trie->links;
849 while (tree && c != tree->label)
850 tree = c < tree->label ? tree->llink : tree->rlink;
851 }
852 while (!tree && (trie = trie->fail) && accept->depth <= trie->depth);
853
854 if (!tree)
855 break;
856 trie = tree->trie;
857 if (trie->accepting)
858 {
859 accept1 = trie;
860 while (accept1->accepting < 0)
861 accept1 = accept1->fail;
862 left1 = tp - accept1->depth;
863 if (left1 <= left)
864 {
865 left = left1;
866 accept = accept1;
867 }
868 }
869 }
870 }
871
872 kwsmatch->index = accept->accepting >> 1;
873 kwsmatch->offset = left - text;
874 kwsmatch->size = accept->depth;
875
876 return left - text;
877 }
878
879 /* Hairy multiple string search with Aho-Corasick algorithm. */
880 static ptrdiff_t
881 acexec (kwset_t kwset, char const *text, idx_t size,
882 struct kwsmatch *kwsmatch, bool longest)
883 {
884 assume (0 <= size);
885 /* Help the compiler inline in two ways, depending on whether
886 kwset->trans is null. */
887 return (IGNORE_DUPLICATE_BRANCH_WARNING
888 (kwset->trans
889 ? acexec_trans (kwset, text, size, kwsmatch, longest)
890 : acexec_trans (kwset, text, size, kwsmatch, longest)));
891 }
892
893 /* Find the first instance of a KWSET member in TEXT, which has SIZE bytes.
894 Return the offset (into TEXT) of the first byte of the matching substring,
895 or -1 if no match is found. Upon a match, store details in
896 *KWSMATCH: index of matched keyword, start offset (same as the return
897 value), and length. If LONGEST, find the longest match; otherwise
898 any match will do. */
899 ptrdiff_t
900 kwsexec (kwset_t kwset, char const *text, idx_t size,
901 struct kwsmatch *kwsmatch, bool longest)
902 {
903 return kwset->kwsexec (kwset, text, size, kwsmatch, longest);
904 }
905
906 /* Free the components of the given keyword set. */
907 void
908 kwsfree (kwset_t kwset)
909 {
910 obstack_free (&kwset->obstack, NULL);
911 free (kwset);
912 }