1 /* search.c -- searching large bodies of text.
2
3 Copyright 1993-2023 Free Software Foundation, Inc.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18 Originally written by Brian Fox. */
19
20 #include "info.h"
21 #include <regex.h>
22
23 #include "session.h"
24 #include "scan.h"
25 #include "search.h"
26
27
28 /* **************************************************************** */
29 /* */
30 /* The Actual Searching Functions */
31 /* */
32 /* **************************************************************** */
33
34 /* Search forwards or backwards for the text delimited by BINDING.
35 The search is forwards if BINDING->start is greater than BINDING->end. */
36 enum search_result
37 search (char *string, SEARCH_BINDING *binding, long *poff)
38 {
39 enum search_result result;
40
41 /* If the search is backwards, then search backwards, otherwise forwards. */
42 if (binding->start > binding->end)
43 result = search_backward (string, binding, poff);
44 else
45 result = search_forward (string, binding, poff);
46
47 return result;
48 }
49
50 /* Expand \n and \t in regexp to newlines and tabs */
51 static char *
52 regexp_expand_newlines_and_tabs (char *regexp)
53 {
54 char *unescaped_regexp = xmalloc (1 + strlen (regexp));
55 char *p, *q;
56
57 for (p = regexp, q = unescaped_regexp; *p != '\0'; p++, q++)
58 {
59 if (*p == '\\')
60 switch(*++p)
61 {
62 case 'n':
63 *q = '\n';
64 break;
65 case 't':
66 *q = '\t';
67 break;
68 case '\0':
69 *q = '\\';
70 p--;
71 break;
72 default:
73 *q++ = '\\';
74 *q = *p;
75 break;
76 }
77 else
78 *q = *p;
79 }
80 *q = '\0';
81
82 return unescaped_regexp;
83 }
84
85 /* Escape any special characters in SEARCH_STRING. */
86 static char *
87 regexp_escape_string (char *search_string)
88 {
89 char *special_chars = "\\[]^$.*(){}|+?";
90 char *p, *q;
91
92 char *escaped_string = xmalloc (strlen (search_string) * 2 + 1);
93
94 for (p = search_string, q = escaped_string; *p != '\0'; )
95 {
96 if (strchr (special_chars, *p))
97 {
98 *q++ = '\\';
99 }
100 *q++ = *p++;
101 }
102
103 *q = '\0';
104
105 return escaped_string;
106 }
107
108
109 static void
110 extend_matches (MATCH_STATE *state)
111 {
112 regmatch_t *matches = state->matches;
113 size_t match_alloc = state->match_alloc;
114 size_t match_count = state->match_count;
115 char *buffer = state->buffer;
116 size_t buflen = state->buflen;
117
118 regoff_t offset = 0;
119 char saved_char;
120 size_t initial_match_count = match_count;
121
122 if (state->finished)
123 return;
124
125 saved_char = buffer[buflen];
126 buffer[buflen] = '\0';
127
128 if (match_count > 0)
129 {
130 offset = matches[match_count - 1].rm_eo;
131
132 /* move past zero-length match */
133 if (offset == matches[match_count - 1].rm_so)
134 offset++;
135 }
136
137 while (offset < buflen && match_count < initial_match_count + 5)
138 {
139 int result = 0;
140 regmatch_t m;
141
142 result = regexec (&state->regex, &buffer[offset], 1, &m, REG_NOTBOL);
143 if (result == 0)
144 {
145 if (match_count == match_alloc)
146 {
147 /* The match list is full. */
148 if (match_alloc == 0)
149 match_alloc = 50;
150 matches = x2nrealloc
151 (matches, &match_alloc, sizeof matches[0]);
152 }
153
154 matches[match_count] = m;
155 matches[match_count].rm_so += offset;
156 matches[match_count].rm_eo += offset;
157 offset = matches[match_count++].rm_eo;
158
159 if (m.rm_eo == 0)
160 offset++; /* Avoid finding match again for a pattern of "$". */
161 }
162 else
163 {
164 state->finished = 1;
165 break;
166 }
167 }
168 buffer[buflen] = saved_char;
169
170 state->matches = matches;
171 state->match_alloc = match_alloc;
172 state->match_count = match_count;
173 }
174
175 /* Search BUFFER for REGEXP. If matches are found, pass back the list of
176 matches in MATCH_STATE. */
177 enum search_result
178 regexp_search (char *regexp, int is_literal, int is_insensitive,
179 char *buffer, size_t buflen,
180 MATCH_STATE *match_state)
181 {
182 regex_t preg; /* Compiled pattern buffer for regexp. */
183 int result;
184 char *regexp_str;
185
186 if (!is_literal)
187 regexp_str = regexp_expand_newlines_and_tabs (regexp);
188 else
189 regexp_str = regexp_escape_string (regexp);
190
191 result = regcomp (&preg, regexp_str,
192 REG_EXTENDED | REG_NEWLINE
193 | (is_insensitive ? REG_ICASE : 0));
194 free (regexp_str);
195
196 if (result != 0)
197 {
198 int size = regerror (result, &preg, NULL, 0);
199 char *buf = xmalloc (size);
200 regerror (result, &preg, buf, size);
201 info_error (_("regexp error: %s"), buf);
202 free (buf);
203 return search_invalid;
204 }
205
206 match_state->matches = 0;
207 match_state->match_count = 0;
208 match_state->match_alloc = 0;
209 match_state->finished = 0;
210 match_state->regex = preg;
211 match_state->buffer = buffer;
212 match_state->buflen = buflen;
213
214 extend_matches (match_state);
215
216 if (match_state->match_count == 0)
217 {
218 free_matches (match_state);
219 return search_not_found;
220 }
221 else
222 return search_success;
223 }
224
225 /* Search forwards for STRING through the text delimited in BINDING. */
226 enum search_result
227 search_forward (char *string, SEARCH_BINDING *binding, long *poff)
228 {
229 register int c, i, len;
230 register char *buff, *end;
231 char *alternate = NULL;
232
233 len = strlen (string);
234
235 /* We match characters in the search buffer against STRING and ALTERNATE.
236 ALTERNATE is a case reversed version of STRING; this is cheaper than
237 case folding each character before comparison. Alternate is only
238 used if the case folding bit is turned on in the passed BINDING. */
239
240 if (binding->flags & S_FoldCase)
241 {
242 alternate = xstrdup (string);
243
244 for (i = 0; i < len; i++)
245 {
246 if (islower (alternate[i]))
247 alternate[i] = toupper (alternate[i]);
248 else if (isupper (alternate[i]))
249 alternate[i] = tolower (alternate[i]);
250 }
251 }
252
253 buff = binding->buffer + binding->start;
254 end = binding->buffer + binding->end + 1;
255
256 while (buff < (end - len))
257 {
258 for (i = 0; i < len; i++)
259 {
260 c = buff[i];
261
262 if ((c != string[i]) && (!alternate || c != alternate[i]))
263 break;
264 }
265
266 if (!string[i])
267 {
268 if (alternate)
269 free (alternate);
270 if (binding->flags & S_SkipDest)
271 buff += len;
272 *poff = buff - binding->buffer;
273 return search_success;
274 }
275
276 buff++;
277 }
278
279 if (alternate)
280 free (alternate);
281
282 return search_not_found;
283 }
284
285 /* Search for STRING backwards through the text delimited in BINDING. */
286 enum search_result
287 search_backward (char *input_string, SEARCH_BINDING *binding, long *poff)
288 {
289 register int c, i, len;
290 register char *buff, *end;
291 char *string;
292 char *alternate = NULL;
293
294 len = strlen (input_string);
295
296 /* Reverse the characters in the search string. */
297 string = xmalloc (1 + len);
298 for (c = 0, i = len - 1; input_string[c]; c++, i--)
299 string[i] = input_string[c];
300
301 string[c] = '\0';
302
303 /* We match characters in the search buffer against STRING and ALTERNATE.
304 ALTERNATE is a case reversed version of STRING; this is cheaper than
305 case folding each character before comparison. ALTERNATE is only
306 used if the case folding bit is turned on in the passed BINDING. */
307
308 if (binding->flags & S_FoldCase)
309 {
310 alternate = xstrdup (string);
311
312 for (i = 0; i < len; i++)
313 {
314 if (islower (alternate[i]))
315 alternate[i] = toupper (alternate[i]);
316 else if (isupper (alternate[i]))
317 alternate[i] = tolower (alternate[i]);
318 }
319 }
320
321 buff = binding->buffer + binding->start - 1;
322 end = binding->buffer + binding->end;
323
324 while (buff > (end + len))
325 {
326 for (i = 0; i < len; i++)
327 {
328 c = *(buff - i);
329
330 if (c != string[i] && (!alternate || c != alternate[i]))
331 break;
332 }
333
334 if (!string[i])
335 {
336 free (string);
337 if (alternate)
338 free (alternate);
339
340 if (binding->flags & S_SkipDest)
341 buff -= len;
342 *poff = 1 + buff - binding->buffer;
343 return search_success;
344 }
345
346 buff--;
347 }
348
349 free (string);
350 if (alternate)
351 free (alternate);
352
353 return search_not_found;
354 }
355
356 /* Find STRING in LINE, returning the offset of the end of the string.
357 Return an offset of -1 if STRING does not appear in LINE. The search
358 is bound by the end of the line (i.e., either NEWLINE or 0). */
359 int
360 string_in_line (char *string, char *line)
361 {
362 register int end;
363 SEARCH_BINDING binding;
364 long offset;
365
366 /* Find the end of the line. */
367 for (end = 0; line[end] && line[end] != '\n'; end++);
368
369 /* Search for STRING within these confines. */
370 binding.buffer = line;
371 binding.start = 0;
372 binding.end = end;
373 binding.flags = S_FoldCase | S_SkipDest;
374
375 if (search_forward (string, &binding, &offset) == search_success)
376 return offset;
377 return -1;
378 }
379
380 /* Return non-zero if STRING is the first text to appear at BINDING. */
381 int
382 looking_at (char *string, SEARCH_BINDING *binding)
383 {
384 long search_end;
385
386 if (search (string, binding, &search_end) != search_success)
387 return 0;
388
389 /* If the string was not found, SEARCH_END is -1. If the string was found,
390 but not right away, SEARCH_END is != binding->start. Otherwise, the
391 string was found at binding->start. */
392 return search_end == binding->start;
393 }
394
395 /* Return non-zero if POINTER is looking at the text at STRING before an
396 end-of-line. */
397 int
398 looking_at_line (char *string, char *pointer)
399 {
400 int len;
401
402 len = strlen (string);
403 if (strncasecmp (pointer, string, len) != 0)
404 return 0;
405
406 pointer += len;
407 if (*pointer == '\n' || !strncmp (pointer, "\r\n", 2)
408 || *pointer == '\0')
409 return 1;
410 return 0;
411 }
412
413 /* **************************************************************** */
414 /* */
415 /* Accessing matches */
416 /* */
417 /* **************************************************************** */
418 /* Search forwards or backwards for entries in MATCHES that start within
419 the search area. The search is forwards if DIR > 0, backward if
420 DIR < 0. Return index of match in *MATCH_INDEX. */
421 enum search_result
422 match_in_match_list (MATCH_STATE *match_state,
423 long start, long end, int dir,
424 int *match_index)
425 {
426 regmatch_t *matches = match_state->matches;
427 size_t match_count = match_state->match_count;
428
429 int i;
430 int index = -1;
431
432 for (i = 0; i < match_count || !match_state->finished; i++)
433 {
434 /* get more matches as we need them */
435 if (i == match_count)
436 {
437 extend_matches (match_state);
438 matches = match_state->matches;
439 match_count = match_state->match_count;
440
441 if (i == match_count)
442 break;
443 }
444
445 if (matches[i].rm_so >= end)
446 break; /* No more matches found in search area. */
447
448 if (matches[i].rm_so >= start)
449 {
450 index = i;
451 if (dir > 0)
452 {
453 *match_index = index;
454 return search_success;
455 }
456 }
457 }
458
459 if (index != -1)
460 {
461 *match_index = index;
462 return search_success;
463 }
464
465 /* not found */
466 return search_not_found;
467 }
468
469 /* Return match INDEX in STATE. INDEX must be a valid index. */
470 regmatch_t
471 match_by_index (MATCH_STATE *state, int index)
472 {
473 while (state->match_alloc <= index)
474 extend_matches (state);
475 return state->matches[index];
476 }
477
478 /* Free and clear all data in STATE. */
479 void
480 free_matches (MATCH_STATE *state)
481 {
482 free (state->matches);
483 state->matches = 0;
484 state->match_count = state->match_alloc = state->finished = 0;
485 state->buffer = 0; /* do not free as it is kept elsewhere */
486 state->buflen = 0;
487 regfree (&state->regex);
488 }
489
490 int
491 matches_ready (MATCH_STATE *state)
492 {
493 return state->matches ? 1 : 0;
494 }
495
496 /* Starting at index *MATCH_INDEX, decide if we are inside a match
497 in MATCHES at offset OFF. The matches are assumed not to overlap
498 and to be in order. */
499 void
500 decide_if_in_match (long off, int *in_match,
501 MATCH_STATE *matches, size_t *match_index)
502 {
503 size_t i = *match_index;
504 int m = *in_match;
505
506 for (; !at_end_of_matches (matches, i); i++)
507 {
508 if (match_by_index (matches, i).rm_so > off)
509 break;
510
511 m = 1;
512
513 if (match_by_index (matches, i).rm_eo > off)
514 break;
515
516 m = 0;
517 }
518
519 *match_index = i;
520 *in_match = m;
521 }
522
523 /* Used for iterating through a match list. */
524 int
525 at_end_of_matches (MATCH_STATE *state, int index)
526 {
527 if (index < state->match_count)
528 return 0;
529 else
530 {
531 if (!state->finished)
532 extend_matches (state);
533
534 if (state->finished)
535 return (state->match_count == index) ? 1 : 0;
536 else
537 return 0;
538 }
539 }
540
541
542
543 /* **************************************************************** */
544 /* */
545 /* Small String Searches */
546 /* */
547 /* **************************************************************** */
548
549 /* Function names that start with "skip" are passed a string, and return
550 an offset from the start of that string. Function names that start
551 with "find" are passed a SEARCH_BINDING, and return an absolute position
552 marker of the item being searched for. "Find" functions return a value
553 of -1 if the item being looked for couldn't be found. */
554
555 /* Return the index of the first non-whitespace character in STRING. */
556 int
557 skip_whitespace (char *string)
558 {
559 register int i;
560
561 for (i = 0; string && whitespace (string[i]); i++);
562 return i;
563 }
564
565 /* Return the index of the first non-whitespace or newline character in
566 STRING. */
567 int
568 skip_whitespace_and_newlines (char *string)
569 {
570 register int i;
571
572 for (i = 0; string && whitespace_or_newline (string[i]); i++);
573 return i;
574 }
575
576 /* Return the index of the first whitespace character in STRING. */
577 int
578 skip_non_whitespace (char *string)
579 {
580 register int i;
581
582 for (i = 0; string && string[i] && !whitespace (string[i]); i++);
583 return i;
584 }
585
586 /* **************************************************************** */
587 /* */
588 /* Searching FILE_BUFFER's */
589 /* */
590 /* **************************************************************** */
591
592 /* Return the absolute position of the first occurence of a node separator
593 starting in BINDING->buffer between BINDING->start and BINDING->end
594 inclusive. Return -1 if no node separator was found. */
595 long
596 find_node_separator (SEARCH_BINDING *binding)
597 {
598 register long i;
599 char *body;
600 int dir;
601
602 body = binding->buffer;
603 dir = binding->start < binding->end ? 1 : -1;
604
605 /* A node is started by [^L]^_[^L][\r]\n. That is to say, the C-l's are
606 optional, but the US and NEWLINE are not. This separator holds
607 true for all separated elements in an Info file, including the tags
608 table (if present) and the indirect tags table (if present). */
609 i = binding->start;
610 while (1)
611 {
612 /* Note that bytes are read in order from the buffer, so if at any
613 point a null byte is encountered signifying the end of the buffer,
614 no more bytes will be read past that point. */
615 if (body[i] == INFO_COOKIE)
616 {
617 int j = i + 1;
618
619 if (body[j] == INFO_FF)
620 j++;
621 if (body[j] == '\r')
622 j++;
623
624 if (body[j] == '\n')
625 return i;
626 }
627
628 if (i == binding->end)
629 break;
630 i += dir;
631 }
632
633 return -1;
634 }
635
636 /* Return the length of the node separator characters that BODY is currently
637 pointing at. If it's not pointing at a node separator, return 0. */
638 int
639 skip_node_separator (char *body)
640 {
641 register int i;
642
643 i = 0;
644
645 if (body[i] == INFO_FF)
646 i++;
647
648 if (body[i++] != INFO_COOKIE)
649 return 0;
650
651 if (body[i] == INFO_FF)
652 i++;
653
654 if (body[i] == '\r')
655 i++;
656
657 if (body[i++] != '\n')
658 return 0;
659
660 return i;
661 }
662
663 /* Return the absolute position of the beginning of a section in this file
664 whose first line is LABEL, starting the search at binding->start. */
665 long
666 find_file_section (SEARCH_BINDING *binding, char *label)
667 {
668 SEARCH_BINDING s;
669 long position;
670 int dir;
671
672 s.buffer = binding->buffer;
673 s.start = binding->start;
674 s.end = binding->end;
675 s.flags = S_FoldCase;
676 dir = binding->start < binding->end ? 1 : -1;
677
678 while ((position = find_node_separator (&s)) != -1 )
679 {
680 long offset = position;
681 offset += skip_node_separator (s.buffer + offset);
682 if (looking_at_line (label, s.buffer + offset))
683 return position;
684
685 if (dir > 0)
686 {
687 s.start = offset;
688 if (s.start >= s.end)
689 break;
690 }
691 else
692 {
693 s.start = position - 1;
694 if (s.start <= s.end)
695 break;
696 }
697 }
698 return -1;
699 }
700
701 /* Return the absolute position of the node named NODENAME in BINDING.
702 This is a brute force search, and we wish to avoid it when possible.
703 This function is called when a tag (indirect or otherwise) doesn't
704 really point to the right node. It returns the absolute position of
705 the separator preceding the node. */
706 long
707 find_node_in_binding (char *nodename, SEARCH_BINDING *binding)
708 {
709 long position;
710 int offset;
711 SEARCH_BINDING s;
712
713 s.buffer = binding->buffer;
714 s.start = binding->start;
715 s.end = binding->end;
716 s.flags = 0;
717
718 while (s.start < s.end && (position = find_node_separator (&s)) != -1)
719 {
720 char *nodename_start;
721 char *read_nodename;
722 int found;
723
724 s.start = position;
725 s.start += skip_node_separator (s.buffer + s.start);
726
727 offset = string_in_line (INFO_NODE_LABEL, s.buffer + s.start);
728
729 if (offset == -1)
730 continue;
731
732 s.start += offset;
733 s.start += skip_whitespace (s.buffer + s.start);
734 nodename_start = s.buffer + s.start;
735 read_quoted_string (nodename_start, "\n\r\t,", 0, &read_nodename);
736 if (!read_nodename)
737 return -1;
738
739 found = !strcmp (read_nodename, nodename);
740 free (read_nodename);
741
742 if (found)
743 return position;
744 }
745 return -1;
746 }