(root)/
texinfo-7.1/
info/
search.c
       1  /* search.c -- searching large bodies of text.
       2  
       3     Copyright 1993-2023 Free Software Foundation, Inc.
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation, either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <http://www.gnu.org/licenses/>.
      17  
      18     Originally written by Brian Fox. */
      19  
      20  #include "info.h"
      21  #include <regex.h>
      22  
      23  #include "session.h"
      24  #include "scan.h"
      25  #include "search.h"
      26  
      27  
      28  /* **************************************************************** */
      29  /*                                                                  */
      30  /*                 The Actual Searching Functions                   */
      31  /*                                                                  */
      32  /* **************************************************************** */
      33  
      34  /* Search forwards or backwards for the text delimited by BINDING.
      35     The search is forwards if BINDING->start is greater than BINDING->end. */
      36  enum search_result
      37  search (char *string, SEARCH_BINDING *binding, long *poff)
      38  {
      39    enum search_result result;
      40  
      41    /* If the search is backwards, then search backwards, otherwise forwards. */
      42    if (binding->start > binding->end)
      43      result = search_backward (string, binding, poff);
      44    else
      45      result = search_forward (string, binding, poff);
      46  
      47    return result;
      48  }
      49  
      50  /* Expand \n and \t in regexp to newlines and tabs */
      51  static char *
      52  regexp_expand_newlines_and_tabs (char *regexp)
      53  {
      54    char *unescaped_regexp = xmalloc (1 + strlen (regexp));
      55    char *p, *q;
      56  
      57    for (p = regexp, q = unescaped_regexp; *p != '\0'; p++, q++)
      58      {
      59        if (*p == '\\')
      60          switch(*++p)
      61            {
      62            case 'n':
      63              *q = '\n';
      64              break;
      65            case 't':
      66              *q = '\t';
      67              break;
      68            case '\0':
      69              *q = '\\';
      70              p--;
      71              break;
      72            default:
      73              *q++ = '\\';
      74              *q = *p;
      75              break;
      76            }
      77        else
      78          *q = *p;
      79      }
      80    *q = '\0';
      81  
      82    return unescaped_regexp;
      83  }
      84  
      85  /* Escape any special characters in SEARCH_STRING. */
      86  static char *
      87  regexp_escape_string (char *search_string)
      88  {
      89    char *special_chars = "\\[]^$.*(){}|+?";
      90    char *p, *q;
      91  
      92    char *escaped_string = xmalloc (strlen (search_string) * 2 + 1);
      93  
      94    for (p = search_string, q = escaped_string; *p != '\0'; )
      95      {
      96        if (strchr (special_chars, *p))
      97          {
      98            *q++ = '\\';
      99          }
     100        *q++ = *p++;
     101      }
     102  
     103    *q = '\0';
     104  
     105    return escaped_string;
     106  }
     107  
     108  
     109  static void
     110  extend_matches (MATCH_STATE *state)
     111  {
     112    regmatch_t *matches = state->matches;
     113    size_t match_alloc = state->match_alloc;
     114    size_t match_count = state->match_count;
     115    char *buffer = state->buffer;
     116    size_t buflen = state->buflen;
     117  
     118    regoff_t offset = 0;
     119    char saved_char;
     120    size_t initial_match_count = match_count;
     121  
     122    if (state->finished)
     123      return;
     124  
     125    saved_char = buffer[buflen];
     126    buffer[buflen] = '\0';
     127  
     128    if (match_count > 0)
     129      {
     130        offset = matches[match_count - 1].rm_eo;
     131  
     132        /* move past zero-length match */
     133        if (offset == matches[match_count - 1].rm_so)
     134          offset++;
     135      }
     136  
     137    while (offset < buflen && match_count < initial_match_count + 5)
     138      {
     139        int result = 0;
     140        regmatch_t m;
     141  
     142        result = regexec (&state->regex, &buffer[offset], 1, &m, REG_NOTBOL);
     143        if (result == 0)
     144          {
     145            if (match_count == match_alloc)
     146              {
     147                /* The match list is full. */
     148                if (match_alloc == 0)
     149                  match_alloc = 50;
     150                matches = x2nrealloc
     151                  (matches, &match_alloc, sizeof matches[0]);
     152              }
     153  
     154            matches[match_count] = m;
     155            matches[match_count].rm_so += offset;
     156            matches[match_count].rm_eo += offset;
     157            offset = matches[match_count++].rm_eo;
     158  
     159            if (m.rm_eo == 0)
     160              offset++; /* Avoid finding match again for a pattern of "$". */
     161          }
     162        else
     163          {
     164            state->finished = 1;
     165            break;
     166          }
     167      }
     168    buffer[buflen] = saved_char;
     169  
     170    state->matches = matches;
     171    state->match_alloc = match_alloc;
     172    state->match_count = match_count;
     173  }
     174  
     175  /* Search BUFFER for REGEXP.  If matches are found, pass back the list of 
     176     matches in MATCH_STATE. */
     177  enum search_result
     178  regexp_search (char *regexp, int is_literal, int is_insensitive,
     179                 char *buffer, size_t buflen,
     180                 MATCH_STATE *match_state)
     181  {
     182    regex_t preg; /* Compiled pattern buffer for regexp. */
     183    int result;
     184    char *regexp_str;
     185  
     186    if (!is_literal)
     187      regexp_str = regexp_expand_newlines_and_tabs (regexp);
     188    else
     189      regexp_str = regexp_escape_string (regexp);
     190  
     191    result = regcomp (&preg, regexp_str,
     192                      REG_EXTENDED | REG_NEWLINE
     193                      | (is_insensitive ? REG_ICASE : 0));
     194    free (regexp_str);
     195  
     196    if (result != 0)
     197      {
     198        int size = regerror (result, &preg, NULL, 0);
     199        char *buf = xmalloc (size);
     200        regerror (result, &preg, buf, size);
     201        info_error (_("regexp error: %s"), buf);
     202        free (buf);
     203        return search_invalid;
     204      }
     205  
     206    match_state->matches = 0;
     207    match_state->match_count = 0;
     208    match_state->match_alloc = 0;
     209    match_state->finished = 0;
     210    match_state->regex = preg;
     211    match_state->buffer = buffer;
     212    match_state->buflen = buflen;
     213  
     214    extend_matches (match_state);
     215  
     216    if (match_state->match_count == 0)
     217      {
     218        free_matches (match_state);
     219        return search_not_found;
     220      }
     221    else
     222      return search_success;
     223  }
     224  
     225  /* Search forwards for STRING through the text delimited in BINDING. */
     226  enum search_result
     227  search_forward (char *string, SEARCH_BINDING *binding, long *poff)
     228  {
     229    register int c, i, len;
     230    register char *buff, *end;
     231    char *alternate = NULL;
     232  
     233    len = strlen (string);
     234  
     235    /* We match characters in the search buffer against STRING and ALTERNATE.
     236       ALTERNATE is a case reversed version of STRING; this is cheaper than
     237       case folding each character before comparison.   Alternate is only
     238       used if the case folding bit is turned on in the passed BINDING. */
     239  
     240    if (binding->flags & S_FoldCase)
     241      {
     242        alternate = xstrdup (string);
     243  
     244        for (i = 0; i < len; i++)
     245          {
     246            if (islower (alternate[i]))
     247              alternate[i] = toupper (alternate[i]);
     248            else if (isupper (alternate[i]))
     249              alternate[i] = tolower (alternate[i]);
     250          }
     251      }
     252  
     253    buff = binding->buffer + binding->start;
     254    end = binding->buffer + binding->end + 1;
     255  
     256    while (buff < (end - len))
     257      {
     258        for (i = 0; i < len; i++)
     259          {
     260            c = buff[i];
     261  
     262            if ((c != string[i]) && (!alternate || c != alternate[i]))
     263              break;
     264          }
     265  
     266        if (!string[i])
     267          {
     268            if (alternate)
     269              free (alternate);
     270            if (binding->flags & S_SkipDest)
     271              buff += len;
     272            *poff = buff - binding->buffer;
     273  	  return search_success;
     274          }
     275  
     276        buff++;
     277      }
     278  
     279    if (alternate)
     280      free (alternate);
     281  
     282    return search_not_found;
     283  }
     284  
     285  /* Search for STRING backwards through the text delimited in BINDING. */
     286  enum search_result
     287  search_backward (char *input_string, SEARCH_BINDING *binding, long *poff)
     288  {
     289    register int c, i, len;
     290    register char *buff, *end;
     291    char *string;
     292    char *alternate = NULL;
     293  
     294    len = strlen (input_string);
     295  
     296    /* Reverse the characters in the search string. */
     297    string = xmalloc (1 + len);
     298    for (c = 0, i = len - 1; input_string[c]; c++, i--)
     299      string[i] = input_string[c];
     300  
     301    string[c] = '\0';
     302  
     303    /* We match characters in the search buffer against STRING and ALTERNATE.
     304       ALTERNATE is a case reversed version of STRING; this is cheaper than
     305       case folding each character before comparison.   ALTERNATE is only
     306       used if the case folding bit is turned on in the passed BINDING. */
     307  
     308    if (binding->flags & S_FoldCase)
     309      {
     310        alternate = xstrdup (string);
     311  
     312        for (i = 0; i < len; i++)
     313          {
     314            if (islower (alternate[i]))
     315              alternate[i] = toupper (alternate[i]);
     316            else if (isupper (alternate[i]))
     317              alternate[i] = tolower (alternate[i]);
     318          }
     319      }
     320  
     321    buff = binding->buffer + binding->start - 1;
     322    end = binding->buffer + binding->end;
     323  
     324    while (buff > (end + len))
     325      {
     326        for (i = 0; i < len; i++)
     327          {
     328            c = *(buff - i);
     329  
     330            if (c != string[i] && (!alternate || c != alternate[i]))
     331              break;
     332          }
     333  
     334        if (!string[i])
     335          {
     336            free (string);
     337            if (alternate)
     338              free (alternate);
     339  
     340            if (binding->flags & S_SkipDest)
     341              buff -= len;
     342            *poff = 1 + buff - binding->buffer;
     343  	  return search_success;
     344          }
     345  
     346        buff--;
     347      }
     348  
     349    free (string);
     350    if (alternate)
     351      free (alternate);
     352  
     353    return search_not_found;
     354  }
     355  
     356  /* Find STRING in LINE, returning the offset of the end of the string.
     357     Return an offset of -1 if STRING does not appear in LINE.  The search
     358     is bound by the end of the line (i.e., either NEWLINE or 0). */
     359  int
     360  string_in_line (char *string, char *line)
     361  {
     362    register int end;
     363    SEARCH_BINDING binding;
     364    long offset;
     365    
     366    /* Find the end of the line. */
     367    for (end = 0; line[end] && line[end] != '\n'; end++);
     368  
     369    /* Search for STRING within these confines. */
     370    binding.buffer = line;
     371    binding.start = 0;
     372    binding.end = end;
     373    binding.flags = S_FoldCase | S_SkipDest;
     374  
     375    if (search_forward (string, &binding, &offset) == search_success)
     376      return offset;
     377    return -1;
     378  }
     379  
     380  /* Return non-zero if STRING is the first text to appear at BINDING. */
     381  int
     382  looking_at (char *string, SEARCH_BINDING *binding)
     383  {
     384    long search_end;
     385  
     386    if (search (string, binding, &search_end) != search_success)
     387      return 0;
     388  
     389    /* If the string was not found, SEARCH_END is -1.  If the string was found,
     390       but not right away, SEARCH_END is != binding->start.  Otherwise, the
     391       string was found at binding->start. */
     392    return search_end == binding->start;
     393  }
     394  
     395  /* Return non-zero if POINTER is looking at the text at STRING before an 
     396     end-of-line. */
     397  int
     398  looking_at_line (char *string, char *pointer)
     399  {
     400    int len;
     401  
     402    len = strlen (string);
     403    if (strncasecmp (pointer, string, len) != 0)
     404      return 0;
     405  
     406    pointer += len;
     407    if (*pointer == '\n' || !strncmp (pointer, "\r\n", 2)
     408        || *pointer == '\0')
     409      return 1;
     410    return 0;
     411  }
     412  
     413  /* **************************************************************** */
     414  /*                                                                  */
     415  /*                      Accessing matches                           */
     416  /*                                                                  */
     417  /* **************************************************************** */
     418  /* Search forwards or backwards for entries in MATCHES that start within
     419     the search area.  The search is forwards if DIR > 0, backward if
     420     DIR < 0.  Return index of match in *MATCH_INDEX. */
     421  enum search_result
     422  match_in_match_list (MATCH_STATE *match_state,
     423                       long start, long end, int dir,
     424                       int *match_index)
     425  {
     426    regmatch_t *matches = match_state->matches;
     427    size_t match_count = match_state->match_count;
     428  
     429    int i;
     430    int index = -1;
     431  
     432    for (i = 0; i < match_count || !match_state->finished; i++)
     433      {
     434        /* get more matches as we need them */
     435        if (i == match_count)
     436          {
     437            extend_matches (match_state);
     438            matches = match_state->matches;
     439            match_count = match_state->match_count;
     440  
     441            if (i == match_count)
     442              break;
     443          }
     444  
     445        if (matches[i].rm_so >= end)
     446          break; /* No  more matches found in search area. */
     447  
     448        if (matches[i].rm_so >= start)
     449          {
     450            index = i;
     451            if (dir > 0)
     452              {
     453                *match_index = index;
     454                return search_success;
     455              }
     456          }
     457      }
     458  
     459    if (index != -1)
     460      {
     461        *match_index = index;
     462        return search_success;
     463      }
     464  
     465    /* not found */
     466    return search_not_found;
     467  }
     468  
     469  /* Return match INDEX in STATE.  INDEX must be a valid index. */
     470  regmatch_t
     471  match_by_index (MATCH_STATE *state, int index)
     472  {
     473    while (state->match_alloc <= index)
     474      extend_matches (state);
     475    return state->matches[index];
     476  }
     477  
     478  /* Free and clear all data in STATE. */
     479  void
     480  free_matches (MATCH_STATE *state)
     481  {
     482    free (state->matches);
     483    state->matches = 0;
     484    state->match_count = state->match_alloc = state->finished = 0;
     485    state->buffer = 0; /* do not free as it is kept elsewhere */
     486    state->buflen = 0;
     487    regfree (&state->regex);
     488  }
     489  
     490  int
     491  matches_ready (MATCH_STATE *state)
     492  {
     493    return state->matches ? 1 : 0;
     494  }
     495  
     496  /* Starting at index *MATCH_INDEX, decide if we are inside a match
     497     in MATCHES at offset OFF.  The matches are assumed not to overlap
     498     and to be in order. */
     499  void
     500  decide_if_in_match (long off, int *in_match,
     501                      MATCH_STATE *matches, size_t *match_index)
     502  {
     503    size_t i = *match_index;
     504    int m = *in_match;
     505  
     506    for (; !at_end_of_matches (matches, i); i++)
     507      {
     508        if (match_by_index (matches, i).rm_so > off)
     509          break;
     510  
     511        m = 1;
     512  
     513        if (match_by_index (matches, i).rm_eo > off)
     514          break;
     515  
     516        m = 0;
     517      }
     518  
     519    *match_index = i;
     520    *in_match = m;
     521  }
     522  
     523  /* Used for iterating through a match list. */
     524  int
     525  at_end_of_matches (MATCH_STATE *state, int index)
     526  {
     527    if (index < state->match_count)
     528      return 0;
     529    else
     530      {
     531        if (!state->finished)
     532          extend_matches (state);
     533  
     534        if (state->finished)
     535          return (state->match_count == index) ? 1 : 0;
     536        else
     537          return 0;
     538      }
     539  }
     540  
     541  
     542  
     543  /* **************************************************************** */
     544  /*                                                                  */
     545  /*                    Small String Searches                         */
     546  /*                                                                  */
     547  /* **************************************************************** */
     548  
     549  /* Function names that start with "skip" are passed a string, and return
     550     an offset from the start of that string.  Function names that start
     551     with "find" are passed a SEARCH_BINDING, and return an absolute position
     552     marker of the item being searched for.  "Find" functions return a value
     553     of -1 if the item being looked for couldn't be found. */
     554  
     555  /* Return the index of the first non-whitespace character in STRING. */
     556  int
     557  skip_whitespace (char *string)
     558  {
     559    register int i;
     560  
     561    for (i = 0; string && whitespace (string[i]); i++);
     562    return i;
     563  }
     564  
     565  /* Return the index of the first non-whitespace or newline character in
     566     STRING. */
     567  int
     568  skip_whitespace_and_newlines (char *string)
     569  {
     570    register int i;
     571  
     572    for (i = 0; string && whitespace_or_newline (string[i]); i++);
     573    return i;
     574  }
     575  
     576  /* Return the index of the first whitespace character in STRING. */
     577  int
     578  skip_non_whitespace (char *string)
     579  {
     580    register int i;
     581  
     582    for (i = 0; string && string[i] && !whitespace (string[i]); i++);
     583    return i;
     584  }
     585  
     586  /* **************************************************************** */
     587  /*                                                                  */
     588  /*                   Searching FILE_BUFFER's                        */
     589  /*                                                                  */
     590  /* **************************************************************** */
     591  
     592  /* Return the absolute position of the first occurence of a node separator
     593     starting in BINDING->buffer between BINDING->start and BINDING->end 
     594     inclusive.  Return -1 if no node separator was found. */
     595  long
     596  find_node_separator (SEARCH_BINDING *binding)
     597  {
     598    register long i;
     599    char *body;
     600    int dir;
     601  
     602    body = binding->buffer;
     603    dir = binding->start < binding->end ? 1 : -1;
     604  
     605    /* A node is started by [^L]^_[^L][\r]\n.  That is to say, the C-l's are
     606       optional, but the US and NEWLINE are not.  This separator holds
     607       true for all separated elements in an Info file, including the tags
     608       table (if present) and the indirect tags table (if present). */
     609    i = binding->start;
     610    while (1)
     611      {
     612        /* Note that bytes are read in order from the buffer, so if at any
     613           point a null byte is encountered signifying the end of the buffer,
     614           no more bytes will be read past that point. */
     615        if (body[i] == INFO_COOKIE)
     616          {
     617            int j = i + 1;
     618  
     619            if (body[j] == INFO_FF)
     620              j++;
     621            if (body[j] == '\r')
     622              j++;
     623  
     624            if (body[j] == '\n')
     625              return i;
     626          }
     627  
     628        if (i == binding->end)
     629          break;
     630        i += dir;
     631      }
     632  
     633    return -1;
     634  }
     635  
     636  /* Return the length of the node separator characters that BODY is currently
     637     pointing at.  If it's not pointing at a node separator, return 0. */
     638  int
     639  skip_node_separator (char *body)
     640  {
     641    register int i;
     642  
     643    i = 0;
     644  
     645    if (body[i] == INFO_FF)
     646      i++;
     647  
     648    if (body[i++] != INFO_COOKIE)
     649      return 0;
     650  
     651    if (body[i] == INFO_FF)
     652      i++;
     653  
     654    if (body[i] == '\r')
     655      i++;
     656  
     657    if (body[i++] != '\n')
     658      return 0;
     659  
     660    return i;
     661  }
     662  
     663  /* Return the absolute position of the beginning of a section in this file
     664     whose first line is LABEL, starting the search at binding->start. */
     665  long
     666  find_file_section (SEARCH_BINDING *binding, char *label)
     667  {
     668    SEARCH_BINDING s;
     669    long position;
     670    int dir;
     671  
     672    s.buffer = binding->buffer;
     673    s.start = binding->start;
     674    s.end = binding->end;
     675    s.flags = S_FoldCase;
     676    dir = binding->start < binding->end ? 1 : -1;
     677  
     678    while ((position = find_node_separator (&s)) != -1 )
     679      {
     680        long offset = position;
     681        offset += skip_node_separator (s.buffer + offset);
     682        if (looking_at_line (label, s.buffer + offset))
     683          return position;
     684  
     685        if (dir > 0)
     686          {
     687            s.start = offset;
     688            if (s.start >= s.end)
     689              break;
     690          }
     691        else
     692          {
     693            s.start = position - 1;
     694            if (s.start <= s.end)
     695              break;
     696          }
     697      }
     698    return -1;
     699  }
     700  
     701  /* Return the absolute position of the node named NODENAME in BINDING.
     702     This is a brute force search, and we wish to avoid it when possible.
     703     This function is called when a tag (indirect or otherwise) doesn't
     704     really point to the right node.  It returns the absolute position of
     705     the separator preceding the node. */
     706  long
     707  find_node_in_binding (char *nodename, SEARCH_BINDING *binding)
     708  {
     709    long position;
     710    int offset;
     711    SEARCH_BINDING s;
     712  
     713    s.buffer = binding->buffer;
     714    s.start = binding->start;
     715    s.end = binding->end;
     716    s.flags = 0;
     717  
     718    while (s.start < s.end && (position = find_node_separator (&s)) != -1)
     719      {
     720        char *nodename_start;
     721        char *read_nodename;
     722        int found;
     723  
     724        s.start = position;
     725        s.start += skip_node_separator (s.buffer + s.start);
     726  
     727        offset = string_in_line (INFO_NODE_LABEL, s.buffer + s.start);
     728  
     729        if (offset == -1)
     730          continue;
     731  
     732        s.start += offset;
     733        s.start += skip_whitespace (s.buffer + s.start); 
     734        nodename_start = s.buffer + s.start;
     735        read_quoted_string (nodename_start, "\n\r\t,", 0, &read_nodename);
     736        if (!read_nodename)
     737          return -1;
     738  
     739        found = !strcmp (read_nodename, nodename);
     740        free (read_nodename);
     741  
     742        if (found)
     743          return position;
     744      }
     745    return -1;
     746  }