(root)/
glibc-2.38/
posix/
bug-regex20.c
       1  /* Test for UTF-8 regular expression optimizations.
       2     Copyright (C) 2003-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4  
       5     The GNU C Library is free software; you can redistribute it and/or
       6     modify it under the terms of the GNU Lesser General Public
       7     License as published by the Free Software Foundation; either
       8     version 2.1 of the License, or (at your option) any later version.
       9  
      10     The GNU C Library is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public
      16     License along with the GNU C Library; if not, see
      17     <https://www.gnu.org/licenses/>.  */
      18  
      19  #include <sys/types.h>
      20  #include <mcheck.h>
      21  #include <regex.h>
      22  #include <stdio.h>
      23  #include <stdlib.h>
      24  #include <string.h>
      25  #include <locale.h>
      26  
      27  #define RE_NO_INTERNAL_PROTOTYPES 1
      28  #include "regex_internal.h"
      29  
      30  #define BRE RE_SYNTAX_POSIX_BASIC
      31  #define ERE RE_SYNTAX_POSIX_EXTENDED
      32  
      33  static struct
      34  {
      35    int syntax;
      36    const char *pattern;
      37    const char *string;
      38    int res, optimize;
      39  } tests[] = {
      40    /* \xc3\x84		LATIN CAPITAL LETTER A WITH DIAERESIS
      41       \xc3\x96		LATIN CAPITAL LETTER O WITH DIAERESIS
      42       \xc3\xa4		LATIN SMALL LETTER A WITH DIAERESIS
      43       \xc3\xb6		LATIN SMALL LETTER O WITH DIAERESIS
      44       \xe2\x80\x94	EM DASH  */
      45    /* Should be optimized.  */
      46    {BRE, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1},
      47    {BRE, "b\xc3\xa4z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
      48    {BRE, "b\xc3\xa4*z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
      49    {BRE, "b\xc3\xa4*z", "b\xc3\xa4rfoobz", 7, 1},
      50    {BRE, "b\xc3\xa4\\+z", "b\xc3\xa4rfoob\xc3\xa4\xc3\xa4z", 7, 1},
      51    {BRE, "b\xc3\xa4\\?z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
      52    {BRE, "b\xc3\xa4\\{1,2\\}z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
      53    {BRE, "^x\\|xy*z$", "\xc3\xb6xyyz", 2, 1},
      54    {BRE, "^x\\\\y\\{6\\}z\\+", "x\\yyyyyyzz\xc3\xb6", 0, 1},
      55    {BRE, "^x\\\\y\\{2,36\\}z\\+", "x\\yzz\xc3\xb6", -1, 1},
      56    {BRE, "^x\\\\y\\{,3\\}z\\+", "x\\yyyzz\xc3\xb6", 0, 1},
      57    {BRE, "^x\\|x\xc3\xa4*z$", "\xc3\xb6x\xc3\xa4\xc3\xa4z", 2, 1},
      58    {BRE, "^x\\\\\xc3\x84\\{6\\}z\\+",
      59     "x\\\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6", 0, 1},
      60    {BRE, "^x\\\\\xc3\x84\\{2,36\\}z\\+", "x\\\xc3\x84zz\xc3\xb6", -1, 1},
      61    {BRE, "^x\\\\\xc3\x84\\{,3\\}z\\+",
      62     "x\\\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6", 0, 1},
      63    {BRE, "x[C]y", "axCy", 1, 1},
      64    {BRE, "x[ABC]y", "axCy", 1, 1},
      65    {BRE, "\\`x\\|z\\'", "x\xe2\x80\x94", 0, 1},
      66    {BRE, "\\(xy\\)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1},
      67    {BRE, "xy\\?z", "\xc3\x84xz\xc3\xb6", 2, 1},
      68    {BRE, "\\`\xc3\x84\\|z\\'", "\xc3\x84\xe2\x80\x94", 0, 1},
      69    {BRE, "\\(x\xc3\x84\\)z\\1\x61\\1",
      70     "\xe2\x80\x94x\xc3\x84zx\xc3\x84\x61x\xc3\x84\xc3\x96", 3, 1},
      71    {BRE, "x\xc3\x96\\?z", "\xc3\x84xz\xc3\xb6", 2, 1},
      72    {BRE, "x.y", "ax\xe2\x80\x94yz", 1, 1},
      73    {BRE, "x.*z", "\xc3\x84xz", 2, 1},
      74    {BRE, "x.*z", "\xc3\x84x\xe2\x80\x94z", 2, 1},
      75    {BRE, "x.*z", "\xc3\x84x\xe2\x80\x94y\xf1\x90\x80\x90z", 2, 1},
      76    {BRE, "x.*z", "\xc3\x84x\xe2\x80\x94\xc3\x94\xf1\x90\x80\x90z", 2, 1},
      77    {BRE, "x.\\?z", "axz", 1, 1},
      78    {BRE, "x.\\?z", "axyz", 1, 1},
      79    {BRE, "x.\\?z", "ax\xc3\x84z", 1, 1},
      80    {BRE, "x.\\?z", "ax\xe2\x80\x94z", 1, 1},
      81    {BRE, "x.\\?z", "ax\xf0\x9d\x80\x80z", 1, 1},
      82    {BRE, "x.\\?z", "ax\xf9\x81\x82\x83\x84z", 1, 1},
      83    {BRE, "x.\\?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfz", 1, 1},
      84    {BRE, ".", "y", 0, 1},
      85    {BRE, ".", "\xc3\x84", 0, 1},
      86    {BRE, ".", "\xe2\x80\x94", 0, 1},
      87    {BRE, ".", "\xf0\x9d\x80\x80", 0, 1},
      88    {BRE, ".", "\xf9\x81\x82\x83\x84", 0, 1},
      89    {BRE, ".", "\xfd\xbf\xbf\xbf\xbf\xbf", 0, 1},
      90    {BRE, "x.\\?z", "axyyz", -1, 1},
      91    {BRE, "x.\\?z", "ax\xc3\x84\xc3\x96z", -1, 1},
      92    {BRE, "x.\\?z", "ax\xe2\x80\x94\xc3\xa4z", -1, 1},
      93    {BRE, "x.\\?z", "ax\xf0\x9d\x80\x80yz", -1, 1},
      94    {BRE, "x.\\?z", "ax\xf9\x81\x82\x83\x84\xf0\x9d\x80\x81z", -1, 1},
      95    {BRE, "x.\\?z", "ax\xfd\xbf\xbf\xbf\xbf\xbf\xc3\x96z", -1, 1},
      96    {BRE, "x.\\+z", "\xe2\x80\x94xz", -1, 1},
      97    {BRE, "x.\\+z", "\xe2\x80\x94xyz", 3, 1},
      98    {BRE, "x.\\+z", "\xe2\x80\x94x\xc3\x84y\xe2\x80\x94z", 3, 1},
      99    {BRE, "x.\\+z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1},
     100    {BRE, "x.\\+z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1},
     101    {BRE, "x.\\+z", "\xe2\x80\x94x.~\xe2\x80\x94\xf9\x81\x82\x83\x84z", 3, 1},
     102    {BRE, "x.\\+z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1},
     103    {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94xz", -1, 1},
     104    {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xc3\x96y\xc3\xa4z", -1, 1},
     105    {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94xyz", 3, 1},
     106    {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xc3\x84\xe2\x80\x94z", 3, 1},
     107    {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1},
     108    {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1},
     109    {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x~\xe2\x80\x94z", 3, 1},
     110    {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1},
     111    {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "axz", 1, 1},
     112    {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfwz", 1, 1},
     113    {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "ax\xc3\x86z", 1, 1},
     114    {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "ax\xe2\x80\x96wz", 1, 1},
     115    {ERE, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1},
     116    {ERE, "^x|xy*z$", "\xc3\xb6xyyz", 2, 1},
     117    {ERE, "^x\\\\y{6}z+", "x\\yyyyyyzz\xc3\xb6", 0, 1},
     118    {ERE, "^x\\\\y{2,36}z+", "x\\yzz\xc3\xb6", -1, 1},
     119    {ERE, "^x\\\\y{,3}z+", "x\\yyyzz\xc3\xb6", 0, 1},
     120    {ERE, "x[C]y", "axCy", 1, 1},
     121    {ERE, "x[ABC]y", "axCy", 1, 1},
     122    {ERE, "\\`x|z\\'", "x\xe2\x80\x94", 0, 1},
     123    {ERE, "(xy)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1},
     124    {ERE, "xy?z", "\xc3\x84xz\xc3\xb6", 2, 1},
     125    {ERE, "x.y", "ax\xe2\x80\x94yz", 1, 1},
     126    {ERE, "x.*z", "\xc3\x84xz", 2, 1},
     127    {ERE, "x.*z", "\xc3\x84x\xe2\x80\x94z", 2, 1},
     128    {ERE, "x.*z", "\xc3\x84x\xe2\x80\x94y\xf1\x90\x80\x90z", 2, 1},
     129    {ERE, "x.*z", "\xc3\x84x\xe2\x80\x94\xc3\x94\xf1\x90\x80\x90z", 2, 1},
     130    {ERE, "x.?z", "axz", 1, 1},
     131    {ERE, "x.?z", "axyz", 1, 1},
     132    {ERE, "x.?z", "ax\xc3\x84z", 1, 1},
     133    {ERE, "x.?z", "ax\xe2\x80\x94z", 1, 1},
     134    {ERE, "x.?z", "ax\xf0\x9d\x80\x80z", 1, 1},
     135    {ERE, "x.?z", "ax\xf9\x81\x82\x83\x84z", 1, 1},
     136    {ERE, "x.?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfz", 1, 1},
     137    {ERE, "x.?z", "axyyz", -1, 1},
     138    {ERE, "x.?z", "ax\xc3\x84\xc3\x96z", -1, 1},
     139    {ERE, "x.?z", "ax\xe2\x80\x94\xc3\xa4z", -1, 1},
     140    {ERE, "x.?z", "ax\xf0\x9d\x80\x80yz", -1, 1},
     141    {ERE, "x.?z", "ax\xf9\x81\x82\x83\x84\xf0\x9d\x80\x81z", -1, 1},
     142    {ERE, "x.?z", "ax\xfd\xbf\xbf\xbf\xbf\xbf\xc3\x96z", -1, 1},
     143    {ERE, "x.+z", "\xe2\x80\x94xz", -1, 1},
     144    {ERE, "x.+z", "\xe2\x80\x94xyz", 3, 1},
     145    {ERE, "x.+z", "\xe2\x80\x94x\xc3\x84y\xe2\x80\x94z", 3, 1},
     146    {ERE, "x.+z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1},
     147    {ERE, "x.+z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1},
     148    {ERE, "x.+z", "\xe2\x80\x94x.~\xe2\x80\x94\xf9\x81\x82\x83\x84z", 3, 1},
     149    {ERE, "x.+z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1},
     150    {ERE, "x.{1,2}z", "\xe2\x80\x94xz", -1, 1},
     151    {ERE, "x.{1,2}z", "\xe2\x80\x94x\xc3\x96y\xc3\xa4z", -1, 1},
     152    {ERE, "x.{1,2}z", "\xe2\x80\x94xyz", 3, 1},
     153    {ERE, "x.{1,2}z", "\xe2\x80\x94x\xc3\x84\xe2\x80\x94z", 3, 1},
     154    {ERE, "x.{1,2}z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1},
     155    {ERE, "x.{1,2}z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1},
     156    {ERE, "x.{1,2}z", "\xe2\x80\x94x~\xe2\x80\x94z", 3, 1},
     157    {ERE, "x.{1,2}z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1},
     158    {ERE, "x(.w|\xc3\x86)?z", "axz", 1, 1},
     159    {ERE, "x(.w|\xc3\x86)?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfwz", 1, 1},
     160    {ERE, "x(.w|\xc3\x86)?z", "ax\xc3\x86z", 1, 1},
     161    {ERE, "x(.w|\xc3\x86)?z", "ax\xe2\x80\x96wz", 1, 1},
     162    /* Should not be optimized.  */
     163    {BRE, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0},
     164    {BRE, "x[A-Z,]y", "axCy", 1, 0},
     165    {BRE, "x[^y]z", "ax\xe2\x80\x94z", 1, 0},
     166    {BRE, "x[[:alnum:]]z", "ax\xc3\x96z", 1, 0},
     167    {BRE, "x[[=A=]]z", "axAz", 1, 0},
     168    {BRE, "x[[=\xc3\x84=]]z", "ax\xc3\x84z", 1, 0},
     169    {BRE, "\\<g", "\xe2\x80\x94g", 3, 0},
     170    {BRE, "\\bg\\b", "\xe2\x80\x94g", 3, 0},
     171    {BRE, "\\Bg\\B", "\xc3\xa4g\xc3\xa4", 2, 0},
     172    {BRE, "a\\wz", "a\xc3\x84z", 0, 0},
     173    {BRE, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, 0},
     174    {ERE, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0},
     175    {ERE, "x[A-Z,]y", "axCy", 1, 0},
     176    {ERE, "x[^y]z", "ax\xe2\x80\x94z", 1, 0},
     177    {ERE, "x[[:alnum:]]z", "ax\xc3\x96z", 1, 0},
     178    {ERE, "x[[=A=]]z", "axAz", 1, 0},
     179    {ERE, "x[[=\xc3\x84=]]z", "ax\xc3\x84z", 1, 0},
     180    {ERE, "\\<g", "\xe2\x80\x94g", 3, 0},
     181    {ERE, "\\bg\\b", "\xe2\x80\x94g", 3, 0},
     182    {ERE, "\\Bg\\B", "\xc3\xa4g\xc3\xa4", 2, 0},
     183    {ERE, "a\\wz", "a\xc3\x84z", 0, 0},
     184    {ERE, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, 0},
     185  };
     186  
     187  int
     188  main (void)
     189  {
     190    struct re_pattern_buffer regbuf;
     191    const char *err;
     192    size_t i;
     193    int ret = 0;
     194  
     195    mtrace ();
     196  
     197    setlocale (LC_ALL, "de_DE.UTF-8");
     198    for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i)
     199      {
     200        int res, optimized;
     201  
     202        re_set_syntax (tests[i].syntax);
     203        memset (&regbuf, '\0', sizeof (regbuf));
     204        err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern),
     205                                  &regbuf);
     206        if (err != NULL)
     207  	{
     208  	  printf ("re_compile_pattern failed: %s\n", err);
     209  	  ret = 1;
     210  	  continue;
     211  	}
     212  
     213        /* Check if re_search will be done as multi-byte or single-byte.  */
     214        optimized = ((re_dfa_t *) regbuf.buffer)->mb_cur_max == 1;
     215        if (optimized != tests[i].optimize)
     216          {
     217            printf ("pattern %zd %soptimized while it should%s be\n",
     218  		  i, optimized ? "" : "not ", tests[i].optimize ? "" : " not");
     219  	  ret = 1;
     220          }
     221  
     222        int str_len = strlen (tests[i].string);
     223        res = re_search (&regbuf, tests[i].string, str_len, 0, str_len, NULL);
     224        if (res != tests[i].res)
     225  	{
     226  	  printf ("re_search %zd failed: %d\n", i, res);
     227  	  ret = 1;
     228  	  regfree (&regbuf);
     229  	  continue;
     230  	}
     231  
     232        res = re_search (&regbuf, tests[i].string, str_len, str_len, -str_len,
     233  		       NULL);
     234        if (res != tests[i].res)
     235  	{
     236  	  printf ("backward re_search %zd failed: %d\n", i, res);
     237  	  ret = 1;
     238  	  regfree (&regbuf);
     239  	  continue;
     240  	}
     241        regfree (&regbuf);
     242  
     243        re_set_syntax (tests[i].syntax | RE_ICASE);
     244        memset (&regbuf, '\0', sizeof (regbuf));
     245        err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern),
     246                                  &regbuf);
     247        if (err != NULL)
     248  	{
     249  	  printf ("re_compile_pattern failed: %s\n", err);
     250  	  ret = 1;
     251  	  continue;
     252  	}
     253  
     254        /* Check if re_search will be done as multi-byte or single-byte.  */
     255        optimized = ((re_dfa_t *) regbuf.buffer)->mb_cur_max == 1;
     256        if (optimized)
     257          {
     258            printf ("pattern %zd optimized while it should not be when case insensitive\n",
     259  		  i);
     260  	  ret = 1;
     261          }
     262  
     263        res = re_search (&regbuf, tests[i].string, str_len, 0, str_len, NULL);
     264        if (res != tests[i].res)
     265  	{
     266  	  printf ("ICASE re_search %zd failed: %d\n", i, res);
     267  	  ret = 1;
     268  	  regfree (&regbuf);
     269  	  continue;
     270  	}
     271  
     272        res = re_search (&regbuf, tests[i].string, str_len, str_len, -str_len,
     273  		       NULL);
     274        if (res != tests[i].res)
     275  	{
     276  	  printf ("ICASE backward re_search %zd failed: %d\n", i, res);
     277  	  ret = 1;
     278  	  regfree (&regbuf);
     279  	  continue;
     280  	}
     281        regfree (&regbuf);
     282      }
     283  
     284    return ret;
     285  }