(root)/
glibc-2.38/
string/
strcoll_l.c
       1  /* Copyright (C) 1995-2023 Free Software Foundation, Inc.
       2     This file is part of the GNU C Library.
       3  
       4     The GNU C Library is free software; you can redistribute it and/or
       5     modify it under the terms of the GNU Lesser General Public
       6     License as published by the Free Software Foundation; either
       7     version 2.1 of the License, or (at your option) any later version.
       8  
       9     The GNU C Library is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      12     Lesser General Public License for more details.
      13  
      14     You should have received a copy of the GNU Lesser General Public
      15     License along with the GNU C Library; if not, see
      16     <https://www.gnu.org/licenses/>.  */
      17  
      18  
      19  #include <assert.h>
      20  #include <langinfo.h>
      21  #include <locale.h>
      22  #include <stddef.h>
      23  #include <stdint.h>
      24  #include <string.h>
      25  #include <sys/param.h>
      26  #include <libc-diag.h>
      27  
      28  #ifndef STRING_TYPE
      29  # define STRING_TYPE char
      30  # define USTRING_TYPE unsigned char
      31  # define STRCOLL __strcoll_l
      32  # define STRCMP strcmp
      33  # define WEIGHT_H "../locale/weight.h"
      34  # define SUFFIX	MB
      35  # define L(arg) arg
      36  #endif
      37  
      38  #define CONCAT(a,b) CONCAT1(a,b)
      39  #define CONCAT1(a,b) a##b
      40  
      41  #include "../locale/localeinfo.h"
      42  #include WEIGHT_H
      43  
      44  /* Track status while looking for sequences in a string.  */
      45  typedef struct
      46  {
      47    int len;			/* Length of the current sequence.  */
      48    size_t val;			/* Position of the sequence relative to the
      49  				   previous non-ignored sequence.  */
      50    size_t idxmax;		/* Maximum index in sequences.  */
      51    size_t idxcnt;		/* Current count of indices.  */
      52    size_t backw;			/* Current Backward sequence index.  */
      53    size_t backw_stop;		/* Index where the backward sequences stop.  */
      54    const USTRING_TYPE *us;	/* The string.  */
      55    unsigned char rule;		/* Saved rule for the first sequence.  */
      56    int32_t idx;			/* Index to weight of the current sequence.  */
      57    int32_t save_idx;		/* Save looked up index of a forward
      58  				   sequence after the last backward
      59  				   sequence.  */
      60    const USTRING_TYPE *back_us;	/* Beginning of the backward sequence.  */
      61  } coll_seq;
      62  
      63  /* Get next sequence.  Traverse the string as required.  */
      64  static __always_inline void
      65  get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
      66  	      const USTRING_TYPE *weights, const int32_t *table,
      67  	      const USTRING_TYPE *extra, const int32_t *indirect,
      68  	      int pass)
      69  {
      70    size_t val = seq->val = 0;
      71    int len = seq->len;
      72    size_t backw_stop = seq->backw_stop;
      73    size_t backw = seq->backw;
      74    size_t idxcnt = seq->idxcnt;
      75    size_t idxmax = seq->idxmax;
      76    int32_t idx = seq->idx;
      77    const USTRING_TYPE *us = seq->us;
      78  
      79    while (len == 0)
      80      {
      81        ++val;
      82        if (backw_stop != ~0ul)
      83  	{
      84  	  /* There is something pushed.  */
      85  	  if (backw == backw_stop)
      86  	    {
      87  	      /* The last pushed character was handled.  Continue
      88  		 with forward characters.  */
      89  	      if (idxcnt < idxmax)
      90  		{
      91  		  idx = seq->save_idx;
      92  		  backw_stop = ~0ul;
      93  		}
      94  	      else
      95  		{
      96  		  /* Nothing anymore.  The backward sequence ended with
      97  		     the last sequence in the string.  Note that len is
      98  		     still zero.  */
      99  		  idx = 0;
     100  		  break;
     101  	        }
     102  	    }
     103  	  else
     104  	    {
     105  	      /* XXX Traverse BACKW sequences from the beginning of
     106  		 BACKW_STOP to get the next sequence.  Is there a quicker way
     107  	         to do this?  */
     108  	      size_t i = backw_stop;
     109  	      us = seq->back_us;
     110  	      while (i < backw)
     111  		{
     112  		  int32_t tmp = findidx (table, indirect, extra, &us, -1);
     113  		  idx = tmp & 0xffffff;
     114  		  i++;
     115  		}
     116  	      --backw;
     117  	      us = seq->us;
     118  	    }
     119  	}
     120        else
     121  	{
     122  	  backw_stop = idxmax;
     123  	  int32_t prev_idx = idx;
     124  
     125  	  while (*us != L('\0'))
     126  	    {
     127  	      int32_t tmp = findidx (table, indirect, extra, &us, -1);
     128  	      unsigned char rule = tmp >> 24;
     129  	      prev_idx = idx;
     130  	      idx = tmp & 0xffffff;
     131  	      idxcnt = idxmax++;
     132  
     133  	      /* Save the rule for the first sequence.  */
     134  	      if (__glibc_unlikely (idxcnt == 0))
     135  	        seq->rule = rule;
     136  
     137  	      if ((rulesets[rule * nrules + pass]
     138  		   & sort_backward) == 0)
     139  		/* No more backward characters to push.  */
     140  		break;
     141  	      ++idxcnt;
     142  	    }
     143  
     144  	  if (backw_stop >= idxcnt)
     145  	    {
     146  	      /* No sequence at all or just one.  */
     147  	      if (idxcnt == idxmax || backw_stop > idxcnt)
     148  		/* Note that len is still zero.  */
     149  		break;
     150  
     151  	      backw_stop = ~0ul;
     152  	    }
     153  	  else
     154  	    {
     155  	      /* We pushed backward sequences.  If the stream ended with the
     156  		 backward sequence, then we process the last sequence we
     157  		 found.  Otherwise we process the sequence before the last
     158  		 one since the last one was a forward sequence.  */
     159  	      seq->back_us = seq->us;
     160  	      seq->us = us;
     161  	      backw = idxcnt;
     162  	      if (idxmax > idxcnt)
     163  		{
     164  		  backw--;
     165  		  seq->save_idx = idx;
     166  		  idx = prev_idx;
     167  		}
     168  	      if (backw > backw_stop)
     169  		backw--;
     170  	    }
     171  	}
     172  
     173        /* With GCC 5.3 when compiling with -Os the compiler complains
     174  	 that idx, taken from seq->idx (seq1 or seq2 from STRCOLL) may
     175  	 be used uninitialized.  In general this can't possibly be true
     176  	 since seq1.idx and seq2.idx are initialized to zero in the
     177  	 outer function.  Only one case where seq->idx is restored from
     178  	 seq->save_idx might result in an uninitialized idx value, but
     179  	 it is guarded by a sequence of checks against backw_stop which
     180  	 ensures that seq->save_idx was saved to first and contains a
     181  	 valid value.  */
     182        DIAG_PUSH_NEEDS_COMMENT;
     183        DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized");
     184        len = weights[idx++];
     185        DIAG_POP_NEEDS_COMMENT;
     186        /* Skip over indices of previous levels.  */
     187        for (int i = 0; i < pass; i++)
     188  	{
     189  	  idx += len;
     190  	  len = weights[idx];
     191  	  idx++;
     192  	}
     193      }
     194  
     195    /* Update the structure.  */
     196    seq->val = val;
     197    seq->len = len;
     198    seq->backw_stop = backw_stop;
     199    seq->backw = backw;
     200    seq->idxcnt = idxcnt;
     201    seq->idxmax = idxmax;
     202    seq->us = us;
     203    seq->idx = idx;
     204  }
     205  
     206  /* Compare two sequences.  */
     207  static __always_inline int
     208  do_compare (coll_seq *seq1, coll_seq *seq2, int position,
     209  	    const USTRING_TYPE *weights)
     210  {
     211    int seq1len = seq1->len;
     212    int seq2len = seq2->len;
     213    size_t val1 = seq1->val;
     214    size_t val2 = seq2->val;
     215    int idx1 = seq1->idx;
     216    int idx2 = seq2->idx;
     217    int result = 0;
     218  
     219    /* Test for position if necessary.  */
     220    if (position && val1 != val2)
     221      {
     222        result = val1 > val2 ? 1 : -1;
     223        goto out;
     224      }
     225  
     226    /* Compare the two sequences.  */
     227    do
     228      {
     229        if (weights[idx1] != weights[idx2])
     230  	{
     231  	  /* The sequences differ.  */
     232  	  result = weights[idx1] - weights[idx2];
     233  	  goto out;
     234  	}
     235  
     236        /* Increment the offsets.  */
     237        ++idx1;
     238        ++idx2;
     239  
     240        --seq1len;
     241        --seq2len;
     242      }
     243    while (seq1len > 0 && seq2len > 0);
     244  
     245    if (position && seq1len != seq2len)
     246      result = seq1len - seq2len;
     247  
     248  out:
     249    seq1->len = seq1len;
     250    seq2->len = seq2len;
     251    seq1->idx = idx1;
     252    seq2->idx = idx2;
     253    return result;
     254  }
     255  
     256  int
     257  STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, locale_t l)
     258  {
     259    struct __locale_data *current = l->__locales[LC_COLLATE];
     260    uint32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
     261    /* We don't assign the following values right away since it might be
     262       unnecessary in case there are no rules.  */
     263    const unsigned char *rulesets;
     264    const int32_t *table;
     265    const USTRING_TYPE *weights;
     266    const USTRING_TYPE *extra;
     267    const int32_t *indirect;
     268  
     269    if (nrules == 0)
     270      return STRCMP (s1, s2);
     271  
     272    /* Catch empty strings.  */
     273    if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
     274      return (*s1 != '\0') - (*s2 != '\0');
     275  
     276    rulesets = (const unsigned char *)
     277      current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
     278    table = (const int32_t *)
     279      current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
     280    weights = (const USTRING_TYPE *)
     281      current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
     282    extra = (const USTRING_TYPE *)
     283      current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
     284    indirect = (const int32_t *)
     285      current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
     286  
     287    assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
     288    assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
     289    assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
     290    assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
     291  
     292    int result = 0, rule = 0;
     293  
     294    /* With GCC 7 when compiling with -Os the compiler warns that
     295       seq1.back_us and seq2.back_us might be used uninitialized.
     296       Sometimes this warning appears at locations in locale/weightwc.h
     297       where the actual use is, but on architectures other than x86_64,
     298       x86 and s390x, a warning appears at the definitions of seq1 and
     299       seq2.  This uninitialized use is impossible for the same reason
     300       as described in comments in locale/weightwc.h.  */
     301    DIAG_PUSH_NEEDS_COMMENT;
     302    DIAG_IGNORE_Os_NEEDS_COMMENT (7, "-Wmaybe-uninitialized");
     303    coll_seq seq1, seq2;
     304    DIAG_POP_NEEDS_COMMENT;
     305    seq1.len = 0;
     306    seq1.idxmax = 0;
     307    seq1.rule = 0;
     308    seq2.len = 0;
     309    seq2.idxmax = 0;
     310  
     311    for (int pass = 0; pass < nrules; ++pass)
     312      {
     313        seq1.idxcnt = 0;
     314        seq1.idx = 0;
     315        seq2.idx = 0;
     316        seq1.backw_stop = ~0ul;
     317        seq1.backw = ~0ul;
     318        seq2.idxcnt = 0;
     319        seq2.backw_stop = ~0ul;
     320        seq2.backw = ~0ul;
     321  
     322        /* We need the elements of the strings as unsigned values since they
     323  	 are used as indices.  */
     324        seq1.us = (const USTRING_TYPE *) s1;
     325        seq2.us = (const USTRING_TYPE *) s2;
     326  
     327        /* We assume that if a rule has defined `position' in one section
     328  	 this is true for all of them.  Please note that the localedef programs
     329  	 makes sure that `position' is not used at the first level.  */
     330  
     331        int position = rulesets[rule * nrules + pass] & sort_position;
     332  
     333        while (1)
     334  	{
     335  	  get_next_seq (&seq1, nrules, rulesets, weights, table,
     336  				    extra, indirect, pass);
     337  	  get_next_seq (&seq2, nrules, rulesets, weights, table,
     338  				    extra, indirect, pass);
     339  	  /* See whether any or both strings are empty.  */
     340  	  if (seq1.len == 0 || seq2.len == 0)
     341  	    {
     342  	      if (seq1.len == seq2.len)
     343  		{
     344  		  /* Both strings ended and are equal at this level.  Do a
     345  		     byte-level comparison to ensure that we don't waste time
     346  		     going through multiple passes for totally equal strings
     347  		     before proceeding to subsequent passes.  */
     348  		  if (pass == 0 && STRCMP (s1, s2) == 0)
     349  		    return result;
     350  		  else
     351  		    break;
     352  	        }
     353  
     354  	      /* This means one string is shorter than the other.  Find out
     355  		 which one and return an appropriate value.  */
     356  	      return seq1.len == 0 ? -1 : 1;
     357  	    }
     358  
     359  	  result = do_compare (&seq1, &seq2, position, weights);
     360  	  if (result != 0)
     361  	    return result;
     362  	}
     363  
     364        rule = seq1.rule;
     365      }
     366  
     367    return result;
     368  }
     369  libc_hidden_def (STRCOLL)
     370  
     371  #ifndef WIDE_CHAR_VERSION
     372  weak_alias (__strcoll_l, strcoll_l)
     373  #endif