1  /*
       2   * node.c -- routines for node management
       3   */
       4  
       5  /*
       6   * Copyright (C) 1986, 1988, 1989, 1991-2001, 2003-2015, 2017-2019, 2021, 2022, 2023,
       7   * the Free Software Foundation, Inc.
       8   *
       9   * This file is part of GAWK, the GNU implementation of the
      10   * AWK Programming Language.
      11   *
      12   * GAWK is free software; you can redistribute it and/or modify
      13   * it under the terms of the GNU General Public License as published by
      14   * the Free Software Foundation; either version 3 of the License, or
      15   * (at your option) any later version.
      16   *
      17   * GAWK is distributed in the hope that it will be useful,
      18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
      19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      20   * GNU General Public License for more details.
      21   *
      22   * You should have received a copy of the GNU General Public License
      23   * along with this program; if not, write to the Free Software
      24   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
      25   */
      26  
      27  #include "awk.h"
      28  
      29  static NODE *r_make_number(double x);
      30  static AWKNUM get_ieee_magic_val(char *val);
      31  extern NODE **fmt_list;          /* declared in eval.c */
      32  
      33  NODE *(*make_number)(double) = r_make_number;
      34  NODE *(*str2number)(NODE *) = r_force_number;
      35  NODE *(*format_val)(const char *, int, NODE *) = r_format_val;
      36  int (*cmp_numbers)(const NODE *, const NODE *) = cmp_awknums;
      37  
      38  /* is_hex --- return true if a string looks like a hex value */
      39  
      40  static bool
      41  is_hex(const char *str, const char *cpend)
      42  {
      43  	/* on entry, we know the string length is >= 1 */
      44  	if (*str == '-' || *str == '+')
      45  		str++;
      46  
      47  	if (str + 1 < cpend && str[0] == '0' && (str[1] == 'x' || str[1] == 'X'))
      48  		return true;
      49  
      50  	return false;
      51  }
      52  
      53  /* force_number --- force a value to be numeric */
      54  
      55  NODE *
      56  r_force_number(NODE *n)
      57  {
      58  	char *cp;
      59  	char *cpend;
      60  	char save;
      61  	char *ptr;
      62  
      63  	if (n->type == Node_elem_new) {
      64  		n->type = Node_val;
      65  		n->flags &= ~STRING;
      66  		n->stptr[0] = '0';	// STRCUR is still set
      67  		n->stlen = 1;
      68  
      69  		return n;
      70  	}
      71  
      72  	if ((n->flags & NUMCUR) != 0)
      73  		return n;
      74  
      75  	/*
      76  	 * We should always set NUMCUR. If USER_INPUT is set and it's a
      77  	 * numeric string, we clear STRING and enable NUMBER, but if it's not
      78  	 * numeric, we disable USER_INPUT.
      79  	 */
      80  
      81  	/* All the conditionals are an attempt to avoid the expensive strtod */
      82  
      83  	n->flags |= NUMCUR;
      84  	n->numbr = 0.0;
      85  
      86  	/* Trim leading white space, bailing out if there's nothing else */
      87  	for (cp = n->stptr, cpend = cp + n->stlen;
      88  	     cp < cpend && isspace((unsigned char) *cp); cp++)
      89  		continue;
      90  
      91  	if (cp == cpend)
      92  		goto badnum;
      93  
      94  	/* At this point, we know the string is not entirely white space */
      95  	/* Trim trailing white space */
      96  	while (isspace((unsigned char) cpend[-1]))
      97  		cpend--;
      98  
      99  	/*
     100  	 * 2/2007:
     101  	 * POSIX, by way of severe language lawyering, seems to
     102  	 * allow things like "inf" and "nan" to mean something.
     103  	 * So if do_posix, the user gets what he deserves.
     104  	 * This also allows hexadecimal floating point. Ugh.
     105  	 */
     106  	if (! do_posix) {
     107  		if (is_alpha((unsigned char) *cp))
     108  			goto badnum;
     109  		else if (is_ieee_magic_val(cp)) {
     110  			if (cpend == cp + 4) {
     111  				n->numbr = get_ieee_magic_val(cp);
     112  				goto goodnum;
     113  			} else
     114  				goto badnum;
     115  		}
     116  		/* else
     117  			fall through */
     118  	}
     119  	/* else POSIX, so
     120  		fall through */
     121  
     122  	if (   (! do_posix		/* not POSIXLY paranoid and */
     123  	        && (is_alpha((unsigned char) *cp)	/* letter, or */
     124  					/* CANNOT do non-decimal and saw 0x */
     125  		    || (! do_non_decimal_data && is_hex(cp, cpend))))) {
     126  		goto badnum;
     127  	}
     128  
     129  	if (cpend - cp == 1) {		/* only one character */
     130  		if (isdigit((unsigned char) *cp)) {	/* it's a digit! */
     131  			n->numbr = (AWKNUM)(*cp - '0');
     132  			if (n->stlen == 1)		/* no white space */
     133  				n->flags |= NUMINT;
     134  			goto goodnum;
     135  		}
     136  		goto badnum;
     137  	}
     138  
     139  	errno = 0;
     140  	if (do_non_decimal_data		/* main.c assures false if do_posix */
     141  		&& ! do_traditional && get_numbase(cp, cpend - cp, true) != 10) {
     142  		/* nondec2awknum() saves and restores the byte after the string itself */
     143  		n->numbr = nondec2awknum(cp, cpend - cp, &ptr);
     144  	} else {
     145  		save = *cpend;
     146  		*cpend = '\0';
     147  		n->numbr = (AWKNUM) strtod((const char *) cp, &ptr);
     148  		*cpend = save;
     149  	}
     150  
     151  	if (errno == 0 || errno == ERANGE) {
     152  		errno = 0;	/* reset in case of ERANGE */
     153  		if (ptr == cpend)
     154  			goto goodnum;
     155  		/* else keep the leading numeric value without updating flags */
     156  		/* fall through to badnum */
     157  	} else {
     158  		errno = 0;
     159  		/*
     160  		 * N.B. For subnormal values, strtod may return the
     161  		 * floating-point representation while setting errno to ERANGE.
     162  		 * We force the numeric value to 0 in such cases.
     163  		 */
     164  		n->numbr = 0;
     165  		/*
     166  		 * Or should we accept it as a NUMBER even though strtod
     167  		 * threw an error?
     168  		 */
     169  		/* fall through to badnum */
     170  	}
     171  badnum:
     172  	n->flags &= ~USER_INPUT;
     173  	return n;
     174  
     175  goodnum:
     176  	if (isnan(n->numbr) && *cp == '-' && signbit(n->numbr) == 0)
     177  		n->numbr = -(n->numbr);
     178  
     179  	if ((n->flags & USER_INPUT) != 0) {
     180  		/* leave USER_INPUT enabled to indicate that this is a strnum */
     181  		n->flags &= ~STRING;
     182  		n->flags |= NUMBER;
     183  	}
     184  	return n;
     185  }
     186  
     187  
     188  /*
     189   * The following lookup table is used as an optimization in force_string;
     190   * (more complicated) variations on this theme didn't seem to pay off, but
     191   * systematic testing might be in order at some point.
     192   */
     193  static const char *values[] = {
     194  	"0",
     195  	"1",
     196  	"2",
     197  	"3",
     198  	"4",
     199  	"5",
     200  	"6",
     201  	"7",
     202  	"8",
     203  	"9",
     204  };
     205  #define	NVAL	(sizeof(values)/sizeof(values[0]))
     206  
     207  /* r_format_val --- format a numeric value based on format */
     208  
     209  NODE *
     210  r_format_val(const char *format, int index, NODE *s)
     211  {
     212  	char buf[BUFSIZ];
     213  	char *sp = buf;
     214  	double val;
     215  
     216  	/*
     217  	 * 2/2007: Simplify our lives here. Instead of worrying about
     218  	 * whether or not the value will fit into a long just so we
     219  	 * can use sprintf("%ld", val) on it, always format it ourselves.
     220  	 * The only thing to worry about is that integral values always
     221  	 * format as integers. %.0f does that very well.
     222  	 *
     223  	 * 6/2008: Would that things were so simple. Always using %.0f
     224  	 * imposes a notable performance penalty for applications that
     225  	 * do a lot of conversion of integers to strings. So, we reinstate
     226  	 * the old code, but use %.0f for integral values that are outside
     227  	 * the range of a long.  This seems a reasonable compromise.
     228  	 *
     229  	 * 12/2009: Use <= and >= in the comparisons with LONG_xxx instead of
     230  	 * < and > so that things work correctly on systems with 64 bit integers.
     231  	 */
     232  
     233  	if (out_of_range(s)) {
     234  		const char *result = format_nan_inf(s, 'g');
     235  		return make_string(result, strlen(result));
     236  	} else if ((val = double_to_int(s->numbr)) != s->numbr
     237  			|| val <= LONG_MIN || val >= LONG_MAX
     238  	) {
     239  		/* not an integral value, or out of integer range */
     240  		/*
     241  		 * Once upon a time, we just blindly did this:
     242  		 *	sprintf(sp, format, s->numbr);
     243  		 *	s->stlen = strlen(sp);
     244  		 *	s->stfmt = index;
     245  		 * but that's no good if, e.g., OFMT is %s. So we punt,
     246  		 * and just always format the value ourselves.
     247  		 */
     248  
     249  		NODE *dummy[2], *r;
     250  		unsigned int oflags;
     251  
     252  		/* create dummy node for a sole use of format_tree */
     253  		dummy[1] = s;
     254  		oflags = s->flags;
     255  
     256  		if (val == s->numbr) {
     257  			/* integral value, but outside range of %ld, use %.0f */
     258  			r = format_tree("%.0f", 4, dummy, 2);
     259  			s->stfmt = STFMT_UNUSED;
     260  		} else {
     261  			r = format_tree(format, fmt_list[index]->stlen, dummy, 2);
     262  			assert(r != NULL);
     263  			s->stfmt = index;
     264  		}
     265  		s->flags = oflags;
     266  		s->stlen = r->stlen;
     267  		if ((s->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
     268  			efree(s->stptr);
     269  		s->stptr = r->stptr;
     270  #ifdef HAVE_MPFR
     271  		s->strndmode = MPFR_round_mode;
     272  #endif
     273  		freenode(r);	/* Do not unref(r)! We want to keep s->stptr == r->stpr.  */
     274  
     275  		goto no_malloc;
     276  	} else {
     277  		/*
     278  		 * integral value; force conversion to long only once.
     279  		 */
     280  		long num = (long) val;
     281  
     282  		if (num < NVAL && num >= 0) {
     283  			sp = (char *) values[num];
     284  			s->stlen = 1;
     285  		} else {
     286  			(void) sprintf(sp, "%ld", num);
     287  			s->stlen = strlen(sp);
     288  		}
     289  		s->stfmt = STFMT_UNUSED;
     290  		if ((s->flags & INTIND) != 0) {
     291  			s->flags &= ~(INTIND|NUMBER);
     292  			s->flags |= STRING;
     293  		}
     294  #ifdef HAVE_MPFR
     295  		s->strndmode = MPFR_round_mode;
     296  #endif
     297  	}
     298  	if ((s->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
     299  		efree(s->stptr);
     300  	emalloc(s->stptr, char *, s->stlen + 1, "format_val");
     301  	memcpy(s->stptr, sp, s->stlen + 1);
     302  no_malloc:
     303  	s->flags |= STRCUR;
     304  	free_wstr(s);
     305  	return s;
     306  }
     307  
     308  /* r_dupnode --- duplicate a node */
     309  
     310  NODE *
     311  r_dupnode(NODE *n)
     312  {
     313  	NODE *r;
     314  
     315  	assert(n->type == Node_val);
     316  
     317  #ifdef GAWKDEBUG
     318  	/* Do the same as in awk.h:dupnode().  */
     319  	if ((n->flags & MALLOC) != 0) {
     320  		n->valref++;
     321  		return n;
     322  	}
     323  #endif
     324  	getnode(r);
     325  	*r = *n;
     326  
     327  #ifdef HAVE_MPFR
     328  	if ((n->flags & MPZN) != 0) {
     329  		mpz_init(r->mpg_i);
     330  		mpz_set(r->mpg_i, n->mpg_i);
     331  	} else if ((n->flags & MPFN) != 0) {
     332  		mpfr_init(r->mpg_numbr);
     333  		int tval = mpfr_set(r->mpg_numbr, n->mpg_numbr, ROUND_MODE);
     334  		IEEE_FMT(r->mpg_numbr, tval);
     335  	}
     336  #endif
     337  
     338  	r->flags |= MALLOC;
     339  	r->valref = 1;
     340  	/*
     341  	 * DON'T call free_wstr(r) here!
     342  	 * r->wstptr still points at n->wstptr's value, and we
     343  	 * don't want to free it!
     344  	 */
     345  	r->wstptr = NULL;
     346  	r->wstlen = 0;
     347  
     348  	if ((n->flags & STRCUR) != 0) {
     349  		emalloc(r->stptr, char *, n->stlen + 1, "r_dupnode");
     350  		memcpy(r->stptr, n->stptr, n->stlen);
     351  		r->stptr[n->stlen] = '\0';
     352  		r->stlen = n->stlen;
     353  		if ((n->flags & WSTRCUR) != 0) {
     354  			r->wstlen = n->wstlen;
     355  			emalloc(r->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 1), "r_dupnode");
     356  			memcpy(r->wstptr, n->wstptr, n->wstlen * sizeof(wchar_t));
     357  			r->wstptr[n->wstlen] = L'\0';
     358  			r->flags |= WSTRCUR;
     359  		}
     360  	}
     361  
     362  	return r;
     363  }
     364  
     365  /* r_make_number --- allocate a node with defined number */
     366  
     367  static NODE *
     368  r_make_number(double x)
     369  {
     370  	NODE *r = make_number_node(0);
     371  	r->numbr = x;
     372  	return r;
     373  }
     374  
     375  /* cmp_awknums --- compare two AWKNUMs */
     376  
     377  int
     378  cmp_awknums(const NODE *t1, const NODE *t2)
     379  {
     380  	/*
     381  	 * This routine is used to sort numeric array indices or values.
     382  	 * For the purposes of sorting, NaN is considered greater than
     383  	 * any other value, and all NaN values are considered equivalent and equal.
     384  	 * This isn't in compliance with IEEE standard, but compliance w.r.t. NaN
     385  	 * comparison at the awk level is a different issue, and needs to be dealt
     386  	 * with in the interpreter for each opcode seperately.
     387  	 */
     388  
     389  	if (isnan(t1->numbr))
     390  		return ! isnan(t2->numbr);
     391  	if (isnan(t2->numbr))
     392  		return -1;
     393  	/* don't subtract, in case one or both are infinite */
     394  	if (t1->numbr == t2->numbr)
     395  		return 0;
     396  	if (t1->numbr < t2->numbr)
     397  		return -1;
     398  	return 1;
     399  }
     400  
     401  /* make_str_node --- make a string node */
     402  
     403  NODE *
     404  make_str_node(const char *s, size_t len, int flags)
     405  {
     406  	NODE *r;
     407  	getnode(r);
     408  	r->type = Node_val;
     409  	r->numbr = 0;
     410  	r->flags = (MALLOC|STRING|STRCUR);
     411  	r->valref = 1;
     412  	r->stfmt = STFMT_UNUSED;
     413  #ifdef HAVE_MPFR
     414  	r->strndmode = MPFR_round_mode;
     415  #endif
     416  	r->wstptr = NULL;
     417  	r->wstlen = 0;
     418  
     419  	if ((flags & ALREADY_MALLOCED) != 0)
     420  		r->stptr = (char *) s;
     421  	else {
     422  		emalloc(r->stptr, char *, len + 1, "make_str_node");
     423  		memcpy(r->stptr, s, len);
     424  	}
     425  	r->stptr[len] = '\0';
     426  
     427  	if ((flags & SCAN) != 0) {	/* scan for escape sequences */
     428  		const char *pf;
     429  		char *ptm;
     430  		int c;
     431  		const char *end;
     432  		mbstate_t cur_state;
     433  
     434  		memset(& cur_state, 0, sizeof(cur_state));
     435  
     436  		end = &(r->stptr[len]);
     437  		for (pf = ptm = r->stptr; pf < end;) {
     438  			/*
     439  			 * Keep multibyte characters together. This avoids
     440  			 * problems if a subsequent byte of a multibyte
     441  			 * character happens to be a backslash.
     442  			 */
     443  			if (gawk_mb_cur_max > 1) {
     444  				int mblen = mbrlen(pf, end-pf, &cur_state);
     445  
     446  				if (mblen > 1) {
     447  					int i;
     448  
     449  					for (i = 0; i < mblen; i++)
     450  						*ptm++ = *pf++;
     451  					continue;
     452  				}
     453  			}
     454  
     455  			c = *pf++;
     456  			if (c == '\\') {
     457  				c = parse_escape(&pf);
     458  				if (c < 0) {
     459  					if (do_lint)
     460  						lintwarn(_("backslash string continuation is not portable"));
     461  					if ((flags & ELIDE_BACK_NL) != 0)
     462  						continue;
     463  					c = '\\';
     464  				}
     465  				*ptm++ = c;
     466  			} else
     467  				*ptm++ = c;
     468  		}
     469  		len = ptm - r->stptr;
     470  		erealloc(r->stptr, char *, len + 1, "make_str_node");
     471  		r->stptr[len] = '\0';
     472  	}
     473  	r->stlen = len;
     474  
     475  	return r;
     476  }
     477  
     478  /* make_typed_regex --- make a typed regex node */
     479  
     480  NODE *
     481  make_typed_regex(const char *re, size_t len)
     482  {
     483  	NODE *n, *exp, *n2;
     484  
     485  	exp = make_str_node(re, len, ALREADY_MALLOCED);
     486  	n = make_regnode(Node_regex, exp);
     487  	if (n == NULL)
     488  		fatal(_("could not make typed regex"));
     489  
     490  	n2 = make_string(re, len);
     491  	n2->typed_re = n;
     492  #if HAVE_MPFR
     493  	if (do_mpfr)
     494  		mpg_zero(n2);
     495  	else
     496  #endif
     497  	n2->numbr = 0;
     498  	n2->flags |= NUMCUR|STRCUR|REGEX;
     499  	n2->flags &= ~(STRING|NUMBER);
     500  
     501  	return n2;
     502  }
     503  
     504  
     505  /* unref --- remove reference to a particular node */
     506  
     507  void
     508  r_unref(NODE *tmp)
     509  {
     510  #ifdef GAWKDEBUG
     511  	/* Do the same as in awk.h:unref().  */
     512  	assert(tmp == NULL || tmp->valref > 0);
     513  	if (tmp == NULL || --tmp->valref > 0)
     514  		return;
     515  #endif
     516  
     517  	if ((tmp->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
     518  		efree(tmp->stptr);
     519  
     520  	mpfr_unset(tmp);
     521  
     522  	free_wstr(tmp);
     523  	freenode(tmp);
     524  }
     525  
     526  
     527  /*
     528   * parse_escape:
     529   *
     530   * Parse a C escape sequence.  STRING_PTR points to a variable containing a
     531   * pointer to the string to parse.  That pointer is updated past the
     532   * characters we use.  The value of the escape sequence is returned.
     533   *
     534   * A negative value means the sequence \ newline was seen, which is supposed to
     535   * be equivalent to nothing at all.
     536   *
     537   * If \ is followed by a null character, we return a negative value and leave
     538   * the string pointer pointing at the null character.
     539   *
     540   * If \ is followed by 000, we return 0 and leave the string pointer after the
     541   * zeros.  A value of 0 does not mean end of string.
     542   *
     543   * POSIX doesn't allow \x.
     544   */
     545  
     546  int
     547  parse_escape(const char **string_ptr)
     548  {
     549  	int c = *(*string_ptr)++;
     550  	int i;
     551  	int count;
     552  	int j;
     553  	const char *start;
     554  
     555  	if (do_lint_old) {
     556  		switch (c) {
     557  		case 'a':
     558  		case 'b':
     559  		case 'f':
     560  		case 'r':
     561  			lintwarn(_("old awk does not support the `\\%c' escape sequence"), c);
     562  			break;
     563  		}
     564  	}
     565  
     566  	switch (c) {
     567  	case 'a':
     568  		return '\a';
     569  	case 'b':
     570  		return '\b';
     571  	case 'f':
     572  		return '\f';
     573  	case 'n':
     574  		return '\n';
     575  	case 'r':
     576  		return '\r';
     577  	case 't':
     578  		return '\t';
     579  	case 'v':
     580  		return '\v';
     581  	case '\n':
     582  		return -2;
     583  	case 0:
     584  		(*string_ptr)--;
     585  		return -1;
     586  	case '0':
     587  	case '1':
     588  	case '2':
     589  	case '3':
     590  	case '4':
     591  	case '5':
     592  	case '6':
     593  	case '7':
     594  		i = c - '0';
     595  		count = 0;
     596  		while (++count < 3) {
     597  			if ((c = *(*string_ptr)++) >= '0' && c <= '7') {
     598  				i *= 8;
     599  				i += c - '0';
     600  			} else {
     601  				(*string_ptr)--;
     602  				break;
     603  			}
     604  		}
     605  		return i;
     606  	case 'x':
     607  		if (do_lint) {
     608  			static bool warned = false;
     609  
     610  			if (! warned) {
     611  				warned = true;
     612  				lintwarn(_("POSIX does not allow `\\x' escapes"));
     613  			}
     614  		}
     615  		if (do_posix)
     616  			return ('x');
     617  		if (! isxdigit((unsigned char) (*string_ptr)[0])) {
     618  			warning(_("no hex digits in `\\x' escape sequence"));
     619  			return ('x');
     620  		}
     621  		start = *string_ptr;
     622  		for (i = j = 0; j < 2; j++) {
     623  			/* do outside test to avoid multiple side effects */
     624  			c = (unsigned char) *(*string_ptr)++;
     625  			if (isxdigit(c)) {
     626  				i *= 16;
     627  				if (isdigit(c))
     628  					i += c - '0';
     629  				else if (isupper(c))
     630  					i += c - 'A' + 10;
     631  				else
     632  					i += c - 'a' + 10;
     633  			} else {
     634  				(*string_ptr)--;
     635  				break;
     636  			}
     637  		}
     638  		if (do_lint && j == 2 && isxdigit((unsigned char)*(*string_ptr)))
     639  			lintwarn(_("hex escape \\x%.*s of %d characters probably not interpreted the way you expect"), 3, start, 3);
     640  		return i;
     641  	case '\\':
     642  	case '"':
     643  		return c;
     644  	default:
     645  	{
     646  		static bool warned[256];
     647  		unsigned char uc = (unsigned char) c;
     648  
     649  		/* N.B.: use unsigned char here to avoid Latin-1 problems */
     650  
     651  		if (! warned[uc]) {
     652  			warned[uc] = true;
     653  
     654  			warning(_("escape sequence `\\%c' treated as plain `%c'"), uc, uc);
     655  		}
     656  	}
     657  		return c;
     658  	}
     659  }
     660  
     661  /* get_numbase --- return the base to use for the number in 's' */
     662  
     663  int
     664  get_numbase(const char *s, size_t len, bool use_locale)
     665  {
     666  	int dec_point = '.';
     667  	const char *str = s;
     668  
     669  #if defined(HAVE_LOCALE_H)
     670  	/*
     671  	 * loc.decimal_point may not have been initialized yet,
     672  	 * so double check it before using it.
     673  	 */
     674  	if (use_locale && loc.decimal_point != NULL && loc.decimal_point[0] != '\0')
     675  		dec_point = loc.decimal_point[0];	/* XXX --- assumes one char */
     676  #endif
     677  
     678  	if (len < 2 || str[0] != '0')
     679  		return 10;
     680  
     681  	/* leading 0x or 0X */
     682  	if (str[1] == 'x' || str[1] == 'X')
     683  		return 16;
     684  
     685  	/*
     686  	 * Numbers with '.', 'e', or 'E' are decimal.
     687  	 * Have to check so that things like 00.34 are handled right.
     688  	 *
     689  	 * These beasts can have trailing whitespace. Deal with that too.
     690  	 */
     691  	for (; len > 0; len--, str++) {
     692  		if (*str == 'e' || *str == 'E' || *str == dec_point)
     693  			return 10;
     694  		else if (! isdigit((unsigned char) *str))
     695  			break;
     696  	}
     697  
     698  	if (! isdigit((unsigned char) s[1])
     699  			|| s[1] == '8' || s[1] == '9'
     700  	)
     701  		return 10;
     702  	return 8;
     703  }
     704  
     705  /* str2wstr --- convert a multibyte string to a wide string */
     706  
     707  NODE *
     708  str2wstr(NODE *n, size_t **ptr)
     709  {
     710  	size_t i, count, src_count;
     711  	char *sp;
     712  	mbstate_t mbs;
     713  	wchar_t wc, *wsp;
     714  	static bool warned = false;
     715  
     716  	assert((n->flags & (STRING|STRCUR)) != 0);
     717  
     718  	/*
     719  	 * Don't convert global null string or global null field
     720  	 * variables to a wide string. They are both zero-length anyway.
     721  	 * This also avoids future double-free errors while releasing
     722  	 * shallow copies, eg. *tmp = *Null_field; free_wstr(tmp);
     723  	 */
     724  	if (n == Nnull_string || n == Null_field)
     725  		return n;
     726  
     727  	if ((n->flags & WSTRCUR) != 0) {
     728  		if (ptr == NULL)
     729  			return n;
     730  		/* otherwise
     731  			fall through and recompute to fill in the array */
     732  		free_wstr(n);
     733  	}
     734  
     735  	/*
     736  	 * After consideration and consultation, this
     737  	 * code trades space for time. We allocate
     738  	 * an array of wchar_t that is n->stlen long.
     739  	 * This is needed in the worst case anyway, where
     740  	 * each input byte maps to one wchar_t.  The
     741  	 * advantage is that we only have to convert the string
     742  	 * once, instead of twice, once to find out how many
     743  	 * wide characters, and then again to actually fill in
     744  	 * the info.  If there's a lot left over, we can
     745  	 * realloc the wide string down in size.
     746  	 */
     747  
     748  	emalloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->stlen + 1), "str2wstr");
     749  	wsp = n->wstptr;
     750  
     751  	/*
     752  	 * For use by do_match, create and fill in an array.
     753  	 * For each byte `i' in n->stptr (the original string),
     754  	 * a[i] is equal to `j', where `j' is the corresponding wchar_t
     755  	 * in the converted wide string.
     756  	 *
     757  	 * Create the array.
     758  	 */
     759  	if (ptr != NULL) {
     760  		ezalloc(*ptr, size_t *, sizeof(size_t) * n->stlen, "str2wstr");
     761  	}
     762  
     763  	sp = n->stptr;
     764  	src_count = n->stlen;
     765  	memset(& mbs, 0, sizeof(mbs));
     766  	for (i = 0; src_count > 0; i++) {
     767  		/*
     768  		 * 9/2010: Check the current byte; if it's a valid character,
     769  		 * then it doesn't start a multibyte sequence. This brings a
     770  		 * big speed up. Thanks to Ulrich Drepper for the tip.
     771  		 * 11/2010: Thanks to Paolo Bonzini for some even faster code.
     772  		 */
     773  		if (is_valid_character(*sp)) {
     774  			count = 1;
     775  			wc = btowc_cache(*sp);
     776  		} else
     777  			count = mbrtowc(& wc, sp, src_count, & mbs);
     778  		switch (count) {
     779  		case (size_t) -2:
     780  		case (size_t) -1:
     781  			/*
     782  			 * mbrtowc(3) says the state of mbs becomes undefined
     783  			 * after a bad character, so reset it.
     784  			 */
     785  			memset(& mbs, 0, sizeof(mbs));
     786  
     787  			/* Warn the user something's wrong */
     788  			if (! warned) {
     789  				warned = true;
     790  				warning(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale"));
     791  			}
     792  
     793  			/*
     794  			 * 8/2015: If we're using UTF, then instead of just
     795  			 * skipping the character, plug in the Unicode
     796  			 * replacement character. In most cases this gives
     797  			 * us "better" results, in that character counts
     798  			 * and string lengths tend to make more sense.
     799  			 *
     800  			 * Otherwise, just skip the bad byte and keep going,
     801  			 * so that we get a more-or-less full string, instead of
     802  			 * stopping early. This is particularly important
     803  			 * for match() where we need to build the indices.
     804  			 */
     805  			if (using_utf8()) {
     806  				count = 1;
     807  				wc = 0xFFFD;	/* unicode replacement character */
     808  				goto set_wc;
     809  			} else {
     810  				/* skip it and keep going */
     811  				sp++;
     812  				src_count--;
     813  			}
     814  			break;
     815  
     816  		case 0:
     817  			count = 1;
     818  			/* fall through */
     819  		default:
     820  		set_wc:
     821  			*wsp++ = wc;
     822  			src_count -= count;
     823  			while (count--)  {
     824  				if (ptr != NULL)
     825  					(*ptr)[sp - n->stptr] = i;
     826  				sp++;
     827  			}
     828  			break;
     829  		}
     830  	}
     831  
     832  	*wsp = L'\0';
     833  	n->wstlen = wsp - n->wstptr;
     834  	n->flags |= WSTRCUR;
     835  #define ARBITRARY_AMOUNT_TO_GIVE_BACK 100
     836  	if (n->stlen - n->wstlen > ARBITRARY_AMOUNT_TO_GIVE_BACK)
     837  		erealloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 1), "str2wstr");
     838  
     839  	return n;
     840  }
     841  
     842  /* wstr2str --- convert a wide string back into multibyte one */
     843  
     844  NODE *
     845  wstr2str(NODE *n)
     846  {
     847  	size_t result;
     848  	size_t length;
     849  	wchar_t *wp;
     850  	mbstate_t mbs;
     851  	char *newval, *cp;
     852  
     853  	assert(n->valref == 1);
     854  	assert((n->flags & WSTRCUR) != 0);
     855  
     856  	/*
     857  	 * Convert the wide chars in t1->wstptr back into m.b. chars.
     858  	 * This is pretty grotty, but it's the most straightforward
     859  	 * way to do things.
     860  	 */
     861  	memset(& mbs, 0, sizeof(mbs));
     862  
     863  	length = n->wstlen;
     864  	emalloc(newval, char *, (length * gawk_mb_cur_max) + 1, "wstr2str");
     865  
     866  	wp = n->wstptr;
     867  	for (cp = newval; length > 0; length--) {
     868  		result = wcrtomb(cp, *wp, & mbs);
     869  		if (result == (size_t) -1)	/* what to do? break seems best */
     870  			break;
     871  		cp += result;
     872  		wp++;
     873  	}
     874  	*cp = '\0';
     875  
     876  	/* N.B. caller just created n with make_string, so this free is safe */
     877  	efree(n->stptr);
     878  	n->stptr = newval;
     879  	n->stlen = cp - newval;
     880  
     881  	return n;
     882  }
     883  
     884  /* free_wstr --- release the wide string part of a node */
     885  
     886  void
     887  r_free_wstr(NODE *n)
     888  {
     889  	assert(n->type == Node_val);
     890  
     891  	if ((n->flags & WSTRCUR) != 0) {
     892  		assert(n->wstptr != NULL);
     893  		efree(n->wstptr);
     894  	}
     895  	n->wstptr = NULL;
     896  	n->wstlen = 0;
     897  	n->flags &= ~WSTRCUR;
     898  }
     899  
     900  static void __attribute__ ((unused))
     901  dump_wstr(FILE *fp, const wchar_t *str, size_t len)
     902  {
     903  	if (str == NULL || len == 0)
     904  		return;
     905  
     906  	for (; len--; str++)
     907  		putwc(*str, fp);
     908  }
     909  
     910  /* wstrstr --- walk haystack, looking for needle, wide char version */
     911  
     912  const wchar_t *
     913  wstrstr(const wchar_t *haystack, size_t hs_len,
     914  	const wchar_t *needle, size_t needle_len)
     915  {
     916  	size_t i;
     917  
     918  	if (haystack == NULL || needle == NULL || needle_len > hs_len)
     919  		return NULL;
     920  
     921  	for (i = 0; i < hs_len; i++) {
     922  		if (haystack[i] == needle[0]
     923  		    && i+needle_len-1 < hs_len
     924  		    && haystack[i+needle_len-1] == needle[needle_len-1]) {
     925  			/* first & last chars match, check string */
     926  			if (memcmp(haystack+i, needle, sizeof(wchar_t) * needle_len) == 0) {
     927  				return haystack + i;
     928  			}
     929  		}
     930  	}
     931  
     932  	return NULL;
     933  }
     934  
     935  /* wcasestrstr --- walk haystack, nocase look for needle, wide char version */
     936  
     937  const wchar_t *
     938  wcasestrstr(const wchar_t *haystack, size_t hs_len,
     939  	const wchar_t *needle, size_t needle_len)
     940  {
     941  	size_t i, j;
     942  
     943  	if (haystack == NULL || needle == NULL || needle_len > hs_len)
     944  		return NULL;
     945  
     946  	for (i = 0; i < hs_len; i++) {
     947  		if (towlower(haystack[i]) == towlower(needle[0])
     948  		    && i+needle_len-1 < hs_len
     949  		    && towlower(haystack[i+needle_len-1]) == towlower(needle[needle_len-1])) {
     950  			/* first & last chars match, check string */
     951  			const wchar_t *start;
     952  
     953  			start = haystack+i;
     954  			for (j = 0; j < needle_len; j++, start++) {
     955  				wchar_t h, n;
     956  
     957  				h = towlower(*start);
     958  				n = towlower(needle[j]);
     959  				if (h != n)
     960  					goto out;
     961  			}
     962  			return haystack + i;
     963  		}
     964  out:	;
     965  	}
     966  
     967  	return NULL;
     968  }
     969  
     970  /* is_ieee_magic_val --- return true for +inf, -inf, +nan, -nan */
     971  
     972  bool
     973  is_ieee_magic_val(const char *val)
     974  {
     975  	/*
     976  	 * Avoid strncasecmp: it mishandles ASCII bytes in some locales.
     977  	 * Assume the length is 4, as the caller checks this.
     978  	 */
     979  	return (   (val[0] == '+' || val[0] == '-')
     980  		&& (   (   (val[1] == 'i' || val[1] == 'I')
     981  			&& (val[2] == 'n' || val[2] == 'N')
     982  			&& (val[3] == 'f' || val[3] == 'F'))
     983  		    || (   (val[1] == 'n' || val[1] == 'N')
     984  			&& (val[2] == 'a' || val[2] == 'A')
     985  			&& (val[3] == 'n' || val[3] == 'N'))));
     986  }
     987  
     988  /* get_ieee_magic_val --- return magic value for string */
     989  
     990  static AWKNUM
     991  get_ieee_magic_val(char *val)
     992  {
     993  	static bool first = true;
     994  	static AWKNUM inf;
     995  	static AWKNUM nan;
     996  	char save;
     997  
     998  	char *ptr;
     999  	save = val[4];
    1000  	val[4] = '\0';
    1001  	AWKNUM v = strtod(val, &ptr);
    1002  	val[4] = save;
    1003  
    1004  	if (val == ptr) { /* Older strtod implementations don't support inf or nan. */
    1005  		if (first) {
    1006  			first = false;
    1007  			nan = sqrt(-1.0);
    1008  			inf = -log(0.0);
    1009  		}
    1010  
    1011  		v = ((val[1] == 'i' || val[1] == 'I') ? inf : nan);
    1012  		if (val[0] == '-')
    1013  			v = -v;
    1014  	}
    1015  
    1016  	return v;
    1017  }
    1018  
    1019  wint_t btowc_cache[256];
    1020  
    1021  /* init_btowc_cache --- initialize the cache */
    1022  
    1023  void init_btowc_cache()
    1024  {
    1025  	int i;
    1026  
    1027  	for (i = 0; i <= 255; i++) {
    1028  		btowc_cache[i] = btowc(i);
    1029  	}
    1030  }
    1031  
    1032  #define BLOCKCHUNK 100
    1033  
    1034  struct block_header nextfree[BLOCK_MAX] = {
    1035  	{ NULL, sizeof(NODE), "node" },
    1036  	{ NULL, sizeof(BUCKET), "bucket" },
    1037  };
    1038  
    1039  #ifdef MEMDEBUG
    1040  
    1041  void *
    1042  r_getblock(int id)
    1043  {
    1044  	void *res;
    1045  	emalloc(res, void *, nextfree[id].size, "getblock");
    1046  	nextfree[id].active++;
    1047  	if (nextfree[id].highwater < nextfree[id].active)
    1048  		nextfree[id].highwater = nextfree[id].active;
    1049  	return res;
    1050  }
    1051  
    1052  void
    1053  r_freeblock(void *p, int id)
    1054  {
    1055  	nextfree[id].active--;
    1056  	free(p);
    1057  }
    1058  
    1059  #else
    1060  
    1061  /* more_blocks --- get more blocks of memory and add to the free list;
    1062  	size of a block must be >= sizeof(struct block_item)
    1063   */
    1064  
    1065  void *
    1066  more_blocks(int id)
    1067  {
    1068  	struct block_item *freep, *np, *next;
    1069  	char *p, *endp;
    1070  	size_t size;
    1071  
    1072  	size = nextfree[id].size;
    1073  
    1074  	assert(size >= sizeof(struct block_item));
    1075  	emalloc(freep, struct block_item *, BLOCKCHUNK * size, "more_blocks");
    1076  	p = (char *) freep;
    1077  	endp = p + BLOCKCHUNK * size;
    1078  
    1079  	for (np = freep; ; np = next) {
    1080  		next = (struct block_item *) (p += size);
    1081  		if (p >= endp) {
    1082  			np->freep = NULL;
    1083  			break;
    1084  		}
    1085  		np->freep = next;
    1086  	}
    1087  	nextfree[id].freep = freep->freep;
    1088  	nextfree[id].highwater += BLOCKCHUNK;
    1089  	return freep;
    1090  }
    1091  
    1092  #endif
    1093  
    1094  /* make_bool_node --- make a boolean-valued node */
    1095  
    1096  extern NODE *
    1097  make_bool_node(bool value)
    1098  {
    1099  	NODE *val;
    1100  	const char *sval;
    1101  	AWKNUM nval;
    1102  
    1103  	sval = (value ? "1" : "0");
    1104  	nval = (value ? 1.0 : 0.0);
    1105  
    1106  	val = make_number(nval);
    1107  	val->stptr = estrdup(sval, strlen(sval));
    1108  	val->stlen = strlen(sval);
    1109  	val->flags |= NUMCUR|STRCUR|BOOLVAL;
    1110  	val->stfmt = STFMT_UNUSED;
    1111  
    1112  	return val;
    1113  }