1 /*
2 * node.c -- routines for node management
3 */
4
5 /*
6 * Copyright (C) 1986, 1988, 1989, 1991-2001, 2003-2015, 2017-2019, 2021, 2022, 2023,
7 * the Free Software Foundation, Inc.
8 *
9 * This file is part of GAWK, the GNU implementation of the
10 * AWK Programming Language.
11 *
12 * GAWK is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 3 of the License, or
15 * (at your option) any later version.
16 *
17 * GAWK is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 */
26
27 #include "awk.h"
28
29 static NODE *r_make_number(double x);
30 static AWKNUM get_ieee_magic_val(char *val);
31 extern NODE **fmt_list; /* declared in eval.c */
32
33 NODE *(*make_number)(double) = r_make_number;
34 NODE *(*str2number)(NODE *) = r_force_number;
35 NODE *(*format_val)(const char *, int, NODE *) = r_format_val;
36 int (*cmp_numbers)(const NODE *, const NODE *) = cmp_awknums;
37
38 /* is_hex --- return true if a string looks like a hex value */
39
40 static bool
41 is_hex(const char *str, const char *cpend)
42 {
43 /* on entry, we know the string length is >= 1 */
44 if (*str == '-' || *str == '+')
45 str++;
46
47 if (str + 1 < cpend && str[0] == '0' && (str[1] == 'x' || str[1] == 'X'))
48 return true;
49
50 return false;
51 }
52
53 /* force_number --- force a value to be numeric */
54
55 NODE *
56 r_force_number(NODE *n)
57 {
58 char *cp;
59 char *cpend;
60 char save;
61 char *ptr;
62
63 if (n->type == Node_elem_new) {
64 n->type = Node_val;
65 n->flags &= ~STRING;
66 n->stptr[0] = '0'; // STRCUR is still set
67 n->stlen = 1;
68
69 return n;
70 }
71
72 if ((n->flags & NUMCUR) != 0)
73 return n;
74
75 /*
76 * We should always set NUMCUR. If USER_INPUT is set and it's a
77 * numeric string, we clear STRING and enable NUMBER, but if it's not
78 * numeric, we disable USER_INPUT.
79 */
80
81 /* All the conditionals are an attempt to avoid the expensive strtod */
82
83 n->flags |= NUMCUR;
84 n->numbr = 0.0;
85
86 /* Trim leading white space, bailing out if there's nothing else */
87 for (cp = n->stptr, cpend = cp + n->stlen;
88 cp < cpend && isspace((unsigned char) *cp); cp++)
89 continue;
90
91 if (cp == cpend)
92 goto badnum;
93
94 /* At this point, we know the string is not entirely white space */
95 /* Trim trailing white space */
96 while (isspace((unsigned char) cpend[-1]))
97 cpend--;
98
99 /*
100 * 2/2007:
101 * POSIX, by way of severe language lawyering, seems to
102 * allow things like "inf" and "nan" to mean something.
103 * So if do_posix, the user gets what he deserves.
104 * This also allows hexadecimal floating point. Ugh.
105 */
106 if (! do_posix) {
107 if (is_alpha((unsigned char) *cp))
108 goto badnum;
109 else if (is_ieee_magic_val(cp)) {
110 if (cpend == cp + 4) {
111 n->numbr = get_ieee_magic_val(cp);
112 goto goodnum;
113 } else
114 goto badnum;
115 }
116 /* else
117 fall through */
118 }
119 /* else POSIX, so
120 fall through */
121
122 if ( (! do_posix /* not POSIXLY paranoid and */
123 && (is_alpha((unsigned char) *cp) /* letter, or */
124 /* CANNOT do non-decimal and saw 0x */
125 || (! do_non_decimal_data && is_hex(cp, cpend))))) {
126 goto badnum;
127 }
128
129 if (cpend - cp == 1) { /* only one character */
130 if (isdigit((unsigned char) *cp)) { /* it's a digit! */
131 n->numbr = (AWKNUM)(*cp - '0');
132 if (n->stlen == 1) /* no white space */
133 n->flags |= NUMINT;
134 goto goodnum;
135 }
136 goto badnum;
137 }
138
139 errno = 0;
140 if (do_non_decimal_data /* main.c assures false if do_posix */
141 && ! do_traditional && get_numbase(cp, cpend - cp, true) != 10) {
142 /* nondec2awknum() saves and restores the byte after the string itself */
143 n->numbr = nondec2awknum(cp, cpend - cp, &ptr);
144 } else {
145 save = *cpend;
146 *cpend = '\0';
147 n->numbr = (AWKNUM) strtod((const char *) cp, &ptr);
148 *cpend = save;
149 }
150
151 if (errno == 0 || errno == ERANGE) {
152 errno = 0; /* reset in case of ERANGE */
153 if (ptr == cpend)
154 goto goodnum;
155 /* else keep the leading numeric value without updating flags */
156 /* fall through to badnum */
157 } else {
158 errno = 0;
159 /*
160 * N.B. For subnormal values, strtod may return the
161 * floating-point representation while setting errno to ERANGE.
162 * We force the numeric value to 0 in such cases.
163 */
164 n->numbr = 0;
165 /*
166 * Or should we accept it as a NUMBER even though strtod
167 * threw an error?
168 */
169 /* fall through to badnum */
170 }
171 badnum:
172 n->flags &= ~USER_INPUT;
173 return n;
174
175 goodnum:
176 if (isnan(n->numbr) && *cp == '-' && signbit(n->numbr) == 0)
177 n->numbr = -(n->numbr);
178
179 if ((n->flags & USER_INPUT) != 0) {
180 /* leave USER_INPUT enabled to indicate that this is a strnum */
181 n->flags &= ~STRING;
182 n->flags |= NUMBER;
183 }
184 return n;
185 }
186
187
188 /*
189 * The following lookup table is used as an optimization in force_string;
190 * (more complicated) variations on this theme didn't seem to pay off, but
191 * systematic testing might be in order at some point.
192 */
193 static const char *values[] = {
194 "0",
195 "1",
196 "2",
197 "3",
198 "4",
199 "5",
200 "6",
201 "7",
202 "8",
203 "9",
204 };
205 #define NVAL (sizeof(values)/sizeof(values[0]))
206
207 /* r_format_val --- format a numeric value based on format */
208
209 NODE *
210 r_format_val(const char *format, int index, NODE *s)
211 {
212 char buf[BUFSIZ];
213 char *sp = buf;
214 double val;
215
216 /*
217 * 2/2007: Simplify our lives here. Instead of worrying about
218 * whether or not the value will fit into a long just so we
219 * can use sprintf("%ld", val) on it, always format it ourselves.
220 * The only thing to worry about is that integral values always
221 * format as integers. %.0f does that very well.
222 *
223 * 6/2008: Would that things were so simple. Always using %.0f
224 * imposes a notable performance penalty for applications that
225 * do a lot of conversion of integers to strings. So, we reinstate
226 * the old code, but use %.0f for integral values that are outside
227 * the range of a long. This seems a reasonable compromise.
228 *
229 * 12/2009: Use <= and >= in the comparisons with LONG_xxx instead of
230 * < and > so that things work correctly on systems with 64 bit integers.
231 */
232
233 if (out_of_range(s)) {
234 const char *result = format_nan_inf(s, 'g');
235 return make_string(result, strlen(result));
236 } else if ((val = double_to_int(s->numbr)) != s->numbr
237 || val <= LONG_MIN || val >= LONG_MAX
238 ) {
239 /* not an integral value, or out of integer range */
240 /*
241 * Once upon a time, we just blindly did this:
242 * sprintf(sp, format, s->numbr);
243 * s->stlen = strlen(sp);
244 * s->stfmt = index;
245 * but that's no good if, e.g., OFMT is %s. So we punt,
246 * and just always format the value ourselves.
247 */
248
249 NODE *dummy[2], *r;
250 unsigned int oflags;
251
252 /* create dummy node for a sole use of format_tree */
253 dummy[1] = s;
254 oflags = s->flags;
255
256 if (val == s->numbr) {
257 /* integral value, but outside range of %ld, use %.0f */
258 r = format_tree("%.0f", 4, dummy, 2);
259 s->stfmt = STFMT_UNUSED;
260 } else {
261 r = format_tree(format, fmt_list[index]->stlen, dummy, 2);
262 assert(r != NULL);
263 s->stfmt = index;
264 }
265 s->flags = oflags;
266 s->stlen = r->stlen;
267 if ((s->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
268 efree(s->stptr);
269 s->stptr = r->stptr;
270 #ifdef HAVE_MPFR
271 s->strndmode = MPFR_round_mode;
272 #endif
273 freenode(r); /* Do not unref(r)! We want to keep s->stptr == r->stpr. */
274
275 goto no_malloc;
276 } else {
277 /*
278 * integral value; force conversion to long only once.
279 */
280 long num = (long) val;
281
282 if (num < NVAL && num >= 0) {
283 sp = (char *) values[num];
284 s->stlen = 1;
285 } else {
286 (void) sprintf(sp, "%ld", num);
287 s->stlen = strlen(sp);
288 }
289 s->stfmt = STFMT_UNUSED;
290 if ((s->flags & INTIND) != 0) {
291 s->flags &= ~(INTIND|NUMBER);
292 s->flags |= STRING;
293 }
294 #ifdef HAVE_MPFR
295 s->strndmode = MPFR_round_mode;
296 #endif
297 }
298 if ((s->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
299 efree(s->stptr);
300 emalloc(s->stptr, char *, s->stlen + 1, "format_val");
301 memcpy(s->stptr, sp, s->stlen + 1);
302 no_malloc:
303 s->flags |= STRCUR;
304 free_wstr(s);
305 return s;
306 }
307
308 /* r_dupnode --- duplicate a node */
309
310 NODE *
311 r_dupnode(NODE *n)
312 {
313 NODE *r;
314
315 assert(n->type == Node_val);
316
317 #ifdef GAWKDEBUG
318 /* Do the same as in awk.h:dupnode(). */
319 if ((n->flags & MALLOC) != 0) {
320 n->valref++;
321 return n;
322 }
323 #endif
324 getnode(r);
325 *r = *n;
326
327 #ifdef HAVE_MPFR
328 if ((n->flags & MPZN) != 0) {
329 mpz_init(r->mpg_i);
330 mpz_set(r->mpg_i, n->mpg_i);
331 } else if ((n->flags & MPFN) != 0) {
332 mpfr_init(r->mpg_numbr);
333 int tval = mpfr_set(r->mpg_numbr, n->mpg_numbr, ROUND_MODE);
334 IEEE_FMT(r->mpg_numbr, tval);
335 }
336 #endif
337
338 r->flags |= MALLOC;
339 r->valref = 1;
340 /*
341 * DON'T call free_wstr(r) here!
342 * r->wstptr still points at n->wstptr's value, and we
343 * don't want to free it!
344 */
345 r->wstptr = NULL;
346 r->wstlen = 0;
347
348 if ((n->flags & STRCUR) != 0) {
349 emalloc(r->stptr, char *, n->stlen + 1, "r_dupnode");
350 memcpy(r->stptr, n->stptr, n->stlen);
351 r->stptr[n->stlen] = '\0';
352 r->stlen = n->stlen;
353 if ((n->flags & WSTRCUR) != 0) {
354 r->wstlen = n->wstlen;
355 emalloc(r->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 1), "r_dupnode");
356 memcpy(r->wstptr, n->wstptr, n->wstlen * sizeof(wchar_t));
357 r->wstptr[n->wstlen] = L'\0';
358 r->flags |= WSTRCUR;
359 }
360 }
361
362 return r;
363 }
364
365 /* r_make_number --- allocate a node with defined number */
366
367 static NODE *
368 r_make_number(double x)
369 {
370 NODE *r = make_number_node(0);
371 r->numbr = x;
372 return r;
373 }
374
375 /* cmp_awknums --- compare two AWKNUMs */
376
377 int
378 cmp_awknums(const NODE *t1, const NODE *t2)
379 {
380 /*
381 * This routine is used to sort numeric array indices or values.
382 * For the purposes of sorting, NaN is considered greater than
383 * any other value, and all NaN values are considered equivalent and equal.
384 * This isn't in compliance with IEEE standard, but compliance w.r.t. NaN
385 * comparison at the awk level is a different issue, and needs to be dealt
386 * with in the interpreter for each opcode seperately.
387 */
388
389 if (isnan(t1->numbr))
390 return ! isnan(t2->numbr);
391 if (isnan(t2->numbr))
392 return -1;
393 /* don't subtract, in case one or both are infinite */
394 if (t1->numbr == t2->numbr)
395 return 0;
396 if (t1->numbr < t2->numbr)
397 return -1;
398 return 1;
399 }
400
401 /* make_str_node --- make a string node */
402
403 NODE *
404 make_str_node(const char *s, size_t len, int flags)
405 {
406 NODE *r;
407 getnode(r);
408 r->type = Node_val;
409 r->numbr = 0;
410 r->flags = (MALLOC|STRING|STRCUR);
411 r->valref = 1;
412 r->stfmt = STFMT_UNUSED;
413 #ifdef HAVE_MPFR
414 r->strndmode = MPFR_round_mode;
415 #endif
416 r->wstptr = NULL;
417 r->wstlen = 0;
418
419 if ((flags & ALREADY_MALLOCED) != 0)
420 r->stptr = (char *) s;
421 else {
422 emalloc(r->stptr, char *, len + 1, "make_str_node");
423 memcpy(r->stptr, s, len);
424 }
425 r->stptr[len] = '\0';
426
427 if ((flags & SCAN) != 0) { /* scan for escape sequences */
428 const char *pf;
429 char *ptm;
430 int c;
431 const char *end;
432 mbstate_t cur_state;
433
434 memset(& cur_state, 0, sizeof(cur_state));
435
436 end = &(r->stptr[len]);
437 for (pf = ptm = r->stptr; pf < end;) {
438 /*
439 * Keep multibyte characters together. This avoids
440 * problems if a subsequent byte of a multibyte
441 * character happens to be a backslash.
442 */
443 if (gawk_mb_cur_max > 1) {
444 int mblen = mbrlen(pf, end-pf, &cur_state);
445
446 if (mblen > 1) {
447 int i;
448
449 for (i = 0; i < mblen; i++)
450 *ptm++ = *pf++;
451 continue;
452 }
453 }
454
455 c = *pf++;
456 if (c == '\\') {
457 c = parse_escape(&pf);
458 if (c < 0) {
459 if (do_lint)
460 lintwarn(_("backslash string continuation is not portable"));
461 if ((flags & ELIDE_BACK_NL) != 0)
462 continue;
463 c = '\\';
464 }
465 *ptm++ = c;
466 } else
467 *ptm++ = c;
468 }
469 len = ptm - r->stptr;
470 erealloc(r->stptr, char *, len + 1, "make_str_node");
471 r->stptr[len] = '\0';
472 }
473 r->stlen = len;
474
475 return r;
476 }
477
478 /* make_typed_regex --- make a typed regex node */
479
480 NODE *
481 make_typed_regex(const char *re, size_t len)
482 {
483 NODE *n, *exp, *n2;
484
485 exp = make_str_node(re, len, ALREADY_MALLOCED);
486 n = make_regnode(Node_regex, exp);
487 if (n == NULL)
488 fatal(_("could not make typed regex"));
489
490 n2 = make_string(re, len);
491 n2->typed_re = n;
492 #if HAVE_MPFR
493 if (do_mpfr)
494 mpg_zero(n2);
495 else
496 #endif
497 n2->numbr = 0;
498 n2->flags |= NUMCUR|STRCUR|REGEX;
499 n2->flags &= ~(STRING|NUMBER);
500
501 return n2;
502 }
503
504
505 /* unref --- remove reference to a particular node */
506
507 void
508 r_unref(NODE *tmp)
509 {
510 #ifdef GAWKDEBUG
511 /* Do the same as in awk.h:unref(). */
512 assert(tmp == NULL || tmp->valref > 0);
513 if (tmp == NULL || --tmp->valref > 0)
514 return;
515 #endif
516
517 if ((tmp->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
518 efree(tmp->stptr);
519
520 mpfr_unset(tmp);
521
522 free_wstr(tmp);
523 freenode(tmp);
524 }
525
526
527 /*
528 * parse_escape:
529 *
530 * Parse a C escape sequence. STRING_PTR points to a variable containing a
531 * pointer to the string to parse. That pointer is updated past the
532 * characters we use. The value of the escape sequence is returned.
533 *
534 * A negative value means the sequence \ newline was seen, which is supposed to
535 * be equivalent to nothing at all.
536 *
537 * If \ is followed by a null character, we return a negative value and leave
538 * the string pointer pointing at the null character.
539 *
540 * If \ is followed by 000, we return 0 and leave the string pointer after the
541 * zeros. A value of 0 does not mean end of string.
542 *
543 * POSIX doesn't allow \x.
544 */
545
546 int
547 parse_escape(const char **string_ptr)
548 {
549 int c = *(*string_ptr)++;
550 int i;
551 int count;
552 int j;
553 const char *start;
554
555 if (do_lint_old) {
556 switch (c) {
557 case 'a':
558 case 'b':
559 case 'f':
560 case 'r':
561 lintwarn(_("old awk does not support the `\\%c' escape sequence"), c);
562 break;
563 }
564 }
565
566 switch (c) {
567 case 'a':
568 return '\a';
569 case 'b':
570 return '\b';
571 case 'f':
572 return '\f';
573 case 'n':
574 return '\n';
575 case 'r':
576 return '\r';
577 case 't':
578 return '\t';
579 case 'v':
580 return '\v';
581 case '\n':
582 return -2;
583 case 0:
584 (*string_ptr)--;
585 return -1;
586 case '0':
587 case '1':
588 case '2':
589 case '3':
590 case '4':
591 case '5':
592 case '6':
593 case '7':
594 i = c - '0';
595 count = 0;
596 while (++count < 3) {
597 if ((c = *(*string_ptr)++) >= '0' && c <= '7') {
598 i *= 8;
599 i += c - '0';
600 } else {
601 (*string_ptr)--;
602 break;
603 }
604 }
605 return i;
606 case 'x':
607 if (do_lint) {
608 static bool warned = false;
609
610 if (! warned) {
611 warned = true;
612 lintwarn(_("POSIX does not allow `\\x' escapes"));
613 }
614 }
615 if (do_posix)
616 return ('x');
617 if (! isxdigit((unsigned char) (*string_ptr)[0])) {
618 warning(_("no hex digits in `\\x' escape sequence"));
619 return ('x');
620 }
621 start = *string_ptr;
622 for (i = j = 0; j < 2; j++) {
623 /* do outside test to avoid multiple side effects */
624 c = (unsigned char) *(*string_ptr)++;
625 if (isxdigit(c)) {
626 i *= 16;
627 if (isdigit(c))
628 i += c - '0';
629 else if (isupper(c))
630 i += c - 'A' + 10;
631 else
632 i += c - 'a' + 10;
633 } else {
634 (*string_ptr)--;
635 break;
636 }
637 }
638 if (do_lint && j == 2 && isxdigit((unsigned char)*(*string_ptr)))
639 lintwarn(_("hex escape \\x%.*s of %d characters probably not interpreted the way you expect"), 3, start, 3);
640 return i;
641 case '\\':
642 case '"':
643 return c;
644 default:
645 {
646 static bool warned[256];
647 unsigned char uc = (unsigned char) c;
648
649 /* N.B.: use unsigned char here to avoid Latin-1 problems */
650
651 if (! warned[uc]) {
652 warned[uc] = true;
653
654 warning(_("escape sequence `\\%c' treated as plain `%c'"), uc, uc);
655 }
656 }
657 return c;
658 }
659 }
660
661 /* get_numbase --- return the base to use for the number in 's' */
662
663 int
664 get_numbase(const char *s, size_t len, bool use_locale)
665 {
666 int dec_point = '.';
667 const char *str = s;
668
669 #if defined(HAVE_LOCALE_H)
670 /*
671 * loc.decimal_point may not have been initialized yet,
672 * so double check it before using it.
673 */
674 if (use_locale && loc.decimal_point != NULL && loc.decimal_point[0] != '\0')
675 dec_point = loc.decimal_point[0]; /* XXX --- assumes one char */
676 #endif
677
678 if (len < 2 || str[0] != '0')
679 return 10;
680
681 /* leading 0x or 0X */
682 if (str[1] == 'x' || str[1] == 'X')
683 return 16;
684
685 /*
686 * Numbers with '.', 'e', or 'E' are decimal.
687 * Have to check so that things like 00.34 are handled right.
688 *
689 * These beasts can have trailing whitespace. Deal with that too.
690 */
691 for (; len > 0; len--, str++) {
692 if (*str == 'e' || *str == 'E' || *str == dec_point)
693 return 10;
694 else if (! isdigit((unsigned char) *str))
695 break;
696 }
697
698 if (! isdigit((unsigned char) s[1])
699 || s[1] == '8' || s[1] == '9'
700 )
701 return 10;
702 return 8;
703 }
704
705 /* str2wstr --- convert a multibyte string to a wide string */
706
707 NODE *
708 str2wstr(NODE *n, size_t **ptr)
709 {
710 size_t i, count, src_count;
711 char *sp;
712 mbstate_t mbs;
713 wchar_t wc, *wsp;
714 static bool warned = false;
715
716 assert((n->flags & (STRING|STRCUR)) != 0);
717
718 /*
719 * Don't convert global null string or global null field
720 * variables to a wide string. They are both zero-length anyway.
721 * This also avoids future double-free errors while releasing
722 * shallow copies, eg. *tmp = *Null_field; free_wstr(tmp);
723 */
724 if (n == Nnull_string || n == Null_field)
725 return n;
726
727 if ((n->flags & WSTRCUR) != 0) {
728 if (ptr == NULL)
729 return n;
730 /* otherwise
731 fall through and recompute to fill in the array */
732 free_wstr(n);
733 }
734
735 /*
736 * After consideration and consultation, this
737 * code trades space for time. We allocate
738 * an array of wchar_t that is n->stlen long.
739 * This is needed in the worst case anyway, where
740 * each input byte maps to one wchar_t. The
741 * advantage is that we only have to convert the string
742 * once, instead of twice, once to find out how many
743 * wide characters, and then again to actually fill in
744 * the info. If there's a lot left over, we can
745 * realloc the wide string down in size.
746 */
747
748 emalloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->stlen + 1), "str2wstr");
749 wsp = n->wstptr;
750
751 /*
752 * For use by do_match, create and fill in an array.
753 * For each byte `i' in n->stptr (the original string),
754 * a[i] is equal to `j', where `j' is the corresponding wchar_t
755 * in the converted wide string.
756 *
757 * Create the array.
758 */
759 if (ptr != NULL) {
760 ezalloc(*ptr, size_t *, sizeof(size_t) * n->stlen, "str2wstr");
761 }
762
763 sp = n->stptr;
764 src_count = n->stlen;
765 memset(& mbs, 0, sizeof(mbs));
766 for (i = 0; src_count > 0; i++) {
767 /*
768 * 9/2010: Check the current byte; if it's a valid character,
769 * then it doesn't start a multibyte sequence. This brings a
770 * big speed up. Thanks to Ulrich Drepper for the tip.
771 * 11/2010: Thanks to Paolo Bonzini for some even faster code.
772 */
773 if (is_valid_character(*sp)) {
774 count = 1;
775 wc = btowc_cache(*sp);
776 } else
777 count = mbrtowc(& wc, sp, src_count, & mbs);
778 switch (count) {
779 case (size_t) -2:
780 case (size_t) -1:
781 /*
782 * mbrtowc(3) says the state of mbs becomes undefined
783 * after a bad character, so reset it.
784 */
785 memset(& mbs, 0, sizeof(mbs));
786
787 /* Warn the user something's wrong */
788 if (! warned) {
789 warned = true;
790 warning(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale"));
791 }
792
793 /*
794 * 8/2015: If we're using UTF, then instead of just
795 * skipping the character, plug in the Unicode
796 * replacement character. In most cases this gives
797 * us "better" results, in that character counts
798 * and string lengths tend to make more sense.
799 *
800 * Otherwise, just skip the bad byte and keep going,
801 * so that we get a more-or-less full string, instead of
802 * stopping early. This is particularly important
803 * for match() where we need to build the indices.
804 */
805 if (using_utf8()) {
806 count = 1;
807 wc = 0xFFFD; /* unicode replacement character */
808 goto set_wc;
809 } else {
810 /* skip it and keep going */
811 sp++;
812 src_count--;
813 }
814 break;
815
816 case 0:
817 count = 1;
818 /* fall through */
819 default:
820 set_wc:
821 *wsp++ = wc;
822 src_count -= count;
823 while (count--) {
824 if (ptr != NULL)
825 (*ptr)[sp - n->stptr] = i;
826 sp++;
827 }
828 break;
829 }
830 }
831
832 *wsp = L'\0';
833 n->wstlen = wsp - n->wstptr;
834 n->flags |= WSTRCUR;
835 #define ARBITRARY_AMOUNT_TO_GIVE_BACK 100
836 if (n->stlen - n->wstlen > ARBITRARY_AMOUNT_TO_GIVE_BACK)
837 erealloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 1), "str2wstr");
838
839 return n;
840 }
841
842 /* wstr2str --- convert a wide string back into multibyte one */
843
844 NODE *
845 wstr2str(NODE *n)
846 {
847 size_t result;
848 size_t length;
849 wchar_t *wp;
850 mbstate_t mbs;
851 char *newval, *cp;
852
853 assert(n->valref == 1);
854 assert((n->flags & WSTRCUR) != 0);
855
856 /*
857 * Convert the wide chars in t1->wstptr back into m.b. chars.
858 * This is pretty grotty, but it's the most straightforward
859 * way to do things.
860 */
861 memset(& mbs, 0, sizeof(mbs));
862
863 length = n->wstlen;
864 emalloc(newval, char *, (length * gawk_mb_cur_max) + 1, "wstr2str");
865
866 wp = n->wstptr;
867 for (cp = newval; length > 0; length--) {
868 result = wcrtomb(cp, *wp, & mbs);
869 if (result == (size_t) -1) /* what to do? break seems best */
870 break;
871 cp += result;
872 wp++;
873 }
874 *cp = '\0';
875
876 /* N.B. caller just created n with make_string, so this free is safe */
877 efree(n->stptr);
878 n->stptr = newval;
879 n->stlen = cp - newval;
880
881 return n;
882 }
883
884 /* free_wstr --- release the wide string part of a node */
885
886 void
887 r_free_wstr(NODE *n)
888 {
889 assert(n->type == Node_val);
890
891 if ((n->flags & WSTRCUR) != 0) {
892 assert(n->wstptr != NULL);
893 efree(n->wstptr);
894 }
895 n->wstptr = NULL;
896 n->wstlen = 0;
897 n->flags &= ~WSTRCUR;
898 }
899
900 static void __attribute__ ((unused))
901 dump_wstr(FILE *fp, const wchar_t *str, size_t len)
902 {
903 if (str == NULL || len == 0)
904 return;
905
906 for (; len--; str++)
907 putwc(*str, fp);
908 }
909
910 /* wstrstr --- walk haystack, looking for needle, wide char version */
911
912 const wchar_t *
913 wstrstr(const wchar_t *haystack, size_t hs_len,
914 const wchar_t *needle, size_t needle_len)
915 {
916 size_t i;
917
918 if (haystack == NULL || needle == NULL || needle_len > hs_len)
919 return NULL;
920
921 for (i = 0; i < hs_len; i++) {
922 if (haystack[i] == needle[0]
923 && i+needle_len-1 < hs_len
924 && haystack[i+needle_len-1] == needle[needle_len-1]) {
925 /* first & last chars match, check string */
926 if (memcmp(haystack+i, needle, sizeof(wchar_t) * needle_len) == 0) {
927 return haystack + i;
928 }
929 }
930 }
931
932 return NULL;
933 }
934
935 /* wcasestrstr --- walk haystack, nocase look for needle, wide char version */
936
937 const wchar_t *
938 wcasestrstr(const wchar_t *haystack, size_t hs_len,
939 const wchar_t *needle, size_t needle_len)
940 {
941 size_t i, j;
942
943 if (haystack == NULL || needle == NULL || needle_len > hs_len)
944 return NULL;
945
946 for (i = 0; i < hs_len; i++) {
947 if (towlower(haystack[i]) == towlower(needle[0])
948 && i+needle_len-1 < hs_len
949 && towlower(haystack[i+needle_len-1]) == towlower(needle[needle_len-1])) {
950 /* first & last chars match, check string */
951 const wchar_t *start;
952
953 start = haystack+i;
954 for (j = 0; j < needle_len; j++, start++) {
955 wchar_t h, n;
956
957 h = towlower(*start);
958 n = towlower(needle[j]);
959 if (h != n)
960 goto out;
961 }
962 return haystack + i;
963 }
964 out: ;
965 }
966
967 return NULL;
968 }
969
970 /* is_ieee_magic_val --- return true for +inf, -inf, +nan, -nan */
971
972 bool
973 is_ieee_magic_val(const char *val)
974 {
975 /*
976 * Avoid strncasecmp: it mishandles ASCII bytes in some locales.
977 * Assume the length is 4, as the caller checks this.
978 */
979 return ( (val[0] == '+' || val[0] == '-')
980 && ( ( (val[1] == 'i' || val[1] == 'I')
981 && (val[2] == 'n' || val[2] == 'N')
982 && (val[3] == 'f' || val[3] == 'F'))
983 || ( (val[1] == 'n' || val[1] == 'N')
984 && (val[2] == 'a' || val[2] == 'A')
985 && (val[3] == 'n' || val[3] == 'N'))));
986 }
987
988 /* get_ieee_magic_val --- return magic value for string */
989
990 static AWKNUM
991 get_ieee_magic_val(char *val)
992 {
993 static bool first = true;
994 static AWKNUM inf;
995 static AWKNUM nan;
996 char save;
997
998 char *ptr;
999 save = val[4];
1000 val[4] = '\0';
1001 AWKNUM v = strtod(val, &ptr);
1002 val[4] = save;
1003
1004 if (val == ptr) { /* Older strtod implementations don't support inf or nan. */
1005 if (first) {
1006 first = false;
1007 nan = sqrt(-1.0);
1008 inf = -log(0.0);
1009 }
1010
1011 v = ((val[1] == 'i' || val[1] == 'I') ? inf : nan);
1012 if (val[0] == '-')
1013 v = -v;
1014 }
1015
1016 return v;
1017 }
1018
1019 wint_t btowc_cache[256];
1020
1021 /* init_btowc_cache --- initialize the cache */
1022
1023 void init_btowc_cache()
1024 {
1025 int i;
1026
1027 for (i = 0; i <= 255; i++) {
1028 btowc_cache[i] = btowc(i);
1029 }
1030 }
1031
1032 #define BLOCKCHUNK 100
1033
1034 struct block_header nextfree[BLOCK_MAX] = {
1035 { NULL, sizeof(NODE), "node" },
1036 { NULL, sizeof(BUCKET), "bucket" },
1037 };
1038
1039 #ifdef MEMDEBUG
1040
1041 void *
1042 r_getblock(int id)
1043 {
1044 void *res;
1045 emalloc(res, void *, nextfree[id].size, "getblock");
1046 nextfree[id].active++;
1047 if (nextfree[id].highwater < nextfree[id].active)
1048 nextfree[id].highwater = nextfree[id].active;
1049 return res;
1050 }
1051
1052 void
1053 r_freeblock(void *p, int id)
1054 {
1055 nextfree[id].active--;
1056 free(p);
1057 }
1058
1059 #else
1060
1061 /* more_blocks --- get more blocks of memory and add to the free list;
1062 size of a block must be >= sizeof(struct block_item)
1063 */
1064
1065 void *
1066 more_blocks(int id)
1067 {
1068 struct block_item *freep, *np, *next;
1069 char *p, *endp;
1070 size_t size;
1071
1072 size = nextfree[id].size;
1073
1074 assert(size >= sizeof(struct block_item));
1075 emalloc(freep, struct block_item *, BLOCKCHUNK * size, "more_blocks");
1076 p = (char *) freep;
1077 endp = p + BLOCKCHUNK * size;
1078
1079 for (np = freep; ; np = next) {
1080 next = (struct block_item *) (p += size);
1081 if (p >= endp) {
1082 np->freep = NULL;
1083 break;
1084 }
1085 np->freep = next;
1086 }
1087 nextfree[id].freep = freep->freep;
1088 nextfree[id].highwater += BLOCKCHUNK;
1089 return freep;
1090 }
1091
1092 #endif
1093
1094 /* make_bool_node --- make a boolean-valued node */
1095
1096 extern NODE *
1097 make_bool_node(bool value)
1098 {
1099 NODE *val;
1100 const char *sval;
1101 AWKNUM nval;
1102
1103 sval = (value ? "1" : "0");
1104 nval = (value ? 1.0 : 0.0);
1105
1106 val = make_number(nval);
1107 val->stptr = estrdup(sval, strlen(sval));
1108 val->stlen = strlen(sval);
1109 val->flags |= NUMCUR|STRCUR|BOOLVAL;
1110 val->stfmt = STFMT_UNUSED;
1111
1112 return val;
1113 }