1 /* __gmp_doscan -- formatted input internals.
2
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
6
7 Copyright 2001-2003 Free Software Foundation, Inc.
8
9 This file is part of the GNU MP Library.
10
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
13
14 * the GNU Lesser General Public License as published by the Free
15 Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
17
18 or
19
20 * the GNU General Public License as published by the Free Software
21 Foundation; either version 2 of the License, or (at your option) any
22 later version.
23
24 or both in parallel, as here.
25
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 for more details.
30
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library. If not,
33 see https://www.gnu.org/licenses/. */
34
35 #define _GNU_SOURCE /* for DECIMAL_POINT in langinfo.h */
36
37 #include "config.h" /* needed for the HAVE_, could also move gmp incls */
38
39 #include <stdarg.h>
40 #include <ctype.h>
41 #include <stddef.h> /* for ptrdiff_t */
42 #include <stdio.h>
43 #include <stdlib.h> /* for strtol */
44 #include <string.h>
45
46 #if HAVE_LANGINFO_H
47 #include <langinfo.h> /* for nl_langinfo */
48 #endif
49
50 #if HAVE_LOCALE_H
51 #include <locale.h> /* for localeconv */
52 #endif
53
54 #if HAVE_INTTYPES_H
55 # include <inttypes.h> /* for intmax_t */
56 #endif
57 #if HAVE_STDINT_H
58 # include <stdint.h>
59 #endif
60
61 #if HAVE_SYS_TYPES_H
62 #include <sys/types.h> /* for quad_t */
63 #endif
64
65 #include "gmp-impl.h"
66
67
68 /* Change this to "#define TRACE(x) x" for some traces. */
69 #define TRACE(x)
70
71
72 /* General:
73
74 It's necessary to parse up the format string to recognise the GMP
75 extra types F, Q and Z. Other types and conversions are passed
76 across to the standard sscanf or fscanf via funs->scan, for ease of
77 implementation. This is essential in the case of something like glibc
78 %p where the pointer format isn't actually documented.
79
80 Because funs->scan doesn't get the whole input it can't put the right
81 values in for %n, so that's handled in __gmp_doscan. Neither sscanf
82 nor fscanf directly indicate how many characters were read, so an
83 extra %n is appended to each run for that. For fscanf this merely
84 supports our %n output, but for sscanf it lets funs->step move us
85 along the input string.
86
87 Whitespace and literal matches in the format string, including %%,
88 are handled directly within __gmp_doscan. This is reasonably
89 efficient, and avoids some suspicious behaviour observed in various
90 system libc's. GLIBC 2.2.4 for instance returns 0 on
91
92 sscanf(" ", " x")
93 or
94 sscanf(" ", " x%d",&n)
95
96 whereas we think they should return EOF, since end-of-string is
97 reached when a match of "x" is required.
98
99 For standard % conversions, funs->scan is called once for each
100 conversion. If we had vfscanf and vsscanf and could rely on their
101 fixed text matching behaviour then we could call them with multiple
102 consecutive standard conversions. But plain fscanf and sscanf work
103 fine, and parsing one field at a time shouldn't be too much of a
104 slowdown.
105
106 gmpscan:
107
108 gmpscan reads a gmp type. It's only used from one place, but is a
109 separate subroutine to avoid a big chunk of complicated code in the
110 middle of __gmp_doscan. Within gmpscan a couple of loopbacks make it
111 possible to share code for parsing integers, rationals and floats.
112
113 In gmpscan normally one char of lookahead is maintained, but when width
114 is reached that stops, on the principle that an fgetc/ungetc of a char
115 past where we're told to stop would be undesirable. "chars" is how many
116 characters have been read so far, including the current c. When
117 chars==width and another character is desired then a jump is done to the
118 "convert" stage. c is invalid and mustn't be unget'ed in this case;
119 chars is set to width+1 to indicate that.
120
121 gmpscan normally returns the number of characters read. -1 means an
122 invalid field, -2 means EOF reached before any matching characters
123 were read.
124
125 For hex floats, the mantissa part is passed to mpf_set_str, then the
126 exponent is applied with mpf_mul_exp or mpf_div_2exp. This is easier
127 than teaching mpf_set_str about an exponent factor (ie. 2) differing
128 from the mantissa radix point factor (ie. 16). mpf_mul_exp and
129 mpf_div_2exp will preserve the application requested precision, so
130 nothing in that respect is lost by making this a two-step process.
131
132 Matching and errors:
133
134 C99 7.19.6.2 paras 9 and 10 say an input item is read as the longest
135 string which is a match for the appropriate type, or a prefix of a
136 match. With that done, if it's only a prefix then the result is a
137 matching failure, ie. invalid input.
138
139 This rule seems fairly clear, but doesn't seem to be universally
140 applied in system C libraries. Even GLIBC doesn't seem to get it
141 right, insofar as it seems to accept some apparently invalid forms.
142 Eg. glibc 2.3.1 accepts "0x" for a "%i", where a reading of the
143 standard would suggest a non-empty sequence of digits should be
144 required after an "0x".
145
146 A footnote to 7.19.6.2 para 17 notes how this input item reading can
147 mean inputs acceptable to strtol are not acceptable to fscanf. We
148 think this confirms our reading of "0x" as invalid.
149
150 Clearly gmp_sscanf could backtrack to a longest input which was a
151 valid match for a given item, but this is not done, since C99 says
152 sscanf is identical to fscanf, so we make gmp_sscanf identical to
153 gmp_fscanf.
154
155 Types:
156
157 C99 says "ll" is for long long, and "L" is for long double floats.
158 Unfortunately in GMP 4.1.1 we documented the two as equivalent. This
159 doesn't affect us directly, since both are passed through to plain
160 scanf. It seems wisest not to try to enforce the C99 rule. This is
161 consistent with what we said before, though whether it actually
162 worked was always up to the C library.
163
164 Alternatives:
165
166 Consideration was given to using separate code for gmp_fscanf and
167 gmp_sscanf. The sscanf case could zip across a string doing literal
168 matches or recognising digits in gmpscan, rather than making a
169 function call fun->get per character. The fscanf could use getc
170 rather than fgetc too, which might help those systems where getc is a
171 macro or otherwise inlined. But none of this scanning and converting
172 will be particularly fast, so the two are done together to keep it a
173 little simpler for now.
174
175 Various multibyte string issues are not addressed, for a start C99
176 scanf says the format string is multibyte. Since we pass %c, %s and
177 %[ to the system scanf, they might do multibyte reads already, but
178 it's another matter whether or not that can be used, since our digit
179 and whitespace parsing is only unibyte. The plan is to quietly
180 ignore multibyte locales for now. This is not as bad as it sounds,
181 since GMP is presumably used mostly on numbers, which can be
182 perfectly adequately treated in plain ASCII.
183
184 */
185
186
187 struct gmp_doscan_params_t {
188 int base;
189 int ignore;
190 char type;
191 int width;
192 };
193
194
195 #define GET(c) \
196 do { \
197 ASSERT (chars <= width); \
198 chars++; \
199 if (chars > width) \
200 goto convert; \
201 (c) = (*funs->get) (data); \
202 } while (0)
203
204 /* store into "s", extending if necessary */
205 #define STORE(c) \
206 do { \
207 ASSERT (s_upto <= s_alloc); \
208 if (s_upto >= s_alloc) \
209 { \
210 size_t s_alloc_new = s_alloc + S_ALLOC_STEP; \
211 s = __GMP_REALLOCATE_FUNC_TYPE (s, s_alloc, s_alloc_new, char); \
212 s_alloc = s_alloc_new; \
213 } \
214 s[s_upto++] = c; \
215 } while (0)
216
217 #define S_ALLOC_STEP 512
218
219 static int
220 gmpscan (const struct gmp_doscan_funs_t *funs, void *data,
221 const struct gmp_doscan_params_t *p, void *dst)
222 {
223 int chars, c, base, first, width, seen_point, seen_digit, hexfloat;
224 size_t s_upto, s_alloc, hexexp;
225 char *s;
226 int invalid = 0;
227
228 TRACE (printf ("gmpscan\n"));
229
230 ASSERT (p->type == 'F' || p->type == 'Q' || p->type == 'Z');
231
232 c = (*funs->get) (data);
233 if (c == EOF)
234 return -2;
235
236 chars = 1;
237 first = 1;
238 seen_point = 0;
239 width = (p->width == 0 ? INT_MAX-1 : p->width);
240 base = p->base;
241 s_alloc = S_ALLOC_STEP;
242 s = __GMP_ALLOCATE_FUNC_TYPE (s_alloc, char);
243 s_upto = 0;
244 hexfloat = 0;
245 hexexp = 0;
246
247 another:
248 seen_digit = 0;
249 if (c == '-')
250 {
251 STORE (c);
252 goto get_for_sign;
253 }
254 else if (c == '+')
255 {
256 /* don't store '+', it's not accepted by mpz_set_str etc */
257 get_for_sign:
258 GET (c);
259 }
260
261 if (base == 0)
262 {
263 base = 10; /* decimal if no base indicator */
264 if (c == '0')
265 {
266 seen_digit = 1; /* 0 alone is a valid number */
267 if (p->type != 'F')
268 base = 8; /* leading 0 is octal, for non-floats */
269 STORE (c);
270 GET (c);
271 if (c == 'x' || c == 'X')
272 {
273 base = 16;
274 seen_digit = 0; /* must have digits after an 0x */
275 if (p->type == 'F') /* don't pass 'x' to mpf_set_str_point */
276 hexfloat = 1;
277 else
278 STORE (c);
279 GET (c);
280 }
281 }
282 }
283
284 digits:
285 for (;;)
286 {
287 if (base == 16)
288 {
289 if (! isxdigit (c))
290 break;
291 }
292 else
293 {
294 if (! isdigit (c))
295 break;
296 if (base == 8 && (c == '8' || c == '9'))
297 break;
298 }
299
300 seen_digit = 1;
301 STORE (c);
302 GET (c);
303 }
304
305 if (first)
306 {
307 /* decimal point */
308 if (p->type == 'F' && ! seen_point)
309 {
310 /* For a multi-character decimal point, if the first character is
311 present then all of it must be, otherwise the input is
312 considered invalid. */
313 const char *point = GMP_DECIMAL_POINT;
314 int pc = (unsigned char) *point++;
315 if (c == pc)
316 {
317 for (;;)
318 {
319 STORE (c);
320 GET (c);
321 pc = (unsigned char) *point++;
322 if (pc == '\0')
323 break;
324 if (c != pc)
325 goto set_invalid;
326 }
327 seen_point = 1;
328 goto digits;
329 }
330 }
331
332 /* exponent */
333 if (p->type == 'F')
334 {
335 if (hexfloat && (c == 'p' || c == 'P'))
336 {
337 hexexp = s_upto; /* exponent location */
338 base = 10; /* exponent in decimal */
339 goto exponent;
340 }
341 else if (! hexfloat && (c == 'e' || c == 'E'))
342 {
343 exponent:
344 /* must have at least one digit in the mantissa, just an exponent
345 is not good enough */
346 if (! seen_digit)
347 goto set_invalid;
348
349 do_second:
350 first = 0;
351 STORE (c);
352 GET (c);
353 goto another;
354 }
355 }
356
357 /* denominator */
358 if (p->type == 'Q' && c == '/')
359 {
360 /* must have at least one digit in the numerator */
361 if (! seen_digit)
362 goto set_invalid;
363
364 /* now look for at least one digit in the denominator */
365 seen_digit = 0;
366
367 /* allow the base to be redetermined for "%i" */
368 base = p->base;
369 goto do_second;
370 }
371 }
372
373 convert:
374 if (! seen_digit)
375 {
376 set_invalid:
377 invalid = 1;
378 goto done;
379 }
380
381 if (! p->ignore)
382 {
383 STORE ('\0');
384 TRACE (printf (" convert \"%s\"\n", s));
385
386 /* We ought to have parsed out a valid string above, so just test
387 mpz_set_str etc with an ASSERT. */
388 switch (p->type) {
389 case 'F':
390 {
391 mpf_ptr f = (mpf_ptr) dst;
392 if (hexexp != 0)
393 s[hexexp] = '\0';
394 ASSERT_NOCARRY (mpf_set_str (f, s, hexfloat ? 16 : 10));
395 if (hexexp != 0)
396 {
397 char *dummy;
398 long exp;
399 exp = strtol (s + hexexp + 1, &dummy, 10);
400 if (exp >= 0)
401 mpf_mul_2exp (f, f, (unsigned long) exp);
402 else
403 mpf_div_2exp (f, f, NEG_CAST (unsigned long, exp));
404 }
405 }
406 break;
407 case 'Q':
408 ASSERT_NOCARRY (mpq_set_str ((mpq_ptr) dst, s, p->base));
409 break;
410 case 'Z':
411 ASSERT_NOCARRY (mpz_set_str ((mpz_ptr) dst, s, p->base));
412 break;
413 default:
414 ASSERT (0);
415 /*FALLTHRU*/
416 break;
417 }
418 }
419
420 done:
421 ASSERT (chars <= width+1);
422 if (chars != width+1)
423 {
424 (*funs->unget) (c, data);
425 TRACE (printf (" ungetc %d, to give %d chars\n", c, chars-1));
426 }
427 chars--;
428
429 (*__gmp_free_func) (s, s_alloc);
430
431 if (invalid)
432 {
433 TRACE (printf (" invalid\n"));
434 return -1;
435 }
436
437 TRACE (printf (" return %d chars (cf width %d)\n", chars, width));
438 return chars;
439 }
440
441
442 /* Read and discard whitespace, if any. Return number of chars skipped.
443 Whitespace skipping never provokes the EOF return from __gmp_doscan, so
444 it's not necessary to watch for EOF from funs->get, */
445 static int
446 skip_white (const struct gmp_doscan_funs_t *funs, void *data)
447 {
448 int c;
449 int ret = 0;
450
451 do
452 {
453 c = (funs->get) (data);
454 ret++;
455 }
456 while (isspace (c));
457
458 (funs->unget) (c, data);
459 ret--;
460
461 TRACE (printf (" skip white %d\n", ret));
462 return ret;
463 }
464
465
466 int
467 __gmp_doscan (const struct gmp_doscan_funs_t *funs, void *data,
468 const char *orig_fmt, va_list orig_ap)
469 {
470 struct gmp_doscan_params_t param;
471 va_list ap;
472 char *alloc_fmt;
473 const char *fmt, *this_fmt, *end_fmt;
474 size_t orig_fmt_len, alloc_fmt_size, len;
475 int new_fields, new_chars;
476 char fchar;
477 int fields = 0;
478 int chars = 0;
479
480 TRACE (printf ("__gmp_doscan \"%s\"\n", orig_fmt);
481 if (funs->scan == (gmp_doscan_scan_t) sscanf)
482 printf (" s=\"%s\"\n", * (const char **) data));
483
484 /* Don't modify orig_ap, if va_list is actually an array and hence call by
485 reference. It could be argued that it'd be more efficient to leave
486 callers to make a copy if they care, but doing so here is going to be a
487 very small part of the total work, and we may as well keep applications
488 out of trouble. */
489 va_copy (ap, orig_ap);
490
491 /* Parts of the format string are going to be copied so that a " %n" can
492 be appended. alloc_fmt is some space for that. orig_fmt_len+4 will be
493 needed if fmt consists of a single "%" specifier, but otherwise is an
494 overestimate. We're not going to be very fast here, so use
495 __gmp_allocate_func rather than TMP_ALLOC. */
496 orig_fmt_len = strlen (orig_fmt);
497 alloc_fmt_size = orig_fmt_len + 4;
498 alloc_fmt = __GMP_ALLOCATE_FUNC_TYPE (alloc_fmt_size, char);
499
500 fmt = orig_fmt;
501 ASSERT_CODE (end_fmt = orig_fmt + orig_fmt_len);
502
503 for (;;)
504 {
505 next:
506 fchar = *fmt++;
507
508 if (fchar == '\0')
509 break;
510
511 if (isspace (fchar))
512 {
513 chars += skip_white (funs, data);
514 continue;
515 }
516
517 if (fchar != '%')
518 {
519 int c;
520 literal:
521 c = (funs->get) (data);
522 if (c != fchar)
523 {
524 (funs->unget) (c, data);
525 if (c == EOF)
526 {
527 eof_no_match:
528 if (fields == 0)
529 fields = EOF;
530 }
531 goto done;
532 }
533 chars++;
534 continue;
535 }
536
537 param.type = '\0';
538 param.base = 0; /* for e,f,g,i */
539 param.ignore = 0;
540 param.width = 0;
541
542 this_fmt = fmt-1;
543 TRACE (printf (" this_fmt \"%s\"\n", this_fmt));
544
545 for (;;)
546 {
547 ASSERT (fmt <= end_fmt);
548
549 fchar = *fmt++;
550 switch (fchar) {
551
552 case '\0': /* unterminated % sequence */
553 ASSERT (0);
554 goto done;
555
556 case '%': /* literal % */
557 goto literal;
558
559 case '[': /* character range */
560 fchar = *fmt++;
561 if (fchar == '^')
562 fchar = *fmt++;
563 /* ']' allowed as the first char (possibly after '^') */
564 if (fchar == ']')
565 fchar = *fmt++;
566 for (;;)
567 {
568 ASSERT (fmt <= end_fmt);
569 if (fchar == '\0')
570 {
571 /* unterminated % sequence */
572 ASSERT (0);
573 goto done;
574 }
575 if (fchar == ']')
576 break;
577 fchar = *fmt++;
578 }
579 /*FALLTHRU*/
580 case 'c': /* characters */
581 case 's': /* string of non-whitespace */
582 case 'p': /* pointer */
583 libc_type:
584 len = fmt - this_fmt;
585 memcpy (alloc_fmt, this_fmt, len);
586 alloc_fmt[len++] = '%';
587 alloc_fmt[len++] = 'n';
588 alloc_fmt[len] = '\0';
589
590 TRACE (printf (" scan \"%s\"\n", alloc_fmt);
591 if (funs->scan == (gmp_doscan_scan_t) sscanf)
592 printf (" s=\"%s\"\n", * (const char **) data));
593
594 new_chars = -1;
595 if (param.ignore)
596 {
597 new_fields = (*funs->scan) (data, alloc_fmt, &new_chars, NULL);
598 ASSERT (new_fields == 0 || new_fields == EOF);
599 }
600 else
601 {
602 void *arg = va_arg (ap, void *);
603 new_fields = (*funs->scan) (data, alloc_fmt, arg, &new_chars);
604 ASSERT (new_fields==0 || new_fields==1 || new_fields==EOF);
605
606 if (new_fields == 0)
607 goto done; /* invalid input */
608
609 if (new_fields == 1)
610 ASSERT (new_chars != -1);
611 }
612 TRACE (printf (" new_fields %d new_chars %d\n",
613 new_fields, new_chars));
614
615 if (new_fields == -1)
616 goto eof_no_match; /* EOF before anything matched */
617
618 /* Under param.ignore, when new_fields==0 we don't know if
619 it's a successful match or an invalid field. new_chars
620 won't have been assigned if it was an invalid field. */
621 if (new_chars == -1)
622 goto done; /* invalid input */
623
624 chars += new_chars;
625 (*funs->step) (data, new_chars);
626
627 increment_fields:
628 if (! param.ignore)
629 fields++;
630 goto next;
631
632 case 'd': /* decimal */
633 case 'u': /* decimal */
634 param.base = 10;
635 goto numeric;
636
637 case 'e': /* float */
638 case 'E': /* float */
639 case 'f': /* float */
640 case 'g': /* float */
641 case 'G': /* float */
642 case 'i': /* integer with base marker */
643 numeric:
644 if (param.type != 'F' && param.type != 'Q' && param.type != 'Z')
645 goto libc_type;
646
647 chars += skip_white (funs, data);
648
649 new_chars = gmpscan (funs, data, ¶m,
650 param.ignore ? NULL : va_arg (ap, void*));
651 if (new_chars == -2)
652 goto eof_no_match;
653 if (new_chars == -1)
654 goto done;
655
656 ASSERT (new_chars >= 0);
657 chars += new_chars;
658 goto increment_fields;
659
660 case 'a': /* glibc allocate string */
661 case '\'': /* glibc digit groupings */
662 break;
663
664 case 'F': /* mpf_t */
665 case 'j': /* intmax_t */
666 case 'L': /* long long */
667 case 'q': /* quad_t */
668 case 'Q': /* mpq_t */
669 case 't': /* ptrdiff_t */
670 case 'z': /* size_t */
671 case 'Z': /* mpz_t */
672 set_type:
673 param.type = fchar;
674 break;
675
676 case 'h': /* short or char */
677 if (param.type != 'h')
678 goto set_type;
679 param.type = 'H'; /* internal code for "hh" */
680 break;
681
682 goto numeric;
683
684 case 'l': /* long, long long, double or long double */
685 if (param.type != 'l')
686 goto set_type;
687 param.type = 'L'; /* "ll" means "L" */
688 break;
689
690 case 'n':
691 if (! param.ignore)
692 {
693 void *p;
694 p = va_arg (ap, void *);
695 TRACE (printf (" store %%n to %p\n", p));
696 switch (param.type) {
697 case '\0': * (int *) p = chars; break;
698 case 'F': mpf_set_si ((mpf_ptr) p, (long) chars); break;
699 case 'H': * (char *) p = chars; break;
700 case 'h': * (short *) p = chars; break;
701 #if HAVE_INTMAX_T
702 case 'j': * (intmax_t *) p = chars; break;
703 #else
704 case 'j': ASSERT_FAIL (intmax_t not available); break;
705 #endif
706 case 'l': * (long *) p = chars; break;
707 #if HAVE_QUAD_T && HAVE_LONG_LONG
708 case 'q':
709 ASSERT_ALWAYS (sizeof (quad_t) == sizeof (long long));
710 /*FALLTHRU*/
711 #else
712 case 'q': ASSERT_FAIL (quad_t not available); break;
713 #endif
714 #if HAVE_LONG_LONG
715 case 'L': * (long long *) p = chars; break;
716 #else
717 case 'L': ASSERT_FAIL (long long not available); break;
718 #endif
719 case 'Q': mpq_set_si ((mpq_ptr) p, (long) chars, 1L); break;
720 #if HAVE_PTRDIFF_T
721 case 't': * (ptrdiff_t *) p = chars; break;
722 #else
723 case 't': ASSERT_FAIL (ptrdiff_t not available); break;
724 #endif
725 case 'z': * (size_t *) p = chars; break;
726 case 'Z': mpz_set_si ((mpz_ptr) p, (long) chars); break;
727 default: ASSERT (0); break;
728 }
729 }
730 goto next;
731
732 case 'o':
733 param.base = 8;
734 goto numeric;
735
736 case 'x':
737 case 'X':
738 param.base = 16;
739 goto numeric;
740
741 case '0': case '1': case '2': case '3': case '4':
742 case '5': case '6': case '7': case '8': case '9':
743 param.width = 0;
744 do {
745 param.width = param.width * 10 + (fchar-'0');
746 fchar = *fmt++;
747 } while (isdigit (fchar));
748 fmt--; /* unget the non-digit */
749 break;
750
751 case '*':
752 param.ignore = 1;
753 break;
754
755 default:
756 /* something invalid in a % sequence */
757 ASSERT (0);
758 goto next;
759 }
760 }
761 }
762
763 done:
764 (*__gmp_free_func) (alloc_fmt, alloc_fmt_size);
765 return fields;
766 }