1 /* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*- */
2
3 /* GLIB - Library of useful routines for C programming
4 * Copyright (C) 2008 Red Hat, Inc.
5 *
6 * SPDX-License-Identifier: LGPL-2.1-or-later
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General
19 * Public License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include "config.h"
23 #include "glibconfig.h"
24
25 #include <string.h>
26
27 #ifdef G_OS_UNIX
28 #include <unistd.h>
29 #endif
30
31 #include "ghostutils.h"
32
33 #include "garray.h"
34 #include "gmem.h"
35 #include "gstring.h"
36 #include "gstrfuncs.h"
37 #include "glibintl.h"
38
39 #ifdef G_PLATFORM_WIN32
40 #include <windows.h>
41 #endif
42
43
44 #define IDNA_ACE_PREFIX "xn--"
45 #define IDNA_ACE_PREFIX_LEN 4
46
47 /* Punycode constants, from RFC 3492. */
48
49 #define PUNYCODE_BASE 36
50 #define PUNYCODE_TMIN 1
51 #define PUNYCODE_TMAX 26
52 #define PUNYCODE_SKEW 38
53 #define PUNYCODE_DAMP 700
54 #define PUNYCODE_INITIAL_BIAS 72
55 #define PUNYCODE_INITIAL_N 0x80
56
57 #define PUNYCODE_IS_BASIC(cp) ((guint)(cp) < 0x80)
58
59 /* Encode/decode a single base-36 digit */
60 static inline gchar
61 encode_digit (guint dig)
62 {
63 if (dig < 26)
64 return dig + 'a';
65 else
66 return dig - 26 + '0';
67 }
68
69 static inline guint
70 decode_digit (gchar dig)
71 {
72 if (dig >= 'A' && dig <= 'Z')
73 return dig - 'A';
74 else if (dig >= 'a' && dig <= 'z')
75 return dig - 'a';
76 else if (dig >= '0' && dig <= '9')
77 return dig - '0' + 26;
78 else
79 return G_MAXUINT;
80 }
81
82 /* Punycode bias adaptation algorithm, RFC 3492 section 6.1 */
83 static guint
84 adapt (guint delta,
85 guint numpoints,
86 gboolean firsttime)
87 {
88 guint k;
89
90 delta = firsttime ? delta / PUNYCODE_DAMP : delta / 2;
91 delta += delta / numpoints;
92
93 k = 0;
94 while (delta > ((PUNYCODE_BASE - PUNYCODE_TMIN) * PUNYCODE_TMAX) / 2)
95 {
96 delta /= PUNYCODE_BASE - PUNYCODE_TMIN;
97 k += PUNYCODE_BASE;
98 }
99
100 return k + ((PUNYCODE_BASE - PUNYCODE_TMIN + 1) * delta /
101 (delta + PUNYCODE_SKEW));
102 }
103
104 /* Punycode encoder, RFC 3492 section 6.3. The algorithm is
105 * sufficiently bizarre that it's not really worth trying to explain
106 * here.
107 */
108 static gboolean
109 punycode_encode (const gchar *input_utf8,
110 gsize input_utf8_length,
111 GString *output)
112 {
113 guint delta, handled_chars, num_basic_chars, bias, j, q, k, t, digit;
114 gunichar n, m, *input;
115 glong written_chars;
116 gsize input_length;
117 gboolean success = FALSE;
118
119 /* Convert from UTF-8 to Unicode code points */
120 input = g_utf8_to_ucs4 (input_utf8, input_utf8_length, NULL,
121 &written_chars, NULL);
122 if (!input)
123 return FALSE;
124
125 input_length = (gsize) (written_chars > 0 ? written_chars : 0);
126
127 /* Copy basic chars */
128 for (j = num_basic_chars = 0; j < input_length; j++)
129 {
130 if (PUNYCODE_IS_BASIC (input[j]))
131 {
132 g_string_append_c (output, g_ascii_tolower (input[j]));
133 num_basic_chars++;
134 }
135 }
136 if (num_basic_chars)
137 g_string_append_c (output, '-');
138
139 handled_chars = num_basic_chars;
140
141 /* Encode non-basic chars */
142 delta = 0;
143 bias = PUNYCODE_INITIAL_BIAS;
144 n = PUNYCODE_INITIAL_N;
145 while (handled_chars < input_length)
146 {
147 /* let m = the minimum {non-basic} code point >= n in the input */
148 for (m = G_MAXUINT, j = 0; j < input_length; j++)
149 {
150 if (input[j] >= n && input[j] < m)
151 m = input[j];
152 }
153
154 if (m - n > (G_MAXUINT - delta) / (handled_chars + 1))
155 goto fail;
156 delta += (m - n) * (handled_chars + 1);
157 n = m;
158
159 for (j = 0; j < input_length; j++)
160 {
161 if (input[j] < n)
162 {
163 if (++delta == 0)
164 goto fail;
165 }
166 else if (input[j] == n)
167 {
168 q = delta;
169 for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE)
170 {
171 if (k <= bias)
172 t = PUNYCODE_TMIN;
173 else if (k >= bias + PUNYCODE_TMAX)
174 t = PUNYCODE_TMAX;
175 else
176 t = k - bias;
177 if (q < t)
178 break;
179 digit = t + (q - t) % (PUNYCODE_BASE - t);
180 g_string_append_c (output, encode_digit (digit));
181 q = (q - t) / (PUNYCODE_BASE - t);
182 }
183
184 g_string_append_c (output, encode_digit (q));
185 bias = adapt (delta, handled_chars + 1, handled_chars == num_basic_chars);
186 delta = 0;
187 handled_chars++;
188 }
189 }
190
191 delta++;
192 n++;
193 }
194
195 success = TRUE;
196
197 fail:
198 g_free (input);
199 return success;
200 }
201
202 /* From RFC 3454, Table B.1 */
203 #define idna_is_junk(ch) ((ch) == 0x00AD || (ch) == 0x1806 || (ch) == 0x200B || (ch) == 0x2060 || (ch) == 0xFEFF || (ch) == 0x034F || (ch) == 0x180B || (ch) == 0x180C || (ch) == 0x180D || (ch) == 0x200C || (ch) == 0x200D || ((ch) >= 0xFE00 && (ch) <= 0xFE0F))
204
205 /* Scan @str for "junk" and return a cleaned-up string if any junk
206 * is found. Else return %NULL.
207 */
208 static gchar *
209 remove_junk (const gchar *str,
210 gint len)
211 {
212 GString *cleaned = NULL;
213 const gchar *p;
214 gunichar ch;
215
216 for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p))
217 {
218 ch = g_utf8_get_char (p);
219 if (idna_is_junk (ch))
220 {
221 if (!cleaned)
222 {
223 cleaned = g_string_new (NULL);
224 g_string_append_len (cleaned, str, p - str);
225 }
226 }
227 else if (cleaned)
228 g_string_append_unichar (cleaned, ch);
229 }
230
231 if (cleaned)
232 return g_string_free (cleaned, FALSE);
233 else
234 return NULL;
235 }
236
237 static inline gboolean
238 contains_uppercase_letters (const gchar *str,
239 gint len)
240 {
241 const gchar *p;
242
243 for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p))
244 {
245 if (g_unichar_isupper (g_utf8_get_char (p)))
246 return TRUE;
247 }
248 return FALSE;
249 }
250
251 static inline gboolean
252 contains_non_ascii (const gchar *str,
253 gint len)
254 {
255 const gchar *p;
256
257 for (p = str; len == -1 ? *p : p < str + len; p++)
258 {
259 if ((guchar)*p > 0x80)
260 return TRUE;
261 }
262 return FALSE;
263 }
264
265 /* RFC 3454, Appendix C. ish. */
266 static inline gboolean
267 idna_is_prohibited (gunichar ch)
268 {
269 switch (g_unichar_type (ch))
270 {
271 case G_UNICODE_CONTROL:
272 case G_UNICODE_FORMAT:
273 case G_UNICODE_UNASSIGNED:
274 case G_UNICODE_PRIVATE_USE:
275 case G_UNICODE_SURROGATE:
276 case G_UNICODE_LINE_SEPARATOR:
277 case G_UNICODE_PARAGRAPH_SEPARATOR:
278 case G_UNICODE_SPACE_SEPARATOR:
279 return TRUE;
280
281 case G_UNICODE_OTHER_SYMBOL:
282 if (ch == 0xFFFC || ch == 0xFFFD ||
283 (ch >= 0x2FF0 && ch <= 0x2FFB))
284 return TRUE;
285 return FALSE;
286
287 case G_UNICODE_NON_SPACING_MARK:
288 if (ch == 0x0340 || ch == 0x0341)
289 return TRUE;
290 return FALSE;
291
292 default:
293 return FALSE;
294 }
295 }
296
297 /* RFC 3491 IDN cleanup algorithm. */
298 static gchar *
299 nameprep (const gchar *hostname,
300 gint len,
301 gboolean *is_unicode)
302 {
303 gchar *name, *tmp = NULL, *p;
304
305 /* It would be nice if we could do this without repeatedly
306 * allocating strings and converting back and forth between
307 * gunichars and UTF-8... The code does at least avoid doing most of
308 * the sub-operations when they would just be equivalent to a
309 * g_strdup().
310 */
311
312 /* Remove presentation-only characters */
313 name = remove_junk (hostname, len);
314 if (name)
315 {
316 tmp = name;
317 len = -1;
318 }
319 else
320 name = (gchar *)hostname;
321
322 /* Convert to lowercase */
323 if (contains_uppercase_letters (name, len))
324 {
325 name = g_utf8_strdown (name, len);
326 g_free (tmp);
327 tmp = name;
328 len = -1;
329 }
330
331 /* If there are no UTF8 characters, we're done. */
332 if (!contains_non_ascii (name, len))
333 {
334 *is_unicode = FALSE;
335 if (name == (gchar *)hostname)
336 return len == -1 ? g_strdup (hostname) : g_strndup (hostname, len);
337 else
338 return name;
339 }
340
341 *is_unicode = TRUE;
342
343 /* Normalize */
344 name = g_utf8_normalize (name, len, G_NORMALIZE_NFKC);
345 g_free (tmp);
346 tmp = name;
347
348 if (!name)
349 return NULL;
350
351 /* KC normalization may have created more capital letters (eg,
352 * angstrom -> capital A with ring). So we have to lowercasify a
353 * second time. (This is more-or-less how the nameprep algorithm
354 * does it. If tolower(nfkc(tolower(X))) is guaranteed to be the
355 * same as tolower(nfkc(X)), then we could skip the first tolower,
356 * but I'm not sure it is.)
357 */
358 if (contains_uppercase_letters (name, -1))
359 {
360 name = g_utf8_strdown (name, -1);
361 g_free (tmp);
362 tmp = name;
363 }
364
365 /* Check for prohibited characters */
366 for (p = name; *p; p = g_utf8_next_char (p))
367 {
368 if (idna_is_prohibited (g_utf8_get_char (p)))
369 {
370 name = NULL;
371 g_free (tmp);
372 goto done;
373 }
374 }
375
376 /* FIXME: We're supposed to verify certain constraints on bidi
377 * characters, but glib does not appear to have that information.
378 */
379
380 done:
381 return name;
382 }
383
384 /* RFC 3490, section 3.1 says '.', 0x3002, 0xFF0E, and 0xFF61 count as
385 * label-separating dots. @str must be '\0'-terminated.
386 */
387 #define idna_is_dot(str) ( \
388 ((guchar)(str)[0] == '.') || \
389 ((guchar)(str)[0] == 0xE3 && (guchar)(str)[1] == 0x80 && (guchar)(str)[2] == 0x82) || \
390 ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBC && (guchar)(str)[2] == 0x8E) || \
391 ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBD && (guchar)(str)[2] == 0xA1) )
392
393 static const gchar *
394 idna_end_of_label (const gchar *str)
395 {
396 for (; *str; str = g_utf8_next_char (str))
397 {
398 if (idna_is_dot (str))
399 return str;
400 }
401 return str;
402 }
403
404 static gsize
405 get_hostname_max_length_bytes (void)
406 {
407 #if defined(G_OS_WIN32)
408 wchar_t tmp[MAX_COMPUTERNAME_LENGTH];
409 return sizeof (tmp) / sizeof (tmp[0]);
410 #elif defined(_SC_HOST_NAME_MAX)
411 glong max = sysconf (_SC_HOST_NAME_MAX);
412 if (max > 0)
413 return (gsize) max;
414
415 #ifdef HOST_NAME_MAX
416 return HOST_NAME_MAX;
417 #else
418 return _POSIX_HOST_NAME_MAX;
419 #endif /* HOST_NAME_MAX */
420 #else
421 /* Fallback to some reasonable value
422 * See https://stackoverflow.com/questions/8724954/what-is-the-maximum-number-of-characters-for-a-host-name-in-unix/28918017#28918017 */
423 return 255;
424 #endif
425 }
426
427 /* Returns %TRUE if `strlen (str) > comparison_length`, but without actually
428 * running `strlen(str)`, as that would take a very long time for long
429 * (untrusted) input strings. */
430 static gboolean
431 strlen_greater_than (const gchar *str,
432 gsize comparison_length)
433 {
434 gsize i;
435
436 for (i = 0; str[i] != '\0'; i++)
437 if (i > comparison_length)
438 return TRUE;
439
440 return FALSE;
441 }
442
443 /**
444 * g_hostname_to_ascii:
445 * @hostname: a valid UTF-8 or ASCII hostname
446 *
447 * Converts @hostname to its canonical ASCII form; an ASCII-only
448 * string containing no uppercase letters and not ending with a
449 * trailing dot.
450 *
451 * Returns: (nullable) (transfer full): an ASCII hostname, which must be freed,
452 * or %NULL if @hostname is in some way invalid.
453 *
454 * Since: 2.22
455 **/
456 gchar *
457 g_hostname_to_ascii (const gchar *hostname)
458 {
459 gchar *name, *label, *p;
460 GString *out;
461 gssize llen, oldlen;
462 gboolean unicode;
463 gsize hostname_max_length_bytes = get_hostname_max_length_bytes ();
464
465 /* Do an initial check on the hostname length, as overlong hostnames take a
466 * long time in the IDN cleanup algorithm in nameprep(). The ultimate
467 * restriction is that the IDN-decoded (i.e. pure ASCII) hostname cannot be
468 * longer than 255 bytes. That’s the least restrictive limit on hostname
469 * length of all the ways hostnames can be interpreted. Typically, the
470 * hostname will be an FQDN, which is limited to 253 bytes long. POSIX
471 * hostnames are limited to `get_hostname_max_length_bytes()` (typically 255
472 * bytes).
473 *
474 * See https://stackoverflow.com/a/28918017/2931197
475 *
476 * It’s possible for a hostname to be %-encoded, in which case its decoded
477 * length will be as much as 3× shorter.
478 *
479 * It’s also possible for a hostname to use overlong UTF-8 encodings, in which
480 * case its decoded length will be as much as 4× shorter.
481 *
482 * Note: This check is not intended as an absolute guarantee that a hostname
483 * is the right length and will be accepted by other systems. It’s intended to
484 * stop wildly-invalid hostnames from taking forever in nameprep().
485 */
486 if (hostname_max_length_bytes <= G_MAXSIZE / 4 &&
487 strlen_greater_than (hostname, 4 * MAX (255, hostname_max_length_bytes)))
488 return NULL;
489
490 label = name = nameprep (hostname, -1, &unicode);
491 if (!name || !unicode)
492 return name;
493
494 out = g_string_new (NULL);
495
496 do
497 {
498 unicode = FALSE;
499 for (p = label; *p && !idna_is_dot (p); p++)
500 {
501 if ((guchar)*p > 0x80)
502 unicode = TRUE;
503 }
504
505 oldlen = out->len;
506 llen = p - label;
507 if (unicode)
508 {
509 if (!strncmp (label, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN))
510 goto fail;
511
512 g_string_append (out, IDNA_ACE_PREFIX);
513 if (!punycode_encode (label, llen, out))
514 goto fail;
515 }
516 else
517 g_string_append_len (out, label, llen);
518
519 if (out->len - oldlen > 63)
520 goto fail;
521
522 label += llen;
523 if (*label)
524 label = g_utf8_next_char (label);
525 if (*label)
526 g_string_append_c (out, '.');
527 }
528 while (*label);
529
530 g_free (name);
531 return g_string_free (out, FALSE);
532
533 fail:
534 g_free (name);
535 g_string_free (out, TRUE);
536 return NULL;
537 }
538
539 /**
540 * g_hostname_is_non_ascii:
541 * @hostname: a hostname
542 *
543 * Tests if @hostname contains Unicode characters. If this returns
544 * %TRUE, you need to encode the hostname with g_hostname_to_ascii()
545 * before using it in non-IDN-aware contexts.
546 *
547 * Note that a hostname might contain a mix of encoded and unencoded
548 * segments, and so it is possible for g_hostname_is_non_ascii() and
549 * g_hostname_is_ascii_encoded() to both return %TRUE for a name.
550 *
551 * Returns: %TRUE if @hostname contains any non-ASCII characters
552 *
553 * Since: 2.22
554 **/
555 gboolean
556 g_hostname_is_non_ascii (const gchar *hostname)
557 {
558 return contains_non_ascii (hostname, -1);
559 }
560
561 /* Punycode decoder, RFC 3492 section 6.2. As with punycode_encode(),
562 * read the RFC if you want to understand what this is actually doing.
563 */
564 static gboolean
565 punycode_decode (const gchar *input,
566 gsize input_length,
567 GString *output)
568 {
569 GArray *output_chars;
570 gunichar n;
571 guint i, bias;
572 guint oldi, w, k, digit, t;
573 const gchar *split;
574
575 n = PUNYCODE_INITIAL_N;
576 i = 0;
577 bias = PUNYCODE_INITIAL_BIAS;
578
579 split = input + input_length - 1;
580 while (split > input && *split != '-')
581 split--;
582 if (split > input)
583 {
584 output_chars = g_array_sized_new (FALSE, FALSE, sizeof (gunichar),
585 split - input);
586 input_length -= (split - input) + 1;
587 while (input < split)
588 {
589 gunichar ch = (gunichar)*input++;
590 if (!PUNYCODE_IS_BASIC (ch))
591 goto fail;
592 g_array_append_val (output_chars, ch);
593 }
594 input++;
595 }
596 else
597 output_chars = g_array_new (FALSE, FALSE, sizeof (gunichar));
598
599 while (input_length)
600 {
601 oldi = i;
602 w = 1;
603 for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE)
604 {
605 if (!input_length--)
606 goto fail;
607 digit = decode_digit (*input++);
608 if (digit >= PUNYCODE_BASE)
609 goto fail;
610 if (digit > (G_MAXUINT - i) / w)
611 goto fail;
612 i += digit * w;
613 if (k <= bias)
614 t = PUNYCODE_TMIN;
615 else if (k >= bias + PUNYCODE_TMAX)
616 t = PUNYCODE_TMAX;
617 else
618 t = k - bias;
619 if (digit < t)
620 break;
621 if (w > G_MAXUINT / (PUNYCODE_BASE - t))
622 goto fail;
623 w *= (PUNYCODE_BASE - t);
624 }
625
626 bias = adapt (i - oldi, output_chars->len + 1, oldi == 0);
627
628 if (i / (output_chars->len + 1) > G_MAXUINT - n)
629 goto fail;
630 n += i / (output_chars->len + 1);
631 i %= (output_chars->len + 1);
632
633 g_array_insert_val (output_chars, i++, n);
634 }
635
636 for (i = 0; i < output_chars->len; i++)
637 g_string_append_unichar (output, g_array_index (output_chars, gunichar, i));
638 g_array_free (output_chars, TRUE);
639 return TRUE;
640
641 fail:
642 g_array_free (output_chars, TRUE);
643 return FALSE;
644 }
645
646 /**
647 * g_hostname_to_unicode:
648 * @hostname: a valid UTF-8 or ASCII hostname
649 *
650 * Converts @hostname to its canonical presentation form; a UTF-8
651 * string in Unicode normalization form C, containing no uppercase
652 * letters, no forbidden characters, and no ASCII-encoded segments,
653 * and not ending with a trailing dot.
654 *
655 * Of course if @hostname is not an internationalized hostname, then
656 * the canonical presentation form will be entirely ASCII.
657 *
658 * Returns: (nullable) (transfer full): a UTF-8 hostname, which must be freed,
659 * or %NULL if @hostname is in some way invalid.
660 *
661 * Since: 2.22
662 **/
663 gchar *
664 g_hostname_to_unicode (const gchar *hostname)
665 {
666 GString *out;
667 gssize llen;
668 gsize hostname_max_length_bytes = get_hostname_max_length_bytes ();
669
670 /* See the comment at the top of g_hostname_to_ascii(). */
671 if (hostname_max_length_bytes <= G_MAXSIZE / 4 &&
672 strlen_greater_than (hostname, 4 * MAX (255, hostname_max_length_bytes)))
673 return NULL;
674
675 out = g_string_new (NULL);
676
677 do
678 {
679 llen = idna_end_of_label (hostname) - hostname;
680 if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN))
681 {
682 hostname += IDNA_ACE_PREFIX_LEN;
683 llen -= IDNA_ACE_PREFIX_LEN;
684 if (!punycode_decode (hostname, llen, out))
685 {
686 g_string_free (out, TRUE);
687 return NULL;
688 }
689 }
690 else
691 {
692 gboolean unicode;
693 gchar *canonicalized = nameprep (hostname, llen, &unicode);
694
695 if (!canonicalized)
696 {
697 g_string_free (out, TRUE);
698 return NULL;
699 }
700 g_string_append (out, canonicalized);
701 g_free (canonicalized);
702 }
703
704 hostname += llen;
705 if (*hostname)
706 hostname = g_utf8_next_char (hostname);
707 if (*hostname)
708 g_string_append_c (out, '.');
709 }
710 while (*hostname);
711
712 return g_string_free (out, FALSE);
713 }
714
715 /**
716 * g_hostname_is_ascii_encoded:
717 * @hostname: a hostname
718 *
719 * Tests if @hostname contains segments with an ASCII-compatible
720 * encoding of an Internationalized Domain Name. If this returns
721 * %TRUE, you should decode the hostname with g_hostname_to_unicode()
722 * before displaying it to the user.
723 *
724 * Note that a hostname might contain a mix of encoded and unencoded
725 * segments, and so it is possible for g_hostname_is_non_ascii() and
726 * g_hostname_is_ascii_encoded() to both return %TRUE for a name.
727 *
728 * Returns: %TRUE if @hostname contains any ASCII-encoded
729 * segments.
730 *
731 * Since: 2.22
732 **/
733 gboolean
734 g_hostname_is_ascii_encoded (const gchar *hostname)
735 {
736 while (1)
737 {
738 if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN))
739 return TRUE;
740 hostname = idna_end_of_label (hostname);
741 if (*hostname)
742 hostname = g_utf8_next_char (hostname);
743 if (!*hostname)
744 return FALSE;
745 }
746 }
747
748 /**
749 * g_hostname_is_ip_address:
750 * @hostname: a hostname (or IP address in string form)
751 *
752 * Tests if @hostname is the string form of an IPv4 or IPv6 address.
753 * (Eg, "192.168.0.1".)
754 *
755 * Since 2.66, IPv6 addresses with a zone-id are accepted (RFC6874).
756 *
757 * Returns: %TRUE if @hostname is an IP address
758 *
759 * Since: 2.22
760 **/
761 gboolean
762 g_hostname_is_ip_address (const gchar *hostname)
763 {
764 gchar *p, *end;
765 gint nsegments, octet;
766
767 /* On Linux we could implement this using inet_pton, but the Windows
768 * equivalent of that requires linking against winsock, so we just
769 * figure this out ourselves. Tested by tests/hostutils.c.
770 */
771
772 p = (char *)hostname;
773
774 if (strchr (p, ':'))
775 {
776 gboolean skipped;
777
778 /* If it contains a ':', it's an IPv6 address (assuming it's an
779 * IP address at all). This consists of eight ':'-separated
780 * segments, each containing a 1-4 digit hex number, except that
781 * optionally: (a) the last two segments can be replaced by an
782 * IPv4 address, and (b) a single span of 1 to 8 "0000" segments
783 * can be replaced with just "::".
784 */
785
786 nsegments = 0;
787 skipped = FALSE;
788 while (*p && *p != '%' && nsegments < 8)
789 {
790 /* Each segment after the first must be preceded by a ':'.
791 * (We also handle half of the "string starts with ::" case
792 * here.)
793 */
794 if (p != (char *)hostname || (p[0] == ':' && p[1] == ':'))
795 {
796 if (*p != ':')
797 return FALSE;
798 p++;
799 }
800
801 /* If there's another ':', it means we're skipping some segments */
802 if (*p == ':' && !skipped)
803 {
804 skipped = TRUE;
805 nsegments++;
806
807 /* Handle the "string ends with ::" case */
808 if (!p[1])
809 p++;
810
811 continue;
812 }
813
814 /* Read the segment, make sure it's valid. */
815 for (end = p; g_ascii_isxdigit (*end); end++)
816 ;
817 if (end == p || end > p + 4)
818 return FALSE;
819
820 if (*end == '.')
821 {
822 if ((nsegments == 6 && !skipped) || (nsegments <= 6 && skipped))
823 goto parse_ipv4;
824 else
825 return FALSE;
826 }
827
828 nsegments++;
829 p = end;
830 }
831
832 return (!*p || (p[0] == '%' && p[1])) && (nsegments == 8 || skipped);
833 }
834
835 parse_ipv4:
836
837 /* Parse IPv4: N.N.N.N, where each N <= 255 and doesn't have leading 0s. */
838 for (nsegments = 0; nsegments < 4; nsegments++)
839 {
840 if (nsegments != 0)
841 {
842 if (*p != '.')
843 return FALSE;
844 p++;
845 }
846
847 /* Check the segment; a little tricker than the IPv6 case since
848 * we can't allow extra leading 0s, and we can't assume that all
849 * strings of valid length are within range.
850 */
851 octet = 0;
852 if (*p == '0')
853 end = p + 1;
854 else
855 {
856 for (end = p; g_ascii_isdigit (*end); end++)
857 {
858 octet = 10 * octet + (*end - '0');
859
860 if (octet > 255)
861 break;
862 }
863 }
864 if (end == p || end > p + 3 || octet > 255)
865 return FALSE;
866
867 p = end;
868 }
869
870 /* If there's nothing left to parse, then it's ok. */
871 return !*p;
872 }