1 /* GNU cmp - compare two files byte by byte
2
3 Copyright (C) 1990-1996, 1998, 2001-2002, 2004, 2006-2007, 2009-2013,
4 2015-2023 Free Software Foundation, Inc.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18
19 #include "system.h"
20 #include "paths.h"
21
22 #include <stdio.h>
23
24 #include <c-stack.h>
25 #include <cmpbuf.h>
26 #include "die.h"
27 #include <error.h>
28 #include <exitfail.h>
29 #include <file-type.h>
30 #include <getopt.h>
31 #include <hard-locale.h>
32 #include <inttostr.h>
33 #include <progname.h>
34 #include <unlocked-io.h>
35 #include <version-etc.h>
36 #include <xalloc.h>
37 #include <binary-io.h>
38 #include <xstdopen.h>
39 #include <xstrtol.h>
40
41 /* The official name of this program (e.g., no 'g' prefix). */
42 static char const PROGRAM_NAME[] = "cmp";
43
44 #define AUTHORS \
45 proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"), \
46 proper_name ("David MacKenzie")
47
48 static bool
49 hard_locale_LC_MESSAGES (void)
50 {
51 #if defined LC_MESSAGES && ENABLE_NLS
52 return hard_locale (LC_MESSAGES);
53 #else
54 return false;
55 #endif
56 }
57
58 static int cmp (void);
59 static off_t file_position (int);
60 static size_t block_compare (word const *, word const *) ATTRIBUTE_PURE;
61 static size_t count_newlines (char *, size_t);
62 static void sprintc (char *, unsigned char);
63
64 /* Filenames of the compared files. */
65 static char const *file[2];
66
67 /* File descriptors of the files. */
68 static int file_desc[2];
69
70 /* Status of the files. If st_size is negative, the status is unknown
71 and st_blksize (if it exists) is just a reasonable guess. */
72 static struct stat stat_buf[2];
73
74 /* Read buffers for the files. */
75 static word *buffer[2];
76
77 /* Optimal block size for the files. */
78 static size_t buf_size;
79
80 /* Initial prefix to ignore for each file. */
81 static off_t ignore_initial[2];
82
83 /* Number of bytes to compare, or -1 if there is no limit. */
84 static intmax_t bytes = -1;
85
86 /* Output format. */
87 static enum comparison_type
88 {
89 type_first_diff, /* Print the first difference. */
90 type_all_diffs, /* Print all differences. */
91 type_no_stdout, /* Do not output to stdout; only stderr. */
92 type_status /* Exit status only. */
93 } comparison_type;
94
95 /* If nonzero, print values of bytes quoted like cat -t does. */
96 static bool opt_print_bytes;
97
98 /* Values for long options that do not have single-letter equivalents. */
99 enum
100 {
101 HELP_OPTION = CHAR_MAX + 1
102 };
103
104 static struct option const long_options[] =
105 {
106 {"print-bytes", 0, 0, 'b'},
107 {"print-chars", 0, 0, 'c'}, /* obsolescent as of diffutils 2.7.3 */
108 {"ignore-initial", 1, 0, 'i'},
109 {"verbose", 0, 0, 'l'},
110 {"bytes", 1, 0, 'n'},
111 {"silent", 0, 0, 's'},
112 {"quiet", 0, 0, 's'},
113 {"version", 0, 0, 'v'},
114 {"help", 0, 0, HELP_OPTION},
115 {0, 0, 0, 0}
116 };
117
118 static _Noreturn void
119 try_help (char const *reason_msgid, char const *operand)
120 {
121 if (reason_msgid)
122 error (0, 0, _(reason_msgid), operand);
123 die (EXIT_TROUBLE, 0,
124 _("Try '%s --help' for more information."), program_name);
125 }
126
127 static char const valid_suffixes[] = "kKMGTPEZY0";
128
129 /* Update ignore_initial[F] according to the result of parsing an
130 *operand ARGPTR of --ignore-initial, updating *ARGPTR to point
131 *after the operand. If DELIMITER is nonzero, the operand may be
132 *followed by DELIMITER; otherwise it must be null-terminated. */
133 static void
134 specify_ignore_initial (int f, char **argptr, char delimiter)
135 {
136 intmax_t val;
137 char const *arg = *argptr;
138 strtol_error e = xstrtoimax (arg, argptr, 0, &val, valid_suffixes);
139 if (! ((e == LONGINT_OK
140 || (e == LONGINT_INVALID_SUFFIX_CHAR && **argptr == delimiter))
141 && 0 <= val && val <= TYPE_MAXIMUM (off_t)))
142 try_help ("invalid --ignore-initial value '%s'", arg);
143 if (ignore_initial[f] < val)
144 ignore_initial[f] = val;
145 }
146
147 /* Specify the output format. */
148 static void
149 specify_comparison_type (enum comparison_type t)
150 {
151 if (comparison_type && comparison_type != t)
152 try_help ("options -l and -s are incompatible", 0);
153 comparison_type = t;
154 }
155
156 static void
157 check_stdout (void)
158 {
159 if (ferror (stdout))
160 die (EXIT_TROUBLE, 0, "%s", _("write failed"));
161 else if (fclose (stdout) != 0)
162 die (EXIT_TROUBLE, errno, "%s", _("standard output"));
163 }
164
165 static char const * const option_help_msgid[] = {
166 N_("-b, --print-bytes print differing bytes"),
167 N_("-i, --ignore-initial=SKIP skip first SKIP bytes of both inputs"),
168 N_("-i, --ignore-initial=SKIP1:SKIP2 skip first SKIP1 bytes of FILE1 and\n"
169 " first SKIP2 bytes of FILE2"),
170 N_("-l, --verbose output byte numbers and differing byte values"),
171 N_("-n, --bytes=LIMIT compare at most LIMIT bytes"),
172 N_("-s, --quiet, --silent suppress all normal output"),
173 N_(" --help display this help and exit"),
174 N_("-v, --version output version information and exit"),
175 0
176 };
177
178 static void
179 usage (void)
180 {
181 char const * const *p;
182
183 printf (_("Usage: %s [OPTION]... FILE1 [FILE2 [SKIP1 [SKIP2]]]\n"),
184 program_name);
185 printf ("%s\n", _("Compare two files byte by byte."));
186 printf ("\n%s\n\n",
187 _("The optional SKIP1 and SKIP2 specify the number of bytes to skip\n"
188 "at the beginning of each file (zero by default)."));
189
190 fputs (_("\
191 Mandatory arguments to long options are mandatory for short options too.\n\
192 "), stdout);
193 for (p = option_help_msgid; *p; p++)
194 printf (" %s\n", _(*p));
195 printf ("\n%s\n\n%s\n%s\n",
196 _("SKIP values may be followed by the following multiplicative suffixes:\n\
197 kB 1000, K 1024, MB 1,000,000, M 1,048,576,\n\
198 GB 1,000,000,000, G 1,073,741,824, and so on for T, P, E, Z, Y."),
199 _("If a FILE is '-' or missing, read standard input."),
200 _("Exit status is 0 if inputs are the same, 1 if different, 2 if trouble."));
201 emit_bug_reporting_address ();
202 }
203
204 int
205 main (int argc, char **argv)
206 {
207 int c, exit_status;
208 size_t words_per_buffer;
209
210 exit_failure = EXIT_TROUBLE;
211 initialize_main (&argc, &argv);
212 set_program_name (argv[0]);
213 setlocale (LC_ALL, "");
214 bindtextdomain (PACKAGE, LOCALEDIR);
215 textdomain (PACKAGE);
216 c_stack_action (0);
217 xstdopen ();
218
219 /* Parse command line options. */
220
221 while ((c = getopt_long (argc, argv, "bci:ln:sv", long_options, 0))
222 != -1)
223 switch (c)
224 {
225 case 'b':
226 case 'c': /* 'c' is obsolescent as of diffutils 2.7.3 */
227 opt_print_bytes = true;
228 break;
229
230 case 'i':
231 specify_ignore_initial (0, &optarg, ':');
232 if (*optarg++ == ':')
233 specify_ignore_initial (1, &optarg, 0);
234 else if (ignore_initial[1] < ignore_initial[0])
235 ignore_initial[1] = ignore_initial[0];
236 break;
237
238 case 'l':
239 specify_comparison_type (type_all_diffs);
240 break;
241
242 case 'n':
243 {
244 intmax_t n;
245 if (xstrtoimax (optarg, 0, 0, &n, valid_suffixes) != LONGINT_OK
246 || n < 0)
247 try_help ("invalid --bytes value '%s'", optarg);
248 if (! (0 <= bytes && bytes < n))
249 bytes = n;
250 }
251 break;
252
253 case 's':
254 specify_comparison_type (type_status);
255 break;
256
257 case 'v':
258 version_etc (stdout, PROGRAM_NAME, PACKAGE_NAME, Version,
259 AUTHORS, nullptr);
260 check_stdout ();
261 return EXIT_SUCCESS;
262
263 case HELP_OPTION:
264 usage ();
265 check_stdout ();
266 return EXIT_SUCCESS;
267
268 default:
269 try_help (0, 0);
270 }
271
272 if (optind == argc)
273 try_help ("missing operand after '%s'", argv[argc - 1]);
274
275 file[0] = argv[optind++];
276 file[1] = optind < argc ? argv[optind++] : "-";
277
278 for (int f = 0; f < 2 && optind < argc; f++)
279 {
280 char *arg = argv[optind++];
281 specify_ignore_initial (f, &arg, 0);
282 }
283
284 if (optind < argc)
285 try_help ("extra operand '%s'", argv[optind]);
286
287 for (int f = 0; f < 2; f++)
288 {
289 /* Two files with the same name and offset are identical.
290 But wait until we open the file once, for proper diagnostics. */
291 if (f && ignore_initial[0] == ignore_initial[1]
292 && file_name_cmp (file[0], file[1]) == 0)
293 return EXIT_SUCCESS;
294
295 if (STREQ (file[f], "-"))
296 {
297 file_desc[f] = STDIN_FILENO;
298 if (O_BINARY && ! isatty (STDIN_FILENO))
299 set_binary_mode (STDIN_FILENO, O_BINARY);
300 }
301 else
302 {
303 file_desc[f] = open (file[f], O_RDONLY | O_BINARY, 0);
304
305 if (file_desc[f] < 0)
306 {
307 if (comparison_type != type_status)
308 error (0, errno, "%s", file[f]);
309 exit (EXIT_TROUBLE);
310 }
311 }
312
313 if (fstat (file_desc[f], stat_buf + f) < 0)
314 {
315 stat_buf[f].st_size = -1;
316 #if HAVE_STRUCT_STAT_ST_BLKSIZE
317 stat_buf[f].st_blksize = 8 * 1024;
318 #endif
319 }
320 }
321
322 /* If the files are links to the same inode and have the same file position,
323 they are identical. */
324
325 if (0 <= stat_buf[0].st_size && 0 <= stat_buf[1].st_size
326 && 0 < same_file (&stat_buf[0], &stat_buf[1])
327 && same_file_attributes (&stat_buf[0], &stat_buf[1])
328 && file_position (0) == file_position (1))
329 return EXIT_SUCCESS;
330
331 /* If output is redirected to the null device, we can avoid some of
332 the work. */
333
334 if (comparison_type != type_status)
335 {
336 struct stat outstat, nullstat;
337
338 if (fstat (STDOUT_FILENO, &outstat) == 0
339 && stat (NULL_DEVICE, &nullstat) == 0
340 && 0 < same_file (&outstat, &nullstat))
341 comparison_type = type_no_stdout;
342 }
343
344 /* If only a return code is needed,
345 and if both input descriptors are associated with plain files,
346 conclude that the files differ if they have different sizes
347 and if more bytes will be compared than are in the smaller file. */
348
349 if (comparison_type == type_status
350 && 0 <= stat_buf[0].st_size && S_ISREG (stat_buf[0].st_mode)
351 && 0 <= stat_buf[1].st_size && S_ISREG (stat_buf[1].st_mode))
352 {
353 off_t s0 = stat_buf[0].st_size - file_position (0);
354 off_t s1 = stat_buf[1].st_size - file_position (1);
355 if (s0 < 0)
356 s0 = 0;
357 if (s1 < 0)
358 s1 = 0;
359 if (s0 != s1 && (bytes < 0 || MIN (s0, s1) < bytes))
360 exit (EXIT_FAILURE);
361 }
362
363 /* Guess a good block size for the files. */
364
365 buf_size = buffer_lcm (STAT_BLOCKSIZE (stat_buf[0]),
366 STAT_BLOCKSIZE (stat_buf[1]),
367 PTRDIFF_MAX - sizeof (word));
368
369 /* Allocate word-aligned buffers, with space for sentinels at the end. */
370
371 words_per_buffer = (buf_size + 2 * sizeof (word) - 1) / sizeof (word);
372 buffer[0] = xmalloc (2 * sizeof (word) * words_per_buffer);
373 buffer[1] = buffer[0] + words_per_buffer;
374
375 exit_status = cmp ();
376
377 for (int f = 0; f < 2; f++)
378 if (close (file_desc[f]) != 0)
379 die (EXIT_TROUBLE, errno, "%s", file[f]);
380 if (exit_status != EXIT_SUCCESS && comparison_type < type_no_stdout)
381 check_stdout ();
382 exit (exit_status);
383 return exit_status;
384 }
385
386 /* Compare the two files already open on 'file_desc[0]' and 'file_desc[1]',
387 using 'buffer[0]' and 'buffer[1]'.
388 Return EXIT_SUCCESS if identical, EXIT_FAILURE if different,
389 >1 if error. */
390
391 static int
392 cmp (void)
393 {
394 bool at_line_start = true;
395 off_t line_number = 1; /* Line number (1...) of difference. */
396 off_t byte_number = 1; /* Byte number (1...) of difference. */
397 intmax_t remaining = bytes; /* Remaining bytes to compare, or -1. */
398 size_t read0, read1; /* Number of bytes read from each file. */
399 size_t first_diff; /* Offset (0...) in buffers of 1st diff. */
400 size_t smaller; /* The lesser of 'read0' and 'read1'. */
401 word *buffer0 = buffer[0];
402 word *buffer1 = buffer[1];
403 char *buf0 = (char *) buffer0;
404 char *buf1 = (char *) buffer1;
405 int differing = 0;
406 int f;
407 int offset_width IF_LINT (= 0); /* IF_LINT due to GCC bug 101768. */
408
409 if (comparison_type == type_all_diffs)
410 {
411 off_t byte_number_max = (0 <= bytes && bytes <= TYPE_MAXIMUM (off_t)
412 ? bytes : TYPE_MAXIMUM (off_t));
413
414 for (f = 0; f < 2; f++)
415 if (0 <= stat_buf[f].st_size && S_ISREG (stat_buf[f].st_mode))
416 {
417 off_t file_bytes = stat_buf[f].st_size - file_position (f);
418 if (file_bytes < byte_number_max)
419 byte_number_max = file_bytes;
420 }
421
422 for (offset_width = 1; (byte_number_max /= 10) != 0; offset_width++)
423 continue;
424 }
425
426 for (f = 0; f < 2; f++)
427 {
428 off_t ig = ignore_initial[f];
429 if (ig && file_position (f) == -1)
430 {
431 /* lseek failed; read and discard the ignored initial prefix. */
432 do
433 {
434 size_t bytes_to_read = MIN (ig, buf_size);
435 size_t r = block_read (file_desc[f], buf0, bytes_to_read);
436 if (r != bytes_to_read)
437 {
438 if (r == SIZE_MAX)
439 die (EXIT_TROUBLE, errno, "%s", file[f]);
440 break;
441 }
442 ig -= r;
443 }
444 while (ig);
445 }
446 }
447
448 do
449 {
450 size_t bytes_to_read = buf_size;
451
452 if (0 <= remaining)
453 {
454 if (remaining < bytes_to_read)
455 bytes_to_read = remaining;
456 remaining -= bytes_to_read;
457 }
458
459 read0 = block_read (file_desc[0], buf0, bytes_to_read);
460 if (read0 == SIZE_MAX)
461 die (EXIT_TROUBLE, errno, "%s", file[0]);
462 read1 = block_read (file_desc[1], buf1, bytes_to_read);
463 if (read1 == SIZE_MAX)
464 die (EXIT_TROUBLE, errno, "%s", file[1]);
465
466 smaller = MIN (read0, read1);
467
468 /* Optimize the common case where the buffers are the same. */
469 if (memcmp (buf0, buf1, smaller) == 0)
470 first_diff = smaller;
471 else
472 {
473 /* Insert sentinels for the block compare. */
474 if (read0 >= read1)
475 buf1[read0] = 0x55; /* arbitrary */
476 if (read1 >= read0)
477 buf0[read1] = 0x79; /* arbitrary and distinct from the above */
478 buf0[read0] = ~buf1[read0];
479 buf1[read1] = ~buf0[read1];
480 /* Ensure all bytes of a final word-read are initialized. */
481 memset (buf0 + read0 + 1, 0,
482 sizeof (word) - read0 % sizeof (word) - 1);
483 memset (buf1 + read1 + 1, 0,
484 sizeof (word) - read1 % sizeof (word) - 1);
485
486 first_diff = block_compare (buffer0, buffer1);
487 }
488
489 byte_number += first_diff;
490 if (comparison_type == type_first_diff && first_diff != 0)
491 {
492 line_number += count_newlines (buf0, first_diff);
493 at_line_start = buf0[first_diff - 1] == '\n';
494 }
495
496 if (first_diff < smaller)
497 {
498 switch (comparison_type)
499 {
500 case type_first_diff:
501 {
502 char byte_buf[INT_BUFSIZE_BOUND (off_t)];
503 char line_buf[INT_BUFSIZE_BOUND (off_t)];
504 char const *byte_num = offtostr (byte_number, byte_buf);
505 char const *line_num = offtostr (line_number, line_buf);
506 if (!opt_print_bytes)
507 {
508 /* See POSIX for this format. This message is
509 used only in the POSIX locale, so it need not
510 be translated. */
511 static char const char_message[] =
512 "%s %s differ: char %s, line %s\n";
513
514 /* The POSIX rationale recommends using the word
515 "byte" outside the POSIX locale. Some gettext
516 implementations translate even in the POSIX
517 locale if certain other environment variables
518 are set, so use "byte" if a translation is
519 available, or if outside the POSIX locale. */
520 static char const byte_msgid[] =
521 N_("%s %s differ: byte %s, line %s\n");
522 char const *byte_message = _(byte_msgid);
523 bool use_byte_message = (byte_message != byte_msgid
524 || hard_locale_LC_MESSAGES ());
525
526 printf (use_byte_message ? byte_message : char_message,
527 file[0], file[1], byte_num, line_num);
528 }
529 else
530 {
531 unsigned char c0 = buf0[first_diff];
532 unsigned char c1 = buf1[first_diff];
533 char s0[5];
534 char s1[5];
535 sprintc (s0, c0);
536 sprintc (s1, c1);
537 printf (_("%s %s differ: byte %s, line %s is %3o %s %3o %s\n"),
538 file[0], file[1], byte_num, line_num,
539 c0, s0, c1, s1);
540 }
541 }
542 FALLTHROUGH;
543 case type_status:
544 return EXIT_FAILURE;
545
546 case type_all_diffs:
547 do
548 {
549 unsigned char c0 = buf0[first_diff];
550 unsigned char c1 = buf1[first_diff];
551 if (c0 != c1)
552 {
553 char byte_buf[INT_BUFSIZE_BOUND (off_t)];
554 char const *byte_num = offtostr (byte_number, byte_buf);
555 if (!opt_print_bytes)
556 {
557 /* See POSIX for this format. */
558 printf ("%*s %3o %3o\n",
559 offset_width, byte_num, c0, c1);
560 }
561 else
562 {
563 char s0[5];
564 char s1[5];
565 sprintc (s0, c0);
566 sprintc (s1, c1);
567 printf ("%*s %3o %-4s %3o %s\n",
568 offset_width, byte_num, c0, s0, c1, s1);
569 }
570 }
571 byte_number++;
572 first_diff++;
573 }
574 while (first_diff < smaller);
575 differing = -1;
576 break;
577
578 case type_no_stdout:
579 differing = 1;
580 break;
581 }
582 }
583
584 if (read0 != read1)
585 {
586 if (differing <= 0 && comparison_type != type_status)
587 {
588 char const *shorter_file = file[read1 < read0];
589
590 /* POSIX says that each of these format strings must be
591 "cmp: EOF on %s", optionally followed by a blank and
592 extra text sans newline, then terminated by "\n". */
593 if (byte_number == 1)
594 fprintf (stderr, _("cmp: EOF on %s which is empty\n"),
595 shorter_file);
596 else
597 {
598 char byte_buf[INT_BUFSIZE_BOUND (off_t)];
599 char const *byte_num = offtostr (byte_number - 1, byte_buf);
600
601 if (comparison_type == type_first_diff)
602 {
603 char line_buf[INT_BUFSIZE_BOUND (off_t)];
604 char const *line_num
605 = offtostr (line_number - at_line_start, line_buf);
606 fprintf (stderr,
607 (at_line_start
608 ? _("cmp: EOF on %s after byte %s, line %s\n")
609 : _("cmp: EOF on %s after byte %s,"
610 " in line %s\n")),
611 shorter_file, byte_num, line_num);
612 }
613 else
614 fprintf (stderr,
615 _("cmp: EOF on %s after byte %s\n"),
616 shorter_file, byte_num);
617 }
618 }
619
620 return EXIT_FAILURE;
621 }
622 }
623 while (differing <= 0 && read0 == buf_size);
624
625 return differing == 0 ? EXIT_SUCCESS : EXIT_FAILURE;
626 }
627
628 /* Compare two blocks of memory P0 and P1 until they differ.
629 If the blocks are not guaranteed to be different, put sentinels at the ends
630 of the blocks before calling this function.
631
632 Return the offset of the first byte that differs. */
633
634 static size_t
635 block_compare (word const *p0, word const *p1)
636 {
637 word const *l0, *l1;
638 char const *c0, *c1;
639
640 /* Find the rough position of the first difference by reading words,
641 not bytes. */
642
643 for (l0 = p0, l1 = p1; *l0 == *l1; l0++, l1++)
644 continue;
645
646 /* Find the exact differing position (endianness independent). */
647
648 for (c0 = (char const *) l0, c1 = (char const *) l1;
649 *c0 == *c1;
650 c0++, c1++)
651 continue;
652
653 return c0 - (char const *) p0;
654 }
655
656 /* Return the number of newlines in BUF, of size BUFSIZE,
657 where BUF[NBYTES] is available for use as a sentinel. */
658
659 static size_t
660 count_newlines (char *buf, size_t bufsize)
661 {
662 size_t count = 0;
663 char *p;
664 char *lim = buf + bufsize;
665 char ch = *lim;
666 *lim = '\n';
667 for (p = buf; (p = rawmemchr (p, '\n')) != lim; p++)
668 count++;
669 *lim = ch;
670 return count;
671 }
672
673 /* Put into BUF the unsigned char C, making unprintable bytes
674 visible by quoting like cat -t does. */
675
676 static void
677 sprintc (char *buf, unsigned char c)
678 {
679 if (! isprint (c))
680 {
681 if (c >= 128)
682 {
683 *buf++ = 'M';
684 *buf++ = '-';
685 c -= 128;
686 }
687 if (c < 32)
688 {
689 *buf++ = '^';
690 c += 64;
691 }
692 else if (c == 127)
693 {
694 *buf++ = '^';
695 c = '?';
696 }
697 }
698
699 *buf++ = c;
700 *buf = 0;
701 }
702
703 /* Position file F to ignore_initial[F] bytes from its initial position,
704 and yield its new position. Don't try more than once. */
705
706 static off_t
707 file_position (int f)
708 {
709 static bool positioned[2];
710 static off_t position[2];
711
712 if (! positioned[f])
713 {
714 positioned[f] = true;
715 position[f] = lseek (file_desc[f], ignore_initial[f], SEEK_CUR);
716 }
717 return position[f];
718 }