(root)/
coreutils-9.4/
src/
uniq.c
       1  /* uniq -- remove duplicate lines from a sorted file
       2     Copyright (C) 1986-2023 Free Software Foundation, Inc.
       3  
       4     This program is free software: you can redistribute it and/or modify
       5     it under the terms of the GNU General Public License as published by
       6     the Free Software Foundation, either version 3 of the License, or
       7     (at your option) any later version.
       8  
       9     This program is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU General Public License for more details.
      13  
      14     You should have received a copy of the GNU General Public License
      15     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      16  
      17  /* Written by Richard M. Stallman and David MacKenzie. */
      18  
      19  #include <config.h>
      20  
      21  #include <getopt.h>
      22  #include <sys/types.h>
      23  
      24  #include "system.h"
      25  #include "argmatch.h"
      26  #include "linebuffer.h"
      27  #include "fadvise.h"
      28  #include "posixver.h"
      29  #include "stdio--.h"
      30  #include "xstrtol.h"
      31  #include "memcasecmp.h"
      32  #include "quote.h"
      33  
      34  /* The official name of this program (e.g., no 'g' prefix).  */
      35  #define PROGRAM_NAME "uniq"
      36  
      37  #define AUTHORS \
      38    proper_name ("Richard M. Stallman"), \
      39    proper_name ("David MacKenzie")
      40  
      41  #define SWAP_LINES(A, B)			\
      42    do						\
      43      {						\
      44        struct linebuffer *_tmp;			\
      45        _tmp = (A);				\
      46        (A) = (B);				\
      47        (B) = _tmp;				\
      48      }						\
      49    while (0)
      50  
      51  /* Number of fields to skip on each line when doing comparisons. */
      52  static size_t skip_fields;
      53  
      54  /* Number of chars to skip after skipping any fields. */
      55  static size_t skip_chars;
      56  
      57  /* Number of chars to compare. */
      58  static size_t check_chars;
      59  
      60  enum countmode
      61  {
      62    count_occurrences,		/* -c Print count before output lines. */
      63    count_none			/* Default.  Do not print counts. */
      64  };
      65  
      66  /* Whether and how to precede the output lines with a count of the number of
      67     times they occurred in the input. */
      68  static enum countmode countmode;
      69  
      70  /* Which lines to output: unique lines, the first of a group of
      71     repeated lines, and the second and subsequent of a group of
      72     repeated lines.  */
      73  static bool output_unique;
      74  static bool output_first_repeated;
      75  static bool output_later_repeated;
      76  
      77  /* If true, ignore case when comparing.  */
      78  static bool ignore_case;
      79  
      80  enum delimit_method
      81  {
      82    /* No delimiters output.  --all-repeated[=none] */
      83    DM_NONE,
      84  
      85    /* Delimiter precedes all groups.  --all-repeated=prepend */
      86    DM_PREPEND,
      87  
      88    /* Delimit all groups.  --all-repeated=separate */
      89    DM_SEPARATE
      90  };
      91  
      92  static char const *const delimit_method_string[] =
      93  {
      94    "none", "prepend", "separate", nullptr
      95  };
      96  
      97  static enum delimit_method const delimit_method_map[] =
      98  {
      99    DM_NONE, DM_PREPEND, DM_SEPARATE
     100  };
     101  
     102  /* Select whether/how to delimit groups of duplicate lines.  */
     103  static enum delimit_method delimit_groups;
     104  
     105  enum grouping_method
     106  {
     107    /* No grouping, when "--group" isn't used */
     108    GM_NONE,
     109  
     110    /* Delimiter precedes all groups.  --group=prepend */
     111    GM_PREPEND,
     112  
     113    /* Delimiter follows all groups.   --group=append */
     114    GM_APPEND,
     115  
     116    /* Delimiter between groups.    --group[=separate] */
     117    GM_SEPARATE,
     118  
     119    /* Delimiter before and after each group. --group=both */
     120    GM_BOTH
     121  };
     122  
     123  static char const *const grouping_method_string[] =
     124  {
     125    "prepend", "append", "separate", "both", nullptr
     126  };
     127  
     128  static enum grouping_method const grouping_method_map[] =
     129  {
     130    GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
     131  };
     132  
     133  static enum grouping_method grouping = GM_NONE;
     134  
     135  enum
     136  {
     137    GROUP_OPTION = CHAR_MAX + 1
     138  };
     139  
     140  static struct option const longopts[] =
     141  {
     142    {"count", no_argument, nullptr, 'c'},
     143    {"repeated", no_argument, nullptr, 'd'},
     144    {"all-repeated", optional_argument, nullptr, 'D'},
     145    {"group", optional_argument, nullptr, GROUP_OPTION},
     146    {"ignore-case", no_argument, nullptr, 'i'},
     147    {"unique", no_argument, nullptr, 'u'},
     148    {"skip-fields", required_argument, nullptr, 'f'},
     149    {"skip-chars", required_argument, nullptr, 's'},
     150    {"check-chars", required_argument, nullptr, 'w'},
     151    {"zero-terminated", no_argument, nullptr, 'z'},
     152    {GETOPT_HELP_OPTION_DECL},
     153    {GETOPT_VERSION_OPTION_DECL},
     154    {nullptr, 0, nullptr, 0}
     155  };
     156  
     157  void
     158  usage (int status)
     159  {
     160    if (status != EXIT_SUCCESS)
     161      emit_try_help ();
     162    else
     163      {
     164        printf (_("\
     165  Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
     166  "),
     167                program_name);
     168        fputs (_("\
     169  Filter adjacent matching lines from INPUT (or standard input),\n\
     170  writing to OUTPUT (or standard output).\n\
     171  \n\
     172  With no options, matching lines are merged to the first occurrence.\n\
     173  "), stdout);
     174  
     175        emit_mandatory_arg_note ();
     176  
     177       fputs (_("\
     178    -c, --count           prefix lines by the number of occurrences\n\
     179    -d, --repeated        only print duplicate lines, one for each group\n\
     180  "), stdout);
     181       fputs (_("\
     182    -D                    print all duplicate lines\n\
     183        --all-repeated[=METHOD]  like -D, but allow separating groups\n\
     184                                   with an empty line;\n\
     185                                   METHOD={none(default),prepend,separate}\n\
     186  "), stdout);
     187       fputs (_("\
     188    -f, --skip-fields=N   avoid comparing the first N fields\n\
     189  "), stdout);
     190       fputs (_("\
     191        --group[=METHOD]  show all items, separating groups with an empty line;\n\
     192                            METHOD={separate(default),prepend,append,both}\n\
     193  "), stdout);
     194       fputs (_("\
     195    -i, --ignore-case     ignore differences in case when comparing\n\
     196    -s, --skip-chars=N    avoid comparing the first N characters\n\
     197    -u, --unique          only print unique lines\n\
     198  "), stdout);
     199        fputs (_("\
     200    -z, --zero-terminated     line delimiter is NUL, not newline\n\
     201  "), stdout);
     202       fputs (_("\
     203    -w, --check-chars=N   compare no more than N characters in lines\n\
     204  "), stdout);
     205       fputs (HELP_OPTION_DESCRIPTION, stdout);
     206       fputs (VERSION_OPTION_DESCRIPTION, stdout);
     207       fputs (_("\
     208  \n\
     209  A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
     210  characters.  Fields are skipped before chars.\n\
     211  "), stdout);
     212       fputs (_("\
     213  \n\
     214  Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
     215  You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
     216  "), stdout);
     217        emit_ancillary_info (PROGRAM_NAME);
     218      }
     219    exit (status);
     220  }
     221  
     222  static bool
     223  strict_posix2 (void)
     224  {
     225    int posix_ver = posix2_version ();
     226    return 200112 <= posix_ver && posix_ver < 200809;
     227  }
     228  
     229  /* Convert OPT to size_t, reporting an error using MSGID if OPT is
     230     invalid.  Silently convert too-large values to SIZE_MAX.  */
     231  
     232  static size_t
     233  size_opt (char const *opt, char const *msgid)
     234  {
     235    uintmax_t size;
     236  
     237    switch (xstrtoumax (opt, nullptr, 10, &size, ""))
     238      {
     239      case LONGINT_OK:
     240      case LONGINT_OVERFLOW:
     241        break;
     242  
     243      default:
     244        error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
     245      }
     246  
     247    return MIN (size, SIZE_MAX);
     248  }
     249  
     250  /* Given a linebuffer LINE,
     251     return a pointer to the beginning of the line's field to be compared. */
     252  
     253  ATTRIBUTE_PURE
     254  static char *
     255  find_field (struct linebuffer const *line)
     256  {
     257    size_t count;
     258    char const *lp = line->buffer;
     259    size_t size = line->length - 1;
     260    size_t i = 0;
     261  
     262    for (count = 0; count < skip_fields && i < size; count++)
     263      {
     264        while (i < size && field_sep (lp[i]))
     265          i++;
     266        while (i < size && !field_sep (lp[i]))
     267          i++;
     268      }
     269  
     270    i += MIN (skip_chars, size - i);
     271  
     272    return line->buffer + i;
     273  }
     274  
     275  /* Return false if two strings OLD and NEW match, true if not.
     276     OLD and NEW point not to the beginnings of the lines
     277     but rather to the beginnings of the fields to compare.
     278     OLDLEN and NEWLEN are their lengths. */
     279  
     280  static bool
     281  different (char *old, char *new, size_t oldlen, size_t newlen)
     282  {
     283    if (check_chars < oldlen)
     284      oldlen = check_chars;
     285    if (check_chars < newlen)
     286      newlen = check_chars;
     287  
     288    if (ignore_case)
     289      return oldlen != newlen || memcasecmp (old, new, oldlen);
     290    else
     291      return oldlen != newlen || memcmp (old, new, oldlen);
     292  }
     293  
     294  /* Output the line in linebuffer LINE to standard output
     295     provided that the switches say it should be output.
     296     MATCH is true if the line matches the previous line.
     297     If requested, print the number of times it occurred, as well;
     298     LINECOUNT + 1 is the number of times that the line occurred. */
     299  
     300  static void
     301  writeline (struct linebuffer const *line,
     302             bool match, uintmax_t linecount)
     303  {
     304    if (! (linecount == 0 ? output_unique
     305           : !match ? output_first_repeated
     306           : output_later_repeated))
     307      return;
     308  
     309    if (countmode == count_occurrences)
     310      printf ("%7" PRIuMAX " ", linecount + 1);
     311  
     312    if (fwrite (line->buffer, sizeof (char), line->length, stdout)
     313        != line->length)
     314      write_error ();
     315  }
     316  
     317  /* Process input file INFILE with output to OUTFILE.
     318     If either is "-", use the standard I/O stream for it instead. */
     319  
     320  static void
     321  check_file (char const *infile, char const *outfile, char delimiter)
     322  {
     323    struct linebuffer lb1, lb2;
     324    struct linebuffer *thisline, *prevline;
     325  
     326    if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
     327      error (EXIT_FAILURE, errno, "%s", quotef (infile));
     328    if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
     329      error (EXIT_FAILURE, errno, "%s", quotef (outfile));
     330  
     331    fadvise (stdin, FADVISE_SEQUENTIAL);
     332  
     333    thisline = &lb1;
     334    prevline = &lb2;
     335  
     336    initbuffer (thisline);
     337    initbuffer (prevline);
     338  
     339    /* The duplication in the following 'if' and 'else' blocks is an
     340       optimization to distinguish between when we can print input
     341       lines immediately (1. & 2.) or not.
     342  
     343       1. --group => all input lines are printed.
     344          checking for unique/duplicated lines is used only for printing
     345          group separators.
     346  
     347       2. The default case in which none of these options has been specified:
     348            --count, --repeated,  --all-repeated, --unique
     349          In the default case, this optimization lets uniq output each different
     350          line right away, without waiting to see if the next one is different.
     351  
     352       3. All other cases.
     353    */
     354    if (output_unique && output_first_repeated && countmode == count_none)
     355      {
     356        char *prevfield = nullptr;
     357        size_t prevlen;
     358        bool first_group_printed = false;
     359  
     360        while (!feof (stdin))
     361          {
     362            char *thisfield;
     363            size_t thislen;
     364            bool new_group;
     365  
     366            if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
     367              break;
     368  
     369            thisfield = find_field (thisline);
     370            thislen = thisline->length - 1 - (thisfield - thisline->buffer);
     371  
     372            new_group = (!prevfield
     373                         || different (thisfield, prevfield, thislen, prevlen));
     374  
     375            if (new_group && grouping != GM_NONE
     376                && (grouping == GM_PREPEND || grouping == GM_BOTH
     377                    || (first_group_printed && (grouping == GM_APPEND
     378                                                || grouping == GM_SEPARATE))))
     379              putchar (delimiter);
     380  
     381            if (new_group || grouping != GM_NONE)
     382              {
     383                if (fwrite (thisline->buffer, sizeof (char), thisline->length,
     384                    stdout) != thisline->length)
     385                  write_error ();
     386  
     387                SWAP_LINES (prevline, thisline);
     388                prevfield = thisfield;
     389                prevlen = thislen;
     390                first_group_printed = true;
     391              }
     392          }
     393        if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
     394          putchar (delimiter);
     395      }
     396    else
     397      {
     398        char *prevfield;
     399        size_t prevlen;
     400        uintmax_t match_count = 0;
     401        bool first_delimiter = true;
     402  
     403        if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
     404          goto closefiles;
     405        prevfield = find_field (prevline);
     406        prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
     407  
     408        while (!feof (stdin))
     409          {
     410            bool match;
     411            char *thisfield;
     412            size_t thislen;
     413            if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
     414              {
     415                if (ferror (stdin))
     416                  goto closefiles;
     417                break;
     418              }
     419            thisfield = find_field (thisline);
     420            thislen = thisline->length - 1 - (thisfield - thisline->buffer);
     421            match = !different (thisfield, prevfield, thislen, prevlen);
     422            match_count += match;
     423  
     424            if (match_count == UINTMAX_MAX)
     425              {
     426                if (count_occurrences)
     427                  error (EXIT_FAILURE, 0, _("too many repeated lines"));
     428                match_count--;
     429              }
     430  
     431            if (delimit_groups != DM_NONE)
     432              {
     433                if (!match)
     434                  {
     435                    if (match_count) /* a previous match */
     436                      first_delimiter = false; /* Only used when DM_SEPARATE */
     437                  }
     438                else if (match_count == 1)
     439                  {
     440                    if ((delimit_groups == DM_PREPEND)
     441                        || (delimit_groups == DM_SEPARATE
     442                            && !first_delimiter))
     443                      putchar (delimiter);
     444                  }
     445              }
     446  
     447            if (!match || output_later_repeated)
     448              {
     449                writeline (prevline, match, match_count);
     450                SWAP_LINES (prevline, thisline);
     451                prevfield = thisfield;
     452                prevlen = thislen;
     453                if (!match)
     454                  match_count = 0;
     455              }
     456          }
     457  
     458        writeline (prevline, false, match_count);
     459      }
     460  
     461   closefiles:
     462    if (ferror (stdin) || fclose (stdin) != 0)
     463      error (EXIT_FAILURE, errno, _("error reading %s"), quoteaf (infile));
     464  
     465    /* stdout is handled via the atexit-invoked close_stdout function.  */
     466  
     467    free (lb1.buffer);
     468    free (lb2.buffer);
     469  }
     470  
     471  enum Skip_field_option_type
     472    {
     473      SFO_NONE,
     474      SFO_OBSOLETE,
     475      SFO_NEW
     476    };
     477  
     478  int
     479  main (int argc, char **argv)
     480  {
     481    int optc = 0;
     482    bool posixly_correct = (getenv ("POSIXLY_CORRECT") != nullptr);
     483    enum Skip_field_option_type skip_field_option_type = SFO_NONE;
     484    unsigned int nfiles = 0;
     485    char const *file[2];
     486    char delimiter = '\n';	/* change with --zero-terminated, -z */
     487    bool output_option_used = false;   /* if true, one of -u/-d/-D/-c was used */
     488  
     489    file[0] = file[1] = "-";
     490    initialize_main (&argc, &argv);
     491    set_program_name (argv[0]);
     492    setlocale (LC_ALL, "");
     493    bindtextdomain (PACKAGE, LOCALEDIR);
     494    textdomain (PACKAGE);
     495  
     496    atexit (close_stdout);
     497  
     498    skip_chars = 0;
     499    skip_fields = 0;
     500    check_chars = SIZE_MAX;
     501    output_unique = output_first_repeated = true;
     502    output_later_repeated = false;
     503    countmode = count_none;
     504    delimit_groups = DM_NONE;
     505  
     506    while (true)
     507      {
     508        /* Parse an operand with leading "+" as a file after "--" was
     509           seen; or if pedantic and a file was seen; or if not
     510           obsolete.  */
     511  
     512        if (optc == -1
     513            || (posixly_correct && nfiles != 0)
     514            || ((optc = getopt_long (argc, argv,
     515                                     "-0123456789Dcdf:is:uw:z",
     516                                     longopts, nullptr))
     517                == -1))
     518          {
     519            if (argc <= optind)
     520              break;
     521            if (nfiles == 2)
     522              {
     523                error (0, 0, _("extra operand %s"), quote (argv[optind]));
     524                usage (EXIT_FAILURE);
     525              }
     526            file[nfiles++] = argv[optind++];
     527          }
     528        else switch (optc)
     529          {
     530          case 1:
     531            {
     532              uintmax_t size;
     533              if (optarg[0] == '+'
     534                  && ! strict_posix2 ()
     535                  && xstrtoumax (optarg, nullptr, 10, &size, "") == LONGINT_OK
     536                  && size <= SIZE_MAX)
     537                skip_chars = size;
     538              else if (nfiles == 2)
     539                {
     540                  error (0, 0, _("extra operand %s"), quote (optarg));
     541                  usage (EXIT_FAILURE);
     542                }
     543              else
     544                file[nfiles++] = optarg;
     545            }
     546            break;
     547  
     548          case '0':
     549          case '1':
     550          case '2':
     551          case '3':
     552          case '4':
     553          case '5':
     554          case '6':
     555          case '7':
     556          case '8':
     557          case '9':
     558            {
     559              if (skip_field_option_type == SFO_NEW)
     560                skip_fields = 0;
     561  
     562              if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
     563                skip_fields = SIZE_MAX;
     564  
     565              skip_field_option_type = SFO_OBSOLETE;
     566            }
     567            break;
     568  
     569          case 'c':
     570            countmode = count_occurrences;
     571            output_option_used = true;
     572            break;
     573  
     574          case 'd':
     575            output_unique = false;
     576            output_option_used = true;
     577            break;
     578  
     579          case 'D':
     580            output_unique = false;
     581            output_later_repeated = true;
     582            if (optarg == nullptr)
     583              delimit_groups = DM_NONE;
     584            else
     585              delimit_groups = XARGMATCH ("--all-repeated", optarg,
     586                                          delimit_method_string,
     587                                          delimit_method_map);
     588            output_option_used = true;
     589            break;
     590  
     591          case GROUP_OPTION:
     592            if (optarg == nullptr)
     593              grouping = GM_SEPARATE;
     594            else
     595              grouping = XARGMATCH ("--group", optarg,
     596                                    grouping_method_string,
     597                                    grouping_method_map);
     598            break;
     599  
     600          case 'f':
     601            skip_field_option_type = SFO_NEW;
     602            skip_fields = size_opt (optarg,
     603                                    N_("invalid number of fields to skip"));
     604            break;
     605  
     606          case 'i':
     607            ignore_case = true;
     608            break;
     609  
     610          case 's':
     611            skip_chars = size_opt (optarg,
     612                                   N_("invalid number of bytes to skip"));
     613            break;
     614  
     615          case 'u':
     616            output_first_repeated = false;
     617            output_option_used = true;
     618            break;
     619  
     620          case 'w':
     621            check_chars = size_opt (optarg,
     622                                    N_("invalid number of bytes to compare"));
     623            break;
     624  
     625          case 'z':
     626            delimiter = '\0';
     627            break;
     628  
     629          case_GETOPT_HELP_CHAR;
     630  
     631          case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
     632  
     633          default:
     634            usage (EXIT_FAILURE);
     635          }
     636      }
     637  
     638    /* Note we could allow --group with -D at least, and that would
     639       avoid the need to specify a grouping method to --all-repeated.
     640       It was thought best to avoid deprecating those parameters though
     641       and keep --group separate to other options.  */
     642    if (grouping != GM_NONE && output_option_used)
     643      {
     644        error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
     645        usage (EXIT_FAILURE);
     646      }
     647  
     648    if (grouping != GM_NONE && countmode != count_none)
     649      {
     650        error (0, 0,
     651             _("grouping and printing repeat counts is meaningless"));
     652        usage (EXIT_FAILURE);
     653      }
     654  
     655    if (countmode == count_occurrences && output_later_repeated)
     656      {
     657        error (0, 0,
     658             _("printing all duplicated lines and repeat counts is meaningless"));
     659        usage (EXIT_FAILURE);
     660      }
     661  
     662    check_file (file[0], file[1], delimiter);
     663  
     664    return EXIT_SUCCESS;
     665  }