(root)/
coreutils-9.4/
src/
shuf.c
       1  /* Shuffle lines of text.
       2  
       3     Copyright (C) 2006-2023 Free Software Foundation, Inc.
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation, either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.
      17  
      18     Written by Paul Eggert.  */
      19  
      20  #include <config.h>
      21  
      22  #include <sys/types.h>
      23  #include "system.h"
      24  
      25  #include "fadvise.h"
      26  #include "getopt.h"
      27  #include "linebuffer.h"
      28  #include "quote.h"
      29  #include "randint.h"
      30  #include "randperm.h"
      31  #include "read-file.h"
      32  #include "stdio--.h"
      33  #include "xstrtol.h"
      34  
      35  /* The official name of this program (e.g., no 'g' prefix).  */
      36  #define PROGRAM_NAME "shuf"
      37  
      38  #define AUTHORS proper_name ("Paul Eggert")
      39  
      40  /* For reservoir-sampling, allocate the reservoir lines in batches.  */
      41  enum { RESERVOIR_LINES_INCREMENT = 1024 };
      42  
      43  /* reservoir-sampling introduces CPU overhead for small inputs.
      44     So only enable it for inputs >= this limit.
      45     This limit was determined using these commands:
      46       $ for p in $(seq 7); do src/seq $((10**$p)) > 10p$p.in; done
      47       $ for p in $(seq 7); do time shuf-nores -n10 10p$p.in >/dev/null; done
      48       $ for p in $(seq 7); do time shuf -n10 10p$p.in >/dev/null; done  .*/
      49  enum { RESERVOIR_MIN_INPUT = 8192 * 1024 };
      50  
      51  
      52  void
      53  usage (int status)
      54  {
      55    if (status != EXIT_SUCCESS)
      56      emit_try_help ();
      57    else
      58      {
      59        printf (_("\
      60  Usage: %s [OPTION]... [FILE]\n\
      61    or:  %s -e [OPTION]... [ARG]...\n\
      62    or:  %s -i LO-HI [OPTION]...\n\
      63  "),
      64                program_name, program_name, program_name);
      65        fputs (_("\
      66  Write a random permutation of the input lines to standard output.\n\
      67  "), stdout);
      68  
      69        emit_stdin_note ();
      70        emit_mandatory_arg_note ();
      71  
      72        fputs (_("\
      73    -e, --echo                treat each ARG as an input line\n\
      74    -i, --input-range=LO-HI   treat each number LO through HI as an input line\n\
      75    -n, --head-count=COUNT    output at most COUNT lines\n\
      76    -o, --output=FILE         write result to FILE instead of standard output\n\
      77        --random-source=FILE  get random bytes from FILE\n\
      78    -r, --repeat              output lines can be repeated\n\
      79  "), stdout);
      80        fputs (_("\
      81    -z, --zero-terminated     line delimiter is NUL, not newline\n\
      82  "), stdout);
      83        fputs (HELP_OPTION_DESCRIPTION, stdout);
      84        fputs (VERSION_OPTION_DESCRIPTION, stdout);
      85        emit_ancillary_info (PROGRAM_NAME);
      86      }
      87  
      88    exit (status);
      89  }
      90  
      91  /* For long options that have no equivalent short option, use a
      92     non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
      93  enum
      94  {
      95    RANDOM_SOURCE_OPTION = CHAR_MAX + 1
      96  };
      97  
      98  static struct option const long_opts[] =
      99  {
     100    {"echo", no_argument, nullptr, 'e'},
     101    {"input-range", required_argument, nullptr, 'i'},
     102    {"head-count", required_argument, nullptr, 'n'},
     103    {"output", required_argument, nullptr, 'o'},
     104    {"random-source", required_argument, nullptr, RANDOM_SOURCE_OPTION},
     105    {"repeat", no_argument, nullptr, 'r'},
     106    {"zero-terminated", no_argument, nullptr, 'z'},
     107    {GETOPT_HELP_OPTION_DECL},
     108    {GETOPT_VERSION_OPTION_DECL},
     109    {0, 0, 0, 0},
     110  };
     111  
     112  static void
     113  input_from_argv (char **operand, int n_operands, char eolbyte)
     114  {
     115    char *p;
     116    size_t size = n_operands;
     117    int i;
     118  
     119    for (i = 0; i < n_operands; i++)
     120      size += strlen (operand[i]);
     121    p = xmalloc (size);
     122  
     123    for (i = 0; i < n_operands; i++)
     124      {
     125        char *p1 = stpcpy (p, operand[i]);
     126        operand[i] = p;
     127        p = p1;
     128        *p++ = eolbyte;
     129      }
     130  
     131    operand[n_operands] = p;
     132  }
     133  
     134  /* Return the start of the next line after LINE, which is guaranteed
     135     to end in EOLBYTE.  */
     136  
     137  static char *
     138  next_line (char *line, char eolbyte)
     139  {
     140    char *p = rawmemchr (line, eolbyte);
     141    return p + 1;
     142  }
     143  
     144  /* Return the size of the input if possible or OFF_T_MAX if not.  */
     145  
     146  static off_t
     147  input_size (void)
     148  {
     149    off_t file_size;
     150  
     151    struct stat stat_buf;
     152    if (fstat (STDIN_FILENO, &stat_buf) != 0)
     153      return OFF_T_MAX;
     154    if (usable_st_size (&stat_buf))
     155      file_size = stat_buf.st_size;
     156    else
     157      return OFF_T_MAX;
     158  
     159    off_t input_offset = lseek (STDIN_FILENO, 0, SEEK_CUR);
     160    if (input_offset < 0)
     161      return OFF_T_MAX;
     162  
     163    file_size -= input_offset;
     164  
     165    return file_size;
     166  }
     167  
     168  /* Read all lines and store up to K permuted lines in *OUT_RSRV.
     169     Return the number of lines read, up to a maximum of K.  */
     170  
     171  static size_t
     172  read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k,
     173                                 struct randint_source *s,
     174                                 struct linebuffer **out_rsrv)
     175  {
     176    randint n_lines = 0;
     177    size_t n_alloc_lines = MIN (k, RESERVOIR_LINES_INCREMENT);
     178    struct linebuffer *line = nullptr;
     179    struct linebuffer *rsrv;
     180  
     181    rsrv = xcalloc (n_alloc_lines, sizeof (struct linebuffer));
     182  
     183    /* Fill the first K lines, directly into the reservoir.  */
     184    while (n_lines < k
     185           && (line =
     186               readlinebuffer_delim (&rsrv[n_lines], in, eolbyte)) != nullptr)
     187      {
     188        n_lines++;
     189  
     190        /* Enlarge reservoir.  */
     191        if (n_lines >= n_alloc_lines)
     192          {
     193            n_alloc_lines += RESERVOIR_LINES_INCREMENT;
     194            rsrv = xnrealloc (rsrv, n_alloc_lines, sizeof (struct linebuffer));
     195            memset (&rsrv[n_lines], 0,
     196                    RESERVOIR_LINES_INCREMENT * sizeof (struct linebuffer));
     197          }
     198      }
     199  
     200    /* last line wasn't null - so there may be more lines to read.  */
     201    if (line != nullptr)
     202      {
     203        struct linebuffer dummy;
     204        initbuffer (&dummy);  /* space for lines not put in reservoir.  */
     205  
     206        /* Choose the fate of the next line, with decreasing probability (as
     207           n_lines increases in size).
     208  
     209           If the line will be used, store it directly in the reservoir.
     210           Otherwise, store it in dummy space.
     211  
     212           With 'struct linebuffer', storing into existing buffer will reduce
     213           re-allocations (will only re-allocate if the new line is longer than
     214           the currently allocated space).  */
     215        do
     216          {
     217            randint j = randint_choose (s, n_lines + 1);  /* 0 .. n_lines.  */
     218            line = (j < k) ? (&rsrv[j]) : (&dummy);
     219          }
     220        while (readlinebuffer_delim (line, in, eolbyte) != nullptr && n_lines++);
     221  
     222        if (! n_lines)
     223          error (EXIT_FAILURE, EOVERFLOW, _("too many input lines"));
     224  
     225        freebuffer (&dummy);
     226      }
     227  
     228    /* no more input lines, or an input error.  */
     229    if (ferror (in))
     230      error (EXIT_FAILURE, errno, _("read error"));
     231  
     232    *out_rsrv = rsrv;
     233    return MIN (k, n_lines);
     234  }
     235  
     236  static int
     237  write_permuted_output_reservoir (size_t n_lines, struct linebuffer *lines,
     238                                   size_t const *permutation)
     239  {
     240    for (size_t i = 0; i < n_lines; i++)
     241      {
     242        const struct linebuffer *p = &lines[permutation[i]];
     243        if (fwrite (p->buffer, sizeof (char), p->length, stdout) != p->length)
     244          return -1;
     245      }
     246  
     247    return 0;
     248  }
     249  
     250  /* Read data from file IN.  Input lines are delimited by EOLBYTE;
     251     silently append a trailing EOLBYTE if the file ends in some other
     252     byte.  Store a pointer to the resulting array of lines into *PLINE.
     253     Return the number of lines read.  Report an error and exit on
     254     failure.  */
     255  
     256  static size_t
     257  read_input (FILE *in, char eolbyte, char ***pline)
     258  {
     259    char *p;
     260    char *buf = nullptr;
     261    size_t used;
     262    char *lim;
     263    char **line;
     264    size_t n_lines;
     265  
     266    /* TODO: We should limit the amount of data read here,
     267       to less than RESERVOIR_MIN_INPUT.  I.e., adjust fread_file() to support
     268       taking a byte limit.  We'd then need to ensure we handle a line spanning
     269       this boundary.  With that in place we could set use_reservoir_sampling
     270       when used==RESERVOIR_MIN_INPUT, and have read_input_reservoir_sampling()
     271       call a wrapper function to populate a linebuffer from the internal pline
     272       or if none left, stdin.  Doing that would give better performance by
     273       avoiding the reservoir CPU overhead when reading < RESERVOIR_MIN_INPUT
     274       from a pipe, and allow us to dispense with the input_size() function.  */
     275    if (!(buf = fread_file (in, 0, &used)))
     276      error (EXIT_FAILURE, errno, _("read error"));
     277  
     278    if (used && buf[used - 1] != eolbyte)
     279      buf[used++] = eolbyte;
     280  
     281    lim = buf + used;
     282  
     283    n_lines = 0;
     284    for (p = buf; p < lim; p = next_line (p, eolbyte))
     285      n_lines++;
     286  
     287    *pline = line = xnmalloc (n_lines + 1, sizeof *line);
     288  
     289    line[0] = p = buf;
     290    for (size_t i = 1; i <= n_lines; i++)
     291      line[i] = p = next_line (p, eolbyte);
     292  
     293    return n_lines;
     294  }
     295  
     296  /* Output N_LINES lines to stdout from LINE array,
     297     chosen by the indices in PERMUTATION.
     298     PERMUTATION and LINE must have at least N_LINES elements.
     299     Strings in LINE must include the line-terminator character.  */
     300  static int
     301  write_permuted_lines (size_t n_lines, char *const *line,
     302                        size_t const *permutation)
     303  {
     304    for (size_t i = 0; i < n_lines; i++)
     305      {
     306        char *const *p = line + permutation[i];
     307        size_t len = p[1] - p[0];
     308        if (fwrite (p[0], sizeof *p[0], len, stdout) != len)
     309          return -1;
     310      }
     311  
     312    return 0;
     313  }
     314  
     315  /* Output N_LINES of numbers to stdout, from PERMUTATION array.
     316     PERMUTATION must have at least N_LINES elements.  */
     317  static int
     318  write_permuted_numbers (size_t n_lines, size_t lo_input,
     319                          size_t const *permutation, char eolbyte)
     320  {
     321    for (size_t i = 0; i < n_lines; i++)
     322      {
     323        unsigned long int n = lo_input + permutation[i];
     324        if (printf ("%lu%c", n, eolbyte) < 0)
     325          return -1;
     326      }
     327  
     328    return 0;
     329  }
     330  
     331  /* Output COUNT numbers to stdout, chosen randomly from range
     332     LO_INPUT through HI_INPUT.  */
     333  static int
     334  write_random_numbers (struct randint_source *s, size_t count,
     335                        size_t lo_input, size_t hi_input, char eolbyte)
     336  {
     337    const randint range = hi_input - lo_input + 1;
     338  
     339    for (size_t i = 0; i < count; i++)
     340      {
     341        unsigned long int j = lo_input + randint_choose (s, range);
     342        if (printf ("%lu%c", j, eolbyte) < 0)
     343          return -1;
     344      }
     345  
     346    return 0;
     347  }
     348  
     349  /* Output COUNT lines to stdout from LINES array.
     350     LINES must have at least N_LINES elements in it.
     351     Strings in LINES_ must include the line-terminator character.  */
     352  static int
     353  write_random_lines (struct randint_source *s, size_t count,
     354                      char *const *lines, size_t n_lines)
     355  {
     356    for (size_t i = 0; i < count; i++)
     357      {
     358        const randint j = randint_choose (s, n_lines);
     359        char *const *p = lines + j;
     360        size_t len = p[1] - p[0];
     361        if (fwrite (p[0], sizeof *p[0], len, stdout) != len)
     362          return -1;
     363      }
     364  
     365    return 0;
     366  }
     367  
     368  int
     369  main (int argc, char **argv)
     370  {
     371    bool echo = false;
     372    bool input_range = false;
     373    size_t lo_input = SIZE_MAX;
     374    size_t hi_input = 0;
     375    size_t head_lines = SIZE_MAX;
     376    char const *outfile = nullptr;
     377    char *random_source = nullptr;
     378    char eolbyte = '\n';
     379    char **input_lines = nullptr;
     380    bool use_reservoir_sampling = false;
     381    bool repeat = false;
     382  
     383    int optc;
     384    int n_operands;
     385    char **operand;
     386    size_t n_lines;
     387    char **line = nullptr;
     388    struct linebuffer *reservoir = nullptr;
     389    struct randint_source *randint_source;
     390    size_t *permutation = nullptr;
     391    int i;
     392  
     393    initialize_main (&argc, &argv);
     394    set_program_name (argv[0]);
     395    setlocale (LC_ALL, "");
     396    bindtextdomain (PACKAGE, LOCALEDIR);
     397    textdomain (PACKAGE);
     398  
     399    atexit (close_stdout);
     400  
     401    while ((optc = getopt_long (argc, argv, "ei:n:o:rz", long_opts, nullptr))
     402           != -1)
     403      switch (optc)
     404        {
     405        case 'e':
     406          echo = true;
     407          break;
     408  
     409        case 'i':
     410          {
     411            if (input_range)
     412              error (EXIT_FAILURE, 0, _("multiple -i options specified"));
     413            input_range = true;
     414  
     415            uintmax_t u;
     416            char *lo_end;
     417            strtol_error err = xstrtoumax (optarg, &lo_end, 10, &u, nullptr);
     418            if (err == LONGINT_OK)
     419              {
     420                lo_input = u;
     421                if (lo_input != u)
     422                  err = LONGINT_OVERFLOW;
     423                else if (*lo_end != '-')
     424                  err = LONGINT_INVALID;
     425                else
     426                  {
     427                    err = xstrtoumax (lo_end + 1, nullptr, 10, &u, "");
     428                    if (err == LONGINT_OK)
     429                      {
     430                        hi_input = u;
     431                        if (hi_input != u)
     432                          err = LONGINT_OVERFLOW;
     433                      }
     434                  }
     435              }
     436  
     437            n_lines = hi_input - lo_input + 1;
     438  
     439            if (err != LONGINT_OK || (lo_input <= hi_input) == (n_lines == 0))
     440              error (EXIT_FAILURE, err == LONGINT_OVERFLOW ? EOVERFLOW : 0,
     441                     "%s: %s", _("invalid input range"), quote (optarg));
     442          }
     443          break;
     444  
     445        case 'n':
     446          {
     447            uintmax_t argval;
     448            strtol_error e = xstrtoumax (optarg, nullptr, 10, &argval, "");
     449  
     450            if (e == LONGINT_OK)
     451              head_lines = MIN (head_lines, argval);
     452            else if (e != LONGINT_OVERFLOW)
     453              error (EXIT_FAILURE, 0, _("invalid line count: %s"),
     454                     quote (optarg));
     455          }
     456          break;
     457  
     458        case 'o':
     459          if (outfile && !STREQ (outfile, optarg))
     460            error (EXIT_FAILURE, 0, _("multiple output files specified"));
     461          outfile = optarg;
     462          break;
     463  
     464        case RANDOM_SOURCE_OPTION:
     465          if (random_source && !STREQ (random_source, optarg))
     466            error (EXIT_FAILURE, 0, _("multiple random sources specified"));
     467          random_source = optarg;
     468          break;
     469  
     470        case 'r':
     471          repeat = true;
     472          break;
     473  
     474        case 'z':
     475          eolbyte = '\0';
     476          break;
     477  
     478        case_GETOPT_HELP_CHAR;
     479        case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
     480        default:
     481          usage (EXIT_FAILURE);
     482        }
     483  
     484    n_operands = argc - optind;
     485    operand = argv + optind;
     486  
     487    /* Check invalid usage.  */
     488    if (echo && input_range)
     489      {
     490        error (0, 0, _("cannot combine -e and -i options"));
     491        usage (EXIT_FAILURE);
     492      }
     493    if (input_range ? 0 < n_operands : !echo && 1 < n_operands)
     494      {
     495        error (0, 0, _("extra operand %s"), quote (operand[!input_range]));
     496        usage (EXIT_FAILURE);
     497      }
     498  
     499    /* Prepare input.  */
     500    if (head_lines == 0)
     501      {
     502        n_lines = 0;
     503        line = nullptr;
     504      }
     505    else if (echo)
     506      {
     507        input_from_argv (operand, n_operands, eolbyte);
     508        n_lines = n_operands;
     509        line = operand;
     510      }
     511    else if (input_range)
     512      {
     513        n_lines = hi_input - lo_input + 1;
     514        line = nullptr;
     515      }
     516    else
     517      {
     518        /* If an input file is specified, re-open it as stdin.  */
     519        if (n_operands == 1
     520            && ! (STREQ (operand[0], "-")
     521                  || freopen (operand[0], "r", stdin)))
     522          error (EXIT_FAILURE, errno, "%s", quotef (operand[0]));
     523  
     524        fadvise (stdin, FADVISE_SEQUENTIAL);
     525  
     526        if (repeat || head_lines == SIZE_MAX
     527            || input_size () <= RESERVOIR_MIN_INPUT)
     528          {
     529            n_lines = read_input (stdin, eolbyte, &input_lines);
     530            line = input_lines;
     531          }
     532        else
     533          {
     534            use_reservoir_sampling = true;
     535            n_lines = SIZE_MAX;   /* unknown number of input lines, for now.  */
     536          }
     537      }
     538  
     539    /* The adjusted head line count; can be less than HEAD_LINES if the
     540       input is small and if not repeating.  */
     541    size_t ahead_lines = repeat || head_lines < n_lines ? head_lines : n_lines;
     542  
     543    randint_source = randint_all_new (random_source,
     544                                      (use_reservoir_sampling || repeat
     545                                       ? SIZE_MAX
     546                                       : randperm_bound (ahead_lines, n_lines)));
     547    if (! randint_source)
     548      error (EXIT_FAILURE, errno, "%s",
     549             quotef (random_source ? random_source : "getrandom"));
     550  
     551    if (use_reservoir_sampling)
     552      {
     553        /* Instead of reading the entire file into 'line',
     554           use reservoir-sampling to store just AHEAD_LINES random lines.  */
     555        n_lines = read_input_reservoir_sampling (stdin, eolbyte, ahead_lines,
     556                                                 randint_source, &reservoir);
     557        ahead_lines = n_lines;
     558      }
     559  
     560    /* Close stdin now, rather than earlier, so that randint_all_new
     561       doesn't have to worry about opening something other than
     562       stdin.  */
     563    if (! (head_lines == 0 || echo || input_range || fclose (stdin) == 0))
     564      error (EXIT_FAILURE, errno, _("read error"));
     565  
     566    if (!repeat)
     567      permutation = randperm_new (randint_source, ahead_lines, n_lines);
     568  
     569    if (outfile && ! freopen (outfile, "w", stdout))
     570      error (EXIT_FAILURE, errno, "%s", quotef (outfile));
     571  
     572    /* Generate output according to requested method */
     573    if (repeat)
     574      {
     575        if (head_lines == 0)
     576          i = 0;
     577        else
     578          {
     579            if (n_lines == 0)
     580              error (EXIT_FAILURE, 0, _("no lines to repeat"));
     581            if (input_range)
     582              i = write_random_numbers (randint_source, ahead_lines,
     583                                        lo_input, hi_input, eolbyte);
     584            else
     585              i = write_random_lines (randint_source, ahead_lines, line, n_lines);
     586          }
     587      }
     588    else
     589      {
     590        if (use_reservoir_sampling)
     591          i = write_permuted_output_reservoir (n_lines, reservoir, permutation);
     592        else if (input_range)
     593          i = write_permuted_numbers (ahead_lines, lo_input,
     594                                      permutation, eolbyte);
     595        else
     596          i = write_permuted_lines (ahead_lines, line, permutation);
     597      }
     598  
     599    if (i != 0)
     600      write_error ();
     601  
     602    main_exit (EXIT_SUCCESS);
     603  }