(root)/
coreutils-9.4/
src/
cut.c
       1  /* cut - remove parts of lines of files
       2     Copyright (C) 1997-2023 Free Software Foundation, Inc.
       3     Copyright (C) 1984 David M. Ihnat
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation, either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  /* Written by David Ihnat.  */
      19  
      20  /* POSIX changes, bug fixes, long-named options, and cleanup
      21     by David MacKenzie <djm@gnu.ai.mit.edu>.
      22  
      23     Rewrite cut_fields and cut_bytes -- Jim Meyering.  */
      24  
      25  #include <config.h>
      26  
      27  #include <stdio.h>
      28  #include <getopt.h>
      29  #include <sys/types.h>
      30  #include "system.h"
      31  
      32  #include "assure.h"
      33  #include "fadvise.h"
      34  #include "getndelim2.h"
      35  
      36  #include "set-fields.h"
      37  
      38  /* The official name of this program (e.g., no 'g' prefix).  */
      39  #define PROGRAM_NAME "cut"
      40  
      41  #define AUTHORS \
      42    proper_name ("David M. Ihnat"), \
      43    proper_name ("David MacKenzie"), \
      44    proper_name ("Jim Meyering")
      45  
      46  #define FATAL_ERROR(Message)						\
      47    do									\
      48      {									\
      49        error (0, 0, (Message));						\
      50        usage (EXIT_FAILURE);						\
      51      }									\
      52    while (0)
      53  
      54  
      55  /* Pointer inside RP.  When checking if a byte or field is selected
      56     by a finite range, we check if it is between CURRENT_RP.LO
      57     and CURRENT_RP.HI.  If the byte or field index is greater than
      58     CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair.  */
      59  static struct field_range_pair *current_rp;
      60  
      61  /* This buffer is used to support the semantics of the -s option
      62     (or lack of same) when the specified field list includes (does
      63     not include) the first field.  In both of those cases, the entire
      64     first field must be read into this buffer to determine whether it
      65     is followed by a delimiter or a newline before any of it may be
      66     output.  Otherwise, cut_fields can do the job without using this
      67     buffer.  */
      68  static char *field_1_buffer;
      69  
      70  /* The number of bytes allocated for FIELD_1_BUFFER.  */
      71  static size_t field_1_bufsize;
      72  
      73  /* If true, do not output lines containing no delimiter characters.
      74     Otherwise, all such lines are printed.  This option is valid only
      75     with field mode.  */
      76  static bool suppress_non_delimited;
      77  
      78  /* If true, print all bytes, characters, or fields _except_
      79     those that were specified.  */
      80  static bool complement;
      81  
      82  /* The delimiter character for field mode.  */
      83  static unsigned char delim;
      84  
      85  /* The delimiter for each line/record.  */
      86  static unsigned char line_delim = '\n';
      87  
      88  /* The length of output_delimiter_string.  */
      89  static size_t output_delimiter_length;
      90  
      91  /* The output field separator string.  Defaults to the 1-character
      92     string consisting of the input delimiter.  */
      93  static char *output_delimiter_string;
      94  
      95  /* The output delimiter string contents, if the default.  */
      96  static char output_delimiter_default[1];
      97  
      98  /* True if we have ever read standard input.  */
      99  static bool have_read_stdin;
     100  
     101  /* For long options that have no equivalent short option, use a
     102     non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
     103  enum
     104  {
     105    OUTPUT_DELIMITER_OPTION = CHAR_MAX + 1,
     106    COMPLEMENT_OPTION
     107  };
     108  
     109  static struct option const longopts[] =
     110  {
     111    {"bytes", required_argument, nullptr, 'b'},
     112    {"characters", required_argument, nullptr, 'c'},
     113    {"fields", required_argument, nullptr, 'f'},
     114    {"delimiter", required_argument, nullptr, 'd'},
     115    {"only-delimited", no_argument, nullptr, 's'},
     116    {"output-delimiter", required_argument, nullptr, OUTPUT_DELIMITER_OPTION},
     117    {"complement", no_argument, nullptr, COMPLEMENT_OPTION},
     118    {"zero-terminated", no_argument, nullptr, 'z'},
     119    {GETOPT_HELP_OPTION_DECL},
     120    {GETOPT_VERSION_OPTION_DECL},
     121    {nullptr, 0, nullptr, 0}
     122  };
     123  
     124  void
     125  usage (int status)
     126  {
     127    if (status != EXIT_SUCCESS)
     128      emit_try_help ();
     129    else
     130      {
     131        printf (_("\
     132  Usage: %s OPTION... [FILE]...\n\
     133  "),
     134                program_name);
     135        fputs (_("\
     136  Print selected parts of lines from each FILE to standard output.\n\
     137  "), stdout);
     138  
     139        emit_stdin_note ();
     140        emit_mandatory_arg_note ();
     141  
     142        fputs (_("\
     143    -b, --bytes=LIST        select only these bytes\n\
     144    -c, --characters=LIST   select only these characters\n\
     145    -d, --delimiter=DELIM   use DELIM instead of TAB for field delimiter\n\
     146  "), stdout);
     147        fputs (_("\
     148    -f, --fields=LIST       select only these fields;  also print any line\n\
     149                              that contains no delimiter character, unless\n\
     150                              the -s option is specified\n\
     151    -n                      (ignored)\n\
     152  "), stdout);
     153        fputs (_("\
     154        --complement        complement the set of selected bytes, characters\n\
     155                              or fields\n\
     156  "), stdout);
     157        fputs (_("\
     158    -s, --only-delimited    do not print lines not containing delimiters\n\
     159        --output-delimiter=STRING  use STRING as the output delimiter\n\
     160                              the default is to use the input delimiter\n\
     161  "), stdout);
     162        fputs (_("\
     163    -z, --zero-terminated   line delimiter is NUL, not newline\n\
     164  "), stdout);
     165        fputs (HELP_OPTION_DESCRIPTION, stdout);
     166        fputs (VERSION_OPTION_DESCRIPTION, stdout);
     167        fputs (_("\
     168  \n\
     169  Use one, and only one of -b, -c or -f.  Each LIST is made up of one\n\
     170  range, or many ranges separated by commas.  Selected input is written\n\
     171  in the same order that it is read, and is written exactly once.\n\
     172  "), stdout);
     173        fputs (_("\
     174  Each range is one of:\n\
     175  \n\
     176    N     N'th byte, character or field, counted from 1\n\
     177    N-    from N'th byte, character or field, to end of line\n\
     178    N-M   from N'th to M'th (included) byte, character or field\n\
     179    -M    from first to M'th (included) byte, character or field\n\
     180  "), stdout);
     181        emit_ancillary_info (PROGRAM_NAME);
     182      }
     183    exit (status);
     184  }
     185  
     186  
     187  /* Increment *ITEM_IDX (i.e., a field or byte index),
     188     and if required CURRENT_RP.  */
     189  
     190  static inline void
     191  next_item (uintmax_t *item_idx)
     192  {
     193    (*item_idx)++;
     194    if ((*item_idx) > current_rp->hi)
     195      current_rp++;
     196  }
     197  
     198  /* Return nonzero if the K'th field or byte is printable.  */
     199  
     200  static inline bool
     201  print_kth (uintmax_t k)
     202  {
     203    return current_rp->lo <= k;
     204  }
     205  
     206  /* Return nonzero if K'th byte is the beginning of a range.  */
     207  
     208  static inline bool
     209  is_range_start_index (uintmax_t k)
     210  {
     211    return k == current_rp->lo;
     212  }
     213  
     214  /* Read from stream STREAM, printing to standard output any selected bytes.  */
     215  
     216  static void
     217  cut_bytes (FILE *stream)
     218  {
     219    uintmax_t byte_idx;	/* Number of bytes in the line so far.  */
     220    /* Whether to begin printing delimiters between ranges for the current line.
     221       Set after we've begun printing data corresponding to the first range.  */
     222    bool print_delimiter;
     223  
     224    byte_idx = 0;
     225    print_delimiter = false;
     226    current_rp = frp;
     227    while (true)
     228      {
     229        int c;		/* Each character from the file.  */
     230  
     231        c = getc (stream);
     232  
     233        if (c == line_delim)
     234          {
     235            if (putchar (c) < 0)
     236              write_error ();
     237            byte_idx = 0;
     238            print_delimiter = false;
     239            current_rp = frp;
     240          }
     241        else if (c == EOF)
     242          {
     243            if (byte_idx > 0)
     244            {
     245              if (putchar (line_delim) < 0)
     246                write_error ();
     247            }
     248            break;
     249          }
     250        else
     251          {
     252            next_item (&byte_idx);
     253            if (print_kth (byte_idx))
     254              {
     255                if (output_delimiter_string != output_delimiter_default)
     256                  {
     257                    if (print_delimiter && is_range_start_index (byte_idx))
     258                      {
     259                        if (fwrite (output_delimiter_string, sizeof (char),
     260                                    output_delimiter_length, stdout)
     261                            != output_delimiter_length)
     262                          write_error ();
     263                      }
     264                    print_delimiter = true;
     265                  }
     266  
     267                if (putchar (c) < 0)
     268                  write_error ();
     269              }
     270          }
     271      }
     272  }
     273  
     274  /* Read from stream STREAM, printing to standard output any selected fields.  */
     275  
     276  static void
     277  cut_fields (FILE *stream)
     278  {
     279    int c;	/* Each character from the file.  */
     280    uintmax_t field_idx = 1;
     281    bool found_any_selected_field = false;
     282    bool buffer_first_field;
     283  
     284    current_rp = frp;
     285  
     286    c = getc (stream);
     287    if (c == EOF)
     288      return;
     289  
     290    ungetc (c, stream);
     291    c = 0;
     292  
     293    /* To support the semantics of the -s flag, we may have to buffer
     294       all of the first field to determine whether it is 'delimited.'
     295       But that is unnecessary if all non-delimited lines must be printed
     296       and the first field has been selected, or if non-delimited lines
     297       must be suppressed and the first field has *not* been selected.
     298       That is because a non-delimited line has exactly one field.  */
     299    buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
     300  
     301    while (true)
     302      {
     303        if (field_idx == 1 && buffer_first_field)
     304          {
     305            ssize_t len;
     306            size_t n_bytes;
     307  
     308            len = getndelim2 (&field_1_buffer, &field_1_bufsize, 0,
     309                              GETNLINE_NO_LIMIT, delim, line_delim, stream);
     310            if (len < 0)
     311              {
     312                free (field_1_buffer);
     313                field_1_buffer = nullptr;
     314                if (ferror (stream) || feof (stream))
     315                  break;
     316                xalloc_die ();
     317              }
     318  
     319            n_bytes = len;
     320            affirm (n_bytes != 0);
     321  
     322            c = 0;
     323  
     324            /* If the first field extends to the end of line (it is not
     325               delimited) and we are printing all non-delimited lines,
     326               print this one.  */
     327            if (to_uchar (field_1_buffer[n_bytes - 1]) != delim)
     328              {
     329                if (suppress_non_delimited)
     330                  {
     331                    /* Empty.  */
     332                  }
     333                else
     334                  {
     335                    if (fwrite (field_1_buffer, sizeof (char), n_bytes, stdout)
     336                        != n_bytes)
     337                      write_error ();
     338                    /* Make sure the output line is newline terminated.  */
     339                    if (field_1_buffer[n_bytes - 1] != line_delim)
     340                      {
     341                        if (putchar (line_delim) < 0)
     342                          write_error ();
     343                      }
     344                    c = line_delim;
     345                  }
     346                continue;
     347              }
     348  
     349            if (print_kth (1))
     350              {
     351                /* Print the field, but not the trailing delimiter.  */
     352                if (fwrite (field_1_buffer, sizeof (char), n_bytes - 1, stdout)
     353                    != n_bytes - 1)
     354                  write_error ();
     355  
     356                /* With -d$'\n' don't treat the last '\n' as a delimiter.  */
     357                if (delim == line_delim)
     358                  {
     359                    int last_c = getc (stream);
     360                    if (last_c != EOF)
     361                      {
     362                        ungetc (last_c, stream);
     363                        found_any_selected_field = true;
     364                      }
     365                  }
     366                else
     367                  {
     368                    found_any_selected_field = true;
     369                  }
     370              }
     371            next_item (&field_idx);
     372          }
     373  
     374        int prev_c = c;
     375  
     376        if (print_kth (field_idx))
     377          {
     378            if (found_any_selected_field)
     379              {
     380                if (fwrite (output_delimiter_string, sizeof (char),
     381                            output_delimiter_length, stdout)
     382                    != output_delimiter_length)
     383                  write_error ();
     384              }
     385            found_any_selected_field = true;
     386  
     387            while ((c = getc (stream)) != delim && c != line_delim && c != EOF)
     388              {
     389                if (putchar (c) < 0)
     390                  write_error ();
     391                prev_c = c;
     392              }
     393          }
     394        else
     395          {
     396            while ((c = getc (stream)) != delim && c != line_delim && c != EOF)
     397              prev_c = c;
     398          }
     399  
     400        /* With -d$'\n' don't treat the last '\n' as a delimiter.  */
     401        if (delim == line_delim && c == delim)
     402          {
     403            int last_c = getc (stream);
     404            if (last_c != EOF)
     405              ungetc (last_c, stream);
     406            else
     407              c = last_c;
     408          }
     409  
     410        if (c == delim)
     411          next_item (&field_idx);
     412        else if (c == line_delim || c == EOF)
     413          {
     414            if (found_any_selected_field
     415                || !(suppress_non_delimited && field_idx == 1))
     416              {
     417                /* Make sure the output line is newline terminated.  */
     418                if (c == line_delim || prev_c != line_delim
     419                    || delim == line_delim)
     420                  {
     421                    if (putchar (line_delim) < 0)
     422                      write_error ();
     423                  }
     424              }
     425            if (c == EOF)
     426              break;
     427  
     428            /* Start processing the next input line.  */
     429            field_idx = 1;
     430            current_rp = frp;
     431            found_any_selected_field = false;
     432          }
     433      }
     434  }
     435  
     436  /* Process file FILE to standard output, using CUT_STREAM.
     437     Return true if successful.  */
     438  
     439  static bool
     440  cut_file (char const *file, void (*cut_stream) (FILE *))
     441  {
     442    FILE *stream;
     443  
     444    if (STREQ (file, "-"))
     445      {
     446        have_read_stdin = true;
     447        stream = stdin;
     448        assume (stream);  /* Pacify GCC bug#109613.  */
     449      }
     450    else
     451      {
     452        stream = fopen (file, "r");
     453        if (stream == nullptr)
     454          {
     455            error (0, errno, "%s", quotef (file));
     456            return false;
     457          }
     458      }
     459  
     460    fadvise (stream, FADVISE_SEQUENTIAL);
     461  
     462    cut_stream (stream);
     463  
     464    int err = errno;
     465    if (!ferror (stream))
     466      err = 0;
     467    if (STREQ (file, "-"))
     468      clearerr (stream);		/* Also clear EOF.  */
     469    else if (fclose (stream) == EOF)
     470      err = errno;
     471    if (err)
     472      {
     473        error (0, err, "%s", quotef (file));
     474        return false;
     475      }
     476    return true;
     477  }
     478  
     479  int
     480  main (int argc, char **argv)
     481  {
     482    int optc;
     483    bool ok;
     484    bool delim_specified = false;
     485    bool byte_mode = false;
     486    char *spec_list_string = nullptr;
     487  
     488    initialize_main (&argc, &argv);
     489    set_program_name (argv[0]);
     490    setlocale (LC_ALL, "");
     491    bindtextdomain (PACKAGE, LOCALEDIR);
     492    textdomain (PACKAGE);
     493  
     494    atexit (close_stdout);
     495  
     496    /* By default, all non-delimited lines are printed.  */
     497    suppress_non_delimited = false;
     498  
     499    delim = '\0';
     500    have_read_stdin = false;
     501  
     502    while ((optc = getopt_long (argc, argv, "b:c:d:f:nsz", longopts, nullptr))
     503           != -1)
     504      {
     505        switch (optc)
     506          {
     507          case 'b':
     508          case 'c':
     509            /* Build the byte list.  */
     510            byte_mode = true;
     511            FALLTHROUGH;
     512          case 'f':
     513            /* Build the field list.  */
     514            if (spec_list_string)
     515              FATAL_ERROR (_("only one list may be specified"));
     516            spec_list_string = optarg;
     517            break;
     518  
     519          case 'd':
     520            /* New delimiter.  */
     521            /* Interpret -d '' to mean 'use the NUL byte as the delimiter.'  */
     522            if (optarg[0] != '\0' && optarg[1] != '\0')
     523              FATAL_ERROR (_("the delimiter must be a single character"));
     524            delim = optarg[0];
     525            delim_specified = true;
     526            break;
     527  
     528          case OUTPUT_DELIMITER_OPTION:
     529            /* Interpret --output-delimiter='' to mean
     530               'use the NUL byte as the delimiter.'  */
     531            output_delimiter_length = (optarg[0] == '\0'
     532                                       ? 1 : strlen (optarg));
     533            output_delimiter_string = optarg;
     534            break;
     535  
     536          case 'n':
     537            break;
     538  
     539          case 's':
     540            suppress_non_delimited = true;
     541            break;
     542  
     543          case 'z':
     544            line_delim = '\0';
     545            break;
     546  
     547          case COMPLEMENT_OPTION:
     548            complement = true;
     549            break;
     550  
     551          case_GETOPT_HELP_CHAR;
     552          case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
     553          default:
     554            usage (EXIT_FAILURE);
     555          }
     556      }
     557  
     558    if (!spec_list_string)
     559      FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
     560  
     561    if (byte_mode)
     562      {
     563        if (delim_specified)
     564          FATAL_ERROR (_("an input delimiter may be specified only\
     565   when operating on fields"));
     566  
     567        if (suppress_non_delimited)
     568          FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
     569  \tonly when operating on fields"));
     570      }
     571  
     572    set_fields (spec_list_string,
     573                ((byte_mode ? SETFLD_ERRMSG_USE_POS : 0)
     574                 | (complement ? SETFLD_COMPLEMENT : 0)));
     575  
     576    if (!delim_specified)
     577      delim = '\t';
     578  
     579    if (output_delimiter_string == nullptr)
     580      {
     581        output_delimiter_default[0] = delim;
     582        output_delimiter_string = output_delimiter_default;
     583        output_delimiter_length = 1;
     584      }
     585  
     586    void (*cut_stream) (FILE *) = byte_mode ? cut_bytes : cut_fields;
     587    if (optind == argc)
     588      ok = cut_file ("-", cut_stream);
     589    else
     590      for (ok = true; optind < argc; optind++)
     591        ok &= cut_file (argv[optind], cut_stream);
     592  
     593  
     594    if (have_read_stdin && fclose (stdin) == EOF)
     595      {
     596        error (0, errno, "-");
     597        ok = false;
     598      }
     599  
     600    return ok ? EXIT_SUCCESS : EXIT_FAILURE;
     601  }