1 /* GNU SED, a batch stream editor.
2 Copyright (C) 1989-2022 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; If not, see <https://www.gnu.org/licenses/>. */
16
17 #include <config.h>
18 #include "basicdefs.h"
19 #include "dfa.h"
20 #include "localeinfo.h"
21 #include "regex.h"
22 #include <stdio.h>
23 #include "unlocked-io.h"
24
25 #include "utils.h"
26
27 /* Struct vector is used to describe a compiled sed program. */
28 struct vector {
29 struct sed_cmd *v; /* a dynamically allocated array */
30 size_t v_allocated; /* ... number of slots allocated */
31 size_t v_length; /* ... number of slots in use */
32 };
33
34 /* This structure tracks files used by sed so that they may all be
35 closed cleanly at normal program termination. A flag is kept that tells
36 if a missing newline was encountered, so that it is added on the
37 next line and the two lines are not concatenated. */
38 struct output {
39 char *name;
40 bool missing_newline;
41 FILE *fp;
42 struct output *link;
43 };
44
45 struct text_buf {
46 char *text;
47 size_t text_length;
48 };
49
50 struct regex {
51 regex_t pattern;
52 int flags;
53 size_t sz;
54 struct dfa *dfa;
55 bool begline;
56 bool endline;
57 char re[1];
58 };
59
60 struct readcmd {
61 char *fname;
62 bool append; /* true: append (default); false: prepend (gnu extension) */
63 };
64
65 enum replacement_types {
66 REPL_ASIS = 0,
67 REPL_UPPERCASE = 1,
68 REPL_LOWERCASE = 2,
69 REPL_UPPERCASE_FIRST = 4,
70 REPL_LOWERCASE_FIRST = 8,
71 REPL_MODIFIERS = REPL_UPPERCASE_FIRST | REPL_LOWERCASE_FIRST,
72
73 /* These are given to aid in debugging */
74 REPL_UPPERCASE_UPPERCASE = REPL_UPPERCASE_FIRST | REPL_UPPERCASE,
75 REPL_UPPERCASE_LOWERCASE = REPL_UPPERCASE_FIRST | REPL_LOWERCASE,
76 REPL_LOWERCASE_UPPERCASE = REPL_LOWERCASE_FIRST | REPL_UPPERCASE,
77 REPL_LOWERCASE_LOWERCASE = REPL_LOWERCASE_FIRST | REPL_LOWERCASE
78 };
79
80 enum text_types {
81 TEXT_BUFFER,
82 TEXT_REPLACEMENT,
83 TEXT_REGEX
84 };
85
86 enum posixicity_types {
87 POSIXLY_EXTENDED, /* with GNU extensions */
88 POSIXLY_CORRECT, /* with POSIX-compatible GNU extensions */
89 POSIXLY_BASIC /* pedantically POSIX */
90 };
91
92 enum addr_state {
93 RANGE_INACTIVE, /* never been active */
94 RANGE_ACTIVE, /* between first and second address */
95 RANGE_CLOSED /* like RANGE_INACTIVE, but range has ended once */
96 };
97
98 enum addr_types {
99 ADDR_IS_NULL, /* null address */
100 ADDR_IS_REGEX, /* a.addr_regex is valid */
101 ADDR_IS_NUM, /* a.addr_number is valid */
102 ADDR_IS_NUM_MOD, /* a.addr_number is valid, addr_step is modulo */
103 ADDR_IS_STEP, /* address is +N (only valid for addr2) */
104 ADDR_IS_STEP_MOD, /* address is ~N (only valid for addr2) */
105 ADDR_IS_LAST /* address is $ */
106 };
107
108 struct addr {
109 enum addr_types addr_type;
110 countT addr_number;
111 countT addr_step;
112 struct regex *addr_regex;
113 };
114
115
116 struct replacement {
117 char *prefix;
118 size_t prefix_length;
119 int subst_id;
120 enum replacement_types repl_type;
121 struct replacement *next;
122 };
123
124 struct subst {
125 struct regex *regx;
126 struct replacement *replacement;
127 countT numb; /* if >0, only substitute for match number "numb" */
128 struct output *outf; /* 'w' option given */
129 unsigned global : 1; /* 'g' option given */
130 unsigned print : 2; /* 'p' option given (before/after eval) */
131 unsigned eval : 1; /* 'e' option given */
132 unsigned max_id : 4; /* maximum backreference on the RHS */
133 #ifdef lint
134 char* replacement_buffer;
135 #endif
136 };
137
138
139
140
141 struct sed_cmd {
142 struct addr *a1; /* save space: usually is NULL */
143 struct addr *a2;
144
145 /* See description the enum, above. */
146 enum addr_state range_state;
147
148 /* Non-zero if command is to be applied to non-matches. */
149 char addr_bang;
150
151 /* The actual command character. */
152 char cmd;
153
154 /* auxiliary data for various commands */
155 union {
156 /* This structure is used for a, i, and c commands. */
157 struct text_buf cmd_txt;
158
159 /* This is used for the l, q and Q commands. */
160 int int_arg;
161
162 /* This is used for the {}, b, and t commands. */
163 countT jump_index;
164
165 /* This is used for the r command. */
166 struct readcmd readcmd;
167
168 /* This is used for the hairy s command. */
169 struct subst *cmd_subst;
170
171 /* This is used for the w command. */
172 struct output *outf;
173
174 /* This is used for the R command.
175 (despite the struct name, it is used for both in and out files). */
176 struct output *inf;
177
178 /* This is used for the y command. */
179 unsigned char *translate;
180 char **translatemb;
181
182 /* This is used for the ':' command (debug only). */
183 char* label_name;
184 } x;
185 };
186
187
188 _Noreturn void bad_prog (const char *why);
189 size_t normalize_text (char *text, size_t len, enum text_types buftype);
190 struct vector *compile_string (struct vector *, char *str, size_t len);
191 struct vector *compile_file (struct vector *, const char *cmdfile);
192 void check_final_program (struct vector *);
193 void rewind_read_files (void);
194 void finish_program (struct vector *);
195
196 struct regex *compile_regex (struct buffer *b, int flags, int needed_sub);
197 int match_regex (struct regex *regex,
198 char *buf, size_t buflen, size_t buf_start_offset,
199 struct re_registers *regarray, int regsize);
200 #ifdef lint
201 void release_regex (struct regex *);
202 #endif
203
204 void
205 debug_print_command (const struct vector *program, const struct sed_cmd *sc);
206 void
207 debug_print_program (const struct vector *program);
208 void
209 debug_print_char (char c);
210
211 int process_files (struct vector *, char **argv);
212
213 int main (int, char **);
214
215 extern struct localeinfo localeinfo;
216
217 extern int extended_regexp_flags;
218
219 /* one-byte buffer delimiter */
220 extern char buffer_delimiter;
221
222 /* If set, fflush(stdout) on every line output,
223 and turn off stream buffering on inputs. */
224 extern bool unbuffered;
225
226 /* If set, don't write out the line unless explicitly told to. */
227 extern bool no_default_output;
228
229 /* If set, reset line counts on every new file. */
230 extern bool separate_files;
231
232 /* If set, follow symlinks when invoked with -i option */
233 extern bool follow_symlinks;
234
235 /* Do we need to be pedantically POSIX compliant? */
236 extern enum posixicity_types posixicity;
237
238 /* How long should the `l' command's output line be? */
239 extern countT lcmd_out_line_len;
240
241 /* How do we edit files in-place? (we don't if NULL) */
242 extern char *in_place_extension;
243
244 /* The mode to use to read and write files, either "rt"/"w" or "rb"/"wb". */
245 extern char const *read_mode;
246 extern char const *write_mode;
247
248 /* Should we use EREs? */
249 extern bool use_extended_syntax_p;
250
251 /* Declarations for multibyte character sets. */
252 extern int mb_cur_max;
253 extern bool is_utf8;
254
255 /* If set, operate in 'sandbox' mode - disable e/r/w commands */
256 extern bool sandbox;
257
258 /* If set, print debugging information. */
259 extern bool debug;
260
261 #define MBRTOWC(pwc, s, n, ps) \
262 (mb_cur_max == 1 ? \
263 (*(pwc) = btowc (*(unsigned char *) (s)), 1) : \
264 mbrtowc ((pwc), (s), (n), (ps)))
265
266 #define WCRTOMB(s, wc, ps) \
267 (mb_cur_max == 1 ? \
268 (*(s) = wctob ((wint_t) (wc)), 1) : \
269 wcrtomb ((s), (wc), (ps)))
270
271 #define MBSINIT(s) \
272 (mb_cur_max == 1 ? 1 : mbsinit ((s)))
273
274 #define MBRLEN(s, n, ps) \
275 (mb_cur_max == 1 ? 1 : mbrtowc (NULL, s, n, ps))
276
277 #define IS_MB_CHAR(ch, ps) \
278 (mb_cur_max == 1 ? 0 : is_mb_char (ch, ps))
279
280 extern int is_mb_char (int ch, mbstate_t *ps);
281 extern void initialize_mbcs (void);
282
283 /* Use this to suppress gcc's '...may be used before initialized' warnings. */
284 #ifdef lint
285 # define IF_LINT(Code) Code
286 #else
287 # define IF_LINT(Code) /* empty */
288 #endif
289
290 #ifndef FALLTHROUGH
291 # if __GNUC__ < 7
292 # define FALLTHROUGH ((void) 0)
293 # else
294 # define FALLTHROUGH __attribute__ ((__fallthrough__))
295 # endif
296 #endif