1 /* cut - remove parts of lines of files
2 Copyright (C) 1997-2023 Free Software Foundation, Inc.
3 Copyright (C) 1984 David M. Ihnat
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 /* Written by David Ihnat. */
19
20 /* POSIX changes, bug fixes, long-named options, and cleanup
21 by David MacKenzie <djm@gnu.ai.mit.edu>.
22
23 Rewrite cut_fields and cut_bytes -- Jim Meyering. */
24
25 #include <config.h>
26
27 #include <stdio.h>
28 #include <getopt.h>
29 #include <sys/types.h>
30 #include "system.h"
31
32 #include "assure.h"
33 #include "fadvise.h"
34 #include "getndelim2.h"
35
36 #include "set-fields.h"
37
38 /* The official name of this program (e.g., no 'g' prefix). */
39 #define PROGRAM_NAME "cut"
40
41 #define AUTHORS \
42 proper_name ("David M. Ihnat"), \
43 proper_name ("David MacKenzie"), \
44 proper_name ("Jim Meyering")
45
46 #define FATAL_ERROR(Message) \
47 do \
48 { \
49 error (0, 0, (Message)); \
50 usage (EXIT_FAILURE); \
51 } \
52 while (0)
53
54
55 /* Pointer inside RP. When checking if a byte or field is selected
56 by a finite range, we check if it is between CURRENT_RP.LO
57 and CURRENT_RP.HI. If the byte or field index is greater than
58 CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
59 static struct field_range_pair *current_rp;
60
61 /* This buffer is used to support the semantics of the -s option
62 (or lack of same) when the specified field list includes (does
63 not include) the first field. In both of those cases, the entire
64 first field must be read into this buffer to determine whether it
65 is followed by a delimiter or a newline before any of it may be
66 output. Otherwise, cut_fields can do the job without using this
67 buffer. */
68 static char *field_1_buffer;
69
70 /* The number of bytes allocated for FIELD_1_BUFFER. */
71 static size_t field_1_bufsize;
72
73 /* If true, do not output lines containing no delimiter characters.
74 Otherwise, all such lines are printed. This option is valid only
75 with field mode. */
76 static bool suppress_non_delimited;
77
78 /* If true, print all bytes, characters, or fields _except_
79 those that were specified. */
80 static bool complement;
81
82 /* The delimiter character for field mode. */
83 static unsigned char delim;
84
85 /* The delimiter for each line/record. */
86 static unsigned char line_delim = '\n';
87
88 /* The length of output_delimiter_string. */
89 static size_t output_delimiter_length;
90
91 /* The output field separator string. Defaults to the 1-character
92 string consisting of the input delimiter. */
93 static char *output_delimiter_string;
94
95 /* The output delimiter string contents, if the default. */
96 static char output_delimiter_default[1];
97
98 /* True if we have ever read standard input. */
99 static bool have_read_stdin;
100
101 /* For long options that have no equivalent short option, use a
102 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
103 enum
104 {
105 OUTPUT_DELIMITER_OPTION = CHAR_MAX + 1,
106 COMPLEMENT_OPTION
107 };
108
109 static struct option const longopts[] =
110 {
111 {"bytes", required_argument, nullptr, 'b'},
112 {"characters", required_argument, nullptr, 'c'},
113 {"fields", required_argument, nullptr, 'f'},
114 {"delimiter", required_argument, nullptr, 'd'},
115 {"only-delimited", no_argument, nullptr, 's'},
116 {"output-delimiter", required_argument, nullptr, OUTPUT_DELIMITER_OPTION},
117 {"complement", no_argument, nullptr, COMPLEMENT_OPTION},
118 {"zero-terminated", no_argument, nullptr, 'z'},
119 {GETOPT_HELP_OPTION_DECL},
120 {GETOPT_VERSION_OPTION_DECL},
121 {nullptr, 0, nullptr, 0}
122 };
123
124 void
usage(int status)125 usage (int status)
126 {
127 if (status != EXIT_SUCCESS)
128 emit_try_help ();
129 else
130 {
131 printf (_("\
132 Usage: %s OPTION... [FILE]...\n\
133 "),
134 program_name);
135 fputs (_("\
136 Print selected parts of lines from each FILE to standard output.\n\
137 "), stdout);
138
139 emit_stdin_note ();
140 emit_mandatory_arg_note ();
141
142 fputs (_("\
143 -b, --bytes=LIST select only these bytes\n\
144 -c, --characters=LIST select only these characters\n\
145 -d, --delimiter=DELIM use DELIM instead of TAB for field delimiter\n\
146 "), stdout);
147 fputs (_("\
148 -f, --fields=LIST select only these fields; also print any line\n\
149 that contains no delimiter character, unless\n\
150 the -s option is specified\n\
151 -n (ignored)\n\
152 "), stdout);
153 fputs (_("\
154 --complement complement the set of selected bytes, characters\n\
155 or fields\n\
156 "), stdout);
157 fputs (_("\
158 -s, --only-delimited do not print lines not containing delimiters\n\
159 --output-delimiter=STRING use STRING as the output delimiter\n\
160 the default is to use the input delimiter\n\
161 "), stdout);
162 fputs (_("\
163 -z, --zero-terminated line delimiter is NUL, not newline\n\
164 "), stdout);
165 fputs (HELP_OPTION_DESCRIPTION, stdout);
166 fputs (VERSION_OPTION_DESCRIPTION, stdout);
167 fputs (_("\
168 \n\
169 Use one, and only one of -b, -c or -f. Each LIST is made up of one\n\
170 range, or many ranges separated by commas. Selected input is written\n\
171 in the same order that it is read, and is written exactly once.\n\
172 "), stdout);
173 fputs (_("\
174 Each range is one of:\n\
175 \n\
176 N N'th byte, character or field, counted from 1\n\
177 N- from N'th byte, character or field, to end of line\n\
178 N-M from N'th to M'th (included) byte, character or field\n\
179 -M from first to M'th (included) byte, character or field\n\
180 "), stdout);
181 emit_ancillary_info (PROGRAM_NAME);
182 }
183 exit (status);
184 }
185
186
187 /* Increment *ITEM_IDX (i.e., a field or byte index),
188 and if required CURRENT_RP. */
189
190 static inline void
next_item(uintmax_t * item_idx)191 next_item (uintmax_t *item_idx)
192 {
193 (*item_idx)++;
194 if ((*item_idx) > current_rp->hi)
195 current_rp++;
196 }
197
198 /* Return nonzero if the K'th field or byte is printable. */
199
200 static inline bool
print_kth(uintmax_t k)201 print_kth (uintmax_t k)
202 {
203 return current_rp->lo <= k;
204 }
205
206 /* Return nonzero if K'th byte is the beginning of a range. */
207
208 static inline bool
is_range_start_index(uintmax_t k)209 is_range_start_index (uintmax_t k)
210 {
211 return k == current_rp->lo;
212 }
213
214 /* Read from stream STREAM, printing to standard output any selected bytes. */
215
216 static void
cut_bytes(FILE * stream)217 cut_bytes (FILE *stream)
218 {
219 uintmax_t byte_idx; /* Number of bytes in the line so far. */
220 /* Whether to begin printing delimiters between ranges for the current line.
221 Set after we've begun printing data corresponding to the first range. */
222 bool print_delimiter;
223
224 byte_idx = 0;
225 print_delimiter = false;
226 current_rp = frp;
227 while (true)
228 {
229 int c; /* Each character from the file. */
230
231 c = getc (stream);
232
233 if (c == line_delim)
234 {
235 if (putchar (c) < 0)
236 write_error ();
237 byte_idx = 0;
238 print_delimiter = false;
239 current_rp = frp;
240 }
241 else if (c == EOF)
242 {
243 if (byte_idx > 0)
244 {
245 if (putchar (line_delim) < 0)
246 write_error ();
247 }
248 break;
249 }
250 else
251 {
252 next_item (&byte_idx);
253 if (print_kth (byte_idx))
254 {
255 if (output_delimiter_string != output_delimiter_default)
256 {
257 if (print_delimiter && is_range_start_index (byte_idx))
258 {
259 if (fwrite (output_delimiter_string, sizeof (char),
260 output_delimiter_length, stdout)
261 != output_delimiter_length)
262 write_error ();
263 }
264 print_delimiter = true;
265 }
266
267 if (putchar (c) < 0)
268 write_error ();
269 }
270 }
271 }
272 }
273
274 /* Read from stream STREAM, printing to standard output any selected fields. */
275
276 static void
cut_fields(FILE * stream)277 cut_fields (FILE *stream)
278 {
279 int c; /* Each character from the file. */
280 uintmax_t field_idx = 1;
281 bool found_any_selected_field = false;
282 bool buffer_first_field;
283
284 current_rp = frp;
285
286 c = getc (stream);
287 if (c == EOF)
288 return;
289
290 ungetc (c, stream);
291 c = 0;
292
293 /* To support the semantics of the -s flag, we may have to buffer
294 all of the first field to determine whether it is 'delimited.'
295 But that is unnecessary if all non-delimited lines must be printed
296 and the first field has been selected, or if non-delimited lines
297 must be suppressed and the first field has *not* been selected.
298 That is because a non-delimited line has exactly one field. */
299 buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
300
301 while (true)
302 {
303 if (field_idx == 1 && buffer_first_field)
304 {
305 ssize_t len;
306 size_t n_bytes;
307
308 len = getndelim2 (&field_1_buffer, &field_1_bufsize, 0,
309 GETNLINE_NO_LIMIT, delim, line_delim, stream);
310 if (len < 0)
311 {
312 free (field_1_buffer);
313 field_1_buffer = nullptr;
314 if (ferror (stream) || feof (stream))
315 break;
316 xalloc_die ();
317 }
318
319 n_bytes = len;
320 affirm (n_bytes != 0);
321
322 c = 0;
323
324 /* If the first field extends to the end of line (it is not
325 delimited) and we are printing all non-delimited lines,
326 print this one. */
327 if (to_uchar (field_1_buffer[n_bytes - 1]) != delim)
328 {
329 if (suppress_non_delimited)
330 {
331 /* Empty. */
332 }
333 else
334 {
335 if (fwrite (field_1_buffer, sizeof (char), n_bytes, stdout)
336 != n_bytes)
337 write_error ();
338 /* Make sure the output line is newline terminated. */
339 if (field_1_buffer[n_bytes - 1] != line_delim)
340 {
341 if (putchar (line_delim) < 0)
342 write_error ();
343 }
344 c = line_delim;
345 }
346 continue;
347 }
348
349 if (print_kth (1))
350 {
351 /* Print the field, but not the trailing delimiter. */
352 if (fwrite (field_1_buffer, sizeof (char), n_bytes - 1, stdout)
353 != n_bytes - 1)
354 write_error ();
355
356 /* With -d$'\n' don't treat the last '\n' as a delimiter. */
357 if (delim == line_delim)
358 {
359 int last_c = getc (stream);
360 if (last_c != EOF)
361 {
362 ungetc (last_c, stream);
363 found_any_selected_field = true;
364 }
365 }
366 else
367 {
368 found_any_selected_field = true;
369 }
370 }
371 next_item (&field_idx);
372 }
373
374 int prev_c = c;
375
376 if (print_kth (field_idx))
377 {
378 if (found_any_selected_field)
379 {
380 if (fwrite (output_delimiter_string, sizeof (char),
381 output_delimiter_length, stdout)
382 != output_delimiter_length)
383 write_error ();
384 }
385 found_any_selected_field = true;
386
387 while ((c = getc (stream)) != delim && c != line_delim && c != EOF)
388 {
389 if (putchar (c) < 0)
390 write_error ();
391 prev_c = c;
392 }
393 }
394 else
395 {
396 while ((c = getc (stream)) != delim && c != line_delim && c != EOF)
397 prev_c = c;
398 }
399
400 /* With -d$'\n' don't treat the last '\n' as a delimiter. */
401 if (delim == line_delim && c == delim)
402 {
403 int last_c = getc (stream);
404 if (last_c != EOF)
405 ungetc (last_c, stream);
406 else
407 c = last_c;
408 }
409
410 if (c == delim)
411 next_item (&field_idx);
412 else if (c == line_delim || c == EOF)
413 {
414 if (found_any_selected_field
415 || !(suppress_non_delimited && field_idx == 1))
416 {
417 /* Make sure the output line is newline terminated. */
418 if (c == line_delim || prev_c != line_delim
419 || delim == line_delim)
420 {
421 if (putchar (line_delim) < 0)
422 write_error ();
423 }
424 }
425 if (c == EOF)
426 break;
427
428 /* Start processing the next input line. */
429 field_idx = 1;
430 current_rp = frp;
431 found_any_selected_field = false;
432 }
433 }
434 }
435
436 /* Process file FILE to standard output, using CUT_STREAM.
437 Return true if successful. */
438
439 static bool
cut_file(char const * file,void (* cut_stream)(FILE *))440 cut_file (char const *file, void (*cut_stream) (FILE *))
441 {
442 FILE *stream;
443
444 if (STREQ (file, "-"))
445 {
446 have_read_stdin = true;
447 stream = stdin;
448 assume (stream); /* Pacify GCC bug#109613. */
449 }
450 else
451 {
452 stream = fopen (file, "r");
453 if (stream == nullptr)
454 {
455 error (0, errno, "%s", quotef (file));
456 return false;
457 }
458 }
459
460 fadvise (stream, FADVISE_SEQUENTIAL);
461
462 cut_stream (stream);
463
464 int err = errno;
465 if (!ferror (stream))
466 err = 0;
467 if (STREQ (file, "-"))
468 clearerr (stream); /* Also clear EOF. */
469 else if (fclose (stream) == EOF)
470 err = errno;
471 if (err)
472 {
473 error (0, err, "%s", quotef (file));
474 return false;
475 }
476 return true;
477 }
478
479 int
main(int argc,char ** argv)480 main (int argc, char **argv)
481 {
482 int optc;
483 bool ok;
484 bool delim_specified = false;
485 bool byte_mode = false;
486 char *spec_list_string = nullptr;
487
488 initialize_main (&argc, &argv);
489 set_program_name (argv[0]);
490 setlocale (LC_ALL, "");
491 bindtextdomain (PACKAGE, LOCALEDIR);
492 textdomain (PACKAGE);
493
494 atexit (close_stdout);
495
496 /* By default, all non-delimited lines are printed. */
497 suppress_non_delimited = false;
498
499 delim = '\0';
500 have_read_stdin = false;
501
502 while ((optc = getopt_long (argc, argv, "b:c:d:f:nsz", longopts, nullptr))
503 != -1)
504 {
505 switch (optc)
506 {
507 case 'b':
508 case 'c':
509 /* Build the byte list. */
510 byte_mode = true;
511 FALLTHROUGH;
512 case 'f':
513 /* Build the field list. */
514 if (spec_list_string)
515 FATAL_ERROR (_("only one list may be specified"));
516 spec_list_string = optarg;
517 break;
518
519 case 'd':
520 /* New delimiter. */
521 /* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
522 if (optarg[0] != '\0' && optarg[1] != '\0')
523 FATAL_ERROR (_("the delimiter must be a single character"));
524 delim = optarg[0];
525 delim_specified = true;
526 break;
527
528 case OUTPUT_DELIMITER_OPTION:
529 /* Interpret --output-delimiter='' to mean
530 'use the NUL byte as the delimiter.' */
531 output_delimiter_length = (optarg[0] == '\0'
532 ? 1 : strlen (optarg));
533 output_delimiter_string = optarg;
534 break;
535
536 case 'n':
537 break;
538
539 case 's':
540 suppress_non_delimited = true;
541 break;
542
543 case 'z':
544 line_delim = '\0';
545 break;
546
547 case COMPLEMENT_OPTION:
548 complement = true;
549 break;
550
551 case_GETOPT_HELP_CHAR;
552 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
553 default:
554 usage (EXIT_FAILURE);
555 }
556 }
557
558 if (!spec_list_string)
559 FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
560
561 if (byte_mode)
562 {
563 if (delim_specified)
564 FATAL_ERROR (_("an input delimiter may be specified only\
565 when operating on fields"));
566
567 if (suppress_non_delimited)
568 FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
569 \tonly when operating on fields"));
570 }
571
572 set_fields (spec_list_string,
573 ((byte_mode ? SETFLD_ERRMSG_USE_POS : 0)
574 | (complement ? SETFLD_COMPLEMENT : 0)));
575
576 if (!delim_specified)
577 delim = '\t';
578
579 if (output_delimiter_string == nullptr)
580 {
581 output_delimiter_default[0] = delim;
582 output_delimiter_string = output_delimiter_default;
583 output_delimiter_length = 1;
584 }
585
586 void (*cut_stream) (FILE *) = byte_mode ? cut_bytes : cut_fields;
587 if (optind == argc)
588 ok = cut_file ("-", cut_stream);
589 else
590 for (ok = true; optind < argc; optind++)
591 ok &= cut_file (argv[optind], cut_stream);
592
593
594 if (have_read_stdin && fclose (stdin) == EOF)
595 {
596 error (0, errno, "-");
597 ok = false;
598 }
599
600 return ok ? EXIT_SUCCESS : EXIT_FAILURE;
601 }
602