1 /* csplit - split a file into sections determined by context lines
2    Copyright (C) 1991-2023 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
16 
17 /* Written by Stuart Kemp, cpsrk@groper.jcu.edu.au.
18    Modified by David MacKenzie, djm@gnu.ai.mit.edu. */
19 
20 #include <config.h>
21 
22 #include <getopt.h>
23 #include <sys/types.h>
24 #include <signal.h>
25 #include <stdckdint.h>
26 
27 #include "system.h"
28 
29 #include <regex.h>
30 
31 #include "fd-reopen.h"
32 #include "quote.h"
33 #include "safe-read.h"
34 #include "stdio--.h"
35 #include "xdectoint.h"
36 #include "xstrtol.h"
37 
38 /* The official name of this program (e.g., no 'g' prefix).  */
39 #define PROGRAM_NAME "csplit"
40 
41 #define AUTHORS \
42   proper_name ("Stuart Kemp"), \
43   proper_name ("David MacKenzie")
44 
45 /* The default prefix for output file names. */
46 #define DEFAULT_PREFIX	"xx"
47 
48 /* A compiled pattern arg. */
49 struct control
50 {
51   intmax_t offset;		/* Offset from regexp to split at. */
52   intmax_t lines_required;	/* Number of lines required. */
53   intmax_t repeat;		/* Repeat count. */
54   int argnum;			/* ARGV index. */
55   bool repeat_forever;		/* True if '*' used as a repeat count. */
56   bool ignore;			/* If true, produce no output (for regexp). */
57   bool regexpr;			/* True if regular expression was used. */
58   struct re_pattern_buffer re_compiled;	/* Compiled regular expression. */
59 };
60 
61 /* Initial size of data area in buffers. */
62 #define START_SIZE	8191
63 
64 /* Number of lines kept in each node in line list. */
65 #define CTRL_SIZE	80
66 
67 #ifdef DEBUG
68 /* Some small values to test the algorithms. */
69 # define START_SIZE	200
70 # define CTRL_SIZE	1
71 #endif
72 
73 /* A string with a length count. */
74 struct cstring
75 {
76   idx_t len;
77   char *str;
78 };
79 
80 /* Pointers to the beginnings of lines in the buffer area.
81    These structures are linked together if needed. */
82 struct line
83 {
84   idx_t used;			/* Number of offsets used in this struct. */
85   idx_t insert_index;		/* Next offset to use when inserting line. */
86   idx_t retrieve_index;	/* Next index to use when retrieving line. */
87   struct cstring starts[CTRL_SIZE]; /* Lines in the data area. */
88   struct line *next;		/* Next in linked list. */
89 };
90 
91 /* The structure to hold the input lines.
92    Contains a pointer to the data area and a list containing
93    pointers to the individual lines. */
94 struct buffer_record
95 {
96   idx_t bytes_alloc;		/* Size of the buffer area. */
97   idx_t bytes_used;		/* Bytes used in the buffer area. */
98   intmax_t start_line;		/* First line number in this buffer. */
99   intmax_t first_available;	/* First line that can be retrieved. */
100   idx_t num_lines;		/* Number of complete lines in this buffer. */
101   char *buffer;			/* Data area. */
102   struct line *line_start;	/* Head of list of pointers to lines. */
103   struct line *curr_line;	/* The line start record currently in use. */
104   struct buffer_record *next;
105 };
106 
107 static void close_output_file (void);
108 static void create_output_file (void);
109 static void delete_all_files (bool);
110 static void save_line_to_file (const struct cstring *line);
111 
112 /* Start of buffer list. */
113 static struct buffer_record *head = nullptr;
114 
115 /* Partially read line. */
116 static char *hold_area = nullptr;
117 
118 /* Number of bytes in 'hold_area'. */
119 static idx_t hold_count = 0;
120 
121 /* Number of the last line in the buffers. */
122 static intmax_t last_line_number = 0;
123 
124 /* Number of the line currently being examined. */
125 static intmax_t current_line = 0;
126 
127 /* If true, we have read EOF. */
128 static bool have_read_eof = false;
129 
130 /* Name of output files. */
131 static char *volatile filename_space = nullptr;
132 
133 /* Prefix part of output file names. */
134 static char const *volatile prefix = nullptr;
135 
136 /* Suffix part of output file names. */
137 static char *volatile suffix = nullptr;
138 
139 /* Number of digits to use in output file names. */
140 static int volatile digits = 2;
141 
142 /* Number of files created so far. */
143 static int volatile files_created = 0;
144 
145 /* Number of bytes written to current file. */
146 static intmax_t bytes_written;
147 
148 /* Output file pointer. */
149 static FILE *output_stream = nullptr;
150 
151 /* Output file name. */
152 static char *output_filename = nullptr;
153 
154 /* Perhaps it would be cleaner to pass arg values instead of indexes. */
155 static char **global_argv;
156 
157 /* If true, do not print the count of bytes in each output file. */
158 static bool suppress_count;
159 
160 /* If true, remove output files on error. */
161 static bool volatile remove_files;
162 
163 /* If true, remove all output files which have a zero length. */
164 static bool elide_empty_files;
165 
166 /* If true, suppress the lines that match the PATTERN */
167 static bool suppress_matched;
168 
169 /* The compiled pattern arguments, which determine how to split
170    the input file. */
171 static struct control *controls;
172 
173 /* Number of elements in 'controls'. */
174 static idx_t control_used;
175 
176 /* The set of signals that are caught.  */
177 static sigset_t caught_signals;
178 
179 /* For long options that have no equivalent short option, use a
180    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
181 enum
182 {
183   SUPPRESS_MATCHED_OPTION = CHAR_MAX + 1
184 };
185 
186 static struct option const longopts[] =
187 {
188   {"digits", required_argument, nullptr, 'n'},
189   {"quiet", no_argument, nullptr, 'q'},
190   {"silent", no_argument, nullptr, 's'},
191   {"keep-files", no_argument, nullptr, 'k'},
192   {"elide-empty-files", no_argument, nullptr, 'z'},
193   {"prefix", required_argument, nullptr, 'f'},
194   {"suffix-format", required_argument, nullptr, 'b'},
195   {"suppress-matched", no_argument, nullptr, SUPPRESS_MATCHED_OPTION},
196   {GETOPT_HELP_OPTION_DECL},
197   {GETOPT_VERSION_OPTION_DECL},
198   {nullptr, 0, nullptr, 0}
199 };
200 
201 /* Optionally remove files created so far; then exit.
202    Called when an error detected. */
203 
204 static void
cleanup(void)205 cleanup (void)
206 {
207   sigset_t oldset;
208 
209   close_output_file ();
210 
211   sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
212   delete_all_files (false);
213   sigprocmask (SIG_SETMASK, &oldset, nullptr);
214 }
215 
216 static _Noreturn void
cleanup_fatal(void)217 cleanup_fatal (void)
218 {
219   cleanup ();
220   exit (EXIT_FAILURE);
221 }
222 
223 extern void
xalloc_die(void)224 xalloc_die (void)
225 {
226   error (0, 0, "%s", _("memory exhausted"));
227   cleanup_fatal ();
228 }
229 
230 static void
interrupt_handler(int sig)231 interrupt_handler (int sig)
232 {
233   delete_all_files (true);
234   signal (sig, SIG_DFL);
235   /* The signal has been reset to SIG_DFL, but blocked during this
236      handler.  Force the default action of this signal once the
237      handler returns and the block is removed.  */
238   raise (sig);
239 }
240 
241 /* Keep track of NUM bytes of a partial line in buffer START.
242    These bytes will be retrieved later when another large buffer is read.  */
243 
244 static void
save_to_hold_area(char * start,idx_t num)245 save_to_hold_area (char *start, idx_t num)
246 {
247   free (hold_area);
248   hold_area = start;
249   hold_count = num;
250 }
251 
252 /* Read up to MAX_N_BYTES bytes from the input stream into DEST.
253    Return the number of bytes read. */
254 
255 static idx_t
read_input(char * dest,idx_t max_n_bytes)256 read_input (char *dest, idx_t max_n_bytes)
257 {
258   idx_t bytes_read;
259 
260   if (max_n_bytes == 0)
261     return 0;
262 
263   bytes_read = safe_read (STDIN_FILENO, dest, max_n_bytes);
264 
265   if (bytes_read == 0)
266     have_read_eof = true;
267 
268   if (bytes_read == SAFE_READ_ERROR)
269     {
270       error (0, errno, _("read error"));
271       cleanup_fatal ();
272     }
273 
274   return bytes_read;
275 }
276 
277 /* Initialize existing line record P. */
278 
279 static void
clear_line_control(struct line * p)280 clear_line_control (struct line *p)
281 {
282   p->used = 0;
283   p->insert_index = 0;
284   p->retrieve_index = 0;
285 }
286 
287 /* Return a new, initialized line record. */
288 
289 static struct line *
new_line_control(void)290 new_line_control (void)
291 {
292   struct line *p = xmalloc (sizeof *p);
293 
294   p->next = nullptr;
295   clear_line_control (p);
296 
297   return p;
298 }
299 
300 /* Record LINE_START, which is the address of the start of a line
301    of length LINE_LEN in the large buffer, in the lines buffer of B. */
302 
303 static void
keep_new_line(struct buffer_record * b,char * line_start,idx_t line_len)304 keep_new_line (struct buffer_record *b, char *line_start, idx_t line_len)
305 {
306   struct line *l;
307 
308   /* If there is no existing area to keep line info, get some. */
309   if (b->line_start == nullptr)
310     b->line_start = b->curr_line = new_line_control ();
311 
312   /* If existing area for lines is full, get more. */
313   if (b->curr_line->used == CTRL_SIZE)
314     {
315       b->curr_line->next = new_line_control ();
316       b->curr_line = b->curr_line->next;
317     }
318 
319   l = b->curr_line;
320 
321   /* Record the start of the line, and update counters. */
322   l->starts[l->insert_index].str = line_start;
323   l->starts[l->insert_index].len = line_len;
324   l->used++;
325   l->insert_index++;
326 }
327 
328 /* Scan the buffer in B for newline characters
329    and record the line start locations and lengths in B.
330    Return the number of lines found in this buffer.
331 
332    There may be an incomplete line at the end of the buffer;
333    a pointer is kept to this area, which will be used when
334    the next buffer is filled. */
335 
336 static idx_t
record_line_starts(struct buffer_record * b)337 record_line_starts (struct buffer_record *b)
338 {
339   char *line_start;		/* Start of current line. */
340   idx_t lines;			/* Number of lines found. */
341   idx_t line_length;		/* Length of each line found. */
342 
343   if (b->bytes_used == 0)
344     return 0;
345 
346   lines = 0;
347   line_start = b->buffer;
348   char *buffer_end = line_start + b->bytes_used;
349   *buffer_end = '\n';
350 
351   while (true)
352     {
353       char *line_end = rawmemchr (line_start, '\n');
354       if (line_end == buffer_end)
355         break;
356       line_length = line_end - line_start + 1;
357       keep_new_line (b, line_start, line_length);
358       line_start = line_end + 1;
359       lines++;
360     }
361 
362   /* Check for an incomplete last line. */
363   idx_t bytes_left = buffer_end - line_start;
364   if (bytes_left)
365     {
366       if (have_read_eof)
367         {
368           keep_new_line (b, line_start, bytes_left);
369           lines++;
370         }
371       else
372         save_to_hold_area (ximemdup (line_start, bytes_left), bytes_left);
373     }
374 
375   b->num_lines = lines;
376   b->first_available = b->start_line = last_line_number + 1;
377   last_line_number += lines;
378 
379   return lines;
380 }
381 
382 /* Work around <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109614>.  */
383 #if 13 <= __GNUC__
384 # pragma GCC diagnostic ignored "-Wanalyzer-mismatching-deallocation"
385 # pragma GCC diagnostic ignored "-Wanalyzer-use-after-free"
386 # pragma GCC diagnostic ignored "-Wanalyzer-use-of-uninitialized-value"
387 #endif
388 
389 static void
free_buffer(struct buffer_record * buf)390 free_buffer (struct buffer_record *buf)
391 {
392   for (struct line *l = buf->line_start; l;)
393     {
394       struct line *n = l->next;
395       free (l);
396       l = n;
397     }
398   free (buf->buffer);
399   free (buf);
400 }
401 
402 /* Return a new buffer of at least MINSIZE bytes.  */
403 
404 static ATTRIBUTE_DEALLOC (free_buffer, 1)
405 struct buffer_record *
get_new_buffer(idx_t min_size)406 get_new_buffer (idx_t min_size)
407 {
408   struct buffer_record *new_buffer = xmalloc (sizeof *new_buffer);
409   new_buffer->bytes_alloc = 0;
410   new_buffer->buffer = xpalloc (nullptr, &new_buffer->bytes_alloc, min_size,
411                                 -1, 1);
412   new_buffer->bytes_used = 0;
413   new_buffer->start_line = new_buffer->first_available = last_line_number + 1;
414   new_buffer->num_lines = 0;
415   new_buffer->line_start = new_buffer->curr_line = nullptr;
416   new_buffer->next = nullptr;
417 
418   return new_buffer;
419 }
420 
421 /* Append buffer BUF to the linked list of buffers that contain
422    some data yet to be processed. */
423 
424 static void
save_buffer(struct buffer_record * buf)425 save_buffer (struct buffer_record *buf)
426 {
427   struct buffer_record *p;
428 
429   buf->next = nullptr;
430   buf->curr_line = buf->line_start;
431 
432   if (head == nullptr)
433     head = buf;
434   else
435     {
436       for (p = head; p->next; p = p->next)
437         /* Do nothing. */ ;
438       p->next = buf;
439     }
440 }
441 
442 /* Fill a buffer of input.
443 
444    Set the initial size of the buffer to a default.
445    Fill the buffer (from the hold area and input stream)
446    and find the individual lines.
447    If no lines are found (the buffer is too small to hold the next line),
448    release the current buffer (whose contents would have been put in the
449    hold area) and repeat the process with another large buffer until at least
450    one entire line has been read.
451 
452    Return true if a new buffer was obtained, otherwise false
453    (in which case end-of-file must have been encountered). */
454 
455 static bool
load_buffer(void)456 load_buffer (void)
457 {
458   if (have_read_eof)
459     return false;
460 
461   /* We must make the buffer at least as large as the amount of data
462      in the partial line left over from the last call,
463      plus room for a sentinel '\n'. */
464   idx_t bytes_wanted = MAX (START_SIZE, hold_count + 1);
465 
466   while (true)
467     {
468       struct buffer_record *b = get_new_buffer (bytes_wanted);
469       idx_t bytes_alloc = b->bytes_alloc;
470       idx_t bytes_avail = bytes_alloc;
471       char *p = b->buffer;
472 
473       /* First check the 'holding' area for a partial line. */
474       if (hold_count)
475         {
476           p = mempcpy (p, hold_area, hold_count);
477           b->bytes_used += hold_count;
478           bytes_avail -= hold_count;
479           hold_count = 0;
480         }
481 
482       b->bytes_used += read_input (p, bytes_avail - 1);
483 
484       if (record_line_starts (b) != 0)
485         {
486           save_buffer (b);
487           return true;
488         }
489 
490       free_buffer (b);
491       if (have_read_eof)
492         return false;
493       if (ckd_add (&bytes_wanted, bytes_alloc, bytes_alloc >> 1))
494         xalloc_die ();
495     }
496 }
497 
498 /* Return the line number of the first line that has not yet been retrieved. */
499 
500 static intmax_t
get_first_line_in_buffer(void)501 get_first_line_in_buffer (void)
502 {
503   if (head == nullptr && !load_buffer ())
504     error (EXIT_FAILURE, errno, _("input disappeared"));
505 
506   return head->first_available;
507 }
508 
509 /* Return a pointer to the logical first line in the buffer and make the
510    next line the logical first line.
511    Return nullptr if there is no more input. */
512 
513 static struct cstring *
remove_line(void)514 remove_line (void)
515 {
516   /* If non-null, this is the buffer for which the previous call
517      returned the final line.  So now, presuming that line has been
518      processed, we can free the buffer and reset this pointer.  */
519   static struct buffer_record *prev_buf = nullptr;
520 
521   struct cstring *line;		/* Return value. */
522   struct line *l;		/* For convenience. */
523 
524   if (prev_buf)
525     {
526       free_buffer (prev_buf);
527       prev_buf = nullptr;
528     }
529 
530   if (head == nullptr && !load_buffer ())
531     return nullptr;
532 
533   if (current_line < head->first_available)
534     current_line = head->first_available;
535 
536   ++(head->first_available);
537 
538   l = head->curr_line;
539 
540   line = &l->starts[l->retrieve_index];
541 
542   /* Advance index to next line. */
543   if (++l->retrieve_index == l->used)
544     {
545       /* Go on to the next line record. */
546       head->curr_line = l->next;
547       if (head->curr_line == nullptr || head->curr_line->used == 0)
548         {
549           /* Go on to the next data block.
550              but first record the current one so we can free it
551              once the line we're returning has been processed.  */
552           prev_buf = head;
553           head = head->next;
554         }
555     }
556 
557   return line;
558 }
559 
560 /* Search the buffers for line LINENUM, reading more input if necessary.
561    Return a pointer to the line, or nullptr if it is not found in the file. */
562 
563 static struct cstring *
find_line(intmax_t linenum)564 find_line (intmax_t linenum)
565 {
566   struct buffer_record *b;
567 
568   if (head == nullptr && !load_buffer ())
569     return nullptr;
570 
571   if (linenum < head->start_line)
572     return nullptr;
573 
574   for (b = head;;)
575     {
576       if (linenum < b->start_line + b->num_lines)
577         {
578           /* The line is in this buffer. */
579           struct line *l;
580           idx_t offset;	/* How far into the buffer the line is. */
581 
582           l = b->line_start;
583           offset = linenum - b->start_line;
584           /* Find the control record. */
585           while (offset >= CTRL_SIZE)
586             {
587               l = l->next;
588               offset -= CTRL_SIZE;
589             }
590           return &l->starts[offset];
591         }
592       if (b->next == nullptr && !load_buffer ())
593         return nullptr;
594       b = b->next;		/* Try the next data block. */
595     }
596 }
597 
598 /* Return true if at least one more line is available for input. */
599 
600 static bool
no_more_lines(void)601 no_more_lines (void)
602 {
603   return find_line (current_line + 1) == nullptr;
604 }
605 
606 /* Open NAME as standard input.  */
607 
608 static void
set_input_file(char const * name)609 set_input_file (char const *name)
610 {
611   if (! STREQ (name, "-") && fd_reopen (STDIN_FILENO, name, O_RDONLY, 0) < 0)
612     error (EXIT_FAILURE, errno, _("cannot open %s for reading"),
613            quoteaf (name));
614 }
615 
616 /* Write all lines from the beginning of the buffer up to, but
617    not including, line LAST_LINE, to the current output file.
618    If IGNORE is true, do not output lines selected here.
619    ARGNUM is the index in ARGV of the current pattern. */
620 
621 static void
write_to_file(intmax_t last_line,bool ignore,int argnum)622 write_to_file (intmax_t last_line, bool ignore, int argnum)
623 {
624   struct cstring *line;
625   intmax_t first_line;		/* First available input line. */
626   intmax_t lines;		/* Number of lines to output. */
627   intmax_t i;
628 
629   first_line = get_first_line_in_buffer ();
630 
631   if (first_line > last_line)
632     {
633       error (0, 0, _("%s: line number out of range"),
634              quote (global_argv[argnum]));
635       cleanup_fatal ();
636     }
637 
638   lines = last_line - first_line;
639 
640   for (i = 0; i < lines; i++)
641     {
642       line = remove_line ();
643       if (line == nullptr)
644         {
645           error (0, 0, _("%s: line number out of range"),
646                  quote (global_argv[argnum]));
647           cleanup_fatal ();
648         }
649       if (!ignore)
650         save_line_to_file (line);
651     }
652 }
653 
654 /* Output any lines left after all regexps have been processed. */
655 
656 static void
dump_rest_of_file(void)657 dump_rest_of_file (void)
658 {
659   struct cstring *line;
660 
661   while ((line = remove_line ()) != nullptr)
662     save_line_to_file (line);
663 }
664 
665 /* Handle an attempt to read beyond EOF under the control of record P,
666    on iteration REPETITION if nonzero. */
667 
668 static void
handle_line_error(const struct control * p,intmax_t repetition)669 handle_line_error (const struct control *p, intmax_t repetition)
670 {
671   char buf[INT_BUFSIZE_BOUND (intmax_t)];
672 
673   fprintf (stderr, _("%s: %s: line number out of range"),
674            program_name, quote (imaxtostr (p->lines_required, buf)));
675   if (repetition)
676     fprintf (stderr, _(" on repetition %s\n"), imaxtostr (repetition, buf));
677   else
678     fprintf (stderr, "\n");
679 
680   cleanup_fatal ();
681 }
682 
683 /* Determine the line number that marks the end of this file,
684    then get those lines and save them to the output file.
685    P is the control record.
686    REPETITION is the repetition number. */
687 
688 static void
process_line_count(const struct control * p,intmax_t repetition)689 process_line_count (const struct control *p, intmax_t repetition)
690 {
691   intmax_t linenum;
692   intmax_t last_line_to_save = p->lines_required * (repetition + 1);
693 
694   create_output_file ();
695 
696   /* Ensure that the line number specified is not 1 greater than
697      the number of lines in the file.
698      When suppressing matched lines, check before the loop. */
699   if (no_more_lines () && suppress_matched)
700     handle_line_error (p, repetition);
701 
702   linenum = get_first_line_in_buffer ();
703   while (linenum++ < last_line_to_save)
704     {
705       struct cstring *line = remove_line ();
706       if (line == nullptr)
707         handle_line_error (p, repetition);
708       save_line_to_file (line);
709     }
710 
711   close_output_file ();
712 
713   if (suppress_matched)
714     remove_line ();
715 
716   /* Ensure that the line number specified is not 1 greater than
717      the number of lines in the file. */
718   if (no_more_lines () && !suppress_matched)
719     handle_line_error (p, repetition);
720 }
721 
722 static void
regexp_error(struct control * p,intmax_t repetition,bool ignore)723 regexp_error (struct control *p, intmax_t repetition, bool ignore)
724 {
725   fprintf (stderr, _("%s: %s: match not found"),
726            program_name, quote (global_argv[p->argnum]));
727 
728   if (repetition)
729     {
730       char buf[INT_BUFSIZE_BOUND (intmax_t)];
731       fprintf (stderr, _(" on repetition %s\n"), imaxtostr (repetition, buf));
732     }
733   else
734     fprintf (stderr, "\n");
735 
736   if (!ignore)
737     {
738       dump_rest_of_file ();
739       close_output_file ();
740     }
741   cleanup_fatal ();
742 }
743 
744 /* Read the input until a line matches the regexp in P, outputting
745    it unless P->IGNORE is true.
746    REPETITION is this repeat-count; 0 means the first time. */
747 
748 static void
process_regexp(struct control * p,intmax_t repetition)749 process_regexp (struct control *p, intmax_t repetition)
750 {
751   struct cstring *line;		/* From input file. */
752   idx_t line_len;		/* To make "$" in regexps work. */
753   intmax_t break_line;		/* First line number of next file. */
754   bool ignore = p->ignore;	/* If true, skip this section. */
755   regoff_t ret;
756 
757   if (!ignore)
758     create_output_file ();
759 
760   /* If there is no offset for the regular expression, or
761      it is positive, then it is not necessary to buffer the lines. */
762 
763   if (p->offset >= 0)
764     {
765       while (true)
766         {
767           line = find_line (++current_line);
768           if (line == nullptr)
769             {
770               if (p->repeat_forever)
771                 {
772                   if (!ignore)
773                     {
774                       dump_rest_of_file ();
775                       close_output_file ();
776                     }
777                   exit (EXIT_SUCCESS);
778                 }
779               else
780                 regexp_error (p, repetition, ignore);
781             }
782           line_len = line->len;
783           if (line->str[line_len - 1] == '\n')
784             line_len--;
785           ret = re_search (&p->re_compiled, line->str, line_len,
786                            0, line_len, nullptr);
787           if (ret == -2)
788             {
789               error (0, 0, _("error in regular expression search"));
790               cleanup_fatal ();
791             }
792           if (ret == -1)
793             {
794               line = remove_line ();
795               if (!ignore)
796                 save_line_to_file (line);
797             }
798           else
799             break;
800         }
801     }
802   else
803     {
804       /* Buffer the lines. */
805       while (true)
806         {
807           line = find_line (++current_line);
808           if (line == nullptr)
809             {
810               if (p->repeat_forever)
811                 {
812                   if (!ignore)
813                     {
814                       dump_rest_of_file ();
815                       close_output_file ();
816                     }
817                   exit (EXIT_SUCCESS);
818                 }
819               else
820                 regexp_error (p, repetition, ignore);
821             }
822           line_len = line->len;
823           if (line->str[line_len - 1] == '\n')
824             line_len--;
825           ret = re_search (&p->re_compiled, line->str, line_len,
826                            0, line_len, nullptr);
827           if (ret == -2)
828             {
829               error (0, 0, _("error in regular expression search"));
830               cleanup_fatal ();
831             }
832           if (ret != -1)
833             break;
834         }
835     }
836 
837   /* Account for any offset from this regexp. */
838   break_line = current_line + p->offset;
839 
840   write_to_file (break_line, ignore, p->argnum);
841 
842   if (!ignore)
843     close_output_file ();
844 
845   if (p->offset > 0)
846     current_line = break_line;
847 
848   if (suppress_matched)
849     remove_line ();
850 }
851 
852 /* Split the input file according to the control records we have built. */
853 
854 static void
split_file(void)855 split_file (void)
856 {
857   for (idx_t i = 0; i < control_used; i++)
858     {
859       intmax_t j;
860       if (controls[i].regexpr)
861         {
862           for (j = 0; (controls[i].repeat_forever
863                        || j <= controls[i].repeat); j++)
864             process_regexp (&controls[i], j);
865         }
866       else
867         {
868           for (j = 0; (controls[i].repeat_forever
869                        || j <= controls[i].repeat); j++)
870             process_line_count (&controls[i], j);
871         }
872     }
873 
874   create_output_file ();
875   dump_rest_of_file ();
876   close_output_file ();
877 }
878 
879 /* Return the name of output file number NUM.
880 
881    This function is called from a signal handler, so it should invoke
882    only reentrant functions that are async-signal-safe.  POSIX does
883    not guarantee this for the functions called below, but we don't
884    know of any hosts where this implementation isn't safe.  */
885 
886 static char *
make_filename(int num)887 make_filename (int num)
888 {
889   strcpy (filename_space, prefix);
890   if (suffix)
891     sprintf (filename_space + strlen (prefix), suffix, num);
892   else
893     sprintf (filename_space + strlen (prefix), "%0*d", digits, num);
894   return filename_space;
895 }
896 
897 /* Create the next output file. */
898 
899 static void
create_output_file(void)900 create_output_file (void)
901 {
902   int nfiles = files_created;
903   bool fopen_ok;
904   int fopen_errno;
905 
906   output_filename = make_filename (nfiles);
907 
908   if (nfiles == INT_MAX)
909     {
910       fopen_ok = false;
911       fopen_errno = EOVERFLOW;
912     }
913   else
914     {
915       /* Create the output file in a critical section, to avoid races.  */
916       sigset_t oldset;
917       sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
918       output_stream = fopen (output_filename, "w");
919       fopen_ok = (output_stream != nullptr);
920       fopen_errno = errno;
921       files_created = nfiles + fopen_ok;
922       sigprocmask (SIG_SETMASK, &oldset, nullptr);
923     }
924 
925   if (! fopen_ok)
926     {
927       error (0, fopen_errno, "%s", quotef (output_filename));
928       cleanup_fatal ();
929     }
930   bytes_written = 0;
931 }
932 
933 /* If requested, delete all the files we have created.  This function
934    must be called only from critical sections.  */
935 
936 static void
delete_all_files(bool in_signal_handler)937 delete_all_files (bool in_signal_handler)
938 {
939   if (! remove_files)
940     return;
941 
942   for (int i = files_created; 0 <= --i; )
943     {
944       char const *name = make_filename (i);
945       if (unlink (name) != 0 && errno != ENOENT && !in_signal_handler)
946         error (0, errno, "%s", quotef (name));
947     }
948 
949   files_created = 0;
950 }
951 
952 /* Close the current output file and print the count
953    of characters in this file. */
954 
955 static void
close_output_file(void)956 close_output_file (void)
957 {
958   if (output_stream)
959     {
960       if (ferror (output_stream))
961         {
962           error (0, 0, _("write error for %s"), quoteaf (output_filename));
963           output_stream = nullptr;
964           cleanup_fatal ();
965         }
966       if (fclose (output_stream) != 0)
967         {
968           error (0, errno, "%s", quotef (output_filename));
969           output_stream = nullptr;
970           cleanup_fatal ();
971         }
972       if (bytes_written == 0 && elide_empty_files)
973         {
974           sigset_t oldset;
975           bool unlink_ok;
976           int unlink_errno;
977 
978           /* Remove the output file in a critical section, to avoid races.  */
979           sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
980           unlink_ok = (unlink (output_filename) == 0);
981           unlink_errno = errno;
982           files_created--;
983           sigprocmask (SIG_SETMASK, &oldset, nullptr);
984 
985           if (! unlink_ok && unlink_errno != ENOENT)
986             error (0, unlink_errno, "%s", quotef (output_filename));
987         }
988       else
989         {
990           if (!suppress_count)
991             {
992               char buf[INT_BUFSIZE_BOUND (intmax_t)];
993               fprintf (stdout, "%s\n", imaxtostr (bytes_written, buf));
994             }
995         }
996       output_stream = nullptr;
997     }
998 }
999 
1000 /* Save line LINE to the output file and
1001    increment the character count for the current file. */
1002 
1003 static void
save_line_to_file(const struct cstring * line)1004 save_line_to_file (const struct cstring *line)
1005 {
1006   idx_t l = fwrite (line->str, sizeof (char), line->len, output_stream);
1007   if (l != line->len)
1008     {
1009       error (0, errno, _("write error for %s"), quoteaf (output_filename));
1010       output_stream = nullptr;
1011       cleanup_fatal ();
1012     }
1013   bytes_written += line->len;
1014 }
1015 
1016 /* Return a new, initialized control record. */
1017 
1018 static struct control *
new_control_record(void)1019 new_control_record (void)
1020 {
1021   static idx_t control_allocated = 0; /* Total space allocated. */
1022   struct control *p;
1023 
1024   if (control_used == control_allocated)
1025     controls = xpalloc (controls, &control_allocated, 1, -1, sizeof *controls);
1026   p = &controls[control_used++];
1027   p->regexpr = false;
1028   p->repeat = 0;
1029   p->repeat_forever = false;
1030   p->lines_required = 0;
1031   p->offset = 0;
1032   return p;
1033 }
1034 
1035 /* Check if there is a numeric offset after a regular expression.
1036    STR is the entire command line argument.
1037    P is the control record for this regular expression.
1038    NUM is the numeric part of STR. */
1039 
1040 static void
check_for_offset(struct control * p,char const * str,char const * num)1041 check_for_offset (struct control *p, char const *str, char const *num)
1042 {
1043   if (xstrtoimax (num, nullptr, 10, &p->offset, "") != LONGINT_OK)
1044     error (EXIT_FAILURE, 0, _("%s: integer expected after delimiter"),
1045            quote (str));
1046 }
1047 
1048 /* Given that the first character of command line arg STR is '{',
1049    make sure that the rest of the string is a valid repeat count
1050    and store its value in P.
1051    ARGNUM is the ARGV index of STR. */
1052 
1053 static void
parse_repeat_count(int argnum,struct control * p,char * str)1054 parse_repeat_count (int argnum, struct control *p, char *str)
1055 {
1056   char *end;
1057 
1058   end = str + strlen (str) - 1;
1059   if (*end != '}')
1060     error (EXIT_FAILURE, 0, _("%s: '}' is required in repeat count"),
1061            quote (str));
1062   *end = '\0';
1063 
1064   if (str + 1 == end - 1 && *(str + 1) == '*')
1065     p->repeat_forever = true;
1066   else
1067     {
1068       uintmax_t val;
1069       if (xstrtoumax (str + 1, nullptr, 10, &val, "") != LONGINT_OK
1070           || INTMAX_MAX < val)
1071         {
1072           error (EXIT_FAILURE, 0,
1073                  _("%s}: integer required between '{' and '}'"),
1074                  quote (global_argv[argnum]));
1075         }
1076       p->repeat = val;
1077     }
1078 
1079   *end = '}';
1080 }
1081 
1082 /* Extract the regular expression from STR and check for a numeric offset.
1083    STR should start with the regexp delimiter character.
1084    Return a new control record for the regular expression.
1085    ARGNUM is the ARGV index of STR.
1086    Unless IGNORE is true, mark these lines for output. */
1087 
1088 static struct control *
extract_regexp(int argnum,bool ignore,char const * str)1089 extract_regexp (int argnum, bool ignore, char const *str)
1090 {
1091   idx_t len;			/* Number of bytes in this regexp. */
1092   char delim = *str;
1093   char const *closing_delim;
1094   struct control *p;
1095   char const *err;
1096 
1097   closing_delim = strrchr (str + 1, delim);
1098   if (closing_delim == nullptr)
1099     error (EXIT_FAILURE, 0,
1100            _("%s: closing delimiter '%c' missing"), str, delim);
1101 
1102   len = closing_delim - str - 1;
1103   p = new_control_record ();
1104   p->argnum = argnum;
1105   p->ignore = ignore;
1106 
1107   p->regexpr = true;
1108   p->re_compiled.buffer = nullptr;
1109   p->re_compiled.allocated = 0;
1110   p->re_compiled.fastmap = xmalloc (UCHAR_MAX + 1);
1111   p->re_compiled.translate = nullptr;
1112   re_syntax_options =
1113     RE_SYNTAX_POSIX_BASIC & ~RE_CONTEXT_INVALID_DUP & ~RE_NO_EMPTY_RANGES;
1114   err = re_compile_pattern (str + 1, len, &p->re_compiled);
1115   if (err)
1116     {
1117       error (0, 0, _("%s: invalid regular expression: %s"), quote (str), err);
1118       cleanup_fatal ();
1119     }
1120 
1121   if (closing_delim[1])
1122     check_for_offset (p, str, closing_delim + 1);
1123 
1124   return p;
1125 }
1126 
1127 /* Extract the break patterns from args START through ARGC - 1 of ARGV.
1128    After each pattern, check if the next argument is a repeat count. */
1129 
1130 static void
parse_patterns(int argc,int start,char ** argv)1131 parse_patterns (int argc, int start, char **argv)
1132 {
1133   struct control *p;		/* New control record created. */
1134   static intmax_t last_val = 0;
1135 
1136   for (int i = start; i < argc; i++)
1137     {
1138       if (*argv[i] == '/' || *argv[i] == '%')
1139         {
1140           p = extract_regexp (i, *argv[i] == '%', argv[i]);
1141         }
1142       else
1143         {
1144           p = new_control_record ();
1145           p->argnum = i;
1146 
1147           uintmax_t val;
1148           if (xstrtoumax (argv[i], nullptr, 10, &val, "") != LONGINT_OK
1149               || INTMAX_MAX < val)
1150             error (EXIT_FAILURE, 0, _("%s: invalid pattern"), quote (argv[i]));
1151           if (val == 0)
1152             error (EXIT_FAILURE, 0,
1153                    _("%s: line number must be greater than zero"), argv[i]);
1154           if (val < last_val)
1155             {
1156               char buf[INT_BUFSIZE_BOUND (intmax_t)];
1157               error (EXIT_FAILURE, 0,
1158                      _("line number %s is smaller than preceding line number,"
1159                        " %s"),
1160                      quote (argv[i]), imaxtostr (last_val, buf));
1161             }
1162 
1163           if (val == last_val)
1164             error (0, 0,
1165            _("warning: line number %s is the same as preceding line number"),
1166                    quote (argv[i]));
1167 
1168           last_val = val;
1169 
1170           p->lines_required = val;
1171         }
1172 
1173       if (i + 1 < argc && *argv[i + 1] == '{')
1174         {
1175           /* We have a repeat count. */
1176           i++;
1177           parse_repeat_count (i, p, argv[i]);
1178         }
1179     }
1180 }
1181 
1182 
1183 
1184 /* Names for the printf format flags ' and #.  These can be ORed together.  */
1185 enum { FLAG_THOUSANDS = 1, FLAG_ALTERNATIVE = 2 };
1186 
1187 /* Scan the printf format flags in FORMAT, storing info about the
1188    flags into *FLAGS_PTR.  Return the number of flags found.  */
1189 static idx_t
get_format_flags(char const * format,int * flags_ptr)1190 get_format_flags (char const *format, int *flags_ptr)
1191 {
1192   int flags = 0;
1193 
1194   for (idx_t count = 0; ; count++)
1195     {
1196       switch (format[count])
1197         {
1198         case '-':
1199         case '0':
1200           break;
1201 
1202         case '\'':
1203           flags |= FLAG_THOUSANDS;
1204           break;
1205 
1206         case '#':
1207           flags |= FLAG_ALTERNATIVE;
1208           break;
1209 
1210         default:
1211           *flags_ptr = flags;
1212           return count;
1213         }
1214     }
1215 }
1216 
1217 /* Check that the printf format conversion specifier *FORMAT is valid
1218    and compatible with FLAGS.  Change it to 'd' if it is 'u',
1219    since the format will be used with a signed value.  */
1220 static void
check_format_conv_type(char * format,int flags)1221 check_format_conv_type (char *format, int flags)
1222 {
1223   unsigned char ch = *format;
1224   int compatible_flags = FLAG_THOUSANDS;
1225 
1226   switch (ch)
1227     {
1228     case 'd':
1229     case 'i':
1230       break;
1231 
1232     case 'u':
1233       *format = 'd';
1234       break;
1235 
1236     case 'o':
1237     case 'x':
1238     case 'X':
1239       compatible_flags = FLAG_ALTERNATIVE;
1240       break;
1241 
1242     case 0:
1243       error (EXIT_FAILURE, 0, _("missing conversion specifier in suffix"));
1244 
1245     default:
1246       if (isprint (ch))
1247         error (EXIT_FAILURE, 0,
1248                _("invalid conversion specifier in suffix: %c"), ch);
1249       else
1250         error (EXIT_FAILURE, 0,
1251                _("invalid conversion specifier in suffix: \\%.3o"), ch);
1252     }
1253 
1254   if (flags & ~ compatible_flags)
1255     error (EXIT_FAILURE, 0,
1256            _("invalid flags in conversion specification: %%%c%c"),
1257            (flags & ~ compatible_flags & FLAG_ALTERNATIVE ? '#' : '\''), ch);
1258 }
1259 
1260 /* Return the maximum number of bytes that can be generated by
1261    applying FORMAT to an int value.  If the format is
1262    invalid, diagnose the problem and exit.  */
1263 static idx_t
max_out(char * format)1264 max_out (char *format)
1265 {
1266   bool percent = false;
1267 
1268   for (char *f = format; *f; f++)
1269     if (*f == '%' && *++f != '%')
1270       {
1271         if (percent)
1272           error (EXIT_FAILURE, 0,
1273                  _("too many %% conversion specifications in suffix"));
1274         percent = true;
1275         int flags;
1276         f += get_format_flags (f, &flags);
1277         while (ISDIGIT (*f))
1278           f++;
1279         if (*f == '.')
1280           while (ISDIGIT (*++f))
1281             continue;
1282         check_format_conv_type (f, flags);
1283       }
1284 
1285   if (! percent)
1286     error (EXIT_FAILURE, 0,
1287            _("missing %% conversion specification in suffix"));
1288 
1289   int maxlen = snprintf (nullptr, 0, format, INT_MAX);
1290   if (maxlen < 0)
1291     xalloc_die ();
1292   return maxlen;
1293 }
1294 
1295 int
main(int argc,char ** argv)1296 main (int argc, char **argv)
1297 {
1298   int optc;
1299 
1300   initialize_main (&argc, &argv);
1301   set_program_name (argv[0]);
1302   setlocale (LC_ALL, "");
1303   bindtextdomain (PACKAGE, LOCALEDIR);
1304   textdomain (PACKAGE);
1305 
1306   atexit (close_stdout);
1307 
1308   global_argv = argv;
1309   controls = nullptr;
1310   control_used = 0;
1311   suppress_count = false;
1312   remove_files = true;
1313   suppress_matched = false;
1314   prefix = DEFAULT_PREFIX;
1315 
1316   while ((optc = getopt_long (argc, argv, "f:b:kn:sqz", longopts, nullptr))
1317          != -1)
1318     switch (optc)
1319       {
1320       case 'f':
1321         prefix = optarg;
1322         break;
1323 
1324       case 'b':
1325         suffix = optarg;
1326         break;
1327 
1328       case 'k':
1329         remove_files = false;
1330         break;
1331 
1332       case 'n':
1333         digits = xdectoimax (optarg, 0, MIN (INT_MAX, IDX_MAX), "",
1334                              _("invalid number"), 0);
1335         break;
1336 
1337       case 's':
1338       case 'q':
1339         suppress_count = true;
1340         break;
1341 
1342       case 'z':
1343         elide_empty_files = true;
1344         break;
1345 
1346       case SUPPRESS_MATCHED_OPTION:
1347         suppress_matched = true;
1348         break;
1349 
1350       case_GETOPT_HELP_CHAR;
1351 
1352       case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1353 
1354       default:
1355         usage (EXIT_FAILURE);
1356       }
1357 
1358   if (argc - optind < 2)
1359     {
1360       if (argc <= optind)
1361         error (0, 0, _("missing operand"));
1362       else
1363         error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
1364       usage (EXIT_FAILURE);
1365     }
1366 
1367   idx_t prefix_len = strlen (prefix);
1368   idx_t max_digit_string_len
1369     = (suffix
1370        ? max_out (suffix)
1371        : MAX (INT_STRLEN_BOUND (int), digits));
1372   idx_t filename_size;
1373   if (ckd_add (&filename_size, prefix_len, max_digit_string_len + 1))
1374     xalloc_die ();
1375   filename_space = ximalloc (filename_size);
1376 
1377   set_input_file (argv[optind++]);
1378 
1379   parse_patterns (argc, optind, argv);
1380 
1381   {
1382     int i;
1383     static int const sig[] =
1384       {
1385         /* The usual suspects.  */
1386         SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
1387 #ifdef SIGPOLL
1388         SIGPOLL,
1389 #endif
1390 #ifdef SIGPROF
1391         SIGPROF,
1392 #endif
1393 #ifdef SIGVTALRM
1394         SIGVTALRM,
1395 #endif
1396 #ifdef SIGXCPU
1397         SIGXCPU,
1398 #endif
1399 #ifdef SIGXFSZ
1400         SIGXFSZ,
1401 #endif
1402       };
1403     enum { nsigs = ARRAY_CARDINALITY (sig) };
1404 
1405     struct sigaction act;
1406 
1407     sigemptyset (&caught_signals);
1408     for (i = 0; i < nsigs; i++)
1409       {
1410         sigaction (sig[i], nullptr, &act);
1411         if (act.sa_handler != SIG_IGN)
1412           sigaddset (&caught_signals, sig[i]);
1413       }
1414 
1415     act.sa_handler = interrupt_handler;
1416     act.sa_mask = caught_signals;
1417     act.sa_flags = 0;
1418 
1419     for (i = 0; i < nsigs; i++)
1420       if (sigismember (&caught_signals, sig[i]))
1421         sigaction (sig[i], &act, nullptr);
1422   }
1423 
1424   split_file ();
1425 
1426   if (close (STDIN_FILENO) != 0)
1427     {
1428       error (0, errno, _("read error"));
1429       cleanup_fatal ();
1430     }
1431 
1432   return EXIT_SUCCESS;
1433 }
1434 
1435 void
usage(int status)1436 usage (int status)
1437 {
1438   if (status != EXIT_SUCCESS)
1439     emit_try_help ();
1440   else
1441     {
1442       printf (_("\
1443 Usage: %s [OPTION]... FILE PATTERN...\n\
1444 "),
1445               program_name);
1446       fputs (_("\
1447 Output pieces of FILE separated by PATTERN(s) to files 'xx00', 'xx01', ...,\n\
1448 and output byte counts of each piece to standard output.\n\
1449 "), stdout);
1450        fputs (_("\
1451 \n\
1452 Read standard input if FILE is -\n\
1453 "), stdout);
1454 
1455       emit_mandatory_arg_note ();
1456 
1457       fputs (_("\
1458   -b, --suffix-format=FORMAT  use sprintf FORMAT instead of %02d\n\
1459   -f, --prefix=PREFIX        use PREFIX instead of 'xx'\n\
1460   -k, --keep-files           do not remove output files on errors\n\
1461 "), stdout);
1462       fputs (_("\
1463       --suppress-matched     suppress the lines matching PATTERN\n\
1464 "), stdout);
1465       fputs (_("\
1466   -n, --digits=DIGITS        use specified number of digits instead of 2\n\
1467   -s, --quiet, --silent      do not print counts of output file sizes\n\
1468   -z, --elide-empty-files    suppress empty output files\n\
1469 "), stdout);
1470       fputs (HELP_OPTION_DESCRIPTION, stdout);
1471       fputs (VERSION_OPTION_DESCRIPTION, stdout);
1472       fputs (_("\
1473 \n\
1474 Each PATTERN may be:\n\
1475   INTEGER            copy up to but not including specified line number\n\
1476   /REGEXP/[OFFSET]   copy up to but not including a matching line\n\
1477   %REGEXP%[OFFSET]   skip to, but not including a matching line\n\
1478   {INTEGER}          repeat the previous pattern specified number of times\n\
1479   {*}                repeat the previous pattern as many times as possible\n\
1480 \n\
1481 A line OFFSET is an integer optionally preceded by '+' or '-'\n\
1482 "), stdout);
1483       emit_ancillary_info (PROGRAM_NAME);
1484     }
1485   exit (status);
1486 }
1487