1 /* Shuffle lines of text.
2 
3    Copyright (C) 2006-2023 Free Software Foundation, Inc.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation, either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <https://www.gnu.org/licenses/>.
17 
18    Written by Paul Eggert.  */
19 
20 #include <config.h>
21 
22 #include <sys/types.h>
23 #include "system.h"
24 
25 #include "fadvise.h"
26 #include "getopt.h"
27 #include "linebuffer.h"
28 #include "quote.h"
29 #include "randint.h"
30 #include "randperm.h"
31 #include "read-file.h"
32 #include "stdio--.h"
33 #include "xstrtol.h"
34 
35 /* The official name of this program (e.g., no 'g' prefix).  */
36 #define PROGRAM_NAME "shuf"
37 
38 #define AUTHORS proper_name ("Paul Eggert")
39 
40 /* For reservoir-sampling, allocate the reservoir lines in batches.  */
41 enum { RESERVOIR_LINES_INCREMENT = 1024 };
42 
43 /* reservoir-sampling introduces CPU overhead for small inputs.
44    So only enable it for inputs >= this limit.
45    This limit was determined using these commands:
46      $ for p in $(seq 7); do src/seq $((10**$p)) > 10p$p.in; done
47      $ for p in $(seq 7); do time shuf-nores -n10 10p$p.in >/dev/null; done
48      $ for p in $(seq 7); do time shuf -n10 10p$p.in >/dev/null; done  .*/
49 enum { RESERVOIR_MIN_INPUT = 8192 * 1024 };
50 
51 
52 void
usage(int status)53 usage (int status)
54 {
55   if (status != EXIT_SUCCESS)
56     emit_try_help ();
57   else
58     {
59       printf (_("\
60 Usage: %s [OPTION]... [FILE]\n\
61   or:  %s -e [OPTION]... [ARG]...\n\
62   or:  %s -i LO-HI [OPTION]...\n\
63 "),
64               program_name, program_name, program_name);
65       fputs (_("\
66 Write a random permutation of the input lines to standard output.\n\
67 "), stdout);
68 
69       emit_stdin_note ();
70       emit_mandatory_arg_note ();
71 
72       fputs (_("\
73   -e, --echo                treat each ARG as an input line\n\
74   -i, --input-range=LO-HI   treat each number LO through HI as an input line\n\
75   -n, --head-count=COUNT    output at most COUNT lines\n\
76   -o, --output=FILE         write result to FILE instead of standard output\n\
77       --random-source=FILE  get random bytes from FILE\n\
78   -r, --repeat              output lines can be repeated\n\
79 "), stdout);
80       fputs (_("\
81   -z, --zero-terminated     line delimiter is NUL, not newline\n\
82 "), stdout);
83       fputs (HELP_OPTION_DESCRIPTION, stdout);
84       fputs (VERSION_OPTION_DESCRIPTION, stdout);
85       emit_ancillary_info (PROGRAM_NAME);
86     }
87 
88   exit (status);
89 }
90 
91 /* For long options that have no equivalent short option, use a
92    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
93 enum
94 {
95   RANDOM_SOURCE_OPTION = CHAR_MAX + 1
96 };
97 
98 static struct option const long_opts[] =
99 {
100   {"echo", no_argument, nullptr, 'e'},
101   {"input-range", required_argument, nullptr, 'i'},
102   {"head-count", required_argument, nullptr, 'n'},
103   {"output", required_argument, nullptr, 'o'},
104   {"random-source", required_argument, nullptr, RANDOM_SOURCE_OPTION},
105   {"repeat", no_argument, nullptr, 'r'},
106   {"zero-terminated", no_argument, nullptr, 'z'},
107   {GETOPT_HELP_OPTION_DECL},
108   {GETOPT_VERSION_OPTION_DECL},
109   {0, 0, 0, 0},
110 };
111 
112 static void
input_from_argv(char ** operand,int n_operands,char eolbyte)113 input_from_argv (char **operand, int n_operands, char eolbyte)
114 {
115   char *p;
116   size_t size = n_operands;
117   int i;
118 
119   for (i = 0; i < n_operands; i++)
120     size += strlen (operand[i]);
121   p = xmalloc (size);
122 
123   for (i = 0; i < n_operands; i++)
124     {
125       char *p1 = stpcpy (p, operand[i]);
126       operand[i] = p;
127       p = p1;
128       *p++ = eolbyte;
129     }
130 
131   operand[n_operands] = p;
132 }
133 
134 /* Return the start of the next line after LINE, which is guaranteed
135    to end in EOLBYTE.  */
136 
137 static char *
next_line(char * line,char eolbyte)138 next_line (char *line, char eolbyte)
139 {
140   char *p = rawmemchr (line, eolbyte);
141   return p + 1;
142 }
143 
144 /* Return the size of the input if possible or OFF_T_MAX if not.  */
145 
146 static off_t
input_size(void)147 input_size (void)
148 {
149   off_t file_size;
150 
151   struct stat stat_buf;
152   if (fstat (STDIN_FILENO, &stat_buf) != 0)
153     return OFF_T_MAX;
154   if (usable_st_size (&stat_buf))
155     file_size = stat_buf.st_size;
156   else
157     return OFF_T_MAX;
158 
159   off_t input_offset = lseek (STDIN_FILENO, 0, SEEK_CUR);
160   if (input_offset < 0)
161     return OFF_T_MAX;
162 
163   file_size -= input_offset;
164 
165   return file_size;
166 }
167 
168 /* Read all lines and store up to K permuted lines in *OUT_RSRV.
169    Return the number of lines read, up to a maximum of K.  */
170 
171 static size_t
read_input_reservoir_sampling(FILE * in,char eolbyte,size_t k,struct randint_source * s,struct linebuffer ** out_rsrv)172 read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k,
173                                struct randint_source *s,
174                                struct linebuffer **out_rsrv)
175 {
176   randint n_lines = 0;
177   size_t n_alloc_lines = MIN (k, RESERVOIR_LINES_INCREMENT);
178   struct linebuffer *line = nullptr;
179   struct linebuffer *rsrv;
180 
181   rsrv = xcalloc (n_alloc_lines, sizeof (struct linebuffer));
182 
183   /* Fill the first K lines, directly into the reservoir.  */
184   while (n_lines < k
185          && (line =
186              readlinebuffer_delim (&rsrv[n_lines], in, eolbyte)) != nullptr)
187     {
188       n_lines++;
189 
190       /* Enlarge reservoir.  */
191       if (n_lines >= n_alloc_lines)
192         {
193           n_alloc_lines += RESERVOIR_LINES_INCREMENT;
194           rsrv = xnrealloc (rsrv, n_alloc_lines, sizeof (struct linebuffer));
195           memset (&rsrv[n_lines], 0,
196                   RESERVOIR_LINES_INCREMENT * sizeof (struct linebuffer));
197         }
198     }
199 
200   /* last line wasn't null - so there may be more lines to read.  */
201   if (line != nullptr)
202     {
203       struct linebuffer dummy;
204       initbuffer (&dummy);  /* space for lines not put in reservoir.  */
205 
206       /* Choose the fate of the next line, with decreasing probability (as
207          n_lines increases in size).
208 
209          If the line will be used, store it directly in the reservoir.
210          Otherwise, store it in dummy space.
211 
212          With 'struct linebuffer', storing into existing buffer will reduce
213          re-allocations (will only re-allocate if the new line is longer than
214          the currently allocated space).  */
215       do
216         {
217           randint j = randint_choose (s, n_lines + 1);  /* 0 .. n_lines.  */
218           line = (j < k) ? (&rsrv[j]) : (&dummy);
219         }
220       while (readlinebuffer_delim (line, in, eolbyte) != nullptr && n_lines++);
221 
222       if (! n_lines)
223         error (EXIT_FAILURE, EOVERFLOW, _("too many input lines"));
224 
225       freebuffer (&dummy);
226     }
227 
228   /* no more input lines, or an input error.  */
229   if (ferror (in))
230     error (EXIT_FAILURE, errno, _("read error"));
231 
232   *out_rsrv = rsrv;
233   return MIN (k, n_lines);
234 }
235 
236 static int
write_permuted_output_reservoir(size_t n_lines,struct linebuffer * lines,size_t const * permutation)237 write_permuted_output_reservoir (size_t n_lines, struct linebuffer *lines,
238                                  size_t const *permutation)
239 {
240   for (size_t i = 0; i < n_lines; i++)
241     {
242       const struct linebuffer *p = &lines[permutation[i]];
243       if (fwrite (p->buffer, sizeof (char), p->length, stdout) != p->length)
244         return -1;
245     }
246 
247   return 0;
248 }
249 
250 /* Read data from file IN.  Input lines are delimited by EOLBYTE;
251    silently append a trailing EOLBYTE if the file ends in some other
252    byte.  Store a pointer to the resulting array of lines into *PLINE.
253    Return the number of lines read.  Report an error and exit on
254    failure.  */
255 
256 static size_t
read_input(FILE * in,char eolbyte,char *** pline)257 read_input (FILE *in, char eolbyte, char ***pline)
258 {
259   char *p;
260   char *buf = nullptr;
261   size_t used;
262   char *lim;
263   char **line;
264   size_t n_lines;
265 
266   /* TODO: We should limit the amount of data read here,
267      to less than RESERVOIR_MIN_INPUT.  I.e., adjust fread_file() to support
268      taking a byte limit.  We'd then need to ensure we handle a line spanning
269      this boundary.  With that in place we could set use_reservoir_sampling
270      when used==RESERVOIR_MIN_INPUT, and have read_input_reservoir_sampling()
271      call a wrapper function to populate a linebuffer from the internal pline
272      or if none left, stdin.  Doing that would give better performance by
273      avoiding the reservoir CPU overhead when reading < RESERVOIR_MIN_INPUT
274      from a pipe, and allow us to dispense with the input_size() function.  */
275   if (!(buf = fread_file (in, 0, &used)))
276     error (EXIT_FAILURE, errno, _("read error"));
277 
278   if (used && buf[used - 1] != eolbyte)
279     buf[used++] = eolbyte;
280 
281   lim = buf + used;
282 
283   n_lines = 0;
284   for (p = buf; p < lim; p = next_line (p, eolbyte))
285     n_lines++;
286 
287   *pline = line = xnmalloc (n_lines + 1, sizeof *line);
288 
289   line[0] = p = buf;
290   for (size_t i = 1; i <= n_lines; i++)
291     line[i] = p = next_line (p, eolbyte);
292 
293   return n_lines;
294 }
295 
296 /* Output N_LINES lines to stdout from LINE array,
297    chosen by the indices in PERMUTATION.
298    PERMUTATION and LINE must have at least N_LINES elements.
299    Strings in LINE must include the line-terminator character.  */
300 static int
write_permuted_lines(size_t n_lines,char * const * line,size_t const * permutation)301 write_permuted_lines (size_t n_lines, char *const *line,
302                       size_t const *permutation)
303 {
304   for (size_t i = 0; i < n_lines; i++)
305     {
306       char *const *p = line + permutation[i];
307       size_t len = p[1] - p[0];
308       if (fwrite (p[0], sizeof *p[0], len, stdout) != len)
309         return -1;
310     }
311 
312   return 0;
313 }
314 
315 /* Output N_LINES of numbers to stdout, from PERMUTATION array.
316    PERMUTATION must have at least N_LINES elements.  */
317 static int
write_permuted_numbers(size_t n_lines,size_t lo_input,size_t const * permutation,char eolbyte)318 write_permuted_numbers (size_t n_lines, size_t lo_input,
319                         size_t const *permutation, char eolbyte)
320 {
321   for (size_t i = 0; i < n_lines; i++)
322     {
323       unsigned long int n = lo_input + permutation[i];
324       if (printf ("%lu%c", n, eolbyte) < 0)
325         return -1;
326     }
327 
328   return 0;
329 }
330 
331 /* Output COUNT numbers to stdout, chosen randomly from range
332    LO_INPUT through HI_INPUT.  */
333 static int
write_random_numbers(struct randint_source * s,size_t count,size_t lo_input,size_t hi_input,char eolbyte)334 write_random_numbers (struct randint_source *s, size_t count,
335                       size_t lo_input, size_t hi_input, char eolbyte)
336 {
337   const randint range = hi_input - lo_input + 1;
338 
339   for (size_t i = 0; i < count; i++)
340     {
341       unsigned long int j = lo_input + randint_choose (s, range);
342       if (printf ("%lu%c", j, eolbyte) < 0)
343         return -1;
344     }
345 
346   return 0;
347 }
348 
349 /* Output COUNT lines to stdout from LINES array.
350    LINES must have at least N_LINES elements in it.
351    Strings in LINES_ must include the line-terminator character.  */
352 static int
write_random_lines(struct randint_source * s,size_t count,char * const * lines,size_t n_lines)353 write_random_lines (struct randint_source *s, size_t count,
354                     char *const *lines, size_t n_lines)
355 {
356   for (size_t i = 0; i < count; i++)
357     {
358       const randint j = randint_choose (s, n_lines);
359       char *const *p = lines + j;
360       size_t len = p[1] - p[0];
361       if (fwrite (p[0], sizeof *p[0], len, stdout) != len)
362         return -1;
363     }
364 
365   return 0;
366 }
367 
368 int
main(int argc,char ** argv)369 main (int argc, char **argv)
370 {
371   bool echo = false;
372   bool input_range = false;
373   size_t lo_input = SIZE_MAX;
374   size_t hi_input = 0;
375   size_t head_lines = SIZE_MAX;
376   char const *outfile = nullptr;
377   char *random_source = nullptr;
378   char eolbyte = '\n';
379   char **input_lines = nullptr;
380   bool use_reservoir_sampling = false;
381   bool repeat = false;
382 
383   int optc;
384   int n_operands;
385   char **operand;
386   size_t n_lines;
387   char **line = nullptr;
388   struct linebuffer *reservoir = nullptr;
389   struct randint_source *randint_source;
390   size_t *permutation = nullptr;
391   int i;
392 
393   initialize_main (&argc, &argv);
394   set_program_name (argv[0]);
395   setlocale (LC_ALL, "");
396   bindtextdomain (PACKAGE, LOCALEDIR);
397   textdomain (PACKAGE);
398 
399   atexit (close_stdout);
400 
401   while ((optc = getopt_long (argc, argv, "ei:n:o:rz", long_opts, nullptr))
402          != -1)
403     switch (optc)
404       {
405       case 'e':
406         echo = true;
407         break;
408 
409       case 'i':
410         {
411           if (input_range)
412             error (EXIT_FAILURE, 0, _("multiple -i options specified"));
413           input_range = true;
414 
415           uintmax_t u;
416           char *lo_end;
417           strtol_error err = xstrtoumax (optarg, &lo_end, 10, &u, nullptr);
418           if (err == LONGINT_OK)
419             {
420               lo_input = u;
421               if (lo_input != u)
422                 err = LONGINT_OVERFLOW;
423               else if (*lo_end != '-')
424                 err = LONGINT_INVALID;
425               else
426                 {
427                   err = xstrtoumax (lo_end + 1, nullptr, 10, &u, "");
428                   if (err == LONGINT_OK)
429                     {
430                       hi_input = u;
431                       if (hi_input != u)
432                         err = LONGINT_OVERFLOW;
433                     }
434                 }
435             }
436 
437           n_lines = hi_input - lo_input + 1;
438 
439           if (err != LONGINT_OK || (lo_input <= hi_input) == (n_lines == 0))
440             error (EXIT_FAILURE, err == LONGINT_OVERFLOW ? EOVERFLOW : 0,
441                    "%s: %s", _("invalid input range"), quote (optarg));
442         }
443         break;
444 
445       case 'n':
446         {
447           uintmax_t argval;
448           strtol_error e = xstrtoumax (optarg, nullptr, 10, &argval, "");
449 
450           if (e == LONGINT_OK)
451             head_lines = MIN (head_lines, argval);
452           else if (e != LONGINT_OVERFLOW)
453             error (EXIT_FAILURE, 0, _("invalid line count: %s"),
454                    quote (optarg));
455         }
456         break;
457 
458       case 'o':
459         if (outfile && !STREQ (outfile, optarg))
460           error (EXIT_FAILURE, 0, _("multiple output files specified"));
461         outfile = optarg;
462         break;
463 
464       case RANDOM_SOURCE_OPTION:
465         if (random_source && !STREQ (random_source, optarg))
466           error (EXIT_FAILURE, 0, _("multiple random sources specified"));
467         random_source = optarg;
468         break;
469 
470       case 'r':
471         repeat = true;
472         break;
473 
474       case 'z':
475         eolbyte = '\0';
476         break;
477 
478       case_GETOPT_HELP_CHAR;
479       case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
480       default:
481         usage (EXIT_FAILURE);
482       }
483 
484   n_operands = argc - optind;
485   operand = argv + optind;
486 
487   /* Check invalid usage.  */
488   if (echo && input_range)
489     {
490       error (0, 0, _("cannot combine -e and -i options"));
491       usage (EXIT_FAILURE);
492     }
493   if (input_range ? 0 < n_operands : !echo && 1 < n_operands)
494     {
495       error (0, 0, _("extra operand %s"), quote (operand[!input_range]));
496       usage (EXIT_FAILURE);
497     }
498 
499   /* Prepare input.  */
500   if (head_lines == 0)
501     {
502       n_lines = 0;
503       line = nullptr;
504     }
505   else if (echo)
506     {
507       input_from_argv (operand, n_operands, eolbyte);
508       n_lines = n_operands;
509       line = operand;
510     }
511   else if (input_range)
512     {
513       n_lines = hi_input - lo_input + 1;
514       line = nullptr;
515     }
516   else
517     {
518       /* If an input file is specified, re-open it as stdin.  */
519       if (n_operands == 1
520           && ! (STREQ (operand[0], "-")
521                 || freopen (operand[0], "r", stdin)))
522         error (EXIT_FAILURE, errno, "%s", quotef (operand[0]));
523 
524       fadvise (stdin, FADVISE_SEQUENTIAL);
525 
526       if (repeat || head_lines == SIZE_MAX
527           || input_size () <= RESERVOIR_MIN_INPUT)
528         {
529           n_lines = read_input (stdin, eolbyte, &input_lines);
530           line = input_lines;
531         }
532       else
533         {
534           use_reservoir_sampling = true;
535           n_lines = SIZE_MAX;   /* unknown number of input lines, for now.  */
536         }
537     }
538 
539   /* The adjusted head line count; can be less than HEAD_LINES if the
540      input is small and if not repeating.  */
541   size_t ahead_lines = repeat || head_lines < n_lines ? head_lines : n_lines;
542 
543   randint_source = randint_all_new (random_source,
544                                     (use_reservoir_sampling || repeat
545                                      ? SIZE_MAX
546                                      : randperm_bound (ahead_lines, n_lines)));
547   if (! randint_source)
548     error (EXIT_FAILURE, errno, "%s",
549            quotef (random_source ? random_source : "getrandom"));
550 
551   if (use_reservoir_sampling)
552     {
553       /* Instead of reading the entire file into 'line',
554          use reservoir-sampling to store just AHEAD_LINES random lines.  */
555       n_lines = read_input_reservoir_sampling (stdin, eolbyte, ahead_lines,
556                                                randint_source, &reservoir);
557       ahead_lines = n_lines;
558     }
559 
560   /* Close stdin now, rather than earlier, so that randint_all_new
561      doesn't have to worry about opening something other than
562      stdin.  */
563   if (! (head_lines == 0 || echo || input_range || fclose (stdin) == 0))
564     error (EXIT_FAILURE, errno, _("read error"));
565 
566   if (!repeat)
567     permutation = randperm_new (randint_source, ahead_lines, n_lines);
568 
569   if (outfile && ! freopen (outfile, "w", stdout))
570     error (EXIT_FAILURE, errno, "%s", quotef (outfile));
571 
572   /* Generate output according to requested method */
573   if (repeat)
574     {
575       if (head_lines == 0)
576         i = 0;
577       else
578         {
579           if (n_lines == 0)
580             error (EXIT_FAILURE, 0, _("no lines to repeat"));
581           if (input_range)
582             i = write_random_numbers (randint_source, ahead_lines,
583                                       lo_input, hi_input, eolbyte);
584           else
585             i = write_random_lines (randint_source, ahead_lines, line, n_lines);
586         }
587     }
588   else
589     {
590       if (use_reservoir_sampling)
591         i = write_permuted_output_reservoir (n_lines, reservoir, permutation);
592       else if (input_range)
593         i = write_permuted_numbers (ahead_lines, lo_input,
594                                     permutation, eolbyte);
595       else
596         i = write_permuted_lines (ahead_lines, line, permutation);
597     }
598 
599   if (i != 0)
600     write_error ();
601 
602   main_exit (EXIT_SUCCESS);
603 }
604