1 /* uniq -- remove duplicate lines from a sorted file
2    Copyright (C) 1986-2023 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
16 
17 /* Written by Richard M. Stallman and David MacKenzie. */
18 
19 #include <config.h>
20 
21 #include <getopt.h>
22 #include <sys/types.h>
23 
24 #include "system.h"
25 #include "argmatch.h"
26 #include "linebuffer.h"
27 #include "fadvise.h"
28 #include "posixver.h"
29 #include "stdio--.h"
30 #include "xstrtol.h"
31 #include "memcasecmp.h"
32 #include "quote.h"
33 
34 /* The official name of this program (e.g., no 'g' prefix).  */
35 #define PROGRAM_NAME "uniq"
36 
37 #define AUTHORS \
38   proper_name ("Richard M. Stallman"), \
39   proper_name ("David MacKenzie")
40 
41 #define SWAP_LINES(A, B)			\
42   do						\
43     {						\
44       struct linebuffer *_tmp;			\
45       _tmp = (A);				\
46       (A) = (B);				\
47       (B) = _tmp;				\
48     }						\
49   while (0)
50 
51 /* Number of fields to skip on each line when doing comparisons. */
52 static size_t skip_fields;
53 
54 /* Number of chars to skip after skipping any fields. */
55 static size_t skip_chars;
56 
57 /* Number of chars to compare. */
58 static size_t check_chars;
59 
60 enum countmode
61 {
62   count_occurrences,		/* -c Print count before output lines. */
63   count_none			/* Default.  Do not print counts. */
64 };
65 
66 /* Whether and how to precede the output lines with a count of the number of
67    times they occurred in the input. */
68 static enum countmode countmode;
69 
70 /* Which lines to output: unique lines, the first of a group of
71    repeated lines, and the second and subsequent of a group of
72    repeated lines.  */
73 static bool output_unique;
74 static bool output_first_repeated;
75 static bool output_later_repeated;
76 
77 /* If true, ignore case when comparing.  */
78 static bool ignore_case;
79 
80 enum delimit_method
81 {
82   /* No delimiters output.  --all-repeated[=none] */
83   DM_NONE,
84 
85   /* Delimiter precedes all groups.  --all-repeated=prepend */
86   DM_PREPEND,
87 
88   /* Delimit all groups.  --all-repeated=separate */
89   DM_SEPARATE
90 };
91 
92 static char const *const delimit_method_string[] =
93 {
94   "none", "prepend", "separate", nullptr
95 };
96 
97 static enum delimit_method const delimit_method_map[] =
98 {
99   DM_NONE, DM_PREPEND, DM_SEPARATE
100 };
101 
102 /* Select whether/how to delimit groups of duplicate lines.  */
103 static enum delimit_method delimit_groups;
104 
105 enum grouping_method
106 {
107   /* No grouping, when "--group" isn't used */
108   GM_NONE,
109 
110   /* Delimiter precedes all groups.  --group=prepend */
111   GM_PREPEND,
112 
113   /* Delimiter follows all groups.   --group=append */
114   GM_APPEND,
115 
116   /* Delimiter between groups.    --group[=separate] */
117   GM_SEPARATE,
118 
119   /* Delimiter before and after each group. --group=both */
120   GM_BOTH
121 };
122 
123 static char const *const grouping_method_string[] =
124 {
125   "prepend", "append", "separate", "both", nullptr
126 };
127 
128 static enum grouping_method const grouping_method_map[] =
129 {
130   GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
131 };
132 
133 static enum grouping_method grouping = GM_NONE;
134 
135 enum
136 {
137   GROUP_OPTION = CHAR_MAX + 1
138 };
139 
140 static struct option const longopts[] =
141 {
142   {"count", no_argument, nullptr, 'c'},
143   {"repeated", no_argument, nullptr, 'd'},
144   {"all-repeated", optional_argument, nullptr, 'D'},
145   {"group", optional_argument, nullptr, GROUP_OPTION},
146   {"ignore-case", no_argument, nullptr, 'i'},
147   {"unique", no_argument, nullptr, 'u'},
148   {"skip-fields", required_argument, nullptr, 'f'},
149   {"skip-chars", required_argument, nullptr, 's'},
150   {"check-chars", required_argument, nullptr, 'w'},
151   {"zero-terminated", no_argument, nullptr, 'z'},
152   {GETOPT_HELP_OPTION_DECL},
153   {GETOPT_VERSION_OPTION_DECL},
154   {nullptr, 0, nullptr, 0}
155 };
156 
157 void
usage(int status)158 usage (int status)
159 {
160   if (status != EXIT_SUCCESS)
161     emit_try_help ();
162   else
163     {
164       printf (_("\
165 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
166 "),
167               program_name);
168       fputs (_("\
169 Filter adjacent matching lines from INPUT (or standard input),\n\
170 writing to OUTPUT (or standard output).\n\
171 \n\
172 With no options, matching lines are merged to the first occurrence.\n\
173 "), stdout);
174 
175       emit_mandatory_arg_note ();
176 
177      fputs (_("\
178   -c, --count           prefix lines by the number of occurrences\n\
179   -d, --repeated        only print duplicate lines, one for each group\n\
180 "), stdout);
181      fputs (_("\
182   -D                    print all duplicate lines\n\
183       --all-repeated[=METHOD]  like -D, but allow separating groups\n\
184                                  with an empty line;\n\
185                                  METHOD={none(default),prepend,separate}\n\
186 "), stdout);
187      fputs (_("\
188   -f, --skip-fields=N   avoid comparing the first N fields\n\
189 "), stdout);
190      fputs (_("\
191       --group[=METHOD]  show all items, separating groups with an empty line;\n\
192                           METHOD={separate(default),prepend,append,both}\n\
193 "), stdout);
194      fputs (_("\
195   -i, --ignore-case     ignore differences in case when comparing\n\
196   -s, --skip-chars=N    avoid comparing the first N characters\n\
197   -u, --unique          only print unique lines\n\
198 "), stdout);
199       fputs (_("\
200   -z, --zero-terminated     line delimiter is NUL, not newline\n\
201 "), stdout);
202      fputs (_("\
203   -w, --check-chars=N   compare no more than N characters in lines\n\
204 "), stdout);
205      fputs (HELP_OPTION_DESCRIPTION, stdout);
206      fputs (VERSION_OPTION_DESCRIPTION, stdout);
207      fputs (_("\
208 \n\
209 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
210 characters.  Fields are skipped before chars.\n\
211 "), stdout);
212      fputs (_("\
213 \n\
214 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
215 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
216 "), stdout);
217       emit_ancillary_info (PROGRAM_NAME);
218     }
219   exit (status);
220 }
221 
222 static bool
strict_posix2(void)223 strict_posix2 (void)
224 {
225   int posix_ver = posix2_version ();
226   return 200112 <= posix_ver && posix_ver < 200809;
227 }
228 
229 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
230    invalid.  Silently convert too-large values to SIZE_MAX.  */
231 
232 static size_t
size_opt(char const * opt,char const * msgid)233 size_opt (char const *opt, char const *msgid)
234 {
235   uintmax_t size;
236 
237   switch (xstrtoumax (opt, nullptr, 10, &size, ""))
238     {
239     case LONGINT_OK:
240     case LONGINT_OVERFLOW:
241       break;
242 
243     default:
244       error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
245     }
246 
247   return MIN (size, SIZE_MAX);
248 }
249 
250 /* Given a linebuffer LINE,
251    return a pointer to the beginning of the line's field to be compared. */
252 
253 ATTRIBUTE_PURE
254 static char *
find_field(struct linebuffer const * line)255 find_field (struct linebuffer const *line)
256 {
257   size_t count;
258   char const *lp = line->buffer;
259   size_t size = line->length - 1;
260   size_t i = 0;
261 
262   for (count = 0; count < skip_fields && i < size; count++)
263     {
264       while (i < size && field_sep (lp[i]))
265         i++;
266       while (i < size && !field_sep (lp[i]))
267         i++;
268     }
269 
270   i += MIN (skip_chars, size - i);
271 
272   return line->buffer + i;
273 }
274 
275 /* Return false if two strings OLD and NEW match, true if not.
276    OLD and NEW point not to the beginnings of the lines
277    but rather to the beginnings of the fields to compare.
278    OLDLEN and NEWLEN are their lengths. */
279 
280 static bool
different(char * old,char * new,size_t oldlen,size_t newlen)281 different (char *old, char *new, size_t oldlen, size_t newlen)
282 {
283   if (check_chars < oldlen)
284     oldlen = check_chars;
285   if (check_chars < newlen)
286     newlen = check_chars;
287 
288   if (ignore_case)
289     return oldlen != newlen || memcasecmp (old, new, oldlen);
290   else
291     return oldlen != newlen || memcmp (old, new, oldlen);
292 }
293 
294 /* Output the line in linebuffer LINE to standard output
295    provided that the switches say it should be output.
296    MATCH is true if the line matches the previous line.
297    If requested, print the number of times it occurred, as well;
298    LINECOUNT + 1 is the number of times that the line occurred. */
299 
300 static void
writeline(struct linebuffer const * line,bool match,uintmax_t linecount)301 writeline (struct linebuffer const *line,
302            bool match, uintmax_t linecount)
303 {
304   if (! (linecount == 0 ? output_unique
305          : !match ? output_first_repeated
306          : output_later_repeated))
307     return;
308 
309   if (countmode == count_occurrences)
310     printf ("%7ju ", linecount + 1);
311 
312   if (fwrite (line->buffer, sizeof (char), line->length, stdout)
313       != line->length)
314     write_error ();
315 }
316 
317 /* Process input file INFILE with output to OUTFILE.
318    If either is "-", use the standard I/O stream for it instead. */
319 
320 static void
check_file(char const * infile,char const * outfile,char delimiter)321 check_file (char const *infile, char const *outfile, char delimiter)
322 {
323   struct linebuffer lb1, lb2;
324   struct linebuffer *thisline, *prevline;
325 
326   if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
327     error (EXIT_FAILURE, errno, "%s", quotef (infile));
328   if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
329     error (EXIT_FAILURE, errno, "%s", quotef (outfile));
330 
331   fadvise (stdin, FADVISE_SEQUENTIAL);
332 
333   thisline = &lb1;
334   prevline = &lb2;
335 
336   initbuffer (thisline);
337   initbuffer (prevline);
338 
339   /* The duplication in the following 'if' and 'else' blocks is an
340      optimization to distinguish between when we can print input
341      lines immediately (1. & 2.) or not.
342 
343      1. --group => all input lines are printed.
344         checking for unique/duplicated lines is used only for printing
345         group separators.
346 
347      2. The default case in which none of these options has been specified:
348           --count, --repeated,  --all-repeated, --unique
349         In the default case, this optimization lets uniq output each different
350         line right away, without waiting to see if the next one is different.
351 
352      3. All other cases.
353   */
354   if (output_unique && output_first_repeated && countmode == count_none)
355     {
356       char *prevfield = nullptr;
357       size_t prevlen;
358       bool first_group_printed = false;
359 
360       while (!feof (stdin))
361         {
362           char *thisfield;
363           size_t thislen;
364           bool new_group;
365 
366           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
367             break;
368 
369           thisfield = find_field (thisline);
370           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
371 
372           new_group = (!prevfield
373                        || different (thisfield, prevfield, thislen, prevlen));
374 
375           if (new_group && grouping != GM_NONE
376               && (grouping == GM_PREPEND || grouping == GM_BOTH
377                   || (first_group_printed && (grouping == GM_APPEND
378                                               || grouping == GM_SEPARATE))))
379             putchar (delimiter);
380 
381           if (new_group || grouping != GM_NONE)
382             {
383               if (fwrite (thisline->buffer, sizeof (char), thisline->length,
384                   stdout) != thisline->length)
385                 write_error ();
386 
387               SWAP_LINES (prevline, thisline);
388               prevfield = thisfield;
389               prevlen = thislen;
390               first_group_printed = true;
391             }
392         }
393       if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
394         putchar (delimiter);
395     }
396   else
397     {
398       char *prevfield;
399       size_t prevlen;
400       uintmax_t match_count = 0;
401       bool first_delimiter = true;
402 
403       if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
404         goto closefiles;
405       prevfield = find_field (prevline);
406       prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
407 
408       while (!feof (stdin))
409         {
410           bool match;
411           char *thisfield;
412           size_t thislen;
413           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
414             {
415               if (ferror (stdin))
416                 goto closefiles;
417               break;
418             }
419           thisfield = find_field (thisline);
420           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
421           match = !different (thisfield, prevfield, thislen, prevlen);
422           match_count += match;
423 
424           if (match_count == UINTMAX_MAX)
425             {
426               if (count_occurrences)
427                 error (EXIT_FAILURE, 0, _("too many repeated lines"));
428               match_count--;
429             }
430 
431           if (delimit_groups != DM_NONE)
432             {
433               if (!match)
434                 {
435                   if (match_count) /* a previous match */
436                     first_delimiter = false; /* Only used when DM_SEPARATE */
437                 }
438               else if (match_count == 1)
439                 {
440                   if ((delimit_groups == DM_PREPEND)
441                       || (delimit_groups == DM_SEPARATE
442                           && !first_delimiter))
443                     putchar (delimiter);
444                 }
445             }
446 
447           if (!match || output_later_repeated)
448             {
449               writeline (prevline, match, match_count);
450               SWAP_LINES (prevline, thisline);
451               prevfield = thisfield;
452               prevlen = thislen;
453               if (!match)
454                 match_count = 0;
455             }
456         }
457 
458       writeline (prevline, false, match_count);
459     }
460 
461  closefiles:
462   if (ferror (stdin) || fclose (stdin) != 0)
463     error (EXIT_FAILURE, errno, _("error reading %s"), quoteaf (infile));
464 
465   /* stdout is handled via the atexit-invoked close_stdout function.  */
466 
467   free (lb1.buffer);
468   free (lb2.buffer);
469 }
470 
471 enum Skip_field_option_type
472   {
473     SFO_NONE,
474     SFO_OBSOLETE,
475     SFO_NEW
476   };
477 
478 int
main(int argc,char ** argv)479 main (int argc, char **argv)
480 {
481   int optc = 0;
482   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != nullptr);
483   enum Skip_field_option_type skip_field_option_type = SFO_NONE;
484   unsigned int nfiles = 0;
485   char const *file[2];
486   char delimiter = '\n';	/* change with --zero-terminated, -z */
487   bool output_option_used = false;   /* if true, one of -u/-d/-D/-c was used */
488 
489   file[0] = file[1] = "-";
490   initialize_main (&argc, &argv);
491   set_program_name (argv[0]);
492   setlocale (LC_ALL, "");
493   bindtextdomain (PACKAGE, LOCALEDIR);
494   textdomain (PACKAGE);
495 
496   atexit (close_stdout);
497 
498   skip_chars = 0;
499   skip_fields = 0;
500   check_chars = SIZE_MAX;
501   output_unique = output_first_repeated = true;
502   output_later_repeated = false;
503   countmode = count_none;
504   delimit_groups = DM_NONE;
505 
506   while (true)
507     {
508       /* Parse an operand with leading "+" as a file after "--" was
509          seen; or if pedantic and a file was seen; or if not
510          obsolete.  */
511 
512       if (optc == -1
513           || (posixly_correct && nfiles != 0)
514           || ((optc = getopt_long (argc, argv,
515                                    "-0123456789Dcdf:is:uw:z",
516                                    longopts, nullptr))
517               == -1))
518         {
519           if (argc <= optind)
520             break;
521           if (nfiles == 2)
522             {
523               error (0, 0, _("extra operand %s"), quote (argv[optind]));
524               usage (EXIT_FAILURE);
525             }
526           file[nfiles++] = argv[optind++];
527         }
528       else switch (optc)
529         {
530         case 1:
531           {
532             uintmax_t size;
533             if (optarg[0] == '+'
534                 && ! strict_posix2 ()
535                 && xstrtoumax (optarg, nullptr, 10, &size, "") == LONGINT_OK
536                 && size <= SIZE_MAX)
537               skip_chars = size;
538             else if (nfiles == 2)
539               {
540                 error (0, 0, _("extra operand %s"), quote (optarg));
541                 usage (EXIT_FAILURE);
542               }
543             else
544               file[nfiles++] = optarg;
545           }
546           break;
547 
548         case '0':
549         case '1':
550         case '2':
551         case '3':
552         case '4':
553         case '5':
554         case '6':
555         case '7':
556         case '8':
557         case '9':
558           {
559             if (skip_field_option_type == SFO_NEW)
560               skip_fields = 0;
561 
562             if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
563               skip_fields = SIZE_MAX;
564 
565             skip_field_option_type = SFO_OBSOLETE;
566           }
567           break;
568 
569         case 'c':
570           countmode = count_occurrences;
571           output_option_used = true;
572           break;
573 
574         case 'd':
575           output_unique = false;
576           output_option_used = true;
577           break;
578 
579         case 'D':
580           output_unique = false;
581           output_later_repeated = true;
582           if (optarg == nullptr)
583             delimit_groups = DM_NONE;
584           else
585             delimit_groups = XARGMATCH ("--all-repeated", optarg,
586                                         delimit_method_string,
587                                         delimit_method_map);
588           output_option_used = true;
589           break;
590 
591         case GROUP_OPTION:
592           if (optarg == nullptr)
593             grouping = GM_SEPARATE;
594           else
595             grouping = XARGMATCH ("--group", optarg,
596                                   grouping_method_string,
597                                   grouping_method_map);
598           break;
599 
600         case 'f':
601           skip_field_option_type = SFO_NEW;
602           skip_fields = size_opt (optarg,
603                                   N_("invalid number of fields to skip"));
604           break;
605 
606         case 'i':
607           ignore_case = true;
608           break;
609 
610         case 's':
611           skip_chars = size_opt (optarg,
612                                  N_("invalid number of bytes to skip"));
613           break;
614 
615         case 'u':
616           output_first_repeated = false;
617           output_option_used = true;
618           break;
619 
620         case 'w':
621           check_chars = size_opt (optarg,
622                                   N_("invalid number of bytes to compare"));
623           break;
624 
625         case 'z':
626           delimiter = '\0';
627           break;
628 
629         case_GETOPT_HELP_CHAR;
630 
631         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
632 
633         default:
634           usage (EXIT_FAILURE);
635         }
636     }
637 
638   /* Note we could allow --group with -D at least, and that would
639      avoid the need to specify a grouping method to --all-repeated.
640      It was thought best to avoid deprecating those parameters though
641      and keep --group separate to other options.  */
642   if (grouping != GM_NONE && output_option_used)
643     {
644       error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
645       usage (EXIT_FAILURE);
646     }
647 
648   if (grouping != GM_NONE && countmode != count_none)
649     {
650       error (0, 0,
651            _("grouping and printing repeat counts is meaningless"));
652       usage (EXIT_FAILURE);
653     }
654 
655   if (countmode == count_occurrences && output_later_repeated)
656     {
657       error (0, 0,
658            _("printing all duplicated lines and repeat counts is meaningless"));
659       usage (EXIT_FAILURE);
660     }
661 
662   check_file (file[0], file[1], delimiter);
663 
664   return EXIT_SUCCESS;
665 }
666