1 /* comm -- compare two sorted files line by line.
2    Copyright (C) 1986-2023 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
16 
17 /* Written by Richard Stallman and David MacKenzie. */
18 
19 #include <config.h>
20 
21 #include <getopt.h>
22 #include <sys/types.h>
23 #include "system.h"
24 #include "linebuffer.h"
25 #include "fadvise.h"
26 #include "hard-locale.h"
27 #include "quote.h"
28 #include "stdio--.h"
29 #include "memcmp2.h"
30 #include "xmemcoll.h"
31 
32 /* The official name of this program (e.g., no 'g' prefix).  */
33 #define PROGRAM_NAME "comm"
34 
35 #define AUTHORS \
36   proper_name ("Richard M. Stallman"), \
37   proper_name ("David MacKenzie")
38 
39 /* Undefine, to avoid warning about redefinition on some systems.  */
40 #undef min
41 #define min(x, y) ((x) < (y) ? (x) : (y))
42 
43 /* True if the LC_COLLATE locale is hard.  */
44 static bool hard_LC_COLLATE;
45 
46 /* If true, print lines that are found only in file 1. */
47 static bool only_file_1;
48 
49 /* If true, print lines that are found only in file 2. */
50 static bool only_file_2;
51 
52 /* If true, print lines that are found in both files. */
53 static bool both;
54 
55 /* If nonzero, we have seen at least one unpairable line. */
56 static bool seen_unpairable;
57 
58 /* If nonzero, we have warned about disorder in that file. */
59 static bool issued_disorder_warning[2];
60 
61 /* line delimiter.  */
62 static unsigned char delim = '\n';
63 
64 /* If true, print a summary.  */
65 static bool total_option;
66 
67 /* If nonzero, check that the input is correctly ordered. */
68 static enum
69   {
70     CHECK_ORDER_DEFAULT,
71     CHECK_ORDER_ENABLED,
72     CHECK_ORDER_DISABLED
73   } check_input_order;
74 
75 /* Output columns will be delimited with this string, which may be set
76    on the command-line with --output-delimiter=STR.  */
77 static char const *col_sep = "\t";
78 static size_t col_sep_len = 0;
79 
80 /* For long options that have no equivalent short option, use a
81    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
82 enum
83 {
84   CHECK_ORDER_OPTION = CHAR_MAX + 1,
85   NOCHECK_ORDER_OPTION,
86   OUTPUT_DELIMITER_OPTION,
87   TOTAL_OPTION
88 };
89 
90 static struct option const long_options[] =
91 {
92   {"check-order", no_argument, nullptr, CHECK_ORDER_OPTION},
93   {"nocheck-order", no_argument, nullptr, NOCHECK_ORDER_OPTION},
94   {"output-delimiter", required_argument, nullptr, OUTPUT_DELIMITER_OPTION},
95   {"total", no_argument, nullptr, TOTAL_OPTION},
96   {"zero-terminated", no_argument, nullptr, 'z'},
97   {GETOPT_HELP_OPTION_DECL},
98   {GETOPT_VERSION_OPTION_DECL},
99   {nullptr, 0, nullptr, 0}
100 };
101 
102 
103 void
usage(int status)104 usage (int status)
105 {
106   if (status != EXIT_SUCCESS)
107     emit_try_help ();
108   else
109     {
110       printf (_("\
111 Usage: %s [OPTION]... FILE1 FILE2\n\
112 "),
113               program_name);
114       fputs (_("\
115 Compare sorted files FILE1 and FILE2 line by line.\n\
116 "), stdout);
117       fputs (_("\
118 \n\
119 When FILE1 or FILE2 (not both) is -, read standard input.\n\
120 "), stdout);
121       fputs (_("\
122 \n\
123 With no options, produce three-column output.  Column one contains\n\
124 lines unique to FILE1, column two contains lines unique to FILE2,\n\
125 and column three contains lines common to both files.\n\
126 "), stdout);
127       fputs (_("\
128 \n\
129   -1                      suppress column 1 (lines unique to FILE1)\n\
130   -2                      suppress column 2 (lines unique to FILE2)\n\
131   -3                      suppress column 3 (lines that appear in both files)\n\
132 "), stdout);
133       fputs (_("\
134 \n\
135       --check-order       check that the input is correctly sorted, even\n\
136                             if all input lines are pairable\n\
137       --nocheck-order     do not check that the input is correctly sorted\n\
138 "), stdout);
139       fputs (_("\
140       --output-delimiter=STR  separate columns with STR\n\
141 "), stdout);
142       fputs (_("\
143       --total             output a summary\n\
144 "), stdout);
145       fputs (_("\
146   -z, --zero-terminated   line delimiter is NUL, not newline\n\
147 "), stdout);
148       fputs (HELP_OPTION_DESCRIPTION, stdout);
149       fputs (VERSION_OPTION_DESCRIPTION, stdout);
150       fputs (_("\
151 \n\
152 Note, comparisons honor the rules specified by 'LC_COLLATE'.\n\
153 "), stdout);
154       printf (_("\
155 \n\
156 Examples:\n\
157   %s -12 file1 file2  Print only lines present in both file1 and file2.\n\
158   %s -3 file1 file2  Print lines in file1 not in file2, and vice versa.\n\
159 "),
160               program_name, program_name);
161       emit_ancillary_info (PROGRAM_NAME);
162     }
163   exit (status);
164 }
165 
166 /* Output the line in linebuffer LINE to stdout
167    provided the switches say it should be output.
168    CLASS is 1 for a line found only in file 1,
169    2 for a line only in file 2, 3 for a line in both. */
170 
171 static void
writeline(struct linebuffer const * line,int class)172 writeline (struct linebuffer const *line, int class)
173 {
174   switch (class)
175     {
176     case 1:
177       if (!only_file_1)
178         return;
179       break;
180 
181     case 2:
182       if (!only_file_2)
183         return;
184       if (only_file_1)
185         fwrite (col_sep, 1, col_sep_len, stdout);
186       break;
187 
188     case 3:
189       if (!both)
190         return;
191       if (only_file_1)
192         fwrite (col_sep, 1, col_sep_len, stdout);
193       if (only_file_2)
194         fwrite (col_sep, 1, col_sep_len, stdout);
195       break;
196     }
197 
198   fwrite (line->buffer, sizeof (char), line->length, stdout);
199 
200   if (ferror (stdout))
201     write_error ();
202 }
203 
204 /* Check that successive input lines PREV and CURRENT from input file
205    WHATFILE are presented in order.
206 
207    If the user specified --nocheck-order, the check is not made.
208    If the user specified --check-order, the problem is fatal.
209    Otherwise (the default), the message is simply a warning.
210 
211    A message is printed at most once per input file.
212 
213    This function was copied (nearly) verbatim from 'src/join.c'. */
214 
215 static void
check_order(struct linebuffer const * prev,struct linebuffer const * current,int whatfile)216 check_order (struct linebuffer const *prev,
217              struct linebuffer const *current,
218              int whatfile)
219 {
220 
221   if (check_input_order != CHECK_ORDER_DISABLED
222       && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable))
223     {
224       if (!issued_disorder_warning[whatfile - 1])
225         {
226           int order;
227 
228           if (hard_LC_COLLATE)
229             order = xmemcoll (prev->buffer, prev->length - 1,
230                               current->buffer, current->length - 1);
231           else
232             order = memcmp2 (prev->buffer, prev->length - 1,
233                              current->buffer, current->length - 1);
234 
235           if (0 < order)
236             {
237               error ((check_input_order == CHECK_ORDER_ENABLED
238                       ? EXIT_FAILURE : 0),
239                      0, _("file %d is not in sorted order"), whatfile);
240 
241               /* If we get to here, the message was just a warning, but we
242                  want only to issue it once. */
243               issued_disorder_warning[whatfile - 1] = true;
244             }
245         }
246     }
247 }
248 
249 /* Compare INFILES[0] and INFILES[1].
250    If either is "-", use the standard input for that file.
251    Assume that each input file is sorted;
252    merge them and output the result.
253    Exit the program when done.  */
254 
255 static _Noreturn void
compare_files(char ** infiles)256 compare_files (char **infiles)
257 {
258   /* For each file, we have four linebuffers in lba. */
259   struct linebuffer lba[2][4];
260 
261   /* thisline[i] points to the linebuffer holding the next available line
262      in file i, or is null if there are no lines left in that file.  */
263   struct linebuffer *thisline[2];
264 
265   /* all_line[i][alt[i][0]] also points to the linebuffer holding the
266      current line in file i. We keep two buffers of history around so we
267      can look two lines back when we get to the end of a file. */
268   struct linebuffer *all_line[2][4];
269 
270   /* This is used to rotate through the buffers for each input file. */
271   int alt[2][3];
272 
273   /* streams[i] holds the input stream for file i.  */
274   FILE *streams[2];
275 
276   /* Counters for the summary.  */
277   uintmax_t total[] = {0, 0, 0};
278 
279   int i, j;
280 
281   /* Initialize the storage. */
282   for (i = 0; i < 2; i++)
283     {
284       for (j = 0; j < 4; j++)
285         {
286           initbuffer (&lba[i][j]);
287           all_line[i][j] = &lba[i][j];
288         }
289       alt[i][0] = 0;
290       alt[i][1] = 0;
291       alt[i][2] = 0;
292       streams[i] = (STREQ (infiles[i], "-") ? stdin : fopen (infiles[i], "r"));
293       if (!streams[i])
294         error (EXIT_FAILURE, errno, "%s", quotef (infiles[i]));
295 
296       fadvise (streams[i], FADVISE_SEQUENTIAL);
297 
298       thisline[i] = readlinebuffer_delim (all_line[i][alt[i][0]], streams[i],
299                                           delim);
300       if (ferror (streams[i]))
301         error (EXIT_FAILURE, errno, "%s", quotef (infiles[i]));
302     }
303 
304   while (thisline[0] || thisline[1])
305     {
306       int order;
307       bool fill_up[2] = { false, false };
308 
309       /* Compare the next available lines of the two files.  */
310 
311       if (!thisline[0])
312         order = 1;
313       else if (!thisline[1])
314         order = -1;
315       else
316         {
317           if (hard_LC_COLLATE)
318             order = xmemcoll (thisline[0]->buffer, thisline[0]->length - 1,
319                               thisline[1]->buffer, thisline[1]->length - 1);
320           else
321             {
322               size_t len = min (thisline[0]->length, thisline[1]->length) - 1;
323               order = memcmp (thisline[0]->buffer, thisline[1]->buffer, len);
324               if (order == 0)
325                 order = ((thisline[0]->length > thisline[1]->length)
326                          - (thisline[0]->length < thisline[1]->length));
327             }
328         }
329 
330       /* Output the line that is lesser. */
331       if (order == 0)
332         {
333           /* Line is seen in both files.  */
334           total[2]++;
335           writeline (thisline[1], 3);
336         }
337       else
338         {
339           seen_unpairable = true;
340           if (order <= 0)
341             {
342               /* Line is seen in file 1 only.  */
343               total[0]++;
344               writeline (thisline[0], 1);
345             }
346           else
347             {
348               /* Line is seen in file 2 only.  */
349               total[1]++;
350               writeline (thisline[1], 2);
351             }
352         }
353 
354       /* Step the file the line came from.
355          If the files match, step both files.  */
356       if (0 <= order)
357         fill_up[1] = true;
358       if (order <= 0)
359         fill_up[0] = true;
360 
361       for (i = 0; i < 2; i++)
362         if (fill_up[i])
363           {
364             /* Rotate the buffers for this file. */
365             alt[i][2] = alt[i][1];
366             alt[i][1] = alt[i][0];
367             alt[i][0] = (alt[i][0] + 1) & 0x03;
368 
369             thisline[i] = readlinebuffer_delim (all_line[i][alt[i][0]],
370                                                 streams[i], delim);
371 
372             if (thisline[i])
373               check_order (all_line[i][alt[i][1]], thisline[i], i + 1);
374 
375             /* If this is the end of the file we may need to re-check
376                the order of the previous two lines, since we might have
377                discovered an unpairable match since we checked before. */
378             else if (all_line[i][alt[i][2]]->buffer)
379               check_order (all_line[i][alt[i][2]],
380                            all_line[i][alt[i][1]], i + 1);
381 
382             if (ferror (streams[i]))
383               error (EXIT_FAILURE, errno, "%s", quotef (infiles[i]));
384 
385             fill_up[i] = false;
386           }
387     }
388 
389   for (i = 0; i < 2; i++)
390     if (fclose (streams[i]) != 0)
391       error (EXIT_FAILURE, errno, "%s", quotef (infiles[i]));
392 
393   if (total_option)
394     {
395       /* Print the summary, minding the column and line delimiters.  */
396       char buf1[INT_BUFSIZE_BOUND (uintmax_t)];
397       char buf2[INT_BUFSIZE_BOUND (uintmax_t)];
398       char buf3[INT_BUFSIZE_BOUND (uintmax_t)];
399       if (col_sep_len == 1)
400         { /* Separate to handle NUL char.  */
401           printf ("%s%c%s%c%s%c%s%c",
402                   umaxtostr (total[0], buf1), *col_sep,
403                   umaxtostr (total[1], buf2), *col_sep,
404                   umaxtostr (total[2], buf3), *col_sep,
405                   _("total"), delim);
406         }
407       else
408         {
409           printf ("%s%s%s%s%s%s%s%c",
410                   umaxtostr (total[0], buf1), col_sep,
411                   umaxtostr (total[1], buf2), col_sep,
412                   umaxtostr (total[2], buf3), col_sep,
413                   _("total"), delim);
414         }
415     }
416 
417   if (issued_disorder_warning[0] || issued_disorder_warning[1])
418     error (EXIT_FAILURE, 0, _("input is not in sorted order"));
419 
420   /* Exit here to pacify gcc -fsanitizer=leak.  */
421   exit (EXIT_SUCCESS);
422 }
423 
424 int
main(int argc,char ** argv)425 main (int argc, char **argv)
426 {
427   int c;
428 
429   initialize_main (&argc, &argv);
430   set_program_name (argv[0]);
431   setlocale (LC_ALL, "");
432   bindtextdomain (PACKAGE, LOCALEDIR);
433   textdomain (PACKAGE);
434   hard_LC_COLLATE = hard_locale (LC_COLLATE);
435 
436   atexit (close_stdout);
437 
438   only_file_1 = true;
439   only_file_2 = true;
440   both = true;
441 
442   seen_unpairable = false;
443   issued_disorder_warning[0] = issued_disorder_warning[1] = false;
444   check_input_order = CHECK_ORDER_DEFAULT;
445   total_option = false;
446 
447   while ((c = getopt_long (argc, argv, "123z", long_options, nullptr)) != -1)
448     switch (c)
449       {
450       case '1':
451         only_file_1 = false;
452         break;
453 
454       case '2':
455         only_file_2 = false;
456         break;
457 
458       case '3':
459         both = false;
460         break;
461 
462       case 'z':
463         delim = '\0';
464         break;
465 
466       case NOCHECK_ORDER_OPTION:
467         check_input_order = CHECK_ORDER_DISABLED;
468         break;
469 
470       case CHECK_ORDER_OPTION:
471         check_input_order = CHECK_ORDER_ENABLED;
472         break;
473 
474       case OUTPUT_DELIMITER_OPTION:
475         if (col_sep_len && !STREQ (col_sep, optarg))
476           error (EXIT_FAILURE, 0, _("multiple output delimiters specified"));
477         col_sep = optarg;
478         col_sep_len = *optarg ? strlen (optarg) : 1;
479         break;
480 
481       case TOTAL_OPTION:
482         total_option = true;
483         break;
484 
485       case_GETOPT_HELP_CHAR;
486 
487       case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
488 
489       default:
490         usage (EXIT_FAILURE);
491       }
492 
493   if (! col_sep_len)
494     col_sep_len = 1;
495 
496   if (argc - optind < 2)
497     {
498       if (argc <= optind)
499         error (0, 0, _("missing operand"));
500       else
501         error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
502       usage (EXIT_FAILURE);
503     }
504 
505   if (2 < argc - optind)
506     {
507       error (0, 0, _("extra operand %s"), quote (argv[optind + 2]));
508       usage (EXIT_FAILURE);
509     }
510 
511   compare_files (argv + optind);
512 }
513