1 /* join - join lines of two files on a common field
2    Copyright (C) 1991-2023 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <https://www.gnu.org/licenses/>.
16 
17    Written by Mike Haertel, mike@gnu.ai.mit.edu.  */
18 
19 #include <config.h>
20 
21 #include <sys/types.h>
22 #include <getopt.h>
23 
24 #include "system.h"
25 #include "assure.h"
26 #include "fadvise.h"
27 #include "hard-locale.h"
28 #include "linebuffer.h"
29 #include "memcasecmp.h"
30 #include "quote.h"
31 #include "stdio--.h"
32 #include "xmemcoll.h"
33 #include "xstrtol.h"
34 #include "argmatch.h"
35 
36 /* The official name of this program (e.g., no 'g' prefix).  */
37 #define PROGRAM_NAME "join"
38 
39 #define AUTHORS proper_name ("Mike Haertel")
40 
41 #define join system_join
42 
43 #define SWAPLINES(a, b) do { \
44   struct line *tmp = a; \
45   a = b; \
46   b = tmp; \
47 } while (0);
48 
49 /* An element of the list identifying which fields to print for each
50    output line.  */
51 struct outlist
52   {
53     /* File number: 0, 1, or 2.  0 means use the join field.
54        1 means use the first file argument, 2 the second.  */
55     int file;
56 
57     /* Field index (zero-based), specified only when FILE is 1 or 2.  */
58     idx_t field;
59 
60     struct outlist *next;
61   };
62 
63 /* A field of a line.  */
64 struct field
65   {
66     char *beg;			/* First character in field.  */
67     idx_t len;			/* The length of the field.  */
68   };
69 
70 /* A line read from an input file.  */
71 struct line
72   {
73     struct linebuffer buf;	/* The line itself.  */
74     idx_t nfields;		/* Number of elements in 'fields'.  */
75     idx_t nfields_allocated;	/* Number of elements allocated for 'fields'. */
76     struct field *fields;
77   };
78 
79 /* One or more consecutive lines read from a file that all have the
80    same join field value.  */
81 struct seq
82   {
83     idx_t count;		/* Elements used in 'lines'.  */
84     idx_t alloc;		/* Elements allocated in 'lines'.  */
85     struct line **lines;
86   };
87 
88 /* The previous line read from each file.  */
89 static struct line *prevline[2] = {nullptr, nullptr};
90 
91 /* The number of lines read from each file.  */
92 static uintmax_t line_no[2] = {0, 0};
93 
94 /* The input file names.  */
95 static char *g_names[2];
96 
97 /* This provides an extra line buffer for each file.  We need these if we
98    try to read two consecutive lines into the same buffer, since we don't
99    want to overwrite the previous buffer before we check order. */
100 static struct line *spareline[2] = {nullptr, nullptr};
101 
102 /* True if the LC_COLLATE locale is hard.  */
103 static bool hard_LC_COLLATE;
104 
105 /* If nonzero, print unpairable lines in file 1 or 2.  */
106 static bool print_unpairables_1, print_unpairables_2;
107 
108 /* If nonzero, print pairable lines.  */
109 static bool print_pairables;
110 
111 /* If nonzero, we have seen at least one unpairable line. */
112 static bool seen_unpairable;
113 
114 /* If nonzero, we have warned about disorder in that file. */
115 static bool issued_disorder_warning[2];
116 
117 /* Empty output field filler.  */
118 static char const *empty_filler;
119 
120 /* Whether to ensure the same number of fields are output from each line.  */
121 static bool autoformat;
122 /* The number of fields to output for each line.
123    Only significant when autoformat is true.  */
124 static idx_t autocount_1;
125 static idx_t autocount_2;
126 
127 /* Field to join on; -1 means they haven't been determined yet.  */
128 static ptrdiff_t join_field_1 = -1;
129 static ptrdiff_t join_field_2 = -1;
130 
131 /* List of fields to print.  */
132 static struct outlist outlist_head;
133 
134 /* Last element in 'outlist', where a new element can be added.  */
135 static struct outlist *outlist_end = &outlist_head;
136 
137 /* Tab character separating fields.  If negative, fields are separated
138    by any nonempty string of blanks, otherwise by exactly one
139    tab character whose value (when cast to unsigned char) equals TAB.  */
140 static int tab = -1;
141 
142 /* If nonzero, check that the input is correctly ordered. */
143 static enum
144   {
145     CHECK_ORDER_DEFAULT,
146     CHECK_ORDER_ENABLED,
147     CHECK_ORDER_DISABLED
148   } check_input_order;
149 
150 enum
151 {
152   CHECK_ORDER_OPTION = CHAR_MAX + 1,
153   NOCHECK_ORDER_OPTION,
154   HEADER_LINE_OPTION
155 };
156 
157 
158 static struct option const longopts[] =
159 {
160   {"ignore-case", no_argument, nullptr, 'i'},
161   {"check-order", no_argument, nullptr, CHECK_ORDER_OPTION},
162   {"nocheck-order", no_argument, nullptr, NOCHECK_ORDER_OPTION},
163   {"zero-terminated", no_argument, nullptr, 'z'},
164   {"header", no_argument, nullptr, HEADER_LINE_OPTION},
165   {GETOPT_HELP_OPTION_DECL},
166   {GETOPT_VERSION_OPTION_DECL},
167   {nullptr, 0, nullptr, 0}
168 };
169 
170 /* Used to print non-joining lines */
171 static struct line uni_blank;
172 
173 /* If nonzero, ignore case when comparing join fields.  */
174 static bool ignore_case;
175 
176 /* If nonzero, treat the first line of each file as column headers --
177    join them without checking for ordering */
178 static bool join_header_lines;
179 
180 /* The character marking end of line. Default to \n. */
181 static char eolchar = '\n';
182 
183 void
usage(int status)184 usage (int status)
185 {
186   if (status != EXIT_SUCCESS)
187     emit_try_help ();
188   else
189     {
190       printf (_("\
191 Usage: %s [OPTION]... FILE1 FILE2\n\
192 "),
193               program_name);
194       fputs (_("\
195 For each pair of input lines with identical join fields, write a line to\n\
196 standard output.  The default join field is the first, delimited by blanks.\
197 \n\
198 "), stdout);
199       fputs (_("\
200 \n\
201 When FILE1 or FILE2 (not both) is -, read standard input.\n\
202 "), stdout);
203       fputs (_("\
204 \n\
205   -a FILENUM             also print unpairable lines from file FILENUM, where\n\
206                            FILENUM is 1 or 2, corresponding to FILE1 or FILE2\n\
207 "), stdout);
208       fputs (_("\
209   -e STRING              replace missing (empty) input fields with STRING;\n\
210                            I.e., missing fields specified with '-12jo' options\
211 \n\
212 "), stdout);
213       fputs (_("\
214   -i, --ignore-case      ignore differences in case when comparing fields\n\
215   -j FIELD               equivalent to '-1 FIELD -2 FIELD'\n\
216   -o FORMAT              obey FORMAT while constructing output line\n\
217   -t CHAR                use CHAR as input and output field separator\n\
218 "), stdout);
219       fputs (_("\
220   -v FILENUM             like -a FILENUM, but suppress joined output lines\n\
221   -1 FIELD               join on this FIELD of file 1\n\
222   -2 FIELD               join on this FIELD of file 2\n\
223       --check-order      check that the input is correctly sorted, even\n\
224                            if all input lines are pairable\n\
225       --nocheck-order    do not check that the input is correctly sorted\n\
226       --header           treat the first line in each file as field headers,\n\
227                            print them without trying to pair them\n\
228 "), stdout);
229       fputs (_("\
230   -z, --zero-terminated  line delimiter is NUL, not newline\n\
231 "), stdout);
232       fputs (HELP_OPTION_DESCRIPTION, stdout);
233       fputs (VERSION_OPTION_DESCRIPTION, stdout);
234       fputs (_("\
235 \n\
236 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
237 else fields are separated by CHAR.  Any FIELD is a field number counted\n\
238 from 1.  FORMAT is one or more comma or blank separated specifications,\n\
239 each being 'FILENUM.FIELD' or '0'.  Default FORMAT outputs the join field,\n\
240 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
241 separated by CHAR.  If FORMAT is the keyword 'auto', then the first\n\
242 line of each file determines the number of fields output for each line.\n\
243 \n\
244 Important: FILE1 and FILE2 must be sorted on the join fields.\n\
245 E.g., use \"sort -k 1b,1\" if 'join' has no options,\n\
246 or use \"join -t ''\" if 'sort' has no options.\n\
247 Note, comparisons honor the rules specified by 'LC_COLLATE'.\n\
248 If the input is not sorted and some lines cannot be joined, a\n\
249 warning message will be given.\n\
250 "), stdout);
251       emit_ancillary_info (PROGRAM_NAME);
252     }
253   exit (status);
254 }
255 
256 /* Record a field in LINE, with location FIELD and size LEN.  */
257 
258 static void
extract_field(struct line * line,char * field,idx_t len)259 extract_field (struct line *line, char *field, idx_t len)
260 {
261   if (line->nfields >= line->nfields_allocated)
262     line->fields = xpalloc (line->fields, &line->nfields_allocated, 1,
263                             -1, sizeof *line->fields);
264   line->fields[line->nfields].beg = field;
265   line->fields[line->nfields].len = len;
266   ++(line->nfields);
267 }
268 
269 /* Fill in the 'fields' structure in LINE.  */
270 
271 static void
xfields(struct line * line)272 xfields (struct line *line)
273 {
274   char *ptr = line->buf.buffer;
275   char const *lim = ptr + line->buf.length - 1;
276 
277   if (ptr == lim)
278     return;
279 
280   if (0 <= tab && tab != '\n')
281     {
282       char *sep;
283       for (; (sep = memchr (ptr, tab, lim - ptr)) != nullptr; ptr = sep + 1)
284         extract_field (line, ptr, sep - ptr);
285     }
286   else if (tab < 0)
287     {
288       /* Skip leading blanks before the first field.  */
289       while (field_sep (*ptr))
290         if (++ptr == lim)
291           return;
292 
293       do
294         {
295           char *sep;
296           for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++)
297             continue;
298           extract_field (line, ptr, sep - ptr);
299           if (sep == lim)
300             return;
301           for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++)
302             continue;
303         }
304       while (ptr != lim);
305     }
306 
307   extract_field (line, ptr, lim - ptr);
308 }
309 
310 static void
freeline(struct line * line)311 freeline (struct line *line)
312 {
313   if (line == nullptr)
314     return;
315   free (line->fields);
316   line->fields = nullptr;
317   free (line->buf.buffer);
318   line->buf.buffer = nullptr;
319 }
320 
321 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
322    >0 if it compares greater; 0 if it compares equal.
323    Report an error and exit if the comparison fails.
324    Use join fields JF_1 and JF_2 respectively.  */
325 
326 static int
keycmp(struct line const * line1,struct line const * line2,idx_t jf_1,idx_t jf_2)327 keycmp (struct line const *line1, struct line const *line2,
328         idx_t jf_1, idx_t jf_2)
329 {
330   /* Start of field to compare in each file.  */
331   char *beg1;
332   char *beg2;
333 
334   idx_t len1;
335   idx_t len2;		/* Length of fields to compare.  */
336   int diff;
337 
338   if (jf_1 < line1->nfields)
339     {
340       beg1 = line1->fields[jf_1].beg;
341       len1 = line1->fields[jf_1].len;
342     }
343   else
344     {
345       beg1 = nullptr;
346       len1 = 0;
347     }
348 
349   if (jf_2 < line2->nfields)
350     {
351       beg2 = line2->fields[jf_2].beg;
352       len2 = line2->fields[jf_2].len;
353     }
354   else
355     {
356       beg2 = nullptr;
357       len2 = 0;
358     }
359 
360   if (len1 == 0)
361     return len2 == 0 ? 0 : -1;
362   if (len2 == 0)
363     return 1;
364 
365   if (ignore_case)
366     {
367       /* FIXME: ignore_case does not work with NLS (in particular,
368          with multibyte chars).  */
369       diff = memcasecmp (beg1, beg2, MIN (len1, len2));
370     }
371   else
372     {
373       if (hard_LC_COLLATE)
374         return xmemcoll (beg1, len1, beg2, len2);
375       diff = memcmp (beg1, beg2, MIN (len1, len2));
376     }
377 
378   if (diff)
379     return diff;
380   return (len1 > len2) - (len1 < len2);
381 }
382 
383 /* Check that successive input lines PREV and CURRENT from input file
384    WHATFILE are presented in order, unless the user may be relying on
385    the GNU extension that input lines may be out of order if no input
386    lines are unpairable.
387 
388    If the user specified --nocheck-order, the check is not made.
389    If the user specified --check-order, the problem is fatal.
390    Otherwise (the default), the message is simply a warning.
391 
392    A message is printed at most once per input file. */
393 
394 static void
check_order(const struct line * prev,const struct line * current,int whatfile)395 check_order (const struct line *prev,
396              const struct line *current,
397              int whatfile)
398 {
399   if (check_input_order != CHECK_ORDER_DISABLED
400       && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable))
401     {
402       if (!issued_disorder_warning[whatfile - 1])
403         {
404           idx_t join_field = whatfile == 1 ? join_field_1 : join_field_2;
405           if (keycmp (prev, current, join_field, join_field) > 0)
406             {
407               /* Exclude any trailing newline. */
408               idx_t len = current->buf.length;
409               if (0 < len && current->buf.buffer[len - 1] == '\n')
410                 --len;
411 
412               /* If the offending line is longer than INT_MAX, output
413                  only the first INT_MAX bytes in this diagnostic.  */
414               len = MIN (INT_MAX, len);
415 
416               error ((check_input_order == CHECK_ORDER_ENABLED
417                       ? EXIT_FAILURE : 0),
418                      0, _("%s:%ju: is not sorted: %.*s"),
419                      g_names[whatfile - 1], line_no[whatfile - 1],
420                      (int) len, current->buf.buffer);
421 
422               /* If we get to here, the message was merely a warning.
423                  Arrange to issue it only once per file.  */
424               issued_disorder_warning[whatfile - 1] = true;
425             }
426         }
427     }
428 }
429 
430 static inline void
reset_line(struct line * line)431 reset_line (struct line *line)
432 {
433   line->nfields = 0;
434 }
435 
436 static struct line *
init_linep(struct line ** linep)437 init_linep (struct line **linep)
438 {
439   struct line *line = xzalloc (sizeof *line);
440   *linep = line;
441   return line;
442 }
443 
444 /* Read a line from FP into LINE and split it into fields.
445    Return true if successful.  */
446 
447 static bool
get_line(FILE * fp,struct line ** linep,int which)448 get_line (FILE *fp, struct line **linep, int which)
449 {
450   struct line *line = *linep;
451 
452   if (line == prevline[which - 1])
453     {
454       SWAPLINES (line, spareline[which - 1]);
455       *linep = line;
456     }
457 
458   if (line)
459     reset_line (line);
460   else
461     line = init_linep (linep);
462 
463   if (! readlinebuffer_delim (&line->buf, fp, eolchar))
464     {
465       if (ferror (fp))
466         error (EXIT_FAILURE, errno, _("read error"));
467       freeline (line);
468       return false;
469     }
470   ++line_no[which - 1];
471 
472   xfields (line);
473 
474   if (prevline[which - 1])
475     check_order (prevline[which - 1], line, which);
476 
477   prevline[which - 1] = line;
478   return true;
479 }
480 
481 static void
free_spareline(void)482 free_spareline (void)
483 {
484   for (idx_t i = 0; i < ARRAY_CARDINALITY (spareline); i++)
485     {
486       if (spareline[i])
487         {
488           freeline (spareline[i]);
489           free (spareline[i]);
490         }
491     }
492 }
493 
494 static void
initseq(struct seq * seq)495 initseq (struct seq *seq)
496 {
497   seq->count = 0;
498   seq->alloc = 0;
499   seq->lines = nullptr;
500 }
501 
502 /* Read a line from FP and add it to SEQ.  Return true if successful.  */
503 
504 static bool
getseq(FILE * fp,struct seq * seq,int whichfile)505 getseq (FILE *fp, struct seq *seq, int whichfile)
506 {
507   if (seq->count == seq->alloc)
508     {
509       seq->lines = xpalloc (seq->lines, &seq->alloc, 1, -1, sizeof *seq->lines);
510       for (idx_t i = seq->count; i < seq->alloc; i++)
511         seq->lines[i] = nullptr;
512     }
513 
514   if (get_line (fp, &seq->lines[seq->count], whichfile))
515     {
516       ++seq->count;
517       return true;
518     }
519   return false;
520 }
521 
522 /* Read a line from FP and add it to SEQ, as the first item if FIRST is
523    true, else as the next.  */
524 static bool
advance_seq(FILE * fp,struct seq * seq,bool first,int whichfile)525 advance_seq (FILE *fp, struct seq *seq, bool first, int whichfile)
526 {
527   if (first)
528     seq->count = 0;
529 
530   return getseq (fp, seq, whichfile);
531 }
532 
533 static void
delseq(struct seq * seq)534 delseq (struct seq *seq)
535 {
536   for (idx_t i = 0; i < seq->alloc; i++)
537     {
538       freeline (seq->lines[i]);
539       free (seq->lines[i]);
540     }
541   free (seq->lines);
542 }
543 
544 
545 /* Print field N of LINE if it exists and is nonempty, otherwise
546    'empty_filler' if it is nonempty.  */
547 
548 static void
prfield(idx_t n,struct line const * line)549 prfield (idx_t n, struct line const *line)
550 {
551   if (n < line->nfields)
552     {
553       idx_t len = line->fields[n].len;
554       if (len)
555         fwrite (line->fields[n].beg, 1, len, stdout);
556       else if (empty_filler)
557         fputs (empty_filler, stdout);
558     }
559   else if (empty_filler)
560     fputs (empty_filler, stdout);
561 }
562 
563 /* Output all the fields in line, other than the join field.  */
564 
565 static void
prfields(struct line const * line,idx_t join_field,idx_t autocount)566 prfields (struct line const *line, idx_t join_field, idx_t autocount)
567 {
568   idx_t i;
569   idx_t nfields = autoformat ? autocount : line->nfields;
570   char output_separator = tab < 0 ? ' ' : tab;
571 
572   for (i = 0; i < join_field && i < nfields; ++i)
573     {
574       putchar (output_separator);
575       prfield (i, line);
576     }
577   for (i = join_field + 1; i < nfields; ++i)
578     {
579       putchar (output_separator);
580       prfield (i, line);
581     }
582 }
583 
584 /* Print the join of LINE1 and LINE2.  */
585 
586 static void
prjoin(struct line const * line1,struct line const * line2)587 prjoin (struct line const *line1, struct line const *line2)
588 {
589   const struct outlist *outlist;
590   char output_separator = tab < 0 ? ' ' : tab;
591   idx_t field;
592   struct line const *line;
593 
594   outlist = outlist_head.next;
595   if (outlist)
596     {
597       const struct outlist *o;
598 
599       o = outlist;
600       while (true)
601         {
602           if (o->file == 0)
603             {
604               if (line1 == &uni_blank)
605                 {
606                   line = line2;
607                   field = join_field_2;
608                 }
609               else
610                 {
611                   line = line1;
612                   field = join_field_1;
613                 }
614             }
615           else
616             {
617               line = (o->file == 1 ? line1 : line2);
618               field = o->field;
619             }
620           prfield (field, line);
621           o = o->next;
622           if (o == nullptr)
623             break;
624           putchar (output_separator);
625         }
626       putchar (eolchar);
627     }
628   else
629     {
630       if (line1 == &uni_blank)
631         {
632           line = line2;
633           field = join_field_2;
634         }
635       else
636         {
637           line = line1;
638           field = join_field_1;
639         }
640 
641       /* Output the join field.  */
642       prfield (field, line);
643 
644       /* Output other fields.  */
645       prfields (line1, join_field_1, autocount_1);
646       prfields (line2, join_field_2, autocount_2);
647 
648       putchar (eolchar);
649     }
650 
651   if (ferror (stdout))
652     write_error ();
653 }
654 
655 /* Print the join of the files in FP1 and FP2.  */
656 
657 static void
join(FILE * fp1,FILE * fp2)658 join (FILE *fp1, FILE *fp2)
659 {
660   struct seq seq1, seq2;
661   int diff;
662   bool eof1, eof2;
663 
664   fadvise (fp1, FADVISE_SEQUENTIAL);
665   fadvise (fp2, FADVISE_SEQUENTIAL);
666 
667   /* Read the first line of each file.  */
668   initseq (&seq1);
669   getseq (fp1, &seq1, 1);
670   initseq (&seq2);
671   getseq (fp2, &seq2, 2);
672 
673   if (autoformat)
674     {
675       autocount_1 = seq1.count ? seq1.lines[0]->nfields : 0;
676       autocount_2 = seq2.count ? seq2.lines[0]->nfields : 0;
677     }
678 
679   if (join_header_lines && (seq1.count || seq2.count))
680     {
681       struct line const *hline1 = seq1.count ? seq1.lines[0] : &uni_blank;
682       struct line const *hline2 = seq2.count ? seq2.lines[0] : &uni_blank;
683       prjoin (hline1, hline2);
684       prevline[0] = nullptr;
685       prevline[1] = nullptr;
686       if (seq1.count)
687         advance_seq (fp1, &seq1, true, 1);
688       if (seq2.count)
689         advance_seq (fp2, &seq2, true, 2);
690     }
691 
692   while (seq1.count && seq2.count)
693     {
694       diff = keycmp (seq1.lines[0], seq2.lines[0],
695                      join_field_1, join_field_2);
696       if (diff < 0)
697         {
698           if (print_unpairables_1)
699             prjoin (seq1.lines[0], &uni_blank);
700           advance_seq (fp1, &seq1, true, 1);
701           seen_unpairable = true;
702           continue;
703         }
704       if (diff > 0)
705         {
706           if (print_unpairables_2)
707             prjoin (&uni_blank, seq2.lines[0]);
708           advance_seq (fp2, &seq2, true, 2);
709           seen_unpairable = true;
710           continue;
711         }
712 
713       /* Keep reading lines from file1 as long as they continue to
714          match the current line from file2.  */
715       eof1 = false;
716       do
717         if (!advance_seq (fp1, &seq1, false, 1))
718           {
719             eof1 = true;
720             ++seq1.count;
721             break;
722           }
723       while (!keycmp (seq1.lines[seq1.count - 1], seq2.lines[0],
724                       join_field_1, join_field_2));
725 
726       /* Keep reading lines from file2 as long as they continue to
727          match the current line from file1.  */
728       eof2 = false;
729       do
730         if (!advance_seq (fp2, &seq2, false, 2))
731           {
732             eof2 = true;
733             ++seq2.count;
734             break;
735           }
736       while (!keycmp (seq1.lines[0], seq2.lines[seq2.count - 1],
737                       join_field_1, join_field_2));
738 
739       if (print_pairables)
740         {
741           for (idx_t i = 0; i < seq1.count - 1; ++i)
742             {
743               idx_t j;
744               for (j = 0; j < seq2.count - 1; ++j)
745                 prjoin (seq1.lines[i], seq2.lines[j]);
746             }
747         }
748 
749       if (!eof1)
750         {
751           SWAPLINES (seq1.lines[0], seq1.lines[seq1.count - 1]);
752           seq1.count = 1;
753         }
754       else
755         seq1.count = 0;
756 
757       if (!eof2)
758         {
759           SWAPLINES (seq2.lines[0], seq2.lines[seq2.count - 1]);
760           seq2.count = 1;
761         }
762       else
763         seq2.count = 0;
764     }
765 
766   /* If the user did not specify --nocheck-order, then we read the
767      tail ends of both inputs to verify that they are in order.  We
768      skip the rest of the tail once we have issued a warning for that
769      file, unless we actually need to print the unpairable lines.  */
770   struct line *line = nullptr;
771   bool checktail = false;
772 
773   if (check_input_order != CHECK_ORDER_DISABLED
774       && !(issued_disorder_warning[0] && issued_disorder_warning[1]))
775     checktail = true;
776 
777   if ((print_unpairables_1 || checktail) && seq1.count)
778     {
779       if (print_unpairables_1)
780         prjoin (seq1.lines[0], &uni_blank);
781       if (seq2.count)
782         seen_unpairable = true;
783       while (get_line (fp1, &line, 1))
784         {
785           if (print_unpairables_1)
786             prjoin (line, &uni_blank);
787           if (issued_disorder_warning[0] && !print_unpairables_1)
788             break;
789         }
790     }
791 
792   if ((print_unpairables_2 || checktail) && seq2.count)
793     {
794       if (print_unpairables_2)
795         prjoin (&uni_blank, seq2.lines[0]);
796       if (seq1.count)
797         seen_unpairable = true;
798       while (get_line (fp2, &line, 2))
799         {
800           if (print_unpairables_2)
801             prjoin (&uni_blank, line);
802           if (issued_disorder_warning[1] && !print_unpairables_2)
803             break;
804         }
805     }
806 
807   freeline (line);
808   free (line);
809 
810   delseq (&seq1);
811   delseq (&seq2);
812 }
813 
814 /* Add a field spec for field FIELD of file FILE to 'outlist'.  */
815 
816 static void
add_field(int file,idx_t field)817 add_field (int file, idx_t field)
818 {
819   struct outlist *o;
820 
821   affirm (file == 0 || file == 1 || file == 2);
822   affirm (file != 0 || field == 0);
823 
824   o = xmalloc (sizeof *o);
825   o->file = file;
826   o->field = field;
827   o->next = nullptr;
828 
829   /* Add to the end of the list so the fields are in the right order.  */
830   outlist_end->next = o;
831   outlist_end = o;
832 }
833 
834 /* Convert a string of decimal digits, STR (the 1-based join field number),
835    to an integral value.  Upon successful conversion, return one less
836    (the zero-based field number).  Silently convert too-large values
837    to PTRDIFF_MAX.  Otherwise, if a value cannot be converted, give a
838    diagnostic and exit.  */
839 
840 static idx_t
string_to_join_field(char const * str)841 string_to_join_field (char const *str)
842 {
843   intmax_t val;
844 
845   strtol_error s_err = xstrtoimax (str, nullptr, 10, &val, "");
846   if (s_err == LONGINT_OVERFLOW || (s_err == LONGINT_OK && PTRDIFF_MAX < val))
847     val = PTRDIFF_MAX;
848   else if (s_err != LONGINT_OK || val <= 0)
849     error (EXIT_FAILURE, 0, _("invalid field number: %s"), quote (str));
850 
851   return val - 1;
852 }
853 
854 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
855    pair.  In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
856    If S is valid, return true.  Otherwise, give a diagnostic and exit.  */
857 
858 static void
decode_field_spec(char const * s,int * file_index,idx_t * field_index)859 decode_field_spec (char const *s, int *file_index, idx_t *field_index)
860 {
861   /* The first character must be 0, 1, or 2.  */
862   switch (s[0])
863     {
864     case '0':
865       if (s[1])
866         {
867           /* '0' must be all alone -- no '.FIELD'.  */
868           error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
869         }
870       *file_index = 0;
871       *field_index = 0;
872       break;
873 
874     case '1':
875     case '2':
876       if (s[1] != '.')
877         error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
878       *file_index = s[0] - '0';
879       *field_index = string_to_join_field (s + 2);
880       break;
881 
882     default:
883       error (EXIT_FAILURE, 0,
884              _("invalid file number in field spec: %s"), quote (s));
885     }
886 }
887 
888 /* Add the comma or blank separated field spec(s) in STR to 'outlist'.  */
889 
890 static void
add_field_list(char * str)891 add_field_list (char *str)
892 {
893   char *p = str;
894 
895   do
896     {
897       int file_index;
898       idx_t field_index;
899       char const *spec_item = p;
900 
901       p = strpbrk (p, ", \t");
902       if (p)
903         *p++ = '\0';
904       decode_field_spec (spec_item, &file_index, &field_index);
905       add_field (file_index, field_index);
906     }
907   while (p);
908 }
909 
910 /* Set the join field *VAR to VAL, but report an error if *VAR is set
911    more than once to incompatible values.  */
912 
913 static void
set_join_field(ptrdiff_t * var,idx_t val)914 set_join_field (ptrdiff_t *var, idx_t val)
915 {
916   if (0 <= *var && *var != val)
917     error (EXIT_FAILURE, 0,
918            _("incompatible join fields %td, %td"), *var, val);
919   *var = val;
920 }
921 
922 /* Status of command-line arguments.  */
923 
924 enum operand_status
925   {
926     /* This argument must be an operand, i.e., one of the files to be
927        joined.  */
928     MUST_BE_OPERAND,
929 
930     /* This might be the argument of the preceding -j1 or -j2 option,
931        or it might be an operand.  */
932     MIGHT_BE_J1_ARG,
933     MIGHT_BE_J2_ARG,
934 
935     /* This might be the argument of the preceding -o option, or it might be
936        an operand.  */
937     MIGHT_BE_O_ARG
938   };
939 
940 /* Add NAME to the array of input file NAMES with operand statuses
941    OPERAND_STATUS; currently there are NFILES names in the list.  */
942 
943 static void
add_file_name(char * name,char * names[2],int operand_status[2],int joption_count[2],int * nfiles,int * prev_optc_status,int * optc_status)944 add_file_name (char *name, char *names[2],
945                int operand_status[2], int joption_count[2], int *nfiles,
946                int *prev_optc_status, int *optc_status)
947 {
948   int n = *nfiles;
949 
950   if (n == 2)
951     {
952       bool op0 = (operand_status[0] == MUST_BE_OPERAND);
953       char *arg = names[op0];
954       switch (operand_status[op0])
955         {
956         case MUST_BE_OPERAND:
957           error (0, 0, _("extra operand %s"), quoteaf (name));
958           usage (EXIT_FAILURE);
959 
960         case MIGHT_BE_J1_ARG:
961           joption_count[0]--;
962           set_join_field (&join_field_1, string_to_join_field (arg));
963           break;
964 
965         case MIGHT_BE_J2_ARG:
966           joption_count[1]--;
967           set_join_field (&join_field_2, string_to_join_field (arg));
968           break;
969 
970         case MIGHT_BE_O_ARG:
971           add_field_list (arg);
972           break;
973         }
974       if (!op0)
975         {
976           operand_status[0] = operand_status[1];
977           names[0] = names[1];
978         }
979       n = 1;
980     }
981 
982   operand_status[n] = *prev_optc_status;
983   names[n] = name;
984   *nfiles = n + 1;
985   if (*prev_optc_status == MIGHT_BE_O_ARG)
986     *optc_status = MIGHT_BE_O_ARG;
987 }
988 
989 int
main(int argc,char ** argv)990 main (int argc, char **argv)
991 {
992   int optc_status;
993   int prev_optc_status = MUST_BE_OPERAND;
994   int operand_status[2];
995   int joption_count[2] = { 0, 0 };
996   FILE *fp1, *fp2;
997   int optc;
998   int nfiles = 0;
999   int i;
1000 
1001   initialize_main (&argc, &argv);
1002   set_program_name (argv[0]);
1003   setlocale (LC_ALL, "");
1004   bindtextdomain (PACKAGE, LOCALEDIR);
1005   textdomain (PACKAGE);
1006   hard_LC_COLLATE = hard_locale (LC_COLLATE);
1007 
1008   atexit (close_stdout);
1009   atexit (free_spareline);
1010 
1011   print_pairables = true;
1012   seen_unpairable = false;
1013   issued_disorder_warning[0] = issued_disorder_warning[1] = false;
1014   check_input_order = CHECK_ORDER_DEFAULT;
1015 
1016   while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:z",
1017                               longopts, nullptr))
1018          != -1)
1019     {
1020       optc_status = MUST_BE_OPERAND;
1021 
1022       switch (optc)
1023         {
1024         case 'v':
1025             print_pairables = false;
1026             FALLTHROUGH;
1027 
1028         case 'a':
1029           {
1030             long int val;
1031             if (xstrtol (optarg, nullptr, 10, &val, "") != LONGINT_OK
1032                 || (val != 1 && val != 2))
1033               error (EXIT_FAILURE, 0,
1034                      _("invalid field number: %s"), quote (optarg));
1035             if (val == 1)
1036               print_unpairables_1 = true;
1037             else
1038               print_unpairables_2 = true;
1039           }
1040           break;
1041 
1042         case 'e':
1043           if (empty_filler && ! STREQ (empty_filler, optarg))
1044             error (EXIT_FAILURE, 0,
1045                    _("conflicting empty-field replacement strings"));
1046           empty_filler = optarg;
1047           break;
1048 
1049         case 'i':
1050           ignore_case = true;
1051           break;
1052 
1053         case '1':
1054           set_join_field (&join_field_1, string_to_join_field (optarg));
1055           break;
1056 
1057         case '2':
1058           set_join_field (&join_field_2, string_to_join_field (optarg));
1059           break;
1060 
1061         case 'j':
1062           if ((optarg[0] == '1' || optarg[0] == '2') && !optarg[1]
1063               && optarg == argv[optind - 1] + 2)
1064             {
1065               /* The argument was either "-j1" or "-j2".  */
1066               bool is_j2 = (optarg[0] == '2');
1067               joption_count[is_j2]++;
1068               optc_status = MIGHT_BE_J1_ARG + is_j2;
1069             }
1070           else
1071             {
1072               set_join_field (&join_field_1, string_to_join_field (optarg));
1073               set_join_field (&join_field_2, join_field_1);
1074             }
1075           break;
1076 
1077         case 'o':
1078           if (STREQ (optarg, "auto"))
1079             autoformat = true;
1080           else
1081             {
1082               add_field_list (optarg);
1083               optc_status = MIGHT_BE_O_ARG;
1084             }
1085           break;
1086 
1087         case 't':
1088           {
1089             unsigned char newtab = optarg[0];
1090             if (! newtab)
1091               newtab = '\n'; /* '' => process the whole line.  */
1092             else if (optarg[1])
1093               {
1094                 if (STREQ (optarg, "\\0"))
1095                   newtab = '\0';
1096                 else
1097                   error (EXIT_FAILURE, 0, _("multi-character tab %s"),
1098                          quote (optarg));
1099               }
1100             if (0 <= tab && tab != newtab)
1101               error (EXIT_FAILURE, 0, _("incompatible tabs"));
1102             tab = newtab;
1103           }
1104           break;
1105 
1106         case 'z':
1107           eolchar = 0;
1108           break;
1109 
1110         case NOCHECK_ORDER_OPTION:
1111           check_input_order = CHECK_ORDER_DISABLED;
1112           break;
1113 
1114         case CHECK_ORDER_OPTION:
1115           check_input_order = CHECK_ORDER_ENABLED;
1116           break;
1117 
1118         case 1:		/* Non-option argument.  */
1119           add_file_name (optarg, g_names, operand_status, joption_count,
1120                          &nfiles, &prev_optc_status, &optc_status);
1121           break;
1122 
1123         case HEADER_LINE_OPTION:
1124           join_header_lines = true;
1125           break;
1126 
1127         case_GETOPT_HELP_CHAR;
1128 
1129         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1130 
1131         default:
1132           usage (EXIT_FAILURE);
1133         }
1134 
1135       prev_optc_status = optc_status;
1136     }
1137 
1138   /* Process any operands after "--".  */
1139   prev_optc_status = MUST_BE_OPERAND;
1140   while (optind < argc)
1141     add_file_name (argv[optind++], g_names, operand_status, joption_count,
1142                    &nfiles, &prev_optc_status, &optc_status);
1143 
1144   if (nfiles != 2)
1145     {
1146       if (nfiles == 0)
1147         error (0, 0, _("missing operand"));
1148       else
1149         error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
1150       usage (EXIT_FAILURE);
1151     }
1152 
1153   /* If "-j1" was specified and it turns out not to have had an argument,
1154      treat it as "-j 1".  Likewise for -j2.  */
1155   for (i = 0; i < 2; i++)
1156     if (joption_count[i] != 0)
1157       {
1158         set_join_field (&join_field_1, i);
1159         set_join_field (&join_field_2, i);
1160       }
1161 
1162   if (join_field_1 < 0)
1163     join_field_1 = 0;
1164   if (join_field_2 < 0)
1165     join_field_2 = 0;
1166 
1167   fp1 = STREQ (g_names[0], "-") ? stdin : fopen (g_names[0], "r");
1168   if (!fp1)
1169     error (EXIT_FAILURE, errno, "%s", quotef (g_names[0]));
1170   fp2 = STREQ (g_names[1], "-") ? stdin : fopen (g_names[1], "r");
1171   if (!fp2)
1172     error (EXIT_FAILURE, errno, "%s", quotef (g_names[1]));
1173   if (fp1 == fp2)
1174     error (EXIT_FAILURE, errno, _("both files cannot be standard input"));
1175   join (fp1, fp2);
1176 
1177   if (fclose (fp1) != 0)
1178     error (EXIT_FAILURE, errno, "%s", quotef (g_names[0]));
1179   if (fclose (fp2) != 0)
1180     error (EXIT_FAILURE, errno, "%s", quotef (g_names[1]));
1181 
1182   if (issued_disorder_warning[0] || issued_disorder_warning[1])
1183     error (EXIT_FAILURE, 0, _("input is not in sorted order"));
1184   else
1185     return EXIT_SUCCESS;
1186 }
1187