1 /* join - join lines of two files on a common field
2 Copyright (C) 1991-2023 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>.
16
17 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
18
19 #include <config.h>
20
21 #include <sys/types.h>
22 #include <getopt.h>
23
24 #include "system.h"
25 #include "assure.h"
26 #include "fadvise.h"
27 #include "hard-locale.h"
28 #include "linebuffer.h"
29 #include "memcasecmp.h"
30 #include "quote.h"
31 #include "stdio--.h"
32 #include "xmemcoll.h"
33 #include "xstrtol.h"
34 #include "argmatch.h"
35
36 /* The official name of this program (e.g., no 'g' prefix). */
37 #define PROGRAM_NAME "join"
38
39 #define AUTHORS proper_name ("Mike Haertel")
40
41 #define join system_join
42
43 #define SWAPLINES(a, b) do { \
44 struct line *tmp = a; \
45 a = b; \
46 b = tmp; \
47 } while (0);
48
49 /* An element of the list identifying which fields to print for each
50 output line. */
51 struct outlist
52 {
53 /* File number: 0, 1, or 2. 0 means use the join field.
54 1 means use the first file argument, 2 the second. */
55 int file;
56
57 /* Field index (zero-based), specified only when FILE is 1 or 2. */
58 idx_t field;
59
60 struct outlist *next;
61 };
62
63 /* A field of a line. */
64 struct field
65 {
66 char *beg; /* First character in field. */
67 idx_t len; /* The length of the field. */
68 };
69
70 /* A line read from an input file. */
71 struct line
72 {
73 struct linebuffer buf; /* The line itself. */
74 idx_t nfields; /* Number of elements in 'fields'. */
75 idx_t nfields_allocated; /* Number of elements allocated for 'fields'. */
76 struct field *fields;
77 };
78
79 /* One or more consecutive lines read from a file that all have the
80 same join field value. */
81 struct seq
82 {
83 idx_t count; /* Elements used in 'lines'. */
84 idx_t alloc; /* Elements allocated in 'lines'. */
85 struct line **lines;
86 };
87
88 /* The previous line read from each file. */
89 static struct line *prevline[2] = {nullptr, nullptr};
90
91 /* The number of lines read from each file. */
92 static uintmax_t line_no[2] = {0, 0};
93
94 /* The input file names. */
95 static char *g_names[2];
96
97 /* This provides an extra line buffer for each file. We need these if we
98 try to read two consecutive lines into the same buffer, since we don't
99 want to overwrite the previous buffer before we check order. */
100 static struct line *spareline[2] = {nullptr, nullptr};
101
102 /* True if the LC_COLLATE locale is hard. */
103 static bool hard_LC_COLLATE;
104
105 /* If nonzero, print unpairable lines in file 1 or 2. */
106 static bool print_unpairables_1, print_unpairables_2;
107
108 /* If nonzero, print pairable lines. */
109 static bool print_pairables;
110
111 /* If nonzero, we have seen at least one unpairable line. */
112 static bool seen_unpairable;
113
114 /* If nonzero, we have warned about disorder in that file. */
115 static bool issued_disorder_warning[2];
116
117 /* Empty output field filler. */
118 static char const *empty_filler;
119
120 /* Whether to ensure the same number of fields are output from each line. */
121 static bool autoformat;
122 /* The number of fields to output for each line.
123 Only significant when autoformat is true. */
124 static idx_t autocount_1;
125 static idx_t autocount_2;
126
127 /* Field to join on; -1 means they haven't been determined yet. */
128 static ptrdiff_t join_field_1 = -1;
129 static ptrdiff_t join_field_2 = -1;
130
131 /* List of fields to print. */
132 static struct outlist outlist_head;
133
134 /* Last element in 'outlist', where a new element can be added. */
135 static struct outlist *outlist_end = &outlist_head;
136
137 /* Tab character separating fields. If negative, fields are separated
138 by any nonempty string of blanks, otherwise by exactly one
139 tab character whose value (when cast to unsigned char) equals TAB. */
140 static int tab = -1;
141
142 /* If nonzero, check that the input is correctly ordered. */
143 static enum
144 {
145 CHECK_ORDER_DEFAULT,
146 CHECK_ORDER_ENABLED,
147 CHECK_ORDER_DISABLED
148 } check_input_order;
149
150 enum
151 {
152 CHECK_ORDER_OPTION = CHAR_MAX + 1,
153 NOCHECK_ORDER_OPTION,
154 HEADER_LINE_OPTION
155 };
156
157
158 static struct option const longopts[] =
159 {
160 {"ignore-case", no_argument, nullptr, 'i'},
161 {"check-order", no_argument, nullptr, CHECK_ORDER_OPTION},
162 {"nocheck-order", no_argument, nullptr, NOCHECK_ORDER_OPTION},
163 {"zero-terminated", no_argument, nullptr, 'z'},
164 {"header", no_argument, nullptr, HEADER_LINE_OPTION},
165 {GETOPT_HELP_OPTION_DECL},
166 {GETOPT_VERSION_OPTION_DECL},
167 {nullptr, 0, nullptr, 0}
168 };
169
170 /* Used to print non-joining lines */
171 static struct line uni_blank;
172
173 /* If nonzero, ignore case when comparing join fields. */
174 static bool ignore_case;
175
176 /* If nonzero, treat the first line of each file as column headers --
177 join them without checking for ordering */
178 static bool join_header_lines;
179
180 /* The character marking end of line. Default to \n. */
181 static char eolchar = '\n';
182
183 void
usage(int status)184 usage (int status)
185 {
186 if (status != EXIT_SUCCESS)
187 emit_try_help ();
188 else
189 {
190 printf (_("\
191 Usage: %s [OPTION]... FILE1 FILE2\n\
192 "),
193 program_name);
194 fputs (_("\
195 For each pair of input lines with identical join fields, write a line to\n\
196 standard output. The default join field is the first, delimited by blanks.\
197 \n\
198 "), stdout);
199 fputs (_("\
200 \n\
201 When FILE1 or FILE2 (not both) is -, read standard input.\n\
202 "), stdout);
203 fputs (_("\
204 \n\
205 -a FILENUM also print unpairable lines from file FILENUM, where\n\
206 FILENUM is 1 or 2, corresponding to FILE1 or FILE2\n\
207 "), stdout);
208 fputs (_("\
209 -e STRING replace missing (empty) input fields with STRING;\n\
210 I.e., missing fields specified with '-12jo' options\
211 \n\
212 "), stdout);
213 fputs (_("\
214 -i, --ignore-case ignore differences in case when comparing fields\n\
215 -j FIELD equivalent to '-1 FIELD -2 FIELD'\n\
216 -o FORMAT obey FORMAT while constructing output line\n\
217 -t CHAR use CHAR as input and output field separator\n\
218 "), stdout);
219 fputs (_("\
220 -v FILENUM like -a FILENUM, but suppress joined output lines\n\
221 -1 FIELD join on this FIELD of file 1\n\
222 -2 FIELD join on this FIELD of file 2\n\
223 --check-order check that the input is correctly sorted, even\n\
224 if all input lines are pairable\n\
225 --nocheck-order do not check that the input is correctly sorted\n\
226 --header treat the first line in each file as field headers,\n\
227 print them without trying to pair them\n\
228 "), stdout);
229 fputs (_("\
230 -z, --zero-terminated line delimiter is NUL, not newline\n\
231 "), stdout);
232 fputs (HELP_OPTION_DESCRIPTION, stdout);
233 fputs (VERSION_OPTION_DESCRIPTION, stdout);
234 fputs (_("\
235 \n\
236 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
237 else fields are separated by CHAR. Any FIELD is a field number counted\n\
238 from 1. FORMAT is one or more comma or blank separated specifications,\n\
239 each being 'FILENUM.FIELD' or '0'. Default FORMAT outputs the join field,\n\
240 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
241 separated by CHAR. If FORMAT is the keyword 'auto', then the first\n\
242 line of each file determines the number of fields output for each line.\n\
243 \n\
244 Important: FILE1 and FILE2 must be sorted on the join fields.\n\
245 E.g., use \"sort -k 1b,1\" if 'join' has no options,\n\
246 or use \"join -t ''\" if 'sort' has no options.\n\
247 Note, comparisons honor the rules specified by 'LC_COLLATE'.\n\
248 If the input is not sorted and some lines cannot be joined, a\n\
249 warning message will be given.\n\
250 "), stdout);
251 emit_ancillary_info (PROGRAM_NAME);
252 }
253 exit (status);
254 }
255
256 /* Record a field in LINE, with location FIELD and size LEN. */
257
258 static void
extract_field(struct line * line,char * field,idx_t len)259 extract_field (struct line *line, char *field, idx_t len)
260 {
261 if (line->nfields >= line->nfields_allocated)
262 line->fields = xpalloc (line->fields, &line->nfields_allocated, 1,
263 -1, sizeof *line->fields);
264 line->fields[line->nfields].beg = field;
265 line->fields[line->nfields].len = len;
266 ++(line->nfields);
267 }
268
269 /* Fill in the 'fields' structure in LINE. */
270
271 static void
xfields(struct line * line)272 xfields (struct line *line)
273 {
274 char *ptr = line->buf.buffer;
275 char const *lim = ptr + line->buf.length - 1;
276
277 if (ptr == lim)
278 return;
279
280 if (0 <= tab && tab != '\n')
281 {
282 char *sep;
283 for (; (sep = memchr (ptr, tab, lim - ptr)) != nullptr; ptr = sep + 1)
284 extract_field (line, ptr, sep - ptr);
285 }
286 else if (tab < 0)
287 {
288 /* Skip leading blanks before the first field. */
289 while (field_sep (*ptr))
290 if (++ptr == lim)
291 return;
292
293 do
294 {
295 char *sep;
296 for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++)
297 continue;
298 extract_field (line, ptr, sep - ptr);
299 if (sep == lim)
300 return;
301 for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++)
302 continue;
303 }
304 while (ptr != lim);
305 }
306
307 extract_field (line, ptr, lim - ptr);
308 }
309
310 static void
freeline(struct line * line)311 freeline (struct line *line)
312 {
313 if (line == nullptr)
314 return;
315 free (line->fields);
316 line->fields = nullptr;
317 free (line->buf.buffer);
318 line->buf.buffer = nullptr;
319 }
320
321 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
322 >0 if it compares greater; 0 if it compares equal.
323 Report an error and exit if the comparison fails.
324 Use join fields JF_1 and JF_2 respectively. */
325
326 static int
keycmp(struct line const * line1,struct line const * line2,idx_t jf_1,idx_t jf_2)327 keycmp (struct line const *line1, struct line const *line2,
328 idx_t jf_1, idx_t jf_2)
329 {
330 /* Start of field to compare in each file. */
331 char *beg1;
332 char *beg2;
333
334 idx_t len1;
335 idx_t len2; /* Length of fields to compare. */
336 int diff;
337
338 if (jf_1 < line1->nfields)
339 {
340 beg1 = line1->fields[jf_1].beg;
341 len1 = line1->fields[jf_1].len;
342 }
343 else
344 {
345 beg1 = nullptr;
346 len1 = 0;
347 }
348
349 if (jf_2 < line2->nfields)
350 {
351 beg2 = line2->fields[jf_2].beg;
352 len2 = line2->fields[jf_2].len;
353 }
354 else
355 {
356 beg2 = nullptr;
357 len2 = 0;
358 }
359
360 if (len1 == 0)
361 return len2 == 0 ? 0 : -1;
362 if (len2 == 0)
363 return 1;
364
365 if (ignore_case)
366 {
367 /* FIXME: ignore_case does not work with NLS (in particular,
368 with multibyte chars). */
369 diff = memcasecmp (beg1, beg2, MIN (len1, len2));
370 }
371 else
372 {
373 if (hard_LC_COLLATE)
374 return xmemcoll (beg1, len1, beg2, len2);
375 diff = memcmp (beg1, beg2, MIN (len1, len2));
376 }
377
378 if (diff)
379 return diff;
380 return (len1 > len2) - (len1 < len2);
381 }
382
383 /* Check that successive input lines PREV and CURRENT from input file
384 WHATFILE are presented in order, unless the user may be relying on
385 the GNU extension that input lines may be out of order if no input
386 lines are unpairable.
387
388 If the user specified --nocheck-order, the check is not made.
389 If the user specified --check-order, the problem is fatal.
390 Otherwise (the default), the message is simply a warning.
391
392 A message is printed at most once per input file. */
393
394 static void
check_order(const struct line * prev,const struct line * current,int whatfile)395 check_order (const struct line *prev,
396 const struct line *current,
397 int whatfile)
398 {
399 if (check_input_order != CHECK_ORDER_DISABLED
400 && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable))
401 {
402 if (!issued_disorder_warning[whatfile - 1])
403 {
404 idx_t join_field = whatfile == 1 ? join_field_1 : join_field_2;
405 if (keycmp (prev, current, join_field, join_field) > 0)
406 {
407 /* Exclude any trailing newline. */
408 idx_t len = current->buf.length;
409 if (0 < len && current->buf.buffer[len - 1] == '\n')
410 --len;
411
412 /* If the offending line is longer than INT_MAX, output
413 only the first INT_MAX bytes in this diagnostic. */
414 len = MIN (INT_MAX, len);
415
416 error ((check_input_order == CHECK_ORDER_ENABLED
417 ? EXIT_FAILURE : 0),
418 0, _("%s:%ju: is not sorted: %.*s"),
419 g_names[whatfile - 1], line_no[whatfile - 1],
420 (int) len, current->buf.buffer);
421
422 /* If we get to here, the message was merely a warning.
423 Arrange to issue it only once per file. */
424 issued_disorder_warning[whatfile - 1] = true;
425 }
426 }
427 }
428 }
429
430 static inline void
reset_line(struct line * line)431 reset_line (struct line *line)
432 {
433 line->nfields = 0;
434 }
435
436 static struct line *
init_linep(struct line ** linep)437 init_linep (struct line **linep)
438 {
439 struct line *line = xzalloc (sizeof *line);
440 *linep = line;
441 return line;
442 }
443
444 /* Read a line from FP into LINE and split it into fields.
445 Return true if successful. */
446
447 static bool
get_line(FILE * fp,struct line ** linep,int which)448 get_line (FILE *fp, struct line **linep, int which)
449 {
450 struct line *line = *linep;
451
452 if (line == prevline[which - 1])
453 {
454 SWAPLINES (line, spareline[which - 1]);
455 *linep = line;
456 }
457
458 if (line)
459 reset_line (line);
460 else
461 line = init_linep (linep);
462
463 if (! readlinebuffer_delim (&line->buf, fp, eolchar))
464 {
465 if (ferror (fp))
466 error (EXIT_FAILURE, errno, _("read error"));
467 freeline (line);
468 return false;
469 }
470 ++line_no[which - 1];
471
472 xfields (line);
473
474 if (prevline[which - 1])
475 check_order (prevline[which - 1], line, which);
476
477 prevline[which - 1] = line;
478 return true;
479 }
480
481 static void
free_spareline(void)482 free_spareline (void)
483 {
484 for (idx_t i = 0; i < ARRAY_CARDINALITY (spareline); i++)
485 {
486 if (spareline[i])
487 {
488 freeline (spareline[i]);
489 free (spareline[i]);
490 }
491 }
492 }
493
494 static void
initseq(struct seq * seq)495 initseq (struct seq *seq)
496 {
497 seq->count = 0;
498 seq->alloc = 0;
499 seq->lines = nullptr;
500 }
501
502 /* Read a line from FP and add it to SEQ. Return true if successful. */
503
504 static bool
getseq(FILE * fp,struct seq * seq,int whichfile)505 getseq (FILE *fp, struct seq *seq, int whichfile)
506 {
507 if (seq->count == seq->alloc)
508 {
509 seq->lines = xpalloc (seq->lines, &seq->alloc, 1, -1, sizeof *seq->lines);
510 for (idx_t i = seq->count; i < seq->alloc; i++)
511 seq->lines[i] = nullptr;
512 }
513
514 if (get_line (fp, &seq->lines[seq->count], whichfile))
515 {
516 ++seq->count;
517 return true;
518 }
519 return false;
520 }
521
522 /* Read a line from FP and add it to SEQ, as the first item if FIRST is
523 true, else as the next. */
524 static bool
advance_seq(FILE * fp,struct seq * seq,bool first,int whichfile)525 advance_seq (FILE *fp, struct seq *seq, bool first, int whichfile)
526 {
527 if (first)
528 seq->count = 0;
529
530 return getseq (fp, seq, whichfile);
531 }
532
533 static void
delseq(struct seq * seq)534 delseq (struct seq *seq)
535 {
536 for (idx_t i = 0; i < seq->alloc; i++)
537 {
538 freeline (seq->lines[i]);
539 free (seq->lines[i]);
540 }
541 free (seq->lines);
542 }
543
544
545 /* Print field N of LINE if it exists and is nonempty, otherwise
546 'empty_filler' if it is nonempty. */
547
548 static void
prfield(idx_t n,struct line const * line)549 prfield (idx_t n, struct line const *line)
550 {
551 if (n < line->nfields)
552 {
553 idx_t len = line->fields[n].len;
554 if (len)
555 fwrite (line->fields[n].beg, 1, len, stdout);
556 else if (empty_filler)
557 fputs (empty_filler, stdout);
558 }
559 else if (empty_filler)
560 fputs (empty_filler, stdout);
561 }
562
563 /* Output all the fields in line, other than the join field. */
564
565 static void
prfields(struct line const * line,idx_t join_field,idx_t autocount)566 prfields (struct line const *line, idx_t join_field, idx_t autocount)
567 {
568 idx_t i;
569 idx_t nfields = autoformat ? autocount : line->nfields;
570 char output_separator = tab < 0 ? ' ' : tab;
571
572 for (i = 0; i < join_field && i < nfields; ++i)
573 {
574 putchar (output_separator);
575 prfield (i, line);
576 }
577 for (i = join_field + 1; i < nfields; ++i)
578 {
579 putchar (output_separator);
580 prfield (i, line);
581 }
582 }
583
584 /* Print the join of LINE1 and LINE2. */
585
586 static void
prjoin(struct line const * line1,struct line const * line2)587 prjoin (struct line const *line1, struct line const *line2)
588 {
589 const struct outlist *outlist;
590 char output_separator = tab < 0 ? ' ' : tab;
591 idx_t field;
592 struct line const *line;
593
594 outlist = outlist_head.next;
595 if (outlist)
596 {
597 const struct outlist *o;
598
599 o = outlist;
600 while (true)
601 {
602 if (o->file == 0)
603 {
604 if (line1 == &uni_blank)
605 {
606 line = line2;
607 field = join_field_2;
608 }
609 else
610 {
611 line = line1;
612 field = join_field_1;
613 }
614 }
615 else
616 {
617 line = (o->file == 1 ? line1 : line2);
618 field = o->field;
619 }
620 prfield (field, line);
621 o = o->next;
622 if (o == nullptr)
623 break;
624 putchar (output_separator);
625 }
626 putchar (eolchar);
627 }
628 else
629 {
630 if (line1 == &uni_blank)
631 {
632 line = line2;
633 field = join_field_2;
634 }
635 else
636 {
637 line = line1;
638 field = join_field_1;
639 }
640
641 /* Output the join field. */
642 prfield (field, line);
643
644 /* Output other fields. */
645 prfields (line1, join_field_1, autocount_1);
646 prfields (line2, join_field_2, autocount_2);
647
648 putchar (eolchar);
649 }
650
651 if (ferror (stdout))
652 write_error ();
653 }
654
655 /* Print the join of the files in FP1 and FP2. */
656
657 static void
join(FILE * fp1,FILE * fp2)658 join (FILE *fp1, FILE *fp2)
659 {
660 struct seq seq1, seq2;
661 int diff;
662 bool eof1, eof2;
663
664 fadvise (fp1, FADVISE_SEQUENTIAL);
665 fadvise (fp2, FADVISE_SEQUENTIAL);
666
667 /* Read the first line of each file. */
668 initseq (&seq1);
669 getseq (fp1, &seq1, 1);
670 initseq (&seq2);
671 getseq (fp2, &seq2, 2);
672
673 if (autoformat)
674 {
675 autocount_1 = seq1.count ? seq1.lines[0]->nfields : 0;
676 autocount_2 = seq2.count ? seq2.lines[0]->nfields : 0;
677 }
678
679 if (join_header_lines && (seq1.count || seq2.count))
680 {
681 struct line const *hline1 = seq1.count ? seq1.lines[0] : &uni_blank;
682 struct line const *hline2 = seq2.count ? seq2.lines[0] : &uni_blank;
683 prjoin (hline1, hline2);
684 prevline[0] = nullptr;
685 prevline[1] = nullptr;
686 if (seq1.count)
687 advance_seq (fp1, &seq1, true, 1);
688 if (seq2.count)
689 advance_seq (fp2, &seq2, true, 2);
690 }
691
692 while (seq1.count && seq2.count)
693 {
694 diff = keycmp (seq1.lines[0], seq2.lines[0],
695 join_field_1, join_field_2);
696 if (diff < 0)
697 {
698 if (print_unpairables_1)
699 prjoin (seq1.lines[0], &uni_blank);
700 advance_seq (fp1, &seq1, true, 1);
701 seen_unpairable = true;
702 continue;
703 }
704 if (diff > 0)
705 {
706 if (print_unpairables_2)
707 prjoin (&uni_blank, seq2.lines[0]);
708 advance_seq (fp2, &seq2, true, 2);
709 seen_unpairable = true;
710 continue;
711 }
712
713 /* Keep reading lines from file1 as long as they continue to
714 match the current line from file2. */
715 eof1 = false;
716 do
717 if (!advance_seq (fp1, &seq1, false, 1))
718 {
719 eof1 = true;
720 ++seq1.count;
721 break;
722 }
723 while (!keycmp (seq1.lines[seq1.count - 1], seq2.lines[0],
724 join_field_1, join_field_2));
725
726 /* Keep reading lines from file2 as long as they continue to
727 match the current line from file1. */
728 eof2 = false;
729 do
730 if (!advance_seq (fp2, &seq2, false, 2))
731 {
732 eof2 = true;
733 ++seq2.count;
734 break;
735 }
736 while (!keycmp (seq1.lines[0], seq2.lines[seq2.count - 1],
737 join_field_1, join_field_2));
738
739 if (print_pairables)
740 {
741 for (idx_t i = 0; i < seq1.count - 1; ++i)
742 {
743 idx_t j;
744 for (j = 0; j < seq2.count - 1; ++j)
745 prjoin (seq1.lines[i], seq2.lines[j]);
746 }
747 }
748
749 if (!eof1)
750 {
751 SWAPLINES (seq1.lines[0], seq1.lines[seq1.count - 1]);
752 seq1.count = 1;
753 }
754 else
755 seq1.count = 0;
756
757 if (!eof2)
758 {
759 SWAPLINES (seq2.lines[0], seq2.lines[seq2.count - 1]);
760 seq2.count = 1;
761 }
762 else
763 seq2.count = 0;
764 }
765
766 /* If the user did not specify --nocheck-order, then we read the
767 tail ends of both inputs to verify that they are in order. We
768 skip the rest of the tail once we have issued a warning for that
769 file, unless we actually need to print the unpairable lines. */
770 struct line *line = nullptr;
771 bool checktail = false;
772
773 if (check_input_order != CHECK_ORDER_DISABLED
774 && !(issued_disorder_warning[0] && issued_disorder_warning[1]))
775 checktail = true;
776
777 if ((print_unpairables_1 || checktail) && seq1.count)
778 {
779 if (print_unpairables_1)
780 prjoin (seq1.lines[0], &uni_blank);
781 if (seq2.count)
782 seen_unpairable = true;
783 while (get_line (fp1, &line, 1))
784 {
785 if (print_unpairables_1)
786 prjoin (line, &uni_blank);
787 if (issued_disorder_warning[0] && !print_unpairables_1)
788 break;
789 }
790 }
791
792 if ((print_unpairables_2 || checktail) && seq2.count)
793 {
794 if (print_unpairables_2)
795 prjoin (&uni_blank, seq2.lines[0]);
796 if (seq1.count)
797 seen_unpairable = true;
798 while (get_line (fp2, &line, 2))
799 {
800 if (print_unpairables_2)
801 prjoin (&uni_blank, line);
802 if (issued_disorder_warning[1] && !print_unpairables_2)
803 break;
804 }
805 }
806
807 freeline (line);
808 free (line);
809
810 delseq (&seq1);
811 delseq (&seq2);
812 }
813
814 /* Add a field spec for field FIELD of file FILE to 'outlist'. */
815
816 static void
add_field(int file,idx_t field)817 add_field (int file, idx_t field)
818 {
819 struct outlist *o;
820
821 affirm (file == 0 || file == 1 || file == 2);
822 affirm (file != 0 || field == 0);
823
824 o = xmalloc (sizeof *o);
825 o->file = file;
826 o->field = field;
827 o->next = nullptr;
828
829 /* Add to the end of the list so the fields are in the right order. */
830 outlist_end->next = o;
831 outlist_end = o;
832 }
833
834 /* Convert a string of decimal digits, STR (the 1-based join field number),
835 to an integral value. Upon successful conversion, return one less
836 (the zero-based field number). Silently convert too-large values
837 to PTRDIFF_MAX. Otherwise, if a value cannot be converted, give a
838 diagnostic and exit. */
839
840 static idx_t
string_to_join_field(char const * str)841 string_to_join_field (char const *str)
842 {
843 intmax_t val;
844
845 strtol_error s_err = xstrtoimax (str, nullptr, 10, &val, "");
846 if (s_err == LONGINT_OVERFLOW || (s_err == LONGINT_OK && PTRDIFF_MAX < val))
847 val = PTRDIFF_MAX;
848 else if (s_err != LONGINT_OK || val <= 0)
849 error (EXIT_FAILURE, 0, _("invalid field number: %s"), quote (str));
850
851 return val - 1;
852 }
853
854 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
855 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
856 If S is valid, return true. Otherwise, give a diagnostic and exit. */
857
858 static void
decode_field_spec(char const * s,int * file_index,idx_t * field_index)859 decode_field_spec (char const *s, int *file_index, idx_t *field_index)
860 {
861 /* The first character must be 0, 1, or 2. */
862 switch (s[0])
863 {
864 case '0':
865 if (s[1])
866 {
867 /* '0' must be all alone -- no '.FIELD'. */
868 error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
869 }
870 *file_index = 0;
871 *field_index = 0;
872 break;
873
874 case '1':
875 case '2':
876 if (s[1] != '.')
877 error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
878 *file_index = s[0] - '0';
879 *field_index = string_to_join_field (s + 2);
880 break;
881
882 default:
883 error (EXIT_FAILURE, 0,
884 _("invalid file number in field spec: %s"), quote (s));
885 }
886 }
887
888 /* Add the comma or blank separated field spec(s) in STR to 'outlist'. */
889
890 static void
add_field_list(char * str)891 add_field_list (char *str)
892 {
893 char *p = str;
894
895 do
896 {
897 int file_index;
898 idx_t field_index;
899 char const *spec_item = p;
900
901 p = strpbrk (p, ", \t");
902 if (p)
903 *p++ = '\0';
904 decode_field_spec (spec_item, &file_index, &field_index);
905 add_field (file_index, field_index);
906 }
907 while (p);
908 }
909
910 /* Set the join field *VAR to VAL, but report an error if *VAR is set
911 more than once to incompatible values. */
912
913 static void
set_join_field(ptrdiff_t * var,idx_t val)914 set_join_field (ptrdiff_t *var, idx_t val)
915 {
916 if (0 <= *var && *var != val)
917 error (EXIT_FAILURE, 0,
918 _("incompatible join fields %td, %td"), *var, val);
919 *var = val;
920 }
921
922 /* Status of command-line arguments. */
923
924 enum operand_status
925 {
926 /* This argument must be an operand, i.e., one of the files to be
927 joined. */
928 MUST_BE_OPERAND,
929
930 /* This might be the argument of the preceding -j1 or -j2 option,
931 or it might be an operand. */
932 MIGHT_BE_J1_ARG,
933 MIGHT_BE_J2_ARG,
934
935 /* This might be the argument of the preceding -o option, or it might be
936 an operand. */
937 MIGHT_BE_O_ARG
938 };
939
940 /* Add NAME to the array of input file NAMES with operand statuses
941 OPERAND_STATUS; currently there are NFILES names in the list. */
942
943 static void
add_file_name(char * name,char * names[2],int operand_status[2],int joption_count[2],int * nfiles,int * prev_optc_status,int * optc_status)944 add_file_name (char *name, char *names[2],
945 int operand_status[2], int joption_count[2], int *nfiles,
946 int *prev_optc_status, int *optc_status)
947 {
948 int n = *nfiles;
949
950 if (n == 2)
951 {
952 bool op0 = (operand_status[0] == MUST_BE_OPERAND);
953 char *arg = names[op0];
954 switch (operand_status[op0])
955 {
956 case MUST_BE_OPERAND:
957 error (0, 0, _("extra operand %s"), quoteaf (name));
958 usage (EXIT_FAILURE);
959
960 case MIGHT_BE_J1_ARG:
961 joption_count[0]--;
962 set_join_field (&join_field_1, string_to_join_field (arg));
963 break;
964
965 case MIGHT_BE_J2_ARG:
966 joption_count[1]--;
967 set_join_field (&join_field_2, string_to_join_field (arg));
968 break;
969
970 case MIGHT_BE_O_ARG:
971 add_field_list (arg);
972 break;
973 }
974 if (!op0)
975 {
976 operand_status[0] = operand_status[1];
977 names[0] = names[1];
978 }
979 n = 1;
980 }
981
982 operand_status[n] = *prev_optc_status;
983 names[n] = name;
984 *nfiles = n + 1;
985 if (*prev_optc_status == MIGHT_BE_O_ARG)
986 *optc_status = MIGHT_BE_O_ARG;
987 }
988
989 int
main(int argc,char ** argv)990 main (int argc, char **argv)
991 {
992 int optc_status;
993 int prev_optc_status = MUST_BE_OPERAND;
994 int operand_status[2];
995 int joption_count[2] = { 0, 0 };
996 FILE *fp1, *fp2;
997 int optc;
998 int nfiles = 0;
999 int i;
1000
1001 initialize_main (&argc, &argv);
1002 set_program_name (argv[0]);
1003 setlocale (LC_ALL, "");
1004 bindtextdomain (PACKAGE, LOCALEDIR);
1005 textdomain (PACKAGE);
1006 hard_LC_COLLATE = hard_locale (LC_COLLATE);
1007
1008 atexit (close_stdout);
1009 atexit (free_spareline);
1010
1011 print_pairables = true;
1012 seen_unpairable = false;
1013 issued_disorder_warning[0] = issued_disorder_warning[1] = false;
1014 check_input_order = CHECK_ORDER_DEFAULT;
1015
1016 while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:z",
1017 longopts, nullptr))
1018 != -1)
1019 {
1020 optc_status = MUST_BE_OPERAND;
1021
1022 switch (optc)
1023 {
1024 case 'v':
1025 print_pairables = false;
1026 FALLTHROUGH;
1027
1028 case 'a':
1029 {
1030 long int val;
1031 if (xstrtol (optarg, nullptr, 10, &val, "") != LONGINT_OK
1032 || (val != 1 && val != 2))
1033 error (EXIT_FAILURE, 0,
1034 _("invalid field number: %s"), quote (optarg));
1035 if (val == 1)
1036 print_unpairables_1 = true;
1037 else
1038 print_unpairables_2 = true;
1039 }
1040 break;
1041
1042 case 'e':
1043 if (empty_filler && ! STREQ (empty_filler, optarg))
1044 error (EXIT_FAILURE, 0,
1045 _("conflicting empty-field replacement strings"));
1046 empty_filler = optarg;
1047 break;
1048
1049 case 'i':
1050 ignore_case = true;
1051 break;
1052
1053 case '1':
1054 set_join_field (&join_field_1, string_to_join_field (optarg));
1055 break;
1056
1057 case '2':
1058 set_join_field (&join_field_2, string_to_join_field (optarg));
1059 break;
1060
1061 case 'j':
1062 if ((optarg[0] == '1' || optarg[0] == '2') && !optarg[1]
1063 && optarg == argv[optind - 1] + 2)
1064 {
1065 /* The argument was either "-j1" or "-j2". */
1066 bool is_j2 = (optarg[0] == '2');
1067 joption_count[is_j2]++;
1068 optc_status = MIGHT_BE_J1_ARG + is_j2;
1069 }
1070 else
1071 {
1072 set_join_field (&join_field_1, string_to_join_field (optarg));
1073 set_join_field (&join_field_2, join_field_1);
1074 }
1075 break;
1076
1077 case 'o':
1078 if (STREQ (optarg, "auto"))
1079 autoformat = true;
1080 else
1081 {
1082 add_field_list (optarg);
1083 optc_status = MIGHT_BE_O_ARG;
1084 }
1085 break;
1086
1087 case 't':
1088 {
1089 unsigned char newtab = optarg[0];
1090 if (! newtab)
1091 newtab = '\n'; /* '' => process the whole line. */
1092 else if (optarg[1])
1093 {
1094 if (STREQ (optarg, "\\0"))
1095 newtab = '\0';
1096 else
1097 error (EXIT_FAILURE, 0, _("multi-character tab %s"),
1098 quote (optarg));
1099 }
1100 if (0 <= tab && tab != newtab)
1101 error (EXIT_FAILURE, 0, _("incompatible tabs"));
1102 tab = newtab;
1103 }
1104 break;
1105
1106 case 'z':
1107 eolchar = 0;
1108 break;
1109
1110 case NOCHECK_ORDER_OPTION:
1111 check_input_order = CHECK_ORDER_DISABLED;
1112 break;
1113
1114 case CHECK_ORDER_OPTION:
1115 check_input_order = CHECK_ORDER_ENABLED;
1116 break;
1117
1118 case 1: /* Non-option argument. */
1119 add_file_name (optarg, g_names, operand_status, joption_count,
1120 &nfiles, &prev_optc_status, &optc_status);
1121 break;
1122
1123 case HEADER_LINE_OPTION:
1124 join_header_lines = true;
1125 break;
1126
1127 case_GETOPT_HELP_CHAR;
1128
1129 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1130
1131 default:
1132 usage (EXIT_FAILURE);
1133 }
1134
1135 prev_optc_status = optc_status;
1136 }
1137
1138 /* Process any operands after "--". */
1139 prev_optc_status = MUST_BE_OPERAND;
1140 while (optind < argc)
1141 add_file_name (argv[optind++], g_names, operand_status, joption_count,
1142 &nfiles, &prev_optc_status, &optc_status);
1143
1144 if (nfiles != 2)
1145 {
1146 if (nfiles == 0)
1147 error (0, 0, _("missing operand"));
1148 else
1149 error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
1150 usage (EXIT_FAILURE);
1151 }
1152
1153 /* If "-j1" was specified and it turns out not to have had an argument,
1154 treat it as "-j 1". Likewise for -j2. */
1155 for (i = 0; i < 2; i++)
1156 if (joption_count[i] != 0)
1157 {
1158 set_join_field (&join_field_1, i);
1159 set_join_field (&join_field_2, i);
1160 }
1161
1162 if (join_field_1 < 0)
1163 join_field_1 = 0;
1164 if (join_field_2 < 0)
1165 join_field_2 = 0;
1166
1167 fp1 = STREQ (g_names[0], "-") ? stdin : fopen (g_names[0], "r");
1168 if (!fp1)
1169 error (EXIT_FAILURE, errno, "%s", quotef (g_names[0]));
1170 fp2 = STREQ (g_names[1], "-") ? stdin : fopen (g_names[1], "r");
1171 if (!fp2)
1172 error (EXIT_FAILURE, errno, "%s", quotef (g_names[1]));
1173 if (fp1 == fp2)
1174 error (EXIT_FAILURE, errno, _("both files cannot be standard input"));
1175 join (fp1, fp2);
1176
1177 if (fclose (fp1) != 0)
1178 error (EXIT_FAILURE, errno, "%s", quotef (g_names[0]));
1179 if (fclose (fp2) != 0)
1180 error (EXIT_FAILURE, errno, "%s", quotef (g_names[1]));
1181
1182 if (issued_disorder_warning[0] || issued_disorder_warning[1])
1183 error (EXIT_FAILURE, 0, _("input is not in sorted order"));
1184 else
1185 return EXIT_SUCCESS;
1186 }
1187