1#!/usr/bin/perl
2# Exercise expr with multibyte input
3
4# Copyright (C) 2017-2023 Free Software Foundation, Inc.
5
6# This program is free software: you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation, either version 3 of the License, or
9# (at your option) any later version.
10
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15
16# You should have received a copy of the GNU General Public License
17# along with this program.  If not, see <https://www.gnu.org/licenses/>.
18
19use strict;
20
21(my $ME = $0) =~ s|.*/||;
22
23my $limits = getlimits ();
24my $UINTMAX_OFLOW = $limits->{UINTMAX_OFLOW};
25
26(my $program_name = $0) =~ s|.*/||;
27my $prog = 'expr';
28
29my $locale = $ENV{LOCALE_FR_UTF8};
30! defined $locale || $locale eq 'none'
31  and CuSkip::skip "$ME: this test requires FR-UTF8 locale\n";
32
33
34=pod
35ἔκφρασις (ekphrasis) - "expression" in Ancient Greek.
36=cut
37my $expression = "\x{1F14}\x{3BA}\x{3C6}\x{3C1}\x{3B1}\x{3C3}\x{3B9}\x{3C2}";
38
39
40## NOTE about tests locales:
41## Tests starting with 'mb' will have {ENV=>"LC_ALL=$locale"}
42## added to them automatically - results are multibyte-aware.
43## Tests starting with 'sb' have the same input but will be
44## run under C locale and will be treated as single-bytes.
45## This enables interleaving C/UTF8 tests
46## (for easier comparison of expected results).
47
48my @Tests =
49  (
50   ### length expressions ###
51
52   # sanity check
53   ['mb-l1', 'length abcdef',   {OUT=>"6"}],
54   ['st-l1', 'length abcdef',   {OUT=>"6"}],
55
56   # A single multibyte character in the beginning of the string
57   # \xCE\xB1 is UTF-8 for "U+03B1 GREEK SMALL LETTER ALPHA"
58   ['mb-l2', "length \xCE\xB1bcdef",   {OUT=>"6"}],
59   ['st-l2', "length \xCE\xB1bcdef",   {OUT=>"7"}],
60
61   # A single multibyte character in the middle of the string
62   # \xCE\xB4 is UTF-8 for "U+03B4 GREEK SMALL LETTER DELTA"
63   ['mb-l3', "length abc\xCE\xB4ef",   {OUT=>"6"}],
64   ['st-l3', "length abc\xCE\xB4ef",   {OUT=>"7"}],
65
66   # A single multibyte character in the end of the string
67   ['mb-l4', "length fedcb\xCE\xB1",   {OUT=>"6"}],
68   ['st-l4', "length fedcb\xCE\xB1",   {OUT=>"7"}],
69
70   # A invalid multibyte sequence
71   ['mb-l5', "length \xB1aaa",   {OUT=>"4"}],
72   ['st-l5', "length \xB1aaa",   {OUT=>"4"}],
73
74   # An incomplete multibyte sequence at the end of the string
75   ['mb-l6', "length aaa\xCE",   {OUT=>"4"}],
76   ['st-l6', "length aaa\xCE",   {OUT=>"4"}],
77
78   # An incomplete multibyte sequence at the end of the string
79   ['mb-l7', "length $expression",   {OUT=>"8"}],
80   ['st-l7', "length $expression",   {OUT=>"17"}],
81
82
83
84   ### index expressions ###
85
86   # sanity check
87   ['mb-i1', 'index abcdef fb',   {OUT=>"2"}],
88   ['st-i1', 'index abcdef fb',   {OUT=>"2"}],
89
90   # Search for a single-octet
91   ['mb-i2', "index \xCE\xB1bc\xCE\xB4ef b",   {OUT=>"2"}],
92   ['st-i2', "index \xCE\xB1bc\xCE\xB4ef b",   {OUT=>"3"}],
93   ['mb-i3', "index \xCE\xB1bc\xCE\xB4ef f",   {OUT=>"6"}],
94   ['st-i3', "index \xCE\xB1bc\xCE\xB4ef f",   {OUT=>"8"}],
95
96   # Search for multibyte character.
97   # In the C locale, the search string is treated as two octets.
98   # the first of them (\xCE) matches the first octet of the input string.
99   ['mb-i4', "index \xCE\xB1bc\xCE\xB4ef \xCE\xB4",   {OUT=>"4"}],
100   ['st-i4', "index \xCE\xB1bc\xCE\xB4ef \xCE\xB4",   {OUT=>"1"}],
101
102   # Invalid multibyte sequence in the input string, treated as a single octet.
103   ['mb-i5', "index \xCEbc\xCE\xB4ef \xCE\xB4",   {OUT=>"4"}],
104   ['st-i5', "index \xCEbc\xCE\xB4ef \xCE\xB4",   {OUT=>"1"}],
105
106   # Invalid multibyte sequence in the search string, treated as a single octet.
107   # In multibyte locale, there should be no match, expr returns and prints
108   # zero, and terminates with exit-code 1 (as per POSIX).
109   ['mb-i6', "index \xCE\xB1bc\xCE\xB4ef \xB4",   {OUT=>"0"}, {EXIT=>1}],
110   ['st-i6', "index \xCE\xB1bc\xCE\xB4ef \xB4",   {OUT=>"6"}],
111
112   # Edge-case: invalid multibyte sequence BOTH in the input string
113   # and in the search string: expr should find a match.
114   ['mb-i7', "index \xCE\xB1bc\xB4ef \xB4",       {OUT=>"4"}],
115
116
117   ### substr expressions ###
118
119   # sanity check
120   ['mb-s1', 'substr abcdef 2 3',   {OUT=>"bcd"}],
121   ['st-s1', 'substr abcdef 2 3',   {OUT=>"bcd"}],
122
123   ['mb-s2', "substr \xCE\xB1bc\xCE\xB4ef 1 1",   {OUT=>"\xCE\xB1"}],
124   ['st-s2', "substr \xCE\xB1bc\xCE\xB4ef 1 1",   {OUT=>"\xCE"}],
125
126   ['mb-s3', "substr \xCE\xB1bc\xCE\xB4ef 3 2",   {OUT=>"c\xCE\xB4"}],
127   ['st-s3', "substr \xCE\xB1bc\xCE\xB4ef 3 2",   {OUT=>"bc"}],
128
129   ['mb-s4', "substr \xCE\xB1bc\xCE\xB4ef 4 1",   {OUT=>"\xCE\xB4"}],
130   ['st-s4', "substr \xCE\xB1bc\xCE\xB4ef 4 1",   {OUT=>"c"}],
131
132   ['mb-s5', "substr \xCE\xB1bc\xCE\xB4ef 4 2",   {OUT=>"\xCE\xB4e"}],
133   ['st-s5', "substr \xCE\xB1bc\xCE\xB4ef 4 2",   {OUT=>"c\xCE"}],
134
135   ['mb-s6', "substr \xCE\xB1bc\xCE\xB4ef 6 1",   {OUT=>"f"}],
136   ['st-s6', "substr \xCE\xB1bc\xCE\xB4ef 6 1",   {OUT=>"\xB4"}],
137
138   ['mb-s7', "substr \xCE\xB1bc\xCE\xB4ef 7 1",   {OUT=>""}, {EXIT=>1}],
139   ['st-s7', "substr \xCE\xB1bc\xCE\xB4ef 7 1",   {OUT=>"e"}],
140
141   # Invalid multibyte sequences
142   ['mb-s8', "substr \xCE\xB1bc\xB4ef 3 3",   {OUT=>"c\xB4e"}],
143   ['st-s8', "substr \xCE\xB1bc\xB4ef 3 3",   {OUT=>"bc\xB4"}],
144
145
146   ### match expressions ###
147
148   # sanity check
149   ['mb-m1', 'match abcdef ab',   {OUT=>"2"}],
150   ['st-m1', 'match abcdef ab',   {OUT=>"2"}],
151   ['mb-m2', 'match abcdef "\(ab\)"',   {OUT=>"ab"}],
152   ['st-m2', 'match abcdef "\(ab\)"',   {OUT=>"ab"}],
153
154   # The regex engine should match the '.' to the first multibyte character.
155   ['mb-m3', "match \xCE\xB1bc\xCE\xB4ef .bc", {OUT=>"3"}],
156   ['st-m3', "match \xCE\xB1bc\xCE\xB4ef .bc", {OUT=>"0"}, {EXIT=>1}],
157
158   # The opposite of the previous test: two dots should only match
159   # the two octets in single-byte locale.
160   ['mb-m4', "match \xCE\xB1bc\xCE\xB4ef ..bc", {OUT=>"0"}, {EXIT=>1}],
161   ['st-m4', "match \xCE\xB1bc\xCE\xB4ef ..bc", {OUT=>"4"}],
162
163   # Match with grouping - a single dot should return the two octets
164   ['mb-m5', "match \xCE\xB1bc\xCE\xB4ef '\\(.b\\)c'", {OUT=>"\xCE\xB1b"}],
165   ['st-m5', "match \xCE\xB1bc\xCE\xB4ef '\\(.b\\)c'", {OUT=>""}, {EXIT=>1}],
166
167   # Invalid multibyte sequences - regex should not match in multibyte locale
168   # (POSIX requirement)
169   ['mb-m6', "match \xCEbc\xCE\xB4ef '\\(.\\)'", {OUT=>""}, {EXIT=>1}],
170   ['st-m6', "match \xCEbc\xCE\xB4ef '\\(.\\)'", {OUT=>"\xCE"}],
171
172
173   # Character classes: in the multibyte case, the regex engine understands
174   # there is a single multibyte character in the brackets.
175   # In the single byte case, the regex engine sees two octets in the character
176   # class ('\xCE' and '\xB1') - and it matches the first one.
177   ['mb-m7', "match \xCE\xB1bc\xCE\xB4e '\\([\xCE\xB1]\\)'", {OUT=>"\xCE\xB1"}],
178   ['st-m7', "match \xCE\xB1bc\xCE\xB4e '\\([\xCE\xB1]\\)'", {OUT=>"\xCE"}],
179
180  );
181
182
183# Append a newline to end of each expected 'OUT' string.
184my $t;
185foreach $t (@Tests)
186  {
187    my $arg1 = $t->[1];
188    my $e;
189    foreach $e (@$t)
190      {
191        $e->{OUT} .= "\n"
192          if ref $e eq 'HASH' and exists $e->{OUT};
193      }
194  }
195
196
197# Force multibyte locale in all tests.
198#
199# NOTE about the ERR_SUBST:
200# The error tests above (e1/e2/e3/e4) expect error messages in C locale
201# having single-quote character (ASCII 0x27).
202# In UTF-8 locale, the error messages will use:
203#  'LEFT SINGLE QUOTATION MARK'  (U+2018) (UTF8: 0xE2 0x80 0x98)
204#  'RIGHT SINGLE QUOTATION MARK' (U+2019) (UTF8: 0xE2 0x80 0x99)
205# So we replace them with ascii single-quote and the results will
206# match the expected error string.
207if ($locale ne 'C')
208  {
209    my @new;
210    foreach my $t (@Tests)
211      {
212        my ($tname) = @$t;
213        if ($tname =~ /^mb/)
214          {
215            push @$t, ({ENV => "LC_ALL=$locale"},
216                       {ERR_SUBST => "s/\xe2\x80[\x98\x99]/'/g"});
217          }
218      }
219  }
220
221
222my $save_temps = $ENV{DEBUG};
223my $verbose = $ENV{VERBOSE};
224
225my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
226exit $fail;
227