1#!/usr/bin/perl 2# Exercise expr with multibyte input 3 4# Copyright (C) 2017-2023 Free Software Foundation, Inc. 5 6# This program is free software: you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation, either version 3 of the License, or 9# (at your option) any later version. 10 11# This program is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15 16# You should have received a copy of the GNU General Public License 17# along with this program. If not, see <https://www.gnu.org/licenses/>. 18 19use strict; 20 21(my $ME = $0) =~ s|.*/||; 22 23my $limits = getlimits (); 24my $UINTMAX_OFLOW = $limits->{UINTMAX_OFLOW}; 25 26(my $program_name = $0) =~ s|.*/||; 27my $prog = 'expr'; 28 29my $locale = $ENV{LOCALE_FR_UTF8}; 30! defined $locale || $locale eq 'none' 31 and CuSkip::skip "$ME: this test requires FR-UTF8 locale\n"; 32 33 34=pod 35ἔκφρασις (ekphrasis) - "expression" in Ancient Greek. 36=cut 37my $expression = "\x{1F14}\x{3BA}\x{3C6}\x{3C1}\x{3B1}\x{3C3}\x{3B9}\x{3C2}"; 38 39 40## NOTE about tests locales: 41## Tests starting with 'mb' will have {ENV=>"LC_ALL=$locale"} 42## added to them automatically - results are multibyte-aware. 43## Tests starting with 'sb' have the same input but will be 44## run under C locale and will be treated as single-bytes. 45## This enables interleaving C/UTF8 tests 46## (for easier comparison of expected results). 47 48my @Tests = 49 ( 50 ### length expressions ### 51 52 # sanity check 53 ['mb-l1', 'length abcdef', {OUT=>"6"}], 54 ['st-l1', 'length abcdef', {OUT=>"6"}], 55 56 # A single multibyte character in the beginning of the string 57 # \xCE\xB1 is UTF-8 for "U+03B1 GREEK SMALL LETTER ALPHA" 58 ['mb-l2', "length \xCE\xB1bcdef", {OUT=>"6"}], 59 ['st-l2', "length \xCE\xB1bcdef", {OUT=>"7"}], 60 61 # A single multibyte character in the middle of the string 62 # \xCE\xB4 is UTF-8 for "U+03B4 GREEK SMALL LETTER DELTA" 63 ['mb-l3', "length abc\xCE\xB4ef", {OUT=>"6"}], 64 ['st-l3', "length abc\xCE\xB4ef", {OUT=>"7"}], 65 66 # A single multibyte character in the end of the string 67 ['mb-l4', "length fedcb\xCE\xB1", {OUT=>"6"}], 68 ['st-l4', "length fedcb\xCE\xB1", {OUT=>"7"}], 69 70 # A invalid multibyte sequence 71 ['mb-l5', "length \xB1aaa", {OUT=>"4"}], 72 ['st-l5', "length \xB1aaa", {OUT=>"4"}], 73 74 # An incomplete multibyte sequence at the end of the string 75 ['mb-l6', "length aaa\xCE", {OUT=>"4"}], 76 ['st-l6', "length aaa\xCE", {OUT=>"4"}], 77 78 # An incomplete multibyte sequence at the end of the string 79 ['mb-l7', "length $expression", {OUT=>"8"}], 80 ['st-l7', "length $expression", {OUT=>"17"}], 81 82 83 84 ### index expressions ### 85 86 # sanity check 87 ['mb-i1', 'index abcdef fb', {OUT=>"2"}], 88 ['st-i1', 'index abcdef fb', {OUT=>"2"}], 89 90 # Search for a single-octet 91 ['mb-i2', "index \xCE\xB1bc\xCE\xB4ef b", {OUT=>"2"}], 92 ['st-i2', "index \xCE\xB1bc\xCE\xB4ef b", {OUT=>"3"}], 93 ['mb-i3', "index \xCE\xB1bc\xCE\xB4ef f", {OUT=>"6"}], 94 ['st-i3', "index \xCE\xB1bc\xCE\xB4ef f", {OUT=>"8"}], 95 96 # Search for multibyte character. 97 # In the C locale, the search string is treated as two octets. 98 # the first of them (\xCE) matches the first octet of the input string. 99 ['mb-i4', "index \xCE\xB1bc\xCE\xB4ef \xCE\xB4", {OUT=>"4"}], 100 ['st-i4', "index \xCE\xB1bc\xCE\xB4ef \xCE\xB4", {OUT=>"1"}], 101 102 # Invalid multibyte sequence in the input string, treated as a single octet. 103 ['mb-i5', "index \xCEbc\xCE\xB4ef \xCE\xB4", {OUT=>"4"}], 104 ['st-i5', "index \xCEbc\xCE\xB4ef \xCE\xB4", {OUT=>"1"}], 105 106 # Invalid multibyte sequence in the search string, treated as a single octet. 107 # In multibyte locale, there should be no match, expr returns and prints 108 # zero, and terminates with exit-code 1 (as per POSIX). 109 ['mb-i6', "index \xCE\xB1bc\xCE\xB4ef \xB4", {OUT=>"0"}, {EXIT=>1}], 110 ['st-i6', "index \xCE\xB1bc\xCE\xB4ef \xB4", {OUT=>"6"}], 111 112 # Edge-case: invalid multibyte sequence BOTH in the input string 113 # and in the search string: expr should find a match. 114 ['mb-i7', "index \xCE\xB1bc\xB4ef \xB4", {OUT=>"4"}], 115 116 117 ### substr expressions ### 118 119 # sanity check 120 ['mb-s1', 'substr abcdef 2 3', {OUT=>"bcd"}], 121 ['st-s1', 'substr abcdef 2 3', {OUT=>"bcd"}], 122 123 ['mb-s2', "substr \xCE\xB1bc\xCE\xB4ef 1 1", {OUT=>"\xCE\xB1"}], 124 ['st-s2', "substr \xCE\xB1bc\xCE\xB4ef 1 1", {OUT=>"\xCE"}], 125 126 ['mb-s3', "substr \xCE\xB1bc\xCE\xB4ef 3 2", {OUT=>"c\xCE\xB4"}], 127 ['st-s3', "substr \xCE\xB1bc\xCE\xB4ef 3 2", {OUT=>"bc"}], 128 129 ['mb-s4', "substr \xCE\xB1bc\xCE\xB4ef 4 1", {OUT=>"\xCE\xB4"}], 130 ['st-s4', "substr \xCE\xB1bc\xCE\xB4ef 4 1", {OUT=>"c"}], 131 132 ['mb-s5', "substr \xCE\xB1bc\xCE\xB4ef 4 2", {OUT=>"\xCE\xB4e"}], 133 ['st-s5', "substr \xCE\xB1bc\xCE\xB4ef 4 2", {OUT=>"c\xCE"}], 134 135 ['mb-s6', "substr \xCE\xB1bc\xCE\xB4ef 6 1", {OUT=>"f"}], 136 ['st-s6', "substr \xCE\xB1bc\xCE\xB4ef 6 1", {OUT=>"\xB4"}], 137 138 ['mb-s7', "substr \xCE\xB1bc\xCE\xB4ef 7 1", {OUT=>""}, {EXIT=>1}], 139 ['st-s7', "substr \xCE\xB1bc\xCE\xB4ef 7 1", {OUT=>"e"}], 140 141 # Invalid multibyte sequences 142 ['mb-s8', "substr \xCE\xB1bc\xB4ef 3 3", {OUT=>"c\xB4e"}], 143 ['st-s8', "substr \xCE\xB1bc\xB4ef 3 3", {OUT=>"bc\xB4"}], 144 145 146 ### match expressions ### 147 148 # sanity check 149 ['mb-m1', 'match abcdef ab', {OUT=>"2"}], 150 ['st-m1', 'match abcdef ab', {OUT=>"2"}], 151 ['mb-m2', 'match abcdef "\(ab\)"', {OUT=>"ab"}], 152 ['st-m2', 'match abcdef "\(ab\)"', {OUT=>"ab"}], 153 154 # The regex engine should match the '.' to the first multibyte character. 155 ['mb-m3', "match \xCE\xB1bc\xCE\xB4ef .bc", {OUT=>"3"}], 156 ['st-m3', "match \xCE\xB1bc\xCE\xB4ef .bc", {OUT=>"0"}, {EXIT=>1}], 157 158 # The opposite of the previous test: two dots should only match 159 # the two octets in single-byte locale. 160 ['mb-m4', "match \xCE\xB1bc\xCE\xB4ef ..bc", {OUT=>"0"}, {EXIT=>1}], 161 ['st-m4', "match \xCE\xB1bc\xCE\xB4ef ..bc", {OUT=>"4"}], 162 163 # Match with grouping - a single dot should return the two octets 164 ['mb-m5', "match \xCE\xB1bc\xCE\xB4ef '\\(.b\\)c'", {OUT=>"\xCE\xB1b"}], 165 ['st-m5', "match \xCE\xB1bc\xCE\xB4ef '\\(.b\\)c'", {OUT=>""}, {EXIT=>1}], 166 167 # Invalid multibyte sequences - regex should not match in multibyte locale 168 # (POSIX requirement) 169 ['mb-m6', "match \xCEbc\xCE\xB4ef '\\(.\\)'", {OUT=>""}, {EXIT=>1}], 170 ['st-m6', "match \xCEbc\xCE\xB4ef '\\(.\\)'", {OUT=>"\xCE"}], 171 172 173 # Character classes: in the multibyte case, the regex engine understands 174 # there is a single multibyte character in the brackets. 175 # In the single byte case, the regex engine sees two octets in the character 176 # class ('\xCE' and '\xB1') - and it matches the first one. 177 ['mb-m7', "match \xCE\xB1bc\xCE\xB4e '\\([\xCE\xB1]\\)'", {OUT=>"\xCE\xB1"}], 178 ['st-m7', "match \xCE\xB1bc\xCE\xB4e '\\([\xCE\xB1]\\)'", {OUT=>"\xCE"}], 179 180 ); 181 182 183# Append a newline to end of each expected 'OUT' string. 184my $t; 185foreach $t (@Tests) 186 { 187 my $arg1 = $t->[1]; 188 my $e; 189 foreach $e (@$t) 190 { 191 $e->{OUT} .= "\n" 192 if ref $e eq 'HASH' and exists $e->{OUT}; 193 } 194 } 195 196 197# Force multibyte locale in all tests. 198# 199# NOTE about the ERR_SUBST: 200# The error tests above (e1/e2/e3/e4) expect error messages in C locale 201# having single-quote character (ASCII 0x27). 202# In UTF-8 locale, the error messages will use: 203# 'LEFT SINGLE QUOTATION MARK' (U+2018) (UTF8: 0xE2 0x80 0x98) 204# 'RIGHT SINGLE QUOTATION MARK' (U+2019) (UTF8: 0xE2 0x80 0x99) 205# So we replace them with ascii single-quote and the results will 206# match the expected error string. 207if ($locale ne 'C') 208 { 209 my @new; 210 foreach my $t (@Tests) 211 { 212 my ($tname) = @$t; 213 if ($tname =~ /^mb/) 214 { 215 push @$t, ({ENV => "LC_ALL=$locale"}, 216 {ERR_SUBST => "s/\xe2\x80[\x98\x99]/'/g"}); 217 } 218 } 219 } 220 221 222my $save_temps = $ENV{DEBUG}; 223my $verbose = $ENV{VERBOSE}; 224 225my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); 226exit $fail; 227