1#!/bin/sh 2# before coreutils-8.32, uniq would not distinguish 3# items which compared equal with strcoll() 4# So ensure we avoid strcoll() for the following cases. 5 6# Copyright (C) 2020-2023 Free Software Foundation, Inc. 7 8# This program is free software: you can redistribute it and/or modify 9# it under the terms of the GNU General Public License as published by 10# the Free Software Foundation, either version 3 of the License, or 11# (at your option) any later version. 12 13# This program is distributed in the hope that it will be useful, 14# but WITHOUT ANY WARRANTY; without even the implied warranty of 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16# GNU General Public License for more details. 17 18# You should have received a copy of the GNU General Public License 19# along with this program. If not, see <https://www.gnu.org/licenses/>. 20 21. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src 22print_ver_ uniq printf 23 24gen_input() 25{ 26 env LC_ALL=$LOCALE_FR_UTF8 printf "$@" > in || framework_failure_ 27} 28 29# strcoll() used to return 0 comparing the following strings 30# which was fixed somewhere between glibc-2.22 and glibc-2.30 31gen_input '%s\n' 'ⁿᵘˡˡ' 'ܥܝܪܐܩ' 32test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1 33 34# normalization in strcoll is inconsistent across platforms. 35# glibc based systems at least do _not_ normalize in strcoll, 36# while cygwin systems for example may do so. 37# á composed and decomposed, are generally not compared equal 38gen_input '\u00E1\na\u0301\n' 39test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1 40# Similarly with the following equivalent hangul characters 41gen_input '\uAC01\n\u1100\u1161\u11A8\n' 42test $(LC_ALL=ko_KR.utf8 uniq < in | wc -l) = 2 || fail=1 43 44# Note if running in the wrong locale, 45# strcoll may indicate the strings match when they don't. 46# I.e., cjk and hangul will now work even if 47# uniq is running in the wrong locale 48# hangul (ko_KR.utf8) 49gen_input '\uAC00\n\uAC01\n' 50test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1 51# CJK (zh_CN.utf8) 52gen_input '\u3400\n\u3401\n' 53test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1 54 55# Note strcoll() ignores certain characters, 56# but not if the strings are otherwise equal. 57# I.e., the following on glibc-2.30 at least, 58# as expected, does not print a single item, 59# but testing here for illustration 60gen_input ',a\n.a\n' 61test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1 62 63Exit $fail 64