1#!/bin/sh
2# before coreutils-8.32, uniq would not distinguish
3# items which compared equal with strcoll()
4# So ensure we avoid strcoll() for the following cases.
5
6# Copyright (C) 2020-2023 Free Software Foundation, Inc.
7
8# This program is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12
13# This program is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16# GNU General Public License for more details.
17
18# You should have received a copy of the GNU General Public License
19# along with this program.  If not, see <https://www.gnu.org/licenses/>.
20
21. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
22print_ver_ uniq printf
23
24gen_input()
25{
26  env LC_ALL=$LOCALE_FR_UTF8 printf "$@" > in || framework_failure_
27}
28
29# strcoll() used to return 0 comparing the following strings
30# which was fixed somewhere between glibc-2.22 and glibc-2.30
31gen_input '%s\n' 'ⁿᵘˡˡ' 'ܥܝܪܐܩ'
32test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
33
34# normalization in strcoll is inconsistent across platforms.
35# glibc based systems at least do _not_ normalize in strcoll,
36# while cygwin systems for example may do so.
37# á composed and decomposed, are generally not compared equal
38gen_input '\u00E1\na\u0301\n'
39test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
40# Similarly with the following equivalent hangul characters
41gen_input '\uAC01\n\u1100\u1161\u11A8\n'
42test $(LC_ALL=ko_KR.utf8 uniq < in | wc -l) = 2 || fail=1
43
44# Note if running in the wrong locale,
45# strcoll may indicate the strings match when they don't.
46# I.e., cjk and hangul will now work even if
47# uniq is running in the wrong locale
48# hangul (ko_KR.utf8)
49gen_input '\uAC00\n\uAC01\n'
50test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1
51# CJK (zh_CN.utf8)
52gen_input '\u3400\n\u3401\n'
53test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1
54
55# Note strcoll() ignores certain characters,
56# but not if the strings are otherwise equal.
57# I.e., the following on glibc-2.30 at least,
58# as expected, does not print a single item,
59# but testing here for illustration
60gen_input ',a\n.a\n'
61test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
62
63Exit $fail
64