1 /* cksum -- calculate and print POSIX checksums and sizes of files
2    Copyright (C) 1992-2023 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
16 
17 #include <config.h>
18 
19 #include <stdio.h>
20 #include <sys/types.h>
21 #include <stdint.h>
22 #include <x86intrin.h>
23 #include "system.h"
24 
25 /* Number of bytes to read at once.  */
26 #define BUFLEN (1 << 16)
27 
28 extern uint_fast32_t const crctab[8][256];
29 
30 extern bool
31 cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
32 
33 /* Calculate CRC32 using PCLMULQDQ CPU instruction found in x86/x64 CPUs */
34 
35 bool
cksum_pclmul(FILE * fp,uint_fast32_t * crc_out,uintmax_t * length_out)36 cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
37 {
38   __m128i buf[BUFLEN / sizeof (__m128i)];
39   uint_fast32_t crc = 0;
40   uintmax_t length = 0;
41   size_t bytes_read;
42   __m128i single_mult_constant;
43   __m128i four_mult_constant;
44   __m128i shuffle_constant;
45 
46   if (!fp || !crc_out || !length_out)
47     return false;
48 
49   /* These constants and general algorithms are taken from the Intel whitepaper
50      "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
51   */
52   single_mult_constant = _mm_set_epi64x (0xC5B9CD4C, 0xE8A45605);
53   four_mult_constant = _mm_set_epi64x (0x8833794C, 0xE6228B11);
54 
55   /* Constant to byteswap a full SSE register */
56   shuffle_constant = _mm_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
57                                    9, 10, 11, 12, 13, 14, 15);
58 
59   while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
60     {
61       __m128i *datap;
62       __m128i data;
63       __m128i data2;
64       __m128i data3;
65       __m128i data4;
66       __m128i data5;
67       __m128i data6;
68       __m128i data7;
69       __m128i data8;
70       __m128i fold_data;
71       __m128i xor_crc;
72 
73       if (length + bytes_read < length)
74         {
75           errno = EOVERFLOW;
76           return false;
77         }
78       length += bytes_read;
79 
80       datap = (__m128i *)buf;
81 
82       /* Fold in parallel eight 16-byte blocks into four 16-byte blocks */
83       if (bytes_read >= 16 * 8)
84         {
85           data = _mm_loadu_si128 (datap);
86           data = _mm_shuffle_epi8 (data, shuffle_constant);
87           /* XOR in initial CRC value (for us 0 so no effect), or CRC value
88              calculated for previous BUFLEN buffer from fread */
89           xor_crc = _mm_set_epi32 (crc, 0, 0, 0);
90           crc = 0;
91           data = _mm_xor_si128 (data, xor_crc);
92           data3 = _mm_loadu_si128 (datap + 1);
93           data3 = _mm_shuffle_epi8 (data3, shuffle_constant);
94           data5 = _mm_loadu_si128 (datap + 2);
95           data5 = _mm_shuffle_epi8 (data5, shuffle_constant);
96           data7 = _mm_loadu_si128 (datap + 3);
97           data7 = _mm_shuffle_epi8 (data7, shuffle_constant);
98 
99 
100           while (bytes_read >= 16 * 8)
101             {
102               datap += 4;
103 
104               /* Do multiplication here for four consecutive 16 byte blocks */
105               data2 = _mm_clmulepi64_si128 (data, four_mult_constant, 0x00);
106               data = _mm_clmulepi64_si128 (data, four_mult_constant, 0x11);
107               data4 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x00);
108               data3 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x11);
109               data6 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x00);
110               data5 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x11);
111               data8 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x00);
112               data7 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x11);
113 
114               /* Now multiplication results for the four blocks is xor:ed with
115                  next four 16 byte blocks from the buffer. This effectively
116                  "consumes" the first four blocks from the buffer.
117                  Keep xor result in variables for multiplication in next
118                  round of loop. */
119               data = _mm_xor_si128 (data, data2);
120               data2 = _mm_loadu_si128 (datap);
121               data2 = _mm_shuffle_epi8 (data2, shuffle_constant);
122               data = _mm_xor_si128 (data, data2);
123 
124               data3 = _mm_xor_si128 (data3, data4);
125               data4 = _mm_loadu_si128 (datap + 1);
126               data4 = _mm_shuffle_epi8 (data4, shuffle_constant);
127               data3 = _mm_xor_si128 (data3, data4);
128 
129               data5 = _mm_xor_si128 (data5, data6);
130               data6 = _mm_loadu_si128 (datap + 2);
131               data6 = _mm_shuffle_epi8 (data6, shuffle_constant);
132               data5 = _mm_xor_si128 (data5, data6);
133 
134               data7 = _mm_xor_si128 (data7, data8);
135               data8 = _mm_loadu_si128 (datap + 3);
136               data8 = _mm_shuffle_epi8 (data8, shuffle_constant);
137               data7 = _mm_xor_si128 (data7, data8);
138 
139               bytes_read -= (16 * 4);
140             }
141           /* At end of loop we write out results from variables back into
142              the buffer, for use in single fold loop */
143           data = _mm_shuffle_epi8 (data, shuffle_constant);
144           _mm_storeu_si128 (datap, data);
145           data3 = _mm_shuffle_epi8 (data3, shuffle_constant);
146           _mm_storeu_si128 (datap + 1, data3);
147           data5 = _mm_shuffle_epi8 (data5, shuffle_constant);
148           _mm_storeu_si128 (datap + 2, data5);
149           data7 = _mm_shuffle_epi8 (data7, shuffle_constant);
150           _mm_storeu_si128 (datap + 3, data7);
151         }
152 
153       /* Fold two 16-byte blocks into one 16-byte block */
154       if (bytes_read >= 32)
155         {
156           data = _mm_loadu_si128 (datap);
157           data = _mm_shuffle_epi8 (data, shuffle_constant);
158           xor_crc = _mm_set_epi32 (crc, 0, 0, 0);
159           crc = 0;
160           data = _mm_xor_si128 (data, xor_crc);
161           while (bytes_read >= 32)
162             {
163               datap++;
164 
165               data2 = _mm_clmulepi64_si128 (data, single_mult_constant, 0x00);
166               data = _mm_clmulepi64_si128 (data, single_mult_constant, 0x11);
167               fold_data = _mm_loadu_si128 (datap);
168               fold_data = _mm_shuffle_epi8 (fold_data, shuffle_constant);
169               data = _mm_xor_si128 (data, data2);
170               data = _mm_xor_si128 (data, fold_data);
171               bytes_read -= 16;
172             }
173           data = _mm_shuffle_epi8 (data, shuffle_constant);
174           _mm_storeu_si128 (datap, data);
175         }
176 
177       /* And finish up last 0-31 bytes in a byte by byte fashion */
178       unsigned char *cp = (unsigned char *)datap;
179       while (bytes_read--)
180         crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
181       if (feof (fp))
182         break;
183     }
184 
185   *crc_out = crc;
186   *length_out = length;
187 
188   return !ferror (fp);
189 }
190