1 /* factor -- print prime factors of n.
2 Copyright (C) 1986-2023 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
16
17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
18 Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
19 Arbitrary-precision code adapted by James Youngman from Torbjörn
20 Granlund's factorize.c, from GNU MP version 4.2.2.
21 In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
22 Contains code from GNU MP. */
23
24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
25 or, with GMP, numbers of any size.
26
27 Code organization:
28
29 There are several variants of many functions, for handling one word, two
30 words, and GMP's mpz_t type. If the one-word variant is called foo, the
31 two-word variant will be foo2, and the one for mpz_t will be mp_foo. In
32 some cases, the plain function variants will handle both one-word and
33 two-word numbers, evidenced by function arguments.
34
35 The factoring code for two words will fall into the code for one word when
36 progress allows that.
37
38 Algorithm:
39
40 (1) Perform trial division using a small primes table, but without hardware
41 division since the primes table store inverses modulo the word base.
42 (The GMP variant of this code doesn't make use of the precomputed
43 inverses, but instead relies on GMP for fast divisibility testing.)
44 (2) Check the nature of any non-factored part using Miller-Rabin for
45 detecting composites, and Lucas for detecting primes.
46 (3) Factor any remaining composite part using the Pollard-Brent rho
47 algorithm or if USE_SQUFOF is defined to 1, try that first.
48 Status of found factors are checked again using Miller-Rabin and Lucas.
49
50 We prefer using Hensel norm in the divisions, not the more familiar
51 Euclidean norm, since the former leads to much faster code. In the
52 Pollard-Brent rho code and the prime testing code, we use Montgomery's
53 trick of multiplying all n-residues by the word base, allowing cheap Hensel
54 reductions mod n.
55
56 The GMP code uses an algorithm that can be considerably slower;
57 for example, on a circa-2017 Intel Xeon Silver 4116, factoring
58 2^{127}-3 takes about 50 ms with the two-word algorithm but would
59 take about 750 ms with the GMP code.
60
61 Improvements:
62
63 * Use modular inverses also for exact division in the Lucas code, and
64 elsewhere. A problem is to locate the inverses not from an index, but
65 from a prime. We might instead compute the inverse on-the-fly.
66
67 * Tune trial division table size (not forgetting that this is a standalone
68 program where the table will be read from secondary storage for
69 each invocation).
70
71 * Implement less naive powm, using k-ary exponentiation for k = 3 or
72 perhaps k = 4.
73
74 * Try to speed trial division code for single uintmax_t numbers, i.e., the
75 code using DIVBLOCK. It currently runs at 2 cycles per prime (Intel SBR,
76 IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
77 using gcc 4.6 and 4.7. Some software pipelining should help; 1, 2, and 4
78 respectively cycles ought to be possible.
79
80 * The redcify function could be vastly improved by using (plain Euclidean)
81 pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
82 GMP's gmp-impl.h). The redcify2 function could be vastly improved using
83 similar methods. These functions currently dominate run time when using
84 the -w option.
85 */
86
87 /* Whether to recursively factor to prove primality,
88 or run faster probabilistic tests. */
89 #ifndef PROVE_PRIMALITY
90 # define PROVE_PRIMALITY 1
91 #endif
92
93 /* Faster for certain ranges but less general. */
94 #ifndef USE_SQUFOF
95 # define USE_SQUFOF 0
96 #endif
97
98 /* Output SQUFOF statistics. */
99 #ifndef STAT_SQUFOF
100 # define STAT_SQUFOF 0
101 #endif
102
103
104 #include <config.h>
105 #include <getopt.h>
106 #include <stdio.h>
107 #include <gmp.h>
108
109 #include "system.h"
110 #include "assure.h"
111 #include "full-write.h"
112 #include "quote.h"
113 #include "readtokens.h"
114 #include "xstrtol.h"
115
116 /* The official name of this program (e.g., no 'g' prefix). */
117 #define PROGRAM_NAME "factor"
118
119 #define AUTHORS \
120 proper_name ("Paul Rubin"), \
121 proper_name_lite ("Torbjorn Granlund", "Torbj\303\266rn Granlund"), \
122 proper_name_lite ("Niels Moller", "Niels M\303\266ller")
123
124 /* Token delimiters when reading from a file. */
125 #define DELIM "\n\t "
126
127 #ifndef USE_LONGLONG_H
128 /* With the way we use longlong.h, it's only safe to use
129 when UWtype = UHWtype, as there were various cases
130 (as can be seen in the history for longlong.h) where
131 for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
132 to avoid compile time or run time issues. */
133 # if LONG_MAX == INTMAX_MAX
134 # define USE_LONGLONG_H 1
135 # endif
136 #endif
137
138 #if USE_LONGLONG_H
139
140 /* Make definitions for longlong.h to make it do what it can do for us */
141
142 /* bitcount for uintmax_t */
143 # if UINTMAX_MAX == UINT32_MAX
144 # define W_TYPE_SIZE 32
145 # elif UINTMAX_MAX == UINT64_MAX
146 # define W_TYPE_SIZE 64
147 # elif UINTMAX_MAX == UINT128_MAX
148 # define W_TYPE_SIZE 128
149 # endif
150
151 # define UWtype uintmax_t
152 # define UHWtype unsigned long int
153 # undef UDWtype
154 # if HAVE_ATTRIBUTE_MODE
155 typedef unsigned int UQItype __attribute__ ((mode (QI)));
156 typedef int SItype __attribute__ ((mode (SI)));
157 typedef unsigned int USItype __attribute__ ((mode (SI)));
158 typedef int DItype __attribute__ ((mode (DI)));
159 typedef unsigned int UDItype __attribute__ ((mode (DI)));
160 # else
161 typedef unsigned char UQItype;
162 typedef long SItype;
163 typedef unsigned long int USItype;
164 # if HAVE_LONG_LONG_INT
165 typedef long long int DItype;
166 typedef unsigned long long int UDItype;
167 # else /* Assume `long' gives us a wide enough type. Needed for hppa2.0w. */
168 typedef long int DItype;
169 typedef unsigned long int UDItype;
170 # endif
171 # endif
172 # define LONGLONG_STANDALONE /* Don't require GMP's longlong.h mdep files */
173 # define ASSERT(x) /* FIXME make longlong.h really standalone */
174 # define __GMP_DECLSPEC /* FIXME make longlong.h really standalone */
175 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
176 # ifndef __GMP_GNUC_PREREQ
177 # define __GMP_GNUC_PREREQ(a,b) 1
178 # endif
179
180 /* These stub macros are only used in longlong.h in certain system compiler
181 combinations, so ensure usage to avoid -Wunused-macros warnings. */
182 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
183 ASSERT (1)
184 __GMP_DECLSPEC
185 # endif
186
187 # if _ARCH_PPC
188 # define HAVE_HOST_CPU_FAMILY_powerpc 1
189 # endif
190 # include "longlong.h"
191 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
192 const unsigned char factor_clz_tab[129] =
193 {
194 1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
195 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
196 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
197 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
198 9
199 };
200 # endif
201
202 #else /* not USE_LONGLONG_H */
203
204 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
205 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
206 # define __ll_lowpart(t) ((uintmax_t) (t) & (__ll_B - 1))
207 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
208
209 #endif
210
211 #if !defined __clz_tab && !defined UHWtype
212 /* Without this seemingly useless conditional, gcc -Wunused-macros
213 warns that each of the two tested macros is unused on Fedora 18.
214 FIXME: this is just an ugly band-aid. Fix it properly. */
215 #endif
216
217 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
218 #define MAX_NFACTS 26
219
220 enum
221 {
222 DEV_DEBUG_OPTION = CHAR_MAX + 1
223 };
224
225 static struct option const long_options[] =
226 {
227 {"exponents", no_argument, nullptr, 'h'},
228 {"-debug", no_argument, nullptr, DEV_DEBUG_OPTION},
229 {GETOPT_HELP_OPTION_DECL},
230 {GETOPT_VERSION_OPTION_DECL},
231 {nullptr, 0, nullptr, 0}
232 };
233
234 /* If true, use p^e output format. */
235 static bool print_exponents;
236
237 struct factors
238 {
239 uintmax_t plarge[2]; /* Can have a single large factor */
240 uintmax_t p[MAX_NFACTS];
241 unsigned char e[MAX_NFACTS];
242 unsigned char nfactors;
243 };
244
245 struct mp_factors
246 {
247 mpz_t *p;
248 unsigned long int *e;
249 idx_t nfactors;
250 idx_t nalloc;
251 };
252
253 static void factor (uintmax_t, uintmax_t, struct factors *);
254
255 #ifndef umul_ppmm
256 # define umul_ppmm(w1, w0, u, v) \
257 do { \
258 uintmax_t __x0, __x1, __x2, __x3; \
259 unsigned long int __ul, __vl, __uh, __vh; \
260 uintmax_t __u = (u), __v = (v); \
261 \
262 __ul = __ll_lowpart (__u); \
263 __uh = __ll_highpart (__u); \
264 __vl = __ll_lowpart (__v); \
265 __vh = __ll_highpart (__v); \
266 \
267 __x0 = (uintmax_t) __ul * __vl; \
268 __x1 = (uintmax_t) __ul * __vh; \
269 __x2 = (uintmax_t) __uh * __vl; \
270 __x3 = (uintmax_t) __uh * __vh; \
271 \
272 __x1 += __ll_highpart (__x0);/* This can't give carry. */ \
273 __x1 += __x2; /* But this indeed can. */ \
274 if (__x1 < __x2) /* Did we get it? */ \
275 __x3 += __ll_B; /* Yes, add it in the proper pos. */ \
276 \
277 (w1) = __x3 + __ll_highpart (__x1); \
278 (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0); \
279 } while (0)
280 #endif
281
282 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
283 /* Define our own, not needing normalization. This function is
284 currently not performance critical, so keep it simple. Similar to
285 the mod macro below. */
286 # undef udiv_qrnnd
287 # define udiv_qrnnd(q, r, n1, n0, d) \
288 do { \
289 uintmax_t __d1, __d0, __q, __r1, __r0; \
290 \
291 __d1 = (d); __d0 = 0; \
292 __r1 = (n1); __r0 = (n0); \
293 affirm (__r1 < __d1); \
294 __q = 0; \
295 for (int __i = W_TYPE_SIZE; __i > 0; __i--) \
296 { \
297 rsh2 (__d1, __d0, __d1, __d0, 1); \
298 __q <<= 1; \
299 if (ge2 (__r1, __r0, __d1, __d0)) \
300 { \
301 __q++; \
302 sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0); \
303 } \
304 } \
305 (r) = __r0; \
306 (q) = __q; \
307 } while (0)
308 #endif
309
310 #if !defined add_ssaaaa
311 # define add_ssaaaa(sh, sl, ah, al, bh, bl) \
312 do { \
313 uintmax_t _add_x; \
314 _add_x = (al) + (bl); \
315 (sh) = (ah) + (bh) + (_add_x < (al)); \
316 (sl) = _add_x; \
317 } while (0)
318 #endif
319
320 #define rsh2(rh, rl, ah, al, cnt) \
321 do { \
322 (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt)); \
323 (rh) = (ah) >> (cnt); \
324 } while (0)
325
326 #define lsh2(rh, rl, ah, al, cnt) \
327 do { \
328 (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt))); \
329 (rl) = (al) << (cnt); \
330 } while (0)
331
332 #define ge2(ah, al, bh, bl) \
333 ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
334
335 #define gt2(ah, al, bh, bl) \
336 ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
337
338 #ifndef sub_ddmmss
339 # define sub_ddmmss(rh, rl, ah, al, bh, bl) \
340 do { \
341 uintmax_t _cy; \
342 _cy = (al) < (bl); \
343 (rl) = (al) - (bl); \
344 (rh) = (ah) - (bh) - _cy; \
345 } while (0)
346 #endif
347
348 #ifndef count_leading_zeros
349 # define count_leading_zeros(count, x) do { \
350 uintmax_t __clz_x = (x); \
351 int __clz_c; \
352 for (__clz_c = 0; \
353 (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0; \
354 __clz_c += 8) \
355 __clz_x <<= 8; \
356 for (; (intmax_t)__clz_x >= 0; __clz_c++) \
357 __clz_x <<= 1; \
358 (count) = __clz_c; \
359 } while (0)
360 #endif
361
362 #ifndef count_trailing_zeros
363 # define count_trailing_zeros(count, x) do { \
364 uintmax_t __ctz_x = (x); \
365 int __ctz_c = 0; \
366 while ((__ctz_x & 1) == 0) \
367 { \
368 __ctz_x >>= 1; \
369 __ctz_c++; \
370 } \
371 (count) = __ctz_c; \
372 } while (0)
373 #endif
374
375 /* Requires that a < n and b <= n */
376 #define submod(r,a,b,n) \
377 do { \
378 uintmax_t _t = - (uintmax_t) (a < b); \
379 (r) = ((n) & _t) + (a) - (b); \
380 } while (0)
381
382 #define addmod(r,a,b,n) \
383 submod ((r), (a), ((n) - (b)), (n))
384
385 /* Modular two-word addition and subtraction. For performance reasons, the
386 most significant bit of n1 must be clear. The destination variables must be
387 distinct from the mod operand. */
388 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0) \
389 do { \
390 add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0)); \
391 if (ge2 ((r1), (r0), (n1), (n0))) \
392 sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0)); \
393 } while (0)
394 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0) \
395 do { \
396 sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0)); \
397 if ((intmax_t) (r1) < 0) \
398 add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0)); \
399 } while (0)
400
401 #define HIGHBIT_TO_MASK(x) \
402 (((intmax_t)-1 >> 1) < 0 \
403 ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1)) \
404 : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1)) \
405 ? UINTMAX_MAX : (uintmax_t) 0))
406
407 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
408 Requires that d1 != 0. */
409 static uintmax_t
mod2(uintmax_t * r1,uintmax_t a1,uintmax_t a0,uintmax_t d1,uintmax_t d0)410 mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
411 {
412 int cntd, cnta;
413
414 affirm (d1 != 0);
415
416 if (a1 == 0)
417 {
418 *r1 = 0;
419 return a0;
420 }
421
422 count_leading_zeros (cntd, d1);
423 count_leading_zeros (cnta, a1);
424 int cnt = cntd - cnta;
425 lsh2 (d1, d0, d1, d0, cnt);
426 for (int i = 0; i < cnt; i++)
427 {
428 if (ge2 (a1, a0, d1, d0))
429 sub_ddmmss (a1, a0, a1, a0, d1, d0);
430 rsh2 (d1, d0, d1, d0, 1);
431 }
432
433 *r1 = a1;
434 return a0;
435 }
436
437 ATTRIBUTE_CONST
438 static uintmax_t
gcd_odd(uintmax_t a,uintmax_t b)439 gcd_odd (uintmax_t a, uintmax_t b)
440 {
441 if ((b & 1) == 0)
442 {
443 uintmax_t t = b;
444 b = a;
445 a = t;
446 }
447 if (a == 0)
448 return b;
449
450 /* Take out least significant one bit, to make room for sign */
451 b >>= 1;
452
453 for (;;)
454 {
455 uintmax_t t;
456 uintmax_t bgta;
457
458 while ((a & 1) == 0)
459 a >>= 1;
460 a >>= 1;
461
462 t = a - b;
463 if (t == 0)
464 return (a << 1) + 1;
465
466 bgta = HIGHBIT_TO_MASK (t);
467
468 /* b <-- min (a, b) */
469 b += (bgta & t);
470
471 /* a <-- |a - b| */
472 a = (t ^ bgta) - bgta;
473 }
474 }
475
476 static uintmax_t
gcd2_odd(uintmax_t * r1,uintmax_t a1,uintmax_t a0,uintmax_t b1,uintmax_t b0)477 gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0)
478 {
479 affirm (b0 & 1);
480
481 if ((a0 | a1) == 0)
482 {
483 *r1 = b1;
484 return b0;
485 }
486
487 while ((a0 & 1) == 0)
488 rsh2 (a1, a0, a1, a0, 1);
489
490 for (;;)
491 {
492 if ((b1 | a1) == 0)
493 {
494 *r1 = 0;
495 return gcd_odd (b0, a0);
496 }
497
498 if (gt2 (a1, a0, b1, b0))
499 {
500 sub_ddmmss (a1, a0, a1, a0, b1, b0);
501 do
502 rsh2 (a1, a0, a1, a0, 1);
503 while ((a0 & 1) == 0);
504 }
505 else if (gt2 (b1, b0, a1, a0))
506 {
507 sub_ddmmss (b1, b0, b1, b0, a1, a0);
508 do
509 rsh2 (b1, b0, b1, b0, 1);
510 while ((b0 & 1) == 0);
511 }
512 else
513 break;
514 }
515
516 *r1 = a1;
517 return a0;
518 }
519
520 static void
factor_insert_multiplicity(struct factors * factors,uintmax_t prime,int m)521 factor_insert_multiplicity (struct factors *factors,
522 uintmax_t prime, int m)
523 {
524 int nfactors = factors->nfactors;
525 uintmax_t *p = factors->p;
526 unsigned char *e = factors->e;
527
528 /* Locate position for insert new or increment e. */
529 int i;
530 for (i = nfactors - 1; i >= 0; i--)
531 {
532 if (p[i] <= prime)
533 break;
534 }
535
536 if (i < 0 || p[i] != prime)
537 {
538 for (int j = nfactors - 1; j > i; j--)
539 {
540 p[j + 1] = p[j];
541 e[j + 1] = e[j];
542 }
543 p[i + 1] = prime;
544 e[i + 1] = m;
545 factors->nfactors = nfactors + 1;
546 }
547 else
548 {
549 e[i] += m;
550 }
551 }
552
553 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
554
555 static void
factor_insert_large(struct factors * factors,uintmax_t p1,uintmax_t p0)556 factor_insert_large (struct factors *factors,
557 uintmax_t p1, uintmax_t p0)
558 {
559 if (p1 > 0)
560 {
561 affirm (factors->plarge[1] == 0);
562 factors->plarge[0] = p0;
563 factors->plarge[1] = p1;
564 }
565 else
566 factor_insert (factors, p0);
567 }
568
569 #ifndef mpz_inits
570
571 # include <stdarg.h>
572
573 # define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
574 # define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
575
576 static void
mpz_va_init(void (* mpz_single_init)(mpz_t),...)577 mpz_va_init (void (*mpz_single_init)(mpz_t), ...)
578 {
579 va_list ap;
580
581 va_start (ap, mpz_single_init);
582
583 mpz_t *mpz;
584 while ((mpz = va_arg (ap, mpz_t *)))
585 mpz_single_init (*mpz);
586
587 va_end (ap);
588 }
589 #endif
590
591 static void mp_factor (mpz_t, struct mp_factors *);
592
593 static void
mp_factor_init(struct mp_factors * factors)594 mp_factor_init (struct mp_factors *factors)
595 {
596 factors->p = nullptr;
597 factors->e = nullptr;
598 factors->nfactors = 0;
599 factors->nalloc = 0;
600 }
601
602 static void
mp_factor_clear(struct mp_factors * factors)603 mp_factor_clear (struct mp_factors *factors)
604 {
605 for (idx_t i = 0; i < factors->nfactors; i++)
606 mpz_clear (factors->p[i]);
607
608 free (factors->p);
609 free (factors->e);
610 }
611
612 static void
mp_factor_insert(struct mp_factors * factors,mpz_t prime)613 mp_factor_insert (struct mp_factors *factors, mpz_t prime)
614 {
615 idx_t nfactors = factors->nfactors;
616 mpz_t *p = factors->p;
617 unsigned long int *e = factors->e;
618 ptrdiff_t i;
619
620 /* Locate position for insert new or increment e. */
621 for (i = nfactors - 1; i >= 0; i--)
622 {
623 if (mpz_cmp (p[i], prime) <= 0)
624 break;
625 }
626
627 if (i < 0 || mpz_cmp (p[i], prime) != 0)
628 {
629 if (factors->nfactors == factors->nalloc)
630 {
631 p = xpalloc (p, &factors->nalloc, 1, -1, sizeof *p);
632 e = xireallocarray (e, factors->nalloc, sizeof *e);
633 }
634
635 mpz_init (p[nfactors]);
636 for (ptrdiff_t j = nfactors - 1; j > i; j--)
637 {
638 mpz_set (p[j + 1], p[j]);
639 e[j + 1] = e[j];
640 }
641 mpz_set (p[i + 1], prime);
642 e[i + 1] = 1;
643
644 factors->p = p;
645 factors->e = e;
646 factors->nfactors = nfactors + 1;
647 }
648 else
649 {
650 e[i] += 1;
651 }
652 }
653
654 static void
mp_factor_insert_ui(struct mp_factors * factors,unsigned long int prime)655 mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
656 {
657 mpz_t pz;
658
659 mpz_init_set_ui (pz, prime);
660 mp_factor_insert (factors, pz);
661 mpz_clear (pz);
662 }
663
664
665 /* Number of bits in an uintmax_t. */
666 enum { W = sizeof (uintmax_t) * CHAR_BIT };
667
668 /* Verify that uintmax_t does not have holes in its representation. */
669 static_assert (UINTMAX_MAX >> (W - 1) == 1);
670
671 #define P(a,b,c,d) a,
672 static const unsigned char primes_diff[] = {
673 #include "primes.h"
674 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
675 };
676 #undef P
677
678 #define PRIMES_PTAB_ENTRIES \
679 (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
680
681 #define P(a,b,c,d) b,
682 static const unsigned char primes_diff8[] = {
683 #include "primes.h"
684 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
685 };
686 #undef P
687
688 struct primes_dtab
689 {
690 uintmax_t binv, lim;
691 };
692
693 #define P(a,b,c,d) {c,d},
694 static const struct primes_dtab primes_dtab[] = {
695 #include "primes.h"
696 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
697 };
698 #undef P
699
700 /* Verify that uintmax_t is not wider than
701 the integers used to generate primes.h. */
702 static_assert (W <= WIDE_UINT_BITS);
703
704 /* debugging for developers. Enables devmsg().
705 This flag is used only in the GMP code. */
706 static bool dev_debug = false;
707
708 /* Prove primality or run probabilistic tests. */
709 static bool flag_prove_primality = PROVE_PRIMALITY;
710
711 /* Number of Miller-Rabin tests to run when not proving primality. */
712 #define MR_REPS 25
713
714 static void
factor_insert_refind(struct factors * factors,uintmax_t p,int i,int off)715 factor_insert_refind (struct factors *factors, uintmax_t p, int i, int off)
716 {
717 for (int j = 0; j < off; j++)
718 p += primes_diff[i + j];
719 factor_insert (factors, p);
720 }
721
722 /* Trial division with odd primes uses the following trick.
723
724 Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
725 consider the case t < B (this is the second loop below).
726
727 From our tables we get
728
729 binv = p^{-1} (mod B)
730 lim = floor ((B-1) / p).
731
732 First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
733 (and all quotients in this range occur for some t).
734
735 Then t = q * p is true also (mod B), and p is invertible we get
736
737 q = t * binv (mod B).
738
739 Next, assume that t is *not* divisible by p. Since multiplication
740 by binv (mod B) is a one-to-one mapping,
741
742 t * binv (mod B) > lim,
743
744 because all the smaller values are already taken.
745
746 This can be summed up by saying that the function
747
748 q(t) = binv * t (mod B)
749
750 is a permutation of the range 0 <= t < B, with the curious property
751 that it maps the multiples of p onto the range 0 <= q <= lim, in
752 order, and the non-multiples of p onto the range lim < q < B.
753 */
754
755 static uintmax_t
factor_using_division(uintmax_t * t1p,uintmax_t t1,uintmax_t t0,struct factors * factors)756 factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
757 struct factors *factors)
758 {
759 if (t0 % 2 == 0)
760 {
761 int cnt;
762
763 if (t0 == 0)
764 {
765 count_trailing_zeros (cnt, t1);
766 t0 = t1 >> cnt;
767 t1 = 0;
768 cnt += W_TYPE_SIZE;
769 }
770 else
771 {
772 count_trailing_zeros (cnt, t0);
773 rsh2 (t1, t0, t1, t0, cnt);
774 }
775
776 factor_insert_multiplicity (factors, 2, cnt);
777 }
778
779 uintmax_t p = 3;
780 idx_t i;
781 for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
782 {
783 for (;;)
784 {
785 uintmax_t q1, q0, hi;
786 MAYBE_UNUSED uintmax_t lo;
787
788 q0 = t0 * primes_dtab[i].binv;
789 umul_ppmm (hi, lo, q0, p);
790 if (hi > t1)
791 break;
792 hi = t1 - hi;
793 q1 = hi * primes_dtab[i].binv;
794 if (LIKELY (q1 > primes_dtab[i].lim))
795 break;
796 t1 = q1; t0 = q0;
797 factor_insert (factors, p);
798 }
799 p += primes_diff[i + 1];
800 }
801 if (t1p)
802 *t1p = t1;
803
804 #define DIVBLOCK(I) \
805 do { \
806 for (;;) \
807 { \
808 q = t0 * pd[I].binv; \
809 if (LIKELY (q > pd[I].lim)) \
810 break; \
811 t0 = q; \
812 factor_insert_refind (factors, p, i + 1, I); \
813 } \
814 } while (0)
815
816 for (; i < PRIMES_PTAB_ENTRIES; i += 8)
817 {
818 uintmax_t q;
819 const struct primes_dtab *pd = &primes_dtab[i];
820 DIVBLOCK (0);
821 DIVBLOCK (1);
822 DIVBLOCK (2);
823 DIVBLOCK (3);
824 DIVBLOCK (4);
825 DIVBLOCK (5);
826 DIVBLOCK (6);
827 DIVBLOCK (7);
828
829 p += primes_diff8[i];
830 if (p * p > t0)
831 break;
832 }
833
834 return t0;
835 }
836
837 static void
mp_factor_using_division(mpz_t t,struct mp_factors * factors)838 mp_factor_using_division (mpz_t t, struct mp_factors *factors)
839 {
840 mpz_t q;
841 mp_bitcnt_t p;
842
843 devmsg ("[trial division] ");
844
845 mpz_init (q);
846
847 p = mpz_scan1 (t, 0);
848 mpz_fdiv_q_2exp (t, t, p);
849 while (p)
850 {
851 mp_factor_insert_ui (factors, 2);
852 --p;
853 }
854
855 unsigned long int d = 3;
856 for (idx_t i = 1; i <= PRIMES_PTAB_ENTRIES;)
857 {
858 if (! mpz_divisible_ui_p (t, d))
859 {
860 d += primes_diff[i++];
861 if (mpz_cmp_ui (t, d * d) < 0)
862 break;
863 }
864 else
865 {
866 mpz_tdiv_q_ui (t, t, d);
867 mp_factor_insert_ui (factors, d);
868 }
869 }
870
871 mpz_clear (q);
872 }
873
874 /* Entry i contains (2i+1)^(-1) mod 2^8. */
875 static const unsigned char binvert_table[128] =
876 {
877 0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
878 0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
879 0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
880 0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
881 0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
882 0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
883 0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
884 0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
885 0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
886 0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
887 0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
888 0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
889 0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
890 0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
891 0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
892 0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
893 };
894
895 /* Compute n^(-1) mod B, using a Newton iteration. */
896 #define binv(inv,n) \
897 do { \
898 uintmax_t __n = (n); \
899 uintmax_t __inv; \
900 \
901 __inv = binvert_table[(__n / 2) & 0x7F]; /* 8 */ \
902 if (W_TYPE_SIZE > 8) __inv = 2 * __inv - __inv * __inv * __n; \
903 if (W_TYPE_SIZE > 16) __inv = 2 * __inv - __inv * __inv * __n; \
904 if (W_TYPE_SIZE > 32) __inv = 2 * __inv - __inv * __inv * __n; \
905 \
906 if (W_TYPE_SIZE > 64) \
907 { \
908 int __invbits = 64; \
909 do { \
910 __inv = 2 * __inv - __inv * __inv * __n; \
911 __invbits *= 2; \
912 } while (__invbits < W_TYPE_SIZE); \
913 } \
914 \
915 (inv) = __inv; \
916 } while (0)
917
918 /* q = u / d, assuming d|u. */
919 #define divexact_21(q1, q0, u1, u0, d) \
920 do { \
921 uintmax_t _di, _q0; \
922 binv (_di, (d)); \
923 _q0 = (u0) * _di; \
924 if ((u1) >= (d)) \
925 { \
926 uintmax_t _p1; \
927 MAYBE_UNUSED intmax_t _p0; \
928 umul_ppmm (_p1, _p0, _q0, d); \
929 (q1) = ((u1) - _p1) * _di; \
930 (q0) = _q0; \
931 } \
932 else \
933 { \
934 (q0) = _q0; \
935 (q1) = 0; \
936 } \
937 } while (0)
938
939 /* x B (mod n). */
940 #define redcify(r_prim, r, n) \
941 do { \
942 MAYBE_UNUSED uintmax_t _redcify_q; \
943 udiv_qrnnd (_redcify_q, r_prim, r, 0, n); \
944 } while (0)
945
946 /* x B^2 (mod n). Requires x > 0, n1 < B/2. */
947 #define redcify2(r1, r0, x, n1, n0) \
948 do { \
949 uintmax_t _r1, _r0, _i; \
950 if ((x) < (n1)) \
951 { \
952 _r1 = (x); _r0 = 0; \
953 _i = W_TYPE_SIZE; \
954 } \
955 else \
956 { \
957 _r1 = 0; _r0 = (x); \
958 _i = 2 * W_TYPE_SIZE; \
959 } \
960 while (_i-- > 0) \
961 { \
962 lsh2 (_r1, _r0, _r1, _r0, 1); \
963 if (ge2 (_r1, _r0, (n1), (n0))) \
964 sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0)); \
965 } \
966 (r1) = _r1; \
967 (r0) = _r0; \
968 } while (0)
969
970 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
971 Both a and b must be in redc form, the result will be in redc form too. */
972 static inline uintmax_t
mulredc(uintmax_t a,uintmax_t b,uintmax_t m,uintmax_t mi)973 mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
974 {
975 uintmax_t rh, rl, q, th, xh;
976 MAYBE_UNUSED uintmax_t tl;
977
978 umul_ppmm (rh, rl, a, b);
979 q = rl * mi;
980 umul_ppmm (th, tl, q, m);
981 xh = rh - th;
982 if (rh < th)
983 xh += m;
984
985 return xh;
986 }
987
988 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
989 Both a and b must be in redc form, the result will be in redc form too.
990 For performance reasons, the most significant bit of m must be clear. */
991 static uintmax_t
mulredc2(uintmax_t * r1p,uintmax_t a1,uintmax_t a0,uintmax_t b1,uintmax_t b0,uintmax_t m1,uintmax_t m0,uintmax_t mi)992 mulredc2 (uintmax_t *r1p,
993 uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
994 uintmax_t m1, uintmax_t m0, uintmax_t mi)
995 {
996 uintmax_t r1, r0, q, p1, t1, t0, s1, s0;
997 MAYBE_UNUSED uintmax_t p0;
998 mi = -mi;
999 affirm ((a1 >> (W_TYPE_SIZE - 1)) == 0);
1000 affirm ((b1 >> (W_TYPE_SIZE - 1)) == 0);
1001 affirm ((m1 >> (W_TYPE_SIZE - 1)) == 0);
1002
1003 /* First compute a0 * <b1, b0> B^{-1}
1004 +-----+
1005 |a0 b0|
1006 +--+--+--+
1007 |a0 b1|
1008 +--+--+--+
1009 |q0 m0|
1010 +--+--+--+
1011 |q0 m1|
1012 -+--+--+--+
1013 |r1|r0| 0|
1014 +--+--+--+
1015 */
1016 umul_ppmm (t1, t0, a0, b0);
1017 umul_ppmm (r1, r0, a0, b1);
1018 q = mi * t0;
1019 umul_ppmm (p1, p0, q, m0);
1020 umul_ppmm (s1, s0, q, m1);
1021 r0 += (t0 != 0); /* Carry */
1022 add_ssaaaa (r1, r0, r1, r0, 0, p1);
1023 add_ssaaaa (r1, r0, r1, r0, 0, t1);
1024 add_ssaaaa (r1, r0, r1, r0, s1, s0);
1025
1026 /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1027 +-----+
1028 |a1 b0|
1029 +--+--+
1030 |r1|r0|
1031 +--+--+--+
1032 |a1 b1|
1033 +--+--+--+
1034 |q1 m0|
1035 +--+--+--+
1036 |q1 m1|
1037 -+--+--+--+
1038 |r1|r0| 0|
1039 +--+--+--+
1040 */
1041 umul_ppmm (t1, t0, a1, b0);
1042 umul_ppmm (s1, s0, a1, b1);
1043 add_ssaaaa (t1, t0, t1, t0, 0, r0);
1044 q = mi * t0;
1045 add_ssaaaa (r1, r0, s1, s0, 0, r1);
1046 umul_ppmm (p1, p0, q, m0);
1047 umul_ppmm (s1, s0, q, m1);
1048 r0 += (t0 != 0); /* Carry */
1049 add_ssaaaa (r1, r0, r1, r0, 0, p1);
1050 add_ssaaaa (r1, r0, r1, r0, 0, t1);
1051 add_ssaaaa (r1, r0, r1, r0, s1, s0);
1052
1053 if (ge2 (r1, r0, m1, m0))
1054 sub_ddmmss (r1, r0, r1, r0, m1, m0);
1055
1056 *r1p = r1;
1057 return r0;
1058 }
1059
1060 ATTRIBUTE_CONST
1061 static uintmax_t
powm(uintmax_t b,uintmax_t e,uintmax_t n,uintmax_t ni,uintmax_t one)1062 powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
1063 {
1064 uintmax_t y = one;
1065
1066 if (e & 1)
1067 y = b;
1068
1069 while (e != 0)
1070 {
1071 b = mulredc (b, b, n, ni);
1072 e >>= 1;
1073
1074 if (e & 1)
1075 y = mulredc (y, b, n, ni);
1076 }
1077
1078 return y;
1079 }
1080
1081 static uintmax_t
powm2(uintmax_t * r1m,const uintmax_t * bp,const uintmax_t * ep,const uintmax_t * np,uintmax_t ni,const uintmax_t * one)1082 powm2 (uintmax_t *r1m,
1083 const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
1084 uintmax_t ni, const uintmax_t *one)
1085 {
1086 uintmax_t r1, r0, b1, b0, n1, n0;
1087 int i;
1088 uintmax_t e;
1089
1090 b0 = bp[0];
1091 b1 = bp[1];
1092 n0 = np[0];
1093 n1 = np[1];
1094
1095 r0 = one[0];
1096 r1 = one[1];
1097
1098 for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
1099 {
1100 if (e & 1)
1101 {
1102 r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1103 r1 = *r1m;
1104 }
1105 b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1106 b1 = *r1m;
1107 }
1108 for (e = ep[1]; e > 0; e >>= 1)
1109 {
1110 if (e & 1)
1111 {
1112 r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1113 r1 = *r1m;
1114 }
1115 b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1116 b1 = *r1m;
1117 }
1118 *r1m = r1;
1119 return r0;
1120 }
1121
1122 ATTRIBUTE_CONST
1123 static bool
millerrabin(uintmax_t n,uintmax_t ni,uintmax_t b,uintmax_t q,int k,uintmax_t one)1124 millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
1125 int k, uintmax_t one)
1126 {
1127 uintmax_t y = powm (b, q, n, ni, one);
1128
1129 uintmax_t nm1 = n - one; /* -1, but in redc representation. */
1130
1131 if (y == one || y == nm1)
1132 return true;
1133
1134 for (int i = 1; i < k; i++)
1135 {
1136 y = mulredc (y, y, n, ni);
1137
1138 if (y == nm1)
1139 return true;
1140 if (y == one)
1141 return false;
1142 }
1143 return false;
1144 }
1145
1146 ATTRIBUTE_PURE static bool
millerrabin2(const uintmax_t * np,uintmax_t ni,const uintmax_t * bp,const uintmax_t * qp,int k,const uintmax_t * one)1147 millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
1148 const uintmax_t *qp, int k, const uintmax_t *one)
1149 {
1150 uintmax_t y1, y0, nm1_1, nm1_0, r1m;
1151
1152 y0 = powm2 (&r1m, bp, qp, np, ni, one);
1153 y1 = r1m;
1154
1155 if (y0 == one[0] && y1 == one[1])
1156 return true;
1157
1158 sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
1159
1160 if (y0 == nm1_0 && y1 == nm1_1)
1161 return true;
1162
1163 for (int i = 1; i < k; i++)
1164 {
1165 y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
1166 y1 = r1m;
1167
1168 if (y0 == nm1_0 && y1 == nm1_1)
1169 return true;
1170 if (y0 == one[0] && y1 == one[1])
1171 return false;
1172 }
1173 return false;
1174 }
1175
1176 static bool
mp_millerrabin(mpz_srcptr n,mpz_srcptr nm1,mpz_ptr x,mpz_ptr y,mpz_srcptr q,mp_bitcnt_t k)1177 mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
1178 mpz_srcptr q, mp_bitcnt_t k)
1179 {
1180 mpz_powm (y, x, q, n);
1181
1182 if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
1183 return true;
1184
1185 for (mp_bitcnt_t i = 1; i < k; i++)
1186 {
1187 mpz_powm_ui (y, y, 2, n);
1188 if (mpz_cmp (y, nm1) == 0)
1189 return true;
1190 if (mpz_cmp_ui (y, 1) == 0)
1191 return false;
1192 }
1193 return false;
1194 }
1195
1196 /* Lucas' prime test. The number of iterations vary greatly, up to a few dozen
1197 have been observed. The average seem to be about 2. */
1198 static bool ATTRIBUTE_PURE
prime_p(uintmax_t n)1199 prime_p (uintmax_t n)
1200 {
1201 mp_bitcnt_t k;
1202 bool is_prime;
1203 uintmax_t a_prim, one, ni;
1204 struct factors factors;
1205
1206 if (n <= 1)
1207 return false;
1208
1209 /* We have already cast out small primes. */
1210 if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
1211 return true;
1212
1213 /* Precomputation for Miller-Rabin. */
1214 uintmax_t q = n - 1;
1215 for (k = 0; (q & 1) == 0; k++)
1216 q >>= 1;
1217
1218 uintmax_t a = 2;
1219 binv (ni, n); /* ni <- 1/n mod B */
1220 redcify (one, 1, n);
1221 addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
1222
1223 /* Perform a Miller-Rabin test, finds most composites quickly. */
1224 if (!millerrabin (n, ni, a_prim, q, k, one))
1225 return false;
1226
1227 if (flag_prove_primality)
1228 {
1229 /* Factor n-1 for Lucas. */
1230 factor (0, n - 1, &factors);
1231 }
1232
1233 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1234 number composite. */
1235 for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1236 {
1237 if (flag_prove_primality)
1238 {
1239 is_prime = true;
1240 for (int i = 0; i < factors.nfactors && is_prime; i++)
1241 {
1242 is_prime
1243 = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
1244 }
1245 }
1246 else
1247 {
1248 /* After enough Miller-Rabin runs, be content. */
1249 is_prime = (r == MR_REPS - 1);
1250 }
1251
1252 if (is_prime)
1253 return true;
1254
1255 a += primes_diff[r]; /* Establish new base. */
1256
1257 /* The following is equivalent to redcify (a_prim, a, n). It runs faster
1258 on most processors, since it avoids udiv_qrnnd. If we go down the
1259 udiv_qrnnd_preinv path, this code should be replaced. */
1260 {
1261 uintmax_t s1, s0;
1262 umul_ppmm (s1, s0, one, a);
1263 if (LIKELY (s1 == 0))
1264 a_prim = s0 % n;
1265 else
1266 {
1267 MAYBE_UNUSED uintmax_t dummy;
1268 udiv_qrnnd (dummy, a_prim, s1, s0, n);
1269 }
1270 }
1271
1272 if (!millerrabin (n, ni, a_prim, q, k, one))
1273 return false;
1274 }
1275
1276 affirm (!"Lucas prime test failure. This should not happen");
1277 }
1278
1279 static bool ATTRIBUTE_PURE
prime2_p(uintmax_t n1,uintmax_t n0)1280 prime2_p (uintmax_t n1, uintmax_t n0)
1281 {
1282 uintmax_t q[2], nm1[2];
1283 uintmax_t a_prim[2];
1284 uintmax_t one[2];
1285 uintmax_t na[2];
1286 uintmax_t ni;
1287 int k;
1288 struct factors factors;
1289
1290 if (n1 == 0)
1291 return prime_p (n0);
1292
1293 nm1[1] = n1 - (n0 == 0);
1294 nm1[0] = n0 - 1;
1295 if (nm1[0] == 0)
1296 {
1297 count_trailing_zeros (k, nm1[1]);
1298
1299 q[0] = nm1[1] >> k;
1300 q[1] = 0;
1301 k += W_TYPE_SIZE;
1302 }
1303 else
1304 {
1305 count_trailing_zeros (k, nm1[0]);
1306 rsh2 (q[1], q[0], nm1[1], nm1[0], k);
1307 }
1308
1309 uintmax_t a = 2;
1310 binv (ni, n0);
1311 redcify2 (one[1], one[0], 1, n1, n0);
1312 addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
1313
1314 /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1315 na[0] = n0;
1316 na[1] = n1;
1317
1318 if (!millerrabin2 (na, ni, a_prim, q, k, one))
1319 return false;
1320
1321 if (flag_prove_primality)
1322 {
1323 /* Factor n-1 for Lucas. */
1324 factor (nm1[1], nm1[0], &factors);
1325 }
1326
1327 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1328 number composite. */
1329 for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1330 {
1331 bool is_prime;
1332 uintmax_t e[2], y[2];
1333
1334 if (flag_prove_primality)
1335 {
1336 is_prime = true;
1337 if (factors.plarge[1])
1338 {
1339 uintmax_t pi;
1340 binv (pi, factors.plarge[0]);
1341 e[0] = pi * nm1[0];
1342 e[1] = 0;
1343 y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1344 is_prime = (y[0] != one[0] || y[1] != one[1]);
1345 }
1346 for (int i = 0; i < factors.nfactors && is_prime; i++)
1347 {
1348 /* FIXME: We always have the factor 2. Do we really need to
1349 handle it here? We have done the same powering as part
1350 of millerrabin. */
1351 if (factors.p[i] == 2)
1352 rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
1353 else
1354 divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
1355 y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1356 is_prime = (y[0] != one[0] || y[1] != one[1]);
1357 }
1358 }
1359 else
1360 {
1361 /* After enough Miller-Rabin runs, be content. */
1362 is_prime = (r == MR_REPS - 1);
1363 }
1364
1365 if (is_prime)
1366 return true;
1367
1368 a += primes_diff[r]; /* Establish new base. */
1369 redcify2 (a_prim[1], a_prim[0], a, n1, n0);
1370
1371 if (!millerrabin2 (na, ni, a_prim, q, k, one))
1372 return false;
1373 }
1374
1375 affirm (!"Lucas prime test failure. This should not happen");
1376 }
1377
1378 static bool
mp_prime_p(mpz_t n)1379 mp_prime_p (mpz_t n)
1380 {
1381 bool is_prime;
1382 mpz_t q, a, nm1, tmp;
1383 struct mp_factors factors;
1384
1385 if (mpz_cmp_ui (n, 1) <= 0)
1386 return false;
1387
1388 /* We have already cast out small primes. */
1389 if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
1390 return true;
1391
1392 mpz_inits (q, a, nm1, tmp, nullptr);
1393
1394 /* Precomputation for Miller-Rabin. */
1395 mpz_sub_ui (nm1, n, 1);
1396
1397 /* Find q and k, where q is odd and n = 1 + 2**k * q. */
1398 mp_bitcnt_t k = mpz_scan1 (nm1, 0);
1399 mpz_tdiv_q_2exp (q, nm1, k);
1400
1401 mpz_set_ui (a, 2);
1402
1403 /* Perform a Miller-Rabin test, finds most composites quickly. */
1404 if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1405 {
1406 is_prime = false;
1407 goto ret2;
1408 }
1409
1410 if (flag_prove_primality)
1411 {
1412 /* Factor n-1 for Lucas. */
1413 mpz_set (tmp, nm1);
1414 mp_factor (tmp, &factors);
1415 }
1416
1417 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1418 number composite. */
1419 for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1420 {
1421 if (flag_prove_primality)
1422 {
1423 is_prime = true;
1424 for (idx_t i = 0; i < factors.nfactors && is_prime; i++)
1425 {
1426 mpz_divexact (tmp, nm1, factors.p[i]);
1427 mpz_powm (tmp, a, tmp, n);
1428 is_prime = mpz_cmp_ui (tmp, 1) != 0;
1429 }
1430 }
1431 else
1432 {
1433 /* After enough Miller-Rabin runs, be content. */
1434 is_prime = (r == MR_REPS - 1);
1435 }
1436
1437 if (is_prime)
1438 goto ret1;
1439
1440 mpz_add_ui (a, a, primes_diff[r]); /* Establish new base. */
1441
1442 if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1443 {
1444 is_prime = false;
1445 goto ret1;
1446 }
1447 }
1448
1449 affirm (!"Lucas prime test failure. This should not happen");
1450
1451 ret1:
1452 if (flag_prove_primality)
1453 mp_factor_clear (&factors);
1454 ret2:
1455 mpz_clears (q, a, nm1, tmp, nullptr);
1456
1457 return is_prime;
1458 }
1459
1460 static void
factor_using_pollard_rho(uintmax_t n,unsigned long int a,struct factors * factors)1461 factor_using_pollard_rho (uintmax_t n, unsigned long int a,
1462 struct factors *factors)
1463 {
1464 uintmax_t x, z, y, P, t, ni, g;
1465
1466 unsigned long int k = 1;
1467 unsigned long int l = 1;
1468
1469 redcify (P, 1, n);
1470 addmod (x, P, P, n); /* i.e., redcify(2) */
1471 y = z = x;
1472
1473 while (n != 1)
1474 {
1475 affirm (a < n);
1476
1477 binv (ni, n); /* FIXME: when could we use old 'ni' value? */
1478
1479 for (;;)
1480 {
1481 do
1482 {
1483 x = mulredc (x, x, n, ni);
1484 addmod (x, x, a, n);
1485
1486 submod (t, z, x, n);
1487 P = mulredc (P, t, n, ni);
1488
1489 if (k % 32 == 1)
1490 {
1491 if (gcd_odd (P, n) != 1)
1492 goto factor_found;
1493 y = x;
1494 }
1495 }
1496 while (--k != 0);
1497
1498 z = x;
1499 k = l;
1500 l = 2 * l;
1501 for (unsigned long int i = 0; i < k; i++)
1502 {
1503 x = mulredc (x, x, n, ni);
1504 addmod (x, x, a, n);
1505 }
1506 y = x;
1507 }
1508
1509 factor_found:
1510 do
1511 {
1512 y = mulredc (y, y, n, ni);
1513 addmod (y, y, a, n);
1514
1515 submod (t, z, y, n);
1516 g = gcd_odd (t, n);
1517 }
1518 while (g == 1);
1519
1520 if (n == g)
1521 {
1522 /* Found n itself as factor. Restart with different params. */
1523 factor_using_pollard_rho (n, a + 1, factors);
1524 return;
1525 }
1526
1527 n = n / g;
1528
1529 if (!prime_p (g))
1530 factor_using_pollard_rho (g, a + 1, factors);
1531 else
1532 factor_insert (factors, g);
1533
1534 if (prime_p (n))
1535 {
1536 factor_insert (factors, n);
1537 break;
1538 }
1539
1540 x = x % n;
1541 z = z % n;
1542 y = y % n;
1543 }
1544 }
1545
1546 static void
factor_using_pollard_rho2(uintmax_t n1,uintmax_t n0,unsigned long int a,struct factors * factors)1547 factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
1548 struct factors *factors)
1549 {
1550 uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
1551
1552 unsigned long int k = 1;
1553 unsigned long int l = 1;
1554
1555 redcify2 (P1, P0, 1, n1, n0);
1556 addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
1557 y1 = z1 = x1;
1558 y0 = z0 = x0;
1559
1560 while (n1 != 0 || n0 != 1)
1561 {
1562 binv (ni, n0);
1563
1564 for (;;)
1565 {
1566 do
1567 {
1568 x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1569 x1 = r1m;
1570 addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1571
1572 submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
1573 P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
1574 P1 = r1m;
1575
1576 if (k % 32 == 1)
1577 {
1578 g0 = gcd2_odd (&g1, P1, P0, n1, n0);
1579 if (g1 != 0 || g0 != 1)
1580 goto factor_found;
1581 y1 = x1; y0 = x0;
1582 }
1583 }
1584 while (--k != 0);
1585
1586 z1 = x1; z0 = x0;
1587 k = l;
1588 l = 2 * l;
1589 for (unsigned long int i = 0; i < k; i++)
1590 {
1591 x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1592 x1 = r1m;
1593 addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1594 }
1595 y1 = x1; y0 = x0;
1596 }
1597
1598 factor_found:
1599 do
1600 {
1601 y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
1602 y1 = r1m;
1603 addmod2 (y1, y0, y1, y0, 0, (uintmax_t) a, n1, n0);
1604
1605 submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
1606 g0 = gcd2_odd (&g1, t1, t0, n1, n0);
1607 }
1608 while (g1 == 0 && g0 == 1);
1609
1610 if (g1 == 0)
1611 {
1612 /* The found factor is one word, and > 1. */
1613 divexact_21 (n1, n0, n1, n0, g0); /* n = n / g */
1614
1615 if (!prime_p (g0))
1616 factor_using_pollard_rho (g0, a + 1, factors);
1617 else
1618 factor_insert (factors, g0);
1619 }
1620 else
1621 {
1622 /* The found factor is two words. This is highly unlikely, thus hard
1623 to trigger. Please be careful before you change this code! */
1624 uintmax_t ginv;
1625
1626 if (n1 == g1 && n0 == g0)
1627 {
1628 /* Found n itself as factor. Restart with different params. */
1629 factor_using_pollard_rho2 (n1, n0, a + 1, factors);
1630 return;
1631 }
1632
1633 /* Compute n = n / g. Since the result will fit one word,
1634 we can compute the quotient modulo B, ignoring the high
1635 divisor word. */
1636 binv (ginv, g0);
1637 n0 = ginv * n0;
1638 n1 = 0;
1639
1640 if (!prime2_p (g1, g0))
1641 factor_using_pollard_rho2 (g1, g0, a + 1, factors);
1642 else
1643 factor_insert_large (factors, g1, g0);
1644 }
1645
1646 if (n1 == 0)
1647 {
1648 if (prime_p (n0))
1649 {
1650 factor_insert (factors, n0);
1651 break;
1652 }
1653
1654 factor_using_pollard_rho (n0, a, factors);
1655 return;
1656 }
1657
1658 if (prime2_p (n1, n0))
1659 {
1660 factor_insert_large (factors, n1, n0);
1661 break;
1662 }
1663
1664 x0 = mod2 (&x1, x1, x0, n1, n0);
1665 z0 = mod2 (&z1, z1, z0, n1, n0);
1666 y0 = mod2 (&y1, y1, y0, n1, n0);
1667 }
1668 }
1669
1670 static void
mp_factor_using_pollard_rho(mpz_t n,unsigned long int a,struct mp_factors * factors)1671 mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
1672 struct mp_factors *factors)
1673 {
1674 mpz_t x, z, y, P;
1675 mpz_t t, t2;
1676
1677 devmsg ("[pollard-rho (%lu)] ", a);
1678
1679 mpz_inits (t, t2, nullptr);
1680 mpz_init_set_si (y, 2);
1681 mpz_init_set_si (x, 2);
1682 mpz_init_set_si (z, 2);
1683 mpz_init_set_ui (P, 1);
1684
1685 unsigned long long int k = 1;
1686 unsigned long long int l = 1;
1687
1688 while (mpz_cmp_ui (n, 1) != 0)
1689 {
1690 for (;;)
1691 {
1692 do
1693 {
1694 mpz_mul (t, x, x);
1695 mpz_mod (x, t, n);
1696 mpz_add_ui (x, x, a);
1697
1698 mpz_sub (t, z, x);
1699 mpz_mul (t2, P, t);
1700 mpz_mod (P, t2, n);
1701
1702 if (k % 32 == 1)
1703 {
1704 mpz_gcd (t, P, n);
1705 if (mpz_cmp_ui (t, 1) != 0)
1706 goto factor_found;
1707 mpz_set (y, x);
1708 }
1709 }
1710 while (--k != 0);
1711
1712 mpz_set (z, x);
1713 k = l;
1714 l = 2 * l;
1715 for (unsigned long long int i = 0; i < k; i++)
1716 {
1717 mpz_mul (t, x, x);
1718 mpz_mod (x, t, n);
1719 mpz_add_ui (x, x, a);
1720 }
1721 mpz_set (y, x);
1722 }
1723
1724 factor_found:
1725 do
1726 {
1727 mpz_mul (t, y, y);
1728 mpz_mod (y, t, n);
1729 mpz_add_ui (y, y, a);
1730
1731 mpz_sub (t, z, y);
1732 mpz_gcd (t, t, n);
1733 }
1734 while (mpz_cmp_ui (t, 1) == 0);
1735
1736 mpz_divexact (n, n, t); /* divide by t, before t is overwritten */
1737
1738 if (!mp_prime_p (t))
1739 {
1740 devmsg ("[composite factor--restarting pollard-rho] ");
1741 mp_factor_using_pollard_rho (t, a + 1, factors);
1742 }
1743 else
1744 {
1745 mp_factor_insert (factors, t);
1746 }
1747
1748 if (mp_prime_p (n))
1749 {
1750 mp_factor_insert (factors, n);
1751 break;
1752 }
1753
1754 mpz_mod (x, x, n);
1755 mpz_mod (z, z, n);
1756 mpz_mod (y, y, n);
1757 }
1758
1759 mpz_clears (P, t2, t, z, x, y, nullptr);
1760 }
1761
1762 #if USE_SQUFOF
1763 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)? If
1764 algorithm is replaced, consider also returning the remainder. */
1765 ATTRIBUTE_CONST
1766 static uintmax_t
isqrt(uintmax_t n)1767 isqrt (uintmax_t n)
1768 {
1769 uintmax_t x;
1770 int c;
1771 if (n == 0)
1772 return 0;
1773
1774 count_leading_zeros (c, n);
1775
1776 /* Make x > sqrt(n). This will be invariant through the loop. */
1777 x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) >> 1);
1778
1779 for (;;)
1780 {
1781 uintmax_t y = (x + n / x) / 2;
1782 if (y >= x)
1783 return x;
1784
1785 x = y;
1786 }
1787 }
1788
1789 ATTRIBUTE_CONST
1790 static uintmax_t
isqrt2(uintmax_t nh,uintmax_t nl)1791 isqrt2 (uintmax_t nh, uintmax_t nl)
1792 {
1793 int shift;
1794 uintmax_t x;
1795
1796 /* Ensures the remainder fits in an uintmax_t. */
1797 affirm (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
1798
1799 if (nh == 0)
1800 return isqrt (nl);
1801
1802 count_leading_zeros (shift, nh);
1803 shift &= ~1;
1804
1805 /* Make x > sqrt (n). */
1806 x = isqrt ((nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
1807 x <<= (W_TYPE_SIZE - shift) >> 1;
1808
1809 /* Do we need more than one iteration? */
1810 for (;;)
1811 {
1812 MAYBE_UNUSED uintmax_t r;
1813 uintmax_t q, y;
1814 udiv_qrnnd (q, r, nh, nl, x);
1815 y = (x + q) / 2;
1816
1817 if (y >= x)
1818 {
1819 uintmax_t hi, lo;
1820 umul_ppmm (hi, lo, x + 1, x + 1);
1821 affirm (gt2 (hi, lo, nh, nl));
1822
1823 umul_ppmm (hi, lo, x, x);
1824 affirm (ge2 (nh, nl, hi, lo));
1825 sub_ddmmss (hi, lo, nh, nl, hi, lo);
1826 affirm (hi == 0);
1827
1828 return x;
1829 }
1830
1831 x = y;
1832 }
1833 }
1834
1835 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1836 # define MAGIC64 0x0202021202030213ULL
1837 # define MAGIC63 0x0402483012450293ULL
1838 # define MAGIC65 0x218a019866014613ULL
1839 # define MAGIC11 0x23b
1840
1841 /* Return the square root if the input is a square, otherwise 0. */
1842 ATTRIBUTE_CONST
1843 static uintmax_t
is_square(uintmax_t x)1844 is_square (uintmax_t x)
1845 {
1846 /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1847 computing the square root. */
1848 if (((MAGIC64 >> (x & 63)) & 1)
1849 && ((MAGIC63 >> (x % 63)) & 1)
1850 /* Both 0 and 64 are squares mod (65). */
1851 && ((MAGIC65 >> ((x % 65) & 63)) & 1)
1852 && ((MAGIC11 >> (x % 11) & 1)))
1853 {
1854 uintmax_t r = isqrt (x);
1855 if (r * r == x)
1856 return r;
1857 }
1858 return 0;
1859 }
1860
1861 /* invtab[i] = floor (0x10000 / (0x100 + i) */
1862 static short const invtab[0x81] =
1863 {
1864 0x200,
1865 0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1866 0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1867 0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1868 0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1869 0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1870 0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1871 0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1872 0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1873 0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1874 0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1875 0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1876 0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1877 0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1878 0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1879 0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1880 0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1881 };
1882
1883 /* Compute q = [u/d], r = u mod d. Avoids slow hardware division for the case
1884 that q < 0x40; here it instead uses a table of (Euclidean) inverses. */
1885 # define div_smallq(q, r, u, d) \
1886 do { \
1887 if ((u) / 0x40 < (d)) \
1888 { \
1889 int _cnt; \
1890 uintmax_t _dinv, _mask, _q, _r; \
1891 count_leading_zeros (_cnt, (d)); \
1892 _r = (u); \
1893 if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8))) \
1894 { \
1895 _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80]; \
1896 _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt); \
1897 } \
1898 else \
1899 { \
1900 _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f]; \
1901 _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11; \
1902 } \
1903 _r -= _q * (d); \
1904 \
1905 _mask = -(uintmax_t) (_r >= (d)); \
1906 (r) = _r - (_mask & (d)); \
1907 (q) = _q - _mask; \
1908 affirm ((q) * (d) + (r) == u); \
1909 } \
1910 else \
1911 { \
1912 uintmax_t _q = (u) / (d); \
1913 (r) = (u) - _q * (d); \
1914 (q) = _q; \
1915 } \
1916 } while (0)
1917
1918 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1919 = 3025, P = 1737, representing F_{18} = (-6314, 2 * 1737, 3025),
1920 with 3025 = 55^2.
1921
1922 Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1923 representing G_0 = (-55, 2 * 4652, 8653).
1924
1925 In the notation of the paper:
1926
1927 S_{-1} = 55, S_0 = 8653, R_0 = 4652
1928
1929 Put
1930
1931 t_0 = floor([q_0 + R_0] / S0) = 1
1932 R_1 = t_0 * S_0 - R_0 = 4001
1933 S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1934 */
1935
1936 /* Multipliers, in order of efficiency:
1937 0.7268 3*5*7*11 = 1155 = 3 (mod 4)
1938 0.7317 3*5*7 = 105 = 1
1939 0.7820 3*5*11 = 165 = 1
1940 0.7872 3*5 = 15 = 3
1941 0.8101 3*7*11 = 231 = 3
1942 0.8155 3*7 = 21 = 1
1943 0.8284 5*7*11 = 385 = 1
1944 0.8339 5*7 = 35 = 3
1945 0.8716 3*11 = 33 = 1
1946 0.8774 3 = 3 = 3
1947 0.8913 5*11 = 55 = 3
1948 0.8972 5 = 5 = 1
1949 0.9233 7*11 = 77 = 1
1950 0.9295 7 = 7 = 3
1951 0.9934 11 = 11 = 3
1952 */
1953 # define QUEUE_SIZE 50
1954 #endif
1955
1956 #if STAT_SQUFOF
1957 # define Q_FREQ_SIZE 50
1958 /* Element 0 keeps the total */
1959 static int q_freq[Q_FREQ_SIZE + 1];
1960 #endif
1961
1962 #if USE_SQUFOF
1963 /* Return true on success. Expected to fail only for numbers
1964 >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1965 static bool
factor_using_squfof(uintmax_t n1,uintmax_t n0,struct factors * factors)1966 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
1967 {
1968 /* Uses algorithm and notation from
1969
1970 SQUARE FORM FACTORIZATION
1971 JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1972
1973 https://homes.cerias.purdue.edu/~ssw/squfof.pdf
1974 */
1975
1976 static short const multipliers_1[] =
1977 { /* = 1 (mod 4) */
1978 105, 165, 21, 385, 33, 5, 77, 1, 0
1979 };
1980 static short const multipliers_3[] =
1981 { /* = 3 (mod 4) */
1982 1155, 15, 231, 35, 3, 55, 7, 11, 0
1983 };
1984
1985 struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
1986
1987 if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
1988 return false;
1989
1990 uintmax_t sqrt_n = isqrt2 (n1, n0);
1991
1992 if (n0 == sqrt_n * sqrt_n)
1993 {
1994 uintmax_t p1, p0;
1995
1996 umul_ppmm (p1, p0, sqrt_n, sqrt_n);
1997 affirm (p0 == n0);
1998
1999 if (n1 == p1)
2000 {
2001 if (prime_p (sqrt_n))
2002 factor_insert_multiplicity (factors, sqrt_n, 2);
2003 else
2004 {
2005 struct factors f;
2006
2007 f.nfactors = 0;
2008 if (!factor_using_squfof (0, sqrt_n, &f))
2009 {
2010 /* Try pollard rho instead */
2011 factor_using_pollard_rho (sqrt_n, 1, &f);
2012 }
2013 /* Duplicate the new factors */
2014 for (unsigned int i = 0; i < f.nfactors; i++)
2015 factor_insert_multiplicity (factors, f.p[i], 2 * f.e[i]);
2016 }
2017 return true;
2018 }
2019 }
2020
2021 /* Select multipliers so we always get n * mu = 3 (mod 4) */
2022 for (short const *m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
2023 *m; m++)
2024 {
2025 uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
2026 unsigned int i;
2027 unsigned int mu = *m;
2028 int qpos = 0;
2029
2030 affirm (mu * n0 % 4 == 3);
2031
2032 /* In the notation of the paper, with mu * n == 3 (mod 4), we
2033 get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2034 I understand it, the necessary bound is 4 \mu^3 < n, or 32
2035 mu^3 < n.
2036
2037 However, this seems insufficient: With n = 37243139 and mu =
2038 105, we get a trivial factor, from the square 38809 = 197^2,
2039 without any corresponding Q earlier in the iteration.
2040
2041 Requiring 64 mu^3 < n seems sufficient. */
2042 if (n1 == 0)
2043 {
2044 if ((uintmax_t) mu * mu * mu >= n0 / 64)
2045 continue;
2046 }
2047 else
2048 {
2049 if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
2050 continue;
2051 }
2052 umul_ppmm (Dh, Dl, n0, mu);
2053 Dh += n1 * mu;
2054
2055 affirm (Dl % 4 != 1);
2056 affirm (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
2057
2058 S = isqrt2 (Dh, Dl);
2059
2060 Q1 = 1;
2061 P = S;
2062
2063 /* Square root remainder fits in one word, so ignore high part. */
2064 Q = Dl - P * P;
2065 /* FIXME: When can this differ from floor (sqrt (2 * sqrt (D)))? */
2066 L = isqrt (2 * S);
2067 B = 2 * L;
2068 L1 = mu * 2 * L;
2069
2070 /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2071 4 D. */
2072
2073 for (i = 0; i <= B; i++)
2074 {
2075 uintmax_t q, P1, t, rem;
2076
2077 div_smallq (q, rem, S + P, Q);
2078 P1 = S - rem; /* P1 = q*Q - P */
2079
2080 affirm (q > 0 && Q > 0);
2081
2082 # if STAT_SQUFOF
2083 q_freq[0]++;
2084 q_freq[MIN (q, Q_FREQ_SIZE)]++;
2085 # endif
2086
2087 if (Q <= L1)
2088 {
2089 uintmax_t g = Q;
2090
2091 if ((Q & 1) == 0)
2092 g /= 2;
2093
2094 g /= gcd_odd (g, mu);
2095
2096 if (g <= L)
2097 {
2098 if (qpos >= QUEUE_SIZE)
2099 error (EXIT_FAILURE, 0, _("squfof queue overflow"));
2100 queue[qpos].Q = g;
2101 queue[qpos].P = P % g;
2102 qpos++;
2103 }
2104 }
2105
2106 /* I think the difference can be either sign, but mod
2107 2^W_TYPE_SIZE arithmetic should be fine. */
2108 t = Q1 + q * (P - P1);
2109 Q1 = Q;
2110 Q = t;
2111 P = P1;
2112
2113 if ((i & 1) == 0)
2114 {
2115 uintmax_t r = is_square (Q);
2116 if (r)
2117 {
2118 for (int j = 0; j < qpos; j++)
2119 {
2120 if (queue[j].Q == r)
2121 {
2122 if (r == 1)
2123 /* Traversed entire cycle. */
2124 goto next_multiplier;
2125
2126 /* Need the absolute value for divisibility test. */
2127 if (P >= queue[j].P)
2128 t = P - queue[j].P;
2129 else
2130 t = queue[j].P - P;
2131 if (t % r == 0)
2132 {
2133 /* Delete entries up to and including entry
2134 j, which matched. */
2135 memmove (queue, queue + j + 1,
2136 (qpos - j - 1) * sizeof (queue[0]));
2137 qpos -= (j + 1);
2138 }
2139 goto next_i;
2140 }
2141 }
2142
2143 /* We have found a square form, which should give a
2144 factor. */
2145 Q1 = r;
2146 affirm (S >= P); /* What signs are possible? */
2147 P += r * ((S - P) / r);
2148
2149 /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2150 for the case D = 2N. */
2151 /* Compute Q = (D - P*P) / Q1, but we need double
2152 precision. */
2153 uintmax_t hi, lo;
2154 umul_ppmm (hi, lo, P, P);
2155 sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
2156 udiv_qrnnd (Q, rem, hi, lo, Q1);
2157 affirm (rem == 0);
2158
2159 for (;;)
2160 {
2161 /* Note: There appears to by a typo in the paper,
2162 Step 4a in the algorithm description says q <--
2163 floor([S+P]/\hat Q), but looking at the equations
2164 in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2165 (In this code, \hat Q is Q1). */
2166 div_smallq (q, rem, S + P, Q);
2167 P1 = S - rem; /* P1 = q*Q - P */
2168
2169 # if STAT_SQUFOF
2170 q_freq[0]++;
2171 q_freq[MIN (q, Q_FREQ_SIZE)]++;
2172 # endif
2173 if (P == P1)
2174 break;
2175 t = Q1 + q * (P - P1);
2176 Q1 = Q;
2177 Q = t;
2178 P = P1;
2179 }
2180
2181 if ((Q & 1) == 0)
2182 Q /= 2;
2183 Q /= gcd_odd (Q, mu);
2184
2185 affirm (Q > 1 && (n1 || Q < n0));
2186
2187 if (prime_p (Q))
2188 factor_insert (factors, Q);
2189 else if (!factor_using_squfof (0, Q, factors))
2190 factor_using_pollard_rho (Q, 2, factors);
2191
2192 divexact_21 (n1, n0, n1, n0, Q);
2193
2194 if (prime2_p (n1, n0))
2195 factor_insert_large (factors, n1, n0);
2196 else
2197 {
2198 if (!factor_using_squfof (n1, n0, factors))
2199 {
2200 if (n1 == 0)
2201 factor_using_pollard_rho (n0, 1, factors);
2202 else
2203 factor_using_pollard_rho2 (n1, n0, 1, factors);
2204 }
2205 }
2206
2207 return true;
2208 }
2209 }
2210 next_i:;
2211 }
2212 next_multiplier:;
2213 }
2214 return false;
2215 }
2216 #endif
2217
2218 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2219 results in FACTORS. */
2220 static void
factor(uintmax_t t1,uintmax_t t0,struct factors * factors)2221 factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
2222 {
2223 factors->nfactors = 0;
2224 factors->plarge[1] = 0;
2225
2226 if (t1 == 0 && t0 < 2)
2227 return;
2228
2229 t0 = factor_using_division (&t1, t1, t0, factors);
2230
2231 if (t1 == 0 && t0 < 2)
2232 return;
2233
2234 if (prime2_p (t1, t0))
2235 factor_insert_large (factors, t1, t0);
2236 else
2237 {
2238 #if USE_SQUFOF
2239 if (factor_using_squfof (t1, t0, factors))
2240 return;
2241 #endif
2242
2243 if (t1 == 0)
2244 factor_using_pollard_rho (t0, 1, factors);
2245 else
2246 factor_using_pollard_rho2 (t1, t0, 1, factors);
2247 }
2248 }
2249
2250 /* Use Pollard-rho to compute the prime factors of
2251 arbitrary-precision T, and put the results in FACTORS. */
2252 static void
mp_factor(mpz_t t,struct mp_factors * factors)2253 mp_factor (mpz_t t, struct mp_factors *factors)
2254 {
2255 mp_factor_init (factors);
2256
2257 if (mpz_sgn (t) != 0)
2258 {
2259 mp_factor_using_division (t, factors);
2260
2261 if (mpz_cmp_ui (t, 1) != 0)
2262 {
2263 devmsg ("[is number prime?] ");
2264 if (mp_prime_p (t))
2265 mp_factor_insert (factors, t);
2266 else
2267 mp_factor_using_pollard_rho (t, 1, factors);
2268 }
2269 }
2270 }
2271
2272 static strtol_error
strto2uintmax(uintmax_t * hip,uintmax_t * lop,char const * s)2273 strto2uintmax (uintmax_t *hip, uintmax_t *lop, char const *s)
2274 {
2275 int lo_carry;
2276 uintmax_t hi = 0, lo = 0;
2277
2278 strtol_error err = LONGINT_INVALID;
2279
2280 /* Initial scan for invalid digits. */
2281 char const *p = s;
2282 for (;;)
2283 {
2284 unsigned char c = *p++;
2285 if (c == 0)
2286 break;
2287
2288 if (UNLIKELY (!ISDIGIT (c)))
2289 {
2290 err = LONGINT_INVALID;
2291 break;
2292 }
2293
2294 err = LONGINT_OK; /* we've seen at least one valid digit */
2295 }
2296
2297 while (err == LONGINT_OK)
2298 {
2299 unsigned char c = *s++;
2300 if (c == 0)
2301 break;
2302
2303 c -= '0';
2304
2305 if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
2306 {
2307 err = LONGINT_OVERFLOW;
2308 break;
2309 }
2310 hi = 10 * hi;
2311
2312 lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
2313 lo_carry += 10 * lo < 2 * lo;
2314
2315 lo = 10 * lo;
2316 lo += c;
2317
2318 lo_carry += lo < c;
2319 hi += lo_carry;
2320 if (UNLIKELY (hi < lo_carry))
2321 {
2322 err = LONGINT_OVERFLOW;
2323 break;
2324 }
2325 }
2326
2327 *hip = hi;
2328 *lop = lo;
2329
2330 return err;
2331 }
2332
2333 /* Structure and routines for buffering and outputting full lines,
2334 to support parallel operation efficiently. */
2335 static struct lbuf_
2336 {
2337 char *buf;
2338 char *end;
2339 } lbuf;
2340
2341 /* 512 is chosen to give good performance,
2342 and also is the max guaranteed size that
2343 consumers can read atomically through pipes.
2344 Also it's big enough to cater for max line length
2345 even with 128 bit uintmax_t. */
2346 #define FACTOR_PIPE_BUF 512
2347
2348 static void
lbuf_alloc(void)2349 lbuf_alloc (void)
2350 {
2351 if (lbuf.buf)
2352 return;
2353
2354 /* Double to ensure enough space for
2355 previous numbers + next number. */
2356 lbuf.buf = xmalloc (FACTOR_PIPE_BUF * 2);
2357 lbuf.end = lbuf.buf;
2358 }
2359
2360 /* Write complete LBUF to standard output. */
2361 static void
lbuf_flush(void)2362 lbuf_flush (void)
2363 {
2364 size_t size = lbuf.end - lbuf.buf;
2365 if (full_write (STDOUT_FILENO, lbuf.buf, size) != size)
2366 write_error ();
2367 lbuf.end = lbuf.buf;
2368 }
2369
2370 /* Add a character C to LBUF and if it's a newline
2371 and enough bytes are already buffered,
2372 then write atomically to standard output. */
2373 static void
lbuf_putc(char c)2374 lbuf_putc (char c)
2375 {
2376 *lbuf.end++ = c;
2377
2378 if (c == '\n')
2379 {
2380 size_t buffered = lbuf.end - lbuf.buf;
2381
2382 /* Provide immediate output for interactive use. */
2383 static int line_buffered = -1;
2384 if (line_buffered == -1)
2385 line_buffered = isatty (STDIN_FILENO) || isatty (STDOUT_FILENO);
2386 if (line_buffered)
2387 lbuf_flush ();
2388 else if (buffered >= FACTOR_PIPE_BUF)
2389 {
2390 /* Write output in <= PIPE_BUF chunks
2391 so consumers can read atomically. */
2392 char const *tend = lbuf.end;
2393
2394 /* Since a umaxint_t's factors must fit in 512
2395 we're guaranteed to find a newline here. */
2396 char *tlend = lbuf.buf + FACTOR_PIPE_BUF;
2397 while (*--tlend != '\n');
2398 tlend++;
2399
2400 lbuf.end = tlend;
2401 lbuf_flush ();
2402
2403 /* Buffer the remainder. */
2404 memcpy (lbuf.buf, tlend, tend - tlend);
2405 lbuf.end = lbuf.buf + (tend - tlend);
2406 }
2407 }
2408 }
2409
2410 /* Buffer an int to the internal LBUF. */
2411 static void
lbuf_putint(uintmax_t i,size_t min_width)2412 lbuf_putint (uintmax_t i, size_t min_width)
2413 {
2414 char buf[INT_BUFSIZE_BOUND (uintmax_t)];
2415 char const *umaxstr = umaxtostr (i, buf);
2416 size_t width = sizeof (buf) - (umaxstr - buf) - 1;
2417 size_t z = width;
2418
2419 for (; z < min_width; z++)
2420 *lbuf.end++ = '0';
2421
2422 memcpy (lbuf.end, umaxstr, width);
2423 lbuf.end += width;
2424 }
2425
2426 static void
print_uintmaxes(uintmax_t t1,uintmax_t t0)2427 print_uintmaxes (uintmax_t t1, uintmax_t t0)
2428 {
2429 uintmax_t q, r;
2430
2431 if (t1 == 0)
2432 lbuf_putint (t0, 0);
2433 else
2434 {
2435 /* Use very plain code here since it seems hard to write fast code
2436 without assuming a specific word size. */
2437 q = t1 / 1000000000;
2438 r = t1 % 1000000000;
2439 udiv_qrnnd (t0, r, r, t0, 1000000000);
2440 print_uintmaxes (q, t0);
2441 lbuf_putint (r, 9);
2442 }
2443 }
2444
2445 /* Single-precision factoring */
2446 static void
print_factors_single(uintmax_t t1,uintmax_t t0)2447 print_factors_single (uintmax_t t1, uintmax_t t0)
2448 {
2449 struct factors factors;
2450
2451 print_uintmaxes (t1, t0);
2452 lbuf_putc (':');
2453
2454 factor (t1, t0, &factors);
2455
2456 for (int j = 0; j < factors.nfactors; j++)
2457 for (int k = 0; k < factors.e[j]; k++)
2458 {
2459 lbuf_putc (' ');
2460 print_uintmaxes (0, factors.p[j]);
2461 if (print_exponents && factors.e[j] > 1)
2462 {
2463 lbuf_putc ('^');
2464 lbuf_putint (factors.e[j], 0);
2465 break;
2466 }
2467 }
2468
2469 if (factors.plarge[1])
2470 {
2471 lbuf_putc (' ');
2472 print_uintmaxes (factors.plarge[1], factors.plarge[0]);
2473 }
2474
2475 lbuf_putc ('\n');
2476 }
2477
2478 /* Emit the factors of the indicated number. If we have the option of using
2479 either algorithm, we select on the basis of the length of the number.
2480 For longer numbers, we prefer the MP algorithm even if the native algorithm
2481 has enough digits, because the algorithm is better. The turnover point
2482 depends on the value. */
2483 static bool
print_factors(char const * input)2484 print_factors (char const *input)
2485 {
2486 /* Skip initial spaces and '+'. */
2487 char const *str = input;
2488 while (*str == ' ')
2489 str++;
2490 str += *str == '+';
2491
2492 uintmax_t t1, t0;
2493
2494 /* Try converting the number to one or two words. If it fails, use GMP or
2495 print an error message. The 2nd condition checks that the most
2496 significant bit of the two-word number is clear, in a typesize neutral
2497 way. */
2498 strtol_error err = strto2uintmax (&t1, &t0, str);
2499
2500 switch (err)
2501 {
2502 case LONGINT_OK:
2503 if (((t1 << 1) >> 1) == t1)
2504 {
2505 devmsg ("[using single-precision arithmetic] ");
2506 print_factors_single (t1, t0);
2507 return true;
2508 }
2509 break;
2510
2511 case LONGINT_OVERFLOW:
2512 /* Try GMP. */
2513 break;
2514
2515 default:
2516 error (0, 0, _("%s is not a valid positive integer"), quote (input));
2517 return false;
2518 }
2519
2520 devmsg ("[using arbitrary-precision arithmetic] ");
2521 mpz_t t;
2522 struct mp_factors factors;
2523
2524 mpz_init_set_str (t, str, 10);
2525
2526 mpz_out_str (stdout, 10, t);
2527 putchar (':');
2528 mp_factor (t, &factors);
2529
2530 for (idx_t j = 0; j < factors.nfactors; j++)
2531 for (unsigned long int k = 0; k < factors.e[j]; k++)
2532 {
2533 putchar (' ');
2534 mpz_out_str (stdout, 10, factors.p[j]);
2535 if (print_exponents && factors.e[j] > 1)
2536 {
2537 printf ("^%lu", factors.e[j]);
2538 break;
2539 }
2540 }
2541
2542 mp_factor_clear (&factors);
2543 mpz_clear (t);
2544 putchar ('\n');
2545 fflush (stdout);
2546 return true;
2547 }
2548
2549 void
usage(int status)2550 usage (int status)
2551 {
2552 if (status != EXIT_SUCCESS)
2553 emit_try_help ();
2554 else
2555 {
2556 printf (_("\
2557 Usage: %s [OPTION] [NUMBER]...\n\
2558 "),
2559 program_name);
2560 fputs (_("\
2561 Print the prime factors of each specified integer NUMBER. If none\n\
2562 are specified on the command line, read them from standard input.\n\
2563 \n\
2564 "), stdout);
2565 fputs ("\
2566 -h, --exponents print repeated factors in form p^e unless e is 1\n\
2567 ", stdout);
2568 fputs (HELP_OPTION_DESCRIPTION, stdout);
2569 fputs (VERSION_OPTION_DESCRIPTION, stdout);
2570 emit_ancillary_info (PROGRAM_NAME);
2571 }
2572 exit (status);
2573 }
2574
2575 static bool
do_stdin(void)2576 do_stdin (void)
2577 {
2578 bool ok = true;
2579 token_buffer tokenbuffer;
2580
2581 init_tokenbuffer (&tokenbuffer);
2582
2583 while (true)
2584 {
2585 size_t token_length = readtoken (stdin, DELIM, sizeof (DELIM) - 1,
2586 &tokenbuffer);
2587 if (token_length == (size_t) -1)
2588 {
2589 if (ferror (stdin))
2590 error (EXIT_FAILURE, errno, _("error reading input"));
2591 break;
2592 }
2593
2594 ok &= print_factors (tokenbuffer.buffer);
2595 }
2596 free (tokenbuffer.buffer);
2597
2598 return ok;
2599 }
2600
2601 int
main(int argc,char ** argv)2602 main (int argc, char **argv)
2603 {
2604 initialize_main (&argc, &argv);
2605 set_program_name (argv[0]);
2606 setlocale (LC_ALL, "");
2607 bindtextdomain (PACKAGE, LOCALEDIR);
2608 textdomain (PACKAGE);
2609
2610 lbuf_alloc ();
2611 atexit (close_stdout);
2612 atexit (lbuf_flush);
2613
2614 int c;
2615 while ((c = getopt_long (argc, argv, "h", long_options, nullptr)) != -1)
2616 {
2617 switch (c)
2618 {
2619 case 'h': /* NetBSD used -h for this functionality first. */
2620 print_exponents = true;
2621 break;
2622
2623 case DEV_DEBUG_OPTION:
2624 dev_debug = true;
2625 break;
2626
2627 case_GETOPT_HELP_CHAR;
2628
2629 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2630
2631 default:
2632 usage (EXIT_FAILURE);
2633 }
2634 }
2635
2636 #if STAT_SQUFOF
2637 memset (q_freq, 0, sizeof (q_freq));
2638 #endif
2639
2640 bool ok;
2641 if (argc <= optind)
2642 ok = do_stdin ();
2643 else
2644 {
2645 ok = true;
2646 for (int i = optind; i < argc; i++)
2647 if (! print_factors (argv[i]))
2648 ok = false;
2649 }
2650
2651 #if STAT_SQUFOF
2652 if (q_freq[0] > 0)
2653 {
2654 double acc_f;
2655 printf ("q freq. cum. freq.(total: %d)\n", q_freq[0]);
2656 for (int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
2657 {
2658 double f = (double) q_freq[i] / q_freq[0];
2659 acc_f += f;
2660 printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
2661 100.0 * f, 100.0 * acc_f);
2662 }
2663 }
2664 #endif
2665
2666 return ok ? EXIT_SUCCESS : EXIT_FAILURE;
2667 }
2668