
projects / BearSSL / blob commit grep author committer pickaxe ? search:
re summary | shortlog | log | commit | commitdiff | tree history | raw | HEAD Some documentation fixes. [BearSSL] / src / int / i62_modpow2.c 1
/* 2
* Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3
* 4
* Permission is hereby granted, free of charge, to any person obtaining 5
* a copy of this software and associated documentation files (the 6
* "Software"), to deal in the Software without restriction, including 7
* without limitation the rights to use, copy, modify, merge, publish, 8
* distribute, sublicense, and/or sell copies of the Software, and to 9
* permit persons to whom the Software is furnished to do so, subject to 10
* the following conditions: 11
* 12
* The above copyright notice and this permission notice shall be 13
* included in all copies or substantial portions of the Software. 14
* 15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22
* SOFTWARE. 23
*/ 24 25
#include "inner.h" 26 27
#if BR_INT128 || BR_UMUL128 28 29
#if BR_INT128 30 31
/* 32
* Compute x*y+v1+v2. Operands are 64-bit, and result is 128-bit, with 33
* high word in "hi" and low word in "lo". 34
*/ 35
#define FMA1(hi, lo, x, y, v1, v2) do { \ 36
unsigned __int128 fmaz; \ 37
fmaz = (unsigned __int128)(x) * (unsigned __int128)(y) \ 38
+ (unsigned __int128)(v1) + (unsigned __int128)(v2); \ 39
(hi) = (uint64_t)(fmaz >> 64); \ 40
(lo) = (uint64_t)fmaz; \ 41
} while (0) 42 43
/* 44
* Compute x1*y1+x2*y2+v1+v2. Operands are 64-bit, and result is 128-bit, 45
* with high word in "hi" and low word in "lo". 46
* 47
* Callers should ensure that the two inner products, and the v1 and v2 48
* operands, are multiple of 4 (this is not used by this specific definition 49
* but may help other implementations). 50
*/ 51
#define FMA2(hi, lo, x1, y1, x2, y2, v1, v2) do { \ 52
unsigned __int128 fmaz; \ 53
fmaz = (unsigned __int128)(x1) * (unsigned __int128)(y1) \ 54
+ (unsigned __int128)(x2) * (unsigned __int128)(y2) \ 55
+ (unsigned __int128)(v1) + (unsigned __int128)(v2); \ 56
(hi) = (uint64_t)(fmaz >> 64); \ 57
(lo) = (uint64_t)fmaz; \ 58
} while (0) 59 60
#elif BR_UMUL128 61 62
#include <intrin.h> 63 64
#define FMA1(hi, lo, x, y, v1, v2) do { \ 65
uint64_t fmahi, fmalo; \ 66
unsigned char fmacc; \ 67
fmalo = _umul128((x), (y), &fmahi); \ 68
fmacc = _addcarry_u64(0, fmalo, (v1), &fmalo); \ 69
_addcarry_u64(fmacc, fmahi, 0, &fmahi); \ 70
fmacc = _addcarry_u64(0, fmalo, (v2), &(lo)); \ 71
_addcarry_u64(fmacc, fmahi, 0, &(hi)); \ 72
} while (0) 73 74
/* 75
* Normally we should use _addcarry_u64() for FMA2 too, but it makes 76
* Visual Studio crash. Instead we use this version, which leverages 77
* the fact that the vx operands, and the products, are multiple of 4. 78
* This is unfortunately slower. 79
*/ 80
#define FMA2(hi, lo, x1, y1, x2, y2, v1, v2) do { \ 81
uint64_t fma1hi, fma1lo; \ 82
uint64_t fma2hi, fma2lo; \ 83
uint64_t fmatt; \ 84
fma1lo = _umul128((x1), (y1), &fma1hi); \ 85
fma2lo = _umul128((x2), (y2), &fma2hi); \ 86
fmatt = (fma1lo >> 2) + (fma2lo >> 2) \ 87
+ ((v1) >> 2) + ((v2) >> 2); \ 88
(lo) = fmatt << 2; \ 89
(hi) = fma1hi + fma2hi + (fmatt >> 62); \ 90
} while (0) 91 92
/* 93
* The FMA2 macro definition we would prefer to use, but it triggers 94
* an internal compiler error in Visual Studio 2015. 95
* 96
#define FMA2(hi, lo, x1, y1, x2, y2, v1, v2) do { \ 97
uint64_t fma1hi, fma1lo; \ 98
uint64_t fma2hi, fma2lo; \ 99
unsigned char fmacc; \ 100
fma1lo = _umul128((x1), (y1), &fma1hi); \ 101
fma2lo = _umul128((x2), (y2), &fma2hi); \ 102
fmacc = _addcarry_u64(0, fma1lo, (v1), &fma1lo); \ 103
_addcarry_u64(fmacc, fma1hi, 0, &fma1hi); \ 104
fmacc = _addcarry_u64(0, fma2lo, (v2), &fma2lo); \ 105
_addcarry_u64(fmacc, fma2hi, 0, &fma2hi); \ 106
fmacc = _addcarry_u64(0, fma1lo, fma2lo, &(lo)); \ 107
_addcarry_u64(fmacc, fma1hi, fma2hi, &(hi)); \ 108
} while (0) 109
*/ 110 111
#endif 112 113
#define MASK62 ((uint64_t)0x3FFFFFFFFFFFFFFF) 114
#define MUL62_lo(x, y) (((uint64_t)(x) * (uint64_t)(y)) & MASK62) 115 116
/* 117
* Subtract b from a, and return the final carry. If 'ctl32' is 0, then 118
* a[] is kept unmodified, but the final carry is still computed and 119
* returned. 120
*/ 121
static uint32_t 122
i62_sub(uint64_t *a
, const uint64_t *b
, size_t num
, uint32_t ctl32
) 123
{ 124
uint64_t cc
, mask
; 125
size_t u
; 126 127 cc
= 0; 128 ctl32
= -ctl32
; 129 mask
= (uint64_t)ctl32
| ((uint64_t)ctl32
<< 32); 130
for (u
= 0; u
< num
; u
++) { 131
uint64_t aw
, bw
, dw
; 132 133 aw
= a
[u
]; 134 bw
= b
[u
]; 135 dw
= aw
- bw
- cc
; 136 cc
= dw
>> 63; 137 dw
&= MASK62
; 138 a
[u
] = aw
^ (mask
& (dw
^ aw
)); 139
} 140
return (uint32_t)cc
; 141
} 142 143
/* 144
* Montgomery multiplication, over arrays of 62-bit values. The 145
* destination array (d) must be distinct from the other operands 146
* (x, y and m). All arrays are in little-endian format (least 147
* significant word comes first) over 'num' words. 148
*/ 149
static void 150
montymul(uint64_t *d
, const uint64_t *x
, const uint64_t *y
, 151
const uint64_t *m
, size_t num
, uint64_t m0i
) 152
{ 153
uint64_t dh
; 154
size_t u
, num4
; 155 156 num4
= 1 + ((num
- 1) & ~(size_t)3); 157
memset(d
, 0, num
* sizeof *d
); 158 dh
= 0; 159
for (u
= 0; u
< num
; u
++) { 160
size_t v
; 161
uint64_t f
, xu
; 162
uint64_t r
, zh
; 163
uint64_t hi
, lo
; 164 165 xu
= x
[u
] << 2; 166 f
= MUL62_lo(d
[0] + MUL62_lo(x
[u
], y
[0]), m0i
) << 2; 167 168
FMA2(hi
, lo
, xu
, y
[0], f
, m
[0], d
[0] << 2, 0); 169 r
= hi
; 170 171
for (v
= 1; v
< num4
; v
+= 4) { 172
FMA2(hi
, lo
, xu
, y
[v
+ 0], 173 f
, m
[v
+ 0], d
[v
+ 0] << 2, r
<< 2); 174 r
= hi
+ (r
>> 62); 175 d
[v
- 1] = lo
>> 2; 176
FMA2(hi
, lo
, xu
, y
[v
+ 1], 177 f
, m
[v
+ 1], d
[v
+ 1] << 2, r
<< 2); 178 r
= hi
+ (r
>> 62); 179 d
[v
+ 0] = lo
>> 2; 180
FMA2(hi
, lo
, xu
, y
[v
+ 2], 181 f
, m
[v
+ 2], d
[v
+ 2] << 2, r
<< 2); 182 r
= hi
+ (r
>> 62); 183 d
[v
+ 1] = lo
>> 2; 184
FMA2(hi
, lo
, xu
, y
[v
+ 3], 185 f
, m
[v
+ 3], d
[v
+ 3] << 2, r
<< 2); 186 r
= hi
+ (r
>> 62); 187 d
[v
+ 2] = lo
>> 2; 188
} 189
for (; v
< num
; v
++) { 190
FMA2(hi
, lo
, xu
, y
[v
], f
, m
[v
], d
[v
] << 2, r
<< 2); 191 r
= hi
+ (r
>> 62); 192 d
[v
- 1] = lo
>> 2; 193
} 194 195 zh
= dh
+ r
; 196 d
[num
- 1] = zh
& MASK62
; 197 dh
= zh
>> 62; 198
} 199
i62_sub(d
, m
, num
, (uint32_t)dh
| NOT(i62_sub(d
, m
, num
, 0))); 200
} 201 202
/* 203
* Conversion back from Montgomery representation. 204
*/ 205
static void 206
frommonty(uint64_t *x
, const uint64_t *m
, size_t num
, uint64_t m0i
) 207
{ 208
size_t u
, v
; 209 210
for (u
= 0; u
< num
; u
++) { 211
uint64_t f
, cc
; 212 213 f
= MUL62_lo(x
[0], m0i
) << 2; 214 cc
= 0; 215
for (v
= 0; v
< num
; v
++) { 216
uint64_t hi
, lo
; 217 218
FMA1(hi
, lo
, f
, m
[v
], x
[v
] << 2, cc
); 219 cc
= hi
<< 2; 220
if (v
!= 0) { 221 x
[v
- 1] = lo
>> 2; 222
} 223
} 224 x
[num
- 1] = cc
>> 2; 225
} 226
i62_sub(x
, m
, num
, NOT(i62_sub(x
, m
, num
, 0))); 227
} 228 229
/* see inner.h */ 230
uint32_t 231
br_i62_modpow_opt(uint32_t *x31
, const unsigned char *e
, size_t elen
, 232
const uint32_t *m31
, uint32_t m0i31
, uint64_t *tmp
, size_t twlen
) 233
{ 234
size_t u
, mw31num
, mw62num
; 235
uint64_t *x
, *m
, *t1
, *t2
; 236
uint64_t m0i
; 237
uint32_t acc
; 238
int win_len
, acc_len
; 239 240
/* 241
* Get modulus size, in words. 242
*/ 243 mw31num
= (m31
[0] + 31) >> 5; 244 mw62num
= (mw31num
+ 1) >> 1; 245 246
/* 247
* In order to apply this function, we must have enough room to 248
* copy the operand and modulus into the temporary array, along 249
* with at least two temporaries. If there is not enough room, 250
* switch to br_i31_modpow(). We also use br_i31_modpow() if the 251
* modulus length is not at least four words (94 bits or more). 252
*/ 253
if (mw31num
< 4 || (mw62num
<< 2) > twlen
) { 254
/* 255
* We assume here that we can split an aligned uint64_t 256
* into two properly aligned uint32_t. Since both types 257
* are supposed to have an exact width with no padding, 258
* then this property must hold. 259
*/ 260
size_t txlen
; 261 262 txlen
= mw31num
+ 1; 263
if (twlen
< txlen
) { 264
return 0; 265
} 266
br_i31_modpow(x31
, e
, elen
, m31
, m0i31
, 267
(uint32_t *)tmp
, (uint32_t *)tmp
+ txlen
); 268
return 1; 269
} 270 271
/* 272
* Convert x to Montgomery representation: this means that 273
* we replace x with x*2^z mod m, where z is the smallest multiple 274
* of the word size such that 2^z >= m. We want to reuse the 31-bit 275
* functions here (for constant-time operation), but we need z 276
* for a 62-bit word size. 277
*/ 278
for (u
= 0; u
< mw62num
; u
++) { 279
br_i31_muladd_small(x31
, 0, m31
); 280
br_i31_muladd_small(x31
, 0, m31
); 281
} 282 283
/* 284
* Assemble operands into arrays of 62-bit words. Note that 285
* all the arrays of 62-bit words that we will handle here 286
* are without any leading size word. 287
* 288
* We also adjust tmp and twlen to account for the words used 289
* for these extra arrays. 290
*/ 291 m
= tmp
; 292 x
= tmp
+ mw62num
; 293 tmp
+= (mw62num
<< 1); 294 twlen
-= (mw62num
<< 1); 295
for (u
= 0; u
< mw31num
; u
+= 2) { 296
size_t v
; 297 298 v
= u
>> 1; 299
if ((u
+ 1) == mw31num
) { 300 m
[v
] = (uint64_t)m31
[u
+ 1]; 301 x
[v
] = (uint64_t)x31
[u
+ 1]; 302
} else { 303 m
[v
] = (uint64_t)m31
[u
+ 1] 304
+ ((uint64_t)m31
[u
+ 2] << 31); 305 x
[v
] = (uint64_t)x31
[u
+ 1] 306
+ ((uint64_t)x31
[u
+ 2] << 31); 307
} 308
} 309 310
/* 311
* Compute window size. We support windows up to 5 bits; for a 312
* window of size k bits, we need 2^k+1 temporaries (for k = 1, 313
* we use special code that uses only 2 temporaries). 314
*/ 315
for (win_len
= 5; win_len
> 1; win_len
--) { 316
if ((((uint32_t)1 << win_len
) + 1) * mw62num
<= twlen
) { 317
break; 318
} 319
} 320 321 t1
= tmp
; 322 t2
= tmp
+ mw62num
; 323 324
/* 325
* Compute m0i, which is equal to -(1/m0) mod 2^62. We were 326
* provided with m0i31, which already fulfills this property 327
* modulo 2^31; the single expression below is then sufficient. 328
*/ 329 m0i
= (uint64_t)m0i31
; 330 m0i
= MUL62_lo(m0i
, (uint64_t)2 + MUL62_lo(m0i
, m
[0])); 331 332
/* 333
* Compute window contents. If the window has size one bit only, 334
* then t2 is set to x; otherwise, t2[0] is left untouched, and 335
* t2[k] is set to x^k (for k >= 1). 336
*/ 337
if (win_len
== 1) { 338
memcpy(t2
, x
, mw62num
* sizeof *x
); 339
} else { 340
uint64_t *base
; 341 342
memcpy(t2
+ mw62num
, x
, mw62num
* sizeof *x
); 343 base
= t2
+ mw62num
; 344
for (u
= 2; u
< ((unsigned)1 << win_len
); u
++) { 345
montymul(base
+ mw62num
, base
, x
, m
, mw62num
, m0i
); 346 base
+= mw62num
; 347
} 348
} 349 350
/* 351
* Set x to 1, in Montgomery representation. We again use the 352
* 31-bit code. 353
*/ 354
br_i31_zero(x31
, m31
[0]); 355 x31
[(m31
[0] + 31) >> 5] = 1; 356
br_i31_muladd_small(x31
, 0, m31
); 357
if (mw31num
& 1) { 358
br_i31_muladd_small(x31
, 0, m31
); 359
} 360
for (u
= 0; u
< mw31num
; u
+= 2) { 361
size_t v
; 362 363 v
= u
>> 1; 364
if ((u
+ 1) == mw31num
) { 365 x
[v
] = (uint64_t)x31
[u
+ 1]; 366
} else { 367 x
[v
] = (uint64_t)x31
[u
+ 1] 368
+ ((uint64_t)x31
[u
+ 2] << 31); 369
} 370
} 371 372
/* 373
* We process bits from most to least significant. At each 374
* loop iteration, we have acc_len bits in acc. 375
*/ 376 acc
= 0; 377 acc_len
= 0; 378
while (acc_len
> 0 || elen
> 0) { 379
int i
, k
; 380
uint32_t bits
; 381
uint64_t mask1
, mask2
; 382 383
/* 384
* Get the next bits. 385
*/ 386 k
= win_len
; 387
if (acc_len
< win_len
) { 388
if (elen
> 0) { 389 acc
= (acc
<< 8) | *e
++; 390 elen
--; 391 acc_len
+= 8; 392
} else { 393 k
= acc_len
; 394
} 395
} 396 bits
= (acc
>> (acc_len
- k
)) & (((uint32_t)1 << k
) - 1); 397 acc_len
-= k
; 398 399
/* 400
* We could get exactly k bits. Compute k squarings. 401
*/ 402
for (i
= 0; i
< k
; i
++) { 403
montymul(t1
, x
, x
, m
, mw62num
, m0i
); 404
memcpy(x
, t1
, mw62num
* sizeof *x
); 405
} 406 407
/* 408
* Window lookup: we want to set t2 to the window 409
* lookup value, assuming the bits are non-zero. If 410
* the window length is 1 bit only, then t2 is 411
* already set; otherwise, we do a constant-time lookup. 412
*/ 413
if (win_len
> 1) { 414
uint64_t *base
; 415 416
memset(t2
, 0, mw62num
* sizeof *t2
); 417 base
= t2
+ mw62num
; 418
for (u
= 1; u
< ((uint32_t)1 << k
); u
++) { 419
uint64_t mask
; 420
size_t v
; 421 422 mask
= -(uint64_t)EQ(u
, bits
); 423
for (v
= 0; v
< mw62num
; v
++) { 424 t2
[v
] |= mask
& base
[v
]; 425
} 426 base
+= mw62num
; 427
} 428
} 429 430
/* 431
* Multiply with the looked-up value. We keep the product 432
* only if the exponent bits are not all-zero. 433
*/ 434
montymul(t1
, x
, t2
, m
, mw62num
, m0i
); 435 mask1
= -(uint64_t)EQ(bits
, 0); 436 mask2
= ~mask1
; 437
for (u
= 0; u
< mw62num
; u
++) { 438 x
[u
] = (mask1
& x
[u
]) | (mask2
& t1
[u
]); 439
} 440
} 441 442
/* 443
* Convert back from Montgomery representation. 444
*/ 445
frommonty(x
, m
, mw62num
, m0i
); 446 447
/* 448
* Convert result into 31-bit words. 449
*/ 450
for (u
= 0; u
< mw31num
; u
+= 2) { 451
uint64_t zw
; 452 453 zw
= x
[u
>> 1]; 454 x31
[u
+ 1] = (uint32_t)zw
& 0x7FFFFFFF; 455
if ((u
+ 1) < mw31num
) { 456 x31
[u
+ 2] = (uint32_t)(zw
>> 31); 457
} 458
} 459
return 1; 460
} 461 462
#else 463 464
/* see inner.h */ 465
uint32_t 466
br_i62_modpow_opt(uint32_t *x31
, const unsigned char *e
, size_t elen
, 467
const uint32_t *m31
, uint32_t m0i31
, uint64_t *tmp
, size_t twlen
) 468
{ 469
size_t mwlen
; 470 471 mwlen
= (m31
[0] + 63) >> 5; 472
if (twlen
< mwlen
) { 473
return 0; 474
} 475
return br_i31_modpow_opt(x31
, e
, elen
, m31
, m0i31
, 476
(uint32_t *)tmp
, twlen
<< 1); 477
} 478 479
#endif 480 481
/* see inner.h */ 482
uint32_t 483
br_i62_modpow_opt_as_i31(uint32_t *x31
, const unsigned char *e
, size_t elen
, 484
const uint32_t *m31
, uint32_t m0i31
, uint32_t *tmp
, size_t twlen
) 485
{ 486
/* 487
* As documented, this function expects the 'tmp' argument to be 488
* 64-bit aligned. This is OK since this function is internal (it 489
* is not part of BearSSL's public API). 490
*/ 491
return br_i62_modpow_opt(x31
, e
, elen
, m31
, m0i31
, 492
(uint64_t *)tmp
, twlen
>> 1); 493
} The BearSSL library and tools. RSS Atom