Line data Source code
1 : /*
2 : * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
3 : *
4 : * Modified for OpenBSD by Thomas Pornin and Mike Belopuhov.
5 : *
6 : * Permission is hereby granted, free of charge, to any person obtaining
7 : * a copy of this software and associated documentation files (the
8 : * "Software"), to deal in the Software without restriction, including
9 : * without limitation the rights to use, copy, modify, merge, publish,
10 : * distribute, sublicense, and/or sell copies of the Software, and to
11 : * permit persons to whom the Software is furnished to do so, subject to
12 : * the following conditions:
13 : *
14 : * The above copyright notice and this permission notice shall be
15 : * included in all copies or substantial portions of the Software.
16 : *
17 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 : * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 : * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21 : * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22 : * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23 : * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 : * SOFTWARE.
25 : */
26 :
27 : #include <sys/types.h>
28 : #include <sys/systm.h>
29 : #include <sys/stdint.h>
30 :
31 : #include "aes.h"
32 :
33 : static inline void
34 0 : enc32le(void *dst, uint32_t x)
35 : {
36 : unsigned char *buf = dst;
37 :
38 0 : buf[0] = (unsigned char)x;
39 0 : buf[1] = (unsigned char)(x >> 8);
40 0 : buf[2] = (unsigned char)(x >> 16);
41 0 : buf[3] = (unsigned char)(x >> 24);
42 0 : }
43 :
44 : static inline uint32_t
45 0 : dec32le(const void *src)
46 : {
47 : const unsigned char *buf = src;
48 :
49 0 : return (uint32_t)buf[0]
50 0 : | ((uint32_t)buf[1] << 8)
51 0 : | ((uint32_t)buf[2] << 16)
52 0 : | ((uint32_t)buf[3] << 24);
53 : }
54 :
55 : /*
56 : * This constant-time implementation is "bitsliced": the 128-bit state is
57 : * split over eight 32-bit words q* in the following way:
58 : *
59 : * -- Input block consists in 16 bytes:
60 : * a00 a10 a20 a30 a01 a11 a21 a31 a02 a12 a22 a32 a03 a13 a23 a33
61 : * In the terminology of FIPS 197, this is a 4x4 matrix which is read
62 : * column by column.
63 : *
64 : * -- Each byte is split into eight bits which are distributed over the
65 : * eight words, at the same rank. Thus, for a byte x at rank k, bit 0
66 : * (least significant) of x will be at rank k in q0 (if that bit is b,
67 : * then it contributes "b << k" to the value of q0), bit 1 of x will be
68 : * at rank k in q1, and so on.
69 : *
70 : * -- Ranks given to bits are in "row order" and are either all even, or
71 : * all odd. Two independent AES states are thus interleaved, one using
72 : * the even ranks, the other the odd ranks. Row order means:
73 : * a00 a01 a02 a03 a10 a11 a12 a13 a20 a21 a22 a23 a30 a31 a32 a33
74 : *
75 : * Converting input bytes from two AES blocks to bitslice representation
76 : * is done in the following way:
77 : * -- Decode first block into the four words q0 q2 q4 q6, in that order,
78 : * using little-endian convention.
79 : * -- Decode second block into the four words q1 q3 q5 q7, in that order,
80 : * using little-endian convention.
81 : * -- Call aes_ct_ortho().
82 : *
83 : * Converting back to bytes is done by using the reverse operations. Note
84 : * that aes_ct_ortho() is its own inverse.
85 : */
86 :
87 : /*
88 : * The AES S-box, as a bitsliced constant-time version. The input array
89 : * consists in eight 32-bit words; 32 S-box instances are computed in
90 : * parallel. Bits 0 to 7 of each S-box input (bit 0 is least significant)
91 : * are spread over the words 0 to 7, at the same rank.
92 : */
93 : static void
94 0 : aes_ct_bitslice_Sbox(uint32_t *q)
95 : {
96 : /*
97 : * This S-box implementation is a straightforward translation of
98 : * the circuit described by Boyar and Peralta in "A new
99 : * combinational logic minimization technique with applications
100 : * to cryptology" (https://eprint.iacr.org/2009/191.pdf).
101 : *
102 : * Note that variables x* (input) and s* (output) are numbered
103 : * in "reverse" order (x0 is the high bit, x7 is the low bit).
104 : */
105 :
106 : uint32_t x0, x1, x2, x3, x4, x5, x6, x7;
107 : uint32_t y1, y2, y3, y4, y5, y6, y7, y8, y9;
108 : uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
109 : uint32_t y20, y21;
110 : uint32_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9;
111 : uint32_t z10, z11, z12, z13, z14, z15, z16, z17;
112 : uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
113 : uint32_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19;
114 : uint32_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
115 : uint32_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39;
116 : uint32_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49;
117 : uint32_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59;
118 : uint32_t t60, t61, t62, t63, t64, t65, t66, t67;
119 : uint32_t s0, s1, s2, s3, s4, s5, s6, s7;
120 :
121 0 : x0 = q[7];
122 0 : x1 = q[6];
123 0 : x2 = q[5];
124 0 : x3 = q[4];
125 0 : x4 = q[3];
126 0 : x5 = q[2];
127 0 : x6 = q[1];
128 0 : x7 = q[0];
129 :
130 : /*
131 : * Top linear transformation.
132 : */
133 0 : y14 = x3 ^ x5;
134 0 : y13 = x0 ^ x6;
135 0 : y9 = x0 ^ x3;
136 0 : y8 = x0 ^ x5;
137 0 : t0 = x1 ^ x2;
138 0 : y1 = t0 ^ x7;
139 0 : y4 = y1 ^ x3;
140 0 : y12 = y13 ^ y14;
141 0 : y2 = y1 ^ x0;
142 0 : y5 = y1 ^ x6;
143 0 : y3 = y5 ^ y8;
144 0 : t1 = x4 ^ y12;
145 0 : y15 = t1 ^ x5;
146 0 : y20 = t1 ^ x1;
147 0 : y6 = y15 ^ x7;
148 0 : y10 = y15 ^ t0;
149 0 : y11 = y20 ^ y9;
150 0 : y7 = x7 ^ y11;
151 0 : y17 = y10 ^ y11;
152 0 : y19 = y10 ^ y8;
153 0 : y16 = t0 ^ y11;
154 0 : y21 = y13 ^ y16;
155 0 : y18 = x0 ^ y16;
156 :
157 : /*
158 : * Non-linear section.
159 : */
160 0 : t2 = y12 & y15;
161 0 : t3 = y3 & y6;
162 0 : t4 = t3 ^ t2;
163 0 : t5 = y4 & x7;
164 0 : t6 = t5 ^ t2;
165 0 : t7 = y13 & y16;
166 0 : t8 = y5 & y1;
167 0 : t9 = t8 ^ t7;
168 0 : t10 = y2 & y7;
169 0 : t11 = t10 ^ t7;
170 0 : t12 = y9 & y11;
171 0 : t13 = y14 & y17;
172 0 : t14 = t13 ^ t12;
173 0 : t15 = y8 & y10;
174 0 : t16 = t15 ^ t12;
175 0 : t17 = t4 ^ t14;
176 0 : t18 = t6 ^ t16;
177 0 : t19 = t9 ^ t14;
178 0 : t20 = t11 ^ t16;
179 0 : t21 = t17 ^ y20;
180 0 : t22 = t18 ^ y19;
181 0 : t23 = t19 ^ y21;
182 0 : t24 = t20 ^ y18;
183 :
184 0 : t25 = t21 ^ t22;
185 0 : t26 = t21 & t23;
186 0 : t27 = t24 ^ t26;
187 0 : t28 = t25 & t27;
188 0 : t29 = t28 ^ t22;
189 0 : t30 = t23 ^ t24;
190 0 : t31 = t22 ^ t26;
191 0 : t32 = t31 & t30;
192 0 : t33 = t32 ^ t24;
193 0 : t34 = t23 ^ t33;
194 0 : t35 = t27 ^ t33;
195 0 : t36 = t24 & t35;
196 0 : t37 = t36 ^ t34;
197 0 : t38 = t27 ^ t36;
198 0 : t39 = t29 & t38;
199 0 : t40 = t25 ^ t39;
200 :
201 0 : t41 = t40 ^ t37;
202 0 : t42 = t29 ^ t33;
203 0 : t43 = t29 ^ t40;
204 0 : t44 = t33 ^ t37;
205 0 : t45 = t42 ^ t41;
206 0 : z0 = t44 & y15;
207 0 : z1 = t37 & y6;
208 0 : z2 = t33 & x7;
209 0 : z3 = t43 & y16;
210 0 : z4 = t40 & y1;
211 0 : z5 = t29 & y7;
212 0 : z6 = t42 & y11;
213 0 : z7 = t45 & y17;
214 0 : z8 = t41 & y10;
215 0 : z9 = t44 & y12;
216 0 : z10 = t37 & y3;
217 0 : z11 = t33 & y4;
218 0 : z12 = t43 & y13;
219 0 : z13 = t40 & y5;
220 0 : z14 = t29 & y2;
221 0 : z15 = t42 & y9;
222 0 : z16 = t45 & y14;
223 0 : z17 = t41 & y8;
224 :
225 : /*
226 : * Bottom linear transformation.
227 : */
228 0 : t46 = z15 ^ z16;
229 0 : t47 = z10 ^ z11;
230 0 : t48 = z5 ^ z13;
231 0 : t49 = z9 ^ z10;
232 0 : t50 = z2 ^ z12;
233 0 : t51 = z2 ^ z5;
234 0 : t52 = z7 ^ z8;
235 0 : t53 = z0 ^ z3;
236 0 : t54 = z6 ^ z7;
237 0 : t55 = z16 ^ z17;
238 0 : t56 = z12 ^ t48;
239 0 : t57 = t50 ^ t53;
240 0 : t58 = z4 ^ t46;
241 0 : t59 = z3 ^ t54;
242 0 : t60 = t46 ^ t57;
243 0 : t61 = z14 ^ t57;
244 0 : t62 = t52 ^ t58;
245 0 : t63 = t49 ^ t58;
246 0 : t64 = z4 ^ t59;
247 0 : t65 = t61 ^ t62;
248 0 : t66 = z1 ^ t63;
249 0 : s0 = t59 ^ t63;
250 0 : s6 = t56 ^ ~t62;
251 0 : s7 = t48 ^ ~t60;
252 0 : t67 = t64 ^ t65;
253 0 : s3 = t53 ^ t66;
254 0 : s4 = t51 ^ t66;
255 0 : s5 = t47 ^ t65;
256 0 : s1 = t64 ^ ~s3;
257 0 : s2 = t55 ^ ~t67;
258 :
259 0 : q[7] = s0;
260 0 : q[6] = s1;
261 0 : q[5] = s2;
262 0 : q[4] = s3;
263 0 : q[3] = s4;
264 0 : q[2] = s5;
265 0 : q[1] = s6;
266 0 : q[0] = s7;
267 0 : }
268 :
269 : /*
270 : * Perform bytewise orthogonalization of eight 32-bit words. Bytes
271 : * of q0..q7 are spread over all words: for a byte x that occurs
272 : * at rank i in q[j] (byte x uses bits 8*i to 8*i+7 in q[j]), the bit
273 : * of rank k in x (0 <= k <= 7) goes to q[k] at rank 8*i+j.
274 : *
275 : * This operation is an involution.
276 : */
277 : static void
278 0 : aes_ct_ortho(uint32_t *q)
279 : {
280 : #define SWAPN(cl, ch, s, x, y) do { \
281 : uint32_t a, b; \
282 : a = (x); \
283 : b = (y); \
284 : (x) = (a & (uint32_t)cl) | ((b & (uint32_t)cl) << (s)); \
285 : (y) = ((a & (uint32_t)ch) >> (s)) | (b & (uint32_t)ch); \
286 : } while (0)
287 :
288 : #define SWAP2(x, y) SWAPN(0x55555555, 0xAAAAAAAA, 1, x, y)
289 : #define SWAP4(x, y) SWAPN(0x33333333, 0xCCCCCCCC, 2, x, y)
290 : #define SWAP8(x, y) SWAPN(0x0F0F0F0F, 0xF0F0F0F0, 4, x, y)
291 :
292 0 : SWAP2(q[0], q[1]);
293 0 : SWAP2(q[2], q[3]);
294 0 : SWAP2(q[4], q[5]);
295 0 : SWAP2(q[6], q[7]);
296 :
297 0 : SWAP4(q[0], q[2]);
298 0 : SWAP4(q[1], q[3]);
299 0 : SWAP4(q[4], q[6]);
300 0 : SWAP4(q[5], q[7]);
301 :
302 0 : SWAP8(q[0], q[4]);
303 0 : SWAP8(q[1], q[5]);
304 0 : SWAP8(q[2], q[6]);
305 0 : SWAP8(q[3], q[7]);
306 0 : }
307 :
308 : static inline uint32_t
309 0 : sub_word(uint32_t x)
310 : {
311 0 : uint32_t q[8];
312 : int i;
313 :
314 0 : for (i = 0; i < 8; i ++) {
315 0 : q[i] = x;
316 : }
317 0 : aes_ct_ortho(q);
318 0 : aes_ct_bitslice_Sbox(q);
319 0 : aes_ct_ortho(q);
320 0 : return q[0];
321 0 : }
322 :
323 : static const unsigned char Rcon[] = {
324 : 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
325 : };
326 :
327 : /*
328 : * Base key schedule code. The function sub_word() must be defined
329 : * below. Subkeys are produced in little-endian convention (but not
330 : * bitsliced). Key length is expressed in bytes.
331 : */
332 : static unsigned
333 0 : aes_keysched_base(uint32_t *skey, const void *key, size_t key_len)
334 : {
335 : unsigned num_rounds;
336 : int i, j, k, nk, nkf;
337 : uint32_t tmp;
338 :
339 0 : switch (key_len) {
340 : case 16:
341 : num_rounds = 10;
342 0 : break;
343 : case 24:
344 : num_rounds = 12;
345 0 : break;
346 : case 32:
347 : num_rounds = 14;
348 0 : break;
349 : default:
350 0 : return 0;
351 : }
352 0 : nk = (int)(key_len >> 2);
353 0 : nkf = (int)((num_rounds + 1) << 2);
354 0 : for (i = 0; i < nk; i ++) {
355 0 : tmp = dec32le((const unsigned char *)key + (i << 2));
356 0 : skey[i] = tmp;
357 : }
358 0 : tmp = skey[(key_len >> 2) - 1];
359 0 : for (i = nk, j = 0, k = 0; i < nkf; i ++) {
360 0 : if (j == 0) {
361 0 : tmp = (tmp << 24) | (tmp >> 8);
362 0 : tmp = sub_word(tmp) ^ Rcon[k];
363 0 : } else if (nk > 6 && j == 4) {
364 0 : tmp = sub_word(tmp);
365 0 : }
366 0 : tmp ^= skey[i - nk];
367 0 : skey[i] = tmp;
368 0 : if (++ j == nk) {
369 : j = 0;
370 0 : k ++;
371 0 : }
372 : }
373 0 : return num_rounds;
374 0 : }
375 :
376 : /*
377 : * AES key schedule, constant-time version. skey[] is filled with n+1
378 : * 128-bit subkeys, where n is the number of rounds (10 to 14, depending
379 : * on key size). The number of rounds is returned. If the key size is
380 : * invalid (not 16, 24 or 32), then 0 is returned.
381 : */
382 : unsigned
383 0 : aes_ct_keysched(uint32_t *comp_skey, const void *key, size_t key_len)
384 : {
385 0 : uint32_t skey[60];
386 : unsigned u, num_rounds;
387 :
388 0 : num_rounds = aes_keysched_base(skey, key, key_len);
389 0 : for (u = 0; u <= num_rounds; u ++) {
390 0 : uint32_t q[8];
391 :
392 0 : q[0] = q[1] = skey[(u << 2) + 0];
393 0 : q[2] = q[3] = skey[(u << 2) + 1];
394 0 : q[4] = q[5] = skey[(u << 2) + 2];
395 0 : q[6] = q[7] = skey[(u << 2) + 3];
396 0 : aes_ct_ortho(q);
397 0 : comp_skey[(u << 2) + 0] =
398 0 : (q[0] & 0x55555555) | (q[1] & 0xAAAAAAAA);
399 0 : comp_skey[(u << 2) + 1] =
400 0 : (q[2] & 0x55555555) | (q[3] & 0xAAAAAAAA);
401 0 : comp_skey[(u << 2) + 2] =
402 0 : (q[4] & 0x55555555) | (q[5] & 0xAAAAAAAA);
403 0 : comp_skey[(u << 2) + 3] =
404 0 : (q[6] & 0x55555555) | (q[7] & 0xAAAAAAAA);
405 0 : }
406 0 : return num_rounds;
407 0 : }
408 :
409 : /*
410 : * Expand AES subkeys as produced by aes_ct_keysched(), into
411 : * a larger array suitable for aes_ct_bitslice_encrypt() and
412 : * aes_ct_bitslice_decrypt().
413 : */
414 : void
415 0 : aes_ct_skey_expand(uint32_t *skey,
416 : unsigned num_rounds, const uint32_t *comp_skey)
417 : {
418 : unsigned u, v, n;
419 :
420 0 : n = (num_rounds + 1) << 2;
421 0 : for (u = 0, v = 0; u < n; u ++, v += 2) {
422 : uint32_t x, y;
423 :
424 0 : x = y = comp_skey[u];
425 0 : x &= 0x55555555;
426 0 : skey[v + 0] = x | (x << 1);
427 0 : y &= 0xAAAAAAAA;
428 0 : skey[v + 1] = y | (y >> 1);
429 : }
430 0 : }
431 :
432 : static inline void
433 0 : add_round_key(uint32_t *q, const uint32_t *sk)
434 : {
435 0 : q[0] ^= sk[0];
436 0 : q[1] ^= sk[1];
437 0 : q[2] ^= sk[2];
438 0 : q[3] ^= sk[3];
439 0 : q[4] ^= sk[4];
440 0 : q[5] ^= sk[5];
441 0 : q[6] ^= sk[6];
442 0 : q[7] ^= sk[7];
443 0 : }
444 :
445 : static inline void
446 0 : shift_rows(uint32_t *q)
447 : {
448 : int i;
449 :
450 0 : for (i = 0; i < 8; i ++) {
451 : uint32_t x;
452 :
453 0 : x = q[i];
454 0 : q[i] = (x & 0x000000FF)
455 0 : | ((x & 0x0000FC00) >> 2) | ((x & 0x00000300) << 6)
456 0 : | ((x & 0x00F00000) >> 4) | ((x & 0x000F0000) << 4)
457 0 : | ((x & 0xC0000000) >> 6) | ((x & 0x3F000000) << 2);
458 : }
459 0 : }
460 :
461 : static inline uint32_t
462 0 : rotr16(uint32_t x)
463 : {
464 0 : return (x << 16) | (x >> 16);
465 : }
466 :
467 : static inline void
468 0 : mix_columns(uint32_t *q)
469 : {
470 : uint32_t q0, q1, q2, q3, q4, q5, q6, q7;
471 : uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
472 :
473 0 : q0 = q[0];
474 0 : q1 = q[1];
475 0 : q2 = q[2];
476 0 : q3 = q[3];
477 0 : q4 = q[4];
478 0 : q5 = q[5];
479 0 : q6 = q[6];
480 0 : q7 = q[7];
481 0 : r0 = (q0 >> 8) | (q0 << 24);
482 0 : r1 = (q1 >> 8) | (q1 << 24);
483 0 : r2 = (q2 >> 8) | (q2 << 24);
484 0 : r3 = (q3 >> 8) | (q3 << 24);
485 0 : r4 = (q4 >> 8) | (q4 << 24);
486 0 : r5 = (q5 >> 8) | (q5 << 24);
487 0 : r6 = (q6 >> 8) | (q6 << 24);
488 0 : r7 = (q7 >> 8) | (q7 << 24);
489 :
490 0 : q[0] = q7 ^ r7 ^ r0 ^ rotr16(q0 ^ r0);
491 0 : q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr16(q1 ^ r1);
492 0 : q[2] = q1 ^ r1 ^ r2 ^ rotr16(q2 ^ r2);
493 0 : q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr16(q3 ^ r3);
494 0 : q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr16(q4 ^ r4);
495 0 : q[5] = q4 ^ r4 ^ r5 ^ rotr16(q5 ^ r5);
496 0 : q[6] = q5 ^ r5 ^ r6 ^ rotr16(q6 ^ r6);
497 0 : q[7] = q6 ^ r6 ^ r7 ^ rotr16(q7 ^ r7);
498 0 : }
499 :
500 : /*
501 : * Compute AES encryption on bitsliced data. Since input is stored on
502 : * eight 32-bit words, two block encryptions are actually performed
503 : * in parallel.
504 : */
505 : void
506 0 : aes_ct_bitslice_encrypt(unsigned num_rounds,
507 : const uint32_t *skey, uint32_t *q)
508 : {
509 : unsigned u;
510 :
511 0 : add_round_key(q, skey);
512 0 : for (u = 1; u < num_rounds; u ++) {
513 : aes_ct_bitslice_Sbox(q);
514 : shift_rows(q);
515 0 : mix_columns(q);
516 0 : add_round_key(q, skey + (u << 3));
517 : }
518 : aes_ct_bitslice_Sbox(q);
519 : shift_rows(q);
520 0 : add_round_key(q, skey + (num_rounds << 3));
521 0 : }
522 :
523 : /*
524 : * Like aes_ct_bitslice_Sbox(), but for the inverse S-box.
525 : */
526 : void
527 0 : aes_ct_bitslice_invSbox(uint32_t *q)
528 : {
529 : /*
530 : * AES S-box is:
531 : * S(x) = A(I(x)) ^ 0x63
532 : * where I() is inversion in GF(256), and A() is a linear
533 : * transform (0 is formally defined to be its own inverse).
534 : * Since inversion is an involution, the inverse S-box can be
535 : * computed from the S-box as:
536 : * iS(x) = B(S(B(x ^ 0x63)) ^ 0x63)
537 : * where B() is the inverse of A(). Indeed, for any y in GF(256):
538 : * iS(S(y)) = B(A(I(B(A(I(y)) ^ 0x63 ^ 0x63))) ^ 0x63 ^ 0x63) = y
539 : *
540 : * Note: we reuse the implementation of the forward S-box,
541 : * instead of duplicating it here, so that total code size is
542 : * lower. By merging the B() transforms into the S-box circuit
543 : * we could make faster CBC decryption, but CBC decryption is
544 : * already quite faster than CBC encryption because we can
545 : * process two blocks in parallel.
546 : */
547 : uint32_t q0, q1, q2, q3, q4, q5, q6, q7;
548 :
549 0 : q0 = ~q[0];
550 0 : q1 = ~q[1];
551 0 : q2 = q[2];
552 0 : q3 = q[3];
553 0 : q4 = q[4];
554 0 : q5 = ~q[5];
555 0 : q6 = ~q[6];
556 0 : q7 = q[7];
557 0 : q[7] = q1 ^ q4 ^ q6;
558 0 : q[6] = q0 ^ q3 ^ q5;
559 0 : q[5] = q7 ^ q2 ^ q4;
560 0 : q[4] = q6 ^ q1 ^ q3;
561 0 : q[3] = q5 ^ q0 ^ q2;
562 0 : q[2] = q4 ^ q7 ^ q1;
563 0 : q[1] = q3 ^ q6 ^ q0;
564 0 : q[0] = q2 ^ q5 ^ q7;
565 :
566 0 : aes_ct_bitslice_Sbox(q);
567 :
568 0 : q0 = ~q[0];
569 0 : q1 = ~q[1];
570 0 : q2 = q[2];
571 0 : q3 = q[3];
572 0 : q4 = q[4];
573 0 : q5 = ~q[5];
574 0 : q6 = ~q[6];
575 0 : q7 = q[7];
576 0 : q[7] = q1 ^ q4 ^ q6;
577 0 : q[6] = q0 ^ q3 ^ q5;
578 0 : q[5] = q7 ^ q2 ^ q4;
579 0 : q[4] = q6 ^ q1 ^ q3;
580 0 : q[3] = q5 ^ q0 ^ q2;
581 0 : q[2] = q4 ^ q7 ^ q1;
582 0 : q[1] = q3 ^ q6 ^ q0;
583 0 : q[0] = q2 ^ q5 ^ q7;
584 0 : }
585 :
586 : static inline void
587 0 : inv_shift_rows(uint32_t *q)
588 : {
589 : int i;
590 :
591 0 : for (i = 0; i < 8; i ++) {
592 : uint32_t x;
593 :
594 0 : x = q[i];
595 0 : q[i] = (x & 0x000000FF)
596 0 : | ((x & 0x00003F00) << 2) | ((x & 0x0000C000) >> 6)
597 0 : | ((x & 0x000F0000) << 4) | ((x & 0x00F00000) >> 4)
598 0 : | ((x & 0x03000000) << 6) | ((x & 0xFC000000) >> 2);
599 : }
600 0 : }
601 :
602 : static void
603 0 : inv_mix_columns(uint32_t *q)
604 : {
605 : uint32_t q0, q1, q2, q3, q4, q5, q6, q7;
606 : uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
607 :
608 0 : q0 = q[0];
609 0 : q1 = q[1];
610 0 : q2 = q[2];
611 0 : q3 = q[3];
612 0 : q4 = q[4];
613 0 : q5 = q[5];
614 0 : q6 = q[6];
615 0 : q7 = q[7];
616 0 : r0 = (q0 >> 8) | (q0 << 24);
617 0 : r1 = (q1 >> 8) | (q1 << 24);
618 0 : r2 = (q2 >> 8) | (q2 << 24);
619 0 : r3 = (q3 >> 8) | (q3 << 24);
620 0 : r4 = (q4 >> 8) | (q4 << 24);
621 0 : r5 = (q5 >> 8) | (q5 << 24);
622 0 : r6 = (q6 >> 8) | (q6 << 24);
623 0 : r7 = (q7 >> 8) | (q7 << 24);
624 :
625 0 : q[0] = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr16(q0 ^ q5 ^ q6 ^ r0 ^ r5);
626 0 : q[1] = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6);
627 0 : q[2] = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr16(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7);
628 0 : q[3] = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr16(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7);
629 0 : q[4] = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6);
630 0 : q[5] = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7);
631 0 : q[6] = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr16(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7);
632 0 : q[7] = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr16(q4 ^ q5 ^ q7 ^ r4 ^ r7);
633 0 : }
634 :
635 : /*
636 : * Compute AES decryption on bitsliced data. Since input is stored on
637 : * eight 32-bit words, two block decryptions are actually performed
638 : * in parallel.
639 : */
640 : void
641 0 : aes_ct_bitslice_decrypt(unsigned num_rounds,
642 : const uint32_t *skey, uint32_t *q)
643 : {
644 : unsigned u;
645 :
646 0 : add_round_key(q, skey + (num_rounds << 3));
647 0 : for (u = num_rounds - 1; u > 0; u --) {
648 : inv_shift_rows(q);
649 : aes_ct_bitslice_invSbox(q);
650 0 : add_round_key(q, skey + (u << 3));
651 0 : inv_mix_columns(q);
652 : }
653 : inv_shift_rows(q);
654 : aes_ct_bitslice_invSbox(q);
655 0 : add_round_key(q, skey);
656 0 : }
657 :
658 :
659 : int
660 0 : AES_Setkey(AES_CTX *ctx, const uint8_t *key, int len)
661 : {
662 0 : ctx->num_rounds = aes_ct_keysched(ctx->sk, key, len);
663 0 : if (ctx->num_rounds == 0)
664 0 : return -1;
665 0 : aes_ct_skey_expand(ctx->sk_exp, ctx->num_rounds, ctx->sk);
666 0 : return 0;
667 0 : }
668 :
669 : void
670 0 : AES_Encrypt_ECB(AES_CTX *ctx, const uint8_t *src,
671 : uint8_t *dst, size_t num_blocks)
672 : {
673 0 : while (num_blocks > 0) {
674 0 : uint32_t q[8];
675 :
676 0 : q[0] = dec32le(src);
677 0 : q[2] = dec32le(src + 4);
678 0 : q[4] = dec32le(src + 8);
679 0 : q[6] = dec32le(src + 12);
680 0 : if (num_blocks > 1) {
681 0 : q[1] = dec32le(src + 16);
682 0 : q[3] = dec32le(src + 20);
683 0 : q[5] = dec32le(src + 24);
684 0 : q[7] = dec32le(src + 28);
685 0 : } else {
686 0 : q[1] = 0;
687 0 : q[3] = 0;
688 0 : q[5] = 0;
689 0 : q[7] = 0;
690 : }
691 0 : aes_ct_ortho(q);
692 0 : aes_ct_bitslice_encrypt(ctx->num_rounds, ctx->sk_exp, q);
693 0 : aes_ct_ortho(q);
694 0 : enc32le(dst, q[0]);
695 0 : enc32le(dst + 4, q[2]);
696 0 : enc32le(dst + 8, q[4]);
697 0 : enc32le(dst + 12, q[6]);
698 0 : if (num_blocks > 1) {
699 0 : enc32le(dst + 16, q[1]);
700 0 : enc32le(dst + 20, q[3]);
701 0 : enc32le(dst + 24, q[5]);
702 0 : enc32le(dst + 28, q[7]);
703 0 : src += 32;
704 0 : dst += 32;
705 0 : num_blocks -= 2;
706 : } else {
707 0 : break;
708 : }
709 0 : }
710 0 : }
711 :
712 : void
713 0 : AES_Decrypt_ECB(AES_CTX *ctx, const uint8_t *src,
714 : uint8_t *dst, size_t num_blocks)
715 : {
716 0 : while (num_blocks > 0) {
717 0 : uint32_t q[8];
718 :
719 0 : q[0] = dec32le(src);
720 0 : q[2] = dec32le(src + 4);
721 0 : q[4] = dec32le(src + 8);
722 0 : q[6] = dec32le(src + 12);
723 0 : if (num_blocks > 1) {
724 0 : q[1] = dec32le(src + 16);
725 0 : q[3] = dec32le(src + 20);
726 0 : q[5] = dec32le(src + 24);
727 0 : q[7] = dec32le(src + 28);
728 0 : } else {
729 0 : q[1] = 0;
730 0 : q[3] = 0;
731 0 : q[5] = 0;
732 0 : q[7] = 0;
733 : }
734 0 : aes_ct_ortho(q);
735 0 : aes_ct_bitslice_decrypt(ctx->num_rounds, ctx->sk_exp, q);
736 0 : aes_ct_ortho(q);
737 0 : enc32le(dst, q[0]);
738 0 : enc32le(dst + 4, q[2]);
739 0 : enc32le(dst + 8, q[4]);
740 0 : enc32le(dst + 12, q[6]);
741 0 : if (num_blocks > 1) {
742 0 : enc32le(dst + 16, q[1]);
743 0 : enc32le(dst + 20, q[3]);
744 0 : enc32le(dst + 24, q[5]);
745 0 : enc32le(dst + 28, q[7]);
746 0 : src += 32;
747 0 : dst += 32;
748 0 : num_blocks -= 2;
749 : } else {
750 0 : break;
751 : }
752 0 : }
753 0 : }
754 :
755 : void
756 0 : AES_Encrypt(AES_CTX *ctx, const uint8_t *src, uint8_t *dst)
757 : {
758 0 : AES_Encrypt_ECB(ctx, src, dst, 1);
759 0 : }
760 :
761 : void
762 0 : AES_Decrypt(AES_CTX *ctx, const uint8_t *src, uint8_t *dst)
763 : {
764 0 : AES_Decrypt_ECB(ctx, src, dst, 1);
765 0 : }
766 :
767 : int
768 0 : AES_KeySetup_Encrypt(uint32_t *skey, const uint8_t *key, int len)
769 : {
770 : unsigned r, u;
771 0 : uint32_t tkey[60];
772 :
773 0 : r = aes_keysched_base(tkey, key, len);
774 0 : if (r == 0) {
775 0 : return 0;
776 : }
777 0 : for (u = 0; u < ((r + 1) << 2); u ++) {
778 : uint32_t w;
779 :
780 0 : w = tkey[u];
781 0 : skey[u] = (w << 24)
782 0 : | ((w & 0x0000FF00) << 8)
783 0 : | ((w & 0x00FF0000) >> 8)
784 0 : | (w >> 24);
785 : }
786 0 : return r;
787 0 : }
788 :
789 : /*
790 : * Reduce value x modulo polynomial x^8+x^4+x^3+x+1. This works as
791 : * long as x fits on 12 bits at most.
792 : */
793 : static inline uint32_t
794 0 : redgf256(uint32_t x)
795 : {
796 : uint32_t h;
797 :
798 0 : h = x >> 8;
799 0 : return (x ^ h ^ (h << 1) ^ (h << 3) ^ (h << 4)) & 0xFF;
800 : }
801 :
802 : /*
803 : * Multiplication by 0x09 in GF(256).
804 : */
805 : static inline uint32_t
806 0 : mul9(uint32_t x)
807 : {
808 0 : return redgf256(x ^ (x << 3));
809 : }
810 :
811 : /*
812 : * Multiplication by 0x0B in GF(256).
813 : */
814 : static inline uint32_t
815 0 : mulb(uint32_t x)
816 : {
817 0 : return redgf256(x ^ (x << 1) ^ (x << 3));
818 : }
819 :
820 : /*
821 : * Multiplication by 0x0D in GF(256).
822 : */
823 : static inline uint32_t
824 0 : muld(uint32_t x)
825 : {
826 0 : return redgf256(x ^ (x << 2) ^ (x << 3));
827 : }
828 :
829 : /*
830 : * Multiplication by 0x0E in GF(256).
831 : */
832 : static inline uint32_t
833 0 : mule(uint32_t x)
834 : {
835 0 : return redgf256((x << 1) ^ (x << 2) ^ (x << 3));
836 : }
837 :
838 : int
839 0 : AES_KeySetup_Decrypt(uint32_t *skey, const uint8_t *key, int len)
840 : {
841 : unsigned r, u;
842 0 : uint32_t tkey[60];
843 :
844 : /*
845 : * Compute encryption subkeys. We get them in big-endian
846 : * notation.
847 : */
848 0 : r = AES_KeySetup_Encrypt(tkey, key, len);
849 0 : if (r == 0) {
850 0 : return 0;
851 : }
852 :
853 : /*
854 : * Copy the subkeys in reverse order. Also, apply InvMixColumns()
855 : * on the subkeys (except first and last).
856 : */
857 0 : memcpy(skey + (r << 2), tkey, 4 * sizeof(uint32_t));
858 0 : memcpy(skey, tkey + (r << 2), 4 * sizeof(uint32_t));
859 0 : for (u = 4; u < (r << 2); u ++) {
860 : uint32_t sk, sk0, sk1, sk2, sk3;
861 : uint32_t tk, tk0, tk1, tk2, tk3;
862 :
863 0 : sk = tkey[u];
864 0 : sk0 = sk >> 24;
865 0 : sk1 = (sk >> 16) & 0xFF;
866 0 : sk2 = (sk >> 8) & 0xFF;
867 0 : sk3 = sk & 0xFF;
868 0 : tk0 = mule(sk0) ^ mulb(sk1) ^ muld(sk2) ^ mul9(sk3);
869 0 : tk1 = mul9(sk0) ^ mule(sk1) ^ mulb(sk2) ^ muld(sk3);
870 0 : tk2 = muld(sk0) ^ mul9(sk1) ^ mule(sk2) ^ mulb(sk3);
871 0 : tk3 = mulb(sk0) ^ muld(sk1) ^ mul9(sk2) ^ mule(sk3);
872 0 : tk = (tk0 << 24) ^ (tk1 << 16) ^ (tk2 << 8) ^ tk3;
873 0 : skey[((r - (u >> 2)) << 2) + (u & 3)] = tk;
874 : }
875 :
876 0 : return r;
877 0 : }
|