GCC Code Coverage Report
Directory: ./ Exec Total Coverage
File: lib/libcrypto/crypto/../../libssl/src/crypto/modes/gcm128.c Lines: 229 348 65.8 %
Date: 2016-12-06 Branches: 73 126 57.9 %

Line Branch Exec Source
1
/* $OpenBSD: gcm128.c,v 1.13 2015/09/10 15:56:25 jsing Exp $ */
2
/* ====================================================================
3
 * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions
7
 * are met:
8
 *
9
 * 1. Redistributions of source code must retain the above copyright
10
 *    notice, this list of conditions and the following disclaimer.
11
 *
12
 * 2. Redistributions in binary form must reproduce the above copyright
13
 *    notice, this list of conditions and the following disclaimer in
14
 *    the documentation and/or other materials provided with the
15
 *    distribution.
16
 *
17
 * 3. All advertising materials mentioning features or use of this
18
 *    software must display the following acknowledgment:
19
 *    "This product includes software developed by the OpenSSL Project
20
 *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21
 *
22
 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23
 *    endorse or promote products derived from this software without
24
 *    prior written permission. For written permission, please contact
25
 *    openssl-core@openssl.org.
26
 *
27
 * 5. Products derived from this software may not be called "OpenSSL"
28
 *    nor may "OpenSSL" appear in their names without prior written
29
 *    permission of the OpenSSL Project.
30
 *
31
 * 6. Redistributions of any form whatsoever must retain the following
32
 *    acknowledgment:
33
 *    "This product includes software developed by the OpenSSL Project
34
 *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35
 *
36
 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37
 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
40
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47
 * OF THE POSSIBILITY OF SUCH DAMAGE.
48
 * ====================================================================
49
 */
50
51
#define OPENSSL_FIPSAPI
52
53
#include <openssl/crypto.h>
54
#include "modes_lcl.h"
55
#include <string.h>
56
57
#ifndef MODES_DEBUG
58
# ifndef NDEBUG
59
#  define NDEBUG
60
# endif
61
#endif
62
63
#if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
64
/* redefine, because alignment is ensured */
65
#undef	GETU32
66
#define	GETU32(p)	BSWAP4(*(const u32 *)(p))
67
#undef	PUTU32
68
#define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
69
#endif
70
71
#define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
72
#define REDUCE1BIT(V)	\
73
	do { \
74
		if (sizeof(size_t)==8) { \
75
			u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
76
			V.lo  = (V.hi<<63)|(V.lo>>1); \
77
			V.hi  = (V.hi>>1 )^T; \
78
		} else { \
79
			u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80
			V.lo  = (V.hi<<63)|(V.lo>>1); \
81
			V.hi  = (V.hi>>1 )^((u64)T<<32); \
82
		} \
83
	} while(0)
84
85
/*
86
 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87
 * never be set to 8. 8 is effectively reserved for testing purposes.
88
 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89
 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90
 * whole spectrum of possible table driven implementations. Why? In
91
 * non-"Shoup's" case memory access pattern is segmented in such manner,
92
 * that it's trivial to see that cache timing information can reveal
93
 * fair portion of intermediate hash value. Given that ciphertext is
94
 * always available to attacker, it's possible for him to attempt to
95
 * deduce secret parameter H and if successful, tamper with messages
96
 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97
 * not as trivial, but there is no reason to believe that it's resistant
98
 * to cache-timing attack. And the thing about "8-bit" implementation is
99
 * that it consumes 16 (sixteen) times more memory, 4KB per individual
100
 * key + 1KB shared. Well, on pros side it should be twice as fast as
101
 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102
 * was observed to run ~75% faster, closer to 100% for commercial
103
 * compilers... Yet "4-bit" procedure is preferred, because it's
104
 * believed to provide better security-performance balance and adequate
105
 * all-round performance. "All-round" refers to things like:
106
 *
107
 * - shorter setup time effectively improves overall timing for
108
 *   handling short messages;
109
 * - larger table allocation can become unbearable because of VM
110
 *   subsystem penalties (for example on Windows large enough free
111
 *   results in VM working set trimming, meaning that consequent
112
 *   malloc would immediately incur working set expansion);
113
 * - larger table has larger cache footprint, which can affect
114
 *   performance of other code paths (not necessarily even from same
115
 *   thread in Hyper-Threading world);
116
 *
117
 * Value of 1 is not appropriate for performance reasons.
118
 */
119
#if	TABLE_BITS==8
120
121
static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122
{
123
	int  i, j;
124
	u128 V;
125
126
	Htable[0].hi = 0;
127
	Htable[0].lo = 0;
128
	V.hi = H[0];
129
	V.lo = H[1];
130
131
	for (Htable[128]=V, i=64; i>0; i>>=1) {
132
		REDUCE1BIT(V);
133
		Htable[i] = V;
134
	}
135
136
	for (i=2; i<256; i<<=1) {
137
		u128 *Hi = Htable+i, H0 = *Hi;
138
		for (j=1; j<i; ++j) {
139
			Hi[j].hi = H0.hi^Htable[j].hi;
140
			Hi[j].lo = H0.lo^Htable[j].lo;
141
		}
142
	}
143
}
144
145
static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146
{
147
	u128 Z = { 0, 0};
148
	const u8 *xi = (const u8 *)Xi+15;
149
	size_t rem, n = *xi;
150
	static const size_t rem_8bit[256] = {
151
		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
152
		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
153
		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
154
		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
155
		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
156
		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
157
		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
158
		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
159
		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
160
		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
161
		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
162
		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
163
		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
164
		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
165
		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
166
		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
167
		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
168
		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
169
		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
170
		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
171
		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
172
		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
173
		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
174
		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
175
		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
176
		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
177
		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
178
		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
179
		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
180
		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
181
		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
182
		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
183
		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
184
		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
185
		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
186
		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
187
		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
188
		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
189
		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
190
		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
191
		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
192
		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
193
		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
194
		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
195
		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
196
		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
197
		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
198
		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
199
		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
200
		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
201
		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
202
		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
203
		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
204
		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
205
		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
206
		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
207
		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
208
		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
209
		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
210
		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
211
		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
212
		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
213
		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
214
		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
215
216
	while (1) {
217
		Z.hi ^= Htable[n].hi;
218
		Z.lo ^= Htable[n].lo;
219
220
		if ((u8 *)Xi==xi)	break;
221
222
		n = *(--xi);
223
224
		rem  = (size_t)Z.lo&0xff;
225
		Z.lo = (Z.hi<<56)|(Z.lo>>8);
226
		Z.hi = (Z.hi>>8);
227
		if (sizeof(size_t)==8)
228
			Z.hi ^= rem_8bit[rem];
229
		else
230
			Z.hi ^= (u64)rem_8bit[rem]<<32;
231
	}
232
233
	if (BYTE_ORDER == LITTLE_ENDIAN) {
234
#ifdef BSWAP8
235
		Xi[0] = BSWAP8(Z.hi);
236
		Xi[1] = BSWAP8(Z.lo);
237
#else
238
		u8 *p = (u8 *)Xi;
239
		u32 v;
240
		v = (u32)(Z.hi>>32);	PUTU32(p,v);
241
		v = (u32)(Z.hi);	PUTU32(p+4,v);
242
		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
243
		v = (u32)(Z.lo);	PUTU32(p+12,v);
244
#endif
245
	}
246
	else {
247
		Xi[0] = Z.hi;
248
		Xi[1] = Z.lo;
249
	}
250
}
251
#define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
252
253
#elif	TABLE_BITS==4
254
255
static void gcm_init_4bit(u128 Htable[16], u64 H[2])
256
{
257
	u128 V;
258
#if defined(OPENSSL_SMALL_FOOTPRINT)
259
	int  i;
260
#endif
261
262
	Htable[0].hi = 0;
263
	Htable[0].lo = 0;
264
	V.hi = H[0];
265
	V.lo = H[1];
266
267
#if defined(OPENSSL_SMALL_FOOTPRINT)
268
	for (Htable[8]=V, i=4; i>0; i>>=1) {
269
		REDUCE1BIT(V);
270
		Htable[i] = V;
271
	}
272
273
	for (i=2; i<16; i<<=1) {
274
		u128 *Hi = Htable+i;
275
		int   j;
276
		for (V=*Hi, j=1; j<i; ++j) {
277
			Hi[j].hi = V.hi^Htable[j].hi;
278
			Hi[j].lo = V.lo^Htable[j].lo;
279
		}
280
	}
281
#else
282
	Htable[8] = V;
283
	REDUCE1BIT(V);
284
	Htable[4] = V;
285
	REDUCE1BIT(V);
286
	Htable[2] = V;
287
	REDUCE1BIT(V);
288
	Htable[1] = V;
289
	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
290
	V=Htable[4];
291
	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
292
	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
293
	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
294
	V=Htable[8];
295
	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
296
	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
297
	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
298
	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
299
	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
300
	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
301
	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
302
#endif
303
#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
304
	/*
305
	 * ARM assembler expects specific dword order in Htable.
306
	 */
307
	{
308
	int j;
309
310
	if (BYTE_ORDER == LITTLE_ENDIAN)
311
		for (j=0;j<16;++j) {
312
			V = Htable[j];
313
			Htable[j].hi = V.lo;
314
			Htable[j].lo = V.hi;
315
		}
316
	else
317
		for (j=0;j<16;++j) {
318
			V = Htable[j];
319
			Htable[j].hi = V.lo<<32|V.lo>>32;
320
			Htable[j].lo = V.hi<<32|V.hi>>32;
321
		}
322
	}
323
#endif
324
}
325
326
#ifndef GHASH_ASM
327
static const size_t rem_4bit[16] = {
328
	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
329
	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
330
	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
331
	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
332
333
static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
334
{
335
	u128 Z;
336
	int cnt = 15;
337
	size_t rem, nlo, nhi;
338
339
	nlo  = ((const u8 *)Xi)[15];
340
	nhi  = nlo>>4;
341
	nlo &= 0xf;
342
343
	Z.hi = Htable[nlo].hi;
344
	Z.lo = Htable[nlo].lo;
345
346
	while (1) {
347
		rem  = (size_t)Z.lo&0xf;
348
		Z.lo = (Z.hi<<60)|(Z.lo>>4);
349
		Z.hi = (Z.hi>>4);
350
		if (sizeof(size_t)==8)
351
			Z.hi ^= rem_4bit[rem];
352
		else
353
			Z.hi ^= (u64)rem_4bit[rem]<<32;
354
355
		Z.hi ^= Htable[nhi].hi;
356
		Z.lo ^= Htable[nhi].lo;
357
358
		if (--cnt<0)		break;
359
360
		nlo  = ((const u8 *)Xi)[cnt];
361
		nhi  = nlo>>4;
362
		nlo &= 0xf;
363
364
		rem  = (size_t)Z.lo&0xf;
365
		Z.lo = (Z.hi<<60)|(Z.lo>>4);
366
		Z.hi = (Z.hi>>4);
367
		if (sizeof(size_t)==8)
368
			Z.hi ^= rem_4bit[rem];
369
		else
370
			Z.hi ^= (u64)rem_4bit[rem]<<32;
371
372
		Z.hi ^= Htable[nlo].hi;
373
		Z.lo ^= Htable[nlo].lo;
374
	}
375
376
	if (BYTE_ORDER == LITTLE_ENDIAN) {
377
#ifdef BSWAP8
378
		Xi[0] = BSWAP8(Z.hi);
379
		Xi[1] = BSWAP8(Z.lo);
380
#else
381
		u8 *p = (u8 *)Xi;
382
		u32 v;
383
		v = (u32)(Z.hi>>32);	PUTU32(p,v);
384
		v = (u32)(Z.hi);	PUTU32(p+4,v);
385
		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
386
		v = (u32)(Z.lo);	PUTU32(p+12,v);
387
#endif
388
	}
389
	else {
390
		Xi[0] = Z.hi;
391
		Xi[1] = Z.lo;
392
	}
393
}
394
395
#if !defined(OPENSSL_SMALL_FOOTPRINT)
396
/*
397
 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
398
 * details... Compiler-generated code doesn't seem to give any
399
 * performance improvement, at least not on x86[_64]. It's here
400
 * mostly as reference and a placeholder for possible future
401
 * non-trivial optimization[s]...
402
 */
403
static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
404
				const u8 *inp,size_t len)
405
{
406
    u128 Z;
407
    int cnt;
408
    size_t rem, nlo, nhi;
409
410
#if 1
411
    do {
412
	cnt  = 15;
413
	nlo  = ((const u8 *)Xi)[15];
414
	nlo ^= inp[15];
415
	nhi  = nlo>>4;
416
	nlo &= 0xf;
417
418
	Z.hi = Htable[nlo].hi;
419
	Z.lo = Htable[nlo].lo;
420
421
	while (1) {
422
		rem  = (size_t)Z.lo&0xf;
423
		Z.lo = (Z.hi<<60)|(Z.lo>>4);
424
		Z.hi = (Z.hi>>4);
425
		if (sizeof(size_t)==8)
426
			Z.hi ^= rem_4bit[rem];
427
		else
428
			Z.hi ^= (u64)rem_4bit[rem]<<32;
429
430
		Z.hi ^= Htable[nhi].hi;
431
		Z.lo ^= Htable[nhi].lo;
432
433
		if (--cnt<0)		break;
434
435
		nlo  = ((const u8 *)Xi)[cnt];
436
		nlo ^= inp[cnt];
437
		nhi  = nlo>>4;
438
		nlo &= 0xf;
439
440
		rem  = (size_t)Z.lo&0xf;
441
		Z.lo = (Z.hi<<60)|(Z.lo>>4);
442
		Z.hi = (Z.hi>>4);
443
		if (sizeof(size_t)==8)
444
			Z.hi ^= rem_4bit[rem];
445
		else
446
			Z.hi ^= (u64)rem_4bit[rem]<<32;
447
448
		Z.hi ^= Htable[nlo].hi;
449
		Z.lo ^= Htable[nlo].lo;
450
	}
451
#else
452
    /*
453
     * Extra 256+16 bytes per-key plus 512 bytes shared tables
454
     * [should] give ~50% improvement... One could have PACK()-ed
455
     * the rem_8bit even here, but the priority is to minimize
456
     * cache footprint...
457
     */
458
    u128 Hshr4[16];	/* Htable shifted right by 4 bits */
459
    u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
460
    static const unsigned short rem_8bit[256] = {
461
	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
462
	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
463
	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
464
	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
465
	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
466
	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
467
	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
468
	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
469
	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
470
	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
471
	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
472
	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
473
	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
474
	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
475
	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
476
	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
477
	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
478
	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
479
	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
480
	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
481
	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
482
	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
483
	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
484
	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
485
	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
486
	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
487
	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
488
	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
489
	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
490
	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
491
	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
492
	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
493
    /*
494
     * This pre-processing phase slows down procedure by approximately
495
     * same time as it makes each loop spin faster. In other words
496
     * single block performance is approximately same as straightforward
497
     * "4-bit" implementation, and then it goes only faster...
498
     */
499
    for (cnt=0; cnt<16; ++cnt) {
500
	Z.hi = Htable[cnt].hi;
501
	Z.lo = Htable[cnt].lo;
502
	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
503
	Hshr4[cnt].hi = (Z.hi>>4);
504
	Hshl4[cnt]    = (u8)(Z.lo<<4);
505
    }
506
507
    do {
508
	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
509
		nlo  = ((const u8 *)Xi)[cnt];
510
		nlo ^= inp[cnt];
511
		nhi  = nlo>>4;
512
		nlo &= 0xf;
513
514
		Z.hi ^= Htable[nlo].hi;
515
		Z.lo ^= Htable[nlo].lo;
516
517
		rem = (size_t)Z.lo&0xff;
518
519
		Z.lo = (Z.hi<<56)|(Z.lo>>8);
520
		Z.hi = (Z.hi>>8);
521
522
		Z.hi ^= Hshr4[nhi].hi;
523
		Z.lo ^= Hshr4[nhi].lo;
524
		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
525
	}
526
527
	nlo  = ((const u8 *)Xi)[0];
528
	nlo ^= inp[0];
529
	nhi  = nlo>>4;
530
	nlo &= 0xf;
531
532
	Z.hi ^= Htable[nlo].hi;
533
	Z.lo ^= Htable[nlo].lo;
534
535
	rem = (size_t)Z.lo&0xf;
536
537
	Z.lo = (Z.hi<<60)|(Z.lo>>4);
538
	Z.hi = (Z.hi>>4);
539
540
	Z.hi ^= Htable[nhi].hi;
541
	Z.lo ^= Htable[nhi].lo;
542
	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
543
#endif
544
545
	if (BYTE_ORDER == LITTLE_ENDIAN) {
546
#ifdef BSWAP8
547
		Xi[0] = BSWAP8(Z.hi);
548
		Xi[1] = BSWAP8(Z.lo);
549
#else
550
		u8 *p = (u8 *)Xi;
551
		u32 v;
552
		v = (u32)(Z.hi>>32);	PUTU32(p,v);
553
		v = (u32)(Z.hi);	PUTU32(p+4,v);
554
		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
555
		v = (u32)(Z.lo);	PUTU32(p+12,v);
556
#endif
557
	}
558
	else {
559
		Xi[0] = Z.hi;
560
		Xi[1] = Z.lo;
561
	}
562
    } while (inp+=16, len-=16);
563
}
564
#endif
565
#else
566
void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
567
void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
568
#endif
569
570
#define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
571
#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
572
#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
573
/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
574
 * trashing effect. In other words idea is to hash data while it's
575
 * still in L1 cache after encryption pass... */
576
#define GHASH_CHUNK       (3*1024)
577
#endif
578
579
#else	/* TABLE_BITS */
580
581
static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
582
{
583
	u128 V,Z = { 0,0 };
584
	long X;
585
	int  i,j;
586
	const long *xi = (const long *)Xi;
587
588
	V.hi = H[0];	/* H is in host byte order, no byte swapping */
589
	V.lo = H[1];
590
591
	for (j=0; j<16/sizeof(long); ++j) {
592
		if (BYTE_ORDER == LITTLE_ENDIAN) {
593
			if (sizeof(long)==8) {
594
#ifdef BSWAP8
595
				X = (long)(BSWAP8(xi[j]));
596
#else
597
				const u8 *p = (const u8 *)(xi+j);
598
				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
599
#endif
600
			}
601
			else {
602
				const u8 *p = (const u8 *)(xi+j);
603
				X = (long)GETU32(p);
604
			}
605
		}
606
		else
607
			X = xi[j];
608
609
		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
610
			u64 M = (u64)(X>>(8*sizeof(long)-1));
611
			Z.hi ^= V.hi&M;
612
			Z.lo ^= V.lo&M;
613
614
			REDUCE1BIT(V);
615
		}
616
	}
617
618
	if (BYTE_ORDER == LITTLE_ENDIAN) {
619
#ifdef BSWAP8
620
		Xi[0] = BSWAP8(Z.hi);
621
		Xi[1] = BSWAP8(Z.lo);
622
#else
623
		u8 *p = (u8 *)Xi;
624
		u32 v;
625
		v = (u32)(Z.hi>>32);	PUTU32(p,v);
626
		v = (u32)(Z.hi);	PUTU32(p+4,v);
627
		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
628
		v = (u32)(Z.lo);	PUTU32(p+12,v);
629
#endif
630
	}
631
	else {
632
		Xi[0] = Z.hi;
633
		Xi[1] = Z.lo;
634
	}
635
}
636
#define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
637
638
#endif
639
640
#if	TABLE_BITS==4 && defined(GHASH_ASM)
641
# if	!defined(I386_ONLY) && \
642
	(defined(__i386)	|| defined(__i386__)	|| \
643
	 defined(__x86_64)	|| defined(__x86_64__)	|| \
644
	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
645
#  define GHASH_ASM_X86_OR_64
646
#  define GCM_FUNCREF_4BIT
647
extern unsigned int OPENSSL_ia32cap_P[2];
648
649
void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
650
void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
651
void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
652
653
#  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
654
#   define GHASH_ASM_X86
655
void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
656
void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657
658
void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
659
void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660
#  endif
661
# elif defined(__arm__) || defined(__arm)
662
#  include "arm_arch.h"
663
#  if __ARM_ARCH__>=7
664
#   define GHASH_ASM_ARM
665
#   define GCM_FUNCREF_4BIT
666
void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
667
void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
668
#  endif
669
# endif
670
#endif
671
672
#ifdef GCM_FUNCREF_4BIT
673
# undef  GCM_MUL
674
# define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
675
# ifdef GHASH
676
#  undef  GHASH
677
#  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
678
# endif
679
#endif
680
681
void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
682
28
{
683
28
	memset(ctx,0,sizeof(*ctx));
684
28
	ctx->block = block;
685
28
	ctx->key   = key;
686
687
28
	(*block)(ctx->H.c,ctx->H.c,key);
688
689
	if (BYTE_ORDER == LITTLE_ENDIAN) {
690
		/* H is stored in host byte order */
691
#ifdef BSWAP8
692
28
		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
693
28
		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
694
#else
695
		u8 *p = ctx->H.c;
696
		u64 hi,lo;
697
		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
698
		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
699
		ctx->H.u[0] = hi;
700
		ctx->H.u[1] = lo;
701
#endif
702
	}
703
704
#if	TABLE_BITS==8
705
	gcm_init_8bit(ctx->Htable,ctx->H.u);
706
#elif	TABLE_BITS==4
707
# if	defined(GHASH_ASM_X86_OR_64)
708
#  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
709

28
	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
710
	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
711
28
		gcm_init_clmul(ctx->Htable,ctx->H.u);
712
28
		ctx->gmult = gcm_gmult_clmul;
713
28
		ctx->ghash = gcm_ghash_clmul;
714
28
		return;
715
	}
716
#  endif
717
	gcm_init_4bit(ctx->Htable,ctx->H.u);
718
#  if	defined(GHASH_ASM_X86)			/* x86 only */
719
#   if	defined(OPENSSL_IA32_SSE2)
720
	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
721
#   else
722
	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
723
#   endif
724
		ctx->gmult = gcm_gmult_4bit_mmx;
725
		ctx->ghash = gcm_ghash_4bit_mmx;
726
	} else {
727
		ctx->gmult = gcm_gmult_4bit_x86;
728
		ctx->ghash = gcm_ghash_4bit_x86;
729
	}
730
#  else
731
	ctx->gmult = gcm_gmult_4bit;
732
	ctx->ghash = gcm_ghash_4bit;
733
#  endif
734
# elif	defined(GHASH_ASM_ARM)
735
	if (OPENSSL_armcap_P & ARMV7_NEON) {
736
		ctx->gmult = gcm_gmult_neon;
737
		ctx->ghash = gcm_ghash_neon;
738
	} else {
739
		gcm_init_4bit(ctx->Htable,ctx->H.u);
740
		ctx->gmult = gcm_gmult_4bit;
741
		ctx->ghash = gcm_ghash_4bit;
742
	}
743
# else
744
	gcm_init_4bit(ctx->Htable,ctx->H.u);
745
# endif
746
#endif
747
}
748
749
void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
750
56
{
751
	unsigned int ctr;
752
#ifdef GCM_FUNCREF_4BIT
753
56
	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
754
#endif
755
756
56
	ctx->Yi.u[0]  = 0;
757
56
	ctx->Yi.u[1]  = 0;
758
56
	ctx->Xi.u[0]  = 0;
759
56
	ctx->Xi.u[1]  = 0;
760
56
	ctx->len.u[0] = 0;	/* AAD length */
761
56
	ctx->len.u[1] = 0;	/* message length */
762
56
	ctx->ares = 0;
763
56
	ctx->mres = 0;
764
765
56
	if (len==12) {
766
42
		memcpy(ctx->Yi.c,iv,12);
767
42
		ctx->Yi.c[15]=1;
768
42
		ctr=1;
769
	}
770
	else {
771
		size_t i;
772
14
		u64 len0 = len;
773
774
54
		while (len>=16) {
775
26
			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
776
26
			GCM_MUL(ctx,Yi);
777
26
			iv += 16;
778
26
			len -= 16;
779
		}
780
14
		if (len) {
781
12
			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
782
12
			GCM_MUL(ctx,Yi);
783
		}
784
14
		len0 <<= 3;
785
		if (BYTE_ORDER == LITTLE_ENDIAN) {
786
#ifdef BSWAP8
787
14
			ctx->Yi.u[1]  ^= BSWAP8(len0);
788
#else
789
			ctx->Yi.c[8]  ^= (u8)(len0>>56);
790
			ctx->Yi.c[9]  ^= (u8)(len0>>48);
791
			ctx->Yi.c[10] ^= (u8)(len0>>40);
792
			ctx->Yi.c[11] ^= (u8)(len0>>32);
793
			ctx->Yi.c[12] ^= (u8)(len0>>24);
794
			ctx->Yi.c[13] ^= (u8)(len0>>16);
795
			ctx->Yi.c[14] ^= (u8)(len0>>8);
796
			ctx->Yi.c[15] ^= (u8)(len0);
797
#endif
798
		}
799
		else
800
			ctx->Yi.u[1]  ^= len0;
801
802
14
		GCM_MUL(ctx,Yi);
803
804
		if (BYTE_ORDER == LITTLE_ENDIAN)
805
#ifdef BSWAP4
806
14
			ctr = BSWAP4(ctx->Yi.d[3]);
807
#else
808
			ctr = GETU32(ctx->Yi.c+12);
809
#endif
810
		else
811
			ctr = ctx->Yi.d[3];
812
	}
813
814
56
	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
815
56
	++ctr;
816
	if (BYTE_ORDER == LITTLE_ENDIAN)
817
#ifdef BSWAP4
818
56
		ctx->Yi.d[3] = BSWAP4(ctr);
819
#else
820
		PUTU32(ctx->Yi.c+12,ctr);
821
#endif
822
	else
823
		ctx->Yi.d[3] = ctr;
824
56
}
825
826
int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
827
36
{
828
	size_t i;
829
	unsigned int n;
830
36
	u64 alen = ctx->len.u[0];
831
#ifdef GCM_FUNCREF_4BIT
832
36
	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
833
# ifdef GHASH
834
	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
835
36
				const u8 *inp,size_t len)	= ctx->ghash;
836
# endif
837
#endif
838
839
36
	if (ctx->len.u[1]) return -2;
840
841
36
	alen += len;
842
36
	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
843
		return -1;
844
36
	ctx->len.u[0] = alen;
845
846
36
	n = ctx->ares;
847
36
	if (n) {
848
		while (n && len) {
849
			ctx->Xi.c[n] ^= *(aad++);
850
			--len;
851
			n = (n+1)%16;
852
		}
853
		if (n==0) GCM_MUL(ctx,Xi);
854
		else {
855
			ctx->ares = n;
856
			return 0;
857
		}
858
	}
859
860
#ifdef GHASH
861
36
	if ((i = (len&(size_t)-16))) {
862
32
		GHASH(ctx,aad,i);
863
32
		aad += i;
864
32
		len -= i;
865
	}
866
#else
867
	while (len>=16) {
868
		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
869
		GCM_MUL(ctx,Xi);
870
		aad += 16;
871
		len -= 16;
872
	}
873
#endif
874
36
	if (len) {
875
34
		n = (unsigned int)len;
876
34
		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
877
	}
878
879
36
	ctx->ares = n;
880
36
	return 0;
881
}
882
883
int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
884
		const unsigned char *in, unsigned char *out,
885
		size_t len)
886
16
{
887
	unsigned int n, ctr;
888
	size_t i;
889
16
	u64        mlen  = ctx->len.u[1];
890
16
	block128_f block = ctx->block;
891
16
	void      *key   = ctx->key;
892
#ifdef GCM_FUNCREF_4BIT
893
16
	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
894
# ifdef GHASH
895
	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
896
16
				const u8 *inp,size_t len)	= ctx->ghash;
897
# endif
898
#endif
899
900
16
	mlen += len;
901
16
	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
902
		return -1;
903
16
	ctx->len.u[1] = mlen;
904
905
16
	if (ctx->ares) {
906
		/* First call to encrypt finalizes GHASH(AAD) */
907
9
		GCM_MUL(ctx,Xi);
908
9
		ctx->ares = 0;
909
	}
910
911
	if (BYTE_ORDER == LITTLE_ENDIAN)
912
#ifdef BSWAP4
913
16
		ctr = BSWAP4(ctx->Yi.d[3]);
914
#else
915
		ctr = GETU32(ctx->Yi.c+12);
916
#endif
917
	else
918
		ctr = ctx->Yi.d[3];
919
920
16
	n = ctx->mres;
921
#if !defined(OPENSSL_SMALL_FOOTPRINT)
922
	if (16%sizeof(size_t) == 0) do {	/* always true actually */
923
16
		if (n) {
924
			while (n && len) {
925
				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
926
				--len;
927
				n = (n+1)%16;
928
			}
929
			if (n==0) GCM_MUL(ctx,Xi);
930
			else {
931
				ctx->mres = n;
932
				return 0;
933
			}
934
		}
935
#ifdef __STRICT_ALIGNMENT
936
		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
937
			break;
938
#endif
939
#if defined(GHASH) && defined(GHASH_CHUNK)
940
16
		while (len>=GHASH_CHUNK) {
941
		    size_t j=GHASH_CHUNK;
942
943
		    while (j) {
944
		    	size_t *out_t=(size_t *)out;
945
		    	const size_t *in_t=(const size_t *)in;
946
947
			(*block)(ctx->Yi.c,ctx->EKi.c,key);
948
			++ctr;
949
			if (BYTE_ORDER == LITTLE_ENDIAN)
950
#ifdef BSWAP4
951
				ctx->Yi.d[3] = BSWAP4(ctr);
952
#else
953
				PUTU32(ctx->Yi.c+12,ctr);
954
#endif
955
			else
956
				ctx->Yi.d[3] = ctr;
957
			for (i=0; i<16/sizeof(size_t); ++i)
958
				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
959
			out += 16;
960
			in  += 16;
961
			j   -= 16;
962
		    }
963
		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
964
		    len -= GHASH_CHUNK;
965
		}
966
16
		if ((i = (len&(size_t)-16))) {
967
16
		    size_t j=i;
968
969
92
		    while (len>=16) {
970
60
		    	size_t *out_t=(size_t *)out;
971
60
		    	const size_t *in_t=(const size_t *)in;
972
973
60
			(*block)(ctx->Yi.c,ctx->EKi.c,key);
974
60
			++ctr;
975
			if (BYTE_ORDER == LITTLE_ENDIAN)
976
#ifdef BSWAP4
977
60
				ctx->Yi.d[3] = BSWAP4(ctr);
978
#else
979
				PUTU32(ctx->Yi.c+12,ctr);
980
#endif
981
			else
982
				ctx->Yi.d[3] = ctr;
983
180
			for (i=0; i<16/sizeof(size_t); ++i)
984
120
				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
985
60
			out += 16;
986
60
			in  += 16;
987
60
			len -= 16;
988
		    }
989
16
		    GHASH(ctx,out-j,j);
990
		}
991
#else
992
		while (len>=16) {
993
		    	size_t *out_t=(size_t *)out;
994
		    	const size_t *in_t=(const size_t *)in;
995
996
			(*block)(ctx->Yi.c,ctx->EKi.c,key);
997
			++ctr;
998
			if (BYTE_ORDER == LITTLE_ENDIAN)
999
#ifdef BSWAP4
1000
				ctx->Yi.d[3] = BSWAP4(ctr);
1001
#else
1002
				PUTU32(ctx->Yi.c+12,ctr);
1003
#endif
1004
			else
1005
				ctx->Yi.d[3] = ctr;
1006
			for (i=0; i<16/sizeof(size_t); ++i)
1007
				ctx->Xi.t[i] ^=
1008
				out_t[i] = in_t[i]^ctx->EKi.t[i];
1009
			GCM_MUL(ctx,Xi);
1010
			out += 16;
1011
			in  += 16;
1012
			len -= 16;
1013
		}
1014
#endif
1015
16
		if (len) {
1016
9
			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1017
9
			++ctr;
1018
			if (BYTE_ORDER == LITTLE_ENDIAN)
1019
#ifdef BSWAP4
1020
9
				ctx->Yi.d[3] = BSWAP4(ctr);
1021
#else
1022
				PUTU32(ctx->Yi.c+12,ctr);
1023
#endif
1024
			else
1025
				ctx->Yi.d[3] = ctr;
1026
126
			while (len--) {
1027
108
				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1028
108
				++n;
1029
			}
1030
		}
1031
1032
16
		ctx->mres = n;
1033
16
		return 0;
1034
	} while(0);
1035
#endif
1036
	for (i=0;i<len;++i) {
1037
		if (n==0) {
1038
			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1039
			++ctr;
1040
			if (BYTE_ORDER == LITTLE_ENDIAN)
1041
#ifdef BSWAP4
1042
				ctx->Yi.d[3] = BSWAP4(ctr);
1043
#else
1044
				PUTU32(ctx->Yi.c+12,ctr);
1045
#endif
1046
			else
1047
				ctx->Yi.d[3] = ctr;
1048
		}
1049
		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1050
		n = (n+1)%16;
1051
		if (n==0)
1052
			GCM_MUL(ctx,Xi);
1053
	}
1054
1055
	ctx->mres = n;
1056
	return 0;
1057
}
1058
1059
int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1060
		const unsigned char *in, unsigned char *out,
1061
		size_t len)
1062
16
{
1063
	unsigned int n, ctr;
1064
	size_t i;
1065
16
	u64        mlen  = ctx->len.u[1];
1066
16
	block128_f block = ctx->block;
1067
16
	void      *key   = ctx->key;
1068
#ifdef GCM_FUNCREF_4BIT
1069
16
	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1070
# ifdef GHASH
1071
	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1072
16
				const u8 *inp,size_t len)	= ctx->ghash;
1073
# endif
1074
#endif
1075
1076
16
	mlen += len;
1077
16
	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1078
		return -1;
1079
16
	ctx->len.u[1] = mlen;
1080
1081
16
	if (ctx->ares) {
1082
		/* First call to decrypt finalizes GHASH(AAD) */
1083
9
		GCM_MUL(ctx,Xi);
1084
9
		ctx->ares = 0;
1085
	}
1086
1087
	if (BYTE_ORDER == LITTLE_ENDIAN)
1088
#ifdef BSWAP4
1089
16
		ctr = BSWAP4(ctx->Yi.d[3]);
1090
#else
1091
		ctr = GETU32(ctx->Yi.c+12);
1092
#endif
1093
	else
1094
		ctr = ctx->Yi.d[3];
1095
1096
16
	n = ctx->mres;
1097
#if !defined(OPENSSL_SMALL_FOOTPRINT)
1098
	if (16%sizeof(size_t) == 0) do {	/* always true actually */
1099
16
		if (n) {
1100
			while (n && len) {
1101
				u8 c = *(in++);
1102
				*(out++) = c^ctx->EKi.c[n];
1103
				ctx->Xi.c[n] ^= c;
1104
				--len;
1105
				n = (n+1)%16;
1106
			}
1107
			if (n==0) GCM_MUL (ctx,Xi);
1108
			else {
1109
				ctx->mres = n;
1110
				return 0;
1111
			}
1112
		}
1113
#ifdef __STRICT_ALIGNMENT
1114
		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1115
			break;
1116
#endif
1117
#if defined(GHASH) && defined(GHASH_CHUNK)
1118
16
		while (len>=GHASH_CHUNK) {
1119
		    size_t j=GHASH_CHUNK;
1120
1121
		    GHASH(ctx,in,GHASH_CHUNK);
1122
		    while (j) {
1123
		    	size_t *out_t=(size_t *)out;
1124
		    	const size_t *in_t=(const size_t *)in;
1125
1126
			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1127
			++ctr;
1128
			if (BYTE_ORDER == LITTLE_ENDIAN)
1129
#ifdef BSWAP4
1130
				ctx->Yi.d[3] = BSWAP4(ctr);
1131
#else
1132
				PUTU32(ctx->Yi.c+12,ctr);
1133
#endif
1134
			else
1135
				ctx->Yi.d[3] = ctr;
1136
			for (i=0; i<16/sizeof(size_t); ++i)
1137
				out_t[i] = in_t[i]^ctx->EKi.t[i];
1138
			out += 16;
1139
			in  += 16;
1140
			j   -= 16;
1141
		    }
1142
		    len -= GHASH_CHUNK;
1143
		}
1144
16
		if ((i = (len&(size_t)-16))) {
1145
16
		    GHASH(ctx,in,i);
1146
92
		    while (len>=16) {
1147
60
		    	size_t *out_t=(size_t *)out;
1148
60
		    	const size_t *in_t=(const size_t *)in;
1149
1150
60
			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1151
60
			++ctr;
1152
			if (BYTE_ORDER == LITTLE_ENDIAN)
1153
#ifdef BSWAP4
1154
60
				ctx->Yi.d[3] = BSWAP4(ctr);
1155
#else
1156
				PUTU32(ctx->Yi.c+12,ctr);
1157
#endif
1158
			else
1159
				ctx->Yi.d[3] = ctr;
1160
180
			for (i=0; i<16/sizeof(size_t); ++i)
1161
120
				out_t[i] = in_t[i]^ctx->EKi.t[i];
1162
60
			out += 16;
1163
60
			in  += 16;
1164
60
			len -= 16;
1165
		    }
1166
		}
1167
#else
1168
		while (len>=16) {
1169
		    	size_t *out_t=(size_t *)out;
1170
		    	const size_t *in_t=(const size_t *)in;
1171
1172
			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1173
			++ctr;
1174
			if (BYTE_ORDER == LITTLE_ENDIAN)
1175
#ifdef BSWAP4
1176
				ctx->Yi.d[3] = BSWAP4(ctr);
1177
#else
1178
				PUTU32(ctx->Yi.c+12,ctr);
1179
#endif
1180
			else
1181
				ctx->Yi.d[3] = ctr;
1182
			for (i=0; i<16/sizeof(size_t); ++i) {
1183
				size_t c = in[i];
1184
				out[i] = c^ctx->EKi.t[i];
1185
				ctx->Xi.t[i] ^= c;
1186
			}
1187
			GCM_MUL(ctx,Xi);
1188
			out += 16;
1189
			in  += 16;
1190
			len -= 16;
1191
		}
1192
#endif
1193
16
		if (len) {
1194
9
			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1195
9
			++ctr;
1196
			if (BYTE_ORDER == LITTLE_ENDIAN)
1197
#ifdef BSWAP4
1198
9
				ctx->Yi.d[3] = BSWAP4(ctr);
1199
#else
1200
				PUTU32(ctx->Yi.c+12,ctr);
1201
#endif
1202
			else
1203
				ctx->Yi.d[3] = ctr;
1204
126
			while (len--) {
1205
108
				u8 c = in[n];
1206
108
				ctx->Xi.c[n] ^= c;
1207
108
				out[n] = c^ctx->EKi.c[n];
1208
108
				++n;
1209
			}
1210
		}
1211
1212
16
		ctx->mres = n;
1213
16
		return 0;
1214
	} while(0);
1215
#endif
1216
	for (i=0;i<len;++i) {
1217
		u8 c;
1218
		if (n==0) {
1219
			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1220
			++ctr;
1221
			if (BYTE_ORDER == LITTLE_ENDIAN)
1222
#ifdef BSWAP4
1223
				ctx->Yi.d[3] = BSWAP4(ctr);
1224
#else
1225
				PUTU32(ctx->Yi.c+12,ctr);
1226
#endif
1227
			else
1228
				ctx->Yi.d[3] = ctr;
1229
		}
1230
		c = in[i];
1231
		out[i] = c^ctx->EKi.c[n];
1232
		ctx->Xi.c[n] ^= c;
1233
		n = (n+1)%16;
1234
		if (n==0)
1235
			GCM_MUL(ctx,Xi);
1236
	}
1237
1238
	ctx->mres = n;
1239
	return 0;
1240
}
1241
1242
int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1243
		const unsigned char *in, unsigned char *out,
1244
		size_t len, ctr128_f stream)
1245
6
{
1246
	unsigned int n, ctr;
1247
	size_t i;
1248
6
	u64   mlen = ctx->len.u[1];
1249
6
	void *key  = ctx->key;
1250
#ifdef GCM_FUNCREF_4BIT
1251
6
	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1252
# ifdef GHASH
1253
	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1254
6
				const u8 *inp,size_t len)	= ctx->ghash;
1255
# endif
1256
#endif
1257
1258
6
	mlen += len;
1259
6
	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1260
		return -1;
1261
6
	ctx->len.u[1] = mlen;
1262
1263
6
	if (ctx->ares) {
1264
		/* First call to encrypt finalizes GHASH(AAD) */
1265
6
		GCM_MUL(ctx,Xi);
1266
6
		ctx->ares = 0;
1267
	}
1268
1269
	if (BYTE_ORDER == LITTLE_ENDIAN)
1270
#ifdef BSWAP4
1271
6
		ctr = BSWAP4(ctx->Yi.d[3]);
1272
#else
1273
		ctr = GETU32(ctx->Yi.c+12);
1274
#endif
1275
	else
1276
		ctr = ctx->Yi.d[3];
1277
1278
6
	n = ctx->mres;
1279
6
	if (n) {
1280
		while (n && len) {
1281
			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1282
			--len;
1283
			n = (n+1)%16;
1284
		}
1285
		if (n==0) GCM_MUL(ctx,Xi);
1286
		else {
1287
			ctx->mres = n;
1288
			return 0;
1289
		}
1290
	}
1291
#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1292
6
	while (len>=GHASH_CHUNK) {
1293
		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1294
		ctr += GHASH_CHUNK/16;
1295
		if (BYTE_ORDER == LITTLE_ENDIAN)
1296
#ifdef BSWAP4
1297
			ctx->Yi.d[3] = BSWAP4(ctr);
1298
#else
1299
			PUTU32(ctx->Yi.c+12,ctr);
1300
#endif
1301
		else
1302
			ctx->Yi.d[3] = ctr;
1303
		GHASH(ctx,out,GHASH_CHUNK);
1304
		out += GHASH_CHUNK;
1305
		in  += GHASH_CHUNK;
1306
		len -= GHASH_CHUNK;
1307
	}
1308
#endif
1309
6
	if ((i = (len&(size_t)-16))) {
1310
4
		size_t j=i/16;
1311
1312
4
		(*stream)(in,out,j,key,ctx->Yi.c);
1313
4
		ctr += (unsigned int)j;
1314
		if (BYTE_ORDER == LITTLE_ENDIAN)
1315
#ifdef BSWAP4
1316
4
			ctx->Yi.d[3] = BSWAP4(ctr);
1317
#else
1318
			PUTU32(ctx->Yi.c+12,ctr);
1319
#endif
1320
		else
1321
			ctx->Yi.d[3] = ctr;
1322
4
		in  += i;
1323
4
		len -= i;
1324
#if defined(GHASH)
1325
4
		GHASH(ctx,out,i);
1326
4
		out += i;
1327
#else
1328
		while (j--) {
1329
			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1330
			GCM_MUL(ctx,Xi);
1331
			out += 16;
1332
		}
1333
#endif
1334
	}
1335
6
	if (len) {
1336
2
		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1337
2
		++ctr;
1338
		if (BYTE_ORDER == LITTLE_ENDIAN)
1339
#ifdef BSWAP4
1340
2
			ctx->Yi.d[3] = BSWAP4(ctr);
1341
#else
1342
			PUTU32(ctx->Yi.c+12,ctr);
1343
#endif
1344
		else
1345
			ctx->Yi.d[3] = ctr;
1346
34
		while (len--) {
1347
30
			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1348
30
			++n;
1349
		}
1350
	}
1351
1352
6
	ctx->mres = n;
1353
6
	return 0;
1354
}
1355
1356
int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1357
		const unsigned char *in, unsigned char *out,
1358
		size_t len,ctr128_f stream)
1359
10
{
1360
	unsigned int n, ctr;
1361
	size_t i;
1362
10
	u64   mlen = ctx->len.u[1];
1363
10
	void *key  = ctx->key;
1364
#ifdef GCM_FUNCREF_4BIT
1365
10
	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1366
# ifdef GHASH
1367
	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1368
10
				const u8 *inp,size_t len)	= ctx->ghash;
1369
# endif
1370
#endif
1371
1372
10
	mlen += len;
1373
10
	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1374
		return -1;
1375
10
	ctx->len.u[1] = mlen;
1376
1377
10
	if (ctx->ares) {
1378
		/* First call to decrypt finalizes GHASH(AAD) */
1379
10
		GCM_MUL(ctx,Xi);
1380
10
		ctx->ares = 0;
1381
	}
1382
1383
	if (BYTE_ORDER == LITTLE_ENDIAN)
1384
#ifdef BSWAP4
1385
10
		ctr = BSWAP4(ctx->Yi.d[3]);
1386
#else
1387
		ctr = GETU32(ctx->Yi.c+12);
1388
#endif
1389
	else
1390
		ctr = ctx->Yi.d[3];
1391
1392
10
	n = ctx->mres;
1393
10
	if (n) {
1394
		while (n && len) {
1395
			u8 c = *(in++);
1396
			*(out++) = c^ctx->EKi.c[n];
1397
			ctx->Xi.c[n] ^= c;
1398
			--len;
1399
			n = (n+1)%16;
1400
		}
1401
		if (n==0) GCM_MUL (ctx,Xi);
1402
		else {
1403
			ctx->mres = n;
1404
			return 0;
1405
		}
1406
	}
1407
#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1408
10
	while (len>=GHASH_CHUNK) {
1409
		GHASH(ctx,in,GHASH_CHUNK);
1410
		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1411
		ctr += GHASH_CHUNK/16;
1412
		if (BYTE_ORDER == LITTLE_ENDIAN)
1413
#ifdef BSWAP4
1414
			ctx->Yi.d[3] = BSWAP4(ctr);
1415
#else
1416
			PUTU32(ctx->Yi.c+12,ctr);
1417
#endif
1418
		else
1419
			ctx->Yi.d[3] = ctr;
1420
		out += GHASH_CHUNK;
1421
		in  += GHASH_CHUNK;
1422
		len -= GHASH_CHUNK;
1423
	}
1424
#endif
1425
10
	if ((i = (len&(size_t)-16))) {
1426
6
		size_t j=i/16;
1427
1428
#if defined(GHASH)
1429
6
		GHASH(ctx,in,i);
1430
#else
1431
		while (j--) {
1432
			size_t k;
1433
			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1434
			GCM_MUL(ctx,Xi);
1435
			in += 16;
1436
		}
1437
		j   = i/16;
1438
		in -= i;
1439
#endif
1440
6
		(*stream)(in,out,j,key,ctx->Yi.c);
1441
6
		ctr += (unsigned int)j;
1442
		if (BYTE_ORDER == LITTLE_ENDIAN)
1443
#ifdef BSWAP4
1444
6
			ctx->Yi.d[3] = BSWAP4(ctr);
1445
#else
1446
			PUTU32(ctx->Yi.c+12,ctr);
1447
#endif
1448
		else
1449
			ctx->Yi.d[3] = ctr;
1450
6
		out += i;
1451
6
		in  += i;
1452
6
		len -= i;
1453
	}
1454
10
	if (len) {
1455
4
		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1456
4
		++ctr;
1457
		if (BYTE_ORDER == LITTLE_ENDIAN)
1458
#ifdef BSWAP4
1459
4
			ctx->Yi.d[3] = BSWAP4(ctr);
1460
#else
1461
			PUTU32(ctx->Yi.c+12,ctr);
1462
#endif
1463
		else
1464
			ctx->Yi.d[3] = ctr;
1465
68
		while (len--) {
1466
60
			u8 c = in[n];
1467
60
			ctx->Xi.c[n] ^= c;
1468
60
			out[n] = c^ctx->EKi.c[n];
1469
60
			++n;
1470
		}
1471
	}
1472
1473
10
	ctx->mres = n;
1474
10
	return 0;
1475
}
1476
1477
int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1478
			size_t len)
1479
56
{
1480
56
	u64 alen = ctx->len.u[0]<<3;
1481
56
	u64 clen = ctx->len.u[1]<<3;
1482
#ifdef GCM_FUNCREF_4BIT
1483
56
	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1484
#endif
1485
1486
56
	if (ctx->mres || ctx->ares)
1487
24
		GCM_MUL(ctx,Xi);
1488
1489
	if (BYTE_ORDER == LITTLE_ENDIAN) {
1490
#ifdef BSWAP8
1491
56
		alen = BSWAP8(alen);
1492
56
		clen = BSWAP8(clen);
1493
#else
1494
		u8 *p = ctx->len.c;
1495
1496
		ctx->len.u[0] = alen;
1497
		ctx->len.u[1] = clen;
1498
1499
		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1500
		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1501
#endif
1502
	}
1503
1504
56
	ctx->Xi.u[0] ^= alen;
1505
56
	ctx->Xi.u[1] ^= clen;
1506
56
	GCM_MUL(ctx,Xi);
1507
1508
56
	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1509
56
	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1510
1511
56
	if (tag && len<=sizeof(ctx->Xi))
1512
40
		return memcmp(ctx->Xi.c,tag,len);
1513
	else
1514
16
		return -1;
1515
}
1516
1517
void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1518
16
{
1519
16
	CRYPTO_gcm128_finish(ctx, NULL, 0);
1520
16
	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1521
16
}
1522
1523
GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1524
{
1525
	GCM128_CONTEXT *ret;
1526
1527
	if ((ret = malloc(sizeof(GCM128_CONTEXT))))
1528
		CRYPTO_gcm128_init(ret,key,block);
1529
1530
	return ret;
1531
}
1532
1533
void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1534
{
1535
	if (ctx) {
1536
		explicit_bzero(ctx,sizeof(*ctx));
1537
		free(ctx);
1538
	}
1539
}