summaryrefslogtreecommitdiff
blob: d168d99222e815433518b305244954b88d874cfc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
https://github.com/Tarsnap/tarsnap/commit/ca40c06f290fb8298dc2e583303d45b58878f37b
https://github.com/Tarsnap/tarsnap/commit/4af6d8350ab53d0f1f3104ce3d9072c2d5f9ef7a

From 4af6d8350ab53d0f1f3104ce3d9072c2d5f9ef7a Mon Sep 17 00:00:00 2001
From: Graham Percival <gperciva@tarsnap.com>
Date: Fri, 1 Apr 2022 16:58:43 -0700
Subject: [PATCH] scrypt: Fix strict aliasing

The original scrypt code treated its data as blobs of bytes, accessing
them in whatever manner was convenient from time to time:

* as 32-bit words or vectors thereof for the purpose of the Salsa20/8 core
* in machine-word-sized chunks (aka. size_t) for block copy and xor operations
* as 32-bit words for the Integerify function.

This worked fine at the time, but newer compilers apply strict aliasing rules
which allow them to assume that e.g. data accessed as a uint32_t is not the
same as data accessed as a size_t, resulting in miscompilation.

Note that in recent versions of scrypt (after 2015-07-18; versions 1.2.0 and
later) such miscompilation should be detected by the built-in runtime testing.

To avoid aliasing problems, the generic scrypt code now operates on uint32_t
throughout while the SSE2-enabled scrypt code operates on __m128i throughout.

Experimentally, we found that memcpy() speeds up blkcpy() in the plain C
case, but slowed it down in the _sse2.c case (probably because memcpy
can make use of vector instructions internally, but cannot assume that
it will always have a multiple of 16 bytes, as we do).
--- a/lib/crypto/crypto_scrypt_smix.c
+++ b/lib/crypto/crypto_scrypt_smix.c
@@ -27,39 +27,32 @@
  * online backup system.
  */
 #include <stdint.h>
+#include <string.h>
 
 #include "sysendian.h"
 
 #include "crypto_scrypt_smix.h"
 
-static void blkcpy(void *, const void *, size_t);
-static void blkxor(void *, const void *, size_t);
+static void blkcpy(uint32_t *, const uint32_t *, size_t);
+static void blkxor(uint32_t *, const uint32_t *, size_t);
 static void salsa20_8(uint32_t[16]);
 static void blockmix_salsa8(const uint32_t *, uint32_t *, uint32_t *, size_t);
-static uint64_t integerify(const void *, size_t);
+static uint64_t integerify(const uint32_t *, size_t);
 
 static void
-blkcpy(void * dest, const void * src, size_t len)
+blkcpy(uint32_t * dest, const uint32_t * src, size_t len)
 {
-	size_t * D = dest;
-	const size_t * S = src;
-	size_t L = len / sizeof(size_t);
-	size_t i;
 
-	for (i = 0; i < L; i++)
-		D[i] = S[i];
+	memcpy(dest, src, len);
 }
 
 static void
-blkxor(void * dest, const void * src, size_t len)
+blkxor(uint32_t * dest, const uint32_t * src, size_t len)
 {
-	size_t * D = dest;
-	const size_t * S = src;
-	size_t L = len / sizeof(size_t);
 	size_t i;
 
-	for (i = 0; i < L; i++)
-		D[i] ^= S[i];
+	for (i = 0; i < len / 4; i++)
+		dest[i] ^= src[i];
 }
 
 /**
@@ -145,9 +138,9 @@ blockmix_salsa8(const uint32_t * Bin, uint32_t * Bout, uint32_t * X, size_t r)
  * Return the result of parsing B_{2r-1} as a little-endian integer.
  */
 static uint64_t
-integerify(const void * B, size_t r)
+integerify(const uint32_t * B, size_t r)
 {
-	const uint32_t * X = (const void *)((uintptr_t)(B) + (2 * r - 1) * 64);
+	const uint32_t * X = B + (2 * r - 1) * 16;
 
 	return (((uint64_t)(X[1]) << 32) + X[0]);
 }
--- a/lib/crypto/crypto_scrypt_smix_sse2.c
+++ b/lib/crypto/crypto_scrypt_smix_sse2.c
@@ -36,34 +36,30 @@
 
 #include "crypto_scrypt_smix_sse2.h"
 
-static void blkcpy(void *, const void *, size_t);
-static void blkxor(void *, const void *, size_t);
+static void blkcpy(__m128i *, const __m128i *, size_t);
+static void blkxor(__m128i *, const __m128i *, size_t);
 static void salsa20_8(__m128i[4]);
 static void blockmix_salsa8(const __m128i *, __m128i *, __m128i *, size_t);
-static uint64_t integerify(const void *, size_t);
+static uint64_t integerify(const __m128i *, size_t);
 
 static void
-blkcpy(void * dest, const void * src, size_t len)
+blkcpy(__m128i * dest, const __m128i * src, size_t len)
 {
-	__m128i * D = dest;
-	const __m128i * S = src;
 	size_t L = len / 16;
 	size_t i;
 
 	for (i = 0; i < L; i++)
-		D[i] = S[i];
+		dest[i] = src[i];
 }
 
 static void
-blkxor(void * dest, const void * src, size_t len)
+blkxor(__m128i * dest, const __m128i * src, size_t len)
 {
-	__m128i * D = dest;
-	const __m128i * S = src;
 	size_t L = len / 16;
 	size_t i;
 
 	for (i = 0; i < L; i++)
-		D[i] = _mm_xor_si128(D[i], S[i]);
+		dest[i] = _mm_xor_si128(dest[i], src[i]);
 }
 
 /**
@@ -168,11 +164,18 @@ blockmix_salsa8(const __m128i * Bin, __m128i * Bout, __m128i * X, size_t r)
  * Note that B's layout is permuted compared to the generic implementation.
  */
 static uint64_t
-integerify(const void * B, size_t r)
+integerify(const __m128i * B, size_t r)
 {
-	const uint32_t * X = (const void *)((uintptr_t)(B) + (2 * r - 1) * 64);
+	const __m128i * X = B + (2*r - 1) * 4;
+	uint32_t X0, X13;
 
-	return (((uint64_t)(X[13]) << 32) + X[0]);
+	/* Get the first 32-bit element in X[0]. */
+	X0 = (uint32_t)_mm_cvtsi128_si32(X[0]);
+
+	/* Get the second 32-bit element in X[3]. */
+	X13 = (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(X[3], 4));
+
+	return (((uint64_t)(X13) << 32) + X0);
 }
 
 /**