diff --git a/configure.ac b/configure.ac index 05da6d6f..6159d4ac 100644 --- a/configure.ac +++ b/configure.ac @@ -408,12 +408,27 @@ if test x$enable_libargon2 = xyes ; then enable_internal_argon2=no else AC_MSG_WARN([Argon2 bundled (slow) reference implementation will be used, please consider to use system library with --enable-libargon2.]) + + AC_ARG_ENABLE(internal-sse-argon2, AS_HELP_STRING([--enable-internal-sse-argon2], + [enable internal SSE implementation of Argon2 PBKDF]),[], [enable_internal_sse_argon2=no]) + + if test x$enable_internal_sse_argon2 = xyes ; then + AC_MSG_CHECKING(if Argon2 SSE optimization can be used) + AC_LINK_IFELSE([AC_LANG_PROGRAM([[ + #include + __m128i testfunc(__m128i *a, __m128i *b) { + return _mm_xor_si128(_mm_loadu_si128(a), _mm_loadu_si128(b)); + } + ]])],,[enable_internal_sse_argon2=no]) + AC_MSG_RESULT($enable_internal_sse_argon2) + fi fi if test x$enable_internal_argon2 = xyes ; then AC_DEFINE(USE_INTERNAL_ARGON2, 1, [Use internal Argon2]) fi AM_CONDITIONAL(CRYPTO_INTERNAL_ARGON2, test x$enable_internal_argon2 = xyes) +AM_CONDITIONAL(CRYPTO_INTERNAL_SSE_ARGON2, test x$enable_internal_sse_argon2 = xyes) dnl Magic for cryptsetup.static build. if test x$enable_static_cryptsetup = xyes; then diff --git a/lib/crypto_backend/argon2/Makemodule.am b/lib/crypto_backend/argon2/Makemodule.am index 4c25c012..be729460 100644 --- a/lib/crypto_backend/argon2/Makemodule.am +++ b/lib/crypto_backend/argon2/Makemodule.am @@ -7,16 +7,22 @@ libargon2_la_SOURCES = \ lib/crypto_backend/argon2/blake2/blake2b.c \ lib/crypto_backend/argon2/blake2/blake2.h \ lib/crypto_backend/argon2/blake2/blake2-impl.h \ - lib/crypto_backend/argon2/blake2/blamka-round-ref.h \ lib/crypto_backend/argon2/argon2.c \ lib/crypto_backend/argon2/argon2.h \ lib/crypto_backend/argon2/core.c \ lib/crypto_backend/argon2/core.h \ lib/crypto_backend/argon2/encoding.c \ lib/crypto_backend/argon2/encoding.h \ - lib/crypto_backend/argon2/ref.c \ lib/crypto_backend/argon2/thread.c \ lib/crypto_backend/argon2/thread.h +if CRYPTO_INTERNAL_SSE_ARGON2 +libargon2_la_SOURCES += lib/crypto_backend/argon2/blake2/blamka-round-opt.h \ + lib/crypto_backend/argon2/opt.c +else +libargon2_la_SOURCES += lib/crypto_backend/argon2/blake2/blamka-round-ref.h \ + lib/crypto_backend/argon2/ref.c +endif + EXTRA_DIST += lib/crypto_backend/argon2/LICENSE EXTRA_DIST += lib/crypto_backend/argon2/README diff --git a/lib/crypto_backend/argon2/blake2/blamka-round-opt.h b/lib/crypto_backend/argon2/blake2/blamka-round-opt.h new file mode 100644 index 00000000..2c8942e3 --- /dev/null +++ b/lib/crypto_backend/argon2/blake2/blamka-round-opt.h @@ -0,0 +1,471 @@ +/* + * Argon2 reference source code package - reference C implementations + * + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves + * + * You may use this work under the terms of a Creative Commons CC0 1.0 + * License/Waiver or the Apache Public License 2.0, at your option. The terms of + * these licenses can be found at: + * + * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + * + * You should have received a copy of both of these licenses along with this + * software. If not, they may be obtained at the above URLs. + */ + +#ifndef BLAKE_ROUND_MKA_OPT_H +#define BLAKE_ROUND_MKA_OPT_H + +#include "blake2-impl.h" + +#include +#if defined(__SSSE3__) +#include /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */ +#endif + +#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__)) +#include +#endif + +#if !defined(__AVX512F__) +#if !defined(__AVX2__) +#if !defined(__XOP__) +#if defined(__SSSE3__) +#define r16 \ + (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) +#define r24 \ + (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) \ + ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \ + : (-(c) == 24) \ + ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) \ + ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) \ + ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ + _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ + _mm_slli_epi64((x), 64 - (-(c)))) +#else /* defined(__SSE2__) */ +#define _mm_roti_epi64(r, c) \ + _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c)))) +#endif +#else +#endif + +static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { + const __m128i z = _mm_mul_epu32(x, y); + return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z)); +} + +#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + A0 = fBlaMka(A0, B0); \ + A1 = fBlaMka(A1, B1); \ + \ + D0 = _mm_xor_si128(D0, A0); \ + D1 = _mm_xor_si128(D1, A1); \ + \ + D0 = _mm_roti_epi64(D0, -32); \ + D1 = _mm_roti_epi64(D1, -32); \ + \ + C0 = fBlaMka(C0, D0); \ + C1 = fBlaMka(C1, D1); \ + \ + B0 = _mm_xor_si128(B0, C0); \ + B1 = _mm_xor_si128(B1, C1); \ + \ + B0 = _mm_roti_epi64(B0, -24); \ + B1 = _mm_roti_epi64(B1, -24); \ + } while ((void)0, 0) + +#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + A0 = fBlaMka(A0, B0); \ + A1 = fBlaMka(A1, B1); \ + \ + D0 = _mm_xor_si128(D0, A0); \ + D1 = _mm_xor_si128(D1, A1); \ + \ + D0 = _mm_roti_epi64(D0, -16); \ + D1 = _mm_roti_epi64(D1, -16); \ + \ + C0 = fBlaMka(C0, D0); \ + C1 = fBlaMka(C1, D1); \ + \ + B0 = _mm_xor_si128(B0, C0); \ + B1 = _mm_xor_si128(B1, C1); \ + \ + B0 = _mm_roti_epi64(B0, -63); \ + B1 = _mm_roti_epi64(B1, -63); \ + } while ((void)0, 0) + +#if defined(__SSSE3__) +#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0 = _mm_alignr_epi8(B1, B0, 8); \ + __m128i t1 = _mm_alignr_epi8(B0, B1, 8); \ + B0 = t0; \ + B1 = t1; \ + \ + t0 = C0; \ + C0 = C1; \ + C1 = t0; \ + \ + t0 = _mm_alignr_epi8(D1, D0, 8); \ + t1 = _mm_alignr_epi8(D0, D1, 8); \ + D0 = t1; \ + D1 = t0; \ + } while ((void)0, 0) + +#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0 = _mm_alignr_epi8(B0, B1, 8); \ + __m128i t1 = _mm_alignr_epi8(B1, B0, 8); \ + B0 = t0; \ + B1 = t1; \ + \ + t0 = C0; \ + C0 = C1; \ + C1 = t0; \ + \ + t0 = _mm_alignr_epi8(D0, D1, 8); \ + t1 = _mm_alignr_epi8(D1, D0, 8); \ + D0 = t1; \ + D1 = t0; \ + } while ((void)0, 0) +#else /* SSE2 */ +#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0 = D0; \ + __m128i t1 = B0; \ + D0 = C0; \ + C0 = C1; \ + C1 = D0; \ + D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0)); \ + D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1)); \ + B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1)); \ + B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1)); \ + } while ((void)0, 0) + +#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0, t1; \ + t0 = C0; \ + C0 = C1; \ + C1 = t0; \ + t0 = B0; \ + t1 = D0; \ + B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0)); \ + B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1)); \ + D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1)); \ + D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1)); \ + } while ((void)0, 0) +#endif + +#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + G1(A0, B0, C0, D0, A1, B1, C1, D1); \ + G2(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + G1(A0, B0, C0, D0, A1, B1, C1, D1); \ + G2(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ + } while ((void)0, 0) +#else /* __AVX2__ */ + +#include + +#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)) +#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) +#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) +#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x))) + +#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + __m256i ml = _mm256_mul_epu32(A0, B0); \ + ml = _mm256_add_epi64(ml, ml); \ + A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ + D0 = _mm256_xor_si256(D0, A0); \ + D0 = rotr32(D0); \ + \ + ml = _mm256_mul_epu32(C0, D0); \ + ml = _mm256_add_epi64(ml, ml); \ + C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ + \ + B0 = _mm256_xor_si256(B0, C0); \ + B0 = rotr24(B0); \ + \ + ml = _mm256_mul_epu32(A1, B1); \ + ml = _mm256_add_epi64(ml, ml); \ + A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ + D1 = _mm256_xor_si256(D1, A1); \ + D1 = rotr32(D1); \ + \ + ml = _mm256_mul_epu32(C1, D1); \ + ml = _mm256_add_epi64(ml, ml); \ + C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ + \ + B1 = _mm256_xor_si256(B1, C1); \ + B1 = rotr24(B1); \ + } while((void)0, 0); + +#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + __m256i ml = _mm256_mul_epu32(A0, B0); \ + ml = _mm256_add_epi64(ml, ml); \ + A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ + D0 = _mm256_xor_si256(D0, A0); \ + D0 = rotr16(D0); \ + \ + ml = _mm256_mul_epu32(C0, D0); \ + ml = _mm256_add_epi64(ml, ml); \ + C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ + B0 = _mm256_xor_si256(B0, C0); \ + B0 = rotr63(B0); \ + \ + ml = _mm256_mul_epu32(A1, B1); \ + ml = _mm256_add_epi64(ml, ml); \ + A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ + D1 = _mm256_xor_si256(D1, A1); \ + D1 = rotr16(D1); \ + \ + ml = _mm256_mul_epu32(C1, D1); \ + ml = _mm256_add_epi64(ml, ml); \ + C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ + B1 = _mm256_xor_si256(B1, C1); \ + B1 = rotr63(B1); \ + } while((void)0, 0); + +#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ + C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ + D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ + \ + B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ + C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ + D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ + } while((void)0, 0); + +#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ + __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ + B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ + \ + tmp1 = C0; \ + C0 = C1; \ + C1 = tmp1; \ + \ + tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \ + tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \ + D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ + } while(0); + +#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ + C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ + D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ + \ + B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ + C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ + D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ + } while((void)0, 0); + +#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ + __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ + B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ + \ + tmp1 = C0; \ + C0 = C1; \ + C1 = tmp1; \ + \ + tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \ + tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \ + D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ + } while((void)0, 0); + +#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \ + do{ \ + G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ + \ + G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ + } while((void)0, 0); + +#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do{ \ + G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + } while((void)0, 0); + +#endif /* __AVX2__ */ + +#else /* __AVX512F__ */ + +#include + +#define ror64(x, n) _mm512_ror_epi64((x), (n)) + +static __m512i muladd(__m512i x, __m512i y) +{ + __m512i z = _mm512_mul_epu32(x, y); + return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z)); +} + +#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + A0 = muladd(A0, B0); \ + A1 = muladd(A1, B1); \ +\ + D0 = _mm512_xor_si512(D0, A0); \ + D1 = _mm512_xor_si512(D1, A1); \ +\ + D0 = ror64(D0, 32); \ + D1 = ror64(D1, 32); \ +\ + C0 = muladd(C0, D0); \ + C1 = muladd(C1, D1); \ +\ + B0 = _mm512_xor_si512(B0, C0); \ + B1 = _mm512_xor_si512(B1, C1); \ +\ + B0 = ror64(B0, 24); \ + B1 = ror64(B1, 24); \ + } while ((void)0, 0) + +#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + A0 = muladd(A0, B0); \ + A1 = muladd(A1, B1); \ +\ + D0 = _mm512_xor_si512(D0, A0); \ + D1 = _mm512_xor_si512(D1, A1); \ +\ + D0 = ror64(D0, 16); \ + D1 = ror64(D1, 16); \ +\ + C0 = muladd(C0, D0); \ + C1 = muladd(C1, D1); \ +\ + B0 = _mm512_xor_si512(B0, C0); \ + B1 = _mm512_xor_si512(B1, C1); \ +\ + B0 = ror64(B0, 63); \ + B1 = ror64(B1, 63); \ + } while ((void)0, 0) + +#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ + B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ +\ + C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ + C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ +\ + D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ + D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ + } while ((void)0, 0) + +#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ + B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ +\ + C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ + C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ +\ + D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ + D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ + } while ((void)0, 0) + +#define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + G1(A0, B0, C0, D0, A1, B1, C1, D1); \ + G2(A0, B0, C0, D0, A1, B1, C1, D1); \ +\ + DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ +\ + G1(A0, B0, C0, D0, A1, B1, C1, D1); \ + G2(A0, B0, C0, D0, A1, B1, C1, D1); \ +\ + UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ + } while ((void)0, 0) + +#define SWAP_HALVES(A0, A1) \ + do { \ + __m512i t0, t1; \ + t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \ + t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \ + A0 = t0; \ + A1 = t1; \ + } while((void)0, 0) + +#define SWAP_QUARTERS(A0, A1) \ + do { \ + SWAP_HALVES(A0, A1); \ + A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \ + A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \ + } while((void)0, 0) + +#define UNSWAP_QUARTERS(A0, A1) \ + do { \ + A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \ + A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \ + SWAP_HALVES(A0, A1); \ + } while((void)0, 0) + +#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \ + do { \ + SWAP_HALVES(A0, B0); \ + SWAP_HALVES(C0, D0); \ + SWAP_HALVES(A1, B1); \ + SWAP_HALVES(C1, D1); \ + BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \ + SWAP_HALVES(A0, B0); \ + SWAP_HALVES(C0, D0); \ + SWAP_HALVES(A1, B1); \ + SWAP_HALVES(C1, D1); \ + } while ((void)0, 0) + +#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + SWAP_QUARTERS(A0, A1); \ + SWAP_QUARTERS(B0, B1); \ + SWAP_QUARTERS(C0, C1); \ + SWAP_QUARTERS(D0, D1); \ + BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \ + UNSWAP_QUARTERS(A0, A1); \ + UNSWAP_QUARTERS(B0, B1); \ + UNSWAP_QUARTERS(C0, C1); \ + UNSWAP_QUARTERS(D0, D1); \ + } while ((void)0, 0) + +#endif /* __AVX512F__ */ +#endif /* BLAKE_ROUND_MKA_OPT_H */ diff --git a/lib/crypto_backend/argon2/opt.c b/lib/crypto_backend/argon2/opt.c new file mode 100644 index 00000000..f6c20528 --- /dev/null +++ b/lib/crypto_backend/argon2/opt.c @@ -0,0 +1,283 @@ +/* + * Argon2 reference source code package - reference C implementations + * + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves + * + * You may use this work under the terms of a Creative Commons CC0 1.0 + * License/Waiver or the Apache Public License 2.0, at your option. The terms of + * these licenses can be found at: + * + * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + * + * You should have received a copy of both of these licenses along with this + * software. If not, they may be obtained at the above URLs. + */ + +#include +#include +#include + +#include "argon2.h" +#include "core.h" + +#include "blake2/blake2.h" +#include "blake2/blamka-round-opt.h" + +/* + * Function fills a new memory block and optionally XORs the old block over the new one. + * Memory must be initialized. + * @param state Pointer to the just produced block. Content will be updated(!) + * @param ref_block Pointer to the reference block + * @param next_block Pointer to the block to be XORed over. May coincide with @ref_block + * @param with_xor Whether to XOR into the new block (1) or just overwrite (0) + * @pre all block pointers must be valid + */ +#if defined(__AVX512F__) +static void fill_block(__m512i *state, const block *ref_block, + block *next_block, int with_xor) { + __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK]; + unsigned int i; + + if (with_xor) { + for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { + state[i] = _mm512_xor_si512( + state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i)); + block_XY[i] = _mm512_xor_si512( + state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i)); + } + } else { + for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { + block_XY[i] = state[i] = _mm512_xor_si512( + state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i)); + } + } + + for (i = 0; i < 2; ++i) { + BLAKE2_ROUND_1( + state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3], + state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]); + } + + for (i = 0; i < 2; ++i) { + BLAKE2_ROUND_2( + state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i], + state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]); + } + + for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { + state[i] = _mm512_xor_si512(state[i], block_XY[i]); + _mm512_storeu_si512((__m512i *)next_block->v + i, state[i]); + } +} +#elif defined(__AVX2__) +static void fill_block(__m256i *state, const block *ref_block, + block *next_block, int with_xor) { + __m256i block_XY[ARGON2_HWORDS_IN_BLOCK]; + unsigned int i; + + if (with_xor) { + for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { + state[i] = _mm256_xor_si256( + state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i)); + block_XY[i] = _mm256_xor_si256( + state[i], _mm256_loadu_si256((const __m256i *)next_block->v + i)); + } + } else { + for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { + block_XY[i] = state[i] = _mm256_xor_si256( + state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i)); + } + } + + for (i = 0; i < 4; ++i) { + BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5], + state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]); + } + + for (i = 0; i < 4; ++i) { + BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i], + state[16 + i], state[20 + i], state[24 + i], state[28 + i]); + } + + for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { + state[i] = _mm256_xor_si256(state[i], block_XY[i]); + _mm256_storeu_si256((__m256i *)next_block->v + i, state[i]); + } +} +#else +static void fill_block(__m128i *state, const block *ref_block, + block *next_block, int with_xor) { + __m128i block_XY[ARGON2_OWORDS_IN_BLOCK]; + unsigned int i; + + if (with_xor) { + for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { + state[i] = _mm_xor_si128( + state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i)); + block_XY[i] = _mm_xor_si128( + state[i], _mm_loadu_si128((const __m128i *)next_block->v + i)); + } + } else { + for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { + block_XY[i] = state[i] = _mm_xor_si128( + state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i)); + } + } + + for (i = 0; i < 8; ++i) { + BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], + state[8 * i + 3], state[8 * i + 4], state[8 * i + 5], + state[8 * i + 6], state[8 * i + 7]); + } + + for (i = 0; i < 8; ++i) { + BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], + state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i], + state[8 * 6 + i], state[8 * 7 + i]); + } + + for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { + state[i] = _mm_xor_si128(state[i], block_XY[i]); + _mm_storeu_si128((__m128i *)next_block->v + i, state[i]); + } +} +#endif + +static void next_addresses(block *address_block, block *input_block) { + /*Temporary zero-initialized blocks*/ +#if defined(__AVX512F__) + __m512i zero_block[ARGON2_512BIT_WORDS_IN_BLOCK]; + __m512i zero2_block[ARGON2_512BIT_WORDS_IN_BLOCK]; +#elif defined(__AVX2__) + __m256i zero_block[ARGON2_HWORDS_IN_BLOCK]; + __m256i zero2_block[ARGON2_HWORDS_IN_BLOCK]; +#else + __m128i zero_block[ARGON2_OWORDS_IN_BLOCK]; + __m128i zero2_block[ARGON2_OWORDS_IN_BLOCK]; +#endif + + memset(zero_block, 0, sizeof(zero_block)); + memset(zero2_block, 0, sizeof(zero2_block)); + + /*Increasing index counter*/ + input_block->v[6]++; + + /*First iteration of G*/ + fill_block(zero_block, input_block, address_block, 0); + + /*Second iteration of G*/ + fill_block(zero2_block, address_block, address_block, 0); +} + +void fill_segment(const argon2_instance_t *instance, + argon2_position_t position) { + block *ref_block = NULL, *curr_block = NULL; + block address_block, input_block; + uint64_t pseudo_rand, ref_index, ref_lane; + uint32_t prev_offset, curr_offset; + uint32_t starting_index, i; +#if defined(__AVX512F__) + __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK]; +#elif defined(__AVX2__) + __m256i state[ARGON2_HWORDS_IN_BLOCK]; +#else + __m128i state[ARGON2_OWORDS_IN_BLOCK]; +#endif + int data_independent_addressing; + + if (instance == NULL) { + return; + } + + data_independent_addressing = + (instance->type == Argon2_i) || + (instance->type == Argon2_id && (position.pass == 0) && + (position.slice < ARGON2_SYNC_POINTS / 2)); + + if (data_independent_addressing) { + init_block_value(&input_block, 0); + + input_block.v[0] = position.pass; + input_block.v[1] = position.lane; + input_block.v[2] = position.slice; + input_block.v[3] = instance->memory_blocks; + input_block.v[4] = instance->passes; + input_block.v[5] = instance->type; + } + + starting_index = 0; + + if ((0 == position.pass) && (0 == position.slice)) { + starting_index = 2; /* we have already generated the first two blocks */ + + /* Don't forget to generate the first block of addresses: */ + if (data_independent_addressing) { + next_addresses(&address_block, &input_block); + } + } + + /* Offset of the current block */ + curr_offset = position.lane * instance->lane_length + + position.slice * instance->segment_length + starting_index; + + if (0 == curr_offset % instance->lane_length) { + /* Last block in this lane */ + prev_offset = curr_offset + instance->lane_length - 1; + } else { + /* Previous block */ + prev_offset = curr_offset - 1; + } + + memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE); + + for (i = starting_index; i < instance->segment_length; + ++i, ++curr_offset, ++prev_offset) { + /*1.1 Rotating prev_offset if needed */ + if (curr_offset % instance->lane_length == 1) { + prev_offset = curr_offset - 1; + } + + /* 1.2 Computing the index of the reference block */ + /* 1.2.1 Taking pseudo-random value from the previous block */ + if (data_independent_addressing) { + if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) { + next_addresses(&address_block, &input_block); + } + pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK]; + } else { + pseudo_rand = instance->memory[prev_offset].v[0]; + } + + /* 1.2.2 Computing the lane of the reference block */ + ref_lane = ((pseudo_rand >> 32)) % instance->lanes; + + if ((position.pass == 0) && (position.slice == 0)) { + /* Can not reference other lanes yet */ + ref_lane = position.lane; + } + + /* 1.2.3 Computing the number of possible reference block within the + * lane. + */ + position.index = i; + ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, + ref_lane == position.lane); + + /* 2 Creating a new block */ + ref_block = + instance->memory + instance->lane_length * ref_lane + ref_index; + curr_block = instance->memory + curr_offset; + if (ARGON2_VERSION_10 == instance->version) { + /* version 1.2.1 and earlier: overwrite, not XOR */ + fill_block(state, ref_block, curr_block, 0); + } else { + if(0 == position.pass) { + fill_block(state, ref_block, curr_block, 0); + } else { + fill_block(state, ref_block, curr_block, 1); + } + } + } +}