diff --git a/configure.ac b/configure.ac
index 05da6d6f..6159d4ac 100644
--- a/configure.ac
+++ b/configure.ac
@@ -408,12 +408,27 @@ if test x$enable_libargon2 = xyes ; then
 	enable_internal_argon2=no
 else
 	AC_MSG_WARN([Argon2 bundled (slow) reference implementation will be used, please consider to use system library with --enable-libargon2.])
+
+	AC_ARG_ENABLE(internal-sse-argon2, AS_HELP_STRING([--enable-internal-sse-argon2],
+		[enable internal SSE implementation of Argon2 PBKDF]),[], [enable_internal_sse_argon2=no])
+
+	if test x$enable_internal_sse_argon2 = xyes ; then
+		AC_MSG_CHECKING(if Argon2 SSE optimization can be used)
+		AC_LINK_IFELSE([AC_LANG_PROGRAM([[
+			#include <emmintrin.h>
+			__m128i testfunc(__m128i *a, __m128i *b) {
+			  return _mm_xor_si128(_mm_loadu_si128(a), _mm_loadu_si128(b));
+			}
+		]])],,[enable_internal_sse_argon2=no])
+		AC_MSG_RESULT($enable_internal_sse_argon2)
+	fi
 fi
 
 if test x$enable_internal_argon2 = xyes ; then
 	AC_DEFINE(USE_INTERNAL_ARGON2, 1, [Use internal Argon2])
 fi
 AM_CONDITIONAL(CRYPTO_INTERNAL_ARGON2, test x$enable_internal_argon2 = xyes)
+AM_CONDITIONAL(CRYPTO_INTERNAL_SSE_ARGON2, test x$enable_internal_sse_argon2 = xyes)
 
 dnl Magic for cryptsetup.static build.
 if test x$enable_static_cryptsetup = xyes; then
diff --git a/lib/crypto_backend/argon2/Makemodule.am b/lib/crypto_backend/argon2/Makemodule.am
index 4c25c012..be729460 100644
--- a/lib/crypto_backend/argon2/Makemodule.am
+++ b/lib/crypto_backend/argon2/Makemodule.am
@@ -7,16 +7,22 @@ libargon2_la_SOURCES = \
 	lib/crypto_backend/argon2/blake2/blake2b.c \
 	lib/crypto_backend/argon2/blake2/blake2.h \
 	lib/crypto_backend/argon2/blake2/blake2-impl.h \
-	lib/crypto_backend/argon2/blake2/blamka-round-ref.h \
 	lib/crypto_backend/argon2/argon2.c \
 	lib/crypto_backend/argon2/argon2.h \
 	lib/crypto_backend/argon2/core.c \
 	lib/crypto_backend/argon2/core.h \
 	lib/crypto_backend/argon2/encoding.c \
 	lib/crypto_backend/argon2/encoding.h \
-	lib/crypto_backend/argon2/ref.c \
 	lib/crypto_backend/argon2/thread.c \
 	lib/crypto_backend/argon2/thread.h
 
+if CRYPTO_INTERNAL_SSE_ARGON2
+libargon2_la_SOURCES += lib/crypto_backend/argon2/blake2/blamka-round-opt.h \
+			lib/crypto_backend/argon2/opt.c
+else
+libargon2_la_SOURCES += lib/crypto_backend/argon2/blake2/blamka-round-ref.h \
+			lib/crypto_backend/argon2/ref.c
+endif
+
 EXTRA_DIST += lib/crypto_backend/argon2/LICENSE
 EXTRA_DIST += lib/crypto_backend/argon2/README
diff --git a/lib/crypto_backend/argon2/blake2/blamka-round-opt.h b/lib/crypto_backend/argon2/blake2/blamka-round-opt.h
new file mode 100644
index 00000000..2c8942e3
--- /dev/null
+++ b/lib/crypto_backend/argon2/blake2/blamka-round-opt.h
@@ -0,0 +1,471 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef BLAKE_ROUND_MKA_OPT_H
+#define BLAKE_ROUND_MKA_OPT_H
+
+#include "blake2-impl.h"
+
+#include <emmintrin.h>
+#if defined(__SSSE3__)
+#include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
+#endif
+
+#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
+#include <x86intrin.h>
+#endif
+
+#if !defined(__AVX512F__)
+#if !defined(__AVX2__)
+#if !defined(__XOP__)
+#if defined(__SSSE3__)
+#define r16                                                                    \
+    (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define r24                                                                    \
+    (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define _mm_roti_epi64(x, c)                                                   \
+    (-(c) == 32)                                                               \
+        ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))                      \
+        : (-(c) == 24)                                                         \
+              ? _mm_shuffle_epi8((x), r24)                                     \
+              : (-(c) == 16)                                                   \
+                    ? _mm_shuffle_epi8((x), r16)                               \
+                    : (-(c) == 63)                                             \
+                          ? _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
+                                          _mm_add_epi64((x), (x)))             \
+                          : _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
+                                          _mm_slli_epi64((x), 64 - (-(c))))
+#else /* defined(__SSE2__) */
+#define _mm_roti_epi64(r, c)                                                   \
+    _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
+#endif
+#else
+#endif
+
+static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
+    const __m128i z = _mm_mul_epu32(x, y);
+    return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
+}
+
+#define G1(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
+    do {                                                                       \
+        A0 = fBlaMka(A0, B0);                                                  \
+        A1 = fBlaMka(A1, B1);                                                  \
+                                                                               \
+        D0 = _mm_xor_si128(D0, A0);                                            \
+        D1 = _mm_xor_si128(D1, A1);                                            \
+                                                                               \
+        D0 = _mm_roti_epi64(D0, -32);                                          \
+        D1 = _mm_roti_epi64(D1, -32);                                          \
+                                                                               \
+        C0 = fBlaMka(C0, D0);                                                  \
+        C1 = fBlaMka(C1, D1);                                                  \
+                                                                               \
+        B0 = _mm_xor_si128(B0, C0);                                            \
+        B1 = _mm_xor_si128(B1, C1);                                            \
+                                                                               \
+        B0 = _mm_roti_epi64(B0, -24);                                          \
+        B1 = _mm_roti_epi64(B1, -24);                                          \
+    } while ((void)0, 0)
+
+#define G2(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
+    do {                                                                       \
+        A0 = fBlaMka(A0, B0);                                                  \
+        A1 = fBlaMka(A1, B1);                                                  \
+                                                                               \
+        D0 = _mm_xor_si128(D0, A0);                                            \
+        D1 = _mm_xor_si128(D1, A1);                                            \
+                                                                               \
+        D0 = _mm_roti_epi64(D0, -16);                                          \
+        D1 = _mm_roti_epi64(D1, -16);                                          \
+                                                                               \
+        C0 = fBlaMka(C0, D0);                                                  \
+        C1 = fBlaMka(C1, D1);                                                  \
+                                                                               \
+        B0 = _mm_xor_si128(B0, C0);                                            \
+        B1 = _mm_xor_si128(B1, C1);                                            \
+                                                                               \
+        B0 = _mm_roti_epi64(B0, -63);                                          \
+        B1 = _mm_roti_epi64(B1, -63);                                          \
+    } while ((void)0, 0)
+
+#if defined(__SSSE3__)
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
+    do {                                                                       \
+        __m128i t0 = _mm_alignr_epi8(B1, B0, 8);                               \
+        __m128i t1 = _mm_alignr_epi8(B0, B1, 8);                               \
+        B0 = t0;                                                               \
+        B1 = t1;                                                               \
+                                                                               \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+                                                                               \
+        t0 = _mm_alignr_epi8(D1, D0, 8);                                       \
+        t1 = _mm_alignr_epi8(D0, D1, 8);                                       \
+        D0 = t1;                                                               \
+        D1 = t0;                                                               \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
+    do {                                                                       \
+        __m128i t0 = _mm_alignr_epi8(B0, B1, 8);                               \
+        __m128i t1 = _mm_alignr_epi8(B1, B0, 8);                               \
+        B0 = t0;                                                               \
+        B1 = t1;                                                               \
+                                                                               \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+                                                                               \
+        t0 = _mm_alignr_epi8(D0, D1, 8);                                       \
+        t1 = _mm_alignr_epi8(D1, D0, 8);                                       \
+        D0 = t1;                                                               \
+        D1 = t0;                                                               \
+    } while ((void)0, 0)
+#else /* SSE2 */
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
+    do {                                                                       \
+        __m128i t0 = D0;                                                       \
+        __m128i t1 = B0;                                                       \
+        D0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = D0;                                                               \
+        D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0));               \
+        D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1));               \
+        B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1));               \
+        B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1));               \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
+    do {                                                                       \
+        __m128i t0, t1;                                                        \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+        t0 = B0;                                                               \
+        t1 = D0;                                                               \
+        B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0));               \
+        B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1));               \
+        D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1));               \
+        D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1));               \
+    } while ((void)0, 0)
+#endif
+
+#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1)                           \
+    do {                                                                       \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+                                                                               \
+        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                           \
+                                                                               \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+                                                                               \
+        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
+    } while ((void)0, 0)
+#else /* __AVX2__ */
+
+#include <immintrin.h>
+
+#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
+#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define rotr16(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define rotr63(x)   _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
+
+#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i ml = _mm256_mul_epu32(A0, B0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        D0 = _mm256_xor_si256(D0, A0); \
+        D0 = rotr32(D0); \
+        \
+        ml = _mm256_mul_epu32(C0, D0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+        \
+        B0 = _mm256_xor_si256(B0, C0); \
+        B0 = rotr24(B0); \
+        \
+        ml = _mm256_mul_epu32(A1, B1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+        D1 = _mm256_xor_si256(D1, A1); \
+        D1 = rotr32(D1); \
+        \
+        ml = _mm256_mul_epu32(C1, D1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        \
+        B1 = _mm256_xor_si256(B1, C1); \
+        B1 = rotr24(B1); \
+    } while((void)0, 0);
+
+#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i ml = _mm256_mul_epu32(A0, B0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        D0 = _mm256_xor_si256(D0, A0); \
+        D0 = rotr16(D0); \
+        \
+        ml = _mm256_mul_epu32(C0, D0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+        B0 = _mm256_xor_si256(B0, C0); \
+        B0 = rotr63(B0); \
+        \
+        ml = _mm256_mul_epu32(A1, B1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+        D1 = _mm256_xor_si256(D1, A1); \
+        D1 = rotr16(D1); \
+        \
+        ml = _mm256_mul_epu32(C1, D1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        B1 = _mm256_xor_si256(B1, C1); \
+        B1 = rotr63(B1); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        \
+        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+        B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        \
+        tmp1 = C0; \
+        C0 = C1; \
+        C1 = tmp1; \
+        \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+    } while(0);
+
+#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        \
+        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+    } while((void)0, 0);
+
+#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+        B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        \
+        tmp1 = C0; \
+        C0 = C1; \
+        C1 = tmp1; \
+        \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do{ \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+        \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do{ \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    } while((void)0, 0);
+
+#endif /* __AVX2__ */
+
+#else /* __AVX512F__ */
+
+#include <immintrin.h>
+
+#define ror64(x, n) _mm512_ror_epi64((x), (n))
+
+static __m512i muladd(__m512i x, __m512i y)
+{
+    __m512i z = _mm512_mul_epu32(x, y);
+    return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
+}
+
+#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        A0 = muladd(A0, B0); \
+        A1 = muladd(A1, B1); \
+\
+        D0 = _mm512_xor_si512(D0, A0); \
+        D1 = _mm512_xor_si512(D1, A1); \
+\
+        D0 = ror64(D0, 32); \
+        D1 = ror64(D1, 32); \
+\
+        C0 = muladd(C0, D0); \
+        C1 = muladd(C1, D1); \
+\
+        B0 = _mm512_xor_si512(B0, C0); \
+        B1 = _mm512_xor_si512(B1, C1); \
+\
+        B0 = ror64(B0, 24); \
+        B1 = ror64(B1, 24); \
+    } while ((void)0, 0)
+
+#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        A0 = muladd(A0, B0); \
+        A1 = muladd(A1, B1); \
+\
+        D0 = _mm512_xor_si512(D0, A0); \
+        D1 = _mm512_xor_si512(D1, A1); \
+\
+        D0 = ror64(D0, 16); \
+        D1 = ror64(D1, 16); \
+\
+        C0 = muladd(C0, D0); \
+        C1 = muladd(C1, D1); \
+\
+        B0 = _mm512_xor_si512(B0, C0); \
+        B1 = _mm512_xor_si512(B1, C1); \
+\
+        B0 = ror64(B0, 63); \
+        B1 = ror64(B1, 63); \
+    } while ((void)0, 0)
+
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+\
+        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+\
+        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+\
+        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+\
+        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+    } while ((void)0, 0)
+
+#define SWAP_HALVES(A0, A1) \
+    do { \
+        __m512i t0, t1; \
+        t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
+        t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
+        A0 = t0; \
+        A1 = t1; \
+    } while((void)0, 0)
+
+#define SWAP_QUARTERS(A0, A1) \
+    do { \
+        SWAP_HALVES(A0, A1); \
+        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
+        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+    } while((void)0, 0)
+
+#define UNSWAP_QUARTERS(A0, A1) \
+    do { \
+        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
+        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+        SWAP_HALVES(A0, A1); \
+    } while((void)0, 0)
+
+#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
+    do { \
+        SWAP_HALVES(A0, B0); \
+        SWAP_HALVES(C0, D0); \
+        SWAP_HALVES(A1, B1); \
+        SWAP_HALVES(C1, D1); \
+        BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
+        SWAP_HALVES(A0, B0); \
+        SWAP_HALVES(C0, D0); \
+        SWAP_HALVES(A1, B1); \
+        SWAP_HALVES(C1, D1); \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        SWAP_QUARTERS(A0, A1); \
+        SWAP_QUARTERS(B0, B1); \
+        SWAP_QUARTERS(C0, C1); \
+        SWAP_QUARTERS(D0, D1); \
+        BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
+        UNSWAP_QUARTERS(A0, A1); \
+        UNSWAP_QUARTERS(B0, B1); \
+        UNSWAP_QUARTERS(C0, C1); \
+        UNSWAP_QUARTERS(D0, D1); \
+    } while ((void)0, 0)
+
+#endif /* __AVX512F__ */
+#endif /* BLAKE_ROUND_MKA_OPT_H */
diff --git a/lib/crypto_backend/argon2/opt.c b/lib/crypto_backend/argon2/opt.c
new file mode 100644
index 00000000..f6c20528
--- /dev/null
+++ b/lib/crypto_backend/argon2/opt.c
@@ -0,0 +1,283 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "argon2.h"
+#include "core.h"
+
+#include "blake2/blake2.h"
+#include "blake2/blamka-round-opt.h"
+
+/*
+ * Function fills a new memory block and optionally XORs the old block over the new one.
+ * Memory must be initialized.
+ * @param state Pointer to the just produced block. Content will be updated(!)
+ * @param ref_block Pointer to the reference block
+ * @param next_block Pointer to the block to be XORed over. May coincide with @ref_block
+ * @param with_xor Whether to XOR into the new block (1) or just overwrite (0)
+ * @pre all block pointers must be valid
+ */
+#if defined(__AVX512F__)
+static void fill_block(__m512i *state, const block *ref_block,
+                       block *next_block, int with_xor) {
+    __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
+    unsigned int i;
+
+    if (with_xor) {
+        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+            state[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
+            block_XY[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i));
+        }
+    } else {
+        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+            block_XY[i] = state[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
+        }
+    }
+
+    for (i = 0; i < 2; ++i) {
+        BLAKE2_ROUND_1(
+            state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+            state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+    }
+
+    for (i = 0; i < 2; ++i) {
+        BLAKE2_ROUND_2(
+            state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i],
+            state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]);
+    }
+
+    for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+        state[i] = _mm512_xor_si512(state[i], block_XY[i]);
+        _mm512_storeu_si512((__m512i *)next_block->v + i, state[i]);
+    }
+}
+#elif defined(__AVX2__)
+static void fill_block(__m256i *state, const block *ref_block,
+                       block *next_block, int with_xor) {
+    __m256i block_XY[ARGON2_HWORDS_IN_BLOCK];
+    unsigned int i;
+
+    if (with_xor) {
+        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+            state[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
+            block_XY[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)next_block->v + i));
+        }
+    } else {
+        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+            block_XY[i] = state[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
+        }
+    }
+
+    for (i = 0; i < 4; ++i) {
+        BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
+                       state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
+    }
+
+    for (i = 0; i < 4; ++i) {
+        BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
+                       state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
+    }
+
+    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+        state[i] = _mm256_xor_si256(state[i], block_XY[i]);
+        _mm256_storeu_si256((__m256i *)next_block->v + i, state[i]);
+    }
+}
+#else
+static void fill_block(__m128i *state, const block *ref_block,
+                       block *next_block, int with_xor) {
+    __m128i block_XY[ARGON2_OWORDS_IN_BLOCK];
+    unsigned int i;
+
+    if (with_xor) {
+        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
+            state[i] = _mm_xor_si128(
+                state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
+            block_XY[i] = _mm_xor_si128(
+                state[i], _mm_loadu_si128((const __m128i *)next_block->v + i));
+        }
+    } else {
+        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
+            block_XY[i] = state[i] = _mm_xor_si128(
+                state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
+        }
+    }
+
+    for (i = 0; i < 8; ++i) {
+        BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
+            state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
+            state[8 * i + 6], state[8 * i + 7]);
+    }
+
+    for (i = 0; i < 8; ++i) {
+        BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
+            state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
+            state[8 * 6 + i], state[8 * 7 + i]);
+    }
+
+    for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
+        state[i] = _mm_xor_si128(state[i], block_XY[i]);
+        _mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
+    }
+}
+#endif
+
+static void next_addresses(block *address_block, block *input_block) {
+    /*Temporary zero-initialized blocks*/
+#if defined(__AVX512F__)
+    __m512i zero_block[ARGON2_512BIT_WORDS_IN_BLOCK];
+    __m512i zero2_block[ARGON2_512BIT_WORDS_IN_BLOCK];
+#elif defined(__AVX2__)
+    __m256i zero_block[ARGON2_HWORDS_IN_BLOCK];
+    __m256i zero2_block[ARGON2_HWORDS_IN_BLOCK];
+#else
+    __m128i zero_block[ARGON2_OWORDS_IN_BLOCK];
+    __m128i zero2_block[ARGON2_OWORDS_IN_BLOCK];
+#endif
+
+    memset(zero_block, 0, sizeof(zero_block));
+    memset(zero2_block, 0, sizeof(zero2_block));
+
+    /*Increasing index counter*/
+    input_block->v[6]++;
+
+    /*First iteration of G*/
+    fill_block(zero_block, input_block, address_block, 0);
+
+    /*Second iteration of G*/
+    fill_block(zero2_block, address_block, address_block, 0);
+}
+
+void fill_segment(const argon2_instance_t *instance,
+                  argon2_position_t position) {
+    block *ref_block = NULL, *curr_block = NULL;
+    block address_block, input_block;
+    uint64_t pseudo_rand, ref_index, ref_lane;
+    uint32_t prev_offset, curr_offset;
+    uint32_t starting_index, i;
+#if defined(__AVX512F__)
+    __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
+#elif defined(__AVX2__)
+    __m256i state[ARGON2_HWORDS_IN_BLOCK];
+#else
+    __m128i state[ARGON2_OWORDS_IN_BLOCK];
+#endif
+    int data_independent_addressing;
+
+    if (instance == NULL) {
+        return;
+    }
+
+    data_independent_addressing =
+        (instance->type == Argon2_i) ||
+        (instance->type == Argon2_id && (position.pass == 0) &&
+         (position.slice < ARGON2_SYNC_POINTS / 2));
+
+    if (data_independent_addressing) {
+        init_block_value(&input_block, 0);
+
+        input_block.v[0] = position.pass;
+        input_block.v[1] = position.lane;
+        input_block.v[2] = position.slice;
+        input_block.v[3] = instance->memory_blocks;
+        input_block.v[4] = instance->passes;
+        input_block.v[5] = instance->type;
+    }
+
+    starting_index = 0;
+
+    if ((0 == position.pass) && (0 == position.slice)) {
+        starting_index = 2; /* we have already generated the first two blocks */
+
+        /* Don't forget to generate the first block of addresses: */
+        if (data_independent_addressing) {
+            next_addresses(&address_block, &input_block);
+        }
+    }
+
+    /* Offset of the current block */
+    curr_offset = position.lane * instance->lane_length +
+                  position.slice * instance->segment_length + starting_index;
+
+    if (0 == curr_offset % instance->lane_length) {
+        /* Last block in this lane */
+        prev_offset = curr_offset + instance->lane_length - 1;
+    } else {
+        /* Previous block */
+        prev_offset = curr_offset - 1;
+    }
+
+    memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
+
+    for (i = starting_index; i < instance->segment_length;
+         ++i, ++curr_offset, ++prev_offset) {
+        /*1.1 Rotating prev_offset if needed */
+        if (curr_offset % instance->lane_length == 1) {
+            prev_offset = curr_offset - 1;
+        }
+
+        /* 1.2 Computing the index of the reference block */
+        /* 1.2.1 Taking pseudo-random value from the previous block */
+        if (data_independent_addressing) {
+            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
+                next_addresses(&address_block, &input_block);
+            }
+            pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
+        } else {
+            pseudo_rand = instance->memory[prev_offset].v[0];
+        }
+
+        /* 1.2.2 Computing the lane of the reference block */
+        ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
+
+        if ((position.pass == 0) && (position.slice == 0)) {
+            /* Can not reference other lanes yet */
+            ref_lane = position.lane;
+        }
+
+        /* 1.2.3 Computing the number of possible reference block within the
+         * lane.
+         */
+        position.index = i;
+        ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
+                                ref_lane == position.lane);
+
+        /* 2 Creating a new block */
+        ref_block =
+            instance->memory + instance->lane_length * ref_lane + ref_index;
+        curr_block = instance->memory + curr_offset;
+        if (ARGON2_VERSION_10 == instance->version) {
+            /* version 1.2.1 and earlier: overwrite, not XOR */
+            fill_block(state, ref_block, curr_block, 0);
+        } else {
+            if(0 == position.pass) {
+                fill_block(state, ref_block, curr_block, 0);
+            } else {
+                fill_block(state, ref_block, curr_block, 1);
+            }
+        }
+    }
+}