crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian

The change to encrypt a fifth ChaCha block using scalar instructions
caused the chacha20-neon, xchacha20-neon, and xchacha12-neon self-tests
to start failing on big endian arm64 kernels.  The bug is that the
keystream block produced in 32-bit scalar registers is directly XOR'd
with the data words, which are loaded and stored in native endianness.
Thus in big endian mode the data bytes end up XOR'd with the wrong
bytes.  Fix it by byte-swapping the keystream words in big endian mode.

Fixes: 2fe55987b262 ("crypto: arm64/chacha - use combined SIMD/ALU routine for more speed")
Signed-off-by: Eric Biggers <[email protected]>
Reviewed-by: Ard Biesheuvel <[email protected]>
Signed-off-by: Herbert Xu <[email protected]>

 arch/arm64/crypto/chacha-neon-core.S | 16 ++++++++++++++++
diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S
index 021bb9e..bfb80e1 100644
--- a/arch/arm64/crypto/chacha-neon-core.S
+++ b/arch/arm64/crypto/chacha-neon-core.S
@@ -532,6 +532,10 @@ ENTRY(chacha_4block_xor_neon)
    add     v3.4s, v3.4s, v19.4s
      add       a2, a2, w8
      add       a3, a3, w9
+CPU_BE(      rev       a0, a0      )
+CPU_BE(      rev       a1, a1      )
+CPU_BE(      rev       a2, a2      )
+CPU_BE(      rev       a3, a3      )

    ld4r        {v24.4s-v27.4s}, [x0], #16
    ld4r        {v28.4s-v31.4s}, [x0]
@@ -552,6 +556,10 @@ ENTRY(chacha_4block_xor_neon)
    add     v7.4s, v7.4s, v23.4s
      add       a6, a6, w8
      add       a7, a7, w9
+CPU_BE(      rev       a4, a4      )
+CPU_BE(      rev       a5, a5      )
+CPU_BE(      rev       a6, a6      )
+CPU_BE(      rev       a7, a7      )

    // x8[0-3] += s2[0]
    // x9[0-3] += s2[1]
@@ -569,6 +577,10 @@ ENTRY(chacha_4block_xor_neon)
    add     v11.4s, v11.4s, v27.4s
      add       a10, a10, w8
      add       a11, a11, w9
+CPU_BE(      rev       a8, a8      )
+CPU_BE(      rev       a9, a9      )
+CPU_BE(      rev       a10, a10    )
+CPU_BE(      rev       a11, a11    )

    // x12[0-3] += s3[0]
    // x13[0-3] += s3[1]
@@ -586,6 +598,10 @@ ENTRY(chacha_4block_xor_neon)
    add     v15.4s, v15.4s, v31.4s
      add       a14, a14, w8
      add       a15, a15, w9
+CPU_BE(      rev       a12, a12    )
+CPU_BE(      rev       a13, a13    )
+CPU_BE(      rev       a14, a14    )
+CPU_BE(      rev       a15, a15    )

    // interleave 32-bit words in state n, n+1
      ldp       w6, w7, [x2], #64

