295 lines
5.8 KiB
ArmAsm
295 lines
5.8 KiB
ArmAsm
// Copyright (c) 2017 Minio Inc. All rights reserved.
|
|
// Use of this source code is governed by a license that can be
|
|
// found in the LICENSE file.
|
|
|
|
// +build amd64 !gccgo !appengine !nacl
|
|
|
|
#include "textflag.h"
|
|
|
|
DATA ·asmConstants<>+0x00(SB)/8, $0xdbe6d5d5fe4cce2f
|
|
DATA ·asmConstants<>+0x08(SB)/8, $0xa4093822299f31d0
|
|
DATA ·asmConstants<>+0x10(SB)/8, $0x13198a2e03707344
|
|
DATA ·asmConstants<>+0x18(SB)/8, $0x243f6a8885a308d3
|
|
DATA ·asmConstants<>+0x20(SB)/8, $0x3bd39e10cb0ef593
|
|
DATA ·asmConstants<>+0x28(SB)/8, $0xc0acf169b5f18a8c
|
|
DATA ·asmConstants<>+0x30(SB)/8, $0xbe5466cf34e90c6c
|
|
DATA ·asmConstants<>+0x38(SB)/8, $0x452821e638d01377
|
|
GLOBL ·asmConstants<>(SB), (NOPTR+RODATA), $64
|
|
|
|
DATA ·asmZipperMerge<>+0x00(SB)/8, $0xf010e05020c03
|
|
DATA ·asmZipperMerge<>+0x08(SB)/8, $0x70806090d0a040b
|
|
GLOBL ·asmZipperMerge<>(SB), (NOPTR+RODATA), $16
|
|
|
|
#define v00 X0
|
|
#define v01 X1
|
|
#define v10 X2
|
|
#define v11 X3
|
|
#define m00 X4
|
|
#define m01 X5
|
|
#define m10 X6
|
|
#define m11 X7
|
|
|
|
#define t0 X8
|
|
#define t1 X9
|
|
#define t2 X10
|
|
|
|
#define REDUCE_MOD(x0, x1, x2, x3, tmp0, tmp1, y0, y1) \
|
|
MOVQ $0x3FFFFFFFFFFFFFFF, tmp0 \
|
|
ANDQ tmp0, x3 \
|
|
MOVQ x2, y0 \
|
|
MOVQ x3, y1 \
|
|
\
|
|
MOVQ x2, tmp0 \
|
|
MOVQ x3, tmp1 \
|
|
SHLQ $1, tmp1 \
|
|
SHRQ $63, tmp0 \
|
|
MOVQ tmp1, x3 \
|
|
ORQ tmp0, x3 \
|
|
\
|
|
SHLQ $1, x2 \
|
|
\
|
|
MOVQ y0, tmp0 \
|
|
MOVQ y1, tmp1 \
|
|
SHLQ $2, tmp1 \
|
|
SHRQ $62, tmp0 \
|
|
MOVQ tmp1, y1 \
|
|
ORQ tmp0, y1 \
|
|
\
|
|
SHLQ $2, y0 \
|
|
\
|
|
XORQ x0, y0 \
|
|
XORQ x2, y0 \
|
|
XORQ x1, y1 \
|
|
XORQ x3, y1
|
|
|
|
#define UPDATE(msg0, msg1) \
|
|
PADDQ msg0, v10 \
|
|
PADDQ m00, v10 \
|
|
PADDQ msg1, v11 \
|
|
PADDQ m01, v11 \
|
|
\
|
|
MOVO v00, t0 \
|
|
MOVO v01, t1 \
|
|
PSRLQ $32, t0 \
|
|
PSRLQ $32, t1 \
|
|
PMULULQ v10, t0 \
|
|
PMULULQ v11, t1 \
|
|
PXOR t0, m00 \
|
|
PXOR t1, m01 \
|
|
\
|
|
PADDQ m10, v00 \
|
|
PADDQ m11, v01 \
|
|
\
|
|
MOVO v10, t0 \
|
|
MOVO v11, t1 \
|
|
PSRLQ $32, t0 \
|
|
PSRLQ $32, t1 \
|
|
PMULULQ v00, t0 \
|
|
PMULULQ v01, t1 \
|
|
PXOR t0, m10 \
|
|
PXOR t1, m11 \
|
|
\
|
|
MOVO v10, t0 \
|
|
PSHUFB t2, t0 \
|
|
MOVO v11, t1 \
|
|
PSHUFB t2, t1 \
|
|
PADDQ t0, v00 \
|
|
PADDQ t1, v01 \
|
|
\
|
|
MOVO v00, t0 \
|
|
PSHUFB t2, t0 \
|
|
MOVO v01, t1 \
|
|
PSHUFB t2, t1 \
|
|
PADDQ t0, v10 \
|
|
PADDQ t1, v11
|
|
|
|
// func initializeSSE4(state *[16]uint64, key []byte)
|
|
TEXT ·initializeSSE4(SB), NOSPLIT, $0-32
|
|
MOVQ state+0(FP), AX
|
|
MOVQ key_base+8(FP), BX
|
|
MOVQ $·asmConstants<>(SB), CX
|
|
|
|
MOVOU 0(BX), v00
|
|
MOVOU 16(BX), v01
|
|
|
|
PSHUFD $177, v00, v10
|
|
PSHUFD $177, v01, v11
|
|
|
|
MOVOU 0(CX), m00
|
|
MOVOU 16(CX), m01
|
|
MOVOU 32(CX), m10
|
|
MOVOU 48(CX), m11
|
|
|
|
PXOR m00, v00
|
|
PXOR m01, v01
|
|
PXOR m10, v10
|
|
PXOR m11, v11
|
|
|
|
MOVOU v00, 0(AX)
|
|
MOVOU v01, 16(AX)
|
|
MOVOU v10, 32(AX)
|
|
MOVOU v11, 48(AX)
|
|
MOVOU m00, 64(AX)
|
|
MOVOU m01, 80(AX)
|
|
MOVOU m10, 96(AX)
|
|
MOVOU m11, 112(AX)
|
|
RET
|
|
|
|
// func updateSSE4(state *[16]uint64, msg []byte)
|
|
TEXT ·updateSSE4(SB), NOSPLIT, $0-32
|
|
MOVQ state+0(FP), AX
|
|
MOVQ msg_base+8(FP), BX
|
|
MOVQ msg_len+16(FP), CX
|
|
|
|
CMPQ CX, $32
|
|
JB DONE
|
|
|
|
MOVOU 0(AX), v00
|
|
MOVOU 16(AX), v01
|
|
MOVOU 32(AX), v10
|
|
MOVOU 48(AX), v11
|
|
MOVOU 64(AX), m00
|
|
MOVOU 80(AX), m01
|
|
MOVOU 96(AX), m10
|
|
MOVOU 112(AX), m11
|
|
|
|
MOVOU ·asmZipperMerge<>(SB), t2
|
|
|
|
LOOP:
|
|
MOVOU 0(BX), t0
|
|
MOVOU 16(BX), t1
|
|
|
|
UPDATE(t0, t1)
|
|
|
|
ADDQ $32, BX
|
|
SUBQ $32, CX
|
|
JA LOOP
|
|
|
|
MOVOU v00, 0(AX)
|
|
MOVOU v01, 16(AX)
|
|
MOVOU v10, 32(AX)
|
|
MOVOU v11, 48(AX)
|
|
MOVOU m00, 64(AX)
|
|
MOVOU m01, 80(AX)
|
|
MOVOU m10, 96(AX)
|
|
MOVOU m11, 112(AX)
|
|
|
|
DONE:
|
|
RET
|
|
|
|
// func finalizeSSE4(out []byte, state *[16]uint64)
|
|
TEXT ·finalizeSSE4(SB), NOSPLIT, $0-32
|
|
MOVQ state+24(FP), AX
|
|
MOVQ out_base+0(FP), BX
|
|
MOVQ out_len+8(FP), CX
|
|
|
|
MOVOU 0(AX), v00
|
|
MOVOU 16(AX), v01
|
|
MOVOU 32(AX), v10
|
|
MOVOU 48(AX), v11
|
|
MOVOU 64(AX), m00
|
|
MOVOU 80(AX), m01
|
|
MOVOU 96(AX), m10
|
|
MOVOU 112(AX), m11
|
|
|
|
MOVOU ·asmZipperMerge<>(SB), t2
|
|
|
|
PSHUFD $177, v01, t0
|
|
PSHUFD $177, v00, t1
|
|
UPDATE(t0, t1)
|
|
|
|
PSHUFD $177, v01, t0
|
|
PSHUFD $177, v00, t1
|
|
UPDATE(t0, t1)
|
|
|
|
PSHUFD $177, v01, t0
|
|
PSHUFD $177, v00, t1
|
|
UPDATE(t0, t1)
|
|
|
|
PSHUFD $177, v01, t0
|
|
PSHUFD $177, v00, t1
|
|
UPDATE(t0, t1)
|
|
|
|
CMPQ CX, $8
|
|
JE skipUpdate // Just 4 rounds for 64-bit checksum
|
|
|
|
PSHUFD $177, v01, t0
|
|
PSHUFD $177, v00, t1
|
|
UPDATE(t0, t1)
|
|
|
|
PSHUFD $177, v01, t0
|
|
PSHUFD $177, v00, t1
|
|
UPDATE(t0, t1)
|
|
|
|
CMPQ CX, $16
|
|
JE skipUpdate // 6 rounds for 128-bit checksum
|
|
|
|
PSHUFD $177, v01, t0
|
|
PSHUFD $177, v00, t1
|
|
UPDATE(t0, t1)
|
|
|
|
PSHUFD $177, v01, t0
|
|
PSHUFD $177, v00, t1
|
|
UPDATE(t0, t1)
|
|
|
|
PSHUFD $177, v01, t0
|
|
PSHUFD $177, v00, t1
|
|
UPDATE(t0, t1)
|
|
|
|
PSHUFD $177, v01, t0
|
|
PSHUFD $177, v00, t1
|
|
UPDATE(t0, t1)
|
|
|
|
skipUpdate:
|
|
MOVOU v00, 0(AX)
|
|
MOVOU v01, 16(AX)
|
|
MOVOU v10, 32(AX)
|
|
MOVOU v11, 48(AX)
|
|
MOVOU m00, 64(AX)
|
|
MOVOU m01, 80(AX)
|
|
MOVOU m10, 96(AX)
|
|
MOVOU m11, 112(AX)
|
|
|
|
CMPQ CX, $8
|
|
JE hash64
|
|
CMPQ CX, $16
|
|
JE hash128
|
|
|
|
// 256-bit checksum
|
|
PADDQ v00, m00
|
|
PADDQ v10, m10
|
|
PADDQ v01, m01
|
|
PADDQ v11, m11
|
|
|
|
MOVQ m00, R8
|
|
PEXTRQ $1, m00, R9
|
|
MOVQ m10, R10
|
|
PEXTRQ $1, m10, R11
|
|
REDUCE_MOD(R8, R9, R10, R11, R12, R13, R14, R15)
|
|
MOVQ R14, 0(BX)
|
|
MOVQ R15, 8(BX)
|
|
|
|
MOVQ m01, R8
|
|
PEXTRQ $1, m01, R9
|
|
MOVQ m11, R10
|
|
PEXTRQ $1, m11, R11
|
|
REDUCE_MOD(R8, R9, R10, R11, R12, R13, R14, R15)
|
|
MOVQ R14, 16(BX)
|
|
MOVQ R15, 24(BX)
|
|
RET
|
|
|
|
hash128:
|
|
PADDQ v00, v11
|
|
PADDQ m00, m11
|
|
PADDQ v11, m11
|
|
MOVOU m11, 0(BX)
|
|
RET
|
|
|
|
hash64:
|
|
PADDQ v00, v10
|
|
PADDQ m00, m10
|
|
PADDQ v10, m10
|
|
MOVQ m10, DX
|
|
MOVQ DX, 0(BX)
|
|
RET
|