249 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			249 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| // Copyright (c) 2017 Minio Inc. All rights reserved.
 | |
| // Use of this source code is governed by a license that can be
 | |
| // found in the LICENSE file.
 | |
| 
 | |
| // +build amd64,!gccgo,!appengine,!nacl,!noasm
 | |
| 
 | |
| #include "textflag.h"
 | |
| 
 | |
| DATA ·consAVX2<>+0x00(SB)/8, $0xdbe6d5d5fe4cce2f
 | |
| DATA ·consAVX2<>+0x08(SB)/8, $0xa4093822299f31d0
 | |
| DATA ·consAVX2<>+0x10(SB)/8, $0x13198a2e03707344
 | |
| DATA ·consAVX2<>+0x18(SB)/8, $0x243f6a8885a308d3
 | |
| DATA ·consAVX2<>+0x20(SB)/8, $0x3bd39e10cb0ef593
 | |
| DATA ·consAVX2<>+0x28(SB)/8, $0xc0acf169b5f18a8c
 | |
| DATA ·consAVX2<>+0x30(SB)/8, $0xbe5466cf34e90c6c
 | |
| DATA ·consAVX2<>+0x38(SB)/8, $0x452821e638d01377
 | |
| GLOBL ·consAVX2<>(SB), (NOPTR+RODATA), $64
 | |
| 
 | |
| DATA ·zipperMergeAVX2<>+0x00(SB)/8, $0xf010e05020c03
 | |
| DATA ·zipperMergeAVX2<>+0x08(SB)/8, $0x70806090d0a040b
 | |
| DATA ·zipperMergeAVX2<>+0x10(SB)/8, $0xf010e05020c03
 | |
| DATA ·zipperMergeAVX2<>+0x18(SB)/8, $0x70806090d0a040b
 | |
| GLOBL ·zipperMergeAVX2<>(SB), (NOPTR+RODATA), $32
 | |
| 
 | |
| #define REDUCE_MOD(x0, x1, x2, x3, tmp0, tmp1, y0, y1) \
 | |
| 	MOVQ $0x3FFFFFFFFFFFFFFF, tmp0 \
 | |
| 	ANDQ tmp0, x3                  \
 | |
| 	MOVQ x2, y0                    \
 | |
| 	MOVQ x3, y1                    \
 | |
| 	                               \
 | |
| 	MOVQ x2, tmp0                  \
 | |
| 	MOVQ x3, tmp1                  \
 | |
| 	SHLQ $1, tmp1                  \
 | |
| 	SHRQ $63, tmp0                 \
 | |
| 	MOVQ tmp1, x3                  \
 | |
| 	ORQ  tmp0, x3                  \
 | |
| 	                               \
 | |
| 	SHLQ $1, x2                    \
 | |
| 	                               \
 | |
| 	MOVQ y0, tmp0                  \
 | |
| 	MOVQ y1, tmp1                  \
 | |
| 	SHLQ $2, tmp1                  \
 | |
| 	SHRQ $62, tmp0                 \
 | |
| 	MOVQ tmp1, y1                  \
 | |
| 	ORQ  tmp0, y1                  \
 | |
| 	                               \
 | |
| 	SHLQ $2, y0                    \
 | |
| 	                               \
 | |
| 	XORQ x0, y0                    \
 | |
| 	XORQ x2, y0                    \
 | |
| 	XORQ x1, y1                    \
 | |
| 	XORQ x3, y1
 | |
| 
 | |
| #define UPDATE(msg) \
 | |
| 	VPADDQ  msg, Y2, Y2                               \
 | |
| 	VPADDQ  Y3, Y2, Y2                                \
 | |
| 	                                                  \
 | |
| 	VPSRLQ  $32, Y1, Y0                               \
 | |
| 	BYTE    $0xC5; BYTE $0xFD; BYTE $0xF4; BYTE $0xC2 \ // VPMULUDQ Y2, Y0, Y0
 | |
| 	VPXOR   Y0, Y3, Y3                                \
 | |
| 	                                                  \
 | |
| 	VPADDQ  Y4, Y1, Y1                                \
 | |
| 	                                                  \
 | |
| 	VPSRLQ  $32, Y2, Y0                               \
 | |
| 	BYTE    $0xC5; BYTE $0xFD; BYTE $0xF4; BYTE $0xC1 \ // VPMULUDQ Y1, Y0, Y0
 | |
| 	VPXOR   Y0, Y4, Y4                                \
 | |
| 	                                                  \
 | |
| 	VPSHUFB Y5, Y2, Y0                                \
 | |
| 	VPADDQ  Y0, Y1, Y1                                \
 | |
| 	                                                  \
 | |
| 	VPSHUFB Y5, Y1, Y0                                \
 | |
| 	VPADDQ  Y0, Y2, Y2
 | |
| 
 | |
| // func initializeAVX2(state *[16]uint64, key []byte)
 | |
| TEXT ·initializeAVX2(SB), 4, $0-32
 | |
| 	MOVQ state+0(FP), AX
 | |
| 	MOVQ key_base+8(FP), BX
 | |
| 	MOVQ $·consAVX2<>(SB), CX
 | |
| 
 | |
| 	VMOVDQU 0(BX), Y1
 | |
| 	VPSHUFD $177, Y1, Y2
 | |
| 
 | |
| 	VMOVDQU 0(CX), Y3
 | |
| 	VMOVDQU 32(CX), Y4
 | |
| 
 | |
| 	VPXOR Y3, Y1, Y1
 | |
| 	VPXOR Y4, Y2, Y2
 | |
| 
 | |
| 	VMOVDQU Y1, 0(AX)
 | |
| 	VMOVDQU Y2, 32(AX)
 | |
| 	VMOVDQU Y3, 64(AX)
 | |
| 	VMOVDQU Y4, 96(AX)
 | |
| 	VZEROUPPER
 | |
| 	RET
 | |
| 
 | |
| // func updateAVX2(state *[16]uint64, msg []byte)
 | |
| TEXT ·updateAVX2(SB), 4, $0-32
 | |
| 	MOVQ state+0(FP), AX
 | |
| 	MOVQ msg_base+8(FP), BX
 | |
| 	MOVQ msg_len+16(FP), CX
 | |
| 
 | |
| 	CMPQ CX, $32
 | |
| 	JB   DONE
 | |
| 
 | |
| 	VMOVDQU 0(AX), Y1
 | |
| 	VMOVDQU 32(AX), Y2
 | |
| 	VMOVDQU 64(AX), Y3
 | |
| 	VMOVDQU 96(AX), Y4
 | |
| 
 | |
| 	VMOVDQU ·zipperMergeAVX2<>(SB), Y5
 | |
| 
 | |
| LOOP:
 | |
| 	VMOVDQU 0(BX), Y0
 | |
| 	UPDATE(Y0)
 | |
| 
 | |
| 	ADDQ $32, BX
 | |
| 	SUBQ $32, CX
 | |
| 	JA   LOOP
 | |
| 
 | |
| 	VMOVDQU Y1, 0(AX)
 | |
| 	VMOVDQU Y2, 32(AX)
 | |
| 	VMOVDQU Y3, 64(AX)
 | |
| 	VMOVDQU Y4, 96(AX)
 | |
| 	VZEROUPPER
 | |
| 
 | |
| DONE:
 | |
| 	RET
 | |
| 
 | |
| // func finalizeAVX2(out []byte, state *[16]uint64)
 | |
| TEXT ·finalizeAVX2(SB), 4, $0-32
 | |
| 	MOVQ state+24(FP), AX
 | |
| 	MOVQ out_base+0(FP), BX
 | |
| 	MOVQ out_len+8(FP), CX
 | |
| 
 | |
| 	VMOVDQU 0(AX), Y1
 | |
| 	VMOVDQU 32(AX), Y2
 | |
| 	VMOVDQU 64(AX), Y3
 | |
| 	VMOVDQU 96(AX), Y4
 | |
| 
 | |
| 	VMOVDQU ·zipperMergeAVX2<>(SB), Y5
 | |
| 
 | |
| 	VPERM2I128 $1, Y1, Y1, Y0
 | |
| 	VPSHUFD    $177, Y0, Y0
 | |
| 	UPDATE(Y0)
 | |
| 
 | |
| 	VPERM2I128 $1, Y1, Y1, Y0
 | |
| 	VPSHUFD    $177, Y0, Y0
 | |
| 	UPDATE(Y0)
 | |
| 
 | |
| 	VPERM2I128 $1, Y1, Y1, Y0
 | |
| 	VPSHUFD    $177, Y0, Y0
 | |
| 	UPDATE(Y0)
 | |
| 
 | |
| 	VPERM2I128 $1, Y1, Y1, Y0
 | |
| 	VPSHUFD    $177, Y0, Y0
 | |
| 	UPDATE(Y0)
 | |
| 
 | |
| 	CMPQ CX, $8
 | |
| 	JE   skipUpdate // Just 4 rounds for 64-bit checksum
 | |
| 
 | |
| 	VPERM2I128 $1, Y1, Y1, Y0
 | |
| 	VPSHUFD    $177, Y0, Y0
 | |
| 	UPDATE(Y0)
 | |
| 
 | |
| 	VPERM2I128 $1, Y1, Y1, Y0
 | |
| 	VPSHUFD    $177, Y0, Y0
 | |
| 	UPDATE(Y0)
 | |
| 
 | |
| 	CMPQ CX, $16
 | |
| 	JE   skipUpdate // 6 rounds for 128-bit checksum
 | |
| 
 | |
| 	VPERM2I128 $1, Y1, Y1, Y0
 | |
| 	VPSHUFD    $177, Y0, Y0
 | |
| 	UPDATE(Y0)
 | |
| 
 | |
| 	VPERM2I128 $1, Y1, Y1, Y0
 | |
| 	VPSHUFD    $177, Y0, Y0
 | |
| 	UPDATE(Y0)
 | |
| 
 | |
| 	VPERM2I128 $1, Y1, Y1, Y0
 | |
| 	VPSHUFD    $177, Y0, Y0
 | |
| 	UPDATE(Y0)
 | |
| 
 | |
| 	VPERM2I128 $1, Y1, Y1, Y0
 | |
| 	VPSHUFD    $177, Y0, Y0
 | |
| 	UPDATE(Y0)
 | |
| 
 | |
| skipUpdate:
 | |
| 	VMOVDQU Y1, 0(AX)
 | |
| 	VMOVDQU Y2, 32(AX)
 | |
| 	VMOVDQU Y3, 64(AX)
 | |
| 	VMOVDQU Y4, 96(AX)
 | |
| 	VZEROUPPER
 | |
| 
 | |
| 	CMPQ CX, $8
 | |
| 	JE   hash64
 | |
| 	CMPQ CX, $16
 | |
| 	JE   hash128
 | |
| 
 | |
| 	// 256-bit checksum
 | |
| 	MOVQ 0*8(AX), R8
 | |
| 	MOVQ 1*8(AX), R9
 | |
| 	MOVQ 4*8(AX), R10
 | |
| 	MOVQ 5*8(AX), R11
 | |
| 	ADDQ 8*8(AX), R8
 | |
| 	ADDQ 9*8(AX), R9
 | |
| 	ADDQ 12*8(AX), R10
 | |
| 	ADDQ 13*8(AX), R11
 | |
| 
 | |
| 	REDUCE_MOD(R8, R9, R10, R11, R12, R13, R14, R15)
 | |
| 	MOVQ R14, 0(BX)
 | |
| 	MOVQ R15, 8(BX)
 | |
| 
 | |
| 	MOVQ 2*8(AX), R8
 | |
| 	MOVQ 3*8(AX), R9
 | |
| 	MOVQ 6*8(AX), R10
 | |
| 	MOVQ 7*8(AX), R11
 | |
| 	ADDQ 10*8(AX), R8
 | |
| 	ADDQ 11*8(AX), R9
 | |
| 	ADDQ 14*8(AX), R10
 | |
| 	ADDQ 15*8(AX), R11
 | |
| 
 | |
| 	REDUCE_MOD(R8, R9, R10, R11, R12, R13, R14, R15)
 | |
| 	MOVQ R14, 16(BX)
 | |
| 	MOVQ R15, 24(BX)
 | |
| 	RET
 | |
| 
 | |
| hash128:
 | |
| 	MOVQ 0*8(AX), R8
 | |
| 	MOVQ 1*8(AX), R9
 | |
| 	ADDQ 6*8(AX), R8
 | |
| 	ADDQ 7*8(AX), R9
 | |
| 	ADDQ 8*8(AX), R8
 | |
| 	ADDQ 9*8(AX), R9
 | |
| 	ADDQ 14*8(AX), R8
 | |
| 	ADDQ 15*8(AX), R9
 | |
| 	MOVQ R8, 0(BX)
 | |
| 	MOVQ R9, 8(BX)
 | |
| 	RET
 | |
| 
 | |
| hash64:
 | |
| 	MOVQ 0*8(AX), DX
 | |
| 	ADDQ 4*8(AX), DX
 | |
| 	ADDQ 8*8(AX), DX
 | |
| 	ADDQ 12*8(AX), DX
 | |
| 	MOVQ DX, 0(BX)
 | |
| 	RET
 | |
| 
 |