rclone/vendor/github.com/vivint/infectious/addmul_amd64.s

// The MIT License (MIT)
//
// Copyright (C) 2016-2017 Vivint, Inc.
// Copyright (c) 2015 Klaus Post
// Copyright (c) 2015 Backblaze
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

/*
The corresponding C implementations:

void addmul(
	uint8_t * restrict lowhigh,
	uint8_t * restrict in,
	uint8_t * restrict out,
	int n
) {
	for(int i = 0; i < n; i++){
		int value = in[i];
		int low   = value & 15;
		int high  = value >> 4;
		out[i] = out[i] ^ lowhigh[low] ^ lowhigh[high+16];
	}
}

void addmulSSSE3(
	uint8_t * restrict lowhigh,
	uint8_t * restrict in,
	uint8_t * restrict out,
	int n
) {
	int i = 0;

	__m128i lotbl = _mm_loadu_si128((__m128i*)(&lowhigh[0]));
	__m128i hitbl = _mm_loadu_si128((__m128i*)(&lowhigh[16]));

	__m128i lomask = _mm_set1_epi8(0xF);

	#pragma nounroll
	for(i = 0; i < (n/16)*16; i += 16){
		__m128i input8  = _mm_loadu_si128((__m128i*)(&in[i]));
		__m128i output8 = _mm_loadu_si128((__m128i*)(&out[i]));

		__m128i lo8 = _mm_and_si128(lomask, input8);
		__m128i hi8 = _mm_and_si128(lomask, _mm_srli_si128(input8, 4)); // simulate shrli epi8

		output8 = _mm_xor_si128(output8, _mm_shuffle_epi8(lotbl, lo8));
		output8 = _mm_xor_si128(output8, _mm_shuffle_epi8(hitbl, hi8));

		_mm_storeu_si128((__m128i*)(&out[i]), output8);
	}
}
*/

#include "textflag.h"
DATA  nybble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA  nybble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA  nybble_mask<>+0x10(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA  nybble_mask<>+0x18(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL nybble_mask<>(SB), (NOPTR+RODATA), $32

#define LOWHIGH  DI
#define LOW   X8
#define HIGH  X9
#define IN    SI
#define OUT   DX
#define INDEX AX

#define LEN   CX
#define LEN16 R8 // LEN16 = (LEN / 16) * 16

#define LOMASK X7 // LOMASK = repeated 15 
// X0-X5 temps

// func addmulSSSE3(lowhigh *[2][16]byte, in, out *byte, len int)
TEXT ·addmulSSSE3(SB), 7, $0
	MOVQ _in+8(FP),   IN
	MOVQ _out+16(FP), OUT
	MOVQ _len+24(FP), LEN

	MOVQ LEN,  LEN16
	ANDQ $-16, LEN16

	JLE start_slow // if LEN16 == 0 { goto done }
	
	MOVQ _lohi+0(FP), LOWHIGH
	MOVOU    (LOWHIGH), LOW
	MOVOU  16(LOWHIGH), HIGH
	
	MOVOU  nybble_mask<>(SB), LOMASK
	XORQ   INDEX, INDEX // INDEX = 0

loop16:
	MOVOU  (IN)(INDEX*1),  X0 // X0 = INPUT[INDEX]
	MOVOU  LOW,  X4            // X4 = copy(LOW)
	MOVOU  (OUT)(INDEX*1), X2 // X2 = OUT[INDEX]
	MOVOU  X0, X1              // X0 = input[index] & 15
	MOVOU  HIGH, X5            // X5 = copy(HIGH)
	
	PAND   LOMASK, X0
	PSRLQ  $4, X1              // X1 = input[index]
	PSHUFB X0, X4             // X4 = LOW[X0]

	PAND   LOMASK, X1         // X1 = input[index] >> 4
	PSHUFB X1, X5            // X5 = HIGH[X1]
	PXOR   X4, X2            // X2 = OUT[INDEX] ^ X4 ^ X5
	PXOR   X5, X2

	MOVOU X2, 0(OUT)(INDEX*1)
	
	ADDQ $16,   INDEX
	CMPQ LEN16, INDEX // INDEX < LEN16
	JG loop16

start_slow:
	MOVQ  _len+32(FP), LOWHIGH
	MOVQ LEN16, INDEX
	CMPQ LEN, INDEX
	JLE done

loop1:
	MOVBQZX (IN)(INDEX*1),   R9  // R9  := in[index]
	MOVBQZX (LOWHIGH)(R9*1), R10 // R10 := multiply[R9]
	XORB R10B, (OUT)(INDEX*1)    // out[index] ^= R10
	INCQ INDEX
	CMPQ LEN, INDEX
	JG loop1

done:
	RET

#undef LOWHIGH
#undef LOW
#undef HIGH
#undef IN
#undef OUT
#undef LEN
#undef INDEX
#undef LEN16
#undef LOMASK

// func addmulAVX2(lowhigh *[2][16]byte, in, out *byte, len int)
TEXT ·addmulAVX2(SB), 7, $0
	MOVQ  low+0(FP), SI     // SI: &lowhigh
	MOVOU (SI),   X6        // X6: low
	MOVOU 16(SI), X7        // X7: high
	
	MOVQ  $15, BX           // BX: low mask
	MOVQ  BX, X5
		
	MOVQ  len+24(FP), R9 // R9: len(in), len(out)

	LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
	LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
	LONG $0x787d62c4; BYTE $0xc5   // VPBROADCASTB YMM8, XMM5         ; X8: lomask (unpacked)

	SHRQ  $5, R9         // len(in) / 32
	MOVQ  out+16(FP), DX // DX: &out
	MOVQ  in+8(FP), SI   // R11: &in
	TESTQ R9, R9
	JZ    done_xor_avx2

loopback_xor_avx2:
	LONG $0x066ffec5             // VMOVDQU YMM0, [rsi]
	LONG $0x226ffec5             // VMOVDQU YMM4, [rdx]
	LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ  YMM1, YMM0, 4   ; X1: high input
	LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND   YMM0, YMM0, YMM8      ; X0: low input
	LONG $0xdb75c1c4; BYTE $0xc8 // VPAND   YMM1, YMM1, YMM8      ; X1: high input
	LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB  YMM2, YMM6, YMM0   ; X2: mul low part
	LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB  YMM3, YMM7, YMM1   ; X2: mul high part
	LONG $0xdbefedc5             // VPXOR   YMM3, YMM2, YMM3    ; X3: Result
	LONG $0xe4efe5c5             // VPXOR   YMM4, YMM3, YMM4    ; X4: Result
	LONG $0x227ffec5             // VMOVDQU [rdx], YMM4

	ADDQ $32, SI           // in+=32
	ADDQ $32, DX           // out+=32
	SUBQ $1, R9
	JNZ  loopback_xor_avx2

done_xor_avx2:
	// VZEROUPPER
	BYTE $0xc5; BYTE $0xf8; BYTE $0x77
	RET
Tardigrade Backend: Dependencies 2020-05-11 20:57:46 +02:00			`// The MIT License (MIT)`
			`//`
			`// Copyright (C) 2016-2017 Vivint, Inc.`
			`// Copyright (c) 2015 Klaus Post`
			`// Copyright (c) 2015 Backblaze`
			`//`
			`// Permission is hereby granted, free of charge, to any person obtaining a copy`
			`// of this software and associated documentation files (the "Software"), to deal`
			`// in the Software without restriction, including without limitation the rights`
			`// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell`
			`// copies of the Software, and to permit persons to whom the Software is`
			`// furnished to do so, subject to the following conditions:`
			`//`
			`// The above copyright notice and this permission notice shall be included in all`
			`// copies or substantial portions of the Software.`
			`//`
			`// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE`
			`// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,`
			`// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE`
			`// SOFTWARE.`

			`/*`
			`The corresponding C implementations:`

			`void addmul(`
			`uint8_t * restrict lowhigh,`
			`uint8_t * restrict in,`
			`uint8_t * restrict out,`
			`int n`
			`) {`
			`for(int i = 0; i < n; i++){`
			`int value = in[i];`
			`int low = value & 15;`
			`int high = value >> 4;`
			`out[i] = out[i] ^ lowhigh[low] ^ lowhigh[high+16];`
			`}`
			`}`

			`void addmulSSSE3(`
			`uint8_t * restrict lowhigh,`
			`uint8_t * restrict in,`
			`uint8_t * restrict out,`
			`int n`
			`) {`
			`int i = 0;`

			`__m128i lotbl = _mm_loadu_si128((__m128i*)(&lowhigh[0]));`
			`__m128i hitbl = _mm_loadu_si128((__m128i*)(&lowhigh[16]));`

			`__m128i lomask = _mm_set1_epi8(0xF);`

			`#pragma nounroll`
			`for(i = 0; i < (n/16)*16; i += 16){`
			`__m128i input8 = _mm_loadu_si128((__m128i*)(&in[i]));`
			`__m128i output8 = _mm_loadu_si128((__m128i*)(&out[i]));`

			`__m128i lo8 = _mm_and_si128(lomask, input8);`
			`__m128i hi8 = _mm_and_si128(lomask, _mm_srli_si128(input8, 4)); // simulate shrli epi8`

			`output8 = _mm_xor_si128(output8, _mm_shuffle_epi8(lotbl, lo8));`
			`output8 = _mm_xor_si128(output8, _mm_shuffle_epi8(hitbl, hi8));`

			`_mm_storeu_si128((__m128i*)(&out[i]), output8);`
			`}`
			`}`
			`*/`

			`#include "textflag.h"`
			`DATA nybble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F`
			`DATA nybble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F`
			`DATA nybble_mask<>+0x10(SB)/8, $0x0F0F0F0F0F0F0F0F`
			`DATA nybble_mask<>+0x18(SB)/8, $0x0F0F0F0F0F0F0F0F`
			`GLOBL nybble_mask<>(SB), (NOPTR+RODATA), $32`

			`#define LOWHIGH DI`
			`#define LOW X8`
			`#define HIGH X9`
			`#define IN SI`
			`#define OUT DX`
			`#define INDEX AX`

			`#define LEN CX`
			`#define LEN16 R8 // LEN16 = (LEN / 16) * 16`

			`#define LOMASK X7 // LOMASK = repeated 15`
			`// X0-X5 temps`

			`// func addmulSSSE3(lowhigh [2][16]byte, in, out byte, len int)`
			`TEXT ·addmulSSSE3(SB), 7, $0`
			`MOVQ _in+8(FP), IN`
			`MOVQ _out+16(FP), OUT`
			`MOVQ _len+24(FP), LEN`

			`MOVQ LEN, LEN16`
			`ANDQ $-16, LEN16`

			`JLE start_slow // if LEN16 == 0 { goto done }`

			`MOVQ _lohi+0(FP), LOWHIGH`
			`MOVOU (LOWHIGH), LOW`
			`MOVOU 16(LOWHIGH), HIGH`

			`MOVOU nybble_mask<>(SB), LOMASK`
			`XORQ INDEX, INDEX // INDEX = 0`

			`loop16:`
			`MOVOU (IN)(INDEX*1), X0 // X0 = INPUT[INDEX]`
			`MOVOU LOW, X4 // X4 = copy(LOW)`
			`MOVOU (OUT)(INDEX*1), X2 // X2 = OUT[INDEX]`
			`MOVOU X0, X1 // X0 = input[index] & 15`
			`MOVOU HIGH, X5 // X5 = copy(HIGH)`

			`PAND LOMASK, X0`
			`PSRLQ $4, X1 // X1 = input[index]`
			`PSHUFB X0, X4 // X4 = LOW[X0]`

			`PAND LOMASK, X1 // X1 = input[index] >> 4`
			`PSHUFB X1, X5 // X5 = HIGH[X1]`
			`PXOR X4, X2 // X2 = OUT[INDEX] ^ X4 ^ X5`
			`PXOR X5, X2`

			`MOVOU X2, 0(OUT)(INDEX*1)`

			`ADDQ $16, INDEX`
			`CMPQ LEN16, INDEX // INDEX < LEN16`
			`JG loop16`

			`start_slow:`
			`MOVQ _len+32(FP), LOWHIGH`
			`MOVQ LEN16, INDEX`
			`CMPQ LEN, INDEX`
			`JLE done`

			`loop1:`
			`MOVBQZX (IN)(INDEX*1), R9 // R9 := in[index]`
			`MOVBQZX (LOWHIGH)(R9*1), R10 // R10 := multiply[R9]`
			`XORB R10B, (OUT)(INDEX*1) // out[index] ^= R10`
			`INCQ INDEX`
			`CMPQ LEN, INDEX`
			`JG loop1`

			`done:`
			`RET`

			`#undef LOWHIGH`
			`#undef LOW`
			`#undef HIGH`
			`#undef IN`
			`#undef OUT`
			`#undef LEN`
			`#undef INDEX`
			`#undef LEN16`
			`#undef LOMASK`

			`// func addmulAVX2(lowhigh [2][16]byte, in, out byte, len int)`
			`TEXT ·addmulAVX2(SB), 7, $0`
			`MOVQ low+0(FP), SI // SI: &lowhigh`
			`MOVOU (SI), X6 // X6: low`
			`MOVOU 16(SI), X7 // X7: high`

			`MOVQ $15, BX // BX: low mask`
			`MOVQ BX, X5`

			`MOVQ len+24(FP), R9 // R9: len(in), len(out)`

			`LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low`
			`LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high`
			`LONG $0x787d62c4; BYTE $0xc5 // VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)`

			`SHRQ $5, R9 // len(in) / 32`
			`MOVQ out+16(FP), DX // DX: &out`
			`MOVQ in+8(FP), SI // R11: &in`
			`TESTQ R9, R9`
			`JZ done_xor_avx2`

			`loopback_xor_avx2:`
			`LONG $0x066ffec5 // VMOVDQU YMM0, [rsi]`
			`LONG $0x226ffec5 // VMOVDQU YMM4, [rdx]`
			`LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ YMM1, YMM0, 4 ; X1: high input`
			`LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND YMM0, YMM0, YMM8 ; X0: low input`
			`LONG $0xdb75c1c4; BYTE $0xc8 // VPAND YMM1, YMM1, YMM8 ; X1: high input`
			`LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part`
			`LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part`
			`LONG $0xdbefedc5 // VPXOR YMM3, YMM2, YMM3 ; X3: Result`
			`LONG $0xe4efe5c5 // VPXOR YMM4, YMM3, YMM4 ; X4: Result`
			`LONG $0x227ffec5 // VMOVDQU [rdx], YMM4`

			`ADDQ $32, SI // in+=32`
			`ADDQ $32, DX // out+=32`
			`SUBQ $1, R9`
			`JNZ loopback_xor_avx2`

			`done_xor_avx2:`
			`// VZEROUPPER`
			`BYTE $0xc5; BYTE $0xf8; BYTE $0x77`
			`RET`