rclone/vendor/github.com/vivint/infectious/addmul_amd64.s
2020-05-12 15:56:50 +00:00

200 lines
5.7 KiB
ArmAsm

// The MIT License (MIT)
//
// Copyright (C) 2016-2017 Vivint, Inc.
// Copyright (c) 2015 Klaus Post
// Copyright (c) 2015 Backblaze
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
/*
The corresponding C implementations:
void addmul(
uint8_t * restrict lowhigh,
uint8_t * restrict in,
uint8_t * restrict out,
int n
) {
for(int i = 0; i < n; i++){
int value = in[i];
int low = value & 15;
int high = value >> 4;
out[i] = out[i] ^ lowhigh[low] ^ lowhigh[high+16];
}
}
void addmulSSSE3(
uint8_t * restrict lowhigh,
uint8_t * restrict in,
uint8_t * restrict out,
int n
) {
int i = 0;
__m128i lotbl = _mm_loadu_si128((__m128i*)(&lowhigh[0]));
__m128i hitbl = _mm_loadu_si128((__m128i*)(&lowhigh[16]));
__m128i lomask = _mm_set1_epi8(0xF);
#pragma nounroll
for(i = 0; i < (n/16)*16; i += 16){
__m128i input8 = _mm_loadu_si128((__m128i*)(&in[i]));
__m128i output8 = _mm_loadu_si128((__m128i*)(&out[i]));
__m128i lo8 = _mm_and_si128(lomask, input8);
__m128i hi8 = _mm_and_si128(lomask, _mm_srli_si128(input8, 4)); // simulate shrli epi8
output8 = _mm_xor_si128(output8, _mm_shuffle_epi8(lotbl, lo8));
output8 = _mm_xor_si128(output8, _mm_shuffle_epi8(hitbl, hi8));
_mm_storeu_si128((__m128i*)(&out[i]), output8);
}
}
*/
#include "textflag.h"
DATA nybble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA nybble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA nybble_mask<>+0x10(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA nybble_mask<>+0x18(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL nybble_mask<>(SB), (NOPTR+RODATA), $32
#define LOWHIGH DI
#define LOW X8
#define HIGH X9
#define IN SI
#define OUT DX
#define INDEX AX
#define LEN CX
#define LEN16 R8 // LEN16 = (LEN / 16) * 16
#define LOMASK X7 // LOMASK = repeated 15
// X0-X5 temps
// func addmulSSSE3(lowhigh *[2][16]byte, in, out *byte, len int)
TEXT ·addmulSSSE3(SB), 7, $0
MOVQ _in+8(FP), IN
MOVQ _out+16(FP), OUT
MOVQ _len+24(FP), LEN
MOVQ LEN, LEN16
ANDQ $-16, LEN16
JLE start_slow // if LEN16 == 0 { goto done }
MOVQ _lohi+0(FP), LOWHIGH
MOVOU (LOWHIGH), LOW
MOVOU 16(LOWHIGH), HIGH
MOVOU nybble_mask<>(SB), LOMASK
XORQ INDEX, INDEX // INDEX = 0
loop16:
MOVOU (IN)(INDEX*1), X0 // X0 = INPUT[INDEX]
MOVOU LOW, X4 // X4 = copy(LOW)
MOVOU (OUT)(INDEX*1), X2 // X2 = OUT[INDEX]
MOVOU X0, X1 // X0 = input[index] & 15
MOVOU HIGH, X5 // X5 = copy(HIGH)
PAND LOMASK, X0
PSRLQ $4, X1 // X1 = input[index]
PSHUFB X0, X4 // X4 = LOW[X0]
PAND LOMASK, X1 // X1 = input[index] >> 4
PSHUFB X1, X5 // X5 = HIGH[X1]
PXOR X4, X2 // X2 = OUT[INDEX] ^ X4 ^ X5
PXOR X5, X2
MOVOU X2, 0(OUT)(INDEX*1)
ADDQ $16, INDEX
CMPQ LEN16, INDEX // INDEX < LEN16
JG loop16
start_slow:
MOVQ _len+32(FP), LOWHIGH
MOVQ LEN16, INDEX
CMPQ LEN, INDEX
JLE done
loop1:
MOVBQZX (IN)(INDEX*1), R9 // R9 := in[index]
MOVBQZX (LOWHIGH)(R9*1), R10 // R10 := multiply[R9]
XORB R10B, (OUT)(INDEX*1) // out[index] ^= R10
INCQ INDEX
CMPQ LEN, INDEX
JG loop1
done:
RET
#undef LOWHIGH
#undef LOW
#undef HIGH
#undef IN
#undef OUT
#undef LEN
#undef INDEX
#undef LEN16
#undef LOMASK
// func addmulAVX2(lowhigh *[2][16]byte, in, out *byte, len int)
TEXT ·addmulAVX2(SB), 7, $0
MOVQ low+0(FP), SI // SI: &lowhigh
MOVOU (SI), X6 // X6: low
MOVOU 16(SI), X7 // X7: high
MOVQ $15, BX // BX: low mask
MOVQ BX, X5
MOVQ len+24(FP), R9 // R9: len(in), len(out)
LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
LONG $0x787d62c4; BYTE $0xc5 // VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
SHRQ $5, R9 // len(in) / 32
MOVQ out+16(FP), DX // DX: &out
MOVQ in+8(FP), SI // R11: &in
TESTQ R9, R9
JZ done_xor_avx2
loopback_xor_avx2:
LONG $0x066ffec5 // VMOVDQU YMM0, [rsi]
LONG $0x226ffec5 // VMOVDQU YMM4, [rdx]
LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ YMM1, YMM0, 4 ; X1: high input
LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND YMM0, YMM0, YMM8 ; X0: low input
LONG $0xdb75c1c4; BYTE $0xc8 // VPAND YMM1, YMM1, YMM8 ; X1: high input
LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
LONG $0xdbefedc5 // VPXOR YMM3, YMM2, YMM3 ; X3: Result
LONG $0xe4efe5c5 // VPXOR YMM4, YMM3, YMM4 ; X4: Result
LONG $0x227ffec5 // VMOVDQU [rdx], YMM4
ADDQ $32, SI // in+=32
ADDQ $32, DX // out+=32
SUBQ $1, R9
JNZ loopback_xor_avx2
done_xor_avx2:
// VZEROUPPER
BYTE $0xc5; BYTE $0xf8; BYTE $0x77
RET