[chore] update dependencies, bump to Go 1.19.1 (#826)

* update dependencies, bump Go version to 1.19

* bump test image Go version

* update golangci-lint

* update gotosocial-drone-build

* sign

* linting, go fmt

* update swagger docs

* update swagger docs

* whitespace

* update contributing.md

* fuckin whoopsie doopsie

* linterino, linteroni

* fix followrequest test not starting processor

* fix other api/client tests not starting processor

* fix remaining tests where processor not started

* bump go-runners version

* don't check last-webfingered-at, processor may have updated this

* update swagger command

* update bun to latest version

* fix embed to work the same as before with new bun

Signed-off-by: kim <grufwub@gmail.com>
Co-authored-by: tsmethurst <tobi.smethurst@protonmail.com>
This commit is contained in:
kim
2022-09-28 18:30:40 +01:00
committed by GitHub
parent 00d38855d4
commit a156188b3e
1135 changed files with 258905 additions and 137146 deletions

27
vendor/github.com/minio/md5-simd/LICENSE.Golang generated vendored Normal file
View File

@ -0,0 +1,27 @@
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -116,6 +116,8 @@ BenchmarkParallel/8MB-4 2182.48 17252.88 7.91x
These measurements were performed on AWS EC2 instance of type `c5.xlarge` equipped with a Xeon Platinum 8124M CPU at 3.0 GHz.
If only one or two inputs are available the scalar calculation method will be used for the
optimal speed in these cases.
## Operation

View File

@ -1,132 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by go run gen.go -output md5block.go; DO NOT EDIT.
package md5simd
import (
"encoding/binary"
"math/bits"
)
type digest struct {
s [4]uint32
x [BlockSize]byte
nx int
len uint64
}
func blockGeneric(dig *digest, p []byte) {
// load state
a, b, c, d := dig.s[0], dig.s[1], dig.s[2], dig.s[3]
for i := 0; i <= len(p)-BlockSize; i += BlockSize {
// eliminate bounds checks on p
q := p[i:]
q = q[:BlockSize:BlockSize]
// save current state
aa, bb, cc, dd := a, b, c, d
// load input block
x0 := binary.LittleEndian.Uint32(q[4*0x0:])
x1 := binary.LittleEndian.Uint32(q[4*0x1:])
x2 := binary.LittleEndian.Uint32(q[4*0x2:])
x3 := binary.LittleEndian.Uint32(q[4*0x3:])
x4 := binary.LittleEndian.Uint32(q[4*0x4:])
x5 := binary.LittleEndian.Uint32(q[4*0x5:])
x6 := binary.LittleEndian.Uint32(q[4*0x6:])
x7 := binary.LittleEndian.Uint32(q[4*0x7:])
x8 := binary.LittleEndian.Uint32(q[4*0x8:])
x9 := binary.LittleEndian.Uint32(q[4*0x9:])
xa := binary.LittleEndian.Uint32(q[4*0xa:])
xb := binary.LittleEndian.Uint32(q[4*0xb:])
xc := binary.LittleEndian.Uint32(q[4*0xc:])
xd := binary.LittleEndian.Uint32(q[4*0xd:])
xe := binary.LittleEndian.Uint32(q[4*0xe:])
xf := binary.LittleEndian.Uint32(q[4*0xf:])
// round 1
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x0+0xd76aa478, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x1+0xe8c7b756, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x2+0x242070db, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x3+0xc1bdceee, 22)
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x4+0xf57c0faf, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x5+0x4787c62a, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x6+0xa8304613, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x7+0xfd469501, 22)
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x8+0x698098d8, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x9+0x8b44f7af, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xa+0xffff5bb1, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xb+0x895cd7be, 22)
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+xc+0x6b901122, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+xd+0xfd987193, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xe+0xa679438e, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xf+0x49b40821, 22)
// round 2
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x1+0xf61e2562, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x6+0xc040b340, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xb+0x265e5a51, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x0+0xe9b6c7aa, 20)
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x5+0xd62f105d, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xa+0x02441453, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xf+0xd8a1e681, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x4+0xe7d3fbc8, 20)
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x9+0x21e1cde6, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xe+0xc33707d6, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x3+0xf4d50d87, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x8+0x455a14ed, 20)
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+xd+0xa9e3e905, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x2+0xfcefa3f8, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x7+0x676f02d9, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+xc+0x8d2a4c8a, 20)
// round 3
a = b + bits.RotateLeft32((b^c^d)+a+x5+0xfffa3942, 4)
d = a + bits.RotateLeft32((a^b^c)+d+x8+0x8771f681, 11)
c = d + bits.RotateLeft32((d^a^b)+c+xb+0x6d9d6122, 16)
b = c + bits.RotateLeft32((c^d^a)+b+xe+0xfde5380c, 23)
a = b + bits.RotateLeft32((b^c^d)+a+x1+0xa4beea44, 4)
d = a + bits.RotateLeft32((a^b^c)+d+x4+0x4bdecfa9, 11)
c = d + bits.RotateLeft32((d^a^b)+c+x7+0xf6bb4b60, 16)
b = c + bits.RotateLeft32((c^d^a)+b+xa+0xbebfbc70, 23)
a = b + bits.RotateLeft32((b^c^d)+a+xd+0x289b7ec6, 4)
d = a + bits.RotateLeft32((a^b^c)+d+x0+0xeaa127fa, 11)
c = d + bits.RotateLeft32((d^a^b)+c+x3+0xd4ef3085, 16)
b = c + bits.RotateLeft32((c^d^a)+b+x6+0x04881d05, 23)
a = b + bits.RotateLeft32((b^c^d)+a+x9+0xd9d4d039, 4)
d = a + bits.RotateLeft32((a^b^c)+d+xc+0xe6db99e5, 11)
c = d + bits.RotateLeft32((d^a^b)+c+xf+0x1fa27cf8, 16)
b = c + bits.RotateLeft32((c^d^a)+b+x2+0xc4ac5665, 23)
// round 4
a = b + bits.RotateLeft32((c^(b|^d))+a+x0+0xf4292244, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+x7+0x432aff97, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+xe+0xab9423a7, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+x5+0xfc93a039, 21)
a = b + bits.RotateLeft32((c^(b|^d))+a+xc+0x655b59c3, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+x3+0x8f0ccc92, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+xa+0xffeff47d, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+x1+0x85845dd1, 21)
a = b + bits.RotateLeft32((c^(b|^d))+a+x8+0x6fa87e4f, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+xf+0xfe2ce6e0, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+x6+0xa3014314, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+xd+0x4e0811a1, 21)
a = b + bits.RotateLeft32((c^(b|^d))+a+x4+0xf7537e82, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+xb+0xbd3af235, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+x2+0x2ad7d2bb, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+x9+0xeb86d391, 21)
// add saved state
a += aa
b += bb
c += cc
d += dd
}
// save state
dig.s[0], dig.s[1], dig.s[2], dig.s[3] = a, b, c, d
}

View File

@ -2,70 +2,72 @@
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
//+build !noasm,!appengine,gc
// This is the AVX512 implementation of the MD5 block function (16-way parallel)
#define prep(index) \
KMOVQ kmask, ktmp \
KMOVQ kmask, ktmp \
VPGATHERDD index*4(base)(ptrs*1), ktmp, mem
#define ROUND1(a, b, c, d, index, const, shift) \
VXORPS c, tmp, tmp \
VPADDD 64*const(consts), a, a \
VPADDD mem, a, a \
VPTERNLOGD $0x6C, b, d, tmp \
prep(index) \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD c, tmp \
VPADDD b, a, a
VPXORQ c, tmp, tmp \
VPADDD 64*const(consts), a, a \
VPADDD mem, a, a \
VPTERNLOGD $0x6C, b, d, tmp \
prep(index) \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD c, tmp \
VPADDD b, a, a
#define ROUND1noload(a, b, c, d, const, shift) \
VXORPS c, tmp, tmp \
VPADDD 64*const(consts), a, a \
VPADDD mem, a, a \
VPTERNLOGD $0x6C, b, d, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD c, tmp \
VPADDD b, a, a
VPXORQ c, tmp, tmp \
VPADDD 64*const(consts), a, a \
VPADDD mem, a, a \
VPTERNLOGD $0x6C, b, d, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD c, tmp \
VPADDD b, a, a
#define ROUND2(a, b, c, d, zreg, const, shift) \
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VANDNPS c, tmp, tmp \
VPTERNLOGD $0xEC, b, tmp, tmp2 \
VMOVAPD c, tmp \
VPADDD tmp2, a, a \
VMOVAPD c, tmp2 \
VPROLD $shift, a, a \
VPADDD b, a, a
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VANDNPD c, tmp, tmp \
VPTERNLOGD $0xEC, b, tmp, tmp2 \
VMOVAPD c, tmp \
VPADDD tmp2, a, a \
VMOVAPD c, tmp2 \
VPROLD $shift, a, a \
VPADDD b, a, a
#define ROUND3(a, b, c, d, zreg, const, shift) \
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VPTERNLOGD $0x96, b, d, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD b, tmp \
VPADDD b, a, a
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VPTERNLOGD $0x96, b, d, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD b, tmp \
VPADDD b, a, a
#define ROUND4(a, b, c, d, zreg, const, shift) \
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VPTERNLOGD $0x36, b, c, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VXORPS c, ones, tmp \
VPADDD b, a, a
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VPTERNLOGD $0x36, b, c, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VPXORQ c, ones, tmp \
VPADDD b, a, a
TEXT ·block16(SB),4,$0-40
TEXT ·block16(SB), 4, $0-40
MOVQ state+0(FP), BX
MOVQ base+8(FP), SI
MOVQ ptrs+16(FP), AX
KMOVQ mask+24(FP), K1
MOVQ n+32(FP), DX
MOVQ ·avx512md5consts+0(SB), DI
MOVQ state+0(FP), BX
MOVQ base+8(FP), SI
MOVQ ptrs+16(FP), AX
KMOVQ mask+24(FP), K1
MOVQ n+32(FP), DX
MOVQ ·avx512md5consts+0(SB), DI
#define a Z0
#define b Z1
@ -90,7 +92,6 @@ TEXT ·block16(SB),4,$0-40
// Registers Z16 through to Z31 are used for caching purposes
// ----------------------------------------------------------
#define dig BX
#define count DX
#define base SI
@ -105,7 +106,7 @@ TEXT ·block16(SB),4,$0-40
// load source pointers
VMOVUPD 0x00(AX), ptrs
MOVQ $-1, AX
MOVQ $-1, AX
VPBROADCASTQ AX, ones
loop:
@ -190,7 +191,7 @@ loop:
ROUND3(c,d,a,b, Z31,0x2e,16)
ROUND3(b,c,d,a, Z18,0x2f,23)
VXORPS d, ones, tmp
VPXORQ d, ones, tmp
ROUND4(a,b,c,d, Z16,0x30, 6)
ROUND4(d,a,b,c, Z23,0x31,10)

View File

@ -1,3 +1,5 @@
//+build !noasm,!appengine,gc
// Copyright (c) 2018 Igneous Systems
// MIT License
//
@ -70,7 +72,7 @@ TEXT ·block8(SB), 4, $0-40
#define consts DI
#define prepmask \
VXORPS mask, mask, mask \
VPXOR mask, mask, mask \
VPCMPGTD mask, off, mask
#define prep(index) \
@ -86,14 +88,14 @@ TEXT ·block8(SB), 4, $0-40
#define roll(shift, a) \
VPSLLD $shift, a, rtmp1 \
VPSRLD $32-shift, a, a \
VORPS rtmp1, a, a
VPOR rtmp1, a, a
#define ROUND1(a, b, c, d, index, const, shift) \
VXORPS c, tmp, tmp \
VPXOR c, tmp, tmp \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VANDPS b, tmp, tmp \
VXORPS d, tmp, tmp \
VPAND b, tmp, tmp \
VPXOR d, tmp, tmp \
prep(index) \
VPADDD tmp, a, a \
roll(shift,a) \
@ -101,11 +103,11 @@ TEXT ·block8(SB), 4, $0-40
VPADDD b, a, a
#define ROUND1load(a, b, c, d, index, const, shift) \
VXORPS c, tmp, tmp \
VXORPD c, tmp, tmp \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VANDPS b, tmp, tmp \
VXORPS d, tmp, tmp \
VPAND b, tmp, tmp \
VPXOR d, tmp, tmp \
load(index) \
VPADDD tmp, a, a \
roll(shift,a) \
@ -115,10 +117,10 @@ TEXT ·block8(SB), 4, $0-40
#define ROUND2(a, b, c, d, index, const, shift) \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VANDPS b, tmp2, tmp2 \
VANDNPS c, tmp, tmp \
VPAND b, tmp2, tmp2 \
VANDNPD c, tmp, tmp \
load(index) \
VORPS tmp, tmp2, tmp2 \
VPOR tmp, tmp2, tmp2 \
VMOVAPD c, tmp \
VPADDD tmp2, a, a \
VMOVAPD c, tmp2 \
@ -129,8 +131,8 @@ TEXT ·block8(SB), 4, $0-40
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
load(index) \
VXORPS d, tmp, tmp \
VXORPS b, tmp, tmp \
VPXOR d, tmp, tmp \
VPXOR b, tmp, tmp \
VPADDD tmp, a, a \
roll(shift,a) \
VMOVAPD b, tmp \
@ -139,12 +141,12 @@ TEXT ·block8(SB), 4, $0-40
#define ROUND4(a, b, c, d, index, const, shift) \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VORPS b, tmp, tmp \
VXORPS c, tmp, tmp \
VPOR b, tmp, tmp \
VPXOR c, tmp, tmp \
VPADDD tmp, a, a \
load(index) \
roll(shift,a) \
VXORPS c, ones, tmp \
VPXOR c, ones, tmp \
VPADDD b, a, a
// load digest into state registers
@ -242,7 +244,7 @@ loop:
ROUND3(b,c,d,a, 0,0x2f,23)
load(0)
VXORPS d, ones, tmp
VPXOR d, ones, tmp
ROUND4(a,b,c,d, 7,0x30, 6)
ROUND4(d,a,b,c,14,0x31,10)

View File

@ -9,14 +9,18 @@ package md5simd
import (
"fmt"
"math"
"sync"
"unsafe"
"github.com/klauspost/cpuid"
"github.com/klauspost/cpuid/v2"
)
var hasAVX512 bool
func init() {
// VANDNPD requires AVX512DQ. Technically it could be VPTERNLOGQ which is AVX512F.
hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ)
}
//go:noescape
func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int)
@ -82,45 +86,52 @@ var avx512md5consts = func(c []uint32) []uint32 {
return inf
}(md5consts[:])
func init() {
hasAVX512 = cpuid.CPU.AVX512F()
}
// Interface function to assembly code
func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) {
if hasAVX512 {
blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16)
} else {
d8a, d8b := digest8{}, digest8{}
for i := range d8a.v0 {
j := i + 8
d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
if !half {
d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
}
}
return
}
i8 := [2][8][]byte{}
for i := range i8[0] {
i8[0][i], i8[1][i] = input[i], input[8+i]
}
if half {
blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a)
} else {
wg := sync.WaitGroup{}
wg.Add(2)
go func() { blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a); wg.Done() }()
go func() { blockMd5_avx2(&d8b, i8[1], s.allBufs, &s.maskRounds8b); wg.Done() }()
wg.Wait()
}
// Preparing data using copy is slower since copies aren't inlined.
for i := range d8a.v0 {
j := i + 8
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i]
if !half {
d.v0[j], d.v1[j], d.v2[j], d.v3[j] = d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i]
}
// Calculate on this goroutine
if half {
for i := range s.i8[0][:] {
s.i8[0][i] = input[i]
}
for i := range s.d8a.v0[:] {
s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
}
blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a)
for i := range s.d8a.v0[:] {
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
}
return
}
for i := range s.i8[0][:] {
s.i8[0][i], s.i8[1][i] = input[i], input[8+i]
}
for i := range s.d8a.v0[:] {
j := (i + 8) & 15
s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
}
// Benchmarks appears to be slightly faster when spinning up 2 goroutines instead
// of using the current for one of the blocks.
s.wg.Add(2)
go func() { blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a); s.wg.Done() }()
go func() { blockMd5_avx2(&s.d8b, s.i8[1], s.allBufs, &s.maskRounds8b); s.wg.Done() }()
s.wg.Wait()
for i := range s.d8a.v0[:] {
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
}
for i := range s.d8b.v0[:] {
j := (i + 8) & 15
d.v0[j], d.v1[j], d.v2[j], d.v3[j] = s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i]
}
}

View File

@ -10,6 +10,7 @@ import (
"encoding/binary"
"errors"
"fmt"
"sync"
"sync/atomic"
)
@ -121,6 +122,14 @@ func (d *md5Digest) Close() {
}
}
var sumChPool sync.Pool
func init() {
sumChPool.New = func() interface{} {
return make(chan sumResult, 1)
}
}
// Sum - Return MD5 sum in bytes
func (d *md5Digest) Sum(in []byte) (result []byte) {
if d.blocksCh == nil {
@ -148,10 +157,11 @@ func (d *md5Digest) Sum(in []byte) (result []byte) {
if len(trail)%BlockSize != 0 {
panic(fmt.Errorf("internal error: sum block was not aligned. len=%d, nx=%d", len(trail), d.nx))
}
sumCh := make(chan sumResult, 1)
sumCh := sumChPool.Get().(chan sumResult)
d.sendBlock(blockInput{uid: d.uid, msg: trail, sumCh: sumCh}, true)
sum := <-sumCh
sumChPool.Put(sumCh)
return append(in, sum.digest[:]...)
}

View File

@ -10,8 +10,9 @@ import (
"encoding/binary"
"fmt"
"runtime"
"sync"
"github.com/klauspost/cpuid"
"github.com/klauspost/cpuid/v2"
)
// MD5 initialization constants
@ -23,6 +24,9 @@ const (
init1 = 0xefcdab89
init2 = 0x98badcfe
init3 = 0x10325476
// Use scalar routine when below this many lanes
useScalarBelow = 3
)
// md5ServerUID - Does not start at 0 but next multiple of 16 so as to be able to
@ -56,11 +60,15 @@ type md5Server struct {
maskRounds8b [8]maskRounds // Pre-allocated static array for max 8 rounds (2nd AVX2 core)
allBufs []byte // Preallocated buffer.
buffers chan []byte // Preallocated buffers, sliced from allBufs.
i8 [2][8][]byte // avx2 temporary vars
d8a, d8b digest8
wg sync.WaitGroup
}
// NewServer - Create new object for parallel processing handling
func NewServer() Server {
if !cpuid.CPU.AVX2() {
if !cpuid.CPU.Supports(cpuid.AVX2) {
return &fallbackServer{}
}
md5srv := &md5Server{}
@ -152,7 +160,7 @@ func (s *md5Server) process(newClients chan newClient) {
sum := sumResult{}
// Add end block to current digest.
blockGeneric(&dig, block.msg)
blockScalar(&dig.s, block.msg)
binary.LittleEndian.PutUint32(sum.digest[0:], dig.s[0])
binary.LittleEndian.PutUint32(sum.digest[4:], dig.s[1])
@ -262,6 +270,88 @@ func (s *md5Server) Close() {
// Invoke assembly and send results back
func (s *md5Server) blocks(lanes []blockInput) {
if len(lanes) < useScalarBelow {
// Use scalar routine when below this many lanes
switch len(lanes) {
case 0:
case 1:
lane := lanes[0]
var d digest
a, ok := s.digests[lane.uid]
if ok {
d.s[0] = binary.LittleEndian.Uint32(a[0:4])
d.s[1] = binary.LittleEndian.Uint32(a[4:8])
d.s[2] = binary.LittleEndian.Uint32(a[8:12])
d.s[3] = binary.LittleEndian.Uint32(a[12:16])
} else {
d.s[0] = init0
d.s[1] = init1
d.s[2] = init2
d.s[3] = init3
}
if len(lane.msg) > 0 {
// Update...
blockScalar(&d.s, lane.msg)
}
dig := [Size]byte{}
binary.LittleEndian.PutUint32(dig[0:], d.s[0])
binary.LittleEndian.PutUint32(dig[4:], d.s[1])
binary.LittleEndian.PutUint32(dig[8:], d.s[2])
binary.LittleEndian.PutUint32(dig[12:], d.s[3])
s.digests[lane.uid] = dig
if lane.msg != nil {
s.buffers <- lane.msg
}
lanes[0] = blockInput{}
default:
s.wg.Add(len(lanes))
var results [useScalarBelow]digest
for i := range lanes {
lane := lanes[i]
go func(i int) {
var d digest
defer s.wg.Done()
a, ok := s.digests[lane.uid]
if ok {
d.s[0] = binary.LittleEndian.Uint32(a[0:4])
d.s[1] = binary.LittleEndian.Uint32(a[4:8])
d.s[2] = binary.LittleEndian.Uint32(a[8:12])
d.s[3] = binary.LittleEndian.Uint32(a[12:16])
} else {
d.s[0] = init0
d.s[1] = init1
d.s[2] = init2
d.s[3] = init3
}
if len(lane.msg) == 0 {
results[i] = d
return
}
// Update...
blockScalar(&d.s, lane.msg)
results[i] = d
}(i)
}
s.wg.Wait()
for i, lane := range lanes {
dig := [Size]byte{}
binary.LittleEndian.PutUint32(dig[0:], results[i].s[0])
binary.LittleEndian.PutUint32(dig[4:], results[i].s[1])
binary.LittleEndian.PutUint32(dig[8:], results[i].s[2])
binary.LittleEndian.PutUint32(dig[12:], results[i].s[3])
s.digests[lane.uid] = dig
if lane.msg != nil {
s.buffers <- lane.msg
}
lanes[i] = blockInput{}
}
}
return
}
inputs := [16][]byte{}
for i := range lanes {
inputs[i] = lanes[i].msg

View File

@ -1,19 +1,21 @@
//+build !noasm,!appengine,gc
// Copyright (c) 2020 MinIO Inc. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
package md5simd
import (
"sort"
)
// Helper struct for sorting blocks based on length
type lane struct {
len uint
pos uint
}
type digest struct {
s [4]uint32
}
// Helper struct for generating number of rounds in combination with mask for valid lanes
type maskRounds struct {
mask uint64
@ -23,15 +25,22 @@ type maskRounds struct {
func generateMaskAndRounds8(input [8][]byte, mr *[8]maskRounds) (rounds int) {
// Sort on blocks length small to large
var sorted [8]lane
for c, inpt := range input {
for c, inpt := range input[:] {
sorted[c] = lane{uint(len(inpt)), uint(c)}
for i := c - 1; i >= 0; i-- {
// swap so largest is at the end...
if sorted[i].len > sorted[i+1].len {
sorted[i], sorted[i+1] = sorted[i+1], sorted[i]
continue
}
break
}
}
sort.Slice(sorted[:], func(i, j int) bool { return sorted[i].len < sorted[j].len })
// Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks
m, round := uint64(0xff), uint64(0)
for _, s := range sorted {
for _, s := range sorted[:] {
if s.len > 0 {
if uint64(s.len)>>6 > round {
mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round}
@ -45,18 +54,24 @@ func generateMaskAndRounds8(input [8][]byte, mr *[8]maskRounds) (rounds int) {
}
func generateMaskAndRounds16(input [16][]byte, mr *[16]maskRounds) (rounds int) {
// Sort on blocks length small to large
var sorted [16]lane
for c, inpt := range input {
for c, inpt := range input[:] {
sorted[c] = lane{uint(len(inpt)), uint(c)}
for i := c - 1; i >= 0; i-- {
// swap so largest is at the end...
if sorted[i].len > sorted[i+1].len {
sorted[i], sorted[i+1] = sorted[i+1], sorted[i]
continue
}
break
}
}
sort.Slice(sorted[:], func(i, j int) bool { return sorted[i].len < sorted[j].len })
// Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks
m, round := uint64(0xffff), uint64(0)
for _, s := range sorted {
for _, s := range sorted[:] {
if s.len > 0 {
if uint64(s.len)>>6 > round {
mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round}

View File

@ -27,6 +27,12 @@ type Hasher interface {
Close()
}
// StdlibHasher returns a Hasher that uses the stdlib for hashing.
// Used hashers are stored in a pool for fast reuse.
func StdlibHasher() Hasher {
return &md5Wrapper{Hash: md5Pool.New().(hash.Hash)}
}
// md5Wrapper is a wrapper around the builtin hasher.
type md5Wrapper struct {
hash.Hash

11
vendor/github.com/minio/md5-simd/md5block_amd64.go generated vendored Normal file
View File

@ -0,0 +1,11 @@
// Code generated by command: go run gen.go -out ../md5block_amd64.s -stubs ../md5block_amd64.go -pkg=md5simd. DO NOT EDIT.
// +build !appengine
// +build !noasm
// +build gc
package md5simd
// Encode p to digest
//go:noescape
func blockScalar(dig *[4]uint32, p []byte)

714
vendor/github.com/minio/md5-simd/md5block_amd64.s generated vendored Normal file
View File

@ -0,0 +1,714 @@
// Code generated by command: go run gen.go -out ../md5block_amd64.s -stubs ../md5block_amd64.go -pkg=md5simd. DO NOT EDIT.
// +build !appengine
// +build !noasm
// +build gc
// func blockScalar(dig *[4]uint32, p []byte)
TEXT ·blockScalar(SB), $0-32
MOVQ p_len+16(FP), AX
MOVQ dig+0(FP), CX
MOVQ p_base+8(FP), DX
SHRQ $0x06, AX
SHLQ $0x06, AX
LEAQ (DX)(AX*1), AX
CMPQ DX, AX
JEQ end
MOVL (CX), BX
MOVL 4(CX), BP
MOVL 8(CX), SI
MOVL 12(CX), CX
MOVL $0xffffffff, DI
loop:
MOVL (DX), R8
MOVL CX, R9
MOVL BX, R10
MOVL BP, R11
MOVL SI, R12
MOVL CX, R13
// ROUND1
XORL SI, R9
ADDL $0xd76aa478, BX
ADDL R8, BX
ANDL BP, R9
XORL CX, R9
MOVL 4(DX), R8
ADDL R9, BX
ROLL $0x07, BX
MOVL SI, R9
ADDL BP, BX
XORL BP, R9
ADDL $0xe8c7b756, CX
ADDL R8, CX
ANDL BX, R9
XORL SI, R9
MOVL 8(DX), R8
ADDL R9, CX
ROLL $0x0c, CX
MOVL BP, R9
ADDL BX, CX
XORL BX, R9
ADDL $0x242070db, SI
ADDL R8, SI
ANDL CX, R9
XORL BP, R9
MOVL 12(DX), R8
ADDL R9, SI
ROLL $0x11, SI
MOVL BX, R9
ADDL CX, SI
XORL CX, R9
ADDL $0xc1bdceee, BP
ADDL R8, BP
ANDL SI, R9
XORL BX, R9
MOVL 16(DX), R8
ADDL R9, BP
ROLL $0x16, BP
MOVL CX, R9
ADDL SI, BP
XORL SI, R9
ADDL $0xf57c0faf, BX
ADDL R8, BX
ANDL BP, R9
XORL CX, R9
MOVL 20(DX), R8
ADDL R9, BX
ROLL $0x07, BX
MOVL SI, R9
ADDL BP, BX
XORL BP, R9
ADDL $0x4787c62a, CX
ADDL R8, CX
ANDL BX, R9
XORL SI, R9
MOVL 24(DX), R8
ADDL R9, CX
ROLL $0x0c, CX
MOVL BP, R9
ADDL BX, CX
XORL BX, R9
ADDL $0xa8304613, SI
ADDL R8, SI
ANDL CX, R9
XORL BP, R9
MOVL 28(DX), R8
ADDL R9, SI
ROLL $0x11, SI
MOVL BX, R9
ADDL CX, SI
XORL CX, R9
ADDL $0xfd469501, BP
ADDL R8, BP
ANDL SI, R9
XORL BX, R9
MOVL 32(DX), R8
ADDL R9, BP
ROLL $0x16, BP
MOVL CX, R9
ADDL SI, BP
XORL SI, R9
ADDL $0x698098d8, BX
ADDL R8, BX
ANDL BP, R9
XORL CX, R9
MOVL 36(DX), R8
ADDL R9, BX
ROLL $0x07, BX
MOVL SI, R9
ADDL BP, BX
XORL BP, R9
ADDL $0x8b44f7af, CX
ADDL R8, CX
ANDL BX, R9
XORL SI, R9
MOVL 40(DX), R8
ADDL R9, CX
ROLL $0x0c, CX
MOVL BP, R9
ADDL BX, CX
XORL BX, R9
ADDL $0xffff5bb1, SI
ADDL R8, SI
ANDL CX, R9
XORL BP, R9
MOVL 44(DX), R8
ADDL R9, SI
ROLL $0x11, SI
MOVL BX, R9
ADDL CX, SI
XORL CX, R9
ADDL $0x895cd7be, BP
ADDL R8, BP
ANDL SI, R9
XORL BX, R9
MOVL 48(DX), R8
ADDL R9, BP
ROLL $0x16, BP
MOVL CX, R9
ADDL SI, BP
XORL SI, R9
ADDL $0x6b901122, BX
ADDL R8, BX
ANDL BP, R9
XORL CX, R9
MOVL 52(DX), R8
ADDL R9, BX
ROLL $0x07, BX
MOVL SI, R9
ADDL BP, BX
XORL BP, R9
ADDL $0xfd987193, CX
ADDL R8, CX
ANDL BX, R9
XORL SI, R9
MOVL 56(DX), R8
ADDL R9, CX
ROLL $0x0c, CX
MOVL BP, R9
ADDL BX, CX
XORL BX, R9
ADDL $0xa679438e, SI
ADDL R8, SI
ANDL CX, R9
XORL BP, R9
MOVL 60(DX), R8
ADDL R9, SI
ROLL $0x11, SI
MOVL BX, R9
ADDL CX, SI
XORL CX, R9
ADDL $0x49b40821, BP
ADDL R8, BP
ANDL SI, R9
XORL BX, R9
MOVL 4(DX), R8
ADDL R9, BP
ROLL $0x16, BP
MOVL CX, R9
ADDL SI, BP
// ROUND2
MOVL CX, R9
MOVL CX, R14
XORL DI, R9
ADDL $0xf61e2562, BX
ADDL R8, BX
ANDL BP, R14
ANDL SI, R9
MOVL 24(DX), R8
ORL R9, R14
MOVL SI, R9
ADDL R14, BX
MOVL SI, R14
ROLL $0x05, BX
ADDL BP, BX
XORL DI, R9
ADDL $0xc040b340, CX
ADDL R8, CX
ANDL BX, R14
ANDL BP, R9
MOVL 44(DX), R8
ORL R9, R14
MOVL BP, R9
ADDL R14, CX
MOVL BP, R14
ROLL $0x09, CX
ADDL BX, CX
XORL DI, R9
ADDL $0x265e5a51, SI
ADDL R8, SI
ANDL CX, R14
ANDL BX, R9
MOVL (DX), R8
ORL R9, R14
MOVL BX, R9
ADDL R14, SI
MOVL BX, R14
ROLL $0x0e, SI
ADDL CX, SI
XORL DI, R9
ADDL $0xe9b6c7aa, BP
ADDL R8, BP
ANDL SI, R14
ANDL CX, R9
MOVL 20(DX), R8
ORL R9, R14
MOVL CX, R9
ADDL R14, BP
MOVL CX, R14
ROLL $0x14, BP
ADDL SI, BP
XORL DI, R9
ADDL $0xd62f105d, BX
ADDL R8, BX
ANDL BP, R14
ANDL SI, R9
MOVL 40(DX), R8
ORL R9, R14
MOVL SI, R9
ADDL R14, BX
MOVL SI, R14
ROLL $0x05, BX
ADDL BP, BX
XORL DI, R9
ADDL $0x02441453, CX
ADDL R8, CX
ANDL BX, R14
ANDL BP, R9
MOVL 60(DX), R8
ORL R9, R14
MOVL BP, R9
ADDL R14, CX
MOVL BP, R14
ROLL $0x09, CX
ADDL BX, CX
XORL DI, R9
ADDL $0xd8a1e681, SI
ADDL R8, SI
ANDL CX, R14
ANDL BX, R9
MOVL 16(DX), R8
ORL R9, R14
MOVL BX, R9
ADDL R14, SI
MOVL BX, R14
ROLL $0x0e, SI
ADDL CX, SI
XORL DI, R9
ADDL $0xe7d3fbc8, BP
ADDL R8, BP
ANDL SI, R14
ANDL CX, R9
MOVL 36(DX), R8
ORL R9, R14
MOVL CX, R9
ADDL R14, BP
MOVL CX, R14
ROLL $0x14, BP
ADDL SI, BP
XORL DI, R9
ADDL $0x21e1cde6, BX
ADDL R8, BX
ANDL BP, R14
ANDL SI, R9
MOVL 56(DX), R8
ORL R9, R14
MOVL SI, R9
ADDL R14, BX
MOVL SI, R14
ROLL $0x05, BX
ADDL BP, BX
XORL DI, R9
ADDL $0xc33707d6, CX
ADDL R8, CX
ANDL BX, R14
ANDL BP, R9
MOVL 12(DX), R8
ORL R9, R14
MOVL BP, R9
ADDL R14, CX
MOVL BP, R14
ROLL $0x09, CX
ADDL BX, CX
XORL DI, R9
ADDL $0xf4d50d87, SI
ADDL R8, SI
ANDL CX, R14
ANDL BX, R9
MOVL 32(DX), R8
ORL R9, R14
MOVL BX, R9
ADDL R14, SI
MOVL BX, R14
ROLL $0x0e, SI
ADDL CX, SI
XORL DI, R9
ADDL $0x455a14ed, BP
ADDL R8, BP
ANDL SI, R14
ANDL CX, R9
MOVL 52(DX), R8
ORL R9, R14
MOVL CX, R9
ADDL R14, BP
MOVL CX, R14
ROLL $0x14, BP
ADDL SI, BP
XORL DI, R9
ADDL $0xa9e3e905, BX
ADDL R8, BX
ANDL BP, R14
ANDL SI, R9
MOVL 8(DX), R8
ORL R9, R14
MOVL SI, R9
ADDL R14, BX
MOVL SI, R14
ROLL $0x05, BX
ADDL BP, BX
XORL DI, R9
ADDL $0xfcefa3f8, CX
ADDL R8, CX
ANDL BX, R14
ANDL BP, R9
MOVL 28(DX), R8
ORL R9, R14
MOVL BP, R9
ADDL R14, CX
MOVL BP, R14
ROLL $0x09, CX
ADDL BX, CX
XORL DI, R9
ADDL $0x676f02d9, SI
ADDL R8, SI
ANDL CX, R14
ANDL BX, R9
MOVL 48(DX), R8
ORL R9, R14
MOVL BX, R9
ADDL R14, SI
MOVL BX, R14
ROLL $0x0e, SI
ADDL CX, SI
XORL DI, R9
ADDL $0x8d2a4c8a, BP
ADDL R8, BP
ANDL SI, R14
ANDL CX, R9
MOVL 20(DX), R8
ORL R9, R14
MOVL CX, R9
ADDL R14, BP
MOVL CX, R14
ROLL $0x14, BP
ADDL SI, BP
// ROUND3
MOVL SI, R9
ADDL $0xfffa3942, BX
ADDL R8, BX
MOVL 32(DX), R8
XORL CX, R9
XORL BP, R9
ADDL R9, BX
ROLL $0x04, BX
MOVL BP, R9
ADDL BP, BX
ADDL $0x8771f681, CX
ADDL R8, CX
MOVL 44(DX), R8
XORL SI, R9
XORL BX, R9
ADDL R9, CX
ROLL $0x0b, CX
MOVL BX, R9
ADDL BX, CX
ADDL $0x6d9d6122, SI
ADDL R8, SI
MOVL 56(DX), R8
XORL BP, R9
XORL CX, R9
ADDL R9, SI
ROLL $0x10, SI
MOVL CX, R9
ADDL CX, SI
ADDL $0xfde5380c, BP
ADDL R8, BP
MOVL 4(DX), R8
XORL BX, R9
XORL SI, R9
ADDL R9, BP
ROLL $0x17, BP
MOVL SI, R9
ADDL SI, BP
ADDL $0xa4beea44, BX
ADDL R8, BX
MOVL 16(DX), R8
XORL CX, R9
XORL BP, R9
ADDL R9, BX
ROLL $0x04, BX
MOVL BP, R9
ADDL BP, BX
ADDL $0x4bdecfa9, CX
ADDL R8, CX
MOVL 28(DX), R8
XORL SI, R9
XORL BX, R9
ADDL R9, CX
ROLL $0x0b, CX
MOVL BX, R9
ADDL BX, CX
ADDL $0xf6bb4b60, SI
ADDL R8, SI
MOVL 40(DX), R8
XORL BP, R9
XORL CX, R9
ADDL R9, SI
ROLL $0x10, SI
MOVL CX, R9
ADDL CX, SI
ADDL $0xbebfbc70, BP
ADDL R8, BP
MOVL 52(DX), R8
XORL BX, R9
XORL SI, R9
ADDL R9, BP
ROLL $0x17, BP
MOVL SI, R9
ADDL SI, BP
ADDL $0x289b7ec6, BX
ADDL R8, BX
MOVL (DX), R8
XORL CX, R9
XORL BP, R9
ADDL R9, BX
ROLL $0x04, BX
MOVL BP, R9
ADDL BP, BX
ADDL $0xeaa127fa, CX
ADDL R8, CX
MOVL 12(DX), R8
XORL SI, R9
XORL BX, R9
ADDL R9, CX
ROLL $0x0b, CX
MOVL BX, R9
ADDL BX, CX
ADDL $0xd4ef3085, SI
ADDL R8, SI
MOVL 24(DX), R8
XORL BP, R9
XORL CX, R9
ADDL R9, SI
ROLL $0x10, SI
MOVL CX, R9
ADDL CX, SI
ADDL $0x04881d05, BP
ADDL R8, BP
MOVL 36(DX), R8
XORL BX, R9
XORL SI, R9
ADDL R9, BP
ROLL $0x17, BP
MOVL SI, R9
ADDL SI, BP
ADDL $0xd9d4d039, BX
ADDL R8, BX
MOVL 48(DX), R8
XORL CX, R9
XORL BP, R9
ADDL R9, BX
ROLL $0x04, BX
MOVL BP, R9
ADDL BP, BX
ADDL $0xe6db99e5, CX
ADDL R8, CX
MOVL 60(DX), R8
XORL SI, R9
XORL BX, R9
ADDL R9, CX
ROLL $0x0b, CX
MOVL BX, R9
ADDL BX, CX
ADDL $0x1fa27cf8, SI
ADDL R8, SI
MOVL 8(DX), R8
XORL BP, R9
XORL CX, R9
ADDL R9, SI
ROLL $0x10, SI
MOVL CX, R9
ADDL CX, SI
ADDL $0xc4ac5665, BP
ADDL R8, BP
MOVL (DX), R8
XORL BX, R9
XORL SI, R9
ADDL R9, BP
ROLL $0x17, BP
MOVL SI, R9
ADDL SI, BP
// ROUND4
MOVL DI, R9
XORL CX, R9
ADDL $0xf4292244, BX
ADDL R8, BX
ORL BP, R9
XORL SI, R9
ADDL R9, BX
MOVL 28(DX), R8
MOVL DI, R9
ROLL $0x06, BX
XORL SI, R9
ADDL BP, BX
ADDL $0x432aff97, CX
ADDL R8, CX
ORL BX, R9
XORL BP, R9
ADDL R9, CX
MOVL 56(DX), R8
MOVL DI, R9
ROLL $0x0a, CX
XORL BP, R9
ADDL BX, CX
ADDL $0xab9423a7, SI
ADDL R8, SI
ORL CX, R9
XORL BX, R9
ADDL R9, SI
MOVL 20(DX), R8
MOVL DI, R9
ROLL $0x0f, SI
XORL BX, R9
ADDL CX, SI
ADDL $0xfc93a039, BP
ADDL R8, BP
ORL SI, R9
XORL CX, R9
ADDL R9, BP
MOVL 48(DX), R8
MOVL DI, R9
ROLL $0x15, BP
XORL CX, R9
ADDL SI, BP
ADDL $0x655b59c3, BX
ADDL R8, BX
ORL BP, R9
XORL SI, R9
ADDL R9, BX
MOVL 12(DX), R8
MOVL DI, R9
ROLL $0x06, BX
XORL SI, R9
ADDL BP, BX
ADDL $0x8f0ccc92, CX
ADDL R8, CX
ORL BX, R9
XORL BP, R9
ADDL R9, CX
MOVL 40(DX), R8
MOVL DI, R9
ROLL $0x0a, CX
XORL BP, R9
ADDL BX, CX
ADDL $0xffeff47d, SI
ADDL R8, SI
ORL CX, R9
XORL BX, R9
ADDL R9, SI
MOVL 4(DX), R8
MOVL DI, R9
ROLL $0x0f, SI
XORL BX, R9
ADDL CX, SI
ADDL $0x85845dd1, BP
ADDL R8, BP
ORL SI, R9
XORL CX, R9
ADDL R9, BP
MOVL 32(DX), R8
MOVL DI, R9
ROLL $0x15, BP
XORL CX, R9
ADDL SI, BP
ADDL $0x6fa87e4f, BX
ADDL R8, BX
ORL BP, R9
XORL SI, R9
ADDL R9, BX
MOVL 60(DX), R8
MOVL DI, R9
ROLL $0x06, BX
XORL SI, R9
ADDL BP, BX
ADDL $0xfe2ce6e0, CX
ADDL R8, CX
ORL BX, R9
XORL BP, R9
ADDL R9, CX
MOVL 24(DX), R8
MOVL DI, R9
ROLL $0x0a, CX
XORL BP, R9
ADDL BX, CX
ADDL $0xa3014314, SI
ADDL R8, SI
ORL CX, R9
XORL BX, R9
ADDL R9, SI
MOVL 52(DX), R8
MOVL DI, R9
ROLL $0x0f, SI
XORL BX, R9
ADDL CX, SI
ADDL $0x4e0811a1, BP
ADDL R8, BP
ORL SI, R9
XORL CX, R9
ADDL R9, BP
MOVL 16(DX), R8
MOVL DI, R9
ROLL $0x15, BP
XORL CX, R9
ADDL SI, BP
ADDL $0xf7537e82, BX
ADDL R8, BX
ORL BP, R9
XORL SI, R9
ADDL R9, BX
MOVL 44(DX), R8
MOVL DI, R9
ROLL $0x06, BX
XORL SI, R9
ADDL BP, BX
ADDL $0xbd3af235, CX
ADDL R8, CX
ORL BX, R9
XORL BP, R9
ADDL R9, CX
MOVL 8(DX), R8
MOVL DI, R9
ROLL $0x0a, CX
XORL BP, R9
ADDL BX, CX
ADDL $0x2ad7d2bb, SI
ADDL R8, SI
ORL CX, R9
XORL BX, R9
ADDL R9, SI
MOVL 36(DX), R8
MOVL DI, R9
ROLL $0x0f, SI
XORL BX, R9
ADDL CX, SI
ADDL $0xeb86d391, BP
ADDL R8, BP
ORL SI, R9
XORL CX, R9
ADDL R9, BP
ROLL $0x15, BP
ADDL SI, BP
ADDL R10, BX
ADDL R11, BP
ADDL R12, SI
ADDL R13, CX
// Prepare next loop
ADDQ $0x40, DX
CMPQ DX, AX
JB loop
// Write output
MOVQ dig+0(FP), AX
MOVL BX, (AX)
MOVL BP, 4(AX)
MOVL SI, 8(AX)
MOVL CX, 12(AX)
end:
RET