mirror of
https://github.com/kasmtech/KasmVNC.git
synced 2025-06-24 11:41:32 +02:00
Sse scaling (#52)
* Add CPUID functions for runtime dispatch * Add SSE2 scaling
This commit is contained in:
parent
dc21d5f97c
commit
0cb2c0ba9f
@ -16,6 +16,7 @@ include(CheckLibraryExists)
|
|||||||
include(CheckTypeSize)
|
include(CheckTypeSize)
|
||||||
include(CheckCSourceCompiles)
|
include(CheckCSourceCompiles)
|
||||||
include(CheckCXXSourceCompiles)
|
include(CheckCXXSourceCompiles)
|
||||||
|
include(CheckCXXCompilerFlag)
|
||||||
include(CheckCSourceRuns)
|
include(CheckCSourceRuns)
|
||||||
|
|
||||||
include(CMakeMacroLibtoolFile)
|
include(CMakeMacroLibtoolFile)
|
||||||
@ -208,6 +209,9 @@ if(ENABLE_PAM)
|
|||||||
endif()
|
endif()
|
||||||
set(HAVE_PAM ${ENABLE_PAM})
|
set(HAVE_PAM ${ENABLE_PAM})
|
||||||
|
|
||||||
|
# Check for SSE2
|
||||||
|
check_cxx_compiler_flag(-msse2 COMPILER_SUPPORTS_SSE2)
|
||||||
|
|
||||||
# Generate config.h and make sure the source finds it
|
# Generate config.h and make sure the source finds it
|
||||||
configure_file(config.h.in config.h)
|
configure_file(config.h.in config.h)
|
||||||
add_definitions(-DHAVE_CONFIG_H)
|
add_definitions(-DHAVE_CONFIG_H)
|
||||||
|
@ -64,6 +64,7 @@ set(RFB_SOURCES
|
|||||||
VNCServerST.cxx
|
VNCServerST.cxx
|
||||||
ZRLEEncoder.cxx
|
ZRLEEncoder.cxx
|
||||||
ZRLEDecoder.cxx
|
ZRLEDecoder.cxx
|
||||||
|
cpuid.cxx
|
||||||
encodings.cxx
|
encodings.cxx
|
||||||
util.cxx
|
util.cxx
|
||||||
xxhash.c)
|
xxhash.c)
|
||||||
@ -97,6 +98,27 @@ if(GNUTLS_FOUND)
|
|||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# SSE2
|
||||||
|
|
||||||
|
set(SSE2_SOURCES
|
||||||
|
scale_sse2.cxx)
|
||||||
|
|
||||||
|
set(SCALE_DUMMY_SOURCES
|
||||||
|
scale_dummy.cxx)
|
||||||
|
|
||||||
|
if(COMPILER_SUPPORTS_SSE2)
|
||||||
|
set_source_files_properties(${SSE2_SOURCES} PROPERTIES COMPILE_FLAGS ${COMPILE_FLAGS} -msse2)
|
||||||
|
set(RFB_SOURCES
|
||||||
|
${RFB_SOURCES}
|
||||||
|
${SSE2_SOURCES}
|
||||||
|
)
|
||||||
|
else()
|
||||||
|
set(RFB_SOURCES
|
||||||
|
${RFB_SOURCES}
|
||||||
|
${SCALE_DUMMY_SOURCES}
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
add_library(rfb STATIC ${RFB_SOURCES})
|
add_library(rfb STATIC ${RFB_SOURCES})
|
||||||
|
|
||||||
target_link_libraries(rfb ${RFB_LIBRARIES})
|
target_link_libraries(rfb ${RFB_LIBRARIES})
|
||||||
|
@ -22,10 +22,12 @@
|
|||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include <rfb/cpuid.h>
|
||||||
#include <rfb/EncCache.h>
|
#include <rfb/EncCache.h>
|
||||||
#include <rfb/EncodeManager.h>
|
#include <rfb/EncodeManager.h>
|
||||||
#include <rfb/Encoder.h>
|
#include <rfb/Encoder.h>
|
||||||
#include <rfb/Palette.h>
|
#include <rfb/Palette.h>
|
||||||
|
#include <rfb/scale_sse2.h>
|
||||||
#include <rfb/SConnection.h>
|
#include <rfb/SConnection.h>
|
||||||
#include <rfb/ServerCore.h>
|
#include <rfb/ServerCore.h>
|
||||||
#include <rfb/SMsgWriter.h>
|
#include <rfb/SMsgWriter.h>
|
||||||
@ -972,6 +974,64 @@ PixelBuffer *rfb::progressiveBilinearScale(const PixelBuffer *pb,
|
|||||||
const uint16_t tgtw, const uint16_t tgth,
|
const uint16_t tgtw, const uint16_t tgth,
|
||||||
const float tgtdiff)
|
const float tgtdiff)
|
||||||
{
|
{
|
||||||
|
if (supportsSSE2()) {
|
||||||
|
if (tgtdiff >= 0.5f) {
|
||||||
|
ManagedPixelBuffer *newpb = new ManagedPixelBuffer(pb->getPF(), tgtw, tgth);
|
||||||
|
|
||||||
|
int oldstride, newstride;
|
||||||
|
const rdr::U8 *oldpx = pb->getBuffer(pb->getRect(), &oldstride);
|
||||||
|
rdr::U8 *newpx = newpb->getBufferRW(newpb->getRect(), &newstride);
|
||||||
|
|
||||||
|
SSE2_scale(oldpx, tgtw, tgth, newpx, oldstride, newstride, tgtdiff);
|
||||||
|
return newpb;
|
||||||
|
}
|
||||||
|
|
||||||
|
PixelBuffer *newpb;
|
||||||
|
uint16_t neww, newh, oldw, oldh;
|
||||||
|
bool del = false;
|
||||||
|
|
||||||
|
do {
|
||||||
|
oldw = pb->getRect().width();
|
||||||
|
oldh = pb->getRect().height();
|
||||||
|
neww = oldw / 2;
|
||||||
|
newh = oldh / 2;
|
||||||
|
|
||||||
|
newpb = new ManagedPixelBuffer(pb->getPF(), neww, newh);
|
||||||
|
|
||||||
|
int oldstride, newstride;
|
||||||
|
const rdr::U8 *oldpx = pb->getBuffer(pb->getRect(), &oldstride);
|
||||||
|
rdr::U8 *newpx = ((ManagedPixelBuffer *) newpb)->getBufferRW(newpb->getRect(),
|
||||||
|
&newstride);
|
||||||
|
|
||||||
|
SSE2_halve(oldpx, neww, newh, newpx, oldstride, newstride);
|
||||||
|
|
||||||
|
if (del)
|
||||||
|
delete pb;
|
||||||
|
del = true;
|
||||||
|
|
||||||
|
pb = newpb;
|
||||||
|
} while (tgtw * 2 < neww);
|
||||||
|
|
||||||
|
// Final, non-halving step
|
||||||
|
if (tgtw != neww || tgth != newh) {
|
||||||
|
oldw = pb->getRect().width();
|
||||||
|
oldh = pb->getRect().height();
|
||||||
|
|
||||||
|
newpb = new ManagedPixelBuffer(pb->getPF(), tgtw, tgth);
|
||||||
|
|
||||||
|
int oldstride, newstride;
|
||||||
|
const rdr::U8 *oldpx = pb->getBuffer(pb->getRect(), &oldstride);
|
||||||
|
rdr::U8 *newpx = ((ManagedPixelBuffer *) newpb)->getBufferRW(newpb->getRect(),
|
||||||
|
&newstride);
|
||||||
|
|
||||||
|
SSE2_scale(oldpx, tgtw, tgth, newpx, oldstride, newstride, tgtdiff);
|
||||||
|
if (del)
|
||||||
|
delete pb;
|
||||||
|
}
|
||||||
|
|
||||||
|
return newpb;
|
||||||
|
} // SSE2
|
||||||
|
|
||||||
if (tgtdiff >= 0.5f)
|
if (tgtdiff >= 0.5f)
|
||||||
return bilinearScale(pb, tgtw, tgth, tgtdiff);
|
return bilinearScale(pb, tgtw, tgth, tgtdiff);
|
||||||
|
|
||||||
|
@ -53,6 +53,7 @@
|
|||||||
|
|
||||||
#include <network/GetAPI.h>
|
#include <network/GetAPI.h>
|
||||||
|
|
||||||
|
#include <rfb/cpuid.h>
|
||||||
#include <rfb/ComparingUpdateTracker.h>
|
#include <rfb/ComparingUpdateTracker.h>
|
||||||
#include <rfb/KeyRemapper.h>
|
#include <rfb/KeyRemapper.h>
|
||||||
#include <rfb/ListConnInfo.h>
|
#include <rfb/ListConnInfo.h>
|
||||||
@ -134,6 +135,9 @@ VNCServerST::VNCServerST(const char* name_, SDesktop* desktop_)
|
|||||||
{
|
{
|
||||||
lastUserInputTime = lastDisconnectTime = time(0);
|
lastUserInputTime = lastDisconnectTime = time(0);
|
||||||
slog.debug("creating single-threaded server %s", name.buf);
|
slog.debug("creating single-threaded server %s", name.buf);
|
||||||
|
slog.info("CPU capability: SSE2 %s, AVX512f %s",
|
||||||
|
supportsSSE2() ? "yes" : "no",
|
||||||
|
supportsAVX512f() ? "yes" : "no");
|
||||||
|
|
||||||
DLPRegion.enabled = DLPRegion.percents = false;
|
DLPRegion.enabled = DLPRegion.percents = false;
|
||||||
|
|
||||||
|
70
common/rfb/cpuid.cxx
Normal file
70
common/rfb/cpuid.cxx
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
/* Copyright (C) 2021 Kasm Web
|
||||||
|
*
|
||||||
|
* This is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This software is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this software; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
|
||||||
|
* USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
static uint32_t cpuid[4] = { 0 };
|
||||||
|
static uint32_t extcpuid[4] = { 0 };
|
||||||
|
|
||||||
|
static void getcpuid() {
|
||||||
|
if (cpuid[0])
|
||||||
|
return;
|
||||||
|
|
||||||
|
#if defined(__x86_64__) || defined(__i386__)
|
||||||
|
uint32_t eax, ecx = 0;
|
||||||
|
|
||||||
|
eax = 1; // normal feature bits
|
||||||
|
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"cpuid\n\t"
|
||||||
|
: "=a"(cpuid[0]), "=b"(cpuid[1]), "=c"(cpuid[2]), "=d"(cpuid[3])
|
||||||
|
: "0"(eax), "2"(ecx)
|
||||||
|
);
|
||||||
|
|
||||||
|
eax = 7; // ext feature bits
|
||||||
|
ecx = 0;
|
||||||
|
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"cpuid\n\t"
|
||||||
|
: "=a"(extcpuid[0]), "=b"(extcpuid[1]), "=c"(extcpuid[2]), "=d"(extcpuid[3])
|
||||||
|
: "0"(eax), "2"(ecx)
|
||||||
|
);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace rfb {
|
||||||
|
|
||||||
|
bool supportsSSE2() {
|
||||||
|
getcpuid();
|
||||||
|
#if defined(__x86_64__) || defined(__i386__)
|
||||||
|
#define bit_SSE2 (1 << 26)
|
||||||
|
return cpuid[3] & bit_SSE2;
|
||||||
|
#endif
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool supportsAVX512f() {
|
||||||
|
getcpuid();
|
||||||
|
#if defined(__x86_64__) || defined(__i386__)
|
||||||
|
#define bit_AVX512f (1 << 16)
|
||||||
|
return extcpuid[1] & bit_AVX512f;
|
||||||
|
#endif
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
}; // namespace rfb
|
28
common/rfb/cpuid.h
Normal file
28
common/rfb/cpuid.h
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
/* Copyright (C) 2021 Kasm Web
|
||||||
|
*
|
||||||
|
* This is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This software is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this software; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
|
||||||
|
* USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __RFB_CPUID_H__
|
||||||
|
#define __RFB_CPUID_H__
|
||||||
|
|
||||||
|
namespace rfb {
|
||||||
|
|
||||||
|
bool supportsSSE2();
|
||||||
|
bool supportsAVX512f();
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
37
common/rfb/scale_dummy.cxx
Normal file
37
common/rfb/scale_dummy.cxx
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
/* Copyright (C) 2021 Kasm Web
|
||||||
|
*
|
||||||
|
* This is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This software is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this software; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
|
||||||
|
* USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <rfb/scale_sse2.h>
|
||||||
|
|
||||||
|
namespace rfb {
|
||||||
|
|
||||||
|
void SSE2_halve(const uint8_t *oldpx,
|
||||||
|
const uint16_t tgtw, const uint16_t tgth,
|
||||||
|
uint8_t *newpx,
|
||||||
|
const unsigned oldstride, const unsigned newstride) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handles factors between 0.5 and 1.0
|
||||||
|
void SSE2_scale(const uint8_t *oldpx,
|
||||||
|
const uint16_t tgtw, const uint16_t tgth,
|
||||||
|
uint8_t *newpx,
|
||||||
|
const unsigned oldstride, const unsigned newstride,
|
||||||
|
const float tgtdiff) {
|
||||||
|
}
|
||||||
|
|
||||||
|
}; // namespace rfb
|
257
common/rfb/scale_sse2.cxx
Normal file
257
common/rfb/scale_sse2.cxx
Normal file
@ -0,0 +1,257 @@
|
|||||||
|
/* Copyright (C) 2021 Kasm Web
|
||||||
|
*
|
||||||
|
* This is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This software is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this software; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
|
||||||
|
* USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <emmintrin.h>
|
||||||
|
|
||||||
|
#include <rfb/scale_sse2.h>
|
||||||
|
|
||||||
|
namespace rfb {
|
||||||
|
|
||||||
|
/*
|
||||||
|
static void print128(const char msg[], const __m128i v) {
|
||||||
|
union {
|
||||||
|
__m128i v;
|
||||||
|
uint8_t c[16];
|
||||||
|
} u;
|
||||||
|
|
||||||
|
u.v = v;
|
||||||
|
|
||||||
|
printf("%s %02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x\n",
|
||||||
|
msg,
|
||||||
|
u.c[0],
|
||||||
|
u.c[1],
|
||||||
|
u.c[2],
|
||||||
|
u.c[3],
|
||||||
|
u.c[4],
|
||||||
|
u.c[5],
|
||||||
|
u.c[6],
|
||||||
|
u.c[7],
|
||||||
|
u.c[8],
|
||||||
|
u.c[9],
|
||||||
|
u.c[10],
|
||||||
|
u.c[11],
|
||||||
|
u.c[12],
|
||||||
|
u.c[13],
|
||||||
|
u.c[14],
|
||||||
|
u.c[15]);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
void SSE2_halve(const uint8_t *oldpx,
|
||||||
|
const uint16_t tgtw, const uint16_t tgth,
|
||||||
|
uint8_t *newpx,
|
||||||
|
const unsigned oldstride, const unsigned newstride) {
|
||||||
|
uint16_t x, y;
|
||||||
|
const uint16_t srcw = tgtw * 2, srch = tgth * 2;
|
||||||
|
const __m128i zero = _mm_setzero_si128();
|
||||||
|
const __m128i shift = _mm_set_epi32(0, 0, 0, 2);
|
||||||
|
const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
|
||||||
|
const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0);
|
||||||
|
|
||||||
|
for (y = 0; y < srch; y += 2) {
|
||||||
|
const uint8_t * const row0 = oldpx + oldstride * y * 4;
|
||||||
|
const uint8_t * const row1 = oldpx + oldstride * (y + 1) * 4;
|
||||||
|
|
||||||
|
uint8_t * const dst = newpx + newstride * (y / 2) * 4;
|
||||||
|
|
||||||
|
for (x = 0; x < srcw; x += 4) {
|
||||||
|
__m128i lo, hi, a, b, c, d;
|
||||||
|
lo = _mm_loadu_si128((__m128i *) &row0[x * 4]);
|
||||||
|
hi = _mm_loadu_si128((__m128i *) &row1[x * 4]);
|
||||||
|
|
||||||
|
a = _mm_unpacklo_epi8(lo, zero);
|
||||||
|
b = _mm_unpackhi_epi8(lo, zero);
|
||||||
|
c = _mm_unpacklo_epi8(hi, zero);
|
||||||
|
d = _mm_unpackhi_epi8(hi, zero);
|
||||||
|
|
||||||
|
a = _mm_add_epi16(a, c);
|
||||||
|
b = _mm_add_epi16(b, d);
|
||||||
|
|
||||||
|
c = _mm_srli_si128(a, 8);
|
||||||
|
a = _mm_and_si128(a, low);
|
||||||
|
a = _mm_add_epi16(a, c);
|
||||||
|
|
||||||
|
d = _mm_slli_si128(b, 8);
|
||||||
|
b = _mm_and_si128(b, high);
|
||||||
|
b = _mm_add_epi16(b, d);
|
||||||
|
|
||||||
|
a = _mm_add_epi16(a, b);
|
||||||
|
|
||||||
|
a = _mm_srl_epi16(a, shift);
|
||||||
|
a = _mm_packus_epi16(a, zero);
|
||||||
|
|
||||||
|
_mm_storel_epi64((__m128i *) &dst[(x / 2) * 4], a);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; x < srcw; x += 2) {
|
||||||
|
// Remainder in C
|
||||||
|
uint8_t i;
|
||||||
|
for (i = 0; i < 4; i++) {
|
||||||
|
dst[(x / 2) * 4 + i] =
|
||||||
|
(row0[x * 4 + i] +
|
||||||
|
row0[(x + 1) * 4 + i] +
|
||||||
|
row1[x * 4 + i] +
|
||||||
|
row1[(x + 1) * 4 + i]) / 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handles factors between 0.5 and 1.0
|
||||||
|
void SSE2_scale(const uint8_t *oldpx,
|
||||||
|
const uint16_t tgtw, const uint16_t tgth,
|
||||||
|
uint8_t *newpx,
|
||||||
|
const unsigned oldstride, const unsigned newstride,
|
||||||
|
const float tgtdiff) {
|
||||||
|
|
||||||
|
uint16_t x, y;
|
||||||
|
const __m128i zero = _mm_setzero_si128();
|
||||||
|
const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
|
||||||
|
const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0);
|
||||||
|
const float invdiff = 1 / tgtdiff;
|
||||||
|
|
||||||
|
for (y = 0; y < tgth; y++) {
|
||||||
|
const float ny = y * invdiff;
|
||||||
|
const uint16_t lowy = ny;
|
||||||
|
const uint16_t highy = lowy + 1;
|
||||||
|
const uint16_t bot = (ny - lowy) * 256;
|
||||||
|
const uint16_t top = 256 - bot;
|
||||||
|
const uint32_t * const row0 = (uint32_t *) (oldpx + oldstride * lowy * 4);
|
||||||
|
const uint32_t * const row1 = (uint32_t *) (oldpx + oldstride * highy * 4);
|
||||||
|
const uint8_t * const brow0 = (uint8_t *) row0;
|
||||||
|
const uint8_t * const brow1 = (uint8_t *) row1;
|
||||||
|
|
||||||
|
uint8_t * const dst = newpx + newstride * y * 4;
|
||||||
|
|
||||||
|
const __m128i vertmul = _mm_set1_epi16(top);
|
||||||
|
const __m128i vertmul2 = _mm_set1_epi16(bot);
|
||||||
|
|
||||||
|
for (x = 0; x < tgtw; x += 2) {
|
||||||
|
const float nx[2] = {
|
||||||
|
x * invdiff,
|
||||||
|
(x + 1) * invdiff,
|
||||||
|
};
|
||||||
|
const uint16_t lowx[2] = {
|
||||||
|
(uint16_t) nx[0],
|
||||||
|
(uint16_t) nx[1],
|
||||||
|
};
|
||||||
|
const uint16_t highx[2] = {
|
||||||
|
(uint16_t) (lowx[0] + 1),
|
||||||
|
(uint16_t) (lowx[1] + 1),
|
||||||
|
};
|
||||||
|
const uint16_t right[2] = {
|
||||||
|
(uint16_t) ((nx[0] - lowx[0]) * 256),
|
||||||
|
(uint16_t) ((nx[1] - lowx[1]) * 256),
|
||||||
|
};
|
||||||
|
const uint16_t left[2] = {
|
||||||
|
(uint16_t) (256 - right[0]),
|
||||||
|
(uint16_t) (256 - right[1]),
|
||||||
|
};
|
||||||
|
|
||||||
|
const __m128i horzmul = _mm_set_epi16(
|
||||||
|
right[0],
|
||||||
|
right[0],
|
||||||
|
right[0],
|
||||||
|
right[0],
|
||||||
|
left[0],
|
||||||
|
left[0],
|
||||||
|
left[0],
|
||||||
|
left[0]
|
||||||
|
);
|
||||||
|
const __m128i horzmul2 = _mm_set_epi16(
|
||||||
|
right[1],
|
||||||
|
right[1],
|
||||||
|
right[1],
|
||||||
|
right[1],
|
||||||
|
left[1],
|
||||||
|
left[1],
|
||||||
|
left[1],
|
||||||
|
left[1]
|
||||||
|
);
|
||||||
|
|
||||||
|
__m128i lo, hi, a, b, c, d;
|
||||||
|
lo = _mm_setr_epi32(row0[lowx[0]],
|
||||||
|
row0[highx[0]],
|
||||||
|
row0[lowx[1]],
|
||||||
|
row0[highx[1]]);
|
||||||
|
hi = _mm_setr_epi32(row1[lowx[0]],
|
||||||
|
row1[highx[0]],
|
||||||
|
row1[lowx[1]],
|
||||||
|
row1[highx[1]]);
|
||||||
|
|
||||||
|
a = _mm_unpacklo_epi8(lo, zero);
|
||||||
|
b = _mm_unpackhi_epi8(lo, zero);
|
||||||
|
c = _mm_unpacklo_epi8(hi, zero);
|
||||||
|
d = _mm_unpackhi_epi8(hi, zero);
|
||||||
|
|
||||||
|
a = _mm_mullo_epi16(a, vertmul);
|
||||||
|
b = _mm_mullo_epi16(b, vertmul);
|
||||||
|
c = _mm_mullo_epi16(c, vertmul2);
|
||||||
|
d = _mm_mullo_epi16(d, vertmul2);
|
||||||
|
|
||||||
|
a = _mm_add_epi16(a, c);
|
||||||
|
a = _mm_srli_epi16(a, 8);
|
||||||
|
b = _mm_add_epi16(b, d);
|
||||||
|
b = _mm_srli_epi16(b, 8);
|
||||||
|
|
||||||
|
a = _mm_mullo_epi16(a, horzmul);
|
||||||
|
b = _mm_mullo_epi16(b, horzmul2);
|
||||||
|
|
||||||
|
lo = _mm_srli_si128(a, 8);
|
||||||
|
a = _mm_and_si128(a, low);
|
||||||
|
a = _mm_add_epi16(a, lo);
|
||||||
|
|
||||||
|
hi = _mm_slli_si128(b, 8);
|
||||||
|
b = _mm_and_si128(b, high);
|
||||||
|
b = _mm_add_epi16(b, hi);
|
||||||
|
|
||||||
|
a = _mm_add_epi16(a, b);
|
||||||
|
a = _mm_srli_epi16(a, 8);
|
||||||
|
|
||||||
|
a = _mm_packus_epi16(a, zero);
|
||||||
|
|
||||||
|
_mm_storel_epi64((__m128i *) &dst[x * 4], a);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; x < tgtw; x++) {
|
||||||
|
// Remainder in C
|
||||||
|
const float nx = x * invdiff;
|
||||||
|
const uint16_t lowx = nx;
|
||||||
|
const uint16_t highx = lowx + 1;
|
||||||
|
const uint16_t right = (nx - lowx) * 256;
|
||||||
|
const uint16_t left = 256 - right;
|
||||||
|
|
||||||
|
uint8_t i;
|
||||||
|
uint32_t val, val2;
|
||||||
|
for (i = 0; i < 4; i++) {
|
||||||
|
val = brow0[lowx * 4 + i] * left;
|
||||||
|
val += brow0[highx * 4 + i] * right;
|
||||||
|
val >>= 8;
|
||||||
|
|
||||||
|
val2 = brow1[lowx * 4 + i] * left;
|
||||||
|
val2 += brow1[highx * 4 + i] * right;
|
||||||
|
val2 >>= 8;
|
||||||
|
|
||||||
|
dst[x * 4 + i] =
|
||||||
|
(val * top + val2 * bot) >> 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}; // namespace rfb
|
38
common/rfb/scale_sse2.h
Normal file
38
common/rfb/scale_sse2.h
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
/* Copyright (C) 2021 Kasm Web
|
||||||
|
*
|
||||||
|
* This is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This software is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this software; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
|
||||||
|
* USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __RFB_SCALE_SSE2_H__
|
||||||
|
#define __RFB_SCALE_SSE2_H__
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
namespace rfb {
|
||||||
|
|
||||||
|
void SSE2_halve(const uint8_t *oldpx,
|
||||||
|
const uint16_t tgtw, const uint16_t tgth,
|
||||||
|
uint8_t *newpx,
|
||||||
|
const unsigned oldstride, const unsigned newstride);
|
||||||
|
|
||||||
|
void SSE2_scale(const uint8_t *oldpx,
|
||||||
|
const uint16_t tgtw, const uint16_t tgth,
|
||||||
|
uint8_t *newpx,
|
||||||
|
const unsigned oldstride, const unsigned newstride,
|
||||||
|
const float tgtdiff);
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
2
kasmweb
2
kasmweb
@ -1 +1 @@
|
|||||||
Subproject commit ba40cacce068fa35fc706c41605db14c04348170
|
Subproject commit e0bb9f6bcf945da6cb10fd0eb48b63b48bf09bb8
|
Loading…
x
Reference in New Issue
Block a user