KasmVNC/common/rfb/scale_sse2.cxx

/* Copyright (C) 2021 Kasm Web
 *
 * This is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this software; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
 * USA.
 */

#include <emmintrin.h>

#include <rfb/scale_sse2.h>

namespace rfb {

/*
static void print128(const char msg[], const __m128i v) {
	union {
		__m128i v;
		uint8_t c[16];
	} u;

	u.v = v;

	printf("%s %02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x\n",
		msg,
		u.c[0],
		u.c[1],
		u.c[2],
		u.c[3],
		u.c[4],
		u.c[5],
		u.c[6],
		u.c[7],
		u.c[8],
		u.c[9],
		u.c[10],
		u.c[11],
		u.c[12],
		u.c[13],
		u.c[14],
		u.c[15]);
}
*/

void SSE2_halve(const uint8_t *oldpx,
			const uint16_t tgtw, const uint16_t tgth,
			uint8_t *newpx,
			const unsigned oldstride, const unsigned newstride) {
	uint16_t x, y;
	const uint16_t srcw = tgtw * 2, srch = tgth * 2;
	const __m128i zero = _mm_setzero_si128();
	const __m128i shift = _mm_set_epi32(0, 0, 0, 2);
	const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
	const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0);

	for (y = 0; y < srch; y += 2) {
		const uint8_t * const row0 = oldpx + oldstride * y * 4;
		const uint8_t * const row1 = oldpx + oldstride * (y + 1) * 4;

		uint8_t * const dst = newpx + newstride * (y / 2) * 4;

		for (x = 0; x < srcw - 3; x += 4) {
			__m128i lo, hi, a, b, c, d;
			lo = _mm_loadu_si128((__m128i *) &row0[x * 4]);
			hi = _mm_loadu_si128((__m128i *) &row1[x * 4]);

			a = _mm_unpacklo_epi8(lo, zero);
			b = _mm_unpackhi_epi8(lo, zero);
			c = _mm_unpacklo_epi8(hi, zero);
			d = _mm_unpackhi_epi8(hi, zero);

			a = _mm_add_epi16(a, c);
			b = _mm_add_epi16(b, d);

			c = _mm_srli_si128(a, 8);
			a = _mm_and_si128(a, low);
			a = _mm_add_epi16(a, c);

			d = _mm_slli_si128(b, 8);
			b = _mm_and_si128(b, high);
			b = _mm_add_epi16(b, d);

			a = _mm_add_epi16(a, b);

			a = _mm_srl_epi16(a, shift);
			a = _mm_packus_epi16(a, zero);

			_mm_storel_epi64((__m128i *) &dst[(x / 2) * 4], a);
		}

		for (; x < srcw; x += 2) {
			// Remainder in C
			uint8_t i;
			for (i = 0; i < 4; i++) {
				dst[(x / 2) * 4 + i] =
					(row0[x * 4 + i] +
					row0[(x + 1) * 4 + i] +
					row1[x * 4 + i] +
					row1[(x + 1) * 4 + i]) / 4;
			}
		}
	}
}

// Handles factors between 0.5 and 1.0
void SSE2_scale(const uint8_t *oldpx,
		const uint16_t tgtw, const uint16_t tgth,
		uint8_t *newpx,
		const unsigned oldstride, const unsigned newstride,
		const float tgtdiff) {

	uint16_t x, y;
	const __m128i zero = _mm_setzero_si128();
	const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
	const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0);
	const float invdiff = 1 / tgtdiff;

	for (y = 0; y < tgth; y++) {
		const float ny = y * invdiff;
		const uint16_t lowy = ny;
		const uint16_t highy = lowy + 1;
		const uint16_t bot = (ny - lowy) * 256;
		const uint16_t top = 256 - bot;
		const uint32_t * const row0 = (uint32_t *) (oldpx + oldstride * lowy * 4);
		const uint32_t * const row1 = (uint32_t *) (oldpx + oldstride * highy * 4);
		const uint8_t * const brow0 = (uint8_t *) row0;
		const uint8_t * const brow1 = (uint8_t *) row1;

		uint8_t * const dst = newpx + newstride * y * 4;

		const __m128i vertmul = _mm_set1_epi16(top);
		const __m128i vertmul2 = _mm_set1_epi16(bot);

		for (x = 0; x < tgtw - 1; x += 2) {
			const float nx[2] = {
				x * invdiff,
				(x + 1) * invdiff,
			};
			const uint16_t lowx[2] =  {
				(uint16_t) nx[0],
				(uint16_t) nx[1],
			};
			const uint16_t highx[2] = {
				(uint16_t) (lowx[0] + 1),
				(uint16_t) (lowx[1] + 1),
			};
			const uint16_t right[2] = {
				(uint16_t) ((nx[0] - lowx[0]) * 256),
				(uint16_t) ((nx[1] - lowx[1]) * 256),
			};
			const uint16_t left[2] = {
				(uint16_t) (256 - right[0]),
				(uint16_t) (256 - right[1]),
			};

			const __m128i horzmul = _mm_set_epi16(
				right[0],
				right[0],
				right[0],
				right[0],
				left[0],
				left[0],
				left[0],
				left[0]
			);
			const __m128i horzmul2 = _mm_set_epi16(
				right[1],
				right[1],
				right[1],
				right[1],
				left[1],
				left[1],
				left[1],
				left[1]
			);

			__m128i lo, hi, a, b, c, d;
			lo = _mm_setr_epi32(row0[lowx[0]],
						row0[highx[0]],
						row0[lowx[1]],
						row0[highx[1]]);
			hi = _mm_setr_epi32(row1[lowx[0]],
						row1[highx[0]],
						row1[lowx[1]],
						row1[highx[1]]);

			a = _mm_unpacklo_epi8(lo, zero);
			b = _mm_unpackhi_epi8(lo, zero);
			c = _mm_unpacklo_epi8(hi, zero);
			d = _mm_unpackhi_epi8(hi, zero);

			a = _mm_mullo_epi16(a, vertmul);
			b = _mm_mullo_epi16(b, vertmul);
			c = _mm_mullo_epi16(c, vertmul2);
			d = _mm_mullo_epi16(d, vertmul2);

			a = _mm_add_epi16(a, c);
			a = _mm_srli_epi16(a, 8);
			b = _mm_add_epi16(b, d);
			b = _mm_srli_epi16(b, 8);

			a = _mm_mullo_epi16(a, horzmul);
			b = _mm_mullo_epi16(b, horzmul2);

			lo = _mm_srli_si128(a, 8);
			a = _mm_and_si128(a, low);
			a = _mm_add_epi16(a, lo);

			hi = _mm_slli_si128(b, 8);
			b = _mm_and_si128(b, high);
			b = _mm_add_epi16(b, hi);

			a = _mm_add_epi16(a, b);
			a = _mm_srli_epi16(a, 8);

			a = _mm_packus_epi16(a, zero);

			_mm_storel_epi64((__m128i *) &dst[x * 4], a);
		}

		for (; x < tgtw; x++) {
			// Remainder in C
			const float nx = x * invdiff;
			const uint16_t lowx = nx;
			const uint16_t highx = lowx + 1;
			const uint16_t right = (nx - lowx) * 256;
			const uint16_t left = 256 - right;

			uint8_t i;
			uint32_t val, val2;
			for (i = 0; i < 4; i++) {
				val = brow0[lowx * 4 + i] * left;
				val += brow0[highx * 4 + i] * right;
				val >>= 8;

				val2 = brow1[lowx * 4 + i] * left;
				val2 += brow1[highx * 4 + i] * right;
				val2 >>= 8;

				dst[x * 4 + i] =
					(val * top + val2 * bot) >> 8;
			}
		}
	}
}

}; // namespace rfb
Sse scaling (#52) * Add CPUID functions for runtime dispatch * Add SSE2 scaling 2021-09-09 19:55:33 +02:00			`/* Copyright (C) 2021 Kasm Web`
			`*`
			`* This is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation; either version 2 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* This software is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this software; if not, write to the Free Software`
			`* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,`
			`* USA.`
			`*/`

			`#include <emmintrin.h>`

			`#include <rfb/scale_sse2.h>`

			`namespace rfb {`

			`/*`
			`static void print128(const char msg[], const __m128i v) {`
			`union {`
			`__m128i v;`
			`uint8_t c[16];`
			`} u;`

			`u.v = v;`

			`printf("%s %02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x\n",`
			`msg,`
			`u.c[0],`
			`u.c[1],`
			`u.c[2],`
			`u.c[3],`
			`u.c[4],`
			`u.c[5],`
			`u.c[6],`
			`u.c[7],`
			`u.c[8],`
			`u.c[9],`
			`u.c[10],`
			`u.c[11],`
			`u.c[12],`
			`u.c[13],`
			`u.c[14],`
			`u.c[15]);`
			`}`
			`*/`

			`void SSE2_halve(const uint8_t *oldpx,`
			`const uint16_t tgtw, const uint16_t tgth,`
			`uint8_t *newpx,`
			`const unsigned oldstride, const unsigned newstride) {`
			`uint16_t x, y;`
			`const uint16_t srcw = tgtw * 2, srch = tgth * 2;`
			`const __m128i zero = _mm_setzero_si128();`
			`const __m128i shift = _mm_set_epi32(0, 0, 0, 2);`
			`const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);`
			`const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0);`

			`for (y = 0; y < srch; y += 2) {`
			`const uint8_t * const row0 = oldpx + oldstride * y * 4;`
			`const uint8_t * const row1 = oldpx + oldstride * (y + 1) * 4;`

			`uint8_t * const dst = newpx + newstride * (y / 2) * 4;`

Chromeclip (#69) * Initial binary clipboard support * Rename -DLP_Clip_Types to -DLP_ClipTypes * Better handling of websocket frames * Copy-paste bug in SSE2 scaling to under 0.5x * Remove old text clipboard * Bind text to binary clipboard * Move binclip clear to probing phase * Off-by-one in sse2 scaling * Add a clarifying log message for INCR clipboard transfers * WIP: Update novnc commit * Fix CentOS pipeline * webpack fix * Update novnc commit * Change some DLP defaults * update novnc commit Co-authored-by: Lauri Kasanen <cand@gmx.com> Co-authored-by: matt <matt@kasmweb.com> 2021-10-15 21:57:58 +02:00			`for (x = 0; x < srcw - 3; x += 4) {`
Sse scaling (#52) * Add CPUID functions for runtime dispatch * Add SSE2 scaling 2021-09-09 19:55:33 +02:00			`__m128i lo, hi, a, b, c, d;`
			`lo = _mm_loadu_si128((__m128i ) &row0[x 4]);`
			`hi = _mm_loadu_si128((__m128i ) &row1[x 4]);`

			`a = _mm_unpacklo_epi8(lo, zero);`
			`b = _mm_unpackhi_epi8(lo, zero);`
			`c = _mm_unpacklo_epi8(hi, zero);`
			`d = _mm_unpackhi_epi8(hi, zero);`

			`a = _mm_add_epi16(a, c);`
			`b = _mm_add_epi16(b, d);`

			`c = _mm_srli_si128(a, 8);`
			`a = _mm_and_si128(a, low);`
			`a = _mm_add_epi16(a, c);`

			`d = _mm_slli_si128(b, 8);`
			`b = _mm_and_si128(b, high);`
			`b = _mm_add_epi16(b, d);`

			`a = _mm_add_epi16(a, b);`

			`a = _mm_srl_epi16(a, shift);`
			`a = _mm_packus_epi16(a, zero);`

			`_mm_storel_epi64((__m128i ) &dst[(x / 2) 4], a);`
			`}`

			`for (; x < srcw; x += 2) {`
			`// Remainder in C`
			`uint8_t i;`
			`for (i = 0; i < 4; i++) {`
			`dst[(x / 2) * 4 + i] =`
			`(row0[x * 4 + i] +`
			`row0[(x + 1) * 4 + i] +`
			`row1[x * 4 + i] +`
			`row1[(x + 1) * 4 + i]) / 4;`
			`}`
			`}`
			`}`
			`}`

			`// Handles factors between 0.5 and 1.0`
			`void SSE2_scale(const uint8_t *oldpx,`
			`const uint16_t tgtw, const uint16_t tgth,`
			`uint8_t *newpx,`
			`const unsigned oldstride, const unsigned newstride,`
			`const float tgtdiff) {`

			`uint16_t x, y;`
			`const __m128i zero = _mm_setzero_si128();`
			`const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);`
			`const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0);`
			`const float invdiff = 1 / tgtdiff;`

			`for (y = 0; y < tgth; y++) {`
			`const float ny = y * invdiff;`
			`const uint16_t lowy = ny;`
			`const uint16_t highy = lowy + 1;`
			`const uint16_t bot = (ny - lowy) * 256;`
			`const uint16_t top = 256 - bot;`
			`const uint32_t * const row0 = (uint32_t ) (oldpx + oldstride lowy * 4);`
			`const uint32_t * const row1 = (uint32_t ) (oldpx + oldstride highy * 4);`
			`const uint8_t * const brow0 = (uint8_t *) row0;`
			`const uint8_t * const brow1 = (uint8_t *) row1;`

			`uint8_t * const dst = newpx + newstride * y * 4;`

			`const __m128i vertmul = _mm_set1_epi16(top);`
			`const __m128i vertmul2 = _mm_set1_epi16(bot);`

Chromeclip (#69) * Initial binary clipboard support * Rename -DLP_Clip_Types to -DLP_ClipTypes * Better handling of websocket frames * Copy-paste bug in SSE2 scaling to under 0.5x * Remove old text clipboard * Bind text to binary clipboard * Move binclip clear to probing phase * Off-by-one in sse2 scaling * Add a clarifying log message for INCR clipboard transfers * WIP: Update novnc commit * Fix CentOS pipeline * webpack fix * Update novnc commit * Change some DLP defaults * update novnc commit Co-authored-by: Lauri Kasanen <cand@gmx.com> Co-authored-by: matt <matt@kasmweb.com> 2021-10-15 21:57:58 +02:00			`for (x = 0; x < tgtw - 1; x += 2) {`
Sse scaling (#52) * Add CPUID functions for runtime dispatch * Add SSE2 scaling 2021-09-09 19:55:33 +02:00			`const float nx[2] = {`
			`x * invdiff,`
			`(x + 1) * invdiff,`
			`};`
			`const uint16_t lowx[2] = {`
			`(uint16_t) nx[0],`
			`(uint16_t) nx[1],`
			`};`
			`const uint16_t highx[2] = {`
			`(uint16_t) (lowx[0] + 1),`
			`(uint16_t) (lowx[1] + 1),`
			`};`
			`const uint16_t right[2] = {`
			`(uint16_t) ((nx[0] - lowx[0]) * 256),`
			`(uint16_t) ((nx[1] - lowx[1]) * 256),`
			`};`
			`const uint16_t left[2] = {`
			`(uint16_t) (256 - right[0]),`
			`(uint16_t) (256 - right[1]),`
			`};`

			`const __m128i horzmul = _mm_set_epi16(`
			`right[0],`
			`right[0],`
			`right[0],`
			`right[0],`
			`left[0],`
			`left[0],`
			`left[0],`
			`left[0]`
			`);`
			`const __m128i horzmul2 = _mm_set_epi16(`
			`right[1],`
			`right[1],`
			`right[1],`
			`right[1],`
			`left[1],`
			`left[1],`
			`left[1],`
			`left[1]`
			`);`

			`__m128i lo, hi, a, b, c, d;`
			`lo = _mm_setr_epi32(row0[lowx[0]],`
			`row0[highx[0]],`
			`row0[lowx[1]],`
			`row0[highx[1]]);`
			`hi = _mm_setr_epi32(row1[lowx[0]],`
			`row1[highx[0]],`
			`row1[lowx[1]],`
			`row1[highx[1]]);`

			`a = _mm_unpacklo_epi8(lo, zero);`
			`b = _mm_unpackhi_epi8(lo, zero);`
			`c = _mm_unpacklo_epi8(hi, zero);`
			`d = _mm_unpackhi_epi8(hi, zero);`

			`a = _mm_mullo_epi16(a, vertmul);`
			`b = _mm_mullo_epi16(b, vertmul);`
			`c = _mm_mullo_epi16(c, vertmul2);`
			`d = _mm_mullo_epi16(d, vertmul2);`

			`a = _mm_add_epi16(a, c);`
			`a = _mm_srli_epi16(a, 8);`
			`b = _mm_add_epi16(b, d);`
			`b = _mm_srli_epi16(b, 8);`

			`a = _mm_mullo_epi16(a, horzmul);`
			`b = _mm_mullo_epi16(b, horzmul2);`

			`lo = _mm_srli_si128(a, 8);`
			`a = _mm_and_si128(a, low);`
			`a = _mm_add_epi16(a, lo);`

			`hi = _mm_slli_si128(b, 8);`
			`b = _mm_and_si128(b, high);`
			`b = _mm_add_epi16(b, hi);`

			`a = _mm_add_epi16(a, b);`
			`a = _mm_srli_epi16(a, 8);`

			`a = _mm_packus_epi16(a, zero);`

			`_mm_storel_epi64((__m128i ) &dst[x 4], a);`
			`}`

			`for (; x < tgtw; x++) {`
			`// Remainder in C`
			`const float nx = x * invdiff;`
			`const uint16_t lowx = nx;`
			`const uint16_t highx = lowx + 1;`
			`const uint16_t right = (nx - lowx) * 256;`
			`const uint16_t left = 256 - right;`

			`uint8_t i;`
			`uint32_t val, val2;`
			`for (i = 0; i < 4; i++) {`
			`val = brow0[lowx * 4 + i] * left;`
			`val += brow0[highx * 4 + i] * right;`
			`val >>= 8;`

			`val2 = brow1[lowx * 4 + i] * left;`
			`val2 += brow1[highx * 4 + i] * right;`
			`val2 >>= 8;`

			`dst[x * 4 + i] =`
			`(val * top + val2 * bot) >> 8;`
			`}`
			`}`
			`}`
			`}`

			`}; // namespace rfb`