/* Copyright (C) 2021 Kasm Web * * This is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this software; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, * USA. */ #include #include namespace rfb { /* static void print128(const char msg[], const __m128i v) { union { __m128i v; uint8_t c[16]; } u; u.v = v; printf("%s %02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x\n", msg, u.c[0], u.c[1], u.c[2], u.c[3], u.c[4], u.c[5], u.c[6], u.c[7], u.c[8], u.c[9], u.c[10], u.c[11], u.c[12], u.c[13], u.c[14], u.c[15]); } */ void SSE2_halve(const uint8_t *oldpx, const uint16_t tgtw, const uint16_t tgth, uint8_t *newpx, const unsigned oldstride, const unsigned newstride) { uint16_t x, y; const uint16_t srcw = tgtw * 2, srch = tgth * 2; const __m128i zero = _mm_setzero_si128(); const __m128i shift = _mm_set_epi32(0, 0, 0, 2); const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0); for (y = 0; y < srch; y += 2) { const uint8_t * const row0 = oldpx + oldstride * y * 4; const uint8_t * const row1 = oldpx + oldstride * (y + 1) * 4; uint8_t * const dst = newpx + newstride * (y / 2) * 4; for (x = 0; x < srcw; x += 4) { __m128i lo, hi, a, b, c, d; lo = _mm_loadu_si128((__m128i *) &row0[x * 4]); hi = _mm_loadu_si128((__m128i *) &row1[x * 4]); a = _mm_unpacklo_epi8(lo, zero); b = _mm_unpackhi_epi8(lo, zero); c = _mm_unpacklo_epi8(hi, zero); d = _mm_unpackhi_epi8(hi, zero); a = _mm_add_epi16(a, c); b = _mm_add_epi16(b, d); c = _mm_srli_si128(a, 8); a = _mm_and_si128(a, low); a = _mm_add_epi16(a, c); d = _mm_slli_si128(b, 8); b = _mm_and_si128(b, high); b = _mm_add_epi16(b, d); a = _mm_add_epi16(a, b); a = _mm_srl_epi16(a, shift); a = _mm_packus_epi16(a, zero); _mm_storel_epi64((__m128i *) &dst[(x / 2) * 4], a); } for (; x < srcw; x += 2) { // Remainder in C uint8_t i; for (i = 0; i < 4; i++) { dst[(x / 2) * 4 + i] = (row0[x * 4 + i] + row0[(x + 1) * 4 + i] + row1[x * 4 + i] + row1[(x + 1) * 4 + i]) / 4; } } } } // Handles factors between 0.5 and 1.0 void SSE2_scale(const uint8_t *oldpx, const uint16_t tgtw, const uint16_t tgth, uint8_t *newpx, const unsigned oldstride, const unsigned newstride, const float tgtdiff) { uint16_t x, y; const __m128i zero = _mm_setzero_si128(); const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0); const float invdiff = 1 / tgtdiff; for (y = 0; y < tgth; y++) { const float ny = y * invdiff; const uint16_t lowy = ny; const uint16_t highy = lowy + 1; const uint16_t bot = (ny - lowy) * 256; const uint16_t top = 256 - bot; const uint32_t * const row0 = (uint32_t *) (oldpx + oldstride * lowy * 4); const uint32_t * const row1 = (uint32_t *) (oldpx + oldstride * highy * 4); const uint8_t * const brow0 = (uint8_t *) row0; const uint8_t * const brow1 = (uint8_t *) row1; uint8_t * const dst = newpx + newstride * y * 4; const __m128i vertmul = _mm_set1_epi16(top); const __m128i vertmul2 = _mm_set1_epi16(bot); for (x = 0; x < tgtw; x += 2) { const float nx[2] = { x * invdiff, (x + 1) * invdiff, }; const uint16_t lowx[2] = { (uint16_t) nx[0], (uint16_t) nx[1], }; const uint16_t highx[2] = { (uint16_t) (lowx[0] + 1), (uint16_t) (lowx[1] + 1), }; const uint16_t right[2] = { (uint16_t) ((nx[0] - lowx[0]) * 256), (uint16_t) ((nx[1] - lowx[1]) * 256), }; const uint16_t left[2] = { (uint16_t) (256 - right[0]), (uint16_t) (256 - right[1]), }; const __m128i horzmul = _mm_set_epi16( right[0], right[0], right[0], right[0], left[0], left[0], left[0], left[0] ); const __m128i horzmul2 = _mm_set_epi16( right[1], right[1], right[1], right[1], left[1], left[1], left[1], left[1] ); __m128i lo, hi, a, b, c, d; lo = _mm_setr_epi32(row0[lowx[0]], row0[highx[0]], row0[lowx[1]], row0[highx[1]]); hi = _mm_setr_epi32(row1[lowx[0]], row1[highx[0]], row1[lowx[1]], row1[highx[1]]); a = _mm_unpacklo_epi8(lo, zero); b = _mm_unpackhi_epi8(lo, zero); c = _mm_unpacklo_epi8(hi, zero); d = _mm_unpackhi_epi8(hi, zero); a = _mm_mullo_epi16(a, vertmul); b = _mm_mullo_epi16(b, vertmul); c = _mm_mullo_epi16(c, vertmul2); d = _mm_mullo_epi16(d, vertmul2); a = _mm_add_epi16(a, c); a = _mm_srli_epi16(a, 8); b = _mm_add_epi16(b, d); b = _mm_srli_epi16(b, 8); a = _mm_mullo_epi16(a, horzmul); b = _mm_mullo_epi16(b, horzmul2); lo = _mm_srli_si128(a, 8); a = _mm_and_si128(a, low); a = _mm_add_epi16(a, lo); hi = _mm_slli_si128(b, 8); b = _mm_and_si128(b, high); b = _mm_add_epi16(b, hi); a = _mm_add_epi16(a, b); a = _mm_srli_epi16(a, 8); a = _mm_packus_epi16(a, zero); _mm_storel_epi64((__m128i *) &dst[x * 4], a); } for (; x < tgtw; x++) { // Remainder in C const float nx = x * invdiff; const uint16_t lowx = nx; const uint16_t highx = lowx + 1; const uint16_t right = (nx - lowx) * 256; const uint16_t left = 256 - right; uint8_t i; uint32_t val, val2; for (i = 0; i < 4; i++) { val = brow0[lowx * 4 + i] * left; val += brow0[highx * 4 + i] * right; val >>= 8; val2 = brow1[lowx * 4 + i] * left; val2 += brow1[highx * 4 + i] * right; val2 >>= 8; dst[x * 4 + i] = (val * top + val2 * bot) >> 8; } } } } }; // namespace rfb