21 #include <winpr/assert.h>
22 #include <winpr/cast.h>
23 #include <winpr/platform.h>
24 #include <freerdp/config.h>
26 #include "../rfx_types.h"
29 #include "../../core/simd.h"
31 #if defined(SSE_AVX_INTRINSICS_ENABLED)
35 #include <winpr/sysinfo.h>
37 #include <xmmintrin.h>
38 #include <emmintrin.h>
41 #define __attribute__(...)
44 #define CACHE_LINE_BYTES 64
47 #define ATTRIBUTES __gnu_inline__, __always_inline__, __artificial__
49 #define ATTRIBUTES __gnu_inline__, __always_inline__
52 static __inline
void __attribute__((ATTRIBUTES))
53 mm_prefetch_buffer(
char* WINPR_RESTRICT buffer,
size_t num_bytes)
55 __m128i* buf = (__m128i*)buffer;
57 for (
size_t i = 0; i < (num_bytes /
sizeof(__m128i)); i += (CACHE_LINE_BYTES /
sizeof(__m128i)))
59 _mm_prefetch((
char*)(&buf[i]), _MM_HINT_NTA);
66 static __inline
void __attribute__((ATTRIBUTES))
67 rfx_quantization_decode_block_sse2(INT16* WINPR_RESTRICT buffer,
const size_t buffer_size,
70 __m128i* ptr = (__m128i*)buffer;
71 const __m128i* buf_end = (__m128i*)(buffer + buffer_size);
78 const __m128i la = _mm_load_si128(ptr);
79 const __m128i a = _mm_slli_epi16(la, WINPR_ASSERTING_INT_CAST(
int, factor));
81 _mm_store_si128(ptr, a);
83 }
while (ptr < buf_end);
86 static void rfx_quantization_decode_sse2(INT16* WINPR_RESTRICT buffer,
87 const UINT32* WINPR_RESTRICT quantVals)
90 WINPR_ASSERT(quantVals);
92 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
93 rfx_quantization_decode_block_sse2(&buffer[0], 1024, quantVals[8] - 1);
94 rfx_quantization_decode_block_sse2(&buffer[1024], 1024, quantVals[7] - 1);
95 rfx_quantization_decode_block_sse2(&buffer[2048], 1024, quantVals[9] - 1);
96 rfx_quantization_decode_block_sse2(&buffer[3072], 256, quantVals[5] - 1);
97 rfx_quantization_decode_block_sse2(&buffer[3328], 256, quantVals[4] - 1);
98 rfx_quantization_decode_block_sse2(&buffer[3584], 256, quantVals[6] - 1);
99 rfx_quantization_decode_block_sse2(&buffer[3840], 64, quantVals[2] - 1);
100 rfx_quantization_decode_block_sse2(&buffer[3904], 64, quantVals[1] - 1);
101 rfx_quantization_decode_block_sse2(&buffer[3968], 64, quantVals[3] - 1);
102 rfx_quantization_decode_block_sse2(&buffer[4032], 64, quantVals[0] - 1);
105 static __inline
void __attribute__((ATTRIBUTES))
106 rfx_quantization_encode_block_sse2(INT16* WINPR_RESTRICT buffer,
const unsigned buffer_size,
109 __m128i* ptr = (__m128i*)buffer;
110 const __m128i* buf_end = (
const __m128i*)(buffer + buffer_size);
115 const __m128i half = _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(INT16, 1 << (factor - 1)));
119 const __m128i la = _mm_load_si128(ptr);
120 __m128i a = _mm_add_epi16(la, half);
121 a = _mm_srai_epi16(a, factor);
122 _mm_store_si128(ptr, a);
124 }
while (ptr < buf_end);
127 static void rfx_quantization_encode_sse2(INT16* WINPR_RESTRICT buffer,
128 const UINT32* WINPR_RESTRICT quantization_values)
130 WINPR_ASSERT(buffer);
131 WINPR_ASSERT(quantization_values);
132 for (
size_t x = 0; x < 10; x++)
134 WINPR_ASSERT(quantization_values[x] >= 6);
135 WINPR_ASSERT(quantization_values[x] <= INT16_MAX + 6);
138 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
139 rfx_quantization_encode_block_sse2(
140 buffer, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[8] - 6));
141 rfx_quantization_encode_block_sse2(
142 buffer + 1024, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[7] - 6));
143 rfx_quantization_encode_block_sse2(
144 buffer + 2048, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[9] - 6));
145 rfx_quantization_encode_block_sse2(
146 buffer + 3072, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[5] - 6));
147 rfx_quantization_encode_block_sse2(
148 buffer + 3328, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[4] - 6));
149 rfx_quantization_encode_block_sse2(
150 buffer + 3584, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[6] - 6));
151 rfx_quantization_encode_block_sse2(
152 buffer + 3840, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[2] - 6));
153 rfx_quantization_encode_block_sse2(
154 buffer + 3904, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[1] - 6));
155 rfx_quantization_encode_block_sse2(
156 buffer + 3968, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[3] - 6));
157 rfx_quantization_encode_block_sse2(
158 buffer + 4032, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[0] - 6));
159 rfx_quantization_encode_block_sse2(buffer, 4096, 5);
162 static __inline
void __attribute__((ATTRIBUTES))
163 rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
164 INT16* WINPR_RESTRICT dst,
size_t subband_width)
168 INT16* dst_ptr = dst;
174 for (
size_t y = 0; y < subband_width; y++)
177 for (
size_t n = 0; n < subband_width; n += 8)
180 __m128i l_n = _mm_load_si128((__m128i*)l_ptr);
181 __m128i h_n = _mm_load_si128((__m128i*)h_ptr);
182 __m128i h_n_m = _mm_loadu_si128((__m128i*)(h_ptr - 1));
186 first = _mm_extract_epi16(h_n_m, 1);
187 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
190 __m128i tmp_n = _mm_add_epi16(h_n, h_n_m);
191 tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
192 tmp_n = _mm_srai_epi16(tmp_n, 1);
193 const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
194 _mm_store_si128((__m128i*)l_ptr, dst_n);
199 l_ptr -= subband_width;
200 h_ptr -= subband_width;
203 for (
size_t n = 0; n < subband_width; n += 8)
206 __m128i h_n = _mm_load_si128((__m128i*)h_ptr);
207 h_n = _mm_slli_epi16(h_n, 1);
208 __m128i dst_n = _mm_load_si128((__m128i*)(l_ptr));
209 __m128i dst_n_p = _mm_loadu_si128((__m128i*)(l_ptr + 1));
211 if (n == subband_width - 8)
213 last = _mm_extract_epi16(dst_n_p, 6);
214 dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
217 __m128i tmp_n = _mm_add_epi16(dst_n_p, dst_n);
218 tmp_n = _mm_srai_epi16(tmp_n, 1);
219 tmp_n = _mm_add_epi16(tmp_n, h_n);
220 dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
221 dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
222 _mm_store_si128((__m128i*)dst_ptr, dst1);
223 _mm_store_si128((__m128i*)(dst_ptr + 8), dst2);
231 static __inline
void __attribute__((ATTRIBUTES))
232 rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
233 INT16* WINPR_RESTRICT dst,
size_t subband_width)
237 INT16* dst_ptr = dst;
238 const size_t total_width = subband_width + subband_width;
241 for (
size_t n = 0; n < subband_width; n++)
243 for (
size_t x = 0; x < total_width; x += 8)
246 const __m128i l_n = _mm_load_si128((__m128i*)l_ptr);
247 const __m128i h_n = _mm_load_si128((__m128i*)h_ptr);
248 __m128i tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));
251 tmp_n = _mm_add_epi16(tmp_n, h_n);
254 const __m128i h_n_m = _mm_loadu_si128((__m128i*)(h_ptr - total_width));
255 tmp_n = _mm_add_epi16(tmp_n, h_n_m);
258 tmp_n = _mm_srai_epi16(tmp_n, 1);
259 const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
260 _mm_store_si128((__m128i*)dst_ptr, dst_n);
266 dst_ptr += total_width;
270 dst_ptr = dst + total_width;
273 for (
size_t n = 0; n < subband_width; n++)
275 for (
size_t x = 0; x < total_width; x += 8)
278 __m128i h_n = _mm_load_si128((__m128i*)h_ptr);
279 __m128i dst_n_m = _mm_load_si128((__m128i*)(dst_ptr - total_width));
280 h_n = _mm_slli_epi16(h_n, 1);
281 __m128i tmp_n = dst_n_m;
283 if (n == subband_width - 1)
284 tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
287 const __m128i dst_n_p = _mm_loadu_si128((__m128i*)(dst_ptr + total_width));
288 tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
291 tmp_n = _mm_srai_epi16(tmp_n, 1);
292 const __m128i dst_n = _mm_add_epi16(tmp_n, h_n);
293 _mm_store_si128((__m128i*)dst_ptr, dst_n);
298 dst_ptr += total_width;
302 static __inline
void __attribute__((ATTRIBUTES))
303 rfx_dwt_2d_decode_block_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
304 size_t subband_width)
306 mm_prefetch_buffer((
char*)idwt, 4ULL * subband_width *
sizeof(INT16));
312 INT16* ll = buffer + 3ULL * subband_width * subband_width;
315 rfx_dwt_2d_decode_block_horiz_sse2(ll, hl, l_dst, subband_width);
316 INT16* lh = buffer + 1ULL * subband_width * subband_width;
317 INT16* hh = buffer + 2ULL * subband_width * subband_width;
318 INT16* h_dst = idwt + 2ULL * subband_width * subband_width;
319 rfx_dwt_2d_decode_block_horiz_sse2(lh, hh, h_dst, subband_width);
321 rfx_dwt_2d_decode_block_vert_sse2(l_dst, h_dst, buffer, subband_width);
324 static void rfx_dwt_2d_decode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt_buffer)
326 WINPR_ASSERT(buffer);
327 WINPR_ASSERT(dwt_buffer);
329 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
330 rfx_dwt_2d_decode_block_sse2(&buffer[3840], dwt_buffer, 8);
331 rfx_dwt_2d_decode_block_sse2(&buffer[3072], dwt_buffer, 16);
332 rfx_dwt_2d_decode_block_sse2(&buffer[0], dwt_buffer, 32);
335 static __inline
void __attribute__((ATTRIBUTES))
336 rfx_dwt_2d_encode_block_vert_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRICT l,
337 INT16* WINPR_RESTRICT h,
size_t subband_width)
339 const size_t total_width = subband_width << 1;
341 for (
size_t n = 0; n < subband_width; n++)
343 for (
size_t x = 0; x < total_width; x += 8)
345 __m128i src_2n = _mm_load_si128((__m128i*)src);
346 __m128i src_2n_1 = _mm_load_si128((__m128i*)(src + total_width));
347 __m128i src_2n_2 = src_2n;
349 if (n < subband_width - 1)
350 src_2n_2 = _mm_load_si128((__m128i*)(src + 2ULL * total_width));
353 __m128i h_n = _mm_add_epi16(src_2n, src_2n_2);
354 h_n = _mm_srai_epi16(h_n, 1);
355 h_n = _mm_sub_epi16(src_2n_1, h_n);
356 h_n = _mm_srai_epi16(h_n, 1);
357 _mm_store_si128((__m128i*)h, h_n);
361 h_n_m = _mm_load_si128((__m128i*)(h - total_width));
364 __m128i l_n = _mm_add_epi16(h_n_m, h_n);
365 l_n = _mm_srai_epi16(l_n, 1);
366 l_n = _mm_add_epi16(l_n, src_2n);
367 _mm_store_si128((__m128i*)l, l_n);
377 static __inline
void __attribute__((ATTRIBUTES))
378 rfx_dwt_2d_encode_block_horiz_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRICT l,
379 INT16* WINPR_RESTRICT h,
size_t subband_width)
381 for (
size_t y = 0; y < subband_width; y++)
383 for (
size_t n = 0; n < subband_width; n += 8)
388 _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
390 _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
392 _mm_set_epi16(((n + 8) == subband_width) ? src[14] : src[16], src[14], src[12],
393 src[10], src[8], src[6], src[4], src[2]);
395 __m128i h_n = _mm_add_epi16(src_2n, src_2n_2);
396 h_n = _mm_srai_epi16(h_n, 1);
397 h_n = _mm_sub_epi16(src_2n_1, h_n);
398 h_n = _mm_srai_epi16(h_n, 1);
399 _mm_store_si128((__m128i*)h, h_n);
400 __m128i h_n_m = _mm_loadu_si128((__m128i*)(h - 1));
404 int first = _mm_extract_epi16(h_n_m, 1);
405 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
409 __m128i l_n = _mm_add_epi16(h_n_m, h_n);
410 l_n = _mm_srai_epi16(l_n, 1);
411 l_n = _mm_add_epi16(l_n, src_2n);
412 _mm_store_si128((__m128i*)l, l_n);
420 static __inline
void __attribute__((ATTRIBUTES))
421 rfx_dwt_2d_encode_block_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt,
422 size_t subband_width)
424 mm_prefetch_buffer((
char*)dwt, 4ULL * subband_width *
sizeof(INT16));
427 INT16* h_src = dwt + 2ULL * subband_width * subband_width;
428 rfx_dwt_2d_encode_block_vert_sse2(buffer, l_src, h_src, subband_width);
433 INT16* ll = buffer + 3ULL * subband_width * subband_width;
435 INT16* lh = buffer + 1ULL * subband_width * subband_width;
436 INT16* hh = buffer + 2ULL * subband_width * subband_width;
437 rfx_dwt_2d_encode_block_horiz_sse2(l_src, ll, hl, subband_width);
438 rfx_dwt_2d_encode_block_horiz_sse2(h_src, lh, hh, subband_width);
441 static void rfx_dwt_2d_encode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt_buffer)
443 WINPR_ASSERT(buffer);
444 WINPR_ASSERT(dwt_buffer);
446 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
447 rfx_dwt_2d_encode_block_sse2(buffer, dwt_buffer, 32);
448 rfx_dwt_2d_encode_block_sse2(buffer + 3072, dwt_buffer, 16);
449 rfx_dwt_2d_encode_block_sse2(buffer + 3840, dwt_buffer, 8);
453 void rfx_init_sse2(RFX_CONTEXT* context)
455 #if defined(SSE_AVX_INTRINSICS_ENABLED)
456 if (!IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE))
459 PROFILER_RENAME(context->priv->prof_rfx_quantization_decode,
"rfx_quantization_decode_sse2")
460 PROFILER_RENAME(context->priv->prof_rfx_quantization_encode, "rfx_quantization_encode_sse2")
461 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_sse2")
462 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_encode, "rfx_dwt_2d_encode_sse2")
463 context->quantization_decode = rfx_quantization_decode_sse2;
464 context->quantization_encode = rfx_quantization_encode_sse2;
465 context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
466 context->dwt_2d_encode = rfx_dwt_2d_encode_sse2;
468 WINPR_UNUSED(context);