21 #include <winpr/platform.h>
22 #include <freerdp/config.h>
24 #include "../rfx_types.h"
27 #if defined(WITH_SSE2)
28 #if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined(_M_IX86_AMD64)
33 #if defined(SSE2_ENABLED)
37 #include <winpr/sysinfo.h>
39 #include <xmmintrin.h>
40 #include <emmintrin.h>
43 #define __attribute__(...)
46 #define CACHE_LINE_BYTES 64
49 #define ATTRIBUTES __gnu_inline__, __always_inline__, __artificial__
51 #define ATTRIBUTES __gnu_inline__, __always_inline__
54 #define mm_between_epi16(_val, _min, _max) \
57 (_val) = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); \
60 static __inline
void __attribute__((ATTRIBUTES))
61 mm_prefetch_buffer(
char* WINPR_RESTRICT buffer,
size_t num_bytes)
63 __m128i* buf = (__m128i*)buffer;
65 for (
size_t i = 0; i < (num_bytes /
sizeof(__m128i)); i += (CACHE_LINE_BYTES /
sizeof(__m128i)))
67 _mm_prefetch((
char*)(&buf[i]), _MM_HINT_NTA);
74 static __inline
void __attribute__((ATTRIBUTES))
75 rfx_quantization_decode_block_sse2(INT16* WINPR_RESTRICT buffer,
const size_t buffer_size,
79 __m128i* ptr = (__m128i*)buffer;
80 __m128i* buf_end = (__m128i*)(buffer + buffer_size);
87 a = _mm_load_si128(ptr);
88 a = _mm_slli_epi16(a, factor);
89 _mm_store_si128(ptr, a);
91 }
while (ptr < buf_end);
94 static void rfx_quantization_decode_sse2(INT16* WINPR_RESTRICT buffer,
95 const UINT32* WINPR_RESTRICT quantVals)
98 WINPR_ASSERT(quantVals);
100 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
101 rfx_quantization_decode_block_sse2(&buffer[0], 1024, quantVals[8] - 1);
102 rfx_quantization_decode_block_sse2(&buffer[1024], 1024, quantVals[7] - 1);
103 rfx_quantization_decode_block_sse2(&buffer[2048], 1024, quantVals[9] - 1);
104 rfx_quantization_decode_block_sse2(&buffer[3072], 256, quantVals[5] - 1);
105 rfx_quantization_decode_block_sse2(&buffer[3328], 256, quantVals[4] - 1);
106 rfx_quantization_decode_block_sse2(&buffer[3584], 256, quantVals[6] - 1);
107 rfx_quantization_decode_block_sse2(&buffer[3840], 64, quantVals[2] - 1);
108 rfx_quantization_decode_block_sse2(&buffer[3904], 64, quantVals[1] - 1);
109 rfx_quantization_decode_block_sse2(&buffer[3968], 64, quantVals[3] - 1);
110 rfx_quantization_decode_block_sse2(&buffer[4032], 64, quantVals[0] - 1);
113 static __inline
void __attribute__((ATTRIBUTES))
114 rfx_quantization_encode_block_sse2(INT16* WINPR_RESTRICT buffer,
const int buffer_size,
118 __m128i* ptr = (__m128i*)buffer;
119 __m128i* buf_end = (__m128i*)(buffer + buffer_size);
125 half = _mm_set1_epi16(1 << (factor - 1));
129 a = _mm_load_si128(ptr);
130 a = _mm_add_epi16(a, half);
131 a = _mm_srai_epi16(a, factor);
132 _mm_store_si128(ptr, a);
134 }
while (ptr < buf_end);
137 static void rfx_quantization_encode_sse2(INT16* WINPR_RESTRICT buffer,
138 const UINT32* WINPR_RESTRICT quantization_values)
140 WINPR_ASSERT(buffer);
141 WINPR_ASSERT(quantization_values);
143 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
144 rfx_quantization_encode_block_sse2(buffer, 1024, quantization_values[8] - 6);
145 rfx_quantization_encode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6);
146 rfx_quantization_encode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6);
147 rfx_quantization_encode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6);
148 rfx_quantization_encode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6);
149 rfx_quantization_encode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6);
150 rfx_quantization_encode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6);
151 rfx_quantization_encode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6);
152 rfx_quantization_encode_block_sse2(buffer + 3968, 64, quantization_values[3] - 6);
153 rfx_quantization_encode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6);
154 rfx_quantization_encode_block_sse2(buffer, 4096, 5);
157 static __inline
void __attribute__((ATTRIBUTES))
158 rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
159 INT16* WINPR_RESTRICT dst,
int subband_width)
163 INT16* dst_ptr = dst;
175 for (
int y = 0; y < subband_width; y++)
178 for (
int n = 0; n < subband_width; n += 8)
181 l_n = _mm_load_si128((__m128i*)l_ptr);
182 h_n = _mm_load_si128((__m128i*)h_ptr);
183 h_n_m = _mm_loadu_si128((__m128i*)(h_ptr - 1));
187 first = _mm_extract_epi16(h_n_m, 1);
188 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
191 tmp_n = _mm_add_epi16(h_n, h_n_m);
192 tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
193 tmp_n = _mm_srai_epi16(tmp_n, 1);
194 dst_n = _mm_sub_epi16(l_n, tmp_n);
195 _mm_store_si128((__m128i*)l_ptr, dst_n);
200 l_ptr -= subband_width;
201 h_ptr -= subband_width;
204 for (
int n = 0; n < subband_width; n += 8)
207 h_n = _mm_load_si128((__m128i*)h_ptr);
208 h_n = _mm_slli_epi16(h_n, 1);
209 dst_n = _mm_load_si128((__m128i*)(l_ptr));
210 dst_n_p = _mm_loadu_si128((__m128i*)(l_ptr + 1));
212 if (n == subband_width - 8)
214 last = _mm_extract_epi16(dst_n_p, 6);
215 dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
218 tmp_n = _mm_add_epi16(dst_n_p, dst_n);
219 tmp_n = _mm_srai_epi16(tmp_n, 1);
220 tmp_n = _mm_add_epi16(tmp_n, h_n);
221 dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
222 dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
223 _mm_store_si128((__m128i*)dst_ptr, dst1);
224 _mm_store_si128((__m128i*)(dst_ptr + 8), dst2);
232 static __inline
void __attribute__((ATTRIBUTES))
233 rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
234 INT16* WINPR_RESTRICT dst,
int subband_width)
238 INT16* dst_ptr = dst;
246 int total_width = subband_width + subband_width;
249 for (
int n = 0; n < subband_width; n++)
251 for (
int x = 0; x < total_width; x += 8)
254 l_n = _mm_load_si128((__m128i*)l_ptr);
255 h_n = _mm_load_si128((__m128i*)h_ptr);
256 tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));
259 tmp_n = _mm_add_epi16(tmp_n, h_n);
262 h_n_m = _mm_loadu_si128((__m128i*)(h_ptr - total_width));
263 tmp_n = _mm_add_epi16(tmp_n, h_n_m);
266 tmp_n = _mm_srai_epi16(tmp_n, 1);
267 dst_n = _mm_sub_epi16(l_n, tmp_n);
268 _mm_store_si128((__m128i*)dst_ptr, dst_n);
274 dst_ptr += total_width;
278 dst_ptr = dst + total_width;
281 for (
int n = 0; n < subband_width; n++)
283 for (
int x = 0; x < total_width; x += 8)
286 h_n = _mm_load_si128((__m128i*)h_ptr);
287 dst_n_m = _mm_load_si128((__m128i*)(dst_ptr - total_width));
288 h_n = _mm_slli_epi16(h_n, 1);
291 if (n == subband_width - 1)
292 tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
295 dst_n_p = _mm_loadu_si128((__m128i*)(dst_ptr + total_width));
296 tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
299 tmp_n = _mm_srai_epi16(tmp_n, 1);
300 dst_n = _mm_add_epi16(tmp_n, h_n);
301 _mm_store_si128((__m128i*)dst_ptr, dst_n);
306 dst_ptr += total_width;
310 static __inline
void __attribute__((ATTRIBUTES))
311 rfx_dwt_2d_decode_block_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
320 mm_prefetch_buffer((
char*)idwt, 4ULL * subband_width *
sizeof(INT16));
326 ll = buffer + 3ULL * subband_width * subband_width;
329 rfx_dwt_2d_decode_block_horiz_sse2(ll, hl, l_dst, subband_width);
330 lh = buffer + 1ULL * subband_width * subband_width;
331 hh = buffer + 2ULL * subband_width * subband_width;
332 h_dst = idwt + 2ULL * subband_width * subband_width;
333 rfx_dwt_2d_decode_block_horiz_sse2(lh, hh, h_dst, subband_width);
335 rfx_dwt_2d_decode_block_vert_sse2(l_dst, h_dst, buffer, subband_width);
338 static void rfx_dwt_2d_decode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt_buffer)
340 WINPR_ASSERT(buffer);
341 WINPR_ASSERT(dwt_buffer);
343 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
344 rfx_dwt_2d_decode_block_sse2(&buffer[3840], dwt_buffer, 8);
345 rfx_dwt_2d_decode_block_sse2(&buffer[3072], dwt_buffer, 16);
346 rfx_dwt_2d_decode_block_sse2(&buffer[0], dwt_buffer, 32);
349 static __inline
void __attribute__((ATTRIBUTES))
350 rfx_dwt_2d_encode_block_vert_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRICT l,
351 INT16* WINPR_RESTRICT h,
int subband_width)
360 total_width = subband_width << 1;
362 for (
int n = 0; n < subband_width; n++)
364 for (
int x = 0; x < total_width; x += 8)
366 src_2n = _mm_load_si128((__m128i*)src);
367 src_2n_1 = _mm_load_si128((__m128i*)(src + total_width));
369 if (n < subband_width - 1)
370 src_2n_2 = _mm_load_si128((__m128i*)(src + 2ULL * total_width));
375 h_n = _mm_add_epi16(src_2n, src_2n_2);
376 h_n = _mm_srai_epi16(h_n, 1);
377 h_n = _mm_sub_epi16(src_2n_1, h_n);
378 h_n = _mm_srai_epi16(h_n, 1);
379 _mm_store_si128((__m128i*)h, h_n);
384 h_n_m = _mm_load_si128((__m128i*)(h - total_width));
387 l_n = _mm_add_epi16(h_n_m, h_n);
388 l_n = _mm_srai_epi16(l_n, 1);
389 l_n = _mm_add_epi16(l_n, src_2n);
390 _mm_store_si128((__m128i*)l, l_n);
400 static __inline
void __attribute__((ATTRIBUTES))
401 rfx_dwt_2d_encode_block_horiz_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRICT l,
402 INT16* WINPR_RESTRICT h,
int subband_width)
412 for (
int y = 0; y < subband_width; y++)
414 for (
int n = 0; n < subband_width; n += 8)
419 _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
421 _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
422 src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16], src[14], src[12],
423 src[10], src[8], src[6], src[4], src[2]);
425 h_n = _mm_add_epi16(src_2n, src_2n_2);
426 h_n = _mm_srai_epi16(h_n, 1);
427 h_n = _mm_sub_epi16(src_2n_1, h_n);
428 h_n = _mm_srai_epi16(h_n, 1);
429 _mm_store_si128((__m128i*)h, h_n);
430 h_n_m = _mm_loadu_si128((__m128i*)(h - 1));
434 first = _mm_extract_epi16(h_n_m, 1);
435 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
439 l_n = _mm_add_epi16(h_n_m, h_n);
440 l_n = _mm_srai_epi16(l_n, 1);
441 l_n = _mm_add_epi16(l_n, src_2n);
442 _mm_store_si128((__m128i*)l, l_n);
450 static __inline
void __attribute__((ATTRIBUTES))
451 rfx_dwt_2d_encode_block_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt,
460 mm_prefetch_buffer((
char*)dwt, 4ULL * subband_width *
sizeof(INT16));
463 h_src = dwt + 2ULL * subband_width * subband_width;
464 rfx_dwt_2d_encode_block_vert_sse2(buffer, l_src, h_src, subband_width);
469 ll = buffer + 3ULL * subband_width * subband_width;
471 lh = buffer + 1ULL * subband_width * subband_width;
472 hh = buffer + 2ULL * subband_width * subband_width;
473 rfx_dwt_2d_encode_block_horiz_sse2(l_src, ll, hl, subband_width);
474 rfx_dwt_2d_encode_block_horiz_sse2(h_src, lh, hh, subband_width);
477 static void rfx_dwt_2d_encode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt_buffer)
479 WINPR_ASSERT(buffer);
480 WINPR_ASSERT(dwt_buffer);
482 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
483 rfx_dwt_2d_encode_block_sse2(buffer, dwt_buffer, 32);
484 rfx_dwt_2d_encode_block_sse2(buffer + 3072, dwt_buffer, 16);
485 rfx_dwt_2d_encode_block_sse2(buffer + 3840, dwt_buffer, 8);
489 void rfx_init_sse2(RFX_CONTEXT* context)
491 #if defined(SSE2_ENABLED)
492 if (!IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE))
495 PROFILER_RENAME(context->priv->prof_rfx_quantization_decode,
"rfx_quantization_decode_sse2")
496 PROFILER_RENAME(context->priv->prof_rfx_quantization_encode, "rfx_quantization_encode_sse2")
497 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_sse2")
498 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_encode, "rfx_dwt_2d_encode_sse2")
499 context->quantization_decode = rfx_quantization_decode_sse2;
500 context->quantization_encode = rfx_quantization_encode_sse2;
501 context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
502 context->dwt_2d_encode = rfx_dwt_2d_encode_sse2;
504 WINPR_UNUSED(context);