FreeRDP
rfx_sse2.c
1 
21 #include <winpr/platform.h>
22 #include <freerdp/config.h>
23 
24 #include "../rfx_types.h"
25 #include "rfx_sse2.h"
26 
27 #if defined(WITH_SSE2)
28 #if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined(_M_IX86_AMD64)
29 #define SSE2_ENABLED
30 #endif
31 #endif
32 
33 #if defined(SSE2_ENABLED)
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <winpr/sysinfo.h>
38 
39 #include <xmmintrin.h>
40 #include <emmintrin.h>
41 
42 #ifdef _MSC_VER
43 #define __attribute__(...)
44 #endif
45 
46 #define CACHE_LINE_BYTES 64
47 
48 #ifndef __clang__
49 #define ATTRIBUTES __gnu_inline__, __always_inline__, __artificial__
50 #else
51 #define ATTRIBUTES __gnu_inline__, __always_inline__
52 #endif
53 
54 #define mm_between_epi16(_val, _min, _max) \
55  do \
56  { \
57  (_val) = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); \
58  } while (0)
59 
60 static __inline void __attribute__((ATTRIBUTES))
61 mm_prefetch_buffer(char* WINPR_RESTRICT buffer, size_t num_bytes)
62 {
63  __m128i* buf = (__m128i*)buffer;
64 
65  for (size_t i = 0; i < (num_bytes / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
66  {
67  _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
68  }
69 }
70 
71 /* rfx_decode_ycbcr_to_rgb_sse2 code now resides in the primitives library. */
72 /* rfx_encode_rgb_to_ycbcr_sse2 code now resides in the primitives library. */
73 
74 static __inline void __attribute__((ATTRIBUTES))
75 rfx_quantization_decode_block_sse2(INT16* WINPR_RESTRICT buffer, const size_t buffer_size,
76  const UINT32 factor)
77 {
78  __m128i a;
79  __m128i* ptr = (__m128i*)buffer;
80  __m128i* buf_end = (__m128i*)(buffer + buffer_size);
81 
82  if (factor == 0)
83  return;
84 
85  do
86  {
87  a = _mm_load_si128(ptr);
88  a = _mm_slli_epi16(a, factor);
89  _mm_store_si128(ptr, a);
90  ptr++;
91  } while (ptr < buf_end);
92 }
93 
94 static void rfx_quantization_decode_sse2(INT16* WINPR_RESTRICT buffer,
95  const UINT32* WINPR_RESTRICT quantVals)
96 {
97  WINPR_ASSERT(buffer);
98  WINPR_ASSERT(quantVals);
99 
100  mm_prefetch_buffer((char*)buffer, 4096 * sizeof(INT16));
101  rfx_quantization_decode_block_sse2(&buffer[0], 1024, quantVals[8] - 1); /* HL1 */
102  rfx_quantization_decode_block_sse2(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
103  rfx_quantization_decode_block_sse2(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
104  rfx_quantization_decode_block_sse2(&buffer[3072], 256, quantVals[5] - 1); /* HL2 */
105  rfx_quantization_decode_block_sse2(&buffer[3328], 256, quantVals[4] - 1); /* LH2 */
106  rfx_quantization_decode_block_sse2(&buffer[3584], 256, quantVals[6] - 1); /* HH2 */
107  rfx_quantization_decode_block_sse2(&buffer[3840], 64, quantVals[2] - 1); /* HL3 */
108  rfx_quantization_decode_block_sse2(&buffer[3904], 64, quantVals[1] - 1); /* LH3 */
109  rfx_quantization_decode_block_sse2(&buffer[3968], 64, quantVals[3] - 1); /* HH3 */
110  rfx_quantization_decode_block_sse2(&buffer[4032], 64, quantVals[0] - 1); /* LL3 */
111 }
112 
113 static __inline void __attribute__((ATTRIBUTES))
114 rfx_quantization_encode_block_sse2(INT16* WINPR_RESTRICT buffer, const int buffer_size,
115  const UINT32 factor)
116 {
117  __m128i a;
118  __m128i* ptr = (__m128i*)buffer;
119  __m128i* buf_end = (__m128i*)(buffer + buffer_size);
120  __m128i half;
121 
122  if (factor == 0)
123  return;
124 
125  half = _mm_set1_epi16(1 << (factor - 1));
126 
127  do
128  {
129  a = _mm_load_si128(ptr);
130  a = _mm_add_epi16(a, half);
131  a = _mm_srai_epi16(a, factor);
132  _mm_store_si128(ptr, a);
133  ptr++;
134  } while (ptr < buf_end);
135 }
136 
137 static void rfx_quantization_encode_sse2(INT16* WINPR_RESTRICT buffer,
138  const UINT32* WINPR_RESTRICT quantization_values)
139 {
140  WINPR_ASSERT(buffer);
141  WINPR_ASSERT(quantization_values);
142 
143  mm_prefetch_buffer((char*)buffer, 4096 * sizeof(INT16));
144  rfx_quantization_encode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
145  rfx_quantization_encode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
146  rfx_quantization_encode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
147  rfx_quantization_encode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
148  rfx_quantization_encode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
149  rfx_quantization_encode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
150  rfx_quantization_encode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
151  rfx_quantization_encode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
152  rfx_quantization_encode_block_sse2(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
153  rfx_quantization_encode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
154  rfx_quantization_encode_block_sse2(buffer, 4096, 5);
155 }
156 
157 static __inline void __attribute__((ATTRIBUTES))
158 rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
159  INT16* WINPR_RESTRICT dst, int subband_width)
160 {
161  INT16* l_ptr = l;
162  INT16* h_ptr = h;
163  INT16* dst_ptr = dst;
164  int first = 0;
165  int last = 0;
166  __m128i l_n;
167  __m128i h_n;
168  __m128i h_n_m;
169  __m128i tmp_n;
170  __m128i dst_n;
171  __m128i dst_n_p;
172  __m128i dst1;
173  __m128i dst2;
174 
175  for (int y = 0; y < subband_width; y++)
176  {
177  /* Even coefficients */
178  for (int n = 0; n < subband_width; n += 8)
179  {
180  /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
181  l_n = _mm_load_si128((__m128i*)l_ptr);
182  h_n = _mm_load_si128((__m128i*)h_ptr);
183  h_n_m = _mm_loadu_si128((__m128i*)(h_ptr - 1));
184 
185  if (n == 0)
186  {
187  first = _mm_extract_epi16(h_n_m, 1);
188  h_n_m = _mm_insert_epi16(h_n_m, first, 0);
189  }
190 
191  tmp_n = _mm_add_epi16(h_n, h_n_m);
192  tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
193  tmp_n = _mm_srai_epi16(tmp_n, 1);
194  dst_n = _mm_sub_epi16(l_n, tmp_n);
195  _mm_store_si128((__m128i*)l_ptr, dst_n);
196  l_ptr += 8;
197  h_ptr += 8;
198  }
199 
200  l_ptr -= subband_width;
201  h_ptr -= subband_width;
202 
203  /* Odd coefficients */
204  for (int n = 0; n < subband_width; n += 8)
205  {
206  /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
207  h_n = _mm_load_si128((__m128i*)h_ptr);
208  h_n = _mm_slli_epi16(h_n, 1);
209  dst_n = _mm_load_si128((__m128i*)(l_ptr));
210  dst_n_p = _mm_loadu_si128((__m128i*)(l_ptr + 1));
211 
212  if (n == subband_width - 8)
213  {
214  last = _mm_extract_epi16(dst_n_p, 6);
215  dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
216  }
217 
218  tmp_n = _mm_add_epi16(dst_n_p, dst_n);
219  tmp_n = _mm_srai_epi16(tmp_n, 1);
220  tmp_n = _mm_add_epi16(tmp_n, h_n);
221  dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
222  dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
223  _mm_store_si128((__m128i*)dst_ptr, dst1);
224  _mm_store_si128((__m128i*)(dst_ptr + 8), dst2);
225  l_ptr += 8;
226  h_ptr += 8;
227  dst_ptr += 16;
228  }
229  }
230 }
231 
232 static __inline void __attribute__((ATTRIBUTES))
233 rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
234  INT16* WINPR_RESTRICT dst, int subband_width)
235 {
236  INT16* l_ptr = l;
237  INT16* h_ptr = h;
238  INT16* dst_ptr = dst;
239  __m128i l_n;
240  __m128i h_n;
241  __m128i tmp_n;
242  __m128i h_n_m;
243  __m128i dst_n;
244  __m128i dst_n_m;
245  __m128i dst_n_p;
246  int total_width = subband_width + subband_width;
247 
248  /* Even coefficients */
249  for (int n = 0; n < subband_width; n++)
250  {
251  for (int x = 0; x < total_width; x += 8)
252  {
253  /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
254  l_n = _mm_load_si128((__m128i*)l_ptr);
255  h_n = _mm_load_si128((__m128i*)h_ptr);
256  tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));
257 
258  if (n == 0)
259  tmp_n = _mm_add_epi16(tmp_n, h_n);
260  else
261  {
262  h_n_m = _mm_loadu_si128((__m128i*)(h_ptr - total_width));
263  tmp_n = _mm_add_epi16(tmp_n, h_n_m);
264  }
265 
266  tmp_n = _mm_srai_epi16(tmp_n, 1);
267  dst_n = _mm_sub_epi16(l_n, tmp_n);
268  _mm_store_si128((__m128i*)dst_ptr, dst_n);
269  l_ptr += 8;
270  h_ptr += 8;
271  dst_ptr += 8;
272  }
273 
274  dst_ptr += total_width;
275  }
276 
277  h_ptr = h;
278  dst_ptr = dst + total_width;
279 
280  /* Odd coefficients */
281  for (int n = 0; n < subband_width; n++)
282  {
283  for (int x = 0; x < total_width; x += 8)
284  {
285  /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
286  h_n = _mm_load_si128((__m128i*)h_ptr);
287  dst_n_m = _mm_load_si128((__m128i*)(dst_ptr - total_width));
288  h_n = _mm_slli_epi16(h_n, 1);
289  tmp_n = dst_n_m;
290 
291  if (n == subband_width - 1)
292  tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
293  else
294  {
295  dst_n_p = _mm_loadu_si128((__m128i*)(dst_ptr + total_width));
296  tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
297  }
298 
299  tmp_n = _mm_srai_epi16(tmp_n, 1);
300  dst_n = _mm_add_epi16(tmp_n, h_n);
301  _mm_store_si128((__m128i*)dst_ptr, dst_n);
302  h_ptr += 8;
303  dst_ptr += 8;
304  }
305 
306  dst_ptr += total_width;
307  }
308 }
309 
310 static __inline void __attribute__((ATTRIBUTES))
311 rfx_dwt_2d_decode_block_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
312  int subband_width)
313 {
314  INT16* hl = NULL;
315  INT16* lh = NULL;
316  INT16* hh = NULL;
317  INT16* ll = NULL;
318  INT16* l_dst = NULL;
319  INT16* h_dst = NULL;
320  mm_prefetch_buffer((char*)idwt, 4ULL * subband_width * sizeof(INT16));
321  /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
322  */
323  /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
324  /* The lower part L uses LL(3) and HL(0). */
325  /* The higher part H uses LH(1) and HH(2). */
326  ll = buffer + 3ULL * subband_width * subband_width;
327  hl = buffer;
328  l_dst = idwt;
329  rfx_dwt_2d_decode_block_horiz_sse2(ll, hl, l_dst, subband_width);
330  lh = buffer + 1ULL * subband_width * subband_width;
331  hh = buffer + 2ULL * subband_width * subband_width;
332  h_dst = idwt + 2ULL * subband_width * subband_width;
333  rfx_dwt_2d_decode_block_horiz_sse2(lh, hh, h_dst, subband_width);
334  /* Inverse DWT in vertical direction, results are stored in original buffer. */
335  rfx_dwt_2d_decode_block_vert_sse2(l_dst, h_dst, buffer, subband_width);
336 }
337 
338 static void rfx_dwt_2d_decode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt_buffer)
339 {
340  WINPR_ASSERT(buffer);
341  WINPR_ASSERT(dwt_buffer);
342 
343  mm_prefetch_buffer((char*)buffer, 4096 * sizeof(INT16));
344  rfx_dwt_2d_decode_block_sse2(&buffer[3840], dwt_buffer, 8);
345  rfx_dwt_2d_decode_block_sse2(&buffer[3072], dwt_buffer, 16);
346  rfx_dwt_2d_decode_block_sse2(&buffer[0], dwt_buffer, 32);
347 }
348 
349 static __inline void __attribute__((ATTRIBUTES))
350 rfx_dwt_2d_encode_block_vert_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRICT l,
351  INT16* WINPR_RESTRICT h, int subband_width)
352 {
353  int total_width = 0;
354  __m128i src_2n;
355  __m128i src_2n_1;
356  __m128i src_2n_2;
357  __m128i h_n;
358  __m128i h_n_m;
359  __m128i l_n;
360  total_width = subband_width << 1;
361 
362  for (int n = 0; n < subband_width; n++)
363  {
364  for (int x = 0; x < total_width; x += 8)
365  {
366  src_2n = _mm_load_si128((__m128i*)src);
367  src_2n_1 = _mm_load_si128((__m128i*)(src + total_width));
368 
369  if (n < subband_width - 1)
370  src_2n_2 = _mm_load_si128((__m128i*)(src + 2ULL * total_width));
371  else
372  src_2n_2 = src_2n;
373 
374  /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
375  h_n = _mm_add_epi16(src_2n, src_2n_2);
376  h_n = _mm_srai_epi16(h_n, 1);
377  h_n = _mm_sub_epi16(src_2n_1, h_n);
378  h_n = _mm_srai_epi16(h_n, 1);
379  _mm_store_si128((__m128i*)h, h_n);
380 
381  if (n == 0)
382  h_n_m = h_n;
383  else
384  h_n_m = _mm_load_si128((__m128i*)(h - total_width));
385 
386  /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
387  l_n = _mm_add_epi16(h_n_m, h_n);
388  l_n = _mm_srai_epi16(l_n, 1);
389  l_n = _mm_add_epi16(l_n, src_2n);
390  _mm_store_si128((__m128i*)l, l_n);
391  src += 8;
392  l += 8;
393  h += 8;
394  }
395 
396  src += total_width;
397  }
398 }
399 
400 static __inline void __attribute__((ATTRIBUTES))
401 rfx_dwt_2d_encode_block_horiz_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRICT l,
402  INT16* WINPR_RESTRICT h, int subband_width)
403 {
404  int first = 0;
405  __m128i src_2n;
406  __m128i src_2n_1;
407  __m128i src_2n_2;
408  __m128i h_n;
409  __m128i h_n_m;
410  __m128i l_n;
411 
412  for (int y = 0; y < subband_width; y++)
413  {
414  for (int n = 0; n < subband_width; n += 8)
415  {
416  /* The following 3 Set operations consumes more than half of the total DWT processing
417  * time! */
418  src_2n =
419  _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
420  src_2n_1 =
421  _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
422  src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16], src[14], src[12],
423  src[10], src[8], src[6], src[4], src[2]);
424  /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
425  h_n = _mm_add_epi16(src_2n, src_2n_2);
426  h_n = _mm_srai_epi16(h_n, 1);
427  h_n = _mm_sub_epi16(src_2n_1, h_n);
428  h_n = _mm_srai_epi16(h_n, 1);
429  _mm_store_si128((__m128i*)h, h_n);
430  h_n_m = _mm_loadu_si128((__m128i*)(h - 1));
431 
432  if (n == 0)
433  {
434  first = _mm_extract_epi16(h_n_m, 1);
435  h_n_m = _mm_insert_epi16(h_n_m, first, 0);
436  }
437 
438  /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
439  l_n = _mm_add_epi16(h_n_m, h_n);
440  l_n = _mm_srai_epi16(l_n, 1);
441  l_n = _mm_add_epi16(l_n, src_2n);
442  _mm_store_si128((__m128i*)l, l_n);
443  src += 16;
444  l += 8;
445  h += 8;
446  }
447  }
448 }
449 
450 static __inline void __attribute__((ATTRIBUTES))
451 rfx_dwt_2d_encode_block_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt,
452  int subband_width)
453 {
454  INT16* hl = NULL;
455  INT16* lh = NULL;
456  INT16* hh = NULL;
457  INT16* ll = NULL;
458  INT16* l_src = NULL;
459  INT16* h_src = NULL;
460  mm_prefetch_buffer((char*)dwt, 4ULL * subband_width * sizeof(INT16));
461  /* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */
462  l_src = dwt;
463  h_src = dwt + 2ULL * subband_width * subband_width;
464  rfx_dwt_2d_encode_block_vert_sse2(buffer, l_src, h_src, subband_width);
465  /* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order,
466  * stored in original buffer. */
467  /* The lower part L generates LL(3) and HL(0). */
468  /* The higher part H generates LH(1) and HH(2). */
469  ll = buffer + 3ULL * subband_width * subband_width;
470  hl = buffer;
471  lh = buffer + 1ULL * subband_width * subband_width;
472  hh = buffer + 2ULL * subband_width * subband_width;
473  rfx_dwt_2d_encode_block_horiz_sse2(l_src, ll, hl, subband_width);
474  rfx_dwt_2d_encode_block_horiz_sse2(h_src, lh, hh, subband_width);
475 }
476 
477 static void rfx_dwt_2d_encode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt_buffer)
478 {
479  WINPR_ASSERT(buffer);
480  WINPR_ASSERT(dwt_buffer);
481 
482  mm_prefetch_buffer((char*)buffer, 4096 * sizeof(INT16));
483  rfx_dwt_2d_encode_block_sse2(buffer, dwt_buffer, 32);
484  rfx_dwt_2d_encode_block_sse2(buffer + 3072, dwt_buffer, 16);
485  rfx_dwt_2d_encode_block_sse2(buffer + 3840, dwt_buffer, 8);
486 }
487 #endif
488 
489 void rfx_init_sse2(RFX_CONTEXT* context)
490 {
491 #if defined(SSE2_ENABLED)
492  if (!IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE))
493  return;
494 
495  PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_sse2")
496  PROFILER_RENAME(context->priv->prof_rfx_quantization_encode, "rfx_quantization_encode_sse2")
497  PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_sse2")
498  PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_encode, "rfx_dwt_2d_encode_sse2")
499  context->quantization_decode = rfx_quantization_decode_sse2;
500  context->quantization_encode = rfx_quantization_encode_sse2;
501  context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
502  context->dwt_2d_encode = rfx_dwt_2d_encode_sse2;
503 #else
504  WINPR_UNUSED(context);
505 #endif
506 }