FreeRDP
rfx_neon.c
1 /*
2  FreeRDP: A Remote Desktop Protocol Implementation
3  RemoteFX Codec Library - NEON Optimizations
4 
5  Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
6 
7  Licensed under the Apache License, Version 2.0 (the "License");
8  you may not use this file except in compliance with the License.
9  You may obtain a copy of the License at
10 
11  http://www.apache.org/licenses/LICENSE-2.0
12 
13  Unless required by applicable law or agreed to in writing, software
14  distributed under the License is distributed on an "AS IS" BASIS,
15  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  See the License for the specific language governing permissions and
17  limitations under the License.
18 */
19 
20 #include <winpr/platform.h>
21 #include <freerdp/config.h>
22 #include <freerdp/log.h>
23 
24 #include "../rfx_types.h"
25 #include "rfx_neon.h"
26 
27 #include "../../core/simd.h"
28 
29 #if defined(NEON_INTRINSICS_ENABLED)
30 
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <arm_neon.h>
35 #include <winpr/sysinfo.h>
36 
37 /* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */
38 
39 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
40 rfx_quantization_decode_block_NEON(INT16* buffer, const size_t buffer_size, const UINT32 factor)
41 {
42  int16x8_t quantFactors = vdupq_n_s16(factor);
43  int16x8_t* buf = (int16x8_t*)buffer;
44  int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
45 
46  do
47  {
48  int16x8_t val = vld1q_s16((INT16*)buf);
49  val = vshlq_s16(val, quantFactors);
50  vst1q_s16((INT16*)buf, val);
51  buf++;
52  } while (buf < buf_end);
53 }
54 
55 static void rfx_quantization_decode_NEON(INT16* buffer, const UINT32* WINPR_RESTRICT quantVals)
56 {
57  WINPR_ASSERT(buffer);
58  WINPR_ASSERT(quantVals);
59 
60  rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1); /* HL1 */
61  rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
62  rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
63  rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1); /* HL2 */
64  rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1); /* LH2 */
65  rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1); /* HH2 */
66  rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1); /* HL3 */
67  rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1); /* LH3 */
68  rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1); /* HH3 */
69  rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1); /* LL3 */
70 }
71 
72 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
73 rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
74  INT16* WINPR_RESTRICT dst, size_t subband_width)
75 {
76  INT16* l_ptr = l;
77  INT16* h_ptr = h;
78  INT16* dst_ptr = dst;
79 
80  for (size_t y = 0; y < subband_width; y++)
81  {
82  /* Even coefficients */
83  for (size_t n = 0; n < subband_width; n += 8)
84  {
85  // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
86  int16x8_t l_n = vld1q_s16(l_ptr);
87  int16x8_t h_n = vld1q_s16(h_ptr);
88  int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
89 
90  if (n == 0)
91  {
92  int16_t first = vgetq_lane_s16(h_n_m, 1);
93  h_n_m = vsetq_lane_s16(first, h_n_m, 0);
94  }
95 
96  int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
97  tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
98  tmp_n = vshrq_n_s16(tmp_n, 1);
99  int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
100  vst1q_s16(l_ptr, dst_n);
101  l_ptr += 8;
102  h_ptr += 8;
103  }
104 
105  l_ptr -= subband_width;
106  h_ptr -= subband_width;
107 
108  /* Odd coefficients */
109  for (size_t n = 0; n < subband_width; n += 8)
110  {
111  // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
112  int16x8_t h_n = vld1q_s16(h_ptr);
113  h_n = vshlq_n_s16(h_n, 1);
114  int16x8x2_t dst_n;
115  dst_n.val[0] = vld1q_s16(l_ptr);
116  int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
117 
118  if (n == subband_width - 8)
119  {
120  int16_t last = vgetq_lane_s16(dst_n_p, 6);
121  dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
122  }
123 
124  dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
125  dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
126  dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
127  vst2q_s16(dst_ptr, dst_n);
128  l_ptr += 8;
129  h_ptr += 8;
130  dst_ptr += 16;
131  }
132  }
133 }
134 
135 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
136 rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
137  INT16* WINPR_RESTRICT dst, size_t subband_width)
138 {
139  INT16* l_ptr = l;
140  INT16* h_ptr = h;
141  INT16* dst_ptr = dst;
142  const size_t total_width = subband_width + subband_width;
143 
144  /* Even coefficients */
145  for (size_t n = 0; n < subband_width; n++)
146  {
147  for (size_t x = 0; x < total_width; x += 8)
148  {
149  // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
150  int16x8_t l_n = vld1q_s16(l_ptr);
151  int16x8_t h_n = vld1q_s16(h_ptr);
152  int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
153 
154  if (n == 0)
155  tmp_n = vaddq_s16(tmp_n, h_n);
156  else
157  {
158  int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
159  tmp_n = vaddq_s16(tmp_n, h_n_m);
160  }
161 
162  tmp_n = vshrq_n_s16(tmp_n, 1);
163  int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
164  vst1q_s16(dst_ptr, dst_n);
165  l_ptr += 8;
166  h_ptr += 8;
167  dst_ptr += 8;
168  }
169 
170  dst_ptr += total_width;
171  }
172 
173  h_ptr = h;
174  dst_ptr = dst + total_width;
175 
176  /* Odd coefficients */
177  for (size_t n = 0; n < subband_width; n++)
178  {
179  for (size_t x = 0; x < total_width; x += 8)
180  {
181  // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
182  int16x8_t h_n = vld1q_s16(h_ptr);
183  int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
184  h_n = vshlq_n_s16(h_n, 1);
185  int16x8_t tmp_n = dst_n_m;
186 
187  if (n == subband_width - 1)
188  tmp_n = vaddq_s16(tmp_n, dst_n_m);
189  else
190  {
191  int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
192  tmp_n = vaddq_s16(tmp_n, dst_n_p);
193  }
194 
195  tmp_n = vshrq_n_s16(tmp_n, 1);
196  int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
197  vst1q_s16(dst_ptr, dst_n);
198  h_ptr += 8;
199  dst_ptr += 8;
200  }
201 
202  dst_ptr += total_width;
203  }
204 }
205 
206 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
207 rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
208  size_t subband_width)
209 {
210  INT16 *hl, *lh, *hh, *ll;
211  INT16 *l_dst, *h_dst;
212  /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
213  */
214  /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
215  /* The lower part L uses LL(3) and HL(0). */
216  /* The higher part H uses LH(1) and HH(2). */
217  ll = buffer + subband_width * subband_width * 3;
218  hl = buffer;
219  l_dst = idwt;
220  rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
221  lh = buffer + subband_width * subband_width;
222  hh = buffer + subband_width * subband_width * 2;
223  h_dst = idwt + subband_width * subband_width * 2;
224  rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
225  /* Inverse DWT in vertical direction, results are stored in original buffer. */
226  rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
227 }
228 
229 static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
230 {
231  rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
232  rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
233  rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
234 }
235 
236 static INLINE void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand, size_t nLowStep,
237  const INT16* restrict pHighBand,
238  size_t nHighStep, INT16* restrict pDstBand,
239  size_t nDstStep, size_t nLowCount,
240  size_t nHighCount, size_t nDstCount)
241 {
242  WINPR_ASSERT(pLowBand);
243  WINPR_ASSERT(pHighBand);
244  WINPR_ASSERT(pDstBand);
245 
246  INT16* l_ptr = pLowBand;
247  const INT16* h_ptr = pHighBand;
248  INT16* dst_ptr = pDstBand;
249  size_t batchSize = (nLowCount + nHighCount) >> 1;
250 
251  for (size_t y = 0; y < nDstCount; y++)
252  {
253  /* Even coefficients */
254  size_t n = 0;
255  for (; n < batchSize; n += 8)
256  {
257  // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
258  int16x8_t l_n = vld1q_s16(l_ptr);
259  int16x8_t h_n = vld1q_s16(h_ptr);
260  int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
261 
262  if (n == 0)
263  {
264  int16_t first = vgetq_lane_s16(h_n_m, 1);
265  h_n_m = vsetq_lane_s16(first, h_n_m, 0);
266  }
267  else if (n == 24)
268  h_n = vsetq_lane_s16(0, h_n, 7);
269 
270  int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
271  tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
272  tmp_n = vshrq_n_s16(tmp_n, 1);
273  int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
274  vst1q_s16(l_ptr, dst_n);
275  l_ptr += 8;
276  h_ptr += 8;
277  }
278  if (n < 32)
279  *l_ptr -= *(h_ptr - 1);
280 
281  l_ptr -= batchSize;
282  h_ptr -= batchSize;
283 
284  /* Odd coefficients */
285  n = 0;
286  for (; n < batchSize; n += 8)
287  {
288  // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
289  int16x8_t h_n = vld1q_s16(h_ptr);
290  h_n = vshlq_n_s16(h_n, 1);
291  int16x8x2_t dst_n;
292  dst_n.val[0] = vld1q_s16(l_ptr);
293  int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
294 
295  if (n == 24)
296  h_n = vsetq_lane_s16(0, h_n, 7);
297 
298  dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
299  dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
300  dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
301  vst2q_s16(dst_ptr, dst_n);
302  l_ptr += 8;
303  h_ptr += 8;
304  dst_ptr += 16;
305  }
306  if (n == 32)
307  {
308  h_ptr -= 1;
309  l_ptr += 1;
310  }
311  else
312  {
313  *dst_ptr = *l_ptr;
314  l_ptr += 1;
315  dst_ptr += 1;
316  }
317  }
318 }
319 
320 static INLINE void rfx_idwt_extrapolate_vert_neon(const INT16* restrict pLowBand, size_t nLowStep,
321  const INT16* restrict pHighBand, size_t nHighStep,
322  INT16* restrict pDstBand, size_t nDstStep,
323  size_t nLowCount, size_t nHighCount,
324  size_t nDstCount)
325 {
326  WINPR_ASSERT(pLowBand);
327  WINPR_ASSERT(pHighBand);
328  WINPR_ASSERT(pDstBand);
329 
330  const INT16* l_ptr = pLowBand;
331  const INT16* h_ptr = pHighBand;
332  INT16* dst_ptr = pDstBand;
333  size_t batchSize = (nDstCount >> 3) << 3;
334  size_t forceBandSize = (nLowCount + nHighCount) >> 1;
335 
336  /* Even coefficients */
337  for (size_t n = 0; n < forceBandSize; n++)
338  {
339  for (size_t x = 0; x < batchSize; x += 8)
340  {
341  // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
342  int16x8_t l_n = vld1q_s16(l_ptr);
343  int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
344  int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
345 
346  if (n == 0)
347  tmp_n = vaddq_s16(tmp_n, h_n);
348  else if (n < 31)
349  {
350  int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
351  tmp_n = vaddq_s16(tmp_n, h_n_m);
352  }
353 
354  tmp_n = vshrq_n_s16(tmp_n, 1);
355  int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
356  vst1q_s16(dst_ptr, dst_n);
357  l_ptr += 8;
358  h_ptr += 8;
359  dst_ptr += 8;
360  }
361 
362  if (nDstCount > batchSize)
363  {
364  int16_t h_n = (n == 31) ? *(h_ptr - nHighStep) : *h_ptr;
365  int16_t tmp_n = h_n + 1;
366  if (n == 0)
367  tmp_n += h_n;
368  else if (n < 31)
369  tmp_n += *(h_ptr - nHighStep);
370  tmp_n >>= 1;
371  *dst_ptr = *l_ptr - tmp_n;
372  l_ptr += 1;
373  h_ptr += 1;
374  dst_ptr += 1;
375  }
376 
377  dst_ptr += nDstStep;
378  }
379 
380  if (forceBandSize < 32)
381  {
382  for (size_t x = 0; x < batchSize; x += 8)
383  {
384  int16x8_t l_n = vld1q_s16(l_ptr);
385  int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
386  int16x8_t tmp_n = vsubq_s16(l_n, h_n);
387  vst1q_s16(dst_ptr, tmp_n);
388  l_ptr += 8;
389  h_ptr += 8;
390  dst_ptr += 8;
391  }
392 
393  if (nDstCount > batchSize)
394  {
395  *dst_ptr = *l_ptr - *(h_ptr - nHighStep);
396  l_ptr += 1;
397  h_ptr += 1;
398  dst_ptr += 1;
399  }
400  }
401 
402  h_ptr = pHighBand;
403  dst_ptr = pDstBand + nDstStep;
404 
405  /* Odd coefficients */
406  for (size_t n = 0; n < forceBandSize; n++)
407  {
408  for (size_t x = 0; x < batchSize; x += 8)
409  {
410  // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
411  int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
412  if (n == 31)
413  {
414  int16x8_t dst_n_p = vld1q_s16(l_ptr);
415  l_ptr += 8;
416  tmp_n = vaddq_s16(tmp_n, dst_n_p);
417  tmp_n = vshrq_n_s16(tmp_n, 1);
418  }
419  else
420  {
421  int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
422  tmp_n = vaddq_s16(tmp_n, dst_n_p);
423  tmp_n = vshrq_n_s16(tmp_n, 1);
424  int16x8_t h_n = vld1q_s16(h_ptr);
425  h_n = vshlq_n_s16(h_n, 1);
426  tmp_n = vaddq_s16(tmp_n, h_n);
427  }
428  vst1q_s16(dst_ptr, tmp_n);
429  h_ptr += 8;
430  dst_ptr += 8;
431  }
432 
433  if (nDstCount > batchSize)
434  {
435  int16_t tmp_n = *(dst_ptr - nDstStep);
436  if (n == 31)
437  {
438  int16_t dst_n_p = *l_ptr;
439  l_ptr += 1;
440  tmp_n += dst_n_p;
441  tmp_n >>= 1;
442  }
443  else
444  {
445  int16_t dst_n_p = *(dst_ptr + nDstStep);
446  tmp_n += dst_n_p;
447  tmp_n >>= 1;
448  int16_t h_n = *h_ptr;
449  h_n <<= 1;
450  tmp_n += h_n;
451  }
452  *dst_ptr = tmp_n;
453  h_ptr += 1;
454  dst_ptr += 1;
455  }
456 
457  dst_ptr += nDstStep;
458  }
459 }
460 
461 static INLINE size_t prfx_get_band_l_count(size_t level)
462 {
463  return (64 >> level) + 1;
464 }
465 
466 static INLINE size_t prfx_get_band_h_count(size_t level)
467 {
468  if (level == 1)
469  return (64 >> 1) - 1;
470  else
471  return (64 + (1 << (level - 1))) >> level;
472 }
473 
474 static INLINE void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
475  size_t level)
476 {
477  size_t nDstStepX;
478  size_t nDstStepY;
479  INT16 *HL, *LH;
480  INT16 *HH, *LL;
481  INT16 *L, *H, *LLx;
482 
483  const size_t nBandL = prfx_get_band_l_count(level);
484  const size_t nBandH = prfx_get_band_h_count(level);
485  size_t offset = 0;
486 
487  WINPR_ASSERT(buffer);
488  WINPR_ASSERT(temp);
489 
490  HL = &buffer[offset];
491  offset += (nBandH * nBandL);
492  LH = &buffer[offset];
493  offset += (nBandL * nBandH);
494  HH = &buffer[offset];
495  offset += (nBandH * nBandH);
496  LL = &buffer[offset];
497  nDstStepX = (nBandL + nBandH);
498  nDstStepY = (nBandL + nBandH);
499  offset = 0;
500  L = &temp[offset];
501  offset += (nBandL * nDstStepX);
502  H = &temp[offset];
503  LLx = &buffer[0];
504 
505  /* horizontal (LL + HL -> L) */
506  rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);
507 
508  /* horizontal (LH + HH -> H) */
509  rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);
510 
511  /* vertical (L + H -> LL) */
512  rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
513  nBandL + nBandH);
514 }
515 
516 static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
517 {
518  WINPR_ASSERT(buffer);
519  WINPR_ASSERT(temp);
520  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
521  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
522  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
523 }
524 #endif // NEON_INTRINSICS_ENABLED
525 
526 void rfx_init_neon(RFX_CONTEXT* context)
527 {
528 #if defined(NEON_INTRINSICS_ENABLED)
529  if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
530  {
531  DEBUG_RFX("Using NEON optimizations");
532  PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
533  PROFILER_RENAME(context->priv->prof_rfx_quantization_decode,
534  "rfx_quantization_decode_NEON");
535  PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
536  context->quantization_decode = rfx_quantization_decode_NEON;
537  context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
538  context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
539  }
540 #else
541  WINPR_UNUSED(context);
542 #endif
543 }