FreeRDP
rfx_neon.c
1 /*
2  FreeRDP: A Remote Desktop Protocol Implementation
3  RemoteFX Codec Library - NEON Optimizations
4 
5  Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
6 
7  Licensed under the Apache License, Version 2.0 (the "License");
8  you may not use this file except in compliance with the License.
9  You may obtain a copy of the License at
10 
11  http://www.apache.org/licenses/LICENSE-2.0
12 
13  Unless required by applicable law or agreed to in writing, software
14  distributed under the License is distributed on an "AS IS" BASIS,
15  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  See the License for the specific language governing permissions and
17  limitations under the License.
18 */
19 
20 #include <winpr/platform.h>
21 #include <freerdp/config.h>
22 #include <freerdp/log.h>
23 
24 #include "../rfx_types.h"
25 #include "rfx_neon.h"
26 
27 #if defined(WITH_NEON)
28 #if defined(_M_ARM64) || defined(_M_ARM)
29 #define NEON_ENABLED
30 #endif
31 #endif
32 
33 #if defined(NEON_ENABLED)
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <arm_neon.h>
39 #include <winpr/sysinfo.h>
40 
41 /* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */
42 
43 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
44 rfx_quantization_decode_block_NEON(INT16* buffer, const size_t buffer_size, const UINT32 factor)
45 {
46  int16x8_t quantFactors = vdupq_n_s16(factor);
47  int16x8_t* buf = (int16x8_t*)buffer;
48  int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
49 
50  do
51  {
52  int16x8_t val = vld1q_s16((INT16*)buf);
53  val = vshlq_s16(val, quantFactors);
54  vst1q_s16((INT16*)buf, val);
55  buf++;
56  } while (buf < buf_end);
57 }
58 
59 static void rfx_quantization_decode_NEON(INT16* buffer, const UINT32* WINPR_RESTRICT quantVals)
60 {
61  WINPR_ASSERT(buffer);
62  WINPR_ASSERT(quantVals);
63 
64  rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1); /* HL1 */
65  rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
66  rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
67  rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1); /* HL2 */
68  rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1); /* LH2 */
69  rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1); /* HH2 */
70  rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1); /* HL3 */
71  rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1); /* LH3 */
72  rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1); /* HH3 */
73  rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1); /* LL3 */
74 }
75 
76 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77 rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
78  INT16* WINPR_RESTRICT dst, size_t subband_width)
79 {
80  INT16* l_ptr = l;
81  INT16* h_ptr = h;
82  INT16* dst_ptr = dst;
83 
84  for (size_t y = 0; y < subband_width; y++)
85  {
86  /* Even coefficients */
87  for (size_t n = 0; n < subband_width; n += 8)
88  {
89  // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
90  int16x8_t l_n = vld1q_s16(l_ptr);
91  int16x8_t h_n = vld1q_s16(h_ptr);
92  int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
93 
94  if (n == 0)
95  {
96  int16_t first = vgetq_lane_s16(h_n_m, 1);
97  h_n_m = vsetq_lane_s16(first, h_n_m, 0);
98  }
99 
100  int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
101  tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
102  tmp_n = vshrq_n_s16(tmp_n, 1);
103  int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
104  vst1q_s16(l_ptr, dst_n);
105  l_ptr += 8;
106  h_ptr += 8;
107  }
108 
109  l_ptr -= subband_width;
110  h_ptr -= subband_width;
111 
112  /* Odd coefficients */
113  for (size_t n = 0; n < subband_width; n += 8)
114  {
115  // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
116  int16x8_t h_n = vld1q_s16(h_ptr);
117  h_n = vshlq_n_s16(h_n, 1);
118  int16x8x2_t dst_n;
119  dst_n.val[0] = vld1q_s16(l_ptr);
120  int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
121 
122  if (n == subband_width - 8)
123  {
124  int16_t last = vgetq_lane_s16(dst_n_p, 6);
125  dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
126  }
127 
128  dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
129  dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
130  dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
131  vst2q_s16(dst_ptr, dst_n);
132  l_ptr += 8;
133  h_ptr += 8;
134  dst_ptr += 16;
135  }
136  }
137 }
138 
139 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140 rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
141  INT16* WINPR_RESTRICT dst, size_t subband_width)
142 {
143  INT16* l_ptr = l;
144  INT16* h_ptr = h;
145  INT16* dst_ptr = dst;
146  const size_t total_width = subband_width + subband_width;
147 
148  /* Even coefficients */
149  for (size_t n = 0; n < subband_width; n++)
150  {
151  for (size_t x = 0; x < total_width; x += 8)
152  {
153  // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
154  int16x8_t l_n = vld1q_s16(l_ptr);
155  int16x8_t h_n = vld1q_s16(h_ptr);
156  int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
157 
158  if (n == 0)
159  tmp_n = vaddq_s16(tmp_n, h_n);
160  else
161  {
162  int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
163  tmp_n = vaddq_s16(tmp_n, h_n_m);
164  }
165 
166  tmp_n = vshrq_n_s16(tmp_n, 1);
167  int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
168  vst1q_s16(dst_ptr, dst_n);
169  l_ptr += 8;
170  h_ptr += 8;
171  dst_ptr += 8;
172  }
173 
174  dst_ptr += total_width;
175  }
176 
177  h_ptr = h;
178  dst_ptr = dst + total_width;
179 
180  /* Odd coefficients */
181  for (size_t n = 0; n < subband_width; n++)
182  {
183  for (size_t x = 0; x < total_width; x += 8)
184  {
185  // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
186  int16x8_t h_n = vld1q_s16(h_ptr);
187  int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
188  h_n = vshlq_n_s16(h_n, 1);
189  int16x8_t tmp_n = dst_n_m;
190 
191  if (n == subband_width - 1)
192  tmp_n = vaddq_s16(tmp_n, dst_n_m);
193  else
194  {
195  int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
196  tmp_n = vaddq_s16(tmp_n, dst_n_p);
197  }
198 
199  tmp_n = vshrq_n_s16(tmp_n, 1);
200  int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
201  vst1q_s16(dst_ptr, dst_n);
202  h_ptr += 8;
203  dst_ptr += 8;
204  }
205 
206  dst_ptr += total_width;
207  }
208 }
209 
210 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
212  size_t subband_width)
213 {
214  INT16 *hl, *lh, *hh, *ll;
215  INT16 *l_dst, *h_dst;
216  /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
217  */
218  /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
219  /* The lower part L uses LL(3) and HL(0). */
220  /* The higher part H uses LH(1) and HH(2). */
221  ll = buffer + subband_width * subband_width * 3;
222  hl = buffer;
223  l_dst = idwt;
224  rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
225  lh = buffer + subband_width * subband_width;
226  hh = buffer + subband_width * subband_width * 2;
227  h_dst = idwt + subband_width * subband_width * 2;
228  rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
229  /* Inverse DWT in vertical direction, results are stored in original buffer. */
230  rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
231 }
232 
233 static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
234 {
235  rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
236  rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
237  rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
238 }
239 
240 static INLINE void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand, size_t nLowStep,
241  const INT16* restrict pHighBand,
242  size_t nHighStep, INT16* restrict pDstBand,
243  size_t nDstStep, size_t nLowCount,
244  size_t nHighCount, size_t nDstCount)
245 {
246  WINPR_ASSERT(pLowBand);
247  WINPR_ASSERT(pHighBand);
248  WINPR_ASSERT(pDstBand);
249 
250  INT16* l_ptr = pLowBand;
251  const INT16* h_ptr = pHighBand;
252  INT16* dst_ptr = pDstBand;
253  size_t batchSize = (nLowCount + nHighCount) >> 1;
254 
255  for (size_t y = 0; y < nDstCount; y++)
256  {
257  /* Even coefficients */
258  size_t n = 0;
259  for (; n < batchSize; n += 8)
260  {
261  // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
262  int16x8_t l_n = vld1q_s16(l_ptr);
263  int16x8_t h_n = vld1q_s16(h_ptr);
264  int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
265 
266  if (n == 0)
267  {
268  int16_t first = vgetq_lane_s16(h_n_m, 1);
269  h_n_m = vsetq_lane_s16(first, h_n_m, 0);
270  }
271  else if (n == 24)
272  h_n = vsetq_lane_s16(0, h_n, 7);
273 
274  int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
275  tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
276  tmp_n = vshrq_n_s16(tmp_n, 1);
277  int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
278  vst1q_s16(l_ptr, dst_n);
279  l_ptr += 8;
280  h_ptr += 8;
281  }
282  if (n < 32)
283  *l_ptr -= *(h_ptr - 1);
284 
285  l_ptr -= batchSize;
286  h_ptr -= batchSize;
287 
288  /* Odd coefficients */
289  n = 0;
290  for (; n < batchSize; n += 8)
291  {
292  // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
293  int16x8_t h_n = vld1q_s16(h_ptr);
294  h_n = vshlq_n_s16(h_n, 1);
295  int16x8x2_t dst_n;
296  dst_n.val[0] = vld1q_s16(l_ptr);
297  int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
298 
299  if (n == 24)
300  h_n = vsetq_lane_s16(0, h_n, 7);
301 
302  dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
303  dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
304  dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
305  vst2q_s16(dst_ptr, dst_n);
306  l_ptr += 8;
307  h_ptr += 8;
308  dst_ptr += 16;
309  }
310  if (n == 32)
311  {
312  h_ptr -= 1;
313  l_ptr += 1;
314  }
315  else
316  {
317  *dst_ptr = *l_ptr;
318  l_ptr += 1;
319  dst_ptr += 1;
320  }
321  }
322 }
323 
324 static INLINE void rfx_idwt_extrapolate_vert_neon(const INT16* restrict pLowBand, size_t nLowStep,
325  const INT16* restrict pHighBand, size_t nHighStep,
326  INT16* restrict pDstBand, size_t nDstStep,
327  size_t nLowCount, size_t nHighCount,
328  size_t nDstCount)
329 {
330  WINPR_ASSERT(pLowBand);
331  WINPR_ASSERT(pHighBand);
332  WINPR_ASSERT(pDstBand);
333 
334  const INT16* l_ptr = pLowBand;
335  const INT16* h_ptr = pHighBand;
336  INT16* dst_ptr = pDstBand;
337  size_t batchSize = (nDstCount >> 3) << 3;
338  size_t forceBandSize = (nLowCount + nHighCount) >> 1;
339 
340  /* Even coefficients */
341  for (size_t n = 0; n < forceBandSize; n++)
342  {
343  for (size_t x = 0; x < batchSize; x += 8)
344  {
345  // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
346  int16x8_t l_n = vld1q_s16(l_ptr);
347  int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
348  int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
349 
350  if (n == 0)
351  tmp_n = vaddq_s16(tmp_n, h_n);
352  else if (n < 31)
353  {
354  int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
355  tmp_n = vaddq_s16(tmp_n, h_n_m);
356  }
357 
358  tmp_n = vshrq_n_s16(tmp_n, 1);
359  int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
360  vst1q_s16(dst_ptr, dst_n);
361  l_ptr += 8;
362  h_ptr += 8;
363  dst_ptr += 8;
364  }
365 
366  if (nDstCount > batchSize)
367  {
368  int16_t h_n = (n == 31) ? *(h_ptr - nHighStep) : *h_ptr;
369  int16_t tmp_n = h_n + 1;
370  if (n == 0)
371  tmp_n += h_n;
372  else if (n < 31)
373  tmp_n += *(h_ptr - nHighStep);
374  tmp_n >>= 1;
375  *dst_ptr = *l_ptr - tmp_n;
376  l_ptr += 1;
377  h_ptr += 1;
378  dst_ptr += 1;
379  }
380 
381  dst_ptr += nDstStep;
382  }
383 
384  if (forceBandSize < 32)
385  {
386  for (size_t x = 0; x < batchSize; x += 8)
387  {
388  int16x8_t l_n = vld1q_s16(l_ptr);
389  int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
390  int16x8_t tmp_n = vsubq_s16(l_n, h_n);
391  vst1q_s16(dst_ptr, tmp_n);
392  l_ptr += 8;
393  h_ptr += 8;
394  dst_ptr += 8;
395  }
396 
397  if (nDstCount > batchSize)
398  {
399  *dst_ptr = *l_ptr - *(h_ptr - nHighStep);
400  l_ptr += 1;
401  h_ptr += 1;
402  dst_ptr += 1;
403  }
404  }
405 
406  h_ptr = pHighBand;
407  dst_ptr = pDstBand + nDstStep;
408 
409  /* Odd coefficients */
410  for (size_t n = 0; n < forceBandSize; n++)
411  {
412  for (size_t x = 0; x < batchSize; x += 8)
413  {
414  // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
415  int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
416  if (n == 31)
417  {
418  int16x8_t dst_n_p = vld1q_s16(l_ptr);
419  l_ptr += 8;
420  tmp_n = vaddq_s16(tmp_n, dst_n_p);
421  tmp_n = vshrq_n_s16(tmp_n, 1);
422  }
423  else
424  {
425  int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
426  tmp_n = vaddq_s16(tmp_n, dst_n_p);
427  tmp_n = vshrq_n_s16(tmp_n, 1);
428  int16x8_t h_n = vld1q_s16(h_ptr);
429  h_n = vshlq_n_s16(h_n, 1);
430  tmp_n = vaddq_s16(tmp_n, h_n);
431  }
432  vst1q_s16(dst_ptr, tmp_n);
433  h_ptr += 8;
434  dst_ptr += 8;
435  }
436 
437  if (nDstCount > batchSize)
438  {
439  int16_t tmp_n = *(dst_ptr - nDstStep);
440  if (n == 31)
441  {
442  int16_t dst_n_p = *l_ptr;
443  l_ptr += 1;
444  tmp_n += dst_n_p;
445  tmp_n >>= 1;
446  }
447  else
448  {
449  int16_t dst_n_p = *(dst_ptr + nDstStep);
450  tmp_n += dst_n_p;
451  tmp_n >>= 1;
452  int16_t h_n = *h_ptr;
453  h_n <<= 1;
454  tmp_n += h_n;
455  }
456  *dst_ptr = tmp_n;
457  h_ptr += 1;
458  dst_ptr += 1;
459  }
460 
461  dst_ptr += nDstStep;
462  }
463 }
464 
465 static INLINE size_t prfx_get_band_l_count(size_t level)
466 {
467  return (64 >> level) + 1;
468 }
469 
470 static INLINE size_t prfx_get_band_h_count(size_t level)
471 {
472  if (level == 1)
473  return (64 >> 1) - 1;
474  else
475  return (64 + (1 << (level - 1))) >> level;
476 }
477 
478 static INLINE void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
479  size_t level)
480 {
481  size_t nDstStepX;
482  size_t nDstStepY;
483  INT16 *HL, *LH;
484  INT16 *HH, *LL;
485  INT16 *L, *H, *LLx;
486 
487  const size_t nBandL = prfx_get_band_l_count(level);
488  const size_t nBandH = prfx_get_band_h_count(level);
489  size_t offset = 0;
490 
491  WINPR_ASSERT(buffer);
492  WINPR_ASSERT(temp);
493 
494  HL = &buffer[offset];
495  offset += (nBandH * nBandL);
496  LH = &buffer[offset];
497  offset += (nBandL * nBandH);
498  HH = &buffer[offset];
499  offset += (nBandH * nBandH);
500  LL = &buffer[offset];
501  nDstStepX = (nBandL + nBandH);
502  nDstStepY = (nBandL + nBandH);
503  offset = 0;
504  L = &temp[offset];
505  offset += (nBandL * nDstStepX);
506  H = &temp[offset];
507  LLx = &buffer[0];
508 
509  /* horizontal (LL + HL -> L) */
510  rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);
511 
512  /* horizontal (LH + HH -> H) */
513  rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);
514 
515  /* vertical (L + H -> LL) */
516  rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
517  nBandL + nBandH);
518 }
519 
520 static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
521 {
522  WINPR_ASSERT(buffer);
523  WINPR_ASSERT(temp);
524  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
525  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
526  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
527 }
528 #endif // NEON_ENABLED
529 
530 void rfx_init_neon(RFX_CONTEXT* context)
531 {
532 #if defined(NEON_ENABLED)
533  if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
534  {
535  DEBUG_RFX("Using NEON optimizations");
536  PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
537  PROFILER_RENAME(context->priv->prof_rfx_quantization_decode,
538  "rfx_quantization_decode_NEON");
539  PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
540  context->quantization_decode = rfx_quantization_decode_NEON;
541  context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
542  context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
543  }
544 #else
545  WINPR_UNUSED(context);
546 #endif
547 }