20 #include <winpr/platform.h>
21 #include <freerdp/config.h>
22 #include <freerdp/log.h>
24 #include "../rfx_types.h"
27 #if defined(WITH_NEON)
28 #if defined(_M_ARM64) || defined(_M_ARM)
33 #if defined(NEON_ENABLED)
39 #include <winpr/sysinfo.h>
43 static __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
44 rfx_quantization_decode_block_NEON(INT16* buffer,
const size_t buffer_size,
const UINT32 factor)
46 int16x8_t quantFactors = vdupq_n_s16(factor);
47 int16x8_t* buf = (int16x8_t*)buffer;
48 int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
52 int16x8_t val = vld1q_s16((INT16*)buf);
53 val = vshlq_s16(val, quantFactors);
54 vst1q_s16((INT16*)buf, val);
56 }
while (buf < buf_end);
59 static void rfx_quantization_decode_NEON(INT16* buffer,
const UINT32* WINPR_RESTRICT quantVals)
62 WINPR_ASSERT(quantVals);
64 rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1);
65 rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1);
66 rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1);
67 rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1);
68 rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1);
69 rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1);
70 rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1);
71 rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1);
72 rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1);
73 rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1);
76 static __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77 rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
78 INT16* WINPR_RESTRICT dst,
size_t subband_width)
84 for (
size_t y = 0; y < subband_width; y++)
87 for (
size_t n = 0; n < subband_width; n += 8)
90 int16x8_t l_n = vld1q_s16(l_ptr);
91 int16x8_t h_n = vld1q_s16(h_ptr);
92 int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
96 int16_t first = vgetq_lane_s16(h_n_m, 1);
97 h_n_m = vsetq_lane_s16(first, h_n_m, 0);
100 int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
101 tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
102 tmp_n = vshrq_n_s16(tmp_n, 1);
103 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
104 vst1q_s16(l_ptr, dst_n);
109 l_ptr -= subband_width;
110 h_ptr -= subband_width;
113 for (
size_t n = 0; n < subband_width; n += 8)
116 int16x8_t h_n = vld1q_s16(h_ptr);
117 h_n = vshlq_n_s16(h_n, 1);
119 dst_n.val[0] = vld1q_s16(l_ptr);
120 int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
122 if (n == subband_width - 8)
124 int16_t last = vgetq_lane_s16(dst_n_p, 6);
125 dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
128 dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
129 dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
130 dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
131 vst2q_s16(dst_ptr, dst_n);
139 static __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140 rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
141 INT16* WINPR_RESTRICT dst,
size_t subband_width)
145 INT16* dst_ptr = dst;
146 const size_t total_width = subband_width + subband_width;
149 for (
size_t n = 0; n < subband_width; n++)
151 for (
size_t x = 0; x < total_width; x += 8)
154 int16x8_t l_n = vld1q_s16(l_ptr);
155 int16x8_t h_n = vld1q_s16(h_ptr);
156 int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
159 tmp_n = vaddq_s16(tmp_n, h_n);
162 int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
163 tmp_n = vaddq_s16(tmp_n, h_n_m);
166 tmp_n = vshrq_n_s16(tmp_n, 1);
167 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
168 vst1q_s16(dst_ptr, dst_n);
174 dst_ptr += total_width;
178 dst_ptr = dst + total_width;
181 for (
size_t n = 0; n < subband_width; n++)
183 for (
size_t x = 0; x < total_width; x += 8)
186 int16x8_t h_n = vld1q_s16(h_ptr);
187 int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
188 h_n = vshlq_n_s16(h_n, 1);
189 int16x8_t tmp_n = dst_n_m;
191 if (n == subband_width - 1)
192 tmp_n = vaddq_s16(tmp_n, dst_n_m);
195 int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
196 tmp_n = vaddq_s16(tmp_n, dst_n_p);
199 tmp_n = vshrq_n_s16(tmp_n, 1);
200 int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
201 vst1q_s16(dst_ptr, dst_n);
206 dst_ptr += total_width;
210 static __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
212 size_t subband_width)
214 INT16 *hl, *lh, *hh, *ll;
215 INT16 *l_dst, *h_dst;
221 ll = buffer + subband_width * subband_width * 3;
224 rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
225 lh = buffer + subband_width * subband_width;
226 hh = buffer + subband_width * subband_width * 2;
227 h_dst = idwt + subband_width * subband_width * 2;
228 rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
230 rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
233 static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
235 rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
236 rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
237 rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
240 static INLINE
void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand,
size_t nLowStep,
241 const INT16* restrict pHighBand,
242 size_t nHighStep, INT16* restrict pDstBand,
243 size_t nDstStep,
size_t nLowCount,
244 size_t nHighCount,
size_t nDstCount)
246 WINPR_ASSERT(pLowBand);
247 WINPR_ASSERT(pHighBand);
248 WINPR_ASSERT(pDstBand);
250 INT16* l_ptr = pLowBand;
251 const INT16* h_ptr = pHighBand;
252 INT16* dst_ptr = pDstBand;
253 size_t batchSize = (nLowCount + nHighCount) >> 1;
255 for (
size_t y = 0; y < nDstCount; y++)
259 for (; n < batchSize; n += 8)
262 int16x8_t l_n = vld1q_s16(l_ptr);
263 int16x8_t h_n = vld1q_s16(h_ptr);
264 int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
268 int16_t first = vgetq_lane_s16(h_n_m, 1);
269 h_n_m = vsetq_lane_s16(first, h_n_m, 0);
272 h_n = vsetq_lane_s16(0, h_n, 7);
274 int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
275 tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
276 tmp_n = vshrq_n_s16(tmp_n, 1);
277 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
278 vst1q_s16(l_ptr, dst_n);
283 *l_ptr -= *(h_ptr - 1);
290 for (; n < batchSize; n += 8)
293 int16x8_t h_n = vld1q_s16(h_ptr);
294 h_n = vshlq_n_s16(h_n, 1);
296 dst_n.val[0] = vld1q_s16(l_ptr);
297 int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
300 h_n = vsetq_lane_s16(0, h_n, 7);
302 dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
303 dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
304 dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
305 vst2q_s16(dst_ptr, dst_n);
324 static INLINE
void rfx_idwt_extrapolate_vert_neon(
const INT16* restrict pLowBand,
size_t nLowStep,
325 const INT16* restrict pHighBand,
size_t nHighStep,
326 INT16* restrict pDstBand,
size_t nDstStep,
327 size_t nLowCount,
size_t nHighCount,
330 WINPR_ASSERT(pLowBand);
331 WINPR_ASSERT(pHighBand);
332 WINPR_ASSERT(pDstBand);
334 const INT16* l_ptr = pLowBand;
335 const INT16* h_ptr = pHighBand;
336 INT16* dst_ptr = pDstBand;
337 size_t batchSize = (nDstCount >> 3) << 3;
338 size_t forceBandSize = (nLowCount + nHighCount) >> 1;
341 for (
size_t n = 0; n < forceBandSize; n++)
343 for (
size_t x = 0; x < batchSize; x += 8)
346 int16x8_t l_n = vld1q_s16(l_ptr);
347 int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
348 int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
351 tmp_n = vaddq_s16(tmp_n, h_n);
354 int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
355 tmp_n = vaddq_s16(tmp_n, h_n_m);
358 tmp_n = vshrq_n_s16(tmp_n, 1);
359 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
360 vst1q_s16(dst_ptr, dst_n);
366 if (nDstCount > batchSize)
368 int16_t h_n = (n == 31) ? *(h_ptr - nHighStep) : *h_ptr;
369 int16_t tmp_n = h_n + 1;
373 tmp_n += *(h_ptr - nHighStep);
375 *dst_ptr = *l_ptr - tmp_n;
384 if (forceBandSize < 32)
386 for (
size_t x = 0; x < batchSize; x += 8)
388 int16x8_t l_n = vld1q_s16(l_ptr);
389 int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
390 int16x8_t tmp_n = vsubq_s16(l_n, h_n);
391 vst1q_s16(dst_ptr, tmp_n);
397 if (nDstCount > batchSize)
399 *dst_ptr = *l_ptr - *(h_ptr - nHighStep);
407 dst_ptr = pDstBand + nDstStep;
410 for (
size_t n = 0; n < forceBandSize; n++)
412 for (
size_t x = 0; x < batchSize; x += 8)
415 int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
418 int16x8_t dst_n_p = vld1q_s16(l_ptr);
420 tmp_n = vaddq_s16(tmp_n, dst_n_p);
421 tmp_n = vshrq_n_s16(tmp_n, 1);
425 int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
426 tmp_n = vaddq_s16(tmp_n, dst_n_p);
427 tmp_n = vshrq_n_s16(tmp_n, 1);
428 int16x8_t h_n = vld1q_s16(h_ptr);
429 h_n = vshlq_n_s16(h_n, 1);
430 tmp_n = vaddq_s16(tmp_n, h_n);
432 vst1q_s16(dst_ptr, tmp_n);
437 if (nDstCount > batchSize)
439 int16_t tmp_n = *(dst_ptr - nDstStep);
442 int16_t dst_n_p = *l_ptr;
449 int16_t dst_n_p = *(dst_ptr + nDstStep);
452 int16_t h_n = *h_ptr;
465 static INLINE
size_t prfx_get_band_l_count(
size_t level)
467 return (64 >> level) + 1;
470 static INLINE
size_t prfx_get_band_h_count(
size_t level)
473 return (64 >> 1) - 1;
475 return (64 + (1 << (level - 1))) >> level;
478 static INLINE
void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
487 const size_t nBandL = prfx_get_band_l_count(level);
488 const size_t nBandH = prfx_get_band_h_count(level);
491 WINPR_ASSERT(buffer);
494 HL = &buffer[offset];
495 offset += (nBandH * nBandL);
496 LH = &buffer[offset];
497 offset += (nBandL * nBandH);
498 HH = &buffer[offset];
499 offset += (nBandH * nBandH);
500 LL = &buffer[offset];
501 nDstStepX = (nBandL + nBandH);
502 nDstStepY = (nBandL + nBandH);
505 offset += (nBandL * nDstStepX);
510 rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);
513 rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);
516 rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
520 static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
522 WINPR_ASSERT(buffer);
524 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
525 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
526 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
530 void rfx_init_neon(RFX_CONTEXT* context)
532 #if defined(NEON_ENABLED)
533 if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
535 DEBUG_RFX(
"Using NEON optimizations");
536 PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb,
"rfx_decode_YCbCr_to_RGB_NEON");
537 PROFILER_RENAME(context->priv->prof_rfx_quantization_decode,
538 "rfx_quantization_decode_NEON");
539 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode,
"rfx_dwt_2d_decode_NEON");
540 context->quantization_decode = rfx_quantization_decode_NEON;
541 context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
542 context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
545 WINPR_UNUSED(context);