20 #include <winpr/platform.h>
21 #include <freerdp/config.h>
22 #include <freerdp/log.h>
24 #include "../rfx_types.h"
27 #include "../../core/simd.h"
29 #if defined(NEON_INTRINSICS_ENABLED)
35 #include <winpr/sysinfo.h>
39 static __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
40 rfx_quantization_decode_block_NEON(INT16* buffer,
const size_t buffer_size,
const UINT32 factor)
42 int16x8_t quantFactors = vdupq_n_s16(factor);
43 int16x8_t* buf = (int16x8_t*)buffer;
44 int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
48 int16x8_t val = vld1q_s16((INT16*)buf);
49 val = vshlq_s16(val, quantFactors);
50 vst1q_s16((INT16*)buf, val);
52 }
while (buf < buf_end);
55 static void rfx_quantization_decode_NEON(INT16* buffer,
const UINT32* WINPR_RESTRICT quantVals)
58 WINPR_ASSERT(quantVals);
60 rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1);
61 rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1);
62 rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1);
63 rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1);
64 rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1);
65 rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1);
66 rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1);
67 rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1);
68 rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1);
69 rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1);
72 static __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
73 rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
74 INT16* WINPR_RESTRICT dst,
size_t subband_width)
80 for (
size_t y = 0; y < subband_width; y++)
83 for (
size_t n = 0; n < subband_width; n += 8)
86 int16x8_t l_n = vld1q_s16(l_ptr);
87 int16x8_t h_n = vld1q_s16(h_ptr);
88 int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
92 int16_t first = vgetq_lane_s16(h_n_m, 1);
93 h_n_m = vsetq_lane_s16(first, h_n_m, 0);
96 int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
97 tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
98 tmp_n = vshrq_n_s16(tmp_n, 1);
99 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
100 vst1q_s16(l_ptr, dst_n);
105 l_ptr -= subband_width;
106 h_ptr -= subband_width;
109 for (
size_t n = 0; n < subband_width; n += 8)
112 int16x8_t h_n = vld1q_s16(h_ptr);
113 h_n = vshlq_n_s16(h_n, 1);
115 dst_n.val[0] = vld1q_s16(l_ptr);
116 int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
118 if (n == subband_width - 8)
120 int16_t last = vgetq_lane_s16(dst_n_p, 6);
121 dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
124 dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
125 dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
126 dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
127 vst2q_s16(dst_ptr, dst_n);
135 static __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
136 rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
137 INT16* WINPR_RESTRICT dst,
size_t subband_width)
141 INT16* dst_ptr = dst;
142 const size_t total_width = subband_width + subband_width;
145 for (
size_t n = 0; n < subband_width; n++)
147 for (
size_t x = 0; x < total_width; x += 8)
150 int16x8_t l_n = vld1q_s16(l_ptr);
151 int16x8_t h_n = vld1q_s16(h_ptr);
152 int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
155 tmp_n = vaddq_s16(tmp_n, h_n);
158 int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
159 tmp_n = vaddq_s16(tmp_n, h_n_m);
162 tmp_n = vshrq_n_s16(tmp_n, 1);
163 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
164 vst1q_s16(dst_ptr, dst_n);
170 dst_ptr += total_width;
174 dst_ptr = dst + total_width;
177 for (
size_t n = 0; n < subband_width; n++)
179 for (
size_t x = 0; x < total_width; x += 8)
182 int16x8_t h_n = vld1q_s16(h_ptr);
183 int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
184 h_n = vshlq_n_s16(h_n, 1);
185 int16x8_t tmp_n = dst_n_m;
187 if (n == subband_width - 1)
188 tmp_n = vaddq_s16(tmp_n, dst_n_m);
191 int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
192 tmp_n = vaddq_s16(tmp_n, dst_n_p);
195 tmp_n = vshrq_n_s16(tmp_n, 1);
196 int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
197 vst1q_s16(dst_ptr, dst_n);
202 dst_ptr += total_width;
206 static __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
207 rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
208 size_t subband_width)
210 INT16 *hl, *lh, *hh, *ll;
211 INT16 *l_dst, *h_dst;
217 ll = buffer + subband_width * subband_width * 3;
220 rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
221 lh = buffer + subband_width * subband_width;
222 hh = buffer + subband_width * subband_width * 2;
223 h_dst = idwt + subband_width * subband_width * 2;
224 rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
226 rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
229 static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
231 rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
232 rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
233 rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
236 static INLINE
void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand,
size_t nLowStep,
237 const INT16* restrict pHighBand,
238 size_t nHighStep, INT16* restrict pDstBand,
239 size_t nDstStep,
size_t nLowCount,
240 size_t nHighCount,
size_t nDstCount)
242 WINPR_ASSERT(pLowBand);
243 WINPR_ASSERT(pHighBand);
244 WINPR_ASSERT(pDstBand);
246 INT16* l_ptr = pLowBand;
247 const INT16* h_ptr = pHighBand;
248 INT16* dst_ptr = pDstBand;
249 size_t batchSize = (nLowCount + nHighCount) >> 1;
251 for (
size_t y = 0; y < nDstCount; y++)
255 for (; n < batchSize; n += 8)
258 int16x8_t l_n = vld1q_s16(l_ptr);
259 int16x8_t h_n = vld1q_s16(h_ptr);
260 int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
264 int16_t first = vgetq_lane_s16(h_n_m, 1);
265 h_n_m = vsetq_lane_s16(first, h_n_m, 0);
268 h_n = vsetq_lane_s16(0, h_n, 7);
270 int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
271 tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
272 tmp_n = vshrq_n_s16(tmp_n, 1);
273 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
274 vst1q_s16(l_ptr, dst_n);
279 *l_ptr -= *(h_ptr - 1);
286 for (; n < batchSize; n += 8)
289 int16x8_t h_n = vld1q_s16(h_ptr);
290 h_n = vshlq_n_s16(h_n, 1);
292 dst_n.val[0] = vld1q_s16(l_ptr);
293 int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
296 h_n = vsetq_lane_s16(0, h_n, 7);
298 dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
299 dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
300 dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
301 vst2q_s16(dst_ptr, dst_n);
320 static INLINE
void rfx_idwt_extrapolate_vert_neon(
const INT16* restrict pLowBand,
size_t nLowStep,
321 const INT16* restrict pHighBand,
size_t nHighStep,
322 INT16* restrict pDstBand,
size_t nDstStep,
323 size_t nLowCount,
size_t nHighCount,
326 WINPR_ASSERT(pLowBand);
327 WINPR_ASSERT(pHighBand);
328 WINPR_ASSERT(pDstBand);
330 const INT16* l_ptr = pLowBand;
331 const INT16* h_ptr = pHighBand;
332 INT16* dst_ptr = pDstBand;
333 size_t batchSize = (nDstCount >> 3) << 3;
334 size_t forceBandSize = (nLowCount + nHighCount) >> 1;
337 for (
size_t n = 0; n < forceBandSize; n++)
339 for (
size_t x = 0; x < batchSize; x += 8)
342 int16x8_t l_n = vld1q_s16(l_ptr);
343 int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
344 int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
347 tmp_n = vaddq_s16(tmp_n, h_n);
350 int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
351 tmp_n = vaddq_s16(tmp_n, h_n_m);
354 tmp_n = vshrq_n_s16(tmp_n, 1);
355 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
356 vst1q_s16(dst_ptr, dst_n);
362 if (nDstCount > batchSize)
364 int16_t h_n = (n == 31) ? *(h_ptr - nHighStep) : *h_ptr;
365 int16_t tmp_n = h_n + 1;
369 tmp_n += *(h_ptr - nHighStep);
371 *dst_ptr = *l_ptr - tmp_n;
380 if (forceBandSize < 32)
382 for (
size_t x = 0; x < batchSize; x += 8)
384 int16x8_t l_n = vld1q_s16(l_ptr);
385 int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
386 int16x8_t tmp_n = vsubq_s16(l_n, h_n);
387 vst1q_s16(dst_ptr, tmp_n);
393 if (nDstCount > batchSize)
395 *dst_ptr = *l_ptr - *(h_ptr - nHighStep);
403 dst_ptr = pDstBand + nDstStep;
406 for (
size_t n = 0; n < forceBandSize; n++)
408 for (
size_t x = 0; x < batchSize; x += 8)
411 int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
414 int16x8_t dst_n_p = vld1q_s16(l_ptr);
416 tmp_n = vaddq_s16(tmp_n, dst_n_p);
417 tmp_n = vshrq_n_s16(tmp_n, 1);
421 int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
422 tmp_n = vaddq_s16(tmp_n, dst_n_p);
423 tmp_n = vshrq_n_s16(tmp_n, 1);
424 int16x8_t h_n = vld1q_s16(h_ptr);
425 h_n = vshlq_n_s16(h_n, 1);
426 tmp_n = vaddq_s16(tmp_n, h_n);
428 vst1q_s16(dst_ptr, tmp_n);
433 if (nDstCount > batchSize)
435 int16_t tmp_n = *(dst_ptr - nDstStep);
438 int16_t dst_n_p = *l_ptr;
445 int16_t dst_n_p = *(dst_ptr + nDstStep);
448 int16_t h_n = *h_ptr;
461 static INLINE
size_t prfx_get_band_l_count(
size_t level)
463 return (64 >> level) + 1;
466 static INLINE
size_t prfx_get_band_h_count(
size_t level)
469 return (64 >> 1) - 1;
471 return (64 + (1 << (level - 1))) >> level;
474 static INLINE
void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
483 const size_t nBandL = prfx_get_band_l_count(level);
484 const size_t nBandH = prfx_get_band_h_count(level);
487 WINPR_ASSERT(buffer);
490 HL = &buffer[offset];
491 offset += (nBandH * nBandL);
492 LH = &buffer[offset];
493 offset += (nBandL * nBandH);
494 HH = &buffer[offset];
495 offset += (nBandH * nBandH);
496 LL = &buffer[offset];
497 nDstStepX = (nBandL + nBandH);
498 nDstStepY = (nBandL + nBandH);
501 offset += (nBandL * nDstStepX);
506 rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);
509 rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);
512 rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
516 static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
518 WINPR_ASSERT(buffer);
520 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
521 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
522 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
526 void rfx_init_neon(RFX_CONTEXT* context)
528 #if defined(NEON_INTRINSICS_ENABLED)
529 if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
531 DEBUG_RFX(
"Using NEON optimizations");
532 PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb,
"rfx_decode_YCbCr_to_RGB_NEON");
533 PROFILER_RENAME(context->priv->prof_rfx_quantization_decode,
534 "rfx_quantization_decode_NEON");
535 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode,
"rfx_dwt_2d_decode_NEON");
536 context->quantization_decode = rfx_quantization_decode_NEON;
537 context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
538 context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
541 WINPR_UNUSED(context);