23 #include <freerdp/config.h>
25 #include <winpr/sysinfo.h>
26 #include <winpr/crt.h>
27 #include <freerdp/types.h>
28 #include <freerdp/primitives.h>
30 #include "prim_internal.h"
33 #if defined(NEON_ENABLED)
38 static INLINE uint8x8_t neon_YUV2R(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
39 int16x4_t Eh, int16x4_t El)
42 const int16x4_t c403 = vdup_n_s16(403);
43 const int32x4_t CEh = vmlal_s16(Ch, Eh, c403);
44 const int32x4_t CEl = vmlal_s16(Cl, El, c403);
45 const int32x4_t Rh = vrshrq_n_s32(CEh, 8);
46 const int32x4_t Rl = vrshrq_n_s32(CEl, 8);
47 const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh));
48 return vqmovun_s16(R);
51 static INLINE uint8x8_t neon_YUV2G(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
52 int16x4_t Eh, int16x4_t El)
55 const int16x4_t c48 = vdup_n_s16(48);
56 const int16x4_t c120 = vdup_n_s16(120);
57 const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48);
58 const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48);
59 const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120);
60 const int32x4_t CDEl = vmlsl_s16(CDl, El, c120);
61 const int32x4_t Gh = vrshrq_n_s32(CDEh, 8);
62 const int32x4_t Gl = vrshrq_n_s32(CDEl, 8);
63 const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh));
64 return vqmovun_s16(G);
67 static INLINE uint8x8_t neon_YUV2B(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
68 int16x4_t Eh, int16x4_t El)
71 const int16x4_t c475 = vdup_n_s16(475);
72 const int32x4_t CDh = vmlal_s16(Ch, Dh, c475);
73 const int32x4_t CDl = vmlal_s16(Ch, Dl, c475);
74 const int32x4_t Bh = vrshrq_n_s32(CDh, 8);
75 const int32x4_t Bl = vrshrq_n_s32(CDl, 8);
76 const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh));
77 return vqmovun_s16(B);
80 static INLINE BYTE* neon_YuvToRgbPixel(BYTE* pRGB, int16x8_t Y, int16x8_t D, int16x8_t E,
81 const uint8_t rPos,
const uint8_t gPos,
const uint8_t bPos,
85 const int32x4_t Ch = vmulq_n_s32(vmovl_s16(vget_high_s16(Y)), 256);
86 const int32x4_t Cl = vmulq_n_s32(vmovl_s16(vget_low_s16(Y)), 256);
87 const int16x4_t Dh = vget_high_s16(D);
88 const int16x4_t Dl = vget_low_s16(D);
89 const int16x4_t Eh = vget_high_s16(E);
90 const int16x4_t El = vget_low_s16(E);
93 const int16x4_t c475 = vdup_n_s16(475);
94 const int32x4_t CDh = vmlal_s16(Ch, Dh, c475);
95 const int32x4_t CDl = vmlal_s16(Cl, Dl, c475);
96 const int32x4_t Bh = vrshrq_n_s32(CDh, 8);
97 const int32x4_t Bl = vrshrq_n_s32(CDl, 8);
98 const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh));
99 bgrx.val[bPos] = vqmovun_s16(B);
103 const int16x4_t c48 = vdup_n_s16(48);
104 const int16x4_t c120 = vdup_n_s16(120);
105 const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48);
106 const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48);
107 const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120);
108 const int32x4_t CDEl = vmlsl_s16(CDl, El, c120);
109 const int32x4_t Gh = vrshrq_n_s32(CDEh, 8);
110 const int32x4_t Gl = vrshrq_n_s32(CDEl, 8);
111 const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh));
112 bgrx.val[gPos] = vqmovun_s16(G);
116 const int16x4_t c403 = vdup_n_s16(403);
117 const int32x4_t CEh = vmlal_s16(Ch, Eh, c403);
118 const int32x4_t CEl = vmlal_s16(Cl, El, c403);
119 const int32x4_t Rh = vrshrq_n_s32(CEh, 8);
120 const int32x4_t Rl = vrshrq_n_s32(CEl, 8);
121 const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh));
122 bgrx.val[rPos] = vqmovun_s16(R);
126 bgrx.val[aPos] = vdup_n_u8(0xFF);
133 static INLINE pstatus_t neon_YUV420ToX(
const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3],
134 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
135 const prim_size_t* WINPR_RESTRICT roi,
const uint8_t rPos,
136 const uint8_t gPos,
const uint8_t bPos,
const uint8_t aPos)
138 const UINT32 nWidth = roi->width;
139 const UINT32 nHeight = roi->height;
140 const DWORD pad = nWidth % 16;
141 const UINT32 yPad = srcStep[0] - roi->width;
142 const UINT32 uPad = srcStep[1] - roi->width / 2;
143 const UINT32 vPad = srcStep[2] - roi->width / 2;
144 const UINT32 dPad = dstStep - roi->width * 4;
145 const int16x8_t c128 = vdupq_n_s16(128);
147 for (UINT32 y = 0; y < nHeight; y += 2)
149 const uint8_t* pY1 = pSrc[0] + y * srcStep[0];
150 const uint8_t* pY2 = pY1 + srcStep[0];
151 const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1];
152 const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2];
153 uint8_t* pRGB1 = pDst + y * dstStep;
154 uint8_t* pRGB2 = pRGB1 + dstStep;
155 const BOOL lastY = y >= nHeight - 1;
158 for (; x < nWidth - pad;)
160 const uint8x8_t Uraw = vld1_u8(pU);
161 const uint8x8x2_t Uu = vzip_u8(Uraw, Uraw);
162 const int16x8_t U1 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[0]));
163 const int16x8_t U2 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[1]));
164 const uint8x8_t Vraw = vld1_u8(pV);
165 const uint8x8x2_t Vu = vzip_u8(Vraw, Vraw);
166 const int16x8_t V1 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[0]));
167 const int16x8_t V2 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[1]));
168 const int16x8_t D1 = vsubq_s16(U1, c128);
169 const int16x8_t E1 = vsubq_s16(V1, c128);
170 const int16x8_t D2 = vsubq_s16(U2, c128);
171 const int16x8_t E2 = vsubq_s16(V2, c128);
173 const uint8x8_t Y1u = vld1_u8(pY1);
174 const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u));
175 pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D1, E1, rPos, gPos, bPos, aPos);
180 const uint8x8_t Y1u = vld1_u8(pY1);
181 const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u));
182 pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D2, E2, rPos, gPos, bPos, aPos);
190 const uint8x8_t Y2u = vld1_u8(pY2);
191 const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u));
192 pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D1, E1, rPos, gPos, bPos, aPos);
196 const uint8x8_t Y2u = vld1_u8(pY2);
197 const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u));
198 pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D2, E2, rPos, gPos, bPos, aPos);
207 for (; x < nWidth; x++)
212 const BYTE Y = *pY1++;
213 const BYTE r = YUV2R(Y, U, V);
214 const BYTE g = YUV2G(Y, U, V);
215 const BYTE b = YUV2B(Y, U, V);
225 const BYTE Y = *pY2++;
226 const BYTE r = YUV2R(Y, U, V);
227 const BYTE g = YUV2G(Y, U, V);
228 const BYTE b = YUV2B(Y, U, V);
251 return PRIMITIVES_SUCCESS;
254 static pstatus_t neon_YUV420ToRGB_8u_P3AC4R(
const BYTE* WINPR_RESTRICT pSrc[3],
255 const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
256 UINT32 dstStep, UINT32 DstFormat,
261 case PIXEL_FORMAT_BGRA32:
262 case PIXEL_FORMAT_BGRX32:
263 return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
265 case PIXEL_FORMAT_RGBA32:
266 case PIXEL_FORMAT_RGBX32:
267 return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
269 case PIXEL_FORMAT_ARGB32:
270 case PIXEL_FORMAT_XRGB32:
271 return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
273 case PIXEL_FORMAT_ABGR32:
274 case PIXEL_FORMAT_XBGR32:
275 return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
278 return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
282 static INLINE pstatus_t neon_YUV444ToX(
const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3],
283 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
284 const prim_size_t* WINPR_RESTRICT roi,
const uint8_t rPos,
285 const uint8_t gPos,
const uint8_t bPos,
const uint8_t aPos)
287 const UINT32 nWidth = roi->width;
288 const UINT32 nHeight = roi->height;
289 const UINT32 yPad = srcStep[0] - roi->width;
290 const UINT32 uPad = srcStep[1] - roi->width;
291 const UINT32 vPad = srcStep[2] - roi->width;
292 const UINT32 dPad = dstStep - roi->width * 4;
293 const uint8_t* pY = pSrc[0];
294 const uint8_t* pU = pSrc[1];
295 const uint8_t* pV = pSrc[2];
296 uint8_t* pRGB = pDst;
297 const int16x8_t c128 = vdupq_n_s16(128);
298 const DWORD pad = nWidth % 8;
300 for (UINT32 y = 0; y < nHeight; y++)
302 for (UINT32 x = 0; x < nWidth - pad; x += 8)
304 const uint8x8_t Yu = vld1_u8(pY);
305 const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(Yu));
306 const uint8x8_t Uu = vld1_u8(pU);
307 const int16x8_t U = vreinterpretq_s16_u16(vmovl_u8(Uu));
308 const uint8x8_t Vu = vld1_u8(pV);
309 const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vu));
312 const int16x8_t D = vsubq_s16(U, c128);
313 const int16x8_t E = vsubq_s16(V, c128);
314 pRGB = neon_YuvToRgbPixel(pRGB, Y, D, E, rPos, gPos, bPos, aPos);
320 for (UINT32 x = 0; x < pad; x++)
322 const BYTE Y = *pY++;
323 const BYTE U = *pU++;
324 const BYTE V = *pV++;
325 const BYTE r = YUV2R(Y, U, V);
326 const BYTE g = YUV2G(Y, U, V);
327 const BYTE b = YUV2B(Y, U, V);
341 return PRIMITIVES_SUCCESS;
344 static pstatus_t neon_YUV444ToRGB_8u_P3AC4R(
const BYTE* WINPR_RESTRICT pSrc[3],
345 const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
346 UINT32 dstStep, UINT32 DstFormat,
351 case PIXEL_FORMAT_BGRA32:
352 case PIXEL_FORMAT_BGRX32:
353 return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
355 case PIXEL_FORMAT_RGBA32:
356 case PIXEL_FORMAT_RGBX32:
357 return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
359 case PIXEL_FORMAT_ARGB32:
360 case PIXEL_FORMAT_XRGB32:
361 return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
363 case PIXEL_FORMAT_ABGR32:
364 case PIXEL_FORMAT_XBGR32:
365 return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
368 return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
372 static pstatus_t neon_LumaToYUV444(
const BYTE* WINPR_RESTRICT pSrcRaw[3],
const UINT32 srcStep[3],
373 BYTE* WINPR_RESTRICT pDstRaw[3],
const UINT32 dstStep[3],
376 const UINT32 nWidth = roi->right - roi->left;
377 const UINT32 nHeight = roi->bottom - roi->top;
378 const UINT32 halfWidth = (nWidth + 1) / 2;
379 const UINT32 halfHeight = (nHeight + 1) / 2;
380 const UINT32 evenY = 0;
381 const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
382 pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
383 pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
384 BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
385 pDstRaw[1] + roi->top * dstStep[1] + roi->left,
386 pDstRaw[2] + roi->top * dstStep[2] + roi->left };
390 for (UINT32 y = 0; y < nHeight; y++)
392 const BYTE* Ym = pSrc[0] + srcStep[0] * y;
393 BYTE* pY = pDst[0] + dstStep[0] * y;
394 memcpy(pY, Ym, nWidth);
399 for (UINT32 y = 0; y < halfHeight; y++)
401 const UINT32 val2y = (2 * y + evenY);
402 const BYTE* Um = pSrc[1] + srcStep[1] * y;
403 const BYTE* Vm = pSrc[2] + srcStep[2] * y;
404 BYTE* pU = pDst[1] + dstStep[1] * val2y;
405 BYTE* pV = pDst[2] + dstStep[2] * val2y;
406 BYTE* pU1 = pU + dstStep[1];
407 BYTE* pV1 = pV + dstStep[2];
410 for (; x + 16 < halfWidth; x += 16)
413 const uint8x16_t u = vld1q_u8(Um);
424 const uint8x16_t v = vld1q_u8(Vm);
436 for (; x < halfWidth; x++)
438 const BYTE u = *Um++;
439 const BYTE v = *Vm++;
451 return PRIMITIVES_SUCCESS;
454 static pstatus_t neon_ChromaFilter(BYTE* WINPR_RESTRICT pDst[3],
const UINT32 dstStep[3],
457 const UINT32 oddY = 1;
458 const UINT32 evenY = 0;
459 const UINT32 nWidth = roi->right - roi->left;
460 const UINT32 nHeight = roi->bottom - roi->top;
461 const UINT32 halfHeight = (nHeight + 1) / 2;
462 const UINT32 halfWidth = (nWidth + 1) / 2;
463 const UINT32 halfPad = halfWidth % 16;
466 for (UINT32 y = roi->top / 2; y < halfHeight + roi->top / 2; y++)
468 const UINT32 val2y = (y * 2 + evenY);
469 const UINT32 val2y1 = val2y + oddY;
470 BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
471 BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
472 BYTE* pU = pDst[1] + dstStep[1] * val2y;
473 BYTE* pV = pDst[2] + dstStep[2] * val2y;
475 if (val2y1 > nHeight + roi->top)
478 UINT32 x = roi->left / 2;
479 for (; x < halfWidth + roi->left / 2 - halfPad; x += 8)
483 uint8x8x2_t u = vld2_u8(&pU[2 * x]);
485 vreinterpretq_s16_u16(vshll_n_u8(u.val[0], 2));
486 const uint8x8x2_t u1 = vld2_u8(&pU1[2 * x]);
487 const uint16x8_t usub = vaddl_u8(u1.val[1], u1.val[0]);
488 const int16x8_t us = vreinterpretq_s16_u16(
489 vaddw_u8(usub, u.val[1]));
490 const int16x8_t un = vsubq_s16(up, us);
491 const uint8x8_t u8 = vqmovun_s16(un);
493 vst2_u8(&pU[2 * x], u);
497 uint8x8x2_t v = vld2_u8(&pV[2 * x]);
499 vreinterpretq_s16_u16(vshll_n_u8(v.val[0], 2));
500 const uint8x8x2_t v1 = vld2_u8(&pV1[2 * x]);
501 const uint16x8_t vsub = vaddl_u8(v1.val[1], v1.val[0]);
502 const int16x8_t vs = vreinterpretq_s16_u16(
503 vaddw_u8(vsub, v.val[1]));
504 const int16x8_t vn = vsubq_s16(vp, vs);
505 const uint8x8_t v8 = vqmovun_s16(vn);
507 vst2_u8(&pV[2 * x], v);
511 for (; x < halfWidth + roi->left / 2; x++)
513 const UINT32 val2x = (x * 2);
514 const UINT32 val2x1 = val2x + 1;
515 const BYTE inU = pU[val2x];
516 const BYTE inV = pV[val2x];
517 const INT32 up = inU * 4;
518 const INT32 vp = inV * 4;
522 if (val2x1 > nWidth + roi->left)
525 u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
526 v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
527 pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
528 pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
532 return PRIMITIVES_SUCCESS;
535 static pstatus_t neon_ChromaV1ToYUV444(
const BYTE* WINPR_RESTRICT pSrcRaw[3],
536 const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
537 const UINT32 dstStep[3],
540 const UINT32 mod = 16;
543 const UINT32 nWidth = roi->right - roi->left;
544 const UINT32 nHeight = roi->bottom - roi->top;
545 const UINT32 halfWidth = (nWidth) / 2;
546 const UINT32 halfHeight = (nHeight) / 2;
547 const UINT32 oddY = 1;
548 const UINT32 evenY = 0;
549 const UINT32 oddX = 1;
552 const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
553 const UINT32 halfPad = halfWidth % 16;
554 const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
555 pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
556 pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
557 BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
558 pDstRaw[1] + roi->top * dstStep[1] + roi->left,
559 pDstRaw[2] + roi->top * dstStep[2] + roi->left };
563 for (UINT32 y = 0; y < padHeigth; y++)
565 const BYTE* Ya = pSrc[0] + srcStep[0] * y;
568 if ((y) % mod < (mod + 1) / 2)
570 const UINT32 pos = (2 * uY++ + oddY);
575 pX = pDst[1] + dstStep[1] * pos;
579 const UINT32 pos = (2 * vY++ + oddY);
584 pX = pDst[2] + dstStep[2] * pos;
587 memcpy(pX, Ya, nWidth);
591 for (UINT32 y = 0; y < halfHeight; y++)
593 const UINT32 val2y = (y * 2 + evenY);
594 const BYTE* Ua = pSrc[1] + srcStep[1] * y;
595 const BYTE* Va = pSrc[2] + srcStep[2] * y;
596 BYTE* pU = pDst[1] + dstStep[1] * val2y;
597 BYTE* pV = pDst[2] + dstStep[2] * val2y;
600 for (; x < halfWidth - halfPad; x += 16)
603 uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
604 u.val[1] = vld1q_u8(&Ua[x]);
605 vst2q_u8(&pU[2 * x], u);
608 uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
609 v.val[1] = vld1q_u8(&Va[x]);
610 vst2q_u8(&pV[2 * x], v);
614 for (; x < halfWidth; x++)
616 const UINT32 val2x1 = (x * 2 + oddX);
623 return neon_ChromaFilter(pDst, dstStep, roi);
626 static pstatus_t neon_ChromaV2ToYUV444(
const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3],
627 UINT32 nTotalWidth, UINT32 nTotalHeight,
628 BYTE* WINPR_RESTRICT pDst[3],
const UINT32 dstStep[3],
631 const UINT32 nWidth = roi->right - roi->left;
632 const UINT32 nHeight = roi->bottom - roi->top;
633 const UINT32 halfWidth = (nWidth + 1) / 2;
634 const UINT32 halfPad = halfWidth % 16;
635 const UINT32 halfHeight = (nHeight + 1) / 2;
636 const UINT32 quaterWidth = (nWidth + 3) / 4;
637 const UINT32 quaterPad = quaterWidth % 16;
640 for (UINT32 y = 0; y < nHeight; y++)
642 const UINT32 yTop = y + roi->top;
643 const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
644 const BYTE* pYaV = pYaU + nTotalWidth / 2;
645 BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
646 BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
649 for (; x < halfWidth - halfPad; x += 16)
652 uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
653 u.val[1] = vld1q_u8(&pYaU[x]);
654 vst2q_u8(&pU[2 * x], u);
657 uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
658 v.val[1] = vld1q_u8(&pYaV[x]);
659 vst2q_u8(&pV[2 * x], v);
663 for (; x < halfWidth; x++)
665 const UINT32 odd = 2 * x + 1;
672 for (UINT32 y = 0; y < halfHeight; y++)
674 const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
675 const BYTE* pUaV = pUaU + nTotalWidth / 4;
676 const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
677 const BYTE* pVaV = pVaU + nTotalWidth / 4;
678 BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
679 BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
682 for (; x < quaterWidth - quaterPad; x += 16)
685 uint8x16x4_t u = vld4q_u8(&pU[4 * x]);
686 u.val[0] = vld1q_u8(&pUaU[x]);
687 u.val[2] = vld1q_u8(&pVaU[x]);
688 vst4q_u8(&pU[4 * x], u);
691 uint8x16x4_t v = vld4q_u8(&pV[4 * x]);
692 v.val[0] = vld1q_u8(&pUaV[x]);
693 v.val[2] = vld1q_u8(&pVaV[x]);
694 vst4q_u8(&pV[4 * x], v);
698 for (; x < quaterWidth; x++)
700 pU[4 * x + 0] = pUaU[x];
701 pV[4 * x + 0] = pUaV[x];
702 pU[4 * x + 2] = pVaU[x];
703 pV[4 * x + 2] = pVaV[x];
707 return neon_ChromaFilter(pDst, dstStep, roi);
710 static pstatus_t neon_YUV420CombineToYUV444(avc444_frame_type type,
711 const BYTE* WINPR_RESTRICT pSrc[3],
712 const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
713 BYTE* WINPR_RESTRICT pDst[3],
const UINT32 dstStep[3],
716 if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
719 if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
728 return neon_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
730 case AVC444_CHROMAv1:
731 return neon_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
733 case AVC444_CHROMAv2:
734 return neon_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
742 void primitives_init_YUV_neon(
primitives_t* WINPR_RESTRICT prims)
744 #if defined(NEON_ENABLED)
745 generic = primitives_get_generic();
746 primitives_init_YUV(prims);
748 if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
750 WLog_VRB(PRIM_TAG,
"NEON optimizations");
751 prims->YUV420ToRGB_8u_P3AC4R = neon_YUV420ToRGB_8u_P3AC4R;
752 prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R;
753 prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444;
756 WLog_VRB(PRIM_TAG,
"undefined WITH_NEON");