23 #include <freerdp/config.h>
25 #include <winpr/sysinfo.h>
26 #include <winpr/crt.h>
27 #include <freerdp/types.h>
28 #include <freerdp/primitives.h>
30 #include "prim_internal.h"
33 #if defined(NEON_INTRINSICS_ENABLED)
38 static INLINE uint8x8_t neon_YUV2R(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
39 int16x4_t Eh, int16x4_t El)
42 const int16x4_t c403 = vdup_n_s16(403);
43 const int32x4_t CEh = vmlal_s16(Ch, Eh, c403);
44 const int32x4_t CEl = vmlal_s16(Cl, El, c403);
45 const int32x4_t Rh = vrshrq_n_s32(CEh, 8);
46 const int32x4_t Rl = vrshrq_n_s32(CEl, 8);
47 const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh));
48 return vqmovun_s16(R);
51 static INLINE uint8x8_t neon_YUV2G(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
52 int16x4_t Eh, int16x4_t El)
55 const int16x4_t c48 = vdup_n_s16(48);
56 const int16x4_t c120 = vdup_n_s16(120);
57 const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48);
58 const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48);
59 const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120);
60 const int32x4_t CDEl = vmlsl_s16(CDl, El, c120);
61 const int32x4_t Gh = vrshrq_n_s32(CDEh, 8);
62 const int32x4_t Gl = vrshrq_n_s32(CDEl, 8);
63 const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh));
64 return vqmovun_s16(G);
67 static INLINE uint8x8_t neon_YUV2B(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
68 int16x4_t Eh, int16x4_t El)
71 const int16x4_t c475 = vdup_n_s16(475);
72 const int32x4_t CDh = vmlal_s16(Ch, Dh, c475);
73 const int32x4_t CDl = vmlal_s16(Ch, Dl, c475);
74 const int32x4_t Bh = vrshrq_n_s32(CDh, 8);
75 const int32x4_t Bl = vrshrq_n_s32(CDl, 8);
76 const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh));
77 return vqmovun_s16(B);
80 static INLINE BYTE* neon_YuvToRgbPixel(BYTE* pRGB, int16x8_t Y, int16x8_t D, int16x8_t E,
81 const uint8_t rPos,
const uint8_t gPos,
const uint8_t bPos,
84 const int32x4_t Ch = vmulq_n_s32(vmovl_s16(vget_high_s16(Y)), 256);
85 const int32x4_t Cl = vmulq_n_s32(vmovl_s16(vget_low_s16(Y)), 256);
86 const int16x4_t Dh = vget_high_s16(D);
87 const int16x4_t Dl = vget_low_s16(D);
88 const int16x4_t Eh = vget_high_s16(E);
89 const int16x4_t El = vget_low_s16(E);
90 uint8x8x4_t bgrx = vld4_u8(pRGB);
93 const int16x4_t c475 = vdup_n_s16(475);
94 const int32x4_t CDh = vmlal_s16(Ch, Dh, c475);
95 const int32x4_t CDl = vmlal_s16(Cl, Dl, c475);
96 const int32x4_t Bh = vrshrq_n_s32(CDh, 8);
97 const int32x4_t Bl = vrshrq_n_s32(CDl, 8);
98 const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh));
99 bgrx.val[bPos] = vqmovun_s16(B);
103 const int16x4_t c48 = vdup_n_s16(48);
104 const int16x4_t c120 = vdup_n_s16(120);
105 const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48);
106 const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48);
107 const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120);
108 const int32x4_t CDEl = vmlsl_s16(CDl, El, c120);
109 const int32x4_t Gh = vrshrq_n_s32(CDEh, 8);
110 const int32x4_t Gl = vrshrq_n_s32(CDEl, 8);
111 const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh));
112 bgrx.val[gPos] = vqmovun_s16(G);
116 const int16x4_t c403 = vdup_n_s16(403);
117 const int32x4_t CEh = vmlal_s16(Ch, Eh, c403);
118 const int32x4_t CEl = vmlal_s16(Cl, El, c403);
119 const int32x4_t Rh = vrshrq_n_s32(CEh, 8);
120 const int32x4_t Rl = vrshrq_n_s32(CEl, 8);
121 const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh));
122 bgrx.val[rPos] = vqmovun_s16(R);
129 static INLINE pstatus_t neon_YUV420ToX(
const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3],
130 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
131 const prim_size_t* WINPR_RESTRICT roi,
const uint8_t rPos,
132 const uint8_t gPos,
const uint8_t bPos,
const uint8_t aPos)
134 const UINT32 nWidth = roi->width;
135 const UINT32 nHeight = roi->height;
136 const DWORD pad = nWidth % 16;
137 const UINT32 yPad = srcStep[0] - roi->width;
138 const UINT32 uPad = srcStep[1] - roi->width / 2;
139 const UINT32 vPad = srcStep[2] - roi->width / 2;
140 const UINT32 dPad = dstStep - roi->width * 4;
141 const int16x8_t c128 = vdupq_n_s16(128);
143 for (UINT32 y = 0; y < nHeight; y += 2)
145 const uint8_t* pY1 = pSrc[0] + y * srcStep[0];
146 const uint8_t* pY2 = pY1 + srcStep[0];
147 const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1];
148 const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2];
149 uint8_t* pRGB1 = pDst + y * dstStep;
150 uint8_t* pRGB2 = pRGB1 + dstStep;
151 const BOOL lastY = y >= nHeight - 1;
154 for (; x < nWidth - pad;)
156 const uint8x8_t Uraw = vld1_u8(pU);
157 const uint8x8x2_t Uu = vzip_u8(Uraw, Uraw);
158 const int16x8_t U1 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[0]));
159 const int16x8_t U2 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[1]));
160 const uint8x8_t Vraw = vld1_u8(pV);
161 const uint8x8x2_t Vu = vzip_u8(Vraw, Vraw);
162 const int16x8_t V1 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[0]));
163 const int16x8_t V2 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[1]));
164 const int16x8_t D1 = vsubq_s16(U1, c128);
165 const int16x8_t E1 = vsubq_s16(V1, c128);
166 const int16x8_t D2 = vsubq_s16(U2, c128);
167 const int16x8_t E2 = vsubq_s16(V2, c128);
169 const uint8x8_t Y1u = vld1_u8(pY1);
170 const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u));
171 pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D1, E1, rPos, gPos, bPos, aPos);
176 const uint8x8_t Y1u = vld1_u8(pY1);
177 const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u));
178 pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D2, E2, rPos, gPos, bPos, aPos);
186 const uint8x8_t Y2u = vld1_u8(pY2);
187 const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u));
188 pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D1, E1, rPos, gPos, bPos, aPos);
192 const uint8x8_t Y2u = vld1_u8(pY2);
193 const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u));
194 pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D2, E2, rPos, gPos, bPos, aPos);
203 for (; x < nWidth; x++)
208 const BYTE Y = *pY1++;
209 const BYTE r = YUV2R(Y, U, V);
210 const BYTE g = YUV2G(Y, U, V);
211 const BYTE b = YUV2B(Y, U, V);
220 const BYTE Y = *pY2++;
221 const BYTE r = YUV2R(Y, U, V);
222 const BYTE g = YUV2G(Y, U, V);
223 const BYTE b = YUV2B(Y, U, V);
245 return PRIMITIVES_SUCCESS;
248 static pstatus_t neon_YUV420ToRGB_8u_P3AC4R(
const BYTE* WINPR_RESTRICT pSrc[3],
249 const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
250 UINT32 dstStep, UINT32 DstFormat,
255 case PIXEL_FORMAT_BGRA32:
256 case PIXEL_FORMAT_BGRX32:
257 return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
259 case PIXEL_FORMAT_RGBA32:
260 case PIXEL_FORMAT_RGBX32:
261 return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
263 case PIXEL_FORMAT_ARGB32:
264 case PIXEL_FORMAT_XRGB32:
265 return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
267 case PIXEL_FORMAT_ABGR32:
268 case PIXEL_FORMAT_XBGR32:
269 return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
272 return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
276 static INLINE pstatus_t neon_YUV444ToX(
const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3],
277 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
278 const prim_size_t* WINPR_RESTRICT roi,
const uint8_t rPos,
279 const uint8_t gPos,
const uint8_t bPos,
const uint8_t aPos)
281 const UINT32 nWidth = roi->width;
282 const UINT32 nHeight = roi->height;
283 const UINT32 yPad = srcStep[0] - roi->width;
284 const UINT32 uPad = srcStep[1] - roi->width;
285 const UINT32 vPad = srcStep[2] - roi->width;
286 const UINT32 dPad = dstStep - roi->width * 4;
287 const uint8_t* pY = pSrc[0];
288 const uint8_t* pU = pSrc[1];
289 const uint8_t* pV = pSrc[2];
290 uint8_t* pRGB = pDst;
291 const int16x8_t c128 = vdupq_n_s16(128);
292 const DWORD pad = nWidth % 8;
294 for (UINT32 y = 0; y < nHeight; y++)
296 for (UINT32 x = 0; x < nWidth - pad; x += 8)
298 const uint8x8_t Yu = vld1_u8(pY);
299 const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(Yu));
300 const uint8x8_t Uu = vld1_u8(pU);
301 const int16x8_t U = vreinterpretq_s16_u16(vmovl_u8(Uu));
302 const uint8x8_t Vu = vld1_u8(pV);
303 const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vu));
306 const int16x8_t D = vsubq_s16(U, c128);
307 const int16x8_t E = vsubq_s16(V, c128);
308 pRGB = neon_YuvToRgbPixel(pRGB, Y, D, E, rPos, gPos, bPos, aPos);
314 for (UINT32 x = 0; x < pad; x++)
316 const BYTE Y = *pY++;
317 const BYTE U = *pU++;
318 const BYTE V = *pV++;
319 const BYTE r = YUV2R(Y, U, V);
320 const BYTE g = YUV2G(Y, U, V);
321 const BYTE b = YUV2B(Y, U, V);
334 return PRIMITIVES_SUCCESS;
337 static pstatus_t neon_YUV444ToRGB_8u_P3AC4R(
const BYTE* WINPR_RESTRICT pSrc[3],
338 const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
339 UINT32 dstStep, UINT32 DstFormat,
344 case PIXEL_FORMAT_BGRA32:
345 case PIXEL_FORMAT_BGRX32:
346 return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
348 case PIXEL_FORMAT_RGBA32:
349 case PIXEL_FORMAT_RGBX32:
350 return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
352 case PIXEL_FORMAT_ARGB32:
353 case PIXEL_FORMAT_XRGB32:
354 return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
356 case PIXEL_FORMAT_ABGR32:
357 case PIXEL_FORMAT_XBGR32:
358 return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
361 return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
365 static pstatus_t neon_LumaToYUV444(
const BYTE* WINPR_RESTRICT pSrcRaw[3],
const UINT32 srcStep[3],
366 BYTE* WINPR_RESTRICT pDstRaw[3],
const UINT32 dstStep[3],
369 const UINT32 nWidth = roi->right - roi->left;
370 const UINT32 nHeight = roi->bottom - roi->top;
371 const UINT32 halfWidth = (nWidth + 1) / 2;
372 const UINT32 halfHeight = (nHeight + 1) / 2;
373 const UINT32 evenY = 0;
374 const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
375 pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
376 pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
377 BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
378 pDstRaw[1] + roi->top * dstStep[1] + roi->left,
379 pDstRaw[2] + roi->top * dstStep[2] + roi->left };
383 for (UINT32 y = 0; y < nHeight; y++)
385 const BYTE* Ym = pSrc[0] + srcStep[0] * y;
386 BYTE* pY = pDst[0] + dstStep[0] * y;
387 memcpy(pY, Ym, nWidth);
392 for (UINT32 y = 0; y < halfHeight; y++)
394 const UINT32 val2y = (2 * y + evenY);
395 const BYTE* Um = pSrc[1] + srcStep[1] * y;
396 const BYTE* Vm = pSrc[2] + srcStep[2] * y;
397 BYTE* pU = pDst[1] + dstStep[1] * val2y;
398 BYTE* pV = pDst[2] + dstStep[2] * val2y;
399 BYTE* pU1 = pU + dstStep[1];
400 BYTE* pV1 = pV + dstStep[2];
403 for (; x + 16 < halfWidth; x += 16)
406 const uint8x16_t u = vld1q_u8(Um);
417 const uint8x16_t v = vld1q_u8(Vm);
429 for (; x < halfWidth; x++)
431 const BYTE u = *Um++;
432 const BYTE v = *Vm++;
444 return PRIMITIVES_SUCCESS;
447 static pstatus_t neon_ChromaFilter(BYTE* WINPR_RESTRICT pDst[3],
const UINT32 dstStep[3],
450 const UINT32 oddY = 1;
451 const UINT32 evenY = 0;
452 const UINT32 nWidth = roi->right - roi->left;
453 const UINT32 nHeight = roi->bottom - roi->top;
454 const UINT32 halfHeight = (nHeight + 1) / 2;
455 const UINT32 halfWidth = (nWidth + 1) / 2;
456 const UINT32 halfPad = halfWidth % 16;
459 for (UINT32 y = roi->top / 2; y < halfHeight + roi->top / 2; y++)
461 const UINT32 val2y = (y * 2 + evenY);
462 const UINT32 val2y1 = val2y + oddY;
463 BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
464 BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
465 BYTE* pU = pDst[1] + dstStep[1] * val2y;
466 BYTE* pV = pDst[2] + dstStep[2] * val2y;
468 if (val2y1 > nHeight + roi->top)
471 UINT32 x = roi->left / 2;
472 for (; x < halfWidth + roi->left / 2 - halfPad; x += 8)
476 uint8x8x2_t u = vld2_u8(&pU[2 * x]);
478 vreinterpretq_s16_u16(vshll_n_u8(u.val[0], 2));
479 const uint8x8x2_t u1 = vld2_u8(&pU1[2 * x]);
480 const uint16x8_t usub = vaddl_u8(u1.val[1], u1.val[0]);
481 const int16x8_t us = vreinterpretq_s16_u16(
482 vaddw_u8(usub, u.val[1]));
483 const int16x8_t un = vsubq_s16(up, us);
484 const uint8x8_t u8 = vqmovun_s16(un);
486 vst2_u8(&pU[2 * x], u);
490 uint8x8x2_t v = vld2_u8(&pV[2 * x]);
492 vreinterpretq_s16_u16(vshll_n_u8(v.val[0], 2));
493 const uint8x8x2_t v1 = vld2_u8(&pV1[2 * x]);
494 const uint16x8_t vsub = vaddl_u8(v1.val[1], v1.val[0]);
495 const int16x8_t vs = vreinterpretq_s16_u16(
496 vaddw_u8(vsub, v.val[1]));
497 const int16x8_t vn = vsubq_s16(vp, vs);
498 const uint8x8_t v8 = vqmovun_s16(vn);
500 vst2_u8(&pV[2 * x], v);
504 for (; x < halfWidth + roi->left / 2; x++)
506 const UINT32 val2x = (x * 2);
507 const UINT32 val2x1 = val2x + 1;
508 const BYTE inU = pU[val2x];
509 const BYTE inV = pV[val2x];
510 const INT32 up = inU * 4;
511 const INT32 vp = inV * 4;
515 if (val2x1 > nWidth + roi->left)
518 u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
519 v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
520 pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
521 pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
525 return PRIMITIVES_SUCCESS;
528 static pstatus_t neon_ChromaV1ToYUV444(
const BYTE* WINPR_RESTRICT pSrcRaw[3],
529 const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
530 const UINT32 dstStep[3],
533 const UINT32 mod = 16;
536 const UINT32 nWidth = roi->right - roi->left;
537 const UINT32 nHeight = roi->bottom - roi->top;
538 const UINT32 halfWidth = (nWidth) / 2;
539 const UINT32 halfHeight = (nHeight) / 2;
540 const UINT32 oddY = 1;
541 const UINT32 evenY = 0;
542 const UINT32 oddX = 1;
545 const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
546 const UINT32 halfPad = halfWidth % 16;
547 const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
548 pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
549 pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
550 BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
551 pDstRaw[1] + roi->top * dstStep[1] + roi->left,
552 pDstRaw[2] + roi->top * dstStep[2] + roi->left };
556 for (UINT32 y = 0; y < padHeigth; y++)
558 const BYTE* Ya = pSrc[0] + srcStep[0] * y;
561 if ((y) % mod < (mod + 1) / 2)
563 const UINT32 pos = (2 * uY++ + oddY);
568 pX = pDst[1] + dstStep[1] * pos;
572 const UINT32 pos = (2 * vY++ + oddY);
577 pX = pDst[2] + dstStep[2] * pos;
580 memcpy(pX, Ya, nWidth);
584 for (UINT32 y = 0; y < halfHeight; y++)
586 const UINT32 val2y = (y * 2 + evenY);
587 const BYTE* Ua = pSrc[1] + srcStep[1] * y;
588 const BYTE* Va = pSrc[2] + srcStep[2] * y;
589 BYTE* pU = pDst[1] + dstStep[1] * val2y;
590 BYTE* pV = pDst[2] + dstStep[2] * val2y;
593 for (; x < halfWidth - halfPad; x += 16)
596 uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
597 u.val[1] = vld1q_u8(&Ua[x]);
598 vst2q_u8(&pU[2 * x], u);
601 uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
602 v.val[1] = vld1q_u8(&Va[x]);
603 vst2q_u8(&pV[2 * x], v);
607 for (; x < halfWidth; x++)
609 const UINT32 val2x1 = (x * 2 + oddX);
616 return neon_ChromaFilter(pDst, dstStep, roi);
619 static pstatus_t neon_ChromaV2ToYUV444(
const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3],
620 UINT32 nTotalWidth, UINT32 nTotalHeight,
621 BYTE* WINPR_RESTRICT pDst[3],
const UINT32 dstStep[3],
624 const UINT32 nWidth = roi->right - roi->left;
625 const UINT32 nHeight = roi->bottom - roi->top;
626 const UINT32 halfWidth = (nWidth + 1) / 2;
627 const UINT32 halfPad = halfWidth % 16;
628 const UINT32 halfHeight = (nHeight + 1) / 2;
629 const UINT32 quaterWidth = (nWidth + 3) / 4;
630 const UINT32 quaterPad = quaterWidth % 16;
633 for (UINT32 y = 0; y < nHeight; y++)
635 const UINT32 yTop = y + roi->top;
636 const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
637 const BYTE* pYaV = pYaU + nTotalWidth / 2;
638 BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
639 BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
642 for (; x < halfWidth - halfPad; x += 16)
645 uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
646 u.val[1] = vld1q_u8(&pYaU[x]);
647 vst2q_u8(&pU[2 * x], u);
650 uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
651 v.val[1] = vld1q_u8(&pYaV[x]);
652 vst2q_u8(&pV[2 * x], v);
656 for (; x < halfWidth; x++)
658 const UINT32 odd = 2 * x + 1;
665 for (UINT32 y = 0; y < halfHeight; y++)
667 const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
668 const BYTE* pUaV = pUaU + nTotalWidth / 4;
669 const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
670 const BYTE* pVaV = pVaU + nTotalWidth / 4;
671 BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
672 BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
675 for (; x < quaterWidth - quaterPad; x += 16)
678 uint8x16x4_t u = vld4q_u8(&pU[4 * x]);
679 u.val[0] = vld1q_u8(&pUaU[x]);
680 u.val[2] = vld1q_u8(&pVaU[x]);
681 vst4q_u8(&pU[4 * x], u);
684 uint8x16x4_t v = vld4q_u8(&pV[4 * x]);
685 v.val[0] = vld1q_u8(&pUaV[x]);
686 v.val[2] = vld1q_u8(&pVaV[x]);
687 vst4q_u8(&pV[4 * x], v);
691 for (; x < quaterWidth; x++)
693 pU[4 * x + 0] = pUaU[x];
694 pV[4 * x + 0] = pUaV[x];
695 pU[4 * x + 2] = pVaU[x];
696 pV[4 * x + 2] = pVaV[x];
700 return neon_ChromaFilter(pDst, dstStep, roi);
703 static pstatus_t neon_YUV420CombineToYUV444(avc444_frame_type type,
704 const BYTE* WINPR_RESTRICT pSrc[3],
705 const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
706 BYTE* WINPR_RESTRICT pDst[3],
const UINT32 dstStep[3],
709 if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
712 if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
721 return neon_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
723 case AVC444_CHROMAv1:
724 return neon_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
726 case AVC444_CHROMAv2:
727 return neon_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
735 void primitives_init_YUV_neon(
primitives_t* WINPR_RESTRICT prims)
737 #if defined(NEON_INTRINSICS_ENABLED)
738 generic = primitives_get_generic();
739 primitives_init_YUV(prims);
741 if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
743 WLog_VRB(PRIM_TAG,
"NEON optimizations");
744 prims->YUV420ToRGB_8u_P3AC4R = neon_YUV420ToRGB_8u_P3AC4R;
745 prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R;
746 prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444;
749 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or neon intrinsics not available");