FreeRDP
prim_YUV_neon.c
1 
23 #include <freerdp/config.h>
24 
25 #include <winpr/sysinfo.h>
26 #include <winpr/crt.h>
27 #include <freerdp/types.h>
28 #include <freerdp/primitives.h>
29 
30 #include "prim_internal.h"
31 #include "prim_YUV.h"
32 
33 #if defined(NEON_ENABLED)
34 #include <arm_neon.h>
35 
36 static primitives_t* generic = NULL;
37 
38 static INLINE uint8x8_t neon_YUV2R(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
39  int16x4_t Eh, int16x4_t El)
40 {
41  /* R = (256 * Y + 403 * (V - 128)) >> 8 */
42  const int16x4_t c403 = vdup_n_s16(403);
43  const int32x4_t CEh = vmlal_s16(Ch, Eh, c403);
44  const int32x4_t CEl = vmlal_s16(Cl, El, c403);
45  const int32x4_t Rh = vrshrq_n_s32(CEh, 8);
46  const int32x4_t Rl = vrshrq_n_s32(CEl, 8);
47  const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh));
48  return vqmovun_s16(R);
49 }
50 
51 static INLINE uint8x8_t neon_YUV2G(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
52  int16x4_t Eh, int16x4_t El)
53 {
54  /* G = (256L * Y - 48 * (U - 128) - 120 * (V - 128)) >> 8 */
55  const int16x4_t c48 = vdup_n_s16(48);
56  const int16x4_t c120 = vdup_n_s16(120);
57  const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48);
58  const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48);
59  const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120);
60  const int32x4_t CDEl = vmlsl_s16(CDl, El, c120);
61  const int32x4_t Gh = vrshrq_n_s32(CDEh, 8);
62  const int32x4_t Gl = vrshrq_n_s32(CDEl, 8);
63  const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh));
64  return vqmovun_s16(G);
65 }
66 
67 static INLINE uint8x8_t neon_YUV2B(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
68  int16x4_t Eh, int16x4_t El)
69 {
70  /* B = (256L * Y + 475 * (U - 128)) >> 8*/
71  const int16x4_t c475 = vdup_n_s16(475);
72  const int32x4_t CDh = vmlal_s16(Ch, Dh, c475);
73  const int32x4_t CDl = vmlal_s16(Ch, Dl, c475);
74  const int32x4_t Bh = vrshrq_n_s32(CDh, 8);
75  const int32x4_t Bl = vrshrq_n_s32(CDl, 8);
76  const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh));
77  return vqmovun_s16(B);
78 }
79 
80 static INLINE BYTE* neon_YuvToRgbPixel(BYTE* pRGB, int16x8_t Y, int16x8_t D, int16x8_t E,
81  const uint8_t rPos, const uint8_t gPos, const uint8_t bPos,
82  const uint8_t aPos)
83 {
84  uint8x8x4_t bgrx;
85  const int32x4_t Ch = vmulq_n_s32(vmovl_s16(vget_high_s16(Y)), 256); /* Y * 256 */
86  const int32x4_t Cl = vmulq_n_s32(vmovl_s16(vget_low_s16(Y)), 256); /* Y * 256 */
87  const int16x4_t Dh = vget_high_s16(D);
88  const int16x4_t Dl = vget_low_s16(D);
89  const int16x4_t Eh = vget_high_s16(E);
90  const int16x4_t El = vget_low_s16(E);
91  {
92  /* B = (256L * Y + 475 * (U - 128)) >> 8*/
93  const int16x4_t c475 = vdup_n_s16(475);
94  const int32x4_t CDh = vmlal_s16(Ch, Dh, c475);
95  const int32x4_t CDl = vmlal_s16(Cl, Dl, c475);
96  const int32x4_t Bh = vrshrq_n_s32(CDh, 8);
97  const int32x4_t Bl = vrshrq_n_s32(CDl, 8);
98  const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh));
99  bgrx.val[bPos] = vqmovun_s16(B);
100  }
101  {
102  /* G = (256L * Y - 48 * (U - 128) - 120 * (V - 128)) >> 8 */
103  const int16x4_t c48 = vdup_n_s16(48);
104  const int16x4_t c120 = vdup_n_s16(120);
105  const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48);
106  const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48);
107  const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120);
108  const int32x4_t CDEl = vmlsl_s16(CDl, El, c120);
109  const int32x4_t Gh = vrshrq_n_s32(CDEh, 8);
110  const int32x4_t Gl = vrshrq_n_s32(CDEl, 8);
111  const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh));
112  bgrx.val[gPos] = vqmovun_s16(G);
113  }
114  {
115  /* R = (256 * Y + 403 * (V - 128)) >> 8 */
116  const int16x4_t c403 = vdup_n_s16(403);
117  const int32x4_t CEh = vmlal_s16(Ch, Eh, c403);
118  const int32x4_t CEl = vmlal_s16(Cl, El, c403);
119  const int32x4_t Rh = vrshrq_n_s32(CEh, 8);
120  const int32x4_t Rl = vrshrq_n_s32(CEl, 8);
121  const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh));
122  bgrx.val[rPos] = vqmovun_s16(R);
123  }
124  {
125  /* A */
126  bgrx.val[aPos] = vdup_n_u8(0xFF);
127  }
128  vst4_u8(pRGB, bgrx);
129  pRGB += 32;
130  return pRGB;
131 }
132 
133 static INLINE pstatus_t neon_YUV420ToX(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
134  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
135  const prim_size_t* WINPR_RESTRICT roi, const uint8_t rPos,
136  const uint8_t gPos, const uint8_t bPos, const uint8_t aPos)
137 {
138  const UINT32 nWidth = roi->width;
139  const UINT32 nHeight = roi->height;
140  const DWORD pad = nWidth % 16;
141  const UINT32 yPad = srcStep[0] - roi->width;
142  const UINT32 uPad = srcStep[1] - roi->width / 2;
143  const UINT32 vPad = srcStep[2] - roi->width / 2;
144  const UINT32 dPad = dstStep - roi->width * 4;
145  const int16x8_t c128 = vdupq_n_s16(128);
146 
147  for (UINT32 y = 0; y < nHeight; y += 2)
148  {
149  const uint8_t* pY1 = pSrc[0] + y * srcStep[0];
150  const uint8_t* pY2 = pY1 + srcStep[0];
151  const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1];
152  const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2];
153  uint8_t* pRGB1 = pDst + y * dstStep;
154  uint8_t* pRGB2 = pRGB1 + dstStep;
155  const BOOL lastY = y >= nHeight - 1;
156 
157  UINT32 x = 0;
158  for (; x < nWidth - pad;)
159  {
160  const uint8x8_t Uraw = vld1_u8(pU);
161  const uint8x8x2_t Uu = vzip_u8(Uraw, Uraw);
162  const int16x8_t U1 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[0]));
163  const int16x8_t U2 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[1]));
164  const uint8x8_t Vraw = vld1_u8(pV);
165  const uint8x8x2_t Vu = vzip_u8(Vraw, Vraw);
166  const int16x8_t V1 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[0]));
167  const int16x8_t V2 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[1]));
168  const int16x8_t D1 = vsubq_s16(U1, c128);
169  const int16x8_t E1 = vsubq_s16(V1, c128);
170  const int16x8_t D2 = vsubq_s16(U2, c128);
171  const int16x8_t E2 = vsubq_s16(V2, c128);
172  {
173  const uint8x8_t Y1u = vld1_u8(pY1);
174  const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u));
175  pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D1, E1, rPos, gPos, bPos, aPos);
176  pY1 += 8;
177  x += 8;
178  }
179  {
180  const uint8x8_t Y1u = vld1_u8(pY1);
181  const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u));
182  pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D2, E2, rPos, gPos, bPos, aPos);
183  pY1 += 8;
184  x += 8;
185  }
186 
187  if (!lastY)
188  {
189  {
190  const uint8x8_t Y2u = vld1_u8(pY2);
191  const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u));
192  pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D1, E1, rPos, gPos, bPos, aPos);
193  pY2 += 8;
194  }
195  {
196  const uint8x8_t Y2u = vld1_u8(pY2);
197  const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u));
198  pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D2, E2, rPos, gPos, bPos, aPos);
199  pY2 += 8;
200  }
201  }
202 
203  pU += 8;
204  pV += 8;
205  }
206 
207  for (; x < nWidth; x++)
208  {
209  const BYTE U = *pU;
210  const BYTE V = *pV;
211  {
212  const BYTE Y = *pY1++;
213  const BYTE r = YUV2R(Y, U, V);
214  const BYTE g = YUV2G(Y, U, V);
215  const BYTE b = YUV2B(Y, U, V);
216  pRGB1[aPos] = 0xFF;
217  pRGB1[rPos] = r;
218  pRGB1[gPos] = g;
219  pRGB1[bPos] = b;
220  pRGB1 += 4;
221  }
222 
223  if (!lastY)
224  {
225  const BYTE Y = *pY2++;
226  const BYTE r = YUV2R(Y, U, V);
227  const BYTE g = YUV2G(Y, U, V);
228  const BYTE b = YUV2B(Y, U, V);
229  pRGB2[aPos] = 0xFF;
230  pRGB2[rPos] = r;
231  pRGB2[gPos] = g;
232  pRGB2[bPos] = b;
233  pRGB2 += 4;
234  }
235 
236  if (x % 2)
237  {
238  pU++;
239  pV++;
240  }
241  }
242 
243  pRGB1 += dPad;
244  pRGB2 += dPad;
245  pY1 += yPad;
246  pY2 += yPad;
247  pU += uPad;
248  pV += vPad;
249  }
250 
251  return PRIMITIVES_SUCCESS;
252 }
253 
254 static pstatus_t neon_YUV420ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3],
255  const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
256  UINT32 dstStep, UINT32 DstFormat,
257  const prim_size_t* WINPR_RESTRICT roi)
258 {
259  switch (DstFormat)
260  {
261  case PIXEL_FORMAT_BGRA32:
262  case PIXEL_FORMAT_BGRX32:
263  return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
264 
265  case PIXEL_FORMAT_RGBA32:
266  case PIXEL_FORMAT_RGBX32:
267  return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
268 
269  case PIXEL_FORMAT_ARGB32:
270  case PIXEL_FORMAT_XRGB32:
271  return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
272 
273  case PIXEL_FORMAT_ABGR32:
274  case PIXEL_FORMAT_XBGR32:
275  return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
276 
277  default:
278  return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
279  }
280 }
281 
282 static INLINE pstatus_t neon_YUV444ToX(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
283  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
284  const prim_size_t* WINPR_RESTRICT roi, const uint8_t rPos,
285  const uint8_t gPos, const uint8_t bPos, const uint8_t aPos)
286 {
287  const UINT32 nWidth = roi->width;
288  const UINT32 nHeight = roi->height;
289  const UINT32 yPad = srcStep[0] - roi->width;
290  const UINT32 uPad = srcStep[1] - roi->width;
291  const UINT32 vPad = srcStep[2] - roi->width;
292  const UINT32 dPad = dstStep - roi->width * 4;
293  const uint8_t* pY = pSrc[0];
294  const uint8_t* pU = pSrc[1];
295  const uint8_t* pV = pSrc[2];
296  uint8_t* pRGB = pDst;
297  const int16x8_t c128 = vdupq_n_s16(128);
298  const DWORD pad = nWidth % 8;
299 
300  for (UINT32 y = 0; y < nHeight; y++)
301  {
302  for (UINT32 x = 0; x < nWidth - pad; x += 8)
303  {
304  const uint8x8_t Yu = vld1_u8(pY);
305  const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(Yu));
306  const uint8x8_t Uu = vld1_u8(pU);
307  const int16x8_t U = vreinterpretq_s16_u16(vmovl_u8(Uu));
308  const uint8x8_t Vu = vld1_u8(pV);
309  const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vu));
310  /* Do the calculations on Y in 32bit width, the result of 255 * 256 does not fit
311  * a signed 16 bit value. */
312  const int16x8_t D = vsubq_s16(U, c128);
313  const int16x8_t E = vsubq_s16(V, c128);
314  pRGB = neon_YuvToRgbPixel(pRGB, Y, D, E, rPos, gPos, bPos, aPos);
315  pY += 8;
316  pU += 8;
317  pV += 8;
318  }
319 
320  for (UINT32 x = 0; x < pad; x++)
321  {
322  const BYTE Y = *pY++;
323  const BYTE U = *pU++;
324  const BYTE V = *pV++;
325  const BYTE r = YUV2R(Y, U, V);
326  const BYTE g = YUV2G(Y, U, V);
327  const BYTE b = YUV2B(Y, U, V);
328  pRGB[aPos] = 0xFF;
329  pRGB[rPos] = r;
330  pRGB[gPos] = g;
331  pRGB[bPos] = b;
332  pRGB += 4;
333  }
334 
335  pRGB += dPad;
336  pY += yPad;
337  pU += uPad;
338  pV += vPad;
339  }
340 
341  return PRIMITIVES_SUCCESS;
342 }
343 
344 static pstatus_t neon_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3],
345  const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
346  UINT32 dstStep, UINT32 DstFormat,
347  const prim_size_t* WINPR_RESTRICT roi)
348 {
349  switch (DstFormat)
350  {
351  case PIXEL_FORMAT_BGRA32:
352  case PIXEL_FORMAT_BGRX32:
353  return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
354 
355  case PIXEL_FORMAT_RGBA32:
356  case PIXEL_FORMAT_RGBX32:
357  return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
358 
359  case PIXEL_FORMAT_ARGB32:
360  case PIXEL_FORMAT_XRGB32:
361  return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
362 
363  case PIXEL_FORMAT_ABGR32:
364  case PIXEL_FORMAT_XBGR32:
365  return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
366 
367  default:
368  return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
369  }
370 }
371 
372 static pstatus_t neon_LumaToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3], const UINT32 srcStep[3],
373  BYTE* WINPR_RESTRICT pDstRaw[3], const UINT32 dstStep[3],
374  const RECTANGLE_16* WINPR_RESTRICT roi)
375 {
376  const UINT32 nWidth = roi->right - roi->left;
377  const UINT32 nHeight = roi->bottom - roi->top;
378  const UINT32 halfWidth = (nWidth + 1) / 2;
379  const UINT32 halfHeight = (nHeight + 1) / 2;
380  const UINT32 evenY = 0;
381  const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
382  pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
383  pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
384  BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
385  pDstRaw[1] + roi->top * dstStep[1] + roi->left,
386  pDstRaw[2] + roi->top * dstStep[2] + roi->left };
387 
388  /* Y data is already here... */
389  /* B1 */
390  for (UINT32 y = 0; y < nHeight; y++)
391  {
392  const BYTE* Ym = pSrc[0] + srcStep[0] * y;
393  BYTE* pY = pDst[0] + dstStep[0] * y;
394  memcpy(pY, Ym, nWidth);
395  }
396 
397  /* The first half of U, V are already here part of this frame. */
398  /* B2 and B3 */
399  for (UINT32 y = 0; y < halfHeight; y++)
400  {
401  const UINT32 val2y = (2 * y + evenY);
402  const BYTE* Um = pSrc[1] + srcStep[1] * y;
403  const BYTE* Vm = pSrc[2] + srcStep[2] * y;
404  BYTE* pU = pDst[1] + dstStep[1] * val2y;
405  BYTE* pV = pDst[2] + dstStep[2] * val2y;
406  BYTE* pU1 = pU + dstStep[1];
407  BYTE* pV1 = pV + dstStep[2];
408 
409  UINT32 x = 0;
410  for (; x + 16 < halfWidth; x += 16)
411  {
412  {
413  const uint8x16_t u = vld1q_u8(Um);
414  uint8x16x2_t u2x;
415  u2x.val[0] = u;
416  u2x.val[1] = u;
417  vst2q_u8(pU, u2x);
418  vst2q_u8(pU1, u2x);
419  Um += 16;
420  pU += 32;
421  pU1 += 32;
422  }
423  {
424  const uint8x16_t v = vld1q_u8(Vm);
425  uint8x16x2_t v2x;
426  v2x.val[0] = v;
427  v2x.val[1] = v;
428  vst2q_u8(pV, v2x);
429  vst2q_u8(pV1, v2x);
430  Vm += 16;
431  pV += 32;
432  pV1 += 32;
433  }
434  }
435 
436  for (; x < halfWidth; x++)
437  {
438  const BYTE u = *Um++;
439  const BYTE v = *Vm++;
440  *pU++ = u;
441  *pU++ = u;
442  *pU1++ = u;
443  *pU1++ = u;
444  *pV++ = v;
445  *pV++ = v;
446  *pV1++ = v;
447  *pV1++ = v;
448  }
449  }
450 
451  return PRIMITIVES_SUCCESS;
452 }
453 
454 static pstatus_t neon_ChromaFilter(BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
455  const RECTANGLE_16* WINPR_RESTRICT roi)
456 {
457  const UINT32 oddY = 1;
458  const UINT32 evenY = 0;
459  const UINT32 nWidth = roi->right - roi->left;
460  const UINT32 nHeight = roi->bottom - roi->top;
461  const UINT32 halfHeight = (nHeight + 1) / 2;
462  const UINT32 halfWidth = (nWidth + 1) / 2;
463  const UINT32 halfPad = halfWidth % 16;
464 
465  /* Filter */
466  for (UINT32 y = roi->top / 2; y < halfHeight + roi->top / 2; y++)
467  {
468  const UINT32 val2y = (y * 2 + evenY);
469  const UINT32 val2y1 = val2y + oddY;
470  BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
471  BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
472  BYTE* pU = pDst[1] + dstStep[1] * val2y;
473  BYTE* pV = pDst[2] + dstStep[2] * val2y;
474 
475  if (val2y1 > nHeight + roi->top)
476  continue;
477 
478  UINT32 x = roi->left / 2;
479  for (; x < halfWidth + roi->left / 2 - halfPad; x += 8)
480  {
481  {
482  /* U = (U2x,2y << 2) - U2x1,2y - U2x,2y1 - U2x1,2y1 */
483  uint8x8x2_t u = vld2_u8(&pU[2 * x]);
484  const int16x8_t up =
485  vreinterpretq_s16_u16(vshll_n_u8(u.val[0], 2)); /* Ux2,2y << 2 */
486  const uint8x8x2_t u1 = vld2_u8(&pU1[2 * x]);
487  const uint16x8_t usub = vaddl_u8(u1.val[1], u1.val[0]); /* U2x,2y1 + U2x1,2y1 */
488  const int16x8_t us = vreinterpretq_s16_u16(
489  vaddw_u8(usub, u.val[1])); /* U2x1,2y + U2x,2y1 + U2x1,2y1 */
490  const int16x8_t un = vsubq_s16(up, us);
491  const uint8x8_t u8 = vqmovun_s16(un); /* CLIP(un) */
492  u.val[0] = u8;
493  vst2_u8(&pU[2 * x], u);
494  }
495  {
496  /* V = (V2x,2y << 2) - V2x1,2y - V2x,2y1 - V2x1,2y1 */
497  uint8x8x2_t v = vld2_u8(&pV[2 * x]);
498  const int16x8_t vp =
499  vreinterpretq_s16_u16(vshll_n_u8(v.val[0], 2)); /* Vx2,2y << 2 */
500  const uint8x8x2_t v1 = vld2_u8(&pV1[2 * x]);
501  const uint16x8_t vsub = vaddl_u8(v1.val[1], v1.val[0]); /* V2x,2y1 + V2x1,2y1 */
502  const int16x8_t vs = vreinterpretq_s16_u16(
503  vaddw_u8(vsub, v.val[1])); /* V2x1,2y + V2x,2y1 + V2x1,2y1 */
504  const int16x8_t vn = vsubq_s16(vp, vs);
505  const uint8x8_t v8 = vqmovun_s16(vn); /* CLIP(vn) */
506  v.val[0] = v8;
507  vst2_u8(&pV[2 * x], v);
508  }
509  }
510 
511  for (; x < halfWidth + roi->left / 2; x++)
512  {
513  const UINT32 val2x = (x * 2);
514  const UINT32 val2x1 = val2x + 1;
515  const BYTE inU = pU[val2x];
516  const BYTE inV = pV[val2x];
517  const INT32 up = inU * 4;
518  const INT32 vp = inV * 4;
519  INT32 u2020;
520  INT32 v2020;
521 
522  if (val2x1 > nWidth + roi->left)
523  continue;
524 
525  u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
526  v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
527  pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
528  pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
529  }
530  }
531 
532  return PRIMITIVES_SUCCESS;
533 }
534 
535 static pstatus_t neon_ChromaV1ToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3],
536  const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
537  const UINT32 dstStep[3],
538  const RECTANGLE_16* WINPR_RESTRICT roi)
539 {
540  const UINT32 mod = 16;
541  UINT32 uY = 0;
542  UINT32 vY = 0;
543  const UINT32 nWidth = roi->right - roi->left;
544  const UINT32 nHeight = roi->bottom - roi->top;
545  const UINT32 halfWidth = (nWidth) / 2;
546  const UINT32 halfHeight = (nHeight) / 2;
547  const UINT32 oddY = 1;
548  const UINT32 evenY = 0;
549  const UINT32 oddX = 1;
550  /* The auxilary frame is aligned to multiples of 16x16.
551  * We need the padded height for B4 and B5 conversion. */
552  const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
553  const UINT32 halfPad = halfWidth % 16;
554  const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
555  pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
556  pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
557  BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
558  pDstRaw[1] + roi->top * dstStep[1] + roi->left,
559  pDstRaw[2] + roi->top * dstStep[2] + roi->left };
560 
561  /* The second half of U and V is a bit more tricky... */
562  /* B4 and B5 */
563  for (UINT32 y = 0; y < padHeigth; y++)
564  {
565  const BYTE* Ya = pSrc[0] + srcStep[0] * y;
566  BYTE* pX;
567 
568  if ((y) % mod < (mod + 1) / 2)
569  {
570  const UINT32 pos = (2 * uY++ + oddY);
571 
572  if (pos >= nHeight)
573  continue;
574 
575  pX = pDst[1] + dstStep[1] * pos;
576  }
577  else
578  {
579  const UINT32 pos = (2 * vY++ + oddY);
580 
581  if (pos >= nHeight)
582  continue;
583 
584  pX = pDst[2] + dstStep[2] * pos;
585  }
586 
587  memcpy(pX, Ya, nWidth);
588  }
589 
590  /* B6 and B7 */
591  for (UINT32 y = 0; y < halfHeight; y++)
592  {
593  const UINT32 val2y = (y * 2 + evenY);
594  const BYTE* Ua = pSrc[1] + srcStep[1] * y;
595  const BYTE* Va = pSrc[2] + srcStep[2] * y;
596  BYTE* pU = pDst[1] + dstStep[1] * val2y;
597  BYTE* pV = pDst[2] + dstStep[2] * val2y;
598 
599  UINT32 x = 0;
600  for (; x < halfWidth - halfPad; x += 16)
601  {
602  {
603  uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
604  u.val[1] = vld1q_u8(&Ua[x]);
605  vst2q_u8(&pU[2 * x], u);
606  }
607  {
608  uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
609  v.val[1] = vld1q_u8(&Va[x]);
610  vst2q_u8(&pV[2 * x], v);
611  }
612  }
613 
614  for (; x < halfWidth; x++)
615  {
616  const UINT32 val2x1 = (x * 2 + oddX);
617  pU[val2x1] = Ua[x];
618  pV[val2x1] = Va[x];
619  }
620  }
621 
622  /* Filter */
623  return neon_ChromaFilter(pDst, dstStep, roi);
624 }
625 
626 static pstatus_t neon_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
627  UINT32 nTotalWidth, UINT32 nTotalHeight,
628  BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
629  const RECTANGLE_16* WINPR_RESTRICT roi)
630 {
631  const UINT32 nWidth = roi->right - roi->left;
632  const UINT32 nHeight = roi->bottom - roi->top;
633  const UINT32 halfWidth = (nWidth + 1) / 2;
634  const UINT32 halfPad = halfWidth % 16;
635  const UINT32 halfHeight = (nHeight + 1) / 2;
636  const UINT32 quaterWidth = (nWidth + 3) / 4;
637  const UINT32 quaterPad = quaterWidth % 16;
638 
639  /* B4 and B5: odd UV values for width/2, height */
640  for (UINT32 y = 0; y < nHeight; y++)
641  {
642  const UINT32 yTop = y + roi->top;
643  const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
644  const BYTE* pYaV = pYaU + nTotalWidth / 2;
645  BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
646  BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
647 
648  UINT32 x = 0;
649  for (; x < halfWidth - halfPad; x += 16)
650  {
651  {
652  uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
653  u.val[1] = vld1q_u8(&pYaU[x]);
654  vst2q_u8(&pU[2 * x], u);
655  }
656  {
657  uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
658  v.val[1] = vld1q_u8(&pYaV[x]);
659  vst2q_u8(&pV[2 * x], v);
660  }
661  }
662 
663  for (; x < halfWidth; x++)
664  {
665  const UINT32 odd = 2 * x + 1;
666  pU[odd] = pYaU[x];
667  pV[odd] = pYaV[x];
668  }
669  }
670 
671  /* B6 - B9 */
672  for (UINT32 y = 0; y < halfHeight; y++)
673  {
674  const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
675  const BYTE* pUaV = pUaU + nTotalWidth / 4;
676  const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
677  const BYTE* pVaV = pVaU + nTotalWidth / 4;
678  BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
679  BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
680 
681  UINT32 x = 0;
682  for (; x < quaterWidth - quaterPad; x += 16)
683  {
684  {
685  uint8x16x4_t u = vld4q_u8(&pU[4 * x]);
686  u.val[0] = vld1q_u8(&pUaU[x]);
687  u.val[2] = vld1q_u8(&pVaU[x]);
688  vst4q_u8(&pU[4 * x], u);
689  }
690  {
691  uint8x16x4_t v = vld4q_u8(&pV[4 * x]);
692  v.val[0] = vld1q_u8(&pUaV[x]);
693  v.val[2] = vld1q_u8(&pVaV[x]);
694  vst4q_u8(&pV[4 * x], v);
695  }
696  }
697 
698  for (; x < quaterWidth; x++)
699  {
700  pU[4 * x + 0] = pUaU[x];
701  pV[4 * x + 0] = pUaV[x];
702  pU[4 * x + 2] = pVaU[x];
703  pV[4 * x + 2] = pVaV[x];
704  }
705  }
706 
707  return neon_ChromaFilter(pDst, dstStep, roi);
708 }
709 
710 static pstatus_t neon_YUV420CombineToYUV444(avc444_frame_type type,
711  const BYTE* WINPR_RESTRICT pSrc[3],
712  const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
713  BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
714  const RECTANGLE_16* WINPR_RESTRICT roi)
715 {
716  if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
717  return -1;
718 
719  if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
720  return -1;
721 
722  if (!roi)
723  return -1;
724 
725  switch (type)
726  {
727  case AVC444_LUMA:
728  return neon_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
729 
730  case AVC444_CHROMAv1:
731  return neon_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
732 
733  case AVC444_CHROMAv2:
734  return neon_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
735 
736  default:
737  return -1;
738  }
739 }
740 #endif
741 
742 void primitives_init_YUV_neon(primitives_t* WINPR_RESTRICT prims)
743 {
744 #if defined(NEON_ENABLED)
745  generic = primitives_get_generic();
746  primitives_init_YUV(prims);
747 
748  if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
749  {
750  WLog_VRB(PRIM_TAG, "NEON optimizations");
751  prims->YUV420ToRGB_8u_P3AC4R = neon_YUV420ToRGB_8u_P3AC4R;
752  prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R;
753  prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444;
754  }
755 #else
756  WLog_VRB(PRIM_TAG, "undefined WITH_NEON");
757  WINPR_UNUSED(prims);
758 #endif
759 }