20 #include <freerdp/config.h>
22 #include <freerdp/types.h>
23 #include <freerdp/primitives.h>
24 #include <winpr/sysinfo.h>
26 #include "prim_internal.h"
27 #include "prim_templates.h"
28 #include "prim_colors.h"
31 #if defined(NEON_INTRINSICS_ENABLED)
37 neon_yCbCrToRGB_16s16s_P3P3(
const INT16* WINPR_RESTRICT pSrc[3], INT32 srcStep,
38 INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
42 int16x8_t zero = vdupq_n_s16(0);
43 int16x8_t max = vdupq_n_s16(255);
44 int16x8_t r_cr = vdupq_n_s16(22986);
45 int16x8_t g_cb = vdupq_n_s16(-5636);
46 int16x8_t g_cr = vdupq_n_s16(-11698);
47 int16x8_t b_cb = vdupq_n_s16(28999);
48 int16x8_t c4096 = vdupq_n_s16(4096);
49 const int16x8_t* y_buf = (
const int16x8_t*)pSrc[0];
50 const int16x8_t* cb_buf = (
const int16x8_t*)pSrc[1];
51 const int16x8_t* cr_buf = (
const int16x8_t*)pSrc[2];
52 int16x8_t* r_buf = (int16x8_t*)pDst[0];
53 int16x8_t* g_buf = (int16x8_t*)pDst[1];
54 int16x8_t* b_buf = (int16x8_t*)pDst[2];
55 int srcbump = srcStep /
sizeof(int16x8_t);
56 int dstbump = dstStep /
sizeof(int16x8_t);
57 int imax = roi->width *
sizeof(INT16) /
sizeof(int16x8_t);
59 for (
int yp = 0; yp < roi->height; ++yp)
61 for (
int i = 0; i < imax; i++)
81 int16x8_t y = vld1q_s16((INT16*)&y_buf[i]);
82 y = vaddq_s16(y, c4096);
83 y = vshrq_n_s16(y, 2);
85 int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
87 int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]);
89 int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
90 r = vshrq_n_s16(r, 3);
92 r = vminq_s16(vmaxq_s16(r, zero), max);
93 vst1q_s16((INT16*)&r_buf[i], r);
95 int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
96 g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
97 g = vshrq_n_s16(g, 3);
99 g = vminq_s16(vmaxq_s16(g, zero), max);
100 vst1q_s16((INT16*)&g_buf[i], g);
102 int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
103 b = vshrq_n_s16(b, 3);
105 b = vminq_s16(vmaxq_s16(b, zero), max);
106 vst1q_s16((INT16*)&b_buf[i], b);
117 return PRIMITIVES_SUCCESS;
120 static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(
const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
121 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
122 const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
123 uint8_t gPos, uint8_t bPos, uint8_t aPos)
126 const INT16* pY = pSrc[0];
127 const INT16* pCb = pSrc[1];
128 const INT16* pCr = pSrc[2];
129 const size_t srcPad = (srcStep - (roi->width *
sizeof(INT16))) /
sizeof(INT16);
130 const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
131 const size_t pad = roi->width % 8;
132 const int16x4_t c4096 = vdup_n_s16(4096);
134 for (UINT32 y = 0; y < roi->height; y++)
136 for (UINT32 x = 0; x < roi->width - pad; x += 8)
138 const int16x8_t Y = vld1q_s16(pY);
139 const int16x4_t Yh = vget_high_s16(Y);
140 const int16x4_t Yl = vget_low_s16(Y);
141 const int32x4_t YhAdd = vaddl_s16(Yh, c4096);
142 const int32x4_t YlAdd = vaddl_s16(Yl, c4096);
143 const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
144 const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
145 const int16x8_t Cr = vld1q_s16(pCr);
146 const int16x4_t Crh = vget_high_s16(Cr);
147 const int16x4_t Crl = vget_low_s16(Cr);
148 const int16x8_t Cb = vld1q_s16(pCb);
149 const int16x4_t Cbh = vget_high_s16(Cb);
150 const int16x4_t Cbl = vget_low_s16(Cb);
154 const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916);
155 const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916);
156 const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
157 const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
158 const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
159 const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
160 const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
161 bgrx.val[rPos] = vqmovun_s16(Rs);
165 const int32x4_t CbGh = vmull_n_s16(Cbh, 22527);
166 const int32x4_t CbGl = vmull_n_s16(Cbl, 22527);
167 const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819);
168 const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819);
169 const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
170 const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
171 const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
172 const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
173 const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
174 const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
175 const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
176 const uint8x8_t G = vqmovun_s16(Gs);
181 const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992);
182 const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992);
183 const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
184 const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
185 const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
186 const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
187 const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
188 const uint8x8_t B = vqmovun_s16(Bs);
193 bgrx.val[aPos] = vdup_n_u8(0xFF);
202 for (UINT32 x = 0; x < pad; x++)
204 const INT32 divisor = 16;
205 const INT32 Y = ((*pY++) + 4096) << divisor;
206 const INT32 Cb = (*pCb++);
207 const INT32 Cr = (*pCr++);
208 const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
209 const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
210 const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
211 const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
212 INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
213 INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
214 INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
216 bgrx[bPos] = CLIP(B);
217 bgrx[gPos] = CLIP(G);
218 bgrx[rPos] = CLIP(R);
232 return PRIMITIVES_SUCCESS;
235 static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(
const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
236 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
242 case PIXEL_FORMAT_BGRA32:
243 case PIXEL_FORMAT_BGRX32:
244 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
246 case PIXEL_FORMAT_RGBA32:
247 case PIXEL_FORMAT_RGBX32:
248 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
250 case PIXEL_FORMAT_ARGB32:
251 case PIXEL_FORMAT_XRGB32:
252 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
254 case PIXEL_FORMAT_ABGR32:
255 case PIXEL_FORMAT_XBGR32:
256 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
259 return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
264 neon_RGBToRGB_16s8u_P3AC4R_X(
const INT16* WINPR_RESTRICT pSrc[3],
266 BYTE* WINPR_RESTRICT pDst,
269 uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
271 UINT32 pad = roi->width % 8;
273 for (UINT32 y = 0; y < roi->height; y++)
275 const INT16* pr = (
const INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
276 const INT16* pg = (
const INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
277 const INT16* pb = (
const INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
278 BYTE* dst = pDst + y * dstStep;
280 for (UINT32 x = 0; x < roi->width - pad; x += 8)
282 int16x8_t r = vld1q_s16(pr);
283 int16x8_t g = vld1q_s16(pg);
284 int16x8_t b = vld1q_s16(pb);
286 bgrx.val[aPos] = vdup_n_u8(0xFF);
287 bgrx.val[rPos] = vqmovun_s16(r);
288 bgrx.val[gPos] = vqmovun_s16(g);
289 bgrx.val[bPos] = vqmovun_s16(b);
297 for (UINT32 x = 0; x < pad; x++)
311 return PRIMITIVES_SUCCESS;
315 neon_RGBToRGB_16s8u_P3AC4R(
const INT16* WINPR_RESTRICT pSrc[3],
317 BYTE* WINPR_RESTRICT pDst,
324 case PIXEL_FORMAT_BGRA32:
325 case PIXEL_FORMAT_BGRX32:
326 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
328 case PIXEL_FORMAT_RGBA32:
329 case PIXEL_FORMAT_RGBX32:
330 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
332 case PIXEL_FORMAT_ARGB32:
333 case PIXEL_FORMAT_XRGB32:
334 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
336 case PIXEL_FORMAT_ABGR32:
337 case PIXEL_FORMAT_XBGR32:
338 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
341 return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
349 #if defined(NEON_INTRINSICS_ENABLED)
350 generic = primitives_get_generic();
351 primitives_init_colors(prims);
353 if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
355 WLog_VRB(PRIM_TAG,
"NEON optimizations");
356 prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
357 prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
358 prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
361 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or neon intrinsics not available");