FreeRDP
prim_colors_neon.c
1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Optimized Color conversion operations.
3  * vi:ts=4 sw=4:
4  *
5  * Copyright 2011 Stephen Erisman
6  * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
7  * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
8  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
9  *
10  * Licensed under the Apache License, Version 2.0 (the "License"); you may
11  * not use this file except in compliance with the License. You may obtain
12  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
16  * or implied. See the License for the specific language governing
17  * permissions and limitations under the License.
18  */
19 
20 #include <freerdp/config.h>
21 
22 #include <freerdp/types.h>
23 #include <freerdp/primitives.h>
24 #include <winpr/sysinfo.h>
25 
26 #include "prim_internal.h"
27 #include "prim_templates.h"
28 #include "prim_colors.h"
29 
30 /*---------------------------------------------------------------------------*/
31 #if defined(NEON_INTRINSICS_ENABLED)
32 #include <arm_neon.h>
33 
34 static primitives_t* generic = NULL;
35 
36 static pstatus_t
37 neon_yCbCrToRGB_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], INT32 srcStep,
38  INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
39  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
40 {
41  /* TODO: If necessary, check alignments and call the general version. */
42  int16x8_t zero = vdupq_n_s16(0);
43  int16x8_t max = vdupq_n_s16(255);
44  int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14
45  int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14
46  int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
47  int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14
48  int16x8_t c4096 = vdupq_n_s16(4096);
49  const int16x8_t* y_buf = (const int16x8_t*)pSrc[0];
50  const int16x8_t* cb_buf = (const int16x8_t*)pSrc[1];
51  const int16x8_t* cr_buf = (const int16x8_t*)pSrc[2];
52  int16x8_t* r_buf = (int16x8_t*)pDst[0];
53  int16x8_t* g_buf = (int16x8_t*)pDst[1];
54  int16x8_t* b_buf = (int16x8_t*)pDst[2];
55  int srcbump = srcStep / sizeof(int16x8_t);
56  int dstbump = dstStep / sizeof(int16x8_t);
57  int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
58 
59  for (int yp = 0; yp < roi->height; ++yp)
60  {
61  for (int i = 0; i < imax; i++)
62  {
63  /*
64  In order to use NEON signed 16-bit integer multiplication we need to convert
65  the floating point factors to signed int without losing information.
66  The result of this multiplication is 32 bit and we have a NEON instruction
67  that returns the hi word of the saturated double.
68  Thus we will multiply the factors by the highest possible 2^n, take the
69  upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
70  shift by 1 to reverse the doubling) and correct this result by multiplying it
71  by 2^(16-n).
72  For the given factors in the conversion matrix the best possible n is 14.
73 
74  Example for calculating r:
75  r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
76  r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
77  r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
78  r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
79  */
80  /* y = (y_buf[i] + 4096) >> 2 */
81  int16x8_t y = vld1q_s16((INT16*)&y_buf[i]);
82  y = vaddq_s16(y, c4096);
83  y = vshrq_n_s16(y, 2);
84  /* cb = cb_buf[i]; */
85  int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
86  /* cr = cr_buf[i]; */
87  int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]);
88  /* (y + HIWORD(cr*22986)) >> 3 */
89  int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
90  r = vshrq_n_s16(r, 3);
91  /* r_buf[i] = CLIP(r); */
92  r = vminq_s16(vmaxq_s16(r, zero), max);
93  vst1q_s16((INT16*)&r_buf[i], r);
94  /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
95  int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
96  g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
97  g = vshrq_n_s16(g, 3);
98  /* g_buf[i] = CLIP(g); */
99  g = vminq_s16(vmaxq_s16(g, zero), max);
100  vst1q_s16((INT16*)&g_buf[i], g);
101  /* (y + HIWORD(cb*28999)) >> 3 */
102  int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
103  b = vshrq_n_s16(b, 3);
104  /* b_buf[i] = CLIP(b); */
105  b = vminq_s16(vmaxq_s16(b, zero), max);
106  vst1q_s16((INT16*)&b_buf[i], b);
107  }
108 
109  y_buf += srcbump;
110  cb_buf += srcbump;
111  cr_buf += srcbump;
112  r_buf += dstbump;
113  g_buf += dstbump;
114  b_buf += dstbump;
115  }
116 
117  return PRIMITIVES_SUCCESS;
118 }
119 
120 static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
121  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
122  const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
123  uint8_t gPos, uint8_t bPos, uint8_t aPos)
124 {
125  BYTE* pRGB = pDst;
126  const INT16* pY = pSrc[0];
127  const INT16* pCb = pSrc[1];
128  const INT16* pCr = pSrc[2];
129  const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
130  const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
131  const size_t pad = roi->width % 8;
132  const int16x4_t c4096 = vdup_n_s16(4096);
133 
134  for (UINT32 y = 0; y < roi->height; y++)
135  {
136  for (UINT32 x = 0; x < roi->width - pad; x += 8)
137  {
138  const int16x8_t Y = vld1q_s16(pY);
139  const int16x4_t Yh = vget_high_s16(Y);
140  const int16x4_t Yl = vget_low_s16(Y);
141  const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
142  const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
143  const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
144  const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
145  const int16x8_t Cr = vld1q_s16(pCr);
146  const int16x4_t Crh = vget_high_s16(Cr);
147  const int16x4_t Crl = vget_low_s16(Cr);
148  const int16x8_t Cb = vld1q_s16(pCb);
149  const int16x4_t Cbh = vget_high_s16(Cb);
150  const int16x4_t Cbl = vget_low_s16(Cb);
151  uint8x8x4_t bgrx;
152  {
153  /* R */
154  const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
155  const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
156  const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
157  const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
158  const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
159  const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
160  const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
161  bgrx.val[rPos] = vqmovun_s16(Rs);
162  }
163  {
164  /* G */
165  const int32x4_t CbGh = vmull_n_s16(Cbh, 22527); /* 0.343730 * 2^16 */
166  const int32x4_t CbGl = vmull_n_s16(Cbl, 22527); /* 0.343730 * 2^16 */
167  const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
168  const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
169  const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
170  const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
171  const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
172  const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
173  const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
174  const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
175  const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
176  const uint8x8_t G = vqmovun_s16(Gs);
177  bgrx.val[gPos] = G;
178  }
179  {
180  /* B */
181  const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
182  const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
183  const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
184  const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
185  const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
186  const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
187  const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
188  const uint8x8_t B = vqmovun_s16(Bs);
189  bgrx.val[bPos] = B;
190  }
191  /* A */
192  {
193  bgrx.val[aPos] = vdup_n_u8(0xFF);
194  }
195  vst4_u8(pRGB, bgrx);
196  pY += 8;
197  pCb += 8;
198  pCr += 8;
199  pRGB += 32;
200  }
201 
202  for (UINT32 x = 0; x < pad; x++)
203  {
204  const INT32 divisor = 16;
205  const INT32 Y = ((*pY++) + 4096) << divisor;
206  const INT32 Cb = (*pCb++);
207  const INT32 Cr = (*pCr++);
208  const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
209  const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
210  const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
211  const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
212  INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
213  INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
214  INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
215  BYTE bgrx[4];
216  bgrx[bPos] = CLIP(B);
217  bgrx[gPos] = CLIP(G);
218  bgrx[rPos] = CLIP(R);
219  bgrx[aPos] = 0xFF;
220  *pRGB++ = bgrx[0];
221  *pRGB++ = bgrx[1];
222  *pRGB++ = bgrx[2];
223  *pRGB++ = bgrx[3];
224  }
225 
226  pY += srcPad;
227  pCb += srcPad;
228  pCr += srcPad;
229  pRGB += dstPad;
230  }
231 
232  return PRIMITIVES_SUCCESS;
233 }
234 
235 static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
236  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
237  UINT32 DstFormat,
238  const prim_size_t* WINPR_RESTRICT roi)
239 {
240  switch (DstFormat)
241  {
242  case PIXEL_FORMAT_BGRA32:
243  case PIXEL_FORMAT_BGRX32:
244  return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
245 
246  case PIXEL_FORMAT_RGBA32:
247  case PIXEL_FORMAT_RGBX32:
248  return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
249 
250  case PIXEL_FORMAT_ARGB32:
251  case PIXEL_FORMAT_XRGB32:
252  return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
253 
254  case PIXEL_FORMAT_ABGR32:
255  case PIXEL_FORMAT_XBGR32:
256  return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
257 
258  default:
259  return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
260  }
261 }
262 
263 static pstatus_t
264 neon_RGBToRGB_16s8u_P3AC4R_X(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
265  UINT32 srcStep, /* bytes between rows in source data */
266  BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
267  UINT32 dstStep, /* bytes between rows in dest data */
268  const prim_size_t* WINPR_RESTRICT roi, /* region of interest */
269  uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
270 {
271  UINT32 pad = roi->width % 8;
272 
273  for (UINT32 y = 0; y < roi->height; y++)
274  {
275  const INT16* pr = (const INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
276  const INT16* pg = (const INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
277  const INT16* pb = (const INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
278  BYTE* dst = pDst + y * dstStep;
279 
280  for (UINT32 x = 0; x < roi->width - pad; x += 8)
281  {
282  int16x8_t r = vld1q_s16(pr);
283  int16x8_t g = vld1q_s16(pg);
284  int16x8_t b = vld1q_s16(pb);
285  uint8x8x4_t bgrx;
286  bgrx.val[aPos] = vdup_n_u8(0xFF);
287  bgrx.val[rPos] = vqmovun_s16(r);
288  bgrx.val[gPos] = vqmovun_s16(g);
289  bgrx.val[bPos] = vqmovun_s16(b);
290  vst4_u8(dst, bgrx);
291  pr += 8;
292  pg += 8;
293  pb += 8;
294  dst += 32;
295  }
296 
297  for (UINT32 x = 0; x < pad; x++)
298  {
299  BYTE bgrx[4];
300  bgrx[bPos] = *pb++;
301  bgrx[gPos] = *pg++;
302  bgrx[rPos] = *pr++;
303  bgrx[aPos] = 0xFF;
304  *dst++ = bgrx[0];
305  *dst++ = bgrx[1];
306  *dst++ = bgrx[2];
307  *dst++ = bgrx[3];
308  }
309  }
310 
311  return PRIMITIVES_SUCCESS;
312 }
313 
314 static pstatus_t
315 neon_RGBToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
316  UINT32 srcStep, /* bytes between rows in source data */
317  BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
318  UINT32 dstStep, /* bytes between rows in dest data */
319  UINT32 DstFormat,
320  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
321 {
322  switch (DstFormat)
323  {
324  case PIXEL_FORMAT_BGRA32:
325  case PIXEL_FORMAT_BGRX32:
326  return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
327 
328  case PIXEL_FORMAT_RGBA32:
329  case PIXEL_FORMAT_RGBX32:
330  return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
331 
332  case PIXEL_FORMAT_ARGB32:
333  case PIXEL_FORMAT_XRGB32:
334  return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
335 
336  case PIXEL_FORMAT_ABGR32:
337  case PIXEL_FORMAT_XBGR32:
338  return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
339 
340  default:
341  return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
342  }
343 }
344 #endif /* NEON_INTRINSICS_ENABLED */
345 
346 /* ------------------------------------------------------------------------- */
347 void primitives_init_colors_neon(primitives_t* prims)
348 {
349 #if defined(NEON_INTRINSICS_ENABLED)
350  generic = primitives_get_generic();
351  primitives_init_colors(prims);
352 
353  if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
354  {
355  WLog_VRB(PRIM_TAG, "NEON optimizations");
356  prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
357  prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
358  prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
359  }
360 #else
361  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or neon intrinsics not available");
362  WINPR_UNUSED(prims);
363 #endif
364 }