FreeRDP
prim_YCoCg_neon.c
1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Optimized YCoCg<->RGB conversion operations.
3  * vi:ts=4 sw=4:
4  *
5  * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 
20 #include <freerdp/config.h>
21 
22 #include <freerdp/types.h>
23 #include <freerdp/primitives.h>
24 #include <winpr/sysinfo.h>
25 
26 #include "prim_internal.h"
27 #include "prim_templates.h"
28 #include "prim_YCoCg.h"
29 
30 #if defined(NEON_INTRINSICS_ENABLED)
31 #include <arm_neon.h>
32 
33 static primitives_t* generic = NULL;
34 
35 static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
36  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
37  UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
38  BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
39 {
40  BYTE* dptr = pDst;
41  const BYTE* sptr = pSrc;
42  const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
43  const int8_t cll = shift - 1; /* -1 builds in the /2's */
44  const UINT32 srcPad = srcStep - (width * 4);
45  const UINT32 dstPad = dstStep - (width * formatSize);
46  const UINT32 pad = width % 8;
47  const uint8x8_t aVal = vdup_n_u8(0xFF);
48  const int8x8_t cllv = vdup_n_s8(cll);
49 
50  for (UINT32 y = 0; y < height; y++)
51  {
52  for (UINT32 x = 0; x < width - pad; x += 8)
53  {
54  /* Note: shifts must be done before sign-conversion. */
55  const uint8x8x4_t raw = vld4_u8(sptr);
56  const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
57  const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
58  const int16x8_t Cg = vmovl_s8(CgRaw);
59  const int16x8_t Co = vmovl_s8(CoRaw);
60  const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
61  const int16x8_t T = vsubq_s16(Y, Cg);
62  const int16x8_t R = vaddq_s16(T, Co);
63  const int16x8_t G = vaddq_s16(Y, Cg);
64  const int16x8_t B = vsubq_s16(T, Co);
65  uint8x8x4_t bgrx;
66  bgrx.val[bPos] = vqmovun_s16(B);
67  bgrx.val[gPos] = vqmovun_s16(G);
68  bgrx.val[rPos] = vqmovun_s16(R);
69 
70  if (alpha)
71  bgrx.val[aPos] = raw.val[3];
72  else
73  bgrx.val[aPos] = aVal;
74 
75  vst4_u8(dptr, bgrx);
76  sptr += sizeof(raw);
77  dptr += sizeof(bgrx);
78  }
79 
80  for (UINT32 x = 0; x < pad; x++)
81  {
82  /* Note: shifts must be done before sign-conversion. */
83  const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
84  const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
85  const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
86  const INT16 T = Y - Cg;
87  const INT16 R = T + Co;
88  const INT16 G = Y + Cg;
89  const INT16 B = T - Co;
90  BYTE bgra[4];
91  bgra[bPos] = CLIP(B);
92  bgra[gPos] = CLIP(G);
93  bgra[rPos] = CLIP(R);
94  bgra[aPos] = *sptr++;
95 
96  if (!alpha)
97  bgra[aPos] = 0xFF;
98 
99  *dptr++ = bgra[0];
100  *dptr++ = bgra[1];
101  *dptr++ = bgra[2];
102  *dptr++ = bgra[3];
103  }
104 
105  sptr += srcPad;
106  dptr += dstPad;
107  }
108 
109  return PRIMITIVES_SUCCESS;
110 }
111 
112 static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
113  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
114  UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
115 {
116  switch (DstFormat)
117  {
118  case PIXEL_FORMAT_BGRA32:
119  return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
120  shift, 2, 1, 0, 3, withAlpha);
121 
122  case PIXEL_FORMAT_BGRX32:
123  return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
124  shift, 2, 1, 0, 3, withAlpha);
125 
126  case PIXEL_FORMAT_RGBA32:
127  return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
128  shift, 0, 1, 2, 3, withAlpha);
129 
130  case PIXEL_FORMAT_RGBX32:
131  return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
132  shift, 0, 1, 2, 3, withAlpha);
133 
134  case PIXEL_FORMAT_ARGB32:
135  return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
136  shift, 1, 2, 3, 0, withAlpha);
137 
138  case PIXEL_FORMAT_XRGB32:
139  return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
140  shift, 1, 2, 3, 0, withAlpha);
141 
142  case PIXEL_FORMAT_ABGR32:
143  return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
144  shift, 3, 2, 1, 0, withAlpha);
145 
146  case PIXEL_FORMAT_XBGR32:
147  return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
148  shift, 3, 2, 1, 0, withAlpha);
149 
150  default:
151  return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
152  height, shift, withAlpha);
153  }
154 }
155 #endif
156 
157 /* ------------------------------------------------------------------------- */
158 void primitives_init_YCoCg_neon(primitives_t* WINPR_RESTRICT prims)
159 {
160 #if defined(NEON_INTRINSICS_ENABLED)
161  generic = primitives_get_generic();
162  primitives_init_YCoCg(prims);
163 
164  if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
165  {
166  WLog_VRB(PRIM_TAG, "NEON optimizations");
167  prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
168  }
169 #else
170  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or neon intrinsics not available");
171  WINPR_UNUSED(prims);
172 #endif
173 }