FreeRDP
prim_YCoCg_ssse3.c
1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Optimized YCoCg<->RGB conversion operations.
3  * vi:ts=4 sw=4:
4  *
5  * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 
20 #include <freerdp/config.h>
21 
22 #include <freerdp/types.h>
23 #include <freerdp/primitives.h>
24 #include <winpr/sysinfo.h>
25 
26 #include "prim_YCoCg.h"
27 
28 #include "prim_internal.h"
29 #include "prim_templates.h"
30 
31 #if defined(SSE_AVX_INTRINSICS_ENABLED)
32 #include <emmintrin.h>
33 #include <tmmintrin.h>
34 
35 static primitives_t* generic = NULL;
36 
37 /* ------------------------------------------------------------------------- */
38 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
39  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
40  UINT32 dstStep, UINT32 width, UINT32 height,
41  UINT8 shift, BOOL withAlpha)
42 {
43  const BYTE* sptr = pSrc;
44  BYTE* dptr = pDst;
45 
46  WINPR_ASSERT(srcStep / sizeof(UINT32) >= width);
47  WINPR_ASSERT(dstStep / sizeof(UINT32) >= width);
48  const size_t sRowBump = srcStep - width * sizeof(UINT32);
49  const size_t dRowBump = dstStep - width * sizeof(UINT32);
50  /* Shift left by "shift" and divide by two is the same as shift
51  * left by "shift-1".
52  */
53  int dataShift = shift - 1;
54  BYTE mask = (BYTE)(0xFFU << dataShift);
55 
56  /* Let's say the data is of the form:
57  * y0y0o0g0 a1y1o1g1 a2y2o2g2...
58  * Apply:
59  * |R| | 1 1/2 -1/2 | |y|
60  * |G| = | 1 0 1/2 | * |o|
61  * |B| | 1 -1/2 -1/2 | |g|
62  * where Y is 8-bit unsigned and o & g are 8-bit signed.
63  */
64 
65  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
66  {
67  /* Too small, or we'll never hit a 16-byte boundary. Punt. */
68  return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
69  DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
70  width, height, shift, withAlpha);
71  }
72 
73  for (UINT32 h = 0; h < height; h++)
74  {
75  UINT32 w = width;
76  BOOL onStride = 0;
77 
78  /* Get to a 16-byte destination boundary. */
79  if ((ULONG_PTR)dptr & 0x0f)
80  {
81  pstatus_t status = 0;
82  UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
83 
84  if (startup > width)
85  startup = width;
86 
87  status = generic->YCoCgToRGB_8u_AC4R(
88  sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
89  WINPR_ASSERTING_INT_CAST(INT32, dstStep), startup, 1, shift, withAlpha);
90 
91  if (status != PRIMITIVES_SUCCESS)
92  return status;
93 
94  sptr += startup * sizeof(UINT32);
95  dptr += startup * sizeof(UINT32);
96  w -= startup;
97  }
98 
99  /* Each loop handles eight pixels at a time. */
100  onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
101 
102  while (w >= 8)
103  {
104  __m128i R0;
105  __m128i R1;
106  __m128i R2;
107  __m128i R3;
108  __m128i R4;
109  __m128i R5;
110  __m128i R6;
111  __m128i R7;
112 
113  if (onStride)
114  {
115  /* The faster path, 16-byte aligned load. */
116  R0 = _mm_load_si128((const __m128i*)sptr);
117  sptr += (128 / 8);
118  R1 = _mm_load_si128((const __m128i*)sptr);
119  sptr += (128 / 8);
120  }
121  else
122  {
123  /* Off-stride, slower LDDQU load. */
124  R0 = _mm_lddqu_si128((const __m128i*)sptr);
125  sptr += (128 / 8);
126  R1 = _mm_lddqu_si128((const __m128i*)sptr);
127  sptr += (128 / 8);
128  }
129 
130  /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
131  /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
132  /* Shuffle to pack all the like types together. */
133  R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
134  R3 = _mm_shuffle_epi8(R0, R2);
135  R4 = _mm_shuffle_epi8(R1, R2);
136  /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
137  /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
138  R5 = _mm_unpackhi_epi32(R3, R4);
139  R6 = _mm_unpacklo_epi32(R3, R4);
140 
141  /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
142  /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
143  /* Save alphas aside */
144  if (withAlpha)
145  R7 = _mm_unpackhi_epi64(R5, R5);
146  else
147  R7 = mm_set1_epu32(0xFFFFFFFFU);
148 
149  /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
150  /* Expand Y's from 8-bit unsigned to 16-bit signed. */
151  R1 = mm_set1_epu32(0);
152  R0 = _mm_unpacklo_epi8(R5, R1);
153  /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
154  /* Shift Co's and Cg's by (shift-1). -1 covers division by two.
155  * Note: this must be done before sign-conversion.
156  * Note also there is no slli_epi8, so we have to use a 16-bit
157  * version and then mask.
158  */
159  R6 = _mm_slli_epi16(R6, dataShift);
160  R1 = mm_set1_epu8(mask);
161  R6 = _mm_and_si128(R6, R1);
162  /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
163  /* Expand Co's from 8-bit signed to 16-bit signed */
164  R1 = _mm_unpackhi_epi8(R6, R6);
165  R1 = _mm_srai_epi16(R1, 8);
166  /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
167  /* Expand Cg's form 8-bit signed to 16-bit signed */
168  R2 = _mm_unpacklo_epi8(R6, R6);
169  R2 = _mm_srai_epi16(R2, 8);
170  /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
171  /* Get Y - halfCg and save */
172  R6 = _mm_subs_epi16(R0, R2);
173  /* R = (Y-halfCg) + halfCo */
174  R3 = _mm_adds_epi16(R6, R1);
175  /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
176  /* G = Y + Cg(/2) */
177  R4 = _mm_adds_epi16(R0, R2);
178  /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
179  /* B = (Y-halfCg) - Co(/2) */
180  R5 = _mm_subs_epi16(R6, R1);
181  /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
182  /* Repack R's & B's. */
183  R0 = _mm_packus_epi16(R3, R5);
184  /* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
185  /* Repack G's. */
186  R1 = _mm_packus_epi16(R4, R4);
187  /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
188  /* And add the A's. */
189  R1 = _mm_unpackhi_epi64(R1, R7);
190  /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
191  /* Now do interleaving again. */
192  R2 = _mm_unpacklo_epi8(R0, R1);
193  /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
194  R3 = _mm_unpackhi_epi8(R0, R1);
195  /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
196  R4 = _mm_unpacklo_epi16(R2, R3);
197  /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
198  R5 = _mm_unpackhi_epi16(R2, R3);
199  /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
200  _mm_store_si128((__m128i*)dptr, R4);
201  dptr += (128 / 8);
202  _mm_store_si128((__m128i*)dptr, R5);
203  dptr += (128 / 8);
204  w -= 8;
205  }
206 
207  /* Handle any remainder pixels. */
208  if (w > 0)
209  {
210  pstatus_t status = 0;
211  status = generic->YCoCgToRGB_8u_AC4R(
212  sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
213  WINPR_ASSERTING_INT_CAST(INT32, dstStep), w, 1, shift, withAlpha);
214 
215  if (status != PRIMITIVES_SUCCESS)
216  return status;
217 
218  sptr += w * sizeof(UINT32);
219  dptr += w * sizeof(UINT32);
220  }
221 
222  sptr += sRowBump;
223  dptr += dRowBump;
224  }
225 
226  return PRIMITIVES_SUCCESS;
227 }
228 
229 /* ------------------------------------------------------------------------- */
230 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
231  UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
232  UINT32 DstFormat, UINT32 dstStep, UINT32 width,
233  UINT32 height, UINT8 shift, BOOL withAlpha)
234 {
235  const BYTE* sptr = pSrc;
236  BYTE* dptr = pDst;
237  size_t sRowBump = srcStep - width * sizeof(UINT32);
238  size_t dRowBump = dstStep - width * sizeof(UINT32);
239  /* Shift left by "shift" and divide by two is the same as shift
240  * left by "shift-1".
241  */
242  int dataShift = shift - 1;
243  BYTE mask = (BYTE)(0xFFU << dataShift);
244 
245  /* Let's say the data is of the form:
246  * y0y0o0g0 a1y1o1g1 a2y2o2g2...
247  * Apply:
248  * |R| | 1 1/2 -1/2 | |y|
249  * |G| = | 1 0 1/2 | * |o|
250  * |B| | 1 -1/2 -1/2 | |g|
251  * where Y is 8-bit unsigned and o & g are 8-bit signed.
252  */
253 
254  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
255  {
256  /* Too small, or we'll never hit a 16-byte boundary. Punt. */
257  return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
258  DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
259  width, height, shift, withAlpha);
260  }
261 
262  for (UINT32 h = 0; h < height; h++)
263  {
264  UINT32 w = width;
265  BOOL onStride = 0;
266 
267  /* Get to a 16-byte destination boundary. */
268  if ((ULONG_PTR)dptr & 0x0f)
269  {
270  pstatus_t status = 0;
271  UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
272 
273  if (startup > width)
274  startup = width;
275 
276  status = generic->YCoCgToRGB_8u_AC4R(
277  sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
278  WINPR_ASSERTING_INT_CAST(INT32, dstStep), startup, 1, shift, withAlpha);
279 
280  if (status != PRIMITIVES_SUCCESS)
281  return status;
282 
283  sptr += startup * sizeof(UINT32);
284  dptr += startup * sizeof(UINT32);
285  w -= startup;
286  }
287 
288  /* Each loop handles eight pixels at a time. */
289  onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
290 
291  while (w >= 8)
292  {
293  __m128i R0;
294  __m128i R1;
295  __m128i R2;
296  __m128i R3;
297  __m128i R4;
298  __m128i R5;
299  __m128i R6;
300  __m128i R7;
301 
302  if (onStride)
303  {
304  /* The faster path, 16-byte aligned load. */
305  R0 = _mm_load_si128((const __m128i*)sptr);
306  sptr += (128 / 8);
307  R1 = _mm_load_si128((const __m128i*)sptr);
308  sptr += (128 / 8);
309  }
310  else
311  {
312  /* Off-stride, slower LDDQU load. */
313  R0 = _mm_lddqu_si128((const __m128i*)sptr);
314  sptr += (128 / 8);
315  R1 = _mm_lddqu_si128((const __m128i*)sptr);
316  sptr += (128 / 8);
317  }
318 
319  /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
320  /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
321  /* Shuffle to pack all the like types together. */
322  R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
323  R3 = _mm_shuffle_epi8(R0, R2);
324  R4 = _mm_shuffle_epi8(R1, R2);
325  /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
326  /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
327  R5 = _mm_unpackhi_epi32(R3, R4);
328  R6 = _mm_unpacklo_epi32(R3, R4);
329 
330  /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
331  /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
332  /* Save alphas aside */
333  if (withAlpha)
334  R7 = _mm_unpackhi_epi64(R5, R5);
335  else
336  R7 = mm_set1_epu32(0xFFFFFFFFU);
337 
338  /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
339  /* Expand Y's from 8-bit unsigned to 16-bit signed. */
340  R1 = mm_set1_epu32(0);
341  R0 = _mm_unpacklo_epi8(R5, R1);
342  /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
343  /* Shift Co's and Cg's by (shift-1). -1 covers division by two.
344  * Note: this must be done before sign-conversion.
345  * Note also there is no slli_epi8, so we have to use a 16-bit
346  * version and then mask.
347  */
348  R6 = _mm_slli_epi16(R6, dataShift);
349  R1 = mm_set1_epu8(mask);
350  R6 = _mm_and_si128(R6, R1);
351  /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
352  /* Expand Co's from 8-bit signed to 16-bit signed */
353  R1 = _mm_unpackhi_epi8(R6, R6);
354  R1 = _mm_srai_epi16(R1, 8);
355  /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
356  /* Expand Cg's form 8-bit signed to 16-bit signed */
357  R2 = _mm_unpacklo_epi8(R6, R6);
358  R2 = _mm_srai_epi16(R2, 8);
359  /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
360  /* Get Y - halfCg and save */
361  R6 = _mm_subs_epi16(R0, R2);
362  /* R = (Y-halfCg) + halfCo */
363  R3 = _mm_adds_epi16(R6, R1);
364  /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
365  /* G = Y + Cg(/2) */
366  R4 = _mm_adds_epi16(R0, R2);
367  /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
368  /* B = (Y-halfCg) - Co(/2) */
369  R5 = _mm_subs_epi16(R6, R1);
370  /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
371  /* Repack R's & B's. */
372  /* This line is the only diff between inverted and non-inverted.
373  * Unfortunately, it would be expensive to check "inverted"
374  * every time through this loop.
375  */
376  R0 = _mm_packus_epi16(R5, R3);
377  /* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
378  /* Repack G's. */
379  R1 = _mm_packus_epi16(R4, R4);
380  /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
381  /* And add the A's. */
382  R1 = _mm_unpackhi_epi64(R1, R7);
383  /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
384  /* Now do interleaving again. */
385  R2 = _mm_unpacklo_epi8(R0, R1);
386  /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
387  R3 = _mm_unpackhi_epi8(R0, R1);
388  /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
389  R4 = _mm_unpacklo_epi16(R2, R3);
390  /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
391  R5 = _mm_unpackhi_epi16(R2, R3);
392  /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
393  _mm_store_si128((__m128i*)dptr, R4);
394  dptr += (128 / 8);
395  _mm_store_si128((__m128i*)dptr, R5);
396  dptr += (128 / 8);
397  w -= 8;
398  }
399 
400  /* Handle any remainder pixels. */
401  if (w > 0)
402  {
403  pstatus_t status = 0;
404  status = generic->YCoCgToRGB_8u_AC4R(
405  sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
406  WINPR_ASSERTING_INT_CAST(INT32, dstStep), WINPR_ASSERTING_INT_CAST(UINT32, w), 1,
407  shift, withAlpha);
408 
409  if (status != PRIMITIVES_SUCCESS)
410  return status;
411 
412  sptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
413  dptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
414  }
415 
416  sptr += sRowBump;
417  dptr += dRowBump;
418  }
419 
420  return PRIMITIVES_SUCCESS;
421 }
422 
423 /* ------------------------------------------------------------------------- */
424 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
425  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
426  INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
427  BOOL withAlpha)
428 {
429  switch (DstFormat)
430  {
431  case PIXEL_FORMAT_BGRX32:
432  case PIXEL_FORMAT_BGRA32:
433  return ssse3_YCoCgRToRGB_8u_AC4R_invert(
434  pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
435  WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
436 
437  case PIXEL_FORMAT_RGBX32:
438  case PIXEL_FORMAT_RGBA32:
439  return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
440  pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
441  WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
442 
443  default:
444  return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
445  height, shift, withAlpha);
446  }
447 }
448 
449 #endif
450 
451 /* ------------------------------------------------------------------------- */
452 void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims)
453 {
454 #if defined(SSE_AVX_INTRINSICS_ENABLED)
455  generic = primitives_get_generic();
456  primitives_init_YCoCg(prims);
457 
458  if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
459  IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
460  {
461  WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
462  prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
463  }
464 #else
465  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
466  WINPR_UNUSED(prims);
467 #endif
468 }