FreeRDP
prim_YCoCg_ssse3.c
1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Optimized YCoCg<->RGB conversion operations.
3  * vi:ts=4 sw=4:
4  *
5  * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 
20 #include <freerdp/config.h>
21 
22 #include <freerdp/types.h>
23 #include <freerdp/primitives.h>
24 #include <winpr/sysinfo.h>
25 
26 #include "prim_YCoCg.h"
27 
28 #include "prim_internal.h"
29 #include "prim_templates.h"
30 
31 #if defined(SSE2_ENABLED)
32 #include <emmintrin.h>
33 #include <tmmintrin.h>
34 
35 static primitives_t* generic = NULL;
36 
37 /* ------------------------------------------------------------------------- */
38 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
39  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
40  UINT32 dstStep, UINT32 width, UINT32 height,
41  UINT8 shift, BOOL withAlpha)
42 {
43  const BYTE* sptr = pSrc;
44  BYTE* dptr = pDst;
45  int sRowBump = srcStep - width * sizeof(UINT32);
46  int dRowBump = dstStep - width * sizeof(UINT32);
47  /* Shift left by "shift" and divide by two is the same as shift
48  * left by "shift-1".
49  */
50  int dataShift = shift - 1;
51  BYTE mask = (BYTE)(0xFFU << dataShift);
52 
53  /* Let's say the data is of the form:
54  * y0y0o0g0 a1y1o1g1 a2y2o2g2...
55  * Apply:
56  * |R| | 1 1/2 -1/2 | |y|
57  * |G| = | 1 0 1/2 | * |o|
58  * |B| | 1 -1/2 -1/2 | |g|
59  * where Y is 8-bit unsigned and o & g are 8-bit signed.
60  */
61 
62  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
63  {
64  /* Too small, or we'll never hit a 16-byte boundary. Punt. */
65  return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
66  shift, withAlpha);
67  }
68 
69  for (UINT32 h = 0; h < height; h++)
70  {
71  UINT32 w = width;
72  BOOL onStride = 0;
73 
74  /* Get to a 16-byte destination boundary. */
75  if ((ULONG_PTR)dptr & 0x0f)
76  {
77  pstatus_t status = 0;
78  UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
79 
80  if (startup > width)
81  startup = width;
82 
83  status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
84  1, shift, withAlpha);
85 
86  if (status != PRIMITIVES_SUCCESS)
87  return status;
88 
89  sptr += startup * sizeof(UINT32);
90  dptr += startup * sizeof(UINT32);
91  w -= startup;
92  }
93 
94  /* Each loop handles eight pixels at a time. */
95  onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
96 
97  while (w >= 8)
98  {
99  __m128i R0;
100  __m128i R1;
101  __m128i R2;
102  __m128i R3;
103  __m128i R4;
104  __m128i R5;
105  __m128i R6;
106  __m128i R7;
107 
108  if (onStride)
109  {
110  /* The faster path, 16-byte aligned load. */
111  R0 = _mm_load_si128((const __m128i*)sptr);
112  sptr += (128 / 8);
113  R1 = _mm_load_si128((const __m128i*)sptr);
114  sptr += (128 / 8);
115  }
116  else
117  {
118  /* Off-stride, slower LDDQU load. */
119  R0 = _mm_lddqu_si128((const __m128i*)sptr);
120  sptr += (128 / 8);
121  R1 = _mm_lddqu_si128((const __m128i*)sptr);
122  sptr += (128 / 8);
123  }
124 
125  /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
126  /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
127  /* Shuffle to pack all the like types together. */
128  R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
129  R3 = _mm_shuffle_epi8(R0, R2);
130  R4 = _mm_shuffle_epi8(R1, R2);
131  /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
132  /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
133  R5 = _mm_unpackhi_epi32(R3, R4);
134  R6 = _mm_unpacklo_epi32(R3, R4);
135 
136  /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
137  /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
138  /* Save alphas aside */
139  if (withAlpha)
140  R7 = _mm_unpackhi_epi64(R5, R5);
141  else
142  R7 = _mm_set1_epi32(0xFFFFFFFFU);
143 
144  /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
145  /* Expand Y's from 8-bit unsigned to 16-bit signed. */
146  R1 = _mm_set1_epi32(0);
147  R0 = _mm_unpacklo_epi8(R5, R1);
148  /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
149  /* Shift Co's and Cg's by (shift-1). -1 covers division by two.
150  * Note: this must be done before sign-conversion.
151  * Note also there is no slli_epi8, so we have to use a 16-bit
152  * version and then mask.
153  */
154  R6 = _mm_slli_epi16(R6, dataShift);
155  R1 = _mm_set1_epi8(mask);
156  R6 = _mm_and_si128(R6, R1);
157  /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
158  /* Expand Co's from 8-bit signed to 16-bit signed */
159  R1 = _mm_unpackhi_epi8(R6, R6);
160  R1 = _mm_srai_epi16(R1, 8);
161  /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
162  /* Expand Cg's form 8-bit signed to 16-bit signed */
163  R2 = _mm_unpacklo_epi8(R6, R6);
164  R2 = _mm_srai_epi16(R2, 8);
165  /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
166  /* Get Y - halfCg and save */
167  R6 = _mm_subs_epi16(R0, R2);
168  /* R = (Y-halfCg) + halfCo */
169  R3 = _mm_adds_epi16(R6, R1);
170  /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
171  /* G = Y + Cg(/2) */
172  R4 = _mm_adds_epi16(R0, R2);
173  /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
174  /* B = (Y-halfCg) - Co(/2) */
175  R5 = _mm_subs_epi16(R6, R1);
176  /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
177  /* Repack R's & B's. */
178  R0 = _mm_packus_epi16(R3, R5);
179  /* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
180  /* Repack G's. */
181  R1 = _mm_packus_epi16(R4, R4);
182  /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
183  /* And add the A's. */
184  R1 = _mm_unpackhi_epi64(R1, R7);
185  /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
186  /* Now do interleaving again. */
187  R2 = _mm_unpacklo_epi8(R0, R1);
188  /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
189  R3 = _mm_unpackhi_epi8(R0, R1);
190  /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
191  R4 = _mm_unpacklo_epi16(R2, R3);
192  /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
193  R5 = _mm_unpackhi_epi16(R2, R3);
194  /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
195  _mm_store_si128((__m128i*)dptr, R4);
196  dptr += (128 / 8);
197  _mm_store_si128((__m128i*)dptr, R5);
198  dptr += (128 / 8);
199  w -= 8;
200  }
201 
202  /* Handle any remainder pixels. */
203  if (w > 0)
204  {
205  pstatus_t status = 0;
206  status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
207  shift, withAlpha);
208 
209  if (status != PRIMITIVES_SUCCESS)
210  return status;
211 
212  sptr += w * sizeof(UINT32);
213  dptr += w * sizeof(UINT32);
214  }
215 
216  sptr += sRowBump;
217  dptr += dRowBump;
218  }
219 
220  return PRIMITIVES_SUCCESS;
221 }
222 
223 /* ------------------------------------------------------------------------- */
224 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
225  UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
226  UINT32 DstFormat, UINT32 dstStep, UINT32 width,
227  UINT32 height, UINT8 shift, BOOL withAlpha)
228 {
229  const BYTE* sptr = pSrc;
230  BYTE* dptr = pDst;
231  int sRowBump = srcStep - width * sizeof(UINT32);
232  int dRowBump = dstStep - width * sizeof(UINT32);
233  /* Shift left by "shift" and divide by two is the same as shift
234  * left by "shift-1".
235  */
236  int dataShift = shift - 1;
237  BYTE mask = (BYTE)(0xFFU << dataShift);
238 
239  /* Let's say the data is of the form:
240  * y0y0o0g0 a1y1o1g1 a2y2o2g2...
241  * Apply:
242  * |R| | 1 1/2 -1/2 | |y|
243  * |G| = | 1 0 1/2 | * |o|
244  * |B| | 1 -1/2 -1/2 | |g|
245  * where Y is 8-bit unsigned and o & g are 8-bit signed.
246  */
247 
248  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
249  {
250  /* Too small, or we'll never hit a 16-byte boundary. Punt. */
251  return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
252  shift, withAlpha);
253  }
254 
255  for (UINT32 h = 0; h < height; h++)
256  {
257  int w = width;
258  BOOL onStride = 0;
259 
260  /* Get to a 16-byte destination boundary. */
261  if ((ULONG_PTR)dptr & 0x0f)
262  {
263  pstatus_t status = 0;
264  UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
265 
266  if (startup > width)
267  startup = width;
268 
269  status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
270  1, shift, withAlpha);
271 
272  if (status != PRIMITIVES_SUCCESS)
273  return status;
274 
275  sptr += startup * sizeof(UINT32);
276  dptr += startup * sizeof(UINT32);
277  w -= startup;
278  }
279 
280  /* Each loop handles eight pixels at a time. */
281  onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
282 
283  while (w >= 8)
284  {
285  __m128i R0;
286  __m128i R1;
287  __m128i R2;
288  __m128i R3;
289  __m128i R4;
290  __m128i R5;
291  __m128i R6;
292  __m128i R7;
293 
294  if (onStride)
295  {
296  /* The faster path, 16-byte aligned load. */
297  R0 = _mm_load_si128((const __m128i*)sptr);
298  sptr += (128 / 8);
299  R1 = _mm_load_si128((const __m128i*)sptr);
300  sptr += (128 / 8);
301  }
302  else
303  {
304  /* Off-stride, slower LDDQU load. */
305  R0 = _mm_lddqu_si128((const __m128i*)sptr);
306  sptr += (128 / 8);
307  R1 = _mm_lddqu_si128((const __m128i*)sptr);
308  sptr += (128 / 8);
309  }
310 
311  /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
312  /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
313  /* Shuffle to pack all the like types together. */
314  R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
315  R3 = _mm_shuffle_epi8(R0, R2);
316  R4 = _mm_shuffle_epi8(R1, R2);
317  /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
318  /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
319  R5 = _mm_unpackhi_epi32(R3, R4);
320  R6 = _mm_unpacklo_epi32(R3, R4);
321 
322  /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
323  /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
324  /* Save alphas aside */
325  if (withAlpha)
326  R7 = _mm_unpackhi_epi64(R5, R5);
327  else
328  R7 = _mm_set1_epi32(0xFFFFFFFFU);
329 
330  /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
331  /* Expand Y's from 8-bit unsigned to 16-bit signed. */
332  R1 = _mm_set1_epi32(0);
333  R0 = _mm_unpacklo_epi8(R5, R1);
334  /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
335  /* Shift Co's and Cg's by (shift-1). -1 covers division by two.
336  * Note: this must be done before sign-conversion.
337  * Note also there is no slli_epi8, so we have to use a 16-bit
338  * version and then mask.
339  */
340  R6 = _mm_slli_epi16(R6, dataShift);
341  R1 = _mm_set1_epi8(mask);
342  R6 = _mm_and_si128(R6, R1);
343  /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
344  /* Expand Co's from 8-bit signed to 16-bit signed */
345  R1 = _mm_unpackhi_epi8(R6, R6);
346  R1 = _mm_srai_epi16(R1, 8);
347  /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
348  /* Expand Cg's form 8-bit signed to 16-bit signed */
349  R2 = _mm_unpacklo_epi8(R6, R6);
350  R2 = _mm_srai_epi16(R2, 8);
351  /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
352  /* Get Y - halfCg and save */
353  R6 = _mm_subs_epi16(R0, R2);
354  /* R = (Y-halfCg) + halfCo */
355  R3 = _mm_adds_epi16(R6, R1);
356  /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
357  /* G = Y + Cg(/2) */
358  R4 = _mm_adds_epi16(R0, R2);
359  /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
360  /* B = (Y-halfCg) - Co(/2) */
361  R5 = _mm_subs_epi16(R6, R1);
362  /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
363  /* Repack R's & B's. */
364  /* This line is the only diff between inverted and non-inverted.
365  * Unfortunately, it would be expensive to check "inverted"
366  * every time through this loop.
367  */
368  R0 = _mm_packus_epi16(R5, R3);
369  /* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
370  /* Repack G's. */
371  R1 = _mm_packus_epi16(R4, R4);
372  /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
373  /* And add the A's. */
374  R1 = _mm_unpackhi_epi64(R1, R7);
375  /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
376  /* Now do interleaving again. */
377  R2 = _mm_unpacklo_epi8(R0, R1);
378  /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
379  R3 = _mm_unpackhi_epi8(R0, R1);
380  /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
381  R4 = _mm_unpacklo_epi16(R2, R3);
382  /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
383  R5 = _mm_unpackhi_epi16(R2, R3);
384  /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
385  _mm_store_si128((__m128i*)dptr, R4);
386  dptr += (128 / 8);
387  _mm_store_si128((__m128i*)dptr, R5);
388  dptr += (128 / 8);
389  w -= 8;
390  }
391 
392  /* Handle any remainder pixels. */
393  if (w > 0)
394  {
395  pstatus_t status = 0;
396  status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
397  shift, withAlpha);
398 
399  if (status != PRIMITIVES_SUCCESS)
400  return status;
401 
402  sptr += w * sizeof(UINT32);
403  dptr += w * sizeof(UINT32);
404  }
405 
406  sptr += sRowBump;
407  dptr += dRowBump;
408  }
409 
410  return PRIMITIVES_SUCCESS;
411 }
412 
413 /* ------------------------------------------------------------------------- */
414 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
415  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
416  INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
417  BOOL withAlpha)
418 {
419  switch (DstFormat)
420  {
421  case PIXEL_FORMAT_BGRX32:
422  case PIXEL_FORMAT_BGRA32:
423  return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, DstFormat, dstStep, width,
424  height, shift, withAlpha);
425 
426  case PIXEL_FORMAT_RGBX32:
427  case PIXEL_FORMAT_RGBA32:
428  return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, DstFormat, dstStep,
429  width, height, shift, withAlpha);
430 
431  default:
432  return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
433  height, shift, withAlpha);
434  }
435 }
436 
437 #endif
438 
439 /* ------------------------------------------------------------------------- */
440 void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims)
441 {
442 #if defined(SSE2_ENABLED)
443  generic = primitives_get_generic();
444  primitives_init_YCoCg(prims);
445 
446  if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
447  IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
448  {
449  WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
450  prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
451  }
452 #else
453  WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
454  WINPR_UNUSED(prims);
455 #endif
456 }