FreeRDP
prim_colors_sse2.c
1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Optimized Color conversion operations.
3  * vi:ts=4 sw=4:
4  *
5  * Copyright 2011 Stephen Erisman
6  * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
7  * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
8  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
9  *
10  * Licensed under the Apache License, Version 2.0 (the "License"); you may
11  * not use this file except in compliance with the License. You may obtain
12  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
16  * or implied. See the License for the specific language governing
17  * permissions and limitations under the License.
18  */
19 
20 #include <freerdp/config.h>
21 
22 #include <freerdp/types.h>
23 #include <freerdp/primitives.h>
24 #include <winpr/sysinfo.h>
25 
26 #include "prim_colors.h"
27 
28 #include "prim_internal.h"
29 #include "prim_templates.h"
30 
31 #if defined(SSE2_ENABLED)
32 #include <emmintrin.h>
33 
34 static primitives_t* generic = NULL;
35 
36 #ifdef __GNUC__
37 #define GNU_INLINE __attribute__((__gnu_inline__, __always_inline__, __artificial__))
38 #else
39 #define GNU_INLINE
40 #endif
41 
42 #define CACHE_LINE_BYTES 64
43 
44 #define mm_between_epi16(_val, _min, _max) \
45  do \
46  { \
47  (_val) = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); \
48  } while (0)
49 
50 #ifdef DO_PREFETCH
51 /*---------------------------------------------------------------------------*/
52 static inline void GNU_INLINE _mm_prefetch_buffer(char* WINPR_RESTRICT buffer, int num_bytes)
53 {
54  __m128i* buf = (__m128i*)buffer;
55 
56  for (unsigned int i = 0; i < (num_bytes / sizeof(__m128i));
57  i += (CACHE_LINE_BYTES / sizeof(__m128i)))
58  {
59  _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
60  }
61 }
62 #endif /* DO_PREFETCH */
63 
64 /*---------------------------------------------------------------------------*/
65 static pstatus_t
66 sse2_yCbCrToRGB_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep,
67  INT16* WINPR_RESTRICT pDst[3], int dstStep,
68  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
69 {
70  __m128i zero;
71  __m128i max;
72  __m128i r_cr;
73  __m128i g_cb;
74  __m128i g_cr;
75  __m128i b_cb;
76  __m128i c4096;
77  const __m128i* y_buf = NULL;
78  const __m128i* cb_buf = NULL;
79  const __m128i* cr_buf = NULL;
80  __m128i* r_buf = NULL;
81  __m128i* g_buf = NULL;
82  __m128i* b_buf = NULL;
83  int srcbump = 0;
84  int dstbump = 0;
85  int imax = 0;
86 
87  if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
88  ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
89  ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
90  (srcStep & 127) || (dstStep & 127))
91  {
92  /* We can't maintain 16-byte alignment. */
93  return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
94  }
95 
96  zero = _mm_setzero_si128();
97  max = _mm_set1_epi16(255);
98  y_buf = (const __m128i*)(pSrc[0]);
99  cb_buf = (const __m128i*)(pSrc[1]);
100  cr_buf = (const __m128i*)(pSrc[2]);
101  r_buf = (__m128i*)(pDst[0]);
102  g_buf = (__m128i*)(pDst[1]);
103  b_buf = (__m128i*)(pDst[2]);
104  r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
105  g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
106  g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
107  b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
108  c4096 = _mm_set1_epi16(4096);
109  srcbump = srcStep / sizeof(__m128i);
110  dstbump = dstStep / sizeof(__m128i);
111 #ifdef DO_PREFETCH
112 
113  /* Prefetch Y's, Cb's, and Cr's. */
114  for (UINT32 yp = 0; yp < roi->height; yp++)
115  {
116  for (int i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
117  i += (CACHE_LINE_BYTES / sizeof(__m128i)))
118  {
119  _mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA);
120  _mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
121  _mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
122  }
123 
124  y_buf += srcbump;
125  cb_buf += srcbump;
126  cr_buf += srcbump;
127  }
128 
129  y_buf = (__m128i*)(pSrc[0]);
130  cb_buf = (__m128i*)(pSrc[1]);
131  cr_buf = (__m128i*)(pSrc[2]);
132 #endif /* DO_PREFETCH */
133  imax = roi->width * sizeof(INT16) / sizeof(__m128i);
134 
135  for (UINT32 yp = 0; yp < roi->height; ++yp)
136  {
137  for (int i = 0; i < imax; i++)
138  {
139  /* In order to use SSE2 signed 16-bit integer multiplication
140  * we need to convert the floating point factors to signed int
141  * without losing information.
142  * The result of this multiplication is 32 bit and we have two
143  * SSE instructions that return either the hi or lo word.
144  * Thus we will multiply the factors by the highest possible 2^n,
145  * take the upper 16 bits of the signed 32-bit result
146  * (_mm_mulhi_epi16) and correct this result by multiplying
147  * it by 2^(16-n).
148  *
149  * For the given factors in the conversion matrix the best
150  * possible n is 14.
151  *
152  * Example for calculating r:
153  * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
154  * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
155  * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
156  * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
157  */
158  /* y = (y_r_buf[i] + 4096) >> 2 */
159  __m128i y;
160  __m128i cb;
161  __m128i cr;
162  __m128i r;
163  __m128i g;
164  __m128i b;
165  y = _mm_load_si128(y_buf + i);
166  y = _mm_add_epi16(y, c4096);
167  y = _mm_srai_epi16(y, 2);
168  /* cb = cb_g_buf[i]; */
169  cb = _mm_load_si128(cb_buf + i);
170  /* cr = cr_b_buf[i]; */
171  cr = _mm_load_si128(cr_buf + i);
172  /* (y + HIWORD(cr*22986)) >> 3 */
173  r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
174  r = _mm_srai_epi16(r, 3);
175  /* r_buf[i] = CLIP(r); */
176  mm_between_epi16(r, zero, max);
177  _mm_store_si128(r_buf + i, r);
178  /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
179  g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
180  g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
181  g = _mm_srai_epi16(g, 3);
182  /* g_buf[i] = CLIP(g); */
183  mm_between_epi16(g, zero, max);
184  _mm_store_si128(g_buf + i, g);
185  /* (y + HIWORD(cb*28999)) >> 3 */
186  b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
187  b = _mm_srai_epi16(b, 3);
188  /* b_buf[i] = CLIP(b); */
189  mm_between_epi16(b, zero, max);
190  _mm_store_si128(b_buf + i, b);
191  }
192 
193  y_buf += srcbump;
194  cb_buf += srcbump;
195  cr_buf += srcbump;
196  r_buf += dstbump;
197  g_buf += dstbump;
198  b_buf += dstbump;
199  }
200 
201  return PRIMITIVES_SUCCESS;
202 }
203 
204 /*---------------------------------------------------------------------------*/
205 static pstatus_t
206 sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
207  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
208  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
209 {
210  const __m128i zero = _mm_setzero_si128();
211  const __m128i max = _mm_set1_epi16(255);
212  const __m128i r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
213  const __m128i g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
214  const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
215  const __m128i b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
216  const __m128i c4096 = _mm_set1_epi16(4096);
217  const INT16* y_buf = pSrc[0];
218  const INT16* cb_buf = pSrc[1];
219  const INT16* cr_buf = pSrc[2];
220  const UINT32 pad = roi->width % 16;
221  const UINT32 step = sizeof(__m128i) / sizeof(INT16);
222  const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
223  BYTE* d_buf = pDst;
224  const size_t dstPad = (dstStep - roi->width * 4);
225 #ifdef DO_PREFETCH
226 
227  /* Prefetch Y's, Cb's, and Cr's. */
228  for (UINT32 yp = 0; yp < roi->height; yp++)
229  {
230  for (int i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
231  {
232  _mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
233  _mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
234  _mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
235  }
236 
237  y_buf += srcStep / sizeof(INT16);
238  cb_buf += srcStep / sizeof(INT16);
239  cr_buf += srcStep / sizeof(INT16);
240  }
241 
242  y_buf = (INT16*)pSrc[0];
243  cb_buf = (INT16*)pSrc[1];
244  cr_buf = (INT16*)pSrc[2];
245 #endif /* DO_PREFETCH */
246 
247  for (UINT32 yp = 0; yp < roi->height; ++yp)
248  {
249  for (UINT32 i = 0; i < imax; i += 2)
250  {
251  /* In order to use SSE2 signed 16-bit integer multiplication
252  * we need to convert the floating point factors to signed int
253  * without losing information.
254  * The result of this multiplication is 32 bit and we have two
255  * SSE instructions that return either the hi or lo word.
256  * Thus we will multiply the factors by the highest possible 2^n,
257  * take the upper 16 bits of the signed 32-bit result
258  * (_mm_mulhi_epi16) and correct this result by multiplying
259  * it by 2^(16-n).
260  *
261  * For the given factors in the conversion matrix the best
262  * possible n is 14.
263  *
264  * Example for calculating r:
265  * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
266  * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
267  * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
268  * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
269  */
270  /* y = (y_r_buf[i] + 4096) >> 2 */
271  __m128i y1;
272  __m128i y2;
273  __m128i cb1;
274  __m128i cb2;
275  __m128i cr1;
276  __m128i cr2;
277  __m128i r1;
278  __m128i r2;
279  __m128i g1;
280  __m128i g2;
281  __m128i b1;
282  __m128i b2;
283  y1 = _mm_load_si128((const __m128i*)y_buf);
284  y_buf += step;
285  y1 = _mm_add_epi16(y1, c4096);
286  y1 = _mm_srai_epi16(y1, 2);
287  /* cb = cb_g_buf[i]; */
288  cb1 = _mm_load_si128((const __m128i*)cb_buf);
289  cb_buf += step;
290  /* cr = cr_b_buf[i]; */
291  cr1 = _mm_load_si128((const __m128i*)cr_buf);
292  cr_buf += step;
293  /* (y + HIWORD(cr*22986)) >> 3 */
294  r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
295  r1 = _mm_srai_epi16(r1, 3);
296  /* r_buf[i] = CLIP(r); */
297  mm_between_epi16(r1, zero, max);
298  /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
299  g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
300  g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
301  g1 = _mm_srai_epi16(g1, 3);
302  /* g_buf[i] = CLIP(g); */
303  mm_between_epi16(g1, zero, max);
304  /* (y + HIWORD(cb*28999)) >> 3 */
305  b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
306  b1 = _mm_srai_epi16(b1, 3);
307  /* b_buf[i] = CLIP(b); */
308  mm_between_epi16(b1, zero, max);
309  y2 = _mm_load_si128((const __m128i*)y_buf);
310  y_buf += step;
311  y2 = _mm_add_epi16(y2, c4096);
312  y2 = _mm_srai_epi16(y2, 2);
313  /* cb = cb_g_buf[i]; */
314  cb2 = _mm_load_si128((const __m128i*)cb_buf);
315  cb_buf += step;
316  /* cr = cr_b_buf[i]; */
317  cr2 = _mm_load_si128((const __m128i*)cr_buf);
318  cr_buf += step;
319  /* (y + HIWORD(cr*22986)) >> 3 */
320  r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
321  r2 = _mm_srai_epi16(r2, 3);
322  /* r_buf[i] = CLIP(r); */
323  mm_between_epi16(r2, zero, max);
324  /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
325  g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
326  g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
327  g2 = _mm_srai_epi16(g2, 3);
328  /* g_buf[i] = CLIP(g); */
329  mm_between_epi16(g2, zero, max);
330  /* (y + HIWORD(cb*28999)) >> 3 */
331  b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
332  b2 = _mm_srai_epi16(b2, 3);
333  /* b_buf[i] = CLIP(b); */
334  mm_between_epi16(b2, zero, max);
335  {
336  __m128i R0;
337  __m128i R1;
338  __m128i R2;
339  __m128i R3;
340  __m128i R4;
341  /* The comments below pretend these are 8-byte registers
342  * rather than 16-byte, for readability.
343  */
344  R0 = b1; /* R0 = 00B300B200B100B0 */
345  R1 = b2; /* R1 = 00B700B600B500B4 */
346  R0 = _mm_packus_epi16(R0, R1); /* R0 = B7B6B5B4B3B2B1B0 */
347  R1 = g1; /* R1 = 00G300G200G100G0 */
348  R2 = g2; /* R2 = 00G700G600G500G4 */
349  R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */
350  R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
351  R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = B3G3B2G2B1G1B0G0 */
352  R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = B7G7B6G6B5G5B4G4 */
353  R0 = r1; /* R0 = 00R300R200R100R0 */
354  R3 = r2; /* R3 = 00R700R600R500R4 */
355  R0 = _mm_packus_epi16(R0, R3); /* R0 = R7R6R5R4R3R2R1R0 */
356  R3 = _mm_set1_epi32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */
357  R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
358  R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = R3FFR2FFR1FFR0FF */
359  R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = R7FFR6FFR5FFR4FF */
360  R0 = R4; /* R0 = R4 */
361  R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = B1G1R1FFB0G0R0FF */
362  R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = B3G3R3FFB2G2R2FF */
363  R2 = R3; /* R2 = R3 */
364  R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = B5G5R5FFB4G4R4FF */
365  R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = B7G7R7FFB6G6R6FF */
366  _mm_store_si128((__m128i*)d_buf, R0); /* B1G1R1FFB0G0R0FF */
367  d_buf += sizeof(__m128i);
368  _mm_store_si128((__m128i*)d_buf, R4); /* B3G3R3FFB2G2R2FF */
369  d_buf += sizeof(__m128i);
370  _mm_store_si128((__m128i*)d_buf, R2); /* B5G5R5FFB4G4R4FF */
371  d_buf += sizeof(__m128i);
372  _mm_store_si128((__m128i*)d_buf, R3); /* B7G7R7FFB6G6R6FF */
373  d_buf += sizeof(__m128i);
374  }
375  }
376 
377  for (UINT32 i = 0; i < pad; i++)
378  {
379  const INT32 divisor = 16;
380  const INT32 Y = ((*y_buf++) + 4096) << divisor;
381  const INT32 Cb = (*cb_buf++);
382  const INT32 Cr = (*cr_buf++);
383  const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
384  const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
385  const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
386  const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
387  const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
388  const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
389  const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
390  *d_buf++ = CLIP(B);
391  *d_buf++ = CLIP(G);
392  *d_buf++ = CLIP(R);
393  *d_buf++ = 0xFF;
394  }
395 
396  d_buf += dstPad;
397  }
398 
399  return PRIMITIVES_SUCCESS;
400 }
401 
402 /*---------------------------------------------------------------------------*/
403 static pstatus_t
404 sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
405  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
406  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
407 {
408  const __m128i zero = _mm_setzero_si128();
409  const __m128i max = _mm_set1_epi16(255);
410  const __m128i r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
411  const __m128i g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
412  const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
413  const __m128i b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
414  const __m128i c4096 = _mm_set1_epi16(4096);
415  const INT16* y_buf = pSrc[0];
416  const INT16* cb_buf = pSrc[1];
417  const INT16* cr_buf = pSrc[2];
418  const UINT32 pad = roi->width % 16;
419  const UINT32 step = sizeof(__m128i) / sizeof(INT16);
420  const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
421  BYTE* d_buf = pDst;
422  const size_t dstPad = (dstStep - roi->width * 4);
423 #ifdef DO_PREFETCH
424 
425  /* Prefetch Y's, Cb's, and Cr's. */
426  for (UINT32 yp = 0; yp < roi->height; yp++)
427  {
428  for (int i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
429  {
430  _mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
431  _mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
432  _mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
433  }
434 
435  y_buf += srcStep / sizeof(INT16);
436  cb_buf += srcStep / sizeof(INT16);
437  cr_buf += srcStep / sizeof(INT16);
438  }
439 
440  y_buf = (INT16*)(pSrc[0]);
441  cb_buf = (INT16*)(pSrc[1]);
442  cr_buf = (INT16*)(pSrc[2]);
443 #endif /* DO_PREFETCH */
444 
445  for (UINT32 yp = 0; yp < roi->height; ++yp)
446  {
447  for (UINT32 i = 0; i < imax; i += 2)
448  {
449  /* In order to use SSE2 signed 16-bit integer multiplication
450  * we need to convert the floating point factors to signed int
451  * without losing information.
452  * The result of this multiplication is 32 bit and we have two
453  * SSE instructions that return either the hi or lo word.
454  * Thus we will multiply the factors by the highest possible 2^n,
455  * take the upper 16 bits of the signed 32-bit result
456  * (_mm_mulhi_epi16) and correct this result by multiplying
457  * it by 2^(16-n).
458  *
459  * For the given factors in the conversion matrix the best
460  * possible n is 14.
461  *
462  * Example for calculating r:
463  * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
464  * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
465  * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
466  * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
467  */
468  /* y = (y_r_buf[i] + 4096) >> 2 */
469  __m128i y1;
470  __m128i y2;
471  __m128i cb1;
472  __m128i cb2;
473  __m128i cr1;
474  __m128i cr2;
475  __m128i r1;
476  __m128i r2;
477  __m128i g1;
478  __m128i g2;
479  __m128i b1;
480  __m128i b2;
481  y1 = _mm_load_si128((const __m128i*)y_buf);
482  y_buf += step;
483  y1 = _mm_add_epi16(y1, c4096);
484  y1 = _mm_srai_epi16(y1, 2);
485  /* cb = cb_g_buf[i]; */
486  cb1 = _mm_load_si128((const __m128i*)cb_buf);
487  cb_buf += step;
488  /* cr = cr_b_buf[i]; */
489  cr1 = _mm_load_si128((const __m128i*)cr_buf);
490  cr_buf += step;
491  /* (y + HIWORD(cr*22986)) >> 3 */
492  r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
493  r1 = _mm_srai_epi16(r1, 3);
494  /* r_buf[i] = CLIP(r); */
495  mm_between_epi16(r1, zero, max);
496  /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
497  g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
498  g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
499  g1 = _mm_srai_epi16(g1, 3);
500  /* g_buf[i] = CLIP(g); */
501  mm_between_epi16(g1, zero, max);
502  /* (y + HIWORD(cb*28999)) >> 3 */
503  b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
504  b1 = _mm_srai_epi16(b1, 3);
505  /* b_buf[i] = CLIP(b); */
506  mm_between_epi16(b1, zero, max);
507  y2 = _mm_load_si128((const __m128i*)y_buf);
508  y_buf += step;
509  y2 = _mm_add_epi16(y2, c4096);
510  y2 = _mm_srai_epi16(y2, 2);
511  /* cb = cb_g_buf[i]; */
512  cb2 = _mm_load_si128((const __m128i*)cb_buf);
513  cb_buf += step;
514  /* cr = cr_b_buf[i]; */
515  cr2 = _mm_load_si128((const __m128i*)cr_buf);
516  cr_buf += step;
517  /* (y + HIWORD(cr*22986)) >> 3 */
518  r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
519  r2 = _mm_srai_epi16(r2, 3);
520  /* r_buf[i] = CLIP(r); */
521  mm_between_epi16(r2, zero, max);
522  /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
523  g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
524  g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
525  g2 = _mm_srai_epi16(g2, 3);
526  /* g_buf[i] = CLIP(g); */
527  mm_between_epi16(g2, zero, max);
528  /* (y + HIWORD(cb*28999)) >> 3 */
529  b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
530  b2 = _mm_srai_epi16(b2, 3);
531  /* b_buf[i] = CLIP(b); */
532  mm_between_epi16(b2, zero, max);
533  {
534  __m128i R0;
535  __m128i R1;
536  __m128i R2;
537  __m128i R3;
538  __m128i R4;
539  /* The comments below pretend these are 8-byte registers
540  * rather than 16-byte, for readability.
541  */
542  R0 = r1; /* R0 = 00R300R200R100R0 */
543  R1 = r2; /* R1 = 00R700R600R500R4 */
544  R0 = _mm_packus_epi16(R0, R1); /* R0 = R7R6R5R4R3R2R1R0 */
545  R1 = g1; /* R1 = 00G300G200G100G0 */
546  R2 = g2; /* R2 = 00G700G600G500G4 */
547  R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */
548  R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
549  R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = R3G3R2G2R1G1R0G0 */
550  R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = R7G7R6G6R5G5R4G4 */
551  R0 = b1; /* R0 = 00B300B200B100B0 */
552  R3 = b2; /* R3 = 00B700B600B500B4 */
553  R0 = _mm_packus_epi16(R0, R3); /* R0 = B7B6B5B4B3B2B1B0 */
554  R3 = _mm_set1_epi32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */
555  R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
556  R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = B3FFB2FFB1FFB0FF */
557  R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = B7FFB6FFB5FFB4FF */
558  R0 = R4; /* R0 = R4 */
559  R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = R1G1B1FFR0G0B0FF */
560  R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = R3G3B3FFR2G2B2FF */
561  R2 = R3; /* R2 = R3 */
562  R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = R5G5B5FFR4G4B4FF */
563  R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = R7G7B7FFR6G6B6FF */
564  _mm_store_si128((__m128i*)d_buf, R0); /* R1G1B1FFR0G0B0FF */
565  d_buf += sizeof(__m128i);
566  _mm_store_si128((__m128i*)d_buf, R4); /* R3G3B3FFR2G2B2FF */
567  d_buf += sizeof(__m128i);
568  _mm_store_si128((__m128i*)d_buf, R2); /* R5G5B5FFR4G4B4FF */
569  d_buf += sizeof(__m128i);
570  _mm_store_si128((__m128i*)d_buf, R3); /* R7G7B7FFR6G6B6FF */
571  d_buf += sizeof(__m128i);
572  }
573  }
574 
575  for (UINT32 i = 0; i < pad; i++)
576  {
577  const INT32 divisor = 16;
578  const INT32 Y = ((*y_buf++) + 4096) << divisor;
579  const INT32 Cb = (*cb_buf++);
580  const INT32 Cr = (*cr_buf++);
581  const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
582  const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
583  const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
584  const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
585  const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
586  const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
587  const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
588  *d_buf++ = CLIP(R);
589  *d_buf++ = CLIP(G);
590  *d_buf++ = CLIP(B);
591  *d_buf++ = 0xFF;
592  }
593 
594  d_buf += dstPad;
595  }
596 
597  return PRIMITIVES_SUCCESS;
598 }
599 
600 static pstatus_t
601 sse2_yCbCrToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
602  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
603  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
604 {
605  if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
606  ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst)&0x0f) || (srcStep & 0x0f) ||
607  (dstStep & 0x0f))
608  {
609  /* We can't maintain 16-byte alignment. */
610  return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
611  }
612 
613  switch (DstFormat)
614  {
615  case PIXEL_FORMAT_BGRA32:
616  case PIXEL_FORMAT_BGRX32:
617  return sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
618 
619  case PIXEL_FORMAT_RGBA32:
620  case PIXEL_FORMAT_RGBX32:
621  return sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
622 
623  default:
624  return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
625  }
626 }
627 /* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
628  * numbers. See the general code above.
629  */
630 static pstatus_t
631 sse2_RGBToYCbCr_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep,
632  INT16* WINPR_RESTRICT pDst[3], int dstStep,
633  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
634 {
635  __m128i min;
636  __m128i max;
637  __m128i y_r;
638  __m128i y_g;
639  __m128i y_b;
640  __m128i cb_r;
641  __m128i cb_g;
642  __m128i cb_b;
643  __m128i cr_r;
644  __m128i cr_g;
645  __m128i cr_b;
646  const __m128i* r_buf = (const __m128i*)(pSrc[0]);
647  const __m128i* g_buf = (const __m128i*)(pSrc[1]);
648  const __m128i* b_buf = (const __m128i*)(pSrc[2]);
649  __m128i* y_buf = (__m128i*)(pDst[0]);
650  __m128i* cb_buf = (__m128i*)(pDst[1]);
651  __m128i* cr_buf = (__m128i*)(pDst[2]);
652  int srcbump = 0;
653  int dstbump = 0;
654  int imax = 0;
655 
656  if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
657  ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
658  ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
659  (srcStep & 127) || (dstStep & 127))
660  {
661  /* We can't maintain 16-byte alignment. */
662  return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
663  }
664 
665  min = _mm_set1_epi16(-128 * 32);
666  max = _mm_set1_epi16(127 * 32);
667 
668  y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */
669  y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */
670  y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */
671  cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */
672  cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
673  cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */
674  cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */
675  cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
676  cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */
677  srcbump = srcStep / sizeof(__m128i);
678  dstbump = dstStep / sizeof(__m128i);
679 #ifdef DO_PREFETCH
680 
681  /* Prefetch RGB's. */
682  for (UINT32 yp = 0; yp < roi->height; yp++)
683  {
684  for (int i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
685  i += (CACHE_LINE_BYTES / sizeof(__m128i)))
686  {
687  _mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
688  _mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
689  _mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
690  }
691 
692  r_buf += srcbump;
693  g_buf += srcbump;
694  b_buf += srcbump;
695  }
696 
697  r_buf = (__m128i*)(pSrc[0]);
698  g_buf = (__m128i*)(pSrc[1]);
699  b_buf = (__m128i*)(pSrc[2]);
700 #endif /* DO_PREFETCH */
701  imax = roi->width * sizeof(INT16) / sizeof(__m128i);
702 
703  for (UINT32 yp = 0; yp < roi->height; ++yp)
704  {
705  for (int i = 0; i < imax; i++)
706  {
707  /* In order to use SSE2 signed 16-bit integer multiplication we
708  * need to convert the floating point factors to signed int
709  * without loosing information. The result of this multiplication
710  * is 32 bit and using SSE2 we get either the product's hi or lo
711  * word. Thus we will multiply the factors by the highest
712  * possible 2^n and take the upper 16 bits of the signed 32-bit
713  * result (_mm_mulhi_epi16). Since the final result needs to
714  * be scaled by << 5 and also in in order to keep the precision
715  * within the upper 16 bits we will also have to scale the RGB
716  * values used in the multiplication by << 5+(16-n).
717  */
718  __m128i r;
719  __m128i g;
720  __m128i b;
721  __m128i y;
722  __m128i cb;
723  __m128i cr;
724  r = _mm_load_si128(r_buf + i);
725  g = _mm_load_si128(g_buf + i);
726  b = _mm_load_si128(b_buf + i);
727  /* r<<6; g<<6; b<<6 */
728  r = _mm_slli_epi16(r, 6);
729  g = _mm_slli_epi16(g, 6);
730  b = _mm_slli_epi16(b, 6);
731  /* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
732  y = _mm_mulhi_epi16(r, y_r);
733  y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
734  y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
735  y = _mm_add_epi16(y, min);
736  /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
737  mm_between_epi16(y, min, max);
738  _mm_store_si128(y_buf + i, y);
739  /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
740  cb = _mm_mulhi_epi16(r, cb_r);
741  cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
742  cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
743  /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
744  mm_between_epi16(cb, min, max);
745  _mm_store_si128(cb_buf + i, cb);
746  /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
747  cr = _mm_mulhi_epi16(r, cr_r);
748  cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
749  cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
750  /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
751  mm_between_epi16(cr, min, max);
752  _mm_store_si128(cr_buf + i, cr);
753  }
754 
755  y_buf += srcbump;
756  cb_buf += srcbump;
757  cr_buf += srcbump;
758  r_buf += dstbump;
759  g_buf += dstbump;
760  b_buf += dstbump;
761  }
762 
763  return PRIMITIVES_SUCCESS;
764 }
765 
766 /*---------------------------------------------------------------------------*/
767 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
768  const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
769  UINT32 srcStep, /* bytes between rows in source data */
770  BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
771  UINT32 dstStep, /* bytes between rows in dest data */
772  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
773 {
774  const UINT16* pr = (const UINT16*)(pSrc[0]);
775  const UINT16* pg = (const UINT16*)(pSrc[1]);
776  const UINT16* pb = (const UINT16*)(pSrc[2]);
777  const UINT32 pad = roi->width % 16;
778  const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
779  BYTE* out = NULL;
780  UINT32 srcbump = 0;
781  UINT32 dstbump = 0;
782  out = pDst;
783  srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
784  dstbump = (dstStep - (roi->width * sizeof(UINT32)));
785 
786  for (UINT32 y = 0; y < roi->height; ++y)
787  {
788  for (UINT32 x = 0; x < roi->width - pad; x += 16)
789  {
790  __m128i r;
791  __m128i g;
792  __m128i b;
793  /* The comments below pretend these are 8-byte registers
794  * rather than 16-byte, for readability.
795  */
796  {
797  __m128i R0;
798  __m128i R1;
799  R0 = _mm_load_si128((const __m128i*)pb);
800  pb += 8; /* R0 = 00B300B200B100B0 */
801  R1 = _mm_load_si128((const __m128i*)pb);
802  pb += 8; /* R1 = 00B700B600B500B4 */
803  b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
804  }
805  {
806  __m128i R0;
807  __m128i R1;
808  R0 = _mm_load_si128((const __m128i*)pg);
809  pg += 8; /* R1 = 00G300G200G100G0 */
810  R1 = _mm_load_si128((const __m128i*)pg);
811  pg += 8; /* R2 = 00G700G600G500G4 */
812  g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
813  }
814  {
815  __m128i R0;
816  __m128i R1;
817  R0 = _mm_load_si128((const __m128i*)pr);
818  pr += 8; /* R0 = 00R300R200R100R0 */
819  R1 = _mm_load_si128((const __m128i*)pr);
820  pr += 8; /* R3 = 00R700R600R500R4 */
821  r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
822  }
823  {
824  __m128i gbHi;
825  __m128i gbLo;
826  __m128i arHi;
827  __m128i arLo;
828  {
829  gbLo = _mm_unpacklo_epi8(b, g); /* R0 = G7G6G5G4G3G2G1G0 */
830  gbHi = _mm_unpackhi_epi8(b, g); /* R1 = G7B7G6B7G5B5G4B4 */
831  arLo = _mm_unpacklo_epi8(r, a); /* R4 = FFR3FFR2FFR1FFR0 */
832  arHi = _mm_unpackhi_epi8(r, a); /* R3 = FFR7FFR6FFR5FFR4 */
833  }
834  {
835  const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
836  _mm_store_si128((__m128i*)out, bgrx);
837  out += 16; /* FFR1G1B1FFR0G0B0 */
838  }
839  {
840  const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
841  _mm_store_si128((__m128i*)out, bgrx);
842  out += 16; /* FFR3G3B3FFR2G2B2 */
843  }
844  {
845  const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
846  _mm_store_si128((__m128i*)out, bgrx);
847  out += 16; /* FFR5G5B5FFR4G4B4 */
848  }
849  {
850  const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
851  _mm_store_si128((__m128i*)out, bgrx);
852  out += 16; /* FFR7G7B7FFR6G6B6 */
853  }
854  }
855  }
856 
857  for (UINT32 x = 0; x < pad; x++)
858  {
859  const BYTE R = CLIP(*pr++);
860  const BYTE G = CLIP(*pg++);
861  const BYTE B = CLIP(*pb++);
862  *out++ = B;
863  *out++ = G;
864  *out++ = R;
865  *out++ = 0xFF;
866  }
867 
868  /* Jump to next row. */
869  pr += srcbump;
870  pg += srcbump;
871  pb += srcbump;
872  out += dstbump;
873  }
874 
875  return PRIMITIVES_SUCCESS;
876 }
877 
878 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
879  const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
880  UINT32 srcStep, /* bytes between rows in source data */
881  BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
882  UINT32 dstStep, /* bytes between rows in dest data */
883  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
884 {
885  const UINT16* pr = (const UINT16*)(pSrc[0]);
886  const UINT16* pg = (const UINT16*)(pSrc[1]);
887  const UINT16* pb = (const UINT16*)(pSrc[2]);
888  const UINT32 pad = roi->width % 16;
889  const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
890  BYTE* out = NULL;
891  UINT32 srcbump = 0;
892  UINT32 dstbump = 0;
893  out = pDst;
894  srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
895  dstbump = (dstStep - (roi->width * sizeof(UINT32)));
896 
897  for (UINT32 y = 0; y < roi->height; ++y)
898  {
899  for (UINT32 x = 0; x < roi->width - pad; x += 16)
900  {
901  __m128i r;
902  __m128i g;
903  __m128i b;
904  /* The comments below pretend these are 8-byte registers
905  * rather than 16-byte, for readability.
906  */
907  {
908  __m128i R0;
909  __m128i R1;
910  R0 = _mm_load_si128((const __m128i*)pb);
911  pb += 8; /* R0 = 00B300B200B100B0 */
912  R1 = _mm_load_si128((const __m128i*)pb);
913  pb += 8; /* R1 = 00B700B600B500B4 */
914  b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
915  }
916  {
917  __m128i R0;
918  __m128i R1;
919  R0 = _mm_load_si128((const __m128i*)pg);
920  pg += 8; /* R1 = 00G300G200G100G0 */
921  R1 = _mm_load_si128((const __m128i*)pg);
922  pg += 8; /* R2 = 00G700G600G500G4 */
923  g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
924  }
925  {
926  __m128i R0;
927  __m128i R1;
928  R0 = _mm_load_si128((const __m128i*)pr);
929  pr += 8; /* R0 = 00R300R200R100R0 */
930  R1 = _mm_load_si128((const __m128i*)pr);
931  pr += 8; /* R3 = 00R700R600R500R4 */
932  r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
933  }
934  {
935  __m128i gbHi;
936  __m128i gbLo;
937  __m128i arHi;
938  __m128i arLo;
939  {
940  gbLo = _mm_unpacklo_epi8(r, g); /* R0 = G7G6G5G4G3G2G1G0 */
941  gbHi = _mm_unpackhi_epi8(r, g); /* R1 = G7B7G6B7G5B5G4B4 */
942  arLo = _mm_unpacklo_epi8(b, a); /* R4 = FFR3FFR2FFR1FFR0 */
943  arHi = _mm_unpackhi_epi8(b, a); /* R3 = FFR7FFR6FFR5FFR4 */
944  }
945  {
946  const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
947  _mm_store_si128((__m128i*)out, bgrx);
948  out += 16; /* FFR1G1B1FFR0G0B0 */
949  }
950  {
951  const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
952  _mm_store_si128((__m128i*)out, bgrx);
953  out += 16; /* FFR3G3B3FFR2G2B2 */
954  }
955  {
956  const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
957  _mm_store_si128((__m128i*)out, bgrx);
958  out += 16; /* FFR5G5B5FFR4G4B4 */
959  }
960  {
961  const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
962  _mm_store_si128((__m128i*)out, bgrx);
963  out += 16; /* FFR7G7B7FFR6G6B6 */
964  }
965  }
966  }
967 
968  for (UINT32 x = 0; x < pad; x++)
969  {
970  const BYTE R = CLIP(*pr++);
971  const BYTE G = CLIP(*pg++);
972  const BYTE B = CLIP(*pb++);
973  *out++ = R;
974  *out++ = G;
975  *out++ = B;
976  *out++ = 0xFF;
977  }
978 
979  /* Jump to next row. */
980  pr += srcbump;
981  pg += srcbump;
982  pb += srcbump;
983  out += dstbump;
984  }
985 
986  return PRIMITIVES_SUCCESS;
987 }
988 
989 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
990  const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
991  UINT32 srcStep, /* bytes between rows in source data */
992  BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
993  UINT32 dstStep, /* bytes between rows in dest data */
994  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
995 {
996  const UINT16* pr = (const UINT16*)(pSrc[0]);
997  const UINT16* pg = (const UINT16*)(pSrc[1]);
998  const UINT16* pb = (const UINT16*)(pSrc[2]);
999  const UINT32 pad = roi->width % 16;
1000  const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
1001  BYTE* out = NULL;
1002  UINT32 srcbump = 0;
1003  UINT32 dstbump = 0;
1004  out = pDst;
1005  srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
1006  dstbump = (dstStep - (roi->width * sizeof(UINT32)));
1007 
1008  for (UINT32 y = 0; y < roi->height; ++y)
1009  {
1010  for (UINT32 x = 0; x < roi->width - pad; x += 16)
1011  {
1012  __m128i r;
1013  __m128i g;
1014  __m128i b;
1015  /* The comments below pretend these are 8-byte registers
1016  * rather than 16-byte, for readability.
1017  */
1018  {
1019  __m128i R0;
1020  __m128i R1;
1021  R0 = _mm_load_si128((const __m128i*)pb);
1022  pb += 8; /* R0 = 00B300B200B100B0 */
1023  R1 = _mm_load_si128((const __m128i*)pb);
1024  pb += 8; /* R1 = 00B700B600B500B4 */
1025  b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
1026  }
1027  {
1028  __m128i R0;
1029  __m128i R1;
1030  R0 = _mm_load_si128((const __m128i*)pg);
1031  pg += 8; /* R1 = 00G300G200G100G0 */
1032  R1 = _mm_load_si128((const __m128i*)pg);
1033  pg += 8; /* R2 = 00G700G600G500G4 */
1034  g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
1035  }
1036  {
1037  __m128i R0;
1038  __m128i R1;
1039  R0 = _mm_load_si128((const __m128i*)pr);
1040  pr += 8; /* R0 = 00R300R200R100R0 */
1041  R1 = _mm_load_si128((const __m128i*)pr);
1042  pr += 8; /* R3 = 00R700R600R500R4 */
1043  r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
1044  }
1045  {
1046  __m128i gbHi;
1047  __m128i gbLo;
1048  __m128i arHi;
1049  __m128i arLo;
1050  {
1051  gbLo = _mm_unpacklo_epi8(a, b); /* R0 = G7G6G5G4G3G2G1G0 */
1052  gbHi = _mm_unpackhi_epi8(a, b); /* R1 = G7B7G6B7G5B5G4B4 */
1053  arLo = _mm_unpacklo_epi8(g, r); /* R4 = FFR3FFR2FFR1FFR0 */
1054  arHi = _mm_unpackhi_epi8(g, r); /* R3 = FFR7FFR6FFR5FFR4 */
1055  }
1056  {
1057  const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
1058  _mm_store_si128((__m128i*)out, bgrx);
1059  out += 16; /* FFR1G1B1FFR0G0B0 */
1060  }
1061  {
1062  const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
1063  _mm_store_si128((__m128i*)out, bgrx);
1064  out += 16; /* FFR3G3B3FFR2G2B2 */
1065  }
1066  {
1067  const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
1068  _mm_store_si128((__m128i*)out, bgrx);
1069  out += 16; /* FFR5G5B5FFR4G4B4 */
1070  }
1071  {
1072  const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
1073  _mm_store_si128((__m128i*)out, bgrx);
1074  out += 16; /* FFR7G7B7FFR6G6B6 */
1075  }
1076  }
1077  }
1078 
1079  for (UINT32 x = 0; x < pad; x++)
1080  {
1081  const BYTE R = CLIP(*pr++);
1082  const BYTE G = CLIP(*pg++);
1083  const BYTE B = CLIP(*pb++);
1084  *out++ = 0xFF;
1085  *out++ = B;
1086  *out++ = G;
1087  *out++ = R;
1088  }
1089 
1090  /* Jump to next row. */
1091  pr += srcbump;
1092  pg += srcbump;
1093  pb += srcbump;
1094  out += dstbump;
1095  }
1096 
1097  return PRIMITIVES_SUCCESS;
1098 }
1099 
1100 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
1101  const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
1102  UINT32 srcStep, /* bytes between rows in source data */
1103  BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
1104  UINT32 dstStep, /* bytes between rows in dest data */
1105  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
1106 {
1107  const UINT16* pr = (const UINT16*)(pSrc[0]);
1108  const UINT16* pg = (const UINT16*)(pSrc[1]);
1109  const UINT16* pb = (const UINT16*)(pSrc[2]);
1110  const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
1111  const UINT32 pad = roi->width % 16;
1112  BYTE* out = NULL;
1113  UINT32 srcbump = 0;
1114  UINT32 dstbump = 0;
1115  out = pDst;
1116  srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
1117  dstbump = (dstStep - (roi->width * sizeof(UINT32)));
1118 
1119  for (UINT32 y = 0; y < roi->height; ++y)
1120  {
1121  for (UINT32 x = 0; x < roi->width - pad; x += 16)
1122  {
1123  __m128i r;
1124  __m128i g;
1125  __m128i b;
1126  /* The comments below pretend these are 8-byte registers
1127  * rather than 16-byte, for readability.
1128  */
1129  {
1130  __m128i R0;
1131  __m128i R1;
1132  R0 = _mm_load_si128((const __m128i*)pb);
1133  pb += 8; /* R0 = 00B300B200B100B0 */
1134  R1 = _mm_load_si128((const __m128i*)pb);
1135  pb += 8; /* R1 = 00B700B600B500B4 */
1136  b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
1137  }
1138  {
1139  __m128i R0;
1140  __m128i R1;
1141  R0 = _mm_load_si128((const __m128i*)pg);
1142  pg += 8; /* R1 = 00G300G200G100G0 */
1143  R1 = _mm_load_si128((const __m128i*)pg);
1144  pg += 8; /* R2 = 00G700G600G500G4 */
1145  g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
1146  }
1147  {
1148  __m128i R0;
1149  __m128i R1;
1150  R0 = _mm_load_si128((const __m128i*)pr);
1151  pr += 8; /* R0 = 00R300R200R100R0 */
1152  R1 = _mm_load_si128((const __m128i*)pr);
1153  pr += 8; /* R3 = 00R700R600R500R4 */
1154  r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
1155  }
1156  {
1157  __m128i gbHi;
1158  __m128i gbLo;
1159  __m128i arHi;
1160  __m128i arLo;
1161  {
1162  gbLo = _mm_unpacklo_epi8(a, r); /* R0 = G7G6G5G4G3G2G1G0 */
1163  gbHi = _mm_unpackhi_epi8(a, r); /* R1 = G7B7G6B7G5B5G4B4 */
1164  arLo = _mm_unpacklo_epi8(g, b); /* R4 = FFR3FFR2FFR1FFR0 */
1165  arHi = _mm_unpackhi_epi8(g, b); /* R3 = FFR7FFR6FFR5FFR4 */
1166  }
1167  {
1168  const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
1169  _mm_store_si128((__m128i*)out, bgrx);
1170  out += 16; /* FFR1G1B1FFR0G0B0 */
1171  }
1172  {
1173  const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
1174  _mm_store_si128((__m128i*)out, bgrx);
1175  out += 16; /* FFR3G3B3FFR2G2B2 */
1176  }
1177  {
1178  const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
1179  _mm_store_si128((__m128i*)out, bgrx);
1180  out += 16; /* FFR5G5B5FFR4G4B4 */
1181  }
1182  {
1183  const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
1184  _mm_store_si128((__m128i*)out, bgrx);
1185  out += 16; /* FFR7G7B7FFR6G6B6 */
1186  }
1187  }
1188  }
1189 
1190  for (UINT32 x = 0; x < pad; x++)
1191  {
1192  const BYTE R = CLIP(*pr++);
1193  const BYTE G = CLIP(*pg++);
1194  const BYTE B = CLIP(*pb++);
1195  *out++ = 0xFF;
1196  *out++ = R;
1197  *out++ = G;
1198  *out++ = B;
1199  }
1200 
1201  /* Jump to next row. */
1202  pr += srcbump;
1203  pg += srcbump;
1204  pb += srcbump;
1205  out += dstbump;
1206  }
1207 
1208  return PRIMITIVES_SUCCESS;
1209 }
1210 
1211 static pstatus_t
1212 sse2_RGBToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
1213  UINT32 srcStep, /* bytes between rows in source data */
1214  BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
1215  UINT32 dstStep, /* bytes between rows in dest data */
1216  UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi)
1217 {
1218  if (((ULONG_PTR)pSrc[0] & 0x0f) || ((ULONG_PTR)pSrc[1] & 0x0f) || ((ULONG_PTR)pSrc[2] & 0x0f) ||
1219  (srcStep & 0x0f) || ((ULONG_PTR)pDst & 0x0f) || (dstStep & 0x0f))
1220  return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1221 
1222  switch (DstFormat)
1223  {
1224  case PIXEL_FORMAT_BGRA32:
1225  case PIXEL_FORMAT_BGRX32:
1226  return sse2_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
1227 
1228  case PIXEL_FORMAT_RGBA32:
1229  case PIXEL_FORMAT_RGBX32:
1230  return sse2_RGBToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
1231 
1232  case PIXEL_FORMAT_ABGR32:
1233  case PIXEL_FORMAT_XBGR32:
1234  return sse2_RGBToRGB_16s8u_P3AC4R_XBGR(pSrc, srcStep, pDst, dstStep, roi);
1235 
1236  case PIXEL_FORMAT_ARGB32:
1237  case PIXEL_FORMAT_XRGB32:
1238  return sse2_RGBToRGB_16s8u_P3AC4R_XRGB(pSrc, srcStep, pDst, dstStep, roi);
1239 
1240  default:
1241  return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1242  }
1243 }
1244 #endif
1245 
1246 void primitives_init_colors_sse2(primitives_t* prims)
1247 {
1248 #if defined(SSE2_ENABLED)
1249  generic = primitives_get_generic();
1250  primitives_init_colors(prims);
1251 
1252  if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
1253  {
1254  WLog_VRB(PRIM_TAG, "SSE2 optimizations");
1255  prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
1256  prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
1257  prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
1258  prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
1259  }
1260 
1261 #else
1262  WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
1263  WINPR_UNUSED(prims);
1264 #endif
1265 }