FreeRDP
prim_colors_sse2.c
1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Optimized Color conversion operations.
3  * vi:ts=4 sw=4:
4  *
5  * Copyright 2011 Stephen Erisman
6  * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
7  * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
8  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
9  *
10  * Licensed under the Apache License, Version 2.0 (the "License"); you may
11  * not use this file except in compliance with the License. You may obtain
12  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
16  * or implied. See the License for the specific language governing
17  * permissions and limitations under the License.
18  */
19 
20 #include <freerdp/config.h>
21 
22 #include <freerdp/types.h>
23 #include <freerdp/primitives.h>
24 #include <winpr/sysinfo.h>
25 
26 #include "prim_colors.h"
27 
28 #include "prim_internal.h"
29 #include "prim_templates.h"
30 
31 #if defined(SSE_AVX_INTRINSICS_ENABLED)
32 #include <emmintrin.h>
33 
34 static primitives_t* generic = NULL;
35 
36 #ifdef __GNUC__
37 #define GNU_INLINE __attribute__((__gnu_inline__, __always_inline__, __artificial__))
38 #else
39 #define GNU_INLINE
40 #endif
41 
42 #define CACHE_LINE_BYTES 64
43 
44 /* 1.403 << 14 */
45 /* -0.344 << 14 */
46 /* -0.714 << 14 */
47 /* 1.770 << 14 */
48 
49 static const int32_t ycbcr_table[][4] = { { 1, 0, -1, 2 },
50  { 3, -1, -1, 4 },
51  { 6, -1, -3, 7 },
52  { 11, -3, -6, 14 },
53  { 22, -6, -11, 28 },
54  { 45, -11, -23, 57 },
55  { 90, -22, -46, 113 },
56  { 180, -44, -91, 227 },
57  { 359, -88, -183, 453 },
58  { 718, -176, -366, 906 },
59  { 1437, -352, -731, 1812 },
60  { 2873, -705, -1462, 3625 },
61  { 5747, -1409, -2925, 7250 },
62  { 11493, -2818, -5849, 14500 },
63  { 22987, -5636, -11698, 29000 },
64  { 45974, -11272, -23396, 57999 },
65  { 91947, -22544, -46793, 115999 },
66  { 183894, -45089, -93585, 231997 },
67  { 367788, -90178, -187171, 463995 },
68  { 735576, -180355, -374342, 927990 },
69  { 1471152, -360710, -748683, 1855980 },
70  { 2942304, -721420, -1497367, 3711959 },
71  { 5884609, -1442841, -2994733, 7423918 },
72  { 11769217, -2885681, -5989466, 14847836 },
73  { 23538434, -5771362, -11978932, 29695672 },
74  { 47076868, -11542725, -23957864, 59391345 },
75  { 94153736, -23085449, -47915729, 118782689 },
76  { 188307472, -46170898, -95831458, 237565379 },
77  { 376614945, -92341797, -191662916, 475130757 },
78  { 753229890, -184683594, -383325831, 950261514 },
79  { 1506459779, -369367187, -766651662, 1900523028 } };
80 
81 static inline __m128i mm_between_epi16_int(__m128i val, __m128i min, __m128i max)
82 {
83  return _mm_min_epi16(max, _mm_max_epi16(val, min));
84 }
85 
86 #define mm_between_epi16(_val, _min, _max) (_val) = mm_between_epi16_int((_val), (_min), (_max))
87 
88 #ifdef DO_PREFETCH
89 /*---------------------------------------------------------------------------*/
90 static inline void GNU_INLINE _mm_prefetch_buffer(char* WINPR_RESTRICT buffer, int num_bytes)
91 {
92  __m128i* buf = (__m128i*)buffer;
93 
94  for (unsigned int i = 0; i < (num_bytes / sizeof(__m128i));
95  i += (CACHE_LINE_BYTES / sizeof(__m128i)))
96  {
97  _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
98  }
99 }
100 #endif /* DO_PREFETCH */
101 
102 /*---------------------------------------------------------------------------*/
103 static pstatus_t
104 sse2_yCbCrToRGB_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep,
105  INT16* WINPR_RESTRICT pDst[3], int dstStep,
106  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
107 {
108  if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
109  ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
110  ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
111  (srcStep & 127) || (dstStep & 127))
112  {
113  /* We can't maintain 16-byte alignment. */
114  return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
115  }
116 
117  const __m128i zero = _mm_setzero_si128();
118  const __m128i max = _mm_set1_epi16(255);
119  const __m128i* y_buf = (const __m128i*)(pSrc[0]);
120  const __m128i* cb_buf = (const __m128i*)(pSrc[1]);
121  const __m128i* cr_buf = (const __m128i*)(pSrc[2]);
122  __m128i* r_buf = (__m128i*)(pDst[0]);
123  __m128i* g_buf = (__m128i*)(pDst[1]);
124  __m128i* b_buf = (__m128i*)(pDst[2]);
125  __m128i r_cr =
126  _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][0])); /* 1.403 << 14 */
127  __m128i g_cb =
128  _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][1])); /* -0.344 << 14 */
129  __m128i g_cr =
130  _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][2])); /* -0.714 << 14 */
131  __m128i b_cb =
132  _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][3])); /* 1.770 << 14 */
133  __m128i c4096 = _mm_set1_epi16(4096);
134  const size_t srcbump = WINPR_ASSERTING_INT_CAST(size_t, srcStep) / sizeof(__m128i);
135  const size_t dstbump = WINPR_ASSERTING_INT_CAST(size_t, dstStep) / sizeof(__m128i);
136 #ifdef DO_PREFETCH
137 
138  /* Prefetch Y's, Cb's, and Cr's. */
139  for (UINT32 yp = 0; yp < roi->height; yp++)
140  {
141  for (int i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
142  i += (CACHE_LINE_BYTES / sizeof(__m128i)))
143  {
144  _mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA);
145  _mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
146  _mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
147  }
148 
149  y_buf += srcbump;
150  cb_buf += srcbump;
151  cr_buf += srcbump;
152  }
153 
154  y_buf = (__m128i*)(pSrc[0]);
155  cb_buf = (__m128i*)(pSrc[1]);
156  cr_buf = (__m128i*)(pSrc[2]);
157 #endif /* DO_PREFETCH */
158  const size_t imax = roi->width * sizeof(INT16) / sizeof(__m128i);
159 
160  for (UINT32 yp = 0; yp < roi->height; ++yp)
161  {
162  for (size_t i = 0; i < imax; i++)
163  {
164  /* In order to use SSE2 signed 16-bit integer multiplication
165  * we need to convert the floating point factors to signed int
166  * without losing information.
167  * The result of this multiplication is 32 bit and we have two
168  * SSE instructions that return either the hi or lo word.
169  * Thus we will multiply the factors by the highest possible 2^n,
170  * take the upper 16 bits of the signed 32-bit result
171  * (_mm_mulhi_epi16) and correct this result by multiplying
172  * it by 2^(16-n).
173  *
174  * For the given factors in the conversion matrix the best
175  * possible n is 14.
176  *
177  * Example for calculating r:
178  * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
179  * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
180  * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
181  * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
182  */
183  /* y = (y_r_buf[i] + 4096) >> 2 */
184  __m128i y = _mm_load_si128(y_buf + i);
185  y = _mm_add_epi16(y, c4096);
186  y = _mm_srai_epi16(y, 2);
187  /* cb = cb_g_buf[i]; */
188  __m128i cb = _mm_load_si128(cb_buf + i);
189  /* cr = cr_b_buf[i]; */
190  __m128i cr = _mm_load_si128(cr_buf + i);
191  /* (y + HIWORD(cr*22986)) >> 3 */
192  __m128i r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
193  r = _mm_srai_epi16(r, 3);
194  /* r_buf[i] = CLIP(r); */
195  mm_between_epi16(r, zero, max);
196  _mm_store_si128(r_buf + i, r);
197  /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
198  __m128i g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
199  g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
200  g = _mm_srai_epi16(g, 3);
201  /* g_buf[i] = CLIP(g); */
202  mm_between_epi16(g, zero, max);
203  _mm_store_si128(g_buf + i, g);
204  /* (y + HIWORD(cb*28999)) >> 3 */
205  __m128i b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
206  b = _mm_srai_epi16(b, 3);
207  /* b_buf[i] = CLIP(b); */
208  mm_between_epi16(b, zero, max);
209  _mm_store_si128(b_buf + i, b);
210  }
211 
212  y_buf += srcbump;
213  cb_buf += srcbump;
214  cr_buf += srcbump;
215  r_buf += dstbump;
216  g_buf += dstbump;
217  b_buf += dstbump;
218  }
219 
220  return PRIMITIVES_SUCCESS;
221 }
222 
223 /*---------------------------------------------------------------------------*/
224 static pstatus_t
225 sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
226  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
227  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
228 {
229  const __m128i zero = _mm_setzero_si128();
230  const __m128i max = _mm_set1_epi16(255);
231  const __m128i r_cr =
232  _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][0])); /* 1.403 << 14 */
233  const __m128i g_cb =
234  _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][1])); /* -0.344 << 14 */
235  const __m128i g_cr =
236  _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][2])); /* -0.714 << 14 */
237  const __m128i b_cb =
238  _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][3])); /* 1.770 << 14 */
239  const __m128i c4096 = _mm_set1_epi16(4096);
240  const INT16* y_buf = pSrc[0];
241  const INT16* cb_buf = pSrc[1];
242  const INT16* cr_buf = pSrc[2];
243  const UINT32 pad = roi->width % 16;
244  const UINT32 step = sizeof(__m128i) / sizeof(INT16);
245  const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
246  BYTE* d_buf = pDst;
247  const size_t dstPad = (dstStep - roi->width * 4);
248 #ifdef DO_PREFETCH
249 
250  /* Prefetch Y's, Cb's, and Cr's. */
251  for (UINT32 yp = 0; yp < roi->height; yp++)
252  {
253  for (int i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
254  {
255  _mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
256  _mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
257  _mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
258  }
259 
260  y_buf += srcStep / sizeof(INT16);
261  cb_buf += srcStep / sizeof(INT16);
262  cr_buf += srcStep / sizeof(INT16);
263  }
264 
265  y_buf = (INT16*)pSrc[0];
266  cb_buf = (INT16*)pSrc[1];
267  cr_buf = (INT16*)pSrc[2];
268 #endif /* DO_PREFETCH */
269 
270  for (UINT32 yp = 0; yp < roi->height; ++yp)
271  {
272  for (UINT32 i = 0; i < imax; i += 2)
273  {
274  /* In order to use SSE2 signed 16-bit integer multiplication
275  * we need to convert the floating point factors to signed int
276  * without losing information.
277  * The result of this multiplication is 32 bit and we have two
278  * SSE instructions that return either the hi or lo word.
279  * Thus we will multiply the factors by the highest possible 2^n,
280  * take the upper 16 bits of the signed 32-bit result
281  * (_mm_mulhi_epi16) and correct this result by multiplying
282  * it by 2^(16-n).
283  *
284  * For the given factors in the conversion matrix the best
285  * possible n is 14.
286  *
287  * Example for calculating r:
288  * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
289  * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
290  * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
291  * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
292  */
293  /* y = (y_r_buf[i] + 4096) >> 2 */
294  __m128i y1 = _mm_load_si128((const __m128i*)y_buf);
295  y_buf += step;
296  y1 = _mm_add_epi16(y1, c4096);
297  y1 = _mm_srai_epi16(y1, 2);
298  /* cb = cb_g_buf[i]; */
299  __m128i cb1 = _mm_load_si128((const __m128i*)cb_buf);
300  cb_buf += step;
301  /* cr = cr_b_buf[i]; */
302  __m128i cr1 = _mm_load_si128((const __m128i*)cr_buf);
303  cr_buf += step;
304  /* (y + HIWORD(cr*22986)) >> 3 */
305  __m128i r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
306  r1 = _mm_srai_epi16(r1, 3);
307  /* r_buf[i] = CLIP(r); */
308  mm_between_epi16(r1, zero, max);
309  /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
310  __m128i g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
311  g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
312  g1 = _mm_srai_epi16(g1, 3);
313  /* g_buf[i] = CLIP(g); */
314  mm_between_epi16(g1, zero, max);
315  /* (y + HIWORD(cb*28999)) >> 3 */
316  __m128i b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
317  b1 = _mm_srai_epi16(b1, 3);
318  /* b_buf[i] = CLIP(b); */
319  mm_between_epi16(b1, zero, max);
320  __m128i y2 = _mm_load_si128((const __m128i*)y_buf);
321  y_buf += step;
322  y2 = _mm_add_epi16(y2, c4096);
323  y2 = _mm_srai_epi16(y2, 2);
324  /* cb = cb_g_buf[i]; */
325  __m128i cb2 = _mm_load_si128((const __m128i*)cb_buf);
326  cb_buf += step;
327  /* cr = cr_b_buf[i]; */
328  __m128i cr2 = _mm_load_si128((const __m128i*)cr_buf);
329  cr_buf += step;
330  /* (y + HIWORD(cr*22986)) >> 3 */
331  __m128i r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
332  r2 = _mm_srai_epi16(r2, 3);
333  /* r_buf[i] = CLIP(r); */
334  mm_between_epi16(r2, zero, max);
335  /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
336  __m128i g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
337  g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
338  g2 = _mm_srai_epi16(g2, 3);
339  /* g_buf[i] = CLIP(g); */
340  mm_between_epi16(g2, zero, max);
341  /* (y + HIWORD(cb*28999)) >> 3 */
342  __m128i b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
343  b2 = _mm_srai_epi16(b2, 3);
344  /* b_buf[i] = CLIP(b); */
345  mm_between_epi16(b2, zero, max);
346  {
347  /* The comments below pretend these are 8-byte registers
348  * rather than 16-byte, for readability.
349  */
350  __m128i R0 = b1; /* R0 = 00B300B200B100B0 */
351  __m128i R1 = b2; /* R1 = 00B700B600B500B4 */
352  R0 = _mm_packus_epi16(R0, R1); /* R0 = B7B6B5B4B3B2B1B0 */
353  R1 = g1; /* R1 = 00G300G200G100G0 */
354  __m128i R2 = g2; /* R2 = 00G700G600G500G4 */
355  R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */
356  R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
357  R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = B3G3B2G2B1G1B0G0 */
358  R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = B7G7B6G6B5G5B4G4 */
359  R0 = r1; /* R0 = 00R300R200R100R0 */
360  __m128i R3 = r2; /* R3 = 00R700R600R500R4 */
361  R0 = _mm_packus_epi16(R0, R3); /* R0 = R7R6R5R4R3R2R1R0 */
362  R3 = mm_set1_epu32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */
363  __m128i R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
364  R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = R3FFR2FFR1FFR0FF */
365  R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = R7FFR6FFR5FFR4FF */
366  R0 = R4; /* R0 = R4 */
367  R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = B1G1R1FFB0G0R0FF */
368  R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = B3G3R3FFB2G2R2FF */
369  R2 = R3; /* R2 = R3 */
370  R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = B5G5R5FFB4G4R4FF */
371  R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = B7G7R7FFB6G6R6FF */
372  _mm_store_si128((__m128i*)d_buf, R0); /* B1G1R1FFB0G0R0FF */
373  d_buf += sizeof(__m128i);
374  _mm_store_si128((__m128i*)d_buf, R4); /* B3G3R3FFB2G2R2FF */
375  d_buf += sizeof(__m128i);
376  _mm_store_si128((__m128i*)d_buf, R2); /* B5G5R5FFB4G4R4FF */
377  d_buf += sizeof(__m128i);
378  _mm_store_si128((__m128i*)d_buf, R3); /* B7G7R7FFB6G6R6FF */
379  d_buf += sizeof(__m128i);
380  }
381  }
382 
383  for (UINT32 i = 0; i < pad; i++)
384  {
385  const INT32 divisor = 16;
386  const INT32 Y = ((*y_buf++) + 4096) << divisor;
387  const INT32 Cb = (*cb_buf++);
388  const INT32 Cr = (*cr_buf++);
389  const INT32 CrR = Cr * ycbcr_table[divisor][0];
390  const INT32 CrG = Cr * ycbcr_table[divisor][1];
391  const INT32 CbG = Cb * ycbcr_table[divisor][2];
392  const INT32 CbB = Cb * ycbcr_table[divisor][3];
393  const INT16 R = WINPR_ASSERTING_INT_CAST(int16_t, (((CrR + Y) >> divisor) >> 5));
394  const INT16 G = WINPR_ASSERTING_INT_CAST(int16_t, (((Y - CbG - CrG) >> divisor) >> 5));
395  const INT16 B = WINPR_ASSERTING_INT_CAST(int16_t, (((CbB + Y) >> divisor) >> 5));
396  *d_buf++ = CLIP(B);
397  *d_buf++ = CLIP(G);
398  *d_buf++ = CLIP(R);
399  *d_buf++ = 0xFF;
400  }
401 
402  d_buf += dstPad;
403  }
404 
405  return PRIMITIVES_SUCCESS;
406 }
407 
408 /*---------------------------------------------------------------------------*/
409 static pstatus_t
410 sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
411  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
412  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
413 {
414  const __m128i zero = _mm_setzero_si128();
415  const __m128i max = _mm_set1_epi16(255);
416  const __m128i r_cr =
417  _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][0])); /* 1.403 << 14 */
418  const __m128i g_cb =
419  _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][1])); /* -0.344 << 14 */
420  const __m128i g_cr =
421  _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][2])); /* -0.714 << 14 */
422  const __m128i b_cb =
423  _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][3])); /* 1.770 << 14 */
424  const __m128i c4096 = _mm_set1_epi16(4096);
425  const INT16* y_buf = pSrc[0];
426  const INT16* cb_buf = pSrc[1];
427  const INT16* cr_buf = pSrc[2];
428  const UINT32 pad = roi->width % 16;
429  const UINT32 step = sizeof(__m128i) / sizeof(INT16);
430  const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
431  BYTE* d_buf = pDst;
432  const size_t dstPad = (dstStep - roi->width * 4);
433 #ifdef DO_PREFETCH
434 
435  /* Prefetch Y's, Cb's, and Cr's. */
436  for (UINT32 yp = 0; yp < roi->height; yp++)
437  {
438  for (int i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
439  {
440  _mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
441  _mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
442  _mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
443  }
444 
445  y_buf += srcStep / sizeof(INT16);
446  cb_buf += srcStep / sizeof(INT16);
447  cr_buf += srcStep / sizeof(INT16);
448  }
449 
450  y_buf = (INT16*)(pSrc[0]);
451  cb_buf = (INT16*)(pSrc[1]);
452  cr_buf = (INT16*)(pSrc[2]);
453 #endif /* DO_PREFETCH */
454 
455  for (UINT32 yp = 0; yp < roi->height; ++yp)
456  {
457  for (UINT32 i = 0; i < imax; i += 2)
458  {
459  /* In order to use SSE2 signed 16-bit integer multiplication
460  * we need to convert the floating point factors to signed int
461  * without losing information.
462  * The result of this multiplication is 32 bit and we have two
463  * SSE instructions that return either the hi or lo word.
464  * Thus we will multiply the factors by the highest possible 2^n,
465  * take the upper 16 bits of the signed 32-bit result
466  * (_mm_mulhi_epi16) and correct this result by multiplying
467  * it by 2^(16-n).
468  *
469  * For the given factors in the conversion matrix the best
470  * possible n is 14.
471  *
472  * Example for calculating r:
473  * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
474  * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
475  * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
476  * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
477  */
478  /* y = (y_r_buf[i] + 4096) >> 2 */
479  __m128i y1 = _mm_load_si128((const __m128i*)y_buf);
480  y_buf += step;
481  y1 = _mm_add_epi16(y1, c4096);
482  y1 = _mm_srai_epi16(y1, 2);
483  /* cb = cb_g_buf[i]; */
484  __m128i cb1 = _mm_load_si128((const __m128i*)cb_buf);
485  cb_buf += step;
486  /* cr = cr_b_buf[i]; */
487  __m128i cr1 = _mm_load_si128((const __m128i*)cr_buf);
488  cr_buf += step;
489  /* (y + HIWORD(cr*22986)) >> 3 */
490  __m128i r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
491  r1 = _mm_srai_epi16(r1, 3);
492  /* r_buf[i] = CLIP(r); */
493  mm_between_epi16(r1, zero, max);
494  /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
495  __m128i g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
496  g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
497  g1 = _mm_srai_epi16(g1, 3);
498  /* g_buf[i] = CLIP(g); */
499  mm_between_epi16(g1, zero, max);
500  /* (y + HIWORD(cb*28999)) >> 3 */
501  __m128i b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
502  b1 = _mm_srai_epi16(b1, 3);
503  /* b_buf[i] = CLIP(b); */
504  mm_between_epi16(b1, zero, max);
505  __m128i y2 = _mm_load_si128((const __m128i*)y_buf);
506  y_buf += step;
507  y2 = _mm_add_epi16(y2, c4096);
508  y2 = _mm_srai_epi16(y2, 2);
509  /* cb = cb_g_buf[i]; */
510  __m128i cb2 = _mm_load_si128((const __m128i*)cb_buf);
511  cb_buf += step;
512  /* cr = cr_b_buf[i]; */
513  __m128i cr2 = _mm_load_si128((const __m128i*)cr_buf);
514  cr_buf += step;
515  /* (y + HIWORD(cr*22986)) >> 3 */
516  __m128i r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
517  r2 = _mm_srai_epi16(r2, 3);
518  /* r_buf[i] = CLIP(r); */
519  mm_between_epi16(r2, zero, max);
520  /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
521  __m128i g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
522  g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
523  g2 = _mm_srai_epi16(g2, 3);
524  /* g_buf[i] = CLIP(g); */
525  mm_between_epi16(g2, zero, max);
526  /* (y + HIWORD(cb*28999)) >> 3 */
527  __m128i b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
528  b2 = _mm_srai_epi16(b2, 3);
529  /* b_buf[i] = CLIP(b); */
530  mm_between_epi16(b2, zero, max);
531  {
532  /* The comments below pretend these are 8-byte registers
533  * rather than 16-byte, for readability.
534  */
535  __m128i R0 = r1; /* R0 = 00R300R200R100R0 */
536  __m128i R1 = r2; /* R1 = 00R700R600R500R4 */
537  R0 = _mm_packus_epi16(R0, R1); /* R0 = R7R6R5R4R3R2R1R0 */
538  R1 = g1; /* R1 = 00G300G200G100G0 */
539  __m128i R2 = g2; /* R2 = 00G700G600G500G4 */
540  R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */
541  R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
542  R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = R3G3R2G2R1G1R0G0 */
543  R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = R7G7R6G6R5G5R4G4 */
544  R0 = b1; /* R0 = 00B300B200B100B0 */
545  __m128i R3 = b2; /* R3 = 00B700B600B500B4 */
546  R0 = _mm_packus_epi16(R0, R3); /* R0 = B7B6B5B4B3B2B1B0 */
547  R3 = mm_set1_epu32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */
548  __m128i R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
549  R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = B3FFB2FFB1FFB0FF */
550  R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = B7FFB6FFB5FFB4FF */
551  R0 = R4; /* R0 = R4 */
552  R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = R1G1B1FFR0G0B0FF */
553  R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = R3G3B3FFR2G2B2FF */
554  R2 = R3; /* R2 = R3 */
555  R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = R5G5B5FFR4G4B4FF */
556  R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = R7G7B7FFR6G6B6FF */
557  _mm_store_si128((__m128i*)d_buf, R0); /* R1G1B1FFR0G0B0FF */
558  d_buf += sizeof(__m128i);
559  _mm_store_si128((__m128i*)d_buf, R4); /* R3G3B3FFR2G2B2FF */
560  d_buf += sizeof(__m128i);
561  _mm_store_si128((__m128i*)d_buf, R2); /* R5G5B5FFR4G4B4FF */
562  d_buf += sizeof(__m128i);
563  _mm_store_si128((__m128i*)d_buf, R3); /* R7G7B7FFR6G6B6FF */
564  d_buf += sizeof(__m128i);
565  }
566  }
567 
568  for (UINT32 i = 0; i < pad; i++)
569  {
570  const INT32 divisor = 16;
571  const INT32 Y = ((*y_buf++) + 4096) << divisor;
572  const INT32 Cb = (*cb_buf++);
573  const INT32 Cr = (*cr_buf++);
574  const INT32 CrR = Cr * ycbcr_table[divisor][0];
575  const INT32 CrG = Cr * ycbcr_table[divisor][1];
576  const INT32 CbG = Cb * ycbcr_table[divisor][2];
577  const INT32 CbB = Cb * ycbcr_table[divisor][3];
578  const INT16 R = WINPR_ASSERTING_INT_CAST(int16_t, (((CrR + Y) >> divisor) >> 5));
579  const INT16 G = WINPR_ASSERTING_INT_CAST(int16_t, (((Y - CbG - CrG) >> divisor) >> 5));
580  const INT16 B = WINPR_ASSERTING_INT_CAST(int16_t, (((CbB + Y) >> divisor) >> 5));
581  *d_buf++ = CLIP(R);
582  *d_buf++ = CLIP(G);
583  *d_buf++ = CLIP(B);
584  *d_buf++ = 0xFF;
585  }
586 
587  d_buf += dstPad;
588  }
589 
590  return PRIMITIVES_SUCCESS;
591 }
592 
593 static pstatus_t
594 sse2_yCbCrToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
595  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
596  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
597 {
598  if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
599  ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst)&0x0f) || (srcStep & 0x0f) ||
600  (dstStep & 0x0f))
601  {
602  /* We can't maintain 16-byte alignment. */
603  return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
604  }
605 
606  switch (DstFormat)
607  {
608  case PIXEL_FORMAT_BGRA32:
609  case PIXEL_FORMAT_BGRX32:
610  return sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
611 
612  case PIXEL_FORMAT_RGBA32:
613  case PIXEL_FORMAT_RGBX32:
614  return sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
615 
616  default:
617  return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
618  }
619 }
620 /* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
621  * numbers. See the general code above.
622  */
623 static pstatus_t
624 sse2_RGBToYCbCr_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep,
625  INT16* WINPR_RESTRICT pDst[3], int dstStep,
626  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
627 {
628  const __m128i* r_buf = (const __m128i*)(pSrc[0]);
629  const __m128i* g_buf = (const __m128i*)(pSrc[1]);
630  const __m128i* b_buf = (const __m128i*)(pSrc[2]);
631  __m128i* y_buf = (__m128i*)(pDst[0]);
632  __m128i* cb_buf = (__m128i*)(pDst[1]);
633  __m128i* cr_buf = (__m128i*)(pDst[2]);
634  int imax = 0;
635 
636  if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
637  ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
638  ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
639  (srcStep & 127) || (dstStep & 127))
640  {
641  /* We can't maintain 16-byte alignment. */
642  return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
643  }
644 
645  const __m128i min = _mm_set1_epi16(-128 * 32);
646  const __m128i max = _mm_set1_epi16(127 * 32);
647 
648  __m128i y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */
649  __m128i y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */
650  __m128i y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */
651  __m128i cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */
652  __m128i cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
653  __m128i cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */
654  __m128i cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */
655  __m128i cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
656  __m128i cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */
657  const size_t srcbump = WINPR_ASSERTING_INT_CAST(size_t, srcStep) / sizeof(__m128i);
658  const size_t dstbump = WINPR_ASSERTING_INT_CAST(size_t, dstStep) / sizeof(__m128i);
659 #ifdef DO_PREFETCH
660 
661  /* Prefetch RGB's. */
662  for (UINT32 yp = 0; yp < roi->height; yp++)
663  {
664  for (int i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
665  i += (CACHE_LINE_BYTES / sizeof(__m128i)))
666  {
667  _mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
668  _mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
669  _mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
670  }
671 
672  r_buf += srcbump;
673  g_buf += srcbump;
674  b_buf += srcbump;
675  }
676 
677  r_buf = (__m128i*)(pSrc[0]);
678  g_buf = (__m128i*)(pSrc[1]);
679  b_buf = (__m128i*)(pSrc[2]);
680 #endif /* DO_PREFETCH */
681  imax = roi->width * sizeof(INT16) / sizeof(__m128i);
682 
683  for (UINT32 yp = 0; yp < roi->height; ++yp)
684  {
685  for (int i = 0; i < imax; i++)
686  {
687  /* In order to use SSE2 signed 16-bit integer multiplication we
688  * need to convert the floating point factors to signed int
689  * without losing information. The result of this multiplication
690  * is 32 bit and using SSE2 we get either the product's hi or lo
691  * word. Thus we will multiply the factors by the highest
692  * possible 2^n and take the upper 16 bits of the signed 32-bit
693  * result (_mm_mulhi_epi16). Since the final result needs to
694  * be scaled by << 5 and also in in order to keep the precision
695  * within the upper 16 bits we will also have to scale the RGB
696  * values used in the multiplication by << 5+(16-n).
697  */
698  __m128i r = _mm_load_si128(r_buf + i);
699  __m128i g = _mm_load_si128(g_buf + i);
700  __m128i b = _mm_load_si128(b_buf + i);
701  /* r<<6; g<<6; b<<6 */
702  r = _mm_slli_epi16(r, 6);
703  g = _mm_slli_epi16(g, 6);
704  b = _mm_slli_epi16(b, 6);
705  /* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
706  __m128i y = _mm_mulhi_epi16(r, y_r);
707  y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
708  y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
709  y = _mm_add_epi16(y, min);
710  /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
711  mm_between_epi16(y, min, max);
712  _mm_store_si128(y_buf + i, y);
713  /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
714  __m128i cb = _mm_mulhi_epi16(r, cb_r);
715  cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
716  cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
717  /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
718  mm_between_epi16(cb, min, max);
719  _mm_store_si128(cb_buf + i, cb);
720  /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
721  __m128i cr = _mm_mulhi_epi16(r, cr_r);
722  cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
723  cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
724  /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
725  mm_between_epi16(cr, min, max);
726  _mm_store_si128(cr_buf + i, cr);
727  }
728 
729  y_buf += srcbump;
730  cb_buf += srcbump;
731  cr_buf += srcbump;
732  r_buf += dstbump;
733  g_buf += dstbump;
734  b_buf += dstbump;
735  }
736 
737  return PRIMITIVES_SUCCESS;
738 }
739 
740 /*---------------------------------------------------------------------------*/
741 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
742  const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
743  UINT32 srcStep, /* bytes between rows in source data */
744  BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
745  UINT32 dstStep, /* bytes between rows in dest data */
746  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
747 {
748  const UINT16* pr = (const UINT16*)(pSrc[0]);
749  const UINT16* pg = (const UINT16*)(pSrc[1]);
750  const UINT16* pb = (const UINT16*)(pSrc[2]);
751  const UINT32 pad = roi->width % 16;
752  const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
753  BYTE* out = NULL;
754  UINT32 srcbump = 0;
755  UINT32 dstbump = 0;
756  out = pDst;
757  srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
758  dstbump = (dstStep - (roi->width * sizeof(UINT32)));
759 
760  for (UINT32 y = 0; y < roi->height; ++y)
761  {
762  for (UINT32 x = 0; x < roi->width - pad; x += 16)
763  {
764  __m128i r;
765  __m128i g;
766  __m128i b;
767  /* The comments below pretend these are 8-byte registers
768  * rather than 16-byte, for readability.
769  */
770  {
771  __m128i R0;
772  __m128i R1;
773  R0 = _mm_load_si128((const __m128i*)pb);
774  pb += 8; /* R0 = 00B300B200B100B0 */
775  R1 = _mm_load_si128((const __m128i*)pb);
776  pb += 8; /* R1 = 00B700B600B500B4 */
777  b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
778  }
779  {
780  __m128i R0;
781  __m128i R1;
782  R0 = _mm_load_si128((const __m128i*)pg);
783  pg += 8; /* R1 = 00G300G200G100G0 */
784  R1 = _mm_load_si128((const __m128i*)pg);
785  pg += 8; /* R2 = 00G700G600G500G4 */
786  g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
787  }
788  {
789  __m128i R0;
790  __m128i R1;
791  R0 = _mm_load_si128((const __m128i*)pr);
792  pr += 8; /* R0 = 00R300R200R100R0 */
793  R1 = _mm_load_si128((const __m128i*)pr);
794  pr += 8; /* R3 = 00R700R600R500R4 */
795  r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
796  }
797  {
798  const __m128i gbLo = _mm_unpacklo_epi8(b, g); /* R0 = G7G6G5G4G3G2G1G0 */
799  const __m128i gbHi = _mm_unpackhi_epi8(b, g); /* R1 = G7B7G6B7G5B5G4B4 */
800  const __m128i arLo = _mm_unpacklo_epi8(r, a); /* R4 = FFR3FFR2FFR1FFR0 */
801  const __m128i arHi = _mm_unpackhi_epi8(r, a); /* R3 = FFR7FFR6FFR5FFR4 */
802 
803  {
804  const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
805  _mm_store_si128((__m128i*)out, bgrx);
806  out += 16; /* FFR1G1B1FFR0G0B0 */
807  }
808  {
809  const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
810  _mm_store_si128((__m128i*)out, bgrx);
811  out += 16; /* FFR3G3B3FFR2G2B2 */
812  }
813  {
814  const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
815  _mm_store_si128((__m128i*)out, bgrx);
816  out += 16; /* FFR5G5B5FFR4G4B4 */
817  }
818  {
819  const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
820  _mm_store_si128((__m128i*)out, bgrx);
821  out += 16; /* FFR7G7B7FFR6G6B6 */
822  }
823  }
824  }
825 
826  for (UINT32 x = 0; x < pad; x++)
827  {
828  const BYTE R = CLIP(*pr++);
829  const BYTE G = CLIP(*pg++);
830  const BYTE B = CLIP(*pb++);
831  *out++ = B;
832  *out++ = G;
833  *out++ = R;
834  *out++ = 0xFF;
835  }
836 
837  /* Jump to next row. */
838  pr += srcbump;
839  pg += srcbump;
840  pb += srcbump;
841  out += dstbump;
842  }
843 
844  return PRIMITIVES_SUCCESS;
845 }
846 
847 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
848  const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
849  UINT32 srcStep, /* bytes between rows in source data */
850  BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
851  UINT32 dstStep, /* bytes between rows in dest data */
852  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
853 {
854  const UINT16* pr = (const UINT16*)(pSrc[0]);
855  const UINT16* pg = (const UINT16*)(pSrc[1]);
856  const UINT16* pb = (const UINT16*)(pSrc[2]);
857  const UINT32 pad = roi->width % 16;
858  const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
859  BYTE* out = NULL;
860  UINT32 srcbump = 0;
861  UINT32 dstbump = 0;
862  out = pDst;
863  srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
864  dstbump = (dstStep - (roi->width * sizeof(UINT32)));
865 
866  for (UINT32 y = 0; y < roi->height; ++y)
867  {
868  for (UINT32 x = 0; x < roi->width - pad; x += 16)
869  {
870  __m128i r;
871  __m128i g;
872  __m128i b;
873  /* The comments below pretend these are 8-byte registers
874  * rather than 16-byte, for readability.
875  */
876  {
877  __m128i R0;
878  __m128i R1;
879  R0 = _mm_load_si128((const __m128i*)pb);
880  pb += 8; /* R0 = 00B300B200B100B0 */
881  R1 = _mm_load_si128((const __m128i*)pb);
882  pb += 8; /* R1 = 00B700B600B500B4 */
883  b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
884  }
885  {
886  __m128i R0;
887  __m128i R1;
888  R0 = _mm_load_si128((const __m128i*)pg);
889  pg += 8; /* R1 = 00G300G200G100G0 */
890  R1 = _mm_load_si128((const __m128i*)pg);
891  pg += 8; /* R2 = 00G700G600G500G4 */
892  g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
893  }
894  {
895  __m128i R0;
896  __m128i R1;
897  R0 = _mm_load_si128((const __m128i*)pr);
898  pr += 8; /* R0 = 00R300R200R100R0 */
899  R1 = _mm_load_si128((const __m128i*)pr);
900  pr += 8; /* R3 = 00R700R600R500R4 */
901  r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
902  }
903  {
904  __m128i gbHi;
905  __m128i gbLo;
906  __m128i arHi;
907  __m128i arLo;
908  {
909  gbLo = _mm_unpacklo_epi8(r, g); /* R0 = G7G6G5G4G3G2G1G0 */
910  gbHi = _mm_unpackhi_epi8(r, g); /* R1 = G7B7G6B7G5B5G4B4 */
911  arLo = _mm_unpacklo_epi8(b, a); /* R4 = FFR3FFR2FFR1FFR0 */
912  arHi = _mm_unpackhi_epi8(b, a); /* R3 = FFR7FFR6FFR5FFR4 */
913  }
914  {
915  const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
916  _mm_store_si128((__m128i*)out, bgrx);
917  out += 16; /* FFR1G1B1FFR0G0B0 */
918  }
919  {
920  const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
921  _mm_store_si128((__m128i*)out, bgrx);
922  out += 16; /* FFR3G3B3FFR2G2B2 */
923  }
924  {
925  const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
926  _mm_store_si128((__m128i*)out, bgrx);
927  out += 16; /* FFR5G5B5FFR4G4B4 */
928  }
929  {
930  const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
931  _mm_store_si128((__m128i*)out, bgrx);
932  out += 16; /* FFR7G7B7FFR6G6B6 */
933  }
934  }
935  }
936 
937  for (UINT32 x = 0; x < pad; x++)
938  {
939  const BYTE R = CLIP(*pr++);
940  const BYTE G = CLIP(*pg++);
941  const BYTE B = CLIP(*pb++);
942  *out++ = R;
943  *out++ = G;
944  *out++ = B;
945  *out++ = 0xFF;
946  }
947 
948  /* Jump to next row. */
949  pr += srcbump;
950  pg += srcbump;
951  pb += srcbump;
952  out += dstbump;
953  }
954 
955  return PRIMITIVES_SUCCESS;
956 }
957 
958 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
959  const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
960  UINT32 srcStep, /* bytes between rows in source data */
961  BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
962  UINT32 dstStep, /* bytes between rows in dest data */
963  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
964 {
965  const UINT16* pr = (const UINT16*)(pSrc[0]);
966  const UINT16* pg = (const UINT16*)(pSrc[1]);
967  const UINT16* pb = (const UINT16*)(pSrc[2]);
968  const UINT32 pad = roi->width % 16;
969  const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
970  BYTE* out = NULL;
971  UINT32 srcbump = 0;
972  UINT32 dstbump = 0;
973  out = pDst;
974  srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
975  dstbump = (dstStep - (roi->width * sizeof(UINT32)));
976 
977  for (UINT32 y = 0; y < roi->height; ++y)
978  {
979  for (UINT32 x = 0; x < roi->width - pad; x += 16)
980  {
981  __m128i r;
982  __m128i g;
983  __m128i b;
984  /* The comments below pretend these are 8-byte registers
985  * rather than 16-byte, for readability.
986  */
987  {
988  __m128i R0;
989  __m128i R1;
990  R0 = _mm_load_si128((const __m128i*)pb);
991  pb += 8; /* R0 = 00B300B200B100B0 */
992  R1 = _mm_load_si128((const __m128i*)pb);
993  pb += 8; /* R1 = 00B700B600B500B4 */
994  b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
995  }
996  {
997  __m128i R0;
998  __m128i R1;
999  R0 = _mm_load_si128((const __m128i*)pg);
1000  pg += 8; /* R1 = 00G300G200G100G0 */
1001  R1 = _mm_load_si128((const __m128i*)pg);
1002  pg += 8; /* R2 = 00G700G600G500G4 */
1003  g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
1004  }
1005  {
1006  __m128i R0;
1007  __m128i R1;
1008  R0 = _mm_load_si128((const __m128i*)pr);
1009  pr += 8; /* R0 = 00R300R200R100R0 */
1010  R1 = _mm_load_si128((const __m128i*)pr);
1011  pr += 8; /* R3 = 00R700R600R500R4 */
1012  r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
1013  }
1014  {
1015  __m128i gbHi;
1016  __m128i gbLo;
1017  __m128i arHi;
1018  __m128i arLo;
1019  {
1020  gbLo = _mm_unpacklo_epi8(a, b); /* R0 = G7G6G5G4G3G2G1G0 */
1021  gbHi = _mm_unpackhi_epi8(a, b); /* R1 = G7B7G6B7G5B5G4B4 */
1022  arLo = _mm_unpacklo_epi8(g, r); /* R4 = FFR3FFR2FFR1FFR0 */
1023  arHi = _mm_unpackhi_epi8(g, r); /* R3 = FFR7FFR6FFR5FFR4 */
1024  }
1025  {
1026  const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
1027  _mm_store_si128((__m128i*)out, bgrx);
1028  out += 16; /* FFR1G1B1FFR0G0B0 */
1029  }
1030  {
1031  const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
1032  _mm_store_si128((__m128i*)out, bgrx);
1033  out += 16; /* FFR3G3B3FFR2G2B2 */
1034  }
1035  {
1036  const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
1037  _mm_store_si128((__m128i*)out, bgrx);
1038  out += 16; /* FFR5G5B5FFR4G4B4 */
1039  }
1040  {
1041  const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
1042  _mm_store_si128((__m128i*)out, bgrx);
1043  out += 16; /* FFR7G7B7FFR6G6B6 */
1044  }
1045  }
1046  }
1047 
1048  for (UINT32 x = 0; x < pad; x++)
1049  {
1050  const BYTE R = CLIP(*pr++);
1051  const BYTE G = CLIP(*pg++);
1052  const BYTE B = CLIP(*pb++);
1053  *out++ = 0xFF;
1054  *out++ = B;
1055  *out++ = G;
1056  *out++ = R;
1057  }
1058 
1059  /* Jump to next row. */
1060  pr += srcbump;
1061  pg += srcbump;
1062  pb += srcbump;
1063  out += dstbump;
1064  }
1065 
1066  return PRIMITIVES_SUCCESS;
1067 }
1068 
1069 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
1070  const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
1071  UINT32 srcStep, /* bytes between rows in source data */
1072  BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
1073  UINT32 dstStep, /* bytes between rows in dest data */
1074  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
1075 {
1076  const UINT16* pr = (const UINT16*)(pSrc[0]);
1077  const UINT16* pg = (const UINT16*)(pSrc[1]);
1078  const UINT16* pb = (const UINT16*)(pSrc[2]);
1079  const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
1080  const UINT32 pad = roi->width % 16;
1081  BYTE* out = NULL;
1082  UINT32 srcbump = 0;
1083  UINT32 dstbump = 0;
1084  out = pDst;
1085  srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
1086  dstbump = (dstStep - (roi->width * sizeof(UINT32)));
1087 
1088  for (UINT32 y = 0; y < roi->height; ++y)
1089  {
1090  for (UINT32 x = 0; x < roi->width - pad; x += 16)
1091  {
1092  __m128i r;
1093  __m128i g;
1094  __m128i b;
1095  /* The comments below pretend these are 8-byte registers
1096  * rather than 16-byte, for readability.
1097  */
1098  {
1099  __m128i R0;
1100  __m128i R1;
1101  R0 = _mm_load_si128((const __m128i*)pb);
1102  pb += 8; /* R0 = 00B300B200B100B0 */
1103  R1 = _mm_load_si128((const __m128i*)pb);
1104  pb += 8; /* R1 = 00B700B600B500B4 */
1105  b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
1106  }
1107  {
1108  __m128i R0;
1109  __m128i R1;
1110  R0 = _mm_load_si128((const __m128i*)pg);
1111  pg += 8; /* R1 = 00G300G200G100G0 */
1112  R1 = _mm_load_si128((const __m128i*)pg);
1113  pg += 8; /* R2 = 00G700G600G500G4 */
1114  g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
1115  }
1116  {
1117  __m128i R0;
1118  __m128i R1;
1119  R0 = _mm_load_si128((const __m128i*)pr);
1120  pr += 8; /* R0 = 00R300R200R100R0 */
1121  R1 = _mm_load_si128((const __m128i*)pr);
1122  pr += 8; /* R3 = 00R700R600R500R4 */
1123  r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
1124  }
1125  {
1126  __m128i gbHi;
1127  __m128i gbLo;
1128  __m128i arHi;
1129  __m128i arLo;
1130  {
1131  gbLo = _mm_unpacklo_epi8(a, r); /* R0 = G7G6G5G4G3G2G1G0 */
1132  gbHi = _mm_unpackhi_epi8(a, r); /* R1 = G7B7G6B7G5B5G4B4 */
1133  arLo = _mm_unpacklo_epi8(g, b); /* R4 = FFR3FFR2FFR1FFR0 */
1134  arHi = _mm_unpackhi_epi8(g, b); /* R3 = FFR7FFR6FFR5FFR4 */
1135  }
1136  {
1137  const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
1138  _mm_store_si128((__m128i*)out, bgrx);
1139  out += 16; /* FFR1G1B1FFR0G0B0 */
1140  }
1141  {
1142  const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
1143  _mm_store_si128((__m128i*)out, bgrx);
1144  out += 16; /* FFR3G3B3FFR2G2B2 */
1145  }
1146  {
1147  const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
1148  _mm_store_si128((__m128i*)out, bgrx);
1149  out += 16; /* FFR5G5B5FFR4G4B4 */
1150  }
1151  {
1152  const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
1153  _mm_store_si128((__m128i*)out, bgrx);
1154  out += 16; /* FFR7G7B7FFR6G6B6 */
1155  }
1156  }
1157  }
1158 
1159  for (UINT32 x = 0; x < pad; x++)
1160  {
1161  const BYTE R = CLIP(*pr++);
1162  const BYTE G = CLIP(*pg++);
1163  const BYTE B = CLIP(*pb++);
1164  *out++ = 0xFF;
1165  *out++ = R;
1166  *out++ = G;
1167  *out++ = B;
1168  }
1169 
1170  /* Jump to next row. */
1171  pr += srcbump;
1172  pg += srcbump;
1173  pb += srcbump;
1174  out += dstbump;
1175  }
1176 
1177  return PRIMITIVES_SUCCESS;
1178 }
1179 
1180 static pstatus_t
1181 sse2_RGBToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
1182  UINT32 srcStep, /* bytes between rows in source data */
1183  BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
1184  UINT32 dstStep, /* bytes between rows in dest data */
1185  UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi)
1186 {
1187  if (((ULONG_PTR)pSrc[0] & 0x0f) || ((ULONG_PTR)pSrc[1] & 0x0f) || ((ULONG_PTR)pSrc[2] & 0x0f) ||
1188  (srcStep & 0x0f) || ((ULONG_PTR)pDst & 0x0f) || (dstStep & 0x0f))
1189  return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1190 
1191  switch (DstFormat)
1192  {
1193  case PIXEL_FORMAT_BGRA32:
1194  case PIXEL_FORMAT_BGRX32:
1195  return sse2_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
1196 
1197  case PIXEL_FORMAT_RGBA32:
1198  case PIXEL_FORMAT_RGBX32:
1199  return sse2_RGBToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
1200 
1201  case PIXEL_FORMAT_ABGR32:
1202  case PIXEL_FORMAT_XBGR32:
1203  return sse2_RGBToRGB_16s8u_P3AC4R_XBGR(pSrc, srcStep, pDst, dstStep, roi);
1204 
1205  case PIXEL_FORMAT_ARGB32:
1206  case PIXEL_FORMAT_XRGB32:
1207  return sse2_RGBToRGB_16s8u_P3AC4R_XRGB(pSrc, srcStep, pDst, dstStep, roi);
1208 
1209  default:
1210  return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1211  }
1212 }
1213 #endif
1214 
1215 void primitives_init_colors_sse2(primitives_t* prims)
1216 {
1217 #if defined(SSE_AVX_INTRINSICS_ENABLED)
1218  generic = primitives_get_generic();
1219  primitives_init_colors(prims);
1220 
1221  if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
1222  {
1223  WLog_VRB(PRIM_TAG, "SSE2 optimizations");
1224  prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
1225  prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
1226  prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
1227  prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
1228  }
1229 
1230 #else
1231  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
1232  WINPR_UNUSED(prims);
1233 #endif
1234 }