FreeRDP
prim_YUV_ssse3.c
1 
23 #include <winpr/wtypes.h>
24 #include <freerdp/config.h>
25 
26 #include <winpr/sysinfo.h>
27 #include <winpr/crt.h>
28 #include <freerdp/types.h>
29 #include <freerdp/primitives.h>
30 
31 #include "prim_internal.h"
32 #include "prim_YUV.h"
33 
34 #if defined(SSE_AVX_INTRINSICS_ENABLED)
35 #include <emmintrin.h>
36 #include <tmmintrin.h>
37 
38 static primitives_t* generic = NULL;
39 
40 /****************************************************************************/
41 /* SSSE3 YUV420 -> RGB conversion */
42 /****************************************************************************/
43 static __m128i* ssse3_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
44  __m128i Vraw, UINT8 pos)
45 {
46  const __m128i mapY[] = { mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
47  mm_set_epu32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
48  mm_set_epu32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880),
49  mm_set_epu32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) };
50  const __m128i mapUV[] = { mm_set_epu32(0x80038002, 0x80018000, 0x80808080, 0x80808080),
51  mm_set_epu32(0x80078006, 0x80058004, 0x80808080, 0x80808080),
52  mm_set_epu32(0x800B800A, 0x80098008, 0x80808080, 0x80808080),
53  mm_set_epu32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) };
54  const __m128i mask[] = { mm_set_epu32(0x80038080, 0x80028080, 0x80018080, 0x80008080),
55  mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
56  mm_set_epu32(0x80808003, 0x80808002, 0x80808001, 0x80808000) };
57  const __m128i c128 = _mm_set1_epi16(128);
58  __m128i BGRX = _mm_and_si128(_mm_loadu_si128(dst),
59  mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000));
60  {
61  __m128i C;
62  __m128i D;
63  __m128i E;
64  /* Load Y values and expand to 32 bit */
65  {
66  C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */
67  }
68  /* Load U values and expand to 32 bit */
69  {
70  const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */
71  D = _mm_sub_epi16(U, c128); /* D = U - 128 */
72  }
73  /* Load V values and expand to 32 bit */
74  {
75  const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */
76  E = _mm_sub_epi16(V, c128); /* E = V - 128 */
77  }
78  /* Get the R value */
79  {
80  const __m128i c403 = _mm_set1_epi16(403);
81  const __m128i e403 =
82  _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403));
83  const __m128i Rs = _mm_add_epi32(C, e403);
84  const __m128i R32 = _mm_srai_epi32(Rs, 8);
85  const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128());
86  const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128());
87  const __m128i packed = _mm_shuffle_epi8(R, mask[0]);
88  BGRX = _mm_or_si128(BGRX, packed);
89  }
90  /* Get the G value */
91  {
92  const __m128i c48 = _mm_set1_epi16(48);
93  const __m128i d48 =
94  _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48));
95  const __m128i c120 = _mm_set1_epi16(120);
96  const __m128i e120 =
97  _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120));
98  const __m128i de = _mm_add_epi32(d48, e120);
99  const __m128i Gs = _mm_sub_epi32(C, de);
100  const __m128i G32 = _mm_srai_epi32(Gs, 8);
101  const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128());
102  const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128());
103  const __m128i packed = _mm_shuffle_epi8(G, mask[1]);
104  BGRX = _mm_or_si128(BGRX, packed);
105  }
106  /* Get the B value */
107  {
108  const __m128i c475 = _mm_set1_epi16(475);
109  const __m128i d475 =
110  _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475));
111  const __m128i Bs = _mm_add_epi32(C, d475);
112  const __m128i B32 = _mm_srai_epi32(Bs, 8);
113  const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128());
114  const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128());
115  const __m128i packed = _mm_shuffle_epi8(B, mask[2]);
116  BGRX = _mm_or_si128(BGRX, packed);
117  }
118  }
119  _mm_storeu_si128(dst++, BGRX);
120  return dst;
121 }
122 
123 static pstatus_t ssse3_YUV420ToRGB_BGRX(const BYTE* WINPR_RESTRICT pSrc[],
124  const UINT32* WINPR_RESTRICT srcStep,
125  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
126  const prim_size_t* WINPR_RESTRICT roi)
127 {
128  const UINT32 nWidth = roi->width;
129  const UINT32 nHeight = roi->height;
130  const UINT32 pad = roi->width % 16;
131  const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
132 
133  for (size_t y = 0; y < nHeight; y++)
134  {
135  __m128i* dst = (__m128i*)(pDst + dstStep * y);
136  const BYTE* YData = pSrc[0] + y * srcStep[0];
137  const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1];
138  const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2];
139 
140  for (UINT32 x = 0; x < nWidth - pad; x += 16)
141  {
142  const __m128i Y = _mm_loadu_si128((const __m128i*)YData);
143  const __m128i uRaw = _mm_loadu_si128((const __m128i*)UData);
144  const __m128i vRaw = _mm_loadu_si128((const __m128i*)VData);
145  const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
146  const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
147  YData += 16;
148  UData += 8;
149  VData += 8;
150  dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
151  dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
152  dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
153  dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
154  }
155 
156  for (UINT32 x = 0; x < pad; x++)
157  {
158  const BYTE Y = *YData++;
159  const BYTE U = *UData;
160  const BYTE V = *VData;
161  const BYTE r = YUV2R(Y, U, V);
162  const BYTE g = YUV2G(Y, U, V);
163  const BYTE b = YUV2B(Y, U, V);
164  dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
165 
166  if (x % 2)
167  {
168  UData++;
169  VData++;
170  }
171  }
172  }
173 
174  return PRIMITIVES_SUCCESS;
175 }
176 
177 static pstatus_t ssse3_YUV420ToRGB(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
178  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
179  const prim_size_t* WINPR_RESTRICT roi)
180 {
181  switch (DstFormat)
182  {
183  case PIXEL_FORMAT_BGRX32:
184  case PIXEL_FORMAT_BGRA32:
185  return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi);
186 
187  default:
188  return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
189  }
190 }
191 
192 static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(const BYTE* WINPR_RESTRICT pSrc[],
193  const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
194  UINT32 dstStep,
195  const prim_size_t* WINPR_RESTRICT roi)
196 {
197  const UINT32 nWidth = roi->width;
198  const UINT32 nHeight = roi->height;
199  const UINT32 pad = roi->width % 16;
200 
201  for (size_t y = 0; y < nHeight; y++)
202  {
203  __m128i* dst = (__m128i*)(pDst + dstStep * y);
204  const BYTE* YData = pSrc[0] + y * srcStep[0];
205  const BYTE* UData = pSrc[1] + y * srcStep[1];
206  const BYTE* VData = pSrc[2] + y * srcStep[2];
207 
208  for (size_t x = 0; x < nWidth - pad; x += 16)
209  {
210  __m128i Y = _mm_load_si128((const __m128i*)YData);
211  __m128i U = _mm_load_si128((const __m128i*)UData);
212  __m128i V = _mm_load_si128((const __m128i*)VData);
213  YData += 16;
214  UData += 16;
215  VData += 16;
216  dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
217  dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
218  dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
219  dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
220  }
221 
222  for (size_t x = 0; x < pad; x++)
223  {
224  const BYTE Y = *YData++;
225  const BYTE U = *UData++;
226  const BYTE V = *VData++;
227  const BYTE r = YUV2R(Y, U, V);
228  const BYTE g = YUV2G(Y, U, V);
229  const BYTE b = YUV2B(Y, U, V);
230  dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
231  }
232  }
233 
234  return PRIMITIVES_SUCCESS;
235 }
236 
237 static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[],
238  const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
239  UINT32 dstStep, UINT32 DstFormat,
240  const prim_size_t* WINPR_RESTRICT roi)
241 {
242  if ((uintptr_t)pSrc[0] % 16 || (uintptr_t)pSrc[1] % 16 || (uintptr_t)pSrc[2] % 16 ||
243  srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16)
244  return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
245 
246  switch (DstFormat)
247  {
248  case PIXEL_FORMAT_BGRX32:
249  case PIXEL_FORMAT_BGRA32:
250  return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
251 
252  default:
253  return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
254  }
255 }
256 
257 /****************************************************************************/
258 /* SSSE3 RGB -> YUV420 conversion **/
259 /****************************************************************************/
260 
282 #define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9)
283 #define BGRX_U_FACTORS \
284  _mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127)
285 #define BGRX_V_FACTORS \
286  _mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12)
287 #define CONST128_FACTORS _mm_set1_epi8(-128)
288 
289 #define Y_SHIFT 7
290 #define U_SHIFT 8
291 #define V_SHIFT 8
292 
293 /*
294 TODO:
295 RGB[AX] can simply be supported using the following factors. And instead of loading the
296 globals directly the functions below could be passed pointers to the correct vectors
297 depending on the source picture format.
298 
299 PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = {
300  27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0
301 };
302 PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = {
303  -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0
304 };
305 PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = {
306  64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0
307 };
308 */
309 
310 /* compute the luma (Y) component from a single rgb source line */
311 
312 static INLINE void ssse3_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width)
313 {
314  __m128i x0;
315  __m128i x1;
316  __m128i x2;
317  __m128i x3;
318  const __m128i y_factors = BGRX_Y_FACTORS;
319  const __m128i* argb = (const __m128i*)src;
320  __m128i* ydst = (__m128i*)dst;
321 
322  for (UINT32 x = 0; x < width; x += 16)
323  {
324  /* store 16 rgba pixels in 4 128 bit registers */
325  x0 = _mm_load_si128(argb++); // 1st 4 pixels
326  x1 = _mm_load_si128(argb++); // 2nd 4 pixels
327  x2 = _mm_load_si128(argb++); // 3rd 4 pixels
328  x3 = _mm_load_si128(argb++); // 4th 4 pixels
329  /* multiplications and subtotals */
330  x0 = _mm_maddubs_epi16(x0, y_factors);
331  x1 = _mm_maddubs_epi16(x1, y_factors);
332  x2 = _mm_maddubs_epi16(x2, y_factors);
333  x3 = _mm_maddubs_epi16(x3, y_factors);
334  /* the total sums */
335  x0 = _mm_hadd_epi16(x0, x1);
336  x2 = _mm_hadd_epi16(x2, x3);
337  /* shift the results */
338  x0 = _mm_srli_epi16(x0, Y_SHIFT);
339  x2 = _mm_srli_epi16(x2, Y_SHIFT);
340  /* pack the 16 words into bytes */
341  x0 = _mm_packus_epi16(x0, x2);
342  /* save to y plane */
343  _mm_storeu_si128(ydst++, x0);
344  }
345 }
346 
347 /* compute the chrominance (UV) components from two rgb source lines */
348 
349 static INLINE void ssse3_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
350  const BYTE* WINPR_RESTRICT src2,
351  BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2,
352  UINT32 width)
353 {
354  const __m128i u_factors = BGRX_U_FACTORS;
355  const __m128i v_factors = BGRX_V_FACTORS;
356  const __m128i vector128 = CONST128_FACTORS;
357  __m128i x0;
358  __m128i x1;
359  __m128i x2;
360  __m128i x3;
361  __m128i x4;
362  __m128i x5;
363  const __m128i* rgb1 = (const __m128i*)src1;
364  const __m128i* rgb2 = (const __m128i*)src2;
365  __m64* udst = (__m64*)dst1;
366  __m64* vdst = (__m64*)dst2;
367 
368  for (UINT32 x = 0; x < width; x += 16)
369  {
370  /* subsample 16x2 pixels into 16x1 pixels */
371  x0 = _mm_load_si128(rgb1++);
372  x4 = _mm_load_si128(rgb2++);
373  x0 = _mm_avg_epu8(x0, x4);
374  x1 = _mm_load_si128(rgb1++);
375  x4 = _mm_load_si128(rgb2++);
376  x1 = _mm_avg_epu8(x1, x4);
377  x2 = _mm_load_si128(rgb1++);
378  x4 = _mm_load_si128(rgb2++);
379  x2 = _mm_avg_epu8(x2, x4);
380  x3 = _mm_load_si128(rgb1++);
381  x4 = _mm_load_si128(rgb2++);
382  x3 = _mm_avg_epu8(x3, x4);
383  /* subsample these 16x1 pixels into 8x1 pixels */
389  x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88));
390  x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd));
391  x0 = _mm_avg_epu8(x0, x4);
392  x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88));
393  x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd));
394  x1 = _mm_avg_epu8(x1, x4);
395  /* multiplications and subtotals */
396  x2 = _mm_maddubs_epi16(x0, u_factors);
397  x3 = _mm_maddubs_epi16(x1, u_factors);
398  x4 = _mm_maddubs_epi16(x0, v_factors);
399  x5 = _mm_maddubs_epi16(x1, v_factors);
400  /* the total sums */
401  x0 = _mm_hadd_epi16(x2, x3);
402  x1 = _mm_hadd_epi16(x4, x5);
403  /* shift the results */
404  x0 = _mm_srai_epi16(x0, U_SHIFT);
405  x1 = _mm_srai_epi16(x1, V_SHIFT);
406  /* pack the 16 words into bytes */
407  x0 = _mm_packs_epi16(x0, x1);
408  /* add 128 */
409  x0 = _mm_sub_epi8(x0, vector128);
410  /* the lower 8 bytes go to the u plane */
411  _mm_storel_pi(udst++, _mm_castsi128_ps(x0));
412  /* the upper 8 bytes go to the v plane */
413  _mm_storeh_pi(vdst++, _mm_castsi128_ps(x0));
414  }
415 }
416 
417 static pstatus_t ssse3_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
418  UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
419  const UINT32 dstStep[],
420  const prim_size_t* WINPR_RESTRICT roi)
421 {
422  const BYTE* argb = pSrc;
423  BYTE* ydst = pDst[0];
424  BYTE* udst = pDst[1];
425  BYTE* vdst = pDst[2];
426 
427  if (roi->height < 1 || roi->width < 1)
428  {
429  return !PRIMITIVES_SUCCESS;
430  }
431 
432  if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
433  {
434  return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
435  }
436 
437  for (UINT32 y = 0; y < roi->height - 1; y += 2)
438  {
439  const BYTE* line1 = argb;
440  const BYTE* line2 = argb + srcStep;
441  ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
442  ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width);
443  ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width);
444  argb += 2ULL * srcStep;
445  ydst += 2ULL * dstStep[0];
446  udst += 1ULL * dstStep[1];
447  vdst += 1ULL * dstStep[2];
448  }
449 
450  if (roi->height & 1)
451  {
452  /* pass the same last line of an odd height twice for UV */
453  ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width);
454  ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width);
455  }
456 
457  return PRIMITIVES_SUCCESS;
458 }
459 
460 static pstatus_t ssse3_RGBToYUV420(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
461  UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
462  const UINT32 dstStep[], const prim_size_t* WINPR_RESTRICT roi)
463 {
464  switch (srcFormat)
465  {
466  case PIXEL_FORMAT_BGRX32:
467  case PIXEL_FORMAT_BGRA32:
468  return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
469 
470  default:
471  return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
472  }
473 }
474 
475 /****************************************************************************/
476 /* SSSE3 RGB -> AVC444-YUV conversion **/
477 /****************************************************************************/
478 
479 static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
480  const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
481  BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
482  BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
483  BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
484 {
485  const __m128i* argbEven = (const __m128i*)srcEven;
486  const __m128i* argbOdd = (const __m128i*)srcOdd;
487  const __m128i y_factors = BGRX_Y_FACTORS;
488  const __m128i u_factors = BGRX_U_FACTORS;
489  const __m128i v_factors = BGRX_V_FACTORS;
490  const __m128i vector128 = CONST128_FACTORS;
491 
492  for (UINT32 x = 0; x < width; x += 16)
493  {
494  /* store 16 rgba pixels in 4 128 bit registers */
495  const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels
496  const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels
497  const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels
498  const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels
499  const __m128i xo1 = _mm_load_si128(argbOdd++); // 1st 4 pixels
500  const __m128i xo2 = _mm_load_si128(argbOdd++); // 2nd 4 pixels
501  const __m128i xo3 = _mm_load_si128(argbOdd++); // 3rd 4 pixels
502  const __m128i xo4 = _mm_load_si128(argbOdd++); // 4th 4 pixels
503  {
504  /* Y: multiplications with subtotals and horizontal sums */
505  const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
506  _mm_maddubs_epi16(xe2, y_factors)),
507  Y_SHIFT);
508  const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
509  _mm_maddubs_epi16(xe4, y_factors)),
510  Y_SHIFT);
511  const __m128i ye = _mm_packus_epi16(ye1, ye2);
512  const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
513  _mm_maddubs_epi16(xo2, y_factors)),
514  Y_SHIFT);
515  const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
516  _mm_maddubs_epi16(xo4, y_factors)),
517  Y_SHIFT);
518  const __m128i yo = _mm_packus_epi16(yo1, yo2);
519  /* store y [b1] */
520  _mm_storeu_si128((__m128i*)b1Even, ye);
521  b1Even += 16;
522 
523  if (b1Odd)
524  {
525  _mm_storeu_si128((__m128i*)b1Odd, yo);
526  b1Odd += 16;
527  }
528  }
529  {
530  /* We have now
531  * 16 even U values in ue
532  * 16 odd U values in uo
533  *
534  * We need to split these according to
535  * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
536  __m128i ue;
537  __m128i uo = { 0 };
538  {
539  const __m128i ue1 =
540  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
541  _mm_maddubs_epi16(xe2, u_factors)),
542  U_SHIFT);
543  const __m128i ue2 =
544  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
545  _mm_maddubs_epi16(xe4, u_factors)),
546  U_SHIFT);
547  ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
548  }
549 
550  if (b1Odd)
551  {
552  const __m128i uo1 =
553  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
554  _mm_maddubs_epi16(xo2, u_factors)),
555  U_SHIFT);
556  const __m128i uo2 =
557  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
558  _mm_maddubs_epi16(xo4, u_factors)),
559  U_SHIFT);
560  uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
561  }
562 
563  /* Now we need the following storage distribution:
564  * 2x 2y -> b2
565  * x 2y+1 -> b4
566  * 2x+1 2y -> b6 */
567  if (b1Odd) /* b2 */
568  {
569  const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
570  const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
571  const __m128i hi = _mm_add_epi16(ueh, uoh);
572  const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
573  const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
574  const __m128i lo = _mm_add_epi16(uel, uol);
575  const __m128i added = _mm_hadd_epi16(lo, hi);
576  const __m128i avg16 = _mm_srai_epi16(added, 2);
577  const __m128i avg = _mm_packus_epi16(avg16, avg16);
578  _mm_storel_epi64((__m128i*)b2, avg);
579  }
580  else
581  {
582  const __m128i mask =
583  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
584  (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
585  const __m128i ud = _mm_shuffle_epi8(ue, mask);
586  _mm_storel_epi64((__m128i*)b2, ud);
587  }
588 
589  b2 += 8;
590 
591  if (b1Odd) /* b4 */
592  {
593  _mm_store_si128((__m128i*)b4, uo);
594  b4 += 16;
595  }
596 
597  {
598  /* b6 */
599  const __m128i mask =
600  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
601  (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
602  const __m128i ude = _mm_shuffle_epi8(ue, mask);
603  _mm_storel_epi64((__m128i*)b6, ude);
604  b6 += 8;
605  }
606  }
607  {
608  /* We have now
609  * 16 even V values in ue
610  * 16 odd V values in uo
611  *
612  * We need to split these according to
613  * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
614  __m128i ve;
615  __m128i vo = { 0 };
616  {
617  const __m128i ve1 =
618  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
619  _mm_maddubs_epi16(xe2, v_factors)),
620  V_SHIFT);
621  const __m128i ve2 =
622  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
623  _mm_maddubs_epi16(xe4, v_factors)),
624  V_SHIFT);
625  ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
626  }
627 
628  if (b1Odd)
629  {
630  const __m128i vo1 =
631  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
632  _mm_maddubs_epi16(xo2, v_factors)),
633  V_SHIFT);
634  const __m128i vo2 =
635  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
636  _mm_maddubs_epi16(xo4, v_factors)),
637  V_SHIFT);
638  vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
639  }
640 
641  /* Now we need the following storage distribution:
642  * 2x 2y -> b3
643  * x 2y+1 -> b5
644  * 2x+1 2y -> b7 */
645  if (b1Odd) /* b3 */
646  {
647  const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
648  const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
649  const __m128i hi = _mm_add_epi16(veh, voh);
650  const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
651  const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
652  const __m128i lo = _mm_add_epi16(vel, vol);
653  const __m128i added = _mm_hadd_epi16(lo, hi);
654  const __m128i avg16 = _mm_srai_epi16(added, 2);
655  const __m128i avg = _mm_packus_epi16(avg16, avg16);
656  _mm_storel_epi64((__m128i*)b3, avg);
657  }
658  else
659  {
660  const __m128i mask =
661  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
662  (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
663  const __m128i vd = _mm_shuffle_epi8(ve, mask);
664  _mm_storel_epi64((__m128i*)b3, vd);
665  }
666 
667  b3 += 8;
668 
669  if (b1Odd) /* b5 */
670  {
671  _mm_store_si128((__m128i*)b5, vo);
672  b5 += 16;
673  }
674 
675  {
676  /* b7 */
677  const __m128i mask =
678  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
679  (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
680  const __m128i vde = _mm_shuffle_epi8(ve, mask);
681  _mm_storel_epi64((__m128i*)b7, vde);
682  b7 += 8;
683  }
684  }
685  }
686 }
687 
688 static pstatus_t ssse3_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
689  UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
690  const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
691  const UINT32 dst2Step[],
692  const prim_size_t* WINPR_RESTRICT roi)
693 {
694  const BYTE* pMaxSrc = pSrc + 1ULL * (roi->height - 1) * srcStep;
695 
696  if (roi->height < 1 || roi->width < 1)
697  return !PRIMITIVES_SUCCESS;
698 
699  if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
700  return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
701  roi);
702 
703  for (size_t y = 0; y < roi->height; y += 2)
704  {
705  const BOOL last = (y >= (roi->height - 1));
706  const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
707  const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
708  const size_t i = y >> 1;
709  const size_t n = (i & (size_t)~7) + i;
710  BYTE* b1Even = pDst1[0] + y * dst1Step[0];
711  BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
712  BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
713  BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
714  BYTE* b4 = pDst2[0] + 1ULL * dst2Step[0] * n;
715  BYTE* b5 = b4 + 8ULL * dst2Step[0];
716  BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
717  BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
718  ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
719  roi->width);
720  }
721 
722  return PRIMITIVES_SUCCESS;
723 }
724 
725 static pstatus_t ssse3_RGBToAVC444YUV(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
726  UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
727  const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
728  const UINT32 dst2Step[],
729  const prim_size_t* WINPR_RESTRICT roi)
730 {
731  switch (srcFormat)
732  {
733  case PIXEL_FORMAT_BGRX32:
734  case PIXEL_FORMAT_BGRA32:
735  return ssse3_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
736  dst2Step, roi);
737 
738  default:
739  return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
740  dst2Step, roi);
741  }
742 }
743 
744 /* Mapping of arguments:
745  *
746  * b1 [even lines] -> yLumaDstEven
747  * b1 [odd lines] -> yLumaDstOdd
748  * b2 -> uLumaDst
749  * b3 -> vLumaDst
750  * b4 -> yChromaDst1
751  * b5 -> yChromaDst2
752  * b6 -> uChromaDst1
753  * b7 -> uChromaDst2
754  * b8 -> vChromaDst1
755  * b9 -> vChromaDst2
756  */
757 static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
758  const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
759  BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
760  BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
761  BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
762  BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
763  BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
764  BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
765 {
766  const __m128i vector128 = CONST128_FACTORS;
767  const __m128i* argbEven = (const __m128i*)srcEven;
768  const __m128i* argbOdd = (const __m128i*)srcOdd;
769 
770  for (UINT32 x = 0; x < width; x += 16)
771  {
772  /* store 16 rgba pixels in 4 128 bit registers
773  * for even and odd rows.
774  */
775  const __m128i xe1 = _mm_load_si128(argbEven++); /* 1st 4 pixels */
776  const __m128i xe2 = _mm_load_si128(argbEven++); /* 2nd 4 pixels */
777  const __m128i xe3 = _mm_load_si128(argbEven++); /* 3rd 4 pixels */
778  const __m128i xe4 = _mm_load_si128(argbEven++); /* 4th 4 pixels */
779  const __m128i xo1 = _mm_load_si128(argbOdd++); /* 1st 4 pixels */
780  const __m128i xo2 = _mm_load_si128(argbOdd++); /* 2nd 4 pixels */
781  const __m128i xo3 = _mm_load_si128(argbOdd++); /* 3rd 4 pixels */
782  const __m128i xo4 = _mm_load_si128(argbOdd++); /* 4th 4 pixels */
783  {
784  /* Y: multiplications with subtotals and horizontal sums */
785  const __m128i y_factors = BGRX_Y_FACTORS;
786  const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
787  _mm_maddubs_epi16(xe2, y_factors)),
788  Y_SHIFT);
789  const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
790  _mm_maddubs_epi16(xe4, y_factors)),
791  Y_SHIFT);
792  const __m128i ye = _mm_packus_epi16(ye1, ye2);
793  /* store y [b1] */
794  _mm_storeu_si128((__m128i*)yLumaDstEven, ye);
795  yLumaDstEven += 16;
796  }
797 
798  if (yLumaDstOdd)
799  {
800  const __m128i y_factors = BGRX_Y_FACTORS;
801  const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
802  _mm_maddubs_epi16(xo2, y_factors)),
803  Y_SHIFT);
804  const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
805  _mm_maddubs_epi16(xo4, y_factors)),
806  Y_SHIFT);
807  const __m128i yo = _mm_packus_epi16(yo1, yo2);
808  _mm_storeu_si128((__m128i*)yLumaDstOdd, yo);
809  yLumaDstOdd += 16;
810  }
811 
812  {
813  /* We have now
814  * 16 even U values in ue
815  * 16 odd U values in uo
816  *
817  * We need to split these according to
818  * 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode */
819  /* U: multiplications with subtotals and horizontal sums */
820  __m128i ue;
821  __m128i uo;
822  __m128i uavg;
823  {
824  const __m128i u_factors = BGRX_U_FACTORS;
825  const __m128i ue1 =
826  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
827  _mm_maddubs_epi16(xe2, u_factors)),
828  U_SHIFT);
829  const __m128i ue2 =
830  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
831  _mm_maddubs_epi16(xe4, u_factors)),
832  U_SHIFT);
833  const __m128i ueavg = _mm_hadd_epi16(ue1, ue2);
834  ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
835  uavg = ueavg;
836  }
837  {
838  const __m128i u_factors = BGRX_U_FACTORS;
839  const __m128i uo1 =
840  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
841  _mm_maddubs_epi16(xo2, u_factors)),
842  U_SHIFT);
843  const __m128i uo2 =
844  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
845  _mm_maddubs_epi16(xo4, u_factors)),
846  U_SHIFT);
847  const __m128i uoavg = _mm_hadd_epi16(uo1, uo2);
848  uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
849  uavg = _mm_add_epi16(uavg, uoavg);
850  uavg = _mm_srai_epi16(uavg, 2);
851  uavg = _mm_packs_epi16(uavg, uoavg);
852  uavg = _mm_sub_epi8(uavg, vector128);
853  }
854  /* Now we need the following storage distribution:
855  * 2x 2y -> uLumaDst
856  * 2x+1 y -> yChromaDst1
857  * 4x 2y+1 -> uChromaDst1
858  * 4x+2 2y+1 -> vChromaDst1 */
859  {
860  const __m128i mask =
861  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
862  (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
863  const __m128i ude = _mm_shuffle_epi8(ue, mask);
864  _mm_storel_epi64((__m128i*)yEvenChromaDst1, ude);
865  yEvenChromaDst1 += 8;
866  }
867 
868  if (yLumaDstOdd)
869  {
870  const __m128i mask =
871  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
872  (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
873  const __m128i udo /* codespell:ignore udo */ = _mm_shuffle_epi8(uo, mask);
874  _mm_storel_epi64((__m128i*)yOddChromaDst1, udo); // codespell:ignore udo
875  yOddChromaDst1 += 8;
876  }
877 
878  if (yLumaDstOdd)
879  {
880  const __m128i mask =
881  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
882  (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
883  const __m128i ud = _mm_shuffle_epi8(uo, mask);
884  int* uDst1 = (int*)uChromaDst1;
885  int* vDst1 = (int*)vChromaDst1;
886  const int* src = (const int*)&ud;
887  _mm_stream_si32(uDst1, src[0]);
888  _mm_stream_si32(vDst1, src[1]);
889  uChromaDst1 += 4;
890  vChromaDst1 += 4;
891  }
892 
893  if (yLumaDstOdd)
894  {
895  _mm_storel_epi64((__m128i*)uLumaDst, uavg);
896  uLumaDst += 8;
897  }
898  else
899  {
900  const __m128i mask =
901  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
902  (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
903  const __m128i ud = _mm_shuffle_epi8(ue, mask);
904  _mm_storel_epi64((__m128i*)uLumaDst, ud);
905  uLumaDst += 8;
906  }
907  }
908 
909  {
910  /* V: multiplications with subtotals and horizontal sums */
911  __m128i ve;
912  __m128i vo;
913  __m128i vavg;
914  {
915  const __m128i v_factors = BGRX_V_FACTORS;
916  const __m128i ve1 =
917  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
918  _mm_maddubs_epi16(xe2, v_factors)),
919  V_SHIFT);
920  const __m128i ve2 =
921  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
922  _mm_maddubs_epi16(xe4, v_factors)),
923  V_SHIFT);
924  const __m128i veavg = _mm_hadd_epi16(ve1, ve2);
925  ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
926  vavg = veavg;
927  }
928  {
929  const __m128i v_factors = BGRX_V_FACTORS;
930  const __m128i vo1 =
931  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
932  _mm_maddubs_epi16(xo2, v_factors)),
933  V_SHIFT);
934  const __m128i vo2 =
935  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
936  _mm_maddubs_epi16(xo4, v_factors)),
937  V_SHIFT);
938  const __m128i voavg = _mm_hadd_epi16(vo1, vo2);
939  vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
940  vavg = _mm_add_epi16(vavg, voavg);
941  vavg = _mm_srai_epi16(vavg, 2);
942  vavg = _mm_packs_epi16(vavg, voavg);
943  vavg = _mm_sub_epi8(vavg, vector128);
944  }
945  /* Now we need the following storage distribution:
946  * 2x 2y -> vLumaDst
947  * 2x+1 y -> yChromaDst2
948  * 4x 2y+1 -> uChromaDst2
949  * 4x+2 2y+1 -> vChromaDst2 */
950  {
951  const __m128i mask =
952  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
953  (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
954  __m128i vde = _mm_shuffle_epi8(ve, mask);
955  _mm_storel_epi64((__m128i*)yEvenChromaDst2, vde);
956  yEvenChromaDst2 += 8;
957  }
958 
959  if (yLumaDstOdd)
960  {
961  const __m128i mask =
962  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
963  (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
964  __m128i vdo = _mm_shuffle_epi8(vo, mask);
965  _mm_storel_epi64((__m128i*)yOddChromaDst2, vdo);
966  yOddChromaDst2 += 8;
967  }
968 
969  if (yLumaDstOdd)
970  {
971  const __m128i mask =
972  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
973  (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
974  const __m128i vd = _mm_shuffle_epi8(vo, mask);
975  int* uDst2 = (int*)uChromaDst2;
976  int* vDst2 = (int*)vChromaDst2;
977  const int* src = (const int*)&vd;
978  _mm_stream_si32(uDst2, src[0]);
979  _mm_stream_si32(vDst2, src[1]);
980  uChromaDst2 += 4;
981  vChromaDst2 += 4;
982  }
983 
984  if (yLumaDstOdd)
985  {
986  _mm_storel_epi64((__m128i*)vLumaDst, vavg);
987  vLumaDst += 8;
988  }
989  else
990  {
991  const __m128i mask =
992  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
993  (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
994  __m128i vd = _mm_shuffle_epi8(ve, mask);
995  _mm_storel_epi64((__m128i*)vLumaDst, vd);
996  vLumaDst += 8;
997  }
998  }
999  }
1000 }
1001 
1002 static pstatus_t ssse3_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1003  UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1004  const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1005  const UINT32 dst2Step[],
1006  const prim_size_t* WINPR_RESTRICT roi)
1007 {
1008  if (roi->height < 1 || roi->width < 1)
1009  return !PRIMITIVES_SUCCESS;
1010 
1011  if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
1012  return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
1013  roi);
1014 
1015  for (size_t y = 0; y < roi->height; y += 2)
1016  {
1017  const BYTE* srcEven = (pSrc + y * srcStep);
1018  const BYTE* srcOdd = (srcEven + srcStep);
1019  BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
1020  BYTE* dstLumaYOdd = (y < roi->height - 1) ? (dstLumaYEven + dst1Step[0]) : NULL;
1021  BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
1022  BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
1023  BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
1024  BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
1025  BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
1026  BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
1027  BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
1028  BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
1029  BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
1030  BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
1031  ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU,
1032  dstLumaV, dstEvenChromaY1, dstEvenChromaY2,
1033  dstOddChromaY1, dstOddChromaY2, dstChromaU1,
1034  dstChromaU2, dstChromaV1, dstChromaV2, roi->width);
1035  }
1036 
1037  return PRIMITIVES_SUCCESS;
1038 }
1039 
1040 static pstatus_t ssse3_RGBToAVC444YUVv2(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1041  UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1042  const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1043  const UINT32 dst2Step[],
1044  const prim_size_t* WINPR_RESTRICT roi)
1045 {
1046  switch (srcFormat)
1047  {
1048  case PIXEL_FORMAT_BGRX32:
1049  case PIXEL_FORMAT_BGRA32:
1050  return ssse3_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1051  dst2Step, roi);
1052 
1053  default:
1054  return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1055  dst2Step, roi);
1056  }
1057 }
1058 
1059 static pstatus_t ssse3_LumaToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[], const UINT32 srcStep[],
1060  BYTE* WINPR_RESTRICT pDstRaw[], const UINT32 dstStep[],
1061  const RECTANGLE_16* WINPR_RESTRICT roi)
1062 {
1063  const UINT32 nWidth = roi->right - roi->left;
1064  const UINT32 nHeight = roi->bottom - roi->top;
1065  const UINT32 halfWidth = (nWidth + 1) / 2;
1066  const UINT32 halfPad = halfWidth % 16;
1067  const UINT32 halfHeight = (nHeight + 1) / 2;
1068  const UINT32 oddY = 1;
1069  const UINT32 evenY = 0;
1070  const UINT32 oddX = 1;
1071  const UINT32 evenX = 0;
1072  const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1073  pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1074  pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1075  BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1076  pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1077  pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1078 
1079  /* Y data is already here... */
1080  /* B1 */
1081  for (size_t y = 0; y < nHeight; y++)
1082  {
1083  const BYTE* Ym = pSrc[0] + y * srcStep[0];
1084  BYTE* pY = pDst[0] + y * dstStep[0];
1085  memcpy(pY, Ym, nWidth);
1086  }
1087 
1088  /* The first half of U, V are already here part of this frame. */
1089  /* B2 and B3 */
1090  for (size_t y = 0; y < halfHeight; y++)
1091  {
1092  const size_t val2y = (2 * y + evenY);
1093  const size_t val2y1 = val2y + oddY;
1094  const BYTE* Um = pSrc[1] + 1ULL * srcStep[1] * y;
1095  const BYTE* Vm = pSrc[2] + 1ULL * srcStep[2] * y;
1096  BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y;
1097  BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y;
1098  BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1;
1099  BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1;
1100 
1101  size_t x = 0;
1102  for (; x < halfWidth - halfPad; x += 16)
1103  {
1104  const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
1105  const __m128i unpackLow =
1106  _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
1107  {
1108  const __m128i u = _mm_loadu_si128((const __m128i*)&Um[x]);
1109  const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1110  const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1111  _mm_storeu_si128((__m128i*)&pU[2ULL * x], uHigh);
1112  _mm_storeu_si128((__m128i*)&pU[2ULL * x + 16], uLow);
1113  _mm_storeu_si128((__m128i*)&pU1[2ULL * x], uHigh);
1114  _mm_storeu_si128((__m128i*)&pU1[2ULL * x + 16], uLow);
1115  }
1116  {
1117  const __m128i u = _mm_loadu_si128((const __m128i*)&Vm[x]);
1118  const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1119  const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1120  _mm_storeu_si128((__m128i*)&pV[2 * x], uHigh);
1121  _mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow);
1122  _mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh);
1123  _mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow);
1124  }
1125  }
1126 
1127  for (; x < halfWidth; x++)
1128  {
1129  const size_t val2x = 2 * x + evenX;
1130  const size_t val2x1 = val2x + oddX;
1131  pU[val2x] = Um[x];
1132  pV[val2x] = Vm[x];
1133  pU[val2x1] = Um[x];
1134  pV[val2x1] = Vm[x];
1135  pU1[val2x] = Um[x];
1136  pV1[val2x] = Vm[x];
1137  pU1[val2x1] = Um[x];
1138  pV1[val2x1] = Vm[x];
1139  }
1140  }
1141 
1142  return PRIMITIVES_SUCCESS;
1143 }
1144 
1145 static INLINE void ssse3_filter(BYTE* WINPR_RESTRICT pSrcDst, const BYTE* WINPR_RESTRICT pSrc2)
1146 {
1147  const __m128i even = _mm_set_epi8((char)0x80, 14, (char)0x80, 12, (char)0x80, 10, (char)0x80, 8,
1148  (char)0x80, 6, (char)0x80, 4, (char)0x80, 2, (char)0x80, 0);
1149  const __m128i odd = _mm_set_epi8((char)0x80, 15, (char)0x80, 13, (char)0x80, 11, (char)0x80, 9,
1150  (char)0x80, 7, (char)0x80, 5, (char)0x80, 3, (char)0x80, 1);
1151  const __m128i interleave = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
1152  const __m128i u = _mm_loadu_si128((const __m128i*)pSrcDst);
1153  const __m128i u1 = _mm_loadu_si128((const __m128i*)pSrc2);
1154  const __m128i uEven = _mm_shuffle_epi8(u, even);
1155  const __m128i uEven4 = _mm_slli_epi16(uEven, 2);
1156  const __m128i uOdd = _mm_shuffle_epi8(u, odd);
1157  const __m128i u1Even = _mm_shuffle_epi8(u1, even);
1158  const __m128i u1Odd = _mm_shuffle_epi8(u1, odd);
1159  const __m128i tmp1 = _mm_add_epi16(uOdd, u1Even);
1160  const __m128i tmp2 = _mm_add_epi16(tmp1, u1Odd);
1161  const __m128i result = _mm_sub_epi16(uEven4, tmp2);
1162  const __m128i packed = _mm_packus_epi16(result, uOdd);
1163  const __m128i interleaved = _mm_shuffle_epi8(packed, interleave);
1164  _mm_storeu_si128((__m128i*)pSrcDst, interleaved);
1165 }
1166 
1167 static pstatus_t ssse3_ChromaFilter(BYTE* WINPR_RESTRICT pDst[], const UINT32 dstStep[],
1168  const RECTANGLE_16* WINPR_RESTRICT roi)
1169 {
1170  const UINT32 oddY = 1;
1171  const UINT32 evenY = 0;
1172  const UINT32 nWidth = roi->right - roi->left;
1173  const UINT32 nHeight = roi->bottom - roi->top;
1174  const UINT32 halfHeight = (nHeight + 1) / 2;
1175  const UINT32 halfWidth = (nWidth + 1) / 2;
1176  const UINT32 halfPad = halfWidth % 16;
1177 
1178  /* Filter */
1179  for (size_t y = roi->top; y < halfHeight + roi->top; y++)
1180  {
1181  size_t x = roi->left;
1182  const size_t val2y = (y * 2ULL + evenY);
1183  const size_t val2y1 = val2y + oddY;
1184  BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1;
1185  BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1;
1186  BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y;
1187  BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y;
1188 
1189  if (val2y1 > nHeight)
1190  continue;
1191 
1192  for (; x < halfWidth + roi->left - halfPad; x += 16)
1193  {
1194  ssse3_filter(&pU[2 * x], &pU1[2 * x]);
1195  ssse3_filter(&pV[2 * x], &pV1[2 * x]);
1196  }
1197 
1198  for (; x < halfWidth + roi->left; x++)
1199  {
1200  const size_t val2x = (x * 2ULL);
1201  const size_t val2x1 = val2x + 1ULL;
1202  const BYTE inU = pU[val2x];
1203  const BYTE inV = pV[val2x];
1204  const INT32 up = inU * 4;
1205  const INT32 vp = inV * 4;
1206  INT32 u2020 = 0;
1207  INT32 v2020 = 0;
1208 
1209  if (val2x1 > nWidth)
1210  continue;
1211 
1212  u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
1213  v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
1214  pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
1215  pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
1216  }
1217  }
1218 
1219  return PRIMITIVES_SUCCESS;
1220 }
1221 
1222 static pstatus_t ssse3_ChromaV1ToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3],
1223  const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
1224  const UINT32 dstStep[3],
1225  const RECTANGLE_16* WINPR_RESTRICT roi)
1226 {
1227  const UINT32 mod = 16;
1228  UINT32 uY = 0;
1229  UINT32 vY = 0;
1230  const UINT32 nWidth = roi->right - roi->left;
1231  const UINT32 nHeight = roi->bottom - roi->top;
1232  const UINT32 halfWidth = (nWidth + 1) / 2;
1233  const UINT32 halfPad = halfWidth % 16;
1234  const UINT32 halfHeight = (nHeight + 1) / 2;
1235  const UINT32 oddY = 1;
1236  const UINT32 evenY = 0;
1237  const UINT32 oddX = 1;
1238  /* The auxiliary frame is aligned to multiples of 16x16.
1239  * We need the padded height for B4 and B5 conversion. */
1240  const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
1241  const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1242  pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1243  pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1244  BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1245  pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1246  pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1247  const __m128i zero = _mm_setzero_si128();
1248  const __m128i mask = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
1249  (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
1250 
1251  /* The second half of U and V is a bit more tricky... */
1252  /* B4 and B5 */
1253  for (size_t y = 0; y < padHeigth; y++)
1254  {
1255  const BYTE* Ya = pSrc[0] + 1ULL * srcStep[0] * y;
1256  BYTE* pX = NULL;
1257 
1258  if ((y) % mod < (mod + 1) / 2)
1259  {
1260  const UINT32 pos = (2 * uY++ + oddY);
1261 
1262  if (pos >= nHeight)
1263  continue;
1264 
1265  pX = pDst[1] + 1ULL * dstStep[1] * pos;
1266  }
1267  else
1268  {
1269  const UINT32 pos = (2 * vY++ + oddY);
1270 
1271  if (pos >= nHeight)
1272  continue;
1273 
1274  pX = pDst[2] + 1ULL * dstStep[2] * pos;
1275  }
1276 
1277  memcpy(pX, Ya, nWidth);
1278  }
1279 
1280  /* B6 and B7 */
1281  for (size_t y = 0; y < halfHeight; y++)
1282  {
1283  const size_t val2y = (y * 2 + evenY);
1284  const BYTE* Ua = pSrc[1] + srcStep[1] * y;
1285  const BYTE* Va = pSrc[2] + srcStep[2] * y;
1286  BYTE* pU = pDst[1] + dstStep[1] * val2y;
1287  BYTE* pV = pDst[2] + dstStep[2] * val2y;
1288 
1289  size_t x = 0;
1290  for (; x < halfWidth - halfPad; x += 16)
1291  {
1292  {
1293  const __m128i u = _mm_loadu_si128((const __m128i*)&Ua[x]);
1294  const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1295  const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1296  _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
1297  _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
1298  }
1299  {
1300  const __m128i u = _mm_loadu_si128((const __m128i*)&Va[x]);
1301  const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1302  const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1303  _mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]);
1304  _mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]);
1305  }
1306  }
1307 
1308  for (; x < halfWidth; x++)
1309  {
1310  const size_t val2x1 = (x * 2ULL + oddX);
1311  pU[val2x1] = Ua[x];
1312  pV[val2x1] = Va[x];
1313  }
1314  }
1315 
1316  /* Filter */
1317  return ssse3_ChromaFilter(pDst, dstStep, roi);
1318 }
1319 
1320 static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
1321  UINT32 nTotalWidth, UINT32 nTotalHeight,
1322  BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
1323  const RECTANGLE_16* WINPR_RESTRICT roi)
1324 {
1325  const UINT32 nWidth = roi->right - roi->left;
1326  const UINT32 nHeight = roi->bottom - roi->top;
1327  const UINT32 halfWidth = (nWidth + 1) / 2;
1328  const UINT32 halfPad = halfWidth % 16;
1329  const UINT32 halfHeight = (nHeight + 1) / 2;
1330  const UINT32 quaterWidth = (nWidth + 3) / 4;
1331  const UINT32 quaterPad = quaterWidth % 16;
1332  const __m128i zero = _mm_setzero_si128();
1333  const __m128i mask = _mm_set_epi8((char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
1334  (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0);
1335  const __m128i mask2 = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80,
1336  0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
1337  const __m128i shuffle1 =
1338  _mm_set_epi8((char)0x80, 15, (char)0x80, 14, (char)0x80, 13, (char)0x80, 12, (char)0x80, 11,
1339  (char)0x80, 10, (char)0x80, 9, (char)0x80, 8);
1340  const __m128i shuffle2 =
1341  _mm_set_epi8((char)0x80, 7, (char)0x80, 6, (char)0x80, 5, (char)0x80, 4, (char)0x80, 3,
1342  (char)0x80, 2, (char)0x80, 1, (char)0x80, 0);
1343 
1344  /* B4 and B5: odd UV values for width/2, height */
1345  for (size_t y = 0; y < nHeight; y++)
1346  {
1347  const size_t yTop = y + roi->top;
1348  const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
1349  const BYTE* pYaV = pYaU + nTotalWidth / 2;
1350  BYTE* pU = pDst[1] + 1ULL * dstStep[1] * yTop + roi->left;
1351  BYTE* pV = pDst[2] + 1ULL * dstStep[2] * yTop + roi->left;
1352 
1353  size_t x = 0;
1354  for (; x < halfWidth - halfPad; x += 16)
1355  {
1356  {
1357  const __m128i u = _mm_loadu_si128((const __m128i*)&pYaU[x]);
1358  const __m128i u2 = _mm_unpackhi_epi8(zero, u);
1359  const __m128i u1 = _mm_unpacklo_epi8(zero, u);
1360  _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
1361  _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
1362  }
1363  {
1364  const __m128i v = _mm_loadu_si128((const __m128i*)&pYaV[x]);
1365  const __m128i v2 = _mm_unpackhi_epi8(zero, v);
1366  const __m128i v1 = _mm_unpacklo_epi8(zero, v);
1367  _mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]);
1368  _mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]);
1369  }
1370  }
1371 
1372  for (; x < halfWidth; x++)
1373  {
1374  const size_t odd = 2ULL * x + 1;
1375  pU[odd] = pYaU[x];
1376  pV[odd] = pYaV[x];
1377  }
1378  }
1379 
1380  /* B6 - B9 */
1381  for (size_t y = 0; y < halfHeight; y++)
1382  {
1383  const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
1384  const BYTE* pUaV = pUaU + nTotalWidth / 4;
1385  const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
1386  const BYTE* pVaV = pVaU + nTotalWidth / 4;
1387  BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
1388  BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
1389 
1390  UINT32 x = 0;
1391  for (; x < quaterWidth - quaterPad; x += 16)
1392  {
1393  {
1394  const __m128i uU = _mm_loadu_si128((const __m128i*)&pUaU[x]);
1395  const __m128i uV = _mm_loadu_si128((const __m128i*)&pVaU[x]);
1396  const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
1397  const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
1398  const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
1399  const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
1400  const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
1401  const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
1402  _mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]);
1403  _mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]);
1404  _mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]);
1405  _mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]);
1406  }
1407  {
1408  const __m128i vU = _mm_loadu_si128((const __m128i*)&pUaV[x]);
1409  const __m128i vV = _mm_loadu_si128((const __m128i*)&pVaV[x]);
1410  const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
1411  const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
1412  const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
1413  const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1);
1414  const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2);
1415  const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1);
1416  _mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]);
1417  _mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]);
1418  _mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]);
1419  _mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]);
1420  }
1421  }
1422 
1423  for (; x < quaterWidth; x++)
1424  {
1425  pU[4 * x + 0] = pUaU[x];
1426  pV[4 * x + 0] = pUaV[x];
1427  pU[4 * x + 2] = pVaU[x];
1428  pV[4 * x + 2] = pVaV[x];
1429  }
1430  }
1431 
1432  return ssse3_ChromaFilter(pDst, dstStep, roi);
1433 }
1434 
1435 static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type,
1436  const BYTE* WINPR_RESTRICT pSrc[3],
1437  const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
1438  BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
1439  const RECTANGLE_16* WINPR_RESTRICT roi)
1440 {
1441  if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
1442  return -1;
1443 
1444  if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
1445  return -1;
1446 
1447  if (!roi)
1448  return -1;
1449 
1450  switch (type)
1451  {
1452  case AVC444_LUMA:
1453  return ssse3_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1454 
1455  case AVC444_CHROMAv1:
1456  return ssse3_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1457 
1458  case AVC444_CHROMAv2:
1459  return ssse3_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
1460 
1461  default:
1462  return -1;
1463  }
1464 }
1465 #endif
1466 
1467 void primitives_init_YUV_ssse3(primitives_t* WINPR_RESTRICT prims)
1468 {
1469 #if defined(SSE_AVX_INTRINSICS_ENABLED)
1470  generic = primitives_get_generic();
1471  primitives_init_YUV(prims);
1472 
1473  if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
1474  IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
1475  {
1476  WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
1477  prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420;
1478  prims->RGBToAVC444YUV = ssse3_RGBToAVC444YUV;
1479  prims->RGBToAVC444YUVv2 = ssse3_RGBToAVC444YUVv2;
1480  prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB;
1481  prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R;
1482  prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444;
1483  }
1484 #else
1485  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSSE3 intrinsics not available");
1486  WINPR_UNUSED(prims);
1487 #endif
1488 }