FreeRDP
prim_YUV_ssse3.c
1 
23 #include <winpr/wtypes.h>
24 #include <freerdp/config.h>
25 
26 #include <winpr/sysinfo.h>
27 #include <winpr/crt.h>
28 #include <freerdp/types.h>
29 #include <freerdp/primitives.h>
30 
31 #include "prim_internal.h"
32 #include "prim_YUV.h"
33 
34 #if defined(SSE2_ENABLED)
35 #include <emmintrin.h>
36 #include <tmmintrin.h>
37 
38 static primitives_t* generic = NULL;
39 
40 /****************************************************************************/
41 /* SSSE3 YUV420 -> RGB conversion */
42 /****************************************************************************/
43 static __m128i* ssse3_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
44  __m128i Vraw, UINT8 pos)
45 {
46  /* Visual Studio 2010 doesn't like _mm_set_epi32 in array initializer list */
47  /* Note: This also applies to Visual Studio 2013 before Update 4 */
48 #if !defined(_MSC_VER) || (_MSC_VER > 1600)
49  const __m128i mapY[] = { _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
50  _mm_set_epi32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
51  _mm_set_epi32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880),
52  _mm_set_epi32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) };
53  const __m128i mapUV[] = { _mm_set_epi32(0x80038002, 0x80018000, 0x80808080, 0x80808080),
54  _mm_set_epi32(0x80078006, 0x80058004, 0x80808080, 0x80808080),
55  _mm_set_epi32(0x800B800A, 0x80098008, 0x80808080, 0x80808080),
56  _mm_set_epi32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) };
57  const __m128i mask[] = { _mm_set_epi32(0x80038080, 0x80028080, 0x80018080, 0x80008080),
58  _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
59  _mm_set_epi32(0x80808003, 0x80808002, 0x80808001, 0x80808000) };
60 #else
61  /* Note: must be in little-endian format ! */
62  const __m128i mapY[] = { { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80,
63  0x80, 0x80, 0x03, 0x80, 0x80 },
64  { 0x80, 0x04, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x06, 0x80,
65  0x80, 0x80, 0x07, 0x80, 0x80 },
66  { 0x80, 0x08, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x0a, 0x80,
67  0x80, 0x80, 0x0b, 0x80, 0x80 },
68  { 0x80, 0x0c, 0x80, 0x80, 0x80, 0x0d, 0x80, 0x80, 0x80, 0x0e, 0x80,
69  0x80, 0x80, 0x0f, 0x80, 0x80 }
70 
71  };
72  const __m128i mapUV[] = { { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x80, 0x01,
73  0x80, 0x02, 0x80, 0x03, 0x80 },
74  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x80, 0x05,
75  0x80, 0x06, 0x80, 0x07, 0x80 },
76  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x80, 0x09,
77  0x80, 0x0a, 0x80, 0x0b, 0x80 },
78  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0c, 0x80, 0x0d,
79  0x80, 0x0e, 0x80, 0x0f, 0x80 } };
80  const __m128i mask[] = { { 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02,
81  0x80, 0x80, 0x80, 0x03, 0x80 },
82  { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80,
83  0x80, 0x80, 0x03, 0x80, 0x80 },
84  { 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80,
85  0x80, 0x03, 0x80, 0x80, 0x80 } };
86 #endif
87  const __m128i c128 = _mm_set1_epi16(128);
88  __m128i BGRX = _mm_and_si128(_mm_loadu_si128(dst),
89  _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000));
90  {
91  __m128i C;
92  __m128i D;
93  __m128i E;
94  /* Load Y values and expand to 32 bit */
95  {
96  C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */
97  }
98  /* Load U values and expand to 32 bit */
99  {
100  const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */
101  D = _mm_sub_epi16(U, c128); /* D = U - 128 */
102  }
103  /* Load V values and expand to 32 bit */
104  {
105  const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */
106  E = _mm_sub_epi16(V, c128); /* E = V - 128 */
107  }
108  /* Get the R value */
109  {
110  const __m128i c403 = _mm_set1_epi16(403);
111  const __m128i e403 =
112  _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403));
113  const __m128i Rs = _mm_add_epi32(C, e403);
114  const __m128i R32 = _mm_srai_epi32(Rs, 8);
115  const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128());
116  const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128());
117  const __m128i packed = _mm_shuffle_epi8(R, mask[0]);
118  BGRX = _mm_or_si128(BGRX, packed);
119  }
120  /* Get the G value */
121  {
122  const __m128i c48 = _mm_set1_epi16(48);
123  const __m128i d48 =
124  _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48));
125  const __m128i c120 = _mm_set1_epi16(120);
126  const __m128i e120 =
127  _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120));
128  const __m128i de = _mm_add_epi32(d48, e120);
129  const __m128i Gs = _mm_sub_epi32(C, de);
130  const __m128i G32 = _mm_srai_epi32(Gs, 8);
131  const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128());
132  const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128());
133  const __m128i packed = _mm_shuffle_epi8(G, mask[1]);
134  BGRX = _mm_or_si128(BGRX, packed);
135  }
136  /* Get the B value */
137  {
138  const __m128i c475 = _mm_set1_epi16(475);
139  const __m128i d475 =
140  _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475));
141  const __m128i Bs = _mm_add_epi32(C, d475);
142  const __m128i B32 = _mm_srai_epi32(Bs, 8);
143  const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128());
144  const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128());
145  const __m128i packed = _mm_shuffle_epi8(B, mask[2]);
146  BGRX = _mm_or_si128(BGRX, packed);
147  }
148  }
149  _mm_storeu_si128(dst++, BGRX);
150  return dst;
151 }
152 
153 static pstatus_t ssse3_YUV420ToRGB_BGRX(const BYTE* WINPR_RESTRICT pSrc[],
154  const UINT32* WINPR_RESTRICT srcStep,
155  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
156  const prim_size_t* WINPR_RESTRICT roi)
157 {
158  const UINT32 nWidth = roi->width;
159  const UINT32 nHeight = roi->height;
160  const UINT32 pad = roi->width % 16;
161  const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
162 
163  for (size_t y = 0; y < nHeight; y++)
164  {
165  __m128i* dst = (__m128i*)(pDst + dstStep * y);
166  const BYTE* YData = pSrc[0] + y * srcStep[0];
167  const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1];
168  const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2];
169 
170  for (UINT32 x = 0; x < nWidth - pad; x += 16)
171  {
172  const __m128i Y = _mm_loadu_si128((const __m128i*)YData);
173  const __m128i uRaw = _mm_loadu_si128((const __m128i*)UData);
174  const __m128i vRaw = _mm_loadu_si128((const __m128i*)VData);
175  const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
176  const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
177  YData += 16;
178  UData += 8;
179  VData += 8;
180  dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
181  dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
182  dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
183  dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
184  }
185 
186  for (UINT32 x = 0; x < pad; x++)
187  {
188  const BYTE Y = *YData++;
189  const BYTE U = *UData;
190  const BYTE V = *VData;
191  const BYTE r = YUV2R(Y, U, V);
192  const BYTE g = YUV2G(Y, U, V);
193  const BYTE b = YUV2B(Y, U, V);
194  dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
195 
196  if (x % 2)
197  {
198  UData++;
199  VData++;
200  }
201  }
202  }
203 
204  return PRIMITIVES_SUCCESS;
205 }
206 
207 static pstatus_t ssse3_YUV420ToRGB(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
208  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
209  const prim_size_t* WINPR_RESTRICT roi)
210 {
211  switch (DstFormat)
212  {
213  case PIXEL_FORMAT_BGRX32:
214  case PIXEL_FORMAT_BGRA32:
215  return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi);
216 
217  default:
218  return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
219  }
220 }
221 
222 static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(const BYTE* WINPR_RESTRICT pSrc[],
223  const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
224  UINT32 dstStep,
225  const prim_size_t* WINPR_RESTRICT roi)
226 {
227  const UINT32 nWidth = roi->width;
228  const UINT32 nHeight = roi->height;
229  const UINT32 pad = roi->width % 16;
230 
231  for (size_t y = 0; y < nHeight; y++)
232  {
233  __m128i* dst = (__m128i*)(pDst + dstStep * y);
234  const BYTE* YData = pSrc[0] + y * srcStep[0];
235  const BYTE* UData = pSrc[1] + y * srcStep[1];
236  const BYTE* VData = pSrc[2] + y * srcStep[2];
237 
238  for (size_t x = 0; x < nWidth - pad; x += 16)
239  {
240  __m128i Y = _mm_load_si128((const __m128i*)YData);
241  __m128i U = _mm_load_si128((const __m128i*)UData);
242  __m128i V = _mm_load_si128((const __m128i*)VData);
243  YData += 16;
244  UData += 16;
245  VData += 16;
246  dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
247  dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
248  dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
249  dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
250  }
251 
252  for (size_t x = 0; x < pad; x++)
253  {
254  const BYTE Y = *YData++;
255  const BYTE U = *UData++;
256  const BYTE V = *VData++;
257  const BYTE r = YUV2R(Y, U, V);
258  const BYTE g = YUV2G(Y, U, V);
259  const BYTE b = YUV2B(Y, U, V);
260  dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
261  }
262  }
263 
264  return PRIMITIVES_SUCCESS;
265 }
266 
267 static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[],
268  const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
269  UINT32 dstStep, UINT32 DstFormat,
270  const prim_size_t* WINPR_RESTRICT roi)
271 {
272  if ((uintptr_t)pSrc[0] % 16 || (uintptr_t)pSrc[1] % 16 || (uintptr_t)pSrc[2] % 16 ||
273  srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16)
274  return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
275 
276  switch (DstFormat)
277  {
278  case PIXEL_FORMAT_BGRX32:
279  case PIXEL_FORMAT_BGRA32:
280  return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
281 
282  default:
283  return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
284  }
285 }
286 
287 /****************************************************************************/
288 /* SSSE3 RGB -> YUV420 conversion **/
289 /****************************************************************************/
290 
312 #define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9)
313 #define BGRX_U_FACTORS \
314  _mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127)
315 #define BGRX_V_FACTORS \
316  _mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12)
317 #define CONST128_FACTORS _mm_set1_epi8(-128)
318 
319 #define Y_SHIFT 7
320 #define U_SHIFT 8
321 #define V_SHIFT 8
322 
323 /*
324 TODO:
325 RGB[AX] can simply be supported using the following factors. And instead of loading the
326 globals directly the functions below could be passed pointers to the correct vectors
327 depending on the source picture format.
328 
329 PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = {
330  27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0
331 };
332 PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = {
333  -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0
334 };
335 PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = {
336  64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0
337 };
338 */
339 
340 /* compute the luma (Y) component from a single rgb source line */
341 
342 static INLINE void ssse3_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width)
343 {
344  __m128i x0;
345  __m128i x1;
346  __m128i x2;
347  __m128i x3;
348  const __m128i y_factors = BGRX_Y_FACTORS;
349  const __m128i* argb = (const __m128i*)src;
350  __m128i* ydst = (__m128i*)dst;
351 
352  for (UINT32 x = 0; x < width; x += 16)
353  {
354  /* store 16 rgba pixels in 4 128 bit registers */
355  x0 = _mm_load_si128(argb++); // 1st 4 pixels
356  x1 = _mm_load_si128(argb++); // 2nd 4 pixels
357  x2 = _mm_load_si128(argb++); // 3rd 4 pixels
358  x3 = _mm_load_si128(argb++); // 4th 4 pixels
359  /* multiplications and subtotals */
360  x0 = _mm_maddubs_epi16(x0, y_factors);
361  x1 = _mm_maddubs_epi16(x1, y_factors);
362  x2 = _mm_maddubs_epi16(x2, y_factors);
363  x3 = _mm_maddubs_epi16(x3, y_factors);
364  /* the total sums */
365  x0 = _mm_hadd_epi16(x0, x1);
366  x2 = _mm_hadd_epi16(x2, x3);
367  /* shift the results */
368  x0 = _mm_srli_epi16(x0, Y_SHIFT);
369  x2 = _mm_srli_epi16(x2, Y_SHIFT);
370  /* pack the 16 words into bytes */
371  x0 = _mm_packus_epi16(x0, x2);
372  /* save to y plane */
373  _mm_storeu_si128(ydst++, x0);
374  }
375 }
376 
377 /* compute the chrominance (UV) components from two rgb source lines */
378 
379 static INLINE void ssse3_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
380  const BYTE* WINPR_RESTRICT src2,
381  BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2,
382  UINT32 width)
383 {
384  const __m128i u_factors = BGRX_U_FACTORS;
385  const __m128i v_factors = BGRX_V_FACTORS;
386  const __m128i vector128 = CONST128_FACTORS;
387  __m128i x0;
388  __m128i x1;
389  __m128i x2;
390  __m128i x3;
391  __m128i x4;
392  __m128i x5;
393  const __m128i* rgb1 = (const __m128i*)src1;
394  const __m128i* rgb2 = (const __m128i*)src2;
395  __m64* udst = (__m64*)dst1;
396  __m64* vdst = (__m64*)dst2;
397 
398  for (UINT32 x = 0; x < width; x += 16)
399  {
400  /* subsample 16x2 pixels into 16x1 pixels */
401  x0 = _mm_load_si128(rgb1++);
402  x4 = _mm_load_si128(rgb2++);
403  x0 = _mm_avg_epu8(x0, x4);
404  x1 = _mm_load_si128(rgb1++);
405  x4 = _mm_load_si128(rgb2++);
406  x1 = _mm_avg_epu8(x1, x4);
407  x2 = _mm_load_si128(rgb1++);
408  x4 = _mm_load_si128(rgb2++);
409  x2 = _mm_avg_epu8(x2, x4);
410  x3 = _mm_load_si128(rgb1++);
411  x4 = _mm_load_si128(rgb2++);
412  x3 = _mm_avg_epu8(x3, x4);
413  /* subsample these 16x1 pixels into 8x1 pixels */
419  x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88));
420  x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd));
421  x0 = _mm_avg_epu8(x0, x4);
422  x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88));
423  x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd));
424  x1 = _mm_avg_epu8(x1, x4);
425  /* multiplications and subtotals */
426  x2 = _mm_maddubs_epi16(x0, u_factors);
427  x3 = _mm_maddubs_epi16(x1, u_factors);
428  x4 = _mm_maddubs_epi16(x0, v_factors);
429  x5 = _mm_maddubs_epi16(x1, v_factors);
430  /* the total sums */
431  x0 = _mm_hadd_epi16(x2, x3);
432  x1 = _mm_hadd_epi16(x4, x5);
433  /* shift the results */
434  x0 = _mm_srai_epi16(x0, U_SHIFT);
435  x1 = _mm_srai_epi16(x1, V_SHIFT);
436  /* pack the 16 words into bytes */
437  x0 = _mm_packs_epi16(x0, x1);
438  /* add 128 */
439  x0 = _mm_sub_epi8(x0, vector128);
440  /* the lower 8 bytes go to the u plane */
441  _mm_storel_pi(udst++, _mm_castsi128_ps(x0));
442  /* the upper 8 bytes go to the v plane */
443  _mm_storeh_pi(vdst++, _mm_castsi128_ps(x0));
444  }
445 }
446 
447 static pstatus_t ssse3_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
448  UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
449  const UINT32 dstStep[],
450  const prim_size_t* WINPR_RESTRICT roi)
451 {
452  const BYTE* argb = pSrc;
453  BYTE* ydst = pDst[0];
454  BYTE* udst = pDst[1];
455  BYTE* vdst = pDst[2];
456 
457  if (roi->height < 1 || roi->width < 1)
458  {
459  return !PRIMITIVES_SUCCESS;
460  }
461 
462  if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
463  {
464  return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
465  }
466 
467  for (UINT32 y = 0; y < roi->height - 1; y += 2)
468  {
469  const BYTE* line1 = argb;
470  const BYTE* line2 = argb + srcStep;
471  ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
472  ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width);
473  ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width);
474  argb += 2ULL * srcStep;
475  ydst += 2ULL * dstStep[0];
476  udst += 1ULL * dstStep[1];
477  vdst += 1ULL * dstStep[2];
478  }
479 
480  if (roi->height & 1)
481  {
482  /* pass the same last line of an odd height twice for UV */
483  ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width);
484  ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width);
485  }
486 
487  return PRIMITIVES_SUCCESS;
488 }
489 
490 static pstatus_t ssse3_RGBToYUV420(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
491  UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
492  const UINT32 dstStep[], const prim_size_t* WINPR_RESTRICT roi)
493 {
494  switch (srcFormat)
495  {
496  case PIXEL_FORMAT_BGRX32:
497  case PIXEL_FORMAT_BGRA32:
498  return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
499 
500  default:
501  return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
502  }
503 }
504 
505 /****************************************************************************/
506 /* SSSE3 RGB -> AVC444-YUV conversion **/
507 /****************************************************************************/
508 
509 static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
510  const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
511  BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
512  BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
513  BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
514 {
515  const __m128i* argbEven = (const __m128i*)srcEven;
516  const __m128i* argbOdd = (const __m128i*)srcOdd;
517  const __m128i y_factors = BGRX_Y_FACTORS;
518  const __m128i u_factors = BGRX_U_FACTORS;
519  const __m128i v_factors = BGRX_V_FACTORS;
520  const __m128i vector128 = CONST128_FACTORS;
521 
522  for (UINT32 x = 0; x < width; x += 16)
523  {
524  /* store 16 rgba pixels in 4 128 bit registers */
525  const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels
526  const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels
527  const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels
528  const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels
529  const __m128i xo1 = _mm_load_si128(argbOdd++); // 1st 4 pixels
530  const __m128i xo2 = _mm_load_si128(argbOdd++); // 2nd 4 pixels
531  const __m128i xo3 = _mm_load_si128(argbOdd++); // 3rd 4 pixels
532  const __m128i xo4 = _mm_load_si128(argbOdd++); // 4th 4 pixels
533  {
534  /* Y: multiplications with subtotals and horizontal sums */
535  const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
536  _mm_maddubs_epi16(xe2, y_factors)),
537  Y_SHIFT);
538  const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
539  _mm_maddubs_epi16(xe4, y_factors)),
540  Y_SHIFT);
541  const __m128i ye = _mm_packus_epi16(ye1, ye2);
542  const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
543  _mm_maddubs_epi16(xo2, y_factors)),
544  Y_SHIFT);
545  const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
546  _mm_maddubs_epi16(xo4, y_factors)),
547  Y_SHIFT);
548  const __m128i yo = _mm_packus_epi16(yo1, yo2);
549  /* store y [b1] */
550  _mm_storeu_si128((__m128i*)b1Even, ye);
551  b1Even += 16;
552 
553  if (b1Odd)
554  {
555  _mm_storeu_si128((__m128i*)b1Odd, yo);
556  b1Odd += 16;
557  }
558  }
559  {
560  /* We have now
561  * 16 even U values in ue
562  * 16 odd U values in uo
563  *
564  * We need to split these according to
565  * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
566  __m128i ue;
567  __m128i uo = { 0 };
568  {
569  const __m128i ue1 =
570  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
571  _mm_maddubs_epi16(xe2, u_factors)),
572  U_SHIFT);
573  const __m128i ue2 =
574  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
575  _mm_maddubs_epi16(xe4, u_factors)),
576  U_SHIFT);
577  ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
578  }
579 
580  if (b1Odd)
581  {
582  const __m128i uo1 =
583  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
584  _mm_maddubs_epi16(xo2, u_factors)),
585  U_SHIFT);
586  const __m128i uo2 =
587  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
588  _mm_maddubs_epi16(xo4, u_factors)),
589  U_SHIFT);
590  uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
591  }
592 
593  /* Now we need the following storage distribution:
594  * 2x 2y -> b2
595  * x 2y+1 -> b4
596  * 2x+1 2y -> b6 */
597  if (b1Odd) /* b2 */
598  {
599  const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
600  const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
601  const __m128i hi = _mm_add_epi16(ueh, uoh);
602  const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
603  const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
604  const __m128i lo = _mm_add_epi16(uel, uol);
605  const __m128i added = _mm_hadd_epi16(lo, hi);
606  const __m128i avg16 = _mm_srai_epi16(added, 2);
607  const __m128i avg = _mm_packus_epi16(avg16, avg16);
608  _mm_storel_epi64((__m128i*)b2, avg);
609  }
610  else
611  {
612  const __m128i mask =
613  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
614  (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
615  const __m128i ud = _mm_shuffle_epi8(ue, mask);
616  _mm_storel_epi64((__m128i*)b2, ud);
617  }
618 
619  b2 += 8;
620 
621  if (b1Odd) /* b4 */
622  {
623  _mm_store_si128((__m128i*)b4, uo);
624  b4 += 16;
625  }
626 
627  {
628  /* b6 */
629  const __m128i mask =
630  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
631  (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
632  const __m128i ude = _mm_shuffle_epi8(ue, mask);
633  _mm_storel_epi64((__m128i*)b6, ude);
634  b6 += 8;
635  }
636  }
637  {
638  /* We have now
639  * 16 even V values in ue
640  * 16 odd V values in uo
641  *
642  * We need to split these according to
643  * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
644  __m128i ve;
645  __m128i vo = { 0 };
646  {
647  const __m128i ve1 =
648  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
649  _mm_maddubs_epi16(xe2, v_factors)),
650  V_SHIFT);
651  const __m128i ve2 =
652  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
653  _mm_maddubs_epi16(xe4, v_factors)),
654  V_SHIFT);
655  ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
656  }
657 
658  if (b1Odd)
659  {
660  const __m128i vo1 =
661  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
662  _mm_maddubs_epi16(xo2, v_factors)),
663  V_SHIFT);
664  const __m128i vo2 =
665  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
666  _mm_maddubs_epi16(xo4, v_factors)),
667  V_SHIFT);
668  vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
669  }
670 
671  /* Now we need the following storage distribution:
672  * 2x 2y -> b3
673  * x 2y+1 -> b5
674  * 2x+1 2y -> b7 */
675  if (b1Odd) /* b3 */
676  {
677  const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
678  const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
679  const __m128i hi = _mm_add_epi16(veh, voh);
680  const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
681  const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
682  const __m128i lo = _mm_add_epi16(vel, vol);
683  const __m128i added = _mm_hadd_epi16(lo, hi);
684  const __m128i avg16 = _mm_srai_epi16(added, 2);
685  const __m128i avg = _mm_packus_epi16(avg16, avg16);
686  _mm_storel_epi64((__m128i*)b3, avg);
687  }
688  else
689  {
690  const __m128i mask =
691  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
692  (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
693  const __m128i vd = _mm_shuffle_epi8(ve, mask);
694  _mm_storel_epi64((__m128i*)b3, vd);
695  }
696 
697  b3 += 8;
698 
699  if (b1Odd) /* b5 */
700  {
701  _mm_store_si128((__m128i*)b5, vo);
702  b5 += 16;
703  }
704 
705  {
706  /* b7 */
707  const __m128i mask =
708  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
709  (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
710  const __m128i vde = _mm_shuffle_epi8(ve, mask);
711  _mm_storel_epi64((__m128i*)b7, vde);
712  b7 += 8;
713  }
714  }
715  }
716 }
717 
718 static pstatus_t ssse3_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
719  UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
720  const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
721  const UINT32 dst2Step[],
722  const prim_size_t* WINPR_RESTRICT roi)
723 {
724  const BYTE* pMaxSrc = pSrc + 1ULL * (roi->height - 1) * srcStep;
725 
726  if (roi->height < 1 || roi->width < 1)
727  return !PRIMITIVES_SUCCESS;
728 
729  if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
730  return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
731  roi);
732 
733  for (size_t y = 0; y < roi->height; y += 2)
734  {
735  const BOOL last = (y >= (roi->height - 1));
736  const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
737  const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
738  const size_t i = y >> 1;
739  const size_t n = (i & ~7) + i;
740  BYTE* b1Even = pDst1[0] + y * dst1Step[0];
741  BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
742  BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
743  BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
744  BYTE* b4 = pDst2[0] + 1ULL * dst2Step[0] * n;
745  BYTE* b5 = b4 + 8ULL * dst2Step[0];
746  BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
747  BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
748  ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
749  roi->width);
750  }
751 
752  return PRIMITIVES_SUCCESS;
753 }
754 
755 static pstatus_t ssse3_RGBToAVC444YUV(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
756  UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
757  const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
758  const UINT32 dst2Step[],
759  const prim_size_t* WINPR_RESTRICT roi)
760 {
761  switch (srcFormat)
762  {
763  case PIXEL_FORMAT_BGRX32:
764  case PIXEL_FORMAT_BGRA32:
765  return ssse3_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
766  dst2Step, roi);
767 
768  default:
769  return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
770  dst2Step, roi);
771  }
772 }
773 
774 /* Mapping of arguments:
775  *
776  * b1 [even lines] -> yLumaDstEven
777  * b1 [odd lines] -> yLumaDstOdd
778  * b2 -> uLumaDst
779  * b3 -> vLumaDst
780  * b4 -> yChromaDst1
781  * b5 -> yChromaDst2
782  * b6 -> uChromaDst1
783  * b7 -> uChromaDst2
784  * b8 -> vChromaDst1
785  * b9 -> vChromaDst2
786  */
787 static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
788  const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
789  BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
790  BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
791  BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
792  BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
793  BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
794  BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
795 {
796  const __m128i vector128 = CONST128_FACTORS;
797  const __m128i* argbEven = (const __m128i*)srcEven;
798  const __m128i* argbOdd = (const __m128i*)srcOdd;
799 
800  for (UINT32 x = 0; x < width; x += 16)
801  {
802  /* store 16 rgba pixels in 4 128 bit registers
803  * for even and odd rows.
804  */
805  const __m128i xe1 = _mm_load_si128(argbEven++); /* 1st 4 pixels */
806  const __m128i xe2 = _mm_load_si128(argbEven++); /* 2nd 4 pixels */
807  const __m128i xe3 = _mm_load_si128(argbEven++); /* 3rd 4 pixels */
808  const __m128i xe4 = _mm_load_si128(argbEven++); /* 4th 4 pixels */
809  const __m128i xo1 = _mm_load_si128(argbOdd++); /* 1st 4 pixels */
810  const __m128i xo2 = _mm_load_si128(argbOdd++); /* 2nd 4 pixels */
811  const __m128i xo3 = _mm_load_si128(argbOdd++); /* 3rd 4 pixels */
812  const __m128i xo4 = _mm_load_si128(argbOdd++); /* 4th 4 pixels */
813  {
814  /* Y: multiplications with subtotals and horizontal sums */
815  const __m128i y_factors = BGRX_Y_FACTORS;
816  const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
817  _mm_maddubs_epi16(xe2, y_factors)),
818  Y_SHIFT);
819  const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
820  _mm_maddubs_epi16(xe4, y_factors)),
821  Y_SHIFT);
822  const __m128i ye = _mm_packus_epi16(ye1, ye2);
823  /* store y [b1] */
824  _mm_storeu_si128((__m128i*)yLumaDstEven, ye);
825  yLumaDstEven += 16;
826  }
827 
828  if (yLumaDstOdd)
829  {
830  const __m128i y_factors = BGRX_Y_FACTORS;
831  const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
832  _mm_maddubs_epi16(xo2, y_factors)),
833  Y_SHIFT);
834  const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
835  _mm_maddubs_epi16(xo4, y_factors)),
836  Y_SHIFT);
837  const __m128i yo = _mm_packus_epi16(yo1, yo2);
838  _mm_storeu_si128((__m128i*)yLumaDstOdd, yo);
839  yLumaDstOdd += 16;
840  }
841 
842  {
843  /* We have now
844  * 16 even U values in ue
845  * 16 odd U values in uo
846  *
847  * We need to split these according to
848  * 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode */
849  /* U: multiplications with subtotals and horizontal sums */
850  __m128i ue;
851  __m128i uo;
852  __m128i uavg;
853  {
854  const __m128i u_factors = BGRX_U_FACTORS;
855  const __m128i ue1 =
856  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
857  _mm_maddubs_epi16(xe2, u_factors)),
858  U_SHIFT);
859  const __m128i ue2 =
860  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
861  _mm_maddubs_epi16(xe4, u_factors)),
862  U_SHIFT);
863  const __m128i ueavg = _mm_hadd_epi16(ue1, ue2);
864  ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
865  uavg = ueavg;
866  }
867  {
868  const __m128i u_factors = BGRX_U_FACTORS;
869  const __m128i uo1 =
870  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
871  _mm_maddubs_epi16(xo2, u_factors)),
872  U_SHIFT);
873  const __m128i uo2 =
874  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
875  _mm_maddubs_epi16(xo4, u_factors)),
876  U_SHIFT);
877  const __m128i uoavg = _mm_hadd_epi16(uo1, uo2);
878  uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
879  uavg = _mm_add_epi16(uavg, uoavg);
880  uavg = _mm_srai_epi16(uavg, 2);
881  uavg = _mm_packs_epi16(uavg, uoavg);
882  uavg = _mm_sub_epi8(uavg, vector128);
883  }
884  /* Now we need the following storage distribution:
885  * 2x 2y -> uLumaDst
886  * 2x+1 y -> yChromaDst1
887  * 4x 2y+1 -> uChromaDst1
888  * 4x+2 2y+1 -> vChromaDst1 */
889  {
890  const __m128i mask =
891  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
892  (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
893  const __m128i ude = _mm_shuffle_epi8(ue, mask);
894  _mm_storel_epi64((__m128i*)yEvenChromaDst1, ude);
895  yEvenChromaDst1 += 8;
896  }
897 
898  if (yLumaDstOdd)
899  {
900  const __m128i mask =
901  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
902  (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
903  const __m128i udo = _mm_shuffle_epi8(uo, mask);
904  _mm_storel_epi64((__m128i*)yOddChromaDst1, udo);
905  yOddChromaDst1 += 8;
906  }
907 
908  if (yLumaDstOdd)
909  {
910  const __m128i mask =
911  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
912  (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
913  const __m128i ud = _mm_shuffle_epi8(uo, mask);
914  int* uDst1 = (int*)uChromaDst1;
915  int* vDst1 = (int*)vChromaDst1;
916  const int* src = (const int*)&ud;
917  _mm_stream_si32(uDst1, src[0]);
918  _mm_stream_si32(vDst1, src[1]);
919  uChromaDst1 += 4;
920  vChromaDst1 += 4;
921  }
922 
923  if (yLumaDstOdd)
924  {
925  _mm_storel_epi64((__m128i*)uLumaDst, uavg);
926  uLumaDst += 8;
927  }
928  else
929  {
930  const __m128i mask =
931  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
932  (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
933  const __m128i ud = _mm_shuffle_epi8(ue, mask);
934  _mm_storel_epi64((__m128i*)uLumaDst, ud);
935  uLumaDst += 8;
936  }
937  }
938 
939  {
940  /* V: multiplications with subtotals and horizontal sums */
941  __m128i ve;
942  __m128i vo;
943  __m128i vavg;
944  {
945  const __m128i v_factors = BGRX_V_FACTORS;
946  const __m128i ve1 =
947  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
948  _mm_maddubs_epi16(xe2, v_factors)),
949  V_SHIFT);
950  const __m128i ve2 =
951  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
952  _mm_maddubs_epi16(xe4, v_factors)),
953  V_SHIFT);
954  const __m128i veavg = _mm_hadd_epi16(ve1, ve2);
955  ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
956  vavg = veavg;
957  }
958  {
959  const __m128i v_factors = BGRX_V_FACTORS;
960  const __m128i vo1 =
961  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
962  _mm_maddubs_epi16(xo2, v_factors)),
963  V_SHIFT);
964  const __m128i vo2 =
965  _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
966  _mm_maddubs_epi16(xo4, v_factors)),
967  V_SHIFT);
968  const __m128i voavg = _mm_hadd_epi16(vo1, vo2);
969  vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
970  vavg = _mm_add_epi16(vavg, voavg);
971  vavg = _mm_srai_epi16(vavg, 2);
972  vavg = _mm_packs_epi16(vavg, voavg);
973  vavg = _mm_sub_epi8(vavg, vector128);
974  }
975  /* Now we need the following storage distribution:
976  * 2x 2y -> vLumaDst
977  * 2x+1 y -> yChromaDst2
978  * 4x 2y+1 -> uChromaDst2
979  * 4x+2 2y+1 -> vChromaDst2 */
980  {
981  const __m128i mask =
982  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
983  (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
984  __m128i vde = _mm_shuffle_epi8(ve, mask);
985  _mm_storel_epi64((__m128i*)yEvenChromaDst2, vde);
986  yEvenChromaDst2 += 8;
987  }
988 
989  if (yLumaDstOdd)
990  {
991  const __m128i mask =
992  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
993  (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
994  __m128i vdo = _mm_shuffle_epi8(vo, mask);
995  _mm_storel_epi64((__m128i*)yOddChromaDst2, vdo);
996  yOddChromaDst2 += 8;
997  }
998 
999  if (yLumaDstOdd)
1000  {
1001  const __m128i mask =
1002  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1003  (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
1004  const __m128i vd = _mm_shuffle_epi8(vo, mask);
1005  int* uDst2 = (int*)uChromaDst2;
1006  int* vDst2 = (int*)vChromaDst2;
1007  const int* src = (const int*)&vd;
1008  _mm_stream_si32(uDst2, src[0]);
1009  _mm_stream_si32(vDst2, src[1]);
1010  uChromaDst2 += 4;
1011  vChromaDst2 += 4;
1012  }
1013 
1014  if (yLumaDstOdd)
1015  {
1016  _mm_storel_epi64((__m128i*)vLumaDst, vavg);
1017  vLumaDst += 8;
1018  }
1019  else
1020  {
1021  const __m128i mask =
1022  _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1023  (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
1024  __m128i vd = _mm_shuffle_epi8(ve, mask);
1025  _mm_storel_epi64((__m128i*)vLumaDst, vd);
1026  vLumaDst += 8;
1027  }
1028  }
1029  }
1030 }
1031 
1032 static pstatus_t ssse3_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1033  UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1034  const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1035  const UINT32 dst2Step[],
1036  const prim_size_t* WINPR_RESTRICT roi)
1037 {
1038  if (roi->height < 1 || roi->width < 1)
1039  return !PRIMITIVES_SUCCESS;
1040 
1041  if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
1042  return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
1043  roi);
1044 
1045  for (size_t y = 0; y < roi->height; y += 2)
1046  {
1047  const BYTE* srcEven = (pSrc + y * srcStep);
1048  const BYTE* srcOdd = (srcEven + srcStep);
1049  BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
1050  BYTE* dstLumaYOdd = (y < roi->height - 1) ? (dstLumaYEven + dst1Step[0]) : NULL;
1051  BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
1052  BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
1053  BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
1054  BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
1055  BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
1056  BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
1057  BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
1058  BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
1059  BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
1060  BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
1061  ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU,
1062  dstLumaV, dstEvenChromaY1, dstEvenChromaY2,
1063  dstOddChromaY1, dstOddChromaY2, dstChromaU1,
1064  dstChromaU2, dstChromaV1, dstChromaV2, roi->width);
1065  }
1066 
1067  return PRIMITIVES_SUCCESS;
1068 }
1069 
1070 static pstatus_t ssse3_RGBToAVC444YUVv2(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1071  UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1072  const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1073  const UINT32 dst2Step[],
1074  const prim_size_t* WINPR_RESTRICT roi)
1075 {
1076  switch (srcFormat)
1077  {
1078  case PIXEL_FORMAT_BGRX32:
1079  case PIXEL_FORMAT_BGRA32:
1080  return ssse3_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1081  dst2Step, roi);
1082 
1083  default:
1084  return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1085  dst2Step, roi);
1086  }
1087 }
1088 
1089 static pstatus_t ssse3_LumaToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[], const UINT32 srcStep[],
1090  BYTE* WINPR_RESTRICT pDstRaw[], const UINT32 dstStep[],
1091  const RECTANGLE_16* WINPR_RESTRICT roi)
1092 {
1093  const UINT32 nWidth = roi->right - roi->left;
1094  const UINT32 nHeight = roi->bottom - roi->top;
1095  const UINT32 halfWidth = (nWidth + 1) / 2;
1096  const UINT32 halfPad = halfWidth % 16;
1097  const UINT32 halfHeight = (nHeight + 1) / 2;
1098  const UINT32 oddY = 1;
1099  const UINT32 evenY = 0;
1100  const UINT32 oddX = 1;
1101  const UINT32 evenX = 0;
1102  const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1103  pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1104  pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1105  BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1106  pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1107  pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1108 
1109  /* Y data is already here... */
1110  /* B1 */
1111  for (size_t y = 0; y < nHeight; y++)
1112  {
1113  const BYTE* Ym = pSrc[0] + y * srcStep[0];
1114  BYTE* pY = pDst[0] + y * dstStep[0];
1115  memcpy(pY, Ym, nWidth);
1116  }
1117 
1118  /* The first half of U, V are already here part of this frame. */
1119  /* B2 and B3 */
1120  for (size_t y = 0; y < halfHeight; y++)
1121  {
1122  const size_t val2y = (2 * y + evenY);
1123  const size_t val2y1 = val2y + oddY;
1124  const BYTE* Um = pSrc[1] + 1ULL * srcStep[1] * y;
1125  const BYTE* Vm = pSrc[2] + 1ULL * srcStep[2] * y;
1126  BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y;
1127  BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y;
1128  BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1;
1129  BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1;
1130 
1131  size_t x = 0;
1132  for (; x < halfWidth - halfPad; x += 16)
1133  {
1134  const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
1135  const __m128i unpackLow =
1136  _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
1137  {
1138  const __m128i u = _mm_loadu_si128((const __m128i*)&Um[x]);
1139  const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1140  const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1141  _mm_storeu_si128((__m128i*)&pU[2ULL * x], uHigh);
1142  _mm_storeu_si128((__m128i*)&pU[2ULL * x + 16], uLow);
1143  _mm_storeu_si128((__m128i*)&pU1[2ULL * x], uHigh);
1144  _mm_storeu_si128((__m128i*)&pU1[2ULL * x + 16], uLow);
1145  }
1146  {
1147  const __m128i u = _mm_loadu_si128((const __m128i*)&Vm[x]);
1148  const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1149  const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1150  _mm_storeu_si128((__m128i*)&pV[2 * x], uHigh);
1151  _mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow);
1152  _mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh);
1153  _mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow);
1154  }
1155  }
1156 
1157  for (; x < halfWidth; x++)
1158  {
1159  const size_t val2x = 2 * x + evenX;
1160  const size_t val2x1 = val2x + oddX;
1161  pU[val2x] = Um[x];
1162  pV[val2x] = Vm[x];
1163  pU[val2x1] = Um[x];
1164  pV[val2x1] = Vm[x];
1165  pU1[val2x] = Um[x];
1166  pV1[val2x] = Vm[x];
1167  pU1[val2x1] = Um[x];
1168  pV1[val2x1] = Vm[x];
1169  }
1170  }
1171 
1172  return PRIMITIVES_SUCCESS;
1173 }
1174 
1175 static INLINE void ssse3_filter(BYTE* WINPR_RESTRICT pSrcDst, const BYTE* WINPR_RESTRICT pSrc2)
1176 {
1177  const __m128i even = _mm_set_epi8((char)0x80, 14, (char)0x80, 12, (char)0x80, 10, (char)0x80, 8,
1178  (char)0x80, 6, (char)0x80, 4, (char)0x80, 2, (char)0x80, 0);
1179  const __m128i odd = _mm_set_epi8((char)0x80, 15, (char)0x80, 13, (char)0x80, 11, (char)0x80, 9,
1180  (char)0x80, 7, (char)0x80, 5, (char)0x80, 3, (char)0x80, 1);
1181  const __m128i interleave = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
1182  const __m128i u = _mm_loadu_si128((const __m128i*)pSrcDst);
1183  const __m128i u1 = _mm_loadu_si128((const __m128i*)pSrc2);
1184  const __m128i uEven = _mm_shuffle_epi8(u, even);
1185  const __m128i uEven4 = _mm_slli_epi16(uEven, 2);
1186  const __m128i uOdd = _mm_shuffle_epi8(u, odd);
1187  const __m128i u1Even = _mm_shuffle_epi8(u1, even);
1188  const __m128i u1Odd = _mm_shuffle_epi8(u1, odd);
1189  const __m128i tmp1 = _mm_add_epi16(uOdd, u1Even);
1190  const __m128i tmp2 = _mm_add_epi16(tmp1, u1Odd);
1191  const __m128i result = _mm_sub_epi16(uEven4, tmp2);
1192  const __m128i packed = _mm_packus_epi16(result, uOdd);
1193  const __m128i interleaved = _mm_shuffle_epi8(packed, interleave);
1194  _mm_storeu_si128((__m128i*)pSrcDst, interleaved);
1195 }
1196 
1197 static pstatus_t ssse3_ChromaFilter(BYTE* WINPR_RESTRICT pDst[], const UINT32 dstStep[],
1198  const RECTANGLE_16* WINPR_RESTRICT roi)
1199 {
1200  const UINT32 oddY = 1;
1201  const UINT32 evenY = 0;
1202  const UINT32 nWidth = roi->right - roi->left;
1203  const UINT32 nHeight = roi->bottom - roi->top;
1204  const UINT32 halfHeight = (nHeight + 1) / 2;
1205  const UINT32 halfWidth = (nWidth + 1) / 2;
1206  const UINT32 halfPad = halfWidth % 16;
1207 
1208  /* Filter */
1209  for (size_t y = roi->top; y < halfHeight + roi->top; y++)
1210  {
1211  size_t x = roi->left;
1212  const size_t val2y = (y * 2ULL + evenY);
1213  const size_t val2y1 = val2y + oddY;
1214  BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1;
1215  BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1;
1216  BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y;
1217  BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y;
1218 
1219  if (val2y1 > nHeight)
1220  continue;
1221 
1222  for (; x < halfWidth + roi->left - halfPad; x += 16)
1223  {
1224  ssse3_filter(&pU[2 * x], &pU1[2 * x]);
1225  ssse3_filter(&pV[2 * x], &pV1[2 * x]);
1226  }
1227 
1228  for (; x < halfWidth + roi->left; x++)
1229  {
1230  const size_t val2x = (x * 2ULL);
1231  const size_t val2x1 = val2x + 1ULL;
1232  const BYTE inU = pU[val2x];
1233  const BYTE inV = pV[val2x];
1234  const INT32 up = inU * 4;
1235  const INT32 vp = inV * 4;
1236  INT32 u2020 = 0;
1237  INT32 v2020 = 0;
1238 
1239  if (val2x1 > nWidth)
1240  continue;
1241 
1242  u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
1243  v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
1244  pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
1245  pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
1246  }
1247  }
1248 
1249  return PRIMITIVES_SUCCESS;
1250 }
1251 
1252 static pstatus_t ssse3_ChromaV1ToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3],
1253  const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
1254  const UINT32 dstStep[3],
1255  const RECTANGLE_16* WINPR_RESTRICT roi)
1256 {
1257  const UINT32 mod = 16;
1258  UINT32 uY = 0;
1259  UINT32 vY = 0;
1260  const UINT32 nWidth = roi->right - roi->left;
1261  const UINT32 nHeight = roi->bottom - roi->top;
1262  const UINT32 halfWidth = (nWidth + 1) / 2;
1263  const UINT32 halfPad = halfWidth % 16;
1264  const UINT32 halfHeight = (nHeight + 1) / 2;
1265  const UINT32 oddY = 1;
1266  const UINT32 evenY = 0;
1267  const UINT32 oddX = 1;
1268  /* The auxilary frame is aligned to multiples of 16x16.
1269  * We need the padded height for B4 and B5 conversion. */
1270  const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
1271  const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1272  pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1273  pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1274  BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1275  pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1276  pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1277  const __m128i zero = _mm_setzero_si128();
1278  const __m128i mask = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
1279  (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
1280 
1281  /* The second half of U and V is a bit more tricky... */
1282  /* B4 and B5 */
1283  for (size_t y = 0; y < padHeigth; y++)
1284  {
1285  const BYTE* Ya = pSrc[0] + 1ULL * srcStep[0] * y;
1286  BYTE* pX = NULL;
1287 
1288  if ((y) % mod < (mod + 1) / 2)
1289  {
1290  const UINT32 pos = (2 * uY++ + oddY);
1291 
1292  if (pos >= nHeight)
1293  continue;
1294 
1295  pX = pDst[1] + 1ULL * dstStep[1] * pos;
1296  }
1297  else
1298  {
1299  const UINT32 pos = (2 * vY++ + oddY);
1300 
1301  if (pos >= nHeight)
1302  continue;
1303 
1304  pX = pDst[2] + 1ULL * dstStep[2] * pos;
1305  }
1306 
1307  memcpy(pX, Ya, nWidth);
1308  }
1309 
1310  /* B6 and B7 */
1311  for (size_t y = 0; y < halfHeight; y++)
1312  {
1313  const size_t val2y = (y * 2 + evenY);
1314  const BYTE* Ua = pSrc[1] + srcStep[1] * y;
1315  const BYTE* Va = pSrc[2] + srcStep[2] * y;
1316  BYTE* pU = pDst[1] + dstStep[1] * val2y;
1317  BYTE* pV = pDst[2] + dstStep[2] * val2y;
1318 
1319  size_t x = 0;
1320  for (; x < halfWidth - halfPad; x += 16)
1321  {
1322  {
1323  const __m128i u = _mm_loadu_si128((const __m128i*)&Ua[x]);
1324  const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1325  const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1326  _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
1327  _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
1328  }
1329  {
1330  const __m128i u = _mm_loadu_si128((const __m128i*)&Va[x]);
1331  const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1332  const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1333  _mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]);
1334  _mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]);
1335  }
1336  }
1337 
1338  for (; x < halfWidth; x++)
1339  {
1340  const size_t val2x1 = (x * 2ULL + oddX);
1341  pU[val2x1] = Ua[x];
1342  pV[val2x1] = Va[x];
1343  }
1344  }
1345 
1346  /* Filter */
1347  return ssse3_ChromaFilter(pDst, dstStep, roi);
1348 }
1349 
1350 static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
1351  UINT32 nTotalWidth, UINT32 nTotalHeight,
1352  BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
1353  const RECTANGLE_16* WINPR_RESTRICT roi)
1354 {
1355  const UINT32 nWidth = roi->right - roi->left;
1356  const UINT32 nHeight = roi->bottom - roi->top;
1357  const UINT32 halfWidth = (nWidth + 1) / 2;
1358  const UINT32 halfPad = halfWidth % 16;
1359  const UINT32 halfHeight = (nHeight + 1) / 2;
1360  const UINT32 quaterWidth = (nWidth + 3) / 4;
1361  const UINT32 quaterPad = quaterWidth % 16;
1362  const __m128i zero = _mm_setzero_si128();
1363  const __m128i mask = _mm_set_epi8((char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
1364  (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0);
1365  const __m128i mask2 = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80,
1366  0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
1367  const __m128i shuffle1 =
1368  _mm_set_epi8((char)0x80, 15, (char)0x80, 14, (char)0x80, 13, (char)0x80, 12, (char)0x80, 11,
1369  (char)0x80, 10, (char)0x80, 9, (char)0x80, 8);
1370  const __m128i shuffle2 =
1371  _mm_set_epi8((char)0x80, 7, (char)0x80, 6, (char)0x80, 5, (char)0x80, 4, (char)0x80, 3,
1372  (char)0x80, 2, (char)0x80, 1, (char)0x80, 0);
1373 
1374  /* B4 and B5: odd UV values for width/2, height */
1375  for (size_t y = 0; y < nHeight; y++)
1376  {
1377  const size_t yTop = y + roi->top;
1378  const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
1379  const BYTE* pYaV = pYaU + nTotalWidth / 2;
1380  BYTE* pU = pDst[1] + 1ULL * dstStep[1] * yTop + roi->left;
1381  BYTE* pV = pDst[2] + 1ULL * dstStep[2] * yTop + roi->left;
1382 
1383  size_t x = 0;
1384  for (; x < halfWidth - halfPad; x += 16)
1385  {
1386  {
1387  const __m128i u = _mm_loadu_si128((const __m128i*)&pYaU[x]);
1388  const __m128i u2 = _mm_unpackhi_epi8(zero, u);
1389  const __m128i u1 = _mm_unpacklo_epi8(zero, u);
1390  _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
1391  _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
1392  }
1393  {
1394  const __m128i v = _mm_loadu_si128((const __m128i*)&pYaV[x]);
1395  const __m128i v2 = _mm_unpackhi_epi8(zero, v);
1396  const __m128i v1 = _mm_unpacklo_epi8(zero, v);
1397  _mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]);
1398  _mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]);
1399  }
1400  }
1401 
1402  for (; x < halfWidth; x++)
1403  {
1404  const size_t odd = 2ULL * x + 1;
1405  pU[odd] = pYaU[x];
1406  pV[odd] = pYaV[x];
1407  }
1408  }
1409 
1410  /* B6 - B9 */
1411  for (size_t y = 0; y < halfHeight; y++)
1412  {
1413  const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
1414  const BYTE* pUaV = pUaU + nTotalWidth / 4;
1415  const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
1416  const BYTE* pVaV = pVaU + nTotalWidth / 4;
1417  BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
1418  BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
1419 
1420  UINT32 x = 0;
1421  for (; x < quaterWidth - quaterPad; x += 16)
1422  {
1423  {
1424  const __m128i uU = _mm_loadu_si128((const __m128i*)&pUaU[x]);
1425  const __m128i uV = _mm_loadu_si128((const __m128i*)&pVaU[x]);
1426  const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
1427  const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
1428  const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
1429  const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
1430  const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
1431  const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
1432  _mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]);
1433  _mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]);
1434  _mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]);
1435  _mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]);
1436  }
1437  {
1438  const __m128i vU = _mm_loadu_si128((const __m128i*)&pUaV[x]);
1439  const __m128i vV = _mm_loadu_si128((const __m128i*)&pVaV[x]);
1440  const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
1441  const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
1442  const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
1443  const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1);
1444  const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2);
1445  const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1);
1446  _mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]);
1447  _mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]);
1448  _mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]);
1449  _mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]);
1450  }
1451  }
1452 
1453  for (; x < quaterWidth; x++)
1454  {
1455  pU[4 * x + 0] = pUaU[x];
1456  pV[4 * x + 0] = pUaV[x];
1457  pU[4 * x + 2] = pVaU[x];
1458  pV[4 * x + 2] = pVaV[x];
1459  }
1460  }
1461 
1462  return ssse3_ChromaFilter(pDst, dstStep, roi);
1463 }
1464 
1465 static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type,
1466  const BYTE* WINPR_RESTRICT pSrc[3],
1467  const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
1468  BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
1469  const RECTANGLE_16* WINPR_RESTRICT roi)
1470 {
1471  if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
1472  return -1;
1473 
1474  if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
1475  return -1;
1476 
1477  if (!roi)
1478  return -1;
1479 
1480  switch (type)
1481  {
1482  case AVC444_LUMA:
1483  return ssse3_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1484 
1485  case AVC444_CHROMAv1:
1486  return ssse3_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1487 
1488  case AVC444_CHROMAv2:
1489  return ssse3_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
1490 
1491  default:
1492  return -1;
1493  }
1494 }
1495 #endif
1496 
1497 void primitives_init_YUV_ssse3(primitives_t* WINPR_RESTRICT prims)
1498 {
1499 #if defined(SSE2_ENABLED)
1500  generic = primitives_get_generic();
1501  primitives_init_YUV(prims);
1502 
1503  if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
1504  IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
1505  {
1506  WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
1507  prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420;
1508  prims->RGBToAVC444YUV = ssse3_RGBToAVC444YUV;
1509  prims->RGBToAVC444YUVv2 = ssse3_RGBToAVC444YUVv2;
1510  prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB;
1511  prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R;
1512  prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444;
1513  }
1514 #else
1515  WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
1516  WINPR_UNUSED(prims);
1517 #endif
1518 }