23 #include <winpr/wtypes.h>
24 #include <freerdp/config.h>
26 #include <winpr/sysinfo.h>
27 #include <winpr/crt.h>
28 #include <freerdp/types.h>
29 #include <freerdp/primitives.h>
31 #include "prim_internal.h"
34 #if defined(SSE2_ENABLED)
35 #include <emmintrin.h>
36 #include <tmmintrin.h>
43 static __m128i* ssse3_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
44 __m128i Vraw, UINT8 pos)
48 #if !defined(_MSC_VER) || (_MSC_VER > 1600)
49 const __m128i mapY[] = { _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
50 _mm_set_epi32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
51 _mm_set_epi32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880),
52 _mm_set_epi32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) };
53 const __m128i mapUV[] = { _mm_set_epi32(0x80038002, 0x80018000, 0x80808080, 0x80808080),
54 _mm_set_epi32(0x80078006, 0x80058004, 0x80808080, 0x80808080),
55 _mm_set_epi32(0x800B800A, 0x80098008, 0x80808080, 0x80808080),
56 _mm_set_epi32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) };
57 const __m128i mask[] = { _mm_set_epi32(0x80038080, 0x80028080, 0x80018080, 0x80008080),
58 _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
59 _mm_set_epi32(0x80808003, 0x80808002, 0x80808001, 0x80808000) };
62 const __m128i mapY[] = { { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80,
63 0x80, 0x80, 0x03, 0x80, 0x80 },
64 { 0x80, 0x04, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x06, 0x80,
65 0x80, 0x80, 0x07, 0x80, 0x80 },
66 { 0x80, 0x08, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x0a, 0x80,
67 0x80, 0x80, 0x0b, 0x80, 0x80 },
68 { 0x80, 0x0c, 0x80, 0x80, 0x80, 0x0d, 0x80, 0x80, 0x80, 0x0e, 0x80,
69 0x80, 0x80, 0x0f, 0x80, 0x80 }
72 const __m128i mapUV[] = { { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x80, 0x01,
73 0x80, 0x02, 0x80, 0x03, 0x80 },
74 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x80, 0x05,
75 0x80, 0x06, 0x80, 0x07, 0x80 },
76 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x80, 0x09,
77 0x80, 0x0a, 0x80, 0x0b, 0x80 },
78 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0c, 0x80, 0x0d,
79 0x80, 0x0e, 0x80, 0x0f, 0x80 } };
80 const __m128i mask[] = { { 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02,
81 0x80, 0x80, 0x80, 0x03, 0x80 },
82 { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80,
83 0x80, 0x80, 0x03, 0x80, 0x80 },
84 { 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80,
85 0x80, 0x03, 0x80, 0x80, 0x80 } };
87 const __m128i c128 = _mm_set1_epi16(128);
88 __m128i BGRX = _mm_and_si128(_mm_loadu_si128(dst),
89 _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000));
96 C = _mm_shuffle_epi8(Yraw, mapY[pos]);
100 const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]);
101 D = _mm_sub_epi16(U, c128);
105 const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]);
106 E = _mm_sub_epi16(V, c128);
110 const __m128i c403 = _mm_set1_epi16(403);
112 _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403));
113 const __m128i Rs = _mm_add_epi32(C, e403);
114 const __m128i R32 = _mm_srai_epi32(Rs, 8);
115 const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128());
116 const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128());
117 const __m128i packed = _mm_shuffle_epi8(R, mask[0]);
118 BGRX = _mm_or_si128(BGRX, packed);
122 const __m128i c48 = _mm_set1_epi16(48);
124 _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48));
125 const __m128i c120 = _mm_set1_epi16(120);
127 _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120));
128 const __m128i de = _mm_add_epi32(d48, e120);
129 const __m128i Gs = _mm_sub_epi32(C, de);
130 const __m128i G32 = _mm_srai_epi32(Gs, 8);
131 const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128());
132 const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128());
133 const __m128i packed = _mm_shuffle_epi8(G, mask[1]);
134 BGRX = _mm_or_si128(BGRX, packed);
138 const __m128i c475 = _mm_set1_epi16(475);
140 _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475));
141 const __m128i Bs = _mm_add_epi32(C, d475);
142 const __m128i B32 = _mm_srai_epi32(Bs, 8);
143 const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128());
144 const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128());
145 const __m128i packed = _mm_shuffle_epi8(B, mask[2]);
146 BGRX = _mm_or_si128(BGRX, packed);
149 _mm_storeu_si128(dst++, BGRX);
153 static pstatus_t ssse3_YUV420ToRGB_BGRX(
const BYTE* WINPR_RESTRICT pSrc[],
154 const UINT32* WINPR_RESTRICT srcStep,
155 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
158 const UINT32 nWidth = roi->width;
159 const UINT32 nHeight = roi->height;
160 const UINT32 pad = roi->width % 16;
161 const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
163 for (
size_t y = 0; y < nHeight; y++)
165 __m128i* dst = (__m128i*)(pDst + dstStep * y);
166 const BYTE* YData = pSrc[0] + y * srcStep[0];
167 const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1];
168 const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2];
170 for (UINT32 x = 0; x < nWidth - pad; x += 16)
172 const __m128i Y = _mm_loadu_si128((
const __m128i*)YData);
173 const __m128i uRaw = _mm_loadu_si128((
const __m128i*)UData);
174 const __m128i vRaw = _mm_loadu_si128((
const __m128i*)VData);
175 const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
176 const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
180 dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
181 dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
182 dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
183 dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
186 for (UINT32 x = 0; x < pad; x++)
188 const BYTE Y = *YData++;
189 const BYTE U = *UData;
190 const BYTE V = *VData;
191 const BYTE r = YUV2R(Y, U, V);
192 const BYTE g = YUV2G(Y, U, V);
193 const BYTE b = YUV2B(Y, U, V);
194 dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
204 return PRIMITIVES_SUCCESS;
207 static pstatus_t ssse3_YUV420ToRGB(
const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3],
208 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
213 case PIXEL_FORMAT_BGRX32:
214 case PIXEL_FORMAT_BGRA32:
215 return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi);
218 return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
222 static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(
const BYTE* WINPR_RESTRICT pSrc[],
223 const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
227 const UINT32 nWidth = roi->width;
228 const UINT32 nHeight = roi->height;
229 const UINT32 pad = roi->width % 16;
231 for (
size_t y = 0; y < nHeight; y++)
233 __m128i* dst = (__m128i*)(pDst + dstStep * y);
234 const BYTE* YData = pSrc[0] + y * srcStep[0];
235 const BYTE* UData = pSrc[1] + y * srcStep[1];
236 const BYTE* VData = pSrc[2] + y * srcStep[2];
238 for (
size_t x = 0; x < nWidth - pad; x += 16)
240 __m128i Y = _mm_load_si128((
const __m128i*)YData);
241 __m128i U = _mm_load_si128((
const __m128i*)UData);
242 __m128i V = _mm_load_si128((
const __m128i*)VData);
246 dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
247 dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
248 dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
249 dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
252 for (
size_t x = 0; x < pad; x++)
254 const BYTE Y = *YData++;
255 const BYTE U = *UData++;
256 const BYTE V = *VData++;
257 const BYTE r = YUV2R(Y, U, V);
258 const BYTE g = YUV2G(Y, U, V);
259 const BYTE b = YUV2B(Y, U, V);
260 dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
264 return PRIMITIVES_SUCCESS;
267 static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(
const BYTE* WINPR_RESTRICT pSrc[],
268 const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
269 UINT32 dstStep, UINT32 DstFormat,
272 if ((uintptr_t)pSrc[0] % 16 || (uintptr_t)pSrc[1] % 16 || (uintptr_t)pSrc[2] % 16 ||
273 srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16)
274 return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
278 case PIXEL_FORMAT_BGRX32:
279 case PIXEL_FORMAT_BGRA32:
280 return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
283 return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
312 #define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9)
313 #define BGRX_U_FACTORS \
314 _mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127)
315 #define BGRX_V_FACTORS \
316 _mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12)
317 #define CONST128_FACTORS _mm_set1_epi8(-128)
342 static INLINE
void ssse3_RGBToYUV420_BGRX_Y(
const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width)
348 const __m128i y_factors = BGRX_Y_FACTORS;
349 const __m128i* argb = (
const __m128i*)src;
350 __m128i* ydst = (__m128i*)dst;
352 for (UINT32 x = 0; x < width; x += 16)
355 x0 = _mm_load_si128(argb++);
356 x1 = _mm_load_si128(argb++);
357 x2 = _mm_load_si128(argb++);
358 x3 = _mm_load_si128(argb++);
360 x0 = _mm_maddubs_epi16(x0, y_factors);
361 x1 = _mm_maddubs_epi16(x1, y_factors);
362 x2 = _mm_maddubs_epi16(x2, y_factors);
363 x3 = _mm_maddubs_epi16(x3, y_factors);
365 x0 = _mm_hadd_epi16(x0, x1);
366 x2 = _mm_hadd_epi16(x2, x3);
368 x0 = _mm_srli_epi16(x0, Y_SHIFT);
369 x2 = _mm_srli_epi16(x2, Y_SHIFT);
371 x0 = _mm_packus_epi16(x0, x2);
373 _mm_storeu_si128(ydst++, x0);
379 static INLINE
void ssse3_RGBToYUV420_BGRX_UV(
const BYTE* WINPR_RESTRICT src1,
380 const BYTE* WINPR_RESTRICT src2,
381 BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2,
384 const __m128i u_factors = BGRX_U_FACTORS;
385 const __m128i v_factors = BGRX_V_FACTORS;
386 const __m128i vector128 = CONST128_FACTORS;
393 const __m128i* rgb1 = (
const __m128i*)src1;
394 const __m128i* rgb2 = (
const __m128i*)src2;
395 __m64* udst = (__m64*)dst1;
396 __m64* vdst = (__m64*)dst2;
398 for (UINT32 x = 0; x < width; x += 16)
401 x0 = _mm_load_si128(rgb1++);
402 x4 = _mm_load_si128(rgb2++);
403 x0 = _mm_avg_epu8(x0, x4);
404 x1 = _mm_load_si128(rgb1++);
405 x4 = _mm_load_si128(rgb2++);
406 x1 = _mm_avg_epu8(x1, x4);
407 x2 = _mm_load_si128(rgb1++);
408 x4 = _mm_load_si128(rgb2++);
409 x2 = _mm_avg_epu8(x2, x4);
410 x3 = _mm_load_si128(rgb1++);
411 x4 = _mm_load_si128(rgb2++);
412 x3 = _mm_avg_epu8(x3, x4);
419 x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88));
420 x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd));
421 x0 = _mm_avg_epu8(x0, x4);
422 x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88));
423 x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd));
424 x1 = _mm_avg_epu8(x1, x4);
426 x2 = _mm_maddubs_epi16(x0, u_factors);
427 x3 = _mm_maddubs_epi16(x1, u_factors);
428 x4 = _mm_maddubs_epi16(x0, v_factors);
429 x5 = _mm_maddubs_epi16(x1, v_factors);
431 x0 = _mm_hadd_epi16(x2, x3);
432 x1 = _mm_hadd_epi16(x4, x5);
434 x0 = _mm_srai_epi16(x0, U_SHIFT);
435 x1 = _mm_srai_epi16(x1, V_SHIFT);
437 x0 = _mm_packs_epi16(x0, x1);
439 x0 = _mm_sub_epi8(x0, vector128);
441 _mm_storel_pi(udst++, _mm_castsi128_ps(x0));
443 _mm_storeh_pi(vdst++, _mm_castsi128_ps(x0));
447 static pstatus_t ssse3_RGBToYUV420_BGRX(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
448 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
449 const UINT32 dstStep[],
452 const BYTE* argb = pSrc;
453 BYTE* ydst = pDst[0];
454 BYTE* udst = pDst[1];
455 BYTE* vdst = pDst[2];
457 if (roi->height < 1 || roi->width < 1)
459 return !PRIMITIVES_SUCCESS;
462 if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
464 return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
467 for (UINT32 y = 0; y < roi->height - 1; y += 2)
469 const BYTE* line1 = argb;
470 const BYTE* line2 = argb + srcStep;
471 ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
472 ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width);
473 ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width);
474 argb += 2ULL * srcStep;
475 ydst += 2ULL * dstStep[0];
476 udst += 1ULL * dstStep[1];
477 vdst += 1ULL * dstStep[2];
483 ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width);
484 ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width);
487 return PRIMITIVES_SUCCESS;
490 static pstatus_t ssse3_RGBToYUV420(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
491 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
492 const UINT32 dstStep[],
const prim_size_t* WINPR_RESTRICT roi)
496 case PIXEL_FORMAT_BGRX32:
497 case PIXEL_FORMAT_BGRA32:
498 return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
501 return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
509 static INLINE
void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
510 const BYTE* WINPR_RESTRICT srcEven,
const BYTE* WINPR_RESTRICT srcOdd,
511 BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
512 BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
513 BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
515 const __m128i* argbEven = (
const __m128i*)srcEven;
516 const __m128i* argbOdd = (
const __m128i*)srcOdd;
517 const __m128i y_factors = BGRX_Y_FACTORS;
518 const __m128i u_factors = BGRX_U_FACTORS;
519 const __m128i v_factors = BGRX_V_FACTORS;
520 const __m128i vector128 = CONST128_FACTORS;
522 for (UINT32 x = 0; x < width; x += 16)
525 const __m128i xe1 = _mm_load_si128(argbEven++);
526 const __m128i xe2 = _mm_load_si128(argbEven++);
527 const __m128i xe3 = _mm_load_si128(argbEven++);
528 const __m128i xe4 = _mm_load_si128(argbEven++);
529 const __m128i xo1 = _mm_load_si128(argbOdd++);
530 const __m128i xo2 = _mm_load_si128(argbOdd++);
531 const __m128i xo3 = _mm_load_si128(argbOdd++);
532 const __m128i xo4 = _mm_load_si128(argbOdd++);
535 const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
536 _mm_maddubs_epi16(xe2, y_factors)),
538 const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
539 _mm_maddubs_epi16(xe4, y_factors)),
541 const __m128i ye = _mm_packus_epi16(ye1, ye2);
542 const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
543 _mm_maddubs_epi16(xo2, y_factors)),
545 const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
546 _mm_maddubs_epi16(xo4, y_factors)),
548 const __m128i yo = _mm_packus_epi16(yo1, yo2);
550 _mm_storeu_si128((__m128i*)b1Even, ye);
555 _mm_storeu_si128((__m128i*)b1Odd, yo);
570 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
571 _mm_maddubs_epi16(xe2, u_factors)),
574 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
575 _mm_maddubs_epi16(xe4, u_factors)),
577 ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
583 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
584 _mm_maddubs_epi16(xo2, u_factors)),
587 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
588 _mm_maddubs_epi16(xo4, u_factors)),
590 uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
599 const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
600 const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
601 const __m128i hi = _mm_add_epi16(ueh, uoh);
602 const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
603 const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
604 const __m128i lo = _mm_add_epi16(uel, uol);
605 const __m128i added = _mm_hadd_epi16(lo, hi);
606 const __m128i avg16 = _mm_srai_epi16(added, 2);
607 const __m128i avg = _mm_packus_epi16(avg16, avg16);
608 _mm_storel_epi64((__m128i*)b2, avg);
613 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
614 (
char)0x80, (
char)0x80, (
char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
615 const __m128i ud = _mm_shuffle_epi8(ue, mask);
616 _mm_storel_epi64((__m128i*)b2, ud);
623 _mm_store_si128((__m128i*)b4, uo);
630 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
631 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
632 const __m128i ude = _mm_shuffle_epi8(ue, mask);
633 _mm_storel_epi64((__m128i*)b6, ude);
648 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
649 _mm_maddubs_epi16(xe2, v_factors)),
652 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
653 _mm_maddubs_epi16(xe4, v_factors)),
655 ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
661 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
662 _mm_maddubs_epi16(xo2, v_factors)),
665 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
666 _mm_maddubs_epi16(xo4, v_factors)),
668 vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
677 const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
678 const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
679 const __m128i hi = _mm_add_epi16(veh, voh);
680 const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
681 const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
682 const __m128i lo = _mm_add_epi16(vel, vol);
683 const __m128i added = _mm_hadd_epi16(lo, hi);
684 const __m128i avg16 = _mm_srai_epi16(added, 2);
685 const __m128i avg = _mm_packus_epi16(avg16, avg16);
686 _mm_storel_epi64((__m128i*)b3, avg);
691 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
692 (
char)0x80, (
char)0x80, (
char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
693 const __m128i vd = _mm_shuffle_epi8(ve, mask);
694 _mm_storel_epi64((__m128i*)b3, vd);
701 _mm_store_si128((__m128i*)b5, vo);
708 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
709 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
710 const __m128i vde = _mm_shuffle_epi8(ve, mask);
711 _mm_storel_epi64((__m128i*)b7, vde);
718 static pstatus_t ssse3_RGBToAVC444YUV_BGRX(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
719 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
720 const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
721 const UINT32 dst2Step[],
724 const BYTE* pMaxSrc = pSrc + 1ULL * (roi->height - 1) * srcStep;
726 if (roi->height < 1 || roi->width < 1)
727 return !PRIMITIVES_SUCCESS;
729 if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
730 return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
733 for (
size_t y = 0; y < roi->height; y += 2)
735 const BOOL last = (y >= (roi->height - 1));
736 const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
737 const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
738 const size_t i = y >> 1;
739 const size_t n = (i & ~7) + i;
740 BYTE* b1Even = pDst1[0] + y * dst1Step[0];
741 BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
742 BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
743 BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
744 BYTE* b4 = pDst2[0] + 1ULL * dst2Step[0] * n;
745 BYTE* b5 = b4 + 8ULL * dst2Step[0];
746 BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
747 BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
748 ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
752 return PRIMITIVES_SUCCESS;
755 static pstatus_t ssse3_RGBToAVC444YUV(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
756 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
757 const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
758 const UINT32 dst2Step[],
763 case PIXEL_FORMAT_BGRX32:
764 case PIXEL_FORMAT_BGRA32:
765 return ssse3_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
769 return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
787 static INLINE
void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
788 const BYTE* WINPR_RESTRICT srcEven,
const BYTE* WINPR_RESTRICT srcOdd,
789 BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
790 BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
791 BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
792 BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
793 BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
794 BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
796 const __m128i vector128 = CONST128_FACTORS;
797 const __m128i* argbEven = (
const __m128i*)srcEven;
798 const __m128i* argbOdd = (
const __m128i*)srcOdd;
800 for (UINT32 x = 0; x < width; x += 16)
805 const __m128i xe1 = _mm_load_si128(argbEven++);
806 const __m128i xe2 = _mm_load_si128(argbEven++);
807 const __m128i xe3 = _mm_load_si128(argbEven++);
808 const __m128i xe4 = _mm_load_si128(argbEven++);
809 const __m128i xo1 = _mm_load_si128(argbOdd++);
810 const __m128i xo2 = _mm_load_si128(argbOdd++);
811 const __m128i xo3 = _mm_load_si128(argbOdd++);
812 const __m128i xo4 = _mm_load_si128(argbOdd++);
815 const __m128i y_factors = BGRX_Y_FACTORS;
816 const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
817 _mm_maddubs_epi16(xe2, y_factors)),
819 const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
820 _mm_maddubs_epi16(xe4, y_factors)),
822 const __m128i ye = _mm_packus_epi16(ye1, ye2);
824 _mm_storeu_si128((__m128i*)yLumaDstEven, ye);
830 const __m128i y_factors = BGRX_Y_FACTORS;
831 const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
832 _mm_maddubs_epi16(xo2, y_factors)),
834 const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
835 _mm_maddubs_epi16(xo4, y_factors)),
837 const __m128i yo = _mm_packus_epi16(yo1, yo2);
838 _mm_storeu_si128((__m128i*)yLumaDstOdd, yo);
854 const __m128i u_factors = BGRX_U_FACTORS;
856 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
857 _mm_maddubs_epi16(xe2, u_factors)),
860 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
861 _mm_maddubs_epi16(xe4, u_factors)),
863 const __m128i ueavg = _mm_hadd_epi16(ue1, ue2);
864 ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
868 const __m128i u_factors = BGRX_U_FACTORS;
870 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
871 _mm_maddubs_epi16(xo2, u_factors)),
874 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
875 _mm_maddubs_epi16(xo4, u_factors)),
877 const __m128i uoavg = _mm_hadd_epi16(uo1, uo2);
878 uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
879 uavg = _mm_add_epi16(uavg, uoavg);
880 uavg = _mm_srai_epi16(uavg, 2);
881 uavg = _mm_packs_epi16(uavg, uoavg);
882 uavg = _mm_sub_epi8(uavg, vector128);
891 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
892 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
893 const __m128i ude = _mm_shuffle_epi8(ue, mask);
894 _mm_storel_epi64((__m128i*)yEvenChromaDst1, ude);
895 yEvenChromaDst1 += 8;
901 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
902 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
903 const __m128i udo = _mm_shuffle_epi8(uo, mask);
904 _mm_storel_epi64((__m128i*)yOddChromaDst1, udo);
911 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
912 (
char)0x80, (
char)0x80, (
char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
913 const __m128i ud = _mm_shuffle_epi8(uo, mask);
914 int* uDst1 = (
int*)uChromaDst1;
915 int* vDst1 = (
int*)vChromaDst1;
916 const int* src = (
const int*)&ud;
917 _mm_stream_si32(uDst1, src[0]);
918 _mm_stream_si32(vDst1, src[1]);
925 _mm_storel_epi64((__m128i*)uLumaDst, uavg);
931 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
932 (
char)0x80, (
char)0x80, (
char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
933 const __m128i ud = _mm_shuffle_epi8(ue, mask);
934 _mm_storel_epi64((__m128i*)uLumaDst, ud);
945 const __m128i v_factors = BGRX_V_FACTORS;
947 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
948 _mm_maddubs_epi16(xe2, v_factors)),
951 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
952 _mm_maddubs_epi16(xe4, v_factors)),
954 const __m128i veavg = _mm_hadd_epi16(ve1, ve2);
955 ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
959 const __m128i v_factors = BGRX_V_FACTORS;
961 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
962 _mm_maddubs_epi16(xo2, v_factors)),
965 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
966 _mm_maddubs_epi16(xo4, v_factors)),
968 const __m128i voavg = _mm_hadd_epi16(vo1, vo2);
969 vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
970 vavg = _mm_add_epi16(vavg, voavg);
971 vavg = _mm_srai_epi16(vavg, 2);
972 vavg = _mm_packs_epi16(vavg, voavg);
973 vavg = _mm_sub_epi8(vavg, vector128);
982 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
983 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
984 __m128i vde = _mm_shuffle_epi8(ve, mask);
985 _mm_storel_epi64((__m128i*)yEvenChromaDst2, vde);
986 yEvenChromaDst2 += 8;
992 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
993 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
994 __m128i vdo = _mm_shuffle_epi8(vo, mask);
995 _mm_storel_epi64((__m128i*)yOddChromaDst2, vdo);
1001 const __m128i mask =
1002 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
1003 (
char)0x80, (
char)0x80, (
char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
1004 const __m128i vd = _mm_shuffle_epi8(vo, mask);
1005 int* uDst2 = (
int*)uChromaDst2;
1006 int* vDst2 = (
int*)vChromaDst2;
1007 const int* src = (
const int*)&vd;
1008 _mm_stream_si32(uDst2, src[0]);
1009 _mm_stream_si32(vDst2, src[1]);
1016 _mm_storel_epi64((__m128i*)vLumaDst, vavg);
1021 const __m128i mask =
1022 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
1023 (
char)0x80, (
char)0x80, (
char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
1024 __m128i vd = _mm_shuffle_epi8(ve, mask);
1025 _mm_storel_epi64((__m128i*)vLumaDst, vd);
1032 static pstatus_t ssse3_RGBToAVC444YUVv2_BGRX(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1033 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1034 const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1035 const UINT32 dst2Step[],
1038 if (roi->height < 1 || roi->width < 1)
1039 return !PRIMITIVES_SUCCESS;
1041 if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
1042 return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
1045 for (
size_t y = 0; y < roi->height; y += 2)
1047 const BYTE* srcEven = (pSrc + y * srcStep);
1048 const BYTE* srcOdd = (srcEven + srcStep);
1049 BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
1050 BYTE* dstLumaYOdd = (y < roi->height - 1) ? (dstLumaYEven + dst1Step[0]) : NULL;
1051 BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
1052 BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
1053 BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
1054 BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
1055 BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
1056 BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
1057 BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
1058 BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
1059 BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
1060 BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
1061 ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU,
1062 dstLumaV, dstEvenChromaY1, dstEvenChromaY2,
1063 dstOddChromaY1, dstOddChromaY2, dstChromaU1,
1064 dstChromaU2, dstChromaV1, dstChromaV2, roi->width);
1067 return PRIMITIVES_SUCCESS;
1070 static pstatus_t ssse3_RGBToAVC444YUVv2(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1071 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1072 const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1073 const UINT32 dst2Step[],
1078 case PIXEL_FORMAT_BGRX32:
1079 case PIXEL_FORMAT_BGRA32:
1080 return ssse3_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1084 return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1089 static pstatus_t ssse3_LumaToYUV444(
const BYTE* WINPR_RESTRICT pSrcRaw[],
const UINT32 srcStep[],
1090 BYTE* WINPR_RESTRICT pDstRaw[],
const UINT32 dstStep[],
1093 const UINT32 nWidth = roi->right - roi->left;
1094 const UINT32 nHeight = roi->bottom - roi->top;
1095 const UINT32 halfWidth = (nWidth + 1) / 2;
1096 const UINT32 halfPad = halfWidth % 16;
1097 const UINT32 halfHeight = (nHeight + 1) / 2;
1098 const UINT32 oddY = 1;
1099 const UINT32 evenY = 0;
1100 const UINT32 oddX = 1;
1101 const UINT32 evenX = 0;
1102 const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1103 pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1104 pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1105 BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1106 pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1107 pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1111 for (
size_t y = 0; y < nHeight; y++)
1113 const BYTE* Ym = pSrc[0] + y * srcStep[0];
1114 BYTE* pY = pDst[0] + y * dstStep[0];
1115 memcpy(pY, Ym, nWidth);
1120 for (
size_t y = 0; y < halfHeight; y++)
1122 const size_t val2y = (2 * y + evenY);
1123 const size_t val2y1 = val2y + oddY;
1124 const BYTE* Um = pSrc[1] + 1ULL * srcStep[1] * y;
1125 const BYTE* Vm = pSrc[2] + 1ULL * srcStep[2] * y;
1126 BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y;
1127 BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y;
1128 BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1;
1129 BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1;
1132 for (; x < halfWidth - halfPad; x += 16)
1134 const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
1135 const __m128i unpackLow =
1136 _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
1138 const __m128i u = _mm_loadu_si128((
const __m128i*)&Um[x]);
1139 const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1140 const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1141 _mm_storeu_si128((__m128i*)&pU[2ULL * x], uHigh);
1142 _mm_storeu_si128((__m128i*)&pU[2ULL * x + 16], uLow);
1143 _mm_storeu_si128((__m128i*)&pU1[2ULL * x], uHigh);
1144 _mm_storeu_si128((__m128i*)&pU1[2ULL * x + 16], uLow);
1147 const __m128i u = _mm_loadu_si128((
const __m128i*)&Vm[x]);
1148 const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1149 const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1150 _mm_storeu_si128((__m128i*)&pV[2 * x], uHigh);
1151 _mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow);
1152 _mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh);
1153 _mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow);
1157 for (; x < halfWidth; x++)
1159 const size_t val2x = 2 * x + evenX;
1160 const size_t val2x1 = val2x + oddX;
1167 pU1[val2x1] = Um[x];
1168 pV1[val2x1] = Vm[x];
1172 return PRIMITIVES_SUCCESS;
1175 static INLINE
void ssse3_filter(BYTE* WINPR_RESTRICT pSrcDst,
const BYTE* WINPR_RESTRICT pSrc2)
1177 const __m128i even = _mm_set_epi8((
char)0x80, 14, (
char)0x80, 12, (
char)0x80, 10, (
char)0x80, 8,
1178 (
char)0x80, 6, (
char)0x80, 4, (
char)0x80, 2, (
char)0x80, 0);
1179 const __m128i odd = _mm_set_epi8((
char)0x80, 15, (
char)0x80, 13, (
char)0x80, 11, (
char)0x80, 9,
1180 (
char)0x80, 7, (
char)0x80, 5, (
char)0x80, 3, (
char)0x80, 1);
1181 const __m128i interleave = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
1182 const __m128i u = _mm_loadu_si128((
const __m128i*)pSrcDst);
1183 const __m128i u1 = _mm_loadu_si128((
const __m128i*)pSrc2);
1184 const __m128i uEven = _mm_shuffle_epi8(u, even);
1185 const __m128i uEven4 = _mm_slli_epi16(uEven, 2);
1186 const __m128i uOdd = _mm_shuffle_epi8(u, odd);
1187 const __m128i u1Even = _mm_shuffle_epi8(u1, even);
1188 const __m128i u1Odd = _mm_shuffle_epi8(u1, odd);
1189 const __m128i tmp1 = _mm_add_epi16(uOdd, u1Even);
1190 const __m128i tmp2 = _mm_add_epi16(tmp1, u1Odd);
1191 const __m128i result = _mm_sub_epi16(uEven4, tmp2);
1192 const __m128i packed = _mm_packus_epi16(result, uOdd);
1193 const __m128i interleaved = _mm_shuffle_epi8(packed, interleave);
1194 _mm_storeu_si128((__m128i*)pSrcDst, interleaved);
1197 static pstatus_t ssse3_ChromaFilter(BYTE* WINPR_RESTRICT pDst[],
const UINT32 dstStep[],
1200 const UINT32 oddY = 1;
1201 const UINT32 evenY = 0;
1202 const UINT32 nWidth = roi->right - roi->left;
1203 const UINT32 nHeight = roi->bottom - roi->top;
1204 const UINT32 halfHeight = (nHeight + 1) / 2;
1205 const UINT32 halfWidth = (nWidth + 1) / 2;
1206 const UINT32 halfPad = halfWidth % 16;
1209 for (
size_t y = roi->top; y < halfHeight + roi->top; y++)
1211 size_t x = roi->left;
1212 const size_t val2y = (y * 2ULL + evenY);
1213 const size_t val2y1 = val2y + oddY;
1214 BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1;
1215 BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1;
1216 BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y;
1217 BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y;
1219 if (val2y1 > nHeight)
1222 for (; x < halfWidth + roi->left - halfPad; x += 16)
1224 ssse3_filter(&pU[2 * x], &pU1[2 * x]);
1225 ssse3_filter(&pV[2 * x], &pV1[2 * x]);
1228 for (; x < halfWidth + roi->left; x++)
1230 const size_t val2x = (x * 2ULL);
1231 const size_t val2x1 = val2x + 1ULL;
1232 const BYTE inU = pU[val2x];
1233 const BYTE inV = pV[val2x];
1234 const INT32 up = inU * 4;
1235 const INT32 vp = inV * 4;
1239 if (val2x1 > nWidth)
1242 u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
1243 v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
1244 pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
1245 pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
1249 return PRIMITIVES_SUCCESS;
1252 static pstatus_t ssse3_ChromaV1ToYUV444(
const BYTE* WINPR_RESTRICT pSrcRaw[3],
1253 const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
1254 const UINT32 dstStep[3],
1257 const UINT32 mod = 16;
1260 const UINT32 nWidth = roi->right - roi->left;
1261 const UINT32 nHeight = roi->bottom - roi->top;
1262 const UINT32 halfWidth = (nWidth + 1) / 2;
1263 const UINT32 halfPad = halfWidth % 16;
1264 const UINT32 halfHeight = (nHeight + 1) / 2;
1265 const UINT32 oddY = 1;
1266 const UINT32 evenY = 0;
1267 const UINT32 oddX = 1;
1270 const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
1271 const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1272 pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1273 pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1274 BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1275 pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1276 pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1277 const __m128i zero = _mm_setzero_si128();
1278 const __m128i mask = _mm_set_epi8(0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0,
1279 (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80);
1283 for (
size_t y = 0; y < padHeigth; y++)
1285 const BYTE* Ya = pSrc[0] + 1ULL * srcStep[0] * y;
1288 if ((y) % mod < (mod + 1) / 2)
1290 const UINT32 pos = (2 * uY++ + oddY);
1295 pX = pDst[1] + 1ULL * dstStep[1] * pos;
1299 const UINT32 pos = (2 * vY++ + oddY);
1304 pX = pDst[2] + 1ULL * dstStep[2] * pos;
1307 memcpy(pX, Ya, nWidth);
1311 for (
size_t y = 0; y < halfHeight; y++)
1313 const size_t val2y = (y * 2 + evenY);
1314 const BYTE* Ua = pSrc[1] + srcStep[1] * y;
1315 const BYTE* Va = pSrc[2] + srcStep[2] * y;
1316 BYTE* pU = pDst[1] + dstStep[1] * val2y;
1317 BYTE* pV = pDst[2] + dstStep[2] * val2y;
1320 for (; x < halfWidth - halfPad; x += 16)
1323 const __m128i u = _mm_loadu_si128((
const __m128i*)&Ua[x]);
1324 const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1325 const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1326 _mm_maskmoveu_si128(u1, mask, (
char*)&pU[2 * x]);
1327 _mm_maskmoveu_si128(u2, mask, (
char*)&pU[2 * x + 16]);
1330 const __m128i u = _mm_loadu_si128((
const __m128i*)&Va[x]);
1331 const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1332 const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1333 _mm_maskmoveu_si128(u1, mask, (
char*)&pV[2 * x]);
1334 _mm_maskmoveu_si128(u2, mask, (
char*)&pV[2 * x + 16]);
1338 for (; x < halfWidth; x++)
1340 const size_t val2x1 = (x * 2ULL + oddX);
1347 return ssse3_ChromaFilter(pDst, dstStep, roi);
1350 static pstatus_t ssse3_ChromaV2ToYUV444(
const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3],
1351 UINT32 nTotalWidth, UINT32 nTotalHeight,
1352 BYTE* WINPR_RESTRICT pDst[3],
const UINT32 dstStep[3],
1355 const UINT32 nWidth = roi->right - roi->left;
1356 const UINT32 nHeight = roi->bottom - roi->top;
1357 const UINT32 halfWidth = (nWidth + 1) / 2;
1358 const UINT32 halfPad = halfWidth % 16;
1359 const UINT32 halfHeight = (nHeight + 1) / 2;
1360 const UINT32 quaterWidth = (nWidth + 3) / 4;
1361 const UINT32 quaterPad = quaterWidth % 16;
1362 const __m128i zero = _mm_setzero_si128();
1363 const __m128i mask = _mm_set_epi8((
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0,
1364 (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0);
1365 const __m128i mask2 = _mm_set_epi8(0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80,
1366 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80);
1367 const __m128i shuffle1 =
1368 _mm_set_epi8((
char)0x80, 15, (
char)0x80, 14, (
char)0x80, 13, (
char)0x80, 12, (
char)0x80, 11,
1369 (
char)0x80, 10, (
char)0x80, 9, (
char)0x80, 8);
1370 const __m128i shuffle2 =
1371 _mm_set_epi8((
char)0x80, 7, (
char)0x80, 6, (
char)0x80, 5, (
char)0x80, 4, (
char)0x80, 3,
1372 (
char)0x80, 2, (
char)0x80, 1, (
char)0x80, 0);
1375 for (
size_t y = 0; y < nHeight; y++)
1377 const size_t yTop = y + roi->top;
1378 const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
1379 const BYTE* pYaV = pYaU + nTotalWidth / 2;
1380 BYTE* pU = pDst[1] + 1ULL * dstStep[1] * yTop + roi->left;
1381 BYTE* pV = pDst[2] + 1ULL * dstStep[2] * yTop + roi->left;
1384 for (; x < halfWidth - halfPad; x += 16)
1387 const __m128i u = _mm_loadu_si128((
const __m128i*)&pYaU[x]);
1388 const __m128i u2 = _mm_unpackhi_epi8(zero, u);
1389 const __m128i u1 = _mm_unpacklo_epi8(zero, u);
1390 _mm_maskmoveu_si128(u1, mask, (
char*)&pU[2 * x]);
1391 _mm_maskmoveu_si128(u2, mask, (
char*)&pU[2 * x + 16]);
1394 const __m128i v = _mm_loadu_si128((
const __m128i*)&pYaV[x]);
1395 const __m128i v2 = _mm_unpackhi_epi8(zero, v);
1396 const __m128i v1 = _mm_unpacklo_epi8(zero, v);
1397 _mm_maskmoveu_si128(v1, mask, (
char*)&pV[2 * x]);
1398 _mm_maskmoveu_si128(v2, mask, (
char*)&pV[2 * x + 16]);
1402 for (; x < halfWidth; x++)
1404 const size_t odd = 2ULL * x + 1;
1411 for (
size_t y = 0; y < halfHeight; y++)
1413 const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
1414 const BYTE* pUaV = pUaU + nTotalWidth / 4;
1415 const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
1416 const BYTE* pVaV = pVaU + nTotalWidth / 4;
1417 BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
1418 BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
1421 for (; x < quaterWidth - quaterPad; x += 16)
1424 const __m128i uU = _mm_loadu_si128((
const __m128i*)&pUaU[x]);
1425 const __m128i uV = _mm_loadu_si128((
const __m128i*)&pVaU[x]);
1426 const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
1427 const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
1428 const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
1429 const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
1430 const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
1431 const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
1432 _mm_maskmoveu_si128(u1, mask2, (
char*)&pU[4 * x + 0]);
1433 _mm_maskmoveu_si128(u2, mask2, (
char*)&pU[4 * x + 16]);
1434 _mm_maskmoveu_si128(u3, mask2, (
char*)&pU[4 * x + 32]);
1435 _mm_maskmoveu_si128(u4, mask2, (
char*)&pU[4 * x + 48]);
1438 const __m128i vU = _mm_loadu_si128((
const __m128i*)&pUaV[x]);
1439 const __m128i vV = _mm_loadu_si128((
const __m128i*)&pVaV[x]);
1440 const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
1441 const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
1442 const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
1443 const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1);
1444 const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2);
1445 const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1);
1446 _mm_maskmoveu_si128(v1, mask2, (
char*)&pV[4 * x + 0]);
1447 _mm_maskmoveu_si128(v2, mask2, (
char*)&pV[4 * x + 16]);
1448 _mm_maskmoveu_si128(v3, mask2, (
char*)&pV[4 * x + 32]);
1449 _mm_maskmoveu_si128(v4, mask2, (
char*)&pV[4 * x + 48]);
1453 for (; x < quaterWidth; x++)
1455 pU[4 * x + 0] = pUaU[x];
1456 pV[4 * x + 0] = pUaV[x];
1457 pU[4 * x + 2] = pVaU[x];
1458 pV[4 * x + 2] = pVaV[x];
1462 return ssse3_ChromaFilter(pDst, dstStep, roi);
1465 static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type,
1466 const BYTE* WINPR_RESTRICT pSrc[3],
1467 const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
1468 BYTE* WINPR_RESTRICT pDst[3],
const UINT32 dstStep[3],
1471 if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
1474 if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
1483 return ssse3_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1485 case AVC444_CHROMAv1:
1486 return ssse3_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1488 case AVC444_CHROMAv2:
1489 return ssse3_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
1497 void primitives_init_YUV_ssse3(
primitives_t* WINPR_RESTRICT prims)
1499 #if defined(SSE2_ENABLED)
1500 generic = primitives_get_generic();
1501 primitives_init_YUV(prims);
1503 if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
1504 IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
1506 WLog_VRB(PRIM_TAG,
"SSE3/SSSE3 optimizations");
1507 prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420;
1508 prims->RGBToAVC444YUV = ssse3_RGBToAVC444YUV;
1509 prims->RGBToAVC444YUVv2 = ssse3_RGBToAVC444YUVv2;
1510 prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB;
1511 prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R;
1512 prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444;
1515 WLog_VRB(PRIM_TAG,
"undefined WITH_SSE2");
1516 WINPR_UNUSED(prims);