23 #include <winpr/wtypes.h>
24 #include <freerdp/config.h>
26 #include <winpr/sysinfo.h>
27 #include <winpr/crt.h>
28 #include <freerdp/types.h>
29 #include <freerdp/primitives.h>
31 #include "prim_internal.h"
34 #if defined(SSE_AVX_INTRINSICS_ENABLED)
35 #include <emmintrin.h>
36 #include <tmmintrin.h>
43 static __m128i* ssse3_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
44 __m128i Vraw, UINT8 pos)
46 const __m128i mapY[] = { mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
47 mm_set_epu32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
48 mm_set_epu32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880),
49 mm_set_epu32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) };
50 const __m128i mapUV[] = { mm_set_epu32(0x80038002, 0x80018000, 0x80808080, 0x80808080),
51 mm_set_epu32(0x80078006, 0x80058004, 0x80808080, 0x80808080),
52 mm_set_epu32(0x800B800A, 0x80098008, 0x80808080, 0x80808080),
53 mm_set_epu32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) };
54 const __m128i mask[] = { mm_set_epu32(0x80038080, 0x80028080, 0x80018080, 0x80008080),
55 mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
56 mm_set_epu32(0x80808003, 0x80808002, 0x80808001, 0x80808000) };
57 const __m128i c128 = _mm_set1_epi16(128);
58 __m128i BGRX = _mm_and_si128(_mm_loadu_si128(dst),
59 mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000));
66 C = _mm_shuffle_epi8(Yraw, mapY[pos]);
70 const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]);
71 D = _mm_sub_epi16(U, c128);
75 const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]);
76 E = _mm_sub_epi16(V, c128);
80 const __m128i c403 = _mm_set1_epi16(403);
82 _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403));
83 const __m128i Rs = _mm_add_epi32(C, e403);
84 const __m128i R32 = _mm_srai_epi32(Rs, 8);
85 const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128());
86 const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128());
87 const __m128i packed = _mm_shuffle_epi8(R, mask[0]);
88 BGRX = _mm_or_si128(BGRX, packed);
92 const __m128i c48 = _mm_set1_epi16(48);
94 _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48));
95 const __m128i c120 = _mm_set1_epi16(120);
97 _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120));
98 const __m128i de = _mm_add_epi32(d48, e120);
99 const __m128i Gs = _mm_sub_epi32(C, de);
100 const __m128i G32 = _mm_srai_epi32(Gs, 8);
101 const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128());
102 const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128());
103 const __m128i packed = _mm_shuffle_epi8(G, mask[1]);
104 BGRX = _mm_or_si128(BGRX, packed);
108 const __m128i c475 = _mm_set1_epi16(475);
110 _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475));
111 const __m128i Bs = _mm_add_epi32(C, d475);
112 const __m128i B32 = _mm_srai_epi32(Bs, 8);
113 const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128());
114 const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128());
115 const __m128i packed = _mm_shuffle_epi8(B, mask[2]);
116 BGRX = _mm_or_si128(BGRX, packed);
119 _mm_storeu_si128(dst++, BGRX);
123 static pstatus_t ssse3_YUV420ToRGB_BGRX(
const BYTE* WINPR_RESTRICT pSrc[],
124 const UINT32* WINPR_RESTRICT srcStep,
125 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
128 const UINT32 nWidth = roi->width;
129 const UINT32 nHeight = roi->height;
130 const UINT32 pad = roi->width % 16;
131 const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
133 for (
size_t y = 0; y < nHeight; y++)
135 __m128i* dst = (__m128i*)(pDst + dstStep * y);
136 const BYTE* YData = pSrc[0] + y * srcStep[0];
137 const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1];
138 const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2];
140 for (UINT32 x = 0; x < nWidth - pad; x += 16)
142 const __m128i Y = _mm_loadu_si128((
const __m128i*)YData);
143 const __m128i uRaw = _mm_loadu_si128((
const __m128i*)UData);
144 const __m128i vRaw = _mm_loadu_si128((
const __m128i*)VData);
145 const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
146 const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
150 dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
151 dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
152 dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
153 dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
156 for (UINT32 x = 0; x < pad; x++)
158 const BYTE Y = *YData++;
159 const BYTE U = *UData;
160 const BYTE V = *VData;
161 const BYTE r = YUV2R(Y, U, V);
162 const BYTE g = YUV2G(Y, U, V);
163 const BYTE b = YUV2B(Y, U, V);
164 dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
174 return PRIMITIVES_SUCCESS;
177 static pstatus_t ssse3_YUV420ToRGB(
const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3],
178 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
183 case PIXEL_FORMAT_BGRX32:
184 case PIXEL_FORMAT_BGRA32:
185 return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi);
188 return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
192 static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(
const BYTE* WINPR_RESTRICT pSrc[],
193 const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
197 const UINT32 nWidth = roi->width;
198 const UINT32 nHeight = roi->height;
199 const UINT32 pad = roi->width % 16;
201 for (
size_t y = 0; y < nHeight; y++)
203 __m128i* dst = (__m128i*)(pDst + dstStep * y);
204 const BYTE* YData = pSrc[0] + y * srcStep[0];
205 const BYTE* UData = pSrc[1] + y * srcStep[1];
206 const BYTE* VData = pSrc[2] + y * srcStep[2];
208 for (
size_t x = 0; x < nWidth - pad; x += 16)
210 __m128i Y = _mm_load_si128((
const __m128i*)YData);
211 __m128i U = _mm_load_si128((
const __m128i*)UData);
212 __m128i V = _mm_load_si128((
const __m128i*)VData);
216 dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
217 dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
218 dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
219 dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
222 for (
size_t x = 0; x < pad; x++)
224 const BYTE Y = *YData++;
225 const BYTE U = *UData++;
226 const BYTE V = *VData++;
227 const BYTE r = YUV2R(Y, U, V);
228 const BYTE g = YUV2G(Y, U, V);
229 const BYTE b = YUV2B(Y, U, V);
230 dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
234 return PRIMITIVES_SUCCESS;
237 static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(
const BYTE* WINPR_RESTRICT pSrc[],
238 const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
239 UINT32 dstStep, UINT32 DstFormat,
242 if ((uintptr_t)pSrc[0] % 16 || (uintptr_t)pSrc[1] % 16 || (uintptr_t)pSrc[2] % 16 ||
243 srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16)
244 return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
248 case PIXEL_FORMAT_BGRX32:
249 case PIXEL_FORMAT_BGRA32:
250 return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
253 return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
282 #define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9)
283 #define BGRX_U_FACTORS \
284 _mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127)
285 #define BGRX_V_FACTORS \
286 _mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12)
287 #define CONST128_FACTORS _mm_set1_epi8(-128)
312 static INLINE
void ssse3_RGBToYUV420_BGRX_Y(
const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width)
318 const __m128i y_factors = BGRX_Y_FACTORS;
319 const __m128i* argb = (
const __m128i*)src;
320 __m128i* ydst = (__m128i*)dst;
322 for (UINT32 x = 0; x < width; x += 16)
325 x0 = _mm_load_si128(argb++);
326 x1 = _mm_load_si128(argb++);
327 x2 = _mm_load_si128(argb++);
328 x3 = _mm_load_si128(argb++);
330 x0 = _mm_maddubs_epi16(x0, y_factors);
331 x1 = _mm_maddubs_epi16(x1, y_factors);
332 x2 = _mm_maddubs_epi16(x2, y_factors);
333 x3 = _mm_maddubs_epi16(x3, y_factors);
335 x0 = _mm_hadd_epi16(x0, x1);
336 x2 = _mm_hadd_epi16(x2, x3);
338 x0 = _mm_srli_epi16(x0, Y_SHIFT);
339 x2 = _mm_srli_epi16(x2, Y_SHIFT);
341 x0 = _mm_packus_epi16(x0, x2);
343 _mm_storeu_si128(ydst++, x0);
349 static INLINE
void ssse3_RGBToYUV420_BGRX_UV(
const BYTE* WINPR_RESTRICT src1,
350 const BYTE* WINPR_RESTRICT src2,
351 BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2,
354 const __m128i u_factors = BGRX_U_FACTORS;
355 const __m128i v_factors = BGRX_V_FACTORS;
356 const __m128i vector128 = CONST128_FACTORS;
363 const __m128i* rgb1 = (
const __m128i*)src1;
364 const __m128i* rgb2 = (
const __m128i*)src2;
365 __m64* udst = (__m64*)dst1;
366 __m64* vdst = (__m64*)dst2;
368 for (UINT32 x = 0; x < width; x += 16)
371 x0 = _mm_load_si128(rgb1++);
372 x4 = _mm_load_si128(rgb2++);
373 x0 = _mm_avg_epu8(x0, x4);
374 x1 = _mm_load_si128(rgb1++);
375 x4 = _mm_load_si128(rgb2++);
376 x1 = _mm_avg_epu8(x1, x4);
377 x2 = _mm_load_si128(rgb1++);
378 x4 = _mm_load_si128(rgb2++);
379 x2 = _mm_avg_epu8(x2, x4);
380 x3 = _mm_load_si128(rgb1++);
381 x4 = _mm_load_si128(rgb2++);
382 x3 = _mm_avg_epu8(x3, x4);
389 x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88));
390 x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd));
391 x0 = _mm_avg_epu8(x0, x4);
392 x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88));
393 x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd));
394 x1 = _mm_avg_epu8(x1, x4);
396 x2 = _mm_maddubs_epi16(x0, u_factors);
397 x3 = _mm_maddubs_epi16(x1, u_factors);
398 x4 = _mm_maddubs_epi16(x0, v_factors);
399 x5 = _mm_maddubs_epi16(x1, v_factors);
401 x0 = _mm_hadd_epi16(x2, x3);
402 x1 = _mm_hadd_epi16(x4, x5);
404 x0 = _mm_srai_epi16(x0, U_SHIFT);
405 x1 = _mm_srai_epi16(x1, V_SHIFT);
407 x0 = _mm_packs_epi16(x0, x1);
409 x0 = _mm_sub_epi8(x0, vector128);
411 _mm_storel_pi(udst++, _mm_castsi128_ps(x0));
413 _mm_storeh_pi(vdst++, _mm_castsi128_ps(x0));
417 static pstatus_t ssse3_RGBToYUV420_BGRX(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
418 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
419 const UINT32 dstStep[],
422 const BYTE* argb = pSrc;
423 BYTE* ydst = pDst[0];
424 BYTE* udst = pDst[1];
425 BYTE* vdst = pDst[2];
427 if (roi->height < 1 || roi->width < 1)
429 return !PRIMITIVES_SUCCESS;
432 if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
434 return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
437 for (UINT32 y = 0; y < roi->height - 1; y += 2)
439 const BYTE* line1 = argb;
440 const BYTE* line2 = argb + srcStep;
441 ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
442 ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width);
443 ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width);
444 argb += 2ULL * srcStep;
445 ydst += 2ULL * dstStep[0];
446 udst += 1ULL * dstStep[1];
447 vdst += 1ULL * dstStep[2];
453 ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width);
454 ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width);
457 return PRIMITIVES_SUCCESS;
460 static pstatus_t ssse3_RGBToYUV420(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
461 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
462 const UINT32 dstStep[],
const prim_size_t* WINPR_RESTRICT roi)
466 case PIXEL_FORMAT_BGRX32:
467 case PIXEL_FORMAT_BGRA32:
468 return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
471 return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
479 static INLINE
void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
480 const BYTE* WINPR_RESTRICT srcEven,
const BYTE* WINPR_RESTRICT srcOdd,
481 BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
482 BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
483 BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
485 const __m128i* argbEven = (
const __m128i*)srcEven;
486 const __m128i* argbOdd = (
const __m128i*)srcOdd;
487 const __m128i y_factors = BGRX_Y_FACTORS;
488 const __m128i u_factors = BGRX_U_FACTORS;
489 const __m128i v_factors = BGRX_V_FACTORS;
490 const __m128i vector128 = CONST128_FACTORS;
492 for (UINT32 x = 0; x < width; x += 16)
495 const __m128i xe1 = _mm_load_si128(argbEven++);
496 const __m128i xe2 = _mm_load_si128(argbEven++);
497 const __m128i xe3 = _mm_load_si128(argbEven++);
498 const __m128i xe4 = _mm_load_si128(argbEven++);
499 const __m128i xo1 = _mm_load_si128(argbOdd++);
500 const __m128i xo2 = _mm_load_si128(argbOdd++);
501 const __m128i xo3 = _mm_load_si128(argbOdd++);
502 const __m128i xo4 = _mm_load_si128(argbOdd++);
505 const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
506 _mm_maddubs_epi16(xe2, y_factors)),
508 const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
509 _mm_maddubs_epi16(xe4, y_factors)),
511 const __m128i ye = _mm_packus_epi16(ye1, ye2);
512 const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
513 _mm_maddubs_epi16(xo2, y_factors)),
515 const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
516 _mm_maddubs_epi16(xo4, y_factors)),
518 const __m128i yo = _mm_packus_epi16(yo1, yo2);
520 _mm_storeu_si128((__m128i*)b1Even, ye);
525 _mm_storeu_si128((__m128i*)b1Odd, yo);
540 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
541 _mm_maddubs_epi16(xe2, u_factors)),
544 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
545 _mm_maddubs_epi16(xe4, u_factors)),
547 ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
553 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
554 _mm_maddubs_epi16(xo2, u_factors)),
557 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
558 _mm_maddubs_epi16(xo4, u_factors)),
560 uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
569 const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
570 const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
571 const __m128i hi = _mm_add_epi16(ueh, uoh);
572 const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
573 const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
574 const __m128i lo = _mm_add_epi16(uel, uol);
575 const __m128i added = _mm_hadd_epi16(lo, hi);
576 const __m128i avg16 = _mm_srai_epi16(added, 2);
577 const __m128i avg = _mm_packus_epi16(avg16, avg16);
578 _mm_storel_epi64((__m128i*)b2, avg);
583 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
584 (
char)0x80, (
char)0x80, (
char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
585 const __m128i ud = _mm_shuffle_epi8(ue, mask);
586 _mm_storel_epi64((__m128i*)b2, ud);
593 _mm_store_si128((__m128i*)b4, uo);
600 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
601 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
602 const __m128i ude = _mm_shuffle_epi8(ue, mask);
603 _mm_storel_epi64((__m128i*)b6, ude);
618 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
619 _mm_maddubs_epi16(xe2, v_factors)),
622 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
623 _mm_maddubs_epi16(xe4, v_factors)),
625 ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
631 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
632 _mm_maddubs_epi16(xo2, v_factors)),
635 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
636 _mm_maddubs_epi16(xo4, v_factors)),
638 vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
647 const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
648 const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
649 const __m128i hi = _mm_add_epi16(veh, voh);
650 const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
651 const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
652 const __m128i lo = _mm_add_epi16(vel, vol);
653 const __m128i added = _mm_hadd_epi16(lo, hi);
654 const __m128i avg16 = _mm_srai_epi16(added, 2);
655 const __m128i avg = _mm_packus_epi16(avg16, avg16);
656 _mm_storel_epi64((__m128i*)b3, avg);
661 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
662 (
char)0x80, (
char)0x80, (
char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
663 const __m128i vd = _mm_shuffle_epi8(ve, mask);
664 _mm_storel_epi64((__m128i*)b3, vd);
671 _mm_store_si128((__m128i*)b5, vo);
678 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
679 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
680 const __m128i vde = _mm_shuffle_epi8(ve, mask);
681 _mm_storel_epi64((__m128i*)b7, vde);
688 static pstatus_t ssse3_RGBToAVC444YUV_BGRX(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
689 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
690 const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
691 const UINT32 dst2Step[],
694 const BYTE* pMaxSrc = pSrc + 1ULL * (roi->height - 1) * srcStep;
696 if (roi->height < 1 || roi->width < 1)
697 return !PRIMITIVES_SUCCESS;
699 if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
700 return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
703 for (
size_t y = 0; y < roi->height; y += 2)
705 const BOOL last = (y >= (roi->height - 1));
706 const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
707 const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
708 const size_t i = y >> 1;
709 const size_t n = (i & (size_t)~7) + i;
710 BYTE* b1Even = pDst1[0] + y * dst1Step[0];
711 BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
712 BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
713 BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
714 BYTE* b4 = pDst2[0] + 1ULL * dst2Step[0] * n;
715 BYTE* b5 = b4 + 8ULL * dst2Step[0];
716 BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
717 BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
718 ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
722 return PRIMITIVES_SUCCESS;
725 static pstatus_t ssse3_RGBToAVC444YUV(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
726 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
727 const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
728 const UINT32 dst2Step[],
733 case PIXEL_FORMAT_BGRX32:
734 case PIXEL_FORMAT_BGRA32:
735 return ssse3_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
739 return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
757 static INLINE
void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
758 const BYTE* WINPR_RESTRICT srcEven,
const BYTE* WINPR_RESTRICT srcOdd,
759 BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
760 BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
761 BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
762 BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
763 BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
764 BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
766 const __m128i vector128 = CONST128_FACTORS;
767 const __m128i* argbEven = (
const __m128i*)srcEven;
768 const __m128i* argbOdd = (
const __m128i*)srcOdd;
770 for (UINT32 x = 0; x < width; x += 16)
775 const __m128i xe1 = _mm_load_si128(argbEven++);
776 const __m128i xe2 = _mm_load_si128(argbEven++);
777 const __m128i xe3 = _mm_load_si128(argbEven++);
778 const __m128i xe4 = _mm_load_si128(argbEven++);
779 const __m128i xo1 = _mm_load_si128(argbOdd++);
780 const __m128i xo2 = _mm_load_si128(argbOdd++);
781 const __m128i xo3 = _mm_load_si128(argbOdd++);
782 const __m128i xo4 = _mm_load_si128(argbOdd++);
785 const __m128i y_factors = BGRX_Y_FACTORS;
786 const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
787 _mm_maddubs_epi16(xe2, y_factors)),
789 const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
790 _mm_maddubs_epi16(xe4, y_factors)),
792 const __m128i ye = _mm_packus_epi16(ye1, ye2);
794 _mm_storeu_si128((__m128i*)yLumaDstEven, ye);
800 const __m128i y_factors = BGRX_Y_FACTORS;
801 const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
802 _mm_maddubs_epi16(xo2, y_factors)),
804 const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
805 _mm_maddubs_epi16(xo4, y_factors)),
807 const __m128i yo = _mm_packus_epi16(yo1, yo2);
808 _mm_storeu_si128((__m128i*)yLumaDstOdd, yo);
824 const __m128i u_factors = BGRX_U_FACTORS;
826 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
827 _mm_maddubs_epi16(xe2, u_factors)),
830 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
831 _mm_maddubs_epi16(xe4, u_factors)),
833 const __m128i ueavg = _mm_hadd_epi16(ue1, ue2);
834 ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
838 const __m128i u_factors = BGRX_U_FACTORS;
840 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
841 _mm_maddubs_epi16(xo2, u_factors)),
844 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
845 _mm_maddubs_epi16(xo4, u_factors)),
847 const __m128i uoavg = _mm_hadd_epi16(uo1, uo2);
848 uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
849 uavg = _mm_add_epi16(uavg, uoavg);
850 uavg = _mm_srai_epi16(uavg, 2);
851 uavg = _mm_packs_epi16(uavg, uoavg);
852 uavg = _mm_sub_epi8(uavg, vector128);
861 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
862 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
863 const __m128i ude = _mm_shuffle_epi8(ue, mask);
864 _mm_storel_epi64((__m128i*)yEvenChromaDst1, ude);
865 yEvenChromaDst1 += 8;
871 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
872 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
873 const __m128i udo = _mm_shuffle_epi8(uo, mask);
874 _mm_storel_epi64((__m128i*)yOddChromaDst1, udo);
881 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
882 (
char)0x80, (
char)0x80, (
char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
883 const __m128i ud = _mm_shuffle_epi8(uo, mask);
884 int* uDst1 = (
int*)uChromaDst1;
885 int* vDst1 = (
int*)vChromaDst1;
886 const int* src = (
const int*)&ud;
887 _mm_stream_si32(uDst1, src[0]);
888 _mm_stream_si32(vDst1, src[1]);
895 _mm_storel_epi64((__m128i*)uLumaDst, uavg);
901 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
902 (
char)0x80, (
char)0x80, (
char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
903 const __m128i ud = _mm_shuffle_epi8(ue, mask);
904 _mm_storel_epi64((__m128i*)uLumaDst, ud);
915 const __m128i v_factors = BGRX_V_FACTORS;
917 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
918 _mm_maddubs_epi16(xe2, v_factors)),
921 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
922 _mm_maddubs_epi16(xe4, v_factors)),
924 const __m128i veavg = _mm_hadd_epi16(ve1, ve2);
925 ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
929 const __m128i v_factors = BGRX_V_FACTORS;
931 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
932 _mm_maddubs_epi16(xo2, v_factors)),
935 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
936 _mm_maddubs_epi16(xo4, v_factors)),
938 const __m128i voavg = _mm_hadd_epi16(vo1, vo2);
939 vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
940 vavg = _mm_add_epi16(vavg, voavg);
941 vavg = _mm_srai_epi16(vavg, 2);
942 vavg = _mm_packs_epi16(vavg, voavg);
943 vavg = _mm_sub_epi8(vavg, vector128);
952 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
953 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
954 __m128i vde = _mm_shuffle_epi8(ve, mask);
955 _mm_storel_epi64((__m128i*)yEvenChromaDst2, vde);
956 yEvenChromaDst2 += 8;
962 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
963 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
964 __m128i vdo = _mm_shuffle_epi8(vo, mask);
965 _mm_storel_epi64((__m128i*)yOddChromaDst2, vdo);
972 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
973 (
char)0x80, (
char)0x80, (
char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
974 const __m128i vd = _mm_shuffle_epi8(vo, mask);
975 int* uDst2 = (
int*)uChromaDst2;
976 int* vDst2 = (
int*)vChromaDst2;
977 const int* src = (
const int*)&vd;
978 _mm_stream_si32(uDst2, src[0]);
979 _mm_stream_si32(vDst2, src[1]);
986 _mm_storel_epi64((__m128i*)vLumaDst, vavg);
992 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
993 (
char)0x80, (
char)0x80, (
char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
994 __m128i vd = _mm_shuffle_epi8(ve, mask);
995 _mm_storel_epi64((__m128i*)vLumaDst, vd);
1002 static pstatus_t ssse3_RGBToAVC444YUVv2_BGRX(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1003 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1004 const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1005 const UINT32 dst2Step[],
1008 if (roi->height < 1 || roi->width < 1)
1009 return !PRIMITIVES_SUCCESS;
1011 if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
1012 return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
1015 for (
size_t y = 0; y < roi->height; y += 2)
1017 const BYTE* srcEven = (pSrc + y * srcStep);
1018 const BYTE* srcOdd = (srcEven + srcStep);
1019 BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
1020 BYTE* dstLumaYOdd = (y < roi->height - 1) ? (dstLumaYEven + dst1Step[0]) : NULL;
1021 BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
1022 BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
1023 BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
1024 BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
1025 BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
1026 BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
1027 BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
1028 BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
1029 BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
1030 BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
1031 ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU,
1032 dstLumaV, dstEvenChromaY1, dstEvenChromaY2,
1033 dstOddChromaY1, dstOddChromaY2, dstChromaU1,
1034 dstChromaU2, dstChromaV1, dstChromaV2, roi->width);
1037 return PRIMITIVES_SUCCESS;
1040 static pstatus_t ssse3_RGBToAVC444YUVv2(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1041 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1042 const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1043 const UINT32 dst2Step[],
1048 case PIXEL_FORMAT_BGRX32:
1049 case PIXEL_FORMAT_BGRA32:
1050 return ssse3_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1054 return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1059 static pstatus_t ssse3_LumaToYUV444(
const BYTE* WINPR_RESTRICT pSrcRaw[],
const UINT32 srcStep[],
1060 BYTE* WINPR_RESTRICT pDstRaw[],
const UINT32 dstStep[],
1063 const UINT32 nWidth = roi->right - roi->left;
1064 const UINT32 nHeight = roi->bottom - roi->top;
1065 const UINT32 halfWidth = (nWidth + 1) / 2;
1066 const UINT32 halfPad = halfWidth % 16;
1067 const UINT32 halfHeight = (nHeight + 1) / 2;
1068 const UINT32 oddY = 1;
1069 const UINT32 evenY = 0;
1070 const UINT32 oddX = 1;
1071 const UINT32 evenX = 0;
1072 const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1073 pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1074 pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1075 BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1076 pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1077 pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1081 for (
size_t y = 0; y < nHeight; y++)
1083 const BYTE* Ym = pSrc[0] + y * srcStep[0];
1084 BYTE* pY = pDst[0] + y * dstStep[0];
1085 memcpy(pY, Ym, nWidth);
1090 for (
size_t y = 0; y < halfHeight; y++)
1092 const size_t val2y = (2 * y + evenY);
1093 const size_t val2y1 = val2y + oddY;
1094 const BYTE* Um = pSrc[1] + 1ULL * srcStep[1] * y;
1095 const BYTE* Vm = pSrc[2] + 1ULL * srcStep[2] * y;
1096 BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y;
1097 BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y;
1098 BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1;
1099 BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1;
1102 for (; x < halfWidth - halfPad; x += 16)
1104 const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
1105 const __m128i unpackLow =
1106 _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
1108 const __m128i u = _mm_loadu_si128((
const __m128i*)&Um[x]);
1109 const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1110 const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1111 _mm_storeu_si128((__m128i*)&pU[2ULL * x], uHigh);
1112 _mm_storeu_si128((__m128i*)&pU[2ULL * x + 16], uLow);
1113 _mm_storeu_si128((__m128i*)&pU1[2ULL * x], uHigh);
1114 _mm_storeu_si128((__m128i*)&pU1[2ULL * x + 16], uLow);
1117 const __m128i u = _mm_loadu_si128((
const __m128i*)&Vm[x]);
1118 const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1119 const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1120 _mm_storeu_si128((__m128i*)&pV[2 * x], uHigh);
1121 _mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow);
1122 _mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh);
1123 _mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow);
1127 for (; x < halfWidth; x++)
1129 const size_t val2x = 2 * x + evenX;
1130 const size_t val2x1 = val2x + oddX;
1137 pU1[val2x1] = Um[x];
1138 pV1[val2x1] = Vm[x];
1142 return PRIMITIVES_SUCCESS;
1145 static INLINE
void ssse3_filter(BYTE* WINPR_RESTRICT pSrcDst,
const BYTE* WINPR_RESTRICT pSrc2)
1147 const __m128i even = _mm_set_epi8((
char)0x80, 14, (
char)0x80, 12, (
char)0x80, 10, (
char)0x80, 8,
1148 (
char)0x80, 6, (
char)0x80, 4, (
char)0x80, 2, (
char)0x80, 0);
1149 const __m128i odd = _mm_set_epi8((
char)0x80, 15, (
char)0x80, 13, (
char)0x80, 11, (
char)0x80, 9,
1150 (
char)0x80, 7, (
char)0x80, 5, (
char)0x80, 3, (
char)0x80, 1);
1151 const __m128i interleave = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
1152 const __m128i u = _mm_loadu_si128((
const __m128i*)pSrcDst);
1153 const __m128i u1 = _mm_loadu_si128((
const __m128i*)pSrc2);
1154 const __m128i uEven = _mm_shuffle_epi8(u, even);
1155 const __m128i uEven4 = _mm_slli_epi16(uEven, 2);
1156 const __m128i uOdd = _mm_shuffle_epi8(u, odd);
1157 const __m128i u1Even = _mm_shuffle_epi8(u1, even);
1158 const __m128i u1Odd = _mm_shuffle_epi8(u1, odd);
1159 const __m128i tmp1 = _mm_add_epi16(uOdd, u1Even);
1160 const __m128i tmp2 = _mm_add_epi16(tmp1, u1Odd);
1161 const __m128i result = _mm_sub_epi16(uEven4, tmp2);
1162 const __m128i packed = _mm_packus_epi16(result, uOdd);
1163 const __m128i interleaved = _mm_shuffle_epi8(packed, interleave);
1164 _mm_storeu_si128((__m128i*)pSrcDst, interleaved);
1167 static pstatus_t ssse3_ChromaFilter(BYTE* WINPR_RESTRICT pDst[],
const UINT32 dstStep[],
1170 const UINT32 oddY = 1;
1171 const UINT32 evenY = 0;
1172 const UINT32 nWidth = roi->right - roi->left;
1173 const UINT32 nHeight = roi->bottom - roi->top;
1174 const UINT32 halfHeight = (nHeight + 1) / 2;
1175 const UINT32 halfWidth = (nWidth + 1) / 2;
1176 const UINT32 halfPad = halfWidth % 16;
1179 for (
size_t y = roi->top; y < halfHeight + roi->top; y++)
1181 size_t x = roi->left;
1182 const size_t val2y = (y * 2ULL + evenY);
1183 const size_t val2y1 = val2y + oddY;
1184 BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1;
1185 BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1;
1186 BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y;
1187 BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y;
1189 if (val2y1 > nHeight)
1192 for (; x < halfWidth + roi->left - halfPad; x += 16)
1194 ssse3_filter(&pU[2 * x], &pU1[2 * x]);
1195 ssse3_filter(&pV[2 * x], &pV1[2 * x]);
1198 for (; x < halfWidth + roi->left; x++)
1200 const size_t val2x = (x * 2ULL);
1201 const size_t val2x1 = val2x + 1ULL;
1202 const BYTE inU = pU[val2x];
1203 const BYTE inV = pV[val2x];
1204 const INT32 up = inU * 4;
1205 const INT32 vp = inV * 4;
1209 if (val2x1 > nWidth)
1212 u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
1213 v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
1214 pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
1215 pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
1219 return PRIMITIVES_SUCCESS;
1222 static pstatus_t ssse3_ChromaV1ToYUV444(
const BYTE* WINPR_RESTRICT pSrcRaw[3],
1223 const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
1224 const UINT32 dstStep[3],
1227 const UINT32 mod = 16;
1230 const UINT32 nWidth = roi->right - roi->left;
1231 const UINT32 nHeight = roi->bottom - roi->top;
1232 const UINT32 halfWidth = (nWidth + 1) / 2;
1233 const UINT32 halfPad = halfWidth % 16;
1234 const UINT32 halfHeight = (nHeight + 1) / 2;
1235 const UINT32 oddY = 1;
1236 const UINT32 evenY = 0;
1237 const UINT32 oddX = 1;
1240 const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
1241 const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1242 pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1243 pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1244 BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1245 pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1246 pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1247 const __m128i zero = _mm_setzero_si128();
1248 const __m128i mask = _mm_set_epi8(0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0,
1249 (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80);
1253 for (
size_t y = 0; y < padHeigth; y++)
1255 const BYTE* Ya = pSrc[0] + 1ULL * srcStep[0] * y;
1258 if ((y) % mod < (mod + 1) / 2)
1260 const UINT32 pos = (2 * uY++ + oddY);
1265 pX = pDst[1] + 1ULL * dstStep[1] * pos;
1269 const UINT32 pos = (2 * vY++ + oddY);
1274 pX = pDst[2] + 1ULL * dstStep[2] * pos;
1277 memcpy(pX, Ya, nWidth);
1281 for (
size_t y = 0; y < halfHeight; y++)
1283 const size_t val2y = (y * 2 + evenY);
1284 const BYTE* Ua = pSrc[1] + srcStep[1] * y;
1285 const BYTE* Va = pSrc[2] + srcStep[2] * y;
1286 BYTE* pU = pDst[1] + dstStep[1] * val2y;
1287 BYTE* pV = pDst[2] + dstStep[2] * val2y;
1290 for (; x < halfWidth - halfPad; x += 16)
1293 const __m128i u = _mm_loadu_si128((
const __m128i*)&Ua[x]);
1294 const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1295 const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1296 _mm_maskmoveu_si128(u1, mask, (
char*)&pU[2 * x]);
1297 _mm_maskmoveu_si128(u2, mask, (
char*)&pU[2 * x + 16]);
1300 const __m128i u = _mm_loadu_si128((
const __m128i*)&Va[x]);
1301 const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1302 const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1303 _mm_maskmoveu_si128(u1, mask, (
char*)&pV[2 * x]);
1304 _mm_maskmoveu_si128(u2, mask, (
char*)&pV[2 * x + 16]);
1308 for (; x < halfWidth; x++)
1310 const size_t val2x1 = (x * 2ULL + oddX);
1317 return ssse3_ChromaFilter(pDst, dstStep, roi);
1320 static pstatus_t ssse3_ChromaV2ToYUV444(
const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3],
1321 UINT32 nTotalWidth, UINT32 nTotalHeight,
1322 BYTE* WINPR_RESTRICT pDst[3],
const UINT32 dstStep[3],
1325 const UINT32 nWidth = roi->right - roi->left;
1326 const UINT32 nHeight = roi->bottom - roi->top;
1327 const UINT32 halfWidth = (nWidth + 1) / 2;
1328 const UINT32 halfPad = halfWidth % 16;
1329 const UINT32 halfHeight = (nHeight + 1) / 2;
1330 const UINT32 quaterWidth = (nWidth + 3) / 4;
1331 const UINT32 quaterPad = quaterWidth % 16;
1332 const __m128i zero = _mm_setzero_si128();
1333 const __m128i mask = _mm_set_epi8((
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0,
1334 (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0);
1335 const __m128i mask2 = _mm_set_epi8(0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80,
1336 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80);
1337 const __m128i shuffle1 =
1338 _mm_set_epi8((
char)0x80, 15, (
char)0x80, 14, (
char)0x80, 13, (
char)0x80, 12, (
char)0x80, 11,
1339 (
char)0x80, 10, (
char)0x80, 9, (
char)0x80, 8);
1340 const __m128i shuffle2 =
1341 _mm_set_epi8((
char)0x80, 7, (
char)0x80, 6, (
char)0x80, 5, (
char)0x80, 4, (
char)0x80, 3,
1342 (
char)0x80, 2, (
char)0x80, 1, (
char)0x80, 0);
1345 for (
size_t y = 0; y < nHeight; y++)
1347 const size_t yTop = y + roi->top;
1348 const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
1349 const BYTE* pYaV = pYaU + nTotalWidth / 2;
1350 BYTE* pU = pDst[1] + 1ULL * dstStep[1] * yTop + roi->left;
1351 BYTE* pV = pDst[2] + 1ULL * dstStep[2] * yTop + roi->left;
1354 for (; x < halfWidth - halfPad; x += 16)
1357 const __m128i u = _mm_loadu_si128((
const __m128i*)&pYaU[x]);
1358 const __m128i u2 = _mm_unpackhi_epi8(zero, u);
1359 const __m128i u1 = _mm_unpacklo_epi8(zero, u);
1360 _mm_maskmoveu_si128(u1, mask, (
char*)&pU[2 * x]);
1361 _mm_maskmoveu_si128(u2, mask, (
char*)&pU[2 * x + 16]);
1364 const __m128i v = _mm_loadu_si128((
const __m128i*)&pYaV[x]);
1365 const __m128i v2 = _mm_unpackhi_epi8(zero, v);
1366 const __m128i v1 = _mm_unpacklo_epi8(zero, v);
1367 _mm_maskmoveu_si128(v1, mask, (
char*)&pV[2 * x]);
1368 _mm_maskmoveu_si128(v2, mask, (
char*)&pV[2 * x + 16]);
1372 for (; x < halfWidth; x++)
1374 const size_t odd = 2ULL * x + 1;
1381 for (
size_t y = 0; y < halfHeight; y++)
1383 const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
1384 const BYTE* pUaV = pUaU + nTotalWidth / 4;
1385 const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
1386 const BYTE* pVaV = pVaU + nTotalWidth / 4;
1387 BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
1388 BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
1391 for (; x < quaterWidth - quaterPad; x += 16)
1394 const __m128i uU = _mm_loadu_si128((
const __m128i*)&pUaU[x]);
1395 const __m128i uV = _mm_loadu_si128((
const __m128i*)&pVaU[x]);
1396 const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
1397 const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
1398 const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
1399 const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
1400 const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
1401 const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
1402 _mm_maskmoveu_si128(u1, mask2, (
char*)&pU[4 * x + 0]);
1403 _mm_maskmoveu_si128(u2, mask2, (
char*)&pU[4 * x + 16]);
1404 _mm_maskmoveu_si128(u3, mask2, (
char*)&pU[4 * x + 32]);
1405 _mm_maskmoveu_si128(u4, mask2, (
char*)&pU[4 * x + 48]);
1408 const __m128i vU = _mm_loadu_si128((
const __m128i*)&pUaV[x]);
1409 const __m128i vV = _mm_loadu_si128((
const __m128i*)&pVaV[x]);
1410 const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
1411 const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
1412 const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
1413 const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1);
1414 const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2);
1415 const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1);
1416 _mm_maskmoveu_si128(v1, mask2, (
char*)&pV[4 * x + 0]);
1417 _mm_maskmoveu_si128(v2, mask2, (
char*)&pV[4 * x + 16]);
1418 _mm_maskmoveu_si128(v3, mask2, (
char*)&pV[4 * x + 32]);
1419 _mm_maskmoveu_si128(v4, mask2, (
char*)&pV[4 * x + 48]);
1423 for (; x < quaterWidth; x++)
1425 pU[4 * x + 0] = pUaU[x];
1426 pV[4 * x + 0] = pUaV[x];
1427 pU[4 * x + 2] = pVaU[x];
1428 pV[4 * x + 2] = pVaV[x];
1432 return ssse3_ChromaFilter(pDst, dstStep, roi);
1435 static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type,
1436 const BYTE* WINPR_RESTRICT pSrc[3],
1437 const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
1438 BYTE* WINPR_RESTRICT pDst[3],
const UINT32 dstStep[3],
1441 if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
1444 if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
1453 return ssse3_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1455 case AVC444_CHROMAv1:
1456 return ssse3_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1458 case AVC444_CHROMAv2:
1459 return ssse3_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
1467 void primitives_init_YUV_ssse3(
primitives_t* WINPR_RESTRICT prims)
1469 #if defined(SSE_AVX_INTRINSICS_ENABLED)
1470 generic = primitives_get_generic();
1471 primitives_init_YUV(prims);
1473 if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
1474 IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
1476 WLog_VRB(PRIM_TAG,
"SSE3/SSSE3 optimizations");
1477 prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420;
1478 prims->RGBToAVC444YUV = ssse3_RGBToAVC444YUV;
1479 prims->RGBToAVC444YUVv2 = ssse3_RGBToAVC444YUVv2;
1480 prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB;
1481 prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R;
1482 prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444;
1485 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or SSSE3 intrinsics not available");
1486 WINPR_UNUSED(prims);