20 #include <freerdp/config.h>
22 #include <freerdp/types.h>
23 #include <freerdp/primitives.h>
24 #include <winpr/sysinfo.h>
26 #include "prim_colors.h"
28 #include "prim_internal.h"
29 #include "prim_templates.h"
31 #if defined(SSE2_ENABLED)
32 #include <emmintrin.h>
37 #define GNU_INLINE __attribute__((__gnu_inline__, __always_inline__, __artificial__))
42 #define CACHE_LINE_BYTES 64
44 #define mm_between_epi16(_val, _min, _max) \
47 (_val) = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); \
52 static inline void GNU_INLINE _mm_prefetch_buffer(
char* WINPR_RESTRICT buffer,
int num_bytes)
54 __m128i* buf = (__m128i*)buffer;
56 for (
unsigned int i = 0; i < (num_bytes /
sizeof(__m128i));
57 i += (CACHE_LINE_BYTES /
sizeof(__m128i)))
59 _mm_prefetch((
char*)(&buf[i]), _MM_HINT_NTA);
66 sse2_yCbCrToRGB_16s16s_P3P3(
const INT16* WINPR_RESTRICT pSrc[3],
int srcStep,
67 INT16* WINPR_RESTRICT pDst[3],
int dstStep,
77 const __m128i* y_buf = NULL;
78 const __m128i* cb_buf = NULL;
79 const __m128i* cr_buf = NULL;
80 __m128i* r_buf = NULL;
81 __m128i* g_buf = NULL;
82 __m128i* b_buf = NULL;
87 if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
88 ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
89 ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
90 (srcStep & 127) || (dstStep & 127))
93 return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
96 zero = _mm_setzero_si128();
97 max = _mm_set1_epi16(255);
98 y_buf = (
const __m128i*)(pSrc[0]);
99 cb_buf = (
const __m128i*)(pSrc[1]);
100 cr_buf = (
const __m128i*)(pSrc[2]);
101 r_buf = (__m128i*)(pDst[0]);
102 g_buf = (__m128i*)(pDst[1]);
103 b_buf = (__m128i*)(pDst[2]);
104 r_cr = _mm_set1_epi16(22986);
105 g_cb = _mm_set1_epi16(-5636);
106 g_cr = _mm_set1_epi16(-11698);
107 b_cb = _mm_set1_epi16(28999);
108 c4096 = _mm_set1_epi16(4096);
109 srcbump = srcStep /
sizeof(__m128i);
110 dstbump = dstStep /
sizeof(__m128i);
114 for (UINT32 yp = 0; yp < roi->height; yp++)
116 for (
int i = 0; i < roi->width *
sizeof(INT16) /
sizeof(__m128i);
117 i += (CACHE_LINE_BYTES /
sizeof(__m128i)))
119 _mm_prefetch((
char*)(&y_buf[i]), _MM_HINT_NTA);
120 _mm_prefetch((
char*)(&cb_buf[i]), _MM_HINT_NTA);
121 _mm_prefetch((
char*)(&cr_buf[i]), _MM_HINT_NTA);
129 y_buf = (__m128i*)(pSrc[0]);
130 cb_buf = (__m128i*)(pSrc[1]);
131 cr_buf = (__m128i*)(pSrc[2]);
133 imax = roi->width *
sizeof(INT16) /
sizeof(__m128i);
135 for (UINT32 yp = 0; yp < roi->height; ++yp)
137 for (
int i = 0; i < imax; i++)
165 y = _mm_load_si128(y_buf + i);
166 y = _mm_add_epi16(y, c4096);
167 y = _mm_srai_epi16(y, 2);
169 cb = _mm_load_si128(cb_buf + i);
171 cr = _mm_load_si128(cr_buf + i);
173 r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
174 r = _mm_srai_epi16(r, 3);
176 mm_between_epi16(r, zero, max);
177 _mm_store_si128(r_buf + i, r);
179 g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
180 g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
181 g = _mm_srai_epi16(g, 3);
183 mm_between_epi16(g, zero, max);
184 _mm_store_si128(g_buf + i, g);
186 b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
187 b = _mm_srai_epi16(b, 3);
189 mm_between_epi16(b, zero, max);
190 _mm_store_si128(b_buf + i, b);
201 return PRIMITIVES_SUCCESS;
206 sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(
const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
207 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
210 const __m128i zero = _mm_setzero_si128();
211 const __m128i max = _mm_set1_epi16(255);
212 const __m128i r_cr = _mm_set1_epi16(22986);
213 const __m128i g_cb = _mm_set1_epi16(-5636);
214 const __m128i g_cr = _mm_set1_epi16(-11698);
215 const __m128i b_cb = _mm_set1_epi16(28999);
216 const __m128i c4096 = _mm_set1_epi16(4096);
217 const INT16* y_buf = pSrc[0];
218 const INT16* cb_buf = pSrc[1];
219 const INT16* cr_buf = pSrc[2];
220 const UINT32 pad = roi->width % 16;
221 const UINT32 step =
sizeof(__m128i) /
sizeof(INT16);
222 const UINT32 imax = (roi->width - pad) *
sizeof(INT16) /
sizeof(__m128i);
224 const size_t dstPad = (dstStep - roi->width * 4);
228 for (UINT32 yp = 0; yp < roi->height; yp++)
230 for (
int i = 0; i < imax; i += (CACHE_LINE_BYTES /
sizeof(__m128i)))
232 _mm_prefetch((
char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
233 _mm_prefetch((
char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
234 _mm_prefetch((
char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
237 y_buf += srcStep /
sizeof(INT16);
238 cb_buf += srcStep /
sizeof(INT16);
239 cr_buf += srcStep /
sizeof(INT16);
242 y_buf = (INT16*)pSrc[0];
243 cb_buf = (INT16*)pSrc[1];
244 cr_buf = (INT16*)pSrc[2];
247 for (UINT32 yp = 0; yp < roi->height; ++yp)
249 for (UINT32 i = 0; i < imax; i += 2)
283 y1 = _mm_load_si128((
const __m128i*)y_buf);
285 y1 = _mm_add_epi16(y1, c4096);
286 y1 = _mm_srai_epi16(y1, 2);
288 cb1 = _mm_load_si128((
const __m128i*)cb_buf);
291 cr1 = _mm_load_si128((
const __m128i*)cr_buf);
294 r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
295 r1 = _mm_srai_epi16(r1, 3);
297 mm_between_epi16(r1, zero, max);
299 g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
300 g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
301 g1 = _mm_srai_epi16(g1, 3);
303 mm_between_epi16(g1, zero, max);
305 b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
306 b1 = _mm_srai_epi16(b1, 3);
308 mm_between_epi16(b1, zero, max);
309 y2 = _mm_load_si128((
const __m128i*)y_buf);
311 y2 = _mm_add_epi16(y2, c4096);
312 y2 = _mm_srai_epi16(y2, 2);
314 cb2 = _mm_load_si128((
const __m128i*)cb_buf);
317 cr2 = _mm_load_si128((
const __m128i*)cr_buf);
320 r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
321 r2 = _mm_srai_epi16(r2, 3);
323 mm_between_epi16(r2, zero, max);
325 g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
326 g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
327 g2 = _mm_srai_epi16(g2, 3);
329 mm_between_epi16(g2, zero, max);
331 b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
332 b2 = _mm_srai_epi16(b2, 3);
334 mm_between_epi16(b2, zero, max);
346 R0 = _mm_packus_epi16(R0, R1);
349 R1 = _mm_packus_epi16(R1, R2);
351 R2 = _mm_unpacklo_epi8(R0, R2);
352 R1 = _mm_unpackhi_epi8(R0, R1);
355 R0 = _mm_packus_epi16(R0, R3);
356 R3 = _mm_set1_epi32(0xFFFFFFFFU);
358 R4 = _mm_unpacklo_epi8(R0, R4);
359 R3 = _mm_unpackhi_epi8(R0, R3);
361 R0 = _mm_unpacklo_epi16(R2, R0);
362 R4 = _mm_unpackhi_epi16(R2, R4);
364 R2 = _mm_unpacklo_epi16(R1, R2);
365 R3 = _mm_unpackhi_epi16(R1, R3);
366 _mm_store_si128((__m128i*)d_buf, R0);
367 d_buf +=
sizeof(__m128i);
368 _mm_store_si128((__m128i*)d_buf, R4);
369 d_buf +=
sizeof(__m128i);
370 _mm_store_si128((__m128i*)d_buf, R2);
371 d_buf +=
sizeof(__m128i);
372 _mm_store_si128((__m128i*)d_buf, R3);
373 d_buf +=
sizeof(__m128i);
377 for (UINT32 i = 0; i < pad; i++)
379 const INT32 divisor = 16;
380 const INT32 Y = ((*y_buf++) + 4096) << divisor;
381 const INT32 Cb = (*cb_buf++);
382 const INT32 Cr = (*cr_buf++);
383 const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
384 const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
385 const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
386 const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
387 const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
388 const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
389 const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
399 return PRIMITIVES_SUCCESS;
404 sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(
const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
405 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
408 const __m128i zero = _mm_setzero_si128();
409 const __m128i max = _mm_set1_epi16(255);
410 const __m128i r_cr = _mm_set1_epi16(22986);
411 const __m128i g_cb = _mm_set1_epi16(-5636);
412 const __m128i g_cr = _mm_set1_epi16(-11698);
413 const __m128i b_cb = _mm_set1_epi16(28999);
414 const __m128i c4096 = _mm_set1_epi16(4096);
415 const INT16* y_buf = pSrc[0];
416 const INT16* cb_buf = pSrc[1];
417 const INT16* cr_buf = pSrc[2];
418 const UINT32 pad = roi->width % 16;
419 const UINT32 step =
sizeof(__m128i) /
sizeof(INT16);
420 const UINT32 imax = (roi->width - pad) *
sizeof(INT16) /
sizeof(__m128i);
422 const size_t dstPad = (dstStep - roi->width * 4);
426 for (UINT32 yp = 0; yp < roi->height; yp++)
428 for (
int i = 0; i < imax; i += (CACHE_LINE_BYTES /
sizeof(__m128i)))
430 _mm_prefetch((
char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
431 _mm_prefetch((
char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
432 _mm_prefetch((
char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
435 y_buf += srcStep /
sizeof(INT16);
436 cb_buf += srcStep /
sizeof(INT16);
437 cr_buf += srcStep /
sizeof(INT16);
440 y_buf = (INT16*)(pSrc[0]);
441 cb_buf = (INT16*)(pSrc[1]);
442 cr_buf = (INT16*)(pSrc[2]);
445 for (UINT32 yp = 0; yp < roi->height; ++yp)
447 for (UINT32 i = 0; i < imax; i += 2)
481 y1 = _mm_load_si128((
const __m128i*)y_buf);
483 y1 = _mm_add_epi16(y1, c4096);
484 y1 = _mm_srai_epi16(y1, 2);
486 cb1 = _mm_load_si128((
const __m128i*)cb_buf);
489 cr1 = _mm_load_si128((
const __m128i*)cr_buf);
492 r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
493 r1 = _mm_srai_epi16(r1, 3);
495 mm_between_epi16(r1, zero, max);
497 g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
498 g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
499 g1 = _mm_srai_epi16(g1, 3);
501 mm_between_epi16(g1, zero, max);
503 b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
504 b1 = _mm_srai_epi16(b1, 3);
506 mm_between_epi16(b1, zero, max);
507 y2 = _mm_load_si128((
const __m128i*)y_buf);
509 y2 = _mm_add_epi16(y2, c4096);
510 y2 = _mm_srai_epi16(y2, 2);
512 cb2 = _mm_load_si128((
const __m128i*)cb_buf);
515 cr2 = _mm_load_si128((
const __m128i*)cr_buf);
518 r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
519 r2 = _mm_srai_epi16(r2, 3);
521 mm_between_epi16(r2, zero, max);
523 g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
524 g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
525 g2 = _mm_srai_epi16(g2, 3);
527 mm_between_epi16(g2, zero, max);
529 b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
530 b2 = _mm_srai_epi16(b2, 3);
532 mm_between_epi16(b2, zero, max);
544 R0 = _mm_packus_epi16(R0, R1);
547 R1 = _mm_packus_epi16(R1, R2);
549 R2 = _mm_unpacklo_epi8(R0, R2);
550 R1 = _mm_unpackhi_epi8(R0, R1);
553 R0 = _mm_packus_epi16(R0, R3);
554 R3 = _mm_set1_epi32(0xFFFFFFFFU);
556 R4 = _mm_unpacklo_epi8(R0, R4);
557 R3 = _mm_unpackhi_epi8(R0, R3);
559 R0 = _mm_unpacklo_epi16(R2, R0);
560 R4 = _mm_unpackhi_epi16(R2, R4);
562 R2 = _mm_unpacklo_epi16(R1, R2);
563 R3 = _mm_unpackhi_epi16(R1, R3);
564 _mm_store_si128((__m128i*)d_buf, R0);
565 d_buf +=
sizeof(__m128i);
566 _mm_store_si128((__m128i*)d_buf, R4);
567 d_buf +=
sizeof(__m128i);
568 _mm_store_si128((__m128i*)d_buf, R2);
569 d_buf +=
sizeof(__m128i);
570 _mm_store_si128((__m128i*)d_buf, R3);
571 d_buf +=
sizeof(__m128i);
575 for (UINT32 i = 0; i < pad; i++)
577 const INT32 divisor = 16;
578 const INT32 Y = ((*y_buf++) + 4096) << divisor;
579 const INT32 Cb = (*cb_buf++);
580 const INT32 Cr = (*cr_buf++);
581 const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
582 const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
583 const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
584 const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
585 const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
586 const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
587 const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
597 return PRIMITIVES_SUCCESS;
601 sse2_yCbCrToRGB_16s8u_P3AC4R(
const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
602 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
605 if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
606 ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst)&0x0f) || (srcStep & 0x0f) ||
610 return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
615 case PIXEL_FORMAT_BGRA32:
616 case PIXEL_FORMAT_BGRX32:
617 return sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
619 case PIXEL_FORMAT_RGBA32:
620 case PIXEL_FORMAT_RGBX32:
621 return sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
624 return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
631 sse2_RGBToYCbCr_16s16s_P3P3(
const INT16* WINPR_RESTRICT pSrc[3],
int srcStep,
632 INT16* WINPR_RESTRICT pDst[3],
int dstStep,
646 const __m128i* r_buf = (
const __m128i*)(pSrc[0]);
647 const __m128i* g_buf = (
const __m128i*)(pSrc[1]);
648 const __m128i* b_buf = (
const __m128i*)(pSrc[2]);
649 __m128i* y_buf = (__m128i*)(pDst[0]);
650 __m128i* cb_buf = (__m128i*)(pDst[1]);
651 __m128i* cr_buf = (__m128i*)(pDst[2]);
656 if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
657 ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
658 ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
659 (srcStep & 127) || (dstStep & 127))
662 return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
665 min = _mm_set1_epi16(-128 * 32);
666 max = _mm_set1_epi16(127 * 32);
668 y_r = _mm_set1_epi16(9798);
669 y_g = _mm_set1_epi16(19235);
670 y_b = _mm_set1_epi16(3735);
671 cb_r = _mm_set1_epi16(-5535);
672 cb_g = _mm_set1_epi16(-10868);
673 cb_b = _mm_set1_epi16(16403);
674 cr_r = _mm_set1_epi16(16377);
675 cr_g = _mm_set1_epi16(-13714);
676 cr_b = _mm_set1_epi16(-2663);
677 srcbump = srcStep /
sizeof(__m128i);
678 dstbump = dstStep /
sizeof(__m128i);
682 for (UINT32 yp = 0; yp < roi->height; yp++)
684 for (
int i = 0; i < roi->width *
sizeof(INT16) /
sizeof(__m128i);
685 i += (CACHE_LINE_BYTES /
sizeof(__m128i)))
687 _mm_prefetch((
char*)(&r_buf[i]), _MM_HINT_NTA);
688 _mm_prefetch((
char*)(&g_buf[i]), _MM_HINT_NTA);
689 _mm_prefetch((
char*)(&b_buf[i]), _MM_HINT_NTA);
697 r_buf = (__m128i*)(pSrc[0]);
698 g_buf = (__m128i*)(pSrc[1]);
699 b_buf = (__m128i*)(pSrc[2]);
701 imax = roi->width *
sizeof(INT16) /
sizeof(__m128i);
703 for (UINT32 yp = 0; yp < roi->height; ++yp)
705 for (
int i = 0; i < imax; i++)
724 r = _mm_load_si128(r_buf + i);
725 g = _mm_load_si128(g_buf + i);
726 b = _mm_load_si128(b_buf + i);
728 r = _mm_slli_epi16(r, 6);
729 g = _mm_slli_epi16(g, 6);
730 b = _mm_slli_epi16(b, 6);
732 y = _mm_mulhi_epi16(r, y_r);
733 y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
734 y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
735 y = _mm_add_epi16(y, min);
737 mm_between_epi16(y, min, max);
738 _mm_store_si128(y_buf + i, y);
740 cb = _mm_mulhi_epi16(r, cb_r);
741 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
742 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
744 mm_between_epi16(cb, min, max);
745 _mm_store_si128(cb_buf + i, cb);
747 cr = _mm_mulhi_epi16(r, cr_r);
748 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
749 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
751 mm_between_epi16(cr, min, max);
752 _mm_store_si128(cr_buf + i, cr);
763 return PRIMITIVES_SUCCESS;
767 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
768 const INT16* WINPR_RESTRICT pSrc[3],
770 BYTE* WINPR_RESTRICT pDst,
774 const UINT16* pr = (
const UINT16*)(pSrc[0]);
775 const UINT16* pg = (
const UINT16*)(pSrc[1]);
776 const UINT16* pb = (
const UINT16*)(pSrc[2]);
777 const UINT32 pad = roi->width % 16;
778 const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
783 srcbump = (srcStep - (roi->width *
sizeof(UINT16))) /
sizeof(UINT16);
784 dstbump = (dstStep - (roi->width *
sizeof(UINT32)));
786 for (UINT32 y = 0; y < roi->height; ++y)
788 for (UINT32 x = 0; x < roi->width - pad; x += 16)
799 R0 = _mm_load_si128((
const __m128i*)pb);
801 R1 = _mm_load_si128((
const __m128i*)pb);
803 b = _mm_packus_epi16(R0, R1);
808 R0 = _mm_load_si128((
const __m128i*)pg);
810 R1 = _mm_load_si128((
const __m128i*)pg);
812 g = _mm_packus_epi16(R0, R1);
817 R0 = _mm_load_si128((
const __m128i*)pr);
819 R1 = _mm_load_si128((
const __m128i*)pr);
821 r = _mm_packus_epi16(R0, R1);
829 gbLo = _mm_unpacklo_epi8(b, g);
830 gbHi = _mm_unpackhi_epi8(b, g);
831 arLo = _mm_unpacklo_epi8(r, a);
832 arHi = _mm_unpackhi_epi8(r, a);
835 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
836 _mm_store_si128((__m128i*)out, bgrx);
840 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
841 _mm_store_si128((__m128i*)out, bgrx);
845 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
846 _mm_store_si128((__m128i*)out, bgrx);
850 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
851 _mm_store_si128((__m128i*)out, bgrx);
857 for (UINT32 x = 0; x < pad; x++)
859 const BYTE R = CLIP(*pr++);
860 const BYTE G = CLIP(*pg++);
861 const BYTE B = CLIP(*pb++);
875 return PRIMITIVES_SUCCESS;
878 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
879 const INT16* WINPR_RESTRICT pSrc[3],
881 BYTE* WINPR_RESTRICT pDst,
885 const UINT16* pr = (
const UINT16*)(pSrc[0]);
886 const UINT16* pg = (
const UINT16*)(pSrc[1]);
887 const UINT16* pb = (
const UINT16*)(pSrc[2]);
888 const UINT32 pad = roi->width % 16;
889 const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
894 srcbump = (srcStep - (roi->width *
sizeof(UINT16))) /
sizeof(UINT16);
895 dstbump = (dstStep - (roi->width *
sizeof(UINT32)));
897 for (UINT32 y = 0; y < roi->height; ++y)
899 for (UINT32 x = 0; x < roi->width - pad; x += 16)
910 R0 = _mm_load_si128((
const __m128i*)pb);
912 R1 = _mm_load_si128((
const __m128i*)pb);
914 b = _mm_packus_epi16(R0, R1);
919 R0 = _mm_load_si128((
const __m128i*)pg);
921 R1 = _mm_load_si128((
const __m128i*)pg);
923 g = _mm_packus_epi16(R0, R1);
928 R0 = _mm_load_si128((
const __m128i*)pr);
930 R1 = _mm_load_si128((
const __m128i*)pr);
932 r = _mm_packus_epi16(R0, R1);
940 gbLo = _mm_unpacklo_epi8(r, g);
941 gbHi = _mm_unpackhi_epi8(r, g);
942 arLo = _mm_unpacklo_epi8(b, a);
943 arHi = _mm_unpackhi_epi8(b, a);
946 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
947 _mm_store_si128((__m128i*)out, bgrx);
951 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
952 _mm_store_si128((__m128i*)out, bgrx);
956 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
957 _mm_store_si128((__m128i*)out, bgrx);
961 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
962 _mm_store_si128((__m128i*)out, bgrx);
968 for (UINT32 x = 0; x < pad; x++)
970 const BYTE R = CLIP(*pr++);
971 const BYTE G = CLIP(*pg++);
972 const BYTE B = CLIP(*pb++);
986 return PRIMITIVES_SUCCESS;
989 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
990 const INT16* WINPR_RESTRICT pSrc[3],
992 BYTE* WINPR_RESTRICT pDst,
996 const UINT16* pr = (
const UINT16*)(pSrc[0]);
997 const UINT16* pg = (
const UINT16*)(pSrc[1]);
998 const UINT16* pb = (
const UINT16*)(pSrc[2]);
999 const UINT32 pad = roi->width % 16;
1000 const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
1005 srcbump = (srcStep - (roi->width *
sizeof(UINT16))) /
sizeof(UINT16);
1006 dstbump = (dstStep - (roi->width *
sizeof(UINT32)));
1008 for (UINT32 y = 0; y < roi->height; ++y)
1010 for (UINT32 x = 0; x < roi->width - pad; x += 16)
1021 R0 = _mm_load_si128((
const __m128i*)pb);
1023 R1 = _mm_load_si128((
const __m128i*)pb);
1025 b = _mm_packus_epi16(R0, R1);
1030 R0 = _mm_load_si128((
const __m128i*)pg);
1032 R1 = _mm_load_si128((
const __m128i*)pg);
1034 g = _mm_packus_epi16(R0, R1);
1039 R0 = _mm_load_si128((
const __m128i*)pr);
1041 R1 = _mm_load_si128((
const __m128i*)pr);
1043 r = _mm_packus_epi16(R0, R1);
1051 gbLo = _mm_unpacklo_epi8(a, b);
1052 gbHi = _mm_unpackhi_epi8(a, b);
1053 arLo = _mm_unpacklo_epi8(g, r);
1054 arHi = _mm_unpackhi_epi8(g, r);
1057 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
1058 _mm_store_si128((__m128i*)out, bgrx);
1062 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
1063 _mm_store_si128((__m128i*)out, bgrx);
1067 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
1068 _mm_store_si128((__m128i*)out, bgrx);
1072 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
1073 _mm_store_si128((__m128i*)out, bgrx);
1079 for (UINT32 x = 0; x < pad; x++)
1081 const BYTE R = CLIP(*pr++);
1082 const BYTE G = CLIP(*pg++);
1083 const BYTE B = CLIP(*pb++);
1097 return PRIMITIVES_SUCCESS;
1100 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
1101 const INT16* WINPR_RESTRICT pSrc[3],
1103 BYTE* WINPR_RESTRICT pDst,
1107 const UINT16* pr = (
const UINT16*)(pSrc[0]);
1108 const UINT16* pg = (
const UINT16*)(pSrc[1]);
1109 const UINT16* pb = (
const UINT16*)(pSrc[2]);
1110 const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
1111 const UINT32 pad = roi->width % 16;
1116 srcbump = (srcStep - (roi->width *
sizeof(UINT16))) /
sizeof(UINT16);
1117 dstbump = (dstStep - (roi->width *
sizeof(UINT32)));
1119 for (UINT32 y = 0; y < roi->height; ++y)
1121 for (UINT32 x = 0; x < roi->width - pad; x += 16)
1132 R0 = _mm_load_si128((
const __m128i*)pb);
1134 R1 = _mm_load_si128((
const __m128i*)pb);
1136 b = _mm_packus_epi16(R0, R1);
1141 R0 = _mm_load_si128((
const __m128i*)pg);
1143 R1 = _mm_load_si128((
const __m128i*)pg);
1145 g = _mm_packus_epi16(R0, R1);
1150 R0 = _mm_load_si128((
const __m128i*)pr);
1152 R1 = _mm_load_si128((
const __m128i*)pr);
1154 r = _mm_packus_epi16(R0, R1);
1162 gbLo = _mm_unpacklo_epi8(a, r);
1163 gbHi = _mm_unpackhi_epi8(a, r);
1164 arLo = _mm_unpacklo_epi8(g, b);
1165 arHi = _mm_unpackhi_epi8(g, b);
1168 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
1169 _mm_store_si128((__m128i*)out, bgrx);
1173 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
1174 _mm_store_si128((__m128i*)out, bgrx);
1178 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
1179 _mm_store_si128((__m128i*)out, bgrx);
1183 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
1184 _mm_store_si128((__m128i*)out, bgrx);
1190 for (UINT32 x = 0; x < pad; x++)
1192 const BYTE R = CLIP(*pr++);
1193 const BYTE G = CLIP(*pg++);
1194 const BYTE B = CLIP(*pb++);
1208 return PRIMITIVES_SUCCESS;
1212 sse2_RGBToRGB_16s8u_P3AC4R(
const INT16* WINPR_RESTRICT pSrc[3],
1214 BYTE* WINPR_RESTRICT pDst,
1216 UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi)
1218 if (((ULONG_PTR)pSrc[0] & 0x0f) || ((ULONG_PTR)pSrc[1] & 0x0f) || ((ULONG_PTR)pSrc[2] & 0x0f) ||
1219 (srcStep & 0x0f) || ((ULONG_PTR)pDst & 0x0f) || (dstStep & 0x0f))
1220 return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1224 case PIXEL_FORMAT_BGRA32:
1225 case PIXEL_FORMAT_BGRX32:
1226 return sse2_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
1228 case PIXEL_FORMAT_RGBA32:
1229 case PIXEL_FORMAT_RGBX32:
1230 return sse2_RGBToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
1232 case PIXEL_FORMAT_ABGR32:
1233 case PIXEL_FORMAT_XBGR32:
1234 return sse2_RGBToRGB_16s8u_P3AC4R_XBGR(pSrc, srcStep, pDst, dstStep, roi);
1236 case PIXEL_FORMAT_ARGB32:
1237 case PIXEL_FORMAT_XRGB32:
1238 return sse2_RGBToRGB_16s8u_P3AC4R_XRGB(pSrc, srcStep, pDst, dstStep, roi);
1241 return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1248 #if defined(SSE2_ENABLED)
1249 generic = primitives_get_generic();
1250 primitives_init_colors(prims);
1252 if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
1254 WLog_VRB(PRIM_TAG,
"SSE2 optimizations");
1255 prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
1256 prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
1257 prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
1258 prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
1262 WLog_VRB(PRIM_TAG,
"undefined WITH_SSE2");
1263 WINPR_UNUSED(prims);