20 #include <freerdp/config.h>
22 #include <freerdp/types.h>
23 #include <freerdp/primitives.h>
24 #include <winpr/sysinfo.h>
26 #include "prim_colors.h"
28 #include "prim_internal.h"
29 #include "prim_templates.h"
31 #if defined(SSE_AVX_INTRINSICS_ENABLED)
32 #include <emmintrin.h>
37 #define GNU_INLINE __attribute__((__gnu_inline__, __always_inline__, __artificial__))
42 #define CACHE_LINE_BYTES 64
49 static const int32_t ycbcr_table[][4] = { { 1, 0, -1, 2 },
55 { 90, -22, -46, 113 },
56 { 180, -44, -91, 227 },
57 { 359, -88, -183, 453 },
58 { 718, -176, -366, 906 },
59 { 1437, -352, -731, 1812 },
60 { 2873, -705, -1462, 3625 },
61 { 5747, -1409, -2925, 7250 },
62 { 11493, -2818, -5849, 14500 },
63 { 22987, -5636, -11698, 29000 },
64 { 45974, -11272, -23396, 57999 },
65 { 91947, -22544, -46793, 115999 },
66 { 183894, -45089, -93585, 231997 },
67 { 367788, -90178, -187171, 463995 },
68 { 735576, -180355, -374342, 927990 },
69 { 1471152, -360710, -748683, 1855980 },
70 { 2942304, -721420, -1497367, 3711959 },
71 { 5884609, -1442841, -2994733, 7423918 },
72 { 11769217, -2885681, -5989466, 14847836 },
73 { 23538434, -5771362, -11978932, 29695672 },
74 { 47076868, -11542725, -23957864, 59391345 },
75 { 94153736, -23085449, -47915729, 118782689 },
76 { 188307472, -46170898, -95831458, 237565379 },
77 { 376614945, -92341797, -191662916, 475130757 },
78 { 753229890, -184683594, -383325831, 950261514 },
79 { 1506459779, -369367187, -766651662, 1900523028 } };
81 static inline __m128i mm_between_epi16_int(__m128i val, __m128i min, __m128i max)
83 return _mm_min_epi16(max, _mm_max_epi16(val, min));
86 #define mm_between_epi16(_val, _min, _max) (_val) = mm_between_epi16_int((_val), (_min), (_max))
90 static inline void GNU_INLINE _mm_prefetch_buffer(
char* WINPR_RESTRICT buffer,
int num_bytes)
92 __m128i* buf = (__m128i*)buffer;
94 for (
unsigned int i = 0; i < (num_bytes /
sizeof(__m128i));
95 i += (CACHE_LINE_BYTES /
sizeof(__m128i)))
97 _mm_prefetch((
char*)(&buf[i]), _MM_HINT_NTA);
104 sse2_yCbCrToRGB_16s16s_P3P3(
const INT16* WINPR_RESTRICT pSrc[3],
int srcStep,
105 INT16* WINPR_RESTRICT pDst[3],
int dstStep,
108 if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
109 ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
110 ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
111 (srcStep & 127) || (dstStep & 127))
114 return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
117 const __m128i zero = _mm_setzero_si128();
118 const __m128i max = _mm_set1_epi16(255);
119 const __m128i* y_buf = (
const __m128i*)(pSrc[0]);
120 const __m128i* cb_buf = (
const __m128i*)(pSrc[1]);
121 const __m128i* cr_buf = (
const __m128i*)(pSrc[2]);
122 __m128i* r_buf = (__m128i*)(pDst[0]);
123 __m128i* g_buf = (__m128i*)(pDst[1]);
124 __m128i* b_buf = (__m128i*)(pDst[2]);
126 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][0]));
128 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][1]));
130 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][2]));
132 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][3]));
133 __m128i c4096 = _mm_set1_epi16(4096);
134 const size_t srcbump = WINPR_ASSERTING_INT_CAST(
size_t, srcStep) /
sizeof(__m128i);
135 const size_t dstbump = WINPR_ASSERTING_INT_CAST(
size_t, dstStep) /
sizeof(__m128i);
139 for (UINT32 yp = 0; yp < roi->height; yp++)
141 for (
int i = 0; i < roi->width *
sizeof(INT16) /
sizeof(__m128i);
142 i += (CACHE_LINE_BYTES /
sizeof(__m128i)))
144 _mm_prefetch((
char*)(&y_buf[i]), _MM_HINT_NTA);
145 _mm_prefetch((
char*)(&cb_buf[i]), _MM_HINT_NTA);
146 _mm_prefetch((
char*)(&cr_buf[i]), _MM_HINT_NTA);
154 y_buf = (__m128i*)(pSrc[0]);
155 cb_buf = (__m128i*)(pSrc[1]);
156 cr_buf = (__m128i*)(pSrc[2]);
158 const size_t imax = roi->width *
sizeof(INT16) /
sizeof(__m128i);
160 for (UINT32 yp = 0; yp < roi->height; ++yp)
162 for (
size_t i = 0; i < imax; i++)
184 __m128i y = _mm_load_si128(y_buf + i);
185 y = _mm_add_epi16(y, c4096);
186 y = _mm_srai_epi16(y, 2);
188 __m128i cb = _mm_load_si128(cb_buf + i);
190 __m128i cr = _mm_load_si128(cr_buf + i);
192 __m128i r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
193 r = _mm_srai_epi16(r, 3);
195 mm_between_epi16(r, zero, max);
196 _mm_store_si128(r_buf + i, r);
198 __m128i g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
199 g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
200 g = _mm_srai_epi16(g, 3);
202 mm_between_epi16(g, zero, max);
203 _mm_store_si128(g_buf + i, g);
205 __m128i b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
206 b = _mm_srai_epi16(b, 3);
208 mm_between_epi16(b, zero, max);
209 _mm_store_si128(b_buf + i, b);
220 return PRIMITIVES_SUCCESS;
225 sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(
const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
226 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
229 const __m128i zero = _mm_setzero_si128();
230 const __m128i max = _mm_set1_epi16(255);
232 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][0]));
234 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][1]));
236 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][2]));
238 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][3]));
239 const __m128i c4096 = _mm_set1_epi16(4096);
240 const INT16* y_buf = pSrc[0];
241 const INT16* cb_buf = pSrc[1];
242 const INT16* cr_buf = pSrc[2];
243 const UINT32 pad = roi->width % 16;
244 const UINT32 step =
sizeof(__m128i) /
sizeof(INT16);
245 const UINT32 imax = (roi->width - pad) *
sizeof(INT16) /
sizeof(__m128i);
247 const size_t dstPad = (dstStep - roi->width * 4);
251 for (UINT32 yp = 0; yp < roi->height; yp++)
253 for (
int i = 0; i < imax; i += (CACHE_LINE_BYTES /
sizeof(__m128i)))
255 _mm_prefetch((
char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
256 _mm_prefetch((
char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
257 _mm_prefetch((
char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
260 y_buf += srcStep /
sizeof(INT16);
261 cb_buf += srcStep /
sizeof(INT16);
262 cr_buf += srcStep /
sizeof(INT16);
265 y_buf = (INT16*)pSrc[0];
266 cb_buf = (INT16*)pSrc[1];
267 cr_buf = (INT16*)pSrc[2];
270 for (UINT32 yp = 0; yp < roi->height; ++yp)
272 for (UINT32 i = 0; i < imax; i += 2)
294 __m128i y1 = _mm_load_si128((
const __m128i*)y_buf);
296 y1 = _mm_add_epi16(y1, c4096);
297 y1 = _mm_srai_epi16(y1, 2);
299 __m128i cb1 = _mm_load_si128((
const __m128i*)cb_buf);
302 __m128i cr1 = _mm_load_si128((
const __m128i*)cr_buf);
305 __m128i r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
306 r1 = _mm_srai_epi16(r1, 3);
308 mm_between_epi16(r1, zero, max);
310 __m128i g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
311 g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
312 g1 = _mm_srai_epi16(g1, 3);
314 mm_between_epi16(g1, zero, max);
316 __m128i b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
317 b1 = _mm_srai_epi16(b1, 3);
319 mm_between_epi16(b1, zero, max);
320 __m128i y2 = _mm_load_si128((
const __m128i*)y_buf);
322 y2 = _mm_add_epi16(y2, c4096);
323 y2 = _mm_srai_epi16(y2, 2);
325 __m128i cb2 = _mm_load_si128((
const __m128i*)cb_buf);
328 __m128i cr2 = _mm_load_si128((
const __m128i*)cr_buf);
331 __m128i r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
332 r2 = _mm_srai_epi16(r2, 3);
334 mm_between_epi16(r2, zero, max);
336 __m128i g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
337 g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
338 g2 = _mm_srai_epi16(g2, 3);
340 mm_between_epi16(g2, zero, max);
342 __m128i b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
343 b2 = _mm_srai_epi16(b2, 3);
345 mm_between_epi16(b2, zero, max);
352 R0 = _mm_packus_epi16(R0, R1);
355 R1 = _mm_packus_epi16(R1, R2);
357 R2 = _mm_unpacklo_epi8(R0, R2);
358 R1 = _mm_unpackhi_epi8(R0, R1);
361 R0 = _mm_packus_epi16(R0, R3);
362 R3 = mm_set1_epu32(0xFFFFFFFFU);
364 R4 = _mm_unpacklo_epi8(R0, R4);
365 R3 = _mm_unpackhi_epi8(R0, R3);
367 R0 = _mm_unpacklo_epi16(R2, R0);
368 R4 = _mm_unpackhi_epi16(R2, R4);
370 R2 = _mm_unpacklo_epi16(R1, R2);
371 R3 = _mm_unpackhi_epi16(R1, R3);
372 _mm_store_si128((__m128i*)d_buf, R0);
373 d_buf +=
sizeof(__m128i);
374 _mm_store_si128((__m128i*)d_buf, R4);
375 d_buf +=
sizeof(__m128i);
376 _mm_store_si128((__m128i*)d_buf, R2);
377 d_buf +=
sizeof(__m128i);
378 _mm_store_si128((__m128i*)d_buf, R3);
379 d_buf +=
sizeof(__m128i);
383 for (UINT32 i = 0; i < pad; i++)
385 const INT32 divisor = 16;
386 const INT32 Y = ((*y_buf++) + 4096) << divisor;
387 const INT32 Cb = (*cb_buf++);
388 const INT32 Cr = (*cr_buf++);
389 const INT32 CrR = Cr * ycbcr_table[divisor][0];
390 const INT32 CrG = Cr * ycbcr_table[divisor][1];
391 const INT32 CbG = Cb * ycbcr_table[divisor][2];
392 const INT32 CbB = Cb * ycbcr_table[divisor][3];
393 const INT16 R = WINPR_ASSERTING_INT_CAST(int16_t, (((CrR + Y) >> divisor) >> 5));
394 const INT16 G = WINPR_ASSERTING_INT_CAST(int16_t, (((Y - CbG - CrG) >> divisor) >> 5));
395 const INT16 B = WINPR_ASSERTING_INT_CAST(int16_t, (((CbB + Y) >> divisor) >> 5));
405 return PRIMITIVES_SUCCESS;
410 sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(
const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
411 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
414 const __m128i zero = _mm_setzero_si128();
415 const __m128i max = _mm_set1_epi16(255);
417 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][0]));
419 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][1]));
421 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][2]));
423 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][3]));
424 const __m128i c4096 = _mm_set1_epi16(4096);
425 const INT16* y_buf = pSrc[0];
426 const INT16* cb_buf = pSrc[1];
427 const INT16* cr_buf = pSrc[2];
428 const UINT32 pad = roi->width % 16;
429 const UINT32 step =
sizeof(__m128i) /
sizeof(INT16);
430 const UINT32 imax = (roi->width - pad) *
sizeof(INT16) /
sizeof(__m128i);
432 const size_t dstPad = (dstStep - roi->width * 4);
436 for (UINT32 yp = 0; yp < roi->height; yp++)
438 for (
int i = 0; i < imax; i += (CACHE_LINE_BYTES /
sizeof(__m128i)))
440 _mm_prefetch((
char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
441 _mm_prefetch((
char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
442 _mm_prefetch((
char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
445 y_buf += srcStep /
sizeof(INT16);
446 cb_buf += srcStep /
sizeof(INT16);
447 cr_buf += srcStep /
sizeof(INT16);
450 y_buf = (INT16*)(pSrc[0]);
451 cb_buf = (INT16*)(pSrc[1]);
452 cr_buf = (INT16*)(pSrc[2]);
455 for (UINT32 yp = 0; yp < roi->height; ++yp)
457 for (UINT32 i = 0; i < imax; i += 2)
479 __m128i y1 = _mm_load_si128((
const __m128i*)y_buf);
481 y1 = _mm_add_epi16(y1, c4096);
482 y1 = _mm_srai_epi16(y1, 2);
484 __m128i cb1 = _mm_load_si128((
const __m128i*)cb_buf);
487 __m128i cr1 = _mm_load_si128((
const __m128i*)cr_buf);
490 __m128i r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
491 r1 = _mm_srai_epi16(r1, 3);
493 mm_between_epi16(r1, zero, max);
495 __m128i g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
496 g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
497 g1 = _mm_srai_epi16(g1, 3);
499 mm_between_epi16(g1, zero, max);
501 __m128i b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
502 b1 = _mm_srai_epi16(b1, 3);
504 mm_between_epi16(b1, zero, max);
505 __m128i y2 = _mm_load_si128((
const __m128i*)y_buf);
507 y2 = _mm_add_epi16(y2, c4096);
508 y2 = _mm_srai_epi16(y2, 2);
510 __m128i cb2 = _mm_load_si128((
const __m128i*)cb_buf);
513 __m128i cr2 = _mm_load_si128((
const __m128i*)cr_buf);
516 __m128i r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
517 r2 = _mm_srai_epi16(r2, 3);
519 mm_between_epi16(r2, zero, max);
521 __m128i g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
522 g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
523 g2 = _mm_srai_epi16(g2, 3);
525 mm_between_epi16(g2, zero, max);
527 __m128i b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
528 b2 = _mm_srai_epi16(b2, 3);
530 mm_between_epi16(b2, zero, max);
537 R0 = _mm_packus_epi16(R0, R1);
540 R1 = _mm_packus_epi16(R1, R2);
542 R2 = _mm_unpacklo_epi8(R0, R2);
543 R1 = _mm_unpackhi_epi8(R0, R1);
546 R0 = _mm_packus_epi16(R0, R3);
547 R3 = mm_set1_epu32(0xFFFFFFFFU);
549 R4 = _mm_unpacklo_epi8(R0, R4);
550 R3 = _mm_unpackhi_epi8(R0, R3);
552 R0 = _mm_unpacklo_epi16(R2, R0);
553 R4 = _mm_unpackhi_epi16(R2, R4);
555 R2 = _mm_unpacklo_epi16(R1, R2);
556 R3 = _mm_unpackhi_epi16(R1, R3);
557 _mm_store_si128((__m128i*)d_buf, R0);
558 d_buf +=
sizeof(__m128i);
559 _mm_store_si128((__m128i*)d_buf, R4);
560 d_buf +=
sizeof(__m128i);
561 _mm_store_si128((__m128i*)d_buf, R2);
562 d_buf +=
sizeof(__m128i);
563 _mm_store_si128((__m128i*)d_buf, R3);
564 d_buf +=
sizeof(__m128i);
568 for (UINT32 i = 0; i < pad; i++)
570 const INT32 divisor = 16;
571 const INT32 Y = ((*y_buf++) + 4096) << divisor;
572 const INT32 Cb = (*cb_buf++);
573 const INT32 Cr = (*cr_buf++);
574 const INT32 CrR = Cr * ycbcr_table[divisor][0];
575 const INT32 CrG = Cr * ycbcr_table[divisor][1];
576 const INT32 CbG = Cb * ycbcr_table[divisor][2];
577 const INT32 CbB = Cb * ycbcr_table[divisor][3];
578 const INT16 R = WINPR_ASSERTING_INT_CAST(int16_t, (((CrR + Y) >> divisor) >> 5));
579 const INT16 G = WINPR_ASSERTING_INT_CAST(int16_t, (((Y - CbG - CrG) >> divisor) >> 5));
580 const INT16 B = WINPR_ASSERTING_INT_CAST(int16_t, (((CbB + Y) >> divisor) >> 5));
590 return PRIMITIVES_SUCCESS;
594 sse2_yCbCrToRGB_16s8u_P3AC4R(
const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
595 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
598 if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
599 ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst)&0x0f) || (srcStep & 0x0f) ||
603 return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
608 case PIXEL_FORMAT_BGRA32:
609 case PIXEL_FORMAT_BGRX32:
610 return sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
612 case PIXEL_FORMAT_RGBA32:
613 case PIXEL_FORMAT_RGBX32:
614 return sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
617 return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
624 sse2_RGBToYCbCr_16s16s_P3P3(
const INT16* WINPR_RESTRICT pSrc[3],
int srcStep,
625 INT16* WINPR_RESTRICT pDst[3],
int dstStep,
628 const __m128i* r_buf = (
const __m128i*)(pSrc[0]);
629 const __m128i* g_buf = (
const __m128i*)(pSrc[1]);
630 const __m128i* b_buf = (
const __m128i*)(pSrc[2]);
631 __m128i* y_buf = (__m128i*)(pDst[0]);
632 __m128i* cb_buf = (__m128i*)(pDst[1]);
633 __m128i* cr_buf = (__m128i*)(pDst[2]);
636 if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
637 ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
638 ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
639 (srcStep & 127) || (dstStep & 127))
642 return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
645 const __m128i min = _mm_set1_epi16(-128 * 32);
646 const __m128i max = _mm_set1_epi16(127 * 32);
648 __m128i y_r = _mm_set1_epi16(9798);
649 __m128i y_g = _mm_set1_epi16(19235);
650 __m128i y_b = _mm_set1_epi16(3735);
651 __m128i cb_r = _mm_set1_epi16(-5535);
652 __m128i cb_g = _mm_set1_epi16(-10868);
653 __m128i cb_b = _mm_set1_epi16(16403);
654 __m128i cr_r = _mm_set1_epi16(16377);
655 __m128i cr_g = _mm_set1_epi16(-13714);
656 __m128i cr_b = _mm_set1_epi16(-2663);
657 const size_t srcbump = WINPR_ASSERTING_INT_CAST(
size_t, srcStep) /
sizeof(__m128i);
658 const size_t dstbump = WINPR_ASSERTING_INT_CAST(
size_t, dstStep) /
sizeof(__m128i);
662 for (UINT32 yp = 0; yp < roi->height; yp++)
664 for (
int i = 0; i < roi->width *
sizeof(INT16) /
sizeof(__m128i);
665 i += (CACHE_LINE_BYTES /
sizeof(__m128i)))
667 _mm_prefetch((
char*)(&r_buf[i]), _MM_HINT_NTA);
668 _mm_prefetch((
char*)(&g_buf[i]), _MM_HINT_NTA);
669 _mm_prefetch((
char*)(&b_buf[i]), _MM_HINT_NTA);
677 r_buf = (__m128i*)(pSrc[0]);
678 g_buf = (__m128i*)(pSrc[1]);
679 b_buf = (__m128i*)(pSrc[2]);
681 imax = roi->width *
sizeof(INT16) /
sizeof(__m128i);
683 for (UINT32 yp = 0; yp < roi->height; ++yp)
685 for (
int i = 0; i < imax; i++)
698 __m128i r = _mm_load_si128(r_buf + i);
699 __m128i g = _mm_load_si128(g_buf + i);
700 __m128i b = _mm_load_si128(b_buf + i);
702 r = _mm_slli_epi16(r, 6);
703 g = _mm_slli_epi16(g, 6);
704 b = _mm_slli_epi16(b, 6);
706 __m128i y = _mm_mulhi_epi16(r, y_r);
707 y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
708 y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
709 y = _mm_add_epi16(y, min);
711 mm_between_epi16(y, min, max);
712 _mm_store_si128(y_buf + i, y);
714 __m128i cb = _mm_mulhi_epi16(r, cb_r);
715 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
716 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
718 mm_between_epi16(cb, min, max);
719 _mm_store_si128(cb_buf + i, cb);
721 __m128i cr = _mm_mulhi_epi16(r, cr_r);
722 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
723 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
725 mm_between_epi16(cr, min, max);
726 _mm_store_si128(cr_buf + i, cr);
737 return PRIMITIVES_SUCCESS;
741 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
742 const INT16* WINPR_RESTRICT pSrc[3],
744 BYTE* WINPR_RESTRICT pDst,
748 const UINT16* pr = (
const UINT16*)(pSrc[0]);
749 const UINT16* pg = (
const UINT16*)(pSrc[1]);
750 const UINT16* pb = (
const UINT16*)(pSrc[2]);
751 const UINT32 pad = roi->width % 16;
752 const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
757 srcbump = (srcStep - (roi->width *
sizeof(UINT16))) /
sizeof(UINT16);
758 dstbump = (dstStep - (roi->width *
sizeof(UINT32)));
760 for (UINT32 y = 0; y < roi->height; ++y)
762 for (UINT32 x = 0; x < roi->width - pad; x += 16)
773 R0 = _mm_load_si128((
const __m128i*)pb);
775 R1 = _mm_load_si128((
const __m128i*)pb);
777 b = _mm_packus_epi16(R0, R1);
782 R0 = _mm_load_si128((
const __m128i*)pg);
784 R1 = _mm_load_si128((
const __m128i*)pg);
786 g = _mm_packus_epi16(R0, R1);
791 R0 = _mm_load_si128((
const __m128i*)pr);
793 R1 = _mm_load_si128((
const __m128i*)pr);
795 r = _mm_packus_epi16(R0, R1);
798 const __m128i gbLo = _mm_unpacklo_epi8(b, g);
799 const __m128i gbHi = _mm_unpackhi_epi8(b, g);
800 const __m128i arLo = _mm_unpacklo_epi8(r, a);
801 const __m128i arHi = _mm_unpackhi_epi8(r, a);
804 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
805 _mm_store_si128((__m128i*)out, bgrx);
809 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
810 _mm_store_si128((__m128i*)out, bgrx);
814 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
815 _mm_store_si128((__m128i*)out, bgrx);
819 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
820 _mm_store_si128((__m128i*)out, bgrx);
826 for (UINT32 x = 0; x < pad; x++)
828 const BYTE R = CLIP(*pr++);
829 const BYTE G = CLIP(*pg++);
830 const BYTE B = CLIP(*pb++);
844 return PRIMITIVES_SUCCESS;
847 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
848 const INT16* WINPR_RESTRICT pSrc[3],
850 BYTE* WINPR_RESTRICT pDst,
854 const UINT16* pr = (
const UINT16*)(pSrc[0]);
855 const UINT16* pg = (
const UINT16*)(pSrc[1]);
856 const UINT16* pb = (
const UINT16*)(pSrc[2]);
857 const UINT32 pad = roi->width % 16;
858 const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
863 srcbump = (srcStep - (roi->width *
sizeof(UINT16))) /
sizeof(UINT16);
864 dstbump = (dstStep - (roi->width *
sizeof(UINT32)));
866 for (UINT32 y = 0; y < roi->height; ++y)
868 for (UINT32 x = 0; x < roi->width - pad; x += 16)
879 R0 = _mm_load_si128((
const __m128i*)pb);
881 R1 = _mm_load_si128((
const __m128i*)pb);
883 b = _mm_packus_epi16(R0, R1);
888 R0 = _mm_load_si128((
const __m128i*)pg);
890 R1 = _mm_load_si128((
const __m128i*)pg);
892 g = _mm_packus_epi16(R0, R1);
897 R0 = _mm_load_si128((
const __m128i*)pr);
899 R1 = _mm_load_si128((
const __m128i*)pr);
901 r = _mm_packus_epi16(R0, R1);
909 gbLo = _mm_unpacklo_epi8(r, g);
910 gbHi = _mm_unpackhi_epi8(r, g);
911 arLo = _mm_unpacklo_epi8(b, a);
912 arHi = _mm_unpackhi_epi8(b, a);
915 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
916 _mm_store_si128((__m128i*)out, bgrx);
920 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
921 _mm_store_si128((__m128i*)out, bgrx);
925 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
926 _mm_store_si128((__m128i*)out, bgrx);
930 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
931 _mm_store_si128((__m128i*)out, bgrx);
937 for (UINT32 x = 0; x < pad; x++)
939 const BYTE R = CLIP(*pr++);
940 const BYTE G = CLIP(*pg++);
941 const BYTE B = CLIP(*pb++);
955 return PRIMITIVES_SUCCESS;
958 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
959 const INT16* WINPR_RESTRICT pSrc[3],
961 BYTE* WINPR_RESTRICT pDst,
965 const UINT16* pr = (
const UINT16*)(pSrc[0]);
966 const UINT16* pg = (
const UINT16*)(pSrc[1]);
967 const UINT16* pb = (
const UINT16*)(pSrc[2]);
968 const UINT32 pad = roi->width % 16;
969 const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
974 srcbump = (srcStep - (roi->width *
sizeof(UINT16))) /
sizeof(UINT16);
975 dstbump = (dstStep - (roi->width *
sizeof(UINT32)));
977 for (UINT32 y = 0; y < roi->height; ++y)
979 for (UINT32 x = 0; x < roi->width - pad; x += 16)
990 R0 = _mm_load_si128((
const __m128i*)pb);
992 R1 = _mm_load_si128((
const __m128i*)pb);
994 b = _mm_packus_epi16(R0, R1);
999 R0 = _mm_load_si128((
const __m128i*)pg);
1001 R1 = _mm_load_si128((
const __m128i*)pg);
1003 g = _mm_packus_epi16(R0, R1);
1008 R0 = _mm_load_si128((
const __m128i*)pr);
1010 R1 = _mm_load_si128((
const __m128i*)pr);
1012 r = _mm_packus_epi16(R0, R1);
1020 gbLo = _mm_unpacklo_epi8(a, b);
1021 gbHi = _mm_unpackhi_epi8(a, b);
1022 arLo = _mm_unpacklo_epi8(g, r);
1023 arHi = _mm_unpackhi_epi8(g, r);
1026 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
1027 _mm_store_si128((__m128i*)out, bgrx);
1031 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
1032 _mm_store_si128((__m128i*)out, bgrx);
1036 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
1037 _mm_store_si128((__m128i*)out, bgrx);
1041 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
1042 _mm_store_si128((__m128i*)out, bgrx);
1048 for (UINT32 x = 0; x < pad; x++)
1050 const BYTE R = CLIP(*pr++);
1051 const BYTE G = CLIP(*pg++);
1052 const BYTE B = CLIP(*pb++);
1066 return PRIMITIVES_SUCCESS;
1069 static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
1070 const INT16* WINPR_RESTRICT pSrc[3],
1072 BYTE* WINPR_RESTRICT pDst,
1076 const UINT16* pr = (
const UINT16*)(pSrc[0]);
1077 const UINT16* pg = (
const UINT16*)(pSrc[1]);
1078 const UINT16* pb = (
const UINT16*)(pSrc[2]);
1079 const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
1080 const UINT32 pad = roi->width % 16;
1085 srcbump = (srcStep - (roi->width *
sizeof(UINT16))) /
sizeof(UINT16);
1086 dstbump = (dstStep - (roi->width *
sizeof(UINT32)));
1088 for (UINT32 y = 0; y < roi->height; ++y)
1090 for (UINT32 x = 0; x < roi->width - pad; x += 16)
1101 R0 = _mm_load_si128((
const __m128i*)pb);
1103 R1 = _mm_load_si128((
const __m128i*)pb);
1105 b = _mm_packus_epi16(R0, R1);
1110 R0 = _mm_load_si128((
const __m128i*)pg);
1112 R1 = _mm_load_si128((
const __m128i*)pg);
1114 g = _mm_packus_epi16(R0, R1);
1119 R0 = _mm_load_si128((
const __m128i*)pr);
1121 R1 = _mm_load_si128((
const __m128i*)pr);
1123 r = _mm_packus_epi16(R0, R1);
1131 gbLo = _mm_unpacklo_epi8(a, r);
1132 gbHi = _mm_unpackhi_epi8(a, r);
1133 arLo = _mm_unpacklo_epi8(g, b);
1134 arHi = _mm_unpackhi_epi8(g, b);
1137 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
1138 _mm_store_si128((__m128i*)out, bgrx);
1142 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
1143 _mm_store_si128((__m128i*)out, bgrx);
1147 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
1148 _mm_store_si128((__m128i*)out, bgrx);
1152 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
1153 _mm_store_si128((__m128i*)out, bgrx);
1159 for (UINT32 x = 0; x < pad; x++)
1161 const BYTE R = CLIP(*pr++);
1162 const BYTE G = CLIP(*pg++);
1163 const BYTE B = CLIP(*pb++);
1177 return PRIMITIVES_SUCCESS;
1181 sse2_RGBToRGB_16s8u_P3AC4R(
const INT16* WINPR_RESTRICT pSrc[3],
1183 BYTE* WINPR_RESTRICT pDst,
1185 UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi)
1187 if (((ULONG_PTR)pSrc[0] & 0x0f) || ((ULONG_PTR)pSrc[1] & 0x0f) || ((ULONG_PTR)pSrc[2] & 0x0f) ||
1188 (srcStep & 0x0f) || ((ULONG_PTR)pDst & 0x0f) || (dstStep & 0x0f))
1189 return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1193 case PIXEL_FORMAT_BGRA32:
1194 case PIXEL_FORMAT_BGRX32:
1195 return sse2_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
1197 case PIXEL_FORMAT_RGBA32:
1198 case PIXEL_FORMAT_RGBX32:
1199 return sse2_RGBToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
1201 case PIXEL_FORMAT_ABGR32:
1202 case PIXEL_FORMAT_XBGR32:
1203 return sse2_RGBToRGB_16s8u_P3AC4R_XBGR(pSrc, srcStep, pDst, dstStep, roi);
1205 case PIXEL_FORMAT_ARGB32:
1206 case PIXEL_FORMAT_XRGB32:
1207 return sse2_RGBToRGB_16s8u_P3AC4R_XRGB(pSrc, srcStep, pDst, dstStep, roi);
1210 return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1217 #if defined(SSE_AVX_INTRINSICS_ENABLED)
1218 generic = primitives_get_generic();
1219 primitives_init_colors(prims);
1221 if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
1223 WLog_VRB(PRIM_TAG,
"SSE2 optimizations");
1224 prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
1225 prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
1226 prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
1227 prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
1231 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or SSE2 intrinsics not available");
1232 WINPR_UNUSED(prims);