20 #include <freerdp/config.h>
22 #include <freerdp/types.h>
23 #include <freerdp/primitives.h>
24 #include <winpr/sysinfo.h>
26 #include "prim_YCoCg.h"
28 #include "prim_internal.h"
29 #include "prim_templates.h"
31 #if defined(SSE_AVX_INTRINSICS_ENABLED)
32 #include <emmintrin.h>
33 #include <tmmintrin.h>
38 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
39 BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
40 UINT32 dstStep, UINT32 width, UINT32 height,
41 UINT8 shift, BOOL withAlpha)
43 const BYTE* sptr = pSrc;
46 WINPR_ASSERT(srcStep /
sizeof(UINT32) >= width);
47 WINPR_ASSERT(dstStep /
sizeof(UINT32) >= width);
48 const size_t sRowBump = srcStep - width *
sizeof(UINT32);
49 const size_t dRowBump = dstStep - width *
sizeof(UINT32);
53 int dataShift = shift - 1;
54 BYTE mask = (BYTE)(0xFFU << dataShift);
65 if ((width < 8) || (ULONG_PTR)dptr & 0x03)
68 return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
69 DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
70 width, height, shift, withAlpha);
73 for (UINT32 h = 0; h < height; h++)
79 if ((ULONG_PTR)dptr & 0x0f)
82 UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
87 status =
generic->YCoCgToRGB_8u_AC4R(
88 sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
89 WINPR_ASSERTING_INT_CAST(INT32, dstStep), startup, 1, shift, withAlpha);
91 if (status != PRIMITIVES_SUCCESS)
94 sptr += startup *
sizeof(UINT32);
95 dptr += startup *
sizeof(UINT32);
100 onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
116 R0 = _mm_load_si128((
const __m128i*)sptr);
118 R1 = _mm_load_si128((
const __m128i*)sptr);
124 R0 = _mm_lddqu_si128((
const __m128i*)sptr);
126 R1 = _mm_lddqu_si128((
const __m128i*)sptr);
133 R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
134 R3 = _mm_shuffle_epi8(R0, R2);
135 R4 = _mm_shuffle_epi8(R1, R2);
138 R5 = _mm_unpackhi_epi32(R3, R4);
139 R6 = _mm_unpacklo_epi32(R3, R4);
145 R7 = _mm_unpackhi_epi64(R5, R5);
147 R7 = mm_set1_epu32(0xFFFFFFFFU);
151 R1 = mm_set1_epu32(0);
152 R0 = _mm_unpacklo_epi8(R5, R1);
159 R6 = _mm_slli_epi16(R6, dataShift);
160 R1 = mm_set1_epu8(mask);
161 R6 = _mm_and_si128(R6, R1);
164 R1 = _mm_unpackhi_epi8(R6, R6);
165 R1 = _mm_srai_epi16(R1, 8);
168 R2 = _mm_unpacklo_epi8(R6, R6);
169 R2 = _mm_srai_epi16(R2, 8);
172 R6 = _mm_subs_epi16(R0, R2);
174 R3 = _mm_adds_epi16(R6, R1);
177 R4 = _mm_adds_epi16(R0, R2);
180 R5 = _mm_subs_epi16(R6, R1);
183 R0 = _mm_packus_epi16(R3, R5);
186 R1 = _mm_packus_epi16(R4, R4);
189 R1 = _mm_unpackhi_epi64(R1, R7);
192 R2 = _mm_unpacklo_epi8(R0, R1);
194 R3 = _mm_unpackhi_epi8(R0, R1);
196 R4 = _mm_unpacklo_epi16(R2, R3);
198 R5 = _mm_unpackhi_epi16(R2, R3);
200 _mm_store_si128((__m128i*)dptr, R4);
202 _mm_store_si128((__m128i*)dptr, R5);
210 pstatus_t status = 0;
211 status =
generic->YCoCgToRGB_8u_AC4R(
212 sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
213 WINPR_ASSERTING_INT_CAST(INT32, dstStep), w, 1, shift, withAlpha);
215 if (status != PRIMITIVES_SUCCESS)
218 sptr += w *
sizeof(UINT32);
219 dptr += w *
sizeof(UINT32);
226 return PRIMITIVES_SUCCESS;
230 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
const BYTE* WINPR_RESTRICT pSrc,
231 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
232 UINT32 DstFormat, UINT32 dstStep, UINT32 width,
233 UINT32 height, UINT8 shift, BOOL withAlpha)
235 const BYTE* sptr = pSrc;
237 size_t sRowBump = srcStep - width *
sizeof(UINT32);
238 size_t dRowBump = dstStep - width *
sizeof(UINT32);
242 int dataShift = shift - 1;
243 BYTE mask = (BYTE)(0xFFU << dataShift);
254 if ((width < 8) || (ULONG_PTR)dptr & 0x03)
257 return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
258 DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
259 width, height, shift, withAlpha);
262 for (UINT32 h = 0; h < height; h++)
268 if ((ULONG_PTR)dptr & 0x0f)
270 pstatus_t status = 0;
271 UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
276 status =
generic->YCoCgToRGB_8u_AC4R(
277 sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
278 WINPR_ASSERTING_INT_CAST(INT32, dstStep), startup, 1, shift, withAlpha);
280 if (status != PRIMITIVES_SUCCESS)
283 sptr += startup *
sizeof(UINT32);
284 dptr += startup *
sizeof(UINT32);
289 onStride = (((
const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
305 R0 = _mm_load_si128((
const __m128i*)sptr);
307 R1 = _mm_load_si128((
const __m128i*)sptr);
313 R0 = _mm_lddqu_si128((
const __m128i*)sptr);
315 R1 = _mm_lddqu_si128((
const __m128i*)sptr);
322 R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
323 R3 = _mm_shuffle_epi8(R0, R2);
324 R4 = _mm_shuffle_epi8(R1, R2);
327 R5 = _mm_unpackhi_epi32(R3, R4);
328 R6 = _mm_unpacklo_epi32(R3, R4);
334 R7 = _mm_unpackhi_epi64(R5, R5);
336 R7 = mm_set1_epu32(0xFFFFFFFFU);
340 R1 = mm_set1_epu32(0);
341 R0 = _mm_unpacklo_epi8(R5, R1);
348 R6 = _mm_slli_epi16(R6, dataShift);
349 R1 = mm_set1_epu8(mask);
350 R6 = _mm_and_si128(R6, R1);
353 R1 = _mm_unpackhi_epi8(R6, R6);
354 R1 = _mm_srai_epi16(R1, 8);
357 R2 = _mm_unpacklo_epi8(R6, R6);
358 R2 = _mm_srai_epi16(R2, 8);
361 R6 = _mm_subs_epi16(R0, R2);
363 R3 = _mm_adds_epi16(R6, R1);
366 R4 = _mm_adds_epi16(R0, R2);
369 R5 = _mm_subs_epi16(R6, R1);
376 R0 = _mm_packus_epi16(R5, R3);
379 R1 = _mm_packus_epi16(R4, R4);
382 R1 = _mm_unpackhi_epi64(R1, R7);
385 R2 = _mm_unpacklo_epi8(R0, R1);
387 R3 = _mm_unpackhi_epi8(R0, R1);
389 R4 = _mm_unpacklo_epi16(R2, R3);
391 R5 = _mm_unpackhi_epi16(R2, R3);
393 _mm_store_si128((__m128i*)dptr, R4);
395 _mm_store_si128((__m128i*)dptr, R5);
403 pstatus_t status = 0;
404 status =
generic->YCoCgToRGB_8u_AC4R(
405 sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
406 WINPR_ASSERTING_INT_CAST(INT32, dstStep), WINPR_ASSERTING_INT_CAST(UINT32, w), 1,
409 if (status != PRIMITIVES_SUCCESS)
412 sptr += WINPR_ASSERTING_INT_CAST(UINT32, w) *
sizeof(UINT32);
413 dptr += WINPR_ASSERTING_INT_CAST(UINT32, w) *
sizeof(UINT32);
420 return PRIMITIVES_SUCCESS;
424 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(
const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
425 BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
426 INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
431 case PIXEL_FORMAT_BGRX32:
432 case PIXEL_FORMAT_BGRA32:
433 return ssse3_YCoCgRToRGB_8u_AC4R_invert(
434 pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
435 WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
437 case PIXEL_FORMAT_RGBX32:
438 case PIXEL_FORMAT_RGBA32:
439 return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
440 pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
441 WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
444 return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
445 height, shift, withAlpha);
452 void primitives_init_YCoCg_ssse3(
primitives_t* WINPR_RESTRICT prims)
454 #if defined(SSE_AVX_INTRINSICS_ENABLED)
455 generic = primitives_get_generic();
456 primitives_init_YCoCg(prims);
458 if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
459 IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
461 WLog_VRB(PRIM_TAG,
"SSE3/SSSE3 optimizations");
462 prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
465 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or SSE2 intrinsics not available");