20 #include <freerdp/config.h>
22 #include <freerdp/types.h>
23 #include <freerdp/primitives.h>
24 #include <winpr/sysinfo.h>
26 #include "prim_YCoCg.h"
28 #include "prim_internal.h"
29 #include "prim_templates.h"
31 #if defined(SSE2_ENABLED)
32 #include <emmintrin.h>
33 #include <tmmintrin.h>
38 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
39 BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
40 UINT32 dstStep, UINT32 width, UINT32 height,
41 UINT8 shift, BOOL withAlpha)
43 const BYTE* sptr = pSrc;
45 int sRowBump = srcStep - width *
sizeof(UINT32);
46 int dRowBump = dstStep - width *
sizeof(UINT32);
50 int dataShift = shift - 1;
51 BYTE mask = (BYTE)(0xFFU << dataShift);
62 if ((width < 8) || (ULONG_PTR)dptr & 0x03)
65 return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
69 for (UINT32 h = 0; h < height; h++)
75 if ((ULONG_PTR)dptr & 0x0f)
78 UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
83 status =
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
86 if (status != PRIMITIVES_SUCCESS)
89 sptr += startup *
sizeof(UINT32);
90 dptr += startup *
sizeof(UINT32);
95 onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
111 R0 = _mm_load_si128((
const __m128i*)sptr);
113 R1 = _mm_load_si128((
const __m128i*)sptr);
119 R0 = _mm_lddqu_si128((
const __m128i*)sptr);
121 R1 = _mm_lddqu_si128((
const __m128i*)sptr);
128 R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
129 R3 = _mm_shuffle_epi8(R0, R2);
130 R4 = _mm_shuffle_epi8(R1, R2);
133 R5 = _mm_unpackhi_epi32(R3, R4);
134 R6 = _mm_unpacklo_epi32(R3, R4);
140 R7 = _mm_unpackhi_epi64(R5, R5);
142 R7 = _mm_set1_epi32(0xFFFFFFFFU);
146 R1 = _mm_set1_epi32(0);
147 R0 = _mm_unpacklo_epi8(R5, R1);
154 R6 = _mm_slli_epi16(R6, dataShift);
155 R1 = _mm_set1_epi8(mask);
156 R6 = _mm_and_si128(R6, R1);
159 R1 = _mm_unpackhi_epi8(R6, R6);
160 R1 = _mm_srai_epi16(R1, 8);
163 R2 = _mm_unpacklo_epi8(R6, R6);
164 R2 = _mm_srai_epi16(R2, 8);
167 R6 = _mm_subs_epi16(R0, R2);
169 R3 = _mm_adds_epi16(R6, R1);
172 R4 = _mm_adds_epi16(R0, R2);
175 R5 = _mm_subs_epi16(R6, R1);
178 R0 = _mm_packus_epi16(R3, R5);
181 R1 = _mm_packus_epi16(R4, R4);
184 R1 = _mm_unpackhi_epi64(R1, R7);
187 R2 = _mm_unpacklo_epi8(R0, R1);
189 R3 = _mm_unpackhi_epi8(R0, R1);
191 R4 = _mm_unpacklo_epi16(R2, R3);
193 R5 = _mm_unpackhi_epi16(R2, R3);
195 _mm_store_si128((__m128i*)dptr, R4);
197 _mm_store_si128((__m128i*)dptr, R5);
205 pstatus_t status = 0;
206 status =
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
209 if (status != PRIMITIVES_SUCCESS)
212 sptr += w *
sizeof(UINT32);
213 dptr += w *
sizeof(UINT32);
220 return PRIMITIVES_SUCCESS;
224 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
const BYTE* WINPR_RESTRICT pSrc,
225 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
226 UINT32 DstFormat, UINT32 dstStep, UINT32 width,
227 UINT32 height, UINT8 shift, BOOL withAlpha)
229 const BYTE* sptr = pSrc;
231 int sRowBump = srcStep - width *
sizeof(UINT32);
232 int dRowBump = dstStep - width *
sizeof(UINT32);
236 int dataShift = shift - 1;
237 BYTE mask = (BYTE)(0xFFU << dataShift);
248 if ((width < 8) || (ULONG_PTR)dptr & 0x03)
251 return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
255 for (UINT32 h = 0; h < height; h++)
261 if ((ULONG_PTR)dptr & 0x0f)
263 pstatus_t status = 0;
264 UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
269 status =
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
270 1, shift, withAlpha);
272 if (status != PRIMITIVES_SUCCESS)
275 sptr += startup *
sizeof(UINT32);
276 dptr += startup *
sizeof(UINT32);
281 onStride = (((
const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
297 R0 = _mm_load_si128((
const __m128i*)sptr);
299 R1 = _mm_load_si128((
const __m128i*)sptr);
305 R0 = _mm_lddqu_si128((
const __m128i*)sptr);
307 R1 = _mm_lddqu_si128((
const __m128i*)sptr);
314 R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
315 R3 = _mm_shuffle_epi8(R0, R2);
316 R4 = _mm_shuffle_epi8(R1, R2);
319 R5 = _mm_unpackhi_epi32(R3, R4);
320 R6 = _mm_unpacklo_epi32(R3, R4);
326 R7 = _mm_unpackhi_epi64(R5, R5);
328 R7 = _mm_set1_epi32(0xFFFFFFFFU);
332 R1 = _mm_set1_epi32(0);
333 R0 = _mm_unpacklo_epi8(R5, R1);
340 R6 = _mm_slli_epi16(R6, dataShift);
341 R1 = _mm_set1_epi8(mask);
342 R6 = _mm_and_si128(R6, R1);
345 R1 = _mm_unpackhi_epi8(R6, R6);
346 R1 = _mm_srai_epi16(R1, 8);
349 R2 = _mm_unpacklo_epi8(R6, R6);
350 R2 = _mm_srai_epi16(R2, 8);
353 R6 = _mm_subs_epi16(R0, R2);
355 R3 = _mm_adds_epi16(R6, R1);
358 R4 = _mm_adds_epi16(R0, R2);
361 R5 = _mm_subs_epi16(R6, R1);
368 R0 = _mm_packus_epi16(R5, R3);
371 R1 = _mm_packus_epi16(R4, R4);
374 R1 = _mm_unpackhi_epi64(R1, R7);
377 R2 = _mm_unpacklo_epi8(R0, R1);
379 R3 = _mm_unpackhi_epi8(R0, R1);
381 R4 = _mm_unpacklo_epi16(R2, R3);
383 R5 = _mm_unpackhi_epi16(R2, R3);
385 _mm_store_si128((__m128i*)dptr, R4);
387 _mm_store_si128((__m128i*)dptr, R5);
395 pstatus_t status = 0;
396 status =
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
399 if (status != PRIMITIVES_SUCCESS)
402 sptr += w *
sizeof(UINT32);
403 dptr += w *
sizeof(UINT32);
410 return PRIMITIVES_SUCCESS;
414 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(
const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
415 BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
416 INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
421 case PIXEL_FORMAT_BGRX32:
422 case PIXEL_FORMAT_BGRA32:
423 return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, DstFormat, dstStep, width,
424 height, shift, withAlpha);
426 case PIXEL_FORMAT_RGBX32:
427 case PIXEL_FORMAT_RGBA32:
428 return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, DstFormat, dstStep,
429 width, height, shift, withAlpha);
432 return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
433 height, shift, withAlpha);
440 void primitives_init_YCoCg_ssse3(
primitives_t* WINPR_RESTRICT prims)
442 #if defined(SSE2_ENABLED)
443 generic = primitives_get_generic();
444 primitives_init_YCoCg(prims);
446 if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
447 IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
449 WLog_VRB(PRIM_TAG,
"SSE3/SSSE3 optimizations");
450 prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
453 WLog_VRB(PRIM_TAG,
"undefined WITH_SSE2");