23 #include <freerdp/config.h>
25 #include <freerdp/types.h>
26 #include <freerdp/primitives.h>
27 #include <winpr/sysinfo.h>
29 #include "prim_alphaComp.h"
31 #include "prim_internal.h"
34 #if defined(SSE_AVX_INTRINSICS_ENABLED)
35 #include <emmintrin.h>
36 #include <pmmintrin.h>
40 static pstatus_t sse2_alphaComp_argb(
const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
41 const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
42 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
45 const UINT32* sptr1 = (
const UINT32*)pSrc1;
46 const UINT32* sptr2 = (
const UINT32*)pSrc2;
48 if ((width <= 0) || (height <= 0))
49 return PRIMITIVES_SUCCESS;
53 return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
57 UINT32* dptr = (UINT32*)pDst;
58 const size_t linebytes = width *
sizeof(UINT32);
59 const size_t src1Jump = (src1Step - linebytes) /
sizeof(UINT32);
60 const size_t src2Jump = (src2Step - linebytes) /
sizeof(UINT32);
61 const size_t dstJump = (dstStep - linebytes) /
sizeof(UINT32);
62 __m128i xmm0 = mm_set1_epu32(0);
63 __m128i xmm1 = _mm_set1_epi16(1);
65 for (UINT32 y = 0; y < height; ++y)
67 uint32_t pixels = width;
72 switch ((ULONG_PTR)dptr & 0x0f)
100 pstatus_t status = 0;
101 status =
generic->alphaComp_argb((
const BYTE*)sptr1, src1Step, (
const BYTE*)sptr2,
102 src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
103 if (status != PRIMITIVES_SUCCESS)
114 pixels -= count << 2;
125 xmm2 = LOAD_SI128(sptr1);
128 xmm3 = LOAD_SI128(sptr2);
131 xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
133 xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
135 xmm6 = _mm_subs_epi16(xmm4, xmm5);
137 xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
139 xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
141 xmm4 = _mm_adds_epi16(xmm4, xmm1);
143 xmm4 = _mm_mullo_epi16(xmm4, xmm6);
145 xmm4 = _mm_srai_epi16(xmm4, 8);
147 xmm4 = _mm_adds_epi16(xmm4, xmm5);
150 xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
152 xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
154 xmm7 = _mm_subs_epi16(xmm5, xmm6);
156 xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
158 xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
160 xmm5 = _mm_adds_epi16(xmm5, xmm1);
162 xmm5 = _mm_mullo_epi16(xmm5, xmm7);
164 xmm5 = _mm_srai_epi16(xmm5, 8);
166 xmm5 = _mm_adds_epi16(xmm5, xmm6);
169 xmm3 = _mm_set1_epi16(0x00ffU);
170 xmm4 = _mm_and_si128(xmm4, xmm3);
171 xmm5 = _mm_and_si128(xmm5, xmm3);
173 xmm5 = _mm_packus_epi16(xmm5, xmm4);
174 _mm_store_si128((__m128i*)dptr, xmm5);
181 pstatus_t status = 0;
182 status =
generic->alphaComp_argb((
const BYTE*)sptr1, src1Step, (
const BYTE*)sptr2,
183 src2Step, (BYTE*)dptr, dstStep, pixels, 1);
184 if (status != PRIMITIVES_SUCCESS)
198 return PRIMITIVES_SUCCESS;
203 void primitives_init_alphaComp_sse3(
primitives_t* WINPR_RESTRICT prims)
205 #if defined(SSE_AVX_INTRINSICS_ENABLED)
206 generic = primitives_get_generic();
207 primitives_init_alphaComp(prims);
209 if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
210 IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
212 WLog_VRB(PRIM_TAG,
"SSE2/SSE3 optimizations");
213 prims->alphaComp_argb = sse2_alphaComp_argb;
217 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or SSE3 intrinsics not available");