16 #include <freerdp/config.h>
18 #include <freerdp/types.h>
19 #include <freerdp/primitives.h>
20 #include <winpr/sysinfo.h>
22 #include "prim_shift.h"
24 #include "prim_internal.h"
25 #include "prim_templates.h"
27 #if defined(SSE2_ENABLED)
28 #include <emmintrin.h>
29 #include <pmmintrin.h>
34 SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16,
35 *dptr++ = (INT16)((UINT16)*sptr++ << val))
37 SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16,
38 *dptr++ = *sptr++ >> val)
40 SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16,
41 *dptr++ = (INT16)((UINT16)*sptr++ << val))
43 SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16,
44 *dptr++ = *sptr++ >> val)
46 static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 len)
48 const INT32 shifts = 2;
50 return PRIMITIVES_SUCCESS;
54 return generic->lShiftC_16s_inplace(pSrcDst, val, len);
56 UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
57 if ((ULONG_PTR)pSrcDst & offBeatMask)
60 return generic->lShiftC_16s_inplace(pSrcDst, val, len);
63 const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) /
sizeof(INT16);
66 const UINT32 add = 16 - rem;
67 pstatus_t status =
generic->lShiftC_16s_inplace(pSrcDst, val, add);
68 if (status != PRIMITIVES_SUCCESS)
75 int count = len >> (8 - shifts);
76 len -= count << (8 - shifts);
80 const __m128i* src = (
const __m128i*)pSrcDst;
82 __m128i xmm0 = _mm_load_si128(src++);
83 __m128i xmm1 = _mm_load_si128(src++);
84 __m128i xmm2 = _mm_load_si128(src++);
85 __m128i xmm3 = _mm_load_si128(src++);
86 __m128i xmm4 = _mm_load_si128(src++);
87 __m128i xmm5 = _mm_load_si128(src++);
88 __m128i xmm6 = _mm_load_si128(src++);
89 __m128i xmm7 = _mm_load_si128(src);
91 xmm0 = _mm_slli_epi16(xmm0, val);
92 xmm1 = _mm_slli_epi16(xmm1, val);
93 xmm2 = _mm_slli_epi16(xmm2, val);
94 xmm3 = _mm_slli_epi16(xmm3, val);
95 xmm4 = _mm_slli_epi16(xmm4, val);
96 xmm5 = _mm_slli_epi16(xmm5, val);
97 xmm6 = _mm_slli_epi16(xmm6, val);
98 xmm7 = _mm_slli_epi16(xmm7, val);
100 __m128i* dst = (__m128i*)pSrcDst;
102 _mm_store_si128(dst++, xmm0);
103 _mm_store_si128(dst++, xmm1);
104 _mm_store_si128(dst++, xmm2);
105 _mm_store_si128(dst++, xmm3);
106 _mm_store_si128(dst++, xmm4);
107 _mm_store_si128(dst++, xmm5);
108 _mm_store_si128(dst++, xmm6);
109 _mm_store_si128(dst++, xmm7);
111 pSrcDst = (INT16*)dst;
115 count = len >> (5 - shifts);
116 len -= count << (5 - shifts);
119 const __m128i* src = (
const __m128i*)pSrcDst;
120 __m128i xmm0 = LOAD_SI128(src);
122 xmm0 = _mm_slli_epi16(xmm0, val);
124 __m128i* dst = (__m128i*)pSrcDst;
125 _mm_store_si128(dst++, xmm0);
126 pSrcDst = (INT16*)dst;
131 return generic->lShiftC_16s_inplace(pSrcDst, val, len);
133 return PRIMITIVES_SUCCESS;
143 void primitives_init_shift_sse3(
primitives_t* WINPR_RESTRICT prims)
145 #if defined(SSE2_ENABLED)
146 generic = primitives_get_generic();
147 primitives_init_shift(prims);
149 if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
150 IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
152 WLog_VRB(PRIM_TAG,
"SSE2/SSE3 optimizations");
153 prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
154 prims->lShiftC_16s = sse2_lShiftC_16s;
155 prims->rShiftC_16s = sse2_rShiftC_16s;
156 prims->lShiftC_16u = sse2_lShiftC_16u;
157 prims->rShiftC_16u = sse2_rShiftC_16u;
161 WLog_VRB(PRIM_TAG,
"undefined WITH_SSE2");