17 #include <freerdp/config.h>
19 #include <freerdp/types.h>
20 #include <freerdp/primitives.h>
21 #include <winpr/sysinfo.h>
25 #include "prim_internal.h"
26 #include "prim_templates.h"
28 #if defined(SSE2_ENABLED)
29 #include <emmintrin.h>
30 #include <pmmintrin.h>
35 SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
36 generic->add_16s(sptr1++, sptr2++, dptr++, 1))
38 static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
39 INT16* WINPR_RESTRICT pSrcDst2, UINT32 len)
42 INT16* dptr1 = pSrcDst1;
43 INT16* dptr2 = pSrcDst2;
46 return generic->add_16s_inplace(pSrcDst1, pSrcDst2, len);
48 UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
49 if ((ULONG_PTR)pSrcDst1 & offBeatMask)
52 return generic->add_16s_inplace(pSrcDst1, pSrcDst2, len);
55 const size_t rem = ((UINT_PTR)dptr1 & 0xf) /
sizeof(INT16);
58 const UINT32 add = 16 - (UINT32)rem;
59 pstatus_t status =
generic->add_16s_inplace(dptr1, dptr2, add);
60 if (status != PRIMITIVES_SUCCESS)
66 size_t count = len >> (7 - shifts);
67 len -= count << (7 - shifts);
68 if (((
const ULONG_PTR)dptr1 & 0x0f) || ((
const ULONG_PTR)dptr2 & 0x0f))
73 const __m128i* vsptr1 = (
const __m128i*)dptr1;
74 const __m128i* vsptr2 = (
const __m128i*)dptr2;
75 __m128i* vdptr1 = (__m128i*)dptr1;
76 __m128i* vdptr2 = (__m128i*)dptr2;
78 __m128i xmm0 = _mm_lddqu_si128(vsptr1++);
79 __m128i xmm1 = _mm_lddqu_si128(vsptr1++);
80 __m128i xmm2 = _mm_lddqu_si128(vsptr1++);
81 __m128i xmm3 = _mm_lddqu_si128(vsptr1++);
82 __m128i xmm4 = _mm_lddqu_si128(vsptr2++);
83 __m128i xmm5 = _mm_lddqu_si128(vsptr2++);
84 __m128i xmm6 = _mm_lddqu_si128(vsptr2++);
85 __m128i xmm7 = _mm_lddqu_si128(vsptr2++);
87 xmm0 = _mm_adds_epi16(xmm0, xmm4);
88 xmm1 = _mm_adds_epi16(xmm1, xmm5);
89 xmm2 = _mm_adds_epi16(xmm2, xmm6);
90 xmm3 = _mm_adds_epi16(xmm3, xmm7);
92 _mm_store_si128(vdptr1++, xmm0);
93 _mm_store_si128(vdptr1++, xmm1);
94 _mm_store_si128(vdptr1++, xmm2);
95 _mm_store_si128(vdptr1++, xmm3);
97 _mm_store_si128(vdptr2++, xmm0);
98 _mm_store_si128(vdptr2++, xmm1);
99 _mm_store_si128(vdptr2++, xmm2);
100 _mm_store_si128(vdptr2++, xmm3);
102 dptr1 = (INT16*)vdptr1;
103 dptr2 = (INT16*)vdptr2;
111 const __m128i* vsptr1 = (
const __m128i*)dptr1;
112 const __m128i* vsptr2 = (
const __m128i*)dptr2;
113 __m128i* vdptr1 = (__m128i*)dptr1;
114 __m128i* vdptr2 = (__m128i*)dptr2;
116 __m128i xmm0 = _mm_load_si128(vsptr1++);
117 __m128i xmm1 = _mm_load_si128(vsptr1++);
118 __m128i xmm2 = _mm_load_si128(vsptr1++);
119 __m128i xmm3 = _mm_load_si128(vsptr1++);
120 __m128i xmm4 = _mm_load_si128(vsptr2++);
121 __m128i xmm5 = _mm_load_si128(vsptr2++);
122 __m128i xmm6 = _mm_load_si128(vsptr2++);
123 __m128i xmm7 = _mm_load_si128(vsptr2++);
125 xmm0 = _mm_adds_epi16(xmm0, xmm4);
126 xmm1 = _mm_adds_epi16(xmm1, xmm5);
127 xmm2 = _mm_adds_epi16(xmm2, xmm6);
128 xmm3 = _mm_adds_epi16(xmm3, xmm7);
130 _mm_store_si128(vdptr1++, xmm0);
131 _mm_store_si128(vdptr1++, xmm1);
132 _mm_store_si128(vdptr1++, xmm2);
133 _mm_store_si128(vdptr1++, xmm3);
135 _mm_store_si128(vdptr2++, xmm0);
136 _mm_store_si128(vdptr2++, xmm1);
137 _mm_store_si128(vdptr2++, xmm2);
138 _mm_store_si128(vdptr2++, xmm3);
140 dptr1 = (INT16*)vdptr1;
141 dptr2 = (INT16*)vdptr2;
145 count = len >> (5 - shifts);
146 len -= count << (5 - shifts);
149 const __m128i* vsptr1 = (
const __m128i*)dptr1;
150 const __m128i* vsptr2 = (
const __m128i*)dptr2;
151 __m128i* vdptr1 = (__m128i*)dptr1;
152 __m128i* vdptr2 = (__m128i*)dptr2;
154 __m128i xmm0 = LOAD_SI128(vsptr1);
155 __m128i xmm1 = LOAD_SI128(vsptr2);
157 xmm0 = _mm_adds_epi16(xmm0, xmm1);
159 _mm_store_si128(vdptr1++, xmm0);
160 _mm_store_si128(vdptr2++, xmm0);
162 dptr1 = (INT16*)vdptr1;
163 dptr2 = (INT16*)vdptr2;
167 return generic->add_16s_inplace(dptr1, dptr2, len);
169 return PRIMITIVES_SUCCESS;
174 void primitives_init_add_sse3(
primitives_t* WINPR_RESTRICT prims)
176 #if defined(SSE2_ENABLED)
177 generic = primitives_get_generic();
178 primitives_init_add(prims);
180 if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
181 IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
183 WLog_VRB(PRIM_TAG,
"SSE2/SSE3 optimizations");
184 prims->add_16s = sse3_add_16s;
185 prims->add_16s_inplace = sse3_add_16s_inplace;
189 WLog_VRB(PRIM_TAG,
"undefined WITH_SSE2");