|
FreeRDP
|
20 #ifndef FREERDP_LIB_PRIM_TEMPLATES_H
21 #define FREERDP_LIB_PRIM_TEMPLATES_H
46 #define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
47 static pstatus_t _name_(const _type_* pSrc, UINT32 val, _type_* pDst, UINT32 len) \
51 const _type_* sptr = pSrc; \
52 _type_* dptr = pDst; \
54 return PRIMITIVES_SUCCESS; \
59 return _fallback_(pSrc, val, pDst, len); \
61 if (sizeof(_type_) == 1) \
63 else if (sizeof(_type_) == 2) \
65 else if (sizeof(_type_) == 4) \
67 else if (sizeof(_type_) == 8) \
69 offBeatMask = (1 << (shifts - 1)) - 1; \
70 if ((ULONG_PTR)pDst & offBeatMask) \
73 return _fallback_(pSrc, val, pDst, len); \
76 while ((ULONG_PTR)dptr & 0x0f) \
80 return PRIMITIVES_SUCCESS; \
83 size_t count = len >> (8 - shifts); \
84 len -= count << (8 - shifts); \
85 if ((const ULONG_PTR)sptr & 0x0f) \
89 __m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr); \
90 sptr += (16 / sizeof(_type_)); \
91 __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \
92 sptr += (16 / sizeof(_type_)); \
93 __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \
94 sptr += (16 / sizeof(_type_)); \
95 __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \
96 sptr += (16 / sizeof(_type_)); \
97 __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \
98 sptr += (16 / sizeof(_type_)); \
99 __m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr); \
100 sptr += (16 / sizeof(_type_)); \
101 __m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr); \
102 sptr += (16 / sizeof(_type_)); \
103 __m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr); \
104 sptr += (16 / sizeof(_type_)); \
105 xmm0 = _op_(xmm0, (_op_type_)val); \
106 xmm1 = _op_(xmm1, (_op_type_)val); \
107 xmm2 = _op_(xmm2, (_op_type_)val); \
108 xmm3 = _op_(xmm3, (_op_type_)val); \
109 xmm4 = _op_(xmm4, (_op_type_)val); \
110 xmm5 = _op_(xmm5, (_op_type_)val); \
111 xmm6 = _op_(xmm6, (_op_type_)val); \
112 xmm7 = _op_(xmm7, (_op_type_)val); \
113 _mm_store_si128((__m128i*)dptr, xmm0); \
114 dptr += (16 / sizeof(_type_)); \
115 _mm_store_si128((__m128i*)dptr, xmm1); \
116 dptr += (16 / sizeof(_type_)); \
117 _mm_store_si128((__m128i*)dptr, xmm2); \
118 dptr += (16 / sizeof(_type_)); \
119 _mm_store_si128((__m128i*)dptr, xmm3); \
120 dptr += (16 / sizeof(_type_)); \
121 _mm_store_si128((__m128i*)dptr, xmm4); \
122 dptr += (16 / sizeof(_type_)); \
123 _mm_store_si128((__m128i*)dptr, xmm5); \
124 dptr += (16 / sizeof(_type_)); \
125 _mm_store_si128((__m128i*)dptr, xmm6); \
126 dptr += (16 / sizeof(_type_)); \
127 _mm_store_si128((__m128i*)dptr, xmm7); \
128 dptr += (16 / sizeof(_type_)); \
135 __m128i xmm0 = _mm_load_si128((const __m128i*)sptr); \
136 sptr += (16 / sizeof(_type_)); \
137 __m128i xmm1 = _mm_load_si128((const __m128i*)sptr); \
138 sptr += (16 / sizeof(_type_)); \
139 __m128i xmm2 = _mm_load_si128((const __m128i*)sptr); \
140 sptr += (16 / sizeof(_type_)); \
141 __m128i xmm3 = _mm_load_si128((const __m128i*)sptr); \
142 sptr += (16 / sizeof(_type_)); \
143 __m128i xmm4 = _mm_load_si128((const __m128i*)sptr); \
144 sptr += (16 / sizeof(_type_)); \
145 __m128i xmm5 = _mm_load_si128((const __m128i*)sptr); \
146 sptr += (16 / sizeof(_type_)); \
147 __m128i xmm6 = _mm_load_si128((const __m128i*)sptr); \
148 sptr += (16 / sizeof(_type_)); \
149 __m128i xmm7 = _mm_load_si128((const __m128i*)sptr); \
150 sptr += (16 / sizeof(_type_)); \
151 xmm0 = _op_(xmm0, (_op_type_)val); \
152 xmm1 = _op_(xmm1, (_op_type_)val); \
153 xmm2 = _op_(xmm2, (_op_type_)val); \
154 xmm3 = _op_(xmm3, (_op_type_)val); \
155 xmm4 = _op_(xmm4, (_op_type_)val); \
156 xmm5 = _op_(xmm5, (_op_type_)val); \
157 xmm6 = _op_(xmm6, (_op_type_)val); \
158 xmm7 = _op_(xmm7, (_op_type_)val); \
159 _mm_store_si128((__m128i*)dptr, xmm0); \
160 dptr += (16 / sizeof(_type_)); \
161 _mm_store_si128((__m128i*)dptr, xmm1); \
162 dptr += (16 / sizeof(_type_)); \
163 _mm_store_si128((__m128i*)dptr, xmm2); \
164 dptr += (16 / sizeof(_type_)); \
165 _mm_store_si128((__m128i*)dptr, xmm3); \
166 dptr += (16 / sizeof(_type_)); \
167 _mm_store_si128((__m128i*)dptr, xmm4); \
168 dptr += (16 / sizeof(_type_)); \
169 _mm_store_si128((__m128i*)dptr, xmm5); \
170 dptr += (16 / sizeof(_type_)); \
171 _mm_store_si128((__m128i*)dptr, xmm6); \
172 dptr += (16 / sizeof(_type_)); \
173 _mm_store_si128((__m128i*)dptr, xmm7); \
174 dptr += (16 / sizeof(_type_)); \
178 count = len >> (5 - shifts); \
179 len -= count << (5 - shifts); \
182 __m128i xmm0 = LOAD_SI128(sptr); \
183 sptr += (16 / sizeof(_type_)); \
184 xmm0 = _op_(xmm0, (_op_type_)val); \
185 _mm_store_si128((__m128i*)dptr, xmm0); \
186 dptr += (16 / sizeof(_type_)); \
193 return PRIMITIVES_SUCCESS; \
200 #define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
201 static pstatus_t _name_(const _type_* pSrc, _type_ val, _type_* pDst, INT32 ilen) \
203 size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen); \
205 UINT32 offBeatMask; \
206 const _type_* sptr = pSrc; \
207 _type_* dptr = pDst; \
212 return _fallback_(pSrc, val, pDst, WINPR_ASSERTING_INT_CAST(int32_t, len)); \
214 if (sizeof(_type_) == 1) \
216 else if (sizeof(_type_) == 2) \
218 else if (sizeof(_type_) == 4) \
220 else if (sizeof(_type_) == 8) \
222 offBeatMask = (1 << (shifts - 1)) - 1; \
223 if ((ULONG_PTR)pDst & offBeatMask) \
226 return _fallback_(pSrc, val, pDst, WINPR_ASSERTING_INT_CAST(int32_t, len)); \
229 while ((ULONG_PTR)dptr & 0x0f) \
233 return PRIMITIVES_SUCCESS; \
236 count = len >> (7 - shifts); \
237 len -= count << (7 - shifts); \
238 xmm0 = mm_set1_epu32(val); \
239 if ((const ULONG_PTR)sptr & 0x0f) \
243 __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \
244 sptr += (16 / sizeof(_type_)); \
245 __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \
246 sptr += (16 / sizeof(_type_)); \
247 __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \
248 sptr += (16 / sizeof(_type_)); \
249 __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \
250 sptr += (16 / sizeof(_type_)); \
251 xmm1 = _op_(xmm1, xmm0); \
252 xmm2 = _op_(xmm2, xmm0); \
253 xmm3 = _op_(xmm3, xmm0); \
254 xmm4 = _op_(xmm4, xmm0); \
255 _mm_store_si128((__m128i*)dptr, xmm1); \
256 dptr += (16 / sizeof(_type_)); \
257 _mm_store_si128((__m128i*)dptr, xmm2); \
258 dptr += (16 / sizeof(_type_)); \
259 _mm_store_si128((__m128i*)dptr, xmm3); \
260 dptr += (16 / sizeof(_type_)); \
261 _mm_store_si128((__m128i*)dptr, xmm4); \
262 dptr += (16 / sizeof(_type_)); \
269 __m128i xmm1 = _mm_load_si128((const __m128i*)sptr); \
270 sptr += (16 / sizeof(_type_)); \
271 __m128i xmm2 = _mm_load_si128((const __m128i*)sptr); \
272 sptr += (16 / sizeof(_type_)); \
273 __m128i xmm3 = _mm_load_si128((const __m128i*)sptr); \
274 sptr += (16 / sizeof(_type_)); \
275 __m128i xmm4 = _mm_load_si128((const __m128i*)sptr); \
276 sptr += (16 / sizeof(_type_)); \
277 xmm1 = _op_(xmm1, xmm0); \
278 xmm2 = _op_(xmm2, xmm0); \
279 xmm3 = _op_(xmm3, xmm0); \
280 xmm4 = _op_(xmm4, xmm0); \
281 _mm_store_si128((__m128i*)dptr, xmm1); \
282 dptr += (16 / sizeof(_type_)); \
283 _mm_store_si128((__m128i*)dptr, xmm2); \
284 dptr += (16 / sizeof(_type_)); \
285 _mm_store_si128((__m128i*)dptr, xmm3); \
286 dptr += (16 / sizeof(_type_)); \
287 _mm_store_si128((__m128i*)dptr, xmm4); \
288 dptr += (16 / sizeof(_type_)); \
292 count = len >> (5 - shifts); \
293 len -= count << (5 - shifts); \
296 __m128i xmm1 = LOAD_SI128(sptr); \
297 sptr += (16 / sizeof(_type_)); \
298 xmm1 = _op_(xmm1, xmm0); \
299 _mm_store_si128((__m128i*)dptr, xmm1); \
300 dptr += (16 / sizeof(_type_)); \
307 return PRIMITIVES_SUCCESS; \
313 #define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
314 static pstatus_t _name_(const _type_* pSrc1, const _type_* pSrc2, _type_* pDst, UINT32 len) \
317 UINT32 offBeatMask; \
318 const _type_* sptr1 = pSrc1; \
319 const _type_* sptr2 = pSrc2; \
320 _type_* dptr = pDst; \
324 return _fallback_(pSrc1, pSrc2, pDst, len); \
326 if (sizeof(_type_) == 1) \
328 else if (sizeof(_type_) == 2) \
330 else if (sizeof(_type_) == 4) \
332 else if (sizeof(_type_) == 8) \
334 offBeatMask = (1 << (shifts - 1)) - 1; \
335 if ((ULONG_PTR)pDst & offBeatMask) \
338 return _fallback_(pSrc1, pSrc2, pDst, len); \
341 while ((ULONG_PTR)dptr & 0x0f) \
344 status = _slowWay_; \
345 if (status != PRIMITIVES_SUCCESS) \
348 return PRIMITIVES_SUCCESS; \
351 count = len >> (7 - shifts); \
352 len -= count << (7 - shifts); \
353 if (((const ULONG_PTR)sptr1 & 0x0f) || ((const ULONG_PTR)sptr2 & 0x0f)) \
358 __m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr1); \
359 sptr1 += (16 / sizeof(_type_)); \
360 __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr1); \
361 sptr1 += (16 / sizeof(_type_)); \
362 __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr1); \
363 sptr1 += (16 / sizeof(_type_)); \
364 __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr1); \
365 sptr1 += (16 / sizeof(_type_)); \
366 __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr2); \
367 sptr2 += (16 / sizeof(_type_)); \
368 __m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr2); \
369 sptr2 += (16 / sizeof(_type_)); \
370 __m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr2); \
371 sptr2 += (16 / sizeof(_type_)); \
372 __m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr2); \
373 sptr2 += (16 / sizeof(_type_)); \
374 xmm0 = _op_(xmm0, xmm4); \
375 xmm1 = _op_(xmm1, xmm5); \
376 xmm2 = _op_(xmm2, xmm6); \
377 xmm3 = _op_(xmm3, xmm7); \
378 _mm_store_si128((__m128i*)dptr, xmm0); \
379 dptr += (16 / sizeof(_type_)); \
380 _mm_store_si128((__m128i*)dptr, xmm1); \
381 dptr += (16 / sizeof(_type_)); \
382 _mm_store_si128((__m128i*)dptr, xmm2); \
383 dptr += (16 / sizeof(_type_)); \
384 _mm_store_si128((__m128i*)dptr, xmm3); \
385 dptr += (16 / sizeof(_type_)); \
393 __m128i xmm0 = _mm_load_si128((const __m128i*)sptr1); \
394 sptr1 += (16 / sizeof(_type_)); \
395 __m128i xmm1 = _mm_load_si128((const __m128i*)sptr1); \
396 sptr1 += (16 / sizeof(_type_)); \
397 __m128i xmm2 = _mm_load_si128((const __m128i*)sptr1); \
398 sptr1 += (16 / sizeof(_type_)); \
399 __m128i xmm3 = _mm_load_si128((const __m128i*)sptr1); \
400 sptr1 += (16 / sizeof(_type_)); \
401 __m128i xmm4 = _mm_load_si128((const __m128i*)sptr2); \
402 sptr2 += (16 / sizeof(_type_)); \
403 __m128i xmm5 = _mm_load_si128((const __m128i*)sptr2); \
404 sptr2 += (16 / sizeof(_type_)); \
405 __m128i xmm6 = _mm_load_si128((const __m128i*)sptr2); \
406 sptr2 += (16 / sizeof(_type_)); \
407 __m128i xmm7 = _mm_load_si128((const __m128i*)sptr2); \
408 sptr2 += (16 / sizeof(_type_)); \
409 xmm0 = _op_(xmm0, xmm4); \
410 xmm1 = _op_(xmm1, xmm5); \
411 xmm2 = _op_(xmm2, xmm6); \
412 xmm3 = _op_(xmm3, xmm7); \
413 _mm_store_si128((__m128i*)dptr, xmm0); \
414 dptr += (16 / sizeof(_type_)); \
415 _mm_store_si128((__m128i*)dptr, xmm1); \
416 dptr += (16 / sizeof(_type_)); \
417 _mm_store_si128((__m128i*)dptr, xmm2); \
418 dptr += (16 / sizeof(_type_)); \
419 _mm_store_si128((__m128i*)dptr, xmm3); \
420 dptr += (16 / sizeof(_type_)); \
424 count = len >> (5 - shifts); \
425 len -= count << (5 - shifts); \
428 __m128i xmm0 = LOAD_SI128(sptr1); \
429 sptr1 += (16 / sizeof(_type_)); \
430 __m128i xmm1 = LOAD_SI128(sptr2); \
431 sptr2 += (16 / sizeof(_type_)); \
432 xmm0 = _op_(xmm0, xmm1); \
433 _mm_store_si128((__m128i*)dptr, xmm0); \
434 dptr += (16 / sizeof(_type_)); \
441 return PRIMITIVES_SUCCESS; \