|
FreeRDP
|
20 #ifndef FREERDP_LIB_PRIM_TEMPLATES_H
21 #define FREERDP_LIB_PRIM_TEMPLATES_H
46 #define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
47 static pstatus_t _name_(const _type_* pSrc, UINT32 val, _type_* pDst, UINT32 len) \
51 const _type_* sptr = pSrc; \
52 _type_* dptr = pDst; \
55 return PRIMITIVES_SUCCESS; \
60 return _fallback_(pSrc, val, pDst, len); \
62 if (sizeof(_type_) == 1) \
64 else if (sizeof(_type_) == 2) \
66 else if (sizeof(_type_) == 4) \
68 else if (sizeof(_type_) == 8) \
70 offBeatMask = (1 << (shifts - 1)) - 1; \
71 if ((ULONG_PTR)pDst & offBeatMask) \
74 return _fallback_(pSrc, val, pDst, len); \
77 while ((ULONG_PTR)dptr & 0x0f) \
81 return PRIMITIVES_SUCCESS; \
84 count = len >> (8 - shifts); \
85 len -= count << (8 - shifts); \
86 if ((const ULONG_PTR)sptr & 0x0f) \
90 __m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr); \
91 sptr += (16 / sizeof(_type_)); \
92 __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \
93 sptr += (16 / sizeof(_type_)); \
94 __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \
95 sptr += (16 / sizeof(_type_)); \
96 __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \
97 sptr += (16 / sizeof(_type_)); \
98 __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \
99 sptr += (16 / sizeof(_type_)); \
100 __m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr); \
101 sptr += (16 / sizeof(_type_)); \
102 __m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr); \
103 sptr += (16 / sizeof(_type_)); \
104 __m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr); \
105 sptr += (16 / sizeof(_type_)); \
106 xmm0 = _op_(xmm0, val); \
107 xmm1 = _op_(xmm1, val); \
108 xmm2 = _op_(xmm2, val); \
109 xmm3 = _op_(xmm3, val); \
110 xmm4 = _op_(xmm4, val); \
111 xmm5 = _op_(xmm5, val); \
112 xmm6 = _op_(xmm6, val); \
113 xmm7 = _op_(xmm7, val); \
114 _mm_store_si128((__m128i*)dptr, xmm0); \
115 dptr += (16 / sizeof(_type_)); \
116 _mm_store_si128((__m128i*)dptr, xmm1); \
117 dptr += (16 / sizeof(_type_)); \
118 _mm_store_si128((__m128i*)dptr, xmm2); \
119 dptr += (16 / sizeof(_type_)); \
120 _mm_store_si128((__m128i*)dptr, xmm3); \
121 dptr += (16 / sizeof(_type_)); \
122 _mm_store_si128((__m128i*)dptr, xmm4); \
123 dptr += (16 / sizeof(_type_)); \
124 _mm_store_si128((__m128i*)dptr, xmm5); \
125 dptr += (16 / sizeof(_type_)); \
126 _mm_store_si128((__m128i*)dptr, xmm6); \
127 dptr += (16 / sizeof(_type_)); \
128 _mm_store_si128((__m128i*)dptr, xmm7); \
129 dptr += (16 / sizeof(_type_)); \
136 __m128i xmm0 = _mm_load_si128((const __m128i*)sptr); \
137 sptr += (16 / sizeof(_type_)); \
138 __m128i xmm1 = _mm_load_si128((const __m128i*)sptr); \
139 sptr += (16 / sizeof(_type_)); \
140 __m128i xmm2 = _mm_load_si128((const __m128i*)sptr); \
141 sptr += (16 / sizeof(_type_)); \
142 __m128i xmm3 = _mm_load_si128((const __m128i*)sptr); \
143 sptr += (16 / sizeof(_type_)); \
144 __m128i xmm4 = _mm_load_si128((const __m128i*)sptr); \
145 sptr += (16 / sizeof(_type_)); \
146 __m128i xmm5 = _mm_load_si128((const __m128i*)sptr); \
147 sptr += (16 / sizeof(_type_)); \
148 __m128i xmm6 = _mm_load_si128((const __m128i*)sptr); \
149 sptr += (16 / sizeof(_type_)); \
150 __m128i xmm7 = _mm_load_si128((const __m128i*)sptr); \
151 sptr += (16 / sizeof(_type_)); \
152 xmm0 = _op_(xmm0, val); \
153 xmm1 = _op_(xmm1, val); \
154 xmm2 = _op_(xmm2, val); \
155 xmm3 = _op_(xmm3, val); \
156 xmm4 = _op_(xmm4, val); \
157 xmm5 = _op_(xmm5, val); \
158 xmm6 = _op_(xmm6, val); \
159 xmm7 = _op_(xmm7, val); \
160 _mm_store_si128((__m128i*)dptr, xmm0); \
161 dptr += (16 / sizeof(_type_)); \
162 _mm_store_si128((__m128i*)dptr, xmm1); \
163 dptr += (16 / sizeof(_type_)); \
164 _mm_store_si128((__m128i*)dptr, xmm2); \
165 dptr += (16 / sizeof(_type_)); \
166 _mm_store_si128((__m128i*)dptr, xmm3); \
167 dptr += (16 / sizeof(_type_)); \
168 _mm_store_si128((__m128i*)dptr, xmm4); \
169 dptr += (16 / sizeof(_type_)); \
170 _mm_store_si128((__m128i*)dptr, xmm5); \
171 dptr += (16 / sizeof(_type_)); \
172 _mm_store_si128((__m128i*)dptr, xmm6); \
173 dptr += (16 / sizeof(_type_)); \
174 _mm_store_si128((__m128i*)dptr, xmm7); \
175 dptr += (16 / sizeof(_type_)); \
179 count = len >> (5 - shifts); \
180 len -= count << (5 - shifts); \
183 __m128i xmm0 = LOAD_SI128(sptr); \
184 sptr += (16 / sizeof(_type_)); \
185 xmm0 = _op_(xmm0, val); \
186 _mm_store_si128((__m128i*)dptr, xmm0); \
187 dptr += (16 / sizeof(_type_)); \
194 return PRIMITIVES_SUCCESS; \
201 #define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
202 static pstatus_t _name_(const _type_* pSrc, _type_ val, _type_* pDst, INT32 len) \
205 UINT32 offBeatMask; \
206 const _type_* sptr = pSrc; \
207 _type_* dptr = pDst; \
212 return _fallback_(pSrc, val, pDst, len); \
214 if (sizeof(_type_) == 1) \
216 else if (sizeof(_type_) == 2) \
218 else if (sizeof(_type_) == 4) \
220 else if (sizeof(_type_) == 8) \
222 offBeatMask = (1 << (shifts - 1)) - 1; \
223 if ((ULONG_PTR)pDst & offBeatMask) \
226 return _fallback_(pSrc, val, pDst, len); \
229 while ((ULONG_PTR)dptr & 0x0f) \
233 return PRIMITIVES_SUCCESS; \
236 count = len >> (7 - shifts); \
237 len -= count << (7 - shifts); \
238 xmm0 = _mm_set1_epi32(val); \
239 if ((const ULONG_PTR)sptr & 0x0f) \
243 __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \
244 sptr += (16 / sizeof(_type_)); \
245 __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \
246 sptr += (16 / sizeof(_type_)); \
247 __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \
248 sptr += (16 / sizeof(_type_)); \
249 __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \
250 sptr += (16 / sizeof(_type_)); \
251 xmm1 = _op_(xmm1, xmm0); \
252 xmm2 = _op_(xmm2, xmm0); \
253 xmm3 = _op_(xmm3, xmm0); \
254 xmm4 = _op_(xmm4, xmm0); \
255 _mm_store_si128((__m128i*)dptr, xmm1); \
256 dptr += (16 / sizeof(_type_)); \
257 _mm_store_si128((__m128i*)dptr, xmm2); \
258 dptr += (16 / sizeof(_type_)); \
259 _mm_store_si128((__m128i*)dptr, xmm3); \
260 dptr += (16 / sizeof(_type_)); \
261 _mm_store_si128((__m128i*)dptr, xmm4); \
262 dptr += (16 / sizeof(_type_)); \
269 __m128i xmm1 = _mm_load_si128((const __m128i*)sptr); \
270 sptr += (16 / sizeof(_type_)); \
271 __m128i xmm2 = _mm_load_si128((const __m128i*)sptr); \
272 sptr += (16 / sizeof(_type_)); \
273 __m128i xmm3 = _mm_load_si128((const __m128i*)sptr); \
274 sptr += (16 / sizeof(_type_)); \
275 __m128i xmm4 = _mm_load_si128((const __m128i*)sptr); \
276 sptr += (16 / sizeof(_type_)); \
277 xmm1 = _op_(xmm1, xmm0); \
278 xmm2 = _op_(xmm2, xmm0); \
279 xmm3 = _op_(xmm3, xmm0); \
280 xmm4 = _op_(xmm4, xmm0); \
281 _mm_store_si128((__m128i*)dptr, xmm1); \
282 dptr += (16 / sizeof(_type_)); \
283 _mm_store_si128((__m128i*)dptr, xmm2); \
284 dptr += (16 / sizeof(_type_)); \
285 _mm_store_si128((__m128i*)dptr, xmm3); \
286 dptr += (16 / sizeof(_type_)); \
287 _mm_store_si128((__m128i*)dptr, xmm4); \
288 dptr += (16 / sizeof(_type_)); \
292 count = len >> (5 - shifts); \
293 len -= count << (5 - shifts); \
296 __m128i xmm1 = LOAD_SI128(sptr); \
297 sptr += (16 / sizeof(_type_)); \
298 xmm1 = _op_(xmm1, xmm0); \
299 _mm_store_si128((__m128i*)dptr, xmm1); \
300 dptr += (16 / sizeof(_type_)); \
307 return PRIMITIVES_SUCCESS; \
313 #define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
314 static pstatus_t _name_(const _type_* pSrc1, const _type_* pSrc2, _type_* pDst, UINT32 len) \
317 UINT32 offBeatMask; \
318 const _type_* sptr1 = pSrc1; \
319 const _type_* sptr2 = pSrc2; \
320 _type_* dptr = pDst; \
324 return _fallback_(pSrc1, pSrc2, pDst, len); \
326 if (sizeof(_type_) == 1) \
328 else if (sizeof(_type_) == 2) \
330 else if (sizeof(_type_) == 4) \
332 else if (sizeof(_type_) == 8) \
334 offBeatMask = (1 << (shifts - 1)) - 1; \
335 if ((ULONG_PTR)pDst & offBeatMask) \
338 return _fallback_(pSrc1, pSrc2, pDst, len); \
341 while ((ULONG_PTR)dptr & 0x0f) \
344 status = _slowWay_; \
345 if (status != PRIMITIVES_SUCCESS) \
348 return PRIMITIVES_SUCCESS; \
351 count = len >> (7 - shifts); \
352 len -= count << (7 - shifts); \
353 if (((const ULONG_PTR)sptr1 & 0x0f) || ((const ULONG_PTR)sptr2 & 0x0f)) \
358 __m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr1); \
359 sptr1 += (16 / sizeof(_type_)); \
360 __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr1); \
361 sptr1 += (16 / sizeof(_type_)); \
362 __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr1); \
363 sptr1 += (16 / sizeof(_type_)); \
364 __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr1); \
365 sptr1 += (16 / sizeof(_type_)); \
366 __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr2); \
367 sptr2 += (16 / sizeof(_type_)); \
368 __m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr2); \
369 sptr2 += (16 / sizeof(_type_)); \
370 __m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr2); \
371 sptr2 += (16 / sizeof(_type_)); \
372 __m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr2); \
373 sptr2 += (16 / sizeof(_type_)); \
374 xmm0 = _op_(xmm0, xmm4); \
375 xmm1 = _op_(xmm1, xmm5); \
376 xmm2 = _op_(xmm2, xmm6); \
377 xmm3 = _op_(xmm3, xmm7); \
378 _mm_store_si128((__m128i*)dptr, xmm0); \
379 dptr += (16 / sizeof(_type_)); \
380 _mm_store_si128((__m128i*)dptr, xmm1); \
381 dptr += (16 / sizeof(_type_)); \
382 _mm_store_si128((__m128i*)dptr, xmm2); \
383 dptr += (16 / sizeof(_type_)); \
384 _mm_store_si128((__m128i*)dptr, xmm3); \
385 dptr += (16 / sizeof(_type_)); \
393 __m128i xmm0 = _mm_load_si128((const __m128i*)sptr1); \
394 sptr1 += (16 / sizeof(_type_)); \
395 __m128i xmm1 = _mm_load_si128((const __m128i*)sptr1); \
396 sptr1 += (16 / sizeof(_type_)); \
397 __m128i xmm2 = _mm_load_si128((const __m128i*)sptr1); \
398 sptr1 += (16 / sizeof(_type_)); \
399 __m128i xmm3 = _mm_load_si128((const __m128i*)sptr1); \
400 sptr1 += (16 / sizeof(_type_)); \
401 __m128i xmm4 = _mm_load_si128((const __m128i*)sptr2); \
402 sptr2 += (16 / sizeof(_type_)); \
403 __m128i xmm5 = _mm_load_si128((const __m128i*)sptr2); \
404 sptr2 += (16 / sizeof(_type_)); \
405 __m128i xmm6 = _mm_load_si128((const __m128i*)sptr2); \
406 sptr2 += (16 / sizeof(_type_)); \
407 __m128i xmm7 = _mm_load_si128((const __m128i*)sptr2); \
408 sptr2 += (16 / sizeof(_type_)); \
409 xmm0 = _op_(xmm0, xmm4); \
410 xmm1 = _op_(xmm1, xmm5); \
411 xmm2 = _op_(xmm2, xmm6); \
412 xmm3 = _op_(xmm3, xmm7); \
413 _mm_store_si128((__m128i*)dptr, xmm0); \
414 dptr += (16 / sizeof(_type_)); \
415 _mm_store_si128((__m128i*)dptr, xmm1); \
416 dptr += (16 / sizeof(_type_)); \
417 _mm_store_si128((__m128i*)dptr, xmm2); \
418 dptr += (16 / sizeof(_type_)); \
419 _mm_store_si128((__m128i*)dptr, xmm3); \
420 dptr += (16 / sizeof(_type_)); \
424 count = len >> (5 - shifts); \
425 len -= count << (5 - shifts); \
428 __m128i xmm0 = LOAD_SI128(sptr1); \
429 sptr1 += (16 / sizeof(_type_)); \
430 __m128i xmm1 = LOAD_SI128(sptr2); \
431 sptr2 += (16 / sizeof(_type_)); \
432 xmm0 = _op_(xmm0, xmm1); \
433 _mm_store_si128((__m128i*)dptr, xmm0); \
434 dptr += (16 / sizeof(_type_)); \
441 return PRIMITIVES_SUCCESS; \