FreeRDP
prim_templates.h
1 /* prim_templates.h
2  * vi:ts=4 sw=4
3  *
4  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
5  * Licensed under the Apache License, Version 2.0 (the "License"); you may
6  * not use this file except in compliance with the License. You may obtain
7  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
8  * Unless required by applicable law or agreed to in writing, software
9  * distributed under the License is distributed on an "AS IS" BASIS,
10  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11  * or implied. See the License for the specific language governing
12  * permissions and limitations under the License. Algorithms used by
13  * this code may be covered by patents by HP, Microsoft, or other parties.
14  */
15 
16 #ifdef __GNUC__
17 #pragma once
18 #endif
19 
20 #ifndef FREERDP_LIB_PRIM_TEMPLATES_H
21 #define FREERDP_LIB_PRIM_TEMPLATES_H
22 
23 /* These are prototypes for SSE (potentially NEON) routines that do a
24  * simple SSE operation over an array of data. Since so much of this
25  * code is shared except for the operation itself, these prototypes are
26  * used rather than duplicating code. The naming convention depends on
27  * the parameters: S=Source param; C=Constant; D=Destination.
28  * All the macros have parameters for a fallback procedure if the data
29  * is too small and an operation "the slow way" for use at 16-byte edges.
30  */
31 
32 /* SSE3 note: If someone needs to support an SSE2 version of these without
33  * SSE3 support, an alternative version could be added that merely checks
34  * that 16-byte alignment on both destination and source(s) can be
35  * achieved, rather than use LDDQU for unaligned reads.
36  */
37 
38 /* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
39  * It easily can't do that if the value is stored in a variable.
40  * So don't save it as an intermediate value.
41  */
42 
43 /* ----------------------------------------------------------------------------
44  * SCD = Source, Constant, Destination
45  */
46 #define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
47  static pstatus_t _name_(const _type_* pSrc, UINT32 val, _type_* pDst, UINT32 len) \
48  { \
49  INT32 shifts = 0; \
50  UINT32 offBeatMask; \
51  const _type_* sptr = pSrc; \
52  _type_* dptr = pDst; \
53  int count; \
54  if (val == 0) \
55  return PRIMITIVES_SUCCESS; \
56  if (val >= 16) \
57  return -1; \
58  if (len < 16) /* pointless if too small */ \
59  { \
60  return _fallback_(pSrc, val, pDst, len); \
61  } \
62  if (sizeof(_type_) == 1) \
63  shifts = 1; \
64  else if (sizeof(_type_) == 2) \
65  shifts = 2; \
66  else if (sizeof(_type_) == 4) \
67  shifts = 3; \
68  else if (sizeof(_type_) == 8) \
69  shifts = 4; \
70  offBeatMask = (1 << (shifts - 1)) - 1; \
71  if ((ULONG_PTR)pDst & offBeatMask) \
72  { \
73  /* Incrementing the pointer skips over 16-byte boundary. */ \
74  return _fallback_(pSrc, val, pDst, len); \
75  } \
76  /* Get to the 16-byte boundary now. */ \
77  while ((ULONG_PTR)dptr & 0x0f) \
78  { \
79  _slowWay_; \
80  if (--len == 0) \
81  return PRIMITIVES_SUCCESS; \
82  } \
83  /* Use 8 128-bit SSE registers. */ \
84  count = len >> (8 - shifts); \
85  len -= count << (8 - shifts); \
86  if ((const ULONG_PTR)sptr & 0x0f) \
87  { \
88  while (count--) \
89  { \
90  __m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr); \
91  sptr += (16 / sizeof(_type_)); \
92  __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \
93  sptr += (16 / sizeof(_type_)); \
94  __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \
95  sptr += (16 / sizeof(_type_)); \
96  __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \
97  sptr += (16 / sizeof(_type_)); \
98  __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \
99  sptr += (16 / sizeof(_type_)); \
100  __m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr); \
101  sptr += (16 / sizeof(_type_)); \
102  __m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr); \
103  sptr += (16 / sizeof(_type_)); \
104  __m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr); \
105  sptr += (16 / sizeof(_type_)); \
106  xmm0 = _op_(xmm0, val); \
107  xmm1 = _op_(xmm1, val); \
108  xmm2 = _op_(xmm2, val); \
109  xmm3 = _op_(xmm3, val); \
110  xmm4 = _op_(xmm4, val); \
111  xmm5 = _op_(xmm5, val); \
112  xmm6 = _op_(xmm6, val); \
113  xmm7 = _op_(xmm7, val); \
114  _mm_store_si128((__m128i*)dptr, xmm0); \
115  dptr += (16 / sizeof(_type_)); \
116  _mm_store_si128((__m128i*)dptr, xmm1); \
117  dptr += (16 / sizeof(_type_)); \
118  _mm_store_si128((__m128i*)dptr, xmm2); \
119  dptr += (16 / sizeof(_type_)); \
120  _mm_store_si128((__m128i*)dptr, xmm3); \
121  dptr += (16 / sizeof(_type_)); \
122  _mm_store_si128((__m128i*)dptr, xmm4); \
123  dptr += (16 / sizeof(_type_)); \
124  _mm_store_si128((__m128i*)dptr, xmm5); \
125  dptr += (16 / sizeof(_type_)); \
126  _mm_store_si128((__m128i*)dptr, xmm6); \
127  dptr += (16 / sizeof(_type_)); \
128  _mm_store_si128((__m128i*)dptr, xmm7); \
129  dptr += (16 / sizeof(_type_)); \
130  } \
131  } \
132  else \
133  { \
134  while (count--) \
135  { \
136  __m128i xmm0 = _mm_load_si128((const __m128i*)sptr); \
137  sptr += (16 / sizeof(_type_)); \
138  __m128i xmm1 = _mm_load_si128((const __m128i*)sptr); \
139  sptr += (16 / sizeof(_type_)); \
140  __m128i xmm2 = _mm_load_si128((const __m128i*)sptr); \
141  sptr += (16 / sizeof(_type_)); \
142  __m128i xmm3 = _mm_load_si128((const __m128i*)sptr); \
143  sptr += (16 / sizeof(_type_)); \
144  __m128i xmm4 = _mm_load_si128((const __m128i*)sptr); \
145  sptr += (16 / sizeof(_type_)); \
146  __m128i xmm5 = _mm_load_si128((const __m128i*)sptr); \
147  sptr += (16 / sizeof(_type_)); \
148  __m128i xmm6 = _mm_load_si128((const __m128i*)sptr); \
149  sptr += (16 / sizeof(_type_)); \
150  __m128i xmm7 = _mm_load_si128((const __m128i*)sptr); \
151  sptr += (16 / sizeof(_type_)); \
152  xmm0 = _op_(xmm0, val); \
153  xmm1 = _op_(xmm1, val); \
154  xmm2 = _op_(xmm2, val); \
155  xmm3 = _op_(xmm3, val); \
156  xmm4 = _op_(xmm4, val); \
157  xmm5 = _op_(xmm5, val); \
158  xmm6 = _op_(xmm6, val); \
159  xmm7 = _op_(xmm7, val); \
160  _mm_store_si128((__m128i*)dptr, xmm0); \
161  dptr += (16 / sizeof(_type_)); \
162  _mm_store_si128((__m128i*)dptr, xmm1); \
163  dptr += (16 / sizeof(_type_)); \
164  _mm_store_si128((__m128i*)dptr, xmm2); \
165  dptr += (16 / sizeof(_type_)); \
166  _mm_store_si128((__m128i*)dptr, xmm3); \
167  dptr += (16 / sizeof(_type_)); \
168  _mm_store_si128((__m128i*)dptr, xmm4); \
169  dptr += (16 / sizeof(_type_)); \
170  _mm_store_si128((__m128i*)dptr, xmm5); \
171  dptr += (16 / sizeof(_type_)); \
172  _mm_store_si128((__m128i*)dptr, xmm6); \
173  dptr += (16 / sizeof(_type_)); \
174  _mm_store_si128((__m128i*)dptr, xmm7); \
175  dptr += (16 / sizeof(_type_)); \
176  } \
177  } \
178  /* Use a single 128-bit SSE register. */ \
179  count = len >> (5 - shifts); \
180  len -= count << (5 - shifts); \
181  while (count--) \
182  { \
183  __m128i xmm0 = LOAD_SI128(sptr); \
184  sptr += (16 / sizeof(_type_)); \
185  xmm0 = _op_(xmm0, val); \
186  _mm_store_si128((__m128i*)dptr, xmm0); \
187  dptr += (16 / sizeof(_type_)); \
188  } \
189  /* Finish off the remainder. */ \
190  while (len--) \
191  { \
192  _slowWay_; \
193  } \
194  return PRIMITIVES_SUCCESS; \
195  }
196 
197 /* ----------------------------------------------------------------------------
198  * SCD = Source, Constant, Destination
199  * PRE = preload xmm0 with the constant.
200  */
201 #define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
202  static pstatus_t _name_(const _type_* pSrc, _type_ val, _type_* pDst, INT32 len) \
203  { \
204  int shifts = 0; \
205  UINT32 offBeatMask; \
206  const _type_* sptr = pSrc; \
207  _type_* dptr = pDst; \
208  size_t count; \
209  __m128i xmm0; \
210  if (len < 16) /* pointless if too small */ \
211  { \
212  return _fallback_(pSrc, val, pDst, len); \
213  } \
214  if (sizeof(_type_) == 1) \
215  shifts = 1; \
216  else if (sizeof(_type_) == 2) \
217  shifts = 2; \
218  else if (sizeof(_type_) == 4) \
219  shifts = 3; \
220  else if (sizeof(_type_) == 8) \
221  shifts = 4; \
222  offBeatMask = (1 << (shifts - 1)) - 1; \
223  if ((ULONG_PTR)pDst & offBeatMask) \
224  { \
225  /* Incrementing the pointer skips over 16-byte boundary. */ \
226  return _fallback_(pSrc, val, pDst, len); \
227  } \
228  /* Get to the 16-byte boundary now. */ \
229  while ((ULONG_PTR)dptr & 0x0f) \
230  { \
231  _slowWay_; \
232  if (--len == 0) \
233  return PRIMITIVES_SUCCESS; \
234  } \
235  /* Use 4 128-bit SSE registers. */ \
236  count = len >> (7 - shifts); \
237  len -= count << (7 - shifts); \
238  xmm0 = _mm_set1_epi32(val); \
239  if ((const ULONG_PTR)sptr & 0x0f) \
240  { \
241  while (count--) \
242  { \
243  __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \
244  sptr += (16 / sizeof(_type_)); \
245  __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \
246  sptr += (16 / sizeof(_type_)); \
247  __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \
248  sptr += (16 / sizeof(_type_)); \
249  __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \
250  sptr += (16 / sizeof(_type_)); \
251  xmm1 = _op_(xmm1, xmm0); \
252  xmm2 = _op_(xmm2, xmm0); \
253  xmm3 = _op_(xmm3, xmm0); \
254  xmm4 = _op_(xmm4, xmm0); \
255  _mm_store_si128((__m128i*)dptr, xmm1); \
256  dptr += (16 / sizeof(_type_)); \
257  _mm_store_si128((__m128i*)dptr, xmm2); \
258  dptr += (16 / sizeof(_type_)); \
259  _mm_store_si128((__m128i*)dptr, xmm3); \
260  dptr += (16 / sizeof(_type_)); \
261  _mm_store_si128((__m128i*)dptr, xmm4); \
262  dptr += (16 / sizeof(_type_)); \
263  } \
264  } \
265  else \
266  { \
267  while (count--) \
268  { \
269  __m128i xmm1 = _mm_load_si128((const __m128i*)sptr); \
270  sptr += (16 / sizeof(_type_)); \
271  __m128i xmm2 = _mm_load_si128((const __m128i*)sptr); \
272  sptr += (16 / sizeof(_type_)); \
273  __m128i xmm3 = _mm_load_si128((const __m128i*)sptr); \
274  sptr += (16 / sizeof(_type_)); \
275  __m128i xmm4 = _mm_load_si128((const __m128i*)sptr); \
276  sptr += (16 / sizeof(_type_)); \
277  xmm1 = _op_(xmm1, xmm0); \
278  xmm2 = _op_(xmm2, xmm0); \
279  xmm3 = _op_(xmm3, xmm0); \
280  xmm4 = _op_(xmm4, xmm0); \
281  _mm_store_si128((__m128i*)dptr, xmm1); \
282  dptr += (16 / sizeof(_type_)); \
283  _mm_store_si128((__m128i*)dptr, xmm2); \
284  dptr += (16 / sizeof(_type_)); \
285  _mm_store_si128((__m128i*)dptr, xmm3); \
286  dptr += (16 / sizeof(_type_)); \
287  _mm_store_si128((__m128i*)dptr, xmm4); \
288  dptr += (16 / sizeof(_type_)); \
289  } \
290  } \
291  /* Use a single 128-bit SSE register. */ \
292  count = len >> (5 - shifts); \
293  len -= count << (5 - shifts); \
294  while (count--) \
295  { \
296  __m128i xmm1 = LOAD_SI128(sptr); \
297  sptr += (16 / sizeof(_type_)); \
298  xmm1 = _op_(xmm1, xmm0); \
299  _mm_store_si128((__m128i*)dptr, xmm1); \
300  dptr += (16 / sizeof(_type_)); \
301  } \
302  /* Finish off the remainder. */ \
303  while (len--) \
304  { \
305  _slowWay_; \
306  } \
307  return PRIMITIVES_SUCCESS; \
308  }
309 
310 /* ----------------------------------------------------------------------------
311  * SSD = Source1, Source2, Destination
312  */
313 #define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
314  static pstatus_t _name_(const _type_* pSrc1, const _type_* pSrc2, _type_* pDst, UINT32 len) \
315  { \
316  int shifts = 0; \
317  UINT32 offBeatMask; \
318  const _type_* sptr1 = pSrc1; \
319  const _type_* sptr2 = pSrc2; \
320  _type_* dptr = pDst; \
321  size_t count; \
322  if (len < 16) /* pointless if too small */ \
323  { \
324  return _fallback_(pSrc1, pSrc2, pDst, len); \
325  } \
326  if (sizeof(_type_) == 1) \
327  shifts = 1; \
328  else if (sizeof(_type_) == 2) \
329  shifts = 2; \
330  else if (sizeof(_type_) == 4) \
331  shifts = 3; \
332  else if (sizeof(_type_) == 8) \
333  shifts = 4; \
334  offBeatMask = (1 << (shifts - 1)) - 1; \
335  if ((ULONG_PTR)pDst & offBeatMask) \
336  { \
337  /* Incrementing the pointer skips over 16-byte boundary. */ \
338  return _fallback_(pSrc1, pSrc2, pDst, len); \
339  } \
340  /* Get to the 16-byte boundary now. */ \
341  while ((ULONG_PTR)dptr & 0x0f) \
342  { \
343  pstatus_t status; \
344  status = _slowWay_; \
345  if (status != PRIMITIVES_SUCCESS) \
346  return status; \
347  if (--len == 0) \
348  return PRIMITIVES_SUCCESS; \
349  } \
350  /* Use 4 128-bit SSE registers. */ \
351  count = len >> (7 - shifts); \
352  len -= count << (7 - shifts); \
353  if (((const ULONG_PTR)sptr1 & 0x0f) || ((const ULONG_PTR)sptr2 & 0x0f)) \
354  { \
355  /* Unaligned loads */ \
356  while (count--) \
357  { \
358  __m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr1); \
359  sptr1 += (16 / sizeof(_type_)); \
360  __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr1); \
361  sptr1 += (16 / sizeof(_type_)); \
362  __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr1); \
363  sptr1 += (16 / sizeof(_type_)); \
364  __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr1); \
365  sptr1 += (16 / sizeof(_type_)); \
366  __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr2); \
367  sptr2 += (16 / sizeof(_type_)); \
368  __m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr2); \
369  sptr2 += (16 / sizeof(_type_)); \
370  __m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr2); \
371  sptr2 += (16 / sizeof(_type_)); \
372  __m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr2); \
373  sptr2 += (16 / sizeof(_type_)); \
374  xmm0 = _op_(xmm0, xmm4); \
375  xmm1 = _op_(xmm1, xmm5); \
376  xmm2 = _op_(xmm2, xmm6); \
377  xmm3 = _op_(xmm3, xmm7); \
378  _mm_store_si128((__m128i*)dptr, xmm0); \
379  dptr += (16 / sizeof(_type_)); \
380  _mm_store_si128((__m128i*)dptr, xmm1); \
381  dptr += (16 / sizeof(_type_)); \
382  _mm_store_si128((__m128i*)dptr, xmm2); \
383  dptr += (16 / sizeof(_type_)); \
384  _mm_store_si128((__m128i*)dptr, xmm3); \
385  dptr += (16 / sizeof(_type_)); \
386  } \
387  } \
388  else \
389  { \
390  /* Aligned loads */ \
391  while (count--) \
392  { \
393  __m128i xmm0 = _mm_load_si128((const __m128i*)sptr1); \
394  sptr1 += (16 / sizeof(_type_)); \
395  __m128i xmm1 = _mm_load_si128((const __m128i*)sptr1); \
396  sptr1 += (16 / sizeof(_type_)); \
397  __m128i xmm2 = _mm_load_si128((const __m128i*)sptr1); \
398  sptr1 += (16 / sizeof(_type_)); \
399  __m128i xmm3 = _mm_load_si128((const __m128i*)sptr1); \
400  sptr1 += (16 / sizeof(_type_)); \
401  __m128i xmm4 = _mm_load_si128((const __m128i*)sptr2); \
402  sptr2 += (16 / sizeof(_type_)); \
403  __m128i xmm5 = _mm_load_si128((const __m128i*)sptr2); \
404  sptr2 += (16 / sizeof(_type_)); \
405  __m128i xmm6 = _mm_load_si128((const __m128i*)sptr2); \
406  sptr2 += (16 / sizeof(_type_)); \
407  __m128i xmm7 = _mm_load_si128((const __m128i*)sptr2); \
408  sptr2 += (16 / sizeof(_type_)); \
409  xmm0 = _op_(xmm0, xmm4); \
410  xmm1 = _op_(xmm1, xmm5); \
411  xmm2 = _op_(xmm2, xmm6); \
412  xmm3 = _op_(xmm3, xmm7); \
413  _mm_store_si128((__m128i*)dptr, xmm0); \
414  dptr += (16 / sizeof(_type_)); \
415  _mm_store_si128((__m128i*)dptr, xmm1); \
416  dptr += (16 / sizeof(_type_)); \
417  _mm_store_si128((__m128i*)dptr, xmm2); \
418  dptr += (16 / sizeof(_type_)); \
419  _mm_store_si128((__m128i*)dptr, xmm3); \
420  dptr += (16 / sizeof(_type_)); \
421  } \
422  } \
423  /* Use a single 128-bit SSE register. */ \
424  count = len >> (5 - shifts); \
425  len -= count << (5 - shifts); \
426  while (count--) \
427  { \
428  __m128i xmm0 = LOAD_SI128(sptr1); \
429  sptr1 += (16 / sizeof(_type_)); \
430  __m128i xmm1 = LOAD_SI128(sptr2); \
431  sptr2 += (16 / sizeof(_type_)); \
432  xmm0 = _op_(xmm0, xmm1); \
433  _mm_store_si128((__m128i*)dptr, xmm0); \
434  dptr += (16 / sizeof(_type_)); \
435  } \
436  /* Finish off the remainder. */ \
437  while (len--) \
438  { \
439  _slowWay_; \
440  } \
441  return PRIMITIVES_SUCCESS; \
442  }
443 
444 #endif /* FREERDP_LIB_PRIM_TEMPLATES_H */