FreeRDP
prim_templates.h
1 /* prim_templates.h
2  * vi:ts=4 sw=4
3  *
4  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
5  * Licensed under the Apache License, Version 2.0 (the "License"); you may
6  * not use this file except in compliance with the License. You may obtain
7  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
8  * Unless required by applicable law or agreed to in writing, software
9  * distributed under the License is distributed on an "AS IS" BASIS,
10  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11  * or implied. See the License for the specific language governing
12  * permissions and limitations under the License. Algorithms used by
13  * this code may be covered by patents by HP, Microsoft, or other parties.
14  */
15 
16 #ifdef __GNUC__
17 #pragma once
18 #endif
19 
20 #ifndef FREERDP_LIB_PRIM_TEMPLATES_H
21 #define FREERDP_LIB_PRIM_TEMPLATES_H
22 
23 /* These are prototypes for SSE (potentially NEON) routines that do a
24  * simple SSE operation over an array of data. Since so much of this
25  * code is shared except for the operation itself, these prototypes are
26  * used rather than duplicating code. The naming convention depends on
27  * the parameters: S=Source param; C=Constant; D=Destination.
28  * All the macros have parameters for a fallback procedure if the data
29  * is too small and an operation "the slow way" for use at 16-byte edges.
30  */
31 
32 /* SSE3 note: If someone needs to support an SSE2 version of these without
33  * SSE3 support, an alternative version could be added that merely checks
34  * that 16-byte alignment on both destination and source(s) can be
35  * achieved, rather than use LDDQU for unaligned reads.
36  */
37 
38 /* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
39  * It easily can't do that if the value is stored in a variable.
40  * So don't save it as an intermediate value.
41  */
42 
43 /* ----------------------------------------------------------------------------
44  * SCD = Source, Constant, Destination
45  */
46 #define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
47  static pstatus_t _name_(const _type_* pSrc, UINT32 val, _type_* pDst, UINT32 len) \
48  { \
49  INT32 shifts = 0; \
50  UINT32 offBeatMask; \
51  const _type_* sptr = pSrc; \
52  _type_* dptr = pDst; \
53  if (val == 0) \
54  return PRIMITIVES_SUCCESS; \
55  if (val >= 16) \
56  return -1; \
57  if (len < 16) /* pointless if too small */ \
58  { \
59  return _fallback_(pSrc, val, pDst, len); \
60  } \
61  if (sizeof(_type_) == 1) \
62  shifts = 1; \
63  else if (sizeof(_type_) == 2) \
64  shifts = 2; \
65  else if (sizeof(_type_) == 4) \
66  shifts = 3; \
67  else if (sizeof(_type_) == 8) \
68  shifts = 4; \
69  offBeatMask = (1 << (shifts - 1)) - 1; \
70  if ((ULONG_PTR)pDst & offBeatMask) \
71  { \
72  /* Incrementing the pointer skips over 16-byte boundary. */ \
73  return _fallback_(pSrc, val, pDst, len); \
74  } \
75  /* Get to the 16-byte boundary now. */ \
76  while ((ULONG_PTR)dptr & 0x0f) \
77  { \
78  _slowWay_; \
79  if (--len == 0) \
80  return PRIMITIVES_SUCCESS; \
81  } \
82  /* Use 8 128-bit SSE registers. */ \
83  size_t count = len >> (8 - shifts); \
84  len -= count << (8 - shifts); \
85  if ((const ULONG_PTR)sptr & 0x0f) \
86  { \
87  while (count--) \
88  { \
89  __m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr); \
90  sptr += (16 / sizeof(_type_)); \
91  __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \
92  sptr += (16 / sizeof(_type_)); \
93  __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \
94  sptr += (16 / sizeof(_type_)); \
95  __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \
96  sptr += (16 / sizeof(_type_)); \
97  __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \
98  sptr += (16 / sizeof(_type_)); \
99  __m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr); \
100  sptr += (16 / sizeof(_type_)); \
101  __m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr); \
102  sptr += (16 / sizeof(_type_)); \
103  __m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr); \
104  sptr += (16 / sizeof(_type_)); \
105  xmm0 = _op_(xmm0, (_op_type_)val); \
106  xmm1 = _op_(xmm1, (_op_type_)val); \
107  xmm2 = _op_(xmm2, (_op_type_)val); \
108  xmm3 = _op_(xmm3, (_op_type_)val); \
109  xmm4 = _op_(xmm4, (_op_type_)val); \
110  xmm5 = _op_(xmm5, (_op_type_)val); \
111  xmm6 = _op_(xmm6, (_op_type_)val); \
112  xmm7 = _op_(xmm7, (_op_type_)val); \
113  _mm_store_si128((__m128i*)dptr, xmm0); \
114  dptr += (16 / sizeof(_type_)); \
115  _mm_store_si128((__m128i*)dptr, xmm1); \
116  dptr += (16 / sizeof(_type_)); \
117  _mm_store_si128((__m128i*)dptr, xmm2); \
118  dptr += (16 / sizeof(_type_)); \
119  _mm_store_si128((__m128i*)dptr, xmm3); \
120  dptr += (16 / sizeof(_type_)); \
121  _mm_store_si128((__m128i*)dptr, xmm4); \
122  dptr += (16 / sizeof(_type_)); \
123  _mm_store_si128((__m128i*)dptr, xmm5); \
124  dptr += (16 / sizeof(_type_)); \
125  _mm_store_si128((__m128i*)dptr, xmm6); \
126  dptr += (16 / sizeof(_type_)); \
127  _mm_store_si128((__m128i*)dptr, xmm7); \
128  dptr += (16 / sizeof(_type_)); \
129  } \
130  } \
131  else \
132  { \
133  while (count--) \
134  { \
135  __m128i xmm0 = _mm_load_si128((const __m128i*)sptr); \
136  sptr += (16 / sizeof(_type_)); \
137  __m128i xmm1 = _mm_load_si128((const __m128i*)sptr); \
138  sptr += (16 / sizeof(_type_)); \
139  __m128i xmm2 = _mm_load_si128((const __m128i*)sptr); \
140  sptr += (16 / sizeof(_type_)); \
141  __m128i xmm3 = _mm_load_si128((const __m128i*)sptr); \
142  sptr += (16 / sizeof(_type_)); \
143  __m128i xmm4 = _mm_load_si128((const __m128i*)sptr); \
144  sptr += (16 / sizeof(_type_)); \
145  __m128i xmm5 = _mm_load_si128((const __m128i*)sptr); \
146  sptr += (16 / sizeof(_type_)); \
147  __m128i xmm6 = _mm_load_si128((const __m128i*)sptr); \
148  sptr += (16 / sizeof(_type_)); \
149  __m128i xmm7 = _mm_load_si128((const __m128i*)sptr); \
150  sptr += (16 / sizeof(_type_)); \
151  xmm0 = _op_(xmm0, (_op_type_)val); \
152  xmm1 = _op_(xmm1, (_op_type_)val); \
153  xmm2 = _op_(xmm2, (_op_type_)val); \
154  xmm3 = _op_(xmm3, (_op_type_)val); \
155  xmm4 = _op_(xmm4, (_op_type_)val); \
156  xmm5 = _op_(xmm5, (_op_type_)val); \
157  xmm6 = _op_(xmm6, (_op_type_)val); \
158  xmm7 = _op_(xmm7, (_op_type_)val); \
159  _mm_store_si128((__m128i*)dptr, xmm0); \
160  dptr += (16 / sizeof(_type_)); \
161  _mm_store_si128((__m128i*)dptr, xmm1); \
162  dptr += (16 / sizeof(_type_)); \
163  _mm_store_si128((__m128i*)dptr, xmm2); \
164  dptr += (16 / sizeof(_type_)); \
165  _mm_store_si128((__m128i*)dptr, xmm3); \
166  dptr += (16 / sizeof(_type_)); \
167  _mm_store_si128((__m128i*)dptr, xmm4); \
168  dptr += (16 / sizeof(_type_)); \
169  _mm_store_si128((__m128i*)dptr, xmm5); \
170  dptr += (16 / sizeof(_type_)); \
171  _mm_store_si128((__m128i*)dptr, xmm6); \
172  dptr += (16 / sizeof(_type_)); \
173  _mm_store_si128((__m128i*)dptr, xmm7); \
174  dptr += (16 / sizeof(_type_)); \
175  } \
176  } \
177  /* Use a single 128-bit SSE register. */ \
178  count = len >> (5 - shifts); \
179  len -= count << (5 - shifts); \
180  while (count--) \
181  { \
182  __m128i xmm0 = LOAD_SI128(sptr); \
183  sptr += (16 / sizeof(_type_)); \
184  xmm0 = _op_(xmm0, (_op_type_)val); \
185  _mm_store_si128((__m128i*)dptr, xmm0); \
186  dptr += (16 / sizeof(_type_)); \
187  } \
188  /* Finish off the remainder. */ \
189  while (len--) \
190  { \
191  _slowWay_; \
192  } \
193  return PRIMITIVES_SUCCESS; \
194  }
195 
196 /* ----------------------------------------------------------------------------
197  * SCD = Source, Constant, Destination
198  * PRE = preload xmm0 with the constant.
199  */
200 #define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
201  static pstatus_t _name_(const _type_* pSrc, _type_ val, _type_* pDst, INT32 ilen) \
202  { \
203  size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen); \
204  int shifts = 0; \
205  UINT32 offBeatMask; \
206  const _type_* sptr = pSrc; \
207  _type_* dptr = pDst; \
208  size_t count; \
209  __m128i xmm0; \
210  if (len < 16) /* pointless if too small */ \
211  { \
212  return _fallback_(pSrc, val, pDst, WINPR_ASSERTING_INT_CAST(int32_t, len)); \
213  } \
214  if (sizeof(_type_) == 1) \
215  shifts = 1; \
216  else if (sizeof(_type_) == 2) \
217  shifts = 2; \
218  else if (sizeof(_type_) == 4) \
219  shifts = 3; \
220  else if (sizeof(_type_) == 8) \
221  shifts = 4; \
222  offBeatMask = (1 << (shifts - 1)) - 1; \
223  if ((ULONG_PTR)pDst & offBeatMask) \
224  { \
225  /* Incrementing the pointer skips over 16-byte boundary. */ \
226  return _fallback_(pSrc, val, pDst, WINPR_ASSERTING_INT_CAST(int32_t, len)); \
227  } \
228  /* Get to the 16-byte boundary now. */ \
229  while ((ULONG_PTR)dptr & 0x0f) \
230  { \
231  _slowWay_; \
232  if (--len == 0) \
233  return PRIMITIVES_SUCCESS; \
234  } \
235  /* Use 4 128-bit SSE registers. */ \
236  count = len >> (7 - shifts); \
237  len -= count << (7 - shifts); \
238  xmm0 = mm_set1_epu32(val); \
239  if ((const ULONG_PTR)sptr & 0x0f) \
240  { \
241  while (count--) \
242  { \
243  __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \
244  sptr += (16 / sizeof(_type_)); \
245  __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \
246  sptr += (16 / sizeof(_type_)); \
247  __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \
248  sptr += (16 / sizeof(_type_)); \
249  __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \
250  sptr += (16 / sizeof(_type_)); \
251  xmm1 = _op_(xmm1, xmm0); \
252  xmm2 = _op_(xmm2, xmm0); \
253  xmm3 = _op_(xmm3, xmm0); \
254  xmm4 = _op_(xmm4, xmm0); \
255  _mm_store_si128((__m128i*)dptr, xmm1); \
256  dptr += (16 / sizeof(_type_)); \
257  _mm_store_si128((__m128i*)dptr, xmm2); \
258  dptr += (16 / sizeof(_type_)); \
259  _mm_store_si128((__m128i*)dptr, xmm3); \
260  dptr += (16 / sizeof(_type_)); \
261  _mm_store_si128((__m128i*)dptr, xmm4); \
262  dptr += (16 / sizeof(_type_)); \
263  } \
264  } \
265  else \
266  { \
267  while (count--) \
268  { \
269  __m128i xmm1 = _mm_load_si128((const __m128i*)sptr); \
270  sptr += (16 / sizeof(_type_)); \
271  __m128i xmm2 = _mm_load_si128((const __m128i*)sptr); \
272  sptr += (16 / sizeof(_type_)); \
273  __m128i xmm3 = _mm_load_si128((const __m128i*)sptr); \
274  sptr += (16 / sizeof(_type_)); \
275  __m128i xmm4 = _mm_load_si128((const __m128i*)sptr); \
276  sptr += (16 / sizeof(_type_)); \
277  xmm1 = _op_(xmm1, xmm0); \
278  xmm2 = _op_(xmm2, xmm0); \
279  xmm3 = _op_(xmm3, xmm0); \
280  xmm4 = _op_(xmm4, xmm0); \
281  _mm_store_si128((__m128i*)dptr, xmm1); \
282  dptr += (16 / sizeof(_type_)); \
283  _mm_store_si128((__m128i*)dptr, xmm2); \
284  dptr += (16 / sizeof(_type_)); \
285  _mm_store_si128((__m128i*)dptr, xmm3); \
286  dptr += (16 / sizeof(_type_)); \
287  _mm_store_si128((__m128i*)dptr, xmm4); \
288  dptr += (16 / sizeof(_type_)); \
289  } \
290  } \
291  /* Use a single 128-bit SSE register. */ \
292  count = len >> (5 - shifts); \
293  len -= count << (5 - shifts); \
294  while (count--) \
295  { \
296  __m128i xmm1 = LOAD_SI128(sptr); \
297  sptr += (16 / sizeof(_type_)); \
298  xmm1 = _op_(xmm1, xmm0); \
299  _mm_store_si128((__m128i*)dptr, xmm1); \
300  dptr += (16 / sizeof(_type_)); \
301  } \
302  /* Finish off the remainder. */ \
303  while (len--) \
304  { \
305  _slowWay_; \
306  } \
307  return PRIMITIVES_SUCCESS; \
308  }
309 
310 /* ----------------------------------------------------------------------------
311  * SSD = Source1, Source2, Destination
312  */
313 #define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
314  static pstatus_t _name_(const _type_* pSrc1, const _type_* pSrc2, _type_* pDst, UINT32 len) \
315  { \
316  int shifts = 0; \
317  UINT32 offBeatMask; \
318  const _type_* sptr1 = pSrc1; \
319  const _type_* sptr2 = pSrc2; \
320  _type_* dptr = pDst; \
321  size_t count; \
322  if (len < 16) /* pointless if too small */ \
323  { \
324  return _fallback_(pSrc1, pSrc2, pDst, len); \
325  } \
326  if (sizeof(_type_) == 1) \
327  shifts = 1; \
328  else if (sizeof(_type_) == 2) \
329  shifts = 2; \
330  else if (sizeof(_type_) == 4) \
331  shifts = 3; \
332  else if (sizeof(_type_) == 8) \
333  shifts = 4; \
334  offBeatMask = (1 << (shifts - 1)) - 1; \
335  if ((ULONG_PTR)pDst & offBeatMask) \
336  { \
337  /* Incrementing the pointer skips over 16-byte boundary. */ \
338  return _fallback_(pSrc1, pSrc2, pDst, len); \
339  } \
340  /* Get to the 16-byte boundary now. */ \
341  while ((ULONG_PTR)dptr & 0x0f) \
342  { \
343  pstatus_t status; \
344  status = _slowWay_; \
345  if (status != PRIMITIVES_SUCCESS) \
346  return status; \
347  if (--len == 0) \
348  return PRIMITIVES_SUCCESS; \
349  } \
350  /* Use 4 128-bit SSE registers. */ \
351  count = len >> (7 - shifts); \
352  len -= count << (7 - shifts); \
353  if (((const ULONG_PTR)sptr1 & 0x0f) || ((const ULONG_PTR)sptr2 & 0x0f)) \
354  { \
355  /* Unaligned loads */ \
356  while (count--) \
357  { \
358  __m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr1); \
359  sptr1 += (16 / sizeof(_type_)); \
360  __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr1); \
361  sptr1 += (16 / sizeof(_type_)); \
362  __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr1); \
363  sptr1 += (16 / sizeof(_type_)); \
364  __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr1); \
365  sptr1 += (16 / sizeof(_type_)); \
366  __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr2); \
367  sptr2 += (16 / sizeof(_type_)); \
368  __m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr2); \
369  sptr2 += (16 / sizeof(_type_)); \
370  __m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr2); \
371  sptr2 += (16 / sizeof(_type_)); \
372  __m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr2); \
373  sptr2 += (16 / sizeof(_type_)); \
374  xmm0 = _op_(xmm0, xmm4); \
375  xmm1 = _op_(xmm1, xmm5); \
376  xmm2 = _op_(xmm2, xmm6); \
377  xmm3 = _op_(xmm3, xmm7); \
378  _mm_store_si128((__m128i*)dptr, xmm0); \
379  dptr += (16 / sizeof(_type_)); \
380  _mm_store_si128((__m128i*)dptr, xmm1); \
381  dptr += (16 / sizeof(_type_)); \
382  _mm_store_si128((__m128i*)dptr, xmm2); \
383  dptr += (16 / sizeof(_type_)); \
384  _mm_store_si128((__m128i*)dptr, xmm3); \
385  dptr += (16 / sizeof(_type_)); \
386  } \
387  } \
388  else \
389  { \
390  /* Aligned loads */ \
391  while (count--) \
392  { \
393  __m128i xmm0 = _mm_load_si128((const __m128i*)sptr1); \
394  sptr1 += (16 / sizeof(_type_)); \
395  __m128i xmm1 = _mm_load_si128((const __m128i*)sptr1); \
396  sptr1 += (16 / sizeof(_type_)); \
397  __m128i xmm2 = _mm_load_si128((const __m128i*)sptr1); \
398  sptr1 += (16 / sizeof(_type_)); \
399  __m128i xmm3 = _mm_load_si128((const __m128i*)sptr1); \
400  sptr1 += (16 / sizeof(_type_)); \
401  __m128i xmm4 = _mm_load_si128((const __m128i*)sptr2); \
402  sptr2 += (16 / sizeof(_type_)); \
403  __m128i xmm5 = _mm_load_si128((const __m128i*)sptr2); \
404  sptr2 += (16 / sizeof(_type_)); \
405  __m128i xmm6 = _mm_load_si128((const __m128i*)sptr2); \
406  sptr2 += (16 / sizeof(_type_)); \
407  __m128i xmm7 = _mm_load_si128((const __m128i*)sptr2); \
408  sptr2 += (16 / sizeof(_type_)); \
409  xmm0 = _op_(xmm0, xmm4); \
410  xmm1 = _op_(xmm1, xmm5); \
411  xmm2 = _op_(xmm2, xmm6); \
412  xmm3 = _op_(xmm3, xmm7); \
413  _mm_store_si128((__m128i*)dptr, xmm0); \
414  dptr += (16 / sizeof(_type_)); \
415  _mm_store_si128((__m128i*)dptr, xmm1); \
416  dptr += (16 / sizeof(_type_)); \
417  _mm_store_si128((__m128i*)dptr, xmm2); \
418  dptr += (16 / sizeof(_type_)); \
419  _mm_store_si128((__m128i*)dptr, xmm3); \
420  dptr += (16 / sizeof(_type_)); \
421  } \
422  } \
423  /* Use a single 128-bit SSE register. */ \
424  count = len >> (5 - shifts); \
425  len -= count << (5 - shifts); \
426  while (count--) \
427  { \
428  __m128i xmm0 = LOAD_SI128(sptr1); \
429  sptr1 += (16 / sizeof(_type_)); \
430  __m128i xmm1 = LOAD_SI128(sptr2); \
431  sptr2 += (16 / sizeof(_type_)); \
432  xmm0 = _op_(xmm0, xmm1); \
433  _mm_store_si128((__m128i*)dptr, xmm0); \
434  dptr += (16 / sizeof(_type_)); \
435  } \
436  /* Finish off the remainder. */ \
437  while (len--) \
438  { \
439  _slowWay_; \
440  } \
441  return PRIMITIVES_SUCCESS; \
442  }
443 
444 #endif /* FREERDP_LIB_PRIM_TEMPLATES_H */