FreeRDP
Toggle main menu visibility
Main Page
Related Pages
Topics
Namespaces
Namespace List
Namespace Members
All
Functions
Data Structures
Data Structures
Data Structure Index
Class Hierarchy
Data Fields
All
a
b
c
d
e
f
g
i
l
m
n
o
p
r
s
t
u
v
Functions
a
c
d
e
f
g
i
o
r
s
t
Variables
a
b
c
d
e
g
i
l
m
n
o
p
r
s
t
u
v
Files
File List
Globals
All
Typedefs
•
All
Data Structures
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Modules
Pages
Loading...
Searching...
No Matches
prim_templates.h
1
/* prim_templates.h
2
* vi:ts=4 sw=4
3
*
4
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
5
* Licensed under the Apache License, Version 2.0 (the "License"); you may
6
* not use this file except in compliance with the License. You may obtain
7
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
8
* Unless required by applicable law or agreed to in writing, software
9
* distributed under the License is distributed on an "AS IS" BASIS,
10
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11
* or implied. See the License for the specific language governing
12
* permissions and limitations under the License. Algorithms used by
13
* this code may be covered by patents by HP, Microsoft, or other parties.
14
*/
15
16
#pragma once
17
18
#include "prim_avxsse.h"
19
20
/* These are prototypes for SSE (potentially NEON) routines that do a
21
* simple SSE operation over an array of data. Since so much of this
22
* code is shared except for the operation itself, these prototypes are
23
* used rather than duplicating code. The naming convention depends on
24
* the parameters: S=Source param; C=Constant; D=Destination.
25
* All the macros have parameters for a fallback procedure if the data
26
* is too small and an operation "the slow way" for use at 16-byte edges.
27
*/
28
29
/* SSE3 note: If someone needs to support an SSE2 version of these without
30
* SSE3 support, an alternative version could be added that merely checks
31
* that 16-byte alignment on both destination and source(s) can be
32
* achieved, rather than use LDDQU for unaligned reads.
33
*/
34
35
/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
36
* It easily can't do that if the value is stored in a variable.
37
* So don't save it as an intermediate value.
38
*/
39
40
/* ----------------------------------------------------------------------------
41
* SCD = Source, Constant, Destination
42
*/
43
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
44
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val, \
45
_type_* WINPR_RESTRICT pDst, UINT32 ulen) \
46
{ \
47
size_t len = ulen; \
48
INT32 shifts = 0; \
49
const _type_* sptr = pSrc; \
50
_type_* dptr = pDst; \
51
if (val == 0) \
52
return PRIMITIVES_SUCCESS; \
53
if (val >= 16) \
54
return -1; \
55
if (sizeof(_type_) == 1) \
56
shifts = 1; \
57
else if (sizeof(_type_) == 2) \
58
shifts = 2; \
59
else if (sizeof(_type_) == 4) \
60
shifts = 3; \
61
else if (sizeof(_type_) == 8) \
62
shifts = 4; \
63
/* Use 8 128-bit SSE registers. */
\
64
size_t count = len >> (8 - shifts); \
65
len -= count << (8 - shifts); \
66
\
67
while (count--) \
68
{ \
69
__m128i xmm0 = LOAD_SI128(sptr); \
70
sptr += (16 / sizeof(_type_)); \
71
__m128i xmm1 = LOAD_SI128(sptr); \
72
sptr += (16 / sizeof(_type_)); \
73
__m128i xmm2 = LOAD_SI128(sptr); \
74
sptr += (16 / sizeof(_type_)); \
75
__m128i xmm3 = LOAD_SI128(sptr); \
76
sptr += (16 / sizeof(_type_)); \
77
__m128i xmm4 = LOAD_SI128(sptr); \
78
sptr += (16 / sizeof(_type_)); \
79
__m128i xmm5 = LOAD_SI128(sptr); \
80
sptr += (16 / sizeof(_type_)); \
81
__m128i xmm6 = LOAD_SI128(sptr); \
82
sptr += (16 / sizeof(_type_)); \
83
__m128i xmm7 = LOAD_SI128(sptr); \
84
sptr += (16 / sizeof(_type_)); \
85
xmm0 = _op_(xmm0, (_op_type_)val); \
86
xmm1 = _op_(xmm1, (_op_type_)val); \
87
xmm2 = _op_(xmm2, (_op_type_)val); \
88
xmm3 = _op_(xmm3, (_op_type_)val); \
89
xmm4 = _op_(xmm4, (_op_type_)val); \
90
xmm5 = _op_(xmm5, (_op_type_)val); \
91
xmm6 = _op_(xmm6, (_op_type_)val); \
92
xmm7 = _op_(xmm7, (_op_type_)val); \
93
STORE_SI128(dptr, xmm0); \
94
dptr += (16 / sizeof(_type_)); \
95
STORE_SI128(dptr, xmm1); \
96
dptr += (16 / sizeof(_type_)); \
97
STORE_SI128(dptr, xmm2); \
98
dptr += (16 / sizeof(_type_)); \
99
STORE_SI128(dptr, xmm3); \
100
dptr += (16 / sizeof(_type_)); \
101
STORE_SI128(dptr, xmm4); \
102
dptr += (16 / sizeof(_type_)); \
103
STORE_SI128(dptr, xmm5); \
104
dptr += (16 / sizeof(_type_)); \
105
STORE_SI128(dptr, xmm6); \
106
dptr += (16 / sizeof(_type_)); \
107
STORE_SI128(dptr, xmm7); \
108
dptr += (16 / sizeof(_type_)); \
109
} \
110
\
111
/* Use a single 128-bit SSE register. */
\
112
count = len >> (5 - shifts); \
113
len -= count << (5 - shifts); \
114
while (count--) \
115
{ \
116
__m128i xmm0 = LOAD_SI128(sptr); \
117
sptr += (16 / sizeof(_type_)); \
118
xmm0 = _op_(xmm0, (_op_type_)val); \
119
STORE_SI128(dptr, xmm0); \
120
dptr += (16 / sizeof(_type_)); \
121
} \
122
/* Finish off the remainder. */
\
123
while (len--) \
124
{ \
125
_slowWay_; \
126
} \
127
return PRIMITIVES_SUCCESS; \
128
}
129
130
/* ----------------------------------------------------------------------------
131
* SCD = Source, Constant, Destination
132
* PRE = preload xmm0 with the constant.
133
*/
134
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
135
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \
136
_type_* WINPR_RESTRICT pDst, INT32 ilen) \
137
{ \
138
size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen); \
139
int shifts = 0; \
140
const _type_* sptr = pSrc; \
141
_type_* dptr = pDst; \
142
__m128i xmm0; \
143
if (sizeof(_type_) == 1) \
144
shifts = 1; \
145
else if (sizeof(_type_) == 2) \
146
shifts = 2; \
147
else if (sizeof(_type_) == 4) \
148
shifts = 3; \
149
else if (sizeof(_type_) == 8) \
150
shifts = 4; \
151
/* Use 4 128-bit SSE registers. */
\
152
size_t count = len >> (7 - shifts); \
153
len -= count << (7 - shifts); \
154
xmm0 = mm_set1_epu32(val); \
155
for (size_t x = 0; x < count; x++) \
156
{ \
157
__m128i xmm1 = LOAD_SI128(sptr); \
158
sptr += (16 / sizeof(_type_)); \
159
__m128i xmm2 = LOAD_SI128(sptr); \
160
sptr += (16 / sizeof(_type_)); \
161
__m128i xmm3 = LOAD_SI128(sptr); \
162
sptr += (16 / sizeof(_type_)); \
163
__m128i xmm4 = LOAD_SI128(sptr); \
164
sptr += (16 / sizeof(_type_)); \
165
xmm1 = _op_(xmm1, xmm0); \
166
xmm2 = _op_(xmm2, xmm0); \
167
xmm3 = _op_(xmm3, xmm0); \
168
xmm4 = _op_(xmm4, xmm0); \
169
STORE_SI128(dptr, xmm1); \
170
dptr += (16 / sizeof(_type_)); \
171
STORE_SI128(dptr, xmm2); \
172
dptr += (16 / sizeof(_type_)); \
173
STORE_SI128(dptr, xmm3); \
174
dptr += (16 / sizeof(_type_)); \
175
STORE_SI128(dptr, xmm4); \
176
dptr += (16 / sizeof(_type_)); \
177
} \
178
/* Use a single 128-bit SSE register. */
\
179
count = len >> (5 - shifts); \
180
len -= count << (5 - shifts); \
181
for (size_t x = 0; x < count; x++) \
182
{ \
183
__m128i xmm1 = LOAD_SI128(sptr); \
184
sptr += (16 / sizeof(_type_)); \
185
xmm1 = _op_(xmm1, xmm0); \
186
STORE_SI128(dptr, xmm1); \
187
dptr += (16 / sizeof(_type_)); \
188
} \
189
/* Finish off the remainder. */
\
190
for (size_t x = 0; x < len; x++) \
191
{ \
192
_slowWay_; \
193
} \
194
return PRIMITIVES_SUCCESS; \
195
}
196
197
/* ----------------------------------------------------------------------------
198
* SSD = Source1, Source2, Destination
199
*/
200
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
201
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1, \
202
const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \
203
UINT32 ulen) \
204
{ \
205
size_t len = ulen; \
206
int shifts = 0; \
207
const _type_* sptr1 = pSrc1; \
208
const _type_* sptr2 = pSrc2; \
209
_type_* dptr = pDst; \
210
size_t count; \
211
if (sizeof(_type_) == 1) \
212
shifts = 1; \
213
else if (sizeof(_type_) == 2) \
214
shifts = 2; \
215
else if (sizeof(_type_) == 4) \
216
shifts = 3; \
217
else if (sizeof(_type_) == 8) \
218
shifts = 4; \
219
/* Use 4 128-bit SSE registers. */
\
220
count = len >> (7 - shifts); \
221
len -= count << (7 - shifts); \
222
/* Aligned loads */
\
223
while (count--) \
224
{ \
225
__m128i xmm0 = LOAD_SI128(sptr1); \
226
sptr1 += (16 / sizeof(_type_)); \
227
__m128i xmm1 = LOAD_SI128(sptr1); \
228
sptr1 += (16 / sizeof(_type_)); \
229
__m128i xmm2 = LOAD_SI128(sptr1); \
230
sptr1 += (16 / sizeof(_type_)); \
231
__m128i xmm3 = LOAD_SI128(sptr1); \
232
sptr1 += (16 / sizeof(_type_)); \
233
__m128i xmm4 = LOAD_SI128(sptr2); \
234
sptr2 += (16 / sizeof(_type_)); \
235
__m128i xmm5 = LOAD_SI128(sptr2); \
236
sptr2 += (16 / sizeof(_type_)); \
237
__m128i xmm6 = LOAD_SI128(sptr2); \
238
sptr2 += (16 / sizeof(_type_)); \
239
__m128i xmm7 = LOAD_SI128(sptr2); \
240
sptr2 += (16 / sizeof(_type_)); \
241
xmm0 = _op_(xmm0, xmm4); \
242
xmm1 = _op_(xmm1, xmm5); \
243
xmm2 = _op_(xmm2, xmm6); \
244
xmm3 = _op_(xmm3, xmm7); \
245
STORE_SI128(dptr, xmm0); \
246
dptr += (16 / sizeof(_type_)); \
247
STORE_SI128(dptr, xmm1); \
248
dptr += (16 / sizeof(_type_)); \
249
STORE_SI128(dptr, xmm2); \
250
dptr += (16 / sizeof(_type_)); \
251
STORE_SI128(dptr, xmm3); \
252
dptr += (16 / sizeof(_type_)); \
253
} \
254
/* Use a single 128-bit SSE register. */
\
255
count = len >> (5 - shifts); \
256
len -= count << (5 - shifts); \
257
while (count--) \
258
{ \
259
__m128i xmm0 = LOAD_SI128(sptr1); \
260
sptr1 += (16 / sizeof(_type_)); \
261
__m128i xmm1 = LOAD_SI128(sptr2); \
262
sptr2 += (16 / sizeof(_type_)); \
263
xmm0 = _op_(xmm0, xmm1); \
264
STORE_SI128(dptr, xmm0); \
265
dptr += (16 / sizeof(_type_)); \
266
} \
267
/* Finish off the remainder. */
\
268
while (len--) \
269
{ \
270
_slowWay_; \
271
} \
272
return PRIMITIVES_SUCCESS; \
273
}
libfreerdp
primitives
sse
prim_templates.h
Generated by
1.9.8