FreeRDP
prim_set_sse2.c
1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Optimized routines to set a chunk of memory to a constant.
3  * vi:ts=4 sw=4:
4  *
5  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6  * Licensed under the Apache License, Version 2.0 (the "License"); you may
7  * not use this file except in compliance with the License. You may obtain
8  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12  * or implied. See the License for the specific language governing
13  * permissions and limitations under the License.
14  *
15  */
16 
17 #include <freerdp/config.h>
18 
19 #include <string.h>
20 #include <freerdp/types.h>
21 #include <freerdp/primitives.h>
22 #include <winpr/sysinfo.h>
23 
24 #include "prim_internal.h"
25 #include "prim_set.h"
26 
27 /* ========================================================================= */
28 #if defined(SSE_AVX_INTRINSICS_ENABLED)
29 #include <emmintrin.h>
30 
31 static primitives_t* generic = NULL;
32 
33 static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
34 {
35  BYTE byte = 0;
36  BYTE* dptr = NULL;
37  __m128i xmm0;
38  size_t count = 0;
39 
40  if (len < 16)
41  return generic->set_8u(val, pDst, len);
42 
43  byte = val;
44  dptr = pDst;
45 
46  /* Seek 16-byte alignment. */
47  while ((ULONG_PTR)dptr & 0x0f)
48  {
49  *dptr++ = byte;
50 
51  if (--len == 0)
52  return PRIMITIVES_SUCCESS;
53  }
54 
55  xmm0 = mm_set1_epu8(byte);
56  /* Cover 256-byte chunks via SSE register stores. */
57  count = len >> 8;
58  len -= count << 8;
59 
60  /* Do 256-byte chunks using one XMM register. */
61  while (count--)
62  {
63  _mm_store_si128((__m128i*)dptr, xmm0);
64  dptr += 16;
65  _mm_store_si128((__m128i*)dptr, xmm0);
66  dptr += 16;
67  _mm_store_si128((__m128i*)dptr, xmm0);
68  dptr += 16;
69  _mm_store_si128((__m128i*)dptr, xmm0);
70  dptr += 16;
71  _mm_store_si128((__m128i*)dptr, xmm0);
72  dptr += 16;
73  _mm_store_si128((__m128i*)dptr, xmm0);
74  dptr += 16;
75  _mm_store_si128((__m128i*)dptr, xmm0);
76  dptr += 16;
77  _mm_store_si128((__m128i*)dptr, xmm0);
78  dptr += 16;
79  _mm_store_si128((__m128i*)dptr, xmm0);
80  dptr += 16;
81  _mm_store_si128((__m128i*)dptr, xmm0);
82  dptr += 16;
83  _mm_store_si128((__m128i*)dptr, xmm0);
84  dptr += 16;
85  _mm_store_si128((__m128i*)dptr, xmm0);
86  dptr += 16;
87  _mm_store_si128((__m128i*)dptr, xmm0);
88  dptr += 16;
89  _mm_store_si128((__m128i*)dptr, xmm0);
90  dptr += 16;
91  _mm_store_si128((__m128i*)dptr, xmm0);
92  dptr += 16;
93  _mm_store_si128((__m128i*)dptr, xmm0);
94  dptr += 16;
95  }
96 
97  /* Cover 16-byte chunks via SSE register stores. */
98  count = len >> 4;
99  len -= count << 4;
100 
101  /* Do 16-byte chunks using one XMM register. */
102  while (count--)
103  {
104  _mm_store_si128((__m128i*)dptr, xmm0);
105  dptr += 16;
106  }
107 
108  /* Do leftover bytes. */
109  while (len--)
110  *dptr++ = byte;
111 
112  return PRIMITIVES_SUCCESS;
113 }
114 
115 /* ------------------------------------------------------------------------- */
116 static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 len)
117 {
118  const primitives_t* prim = primitives_get_generic();
119  UINT32* dptr = pDst;
120  __m128i xmm0;
121  size_t count = 0;
122 
123  /* If really short, just do it here. */
124  if (len < 32)
125  {
126  while (len--)
127  *dptr++ = val;
128 
129  return PRIMITIVES_SUCCESS;
130  }
131 
132  /* Assure we can reach 16-byte alignment. */
133  if (((ULONG_PTR)dptr & 0x03) != 0)
134  {
135  return prim->set_32u(val, pDst, len);
136  }
137 
138  /* Seek 16-byte alignment. */
139  while ((ULONG_PTR)dptr & 0x0f)
140  {
141  *dptr++ = val;
142 
143  if (--len == 0)
144  return PRIMITIVES_SUCCESS;
145  }
146 
147  xmm0 = mm_set1_epu32(val);
148  /* Cover 256-byte chunks via SSE register stores. */
149  count = len >> 6;
150  len -= count << 6;
151 
152  /* Do 256-byte chunks using one XMM register. */
153  while (count--)
154  {
155  _mm_store_si128((__m128i*)dptr, xmm0);
156  dptr += 4;
157  _mm_store_si128((__m128i*)dptr, xmm0);
158  dptr += 4;
159  _mm_store_si128((__m128i*)dptr, xmm0);
160  dptr += 4;
161  _mm_store_si128((__m128i*)dptr, xmm0);
162  dptr += 4;
163  _mm_store_si128((__m128i*)dptr, xmm0);
164  dptr += 4;
165  _mm_store_si128((__m128i*)dptr, xmm0);
166  dptr += 4;
167  _mm_store_si128((__m128i*)dptr, xmm0);
168  dptr += 4;
169  _mm_store_si128((__m128i*)dptr, xmm0);
170  dptr += 4;
171  _mm_store_si128((__m128i*)dptr, xmm0);
172  dptr += 4;
173  _mm_store_si128((__m128i*)dptr, xmm0);
174  dptr += 4;
175  _mm_store_si128((__m128i*)dptr, xmm0);
176  dptr += 4;
177  _mm_store_si128((__m128i*)dptr, xmm0);
178  dptr += 4;
179  _mm_store_si128((__m128i*)dptr, xmm0);
180  dptr += 4;
181  _mm_store_si128((__m128i*)dptr, xmm0);
182  dptr += 4;
183  _mm_store_si128((__m128i*)dptr, xmm0);
184  dptr += 4;
185  _mm_store_si128((__m128i*)dptr, xmm0);
186  dptr += 4;
187  }
188 
189  /* Cover 16-byte chunks via SSE register stores. */
190  count = len >> 2;
191  len -= count << 2;
192 
193  /* Do 16-byte chunks using one XMM register. */
194  while (count--)
195  {
196  _mm_store_si128((__m128i*)dptr, xmm0);
197  dptr += 4;
198  }
199 
200  /* Do leftover bytes. */
201  while (len--)
202  *dptr++ = val;
203 
204  return PRIMITIVES_SUCCESS;
205 }
206 
207 /* ------------------------------------------------------------------------- */
208 static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
209 {
210  UINT32 uval = *((UINT32*)&val);
211  return sse2_set_32u(uval, (UINT32*)pDst, len);
212 }
213 #endif
214 
215 /* ------------------------------------------------------------------------- */
216 void primitives_init_set_sse2(primitives_t* WINPR_RESTRICT prims)
217 {
218 #if defined(SSE_AVX_INTRINSICS_ENABLED)
219  generic = primitives_get_generic();
220  primitives_init_set(prims);
221  /* Pick tuned versions if possible. */
222 
223  if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
224  {
225  WLog_VRB(PRIM_TAG, "SSE2 optimizations");
226  prims->set_8u = sse2_set_8u;
227  prims->set_32s = sse2_set_32s;
228  prims->set_32u = sse2_set_32u;
229  }
230 
231 #else
232  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
233  WINPR_UNUSED(prims);
234 #endif
235 }