FreeRDP
prim_sign_ssse3.c
1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Optimized sign operations.
3  * vi:ts=4 sw=4:
4  *
5  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6  * Licensed under the Apache License, Version 2.0 (the "License"); you may
7  * not use this file except in compliance with the License. You may obtain
8  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12  * or implied. See the License for the specific language governing
13  * permissions and limitations under the License.
14  */
15 
16 #include <freerdp/config.h>
17 
18 #include <freerdp/types.h>
19 #include <freerdp/primitives.h>
20 #include <winpr/sysinfo.h>
21 
22 #include "prim_sign.h"
23 
24 #include "prim_internal.h"
25 
26 #if defined(SSE_AVX_INTRINSICS_ENABLED)
27 #include <emmintrin.h>
28 #include <tmmintrin.h>
29 
30 static primitives_t* generic = NULL;
31 
32 /* ------------------------------------------------------------------------- */
33 static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
34  UINT32 len)
35 {
36  const INT16* sptr = pSrc;
37  INT16* dptr = pDst;
38  size_t count = 0;
39 
40  if (len < 16)
41  {
42  return generic->sign_16s(pSrc, pDst, len);
43  }
44 
45  /* Check for 16-byte alignment (eventually). */
46  if ((ULONG_PTR)pDst & 0x01)
47  {
48  return generic->sign_16s(pSrc, pDst, len);
49  }
50 
51  /* Seek 16-byte alignment. */
52  while ((ULONG_PTR)dptr & 0x0f)
53  {
54  INT16 src = *sptr++;
55  *dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? (-1) : ((src > 0) ? 1 : 0));
56 
57  if (--len == 0)
58  return PRIMITIVES_SUCCESS;
59  }
60 
61  /* Do 32-short chunks using 8 XMM registers. */
62  count = len >> 5; /* / 32 */
63  len -= count << 5; /* * 32 */
64 
65  if ((ULONG_PTR)sptr & 0x0f)
66  {
67  /* Unaligned */
68  while (count--)
69  {
70  __m128i xmm0;
71  __m128i xmm1;
72  __m128i xmm2;
73  __m128i xmm3;
74  __m128i xmm4;
75  __m128i xmm5;
76  __m128i xmm6;
77  __m128i xmm7;
78  xmm0 = _mm_set1_epi16(0x0001U);
79  xmm1 = _mm_set1_epi16(0x0001U);
80  xmm2 = _mm_set1_epi16(0x0001U);
81  xmm3 = _mm_set1_epi16(0x0001U);
82  xmm4 = _mm_lddqu_si128((const __m128i*)sptr);
83  sptr += 8;
84  xmm5 = _mm_lddqu_si128((const __m128i*)sptr);
85  sptr += 8;
86  xmm6 = _mm_lddqu_si128((const __m128i*)sptr);
87  sptr += 8;
88  xmm7 = _mm_lddqu_si128((const __m128i*)sptr);
89  sptr += 8;
90  xmm0 = _mm_sign_epi16(xmm0, xmm4);
91  xmm1 = _mm_sign_epi16(xmm1, xmm5);
92  xmm2 = _mm_sign_epi16(xmm2, xmm6);
93  xmm3 = _mm_sign_epi16(xmm3, xmm7);
94  _mm_store_si128((__m128i*)dptr, xmm0);
95  dptr += 8;
96  _mm_store_si128((__m128i*)dptr, xmm1);
97  dptr += 8;
98  _mm_store_si128((__m128i*)dptr, xmm2);
99  dptr += 8;
100  _mm_store_si128((__m128i*)dptr, xmm3);
101  dptr += 8;
102  }
103  }
104  else
105  {
106  /* Aligned */
107  while (count--)
108  {
109  __m128i xmm0;
110  __m128i xmm1;
111  __m128i xmm2;
112  __m128i xmm3;
113  __m128i xmm4;
114  __m128i xmm5;
115  __m128i xmm6;
116  __m128i xmm7;
117  xmm0 = _mm_set1_epi16(0x0001U);
118  xmm1 = _mm_set1_epi16(0x0001U);
119  xmm2 = _mm_set1_epi16(0x0001U);
120  xmm3 = _mm_set1_epi16(0x0001U);
121  xmm4 = _mm_load_si128((const __m128i*)sptr);
122  sptr += 8;
123  xmm5 = _mm_load_si128((const __m128i*)sptr);
124  sptr += 8;
125  xmm6 = _mm_load_si128((const __m128i*)sptr);
126  sptr += 8;
127  xmm7 = _mm_load_si128((const __m128i*)sptr);
128  sptr += 8;
129  xmm0 = _mm_sign_epi16(xmm0, xmm4);
130  xmm1 = _mm_sign_epi16(xmm1, xmm5);
131  xmm2 = _mm_sign_epi16(xmm2, xmm6);
132  xmm3 = _mm_sign_epi16(xmm3, xmm7);
133  _mm_store_si128((__m128i*)dptr, xmm0);
134  dptr += 8;
135  _mm_store_si128((__m128i*)dptr, xmm1);
136  dptr += 8;
137  _mm_store_si128((__m128i*)dptr, xmm2);
138  dptr += 8;
139  _mm_store_si128((__m128i*)dptr, xmm3);
140  dptr += 8;
141  }
142  }
143 
144  /* Do 8-short chunks using two XMM registers. */
145  count = len >> 3;
146  len -= count << 3;
147 
148  while (count--)
149  {
150  __m128i xmm0 = _mm_set1_epi16(0x0001U);
151  __m128i xmm1 = LOAD_SI128(sptr);
152  sptr += 8;
153  xmm0 = _mm_sign_epi16(xmm0, xmm1);
154  _mm_store_si128((__m128i*)dptr, xmm0);
155  dptr += 8;
156  }
157 
158  /* Do leftovers. */
159  while (len--)
160  {
161  INT16 src = *sptr++;
162  *dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? -1 : ((src > 0) ? 1 : 0));
163  }
164 
165  return PRIMITIVES_SUCCESS;
166 }
167 
168 #endif /* SSE_AVX_INTRINSICS_ENABLED */
169 
170 /* ------------------------------------------------------------------------- */
171 void primitives_init_sign_ssse3(primitives_t* WINPR_RESTRICT prims)
172 {
173 #if defined(SSE_AVX_INTRINSICS_ENABLED)
174  generic = primitives_get_generic();
175  primitives_init_sign(prims);
176  /* Pick tuned versions if possible. */
177  /* I didn't spot an IPP version of this. */
178 
179  if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
180  IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
181  {
182  WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
183  prims->sign_16s = ssse3_sign_16s;
184  }
185 
186 #else
187  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSSE3/SSE3 intrinsics not available");
188  WINPR_UNUSED(prims);
189 #endif
190 }