FreeRDP
prim_shift_sse3.c
1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Shift operations.
3  * vi:ts=4 sw=4:
4  *
5  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6  * Licensed under the Apache License, Version 2.0 (the "License"); you may
7  * not use this file except in compliance with the License. You may obtain
8  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12  * or implied. See the License for the specific language governing
13  * permissions and limitations under the License.
14  */
15 
16 #include <freerdp/config.h>
17 
18 #include <freerdp/types.h>
19 #include <freerdp/primitives.h>
20 #include <winpr/sysinfo.h>
21 
22 #include "prim_shift.h"
23 
24 #include "prim_internal.h"
25 #include "prim_templates.h"
26 
27 #if defined(SSE_AVX_INTRINSICS_ENABLED)
28 #include <emmintrin.h>
29 #include <pmmintrin.h>
30 
31 static primitives_t* generic = NULL;
32 
33 /* ------------------------------------------------------------------------- */
34 SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16, int16_t,
35  *dptr++ = (INT16)(((UINT16)*sptr++ << val) & 0xFFFF))
36 /* ------------------------------------------------------------------------- */
37 SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16, int16_t,
38  *dptr++ = *sptr++ >> val)
39 /* ------------------------------------------------------------------------- */
40 SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16, int16_t,
41  *dptr++ = (((UINT16)*sptr++ << val) & 0xFFFF))
42 /* ------------------------------------------------------------------------- */
43 SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16, int16_t,
44  *dptr++ = *sptr++ >> val)
45 
46 static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 len)
47 {
48  const INT32 shifts = 2;
49  if (val == 0)
50  return PRIMITIVES_SUCCESS;
51  if (val >= 16)
52  return -1;
53  if (len < 16) /* pointless if too small */
54  return generic->lShiftC_16s_inplace(pSrcDst, val, len);
55 
56  UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
57  if ((ULONG_PTR)pSrcDst & offBeatMask)
58  {
59  /* Incrementing the pointer skips over 16-byte boundary. */
60  return generic->lShiftC_16s_inplace(pSrcDst, val, len);
61  }
62  /* Get to the 16-byte boundary now. */
63  const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16);
64  if (rem > 0)
65  {
66  const UINT32 add = 16 - rem;
67  pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add);
68  if (status != PRIMITIVES_SUCCESS)
69  return status;
70  pSrcDst += add;
71  len -= add;
72  }
73 
74  /* Use 8 128-bit SSE registers. */
75  uint32_t count = len >> (8 - shifts);
76  len -= count << (8 - shifts);
77 
78  while (count--)
79  {
80  const __m128i* src = (const __m128i*)pSrcDst;
81 
82  __m128i xmm0 = _mm_load_si128(src++);
83  __m128i xmm1 = _mm_load_si128(src++);
84  __m128i xmm2 = _mm_load_si128(src++);
85  __m128i xmm3 = _mm_load_si128(src++);
86  __m128i xmm4 = _mm_load_si128(src++);
87  __m128i xmm5 = _mm_load_si128(src++);
88  __m128i xmm6 = _mm_load_si128(src++);
89  __m128i xmm7 = _mm_load_si128(src);
90 
91  xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
92  xmm1 = _mm_slli_epi16(xmm1, (int16_t)val);
93  xmm2 = _mm_slli_epi16(xmm2, (int16_t)val);
94  xmm3 = _mm_slli_epi16(xmm3, (int16_t)val);
95  xmm4 = _mm_slli_epi16(xmm4, (int16_t)val);
96  xmm5 = _mm_slli_epi16(xmm5, (int16_t)val);
97  xmm6 = _mm_slli_epi16(xmm6, (int16_t)val);
98  xmm7 = _mm_slli_epi16(xmm7, (int16_t)val);
99 
100  __m128i* dst = (__m128i*)pSrcDst;
101 
102  _mm_store_si128(dst++, xmm0);
103  _mm_store_si128(dst++, xmm1);
104  _mm_store_si128(dst++, xmm2);
105  _mm_store_si128(dst++, xmm3);
106  _mm_store_si128(dst++, xmm4);
107  _mm_store_si128(dst++, xmm5);
108  _mm_store_si128(dst++, xmm6);
109  _mm_store_si128(dst++, xmm7);
110 
111  pSrcDst = (INT16*)dst;
112  }
113 
114  /* Use a single 128-bit SSE register. */
115  count = len >> (5 - shifts);
116  len -= count << (5 - shifts);
117  while (count--)
118  {
119  const __m128i* src = (const __m128i*)pSrcDst;
120  __m128i xmm0 = LOAD_SI128(src);
121 
122  xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
123 
124  __m128i* dst = (__m128i*)pSrcDst;
125  _mm_store_si128(dst++, xmm0);
126  pSrcDst = (INT16*)dst;
127  }
128 
129  /* Finish off the remainder. */
130  if (len > 0)
131  return generic->lShiftC_16s_inplace(pSrcDst, val, len);
132 
133  return PRIMITIVES_SUCCESS;
134 }
135 #endif
136 
137 /* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
138  * depending on the sign of val. To avoid using the deprecated inplace
139  * routines, a wrapper can use the src for the dest.
140  */
141 
142 /* ------------------------------------------------------------------------- */
143 void primitives_init_shift_sse3(primitives_t* WINPR_RESTRICT prims)
144 {
145 #if defined(SSE_AVX_INTRINSICS_ENABLED)
146  generic = primitives_get_generic();
147  primitives_init_shift(prims);
148 
149  if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
150  IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
151  {
152  WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
153  prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
154  prims->lShiftC_16s = sse2_lShiftC_16s;
155  prims->rShiftC_16s = sse2_rShiftC_16s;
156  prims->lShiftC_16u = sse2_lShiftC_16u;
157  prims->rShiftC_16u = sse2_rShiftC_16u;
158  }
159 
160 #else
161  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
162  WINPR_UNUSED(prims);
163 #endif
164 }