FreeRDP
prim_copy_avx2.c
1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Copy operations.
3  * vi:ts=4 sw=4:
4  *
5  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6  * Licensed under the Apache License, Version 2.0 (the "License"); you may
7  * not use this file except in compliance with the License. You may obtain
8  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12  * or implied. See the License for the specific language governing
13  * permissions and limitations under the License.
14  */
15 
16 #include <winpr/sysinfo.h>
17 
18 #include <freerdp/config.h>
19 
20 #include <string.h>
21 #include <freerdp/types.h>
22 #include <freerdp/primitives.h>
23 #include <freerdp/log.h>
24 
25 #include "prim_internal.h"
26 #include "prim_copy.h"
27 #include "../codec/color.h"
28 
29 #include <freerdp/codec/color.h>
30 
31 #if defined(SSE_AVX_INTRINSICS_ENABLED)
32 #define TAG FREERDP_TAG("primitives.copy")
33 
34 #include <emmintrin.h>
35 #include <immintrin.h>
36 
37 static inline __m256i mm256_set_epu32(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3,
38  uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7)
39 {
40  return _mm256_set_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4,
41  (int32_t)i5, (int32_t)i6, (int32_t)i7);
42 }
43 
44 static INLINE pstatus_t avx2_image_copy_no_overlap_convert(
45  BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
46  UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
47  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
48  SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset);
49 
50 static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
51  UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
52  UINT32 nHeight,
53  const BYTE* WINPR_RESTRICT pSrcData,
54  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
55  SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
56  SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
57 {
58 
59  const SSIZE_T srcByte = 3;
60  const SSIZE_T dstByte = 4;
61 
62  const __m256i mask = mm256_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
63  0xFF000000, 0xFF000000, 0xFF000000);
64  const __m256i smask = mm256_set_epu32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
65  0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
66  const __m256i shelpmask = mm256_set_epu32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
67  0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
68  const UINT32 rem = nWidth % 8;
69  const SSIZE_T width = nWidth - rem;
70 
71  const size_t align = nSrcStep % 32;
72  const BOOL fast = (align == 0) ? TRUE : (align >= 8 - MIN(8, (size_t)rem) ? TRUE : FALSE);
73  for (SSIZE_T y = 0; y < nHeight; y++)
74  {
75  const BYTE* WINPR_RESTRICT srcLine =
76  &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
77  BYTE* WINPR_RESTRICT dstLine =
78  &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
79 
80  SSIZE_T x = 0;
81 
82  /* Ensure alignment requirements can be met */
83  if (fast)
84  {
85  for (; x < width; x += 8)
86  {
87  const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
88  __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
89  const __m256i s0 = _mm256_loadu_si256(src);
90  __m256i s1 = _mm256_shuffle_epi8(s0, smask);
91 
92  /* _mm256_shuffle_epi8 can not cross 128bit lanes.
93  * manually copy these bytes with extract/insert */
94  const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
95  const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
96  const __m256i bmask =
97  _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF, 0x00000000,
98  0x00000000, 0x00000000, 0x00000000);
99  const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
100 
101  const __m256i s2 = _mm256_loadu_si256(dst);
102  __m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
103  _mm256_storeu_si256(dst, d0);
104  }
105  }
106  for (; x < nWidth; x++)
107  {
108  const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
109  BYTE* dst = &dstLine[(x + nXDst) * dstByte];
110  *dst++ = *src++;
111  *dst++ = *src++;
112  *dst++ = *src++;
113  }
114  }
115 
116  return PRIMITIVES_SUCCESS;
117 }
118 
119 static INLINE pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
120  UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
121  UINT32 nWidth, UINT32 nHeight,
122  const BYTE* WINPR_RESTRICT pSrcData,
123  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
124  SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
125  SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
126 {
127 
128  const SSIZE_T srcByte = 4;
129  const SSIZE_T dstByte = 4;
130 
131  const __m256i mask = _mm256_setr_epi8(
132  (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
133  (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
134  (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
135  (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
136  const UINT32 rem = nWidth % 8;
137  const SSIZE_T width = nWidth - rem;
138  for (SSIZE_T y = 0; y < nHeight; y++)
139  {
140  const BYTE* WINPR_RESTRICT srcLine =
141  &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
142  BYTE* WINPR_RESTRICT dstLine =
143  &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
144 
145  SSIZE_T x = 0;
146  for (; x < width; x += 8)
147  {
148  const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
149  __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
150  const __m256i s0 = _mm256_loadu_si256(src);
151  const __m256i s1 = _mm256_loadu_si256(dst);
152  __m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
153  _mm256_storeu_si256(dst, d0);
154  }
155 
156  for (; x < nWidth; x++)
157  {
158  const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
159  BYTE* dst = &dstLine[(x + nXDst) * dstByte];
160  *dst++ = *src++;
161  *dst++ = *src++;
162  *dst++ = *src++;
163  }
164  }
165 
166  return PRIMITIVES_SUCCESS;
167 }
168 
169 static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
170  BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
171  UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
172  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
173  UINT32 flags, SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier,
174  SSIZE_T dstVOffset)
175 {
176  WINPR_ASSERT(pDstData);
177  WINPR_ASSERT(pSrcData);
178 
179  switch (SrcFormat)
180  {
181  case PIXEL_FORMAT_BGR24:
182  switch (DstFormat)
183  {
184  case PIXEL_FORMAT_BGRX32:
185  case PIXEL_FORMAT_BGRA32:
186  return avx2_image_copy_bgr24_bgrx32(
187  pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
188  nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
189  default:
190  break;
191  }
192  break;
193  case PIXEL_FORMAT_BGRX32:
194  case PIXEL_FORMAT_BGRA32:
195  switch (DstFormat)
196  {
197  case PIXEL_FORMAT_BGRX32:
198  case PIXEL_FORMAT_BGRA32:
199  return avx2_image_copy_bgrx32_bgrx32(
200  pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
201  nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
202  default:
203  break;
204  }
205  break;
206  case PIXEL_FORMAT_RGBX32:
207  case PIXEL_FORMAT_RGBA32:
208  switch (DstFormat)
209  {
210  case PIXEL_FORMAT_RGBX32:
211  case PIXEL_FORMAT_RGBA32:
212  return avx2_image_copy_bgrx32_bgrx32(
213  pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
214  nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
215  default:
216  break;
217  }
218  break;
219  default:
220  break;
221  }
222 
223  primitives_t* gen = primitives_get_generic();
224  return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
225  pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
226 }
227 
228 static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
229  UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
230  UINT32 nWidth, UINT32 nHeight,
231  const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
232  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
233  const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
234 {
235  const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
236  SSIZE_T srcVOffset = 0;
237  SSIZE_T srcVMultiplier = 1;
238  SSIZE_T dstVOffset = 0;
239  SSIZE_T dstVMultiplier = 1;
240 
241  if ((nWidth == 0) || (nHeight == 0))
242  return PRIMITIVES_SUCCESS;
243 
244  if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
245  return -1;
246 
247  if (!pDstData || !pSrcData)
248  return -1;
249 
250  if (nDstStep == 0)
251  nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
252 
253  if (nSrcStep == 0)
254  nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
255 
256  if (vSrcVFlip)
257  {
258  srcVOffset = (nHeight - 1ll) * nSrcStep;
259  srcVMultiplier = -1;
260  }
261 
262  if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
263  return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
264  nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
265  nXSrc, nYSrc, palette, flags, srcVMultiplier,
266  srcVOffset, dstVMultiplier, dstVOffset);
267  else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
268  return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
269  nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
270  nXSrc, nYSrc, palette, srcVMultiplier,
271  srcVOffset, dstVMultiplier, dstVOffset, flags);
272  else
273  {
274  primitives_t* gen = primitives_get_generic();
275  return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
276  pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
277  }
278 }
279 #endif
280 
281 /* ------------------------------------------------------------------------- */
282 void primitives_init_copy_avx2(primitives_t* prims)
283 {
284 #if defined(SSE_AVX_INTRINSICS_ENABLED)
285  if (IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE))
286  {
287  WLog_VRB(PRIM_TAG, "AVX2 optimizations");
288  prims->copy_no_overlap = avx2_image_copy_no_overlap;
289  }
290 #else
291  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or WITH_AVX2 or AVX2 intrinsics not available");
292  WINPR_UNUSED(prims);
293 #endif
294 }
__copy_no_overlap_t copy_no_overlap
Definition: primitives.h:258