FreeRDP
prim_copy_avx2.c
1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Copy operations.
3  * vi:ts=4 sw=4:
4  *
5  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6  * Licensed under the Apache License, Version 2.0 (the "License"); you may
7  * not use this file except in compliance with the License. You may obtain
8  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12  * or implied. See the License for the specific language governing
13  * permissions and limitations under the License.
14  */
15 
16 #include <winpr/sysinfo.h>
17 
18 #include <freerdp/config.h>
19 
20 #include <string.h>
21 #include <freerdp/types.h>
22 #include <freerdp/primitives.h>
23 #include <freerdp/log.h>
24 
25 #include "prim_internal.h"
26 #include "prim_copy.h"
27 #include "../codec/color.h"
28 
29 #include <freerdp/codec/color.h>
30 
31 #if defined(SSE2_ENABLED)
32 #define TAG FREERDP_TAG("primitives.copy")
33 
34 #include <emmintrin.h>
35 #include <immintrin.h>
36 
37 static INLINE pstatus_t avx2_image_copy_no_overlap_convert(
38  BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
39  UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
40  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
41  SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset);
42 
43 static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
44  UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
45  UINT32 nHeight,
46  const BYTE* WINPR_RESTRICT pSrcData,
47  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
48  SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
49  SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
50 {
51 
52  const SSIZE_T srcByte = 3;
53  const SSIZE_T dstByte = 4;
54 
55  const __m256i mask = _mm256_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
56  0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
57  const __m256i smask = _mm256_set_epi32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
58  0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
59  const __m256i shelpmask = _mm256_set_epi32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
60  0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
61  const UINT32 rem = nWidth % 8;
62  const SSIZE_T width = nWidth - rem;
63 
64  const size_t align = nSrcStep % 32;
65  const BOOL fast = (align == 0) ? TRUE : (align >= 8 - MIN(8, (size_t)rem) ? TRUE : FALSE);
66  for (SSIZE_T y = 0; y < nHeight; y++)
67  {
68  const BYTE* WINPR_RESTRICT srcLine =
69  &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
70  BYTE* WINPR_RESTRICT dstLine =
71  &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
72 
73  SSIZE_T x = 0;
74 
75  /* Ensure alignment requirements can be met */
76  if (fast)
77  {
78  for (; x < width; x += 8)
79  {
80  const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
81  __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
82  const __m256i s0 = _mm256_loadu_si256(src);
83  __m256i s1 = _mm256_shuffle_epi8(s0, smask);
84 
85  /* _mm256_shuffle_epi8 can not cross 128bit lanes.
86  * manually copy these bytes with extract/insert */
87  const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
88  const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
89  const __m256i bmask =
90  _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF, 0x00000000,
91  0x00000000, 0x00000000, 0x00000000);
92  const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
93 
94  const __m256i s2 = _mm256_loadu_si256(dst);
95  __m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
96  _mm256_storeu_si256(dst, d0);
97  }
98  }
99  for (; x < nWidth; x++)
100  {
101  const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
102  BYTE* dst = &dstLine[(x + nXDst) * dstByte];
103  *dst++ = *src++;
104  *dst++ = *src++;
105  *dst++ = *src++;
106  }
107  }
108 
109  return PRIMITIVES_SUCCESS;
110 }
111 
112 static INLINE pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
113  UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
114  UINT32 nWidth, UINT32 nHeight,
115  const BYTE* WINPR_RESTRICT pSrcData,
116  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
117  SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
118  SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
119 {
120 
121  const SSIZE_T srcByte = 4;
122  const SSIZE_T dstByte = 4;
123 
124  const __m256i mask = _mm256_setr_epi8(
125  (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
126  (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
127  (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
128  (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
129  const UINT32 rem = nWidth % 8;
130  const SSIZE_T width = nWidth - rem;
131  for (SSIZE_T y = 0; y < nHeight; y++)
132  {
133  const BYTE* WINPR_RESTRICT srcLine =
134  &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
135  BYTE* WINPR_RESTRICT dstLine =
136  &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
137 
138  SSIZE_T x = 0;
139  for (; x < width; x += 8)
140  {
141  const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
142  __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
143  const __m256i s0 = _mm256_loadu_si256(src);
144  const __m256i s1 = _mm256_loadu_si256(dst);
145  __m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
146  _mm256_storeu_si256(dst, d0);
147  }
148 
149  for (; x < nWidth; x++)
150  {
151  const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
152  BYTE* dst = &dstLine[(x + nXDst) * dstByte];
153  *dst++ = *src++;
154  *dst++ = *src++;
155  *dst++ = *src++;
156  }
157  }
158 
159  return PRIMITIVES_SUCCESS;
160 }
161 
162 static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
163  BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
164  UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
165  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
166  UINT32 flags, SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier,
167  SSIZE_T dstVOffset)
168 {
169  WINPR_ASSERT(pDstData);
170  WINPR_ASSERT(pSrcData);
171 
172  switch (SrcFormat)
173  {
174  case PIXEL_FORMAT_BGR24:
175  switch (DstFormat)
176  {
177  case PIXEL_FORMAT_BGRX32:
178  case PIXEL_FORMAT_BGRA32:
179  return avx2_image_copy_bgr24_bgrx32(
180  pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
181  nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
182  default:
183  break;
184  }
185  break;
186  case PIXEL_FORMAT_BGRX32:
187  case PIXEL_FORMAT_BGRA32:
188  switch (DstFormat)
189  {
190  case PIXEL_FORMAT_BGRX32:
191  case PIXEL_FORMAT_BGRA32:
192  return avx2_image_copy_bgrx32_bgrx32(
193  pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
194  nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
195  default:
196  break;
197  }
198  break;
199  case PIXEL_FORMAT_RGBX32:
200  case PIXEL_FORMAT_RGBA32:
201  switch (DstFormat)
202  {
203  case PIXEL_FORMAT_RGBX32:
204  case PIXEL_FORMAT_RGBA32:
205  return avx2_image_copy_bgrx32_bgrx32(
206  pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
207  nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
208  default:
209  break;
210  }
211  break;
212  default:
213  break;
214  }
215 
216  primitives_t* gen = primitives_get_generic();
217  return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
218  pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
219 }
220 
221 static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
222  UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
223  UINT32 nWidth, UINT32 nHeight,
224  const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
225  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
226  const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
227 {
228  const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
229  SSIZE_T srcVOffset = 0;
230  SSIZE_T srcVMultiplier = 1;
231  SSIZE_T dstVOffset = 0;
232  SSIZE_T dstVMultiplier = 1;
233 
234  if ((nWidth == 0) || (nHeight == 0))
235  return PRIMITIVES_SUCCESS;
236 
237  if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
238  return -1;
239 
240  if (!pDstData || !pSrcData)
241  return -1;
242 
243  if (nDstStep == 0)
244  nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
245 
246  if (nSrcStep == 0)
247  nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
248 
249  if (vSrcVFlip)
250  {
251  srcVOffset = (nHeight - 1ll) * nSrcStep;
252  srcVMultiplier = -1;
253  }
254 
255  if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
256  return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
257  nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
258  nXSrc, nYSrc, palette, flags, srcVMultiplier,
259  srcVOffset, dstVMultiplier, dstVOffset);
260  else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
261  return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
262  nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
263  nXSrc, nYSrc, palette, srcVMultiplier,
264  srcVOffset, dstVMultiplier, dstVOffset, flags);
265  else
266  {
267  primitives_t* gen = primitives_get_generic();
268  return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
269  pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
270  }
271 }
272 #endif
273 
274 /* ------------------------------------------------------------------------- */
275 void primitives_init_copy_avx2(primitives_t* prims)
276 {
277 #if defined(SSE2_ENABLED)
278  if (IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE))
279  {
280  WLog_VRB(PRIM_TAG, "AVX2 optimizations");
281  prims->copy_no_overlap = avx2_image_copy_no_overlap;
282  }
283 #else
284  WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
285  WINPR_UNUSED(prims);
286 #endif
287 }
__copy_no_overlap_t copy_no_overlap
Definition: primitives.h:258