16 #include <winpr/sysinfo.h>
18 #include <freerdp/config.h>
21 #include <freerdp/types.h>
22 #include <freerdp/primitives.h>
23 #include <freerdp/log.h>
25 #include "prim_internal.h"
26 #include "prim_copy.h"
27 #include "../codec/color.h"
29 #include <freerdp/codec/color.h>
31 #if defined(SSE_AVX_INTRINSICS_ENABLED)
32 #define TAG FREERDP_TAG("primitives.copy")
34 #include <emmintrin.h>
35 #include <immintrin.h>
37 static inline __m256i mm256_set_epu32(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3,
38 uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7)
40 return _mm256_set_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4,
41 (int32_t)i5, (int32_t)i6, (int32_t)i7);
44 static INLINE pstatus_t avx2_image_copy_no_overlap_convert(
45 BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
46 UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
47 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette,
48 SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset);
50 static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
51 UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
53 const BYTE* WINPR_RESTRICT pSrcData,
54 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
55 SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
56 SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
59 const SSIZE_T srcByte = 3;
60 const SSIZE_T dstByte = 4;
62 const __m256i mask = mm256_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
63 0xFF000000, 0xFF000000, 0xFF000000);
64 const __m256i smask = mm256_set_epu32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
65 0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
66 const __m256i shelpmask = mm256_set_epu32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
67 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
68 const UINT32 rem = nWidth % 8;
69 const SSIZE_T width = nWidth - rem;
71 const size_t align = nSrcStep % 32;
72 const BOOL fast = (align == 0) ? TRUE : (align >= 8 - MIN(8, (
size_t)rem) ? TRUE : FALSE);
73 for (SSIZE_T y = 0; y < nHeight; y++)
75 const BYTE* WINPR_RESTRICT srcLine =
76 &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
77 BYTE* WINPR_RESTRICT dstLine =
78 &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
85 for (; x < width; x += 8)
87 const __m256i* src = (
const __m256i*)&srcLine[(x + nXSrc) * srcByte];
88 __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
89 const __m256i s0 = _mm256_loadu_si256(src);
90 __m256i s1 = _mm256_shuffle_epi8(s0, smask);
94 const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
95 const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
97 _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF, 0x00000000,
98 0x00000000, 0x00000000, 0x00000000);
99 const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
101 const __m256i s2 = _mm256_loadu_si256(dst);
102 __m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
103 _mm256_storeu_si256(dst, d0);
106 for (; x < nWidth; x++)
108 const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
109 BYTE* dst = &dstLine[(x + nXDst) * dstByte];
116 return PRIMITIVES_SUCCESS;
119 static INLINE pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
120 UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
121 UINT32 nWidth, UINT32 nHeight,
122 const BYTE* WINPR_RESTRICT pSrcData,
123 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
124 SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
125 SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
128 const SSIZE_T srcByte = 4;
129 const SSIZE_T dstByte = 4;
131 const __m256i mask = _mm256_setr_epi8(
132 (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00,
133 (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00,
134 (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00,
135 (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00);
136 const UINT32 rem = nWidth % 8;
137 const SSIZE_T width = nWidth - rem;
138 for (SSIZE_T y = 0; y < nHeight; y++)
140 const BYTE* WINPR_RESTRICT srcLine =
141 &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
142 BYTE* WINPR_RESTRICT dstLine =
143 &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
146 for (; x < width; x += 8)
148 const __m256i* src = (
const __m256i*)&srcLine[(x + nXSrc) * srcByte];
149 __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
150 const __m256i s0 = _mm256_loadu_si256(src);
151 const __m256i s1 = _mm256_loadu_si256(dst);
152 __m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
153 _mm256_storeu_si256(dst, d0);
156 for (; x < nWidth; x++)
158 const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
159 BYTE* dst = &dstLine[(x + nXDst) * dstByte];
166 return PRIMITIVES_SUCCESS;
169 static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
170 BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
171 UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
172 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette,
173 UINT32 flags, SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier,
176 WINPR_ASSERT(pDstData);
177 WINPR_ASSERT(pSrcData);
181 case PIXEL_FORMAT_BGR24:
184 case PIXEL_FORMAT_BGRX32:
185 case PIXEL_FORMAT_BGRA32:
186 return avx2_image_copy_bgr24_bgrx32(
187 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
188 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
193 case PIXEL_FORMAT_BGRX32:
194 case PIXEL_FORMAT_BGRA32:
197 case PIXEL_FORMAT_BGRX32:
198 case PIXEL_FORMAT_BGRA32:
199 return avx2_image_copy_bgrx32_bgrx32(
200 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
201 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
206 case PIXEL_FORMAT_RGBX32:
207 case PIXEL_FORMAT_RGBA32:
210 case PIXEL_FORMAT_RGBX32:
211 case PIXEL_FORMAT_RGBA32:
212 return avx2_image_copy_bgrx32_bgrx32(
213 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
214 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
224 return gen->
copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
225 pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
228 static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
229 UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
230 UINT32 nWidth, UINT32 nHeight,
231 const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
232 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
233 const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
235 const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
236 SSIZE_T srcVOffset = 0;
237 SSIZE_T srcVMultiplier = 1;
238 SSIZE_T dstVOffset = 0;
239 SSIZE_T dstVMultiplier = 1;
241 if ((nWidth == 0) || (nHeight == 0))
242 return PRIMITIVES_SUCCESS;
244 if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
247 if (!pDstData || !pSrcData)
251 nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
254 nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
258 srcVOffset = (nHeight - 1ll) * nSrcStep;
262 if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
263 return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
264 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
265 nXSrc, nYSrc, palette, flags, srcVMultiplier,
266 srcVOffset, dstVMultiplier, dstVOffset);
267 else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
268 return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
269 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
270 nXSrc, nYSrc, palette, srcVMultiplier,
271 srcVOffset, dstVMultiplier, dstVOffset, flags);
275 return gen->
copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
276 pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
284 #if defined(SSE_AVX_INTRINSICS_ENABLED)
285 if (IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE))
287 WLog_VRB(PRIM_TAG,
"AVX2 optimizations");
291 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or WITH_AVX2 or AVX2 intrinsics not available");
__copy_no_overlap_t copy_no_overlap