16 #include <winpr/sysinfo.h>
18 #include <freerdp/config.h>
21 #include <freerdp/types.h>
22 #include <freerdp/primitives.h>
23 #include <freerdp/log.h>
25 #include "prim_internal.h"
26 #include "prim_copy.h"
27 #include "../codec/color.h"
29 #include <freerdp/codec/color.h>
31 #if defined(SSE2_ENABLED)
32 #define TAG FREERDP_TAG("primitives.copy")
34 #include <emmintrin.h>
35 #include <immintrin.h>
37 static INLINE pstatus_t avx2_image_copy_no_overlap_convert(
38 BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
39 UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
40 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette,
41 SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset);
43 static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
44 UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
46 const BYTE* WINPR_RESTRICT pSrcData,
47 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
48 SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
49 SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
52 const SSIZE_T srcByte = 3;
53 const SSIZE_T dstByte = 4;
55 const __m256i mask = _mm256_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
56 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
57 const __m256i smask = _mm256_set_epi32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
58 0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
59 const __m256i shelpmask = _mm256_set_epi32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
60 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
61 const UINT32 rem = nWidth % 8;
62 const SSIZE_T width = nWidth - rem;
64 const size_t align = nSrcStep % 32;
65 const BOOL fast = (align == 0) ? TRUE : (align >= 8 - MIN(8, (
size_t)rem) ? TRUE : FALSE);
66 for (SSIZE_T y = 0; y < nHeight; y++)
68 const BYTE* WINPR_RESTRICT srcLine =
69 &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
70 BYTE* WINPR_RESTRICT dstLine =
71 &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
78 for (; x < width; x += 8)
80 const __m256i* src = (
const __m256i*)&srcLine[(x + nXSrc) * srcByte];
81 __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
82 const __m256i s0 = _mm256_loadu_si256(src);
83 __m256i s1 = _mm256_shuffle_epi8(s0, smask);
87 const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
88 const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
90 _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF, 0x00000000,
91 0x00000000, 0x00000000, 0x00000000);
92 const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
94 const __m256i s2 = _mm256_loadu_si256(dst);
95 __m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
96 _mm256_storeu_si256(dst, d0);
99 for (; x < nWidth; x++)
101 const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
102 BYTE* dst = &dstLine[(x + nXDst) * dstByte];
109 return PRIMITIVES_SUCCESS;
112 static INLINE pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
113 UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
114 UINT32 nWidth, UINT32 nHeight,
115 const BYTE* WINPR_RESTRICT pSrcData,
116 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
117 SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
118 SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
121 const SSIZE_T srcByte = 4;
122 const SSIZE_T dstByte = 4;
124 const __m256i mask = _mm256_setr_epi8(
125 (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00,
126 (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00,
127 (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00,
128 (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00);
129 const UINT32 rem = nWidth % 8;
130 const SSIZE_T width = nWidth - rem;
131 for (SSIZE_T y = 0; y < nHeight; y++)
133 const BYTE* WINPR_RESTRICT srcLine =
134 &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
135 BYTE* WINPR_RESTRICT dstLine =
136 &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
139 for (; x < width; x += 8)
141 const __m256i* src = (
const __m256i*)&srcLine[(x + nXSrc) * srcByte];
142 __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
143 const __m256i s0 = _mm256_loadu_si256(src);
144 const __m256i s1 = _mm256_loadu_si256(dst);
145 __m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
146 _mm256_storeu_si256(dst, d0);
149 for (; x < nWidth; x++)
151 const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
152 BYTE* dst = &dstLine[(x + nXDst) * dstByte];
159 return PRIMITIVES_SUCCESS;
162 static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
163 BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
164 UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
165 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette,
166 UINT32 flags, SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier,
169 WINPR_ASSERT(pDstData);
170 WINPR_ASSERT(pSrcData);
174 case PIXEL_FORMAT_BGR24:
177 case PIXEL_FORMAT_BGRX32:
178 case PIXEL_FORMAT_BGRA32:
179 return avx2_image_copy_bgr24_bgrx32(
180 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
181 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
186 case PIXEL_FORMAT_BGRX32:
187 case PIXEL_FORMAT_BGRA32:
190 case PIXEL_FORMAT_BGRX32:
191 case PIXEL_FORMAT_BGRA32:
192 return avx2_image_copy_bgrx32_bgrx32(
193 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
194 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
199 case PIXEL_FORMAT_RGBX32:
200 case PIXEL_FORMAT_RGBA32:
203 case PIXEL_FORMAT_RGBX32:
204 case PIXEL_FORMAT_RGBA32:
205 return avx2_image_copy_bgrx32_bgrx32(
206 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
207 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
217 return gen->
copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
218 pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
221 static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
222 UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
223 UINT32 nWidth, UINT32 nHeight,
224 const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
225 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
226 const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
228 const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
229 SSIZE_T srcVOffset = 0;
230 SSIZE_T srcVMultiplier = 1;
231 SSIZE_T dstVOffset = 0;
232 SSIZE_T dstVMultiplier = 1;
234 if ((nWidth == 0) || (nHeight == 0))
235 return PRIMITIVES_SUCCESS;
237 if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
240 if (!pDstData || !pSrcData)
244 nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
247 nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
251 srcVOffset = (nHeight - 1ll) * nSrcStep;
255 if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
256 return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
257 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
258 nXSrc, nYSrc, palette, flags, srcVMultiplier,
259 srcVOffset, dstVMultiplier, dstVOffset);
260 else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
261 return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
262 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
263 nXSrc, nYSrc, palette, srcVMultiplier,
264 srcVOffset, dstVMultiplier, dstVOffset, flags);
268 return gen->
copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
269 pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
277 #if defined(SSE2_ENABLED)
278 if (IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE))
280 WLog_VRB(PRIM_TAG,
"AVX2 optimizations");
284 WLog_VRB(PRIM_TAG,
"undefined WITH_SSE2");
__copy_no_overlap_t copy_no_overlap