16 #include <winpr/sysinfo.h>
18 #include <freerdp/config.h>
21 #include <freerdp/types.h>
22 #include <freerdp/primitives.h>
23 #include <freerdp/log.h>
25 #include "prim_internal.h"
26 #include "prim_copy.h"
27 #include "../codec/color.h"
29 #include <freerdp/codec/color.h>
31 #if defined(SSE2_ENABLED)
32 #define TAG FREERDP_TAG("primitives.copy")
34 #include <emmintrin.h>
35 #include <immintrin.h>
37 static INLINE pstatus_t sse_image_copy_no_overlap_convert(
38 BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
39 UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
40 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette,
41 SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset);
43 static INLINE pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
44 UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
46 const BYTE* WINPR_RESTRICT pSrcData,
47 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
48 SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
49 SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
52 const SSIZE_T srcByte = 3;
53 const SSIZE_T dstByte = 4;
55 const __m128i mask = _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
56 const __m128i smask = _mm_set_epi32(0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
57 const UINT32 rem = nWidth % 4;
59 const size_t align = nSrcStep % 64;
60 const BOOL fast = (align == 0) ? TRUE : (align >= 16 - MIN(16, (
size_t)rem) ? TRUE : FALSE);
61 const SSIZE_T width = nWidth - rem;
62 for (SSIZE_T y = 0; y < nHeight; y++)
64 const BYTE* WINPR_RESTRICT srcLine =
65 &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
66 BYTE* WINPR_RESTRICT dstLine =
67 &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
73 for (; x < width; x += 4)
75 const __m128i* src = (
const __m128i*)&srcLine[(x + nXSrc) * srcByte];
76 __m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
77 const __m128i s0 = _mm_loadu_si128(src);
78 const __m128i s1 = _mm_shuffle_epi8(s0, smask);
79 const __m128i s2 = _mm_loadu_si128(dst);
81 __m128i d0 = _mm_blendv_epi8(s1, s2, mask);
82 _mm_storeu_si128(dst, d0);
85 for (; x < nWidth; x++)
87 const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
88 BYTE* dst = &dstLine[(x + nXDst) * dstByte];
95 return PRIMITIVES_SUCCESS;
98 static INLINE pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
99 UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
101 const BYTE* WINPR_RESTRICT pSrcData,
102 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
103 SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
104 SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
107 const SSIZE_T srcByte = 4;
108 const SSIZE_T dstByte = 4;
110 const __m128i mask = _mm_setr_epi8((
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF,
111 (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF,
112 (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00);
113 const UINT32 rem = nWidth % 4;
114 const SSIZE_T width = nWidth - rem;
115 for (SSIZE_T y = 0; y < nHeight; y++)
117 const BYTE* WINPR_RESTRICT srcLine =
118 &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
119 BYTE* WINPR_RESTRICT dstLine =
120 &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
123 for (; x < width; x += 4)
125 const __m128i* src = (
const __m128i*)&srcLine[(x + nXSrc) * srcByte];
126 __m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
127 const __m128i s0 = _mm_loadu_si128(src);
128 const __m128i s1 = _mm_loadu_si128(dst);
129 __m128i d0 = _mm_blendv_epi8(s1, s0, mask);
130 _mm_storeu_si128(dst, d0);
133 for (; x < nWidth; x++)
135 const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
136 BYTE* dst = &dstLine[(x + nXDst) * dstByte];
143 return PRIMITIVES_SUCCESS;
146 static pstatus_t sse_image_copy_no_overlap_dst_alpha(
147 BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
148 UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
149 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette,
150 UINT32 flags, SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier,
153 WINPR_ASSERT(pDstData);
154 WINPR_ASSERT(pSrcData);
158 case PIXEL_FORMAT_BGR24:
161 case PIXEL_FORMAT_BGRX32:
162 case PIXEL_FORMAT_BGRA32:
163 return sse_image_copy_bgr24_bgrx32(
164 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
165 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
170 case PIXEL_FORMAT_BGRX32:
171 case PIXEL_FORMAT_BGRA32:
174 case PIXEL_FORMAT_BGRX32:
175 case PIXEL_FORMAT_BGRA32:
176 return sse_image_copy_bgrx32_bgrx32(
177 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
178 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
183 case PIXEL_FORMAT_RGBX32:
184 case PIXEL_FORMAT_RGBA32:
187 case PIXEL_FORMAT_RGBX32:
188 case PIXEL_FORMAT_RGBA32:
189 return sse_image_copy_bgrx32_bgrx32(
190 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
191 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
201 return gen->
copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
202 pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
205 static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
206 UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
207 UINT32 nWidth, UINT32 nHeight,
208 const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
209 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
210 const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
212 const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
213 SSIZE_T srcVOffset = 0;
214 SSIZE_T srcVMultiplier = 1;
215 SSIZE_T dstVOffset = 0;
216 SSIZE_T dstVMultiplier = 1;
218 if ((nWidth == 0) || (nHeight == 0))
219 return PRIMITIVES_SUCCESS;
221 if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
224 if (!pDstData || !pSrcData)
228 nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
231 nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
235 srcVOffset = (nHeight - 1ll) * nSrcStep;
239 if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
240 return sse_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
241 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
242 nXSrc, nYSrc, palette, flags, srcVMultiplier,
243 srcVOffset, dstVMultiplier, dstVOffset);
244 else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
245 return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
246 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
247 nXSrc, nYSrc, palette, srcVMultiplier,
248 srcVOffset, dstVMultiplier, dstVOffset, flags);
252 return gen->
copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
253 pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
261 #if defined(SSE2_ENABLED)
262 if (IsProcessorFeaturePresent(PF_SSE4_1_INSTRUCTIONS_AVAILABLE))
264 WLog_VRB(PRIM_TAG,
"SSE4.1 optimizations");
268 WLog_VRB(PRIM_TAG,
"undefined WITH_SSE2");
__copy_no_overlap_t copy_no_overlap