FreeRDP
prim_copy_sse4_1.c
1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Copy operations.
3  * vi:ts=4 sw=4:
4  *
5  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6  * Licensed under the Apache License, Version 2.0 (the "License"); you may
7  * not use this file except in compliance with the License. You may obtain
8  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12  * or implied. See the License for the specific language governing
13  * permissions and limitations under the License.
14  */
15 
16 #include <winpr/sysinfo.h>
17 
18 #include <freerdp/config.h>
19 
20 #include <string.h>
21 #include <freerdp/types.h>
22 #include <freerdp/primitives.h>
23 #include <freerdp/log.h>
24 
25 #include "prim_internal.h"
26 #include "prim_copy.h"
27 #include "../codec/color.h"
28 
29 #include <freerdp/codec/color.h>
30 
31 #if defined(SSE2_ENABLED)
32 #define TAG FREERDP_TAG("primitives.copy")
33 
34 #include <emmintrin.h>
35 #include <immintrin.h>
36 
37 static INLINE pstatus_t sse_image_copy_no_overlap_convert(
38  BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
39  UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
40  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
41  SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset);
42 
43 static INLINE pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
44  UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
45  UINT32 nHeight,
46  const BYTE* WINPR_RESTRICT pSrcData,
47  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
48  SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
49  SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
50 {
51 
52  const SSIZE_T srcByte = 3;
53  const SSIZE_T dstByte = 4;
54 
55  const __m128i mask = _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
56  const __m128i smask = _mm_set_epi32(0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
57  const UINT32 rem = nWidth % 4;
58 
59  const size_t align = nSrcStep % 64;
60  const BOOL fast = (align == 0) ? TRUE : (align >= 16 - MIN(16, (size_t)rem) ? TRUE : FALSE);
61  const SSIZE_T width = nWidth - rem;
62  for (SSIZE_T y = 0; y < nHeight; y++)
63  {
64  const BYTE* WINPR_RESTRICT srcLine =
65  &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
66  BYTE* WINPR_RESTRICT dstLine =
67  &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
68 
69  SSIZE_T x = 0;
70  /* Ensure alignment requirements can be met */
71  if (fast)
72  {
73  for (; x < width; x += 4)
74  {
75  const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
76  __m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
77  const __m128i s0 = _mm_loadu_si128(src);
78  const __m128i s1 = _mm_shuffle_epi8(s0, smask);
79  const __m128i s2 = _mm_loadu_si128(dst);
80 
81  __m128i d0 = _mm_blendv_epi8(s1, s2, mask);
82  _mm_storeu_si128(dst, d0);
83  }
84  }
85  for (; x < nWidth; x++)
86  {
87  const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
88  BYTE* dst = &dstLine[(x + nXDst) * dstByte];
89  *dst++ = *src++;
90  *dst++ = *src++;
91  *dst++ = *src++;
92  }
93  }
94 
95  return PRIMITIVES_SUCCESS;
96 }
97 
98 static INLINE pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
99  UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
100  UINT32 nHeight,
101  const BYTE* WINPR_RESTRICT pSrcData,
102  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
103  SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
104  SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
105 {
106 
107  const SSIZE_T srcByte = 4;
108  const SSIZE_T dstByte = 4;
109 
110  const __m128i mask = _mm_setr_epi8((char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF,
111  (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF,
112  (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
113  const UINT32 rem = nWidth % 4;
114  const SSIZE_T width = nWidth - rem;
115  for (SSIZE_T y = 0; y < nHeight; y++)
116  {
117  const BYTE* WINPR_RESTRICT srcLine =
118  &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
119  BYTE* WINPR_RESTRICT dstLine =
120  &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
121 
122  SSIZE_T x = 0;
123  for (; x < width; x += 4)
124  {
125  const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
126  __m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
127  const __m128i s0 = _mm_loadu_si128(src);
128  const __m128i s1 = _mm_loadu_si128(dst);
129  __m128i d0 = _mm_blendv_epi8(s1, s0, mask);
130  _mm_storeu_si128(dst, d0);
131  }
132 
133  for (; x < nWidth; x++)
134  {
135  const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
136  BYTE* dst = &dstLine[(x + nXDst) * dstByte];
137  *dst++ = *src++;
138  *dst++ = *src++;
139  *dst++ = *src++;
140  }
141  }
142 
143  return PRIMITIVES_SUCCESS;
144 }
145 
146 static pstatus_t sse_image_copy_no_overlap_dst_alpha(
147  BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
148  UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
149  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
150  UINT32 flags, SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier,
151  SSIZE_T dstVOffset)
152 {
153  WINPR_ASSERT(pDstData);
154  WINPR_ASSERT(pSrcData);
155 
156  switch (SrcFormat)
157  {
158  case PIXEL_FORMAT_BGR24:
159  switch (DstFormat)
160  {
161  case PIXEL_FORMAT_BGRX32:
162  case PIXEL_FORMAT_BGRA32:
163  return sse_image_copy_bgr24_bgrx32(
164  pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
165  nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
166  default:
167  break;
168  }
169  break;
170  case PIXEL_FORMAT_BGRX32:
171  case PIXEL_FORMAT_BGRA32:
172  switch (DstFormat)
173  {
174  case PIXEL_FORMAT_BGRX32:
175  case PIXEL_FORMAT_BGRA32:
176  return sse_image_copy_bgrx32_bgrx32(
177  pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
178  nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
179  default:
180  break;
181  }
182  break;
183  case PIXEL_FORMAT_RGBX32:
184  case PIXEL_FORMAT_RGBA32:
185  switch (DstFormat)
186  {
187  case PIXEL_FORMAT_RGBX32:
188  case PIXEL_FORMAT_RGBA32:
189  return sse_image_copy_bgrx32_bgrx32(
190  pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
191  nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
192  default:
193  break;
194  }
195  break;
196  default:
197  break;
198  }
199 
200  primitives_t* gen = primitives_get_generic();
201  return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
202  pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
203 }
204 
205 static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
206  UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
207  UINT32 nWidth, UINT32 nHeight,
208  const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
209  UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
210  const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
211 {
212  const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
213  SSIZE_T srcVOffset = 0;
214  SSIZE_T srcVMultiplier = 1;
215  SSIZE_T dstVOffset = 0;
216  SSIZE_T dstVMultiplier = 1;
217 
218  if ((nWidth == 0) || (nHeight == 0))
219  return PRIMITIVES_SUCCESS;
220 
221  if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
222  return -1;
223 
224  if (!pDstData || !pSrcData)
225  return -1;
226 
227  if (nDstStep == 0)
228  nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
229 
230  if (nSrcStep == 0)
231  nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
232 
233  if (vSrcVFlip)
234  {
235  srcVOffset = (nHeight - 1ll) * nSrcStep;
236  srcVMultiplier = -1;
237  }
238 
239  if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
240  return sse_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
241  nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
242  nXSrc, nYSrc, palette, flags, srcVMultiplier,
243  srcVOffset, dstVMultiplier, dstVOffset);
244  else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
245  return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
246  nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
247  nXSrc, nYSrc, palette, srcVMultiplier,
248  srcVOffset, dstVMultiplier, dstVOffset, flags);
249  else
250  {
251  primitives_t* gen = primitives_get_generic();
252  return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
253  pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
254  }
255 }
256 #endif
257 
258 /* ------------------------------------------------------------------------- */
259 void primitives_init_copy_sse41(primitives_t* prims)
260 {
261 #if defined(SSE2_ENABLED)
262  if (IsProcessorFeaturePresent(PF_SSE4_1_INSTRUCTIONS_AVAILABLE))
263  {
264  WLog_VRB(PRIM_TAG, "SSE4.1 optimizations");
265  prims->copy_no_overlap = sse_image_copy_no_overlap;
266  }
267 #else
268  WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
269  WINPR_UNUSED(prims);
270 #endif
271 }
__copy_no_overlap_t copy_no_overlap
Definition: primitives.h:258