FreeRDP
prim_alphaComp_sse3.c
1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Optimized alpha blending routines.
3  * vi:ts=4 sw=4:
4  *
5  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6  * Licensed under the Apache License, Version 2.0 (the "License"); you may
7  * not use this file except in compliance with the License. You may obtain
8  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12  * or implied. See the License for the specific language governing
13  * permissions and limitations under the License.
14  *
15  * Note: this code assumes the second operand is fully opaque,
16  * e.g.
17  * newval = alpha1*val1 + (1-alpha1)*val2
18  * rather than
19  * newval = alpha1*val1 + (1-alpha1)*alpha2*val2
20  * The IPP gives other options.
21  */
22 
23 #include <freerdp/config.h>
24 
25 #include <freerdp/types.h>
26 #include <freerdp/primitives.h>
27 #include <winpr/sysinfo.h>
28 
29 #include "prim_alphaComp.h"
30 
31 #include "prim_internal.h"
32 
33 /* ------------------------------------------------------------------------- */
34 #if defined(SSE2_ENABLED)
35 #include <emmintrin.h>
36 #include <pmmintrin.h>
37 
38 static primitives_t* generic = NULL;
39 
40 static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
41  const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
42  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
43  UINT32 height)
44 {
45  const UINT32* sptr1 = (const UINT32*)pSrc1;
46  const UINT32* sptr2 = (const UINT32*)pSrc2;
47  UINT32* dptr = NULL;
48  int linebytes = 0;
49  int src1Jump = 0;
50  int src2Jump = 0;
51  int dstJump = 0;
52  __m128i xmm0;
53  __m128i xmm1;
54 
55  if ((width <= 0) || (height <= 0))
56  return PRIMITIVES_SUCCESS;
57 
58  if (width < 4) /* pointless if too small */
59  {
60  return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
61  height);
62  }
63 
64  dptr = (UINT32*)pDst;
65  linebytes = width * sizeof(UINT32);
66  src1Jump = (src1Step - linebytes) / sizeof(UINT32);
67  src2Jump = (src2Step - linebytes) / sizeof(UINT32);
68  dstJump = (dstStep - linebytes) / sizeof(UINT32);
69  xmm0 = _mm_set1_epi32(0);
70  xmm1 = _mm_set1_epi16(1);
71 
72  for (UINT32 y = 0; y < height; ++y)
73  {
74  int pixels = width;
75  int count = 0;
76  /* Get to the 16-byte boundary now. */
77  int leadIn = 0;
78 
79  switch ((ULONG_PTR)dptr & 0x0f)
80  {
81  case 0:
82  leadIn = 0;
83  break;
84 
85  case 4:
86  leadIn = 3;
87  break;
88 
89  case 8:
90  leadIn = 2;
91  break;
92 
93  case 12:
94  leadIn = 1;
95  break;
96 
97  default:
98  /* We'll never hit a 16-byte boundary, so do the whole
99  * thing the slow way.
100  */
101  leadIn = width;
102  break;
103  }
104 
105  if (leadIn)
106  {
107  pstatus_t status = 0;
108  status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
109  src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
110  if (status != PRIMITIVES_SUCCESS)
111  return status;
112 
113  sptr1 += leadIn;
114  sptr2 += leadIn;
115  dptr += leadIn;
116  pixels -= leadIn;
117  }
118 
119  /* Use SSE registers to do 4 pixels at a time. */
120  count = pixels >> 2;
121  pixels -= count << 2;
122 
123  while (count--)
124  {
125  __m128i xmm2;
126  __m128i xmm3;
127  __m128i xmm4;
128  __m128i xmm5;
129  __m128i xmm6;
130  __m128i xmm7;
131  /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
132  xmm2 = LOAD_SI128(sptr1);
133  sptr1 += 4;
134  /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
135  xmm3 = LOAD_SI128(sptr2);
136  sptr2 += 4;
137  /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
138  xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
139  /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
140  xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
141  /* subtract */
142  xmm6 = _mm_subs_epi16(xmm4, xmm5);
143  /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
144  xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
145  /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
146  xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
147  /* Add one to alphas */
148  xmm4 = _mm_adds_epi16(xmm4, xmm1);
149  /* Multiply and take low word */
150  xmm4 = _mm_mullo_epi16(xmm4, xmm6);
151  /* Shift 8 right */
152  xmm4 = _mm_srai_epi16(xmm4, 8);
153  /* Add xmm5 */
154  xmm4 = _mm_adds_epi16(xmm4, xmm5);
155  /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
156  /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
157  xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
158  /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
159  xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
160  /* subtract */
161  xmm7 = _mm_subs_epi16(xmm5, xmm6);
162  /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
163  xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
164  /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
165  xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
166  /* Add one to alphas */
167  xmm5 = _mm_adds_epi16(xmm5, xmm1);
168  /* Multiply and take low word */
169  xmm5 = _mm_mullo_epi16(xmm5, xmm7);
170  /* Shift 8 right */
171  xmm5 = _mm_srai_epi16(xmm5, 8);
172  /* Add xmm6 */
173  xmm5 = _mm_adds_epi16(xmm5, xmm6);
174  /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
175  /* Must mask off remainders or pack gets confused */
176  xmm3 = _mm_set1_epi16(0x00ffU);
177  xmm4 = _mm_and_si128(xmm4, xmm3);
178  xmm5 = _mm_and_si128(xmm5, xmm3);
179  /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
180  xmm5 = _mm_packus_epi16(xmm5, xmm4);
181  _mm_store_si128((__m128i*)dptr, xmm5);
182  dptr += 4;
183  }
184 
185  /* Finish off the remainder. */
186  if (pixels)
187  {
188  pstatus_t status = 0;
189  status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
190  src2Step, (BYTE*)dptr, dstStep, pixels, 1);
191  if (status != PRIMITIVES_SUCCESS)
192  return status;
193 
194  sptr1 += pixels;
195  sptr2 += pixels;
196  dptr += pixels;
197  }
198 
199  /* Jump to next row. */
200  sptr1 += src1Jump;
201  sptr2 += src2Jump;
202  dptr += dstJump;
203  }
204 
205  return PRIMITIVES_SUCCESS;
206 }
207 #endif
208 
209 /* ------------------------------------------------------------------------- */
210 void primitives_init_alphaComp_sse3(primitives_t* WINPR_RESTRICT prims)
211 {
212 #if defined(SSE2_ENABLED)
213  generic = primitives_get_generic();
214  primitives_init_alphaComp(prims);
215 
216  if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
217  IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
218  {
219  WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
220  prims->alphaComp_argb = sse2_alphaComp_argb;
221  }
222 
223 #else
224  WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
225  WINPR_UNUSED(prims);
226 #endif
227 }