Merge lp:~schnetter/pocl/main into lp:~pocl/pocl/trunk
- main
- Merge into trunk
Proposed by
Erik Schnetter
Status: | Merged |
---|---|
Merged at revision: | 236 |
Proposed branch: | lp:~schnetter/pocl/main |
Merge into: | lp:~pocl/pocl/trunk |
Diff against target: |
547 lines (+420/-46) 6 files modified
examples/kernel/test_bitselect.cl (+56/-1) lib/kernel/clz.cl (+0/-42) lib/kernel/popcount.cl (+0/-2) lib/kernel/x86_64/abs.cl (+191/-0) lib/kernel/x86_64/copysign.cl (+172/-0) tests/testsuite.at (+1/-1) |
To merge this branch: | bzr merge lp:~schnetter/pocl/main |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Erik Schnetter | Needs Resubmitting | ||
pocl maintaners | Pending | ||
Review via email:
|
Commit message
Description of the change
I have implemented x86-64 optimized (vectorized) versions of copysign and abs.
To post a comment you must log in.
Revision history for this message
![](/+icing/build/overlay/assets/skins/sam/images/close.gif)
Pekka Jääskeläinen (pekka-jaaskelainen) wrote : | # |
Revision history for this message
![](/+icing/build/overlay/assets/skins/sam/images/close.gif)
Erik Schnetter (schnetter) wrote : | # |
This compiler warning is there since the code is type-generic and works for both signed and unsigned types. abs() is indeed a superfluous operation for unsigned types. Since OpenCL does not have templates, we need to use if statements instead.
The code now checks the sign bit explicitly instead of comparing to zero. This silences the warning (until the optimizer catches up). The real solution would be to switch off this warning for this code.
review:
Needs Resubmitting
lp:~schnetter/pocl/main
updated
- 197. By Erik Schnetter
-
Check sign bit explicitly to avoid compiler warning
- 198. By Erik Schnetter
-
Merge from trunk
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file 'examples/kernel/test_bitselect.cl' |
2 | --- examples/kernel/test_bitselect.cl 2012-01-27 23:38:45 +0000 |
3 | +++ examples/kernel/test_bitselect.cl 2012-04-10 14:24:21 +0000 |
4 | @@ -1,5 +1,8 @@ |
5 | +// TESTING: abs |
6 | // TESTING: bitselect |
7 | // TESTING: clz |
8 | +// TESTING: max |
9 | +// TESTING: min |
10 | // TESTING: popcount |
11 | |
12 | #define IMPLEMENT_BODY_G(NAME, BODY, GTYPE, SGTYPE, UGTYPE, SUGTYPE) \ |
13 | @@ -1144,7 +1147,13 @@ |
14 | gtype v; |
15 | sgtype s[16]; |
16 | } Tvec; |
17 | - Tvec sel, left, right, res_bitselect, res_clz, res_popcount; |
18 | + typedef union { |
19 | + ugtype v; |
20 | + sugtype s[16]; |
21 | + } UTvec; |
22 | + Tvec sel, left, right; |
23 | + UTvec res_abs; |
24 | + Tvec res_bitselect, res_clz, res_max, res_min, res_popcount; |
25 | int vecsize = vec_step(gtype); |
26 | for (int n=0; n<vecsize; ++n) { |
27 | sel.s[n] = randoms[(iter+n ) % nrandoms]; |
28 | @@ -1156,10 +1165,32 @@ |
29 | right.s[n] = (right.s[n] << (bits/2)) | randoms[(iter+n+140) % nrandoms]; |
30 | } |
31 | } |
32 | + res_abs.v = abs(left.v); |
33 | res_bitselect.v = bitselect(left.v, right.v, sel.v); |
34 | res_clz.v = clz(left.v); |
35 | + res_max.v = max(left.v, right.v); |
36 | + res_min.v = min(left.v, right.v); |
37 | res_popcount.v = popcount(left.v); |
38 | bool equal; |
39 | + // abs |
40 | + equal = true; |
41 | + for (int n=0; n<vecsize; ++n) { |
42 | + sgtype signbit = (sgtype)1 << (sgtype)(count_bits(sgtype)-1); |
43 | + // Note: left.s[n] < 0 leads to a compiler warning for unsigned types, |
44 | + // so we check the sign bit explicitly |
45 | + sugtype absval = |
46 | + is_signed(sgtype) ? |
47 | + (left.s[n] & signbit ? -left.s[n] : left.s[n]) : |
48 | + left.s[n]; |
49 | + equal = equal && res_abs.s[n] == absval; |
50 | + } |
51 | + if (!equal) { |
52 | + printf("FAIL: abs type=%s a=0x%08x c=0x%08x\n", |
53 | + typename, |
54 | + (uint)left.s[0], |
55 | + (uint)res_abs.s[0]); |
56 | + return; |
57 | + } |
58 | // bitselect |
59 | equal = true; |
60 | for (int n=0; n<vecsize; ++n) { |
61 | @@ -1190,6 +1221,30 @@ |
62 | (uint)left.s[0], (uint)res_clz.s[0]); |
63 | return; |
64 | } |
65 | + // max |
66 | + equal = true; |
67 | + for (int n=0; n<vecsize; ++n) { |
68 | + equal = equal && res_max.s[n] == (left.s[n] > right.s[n] ? left.s[n] : right.s[n]); |
69 | + } |
70 | + if (!equal) { |
71 | + printf("FAIL: max type=%s a=0x%08x b=0x%08x c=0x%08x\n", |
72 | + typename, |
73 | + (uint)left.s[0], (uint)right.s[0], |
74 | + (uint)res_max.s[0]); |
75 | + return; |
76 | + } |
77 | + // min |
78 | + equal = true; |
79 | + for (int n=0; n<vecsize; ++n) { |
80 | + equal = equal && res_min.s[n] == (left.s[n] < right.s[n] ? left.s[n] : right.s[n]); |
81 | + } |
82 | + if (!equal) { |
83 | + printf("FAIL: min type=%s a=0x%08x b=0x%08x c=0x%08x\n", |
84 | + typename, |
85 | + (uint)left.s[0], (uint)right.s[0], |
86 | + (uint)res_min.s[0]); |
87 | + return; |
88 | + } |
89 | // popcount |
90 | equal = true; |
91 | for (int n=0; n<vecsize; ++n) { |
92 | |
93 | === modified file 'lib/kernel/clz.cl' |
94 | --- lib/kernel/clz.cl 2011-12-05 13:38:02 +0000 |
95 | +++ lib/kernel/clz.cl 2012-04-10 14:24:21 +0000 |
96 | @@ -23,8 +23,6 @@ |
97 | |
98 | #include "templates.h" |
99 | |
100 | -// Intel: LZCNT |
101 | - |
102 | #define __builtin_clzhh __builtin_clz |
103 | #define __builtin_clzh __builtin_clz |
104 | #define __builtin_clzuhh __builtin_clz |
105 | @@ -33,43 +31,3 @@ |
106 | #define __builtin_clzul __builtin_clzl |
107 | |
108 | DEFINE_BUILTIN_G_G(clz) |
109 | - |
110 | -#if 0 |
111 | - |
112 | -/* Count ones */ |
113 | -#define CO(b) \ |
114 | - ({ \ |
115 | - ugtype c = b; \ |
116 | - int bitmask = CHAR_BIT * sizeof(sugtype) - 1; \ |
117 | - c -= ((c >> (sugtype)1) & (ugtype)0x5555555555555555UL); \ |
118 | - c = (((c >> (sugtype)2) & (ugtype)0x3333333333333333UL) + \ |
119 | - (c & (ugtype)0x3333333333333333UL)); \ |
120 | - c = (((c >> (sugtype)4) + c) & (ugtype)0x0f0f0f0f0f0f0f0fUL); \ |
121 | - c += (c >> (sugtype)( 8 & bitmask)); \ |
122 | - c += (c >> (sugtype)(16 & bitmask)); \ |
123 | - c += (c >> (sugtype)(32 & bitmask)); \ |
124 | - c & (ugtype)0xff; \ |
125 | - }) |
126 | - |
127 | -/* Count leading zeros */ |
128 | -#define CLZ(a) \ |
129 | - ({ \ |
130 | - ugtype b = a; \ |
131 | - sugtype bits = CHAR_BIT * sizeof(sugtype); \ |
132 | - int bitmask = CHAR_BIT * sizeof(sugtype) - 1; \ |
133 | - b |= (b >> (sugtype)1); \ |
134 | - b |= (b >> (sugtype)2); \ |
135 | - b |= (b >> (sugtype)4); \ |
136 | - b |= (b >> (sugtype)( 8 & bitmask)); \ |
137 | - b |= (b >> (sugtype)(16 & bitmask)); \ |
138 | - b |= (b >> (sugtype)(32 & bitmask)); \ |
139 | - (ugtype)bits - CO(b); \ |
140 | - }) |
141 | - |
142 | -DEFINE_EXPR_G_G(clz, |
143 | - ({ |
144 | - ugtype lz = CLZ(*(ugtype*)&a); |
145 | - *(gtype*)&lz; |
146 | - })) |
147 | - |
148 | -#endif |
149 | |
150 | === modified file 'lib/kernel/popcount.cl' |
151 | --- lib/kernel/popcount.cl 2011-12-05 16:36:58 +0000 |
152 | +++ lib/kernel/popcount.cl 2012-04-10 14:24:21 +0000 |
153 | @@ -23,8 +23,6 @@ |
154 | |
155 | #include "templates.h" |
156 | |
157 | -// Intel: POPCNT |
158 | - |
159 | #define __builtin_popcounthh __builtin_popcount |
160 | #define __builtin_popcounth __builtin_popcount |
161 | #define __builtin_popcountuhh __builtin_popcount |
162 | |
163 | === added file 'lib/kernel/x86_64/abs.cl' |
164 | --- lib/kernel/x86_64/abs.cl 1970-01-01 00:00:00 +0000 |
165 | +++ lib/kernel/x86_64/abs.cl 2012-04-10 14:24:21 +0000 |
166 | @@ -0,0 +1,191 @@ |
167 | +/* OpenCL built-in library: abs() |
168 | + |
169 | + Copyright (c) 2011 Universidad Rey Juan Carlos |
170 | + |
171 | + Permission is hereby granted, free of charge, to any person obtaining a copy |
172 | + of this software and associated documentation files (the "Software"), to deal |
173 | + in the Software without restriction, including without limitation the rights |
174 | + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
175 | + copies of the Software, and to permit persons to whom the Software is |
176 | + furnished to do so, subject to the following conditions: |
177 | + |
178 | + The above copyright notice and this permission notice shall be included in |
179 | + all copies or substantial portions of the Software. |
180 | + |
181 | + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
182 | + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
183 | + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
184 | + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
185 | + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
186 | + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
187 | + THE SOFTWARE. |
188 | +*/ |
189 | + |
190 | +#include "../templates.h" |
191 | + |
192 | +#define IMPLEMENT_DIRECT(NAME, TYPE, UTYPE, EXPR) \ |
193 | + UTYPE _cl_overloadable NAME(TYPE a) \ |
194 | + { \ |
195 | + typedef TYPE gtype; \ |
196 | + typedef UTYPE ugtype; \ |
197 | + return EXPR; \ |
198 | + } |
199 | + |
200 | +#define IMPLEMENT_UPCAST(NAME, TYPE, UTYPE, UPTYPE, LO) \ |
201 | + UTYPE _cl_overloadable NAME(TYPE a) \ |
202 | + { \ |
203 | + UPTYPE a1; \ |
204 | + a1.LO = a; \ |
205 | + return NAME(a1).LO; \ |
206 | + } |
207 | + |
208 | +#define IMPLEMENT_SPLIT(NAME, TYPE, UTYPE, LO, HI) \ |
209 | + UTYPE _cl_overloadable NAME(TYPE a) \ |
210 | + { \ |
211 | + return (UTYPE)(NAME(a.LO), NAME(a.HI)); \ |
212 | + } |
213 | + |
214 | + |
215 | + |
216 | +#define IMPLEMENT_ABS_DIRECT_UNSIGNED (a) |
217 | + |
218 | +#define IMPLEMENT_ABS_BUILTIN_INT __builtin_abs(a) |
219 | + |
220 | +#define IMPLEMENT_ABS_DIRECT \ |
221 | + ({ \ |
222 | + a = a<(gtype)0 ? -a : a; \ |
223 | + *(ugtype*)&a; \ |
224 | + }) |
225 | + |
226 | +#define IMPLEMENT_ABS_SSSE3_CHAR16 \ |
227 | + ({ \ |
228 | + __asm__ ("pabsb %[src], %[dst]" : \ |
229 | + [dst] "=x" (a) : \ |
230 | + [src] "x" (a)); \ |
231 | + *(ugtype*)&a; \ |
232 | + }) |
233 | +#define IMPLEMENT_ABS_AVX2_CHAR32 \ |
234 | + ({ \ |
235 | + __asm__ ("pabsb256 %[src], %[dst]" : \ |
236 | + [dst] "=x" (a) : \ |
237 | + [src] "x" (a)); \ |
238 | + *(ugtype)&a; \ |
239 | + }) |
240 | +#define IMPLEMENT_ABS_SSSE3_SHORT8 \ |
241 | + ({ \ |
242 | + __asm__ ("pabsw %[src], %[dst]" : \ |
243 | + [dst] "=x" (a) : \ |
244 | + [src] "x" (a)); \ |
245 | + *(ugtype*)&a; \ |
246 | + }) |
247 | +#define IMPLEMENT_ABS_AVX2_SHORT16 \ |
248 | + ({ \ |
249 | + __asm__ ("pabsw256 %[src], %[dst]" : \ |
250 | + [dst] "=x" (a) : \ |
251 | + [src] "x" (a)); \ |
252 | + *(ugtype*)&a; \ |
253 | + }) |
254 | +#define IMPLEMENT_ABS_SSSE3_INT4 \ |
255 | + ({ \ |
256 | + __asm__ ("pabsd %[src], %[dst]" : \ |
257 | + [dst] "=x" (a) : \ |
258 | + [src] "x" (a)); \ |
259 | + *(ugtype*)&a; \ |
260 | + }) |
261 | +#define IMPLEMENT_ABS_AVX2_INT8 \ |
262 | + ({ \ |
263 | + __asm__ ("pabsd256 %[src], %[dst]" : \ |
264 | + [dst] "=x" (a) : \ |
265 | + [src] "x" (a)); \ |
266 | + *(ugtype*)&a; \ |
267 | + }) |
268 | + |
269 | + |
270 | + |
271 | +IMPLEMENT_DIRECT(abs, char , uchar , IMPLEMENT_ABS_DIRECT) |
272 | +#ifdef __SSSE3__ |
273 | +IMPLEMENT_UPCAST(abs, char2 , uchar2 , char4 , lo) |
274 | +IMPLEMENT_UPCAST(abs, char3 , uchar3 , char4 , s012) |
275 | +IMPLEMENT_UPCAST(abs, char4 , uchar4 , char8 , lo) |
276 | +IMPLEMENT_UPCAST(abs, char8 , uchar8 , char16, lo) |
277 | +IMPLEMENT_DIRECT(abs, char16, uchar16, IMPLEMENT_ABS_SSSE3_CHAR16) |
278 | +#else |
279 | +IMPLEMENT_DIRECT(abs, char2 , uchar2 , IMPLEMENT_ABS_DIRECT) |
280 | +IMPLEMENT_DIRECT(abs, char3 , uchar3 , IMPLEMENT_ABS_DIRECT) |
281 | +IMPLEMENT_DIRECT(abs, char4 , uchar4 , IMPLEMENT_ABS_DIRECT) |
282 | +IMPLEMENT_DIRECT(abs, char8 , uchar8 , IMPLEMENT_ABS_DIRECT) |
283 | +IMPLEMENT_DIRECT(abs, char16, uchar16, IMPLEMENT_ABS_DIRECT) |
284 | +#endif |
285 | + |
286 | +IMPLEMENT_DIRECT(abs, uchar , uchar , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
287 | +IMPLEMENT_DIRECT(abs, uchar2 , uchar2 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
288 | +IMPLEMENT_DIRECT(abs, uchar3 , uchar3 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
289 | +IMPLEMENT_DIRECT(abs, uchar4 , uchar4 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
290 | +IMPLEMENT_DIRECT(abs, uchar8 , uchar8 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
291 | +IMPLEMENT_DIRECT(abs, uchar16, uchar16, IMPLEMENT_ABS_DIRECT_UNSIGNED) |
292 | + |
293 | +IMPLEMENT_DIRECT(abs, short , ushort , IMPLEMENT_ABS_DIRECT) |
294 | +#ifdef __SSSE3__ |
295 | +IMPLEMENT_UPCAST(abs, short2 , ushort2 , short4, lo) |
296 | +IMPLEMENT_UPCAST(abs, short3 , ushort3 , short4, s012) |
297 | +IMPLEMENT_UPCAST(abs, short4 , ushort4 , short8, lo) |
298 | +IMPLEMENT_DIRECT(abs, short8 , ushort8 , IMPLEMENT_ABS_SSSE3_SHORT8) |
299 | +# ifdef __AVX2__ |
300 | +IMPLEMENT_DIRECT(abs, short16, ushort16, IMPLEMENT_ABS_AVX2_SHORT16) |
301 | +# else |
302 | +IMPLEMENT_SPLIT (abs, short16, ushort16, lo, hi) |
303 | +# endif |
304 | +#else |
305 | +IMPLEMENT_DIRECT(abs, short2 , ushort2 , IMPLEMENT_ABS_DIRECT) |
306 | +IMPLEMENT_DIRECT(abs, short3 , ushort3 , IMPLEMENT_ABS_DIRECT) |
307 | +IMPLEMENT_DIRECT(abs, short4 , ushort4 , IMPLEMENT_ABS_DIRECT) |
308 | +IMPLEMENT_DIRECT(abs, short8 , ushort8 , IMPLEMENT_ABS_DIRECT) |
309 | +IMPLEMENT_DIRECT(abs, short16, ushort16, IMPLEMENT_ABS_DIRECT) |
310 | +#endif |
311 | + |
312 | +IMPLEMENT_DIRECT(abs, ushort , ushort , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
313 | +IMPLEMENT_DIRECT(abs, ushort2 , ushort2 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
314 | +IMPLEMENT_DIRECT(abs, ushort3 , ushort3 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
315 | +IMPLEMENT_DIRECT(abs, ushort4 , ushort4 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
316 | +IMPLEMENT_DIRECT(abs, ushort8 , ushort8 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
317 | +IMPLEMENT_DIRECT(abs, ushort16, ushort16, IMPLEMENT_ABS_DIRECT_UNSIGNED) |
318 | + |
319 | +IMPLEMENT_DIRECT(abs, int , uint , IMPLEMENT_ABS_BUILTIN_INT) |
320 | +#ifdef __SSSE3__ |
321 | +IMPLEMENT_UPCAST(abs, int2 , uint2 , int4, lo) |
322 | +IMPLEMENT_UPCAST(abs, int3 , uint3 , int4, s012) |
323 | +IMPLEMENT_DIRECT(abs, int4 , uint4 , IMPLEMENT_ABS_SSSE3_INT4) |
324 | +# ifdef __AVX2__ |
325 | +IMPLEMENT_DIRECT(abs, int8 , uint8 , IMPLEMENT_ABS_AVX2_INT8) |
326 | +# else |
327 | +IMPLEMENT_SPLIT (abs, int8 , uint8 , lo, hi) |
328 | +#endif |
329 | +IMPLEMENT_SPLIT (abs, int16, uint16, lo, hi) |
330 | +#else |
331 | +IMPLEMENT_DIRECT(abs, int2 , uint2 , IMPLEMENT_ABS_DIRECT) |
332 | +IMPLEMENT_DIRECT(abs, int3 , uint3 , IMPLEMENT_ABS_DIRECT) |
333 | +IMPLEMENT_DIRECT(abs, int4 , uint4 , IMPLEMENT_ABS_DIRECT) |
334 | +IMPLEMENT_DIRECT(abs, int8 , uint8 , IMPLEMENT_ABS_DIRECT) |
335 | +IMPLEMENT_DIRECT(abs, int16, uint16, IMPLEMENT_ABS_DIRECT) |
336 | +#endif |
337 | + |
338 | +IMPLEMENT_DIRECT(abs, uint , uint , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
339 | +IMPLEMENT_DIRECT(abs, uint2 , uint2 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
340 | +IMPLEMENT_DIRECT(abs, uint3 , uint3 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
341 | +IMPLEMENT_DIRECT(abs, uint4 , uint4 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
342 | +IMPLEMENT_DIRECT(abs, uint8 , uint8 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
343 | +IMPLEMENT_DIRECT(abs, uint16, uint16, IMPLEMENT_ABS_DIRECT_UNSIGNED) |
344 | + |
345 | +IMPLEMENT_DIRECT(abs, long , ulong , IMPLEMENT_ABS_DIRECT) |
346 | +IMPLEMENT_DIRECT(abs, long2 , ulong2 , IMPLEMENT_ABS_DIRECT) |
347 | +IMPLEMENT_DIRECT(abs, long3 , ulong3 , IMPLEMENT_ABS_DIRECT) |
348 | +IMPLEMENT_DIRECT(abs, long4 , ulong4 , IMPLEMENT_ABS_DIRECT) |
349 | +IMPLEMENT_DIRECT(abs, long8 , ulong8 , IMPLEMENT_ABS_DIRECT) |
350 | +IMPLEMENT_DIRECT(abs, long16, ulong16, IMPLEMENT_ABS_DIRECT) |
351 | + |
352 | +IMPLEMENT_DIRECT(abs, ulong , ulong , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
353 | +IMPLEMENT_DIRECT(abs, ulong2 , ulong2 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
354 | +IMPLEMENT_DIRECT(abs, ulong3 , ulong3 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
355 | +IMPLEMENT_DIRECT(abs, ulong4 , ulong4 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
356 | +IMPLEMENT_DIRECT(abs, ulong8 , ulong8 , IMPLEMENT_ABS_DIRECT_UNSIGNED) |
357 | +IMPLEMENT_DIRECT(abs, ulong16, ulong16, IMPLEMENT_ABS_DIRECT_UNSIGNED) |
358 | |
359 | === added file 'lib/kernel/x86_64/copysign.cl' |
360 | --- lib/kernel/x86_64/copysign.cl 1970-01-01 00:00:00 +0000 |
361 | +++ lib/kernel/x86_64/copysign.cl 2012-04-10 14:24:21 +0000 |
362 | @@ -0,0 +1,172 @@ |
363 | +/* OpenCL built-in library: copysign() |
364 | + |
365 | + Copyright (c) 2012 Universidad Rey Juan Carlos |
366 | + |
367 | + Permission is hereby granted, free of charge, to any person obtaining a copy |
368 | + of this software and associated documentation files (the "Software"), to deal |
369 | + in the Software without restriction, including without limitation the rights |
370 | + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
371 | + copies of the Software, and to permit persons to whom the Software is |
372 | + furnished to do so, subject to the following conditions: |
373 | + |
374 | + The above copyright notice and this permission notice shall be included in |
375 | + all copies or substantial portions of the Software. |
376 | + |
377 | + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
378 | + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
379 | + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
380 | + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
381 | + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
382 | + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
383 | + THE SOFTWARE. |
384 | +*/ |
385 | + |
386 | +#if 0 |
387 | + |
388 | +#include "../templates.h" |
389 | + |
390 | +// LLVM generates non-optimal code for this implementation |
391 | +DEFINE_EXPR_V_VV(copysign, |
392 | + ({ |
393 | + int bits = CHAR_BIT * sizeof(stype); |
394 | + sjtype sign_mask = (sjtype)1 << (sjtype)(bits - 1); |
395 | + sjtype result = |
396 | + (~sign_mask & *(jtype*)&a) | (sign_mask & *(jtype*)&b); |
397 | + *(vtype*)&result; |
398 | + })) |
399 | + |
400 | +#endif |
401 | + |
402 | + |
403 | + |
404 | +#define IMPLEMENT_DIRECT(NAME, TYPE, EXPR) \ |
405 | + TYPE _cl_overloadable NAME(TYPE a, TYPE b) \ |
406 | + { \ |
407 | + return EXPR; \ |
408 | + } |
409 | + |
410 | +#define IMPLEMENT_UPCAST(NAME, TYPE, UPTYPE, LO) \ |
411 | + TYPE _cl_overloadable NAME(TYPE a, TYPE b) \ |
412 | + { \ |
413 | + UPTYPE a1, b1; \ |
414 | + a1.LO = a; \ |
415 | + b1.LO = b; \ |
416 | + return NAME(a1, b1).LO; \ |
417 | + } |
418 | + |
419 | +#define IMPLEMENT_SPLIT(NAME, TYPE, LO, HI) \ |
420 | + TYPE _cl_overloadable NAME(TYPE a, TYPE b) \ |
421 | + { \ |
422 | + return (TYPE)(NAME(a.LO, b.LO), NAME(a.HI, b.HI)); \ |
423 | + } |
424 | + |
425 | + |
426 | + |
427 | +#define IMPLEMENT_COPYSIGN_DIRECT \ |
428 | + ({ \ |
429 | + int bits = CHAR_BIT * sizeof(stype); \ |
430 | + jtype sign_mask = (jtype)1 << (jtype)(bits - 1); \ |
431 | + jtype result = (~sign_mask & *(jtype*)&a) | (sign_mask & *(jtype*)&b); \ |
432 | + *(vtype*)&result; \ |
433 | + }) |
434 | +#define IMPLEMENT_COPYSIGN_SSE_FLOAT4 \ |
435 | + ({ \ |
436 | + uint4 sign_mask = {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}; \ |
437 | + __asm__ ("andps %[src], %[dst]" : \ |
438 | + [dst] "+x" (a) : \ |
439 | + [src] "xm" (~sign_mask)); \ |
440 | + __asm__ ("andps %[src], %[dst]" : \ |
441 | + [dst] "+x" (b) : \ |
442 | + [src] "xm" (sign_mask)); \ |
443 | + __asm__ ("orps %[src], %[dst]" : \ |
444 | + [dst] "+x" (a) : \ |
445 | + [src] "xm" (b)); \ |
446 | + a; \ |
447 | + }) |
448 | +#define IMPLEMENT_COPYSIGN_AVX_FLOAT8 \ |
449 | + ({ \ |
450 | + uint8 sign_mask = {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, \ |
451 | + 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}; \ |
452 | + __asm__ ("andps256 %[src], %[dst]" : \ |
453 | + [dst] "+x" (a) : \ |
454 | + [src] "xm" (~sign_mask)); \ |
455 | + __asm__ ("andps256 %[src], %[dst]" : \ |
456 | + [dst] "+x" (b) : \ |
457 | + [src] "xm" (sign_mask)); \ |
458 | + __asm__ ("orps256 %[src], %[dst]" : \ |
459 | + [dst] "+x" (a) : \ |
460 | + [src] "xm" (b)); \ |
461 | + a; \ |
462 | + }) |
463 | +#define IMPLEMENT_COPYSIGN_SSE2_DOUBLE2 \ |
464 | + ({ \ |
465 | + ulong2 sign_mask = {0x8000000000000000UL, 0x8000000000000000UL}; \ |
466 | + __asm__ ("andpd %[src], %[dst]" : \ |
467 | + [dst] "+x" (a) : \ |
468 | + [src] "xm" (~sign_mask)); \ |
469 | + __asm__ ("andpd %[src], %[dst]" : \ |
470 | + [dst] "+x" (b) : \ |
471 | + [src] "xm" (sign_mask)); \ |
472 | + __asm__ ("orpd %[src], %[dst]" : \ |
473 | + [dst] "+x" (a) : \ |
474 | + [src] "xm" (b)); \ |
475 | + a; \ |
476 | + }) |
477 | +#define IMPLEMENT_COPYSIGN_AVX_DOUBLE4 \ |
478 | + ({ \ |
479 | + ulong4 sign_mask = {0x8000000000000000UL, 0x8000000000000000UL, \ |
480 | + 0x8000000000000000UL, 0x8000000000000000UL}; \ |
481 | + __asm__ ("andpd256 %[src], %[dst]" : \ |
482 | + [dst] "+x" (a) : \ |
483 | + [src] "xm" (~sign_mask)); \ |
484 | + __asm__ ("andpd256 %[src], %[dst]" : \ |
485 | + [dst] "+x" (b) : \ |
486 | + [src] "xm" (sign_mask)); \ |
487 | + __asm__ ("orpd256 %[src], %[dst]" : \ |
488 | + [dst] "+x" (a) : \ |
489 | + [src] "xm" (b)); \ |
490 | + a; \ |
491 | + }) |
492 | + |
493 | + |
494 | + |
495 | +#ifdef __SSE__ |
496 | +IMPLEMENT_UPCAST(copysign, float , float2, lo) |
497 | +IMPLEMENT_UPCAST(copysign, float2 , float4, lo) |
498 | +IMPLEMENT_UPCAST(copysign, float3 , float4, s012) |
499 | +IMPLEMENT_DIRECT(copysign, float4 , IMPLEMENT_COPYSIGN_SSE_FLOAT4) |
500 | +# ifdef __AVX__ |
501 | +IMPLEMENT_DIRECT(copysign, float8 , IMPLEMENT_COPYSIGN_AVX_FLOAT8) |
502 | +# else |
503 | +IMPLEMENT_SPLIT (copysign, float8 , lo, hi) |
504 | +# endif |
505 | +IMPLEMENT_SPLIT (copysign, float16, lo, hi) |
506 | +#else |
507 | +IMPLEMENT_DIRECT(copysign, float , IMPLEMENT_COPYSIGN_DIRECT) |
508 | +IMPLEMENT_DIRECT(copysign, float2 , IMPLEMENT_COPYSIGN_DIRECT) |
509 | +IMPLEMENT_DIRECT(copysign, float3 , IMPLEMENT_COPYSIGN_DIRECT) |
510 | +IMPLEMENT_DIRECT(copysign, float4 , IMPLEMENT_COPYSIGN_DIRECT) |
511 | +IMPLEMENT_DIRECT(copysign, float8 , IMPLEMENT_COPYSIGN_DIRECT) |
512 | +IMPLEMENT_DIRECT(copysign, float16, IMPLEMENT_COPYSIGN_DIRECT) |
513 | +#endif |
514 | + |
515 | +#ifdef __SSE2__ |
516 | +IMPLEMENT_UPCAST(copysign, double , double2, lo) |
517 | +IMPLEMENT_DIRECT(copysign, double2 , IMPLEMENT_COPYSIGN_SSE2_DOUBLE2) |
518 | +# ifdef __AVX__ |
519 | +IMPLEMENT_UPCAST(copysign, double3 , double4, s012) |
520 | +IMPLEMENT_DIRECT(copysign, double4 , IMPLEMENT_COPYSIGN_AVX_DOUBLE4) |
521 | +# else |
522 | +IMPLEMENT_SPLIT (copysign, double3 , lo, s2) |
523 | +IMPLEMENT_SPLIT (copysign, double4 , lo, hi) |
524 | +# endif |
525 | +IMPLEMENT_SPLIT (copysign, double8 , lo, hi) |
526 | +IMPLEMENT_SPLIT (copysign, double16, lo, hi) |
527 | +#else |
528 | +IMPLEMENT_DIRECT(copysign, double , IMPLEMENT_COPYSIGN_DIRECT) |
529 | +IMPLEMENT_DIRECT(copysign, double2 , IMPLEMENT_COPYSIGN_DIRECT) |
530 | +IMPLEMENT_DIRECT(copysign, double3 , IMPLEMENT_COPYSIGN_DIRECT) |
531 | +IMPLEMENT_DIRECT(copysign, double4 , IMPLEMENT_COPYSIGN_DIRECT) |
532 | +IMPLEMENT_DIRECT(copysign, double8 , IMPLEMENT_COPYSIGN_DIRECT) |
533 | +IMPLEMENT_DIRECT(copysign, double16, IMPLEMENT_COPYSIGN_DIRECT) |
534 | +#endif |
535 | |
536 | === modified file 'tests/testsuite.at' |
537 | --- tests/testsuite.at 2012-03-27 10:48:08 +0000 |
538 | +++ tests/testsuite.at 2012-04-10 14:24:21 +0000 |
539 | @@ -167,7 +167,7 @@ |
540 | |
541 | AT_BANNER([Kernel runtime library]) |
542 | |
543 | -AT_SETUP([Kernel functions bitselect clz popcount]) |
544 | +AT_SETUP([Kernel functions abs bitselect clz max min popcount]) |
545 | AT_DATA([expout], |
546 | [Running test test_bitselect... |
547 | OK |
'make check' fails (11: Kernel functions abs bitselect clz max min popcount FAILED (testsuite.at:175) due to a warning: /program. cl:126: 461386: warning: comparison of unsigned expression < 0 is always false [-Wtautological -compare] ... char() { typedef char gtype; typedef char sgtype; typedef uchar ugtype; typedef uchar sugtype; char const *const typename = "char"; ({ typedef int aisgtype[ (!((sgtype) 0.1 > ...
pocl643LiL/
oid test_bitselect_