Merge lp:~schnetter/pocl/main into lp:~pocl/pocl/trunk

Proposed by Erik Schnetter
Status: Merged
Merged at revision: 236
Proposed branch: lp:~schnetter/pocl/main
Merge into: lp:~pocl/pocl/trunk
Diff against target: 547 lines (+420/-46)
6 files modified
examples/kernel/test_bitselect.cl (+56/-1)
lib/kernel/clz.cl (+0/-42)
lib/kernel/popcount.cl (+0/-2)
lib/kernel/x86_64/abs.cl (+191/-0)
lib/kernel/x86_64/copysign.cl (+172/-0)
tests/testsuite.at (+1/-1)
To merge this branch: bzr merge lp:~schnetter/pocl/main
Reviewer Review Type Date Requested Status
Erik Schnetter Needs Resubmitting
pocl maintaners Pending
Review via email: mp+101128@code.launchpad.net

Description of the change

I have implemented x86-64 optimized (vectorized) versions of copysign and abs.

To post a comment you must log in.
Revision history for this message
Pekka Jääskeläinen (pekka-jaaskelainen) wrote :

'make check' fails (11: Kernel functions abs bitselect clz max min popcount FAILED (testsuite.at:175) due to a warning:
pocl643LiL//program.cl:126:461386: warning: comparison of unsigned expression < 0 is always false [-Wtautological-compare] ...
oid test_bitselect_char() { typedef char gtype; typedef char sgtype; typedef uchar ugtype; typedef uchar sugtype; char const *const typename = "char"; ({ typedef int aisgtype[(!((sgtype)0.1 > ...

Revision history for this message
Erik Schnetter (schnetter) wrote :

This compiler warning is there since the code is type-generic and works for both signed and unsigned types. abs() is indeed a superfluous operation for unsigned types. Since OpenCL does not have templates, we need to use if statements instead.

The code now checks the sign bit explicitly instead of comparing to zero. This silences the warning (until the optimizer catches up). The real solution would be to switch off this warning for this code.

review: Needs Resubmitting
lp:~schnetter/pocl/main updated
197. By Erik Schnetter

Check sign bit explicitly to avoid compiler warning

198. By Erik Schnetter

Merge from trunk

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'examples/kernel/test_bitselect.cl'
2--- examples/kernel/test_bitselect.cl 2012-01-27 23:38:45 +0000
3+++ examples/kernel/test_bitselect.cl 2012-04-10 14:24:21 +0000
4@@ -1,5 +1,8 @@
5+// TESTING: abs
6 // TESTING: bitselect
7 // TESTING: clz
8+// TESTING: max
9+// TESTING: min
10 // TESTING: popcount
11
12 #define IMPLEMENT_BODY_G(NAME, BODY, GTYPE, SGTYPE, UGTYPE, SUGTYPE) \
13@@ -1144,7 +1147,13 @@
14 gtype v;
15 sgtype s[16];
16 } Tvec;
17- Tvec sel, left, right, res_bitselect, res_clz, res_popcount;
18+ typedef union {
19+ ugtype v;
20+ sugtype s[16];
21+ } UTvec;
22+ Tvec sel, left, right;
23+ UTvec res_abs;
24+ Tvec res_bitselect, res_clz, res_max, res_min, res_popcount;
25 int vecsize = vec_step(gtype);
26 for (int n=0; n<vecsize; ++n) {
27 sel.s[n] = randoms[(iter+n ) % nrandoms];
28@@ -1156,10 +1165,32 @@
29 right.s[n] = (right.s[n] << (bits/2)) | randoms[(iter+n+140) % nrandoms];
30 }
31 }
32+ res_abs.v = abs(left.v);
33 res_bitselect.v = bitselect(left.v, right.v, sel.v);
34 res_clz.v = clz(left.v);
35+ res_max.v = max(left.v, right.v);
36+ res_min.v = min(left.v, right.v);
37 res_popcount.v = popcount(left.v);
38 bool equal;
39+ // abs
40+ equal = true;
41+ for (int n=0; n<vecsize; ++n) {
42+ sgtype signbit = (sgtype)1 << (sgtype)(count_bits(sgtype)-1);
43+ // Note: left.s[n] < 0 leads to a compiler warning for unsigned types,
44+ // so we check the sign bit explicitly
45+ sugtype absval =
46+ is_signed(sgtype) ?
47+ (left.s[n] & signbit ? -left.s[n] : left.s[n]) :
48+ left.s[n];
49+ equal = equal && res_abs.s[n] == absval;
50+ }
51+ if (!equal) {
52+ printf("FAIL: abs type=%s a=0x%08x c=0x%08x\n",
53+ typename,
54+ (uint)left.s[0],
55+ (uint)res_abs.s[0]);
56+ return;
57+ }
58 // bitselect
59 equal = true;
60 for (int n=0; n<vecsize; ++n) {
61@@ -1190,6 +1221,30 @@
62 (uint)left.s[0], (uint)res_clz.s[0]);
63 return;
64 }
65+ // max
66+ equal = true;
67+ for (int n=0; n<vecsize; ++n) {
68+ equal = equal && res_max.s[n] == (left.s[n] > right.s[n] ? left.s[n] : right.s[n]);
69+ }
70+ if (!equal) {
71+ printf("FAIL: max type=%s a=0x%08x b=0x%08x c=0x%08x\n",
72+ typename,
73+ (uint)left.s[0], (uint)right.s[0],
74+ (uint)res_max.s[0]);
75+ return;
76+ }
77+ // min
78+ equal = true;
79+ for (int n=0; n<vecsize; ++n) {
80+ equal = equal && res_min.s[n] == (left.s[n] < right.s[n] ? left.s[n] : right.s[n]);
81+ }
82+ if (!equal) {
83+ printf("FAIL: min type=%s a=0x%08x b=0x%08x c=0x%08x\n",
84+ typename,
85+ (uint)left.s[0], (uint)right.s[0],
86+ (uint)res_min.s[0]);
87+ return;
88+ }
89 // popcount
90 equal = true;
91 for (int n=0; n<vecsize; ++n) {
92
93=== modified file 'lib/kernel/clz.cl'
94--- lib/kernel/clz.cl 2011-12-05 13:38:02 +0000
95+++ lib/kernel/clz.cl 2012-04-10 14:24:21 +0000
96@@ -23,8 +23,6 @@
97
98 #include "templates.h"
99
100-// Intel: LZCNT
101-
102 #define __builtin_clzhh __builtin_clz
103 #define __builtin_clzh __builtin_clz
104 #define __builtin_clzuhh __builtin_clz
105@@ -33,43 +31,3 @@
106 #define __builtin_clzul __builtin_clzl
107
108 DEFINE_BUILTIN_G_G(clz)
109-
110-#if 0
111-
112-/* Count ones */
113-#define CO(b) \
114- ({ \
115- ugtype c = b; \
116- int bitmask = CHAR_BIT * sizeof(sugtype) - 1; \
117- c -= ((c >> (sugtype)1) & (ugtype)0x5555555555555555UL); \
118- c = (((c >> (sugtype)2) & (ugtype)0x3333333333333333UL) + \
119- (c & (ugtype)0x3333333333333333UL)); \
120- c = (((c >> (sugtype)4) + c) & (ugtype)0x0f0f0f0f0f0f0f0fUL); \
121- c += (c >> (sugtype)( 8 & bitmask)); \
122- c += (c >> (sugtype)(16 & bitmask)); \
123- c += (c >> (sugtype)(32 & bitmask)); \
124- c & (ugtype)0xff; \
125- })
126-
127-/* Count leading zeros */
128-#define CLZ(a) \
129- ({ \
130- ugtype b = a; \
131- sugtype bits = CHAR_BIT * sizeof(sugtype); \
132- int bitmask = CHAR_BIT * sizeof(sugtype) - 1; \
133- b |= (b >> (sugtype)1); \
134- b |= (b >> (sugtype)2); \
135- b |= (b >> (sugtype)4); \
136- b |= (b >> (sugtype)( 8 & bitmask)); \
137- b |= (b >> (sugtype)(16 & bitmask)); \
138- b |= (b >> (sugtype)(32 & bitmask)); \
139- (ugtype)bits - CO(b); \
140- })
141-
142-DEFINE_EXPR_G_G(clz,
143- ({
144- ugtype lz = CLZ(*(ugtype*)&a);
145- *(gtype*)&lz;
146- }))
147-
148-#endif
149
150=== modified file 'lib/kernel/popcount.cl'
151--- lib/kernel/popcount.cl 2011-12-05 16:36:58 +0000
152+++ lib/kernel/popcount.cl 2012-04-10 14:24:21 +0000
153@@ -23,8 +23,6 @@
154
155 #include "templates.h"
156
157-// Intel: POPCNT
158-
159 #define __builtin_popcounthh __builtin_popcount
160 #define __builtin_popcounth __builtin_popcount
161 #define __builtin_popcountuhh __builtin_popcount
162
163=== added file 'lib/kernel/x86_64/abs.cl'
164--- lib/kernel/x86_64/abs.cl 1970-01-01 00:00:00 +0000
165+++ lib/kernel/x86_64/abs.cl 2012-04-10 14:24:21 +0000
166@@ -0,0 +1,191 @@
167+/* OpenCL built-in library: abs()
168+
169+ Copyright (c) 2011 Universidad Rey Juan Carlos
170+
171+ Permission is hereby granted, free of charge, to any person obtaining a copy
172+ of this software and associated documentation files (the "Software"), to deal
173+ in the Software without restriction, including without limitation the rights
174+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
175+ copies of the Software, and to permit persons to whom the Software is
176+ furnished to do so, subject to the following conditions:
177+
178+ The above copyright notice and this permission notice shall be included in
179+ all copies or substantial portions of the Software.
180+
181+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
182+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
183+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
184+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
185+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
186+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
187+ THE SOFTWARE.
188+*/
189+
190+#include "../templates.h"
191+
192+#define IMPLEMENT_DIRECT(NAME, TYPE, UTYPE, EXPR) \
193+ UTYPE _cl_overloadable NAME(TYPE a) \
194+ { \
195+ typedef TYPE gtype; \
196+ typedef UTYPE ugtype; \
197+ return EXPR; \
198+ }
199+
200+#define IMPLEMENT_UPCAST(NAME, TYPE, UTYPE, UPTYPE, LO) \
201+ UTYPE _cl_overloadable NAME(TYPE a) \
202+ { \
203+ UPTYPE a1; \
204+ a1.LO = a; \
205+ return NAME(a1).LO; \
206+ }
207+
208+#define IMPLEMENT_SPLIT(NAME, TYPE, UTYPE, LO, HI) \
209+ UTYPE _cl_overloadable NAME(TYPE a) \
210+ { \
211+ return (UTYPE)(NAME(a.LO), NAME(a.HI)); \
212+ }
213+
214+
215+
216+#define IMPLEMENT_ABS_DIRECT_UNSIGNED (a)
217+
218+#define IMPLEMENT_ABS_BUILTIN_INT __builtin_abs(a)
219+
220+#define IMPLEMENT_ABS_DIRECT \
221+ ({ \
222+ a = a<(gtype)0 ? -a : a; \
223+ *(ugtype*)&a; \
224+ })
225+
226+#define IMPLEMENT_ABS_SSSE3_CHAR16 \
227+ ({ \
228+ __asm__ ("pabsb %[src], %[dst]" : \
229+ [dst] "=x" (a) : \
230+ [src] "x" (a)); \
231+ *(ugtype*)&a; \
232+ })
233+#define IMPLEMENT_ABS_AVX2_CHAR32 \
234+ ({ \
235+ __asm__ ("pabsb256 %[src], %[dst]" : \
236+ [dst] "=x" (a) : \
237+ [src] "x" (a)); \
238+ *(ugtype)&a; \
239+ })
240+#define IMPLEMENT_ABS_SSSE3_SHORT8 \
241+ ({ \
242+ __asm__ ("pabsw %[src], %[dst]" : \
243+ [dst] "=x" (a) : \
244+ [src] "x" (a)); \
245+ *(ugtype*)&a; \
246+ })
247+#define IMPLEMENT_ABS_AVX2_SHORT16 \
248+ ({ \
249+ __asm__ ("pabsw256 %[src], %[dst]" : \
250+ [dst] "=x" (a) : \
251+ [src] "x" (a)); \
252+ *(ugtype*)&a; \
253+ })
254+#define IMPLEMENT_ABS_SSSE3_INT4 \
255+ ({ \
256+ __asm__ ("pabsd %[src], %[dst]" : \
257+ [dst] "=x" (a) : \
258+ [src] "x" (a)); \
259+ *(ugtype*)&a; \
260+ })
261+#define IMPLEMENT_ABS_AVX2_INT8 \
262+ ({ \
263+ __asm__ ("pabsd256 %[src], %[dst]" : \
264+ [dst] "=x" (a) : \
265+ [src] "x" (a)); \
266+ *(ugtype*)&a; \
267+ })
268+
269+
270+
271+IMPLEMENT_DIRECT(abs, char , uchar , IMPLEMENT_ABS_DIRECT)
272+#ifdef __SSSE3__
273+IMPLEMENT_UPCAST(abs, char2 , uchar2 , char4 , lo)
274+IMPLEMENT_UPCAST(abs, char3 , uchar3 , char4 , s012)
275+IMPLEMENT_UPCAST(abs, char4 , uchar4 , char8 , lo)
276+IMPLEMENT_UPCAST(abs, char8 , uchar8 , char16, lo)
277+IMPLEMENT_DIRECT(abs, char16, uchar16, IMPLEMENT_ABS_SSSE3_CHAR16)
278+#else
279+IMPLEMENT_DIRECT(abs, char2 , uchar2 , IMPLEMENT_ABS_DIRECT)
280+IMPLEMENT_DIRECT(abs, char3 , uchar3 , IMPLEMENT_ABS_DIRECT)
281+IMPLEMENT_DIRECT(abs, char4 , uchar4 , IMPLEMENT_ABS_DIRECT)
282+IMPLEMENT_DIRECT(abs, char8 , uchar8 , IMPLEMENT_ABS_DIRECT)
283+IMPLEMENT_DIRECT(abs, char16, uchar16, IMPLEMENT_ABS_DIRECT)
284+#endif
285+
286+IMPLEMENT_DIRECT(abs, uchar , uchar , IMPLEMENT_ABS_DIRECT_UNSIGNED)
287+IMPLEMENT_DIRECT(abs, uchar2 , uchar2 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
288+IMPLEMENT_DIRECT(abs, uchar3 , uchar3 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
289+IMPLEMENT_DIRECT(abs, uchar4 , uchar4 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
290+IMPLEMENT_DIRECT(abs, uchar8 , uchar8 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
291+IMPLEMENT_DIRECT(abs, uchar16, uchar16, IMPLEMENT_ABS_DIRECT_UNSIGNED)
292+
293+IMPLEMENT_DIRECT(abs, short , ushort , IMPLEMENT_ABS_DIRECT)
294+#ifdef __SSSE3__
295+IMPLEMENT_UPCAST(abs, short2 , ushort2 , short4, lo)
296+IMPLEMENT_UPCAST(abs, short3 , ushort3 , short4, s012)
297+IMPLEMENT_UPCAST(abs, short4 , ushort4 , short8, lo)
298+IMPLEMENT_DIRECT(abs, short8 , ushort8 , IMPLEMENT_ABS_SSSE3_SHORT8)
299+# ifdef __AVX2__
300+IMPLEMENT_DIRECT(abs, short16, ushort16, IMPLEMENT_ABS_AVX2_SHORT16)
301+# else
302+IMPLEMENT_SPLIT (abs, short16, ushort16, lo, hi)
303+# endif
304+#else
305+IMPLEMENT_DIRECT(abs, short2 , ushort2 , IMPLEMENT_ABS_DIRECT)
306+IMPLEMENT_DIRECT(abs, short3 , ushort3 , IMPLEMENT_ABS_DIRECT)
307+IMPLEMENT_DIRECT(abs, short4 , ushort4 , IMPLEMENT_ABS_DIRECT)
308+IMPLEMENT_DIRECT(abs, short8 , ushort8 , IMPLEMENT_ABS_DIRECT)
309+IMPLEMENT_DIRECT(abs, short16, ushort16, IMPLEMENT_ABS_DIRECT)
310+#endif
311+
312+IMPLEMENT_DIRECT(abs, ushort , ushort , IMPLEMENT_ABS_DIRECT_UNSIGNED)
313+IMPLEMENT_DIRECT(abs, ushort2 , ushort2 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
314+IMPLEMENT_DIRECT(abs, ushort3 , ushort3 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
315+IMPLEMENT_DIRECT(abs, ushort4 , ushort4 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
316+IMPLEMENT_DIRECT(abs, ushort8 , ushort8 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
317+IMPLEMENT_DIRECT(abs, ushort16, ushort16, IMPLEMENT_ABS_DIRECT_UNSIGNED)
318+
319+IMPLEMENT_DIRECT(abs, int , uint , IMPLEMENT_ABS_BUILTIN_INT)
320+#ifdef __SSSE3__
321+IMPLEMENT_UPCAST(abs, int2 , uint2 , int4, lo)
322+IMPLEMENT_UPCAST(abs, int3 , uint3 , int4, s012)
323+IMPLEMENT_DIRECT(abs, int4 , uint4 , IMPLEMENT_ABS_SSSE3_INT4)
324+# ifdef __AVX2__
325+IMPLEMENT_DIRECT(abs, int8 , uint8 , IMPLEMENT_ABS_AVX2_INT8)
326+# else
327+IMPLEMENT_SPLIT (abs, int8 , uint8 , lo, hi)
328+#endif
329+IMPLEMENT_SPLIT (abs, int16, uint16, lo, hi)
330+#else
331+IMPLEMENT_DIRECT(abs, int2 , uint2 , IMPLEMENT_ABS_DIRECT)
332+IMPLEMENT_DIRECT(abs, int3 , uint3 , IMPLEMENT_ABS_DIRECT)
333+IMPLEMENT_DIRECT(abs, int4 , uint4 , IMPLEMENT_ABS_DIRECT)
334+IMPLEMENT_DIRECT(abs, int8 , uint8 , IMPLEMENT_ABS_DIRECT)
335+IMPLEMENT_DIRECT(abs, int16, uint16, IMPLEMENT_ABS_DIRECT)
336+#endif
337+
338+IMPLEMENT_DIRECT(abs, uint , uint , IMPLEMENT_ABS_DIRECT_UNSIGNED)
339+IMPLEMENT_DIRECT(abs, uint2 , uint2 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
340+IMPLEMENT_DIRECT(abs, uint3 , uint3 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
341+IMPLEMENT_DIRECT(abs, uint4 , uint4 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
342+IMPLEMENT_DIRECT(abs, uint8 , uint8 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
343+IMPLEMENT_DIRECT(abs, uint16, uint16, IMPLEMENT_ABS_DIRECT_UNSIGNED)
344+
345+IMPLEMENT_DIRECT(abs, long , ulong , IMPLEMENT_ABS_DIRECT)
346+IMPLEMENT_DIRECT(abs, long2 , ulong2 , IMPLEMENT_ABS_DIRECT)
347+IMPLEMENT_DIRECT(abs, long3 , ulong3 , IMPLEMENT_ABS_DIRECT)
348+IMPLEMENT_DIRECT(abs, long4 , ulong4 , IMPLEMENT_ABS_DIRECT)
349+IMPLEMENT_DIRECT(abs, long8 , ulong8 , IMPLEMENT_ABS_DIRECT)
350+IMPLEMENT_DIRECT(abs, long16, ulong16, IMPLEMENT_ABS_DIRECT)
351+
352+IMPLEMENT_DIRECT(abs, ulong , ulong , IMPLEMENT_ABS_DIRECT_UNSIGNED)
353+IMPLEMENT_DIRECT(abs, ulong2 , ulong2 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
354+IMPLEMENT_DIRECT(abs, ulong3 , ulong3 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
355+IMPLEMENT_DIRECT(abs, ulong4 , ulong4 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
356+IMPLEMENT_DIRECT(abs, ulong8 , ulong8 , IMPLEMENT_ABS_DIRECT_UNSIGNED)
357+IMPLEMENT_DIRECT(abs, ulong16, ulong16, IMPLEMENT_ABS_DIRECT_UNSIGNED)
358
359=== added file 'lib/kernel/x86_64/copysign.cl'
360--- lib/kernel/x86_64/copysign.cl 1970-01-01 00:00:00 +0000
361+++ lib/kernel/x86_64/copysign.cl 2012-04-10 14:24:21 +0000
362@@ -0,0 +1,172 @@
363+/* OpenCL built-in library: copysign()
364+
365+ Copyright (c) 2012 Universidad Rey Juan Carlos
366+
367+ Permission is hereby granted, free of charge, to any person obtaining a copy
368+ of this software and associated documentation files (the "Software"), to deal
369+ in the Software without restriction, including without limitation the rights
370+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
371+ copies of the Software, and to permit persons to whom the Software is
372+ furnished to do so, subject to the following conditions:
373+
374+ The above copyright notice and this permission notice shall be included in
375+ all copies or substantial portions of the Software.
376+
377+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
378+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
379+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
380+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
381+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
382+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
383+ THE SOFTWARE.
384+*/
385+
386+#if 0
387+
388+#include "../templates.h"
389+
390+// LLVM generates non-optimal code for this implementation
391+DEFINE_EXPR_V_VV(copysign,
392+ ({
393+ int bits = CHAR_BIT * sizeof(stype);
394+ sjtype sign_mask = (sjtype)1 << (sjtype)(bits - 1);
395+ sjtype result =
396+ (~sign_mask & *(jtype*)&a) | (sign_mask & *(jtype*)&b);
397+ *(vtype*)&result;
398+ }))
399+
400+#endif
401+
402+
403+
404+#define IMPLEMENT_DIRECT(NAME, TYPE, EXPR) \
405+ TYPE _cl_overloadable NAME(TYPE a, TYPE b) \
406+ { \
407+ return EXPR; \
408+ }
409+
410+#define IMPLEMENT_UPCAST(NAME, TYPE, UPTYPE, LO) \
411+ TYPE _cl_overloadable NAME(TYPE a, TYPE b) \
412+ { \
413+ UPTYPE a1, b1; \
414+ a1.LO = a; \
415+ b1.LO = b; \
416+ return NAME(a1, b1).LO; \
417+ }
418+
419+#define IMPLEMENT_SPLIT(NAME, TYPE, LO, HI) \
420+ TYPE _cl_overloadable NAME(TYPE a, TYPE b) \
421+ { \
422+ return (TYPE)(NAME(a.LO, b.LO), NAME(a.HI, b.HI)); \
423+ }
424+
425+
426+
427+#define IMPLEMENT_COPYSIGN_DIRECT \
428+ ({ \
429+ int bits = CHAR_BIT * sizeof(stype); \
430+ jtype sign_mask = (jtype)1 << (jtype)(bits - 1); \
431+ jtype result = (~sign_mask & *(jtype*)&a) | (sign_mask & *(jtype*)&b); \
432+ *(vtype*)&result; \
433+ })
434+#define IMPLEMENT_COPYSIGN_SSE_FLOAT4 \
435+ ({ \
436+ uint4 sign_mask = {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}; \
437+ __asm__ ("andps %[src], %[dst]" : \
438+ [dst] "+x" (a) : \
439+ [src] "xm" (~sign_mask)); \
440+ __asm__ ("andps %[src], %[dst]" : \
441+ [dst] "+x" (b) : \
442+ [src] "xm" (sign_mask)); \
443+ __asm__ ("orps %[src], %[dst]" : \
444+ [dst] "+x" (a) : \
445+ [src] "xm" (b)); \
446+ a; \
447+ })
448+#define IMPLEMENT_COPYSIGN_AVX_FLOAT8 \
449+ ({ \
450+ uint8 sign_mask = {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, \
451+ 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}; \
452+ __asm__ ("andps256 %[src], %[dst]" : \
453+ [dst] "+x" (a) : \
454+ [src] "xm" (~sign_mask)); \
455+ __asm__ ("andps256 %[src], %[dst]" : \
456+ [dst] "+x" (b) : \
457+ [src] "xm" (sign_mask)); \
458+ __asm__ ("orps256 %[src], %[dst]" : \
459+ [dst] "+x" (a) : \
460+ [src] "xm" (b)); \
461+ a; \
462+ })
463+#define IMPLEMENT_COPYSIGN_SSE2_DOUBLE2 \
464+ ({ \
465+ ulong2 sign_mask = {0x8000000000000000UL, 0x8000000000000000UL}; \
466+ __asm__ ("andpd %[src], %[dst]" : \
467+ [dst] "+x" (a) : \
468+ [src] "xm" (~sign_mask)); \
469+ __asm__ ("andpd %[src], %[dst]" : \
470+ [dst] "+x" (b) : \
471+ [src] "xm" (sign_mask)); \
472+ __asm__ ("orpd %[src], %[dst]" : \
473+ [dst] "+x" (a) : \
474+ [src] "xm" (b)); \
475+ a; \
476+ })
477+#define IMPLEMENT_COPYSIGN_AVX_DOUBLE4 \
478+ ({ \
479+ ulong4 sign_mask = {0x8000000000000000UL, 0x8000000000000000UL, \
480+ 0x8000000000000000UL, 0x8000000000000000UL}; \
481+ __asm__ ("andpd256 %[src], %[dst]" : \
482+ [dst] "+x" (a) : \
483+ [src] "xm" (~sign_mask)); \
484+ __asm__ ("andpd256 %[src], %[dst]" : \
485+ [dst] "+x" (b) : \
486+ [src] "xm" (sign_mask)); \
487+ __asm__ ("orpd256 %[src], %[dst]" : \
488+ [dst] "+x" (a) : \
489+ [src] "xm" (b)); \
490+ a; \
491+ })
492+
493+
494+
495+#ifdef __SSE__
496+IMPLEMENT_UPCAST(copysign, float , float2, lo)
497+IMPLEMENT_UPCAST(copysign, float2 , float4, lo)
498+IMPLEMENT_UPCAST(copysign, float3 , float4, s012)
499+IMPLEMENT_DIRECT(copysign, float4 , IMPLEMENT_COPYSIGN_SSE_FLOAT4)
500+# ifdef __AVX__
501+IMPLEMENT_DIRECT(copysign, float8 , IMPLEMENT_COPYSIGN_AVX_FLOAT8)
502+# else
503+IMPLEMENT_SPLIT (copysign, float8 , lo, hi)
504+# endif
505+IMPLEMENT_SPLIT (copysign, float16, lo, hi)
506+#else
507+IMPLEMENT_DIRECT(copysign, float , IMPLEMENT_COPYSIGN_DIRECT)
508+IMPLEMENT_DIRECT(copysign, float2 , IMPLEMENT_COPYSIGN_DIRECT)
509+IMPLEMENT_DIRECT(copysign, float3 , IMPLEMENT_COPYSIGN_DIRECT)
510+IMPLEMENT_DIRECT(copysign, float4 , IMPLEMENT_COPYSIGN_DIRECT)
511+IMPLEMENT_DIRECT(copysign, float8 , IMPLEMENT_COPYSIGN_DIRECT)
512+IMPLEMENT_DIRECT(copysign, float16, IMPLEMENT_COPYSIGN_DIRECT)
513+#endif
514+
515+#ifdef __SSE2__
516+IMPLEMENT_UPCAST(copysign, double , double2, lo)
517+IMPLEMENT_DIRECT(copysign, double2 , IMPLEMENT_COPYSIGN_SSE2_DOUBLE2)
518+# ifdef __AVX__
519+IMPLEMENT_UPCAST(copysign, double3 , double4, s012)
520+IMPLEMENT_DIRECT(copysign, double4 , IMPLEMENT_COPYSIGN_AVX_DOUBLE4)
521+# else
522+IMPLEMENT_SPLIT (copysign, double3 , lo, s2)
523+IMPLEMENT_SPLIT (copysign, double4 , lo, hi)
524+# endif
525+IMPLEMENT_SPLIT (copysign, double8 , lo, hi)
526+IMPLEMENT_SPLIT (copysign, double16, lo, hi)
527+#else
528+IMPLEMENT_DIRECT(copysign, double , IMPLEMENT_COPYSIGN_DIRECT)
529+IMPLEMENT_DIRECT(copysign, double2 , IMPLEMENT_COPYSIGN_DIRECT)
530+IMPLEMENT_DIRECT(copysign, double3 , IMPLEMENT_COPYSIGN_DIRECT)
531+IMPLEMENT_DIRECT(copysign, double4 , IMPLEMENT_COPYSIGN_DIRECT)
532+IMPLEMENT_DIRECT(copysign, double8 , IMPLEMENT_COPYSIGN_DIRECT)
533+IMPLEMENT_DIRECT(copysign, double16, IMPLEMENT_COPYSIGN_DIRECT)
534+#endif
535
536=== modified file 'tests/testsuite.at'
537--- tests/testsuite.at 2012-03-27 10:48:08 +0000
538+++ tests/testsuite.at 2012-04-10 14:24:21 +0000
539@@ -167,7 +167,7 @@
540
541 AT_BANNER([Kernel runtime library])
542
543-AT_SETUP([Kernel functions bitselect clz popcount])
544+AT_SETUP([Kernel functions abs bitselect clz max min popcount])
545 AT_DATA([expout],
546 [Running test test_bitselect...
547 OK