Merge lp:~schnetter/pocl/main into lp:~pocl/pocl/trunk

Proposed by Erik Schnetter
Status: Merged
Merge reported by: Pekka Jääskeläinen
Merged at revision: not available
Proposed branch: lp:~schnetter/pocl/main
Merge into: lp:~pocl/pocl/trunk
Diff against target: 2953 lines (+1586/-379)
53 files modified
clconfig.h.in (+14/-7)
configure.ac (+13/-0)
examples/kernel/test_bitselect.cl (+1/-1)
examples/kernel/test_fabs.cl (+1/-1)
examples/kernel/test_rotate.cl (+1/-1)
examples/scalarwave/scalarwave.c (+4/-5)
examples/scalarwave/scalarwave.cl (+5/-3)
include/CL/cl.h (+1/-1)
include/_kernel.h (+3/-0)
include/arm/types.h (+1/-1)
include/tce/types.h (+1/-1)
include/types.h (+1/-1)
lib/CL/Makefile.am (+7/-0)
lib/CL/clCreateBuffer.c (+3/-1)
lib/CL/clCreateCommandQueue.c (+2/-0)
lib/CL/clCreateContext.c (+1/-2)
lib/CL/clCreateContextFromType.c (+2/-0)
lib/CL/clCreateKernel.c (+2/-0)
lib/CL/clCreateProgramWithSource.c (+2/-0)
lib/CL/clEnqueueBarrier.c (+31/-0)
lib/CL/clEnqueueCopyBuffer.c (+67/-0)
lib/CL/clEnqueueCopyBufferRect.c (+87/-0)
lib/CL/clEnqueueReadBuffer.c (+1/-1)
lib/CL/clEnqueueReadBufferRect.c (+82/-0)
lib/CL/clEnqueueWriteBuffer.c (+1/-1)
lib/CL/clEnqueueWriteBufferRect.c (+97/-0)
lib/CL/clFinish.c (+1/-1)
lib/CL/clGetContextInfo.c (+12/-20)
lib/CL/clGetDeviceIDs.c (+19/-24)
lib/CL/clGetDeviceInfo.c (+170/-8)
lib/CL/clGetEventProfilingInfo.c (+62/-0)
lib/CL/clGetPlatformIDs.c (+15/-23)
lib/CL/clGetPlatformInfo.c (+12/-14)
lib/CL/clGetProgramBuildInfo.c (+44/-27)
lib/CL/clGetProgramInfo.c (+89/-0)
lib/CL/devices/native/native.c (+6/-2)
lib/CL/devices/native/native.h (+4/-0)
lib/CL/devices/pthread/pthread.c (+106/-3)
lib/CL/devices/pthread/pthread.h (+79/-6)
lib/CL/pocl_cl.h (+25/-0)
lib/kernel/all.cl (+64/-64)
lib/kernel/any.cl (+64/-64)
lib/kernel/arm/vload_half.cl (+109/-0)
lib/kernel/arm/vstore_half.cl (+120/-0)
lib/kernel/as_type.cl (+1/-1)
lib/kernel/convert_type.cl (+7/-7)
lib/kernel/cross.cl (+4/-4)
lib/kernel/dot.cl (+28/-28)
lib/kernel/upsample.cl (+1/-1)
lib/kernel/vload.cl (+5/-5)
lib/kernel/vload_half.cl (+40/-18)
lib/kernel/vstore.cl (+5/-5)
lib/kernel/vstore_half.cl (+63/-27)
To merge this branch: bzr merge lp:~schnetter/pocl/main
Reviewer Review Type Date Requested Status
pocl maintaners Pending
Review via email: mp+86157@code.launchpad.net

Description of the change

I made many improvements to the OpenCL run-time library, mostly to clGet* and buffer read/write/copy routines. The current state of the library is now sufficient to run my application code (although it is still much too slow).

To post a comment you must log in.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'clconfig.h.in'
2--- clconfig.h.in 2011-12-15 04:18:23 +0000
3+++ clconfig.h.in 2011-12-18 04:51:24 +0000
4@@ -1,13 +1,20 @@
5+/* Define to 1 if __fp16 can be converted to float. */
6+#undef CONVERT___FP16_FLOAT
7+
8+/* The size of `double', as computed by sizeof. */
9+#undef SIZEOF_DOUBLE
10+
11 /* The size of `long', as computed by sizeof. */
12 #undef SIZEOF_LONG
13
14-/* The size of `half', as computed by sizeof. */
15+/* The size of `void *', as computed by sizeof. */
16+#undef SIZEOF_VOID_P
17+
18+/* The size of `__fp16', as computed by sizeof. */
19 #undef SIZEOF___FP16
20+
21+
22+
23 /* The OpenCL type `half' is called `__fp16' in C */
24 #define SIZEOF_HALF SIZEOF___FP16
25-
26-/* The size of `double', as computed by sizeof. */
27-#undef SIZEOF_DOUBLE
28-
29-/* The size of `void *', as computed by sizeof. */
30-#undef SIZEOF_VOID_P
31+#define CONVERT_HALF_FLOAT CONVERT___FP16_FLOAT
32
33=== modified file 'configure.ac'
34--- configure.ac 2011-12-15 04:18:23 +0000
35+++ configure.ac 2011-12-18 04:51:24 +0000
36@@ -223,9 +223,22 @@
37 # Checks for typedefs, structures, and compiler characteristics.
38 AC_CHECK_SIZEOF([long])
39 AC_CHECK_SIZEOF([__fp16])
40+AC_MSG_CHECKING([whether __fp16 can be converted to float])
41+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[__fp16 x; float y;]],
42+ [[y=x]])],
43+ [
44+ AC_MSG_RESULT([yes])
45+ AC_DEFINE_UNQUOTED([CONVERT___FP16_FLOAT], 1, [Define to 1 if __fp16 can be converted to float.])
46+ ],
47+ [
48+ AC_MSG_RESULT([no])
49+ AC_DEFINE_UNQUOTED([CONVERT___FP16_FLOAT], 0, [Define to 1 if __fp16 can be converted to float.])
50+ ]
51+ )
52 AC_CHECK_SIZEOF([double])
53 AC_CHECK_SIZEOF([void *])
54 AC_CHECK_ALIGNOF([float16], [typedef float float16 __attribute__((__ext_vector_type__(16)));])
55+AC_CHECK_ALIGNOF([double16], [typedef double double16 __attribute__((__ext_vector_type__(16)));])
56
57 CC="$old_CC"
58
59
60=== modified file 'examples/kernel/test_bitselect.cl'
61--- examples/kernel/test_bitselect.cl 2011-12-14 15:15:44 +0000
62+++ examples/kernel/test_bitselect.cl 2011-12-18 04:51:24 +0000
63@@ -1170,7 +1170,7 @@
64 })
65 )
66
67-void test_bitselect()
68+kernel void test_bitselect()
69 {
70 CALL_FUNC_G(test_bitselect)
71 }
72
73=== modified file 'examples/kernel/test_fabs.cl'
74--- examples/kernel/test_fabs.cl 2011-12-14 15:15:44 +0000
75+++ examples/kernel/test_fabs.cl 2011-12-18 04:51:24 +0000
76@@ -172,7 +172,7 @@
77 })
78 )
79
80-void test_fabs()
81+kernel void test_fabs()
82 {
83 CALL_FUNC_V(test_fabs)
84 }
85
86=== modified file 'examples/kernel/test_rotate.cl'
87--- examples/kernel/test_rotate.cl 2011-12-14 15:15:44 +0000
88+++ examples/kernel/test_rotate.cl 2011-12-18 04:51:24 +0000
89@@ -198,7 +198,7 @@
90 })
91 )
92
93-void test_rotate()
94+kernel void test_rotate()
95 {
96 CALL_FUNC_G(test_rotate)
97 }
98
99=== modified file 'examples/scalarwave/scalarwave.c'
100--- examples/scalarwave/scalarwave.c 2011-12-14 15:45:14 +0000
101+++ examples/scalarwave/scalarwave.c 2011-12-18 04:51:24 +0000
102@@ -11,7 +11,7 @@
103
104
105
106-#define GRID_GRANULARITY 1 // TODO 2
107+#define GRID_GRANULARITY 2
108
109 typedef struct grid_t {
110 cl_double dt; // time step
111@@ -96,7 +96,7 @@
112 if (ierr) return -1;
113
114 size_t const global_work_size[3] =
115- {grid->ni, grid->nj, grid->nk};
116+ {grid->ai, grid->aj, grid->ak};
117 size_t const local_work_size[3] =
118 {GRID_GRANULARITY, GRID_GRANULARITY, GRID_GRANULARITY};
119
120@@ -207,9 +207,8 @@
121 phi = tmp;
122 }
123
124- // TODO: We create the program and allocate the buffers each time,
125- // which is slow. But then, we only want to test correctness, not
126- // performance. (Yet?)
127+ // TODO: We allocate the buffers each time, which is slow. But
128+ // then, we only want to test correctness, not performance. (Yet?)
129 int const ierr =
130 exec_scalarwave_kernel (source, phi, phi_p, phi_p_p, &grid);
131 assert(!ierr);
132
133=== modified file 'examples/scalarwave/scalarwave.cl'
134--- examples/scalarwave/scalarwave.cl 2011-12-14 15:45:14 +0000
135+++ examples/scalarwave/scalarwave.cl 2011-12-18 04:51:24 +0000
136@@ -21,7 +21,7 @@
137 int ni, nj, nk; // used size
138 } grid_t;
139
140-void
141+kernel void
142 scalarwave(global double *restrict const phi,
143 global double const *restrict const phi_p,
144 global double const *restrict const phi_p_p,
145@@ -72,11 +72,13 @@
146 size_t const k = get_global_id(2);
147
148 // If outside the domain, do nothing
149- if (i>=ni || j>=nj || k>=nk) return;
150+ if (__builtin_expect(i>=ni || j>=nj || k>=nk, false)) return;
151
152 size_t const ind3d = di*i + dj*j + dk*k;
153
154- if (i==0 || i==ni-1 || j==0 || j==nj-1 || k==0 || k==nk-1) {
155+ if (__builtin_expect(i==0 || j==0 || k==0 || i==ni-1 || j==nj-1 || k==nk-1,
156+ false))
157+ {
158 // Boundary condition
159
160 phi[ind3d] = 0.0;
161
162=== modified file 'include/CL/cl.h'
163--- include/CL/cl.h 2011-02-08 22:30:08 +0000
164+++ include/CL/cl.h 2011-12-18 04:51:24 +0000
165@@ -814,7 +814,7 @@
166 size_t /* buffer_row_pitch */,
167 size_t /* buffer_slice_pitch */,
168 size_t /* host_row_pitch */,
169- size_t /* host_slice_pitch */,
170+ size_t /* host_slice_pitch */,
171 const void * /* ptr */,
172 cl_uint /* num_events_in_wait_list */,
173 const cl_event * /* event_wait_list */,
174
175=== modified file 'include/_kernel.h'
176--- include/_kernel.h 2011-12-15 19:03:41 +0000
177+++ include/_kernel.h 2011-12-18 04:51:24 +0000
178@@ -32,6 +32,9 @@
179 only. Seems the pragma does not add the macro, so we have the target
180 define the macro and the pragma is conditionally enabled.
181 */
182+#ifdef cl_khr_fp16
183+# pragma OPENCL EXTENSION cl_khr_fp16: enable
184+#endif
185 #ifdef cl_khr_fp64
186 # pragma OPENCL EXTENSION cl_khr_fp64: enable
187 #endif
188
189=== modified file 'include/arm/types.h'
190--- include/arm/types.h 2011-12-14 23:01:01 +0000
191+++ include/arm/types.h 2011-12-18 04:51:24 +0000
192@@ -4,7 +4,7 @@
193
194 #define __EMBEDDED_PROFILE__ 1
195 #undef cles_khr_int64
196-#define cl_khr_fp16 /* ES: is this correct? */
197+#define cl_khr_fp16
198 #undef cl_khr_fp64
199
200 typedef uint size_t;
201
202=== modified file 'include/tce/types.h'
203--- include/tce/types.h 2011-12-14 23:01:01 +0000
204+++ include/tce/types.h 2011-12-18 04:51:24 +0000
205@@ -4,7 +4,7 @@
206
207 #define __EMBEDDED_PROFILE__ 1
208 #undef cles_khr_int64
209-#define cl_khr_fp16 /* ES: is this correct? */
210+#define cl_khr_fp16
211 #undef cl_khr_fp64
212
213 typedef uint size_t;
214
215=== modified file 'include/types.h'
216--- include/types.h 2011-12-14 23:01:01 +0000
217+++ include/types.h 2011-12-18 04:51:24 +0000
218@@ -18,7 +18,7 @@
219 # undef cles_khr_int64
220 #endif
221
222-#if SIZEOF_HALF == 2
223+#if SIZEOF_HALF == 2 /* && CONVERT_HALF_FLOAT */
224 # define cl_khr_fp16
225 #else
226 # undef cl_khr_fp16
227
228=== modified file 'lib/CL/Makefile.am'
229--- lib/CL/Makefile.am 2011-12-07 15:45:30 +0000
230+++ lib/CL/Makefile.am 2011-12-18 04:51:24 +0000
231@@ -31,6 +31,7 @@
232 clReleaseCommandQueue.c \
233 clCreateBuffer.c \
234 clEnqueueReadBuffer.c \
235+ clEnqueueReadBufferRect.c \
236 clReleaseMemObject.c \
237 clCreateProgramWithSource.c \
238 clReleaseProgram.c \
239@@ -46,8 +47,14 @@
240 clCreateContext.c \
241 clGetProgramBuildInfo.c \
242 clEnqueueWriteBuffer.c \
243+ clEnqueueWriteBufferRect.c \
244+ clEnqueueCopyBuffer.c \
245+ clEnqueueCopyBufferRect.c \
246 clFinish.c \
247+ clEnqueueBarrier.c \
248 clGetKernelWorkGroupInfo.c \
249+ clGetProgramInfo.c \
250+ clGetEventProfilingInfo.c \
251 pocl_cl.h \
252 devices/devices.h \
253 devices/devices.c \
254
255=== modified file 'lib/CL/clCreateBuffer.c'
256--- lib/CL/clCreateBuffer.c 2011-12-08 02:45:48 +0000
257+++ lib/CL/clCreateBuffer.c 2011-12-18 04:51:24 +0000
258@@ -69,6 +69,8 @@
259 mem->mem_host_ptr = host_ptr;
260 mem->reference_count = 1;
261 mem->context = context;
262-
263+
264+ if (errcode_ret != NULL)
265+ *errcode_ret = CL_SUCCESS;
266 return mem;
267 }
268
269=== modified file 'lib/CL/clCreateCommandQueue.c'
270--- lib/CL/clCreateCommandQueue.c 2011-10-14 10:31:27 +0000
271+++ lib/CL/clCreateCommandQueue.c 2011-12-18 04:51:24 +0000
272@@ -38,5 +38,7 @@
273 command_queue->reference_count = 1;
274 command_queue->properties = properties;
275
276+ if (errcode_ret != NULL)
277+ *errcode_ret = CL_SUCCESS;
278 return command_queue;
279 }
280
281=== modified file 'lib/CL/clCreateContext.c'
282--- lib/CL/clCreateContext.c 2011-12-07 15:00:07 +0000
283+++ lib/CL/clCreateContext.c 2011-12-18 04:51:24 +0000
284@@ -72,8 +72,7 @@
285 context->properties = properties;
286 context->reference_count = 1;
287
288- if (errcode_ret != NULL)
289+ if (errcode_ret)
290 *errcode_ret = CL_SUCCESS;
291-
292 return context;
293 }
294
295=== modified file 'lib/CL/clCreateContextFromType.c'
296--- lib/CL/clCreateContextFromType.c 2011-12-02 17:27:49 +0000
297+++ lib/CL/clCreateContextFromType.c 2011-12-18 04:51:24 +0000
298@@ -68,5 +68,7 @@
299 context->properties = properties;
300 context->reference_count = 1;
301
302+ if (errcode_ret != NULL)
303+ *errcode_ret = CL_SUCCESS;
304 return context;
305 }
306
307=== modified file 'lib/CL/clCreateKernel.c'
308--- lib/CL/clCreateKernel.c 2011-12-14 18:52:39 +0000
309+++ lib/CL/clCreateKernel.c 2011-12-18 04:51:24 +0000
310@@ -145,5 +145,7 @@
311 program->kernels = kernel;
312 kernel->next = k;
313
314+ if (errcode_ret != NULL)
315+ *errcode_ret = CL_SUCCESS;
316 return kernel;
317 }
318
319=== modified file 'lib/CL/clCreateProgramWithSource.c'
320--- lib/CL/clCreateProgramWithSource.c 2011-10-14 10:31:27 +0000
321+++ lib/CL/clCreateProgramWithSource.c 2011-12-18 04:51:24 +0000
322@@ -92,5 +92,7 @@
323 program->binary = NULL;
324 program->kernels = NULL;
325
326+ if (errcode_ret != NULL)
327+ *errcode_ret = CL_SUCCESS;
328 return program;
329 }
330
331=== added file 'lib/CL/clEnqueueBarrier.c'
332--- lib/CL/clEnqueueBarrier.c 1970-01-01 00:00:00 +0000
333+++ lib/CL/clEnqueueBarrier.c 2011-12-18 04:51:24 +0000
334@@ -0,0 +1,31 @@
335+/* OpenCL runtime library: clEnqueueBarrier()
336+
337+ Copyright (c) 2011 Erik Schnetter
338+
339+ Permission is hereby granted, free of charge, to any person obtaining a copy
340+ of this software and associated documentation files (the "Software"), to deal
341+ in the Software without restriction, including without limitation the rights
342+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
343+ copies of the Software, and to permit persons to whom the Software is
344+ furnished to do so, subject to the following conditions:
345+
346+ The above copyright notice and this permission notice shall be included in
347+ all copies or substantial portions of the Software.
348+
349+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
350+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
351+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
352+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
353+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
354+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
355+ THE SOFTWARE.
356+*/
357+
358+#include "pocl_cl.h"
359+
360+CL_API_ENTRY cl_int CL_API_CALL
361+clEnqueueBarrier(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0
362+{
363+ /* All operations are serialised, so we never have to wait */
364+ return CL_SUCCESS;
365+}
366
367=== added file 'lib/CL/clEnqueueCopyBuffer.c'
368--- lib/CL/clEnqueueCopyBuffer.c 1970-01-01 00:00:00 +0000
369+++ lib/CL/clEnqueueCopyBuffer.c 2011-12-18 04:51:24 +0000
370@@ -0,0 +1,67 @@
371+/* OpenCL runtime library: clEnqueueCopyBuffer()
372+
373+ Copyright (c) 2011 Universidad Rey Juan Carlos
374+
375+ Permission is hereby granted, free of charge, to any person obtaining a copy
376+ of this software and associated documentation files (the "Software"), to deal
377+ in the Software without restriction, including without limitation the rights
378+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
379+ copies of the Software, and to permit persons to whom the Software is
380+ furnished to do so, subject to the following conditions:
381+
382+ The above copyright notice and this permission notice shall be included in
383+ all copies or substantial portions of the Software.
384+
385+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
386+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
387+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
388+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
389+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
390+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
391+ THE SOFTWARE.
392+*/
393+
394+#include "pocl_cl.h"
395+#include <assert.h>
396+
397+CL_API_ENTRY cl_int CL_API_CALL
398+clEnqueueCopyBuffer(cl_command_queue command_queue,
399+ cl_mem src_buffer,
400+ cl_mem dst_buffer,
401+ size_t src_offset,
402+ size_t dst_offset,
403+ size_t cb,
404+ cl_uint num_events_in_wait_list,
405+ const cl_event *event_wait_list,
406+ cl_event *event) CL_API_SUFFIX__VERSION_1_0
407+{
408+ cl_device_id device_id;
409+ unsigned i;
410+
411+ if (command_queue == NULL)
412+ return CL_INVALID_COMMAND_QUEUE;
413+
414+ if ((src_buffer == NULL) || (dst_buffer == NULL))
415+ return CL_INVALID_MEM_OBJECT;
416+
417+ if ((command_queue->context != src_buffer->context) ||
418+ (command_queue->context != dst_buffer->context))
419+ return CL_INVALID_CONTEXT;
420+
421+ if ((src_offset + cb > src_buffer->size) ||
422+ (dst_offset + cb > dst_buffer->size))
423+ return CL_INVALID_VALUE;
424+
425+ device_id = command_queue->device;
426+ for (i = 0; i < command_queue->context->num_devices; ++i)
427+ {
428+ if (command_queue->context->devices[i] == device_id)
429+ break;
430+ }
431+
432+ assert(i < command_queue->context->num_devices);
433+
434+ device_id->copy(device_id->data, src_buffer->device_ptrs[i], dst_buffer->device_ptrs[i], cb);
435+
436+ return CL_SUCCESS;
437+}
438
439=== added file 'lib/CL/clEnqueueCopyBufferRect.c'
440--- lib/CL/clEnqueueCopyBufferRect.c 1970-01-01 00:00:00 +0000
441+++ lib/CL/clEnqueueCopyBufferRect.c 2011-12-18 04:51:24 +0000
442@@ -0,0 +1,87 @@
443+/* OpenCL runtime library: clEnqueueCopyBufferRect()
444+
445+ Copyright (c) 2011 Universidad Rey Juan Carlos
446+
447+ Permission is hereby granted, free of charge, to any person obtaining a copy
448+ of this software and associated documentation files (the "Software"), to deal
449+ in the Software without restriction, including without limitation the rights
450+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
451+ copies of the Software, and to permit persons to whom the Software is
452+ furnished to do so, subject to the following conditions:
453+
454+ The above copyright notice and this permission notice shall be included in
455+ all copies or substantial portions of the Software.
456+
457+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
458+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
459+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
460+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
461+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
462+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
463+ THE SOFTWARE.
464+*/
465+
466+#include "pocl_cl.h"
467+#include <assert.h>
468+
469+CL_API_ENTRY cl_int CL_API_CALL
470+clEnqueueCopyBufferRect(cl_command_queue command_queue,
471+ cl_mem src_buffer,
472+ cl_mem dst_buffer,
473+ const size_t *src_origin,
474+ const size_t *dst_origin,
475+ const size_t *region,
476+ size_t src_row_pitch,
477+ size_t src_slice_pitch,
478+ size_t dst_row_pitch,
479+ size_t dst_slice_pitch,
480+ cl_uint num_events_in_wait_list,
481+ const cl_event *event_wait_list,
482+ cl_event *event) CL_API_SUFFIX__VERSION_1_1
483+{
484+ cl_device_id device_id;
485+ unsigned i;
486+
487+ if (command_queue == NULL)
488+ return CL_INVALID_COMMAND_QUEUE;
489+
490+ if ((src_buffer == NULL) || (dst_buffer == NULL))
491+ return CL_INVALID_MEM_OBJECT;
492+
493+ if ((command_queue->context != src_buffer->context) ||
494+ (command_queue->context != dst_buffer->context))
495+ return CL_INVALID_CONTEXT;
496+
497+ if ((src_origin == NULL) ||
498+ (dst_origin == NULL) ||
499+ (region == NULL))
500+ return CL_INVALID_VALUE;
501+
502+ if ((region[0]*region[1]*region[2] > 0) &&
503+ (src_origin[0] + region[0]-1 +
504+ src_row_pitch * (src_origin[1] + region[1]-1) +
505+ src_slice_pitch * (src_origin[2] + region[2]-1) >= src_buffer->size))
506+ return CL_INVALID_VALUE;
507+ if ((region[0]*region[1]*region[2] > 0) &&
508+ (dst_origin[0] + region[0]-1 +
509+ dst_row_pitch * (dst_origin[1] + region[1]-1) +
510+ dst_slice_pitch * (dst_origin[2] + region[2]-1) >= dst_buffer->size))
511+ return CL_INVALID_VALUE;
512+
513+ device_id = command_queue->device;
514+ for (i = 0; i < command_queue->context->num_devices; ++i)
515+ {
516+ if (command_queue->context->devices[i] == device_id)
517+ break;
518+ }
519+
520+ assert(i < command_queue->context->num_devices);
521+
522+ device_id->copy_rect(device_id->data,
523+ src_buffer->device_ptrs[i], dst_buffer->device_ptrs[i],
524+ src_origin, dst_origin, region,
525+ src_row_pitch, src_slice_pitch,
526+ dst_row_pitch, dst_slice_pitch);
527+
528+ return CL_SUCCESS;
529+}
530
531=== modified file 'lib/CL/clEnqueueReadBuffer.c'
532--- lib/CL/clEnqueueReadBuffer.c 2011-11-30 16:00:41 +0000
533+++ lib/CL/clEnqueueReadBuffer.c 2011-12-18 04:51:24 +0000
534@@ -1,4 +1,4 @@
535-/* OpenCL runtime library: clEnqueueWriteBuffer()
536+/* OpenCL runtime library: clEnqueueReadBuffer()
537
538 Copyright (c) 2011 Universidad Rey Juan Carlos
539
540
541=== added file 'lib/CL/clEnqueueReadBufferRect.c'
542--- lib/CL/clEnqueueReadBufferRect.c 1970-01-01 00:00:00 +0000
543+++ lib/CL/clEnqueueReadBufferRect.c 2011-12-18 04:51:24 +0000
544@@ -0,0 +1,82 @@
545+/* OpenCL runtime library: clEnqueueReadBufferRect()
546+
547+ Copyright (c) 2011 Universidad Rey Juan Carlos
548+
549+ Permission is hereby granted, free of charge, to any person obtaining a copy
550+ of this software and associated documentation files (the "Software"), to deal
551+ in the Software without restriction, including without limitation the rights
552+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
553+ copies of the Software, and to permit persons to whom the Software is
554+ furnished to do so, subject to the following conditions:
555+
556+ The above copyright notice and this permission notice shall be included in
557+ all copies or substantial portions of the Software.
558+
559+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
560+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
561+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
562+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
563+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
564+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
565+ THE SOFTWARE.
566+*/
567+
568+#include "pocl_cl.h"
569+#include <assert.h>
570+
571+CL_API_ENTRY cl_int CL_API_CALL
572+clEnqueueReadBufferRect(cl_command_queue command_queue,
573+ cl_mem buffer,
574+ cl_bool blocking_read,
575+ const size_t *buffer_origin,
576+ const size_t *host_origin,
577+ const size_t *region,
578+ size_t buffer_row_pitch,
579+ size_t buffer_slice_pitch,
580+ size_t host_row_pitch,
581+ size_t host_slice_pitch,
582+ void *ptr,
583+ cl_uint num_events_in_wait_list,
584+ const cl_event *event_wait_list,
585+ cl_event *event) CL_API_SUFFIX__VERSION_1_1
586+{
587+ cl_device_id device_id;
588+ unsigned i;
589+
590+ if (command_queue == NULL)
591+ return CL_INVALID_COMMAND_QUEUE;
592+
593+ if (buffer == NULL)
594+ return CL_INVALID_MEM_OBJECT;
595+
596+ if (command_queue->context != buffer->context)
597+ return CL_INVALID_CONTEXT;
598+
599+ if ((ptr == NULL) ||
600+ (buffer_origin == NULL) ||
601+ (host_origin == NULL) ||
602+ (region == NULL))
603+ return CL_INVALID_VALUE;
604+
605+ if ((region[0]*region[1]*region[2] > 0) &&
606+ (buffer_origin[0] + region[0]-1 +
607+ buffer_row_pitch * (buffer_origin[1] + region[1]-1) +
608+ buffer_slice_pitch * (buffer_origin[2] + region[2]-1) >= buffer->size))
609+ return CL_INVALID_VALUE;
610+
611+ device_id = command_queue->device;
612+ for (i = 0; i < command_queue->context->num_devices; ++i)
613+ {
614+ if (command_queue->context->devices[i] == device_id)
615+ break;
616+ }
617+
618+ assert(i < command_queue->context->num_devices);
619+
620+ device_id->read_rect(device_id->data, ptr, buffer->device_ptrs[i],
621+ buffer_origin, host_origin, region,
622+ buffer_row_pitch, buffer_slice_pitch,
623+ host_row_pitch, host_slice_pitch);
624+
625+ return CL_SUCCESS;
626+}
627
628=== modified file 'lib/CL/clEnqueueWriteBuffer.c'
629--- lib/CL/clEnqueueWriteBuffer.c 2011-11-30 16:00:41 +0000
630+++ lib/CL/clEnqueueWriteBuffer.c 2011-12-18 04:51:24 +0000
631@@ -1,4 +1,4 @@
632-/* OpenCL runtime library: clEnqueueReadBuffer()
633+/* OpenCL runtime library: clEnqueueWriteBuffer()
634
635 Copyright (c) 2011 Universidad Rey Juan Carlos
636
637
638=== added file 'lib/CL/clEnqueueWriteBufferRect.c'
639--- lib/CL/clEnqueueWriteBufferRect.c 1970-01-01 00:00:00 +0000
640+++ lib/CL/clEnqueueWriteBufferRect.c 2011-12-18 04:51:24 +0000
641@@ -0,0 +1,97 @@
642+/* OpenCL runtime library: clEnqueueWriteBufferRect()
643+
644+ Copyright (c) 2011 Universidad Rey Juan Carlos
645+
646+ Permission is hereby granted, free of charge, to any person obtaining a copy
647+ of this software and associated documentation files (the "Software"), to deal
648+ in the Software without restriction, including without limitation the rights
649+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
650+ copies of the Software, and to permit persons to whom the Software is
651+ furnished to do so, subject to the following conditions:
652+
653+ The above copyright notice and this permission notice shall be included in
654+ all copies or substantial portions of the Software.
655+
656+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
657+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
658+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
659+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
660+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
661+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
662+ THE SOFTWARE.
663+*/
664+
665+#include "pocl_cl.h"
666+#include <assert.h>
667+
668+CL_API_ENTRY cl_int CL_API_CALL
669+clEnqueueWriteBufferRect(cl_command_queue command_queue,
670+ cl_mem buffer,
671+ cl_bool blocking_write,
672+ const size_t *buffer_origin,
673+ const size_t *host_origin,
674+ const size_t *region,
675+ size_t buffer_row_pitch,
676+ size_t buffer_slice_pitch,
677+ size_t host_row_pitch,
678+ size_t host_slice_pitch,
679+ const void *ptr,
680+ cl_uint num_events_in_wait_list,
681+ const cl_event *event_wait_list,
682+ cl_event *event) CL_API_SUFFIX__VERSION_1_1
683+{
684+ cl_device_id device_id;
685+ unsigned i;
686+
687+ if (command_queue == NULL)
688+ return CL_INVALID_COMMAND_QUEUE;
689+
690+ if (buffer == NULL)
691+ return CL_INVALID_MEM_OBJECT;
692+
693+ if (command_queue->context != buffer->context)
694+ return CL_INVALID_CONTEXT;
695+
696+ if ((ptr == NULL) ||
697+ (buffer_origin == NULL) ||
698+ (host_origin == NULL) ||
699+ (region == NULL))
700+ return CL_INVALID_VALUE;
701+
702+ if ((region[0]*region[1]*region[2] > 0) &&
703+ (buffer_origin[0] + region[0]-1 +
704+ buffer_row_pitch * (buffer_origin[1] + region[1]-1) +
705+ buffer_slice_pitch * (buffer_origin[2] + region[2]-1) >= buffer->size))
706+ {
707+#warning "TODO"
708+ printf("bo=[%d,%d,%d]\n"
709+ "ho=[%d,%d,%d]\n"
710+ "re=[%d,%d,%d]\n"
711+ "bp=[,%d,%d]\n"
712+ "hp=[,%d,%d]\n"
713+ "bs=[%d]\n",
714+ (int)buffer_origin[0], (int)buffer_origin[1], (int)buffer_origin[2],
715+ (int)host_origin[0], (int)host_origin[1], (int)host_origin[2],
716+ (int)region[0], (int)region[1], (int)region[2],
717+ (int)buffer_row_pitch, (int)buffer_slice_pitch,
718+ (int)host_row_pitch, (int)host_slice_pitch,
719+ (int)buffer->size);
720+ return CL_INVALID_VALUE;
721+ }
722+
723+ device_id = command_queue->device;
724+ for (i = 0; i < command_queue->context->num_devices; ++i)
725+ {
726+ if (command_queue->context->devices[i] == device_id)
727+ break;
728+ }
729+
730+ assert(i < command_queue->context->num_devices);
731+
732+ device_id->write_rect(device_id->data, ptr, buffer->device_ptrs[i],
733+ buffer_origin, host_origin, region,
734+ buffer_row_pitch, buffer_slice_pitch,
735+ host_row_pitch, host_slice_pitch);
736+
737+ return CL_SUCCESS;
738+}
739
740=== modified file 'lib/CL/clFinish.c'
741--- lib/CL/clFinish.c 2011-12-05 22:08:57 +0000
742+++ lib/CL/clFinish.c 2011-12-18 04:51:24 +0000
743@@ -24,7 +24,7 @@
744 #include "pocl_cl.h"
745
746 CL_API_ENTRY cl_int CL_API_CALL
747-clFinish(cl_command_queue command_queue)
748+clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0
749 {
750 /* All operations are serialised, so we never have to wait */
751 return CL_SUCCESS;
752
753=== modified file 'lib/CL/clGetContextInfo.c'
754--- lib/CL/clGetContextInfo.c 2011-10-14 10:31:27 +0000
755+++ lib/CL/clGetContextInfo.c 2011-12-18 04:51:24 +0000
756@@ -33,28 +33,20 @@
757 size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
758 {
759 size_t value_size;
760-
761+
762 switch (param_name) {
763- default:
764- assert(0);
765+
766 case CL_CONTEXT_DEVICES:
767- value_size = context->num_devices * sizeof(cl_device_id);
768- }
769-
770- if (param_value_size_ret != NULL)
771- *param_value_size_ret = value_size;
772-
773- if (param_value != NULL) {
774- if (value_size > param_value_size)
775- return CL_INVALID_VALUE;
776-
777- switch (param_name) {
778- default:
779- assert(0);
780- case CL_CONTEXT_DEVICES:
781- memcpy(param_value, context->devices, value_size);
782+ {
783+ value_size = context->num_devices * sizeof(cl_device_id);
784+ if (param_value != NULL) {
785+ if (param_value_size < value_size)
786+ return CL_INVALID_VALUE;
787+ memcpy(param_value, context->devices, value_size);
788+ }
789+ if (param_value_size_ret != NULL)
790+ *param_value_size_ret = value_size;
791+ return CL_SUCCESS;
792 }
793 }
794-
795- return CL_SUCCESS;
796 }
797
798=== modified file 'lib/CL/clGetDeviceIDs.c'
799--- lib/CL/clGetDeviceIDs.c 2011-12-06 03:00:30 +0000
800+++ lib/CL/clGetDeviceIDs.c 2011-12-18 04:51:24 +0000
801@@ -23,6 +23,7 @@
802
803 #include "pocl_cl.h"
804 #include "devices/devices.h"
805+#include <string.h>
806
807 /* Note: this is a kludge. This will require a thorough re-write when pocl
808 * supports multiple devices
809@@ -34,41 +35,35 @@
810 cl_device_id * devices,
811 cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0
812 {
813- int num = 0;
814+ int num;
815+ int i;
816
817- // TODO: OpenCL API specification allows implementation dependant behaviour
818- // if platform == NULL. Should we just allow for it?
819+ /* TODO: OpenCL API specification allows implementation dependent
820+ behaviour if platform == NULL. Should we just allow it? */
821 if (platform == NULL || ( platform->magic != 42 ))
822 return CL_INVALID_PLATFORM;
823-
824- // Currently - POCL supports only the host device - i.e. a CPU
825+
826+ /* Currently POCL supports only the host device, i.e. a CPU */
827 if ((device_type & CL_DEVICE_TYPE_CPU) ||
828 (device_type & CL_DEVICE_TYPE_DEFAULT))
829 num = 1;
830- else if ((device_type | CL_DEVICE_TYPE_GPU) ||
831- (device_type | CL_DEVICE_TYPE_ACCELERATOR))
832+ else if ((device_type & CL_DEVICE_TYPE_GPU) ||
833+ (device_type & CL_DEVICE_TYPE_ACCELERATOR))
834 num = 0;
835 else
836 return CL_INVALID_DEVICE_TYPE;
837-
838- // no room for any response
839- if (devices == NULL && num_devices == NULL)
840- return CL_INVALID_VALUE;
841-
842- // user forgot to allocate space for response
843- if (num_entries > 0 && devices == NULL )
844- return CL_INVALID_VALUE;
845-
846-
847+
848+ if (devices != NULL) {
849+ if (num < num_entries)
850+ return CL_INVALID_VALUE;
851+
852+ for (i=0; i<num; ++i)
853+ devices[i] = &pocl_devices[i];
854+ }
855+
856 if (num_devices != NULL)
857 *num_devices = num;
858-
859- if (num_entries > 0 && devices!= NULL)
860- {
861- if (num)
862- devices[0] = &pocl_devices[0];
863- }
864-
865+
866 if (num > 0)
867 return CL_SUCCESS;
868 else
869
870=== modified file 'lib/CL/clGetDeviceInfo.c'
871--- lib/CL/clGetDeviceInfo.c 2011-12-02 13:09:02 +0000
872+++ lib/CL/clGetDeviceInfo.c 2011-12-18 04:51:24 +0000
873@@ -24,13 +24,175 @@
874 #include "pocl_cl.h"
875
876 CL_API_ENTRY cl_int CL_API_CALL
877-clGetDeviceInfo(cl_device_id device,
878- cl_device_info param_name,
879- size_t param_value_size,
880- void * param_value,
881- size_t * param_value_size_ret ) CL_API_SUFFIX__VERSION_1_0
882+clGetDeviceInfo(cl_device_id device,
883+ cl_device_info param_name,
884+ size_t param_value_size,
885+ void * param_value,
886+ size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
887 {
888- // TODO: dig up the info
889- return CL_INVALID_VALUE;
890+ switch (param_name)
891+ {
892+ case CL_DEVICE_TYPE:
893+ {
894+ size_t const value_size = sizeof(cl_device_type);
895+ if (param_value)
896+ {
897+ if (param_value_size < value_size) return CL_INVALID_VALUE;
898+ *(cl_device_type*)param_value = device->type;
899+ }
900+ if (param_value_size_ret)
901+ *param_value_size_ret = value_size;
902+ return CL_SUCCESS;
903+ }
904+
905+ case CL_DEVICE_VENDOR_ID : break;
906+ case CL_DEVICE_MAX_COMPUTE_UNITS : break;
907+ case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS : break;
908+ case CL_DEVICE_MAX_WORK_GROUP_SIZE : break;
909+ case CL_DEVICE_MAX_WORK_ITEM_SIZES : break;
910+
911+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
912+ {
913+ size_t const value_size = sizeof(cl_uint);
914+ if (param_value)
915+ {
916+ if (param_value_size < value_size) return CL_INVALID_VALUE;
917+ *(cl_uint*)param_value = device->preferred_vector_width_char;
918+ }
919+ if (param_value_size_ret)
920+ *param_value_size_ret = value_size;
921+ return CL_SUCCESS;
922+ }
923+
924+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
925+ {
926+ size_t const value_size = sizeof(cl_uint);
927+ if (param_value)
928+ {
929+ if (param_value_size < value_size) return CL_INVALID_VALUE;
930+ *(cl_uint*)param_value = device->preferred_vector_width_short;
931+ }
932+ if (param_value_size_ret)
933+ *param_value_size_ret = value_size;
934+ return CL_SUCCESS;
935+ }
936+
937+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:
938+ {
939+ size_t const value_size = sizeof(cl_uint);
940+ if (param_value)
941+ {
942+ if (param_value_size < value_size) return CL_INVALID_VALUE;
943+ *(cl_uint*)param_value = device->preferred_vector_width_int;
944+ }
945+ if (param_value_size_ret)
946+ *param_value_size_ret = value_size;
947+ return CL_SUCCESS;
948+ }
949+
950+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:
951+ {
952+ size_t const value_size = sizeof(cl_uint);
953+ if (param_value)
954+ {
955+ if (param_value_size < value_size) return CL_INVALID_VALUE;
956+ *(cl_uint*)param_value = device->preferred_vector_width_long;
957+ }
958+ if (param_value_size_ret)
959+ *param_value_size_ret = value_size;
960+ return CL_SUCCESS;
961+ }
962+
963+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:
964+ {
965+ size_t const value_size = sizeof(cl_uint);
966+ if (param_value)
967+ {
968+ if (param_value_size < value_size) return CL_INVALID_VALUE;
969+ *(cl_uint*)param_value = device->preferred_vector_width_float;
970+ }
971+ if (param_value_size_ret)
972+ *param_value_size_ret = value_size;
973+ return CL_SUCCESS;
974+ }
975+
976+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
977+ {
978+ size_t const value_size = sizeof(cl_uint);
979+ if (param_value)
980+ {
981+ if (param_value_size < value_size) return CL_INVALID_VALUE;
982+ *(cl_uint*)param_value = device->preferred_vector_width_double;
983+ }
984+ if (param_value_size_ret)
985+ *param_value_size_ret = value_size;
986+ return CL_SUCCESS;
987+ }
988+
989+ case CL_DEVICE_MAX_CLOCK_FREQUENCY : break;
990+ case CL_DEVICE_ADDRESS_BITS : break;
991+ case CL_DEVICE_MAX_READ_IMAGE_ARGS : break;
992+ case CL_DEVICE_MAX_WRITE_IMAGE_ARGS : break;
993+ case CL_DEVICE_MAX_MEM_ALLOC_SIZE : break;
994+ case CL_DEVICE_IMAGE2D_MAX_WIDTH : break;
995+ case CL_DEVICE_IMAGE2D_MAX_HEIGHT : break;
996+ case CL_DEVICE_IMAGE3D_MAX_WIDTH : break;
997+ case CL_DEVICE_IMAGE3D_MAX_HEIGHT : break;
998+ case CL_DEVICE_IMAGE3D_MAX_DEPTH : break;
999+ case CL_DEVICE_IMAGE_SUPPORT : break;
1000+ case CL_DEVICE_MAX_PARAMETER_SIZE : break;
1001+ case CL_DEVICE_MAX_SAMPLERS : break;
1002+ case CL_DEVICE_MEM_BASE_ADDR_ALIGN : break;
1003+ case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE : break;
1004+ case CL_DEVICE_SINGLE_FP_CONFIG : break;
1005+ case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE : break;
1006+ case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE : break;
1007+ case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE : break;
1008+ case CL_DEVICE_GLOBAL_MEM_SIZE : break;
1009+ case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE : break;
1010+ case CL_DEVICE_MAX_CONSTANT_ARGS : break;
1011+ case CL_DEVICE_LOCAL_MEM_TYPE : break;
1012+ case CL_DEVICE_LOCAL_MEM_SIZE : break;
1013+ case CL_DEVICE_ERROR_CORRECTION_SUPPORT : break;
1014+ case CL_DEVICE_PROFILING_TIMER_RESOLUTION : break;
1015+ case CL_DEVICE_ENDIAN_LITTLE : break;
1016+ case CL_DEVICE_AVAILABLE : break;
1017+ case CL_DEVICE_COMPILER_AVAILABLE : break;
1018+ case CL_DEVICE_EXECUTION_CAPABILITIES : break;
1019+ case CL_DEVICE_QUEUE_PROPERTIES : break;
1020+
1021+ case CL_DEVICE_NAME:
1022+ {
1023+ size_t const value_size = strlen(device->name) + 1;
1024+ if (param_value)
1025+ {
1026+ if (param_value_size < value_size) return CL_INVALID_VALUE;
1027+ memcpy(param_value, device->name, value_size);
1028+ }
1029+ if (param_value_size_ret)
1030+ *param_value_size_ret = value_size;
1031+ return CL_SUCCESS;
1032+ }
1033+
1034+ case CL_DEVICE_VENDOR : break;
1035+ case CL_DRIVER_VERSION : break;
1036+ case CL_DEVICE_PROFILE : break;
1037+ case CL_DEVICE_VERSION : break;
1038+ case CL_DEVICE_EXTENSIONS : break;
1039+ case CL_DEVICE_PLATFORM : break;
1040+ case CL_DEVICE_DOUBLE_FP_CONFIG : break;
1041+ case CL_DEVICE_HALF_FP_CONFIG : break;
1042+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF : break;
1043+ case CL_DEVICE_HOST_UNIFIED_MEMORY : break;
1044+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR : break;
1045+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT : break;
1046+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT : break;
1047+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG : break;
1048+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT : break;
1049+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE : break;
1050+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF : break;
1051+ case CL_DEVICE_OPENCL_C_VERSION : break;
1052+ }
1053+
1054+ return CL_INVALID_VALUE;
1055 }
1056-
1057
1058=== added file 'lib/CL/clGetEventProfilingInfo.c'
1059--- lib/CL/clGetEventProfilingInfo.c 1970-01-01 00:00:00 +0000
1060+++ lib/CL/clGetEventProfilingInfo.c 2011-12-18 04:51:24 +0000
1061@@ -0,0 +1,62 @@
1062+/* OpenCL runtime library: clGetEventProfilingInfo()
1063+
1064+ Copyright (c) 2011 Erik Schnetter
1065+
1066+ Permission is hereby granted, free of charge, to any person obtaining a copy
1067+ of this software and associated documentation files (the "Software"), to deal
1068+ in the Software without restriction, including without limitation the rights
1069+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1070+ copies of the Software, and to permit persons to whom the Software is
1071+ furnished to do so, subject to the following conditions:
1072+
1073+ The above copyright notice and this permission notice shall be included in
1074+ all copies or substantial portions of the Software.
1075+
1076+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1077+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1078+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1079+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1080+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1081+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1082+ THE SOFTWARE.
1083+*/
1084+
1085+#include "pocl_cl.h"
1086+#include <string.h>
1087+
1088+CL_API_ENTRY cl_int CL_API_CALL
1089+clGetEventProfilingInfo(cl_event event,
1090+ cl_profiling_info param_name,
1091+ size_t param_value_size,
1092+ void *param_value,
1093+ size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
1094+{
1095+ size_t const value_size = sizeof(cl_ulong);
1096+ if (param_value)
1097+ {
1098+ if (param_value_size < value_size) return CL_INVALID_VALUE;
1099+
1100+ switch (param_name)
1101+ {
1102+ case CL_PROFILING_COMMAND_QUEUED:
1103+ *(cl_ulong*)param_value = 0; /* TODO: return correct value */
1104+ break;
1105+ case CL_PROFILING_COMMAND_SUBMIT:
1106+ *(cl_ulong*)param_value = 0; /* TODO: return correct value */
1107+ break;
1108+ case CL_PROFILING_COMMAND_START:
1109+ *(cl_ulong*)param_value = 0; /* TODO: return correct value */
1110+ break;
1111+ case CL_PROFILING_COMMAND_END:
1112+ *(cl_ulong*)param_value = 0; /* TODO: return correct value */
1113+ break;
1114+ default:
1115+ return CL_INVALID_VALUE;
1116+ }
1117+ }
1118+
1119+ if (param_value_size_ret)
1120+ *param_value_size_ret = value_size;
1121+
1122+ return CL_SUCCESS;
1123+}
1124
1125=== modified file 'lib/CL/clGetPlatformIDs.c'
1126--- lib/CL/clGetPlatformIDs.c 2011-12-02 13:09:02 +0000
1127+++ lib/CL/clGetPlatformIDs.c 2011-12-18 04:51:24 +0000
1128@@ -36,27 +36,19 @@
1129 cl_platform_id * platforms,
1130 cl_uint * num_platforms) CL_API_SUFFIX__VERSION_1_0
1131 {
1132-
1133- // user requests only number of platforms - not types
1134- if (num_entries == 0 && num_platforms != NULL)
1135- {
1136- *num_platforms = 1;
1137- return CL_SUCCESS;
1138- }
1139-
1140- // Bad request - no place to store response
1141- if (num_entries > 0 && platforms == NULL)
1142- return CL_INVALID_VALUE;
1143-
1144- // Check required by spec
1145- if (platforms == NULL && num_platforms == NULL)
1146- return CL_INVALID_VALUE;
1147-
1148- // platform is not used now - just mark this platform as 'valid'
1149- platforms[0] = &(_platforms[0]);
1150-
1151- if (num_platforms != NULL)
1152- *num_platforms = 1;
1153-
1154- return CL_SUCCESS;
1155+ int const num = 1;
1156+ int i;
1157+
1158+ if (platforms != NULL) {
1159+ if (num_entries < num)
1160+ return CL_INVALID_VALUE;
1161+
1162+ for (i=0; i<num; ++i)
1163+ platforms[i] = &_platforms[i];
1164+ }
1165+
1166+ if (num_platforms != NULL)
1167+ *num_platforms = num;
1168+
1169+ return CL_SUCCESS;
1170 }
1171
1172=== modified file 'lib/CL/clGetPlatformInfo.c'
1173--- lib/CL/clGetPlatformInfo.c 2011-12-02 13:09:02 +0000
1174+++ lib/CL/clGetPlatformInfo.c 2011-12-18 04:51:24 +0000
1175@@ -31,14 +31,14 @@
1176 void * param_value,
1177 size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
1178 {
1179- char *ret=0;
1180- int retlen;
1181+ const char *ret;
1182+ int retlen;
1183
1184 if (platform == NULL || (platform->magic != 42))
1185 return CL_INVALID_PLATFORM;
1186
1187 switch (param_name)
1188- {
1189+ {
1190 case CL_PLATFORM_PROFILE:
1191 // TODO: figure this out depending on the native execution host.
1192 // assume FULL_PROFILE for now.
1193@@ -59,23 +59,21 @@
1194 break;
1195 default:
1196 return CL_INVALID_VALUE;
1197- }
1198+ }
1199
1200- // Specs say (section 4.1) to "ignore param_value" should it be NULL
1201- if (param_value == NULL)
1202- return CL_SUCCESS;
1203-
1204 // the OpenCL API docs *seem* to count the trailing NULL
1205 retlen = strlen(ret) + 1;
1206
1207- if (param_value_size < retlen)
1208- return CL_INVALID_VALUE;
1209-
1210- strncpy(param_value, ret, retlen);
1211+ if (param_value != NULL)
1212+ {
1213+ if (param_value_size < retlen)
1214+ return CL_INVALID_VALUE;
1215+
1216+ memcpy(param_value, ret, retlen);
1217+ }
1218
1219 if (param_value_size_ret != NULL)
1220- *param_value_size_ret=retlen;
1221+ *param_value_size_ret = retlen;
1222
1223 return CL_SUCCESS;
1224-
1225 }
1226
1227=== modified file 'lib/CL/clGetProgramBuildInfo.c'
1228--- lib/CL/clGetProgramBuildInfo.c 2011-12-02 13:09:02 +0000
1229+++ lib/CL/clGetProgramBuildInfo.c 2011-12-18 04:51:24 +0000
1230@@ -32,31 +32,48 @@
1231 void * param_value,
1232 size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
1233 {
1234- char *retval = "";
1235- int retlen;
1236-
1237- if (program == NULL)
1238- return CL_INVALID_PROGRAM;
1239-
1240- // currently - just stub-implmement this.
1241- // there doesn't seem to exist an "CL_INTERNAL_ERROR" return code :(
1242- if (param_name != CL_PROGRAM_BUILD_LOG)
1243- return CL_INVALID_OPERATION;
1244-
1245- retlen = strlen(retval) + 1;
1246-
1247- if (param_value == NULL)
1248- return CL_SUCCESS;
1249-
1250- if (param_value_size < retlen)
1251- return CL_INVALID_VALUE;
1252-
1253- strncpy(param_value, retval, retlen);
1254-
1255- if (param_value_size_ret != NULL)
1256- *param_value_size_ret = retlen;
1257-
1258- return CL_SUCCESS;
1259+ const char *retval = ""; /* dummy return value */
1260+
1261+ switch (param_name) {
1262+ case CL_PROGRAM_BUILD_STATUS:
1263+ {
1264+ size_t const value_size = strlen(retval) + 1;
1265+ if (param_value)
1266+ {
1267+ if (param_value_size < value_size) return CL_INVALID_VALUE;
1268+ memcpy(param_value, retval, value_size);
1269+ }
1270+ if (param_value_size_ret)
1271+ *param_value_size_ret = value_size;
1272+ return CL_SUCCESS;
1273+ }
1274+
1275+ case CL_PROGRAM_BUILD_OPTIONS:
1276+ {
1277+ size_t const value_size = strlen(retval) + 1;
1278+ if (param_value)
1279+ {
1280+ if (param_value_size < value_size) return CL_INVALID_VALUE;
1281+ memcpy(param_value, retval, value_size);
1282+ }
1283+ if (param_value_size_ret)
1284+ *param_value_size_ret = value_size;
1285+ return CL_SUCCESS;
1286+ }
1287+
1288+ case CL_PROGRAM_BUILD_LOG:
1289+ {
1290+ size_t const value_size = strlen(retval) + 1;
1291+ if (param_value)
1292+ {
1293+ if (param_value_size < value_size) return CL_INVALID_VALUE;
1294+ memcpy(param_value, retval, value_size);
1295+ }
1296+ if (param_value_size_ret)
1297+ *param_value_size_ret = value_size;
1298+ return CL_SUCCESS;
1299+ }
1300+ }
1301+
1302+ return CL_INVALID_VALUE;
1303 }
1304-
1305-
1306
1307=== added file 'lib/CL/clGetProgramInfo.c'
1308--- lib/CL/clGetProgramInfo.c 1970-01-01 00:00:00 +0000
1309+++ lib/CL/clGetProgramInfo.c 2011-12-18 04:51:24 +0000
1310@@ -0,0 +1,89 @@
1311+/* OpenCL runtime library: clGetProgramInfo()
1312+
1313+ Copyright (c) 2011 Erik Schnetter
1314+
1315+ Permission is hereby granted, free of charge, to any person obtaining a copy
1316+ of this software and associated documentation files (the "Software"), to deal
1317+ in the Software without restriction, including without limitation the rights
1318+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1319+ copies of the Software, and to permit persons to whom the Software is
1320+ furnished to do so, subject to the following conditions:
1321+
1322+ The above copyright notice and this permission notice shall be included in
1323+ all copies or substantial portions of the Software.
1324+
1325+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1326+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1327+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1328+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1329+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1330+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1331+ THE SOFTWARE.
1332+*/
1333+
1334+#include "pocl_cl.h"
1335+#include <string.h>
1336+
1337+CL_API_ENTRY cl_int CL_API_CALL
1338+clGetProgramInfo(cl_program program,
1339+ cl_program_info param_name,
1340+ size_t param_value_size,
1341+ void *param_value,
1342+ size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
1343+{
1344+ switch (param_name)
1345+ {
1346+ case CL_PROGRAM_REFERENCE_COUNT:
1347+ return CL_INVALID_VALUE; /* not yet implemented */
1348+
1349+ case CL_PROGRAM_CONTEXT:
1350+ return CL_INVALID_VALUE; /* not yet implemented */
1351+
1352+ case CL_PROGRAM_NUM_DEVICES:
1353+ return CL_INVALID_VALUE; /* not yet implemented */
1354+
1355+ case CL_PROGRAM_DEVICES:
1356+ return CL_INVALID_VALUE; /* not yet implemented */
1357+
1358+ case CL_PROGRAM_SOURCE:
1359+ {
1360+ size_t const value_size = strlen(program->source) + 1;
1361+ if (param_value)
1362+ {
1363+ if (param_value_size < value_size) return CL_INVALID_VALUE;
1364+ memcpy(param_value, program->source, value_size);
1365+ }
1366+ if (param_value_size_ret)
1367+ *param_value_size_ret = value_size;
1368+ return CL_SUCCESS;
1369+ }
1370+
1371+ case CL_PROGRAM_BINARY_SIZES:
1372+ {
1373+ size_t const value_size = sizeof(size_t);
1374+ if (param_value)
1375+ {
1376+ if (param_value_size < value_size) return CL_INVALID_VALUE;
1377+ *(size_t*)param_value = program->binary_size;
1378+ }
1379+ if (param_value_size_ret)
1380+ *param_value_size_ret = value_size;
1381+ return CL_SUCCESS;
1382+ }
1383+
1384+ case CL_PROGRAM_BINARIES:
1385+ {
1386+ size_t const value_size = sizeof(unsigned char *);
1387+ if (param_value)
1388+ {
1389+ if (param_value_size < value_size) return CL_INVALID_VALUE;
1390+ *(unsigned char **)param_value = program->binary;
1391+ }
1392+ if (param_value_size_ret)
1393+ *param_value_size_ret = value_size;
1394+ return CL_SUCCESS;
1395+ }
1396+ }
1397+
1398+ return CL_INVALID_VALUE;
1399+}
1400
1401=== modified file 'lib/CL/devices/native/native.c'
1402--- lib/CL/devices/native/native.c 2011-12-08 02:45:48 +0000
1403+++ lib/CL/devices/native/native.c 2011-12-18 04:51:24 +0000
1404@@ -27,9 +27,13 @@
1405 #include <stdlib.h>
1406 #include <unistd.h>
1407
1408+#define max(a,b) (((a) > (b)) ? (a) : (b))
1409+
1410 #define COMMAND_LENGTH 256
1411 #define WORKGROUP_STRING_LENGTH 128
1412
1413+#define ALIGNMENT (max(ALIGNOF_FLOAT16, ALIGNOF_DOUBLE16))
1414+
1415 struct data {
1416 /* Currently loaded kernel. */
1417 cl_kernel current_kernel;
1418@@ -60,7 +64,7 @@
1419
1420 if (flags & CL_MEM_COPY_HOST_PTR)
1421 {
1422- if (posix_memalign (&b, ALIGNOF_FLOAT16, size) == 0)
1423+ if (posix_memalign (&b, ALIGNMENT, size) == 0)
1424 {
1425 memcpy (b, host_ptr, size);
1426 return b;
1427@@ -74,7 +78,7 @@
1428 return host_ptr;
1429 }
1430
1431- if (posix_memalign (&b, ALIGNOF_FLOAT16, size) == 0)
1432+ if (posix_memalign (&b, ALIGNMENT, size) == 0)
1433 return b;
1434
1435 return NULL;
1436
1437=== modified file 'lib/CL/devices/native/native.h'
1438--- lib/CL/devices/native/native.h 2011-12-08 02:45:48 +0000
1439+++ lib/CL/devices/native/native.h 2011-12-18 04:51:24 +0000
1440@@ -94,7 +94,11 @@
1441 pocl_native_malloc, /* malloc */ \
1442 pocl_native_free, /* free */ \
1443 pocl_native_read, /* read */ \
1444+ NULL, /* read_rect */ \
1445 pocl_native_write, /* write */ \
1446+ NULL, /* write_rect */ \
1447+ NULL, /* copy */ \
1448+ NULL, /* copy_rect */ \
1449 pocl_native_run, /* run */ \
1450 NULL /* data */ \
1451 }
1452
1453=== modified file 'lib/CL/devices/pthread/pthread.c'
1454--- lib/CL/devices/pthread/pthread.c 2011-12-08 02:45:48 +0000
1455+++ lib/CL/devices/pthread/pthread.c 2011-12-18 04:51:24 +0000
1456@@ -29,10 +29,13 @@
1457 #include <unistd.h>
1458
1459 #define min(a,b) (((a) < (b)) ? (a) : (b))
1460+#define max(a,b) (((a) > (b)) ? (a) : (b))
1461
1462 #define COMMAND_LENGTH 256
1463 #define WORKGROUP_STRING_LENGTH 128
1464
1465+#define ALIGNMENT (max(ALIGNOF_FLOAT16, ALIGNOF_DOUBLE16))
1466+
1467 /* The name of the environment variable used to force a certain max thread count
1468 for the thread execution. */
1469 #define THREAD_COUNT_ENV "POCL_MAX_PTHREAD_COUNT"
1470@@ -80,7 +83,7 @@
1471
1472 if (flags & CL_MEM_COPY_HOST_PTR)
1473 {
1474- if (posix_memalign (&b, ALIGNOF_FLOAT16, size) == 0)
1475+ if (posix_memalign (&b, ALIGNMENT, size) == 0)
1476 {
1477 memcpy (b, host_ptr, size);
1478 return b;
1479@@ -94,7 +97,7 @@
1480 return host_ptr;
1481 }
1482
1483- if (posix_memalign (&b, ALIGNOF_FLOAT16, size) == 0)
1484+ if (posix_memalign (&b, ALIGNMENT, size) == 0)
1485 return b;
1486
1487 return NULL;
1488@@ -118,7 +121,38 @@
1489 memcpy (host_ptr, device_ptr, cb);
1490 }
1491
1492-void pocl_pthread_write (void *data, const void *host_ptr, void *device_ptr, size_t cb)
1493+void
1494+pocl_pthread_read_rect (void *data,
1495+ void *__restrict__ const host_ptr,
1496+ void *__restrict__ const device_ptr,
1497+ const size_t *__restrict__ const buffer_origin,
1498+ const size_t *__restrict__ const host_origin,
1499+ const size_t *__restrict__ const region,
1500+ size_t const buffer_row_pitch,
1501+ size_t const buffer_slice_pitch,
1502+ size_t const host_row_pitch,
1503+ size_t const host_slice_pitch)
1504+{
1505+ char const *__restrict const adjusted_device_ptr =
1506+ (char const*)device_ptr +
1507+ buffer_origin[0] + buffer_row_pitch * (buffer_origin[1] + buffer_slice_pitch * buffer_origin[2]);
1508+ char *__restrict__ const adjusted_host_ptr =
1509+ (char*)host_ptr +
1510+ host_origin[0] + host_row_pitch * (host_origin[1] + host_slice_pitch * host_origin[2]);
1511+
1512+ size_t j, k;
1513+
1514+ /* TODO: handle overlaping regions */
1515+
1516+ for (k = 0; k < region[2]; ++k)
1517+ for (j = 0; j < region[1]; ++j)
1518+ memcpy (adjusted_host_ptr + host_row_pitch * j + host_slice_pitch * k,
1519+ adjusted_device_ptr + buffer_row_pitch * j + buffer_slice_pitch * k,
1520+ region[0]);
1521+}
1522+
1523+void
1524+pocl_pthread_write (void *data, const void *host_ptr, void *device_ptr, size_t cb)
1525 {
1526 if (host_ptr == device_ptr)
1527 return;
1528@@ -126,6 +160,75 @@
1529 memcpy (device_ptr, host_ptr, cb);
1530 }
1531
1532+void
1533+pocl_pthread_write_rect (void *data,
1534+ const void *__restrict__ const host_ptr,
1535+ void *__restrict__ const device_ptr,
1536+ const size_t *__restrict__ const buffer_origin,
1537+ const size_t *__restrict__ const host_origin,
1538+ const size_t *__restrict__ const region,
1539+ size_t const buffer_row_pitch,
1540+ size_t const buffer_slice_pitch,
1541+ size_t const host_row_pitch,
1542+ size_t const host_slice_pitch)
1543+{
1544+ char *__restrict const adjusted_device_ptr =
1545+ (char*)device_ptr +
1546+ buffer_origin[0] + buffer_row_pitch * (buffer_origin[1] + buffer_slice_pitch * buffer_origin[2]);
1547+ char const *__restrict__ const adjusted_host_ptr =
1548+ (char const*)host_ptr +
1549+ host_origin[0] + host_row_pitch * (host_origin[1] + host_slice_pitch * host_origin[2]);
1550+
1551+ size_t j, k;
1552+
1553+ /* TODO: handle overlaping regions */
1554+
1555+ for (k = 0; k < region[2]; ++k)
1556+ for (j = 0; j < region[1]; ++j)
1557+ memcpy (adjusted_device_ptr + buffer_row_pitch * j + buffer_slice_pitch * k,
1558+ adjusted_host_ptr + host_row_pitch * j + host_slice_pitch * k,
1559+ region[0]);
1560+}
1561+
1562+void
1563+pocl_pthread_copy (void *data, const void *src_ptr, const void *dst_ptr, size_t cb)
1564+{
1565+ if (src_ptr == dst_ptr)
1566+ return;
1567+
1568+ memcpy (dst_ptr, src_ptr, cb);
1569+}
1570+
1571+void
1572+pocl_pthread_copy_rect (void *data,
1573+ const void *__restrict const src_ptr,
1574+ void *__restrict__ const dst_ptr,
1575+ const size_t *__restrict__ const src_origin,
1576+ const size_t *__restrict__ const dst_origin,
1577+ const size_t *__restrict__ const region,
1578+ size_t const src_row_pitch,
1579+ size_t const src_slice_pitch,
1580+ size_t const dst_row_pitch,
1581+ size_t const dst_slice_pitch)
1582+{
1583+ char const *__restrict const adjusted_src_ptr =
1584+ (char const*)src_ptr +
1585+ src_origin[0] + src_row_pitch * (src_origin[1] + src_slice_pitch * src_origin[2]);
1586+ char *__restrict__ const adjusted_dst_ptr =
1587+ (char*)dst_ptr +
1588+ dst_origin[0] + dst_row_pitch * (dst_origin[1] + dst_slice_pitch * dst_origin[2]);
1589+
1590+ size_t j, k;
1591+
1592+ /* TODO: handle overlaping regions */
1593+
1594+ for (k = 0; k < region[2]; ++k)
1595+ for (j = 0; j < region[1]; ++j)
1596+ memcpy (adjusted_dst_ptr + dst_row_pitch * j + dst_slice_pitch * k,
1597+ adjusted_src_ptr + src_row_pitch * j + src_slice_pitch * k,
1598+ region[0]);
1599+}
1600+
1601 //#define DEBUG_MT
1602 //#define DEBUG_MAX_THREAD_COUNT
1603 /**
1604
1605=== modified file 'lib/CL/devices/pthread/pthread.h'
1606--- lib/CL/devices/pthread/pthread.h 2011-12-08 02:45:48 +0000
1607+++ lib/CL/devices/pthread/pthread.h 2011-12-18 04:51:24 +0000
1608@@ -31,13 +31,82 @@
1609 size_t size, void *host_ptr);
1610 void pocl_pthread_free (void *data, cl_mem_flags flags, void *ptr);
1611 void pocl_pthread_read (void *data, void *host_ptr, const void *device_ptr, size_t cb);
1612+void pocl_pthread_read_rect (void *data, void *host_ptr, void *device_ptr,
1613+ const size_t *buffer_origin,
1614+ const size_t *host_origin,
1615+ const size_t *region,
1616+ size_t buffer_row_pitch,
1617+ size_t buffer_slice_pitch,
1618+ size_t host_row_pitch,
1619+ size_t host_slice_pitch);
1620 void pocl_pthread_write (void *data, const void *host_ptr, void *device_ptr, size_t cb);
1621+void pocl_pthread_write_rect (void *data, const void *host_ptr, void *device_ptr,
1622+ const size_t *buffer_origin,
1623+ const size_t *host_origin,
1624+ const size_t *region,
1625+ size_t buffer_row_pitch,
1626+ size_t buffer_slice_pitch,
1627+ size_t host_row_pitch,
1628+ size_t host_slice_pitch);
1629+void pocl_pthread_copy (void *data, const void *src_ptr, const void *dst_ptr, size_t cb);
1630+void pocl_pthread_copy_rect (void *data, const void *src_ptr, void *dst_ptr,
1631+ const size_t *src_origin,
1632+ const size_t *dst_origin,
1633+ const size_t *region,
1634+ size_t src_row_pitch,
1635+ size_t src_slice_pitch,
1636+ size_t dst_row_pitch,
1637+ size_t dst_slice_pitch);
1638 void pocl_pthread_run (void *data, const char *bytecode,
1639 cl_kernel kernel,
1640 struct pocl_context *pc);
1641
1642 extern size_t pocl_pthread_max_work_item_sizes[];
1643
1644+/* Determine preferred vector sizes */
1645+#if defined(__AVX__)
1646+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_CHAR 16
1647+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_SHORT 8
1648+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_INT 4
1649+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_LONG 2
1650+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_FLOAT 4
1651+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_DOUBLE 2
1652+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_CHAR 16
1653+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_SHORT 8
1654+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_INT 4
1655+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_LONG 2
1656+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_FLOAT 8
1657+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_DOUBLE 4
1658+#elif defined(__SSE2__)
1659+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_CHAR 16
1660+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_SHORT 8
1661+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_INT 4
1662+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_LONG 2
1663+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_FLOAT 4
1664+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_DOUBLE 2
1665+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_CHAR 16
1666+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_SHORT 8
1667+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_INT 4
1668+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_LONG 2
1669+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_FLOAT 4
1670+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_DOUBLE 2
1671+#else
1672+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_CHAR 1
1673+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_SHORT 1
1674+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_INT 1
1675+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_LONG 1
1676+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_FLOAT 1
1677+# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_DOUBLE 1
1678+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_CHAR 1
1679+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_SHORT 1
1680+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_INT 1
1681+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_LONG 1
1682+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_FLOAT 1
1683+# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_DOUBLE 1
1684+#endif
1685+/* Half is internally represented as short */
1686+#define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_HALF POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_SHORT
1687+
1688 #define POCL_DEVICES_PTHREAD { \
1689 CL_DEVICE_TYPE_CPU, /* type */ \
1690 0, /* vendor_id */ \
1691@@ -45,12 +114,12 @@
1692 3, /* max_work_item_dimensions */ \
1693 pocl_pthread_max_work_item_sizes, /* max_work_item_sizes */ \
1694 CL_INT_MAX, /* max_work_group_size */ \
1695- 0, /* preferred_vector_width_char */ \
1696- 0, /* preferred_vector_width_short */ \
1697- 0, /* preferred_vector_width_int */ \
1698- 0, /* preferred_vector_width_long */ \
1699- 0, /* preferred_vector_width_float */ \
1700- 0, /* preferred_vector_width_double */ \
1701+ POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_CHAR , /* preferred_vector_width_char */ \
1702+ POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_SHORT , /* preferred_vector_width_short */ \
1703+ POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_INT , /* preferred_vector_width_int */ \
1704+ POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_LONG , /* preferred_vector_width_long */ \
1705+ POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_FLOAT , /* preferred_vector_width_float */ \
1706+ POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_DOUBLE, /* preferred_vector_width_double */ \
1707 0, /* max_clock_frequency */ \
1708 0, /* address_bits */ \
1709 0, /* max_mem_alloc_size */ \
1710@@ -94,7 +163,11 @@
1711 pocl_pthread_malloc, /* malloc */ \
1712 pocl_pthread_free, /* free */ \
1713 pocl_pthread_read, /* read */ \
1714+ pocl_pthread_read_rect, /* read_rect */ \
1715 pocl_pthread_write, /* write */ \
1716+ pocl_pthread_write_rect, /* write_rect */ \
1717+ pocl_pthread_copy, /* copy */ \
1718+ pocl_pthread_copy_rect, /* copy_rect */ \
1719 pocl_pthread_run, /* run */ \
1720 NULL /* data */ \
1721 }
1722
1723=== modified file 'lib/CL/pocl_cl.h'
1724--- lib/CL/pocl_cl.h 2011-12-08 14:06:07 +0000
1725+++ lib/CL/pocl_cl.h 2011-12-18 04:51:24 +0000
1726@@ -100,7 +100,32 @@
1727 size_t size, void *host_ptr);
1728 void (*free) (void *data, cl_mem_flags flags, void *ptr);
1729 void (*read) (void *data, void *host_ptr, void *device_ptr, size_t cb);
1730+ void (*read_rect) (void *data, void *host_ptr, void *device_ptr,
1731+ const size_t *buffer_origin,
1732+ const size_t *host_origin,
1733+ const size_t *region,
1734+ size_t buffer_row_pitch,
1735+ size_t buffer_slice_pitch,
1736+ size_t host_row_pitch,
1737+ size_t host_slice_pitch);
1738 void (*write) (void *data, const void *host_ptr, void *device_ptr, size_t cb);
1739+ void (*write_rect) (void *data, const void *host_ptr, void *device_ptr,
1740+ const size_t *buffer_origin,
1741+ const size_t *host_origin,
1742+ const size_t *region,
1743+ size_t buffer_row_pitch,
1744+ size_t buffer_slice_pitch,
1745+ size_t host_row_pitch,
1746+ size_t host_slice_pitch);
1747+ void (*copy) (void *data, const void *src_ptr, const void *dst_ptr, size_t cb);
1748+ void (*copy_rect) (void *data, const void *src_ptr, void *dst_ptr,
1749+ const size_t *src_origin,
1750+ const size_t *dst_origin,
1751+ const size_t *region,
1752+ size_t src_row_pitch,
1753+ size_t src_slice_pitch,
1754+ size_t dst_row_pitch,
1755+ size_t dst_slice_pitch);
1756 void (*run) (void *data, const char *bytecode,
1757 cl_kernel kernel,
1758 struct pocl_context *pc);
1759
1760=== modified file 'lib/kernel/all.cl'
1761--- lib/kernel/all.cl 2011-11-24 15:56:10 +0000
1762+++ lib/kernel/all.cl 2011-12-18 04:51:24 +0000
1763@@ -21,123 +21,123 @@
1764 THE SOFTWARE.
1765 */
1766
1767-int __attribute__((__overloadable__)) all(char a)
1768+int _cl_overloadable all(char a)
1769 {
1770 return a < (char)0;
1771 }
1772
1773-int __attribute__((__overloadable__)) all(char2 a)
1774+int _cl_overloadable all(char2 a)
1775 {
1776 return all(a.lo) && all(a.hi);
1777 }
1778
1779-int __attribute__((__overloadable__)) all(char3 a)
1780+int _cl_overloadable all(char3 a)
1781 {
1782 return all(a.s01) && all(a.s2);
1783 }
1784
1785-int __attribute__((__overloadable__)) all(char4 a)
1786-{
1787- return all(a.lo) && all(a.hi);
1788-}
1789-
1790-int __attribute__((__overloadable__)) all(char8 a)
1791-{
1792- return all(a.lo) && all(a.hi);
1793-}
1794-
1795-int __attribute__((__overloadable__)) all(char16 a)
1796-{
1797- return all(a.lo) && all(a.hi);
1798-}
1799-
1800-int __attribute__((__overloadable__)) all(short a)
1801+int _cl_overloadable all(char4 a)
1802+{
1803+ return all(a.lo) && all(a.hi);
1804+}
1805+
1806+int _cl_overloadable all(char8 a)
1807+{
1808+ return all(a.lo) && all(a.hi);
1809+}
1810+
1811+int _cl_overloadable all(char16 a)
1812+{
1813+ return all(a.lo) && all(a.hi);
1814+}
1815+
1816+int _cl_overloadable all(short a)
1817 {
1818 return a < (short)0;
1819 }
1820
1821-int __attribute__((__overloadable__)) all(short2 a)
1822+int _cl_overloadable all(short2 a)
1823 {
1824 return all(a.lo) && all(a.hi);
1825 }
1826
1827-int __attribute__((__overloadable__)) all(short3 a)
1828+int _cl_overloadable all(short3 a)
1829 {
1830 return all(a.s01) && all(a.s2);
1831 }
1832
1833-int __attribute__((__overloadable__)) all(short4 a)
1834-{
1835- return all(a.lo) && all(a.hi);
1836-}
1837-
1838-int __attribute__((__overloadable__)) all(short8 a)
1839-{
1840- return all(a.lo) && all(a.hi);
1841-}
1842-
1843-int __attribute__((__overloadable__)) all(short16 a)
1844-{
1845- return all(a.lo) && all(a.hi);
1846-}
1847-
1848-int __attribute__((__overloadable__)) all(int a)
1849+int _cl_overloadable all(short4 a)
1850+{
1851+ return all(a.lo) && all(a.hi);
1852+}
1853+
1854+int _cl_overloadable all(short8 a)
1855+{
1856+ return all(a.lo) && all(a.hi);
1857+}
1858+
1859+int _cl_overloadable all(short16 a)
1860+{
1861+ return all(a.lo) && all(a.hi);
1862+}
1863+
1864+int _cl_overloadable all(int a)
1865 {
1866 return a < 0;
1867 }
1868
1869-int __attribute__((__overloadable__)) all(int2 a)
1870+int _cl_overloadable all(int2 a)
1871 {
1872 return all(a.lo) && all(a.hi);
1873 }
1874
1875-int __attribute__((__overloadable__)) all(int3 a)
1876+int _cl_overloadable all(int3 a)
1877 {
1878 return all(a.s01) && all(a.s2);
1879 }
1880
1881-int __attribute__((__overloadable__)) all(int4 a)
1882-{
1883- return all(a.lo) && all(a.hi);
1884-}
1885-
1886-int __attribute__((__overloadable__)) all(int8 a)
1887-{
1888- return all(a.lo) && all(a.hi);
1889-}
1890-
1891-int __attribute__((__overloadable__)) all(int16 a)
1892+int _cl_overloadable all(int4 a)
1893+{
1894+ return all(a.lo) && all(a.hi);
1895+}
1896+
1897+int _cl_overloadable all(int8 a)
1898+{
1899+ return all(a.lo) && all(a.hi);
1900+}
1901+
1902+int _cl_overloadable all(int16 a)
1903 {
1904 return all(a.lo) && all(a.hi);
1905 }
1906
1907 #ifdef cles_khr_int64
1908-int __attribute__((__overloadable__)) all(long a)
1909+int _cl_overloadable all(long a)
1910 {
1911 return a < 0L;
1912 }
1913
1914-int __attribute__((__overloadable__)) all(long2 a)
1915+int _cl_overloadable all(long2 a)
1916 {
1917 return all(a.lo) && all(a.hi);
1918 }
1919
1920-int __attribute__((__overloadable__)) all(long3 a)
1921+int _cl_overloadable all(long3 a)
1922 {
1923 return all(a.s01) && all(a.s2);
1924 }
1925
1926-int __attribute__((__overloadable__)) all(long4 a)
1927-{
1928- return all(a.lo) && all(a.hi);
1929-}
1930-
1931-int __attribute__((__overloadable__)) all(long8 a)
1932-{
1933- return all(a.lo) && all(a.hi);
1934-}
1935-
1936-int __attribute__((__overloadable__)) all(long16 a)
1937+int _cl_overloadable all(long4 a)
1938+{
1939+ return all(a.lo) && all(a.hi);
1940+}
1941+
1942+int _cl_overloadable all(long8 a)
1943+{
1944+ return all(a.lo) && all(a.hi);
1945+}
1946+
1947+int _cl_overloadable all(long16 a)
1948 {
1949 return all(a.lo) && all(a.hi);
1950 }
1951
1952=== modified file 'lib/kernel/any.cl'
1953--- lib/kernel/any.cl 2011-11-24 15:56:10 +0000
1954+++ lib/kernel/any.cl 2011-12-18 04:51:24 +0000
1955@@ -21,123 +21,123 @@
1956 THE SOFTWARE.
1957 */
1958
1959-int __attribute__((__overloadable__)) any(char a)
1960+int _cl_overloadable any(char a)
1961 {
1962 return a < (char)0;
1963 }
1964
1965-int __attribute__((__overloadable__)) any(char2 a)
1966+int _cl_overloadable any(char2 a)
1967 {
1968 return any(a.lo) || any(a.hi);
1969 }
1970
1971-int __attribute__((__overloadable__)) any(char3 a)
1972+int _cl_overloadable any(char3 a)
1973 {
1974 return any(a.s01) || any(a.s2);
1975 }
1976
1977-int __attribute__((__overloadable__)) any(char4 a)
1978-{
1979- return any(a.lo) || any(a.hi);
1980-}
1981-
1982-int __attribute__((__overloadable__)) any(char8 a)
1983-{
1984- return any(a.lo) || any(a.hi);
1985-}
1986-
1987-int __attribute__((__overloadable__)) any(char16 a)
1988-{
1989- return any(a.lo) || any(a.hi);
1990-}
1991-
1992-int __attribute__((__overloadable__)) any(short a)
1993+int _cl_overloadable any(char4 a)
1994+{
1995+ return any(a.lo) || any(a.hi);
1996+}
1997+
1998+int _cl_overloadable any(char8 a)
1999+{
2000+ return any(a.lo) || any(a.hi);
2001+}
2002+
2003+int _cl_overloadable any(char16 a)
2004+{
2005+ return any(a.lo) || any(a.hi);
2006+}
2007+
2008+int _cl_overloadable any(short a)
2009 {
2010 return a < (short)0;
2011 }
2012
2013-int __attribute__((__overloadable__)) any(short2 a)
2014+int _cl_overloadable any(short2 a)
2015 {
2016 return any(a.lo) || any(a.hi);
2017 }
2018
2019-int __attribute__((__overloadable__)) any(short3 a)
2020+int _cl_overloadable any(short3 a)
2021 {
2022 return any(a.s01) || any(a.s2);
2023 }
2024
2025-int __attribute__((__overloadable__)) any(short4 a)
2026-{
2027- return any(a.lo) || any(a.hi);
2028-}
2029-
2030-int __attribute__((__overloadable__)) any(short8 a)
2031-{
2032- return any(a.lo) || any(a.hi);
2033-}
2034-
2035-int __attribute__((__overloadable__)) any(short16 a)
2036-{
2037- return any(a.lo) || any(a.hi);
2038-}
2039-
2040-int __attribute__((__overloadable__)) any(int a)
2041+int _cl_overloadable any(short4 a)
2042+{
2043+ return any(a.lo) || any(a.hi);
2044+}
2045+
2046+int _cl_overloadable any(short8 a)
2047+{
2048+ return any(a.lo) || any(a.hi);
2049+}
2050+
2051+int _cl_overloadable any(short16 a)
2052+{
2053+ return any(a.lo) || any(a.hi);
2054+}
2055+
2056+int _cl_overloadable any(int a)
2057 {
2058 return a < 0;
2059 }
2060
2061-int __attribute__((__overloadable__)) any(int2 a)
2062+int _cl_overloadable any(int2 a)
2063 {
2064 return any(a.lo) || any(a.hi);
2065 }
2066
2067-int __attribute__((__overloadable__)) any(int3 a)
2068+int _cl_overloadable any(int3 a)
2069 {
2070 return any(a.s01) || any(a.s2);
2071 }
2072
2073-int __attribute__((__overloadable__)) any(int4 a)
2074-{
2075- return any(a.lo) || any(a.hi);
2076-}
2077-
2078-int __attribute__((__overloadable__)) any(int8 a)
2079-{
2080- return any(a.lo) || any(a.hi);
2081-}
2082-
2083-int __attribute__((__overloadable__)) any(int16 a)
2084+int _cl_overloadable any(int4 a)
2085+{
2086+ return any(a.lo) || any(a.hi);
2087+}
2088+
2089+int _cl_overloadable any(int8 a)
2090+{
2091+ return any(a.lo) || any(a.hi);
2092+}
2093+
2094+int _cl_overloadable any(int16 a)
2095 {
2096 return any(a.lo) || any(a.hi);
2097 }
2098
2099 #ifdef cles_khr_int64
2100-int __attribute__((__overloadable__)) any(long a)
2101+int _cl_overloadable any(long a)
2102 {
2103 return a < 0L;
2104 }
2105
2106-int __attribute__((__overloadable__)) any(long2 a)
2107+int _cl_overloadable any(long2 a)
2108 {
2109 return any(a.lo) || any(a.hi);
2110 }
2111
2112-int __attribute__((__overloadable__)) any(long3 a)
2113+int _cl_overloadable any(long3 a)
2114 {
2115 return any(a.s01) || any(a.s2);
2116 }
2117
2118-int __attribute__((__overloadable__)) any(long4 a)
2119-{
2120- return any(a.lo) || any(a.hi);
2121-}
2122-
2123-int __attribute__((__overloadable__)) any(long8 a)
2124-{
2125- return any(a.lo) || any(a.hi);
2126-}
2127-
2128-int __attribute__((__overloadable__)) any(long16 a)
2129+int _cl_overloadable any(long4 a)
2130+{
2131+ return any(a.lo) || any(a.hi);
2132+}
2133+
2134+int _cl_overloadable any(long8 a)
2135+{
2136+ return any(a.lo) || any(a.hi);
2137+}
2138+
2139+int _cl_overloadable any(long16 a)
2140 {
2141 return any(a.lo) || any(a.hi);
2142 }
2143
2144=== added file 'lib/kernel/arm/vload_half.cl'
2145--- lib/kernel/arm/vload_half.cl 1970-01-01 00:00:00 +0000
2146+++ lib/kernel/arm/vload_half.cl 2011-12-18 04:51:24 +0000
2147@@ -0,0 +1,109 @@
2148+/* OpenCL built-in library: vload_half()
2149+
2150+ Copyright (c) 2011 Universidad Rey Juan Carlos
2151+
2152+ Permission is hereby granted, free of charge, to any person obtaining a copy
2153+ of this software and associated documentation files (the "Software"), to deal
2154+ in the Software without restriction, including without limitation the rights
2155+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
2156+ copies of the Software, and to permit persons to whom the Software is
2157+ furnished to do so, subject to the following conditions:
2158+
2159+ The above copyright notice and this permission notice shall be included in
2160+ all copies or substantial portions of the Software.
2161+
2162+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2163+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2164+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2165+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2166+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2167+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2168+ THE SOFTWARE.
2169+*/
2170+
2171+
2172+
2173+#define IMPLEMENT_VLOAD_HALF(MOD) \
2174+ \
2175+ float _cl_overloadable \
2176+ vload_half(size_t offset, const MOD half *p) \
2177+ { \
2178+ return p[offset]; \
2179+ } \
2180+ \
2181+ float2 _cl_overloadable \
2182+ vload_half2(size_t offset, const MOD half *p) \
2183+ { \
2184+ return (float2)(vload_half(0, &p[offset*2]), \
2185+ vload_half(0, &p[offset*2+1])); \
2186+ } \
2187+ \
2188+ float3 _cl_overloadable \
2189+ vload_half3(size_t offset, const MOD half *p) \
2190+ { \
2191+ return (float3)(vload_half2(0, &p[offset*3]), \
2192+ vload_half(0, &p[offset*3+2])); \
2193+ } \
2194+ \
2195+ float4 _cl_overloadable \
2196+ vload_half4(size_t offset, const MOD half *p) \
2197+ { \
2198+ return (float4)(vload_half2(0, &p[offset*4]), \
2199+ vload_half2(0, &p[offset*4+2])); \
2200+ } \
2201+ \
2202+ float8 _cl_overloadable \
2203+ vload_half8(size_t offset, const MOD half *p) \
2204+ { \
2205+ return (float8)(vload_half4(0, &p[offset*8]), \
2206+ vload_half4(0, &p[offset*8+4])); \
2207+ } \
2208+ \
2209+ float16 _cl_overloadable \
2210+ vload_half16(size_t offset, const MOD half *p) \
2211+ { \
2212+ return (float16)(vload_half8(0, &p[offset*16]), \
2213+ vload_half8(0, &p[offset*16+8])); \
2214+ } \
2215+ \
2216+ float2 _cl_overloadable \
2217+ vloada_half2(size_t offset, const MOD half *p) \
2218+ { \
2219+ return (float2)(vload_half(0, &p[offset*2]), \
2220+ vload_half(0, &p[offset*2+1])); \
2221+ } \
2222+ \
2223+ float3 _cl_overloadable \
2224+ vloada_half3(size_t offset, const MOD half *p) \
2225+ { \
2226+ return (float3)(vloada_half2(0, &p[offset*4]), \
2227+ vload_half(0, &p[offset*4+2])); \
2228+ } \
2229+ \
2230+ float4 _cl_overloadable \
2231+ vloada_half4(size_t offset, const MOD half *p) \
2232+ { \
2233+ return (float4)(vloada_half2(0, &p[offset*4]), \
2234+ vloada_half2(0, &p[offset*4+2])); \
2235+ } \
2236+ \
2237+ float8 _cl_overloadable \
2238+ vloada_half8(size_t offset, const MOD half *p) \
2239+ { \
2240+ return (float8)(vloada_half4(0, &p[offset*8]), \
2241+ vloada_half4(0, &p[offset*8+4])); \
2242+ } \
2243+ \
2244+ float16 _cl_overloadable \
2245+ vloada_half16(size_t offset, const MOD half *p) \
2246+ { \
2247+ return (float16)(vloada_half8(0, &p[offset*16]), \
2248+ vloada_half8(0, &p[offset*16+8])); \
2249+ }
2250+
2251+
2252+
2253+IMPLEMENT_VLOAD_HALF(__global)
2254+IMPLEMENT_VLOAD_HALF(__local)
2255+IMPLEMENT_VLOAD_HALF(__constant)
2256+/* IMPLEMENT_VLOAD_HALF(__private) */
2257
2258=== added file 'lib/kernel/arm/vstore_half.cl'
2259--- lib/kernel/arm/vstore_half.cl 1970-01-01 00:00:00 +0000
2260+++ lib/kernel/arm/vstore_half.cl 2011-12-18 04:51:24 +0000
2261@@ -0,0 +1,120 @@
2262+/* OpenCL built-in library: vstore_half()
2263+
2264+ Copyright (c) 2011 Universidad Rey Juan Carlos
2265+
2266+ Permission is hereby granted, free of charge, to any person obtaining a copy
2267+ of this software and associated documentation files (the "Software"), to deal
2268+ in the Software without restriction, including without limitation the rights
2269+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
2270+ copies of the Software, and to permit persons to whom the Software is
2271+ furnished to do so, subject to the following conditions:
2272+
2273+ The above copyright notice and this permission notice shall be included in
2274+ all copies or substantial portions of the Software.
2275+
2276+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2277+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2278+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2279+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2280+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2281+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2282+ THE SOFTWARE.
2283+*/
2284+
2285+
2286+
2287+#define IMPLEMENT_VSTORE_HALF(MOD, SUFFIX) \
2288+ \
2289+ void _cl_overloadable \
2290+ vstore_half##SUFFIX(float data, size_t offset, MOD half *p) \
2291+ { \
2292+ p[offset] = data; \
2293+ } \
2294+ \
2295+ void _cl_overloadable \
2296+ vstore_half2##SUFFIX(float2 data, size_t offset, MOD half *p) \
2297+ { \
2298+ vstore_half##SUFFIX(data.lo, 0, &p[offset*2]); \
2299+ vstore_half##SUFFIX(data.hi, 0, &p[offset*2+1]); \
2300+ } \
2301+ \
2302+ void _cl_overloadable \
2303+ vstore_half3##SUFFIX(float3 data, size_t offset, MOD half *p) \
2304+ { \
2305+ vstore_half2##SUFFIX(data.lo, 0, &p[offset*3]); \
2306+ vstore_half##SUFFIX(data.s2, 0, &p[offset*3+2]); \
2307+ } \
2308+ \
2309+ void _cl_overloadable \
2310+ vstore_half4##SUFFIX(float4 data, size_t offset, MOD half *p) \
2311+ { \
2312+ vstore_half2##SUFFIX(data.lo, 0, &p[offset*4]); \
2313+ vstore_half2##SUFFIX(data.hi, 0, &p[offset*4+2]); \
2314+ } \
2315+ \
2316+ void _cl_overloadable \
2317+ vstore_half8##SUFFIX(float8 data, size_t offset, MOD half *p) \
2318+ { \
2319+ vstore_half4##SUFFIX(data.lo, 0, &p[offset*8]); \
2320+ vstore_half4##SUFFIX(data.hi, 0, &p[offset*8+4]); \
2321+ } \
2322+ \
2323+ void _cl_overloadable \
2324+ vstore_half16##SUFFIX(float16 data, size_t offset, MOD half *p) \
2325+ { \
2326+ vstore_half8##SUFFIX(data.lo, 0, &p[offset*16]); \
2327+ vstore_half8##SUFFIX(data.hi, 0, &p[offset*16+8]); \
2328+ } \
2329+ \
2330+ void _cl_overloadable \
2331+ vstorea_half2##SUFFIX(float2 data, size_t offset, MOD half *p) \
2332+ { \
2333+ vstore_half##SUFFIX(data.lo, 0, &p[offset*2]); \
2334+ vstore_half##SUFFIX(data.hi, 0, &p[offset*2+1]); \
2335+ } \
2336+ \
2337+ void _cl_overloadable \
2338+ vstorea_half3##SUFFIX(float3 data, size_t offset, MOD half *p) \
2339+ { \
2340+ vstorea_half2##SUFFIX(data.lo, 0, &p[offset*3]); \
2341+ vstore_half##SUFFIX(data.s2, 0, &p[offset*3+2]); \
2342+ } \
2343+ \
2344+ void _cl_overloadable \
2345+ vstorea_half4##SUFFIX(float4 data, size_t offset, MOD half *p) \
2346+ { \
2347+ vstorea_half2##SUFFIX(data.lo, 0, &p[offset*4]); \
2348+ vstorea_half2##SUFFIX(data.hi, 0, &p[offset*4+2]); \
2349+ } \
2350+ \
2351+ void _cl_overloadable \
2352+ vstorea_half8##SUFFIX(float8 data, size_t offset, MOD half *p) \
2353+ { \
2354+ vstorea_half4##SUFFIX(data.lo, 0, &p[offset*8]); \
2355+ vstorea_half4##SUFFIX(data.hi, 0, &p[offset*8+4]); \
2356+ } \
2357+ \
2358+ void _cl_overloadable \
2359+ vstorea_half16##SUFFIX(float16 data, size_t offset, MOD half *p) \
2360+ { \
2361+ vstorea_half8##SUFFIX(data.lo, 0, &p[offset*16]); \
2362+ vstorea_half8##SUFFIX(data.hi, 0, &p[offset*16+8]); \
2363+ }
2364+
2365+
2366+
2367+IMPLEMENT_VSTORE_HALF(__global , )
2368+IMPLEMENT_VSTORE_HALF(__global , _rte)
2369+IMPLEMENT_VSTORE_HALF(__global , _rtz)
2370+IMPLEMENT_VSTORE_HALF(__global , _rtp)
2371+IMPLEMENT_VSTORE_HALF(__global , _rtn)
2372+IMPLEMENT_VSTORE_HALF(__local , )
2373+IMPLEMENT_VSTORE_HALF(__local , _rte)
2374+IMPLEMENT_VSTORE_HALF(__local , _rtz)
2375+IMPLEMENT_VSTORE_HALF(__local , _rtp)
2376+IMPLEMENT_VSTORE_HALF(__local , _rtn)
2377+/* IMPLEMENT_VSTORE_HALF(__private , ) */
2378+/* IMPLEMENT_VSTORE_HALF(__private , _rte) */
2379+/* IMPLEMENT_VSTORE_HALF(__private , _rtz) */
2380+/* IMPLEMENT_VSTORE_HALF(__private , _rtp) */
2381+/* IMPLEMENT_VSTORE_HALF(__private , _rtn) */
2382
2383=== modified file 'lib/kernel/as_type.cl'
2384--- lib/kernel/as_type.cl 2011-11-25 17:02:42 +0000
2385+++ lib/kernel/as_type.cl 2011-12-18 04:51:24 +0000
2386@@ -22,7 +22,7 @@
2387 */
2388
2389 #define DEFINE_AS_TYPE(SRC, DST) \
2390- DST __attribute__ ((__overloadable__)) \
2391+ DST _cl_overloadable \
2392 as_##DST(SRC a) \
2393 { \
2394 return *(DST*)&a; \
2395
2396=== modified file 'lib/kernel/convert_type.cl'
2397--- lib/kernel/convert_type.cl 2011-11-25 16:47:53 +0000
2398+++ lib/kernel/convert_type.cl 2011-12-18 04:51:24 +0000
2399@@ -24,28 +24,28 @@
2400 #include "templates.h"
2401
2402 /* SRC and DST must be scalars */
2403-#define DEFINE_CONVERT_TYPE(SRC, DST) \
2404- DST __attribute__ ((__overloadable__)) convert_##DST(SRC a) \
2405- { \
2406- return (DST)a; \
2407+#define DEFINE_CONVERT_TYPE(SRC, DST) \
2408+ DST _cl_overloadable convert_##DST(SRC a) \
2409+ { \
2410+ return (DST)a; \
2411 }
2412
2413 /* implementing vector SRC and DST in terms of scalars */
2414 #define DEFINE_CONVERT_TYPE_HALF(SRC, DST, HALFDST) \
2415- DST __attribute__ ((__overloadable__)) convert_##DST(SRC a) \
2416+ DST _cl_overloadable convert_##DST(SRC a) \
2417 { \
2418 return (DST)(convert_##HALFDST(a.lo), convert_##HALFDST(a.hi)); \
2419 }
2420
2421 #define DEFINE_CONVERT_TYPE_012(SRC, DST, DST01, DST2) \
2422- DST __attribute__ ((__overloadable__)) convert_##DST(SRC a) \
2423+ DST _cl_overloadable convert_##DST(SRC a) \
2424 { \
2425 return (DST)(convert_##DST01(a.s01), convert_##DST2(a.s2)); \
2426 }
2427
2428 /* SRC and DST may be vectors */
2429 #define DEFINE_CONVERT_TYPE_SAT(SRC, DST, SIZE) \
2430- DST##SIZE __attribute__ ((__overloadable__)) \
2431+ DST##SIZE _cl_overloadable \
2432 convert_##DST##SIZE##_sat(SRC##SIZE a) \
2433 { \
2434 int const src_float = (SRC)0.1f > (SRC)0; \
2435
2436=== modified file 'lib/kernel/cross.cl'
2437--- lib/kernel/cross.cl 2011-11-01 16:33:17 +0000
2438+++ lib/kernel/cross.cl 2011-12-18 04:51:24 +0000
2439@@ -21,12 +21,12 @@
2440 THE SOFTWARE.
2441 */
2442
2443-float4 __attribute__ ((__overloadable__)) cross(float4 a, float4 b)
2444+float4 _cl_overloadable cross(float4 a, float4 b)
2445 {
2446 return (float4)(cross(a.xyz, b.xyz), 0.0f);
2447 }
2448
2449-float3 __attribute__ ((__overloadable__)) cross(float3 a, float3 b)
2450+float3 _cl_overloadable cross(float3 a, float3 b)
2451 {
2452 return (float3)(a.y * b.z - a.z * b.y,
2453 a.z * b.x - a.x * b.z,
2454@@ -34,12 +34,12 @@
2455 }
2456
2457 #ifdef cl_khr_fp64
2458-double4 __attribute__ ((__overloadable__)) cross(double4 a, double4 b)
2459+double4 _cl_overloadable cross(double4 a, double4 b)
2460 {
2461 return (double4)(cross(a.xyz, b.xyz), 0.0f);
2462 }
2463
2464-double3 __attribute__ ((__overloadable__)) cross(double3 a, double3 b)
2465+double3 _cl_overloadable cross(double3 a, double3 b)
2466 {
2467 return (double3)(a.y * b.z - a.z * b.y,
2468 a.z * b.x - a.x * b.z,
2469
2470=== modified file 'lib/kernel/dot.cl'
2471--- lib/kernel/dot.cl 2011-11-01 16:33:17 +0000
2472+++ lib/kernel/dot.cl 2011-12-18 04:51:24 +0000
2473@@ -21,63 +21,63 @@
2474 THE SOFTWARE.
2475 */
2476
2477-float __attribute__ ((__overloadable__)) dot(float a, float b)
2478+float _cl_overloadable dot(float a, float b)
2479 {
2480 return a * b;
2481 }
2482
2483-float __attribute__ ((__overloadable__)) dot(float2 a, float2 b)
2484+float _cl_overloadable dot(float2 a, float2 b)
2485 {
2486 return a.lo * b.lo + a.hi * b.hi;
2487 }
2488
2489-float __attribute__ ((__overloadable__)) dot(float3 a, float3 b)
2490+float _cl_overloadable dot(float3 a, float3 b)
2491 {
2492 return dot(a.s01, b.s01) + a.s2 * b.s2;
2493 }
2494
2495-float __attribute__ ((__overloadable__)) dot(float4 a, float4 b)
2496-{
2497- return dot(a.lo, b.lo) + dot(a.hi, b.hi);
2498-}
2499-
2500-float __attribute__ ((__overloadable__)) dot(float8 a, float8 b)
2501-{
2502- return dot(a.lo, b.lo) + dot(a.hi, b.hi);
2503-}
2504-
2505-float __attribute__ ((__overloadable__)) dot(float16 a, float16 b)
2506+float _cl_overloadable dot(float4 a, float4 b)
2507+{
2508+ return dot(a.lo, b.lo) + dot(a.hi, b.hi);
2509+}
2510+
2511+float _cl_overloadable dot(float8 a, float8 b)
2512+{
2513+ return dot(a.lo, b.lo) + dot(a.hi, b.hi);
2514+}
2515+
2516+float _cl_overloadable dot(float16 a, float16 b)
2517 {
2518 return dot(a.lo, b.lo) + dot(a.hi, b.hi);
2519 }
2520
2521 #ifdef cl_khr_fp64
2522-double __attribute__ ((__overloadable__)) dot(double a, double b)
2523+double _cl_overloadable dot(double a, double b)
2524 {
2525 return a * b;
2526 }
2527
2528-double __attribute__ ((__overloadable__)) dot(double2 a, double2 b)
2529+double _cl_overloadable dot(double2 a, double2 b)
2530 {
2531 return a.lo * b.lo + a.hi * b.hi;
2532 }
2533
2534-double __attribute__ ((__overloadable__)) dot(double3 a, double3 b)
2535+double _cl_overloadable dot(double3 a, double3 b)
2536 {
2537 return dot(a.s01, b.s01) + a.s2 * b.s2;
2538 }
2539
2540-double __attribute__ ((__overloadable__)) dot(double4 a, double4 b)
2541-{
2542- return dot(a.lo, b.lo) + dot(a.hi, b.hi);
2543-}
2544-
2545-double __attribute__ ((__overloadable__)) dot(double8 a, double8 b)
2546-{
2547- return dot(a.lo, b.lo) + dot(a.hi, b.hi);
2548-}
2549-
2550-double __attribute__ ((__overloadable__)) dot(double16 a, double16 b)
2551+double _cl_overloadable dot(double4 a, double4 b)
2552+{
2553+ return dot(a.lo, b.lo) + dot(a.hi, b.hi);
2554+}
2555+
2556+double _cl_overloadable dot(double8 a, double8 b)
2557+{
2558+ return dot(a.lo, b.lo) + dot(a.hi, b.hi);
2559+}
2560+
2561+double _cl_overloadable dot(double16 a, double16 b)
2562 {
2563 return dot(a.lo, b.lo) + dot(a.hi, b.hi);
2564 }
2565
2566=== modified file 'lib/kernel/upsample.cl'
2567--- lib/kernel/upsample.cl 2011-11-24 15:56:10 +0000
2568+++ lib/kernel/upsample.cl 2011-12-18 04:51:24 +0000
2569@@ -25,7 +25,7 @@
2570 // convert_* function calls
2571
2572 #define IMPLEMENT_UPSAMPLE_LG_GUG(GTYPE, SGTYPE, UGTYPE, LGTYPE) \
2573- LGTYPE __attribute__ ((__overloadable__)) \
2574+ LGTYPE _cl_overloadable \
2575 upsample(GTYPE a, UGTYPE b) \
2576 { \
2577 int bits = CHAR_BIT * sizeof(SGTYPE); \
2578
2579=== modified file 'lib/kernel/vload.cl'
2580--- lib/kernel/vload.cl 2011-12-14 23:01:01 +0000
2581+++ lib/kernel/vload.cl 2011-12-18 04:51:24 +0000
2582@@ -27,31 +27,31 @@
2583
2584 #define IMPLEMENT_VLOAD(TYPE, MOD) \
2585 \
2586- TYPE##2 __attribute__ ((__overloadable__)) \
2587+ TYPE##2 _cl_overloadable \
2588 vload2(size_t offset, const MOD TYPE *p) \
2589 { \
2590 return (TYPE##2)(p[offset*2], p[offset*2+1]); \
2591 } \
2592 \
2593- TYPE##3 __attribute__ ((__overloadable__)) \
2594+ TYPE##3 _cl_overloadable \
2595 vload3(size_t offset, const MOD TYPE *p) \
2596 { \
2597 return (TYPE##3)(vload2(0, &p[offset*3]), p[offset*3+2]); \
2598 } \
2599 \
2600- TYPE##4 __attribute__ ((__overloadable__)) \
2601+ TYPE##4 _cl_overloadable \
2602 vload4(size_t offset, const MOD TYPE *p) \
2603 { \
2604 return (TYPE##4)(vload2(0, &p[offset*4]), vload2(0, &p[offset*4+2])); \
2605 } \
2606 \
2607- TYPE##8 __attribute__ ((__overloadable__)) \
2608+ TYPE##8 _cl_overloadable \
2609 vload8(size_t offset, const MOD TYPE *p) \
2610 { \
2611 return (TYPE##8)(vload4(0, &p[offset*8]), vload4(0, &p[offset*8+4])); \
2612 } \
2613 \
2614- TYPE##16 __attribute__ ((__overloadable__)) \
2615+ TYPE##16 _cl_overloadable \
2616 vload16(size_t offset, const MOD TYPE *p) \
2617 { \
2618 return (TYPE##16)(vload8(0, &p[offset*16]), vload8(0, &p[offset*16+8])); \
2619
2620=== modified file 'lib/kernel/vload_half.cl'
2621--- lib/kernel/vload_half.cl 2011-12-14 23:01:01 +0000
2622+++ lib/kernel/vload_half.cl 2011-12-18 04:51:24 +0000
2623@@ -25,78 +25,100 @@
2624
2625 #ifdef cl_khr_fp16
2626
2627-#define IMPLEMENT_VLOAD_HALF(MOD) \
2628- \
2629- float __attribute__ ((__overloadable__)) \
2630- vload_half(size_t offset, const MOD half *p) \
2631- { \
2632- return (float)p[offset]; \
2633- } \
2634- \
2635- float2 __attribute__ ((__overloadable__)) \
2636+/*
2637+ half: 1 sign bit, 5 exponent bits, 10 mantissa bits
2638+ float: 1 sign bit, 8 exponent bits, 23 mantissa bits
2639+ double: 1 sign bit, 10 exponent bits, 53 mantissa bits
2640+*/
2641+
2642+
2643+
2644+#define IMPLEMENT_VLOAD_HALF(MOD) \
2645+ \
2646+ float _cl_overloadable \
2647+ vload_half(size_t offset, const MOD half *p) \
2648+ { \
2649+ /* This conversion always succeeds */ \
2650+ short hval = ((const MOD short*)p)[offset]; \
2651+ short hsign = (hval & (short)0x8000) >> (short)15; \
2652+ short hexp = (hval & (short)0x7c00) >> (short)10; \
2653+ short hmant = hval & (short)0x03ff; \
2654+ bool isdenorm = hexp == (short)0; \
2655+ bool isinfnan = hexp == (short)31; \
2656+ hexp -= (short)15; \
2657+ int fsign = (int)hsign << 31; \
2658+ int fexp = (__builtin_expect(isdenorm, false) ? 0 : \
2659+ __builtin_expect(isinfnan, false) ? 255 : (int)hexp + 127); \
2660+ fexp <<= 23; \
2661+ int fmant = (int)hmant << 13; \
2662+ int fval = fsign | fexp | fmant; \
2663+ return as_float(fval); \
2664+ } \
2665+ \
2666+ float2 _cl_overloadable \
2667 vload_half2(size_t offset, const MOD half *p) \
2668 { \
2669 return (float2)(vload_half(0, &p[offset*2]), \
2670 vload_half(0, &p[offset*2+1])); \
2671 } \
2672 \
2673- float3 __attribute__ ((__overloadable__)) \
2674+ float3 _cl_overloadable \
2675 vload_half3(size_t offset, const MOD half *p) \
2676 { \
2677 return (float3)(vload_half2(0, &p[offset*3]), \
2678 vload_half(0, &p[offset*3+2])); \
2679 } \
2680 \
2681- float4 __attribute__ ((__overloadable__)) \
2682+ float4 _cl_overloadable \
2683 vload_half4(size_t offset, const MOD half *p) \
2684 { \
2685 return (float4)(vload_half2(0, &p[offset*4]), \
2686 vload_half2(0, &p[offset*4+2])); \
2687 } \
2688 \
2689- float8 __attribute__ ((__overloadable__)) \
2690+ float8 _cl_overloadable \
2691 vload_half8(size_t offset, const MOD half *p) \
2692 { \
2693 return (float8)(vload_half4(0, &p[offset*8]), \
2694 vload_half4(0, &p[offset*8+4])); \
2695 } \
2696 \
2697- float16 __attribute__ ((__overloadable__)) \
2698+ float16 _cl_overloadable \
2699 vload_half16(size_t offset, const MOD half *p) \
2700 { \
2701 return (float16)(vload_half8(0, &p[offset*16]), \
2702 vload_half8(0, &p[offset*16+8])); \
2703 } \
2704 \
2705- float2 __attribute__ ((__overloadable__)) \
2706+ float2 _cl_overloadable \
2707 vloada_half2(size_t offset, const MOD half *p) \
2708 { \
2709 return (float2)(vload_half(0, &p[offset*2]), \
2710 vload_half(0, &p[offset*2+1])); \
2711 } \
2712 \
2713- float3 __attribute__ ((__overloadable__)) \
2714+ float3 _cl_overloadable \
2715 vloada_half3(size_t offset, const MOD half *p) \
2716 { \
2717 return (float3)(vloada_half2(0, &p[offset*4]), \
2718 vload_half(0, &p[offset*4+2])); \
2719 } \
2720 \
2721- float4 __attribute__ ((__overloadable__)) \
2722+ float4 _cl_overloadable \
2723 vloada_half4(size_t offset, const MOD half *p) \
2724 { \
2725 return (float4)(vloada_half2(0, &p[offset*4]), \
2726 vloada_half2(0, &p[offset*4+2])); \
2727 } \
2728 \
2729- float8 __attribute__ ((__overloadable__)) \
2730+ float8 _cl_overloadable \
2731 vloada_half8(size_t offset, const MOD half *p) \
2732 { \
2733 return (float8)(vloada_half4(0, &p[offset*8]), \
2734 vloada_half4(0, &p[offset*8+4])); \
2735 } \
2736 \
2737- float16 __attribute__ ((__overloadable__)) \
2738+ float16 _cl_overloadable \
2739 vloada_half16(size_t offset, const MOD half *p) \
2740 { \
2741 return (float16)(vloada_half8(0, &p[offset*16]), \
2742
2743=== modified file 'lib/kernel/vstore.cl'
2744--- lib/kernel/vstore.cl 2011-11-25 17:02:42 +0000
2745+++ lib/kernel/vstore.cl 2011-12-18 04:51:24 +0000
2746@@ -27,35 +27,35 @@
2747
2748 #define IMPLEMENT_VSTORE(TYPE, MOD) \
2749 \
2750- void __attribute__ ((__overloadable__)) \
2751+ void _cl_overloadable \
2752 vstore2(TYPE##2 data, size_t offset, MOD TYPE *p) \
2753 { \
2754 p[offset*2] = data.lo; \
2755 p[offset*2+1] = data.hi; \
2756 } \
2757 \
2758- void __attribute__ ((__overloadable__)) \
2759+ void _cl_overloadable \
2760 vstore3(TYPE##3 data, size_t offset, MOD TYPE *p) \
2761 { \
2762 vstore2(data.lo, 0, &p[offset*3]); \
2763 p[offset*3+2] = data.s2; \
2764 } \
2765 \
2766- void __attribute__ ((__overloadable__)) \
2767+ void _cl_overloadable \
2768 vstore4(TYPE##4 data, size_t offset, MOD TYPE *p) \
2769 { \
2770 vstore2(data.lo, 0, &p[offset*4]); \
2771 vstore2(data.hi, 0, &p[offset*4+2]); \
2772 } \
2773 \
2774- void __attribute__ ((__overloadable__)) \
2775+ void _cl_overloadable \
2776 vstore8(TYPE##8 data, size_t offset, MOD TYPE *p) \
2777 { \
2778 vstore4(data.lo, 0, &p[offset*8]); \
2779 vstore4(data.hi, 0, &p[offset*8+4]); \
2780 } \
2781 \
2782- void __attribute__ ((__overloadable__)) \
2783+ void _cl_overloadable \
2784 vstore16(TYPE##16 data, size_t offset, MOD TYPE *p) \
2785 { \
2786 vstore8(data.lo, 0, &p[offset*16]); \
2787
2788=== modified file 'lib/kernel/vstore_half.cl'
2789--- lib/kernel/vstore_half.cl 2011-12-14 23:01:01 +0000
2790+++ lib/kernel/vstore_half.cl 2011-12-18 04:51:24 +0000
2791@@ -25,78 +25,114 @@
2792
2793 #ifdef cl_khr_fp16
2794
2795+/*
2796+ half: 1 sign bit, 5 exponent bits, 10 mantissa bits
2797+ float: 1 sign bit, 7 exponent bits, 23 mantissa bits
2798+ double: 1 sign bit, 10 exponent bits, 53 mantissa bits
2799+*/
2800+
2801+#define HALF_MAXPLUS 0x1.ffdp15f /* "one more" than HALF_MAX */
2802+#define HALF_MIN 0x1.0p-14f
2803+#define HALF_ZERO ((short)0x0000) /* zero */
2804+#define HALF_INF ((short)0x4000) /* infinity */
2805+#define HALF_SIGN ((short)0x8000) /* sign bit */
2806+
2807+
2808+
2809 #define IMPLEMENT_VSTORE_HALF(MOD, SUFFIX) \
2810 \
2811- void __attribute__ ((__overloadable__)) \
2812+ void _cl_overloadable \
2813 vstore_half##SUFFIX(float data, size_t offset, MOD half *p) \
2814 { \
2815- p[offset] = data; \
2816+ /* IDEA: modify data (e.g. add "1/2") to round correctly */ \
2817+ int fval = as_int(data); \
2818+ int fsign = (fval & 0x80000000) >> 31; \
2819+ int fexp = (fval & 0x7f800000) >> 23; \
2820+ int fmant = fval & 0x007fffff; \
2821+ bool isdenorm = fexp == 0; \
2822+ bool isinfnan = fexp == 255; \
2823+ fexp -= 127; \
2824+ short hsign = (short)fsign << (short)15; \
2825+ short hexp = (__builtin_expect(isdenorm, false) ? (short)0 : \
2826+ __builtin_expect(isinfnan, false) ? (short)31 : \
2827+ (short)fexp + (short)15); \
2828+ /* TODO: this always truncates */ \
2829+ short hmant = (short)(fmant >> 13); \
2830+ short hval; \
2831+ if (__builtin_expect(fabs(data) >= HALF_MAXPLUS, false)) { \
2832+ hval = data > 0.0f ? HALF_INF : HALF_INF | HALF_SIGN; \
2833+ } else if (__builtin_expect(fabs(data) < HALF_MIN, false)) { \
2834+ hval = signbit(data)==0 ? HALF_ZERO : HALF_ZERO | HALF_SIGN; \
2835+ } else { \
2836+ hval = hsign | hexp | hmant; \
2837+ } \
2838+ ((MOD short*)p)[offset] = hval; \
2839 } \
2840 \
2841- void __attribute__ ((__overloadable__)) \
2842+ void _cl_overloadable \
2843 vstore_half2##SUFFIX(float2 data, size_t offset, MOD half *p) \
2844 { \
2845 vstore_half##SUFFIX(data.lo, 0, &p[offset*2]); \
2846 vstore_half##SUFFIX(data.hi, 0, &p[offset*2+1]); \
2847 } \
2848 \
2849- void __attribute__ ((__overloadable__)) \
2850+ void _cl_overloadable \
2851 vstore_half3##SUFFIX(float3 data, size_t offset, MOD half *p) \
2852 { \
2853 vstore_half2##SUFFIX(data.lo, 0, &p[offset*3]); \
2854 vstore_half##SUFFIX(data.s2, 0, &p[offset*3+2]); \
2855 } \
2856 \
2857- void __attribute__ ((__overloadable__)) \
2858+ void _cl_overloadable \
2859 vstore_half4##SUFFIX(float4 data, size_t offset, MOD half *p) \
2860 { \
2861 vstore_half2##SUFFIX(data.lo, 0, &p[offset*4]); \
2862 vstore_half2##SUFFIX(data.hi, 0, &p[offset*4+2]); \
2863 } \
2864 \
2865- void __attribute__ ((__overloadable__)) \
2866+ void _cl_overloadable \
2867 vstore_half8##SUFFIX(float8 data, size_t offset, MOD half *p) \
2868 { \
2869 vstore_half4##SUFFIX(data.lo, 0, &p[offset*8]); \
2870 vstore_half4##SUFFIX(data.hi, 0, &p[offset*8+4]); \
2871 } \
2872 \
2873- void __attribute__ ((__overloadable__)) \
2874+ void _cl_overloadable \
2875 vstore_half16##SUFFIX(float16 data, size_t offset, MOD half *p) \
2876 { \
2877 vstore_half8##SUFFIX(data.lo, 0, &p[offset*16]); \
2878 vstore_half8##SUFFIX(data.hi, 0, &p[offset*16+8]); \
2879 } \
2880 \
2881- void __attribute__ ((__overloadable__)) \
2882+ void _cl_overloadable \
2883 vstorea_half2##SUFFIX(float2 data, size_t offset, MOD half *p) \
2884 { \
2885 vstore_half##SUFFIX(data.lo, 0, &p[offset*2]); \
2886 vstore_half##SUFFIX(data.hi, 0, &p[offset*2+1]); \
2887 } \
2888 \
2889- void __attribute__ ((__overloadable__)) \
2890+ void _cl_overloadable \
2891 vstorea_half3##SUFFIX(float3 data, size_t offset, MOD half *p) \
2892 { \
2893 vstorea_half2##SUFFIX(data.lo, 0, &p[offset*3]); \
2894 vstore_half##SUFFIX(data.s2, 0, &p[offset*3+2]); \
2895 } \
2896 \
2897- void __attribute__ ((__overloadable__)) \
2898+ void _cl_overloadable \
2899 vstorea_half4##SUFFIX(float4 data, size_t offset, MOD half *p) \
2900 { \
2901 vstorea_half2##SUFFIX(data.lo, 0, &p[offset*4]); \
2902 vstorea_half2##SUFFIX(data.hi, 0, &p[offset*4+2]); \
2903 } \
2904 \
2905- void __attribute__ ((__overloadable__)) \
2906+ void _cl_overloadable \
2907 vstorea_half8##SUFFIX(float8 data, size_t offset, MOD half *p) \
2908 { \
2909 vstorea_half4##SUFFIX(data.lo, 0, &p[offset*8]); \
2910 vstorea_half4##SUFFIX(data.hi, 0, &p[offset*8+4]); \
2911 } \
2912 \
2913- void __attribute__ ((__overloadable__)) \
2914+ void _cl_overloadable \
2915 vstorea_half16##SUFFIX(float16 data, size_t offset, MOD half *p) \
2916 { \
2917 vstorea_half8##SUFFIX(data.lo, 0, &p[offset*16]); \
2918@@ -105,20 +141,20 @@
2919
2920
2921
2922-IMPLEMENT_VSTORE_HALF(__global , )
2923-IMPLEMENT_VSTORE_HALF(__global , _rte)
2924-IMPLEMENT_VSTORE_HALF(__global , _rtz)
2925-IMPLEMENT_VSTORE_HALF(__global , _rtp)
2926-IMPLEMENT_VSTORE_HALF(__global , _rtn)
2927-IMPLEMENT_VSTORE_HALF(__local , )
2928-IMPLEMENT_VSTORE_HALF(__local , _rte)
2929-IMPLEMENT_VSTORE_HALF(__local , _rtz)
2930-IMPLEMENT_VSTORE_HALF(__local , _rtp)
2931-IMPLEMENT_VSTORE_HALF(__local , _rtn)
2932-/* IMPLEMENT_VSTORE_HALF(__private , ) */
2933-/* IMPLEMENT_VSTORE_HALF(__private , _rte) */
2934-/* IMPLEMENT_VSTORE_HALF(__private , _rtz) */
2935-/* IMPLEMENT_VSTORE_HALF(__private , _rtp) */
2936-/* IMPLEMENT_VSTORE_HALF(__private , _rtn) */
2937+IMPLEMENT_VSTORE_HALF(__global , )
2938+IMPLEMENT_VSTORE_HALF(__global , _rte)
2939+IMPLEMENT_VSTORE_HALF(__global , _rtz)
2940+IMPLEMENT_VSTORE_HALF(__global , _rtp)
2941+IMPLEMENT_VSTORE_HALF(__global , _rtn)
2942+IMPLEMENT_VSTORE_HALF(__local , )
2943+IMPLEMENT_VSTORE_HALF(__local , _rte)
2944+IMPLEMENT_VSTORE_HALF(__local , _rtz)
2945+IMPLEMENT_VSTORE_HALF(__local , _rtp)
2946+IMPLEMENT_VSTORE_HALF(__local , _rtn)
2947+/* IMPLEMENT_VSTORE_HALF(__private, ) */
2948+/* IMPLEMENT_VSTORE_HALF(__private, _rte) */
2949+/* IMPLEMENT_VSTORE_HALF(__private, _rtz) */
2950+/* IMPLEMENT_VSTORE_HALF(__private, _rtp) */
2951+/* IMPLEMENT_VSTORE_HALF(__private, _rtn) */
2952
2953 #endif