Merge lp:~schnetter/pocl/main into lp:~pocl/pocl/trunk
- main
- Merge into trunk
Proposed by
Erik Schnetter
Status: | Merged |
---|---|
Merge reported by: | Pekka Jääskeläinen |
Merged at revision: | not available |
Proposed branch: | lp:~schnetter/pocl/main |
Merge into: | lp:~pocl/pocl/trunk |
Diff against target: |
2953 lines (+1586/-379) 53 files modified
clconfig.h.in (+14/-7) configure.ac (+13/-0) examples/kernel/test_bitselect.cl (+1/-1) examples/kernel/test_fabs.cl (+1/-1) examples/kernel/test_rotate.cl (+1/-1) examples/scalarwave/scalarwave.c (+4/-5) examples/scalarwave/scalarwave.cl (+5/-3) include/CL/cl.h (+1/-1) include/_kernel.h (+3/-0) include/arm/types.h (+1/-1) include/tce/types.h (+1/-1) include/types.h (+1/-1) lib/CL/Makefile.am (+7/-0) lib/CL/clCreateBuffer.c (+3/-1) lib/CL/clCreateCommandQueue.c (+2/-0) lib/CL/clCreateContext.c (+1/-2) lib/CL/clCreateContextFromType.c (+2/-0) lib/CL/clCreateKernel.c (+2/-0) lib/CL/clCreateProgramWithSource.c (+2/-0) lib/CL/clEnqueueBarrier.c (+31/-0) lib/CL/clEnqueueCopyBuffer.c (+67/-0) lib/CL/clEnqueueCopyBufferRect.c (+87/-0) lib/CL/clEnqueueReadBuffer.c (+1/-1) lib/CL/clEnqueueReadBufferRect.c (+82/-0) lib/CL/clEnqueueWriteBuffer.c (+1/-1) lib/CL/clEnqueueWriteBufferRect.c (+97/-0) lib/CL/clFinish.c (+1/-1) lib/CL/clGetContextInfo.c (+12/-20) lib/CL/clGetDeviceIDs.c (+19/-24) lib/CL/clGetDeviceInfo.c (+170/-8) lib/CL/clGetEventProfilingInfo.c (+62/-0) lib/CL/clGetPlatformIDs.c (+15/-23) lib/CL/clGetPlatformInfo.c (+12/-14) lib/CL/clGetProgramBuildInfo.c (+44/-27) lib/CL/clGetProgramInfo.c (+89/-0) lib/CL/devices/native/native.c (+6/-2) lib/CL/devices/native/native.h (+4/-0) lib/CL/devices/pthread/pthread.c (+106/-3) lib/CL/devices/pthread/pthread.h (+79/-6) lib/CL/pocl_cl.h (+25/-0) lib/kernel/all.cl (+64/-64) lib/kernel/any.cl (+64/-64) lib/kernel/arm/vload_half.cl (+109/-0) lib/kernel/arm/vstore_half.cl (+120/-0) lib/kernel/as_type.cl (+1/-1) lib/kernel/convert_type.cl (+7/-7) lib/kernel/cross.cl (+4/-4) lib/kernel/dot.cl (+28/-28) lib/kernel/upsample.cl (+1/-1) lib/kernel/vload.cl (+5/-5) lib/kernel/vload_half.cl (+40/-18) lib/kernel/vstore.cl (+5/-5) lib/kernel/vstore_half.cl (+63/-27) |
To merge this branch: | bzr merge lp:~schnetter/pocl/main |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
pocl maintaners | Pending | ||
Review via email:
|
Commit message
Description of the change
I made many improvements to the OpenCL run-time library, mostly to clGet* and buffer read/write/copy routines. The current state of the library is now sufficient to run my application code (although it is still much too slow).
To post a comment you must log in.
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file 'clconfig.h.in' |
2 | --- clconfig.h.in 2011-12-15 04:18:23 +0000 |
3 | +++ clconfig.h.in 2011-12-18 04:51:24 +0000 |
4 | @@ -1,13 +1,20 @@ |
5 | +/* Define to 1 if __fp16 can be converted to float. */ |
6 | +#undef CONVERT___FP16_FLOAT |
7 | + |
8 | +/* The size of `double', as computed by sizeof. */ |
9 | +#undef SIZEOF_DOUBLE |
10 | + |
11 | /* The size of `long', as computed by sizeof. */ |
12 | #undef SIZEOF_LONG |
13 | |
14 | -/* The size of `half', as computed by sizeof. */ |
15 | +/* The size of `void *', as computed by sizeof. */ |
16 | +#undef SIZEOF_VOID_P |
17 | + |
18 | +/* The size of `__fp16', as computed by sizeof. */ |
19 | #undef SIZEOF___FP16 |
20 | + |
21 | + |
22 | + |
23 | /* The OpenCL type `half' is called `__fp16' in C */ |
24 | #define SIZEOF_HALF SIZEOF___FP16 |
25 | - |
26 | -/* The size of `double', as computed by sizeof. */ |
27 | -#undef SIZEOF_DOUBLE |
28 | - |
29 | -/* The size of `void *', as computed by sizeof. */ |
30 | -#undef SIZEOF_VOID_P |
31 | +#define CONVERT_HALF_FLOAT CONVERT___FP16_FLOAT |
32 | |
33 | === modified file 'configure.ac' |
34 | --- configure.ac 2011-12-15 04:18:23 +0000 |
35 | +++ configure.ac 2011-12-18 04:51:24 +0000 |
36 | @@ -223,9 +223,22 @@ |
37 | # Checks for typedefs, structures, and compiler characteristics. |
38 | AC_CHECK_SIZEOF([long]) |
39 | AC_CHECK_SIZEOF([__fp16]) |
40 | +AC_MSG_CHECKING([whether __fp16 can be converted to float]) |
41 | +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[__fp16 x; float y;]], |
42 | + [[y=x]])], |
43 | + [ |
44 | + AC_MSG_RESULT([yes]) |
45 | + AC_DEFINE_UNQUOTED([CONVERT___FP16_FLOAT], 1, [Define to 1 if __fp16 can be converted to float.]) |
46 | + ], |
47 | + [ |
48 | + AC_MSG_RESULT([no]) |
49 | + AC_DEFINE_UNQUOTED([CONVERT___FP16_FLOAT], 0, [Define to 1 if __fp16 can be converted to float.]) |
50 | + ] |
51 | + ) |
52 | AC_CHECK_SIZEOF([double]) |
53 | AC_CHECK_SIZEOF([void *]) |
54 | AC_CHECK_ALIGNOF([float16], [typedef float float16 __attribute__((__ext_vector_type__(16)));]) |
55 | +AC_CHECK_ALIGNOF([double16], [typedef double double16 __attribute__((__ext_vector_type__(16)));]) |
56 | |
57 | CC="$old_CC" |
58 | |
59 | |
60 | === modified file 'examples/kernel/test_bitselect.cl' |
61 | --- examples/kernel/test_bitselect.cl 2011-12-14 15:15:44 +0000 |
62 | +++ examples/kernel/test_bitselect.cl 2011-12-18 04:51:24 +0000 |
63 | @@ -1170,7 +1170,7 @@ |
64 | }) |
65 | ) |
66 | |
67 | -void test_bitselect() |
68 | +kernel void test_bitselect() |
69 | { |
70 | CALL_FUNC_G(test_bitselect) |
71 | } |
72 | |
73 | === modified file 'examples/kernel/test_fabs.cl' |
74 | --- examples/kernel/test_fabs.cl 2011-12-14 15:15:44 +0000 |
75 | +++ examples/kernel/test_fabs.cl 2011-12-18 04:51:24 +0000 |
76 | @@ -172,7 +172,7 @@ |
77 | }) |
78 | ) |
79 | |
80 | -void test_fabs() |
81 | +kernel void test_fabs() |
82 | { |
83 | CALL_FUNC_V(test_fabs) |
84 | } |
85 | |
86 | === modified file 'examples/kernel/test_rotate.cl' |
87 | --- examples/kernel/test_rotate.cl 2011-12-14 15:15:44 +0000 |
88 | +++ examples/kernel/test_rotate.cl 2011-12-18 04:51:24 +0000 |
89 | @@ -198,7 +198,7 @@ |
90 | }) |
91 | ) |
92 | |
93 | -void test_rotate() |
94 | +kernel void test_rotate() |
95 | { |
96 | CALL_FUNC_G(test_rotate) |
97 | } |
98 | |
99 | === modified file 'examples/scalarwave/scalarwave.c' |
100 | --- examples/scalarwave/scalarwave.c 2011-12-14 15:45:14 +0000 |
101 | +++ examples/scalarwave/scalarwave.c 2011-12-18 04:51:24 +0000 |
102 | @@ -11,7 +11,7 @@ |
103 | |
104 | |
105 | |
106 | -#define GRID_GRANULARITY 1 // TODO 2 |
107 | +#define GRID_GRANULARITY 2 |
108 | |
109 | typedef struct grid_t { |
110 | cl_double dt; // time step |
111 | @@ -96,7 +96,7 @@ |
112 | if (ierr) return -1; |
113 | |
114 | size_t const global_work_size[3] = |
115 | - {grid->ni, grid->nj, grid->nk}; |
116 | + {grid->ai, grid->aj, grid->ak}; |
117 | size_t const local_work_size[3] = |
118 | {GRID_GRANULARITY, GRID_GRANULARITY, GRID_GRANULARITY}; |
119 | |
120 | @@ -207,9 +207,8 @@ |
121 | phi = tmp; |
122 | } |
123 | |
124 | - // TODO: We create the program and allocate the buffers each time, |
125 | - // which is slow. But then, we only want to test correctness, not |
126 | - // performance. (Yet?) |
127 | + // TODO: We allocate the buffers each time, which is slow. But |
128 | + // then, we only want to test correctness, not performance. (Yet?) |
129 | int const ierr = |
130 | exec_scalarwave_kernel (source, phi, phi_p, phi_p_p, &grid); |
131 | assert(!ierr); |
132 | |
133 | === modified file 'examples/scalarwave/scalarwave.cl' |
134 | --- examples/scalarwave/scalarwave.cl 2011-12-14 15:45:14 +0000 |
135 | +++ examples/scalarwave/scalarwave.cl 2011-12-18 04:51:24 +0000 |
136 | @@ -21,7 +21,7 @@ |
137 | int ni, nj, nk; // used size |
138 | } grid_t; |
139 | |
140 | -void |
141 | +kernel void |
142 | scalarwave(global double *restrict const phi, |
143 | global double const *restrict const phi_p, |
144 | global double const *restrict const phi_p_p, |
145 | @@ -72,11 +72,13 @@ |
146 | size_t const k = get_global_id(2); |
147 | |
148 | // If outside the domain, do nothing |
149 | - if (i>=ni || j>=nj || k>=nk) return; |
150 | + if (__builtin_expect(i>=ni || j>=nj || k>=nk, false)) return; |
151 | |
152 | size_t const ind3d = di*i + dj*j + dk*k; |
153 | |
154 | - if (i==0 || i==ni-1 || j==0 || j==nj-1 || k==0 || k==nk-1) { |
155 | + if (__builtin_expect(i==0 || j==0 || k==0 || i==ni-1 || j==nj-1 || k==nk-1, |
156 | + false)) |
157 | + { |
158 | // Boundary condition |
159 | |
160 | phi[ind3d] = 0.0; |
161 | |
162 | === modified file 'include/CL/cl.h' |
163 | --- include/CL/cl.h 2011-02-08 22:30:08 +0000 |
164 | +++ include/CL/cl.h 2011-12-18 04:51:24 +0000 |
165 | @@ -814,7 +814,7 @@ |
166 | size_t /* buffer_row_pitch */, |
167 | size_t /* buffer_slice_pitch */, |
168 | size_t /* host_row_pitch */, |
169 | - size_t /* host_slice_pitch */, |
170 | + size_t /* host_slice_pitch */, |
171 | const void * /* ptr */, |
172 | cl_uint /* num_events_in_wait_list */, |
173 | const cl_event * /* event_wait_list */, |
174 | |
175 | === modified file 'include/_kernel.h' |
176 | --- include/_kernel.h 2011-12-15 19:03:41 +0000 |
177 | +++ include/_kernel.h 2011-12-18 04:51:24 +0000 |
178 | @@ -32,6 +32,9 @@ |
179 | only. Seems the pragma does not add the macro, so we have the target |
180 | define the macro and the pragma is conditionally enabled. |
181 | */ |
182 | +#ifdef cl_khr_fp16 |
183 | +# pragma OPENCL EXTENSION cl_khr_fp16: enable |
184 | +#endif |
185 | #ifdef cl_khr_fp64 |
186 | # pragma OPENCL EXTENSION cl_khr_fp64: enable |
187 | #endif |
188 | |
189 | === modified file 'include/arm/types.h' |
190 | --- include/arm/types.h 2011-12-14 23:01:01 +0000 |
191 | +++ include/arm/types.h 2011-12-18 04:51:24 +0000 |
192 | @@ -4,7 +4,7 @@ |
193 | |
194 | #define __EMBEDDED_PROFILE__ 1 |
195 | #undef cles_khr_int64 |
196 | -#define cl_khr_fp16 /* ES: is this correct? */ |
197 | +#define cl_khr_fp16 |
198 | #undef cl_khr_fp64 |
199 | |
200 | typedef uint size_t; |
201 | |
202 | === modified file 'include/tce/types.h' |
203 | --- include/tce/types.h 2011-12-14 23:01:01 +0000 |
204 | +++ include/tce/types.h 2011-12-18 04:51:24 +0000 |
205 | @@ -4,7 +4,7 @@ |
206 | |
207 | #define __EMBEDDED_PROFILE__ 1 |
208 | #undef cles_khr_int64 |
209 | -#define cl_khr_fp16 /* ES: is this correct? */ |
210 | +#define cl_khr_fp16 |
211 | #undef cl_khr_fp64 |
212 | |
213 | typedef uint size_t; |
214 | |
215 | === modified file 'include/types.h' |
216 | --- include/types.h 2011-12-14 23:01:01 +0000 |
217 | +++ include/types.h 2011-12-18 04:51:24 +0000 |
218 | @@ -18,7 +18,7 @@ |
219 | # undef cles_khr_int64 |
220 | #endif |
221 | |
222 | -#if SIZEOF_HALF == 2 |
223 | +#if SIZEOF_HALF == 2 /* && CONVERT_HALF_FLOAT */ |
224 | # define cl_khr_fp16 |
225 | #else |
226 | # undef cl_khr_fp16 |
227 | |
228 | === modified file 'lib/CL/Makefile.am' |
229 | --- lib/CL/Makefile.am 2011-12-07 15:45:30 +0000 |
230 | +++ lib/CL/Makefile.am 2011-12-18 04:51:24 +0000 |
231 | @@ -31,6 +31,7 @@ |
232 | clReleaseCommandQueue.c \ |
233 | clCreateBuffer.c \ |
234 | clEnqueueReadBuffer.c \ |
235 | + clEnqueueReadBufferRect.c \ |
236 | clReleaseMemObject.c \ |
237 | clCreateProgramWithSource.c \ |
238 | clReleaseProgram.c \ |
239 | @@ -46,8 +47,14 @@ |
240 | clCreateContext.c \ |
241 | clGetProgramBuildInfo.c \ |
242 | clEnqueueWriteBuffer.c \ |
243 | + clEnqueueWriteBufferRect.c \ |
244 | + clEnqueueCopyBuffer.c \ |
245 | + clEnqueueCopyBufferRect.c \ |
246 | clFinish.c \ |
247 | + clEnqueueBarrier.c \ |
248 | clGetKernelWorkGroupInfo.c \ |
249 | + clGetProgramInfo.c \ |
250 | + clGetEventProfilingInfo.c \ |
251 | pocl_cl.h \ |
252 | devices/devices.h \ |
253 | devices/devices.c \ |
254 | |
255 | === modified file 'lib/CL/clCreateBuffer.c' |
256 | --- lib/CL/clCreateBuffer.c 2011-12-08 02:45:48 +0000 |
257 | +++ lib/CL/clCreateBuffer.c 2011-12-18 04:51:24 +0000 |
258 | @@ -69,6 +69,8 @@ |
259 | mem->mem_host_ptr = host_ptr; |
260 | mem->reference_count = 1; |
261 | mem->context = context; |
262 | - |
263 | + |
264 | + if (errcode_ret != NULL) |
265 | + *errcode_ret = CL_SUCCESS; |
266 | return mem; |
267 | } |
268 | |
269 | === modified file 'lib/CL/clCreateCommandQueue.c' |
270 | --- lib/CL/clCreateCommandQueue.c 2011-10-14 10:31:27 +0000 |
271 | +++ lib/CL/clCreateCommandQueue.c 2011-12-18 04:51:24 +0000 |
272 | @@ -38,5 +38,7 @@ |
273 | command_queue->reference_count = 1; |
274 | command_queue->properties = properties; |
275 | |
276 | + if (errcode_ret != NULL) |
277 | + *errcode_ret = CL_SUCCESS; |
278 | return command_queue; |
279 | } |
280 | |
281 | === modified file 'lib/CL/clCreateContext.c' |
282 | --- lib/CL/clCreateContext.c 2011-12-07 15:00:07 +0000 |
283 | +++ lib/CL/clCreateContext.c 2011-12-18 04:51:24 +0000 |
284 | @@ -72,8 +72,7 @@ |
285 | context->properties = properties; |
286 | context->reference_count = 1; |
287 | |
288 | - if (errcode_ret != NULL) |
289 | + if (errcode_ret) |
290 | *errcode_ret = CL_SUCCESS; |
291 | - |
292 | return context; |
293 | } |
294 | |
295 | === modified file 'lib/CL/clCreateContextFromType.c' |
296 | --- lib/CL/clCreateContextFromType.c 2011-12-02 17:27:49 +0000 |
297 | +++ lib/CL/clCreateContextFromType.c 2011-12-18 04:51:24 +0000 |
298 | @@ -68,5 +68,7 @@ |
299 | context->properties = properties; |
300 | context->reference_count = 1; |
301 | |
302 | + if (errcode_ret != NULL) |
303 | + *errcode_ret = CL_SUCCESS; |
304 | return context; |
305 | } |
306 | |
307 | === modified file 'lib/CL/clCreateKernel.c' |
308 | --- lib/CL/clCreateKernel.c 2011-12-14 18:52:39 +0000 |
309 | +++ lib/CL/clCreateKernel.c 2011-12-18 04:51:24 +0000 |
310 | @@ -145,5 +145,7 @@ |
311 | program->kernels = kernel; |
312 | kernel->next = k; |
313 | |
314 | + if (errcode_ret != NULL) |
315 | + *errcode_ret = CL_SUCCESS; |
316 | return kernel; |
317 | } |
318 | |
319 | === modified file 'lib/CL/clCreateProgramWithSource.c' |
320 | --- lib/CL/clCreateProgramWithSource.c 2011-10-14 10:31:27 +0000 |
321 | +++ lib/CL/clCreateProgramWithSource.c 2011-12-18 04:51:24 +0000 |
322 | @@ -92,5 +92,7 @@ |
323 | program->binary = NULL; |
324 | program->kernels = NULL; |
325 | |
326 | + if (errcode_ret != NULL) |
327 | + *errcode_ret = CL_SUCCESS; |
328 | return program; |
329 | } |
330 | |
331 | === added file 'lib/CL/clEnqueueBarrier.c' |
332 | --- lib/CL/clEnqueueBarrier.c 1970-01-01 00:00:00 +0000 |
333 | +++ lib/CL/clEnqueueBarrier.c 2011-12-18 04:51:24 +0000 |
334 | @@ -0,0 +1,31 @@ |
335 | +/* OpenCL runtime library: clEnqueueBarrier() |
336 | + |
337 | + Copyright (c) 2011 Erik Schnetter |
338 | + |
339 | + Permission is hereby granted, free of charge, to any person obtaining a copy |
340 | + of this software and associated documentation files (the "Software"), to deal |
341 | + in the Software without restriction, including without limitation the rights |
342 | + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
343 | + copies of the Software, and to permit persons to whom the Software is |
344 | + furnished to do so, subject to the following conditions: |
345 | + |
346 | + The above copyright notice and this permission notice shall be included in |
347 | + all copies or substantial portions of the Software. |
348 | + |
349 | + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
350 | + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
351 | + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
352 | + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
353 | + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
354 | + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
355 | + THE SOFTWARE. |
356 | +*/ |
357 | + |
358 | +#include "pocl_cl.h" |
359 | + |
360 | +CL_API_ENTRY cl_int CL_API_CALL |
361 | +clEnqueueBarrier(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 |
362 | +{ |
363 | + /* All operations are serialised, so we never have to wait */ |
364 | + return CL_SUCCESS; |
365 | +} |
366 | |
367 | === added file 'lib/CL/clEnqueueCopyBuffer.c' |
368 | --- lib/CL/clEnqueueCopyBuffer.c 1970-01-01 00:00:00 +0000 |
369 | +++ lib/CL/clEnqueueCopyBuffer.c 2011-12-18 04:51:24 +0000 |
370 | @@ -0,0 +1,67 @@ |
371 | +/* OpenCL runtime library: clEnqueueCopyBuffer() |
372 | + |
373 | + Copyright (c) 2011 Universidad Rey Juan Carlos |
374 | + |
375 | + Permission is hereby granted, free of charge, to any person obtaining a copy |
376 | + of this software and associated documentation files (the "Software"), to deal |
377 | + in the Software without restriction, including without limitation the rights |
378 | + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
379 | + copies of the Software, and to permit persons to whom the Software is |
380 | + furnished to do so, subject to the following conditions: |
381 | + |
382 | + The above copyright notice and this permission notice shall be included in |
383 | + all copies or substantial portions of the Software. |
384 | + |
385 | + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
386 | + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
387 | + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
388 | + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
389 | + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
390 | + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
391 | + THE SOFTWARE. |
392 | +*/ |
393 | + |
394 | +#include "pocl_cl.h" |
395 | +#include <assert.h> |
396 | + |
397 | +CL_API_ENTRY cl_int CL_API_CALL |
398 | +clEnqueueCopyBuffer(cl_command_queue command_queue, |
399 | + cl_mem src_buffer, |
400 | + cl_mem dst_buffer, |
401 | + size_t src_offset, |
402 | + size_t dst_offset, |
403 | + size_t cb, |
404 | + cl_uint num_events_in_wait_list, |
405 | + const cl_event *event_wait_list, |
406 | + cl_event *event) CL_API_SUFFIX__VERSION_1_0 |
407 | +{ |
408 | + cl_device_id device_id; |
409 | + unsigned i; |
410 | + |
411 | + if (command_queue == NULL) |
412 | + return CL_INVALID_COMMAND_QUEUE; |
413 | + |
414 | + if ((src_buffer == NULL) || (dst_buffer == NULL)) |
415 | + return CL_INVALID_MEM_OBJECT; |
416 | + |
417 | + if ((command_queue->context != src_buffer->context) || |
418 | + (command_queue->context != dst_buffer->context)) |
419 | + return CL_INVALID_CONTEXT; |
420 | + |
421 | + if ((src_offset + cb > src_buffer->size) || |
422 | + (dst_offset + cb > dst_buffer->size)) |
423 | + return CL_INVALID_VALUE; |
424 | + |
425 | + device_id = command_queue->device; |
426 | + for (i = 0; i < command_queue->context->num_devices; ++i) |
427 | + { |
428 | + if (command_queue->context->devices[i] == device_id) |
429 | + break; |
430 | + } |
431 | + |
432 | + assert(i < command_queue->context->num_devices); |
433 | + |
434 | + device_id->copy(device_id->data, src_buffer->device_ptrs[i], dst_buffer->device_ptrs[i], cb); |
435 | + |
436 | + return CL_SUCCESS; |
437 | +} |
438 | |
439 | === added file 'lib/CL/clEnqueueCopyBufferRect.c' |
440 | --- lib/CL/clEnqueueCopyBufferRect.c 1970-01-01 00:00:00 +0000 |
441 | +++ lib/CL/clEnqueueCopyBufferRect.c 2011-12-18 04:51:24 +0000 |
442 | @@ -0,0 +1,87 @@ |
443 | +/* OpenCL runtime library: clEnqueueCopyBufferRect() |
444 | + |
445 | + Copyright (c) 2011 Universidad Rey Juan Carlos |
446 | + |
447 | + Permission is hereby granted, free of charge, to any person obtaining a copy |
448 | + of this software and associated documentation files (the "Software"), to deal |
449 | + in the Software without restriction, including without limitation the rights |
450 | + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
451 | + copies of the Software, and to permit persons to whom the Software is |
452 | + furnished to do so, subject to the following conditions: |
453 | + |
454 | + The above copyright notice and this permission notice shall be included in |
455 | + all copies or substantial portions of the Software. |
456 | + |
457 | + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
458 | + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
459 | + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
460 | + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
461 | + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
462 | + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
463 | + THE SOFTWARE. |
464 | +*/ |
465 | + |
466 | +#include "pocl_cl.h" |
467 | +#include <assert.h> |
468 | + |
469 | +CL_API_ENTRY cl_int CL_API_CALL |
470 | +clEnqueueCopyBufferRect(cl_command_queue command_queue, |
471 | + cl_mem src_buffer, |
472 | + cl_mem dst_buffer, |
473 | + const size_t *src_origin, |
474 | + const size_t *dst_origin, |
475 | + const size_t *region, |
476 | + size_t src_row_pitch, |
477 | + size_t src_slice_pitch, |
478 | + size_t dst_row_pitch, |
479 | + size_t dst_slice_pitch, |
480 | + cl_uint num_events_in_wait_list, |
481 | + const cl_event *event_wait_list, |
482 | + cl_event *event) CL_API_SUFFIX__VERSION_1_1 |
483 | +{ |
484 | + cl_device_id device_id; |
485 | + unsigned i; |
486 | + |
487 | + if (command_queue == NULL) |
488 | + return CL_INVALID_COMMAND_QUEUE; |
489 | + |
490 | + if ((src_buffer == NULL) || (dst_buffer == NULL)) |
491 | + return CL_INVALID_MEM_OBJECT; |
492 | + |
493 | + if ((command_queue->context != src_buffer->context) || |
494 | + (command_queue->context != dst_buffer->context)) |
495 | + return CL_INVALID_CONTEXT; |
496 | + |
497 | + if ((src_origin == NULL) || |
498 | + (dst_origin == NULL) || |
499 | + (region == NULL)) |
500 | + return CL_INVALID_VALUE; |
501 | + |
502 | + if ((region[0]*region[1]*region[2] > 0) && |
503 | + (src_origin[0] + region[0]-1 + |
504 | + src_row_pitch * (src_origin[1] + region[1]-1) + |
505 | + src_slice_pitch * (src_origin[2] + region[2]-1) >= src_buffer->size)) |
506 | + return CL_INVALID_VALUE; |
507 | + if ((region[0]*region[1]*region[2] > 0) && |
508 | + (dst_origin[0] + region[0]-1 + |
509 | + dst_row_pitch * (dst_origin[1] + region[1]-1) + |
510 | + dst_slice_pitch * (dst_origin[2] + region[2]-1) >= dst_buffer->size)) |
511 | + return CL_INVALID_VALUE; |
512 | + |
513 | + device_id = command_queue->device; |
514 | + for (i = 0; i < command_queue->context->num_devices; ++i) |
515 | + { |
516 | + if (command_queue->context->devices[i] == device_id) |
517 | + break; |
518 | + } |
519 | + |
520 | + assert(i < command_queue->context->num_devices); |
521 | + |
522 | + device_id->copy_rect(device_id->data, |
523 | + src_buffer->device_ptrs[i], dst_buffer->device_ptrs[i], |
524 | + src_origin, dst_origin, region, |
525 | + src_row_pitch, src_slice_pitch, |
526 | + dst_row_pitch, dst_slice_pitch); |
527 | + |
528 | + return CL_SUCCESS; |
529 | +} |
530 | |
531 | === modified file 'lib/CL/clEnqueueReadBuffer.c' |
532 | --- lib/CL/clEnqueueReadBuffer.c 2011-11-30 16:00:41 +0000 |
533 | +++ lib/CL/clEnqueueReadBuffer.c 2011-12-18 04:51:24 +0000 |
534 | @@ -1,4 +1,4 @@ |
535 | -/* OpenCL runtime library: clEnqueueWriteBuffer() |
536 | +/* OpenCL runtime library: clEnqueueReadBuffer() |
537 | |
538 | Copyright (c) 2011 Universidad Rey Juan Carlos |
539 | |
540 | |
541 | === added file 'lib/CL/clEnqueueReadBufferRect.c' |
542 | --- lib/CL/clEnqueueReadBufferRect.c 1970-01-01 00:00:00 +0000 |
543 | +++ lib/CL/clEnqueueReadBufferRect.c 2011-12-18 04:51:24 +0000 |
544 | @@ -0,0 +1,82 @@ |
545 | +/* OpenCL runtime library: clEnqueueReadBufferRect() |
546 | + |
547 | + Copyright (c) 2011 Universidad Rey Juan Carlos |
548 | + |
549 | + Permission is hereby granted, free of charge, to any person obtaining a copy |
550 | + of this software and associated documentation files (the "Software"), to deal |
551 | + in the Software without restriction, including without limitation the rights |
552 | + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
553 | + copies of the Software, and to permit persons to whom the Software is |
554 | + furnished to do so, subject to the following conditions: |
555 | + |
556 | + The above copyright notice and this permission notice shall be included in |
557 | + all copies or substantial portions of the Software. |
558 | + |
559 | + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
560 | + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
561 | + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
562 | + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
563 | + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
564 | + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
565 | + THE SOFTWARE. |
566 | +*/ |
567 | + |
568 | +#include "pocl_cl.h" |
569 | +#include <assert.h> |
570 | + |
571 | +CL_API_ENTRY cl_int CL_API_CALL |
572 | +clEnqueueReadBufferRect(cl_command_queue command_queue, |
573 | + cl_mem buffer, |
574 | + cl_bool blocking_read, |
575 | + const size_t *buffer_origin, |
576 | + const size_t *host_origin, |
577 | + const size_t *region, |
578 | + size_t buffer_row_pitch, |
579 | + size_t buffer_slice_pitch, |
580 | + size_t host_row_pitch, |
581 | + size_t host_slice_pitch, |
582 | + void *ptr, |
583 | + cl_uint num_events_in_wait_list, |
584 | + const cl_event *event_wait_list, |
585 | + cl_event *event) CL_API_SUFFIX__VERSION_1_1 |
586 | +{ |
587 | + cl_device_id device_id; |
588 | + unsigned i; |
589 | + |
590 | + if (command_queue == NULL) |
591 | + return CL_INVALID_COMMAND_QUEUE; |
592 | + |
593 | + if (buffer == NULL) |
594 | + return CL_INVALID_MEM_OBJECT; |
595 | + |
596 | + if (command_queue->context != buffer->context) |
597 | + return CL_INVALID_CONTEXT; |
598 | + |
599 | + if ((ptr == NULL) || |
600 | + (buffer_origin == NULL) || |
601 | + (host_origin == NULL) || |
602 | + (region == NULL)) |
603 | + return CL_INVALID_VALUE; |
604 | + |
605 | + if ((region[0]*region[1]*region[2] > 0) && |
606 | + (buffer_origin[0] + region[0]-1 + |
607 | + buffer_row_pitch * (buffer_origin[1] + region[1]-1) + |
608 | + buffer_slice_pitch * (buffer_origin[2] + region[2]-1) >= buffer->size)) |
609 | + return CL_INVALID_VALUE; |
610 | + |
611 | + device_id = command_queue->device; |
612 | + for (i = 0; i < command_queue->context->num_devices; ++i) |
613 | + { |
614 | + if (command_queue->context->devices[i] == device_id) |
615 | + break; |
616 | + } |
617 | + |
618 | + assert(i < command_queue->context->num_devices); |
619 | + |
620 | + device_id->read_rect(device_id->data, ptr, buffer->device_ptrs[i], |
621 | + buffer_origin, host_origin, region, |
622 | + buffer_row_pitch, buffer_slice_pitch, |
623 | + host_row_pitch, host_slice_pitch); |
624 | + |
625 | + return CL_SUCCESS; |
626 | +} |
627 | |
628 | === modified file 'lib/CL/clEnqueueWriteBuffer.c' |
629 | --- lib/CL/clEnqueueWriteBuffer.c 2011-11-30 16:00:41 +0000 |
630 | +++ lib/CL/clEnqueueWriteBuffer.c 2011-12-18 04:51:24 +0000 |
631 | @@ -1,4 +1,4 @@ |
632 | -/* OpenCL runtime library: clEnqueueReadBuffer() |
633 | +/* OpenCL runtime library: clEnqueueWriteBuffer() |
634 | |
635 | Copyright (c) 2011 Universidad Rey Juan Carlos |
636 | |
637 | |
638 | === added file 'lib/CL/clEnqueueWriteBufferRect.c' |
639 | --- lib/CL/clEnqueueWriteBufferRect.c 1970-01-01 00:00:00 +0000 |
640 | +++ lib/CL/clEnqueueWriteBufferRect.c 2011-12-18 04:51:24 +0000 |
641 | @@ -0,0 +1,97 @@ |
642 | +/* OpenCL runtime library: clEnqueueWriteBufferRect() |
643 | + |
644 | + Copyright (c) 2011 Universidad Rey Juan Carlos |
645 | + |
646 | + Permission is hereby granted, free of charge, to any person obtaining a copy |
647 | + of this software and associated documentation files (the "Software"), to deal |
648 | + in the Software without restriction, including without limitation the rights |
649 | + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
650 | + copies of the Software, and to permit persons to whom the Software is |
651 | + furnished to do so, subject to the following conditions: |
652 | + |
653 | + The above copyright notice and this permission notice shall be included in |
654 | + all copies or substantial portions of the Software. |
655 | + |
656 | + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
657 | + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
658 | + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
659 | + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
660 | + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
661 | + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
662 | + THE SOFTWARE. |
663 | +*/ |
664 | + |
665 | +#include "pocl_cl.h" |
666 | +#include <assert.h> |
667 | + |
668 | +CL_API_ENTRY cl_int CL_API_CALL |
669 | +clEnqueueWriteBufferRect(cl_command_queue command_queue, |
670 | + cl_mem buffer, |
671 | + cl_bool blocking_write, |
672 | + const size_t *buffer_origin, |
673 | + const size_t *host_origin, |
674 | + const size_t *region, |
675 | + size_t buffer_row_pitch, |
676 | + size_t buffer_slice_pitch, |
677 | + size_t host_row_pitch, |
678 | + size_t host_slice_pitch, |
679 | + const void *ptr, |
680 | + cl_uint num_events_in_wait_list, |
681 | + const cl_event *event_wait_list, |
682 | + cl_event *event) CL_API_SUFFIX__VERSION_1_1 |
683 | +{ |
684 | + cl_device_id device_id; |
685 | + unsigned i; |
686 | + |
687 | + if (command_queue == NULL) |
688 | + return CL_INVALID_COMMAND_QUEUE; |
689 | + |
690 | + if (buffer == NULL) |
691 | + return CL_INVALID_MEM_OBJECT; |
692 | + |
693 | + if (command_queue->context != buffer->context) |
694 | + return CL_INVALID_CONTEXT; |
695 | + |
696 | + if ((ptr == NULL) || |
697 | + (buffer_origin == NULL) || |
698 | + (host_origin == NULL) || |
699 | + (region == NULL)) |
700 | + return CL_INVALID_VALUE; |
701 | + |
702 | + if ((region[0]*region[1]*region[2] > 0) && |
703 | + (buffer_origin[0] + region[0]-1 + |
704 | + buffer_row_pitch * (buffer_origin[1] + region[1]-1) + |
705 | + buffer_slice_pitch * (buffer_origin[2] + region[2]-1) >= buffer->size)) |
706 | + { |
707 | +#warning "TODO" |
708 | + printf("bo=[%d,%d,%d]\n" |
709 | + "ho=[%d,%d,%d]\n" |
710 | + "re=[%d,%d,%d]\n" |
711 | + "bp=[,%d,%d]\n" |
712 | + "hp=[,%d,%d]\n" |
713 | + "bs=[%d]\n", |
714 | + (int)buffer_origin[0], (int)buffer_origin[1], (int)buffer_origin[2], |
715 | + (int)host_origin[0], (int)host_origin[1], (int)host_origin[2], |
716 | + (int)region[0], (int)region[1], (int)region[2], |
717 | + (int)buffer_row_pitch, (int)buffer_slice_pitch, |
718 | + (int)host_row_pitch, (int)host_slice_pitch, |
719 | + (int)buffer->size); |
720 | + return CL_INVALID_VALUE; |
721 | + } |
722 | + |
723 | + device_id = command_queue->device; |
724 | + for (i = 0; i < command_queue->context->num_devices; ++i) |
725 | + { |
726 | + if (command_queue->context->devices[i] == device_id) |
727 | + break; |
728 | + } |
729 | + |
730 | + assert(i < command_queue->context->num_devices); |
731 | + |
732 | + device_id->write_rect(device_id->data, ptr, buffer->device_ptrs[i], |
733 | + buffer_origin, host_origin, region, |
734 | + buffer_row_pitch, buffer_slice_pitch, |
735 | + host_row_pitch, host_slice_pitch); |
736 | + |
737 | + return CL_SUCCESS; |
738 | +} |
739 | |
740 | === modified file 'lib/CL/clFinish.c' |
741 | --- lib/CL/clFinish.c 2011-12-05 22:08:57 +0000 |
742 | +++ lib/CL/clFinish.c 2011-12-18 04:51:24 +0000 |
743 | @@ -24,7 +24,7 @@ |
744 | #include "pocl_cl.h" |
745 | |
746 | CL_API_ENTRY cl_int CL_API_CALL |
747 | -clFinish(cl_command_queue command_queue) |
748 | +clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 |
749 | { |
750 | /* All operations are serialised, so we never have to wait */ |
751 | return CL_SUCCESS; |
752 | |
753 | === modified file 'lib/CL/clGetContextInfo.c' |
754 | --- lib/CL/clGetContextInfo.c 2011-10-14 10:31:27 +0000 |
755 | +++ lib/CL/clGetContextInfo.c 2011-12-18 04:51:24 +0000 |
756 | @@ -33,28 +33,20 @@ |
757 | size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 |
758 | { |
759 | size_t value_size; |
760 | - |
761 | + |
762 | switch (param_name) { |
763 | - default: |
764 | - assert(0); |
765 | + |
766 | case CL_CONTEXT_DEVICES: |
767 | - value_size = context->num_devices * sizeof(cl_device_id); |
768 | - } |
769 | - |
770 | - if (param_value_size_ret != NULL) |
771 | - *param_value_size_ret = value_size; |
772 | - |
773 | - if (param_value != NULL) { |
774 | - if (value_size > param_value_size) |
775 | - return CL_INVALID_VALUE; |
776 | - |
777 | - switch (param_name) { |
778 | - default: |
779 | - assert(0); |
780 | - case CL_CONTEXT_DEVICES: |
781 | - memcpy(param_value, context->devices, value_size); |
782 | + { |
783 | + value_size = context->num_devices * sizeof(cl_device_id); |
784 | + if (param_value != NULL) { |
785 | + if (param_value_size < value_size) |
786 | + return CL_INVALID_VALUE; |
787 | + memcpy(param_value, context->devices, value_size); |
788 | + } |
789 | + if (param_value_size_ret != NULL) |
790 | + *param_value_size_ret = value_size; |
791 | + return CL_SUCCESS; |
792 | } |
793 | } |
794 | - |
795 | - return CL_SUCCESS; |
796 | } |
797 | |
798 | === modified file 'lib/CL/clGetDeviceIDs.c' |
799 | --- lib/CL/clGetDeviceIDs.c 2011-12-06 03:00:30 +0000 |
800 | +++ lib/CL/clGetDeviceIDs.c 2011-12-18 04:51:24 +0000 |
801 | @@ -23,6 +23,7 @@ |
802 | |
803 | #include "pocl_cl.h" |
804 | #include "devices/devices.h" |
805 | +#include <string.h> |
806 | |
807 | /* Note: this is a kludge. This will require a thorough re-write when pocl |
808 | * supports multiple devices |
809 | @@ -34,41 +35,35 @@ |
810 | cl_device_id * devices, |
811 | cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0 |
812 | { |
813 | - int num = 0; |
814 | + int num; |
815 | + int i; |
816 | |
817 | - // TODO: OpenCL API specification allows implementation dependant behaviour |
818 | - // if platform == NULL. Should we just allow for it? |
819 | + /* TODO: OpenCL API specification allows implementation dependent |
820 | + behaviour if platform == NULL. Should we just allow it? */ |
821 | if (platform == NULL || ( platform->magic != 42 )) |
822 | return CL_INVALID_PLATFORM; |
823 | - |
824 | - // Currently - POCL supports only the host device - i.e. a CPU |
825 | + |
826 | + /* Currently POCL supports only the host device, i.e. a CPU */ |
827 | if ((device_type & CL_DEVICE_TYPE_CPU) || |
828 | (device_type & CL_DEVICE_TYPE_DEFAULT)) |
829 | num = 1; |
830 | - else if ((device_type | CL_DEVICE_TYPE_GPU) || |
831 | - (device_type | CL_DEVICE_TYPE_ACCELERATOR)) |
832 | + else if ((device_type & CL_DEVICE_TYPE_GPU) || |
833 | + (device_type & CL_DEVICE_TYPE_ACCELERATOR)) |
834 | num = 0; |
835 | else |
836 | return CL_INVALID_DEVICE_TYPE; |
837 | - |
838 | - // no room for any response |
839 | - if (devices == NULL && num_devices == NULL) |
840 | - return CL_INVALID_VALUE; |
841 | - |
842 | - // user forgot to allocate space for response |
843 | - if (num_entries > 0 && devices == NULL ) |
844 | - return CL_INVALID_VALUE; |
845 | - |
846 | - |
847 | + |
848 | + if (devices != NULL) { |
849 | + if (num < num_entries) |
850 | + return CL_INVALID_VALUE; |
851 | + |
852 | + for (i=0; i<num; ++i) |
853 | + devices[i] = &pocl_devices[i]; |
854 | + } |
855 | + |
856 | if (num_devices != NULL) |
857 | *num_devices = num; |
858 | - |
859 | - if (num_entries > 0 && devices!= NULL) |
860 | - { |
861 | - if (num) |
862 | - devices[0] = &pocl_devices[0]; |
863 | - } |
864 | - |
865 | + |
866 | if (num > 0) |
867 | return CL_SUCCESS; |
868 | else |
869 | |
870 | === modified file 'lib/CL/clGetDeviceInfo.c' |
871 | --- lib/CL/clGetDeviceInfo.c 2011-12-02 13:09:02 +0000 |
872 | +++ lib/CL/clGetDeviceInfo.c 2011-12-18 04:51:24 +0000 |
873 | @@ -24,13 +24,175 @@ |
874 | #include "pocl_cl.h" |
875 | |
876 | CL_API_ENTRY cl_int CL_API_CALL |
877 | -clGetDeviceInfo(cl_device_id device, |
878 | - cl_device_info param_name, |
879 | - size_t param_value_size, |
880 | - void * param_value, |
881 | - size_t * param_value_size_ret ) CL_API_SUFFIX__VERSION_1_0 |
882 | +clGetDeviceInfo(cl_device_id device, |
883 | + cl_device_info param_name, |
884 | + size_t param_value_size, |
885 | + void * param_value, |
886 | + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 |
887 | { |
888 | - // TODO: dig up the info |
889 | - return CL_INVALID_VALUE; |
890 | + switch (param_name) |
891 | + { |
892 | + case CL_DEVICE_TYPE: |
893 | + { |
894 | + size_t const value_size = sizeof(cl_device_type); |
895 | + if (param_value) |
896 | + { |
897 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
898 | + *(cl_device_type*)param_value = device->type; |
899 | + } |
900 | + if (param_value_size_ret) |
901 | + *param_value_size_ret = value_size; |
902 | + return CL_SUCCESS; |
903 | + } |
904 | + |
905 | + case CL_DEVICE_VENDOR_ID : break; |
906 | + case CL_DEVICE_MAX_COMPUTE_UNITS : break; |
907 | + case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS : break; |
908 | + case CL_DEVICE_MAX_WORK_GROUP_SIZE : break; |
909 | + case CL_DEVICE_MAX_WORK_ITEM_SIZES : break; |
910 | + |
911 | + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: |
912 | + { |
913 | + size_t const value_size = sizeof(cl_uint); |
914 | + if (param_value) |
915 | + { |
916 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
917 | + *(cl_uint*)param_value = device->preferred_vector_width_char; |
918 | + } |
919 | + if (param_value_size_ret) |
920 | + *param_value_size_ret = value_size; |
921 | + return CL_SUCCESS; |
922 | + } |
923 | + |
924 | + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT: |
925 | + { |
926 | + size_t const value_size = sizeof(cl_uint); |
927 | + if (param_value) |
928 | + { |
929 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
930 | + *(cl_uint*)param_value = device->preferred_vector_width_short; |
931 | + } |
932 | + if (param_value_size_ret) |
933 | + *param_value_size_ret = value_size; |
934 | + return CL_SUCCESS; |
935 | + } |
936 | + |
937 | + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: |
938 | + { |
939 | + size_t const value_size = sizeof(cl_uint); |
940 | + if (param_value) |
941 | + { |
942 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
943 | + *(cl_uint*)param_value = device->preferred_vector_width_int; |
944 | + } |
945 | + if (param_value_size_ret) |
946 | + *param_value_size_ret = value_size; |
947 | + return CL_SUCCESS; |
948 | + } |
949 | + |
950 | + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: |
951 | + { |
952 | + size_t const value_size = sizeof(cl_uint); |
953 | + if (param_value) |
954 | + { |
955 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
956 | + *(cl_uint*)param_value = device->preferred_vector_width_long; |
957 | + } |
958 | + if (param_value_size_ret) |
959 | + *param_value_size_ret = value_size; |
960 | + return CL_SUCCESS; |
961 | + } |
962 | + |
963 | + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: |
964 | + { |
965 | + size_t const value_size = sizeof(cl_uint); |
966 | + if (param_value) |
967 | + { |
968 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
969 | + *(cl_uint*)param_value = device->preferred_vector_width_float; |
970 | + } |
971 | + if (param_value_size_ret) |
972 | + *param_value_size_ret = value_size; |
973 | + return CL_SUCCESS; |
974 | + } |
975 | + |
976 | + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: |
977 | + { |
978 | + size_t const value_size = sizeof(cl_uint); |
979 | + if (param_value) |
980 | + { |
981 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
982 | + *(cl_uint*)param_value = device->preferred_vector_width_double; |
983 | + } |
984 | + if (param_value_size_ret) |
985 | + *param_value_size_ret = value_size; |
986 | + return CL_SUCCESS; |
987 | + } |
988 | + |
989 | + case CL_DEVICE_MAX_CLOCK_FREQUENCY : break; |
990 | + case CL_DEVICE_ADDRESS_BITS : break; |
991 | + case CL_DEVICE_MAX_READ_IMAGE_ARGS : break; |
992 | + case CL_DEVICE_MAX_WRITE_IMAGE_ARGS : break; |
993 | + case CL_DEVICE_MAX_MEM_ALLOC_SIZE : break; |
994 | + case CL_DEVICE_IMAGE2D_MAX_WIDTH : break; |
995 | + case CL_DEVICE_IMAGE2D_MAX_HEIGHT : break; |
996 | + case CL_DEVICE_IMAGE3D_MAX_WIDTH : break; |
997 | + case CL_DEVICE_IMAGE3D_MAX_HEIGHT : break; |
998 | + case CL_DEVICE_IMAGE3D_MAX_DEPTH : break; |
999 | + case CL_DEVICE_IMAGE_SUPPORT : break; |
1000 | + case CL_DEVICE_MAX_PARAMETER_SIZE : break; |
1001 | + case CL_DEVICE_MAX_SAMPLERS : break; |
1002 | + case CL_DEVICE_MEM_BASE_ADDR_ALIGN : break; |
1003 | + case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE : break; |
1004 | + case CL_DEVICE_SINGLE_FP_CONFIG : break; |
1005 | + case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE : break; |
1006 | + case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE : break; |
1007 | + case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE : break; |
1008 | + case CL_DEVICE_GLOBAL_MEM_SIZE : break; |
1009 | + case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE : break; |
1010 | + case CL_DEVICE_MAX_CONSTANT_ARGS : break; |
1011 | + case CL_DEVICE_LOCAL_MEM_TYPE : break; |
1012 | + case CL_DEVICE_LOCAL_MEM_SIZE : break; |
1013 | + case CL_DEVICE_ERROR_CORRECTION_SUPPORT : break; |
1014 | + case CL_DEVICE_PROFILING_TIMER_RESOLUTION : break; |
1015 | + case CL_DEVICE_ENDIAN_LITTLE : break; |
1016 | + case CL_DEVICE_AVAILABLE : break; |
1017 | + case CL_DEVICE_COMPILER_AVAILABLE : break; |
1018 | + case CL_DEVICE_EXECUTION_CAPABILITIES : break; |
1019 | + case CL_DEVICE_QUEUE_PROPERTIES : break; |
1020 | + |
1021 | + case CL_DEVICE_NAME: |
1022 | + { |
1023 | + size_t const value_size = strlen(device->name) + 1; |
1024 | + if (param_value) |
1025 | + { |
1026 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
1027 | + memcpy(param_value, device->name, value_size); |
1028 | + } |
1029 | + if (param_value_size_ret) |
1030 | + *param_value_size_ret = value_size; |
1031 | + return CL_SUCCESS; |
1032 | + } |
1033 | + |
1034 | + case CL_DEVICE_VENDOR : break; |
1035 | + case CL_DRIVER_VERSION : break; |
1036 | + case CL_DEVICE_PROFILE : break; |
1037 | + case CL_DEVICE_VERSION : break; |
1038 | + case CL_DEVICE_EXTENSIONS : break; |
1039 | + case CL_DEVICE_PLATFORM : break; |
1040 | + case CL_DEVICE_DOUBLE_FP_CONFIG : break; |
1041 | + case CL_DEVICE_HALF_FP_CONFIG : break; |
1042 | + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF : break; |
1043 | + case CL_DEVICE_HOST_UNIFIED_MEMORY : break; |
1044 | + case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR : break; |
1045 | + case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT : break; |
1046 | + case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT : break; |
1047 | + case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG : break; |
1048 | + case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT : break; |
1049 | + case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE : break; |
1050 | + case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF : break; |
1051 | + case CL_DEVICE_OPENCL_C_VERSION : break; |
1052 | + } |
1053 | + |
1054 | + return CL_INVALID_VALUE; |
1055 | } |
1056 | - |
1057 | |
1058 | === added file 'lib/CL/clGetEventProfilingInfo.c' |
1059 | --- lib/CL/clGetEventProfilingInfo.c 1970-01-01 00:00:00 +0000 |
1060 | +++ lib/CL/clGetEventProfilingInfo.c 2011-12-18 04:51:24 +0000 |
1061 | @@ -0,0 +1,62 @@ |
1062 | +/* OpenCL runtime library: clGetEventProfilingInfo() |
1063 | + |
1064 | + Copyright (c) 2011 Erik Schnetter |
1065 | + |
1066 | + Permission is hereby granted, free of charge, to any person obtaining a copy |
1067 | + of this software and associated documentation files (the "Software"), to deal |
1068 | + in the Software without restriction, including without limitation the rights |
1069 | + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
1070 | + copies of the Software, and to permit persons to whom the Software is |
1071 | + furnished to do so, subject to the following conditions: |
1072 | + |
1073 | + The above copyright notice and this permission notice shall be included in |
1074 | + all copies or substantial portions of the Software. |
1075 | + |
1076 | + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
1077 | + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
1078 | + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
1079 | + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
1080 | + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
1081 | + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
1082 | + THE SOFTWARE. |
1083 | +*/ |
1084 | + |
1085 | +#include "pocl_cl.h" |
1086 | +#include <string.h> |
1087 | + |
1088 | +CL_API_ENTRY cl_int CL_API_CALL |
1089 | +clGetEventProfilingInfo(cl_event event, |
1090 | + cl_profiling_info param_name, |
1091 | + size_t param_value_size, |
1092 | + void *param_value, |
1093 | + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 |
1094 | +{ |
1095 | + size_t const value_size = sizeof(cl_ulong); |
1096 | + if (param_value) |
1097 | + { |
1098 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
1099 | + |
1100 | + switch (param_name) |
1101 | + { |
1102 | + case CL_PROFILING_COMMAND_QUEUED: |
1103 | + *(cl_ulong*)param_value = 0; /* TODO: return correct value */ |
1104 | + break; |
1105 | + case CL_PROFILING_COMMAND_SUBMIT: |
1106 | + *(cl_ulong*)param_value = 0; /* TODO: return correct value */ |
1107 | + break; |
1108 | + case CL_PROFILING_COMMAND_START: |
1109 | + *(cl_ulong*)param_value = 0; /* TODO: return correct value */ |
1110 | + break; |
1111 | + case CL_PROFILING_COMMAND_END: |
1112 | + *(cl_ulong*)param_value = 0; /* TODO: return correct value */ |
1113 | + break; |
1114 | + default: |
1115 | + return CL_INVALID_VALUE; |
1116 | + } |
1117 | + } |
1118 | + |
1119 | + if (param_value_size_ret) |
1120 | + *param_value_size_ret = value_size; |
1121 | + |
1122 | + return CL_SUCCESS; |
1123 | +} |
1124 | |
1125 | === modified file 'lib/CL/clGetPlatformIDs.c' |
1126 | --- lib/CL/clGetPlatformIDs.c 2011-12-02 13:09:02 +0000 |
1127 | +++ lib/CL/clGetPlatformIDs.c 2011-12-18 04:51:24 +0000 |
1128 | @@ -36,27 +36,19 @@ |
1129 | cl_platform_id * platforms, |
1130 | cl_uint * num_platforms) CL_API_SUFFIX__VERSION_1_0 |
1131 | { |
1132 | - |
1133 | - // user requests only number of platforms - not types |
1134 | - if (num_entries == 0 && num_platforms != NULL) |
1135 | - { |
1136 | - *num_platforms = 1; |
1137 | - return CL_SUCCESS; |
1138 | - } |
1139 | - |
1140 | - // Bad request - no place to store response |
1141 | - if (num_entries > 0 && platforms == NULL) |
1142 | - return CL_INVALID_VALUE; |
1143 | - |
1144 | - // Check required by spec |
1145 | - if (platforms == NULL && num_platforms == NULL) |
1146 | - return CL_INVALID_VALUE; |
1147 | - |
1148 | - // platform is not used now - just mark this platform as 'valid' |
1149 | - platforms[0] = &(_platforms[0]); |
1150 | - |
1151 | - if (num_platforms != NULL) |
1152 | - *num_platforms = 1; |
1153 | - |
1154 | - return CL_SUCCESS; |
1155 | + int const num = 1; |
1156 | + int i; |
1157 | + |
1158 | + if (platforms != NULL) { |
1159 | + if (num_entries < num) |
1160 | + return CL_INVALID_VALUE; |
1161 | + |
1162 | + for (i=0; i<num; ++i) |
1163 | + platforms[i] = &_platforms[i]; |
1164 | + } |
1165 | + |
1166 | + if (num_platforms != NULL) |
1167 | + *num_platforms = num; |
1168 | + |
1169 | + return CL_SUCCESS; |
1170 | } |
1171 | |
1172 | === modified file 'lib/CL/clGetPlatformInfo.c' |
1173 | --- lib/CL/clGetPlatformInfo.c 2011-12-02 13:09:02 +0000 |
1174 | +++ lib/CL/clGetPlatformInfo.c 2011-12-18 04:51:24 +0000 |
1175 | @@ -31,14 +31,14 @@ |
1176 | void * param_value, |
1177 | size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 |
1178 | { |
1179 | - char *ret=0; |
1180 | - int retlen; |
1181 | + const char *ret; |
1182 | + int retlen; |
1183 | |
1184 | if (platform == NULL || (platform->magic != 42)) |
1185 | return CL_INVALID_PLATFORM; |
1186 | |
1187 | switch (param_name) |
1188 | - { |
1189 | + { |
1190 | case CL_PLATFORM_PROFILE: |
1191 | // TODO: figure this out depending on the native execution host. |
1192 | // assume FULL_PROFILE for now. |
1193 | @@ -59,23 +59,21 @@ |
1194 | break; |
1195 | default: |
1196 | return CL_INVALID_VALUE; |
1197 | - } |
1198 | + } |
1199 | |
1200 | - // Specs say (section 4.1) to "ignore param_value" should it be NULL |
1201 | - if (param_value == NULL) |
1202 | - return CL_SUCCESS; |
1203 | - |
1204 | // the OpenCL API docs *seem* to count the trailing NULL |
1205 | retlen = strlen(ret) + 1; |
1206 | |
1207 | - if (param_value_size < retlen) |
1208 | - return CL_INVALID_VALUE; |
1209 | - |
1210 | - strncpy(param_value, ret, retlen); |
1211 | + if (param_value != NULL) |
1212 | + { |
1213 | + if (param_value_size < retlen) |
1214 | + return CL_INVALID_VALUE; |
1215 | + |
1216 | + memcpy(param_value, ret, retlen); |
1217 | + } |
1218 | |
1219 | if (param_value_size_ret != NULL) |
1220 | - *param_value_size_ret=retlen; |
1221 | + *param_value_size_ret = retlen; |
1222 | |
1223 | return CL_SUCCESS; |
1224 | - |
1225 | } |
1226 | |
1227 | === modified file 'lib/CL/clGetProgramBuildInfo.c' |
1228 | --- lib/CL/clGetProgramBuildInfo.c 2011-12-02 13:09:02 +0000 |
1229 | +++ lib/CL/clGetProgramBuildInfo.c 2011-12-18 04:51:24 +0000 |
1230 | @@ -32,31 +32,48 @@ |
1231 | void * param_value, |
1232 | size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 |
1233 | { |
1234 | - char *retval = ""; |
1235 | - int retlen; |
1236 | - |
1237 | - if (program == NULL) |
1238 | - return CL_INVALID_PROGRAM; |
1239 | - |
1240 | - // currently - just stub-implmement this. |
1241 | - // there doesn't seem to exist an "CL_INTERNAL_ERROR" return code :( |
1242 | - if (param_name != CL_PROGRAM_BUILD_LOG) |
1243 | - return CL_INVALID_OPERATION; |
1244 | - |
1245 | - retlen = strlen(retval) + 1; |
1246 | - |
1247 | - if (param_value == NULL) |
1248 | - return CL_SUCCESS; |
1249 | - |
1250 | - if (param_value_size < retlen) |
1251 | - return CL_INVALID_VALUE; |
1252 | - |
1253 | - strncpy(param_value, retval, retlen); |
1254 | - |
1255 | - if (param_value_size_ret != NULL) |
1256 | - *param_value_size_ret = retlen; |
1257 | - |
1258 | - return CL_SUCCESS; |
1259 | + const char *retval = ""; /* dummy return value */ |
1260 | + |
1261 | + switch (param_name) { |
1262 | + case CL_PROGRAM_BUILD_STATUS: |
1263 | + { |
1264 | + size_t const value_size = strlen(retval) + 1; |
1265 | + if (param_value) |
1266 | + { |
1267 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
1268 | + memcpy(param_value, retval, value_size); |
1269 | + } |
1270 | + if (param_value_size_ret) |
1271 | + *param_value_size_ret = value_size; |
1272 | + return CL_SUCCESS; |
1273 | + } |
1274 | + |
1275 | + case CL_PROGRAM_BUILD_OPTIONS: |
1276 | + { |
1277 | + size_t const value_size = strlen(retval) + 1; |
1278 | + if (param_value) |
1279 | + { |
1280 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
1281 | + memcpy(param_value, retval, value_size); |
1282 | + } |
1283 | + if (param_value_size_ret) |
1284 | + *param_value_size_ret = value_size; |
1285 | + return CL_SUCCESS; |
1286 | + } |
1287 | + |
1288 | + case CL_PROGRAM_BUILD_LOG: |
1289 | + { |
1290 | + size_t const value_size = strlen(retval) + 1; |
1291 | + if (param_value) |
1292 | + { |
1293 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
1294 | + memcpy(param_value, retval, value_size); |
1295 | + } |
1296 | + if (param_value_size_ret) |
1297 | + *param_value_size_ret = value_size; |
1298 | + return CL_SUCCESS; |
1299 | + } |
1300 | + } |
1301 | + |
1302 | + return CL_INVALID_VALUE; |
1303 | } |
1304 | - |
1305 | - |
1306 | |
1307 | === added file 'lib/CL/clGetProgramInfo.c' |
1308 | --- lib/CL/clGetProgramInfo.c 1970-01-01 00:00:00 +0000 |
1309 | +++ lib/CL/clGetProgramInfo.c 2011-12-18 04:51:24 +0000 |
1310 | @@ -0,0 +1,89 @@ |
1311 | +/* OpenCL runtime library: clGetProgramInfo() |
1312 | + |
1313 | + Copyright (c) 2011 Erik Schnetter |
1314 | + |
1315 | + Permission is hereby granted, free of charge, to any person obtaining a copy |
1316 | + of this software and associated documentation files (the "Software"), to deal |
1317 | + in the Software without restriction, including without limitation the rights |
1318 | + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
1319 | + copies of the Software, and to permit persons to whom the Software is |
1320 | + furnished to do so, subject to the following conditions: |
1321 | + |
1322 | + The above copyright notice and this permission notice shall be included in |
1323 | + all copies or substantial portions of the Software. |
1324 | + |
1325 | + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
1326 | + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
1327 | + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
1328 | + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
1329 | + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
1330 | + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
1331 | + THE SOFTWARE. |
1332 | +*/ |
1333 | + |
1334 | +#include "pocl_cl.h" |
1335 | +#include <string.h> |
1336 | + |
1337 | +CL_API_ENTRY cl_int CL_API_CALL |
1338 | +clGetProgramInfo(cl_program program, |
1339 | + cl_program_info param_name, |
1340 | + size_t param_value_size, |
1341 | + void *param_value, |
1342 | + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 |
1343 | +{ |
1344 | + switch (param_name) |
1345 | + { |
1346 | + case CL_PROGRAM_REFERENCE_COUNT: |
1347 | + return CL_INVALID_VALUE; /* not yet implemented */ |
1348 | + |
1349 | + case CL_PROGRAM_CONTEXT: |
1350 | + return CL_INVALID_VALUE; /* not yet implemented */ |
1351 | + |
1352 | + case CL_PROGRAM_NUM_DEVICES: |
1353 | + return CL_INVALID_VALUE; /* not yet implemented */ |
1354 | + |
1355 | + case CL_PROGRAM_DEVICES: |
1356 | + return CL_INVALID_VALUE; /* not yet implemented */ |
1357 | + |
1358 | + case CL_PROGRAM_SOURCE: |
1359 | + { |
1360 | + size_t const value_size = strlen(program->source) + 1; |
1361 | + if (param_value) |
1362 | + { |
1363 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
1364 | + memcpy(param_value, program->source, value_size); |
1365 | + } |
1366 | + if (param_value_size_ret) |
1367 | + *param_value_size_ret = value_size; |
1368 | + return CL_SUCCESS; |
1369 | + } |
1370 | + |
1371 | + case CL_PROGRAM_BINARY_SIZES: |
1372 | + { |
1373 | + size_t const value_size = sizeof(size_t); |
1374 | + if (param_value) |
1375 | + { |
1376 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
1377 | + *(size_t*)param_value = program->binary_size; |
1378 | + } |
1379 | + if (param_value_size_ret) |
1380 | + *param_value_size_ret = value_size; |
1381 | + return CL_SUCCESS; |
1382 | + } |
1383 | + |
1384 | + case CL_PROGRAM_BINARIES: |
1385 | + { |
1386 | + size_t const value_size = sizeof(unsigned char *); |
1387 | + if (param_value) |
1388 | + { |
1389 | + if (param_value_size < value_size) return CL_INVALID_VALUE; |
1390 | + *(unsigned char **)param_value = program->binary; |
1391 | + } |
1392 | + if (param_value_size_ret) |
1393 | + *param_value_size_ret = value_size; |
1394 | + return CL_SUCCESS; |
1395 | + } |
1396 | + } |
1397 | + |
1398 | + return CL_INVALID_VALUE; |
1399 | +} |
1400 | |
1401 | === modified file 'lib/CL/devices/native/native.c' |
1402 | --- lib/CL/devices/native/native.c 2011-12-08 02:45:48 +0000 |
1403 | +++ lib/CL/devices/native/native.c 2011-12-18 04:51:24 +0000 |
1404 | @@ -27,9 +27,13 @@ |
1405 | #include <stdlib.h> |
1406 | #include <unistd.h> |
1407 | |
1408 | +#define max(a,b) (((a) > (b)) ? (a) : (b)) |
1409 | + |
1410 | #define COMMAND_LENGTH 256 |
1411 | #define WORKGROUP_STRING_LENGTH 128 |
1412 | |
1413 | +#define ALIGNMENT (max(ALIGNOF_FLOAT16, ALIGNOF_DOUBLE16)) |
1414 | + |
1415 | struct data { |
1416 | /* Currently loaded kernel. */ |
1417 | cl_kernel current_kernel; |
1418 | @@ -60,7 +64,7 @@ |
1419 | |
1420 | if (flags & CL_MEM_COPY_HOST_PTR) |
1421 | { |
1422 | - if (posix_memalign (&b, ALIGNOF_FLOAT16, size) == 0) |
1423 | + if (posix_memalign (&b, ALIGNMENT, size) == 0) |
1424 | { |
1425 | memcpy (b, host_ptr, size); |
1426 | return b; |
1427 | @@ -74,7 +78,7 @@ |
1428 | return host_ptr; |
1429 | } |
1430 | |
1431 | - if (posix_memalign (&b, ALIGNOF_FLOAT16, size) == 0) |
1432 | + if (posix_memalign (&b, ALIGNMENT, size) == 0) |
1433 | return b; |
1434 | |
1435 | return NULL; |
1436 | |
1437 | === modified file 'lib/CL/devices/native/native.h' |
1438 | --- lib/CL/devices/native/native.h 2011-12-08 02:45:48 +0000 |
1439 | +++ lib/CL/devices/native/native.h 2011-12-18 04:51:24 +0000 |
1440 | @@ -94,7 +94,11 @@ |
1441 | pocl_native_malloc, /* malloc */ \ |
1442 | pocl_native_free, /* free */ \ |
1443 | pocl_native_read, /* read */ \ |
1444 | + NULL, /* read_rect */ \ |
1445 | pocl_native_write, /* write */ \ |
1446 | + NULL, /* write_rect */ \ |
1447 | + NULL, /* copy */ \ |
1448 | + NULL, /* copy_rect */ \ |
1449 | pocl_native_run, /* run */ \ |
1450 | NULL /* data */ \ |
1451 | } |
1452 | |
1453 | === modified file 'lib/CL/devices/pthread/pthread.c' |
1454 | --- lib/CL/devices/pthread/pthread.c 2011-12-08 02:45:48 +0000 |
1455 | +++ lib/CL/devices/pthread/pthread.c 2011-12-18 04:51:24 +0000 |
1456 | @@ -29,10 +29,13 @@ |
1457 | #include <unistd.h> |
1458 | |
1459 | #define min(a,b) (((a) < (b)) ? (a) : (b)) |
1460 | +#define max(a,b) (((a) > (b)) ? (a) : (b)) |
1461 | |
1462 | #define COMMAND_LENGTH 256 |
1463 | #define WORKGROUP_STRING_LENGTH 128 |
1464 | |
1465 | +#define ALIGNMENT (max(ALIGNOF_FLOAT16, ALIGNOF_DOUBLE16)) |
1466 | + |
1467 | /* The name of the environment variable used to force a certain max thread count |
1468 | for the thread execution. */ |
1469 | #define THREAD_COUNT_ENV "POCL_MAX_PTHREAD_COUNT" |
1470 | @@ -80,7 +83,7 @@ |
1471 | |
1472 | if (flags & CL_MEM_COPY_HOST_PTR) |
1473 | { |
1474 | - if (posix_memalign (&b, ALIGNOF_FLOAT16, size) == 0) |
1475 | + if (posix_memalign (&b, ALIGNMENT, size) == 0) |
1476 | { |
1477 | memcpy (b, host_ptr, size); |
1478 | return b; |
1479 | @@ -94,7 +97,7 @@ |
1480 | return host_ptr; |
1481 | } |
1482 | |
1483 | - if (posix_memalign (&b, ALIGNOF_FLOAT16, size) == 0) |
1484 | + if (posix_memalign (&b, ALIGNMENT, size) == 0) |
1485 | return b; |
1486 | |
1487 | return NULL; |
1488 | @@ -118,7 +121,38 @@ |
1489 | memcpy (host_ptr, device_ptr, cb); |
1490 | } |
1491 | |
1492 | -void pocl_pthread_write (void *data, const void *host_ptr, void *device_ptr, size_t cb) |
1493 | +void |
1494 | +pocl_pthread_read_rect (void *data, |
1495 | + void *__restrict__ const host_ptr, |
1496 | + void *__restrict__ const device_ptr, |
1497 | + const size_t *__restrict__ const buffer_origin, |
1498 | + const size_t *__restrict__ const host_origin, |
1499 | + const size_t *__restrict__ const region, |
1500 | + size_t const buffer_row_pitch, |
1501 | + size_t const buffer_slice_pitch, |
1502 | + size_t const host_row_pitch, |
1503 | + size_t const host_slice_pitch) |
1504 | +{ |
1505 | + char const *__restrict const adjusted_device_ptr = |
1506 | + (char const*)device_ptr + |
1507 | + buffer_origin[0] + buffer_row_pitch * (buffer_origin[1] + buffer_slice_pitch * buffer_origin[2]); |
1508 | + char *__restrict__ const adjusted_host_ptr = |
1509 | + (char*)host_ptr + |
1510 | + host_origin[0] + host_row_pitch * (host_origin[1] + host_slice_pitch * host_origin[2]); |
1511 | + |
1512 | + size_t j, k; |
1513 | + |
1514 | + /* TODO: handle overlaping regions */ |
1515 | + |
1516 | + for (k = 0; k < region[2]; ++k) |
1517 | + for (j = 0; j < region[1]; ++j) |
1518 | + memcpy (adjusted_host_ptr + host_row_pitch * j + host_slice_pitch * k, |
1519 | + adjusted_device_ptr + buffer_row_pitch * j + buffer_slice_pitch * k, |
1520 | + region[0]); |
1521 | +} |
1522 | + |
1523 | +void |
1524 | +pocl_pthread_write (void *data, const void *host_ptr, void *device_ptr, size_t cb) |
1525 | { |
1526 | if (host_ptr == device_ptr) |
1527 | return; |
1528 | @@ -126,6 +160,75 @@ |
1529 | memcpy (device_ptr, host_ptr, cb); |
1530 | } |
1531 | |
1532 | +void |
1533 | +pocl_pthread_write_rect (void *data, |
1534 | + const void *__restrict__ const host_ptr, |
1535 | + void *__restrict__ const device_ptr, |
1536 | + const size_t *__restrict__ const buffer_origin, |
1537 | + const size_t *__restrict__ const host_origin, |
1538 | + const size_t *__restrict__ const region, |
1539 | + size_t const buffer_row_pitch, |
1540 | + size_t const buffer_slice_pitch, |
1541 | + size_t const host_row_pitch, |
1542 | + size_t const host_slice_pitch) |
1543 | +{ |
1544 | + char *__restrict const adjusted_device_ptr = |
1545 | + (char*)device_ptr + |
1546 | + buffer_origin[0] + buffer_row_pitch * (buffer_origin[1] + buffer_slice_pitch * buffer_origin[2]); |
1547 | + char const *__restrict__ const adjusted_host_ptr = |
1548 | + (char const*)host_ptr + |
1549 | + host_origin[0] + host_row_pitch * (host_origin[1] + host_slice_pitch * host_origin[2]); |
1550 | + |
1551 | + size_t j, k; |
1552 | + |
1553 | + /* TODO: handle overlaping regions */ |
1554 | + |
1555 | + for (k = 0; k < region[2]; ++k) |
1556 | + for (j = 0; j < region[1]; ++j) |
1557 | + memcpy (adjusted_device_ptr + buffer_row_pitch * j + buffer_slice_pitch * k, |
1558 | + adjusted_host_ptr + host_row_pitch * j + host_slice_pitch * k, |
1559 | + region[0]); |
1560 | +} |
1561 | + |
1562 | +void |
1563 | +pocl_pthread_copy (void *data, const void *src_ptr, const void *dst_ptr, size_t cb) |
1564 | +{ |
1565 | + if (src_ptr == dst_ptr) |
1566 | + return; |
1567 | + |
1568 | + memcpy (dst_ptr, src_ptr, cb); |
1569 | +} |
1570 | + |
1571 | +void |
1572 | +pocl_pthread_copy_rect (void *data, |
1573 | + const void *__restrict const src_ptr, |
1574 | + void *__restrict__ const dst_ptr, |
1575 | + const size_t *__restrict__ const src_origin, |
1576 | + const size_t *__restrict__ const dst_origin, |
1577 | + const size_t *__restrict__ const region, |
1578 | + size_t const src_row_pitch, |
1579 | + size_t const src_slice_pitch, |
1580 | + size_t const dst_row_pitch, |
1581 | + size_t const dst_slice_pitch) |
1582 | +{ |
1583 | + char const *__restrict const adjusted_src_ptr = |
1584 | + (char const*)src_ptr + |
1585 | + src_origin[0] + src_row_pitch * (src_origin[1] + src_slice_pitch * src_origin[2]); |
1586 | + char *__restrict__ const adjusted_dst_ptr = |
1587 | + (char*)dst_ptr + |
1588 | + dst_origin[0] + dst_row_pitch * (dst_origin[1] + dst_slice_pitch * dst_origin[2]); |
1589 | + |
1590 | + size_t j, k; |
1591 | + |
1592 | + /* TODO: handle overlaping regions */ |
1593 | + |
1594 | + for (k = 0; k < region[2]; ++k) |
1595 | + for (j = 0; j < region[1]; ++j) |
1596 | + memcpy (adjusted_dst_ptr + dst_row_pitch * j + dst_slice_pitch * k, |
1597 | + adjusted_src_ptr + src_row_pitch * j + src_slice_pitch * k, |
1598 | + region[0]); |
1599 | +} |
1600 | + |
1601 | //#define DEBUG_MT |
1602 | //#define DEBUG_MAX_THREAD_COUNT |
1603 | /** |
1604 | |
1605 | === modified file 'lib/CL/devices/pthread/pthread.h' |
1606 | --- lib/CL/devices/pthread/pthread.h 2011-12-08 02:45:48 +0000 |
1607 | +++ lib/CL/devices/pthread/pthread.h 2011-12-18 04:51:24 +0000 |
1608 | @@ -31,13 +31,82 @@ |
1609 | size_t size, void *host_ptr); |
1610 | void pocl_pthread_free (void *data, cl_mem_flags flags, void *ptr); |
1611 | void pocl_pthread_read (void *data, void *host_ptr, const void *device_ptr, size_t cb); |
1612 | +void pocl_pthread_read_rect (void *data, void *host_ptr, void *device_ptr, |
1613 | + const size_t *buffer_origin, |
1614 | + const size_t *host_origin, |
1615 | + const size_t *region, |
1616 | + size_t buffer_row_pitch, |
1617 | + size_t buffer_slice_pitch, |
1618 | + size_t host_row_pitch, |
1619 | + size_t host_slice_pitch); |
1620 | void pocl_pthread_write (void *data, const void *host_ptr, void *device_ptr, size_t cb); |
1621 | +void pocl_pthread_write_rect (void *data, const void *host_ptr, void *device_ptr, |
1622 | + const size_t *buffer_origin, |
1623 | + const size_t *host_origin, |
1624 | + const size_t *region, |
1625 | + size_t buffer_row_pitch, |
1626 | + size_t buffer_slice_pitch, |
1627 | + size_t host_row_pitch, |
1628 | + size_t host_slice_pitch); |
1629 | +void pocl_pthread_copy (void *data, const void *src_ptr, const void *dst_ptr, size_t cb); |
1630 | +void pocl_pthread_copy_rect (void *data, const void *src_ptr, void *dst_ptr, |
1631 | + const size_t *src_origin, |
1632 | + const size_t *dst_origin, |
1633 | + const size_t *region, |
1634 | + size_t src_row_pitch, |
1635 | + size_t src_slice_pitch, |
1636 | + size_t dst_row_pitch, |
1637 | + size_t dst_slice_pitch); |
1638 | void pocl_pthread_run (void *data, const char *bytecode, |
1639 | cl_kernel kernel, |
1640 | struct pocl_context *pc); |
1641 | |
1642 | extern size_t pocl_pthread_max_work_item_sizes[]; |
1643 | |
1644 | +/* Determine preferred vector sizes */ |
1645 | +#if defined(__AVX__) |
1646 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_CHAR 16 |
1647 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_SHORT 8 |
1648 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_INT 4 |
1649 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_LONG 2 |
1650 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_FLOAT 4 |
1651 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_DOUBLE 2 |
1652 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_CHAR 16 |
1653 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_SHORT 8 |
1654 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_INT 4 |
1655 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_LONG 2 |
1656 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_FLOAT 8 |
1657 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_DOUBLE 4 |
1658 | +#elif defined(__SSE2__) |
1659 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_CHAR 16 |
1660 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_SHORT 8 |
1661 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_INT 4 |
1662 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_LONG 2 |
1663 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_FLOAT 4 |
1664 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_DOUBLE 2 |
1665 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_CHAR 16 |
1666 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_SHORT 8 |
1667 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_INT 4 |
1668 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_LONG 2 |
1669 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_FLOAT 4 |
1670 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_DOUBLE 2 |
1671 | +#else |
1672 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_CHAR 1 |
1673 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_SHORT 1 |
1674 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_INT 1 |
1675 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_LONG 1 |
1676 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_FLOAT 1 |
1677 | +# define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_DOUBLE 1 |
1678 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_CHAR 1 |
1679 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_SHORT 1 |
1680 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_INT 1 |
1681 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_LONG 1 |
1682 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_FLOAT 1 |
1683 | +# define POCL_DEVICES_PTHREAD_NATIVE_VECTOR_WIDTH_DOUBLE 1 |
1684 | +#endif |
1685 | +/* Half is internally represented as short */ |
1686 | +#define POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_HALF POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_SHORT |
1687 | + |
1688 | #define POCL_DEVICES_PTHREAD { \ |
1689 | CL_DEVICE_TYPE_CPU, /* type */ \ |
1690 | 0, /* vendor_id */ \ |
1691 | @@ -45,12 +114,12 @@ |
1692 | 3, /* max_work_item_dimensions */ \ |
1693 | pocl_pthread_max_work_item_sizes, /* max_work_item_sizes */ \ |
1694 | CL_INT_MAX, /* max_work_group_size */ \ |
1695 | - 0, /* preferred_vector_width_char */ \ |
1696 | - 0, /* preferred_vector_width_short */ \ |
1697 | - 0, /* preferred_vector_width_int */ \ |
1698 | - 0, /* preferred_vector_width_long */ \ |
1699 | - 0, /* preferred_vector_width_float */ \ |
1700 | - 0, /* preferred_vector_width_double */ \ |
1701 | + POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_CHAR , /* preferred_vector_width_char */ \ |
1702 | + POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_SHORT , /* preferred_vector_width_short */ \ |
1703 | + POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_INT , /* preferred_vector_width_int */ \ |
1704 | + POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_LONG , /* preferred_vector_width_long */ \ |
1705 | + POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_FLOAT , /* preferred_vector_width_float */ \ |
1706 | + POCL_DEVICES_PTHREAD_PREFERRED_VECTOR_WIDTH_DOUBLE, /* preferred_vector_width_double */ \ |
1707 | 0, /* max_clock_frequency */ \ |
1708 | 0, /* address_bits */ \ |
1709 | 0, /* max_mem_alloc_size */ \ |
1710 | @@ -94,7 +163,11 @@ |
1711 | pocl_pthread_malloc, /* malloc */ \ |
1712 | pocl_pthread_free, /* free */ \ |
1713 | pocl_pthread_read, /* read */ \ |
1714 | + pocl_pthread_read_rect, /* read_rect */ \ |
1715 | pocl_pthread_write, /* write */ \ |
1716 | + pocl_pthread_write_rect, /* write_rect */ \ |
1717 | + pocl_pthread_copy, /* copy */ \ |
1718 | + pocl_pthread_copy_rect, /* copy_rect */ \ |
1719 | pocl_pthread_run, /* run */ \ |
1720 | NULL /* data */ \ |
1721 | } |
1722 | |
1723 | === modified file 'lib/CL/pocl_cl.h' |
1724 | --- lib/CL/pocl_cl.h 2011-12-08 14:06:07 +0000 |
1725 | +++ lib/CL/pocl_cl.h 2011-12-18 04:51:24 +0000 |
1726 | @@ -100,7 +100,32 @@ |
1727 | size_t size, void *host_ptr); |
1728 | void (*free) (void *data, cl_mem_flags flags, void *ptr); |
1729 | void (*read) (void *data, void *host_ptr, void *device_ptr, size_t cb); |
1730 | + void (*read_rect) (void *data, void *host_ptr, void *device_ptr, |
1731 | + const size_t *buffer_origin, |
1732 | + const size_t *host_origin, |
1733 | + const size_t *region, |
1734 | + size_t buffer_row_pitch, |
1735 | + size_t buffer_slice_pitch, |
1736 | + size_t host_row_pitch, |
1737 | + size_t host_slice_pitch); |
1738 | void (*write) (void *data, const void *host_ptr, void *device_ptr, size_t cb); |
1739 | + void (*write_rect) (void *data, const void *host_ptr, void *device_ptr, |
1740 | + const size_t *buffer_origin, |
1741 | + const size_t *host_origin, |
1742 | + const size_t *region, |
1743 | + size_t buffer_row_pitch, |
1744 | + size_t buffer_slice_pitch, |
1745 | + size_t host_row_pitch, |
1746 | + size_t host_slice_pitch); |
1747 | + void (*copy) (void *data, const void *src_ptr, const void *dst_ptr, size_t cb); |
1748 | + void (*copy_rect) (void *data, const void *src_ptr, void *dst_ptr, |
1749 | + const size_t *src_origin, |
1750 | + const size_t *dst_origin, |
1751 | + const size_t *region, |
1752 | + size_t src_row_pitch, |
1753 | + size_t src_slice_pitch, |
1754 | + size_t dst_row_pitch, |
1755 | + size_t dst_slice_pitch); |
1756 | void (*run) (void *data, const char *bytecode, |
1757 | cl_kernel kernel, |
1758 | struct pocl_context *pc); |
1759 | |
1760 | === modified file 'lib/kernel/all.cl' |
1761 | --- lib/kernel/all.cl 2011-11-24 15:56:10 +0000 |
1762 | +++ lib/kernel/all.cl 2011-12-18 04:51:24 +0000 |
1763 | @@ -21,123 +21,123 @@ |
1764 | THE SOFTWARE. |
1765 | */ |
1766 | |
1767 | -int __attribute__((__overloadable__)) all(char a) |
1768 | +int _cl_overloadable all(char a) |
1769 | { |
1770 | return a < (char)0; |
1771 | } |
1772 | |
1773 | -int __attribute__((__overloadable__)) all(char2 a) |
1774 | +int _cl_overloadable all(char2 a) |
1775 | { |
1776 | return all(a.lo) && all(a.hi); |
1777 | } |
1778 | |
1779 | -int __attribute__((__overloadable__)) all(char3 a) |
1780 | +int _cl_overloadable all(char3 a) |
1781 | { |
1782 | return all(a.s01) && all(a.s2); |
1783 | } |
1784 | |
1785 | -int __attribute__((__overloadable__)) all(char4 a) |
1786 | -{ |
1787 | - return all(a.lo) && all(a.hi); |
1788 | -} |
1789 | - |
1790 | -int __attribute__((__overloadable__)) all(char8 a) |
1791 | -{ |
1792 | - return all(a.lo) && all(a.hi); |
1793 | -} |
1794 | - |
1795 | -int __attribute__((__overloadable__)) all(char16 a) |
1796 | -{ |
1797 | - return all(a.lo) && all(a.hi); |
1798 | -} |
1799 | - |
1800 | -int __attribute__((__overloadable__)) all(short a) |
1801 | +int _cl_overloadable all(char4 a) |
1802 | +{ |
1803 | + return all(a.lo) && all(a.hi); |
1804 | +} |
1805 | + |
1806 | +int _cl_overloadable all(char8 a) |
1807 | +{ |
1808 | + return all(a.lo) && all(a.hi); |
1809 | +} |
1810 | + |
1811 | +int _cl_overloadable all(char16 a) |
1812 | +{ |
1813 | + return all(a.lo) && all(a.hi); |
1814 | +} |
1815 | + |
1816 | +int _cl_overloadable all(short a) |
1817 | { |
1818 | return a < (short)0; |
1819 | } |
1820 | |
1821 | -int __attribute__((__overloadable__)) all(short2 a) |
1822 | +int _cl_overloadable all(short2 a) |
1823 | { |
1824 | return all(a.lo) && all(a.hi); |
1825 | } |
1826 | |
1827 | -int __attribute__((__overloadable__)) all(short3 a) |
1828 | +int _cl_overloadable all(short3 a) |
1829 | { |
1830 | return all(a.s01) && all(a.s2); |
1831 | } |
1832 | |
1833 | -int __attribute__((__overloadable__)) all(short4 a) |
1834 | -{ |
1835 | - return all(a.lo) && all(a.hi); |
1836 | -} |
1837 | - |
1838 | -int __attribute__((__overloadable__)) all(short8 a) |
1839 | -{ |
1840 | - return all(a.lo) && all(a.hi); |
1841 | -} |
1842 | - |
1843 | -int __attribute__((__overloadable__)) all(short16 a) |
1844 | -{ |
1845 | - return all(a.lo) && all(a.hi); |
1846 | -} |
1847 | - |
1848 | -int __attribute__((__overloadable__)) all(int a) |
1849 | +int _cl_overloadable all(short4 a) |
1850 | +{ |
1851 | + return all(a.lo) && all(a.hi); |
1852 | +} |
1853 | + |
1854 | +int _cl_overloadable all(short8 a) |
1855 | +{ |
1856 | + return all(a.lo) && all(a.hi); |
1857 | +} |
1858 | + |
1859 | +int _cl_overloadable all(short16 a) |
1860 | +{ |
1861 | + return all(a.lo) && all(a.hi); |
1862 | +} |
1863 | + |
1864 | +int _cl_overloadable all(int a) |
1865 | { |
1866 | return a < 0; |
1867 | } |
1868 | |
1869 | -int __attribute__((__overloadable__)) all(int2 a) |
1870 | +int _cl_overloadable all(int2 a) |
1871 | { |
1872 | return all(a.lo) && all(a.hi); |
1873 | } |
1874 | |
1875 | -int __attribute__((__overloadable__)) all(int3 a) |
1876 | +int _cl_overloadable all(int3 a) |
1877 | { |
1878 | return all(a.s01) && all(a.s2); |
1879 | } |
1880 | |
1881 | -int __attribute__((__overloadable__)) all(int4 a) |
1882 | -{ |
1883 | - return all(a.lo) && all(a.hi); |
1884 | -} |
1885 | - |
1886 | -int __attribute__((__overloadable__)) all(int8 a) |
1887 | -{ |
1888 | - return all(a.lo) && all(a.hi); |
1889 | -} |
1890 | - |
1891 | -int __attribute__((__overloadable__)) all(int16 a) |
1892 | +int _cl_overloadable all(int4 a) |
1893 | +{ |
1894 | + return all(a.lo) && all(a.hi); |
1895 | +} |
1896 | + |
1897 | +int _cl_overloadable all(int8 a) |
1898 | +{ |
1899 | + return all(a.lo) && all(a.hi); |
1900 | +} |
1901 | + |
1902 | +int _cl_overloadable all(int16 a) |
1903 | { |
1904 | return all(a.lo) && all(a.hi); |
1905 | } |
1906 | |
1907 | #ifdef cles_khr_int64 |
1908 | -int __attribute__((__overloadable__)) all(long a) |
1909 | +int _cl_overloadable all(long a) |
1910 | { |
1911 | return a < 0L; |
1912 | } |
1913 | |
1914 | -int __attribute__((__overloadable__)) all(long2 a) |
1915 | +int _cl_overloadable all(long2 a) |
1916 | { |
1917 | return all(a.lo) && all(a.hi); |
1918 | } |
1919 | |
1920 | -int __attribute__((__overloadable__)) all(long3 a) |
1921 | +int _cl_overloadable all(long3 a) |
1922 | { |
1923 | return all(a.s01) && all(a.s2); |
1924 | } |
1925 | |
1926 | -int __attribute__((__overloadable__)) all(long4 a) |
1927 | -{ |
1928 | - return all(a.lo) && all(a.hi); |
1929 | -} |
1930 | - |
1931 | -int __attribute__((__overloadable__)) all(long8 a) |
1932 | -{ |
1933 | - return all(a.lo) && all(a.hi); |
1934 | -} |
1935 | - |
1936 | -int __attribute__((__overloadable__)) all(long16 a) |
1937 | +int _cl_overloadable all(long4 a) |
1938 | +{ |
1939 | + return all(a.lo) && all(a.hi); |
1940 | +} |
1941 | + |
1942 | +int _cl_overloadable all(long8 a) |
1943 | +{ |
1944 | + return all(a.lo) && all(a.hi); |
1945 | +} |
1946 | + |
1947 | +int _cl_overloadable all(long16 a) |
1948 | { |
1949 | return all(a.lo) && all(a.hi); |
1950 | } |
1951 | |
1952 | === modified file 'lib/kernel/any.cl' |
1953 | --- lib/kernel/any.cl 2011-11-24 15:56:10 +0000 |
1954 | +++ lib/kernel/any.cl 2011-12-18 04:51:24 +0000 |
1955 | @@ -21,123 +21,123 @@ |
1956 | THE SOFTWARE. |
1957 | */ |
1958 | |
1959 | -int __attribute__((__overloadable__)) any(char a) |
1960 | +int _cl_overloadable any(char a) |
1961 | { |
1962 | return a < (char)0; |
1963 | } |
1964 | |
1965 | -int __attribute__((__overloadable__)) any(char2 a) |
1966 | +int _cl_overloadable any(char2 a) |
1967 | { |
1968 | return any(a.lo) || any(a.hi); |
1969 | } |
1970 | |
1971 | -int __attribute__((__overloadable__)) any(char3 a) |
1972 | +int _cl_overloadable any(char3 a) |
1973 | { |
1974 | return any(a.s01) || any(a.s2); |
1975 | } |
1976 | |
1977 | -int __attribute__((__overloadable__)) any(char4 a) |
1978 | -{ |
1979 | - return any(a.lo) || any(a.hi); |
1980 | -} |
1981 | - |
1982 | -int __attribute__((__overloadable__)) any(char8 a) |
1983 | -{ |
1984 | - return any(a.lo) || any(a.hi); |
1985 | -} |
1986 | - |
1987 | -int __attribute__((__overloadable__)) any(char16 a) |
1988 | -{ |
1989 | - return any(a.lo) || any(a.hi); |
1990 | -} |
1991 | - |
1992 | -int __attribute__((__overloadable__)) any(short a) |
1993 | +int _cl_overloadable any(char4 a) |
1994 | +{ |
1995 | + return any(a.lo) || any(a.hi); |
1996 | +} |
1997 | + |
1998 | +int _cl_overloadable any(char8 a) |
1999 | +{ |
2000 | + return any(a.lo) || any(a.hi); |
2001 | +} |
2002 | + |
2003 | +int _cl_overloadable any(char16 a) |
2004 | +{ |
2005 | + return any(a.lo) || any(a.hi); |
2006 | +} |
2007 | + |
2008 | +int _cl_overloadable any(short a) |
2009 | { |
2010 | return a < (short)0; |
2011 | } |
2012 | |
2013 | -int __attribute__((__overloadable__)) any(short2 a) |
2014 | +int _cl_overloadable any(short2 a) |
2015 | { |
2016 | return any(a.lo) || any(a.hi); |
2017 | } |
2018 | |
2019 | -int __attribute__((__overloadable__)) any(short3 a) |
2020 | +int _cl_overloadable any(short3 a) |
2021 | { |
2022 | return any(a.s01) || any(a.s2); |
2023 | } |
2024 | |
2025 | -int __attribute__((__overloadable__)) any(short4 a) |
2026 | -{ |
2027 | - return any(a.lo) || any(a.hi); |
2028 | -} |
2029 | - |
2030 | -int __attribute__((__overloadable__)) any(short8 a) |
2031 | -{ |
2032 | - return any(a.lo) || any(a.hi); |
2033 | -} |
2034 | - |
2035 | -int __attribute__((__overloadable__)) any(short16 a) |
2036 | -{ |
2037 | - return any(a.lo) || any(a.hi); |
2038 | -} |
2039 | - |
2040 | -int __attribute__((__overloadable__)) any(int a) |
2041 | +int _cl_overloadable any(short4 a) |
2042 | +{ |
2043 | + return any(a.lo) || any(a.hi); |
2044 | +} |
2045 | + |
2046 | +int _cl_overloadable any(short8 a) |
2047 | +{ |
2048 | + return any(a.lo) || any(a.hi); |
2049 | +} |
2050 | + |
2051 | +int _cl_overloadable any(short16 a) |
2052 | +{ |
2053 | + return any(a.lo) || any(a.hi); |
2054 | +} |
2055 | + |
2056 | +int _cl_overloadable any(int a) |
2057 | { |
2058 | return a < 0; |
2059 | } |
2060 | |
2061 | -int __attribute__((__overloadable__)) any(int2 a) |
2062 | +int _cl_overloadable any(int2 a) |
2063 | { |
2064 | return any(a.lo) || any(a.hi); |
2065 | } |
2066 | |
2067 | -int __attribute__((__overloadable__)) any(int3 a) |
2068 | +int _cl_overloadable any(int3 a) |
2069 | { |
2070 | return any(a.s01) || any(a.s2); |
2071 | } |
2072 | |
2073 | -int __attribute__((__overloadable__)) any(int4 a) |
2074 | -{ |
2075 | - return any(a.lo) || any(a.hi); |
2076 | -} |
2077 | - |
2078 | -int __attribute__((__overloadable__)) any(int8 a) |
2079 | -{ |
2080 | - return any(a.lo) || any(a.hi); |
2081 | -} |
2082 | - |
2083 | -int __attribute__((__overloadable__)) any(int16 a) |
2084 | +int _cl_overloadable any(int4 a) |
2085 | +{ |
2086 | + return any(a.lo) || any(a.hi); |
2087 | +} |
2088 | + |
2089 | +int _cl_overloadable any(int8 a) |
2090 | +{ |
2091 | + return any(a.lo) || any(a.hi); |
2092 | +} |
2093 | + |
2094 | +int _cl_overloadable any(int16 a) |
2095 | { |
2096 | return any(a.lo) || any(a.hi); |
2097 | } |
2098 | |
2099 | #ifdef cles_khr_int64 |
2100 | -int __attribute__((__overloadable__)) any(long a) |
2101 | +int _cl_overloadable any(long a) |
2102 | { |
2103 | return a < 0L; |
2104 | } |
2105 | |
2106 | -int __attribute__((__overloadable__)) any(long2 a) |
2107 | +int _cl_overloadable any(long2 a) |
2108 | { |
2109 | return any(a.lo) || any(a.hi); |
2110 | } |
2111 | |
2112 | -int __attribute__((__overloadable__)) any(long3 a) |
2113 | +int _cl_overloadable any(long3 a) |
2114 | { |
2115 | return any(a.s01) || any(a.s2); |
2116 | } |
2117 | |
2118 | -int __attribute__((__overloadable__)) any(long4 a) |
2119 | -{ |
2120 | - return any(a.lo) || any(a.hi); |
2121 | -} |
2122 | - |
2123 | -int __attribute__((__overloadable__)) any(long8 a) |
2124 | -{ |
2125 | - return any(a.lo) || any(a.hi); |
2126 | -} |
2127 | - |
2128 | -int __attribute__((__overloadable__)) any(long16 a) |
2129 | +int _cl_overloadable any(long4 a) |
2130 | +{ |
2131 | + return any(a.lo) || any(a.hi); |
2132 | +} |
2133 | + |
2134 | +int _cl_overloadable any(long8 a) |
2135 | +{ |
2136 | + return any(a.lo) || any(a.hi); |
2137 | +} |
2138 | + |
2139 | +int _cl_overloadable any(long16 a) |
2140 | { |
2141 | return any(a.lo) || any(a.hi); |
2142 | } |
2143 | |
2144 | === added file 'lib/kernel/arm/vload_half.cl' |
2145 | --- lib/kernel/arm/vload_half.cl 1970-01-01 00:00:00 +0000 |
2146 | +++ lib/kernel/arm/vload_half.cl 2011-12-18 04:51:24 +0000 |
2147 | @@ -0,0 +1,109 @@ |
2148 | +/* OpenCL built-in library: vload_half() |
2149 | + |
2150 | + Copyright (c) 2011 Universidad Rey Juan Carlos |
2151 | + |
2152 | + Permission is hereby granted, free of charge, to any person obtaining a copy |
2153 | + of this software and associated documentation files (the "Software"), to deal |
2154 | + in the Software without restriction, including without limitation the rights |
2155 | + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
2156 | + copies of the Software, and to permit persons to whom the Software is |
2157 | + furnished to do so, subject to the following conditions: |
2158 | + |
2159 | + The above copyright notice and this permission notice shall be included in |
2160 | + all copies or substantial portions of the Software. |
2161 | + |
2162 | + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
2163 | + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
2164 | + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
2165 | + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
2166 | + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
2167 | + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
2168 | + THE SOFTWARE. |
2169 | +*/ |
2170 | + |
2171 | + |
2172 | + |
2173 | +#define IMPLEMENT_VLOAD_HALF(MOD) \ |
2174 | + \ |
2175 | + float _cl_overloadable \ |
2176 | + vload_half(size_t offset, const MOD half *p) \ |
2177 | + { \ |
2178 | + return p[offset]; \ |
2179 | + } \ |
2180 | + \ |
2181 | + float2 _cl_overloadable \ |
2182 | + vload_half2(size_t offset, const MOD half *p) \ |
2183 | + { \ |
2184 | + return (float2)(vload_half(0, &p[offset*2]), \ |
2185 | + vload_half(0, &p[offset*2+1])); \ |
2186 | + } \ |
2187 | + \ |
2188 | + float3 _cl_overloadable \ |
2189 | + vload_half3(size_t offset, const MOD half *p) \ |
2190 | + { \ |
2191 | + return (float3)(vload_half2(0, &p[offset*3]), \ |
2192 | + vload_half(0, &p[offset*3+2])); \ |
2193 | + } \ |
2194 | + \ |
2195 | + float4 _cl_overloadable \ |
2196 | + vload_half4(size_t offset, const MOD half *p) \ |
2197 | + { \ |
2198 | + return (float4)(vload_half2(0, &p[offset*4]), \ |
2199 | + vload_half2(0, &p[offset*4+2])); \ |
2200 | + } \ |
2201 | + \ |
2202 | + float8 _cl_overloadable \ |
2203 | + vload_half8(size_t offset, const MOD half *p) \ |
2204 | + { \ |
2205 | + return (float8)(vload_half4(0, &p[offset*8]), \ |
2206 | + vload_half4(0, &p[offset*8+4])); \ |
2207 | + } \ |
2208 | + \ |
2209 | + float16 _cl_overloadable \ |
2210 | + vload_half16(size_t offset, const MOD half *p) \ |
2211 | + { \ |
2212 | + return (float16)(vload_half8(0, &p[offset*16]), \ |
2213 | + vload_half8(0, &p[offset*16+8])); \ |
2214 | + } \ |
2215 | + \ |
2216 | + float2 _cl_overloadable \ |
2217 | + vloada_half2(size_t offset, const MOD half *p) \ |
2218 | + { \ |
2219 | + return (float2)(vload_half(0, &p[offset*2]), \ |
2220 | + vload_half(0, &p[offset*2+1])); \ |
2221 | + } \ |
2222 | + \ |
2223 | + float3 _cl_overloadable \ |
2224 | + vloada_half3(size_t offset, const MOD half *p) \ |
2225 | + { \ |
2226 | + return (float3)(vloada_half2(0, &p[offset*4]), \ |
2227 | + vload_half(0, &p[offset*4+2])); \ |
2228 | + } \ |
2229 | + \ |
2230 | + float4 _cl_overloadable \ |
2231 | + vloada_half4(size_t offset, const MOD half *p) \ |
2232 | + { \ |
2233 | + return (float4)(vloada_half2(0, &p[offset*4]), \ |
2234 | + vloada_half2(0, &p[offset*4+2])); \ |
2235 | + } \ |
2236 | + \ |
2237 | + float8 _cl_overloadable \ |
2238 | + vloada_half8(size_t offset, const MOD half *p) \ |
2239 | + { \ |
2240 | + return (float8)(vloada_half4(0, &p[offset*8]), \ |
2241 | + vloada_half4(0, &p[offset*8+4])); \ |
2242 | + } \ |
2243 | + \ |
2244 | + float16 _cl_overloadable \ |
2245 | + vloada_half16(size_t offset, const MOD half *p) \ |
2246 | + { \ |
2247 | + return (float16)(vloada_half8(0, &p[offset*16]), \ |
2248 | + vloada_half8(0, &p[offset*16+8])); \ |
2249 | + } |
2250 | + |
2251 | + |
2252 | + |
2253 | +IMPLEMENT_VLOAD_HALF(__global) |
2254 | +IMPLEMENT_VLOAD_HALF(__local) |
2255 | +IMPLEMENT_VLOAD_HALF(__constant) |
2256 | +/* IMPLEMENT_VLOAD_HALF(__private) */ |
2257 | |
2258 | === added file 'lib/kernel/arm/vstore_half.cl' |
2259 | --- lib/kernel/arm/vstore_half.cl 1970-01-01 00:00:00 +0000 |
2260 | +++ lib/kernel/arm/vstore_half.cl 2011-12-18 04:51:24 +0000 |
2261 | @@ -0,0 +1,120 @@ |
2262 | +/* OpenCL built-in library: vstore_half() |
2263 | + |
2264 | + Copyright (c) 2011 Universidad Rey Juan Carlos |
2265 | + |
2266 | + Permission is hereby granted, free of charge, to any person obtaining a copy |
2267 | + of this software and associated documentation files (the "Software"), to deal |
2268 | + in the Software without restriction, including without limitation the rights |
2269 | + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
2270 | + copies of the Software, and to permit persons to whom the Software is |
2271 | + furnished to do so, subject to the following conditions: |
2272 | + |
2273 | + The above copyright notice and this permission notice shall be included in |
2274 | + all copies or substantial portions of the Software. |
2275 | + |
2276 | + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
2277 | + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
2278 | + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
2279 | + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
2280 | + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
2281 | + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
2282 | + THE SOFTWARE. |
2283 | +*/ |
2284 | + |
2285 | + |
2286 | + |
2287 | +#define IMPLEMENT_VSTORE_HALF(MOD, SUFFIX) \ |
2288 | + \ |
2289 | + void _cl_overloadable \ |
2290 | + vstore_half##SUFFIX(float data, size_t offset, MOD half *p) \ |
2291 | + { \ |
2292 | + p[offset] = data; \ |
2293 | + } \ |
2294 | + \ |
2295 | + void _cl_overloadable \ |
2296 | + vstore_half2##SUFFIX(float2 data, size_t offset, MOD half *p) \ |
2297 | + { \ |
2298 | + vstore_half##SUFFIX(data.lo, 0, &p[offset*2]); \ |
2299 | + vstore_half##SUFFIX(data.hi, 0, &p[offset*2+1]); \ |
2300 | + } \ |
2301 | + \ |
2302 | + void _cl_overloadable \ |
2303 | + vstore_half3##SUFFIX(float3 data, size_t offset, MOD half *p) \ |
2304 | + { \ |
2305 | + vstore_half2##SUFFIX(data.lo, 0, &p[offset*3]); \ |
2306 | + vstore_half##SUFFIX(data.s2, 0, &p[offset*3+2]); \ |
2307 | + } \ |
2308 | + \ |
2309 | + void _cl_overloadable \ |
2310 | + vstore_half4##SUFFIX(float4 data, size_t offset, MOD half *p) \ |
2311 | + { \ |
2312 | + vstore_half2##SUFFIX(data.lo, 0, &p[offset*4]); \ |
2313 | + vstore_half2##SUFFIX(data.hi, 0, &p[offset*4+2]); \ |
2314 | + } \ |
2315 | + \ |
2316 | + void _cl_overloadable \ |
2317 | + vstore_half8##SUFFIX(float8 data, size_t offset, MOD half *p) \ |
2318 | + { \ |
2319 | + vstore_half4##SUFFIX(data.lo, 0, &p[offset*8]); \ |
2320 | + vstore_half4##SUFFIX(data.hi, 0, &p[offset*8+4]); \ |
2321 | + } \ |
2322 | + \ |
2323 | + void _cl_overloadable \ |
2324 | + vstore_half16##SUFFIX(float16 data, size_t offset, MOD half *p) \ |
2325 | + { \ |
2326 | + vstore_half8##SUFFIX(data.lo, 0, &p[offset*16]); \ |
2327 | + vstore_half8##SUFFIX(data.hi, 0, &p[offset*16+8]); \ |
2328 | + } \ |
2329 | + \ |
2330 | + void _cl_overloadable \ |
2331 | + vstorea_half2##SUFFIX(float2 data, size_t offset, MOD half *p) \ |
2332 | + { \ |
2333 | + vstore_half##SUFFIX(data.lo, 0, &p[offset*2]); \ |
2334 | + vstore_half##SUFFIX(data.hi, 0, &p[offset*2+1]); \ |
2335 | + } \ |
2336 | + \ |
2337 | + void _cl_overloadable \ |
2338 | + vstorea_half3##SUFFIX(float3 data, size_t offset, MOD half *p) \ |
2339 | + { \ |
2340 | + vstorea_half2##SUFFIX(data.lo, 0, &p[offset*3]); \ |
2341 | + vstore_half##SUFFIX(data.s2, 0, &p[offset*3+2]); \ |
2342 | + } \ |
2343 | + \ |
2344 | + void _cl_overloadable \ |
2345 | + vstorea_half4##SUFFIX(float4 data, size_t offset, MOD half *p) \ |
2346 | + { \ |
2347 | + vstorea_half2##SUFFIX(data.lo, 0, &p[offset*4]); \ |
2348 | + vstorea_half2##SUFFIX(data.hi, 0, &p[offset*4+2]); \ |
2349 | + } \ |
2350 | + \ |
2351 | + void _cl_overloadable \ |
2352 | + vstorea_half8##SUFFIX(float8 data, size_t offset, MOD half *p) \ |
2353 | + { \ |
2354 | + vstorea_half4##SUFFIX(data.lo, 0, &p[offset*8]); \ |
2355 | + vstorea_half4##SUFFIX(data.hi, 0, &p[offset*8+4]); \ |
2356 | + } \ |
2357 | + \ |
2358 | + void _cl_overloadable \ |
2359 | + vstorea_half16##SUFFIX(float16 data, size_t offset, MOD half *p) \ |
2360 | + { \ |
2361 | + vstorea_half8##SUFFIX(data.lo, 0, &p[offset*16]); \ |
2362 | + vstorea_half8##SUFFIX(data.hi, 0, &p[offset*16+8]); \ |
2363 | + } |
2364 | + |
2365 | + |
2366 | + |
2367 | +IMPLEMENT_VSTORE_HALF(__global , ) |
2368 | +IMPLEMENT_VSTORE_HALF(__global , _rte) |
2369 | +IMPLEMENT_VSTORE_HALF(__global , _rtz) |
2370 | +IMPLEMENT_VSTORE_HALF(__global , _rtp) |
2371 | +IMPLEMENT_VSTORE_HALF(__global , _rtn) |
2372 | +IMPLEMENT_VSTORE_HALF(__local , ) |
2373 | +IMPLEMENT_VSTORE_HALF(__local , _rte) |
2374 | +IMPLEMENT_VSTORE_HALF(__local , _rtz) |
2375 | +IMPLEMENT_VSTORE_HALF(__local , _rtp) |
2376 | +IMPLEMENT_VSTORE_HALF(__local , _rtn) |
2377 | +/* IMPLEMENT_VSTORE_HALF(__private , ) */ |
2378 | +/* IMPLEMENT_VSTORE_HALF(__private , _rte) */ |
2379 | +/* IMPLEMENT_VSTORE_HALF(__private , _rtz) */ |
2380 | +/* IMPLEMENT_VSTORE_HALF(__private , _rtp) */ |
2381 | +/* IMPLEMENT_VSTORE_HALF(__private , _rtn) */ |
2382 | |
2383 | === modified file 'lib/kernel/as_type.cl' |
2384 | --- lib/kernel/as_type.cl 2011-11-25 17:02:42 +0000 |
2385 | +++ lib/kernel/as_type.cl 2011-12-18 04:51:24 +0000 |
2386 | @@ -22,7 +22,7 @@ |
2387 | */ |
2388 | |
2389 | #define DEFINE_AS_TYPE(SRC, DST) \ |
2390 | - DST __attribute__ ((__overloadable__)) \ |
2391 | + DST _cl_overloadable \ |
2392 | as_##DST(SRC a) \ |
2393 | { \ |
2394 | return *(DST*)&a; \ |
2395 | |
2396 | === modified file 'lib/kernel/convert_type.cl' |
2397 | --- lib/kernel/convert_type.cl 2011-11-25 16:47:53 +0000 |
2398 | +++ lib/kernel/convert_type.cl 2011-12-18 04:51:24 +0000 |
2399 | @@ -24,28 +24,28 @@ |
2400 | #include "templates.h" |
2401 | |
2402 | /* SRC and DST must be scalars */ |
2403 | -#define DEFINE_CONVERT_TYPE(SRC, DST) \ |
2404 | - DST __attribute__ ((__overloadable__)) convert_##DST(SRC a) \ |
2405 | - { \ |
2406 | - return (DST)a; \ |
2407 | +#define DEFINE_CONVERT_TYPE(SRC, DST) \ |
2408 | + DST _cl_overloadable convert_##DST(SRC a) \ |
2409 | + { \ |
2410 | + return (DST)a; \ |
2411 | } |
2412 | |
2413 | /* implementing vector SRC and DST in terms of scalars */ |
2414 | #define DEFINE_CONVERT_TYPE_HALF(SRC, DST, HALFDST) \ |
2415 | - DST __attribute__ ((__overloadable__)) convert_##DST(SRC a) \ |
2416 | + DST _cl_overloadable convert_##DST(SRC a) \ |
2417 | { \ |
2418 | return (DST)(convert_##HALFDST(a.lo), convert_##HALFDST(a.hi)); \ |
2419 | } |
2420 | |
2421 | #define DEFINE_CONVERT_TYPE_012(SRC, DST, DST01, DST2) \ |
2422 | - DST __attribute__ ((__overloadable__)) convert_##DST(SRC a) \ |
2423 | + DST _cl_overloadable convert_##DST(SRC a) \ |
2424 | { \ |
2425 | return (DST)(convert_##DST01(a.s01), convert_##DST2(a.s2)); \ |
2426 | } |
2427 | |
2428 | /* SRC and DST may be vectors */ |
2429 | #define DEFINE_CONVERT_TYPE_SAT(SRC, DST, SIZE) \ |
2430 | - DST##SIZE __attribute__ ((__overloadable__)) \ |
2431 | + DST##SIZE _cl_overloadable \ |
2432 | convert_##DST##SIZE##_sat(SRC##SIZE a) \ |
2433 | { \ |
2434 | int const src_float = (SRC)0.1f > (SRC)0; \ |
2435 | |
2436 | === modified file 'lib/kernel/cross.cl' |
2437 | --- lib/kernel/cross.cl 2011-11-01 16:33:17 +0000 |
2438 | +++ lib/kernel/cross.cl 2011-12-18 04:51:24 +0000 |
2439 | @@ -21,12 +21,12 @@ |
2440 | THE SOFTWARE. |
2441 | */ |
2442 | |
2443 | -float4 __attribute__ ((__overloadable__)) cross(float4 a, float4 b) |
2444 | +float4 _cl_overloadable cross(float4 a, float4 b) |
2445 | { |
2446 | return (float4)(cross(a.xyz, b.xyz), 0.0f); |
2447 | } |
2448 | |
2449 | -float3 __attribute__ ((__overloadable__)) cross(float3 a, float3 b) |
2450 | +float3 _cl_overloadable cross(float3 a, float3 b) |
2451 | { |
2452 | return (float3)(a.y * b.z - a.z * b.y, |
2453 | a.z * b.x - a.x * b.z, |
2454 | @@ -34,12 +34,12 @@ |
2455 | } |
2456 | |
2457 | #ifdef cl_khr_fp64 |
2458 | -double4 __attribute__ ((__overloadable__)) cross(double4 a, double4 b) |
2459 | +double4 _cl_overloadable cross(double4 a, double4 b) |
2460 | { |
2461 | return (double4)(cross(a.xyz, b.xyz), 0.0f); |
2462 | } |
2463 | |
2464 | -double3 __attribute__ ((__overloadable__)) cross(double3 a, double3 b) |
2465 | +double3 _cl_overloadable cross(double3 a, double3 b) |
2466 | { |
2467 | return (double3)(a.y * b.z - a.z * b.y, |
2468 | a.z * b.x - a.x * b.z, |
2469 | |
2470 | === modified file 'lib/kernel/dot.cl' |
2471 | --- lib/kernel/dot.cl 2011-11-01 16:33:17 +0000 |
2472 | +++ lib/kernel/dot.cl 2011-12-18 04:51:24 +0000 |
2473 | @@ -21,63 +21,63 @@ |
2474 | THE SOFTWARE. |
2475 | */ |
2476 | |
2477 | -float __attribute__ ((__overloadable__)) dot(float a, float b) |
2478 | +float _cl_overloadable dot(float a, float b) |
2479 | { |
2480 | return a * b; |
2481 | } |
2482 | |
2483 | -float __attribute__ ((__overloadable__)) dot(float2 a, float2 b) |
2484 | +float _cl_overloadable dot(float2 a, float2 b) |
2485 | { |
2486 | return a.lo * b.lo + a.hi * b.hi; |
2487 | } |
2488 | |
2489 | -float __attribute__ ((__overloadable__)) dot(float3 a, float3 b) |
2490 | +float _cl_overloadable dot(float3 a, float3 b) |
2491 | { |
2492 | return dot(a.s01, b.s01) + a.s2 * b.s2; |
2493 | } |
2494 | |
2495 | -float __attribute__ ((__overloadable__)) dot(float4 a, float4 b) |
2496 | -{ |
2497 | - return dot(a.lo, b.lo) + dot(a.hi, b.hi); |
2498 | -} |
2499 | - |
2500 | -float __attribute__ ((__overloadable__)) dot(float8 a, float8 b) |
2501 | -{ |
2502 | - return dot(a.lo, b.lo) + dot(a.hi, b.hi); |
2503 | -} |
2504 | - |
2505 | -float __attribute__ ((__overloadable__)) dot(float16 a, float16 b) |
2506 | +float _cl_overloadable dot(float4 a, float4 b) |
2507 | +{ |
2508 | + return dot(a.lo, b.lo) + dot(a.hi, b.hi); |
2509 | +} |
2510 | + |
2511 | +float _cl_overloadable dot(float8 a, float8 b) |
2512 | +{ |
2513 | + return dot(a.lo, b.lo) + dot(a.hi, b.hi); |
2514 | +} |
2515 | + |
2516 | +float _cl_overloadable dot(float16 a, float16 b) |
2517 | { |
2518 | return dot(a.lo, b.lo) + dot(a.hi, b.hi); |
2519 | } |
2520 | |
2521 | #ifdef cl_khr_fp64 |
2522 | -double __attribute__ ((__overloadable__)) dot(double a, double b) |
2523 | +double _cl_overloadable dot(double a, double b) |
2524 | { |
2525 | return a * b; |
2526 | } |
2527 | |
2528 | -double __attribute__ ((__overloadable__)) dot(double2 a, double2 b) |
2529 | +double _cl_overloadable dot(double2 a, double2 b) |
2530 | { |
2531 | return a.lo * b.lo + a.hi * b.hi; |
2532 | } |
2533 | |
2534 | -double __attribute__ ((__overloadable__)) dot(double3 a, double3 b) |
2535 | +double _cl_overloadable dot(double3 a, double3 b) |
2536 | { |
2537 | return dot(a.s01, b.s01) + a.s2 * b.s2; |
2538 | } |
2539 | |
2540 | -double __attribute__ ((__overloadable__)) dot(double4 a, double4 b) |
2541 | -{ |
2542 | - return dot(a.lo, b.lo) + dot(a.hi, b.hi); |
2543 | -} |
2544 | - |
2545 | -double __attribute__ ((__overloadable__)) dot(double8 a, double8 b) |
2546 | -{ |
2547 | - return dot(a.lo, b.lo) + dot(a.hi, b.hi); |
2548 | -} |
2549 | - |
2550 | -double __attribute__ ((__overloadable__)) dot(double16 a, double16 b) |
2551 | +double _cl_overloadable dot(double4 a, double4 b) |
2552 | +{ |
2553 | + return dot(a.lo, b.lo) + dot(a.hi, b.hi); |
2554 | +} |
2555 | + |
2556 | +double _cl_overloadable dot(double8 a, double8 b) |
2557 | +{ |
2558 | + return dot(a.lo, b.lo) + dot(a.hi, b.hi); |
2559 | +} |
2560 | + |
2561 | +double _cl_overloadable dot(double16 a, double16 b) |
2562 | { |
2563 | return dot(a.lo, b.lo) + dot(a.hi, b.hi); |
2564 | } |
2565 | |
2566 | === modified file 'lib/kernel/upsample.cl' |
2567 | --- lib/kernel/upsample.cl 2011-11-24 15:56:10 +0000 |
2568 | +++ lib/kernel/upsample.cl 2011-12-18 04:51:24 +0000 |
2569 | @@ -25,7 +25,7 @@ |
2570 | // convert_* function calls |
2571 | |
2572 | #define IMPLEMENT_UPSAMPLE_LG_GUG(GTYPE, SGTYPE, UGTYPE, LGTYPE) \ |
2573 | - LGTYPE __attribute__ ((__overloadable__)) \ |
2574 | + LGTYPE _cl_overloadable \ |
2575 | upsample(GTYPE a, UGTYPE b) \ |
2576 | { \ |
2577 | int bits = CHAR_BIT * sizeof(SGTYPE); \ |
2578 | |
2579 | === modified file 'lib/kernel/vload.cl' |
2580 | --- lib/kernel/vload.cl 2011-12-14 23:01:01 +0000 |
2581 | +++ lib/kernel/vload.cl 2011-12-18 04:51:24 +0000 |
2582 | @@ -27,31 +27,31 @@ |
2583 | |
2584 | #define IMPLEMENT_VLOAD(TYPE, MOD) \ |
2585 | \ |
2586 | - TYPE##2 __attribute__ ((__overloadable__)) \ |
2587 | + TYPE##2 _cl_overloadable \ |
2588 | vload2(size_t offset, const MOD TYPE *p) \ |
2589 | { \ |
2590 | return (TYPE##2)(p[offset*2], p[offset*2+1]); \ |
2591 | } \ |
2592 | \ |
2593 | - TYPE##3 __attribute__ ((__overloadable__)) \ |
2594 | + TYPE##3 _cl_overloadable \ |
2595 | vload3(size_t offset, const MOD TYPE *p) \ |
2596 | { \ |
2597 | return (TYPE##3)(vload2(0, &p[offset*3]), p[offset*3+2]); \ |
2598 | } \ |
2599 | \ |
2600 | - TYPE##4 __attribute__ ((__overloadable__)) \ |
2601 | + TYPE##4 _cl_overloadable \ |
2602 | vload4(size_t offset, const MOD TYPE *p) \ |
2603 | { \ |
2604 | return (TYPE##4)(vload2(0, &p[offset*4]), vload2(0, &p[offset*4+2])); \ |
2605 | } \ |
2606 | \ |
2607 | - TYPE##8 __attribute__ ((__overloadable__)) \ |
2608 | + TYPE##8 _cl_overloadable \ |
2609 | vload8(size_t offset, const MOD TYPE *p) \ |
2610 | { \ |
2611 | return (TYPE##8)(vload4(0, &p[offset*8]), vload4(0, &p[offset*8+4])); \ |
2612 | } \ |
2613 | \ |
2614 | - TYPE##16 __attribute__ ((__overloadable__)) \ |
2615 | + TYPE##16 _cl_overloadable \ |
2616 | vload16(size_t offset, const MOD TYPE *p) \ |
2617 | { \ |
2618 | return (TYPE##16)(vload8(0, &p[offset*16]), vload8(0, &p[offset*16+8])); \ |
2619 | |
2620 | === modified file 'lib/kernel/vload_half.cl' |
2621 | --- lib/kernel/vload_half.cl 2011-12-14 23:01:01 +0000 |
2622 | +++ lib/kernel/vload_half.cl 2011-12-18 04:51:24 +0000 |
2623 | @@ -25,78 +25,100 @@ |
2624 | |
2625 | #ifdef cl_khr_fp16 |
2626 | |
2627 | -#define IMPLEMENT_VLOAD_HALF(MOD) \ |
2628 | - \ |
2629 | - float __attribute__ ((__overloadable__)) \ |
2630 | - vload_half(size_t offset, const MOD half *p) \ |
2631 | - { \ |
2632 | - return (float)p[offset]; \ |
2633 | - } \ |
2634 | - \ |
2635 | - float2 __attribute__ ((__overloadable__)) \ |
2636 | +/* |
2637 | + half: 1 sign bit, 5 exponent bits, 10 mantissa bits |
2638 | + float: 1 sign bit, 8 exponent bits, 23 mantissa bits |
2639 | + double: 1 sign bit, 10 exponent bits, 53 mantissa bits |
2640 | +*/ |
2641 | + |
2642 | + |
2643 | + |
2644 | +#define IMPLEMENT_VLOAD_HALF(MOD) \ |
2645 | + \ |
2646 | + float _cl_overloadable \ |
2647 | + vload_half(size_t offset, const MOD half *p) \ |
2648 | + { \ |
2649 | + /* This conversion always succeeds */ \ |
2650 | + short hval = ((const MOD short*)p)[offset]; \ |
2651 | + short hsign = (hval & (short)0x8000) >> (short)15; \ |
2652 | + short hexp = (hval & (short)0x7c00) >> (short)10; \ |
2653 | + short hmant = hval & (short)0x03ff; \ |
2654 | + bool isdenorm = hexp == (short)0; \ |
2655 | + bool isinfnan = hexp == (short)31; \ |
2656 | + hexp -= (short)15; \ |
2657 | + int fsign = (int)hsign << 31; \ |
2658 | + int fexp = (__builtin_expect(isdenorm, false) ? 0 : \ |
2659 | + __builtin_expect(isinfnan, false) ? 255 : (int)hexp + 127); \ |
2660 | + fexp <<= 23; \ |
2661 | + int fmant = (int)hmant << 13; \ |
2662 | + int fval = fsign | fexp | fmant; \ |
2663 | + return as_float(fval); \ |
2664 | + } \ |
2665 | + \ |
2666 | + float2 _cl_overloadable \ |
2667 | vload_half2(size_t offset, const MOD half *p) \ |
2668 | { \ |
2669 | return (float2)(vload_half(0, &p[offset*2]), \ |
2670 | vload_half(0, &p[offset*2+1])); \ |
2671 | } \ |
2672 | \ |
2673 | - float3 __attribute__ ((__overloadable__)) \ |
2674 | + float3 _cl_overloadable \ |
2675 | vload_half3(size_t offset, const MOD half *p) \ |
2676 | { \ |
2677 | return (float3)(vload_half2(0, &p[offset*3]), \ |
2678 | vload_half(0, &p[offset*3+2])); \ |
2679 | } \ |
2680 | \ |
2681 | - float4 __attribute__ ((__overloadable__)) \ |
2682 | + float4 _cl_overloadable \ |
2683 | vload_half4(size_t offset, const MOD half *p) \ |
2684 | { \ |
2685 | return (float4)(vload_half2(0, &p[offset*4]), \ |
2686 | vload_half2(0, &p[offset*4+2])); \ |
2687 | } \ |
2688 | \ |
2689 | - float8 __attribute__ ((__overloadable__)) \ |
2690 | + float8 _cl_overloadable \ |
2691 | vload_half8(size_t offset, const MOD half *p) \ |
2692 | { \ |
2693 | return (float8)(vload_half4(0, &p[offset*8]), \ |
2694 | vload_half4(0, &p[offset*8+4])); \ |
2695 | } \ |
2696 | \ |
2697 | - float16 __attribute__ ((__overloadable__)) \ |
2698 | + float16 _cl_overloadable \ |
2699 | vload_half16(size_t offset, const MOD half *p) \ |
2700 | { \ |
2701 | return (float16)(vload_half8(0, &p[offset*16]), \ |
2702 | vload_half8(0, &p[offset*16+8])); \ |
2703 | } \ |
2704 | \ |
2705 | - float2 __attribute__ ((__overloadable__)) \ |
2706 | + float2 _cl_overloadable \ |
2707 | vloada_half2(size_t offset, const MOD half *p) \ |
2708 | { \ |
2709 | return (float2)(vload_half(0, &p[offset*2]), \ |
2710 | vload_half(0, &p[offset*2+1])); \ |
2711 | } \ |
2712 | \ |
2713 | - float3 __attribute__ ((__overloadable__)) \ |
2714 | + float3 _cl_overloadable \ |
2715 | vloada_half3(size_t offset, const MOD half *p) \ |
2716 | { \ |
2717 | return (float3)(vloada_half2(0, &p[offset*4]), \ |
2718 | vload_half(0, &p[offset*4+2])); \ |
2719 | } \ |
2720 | \ |
2721 | - float4 __attribute__ ((__overloadable__)) \ |
2722 | + float4 _cl_overloadable \ |
2723 | vloada_half4(size_t offset, const MOD half *p) \ |
2724 | { \ |
2725 | return (float4)(vloada_half2(0, &p[offset*4]), \ |
2726 | vloada_half2(0, &p[offset*4+2])); \ |
2727 | } \ |
2728 | \ |
2729 | - float8 __attribute__ ((__overloadable__)) \ |
2730 | + float8 _cl_overloadable \ |
2731 | vloada_half8(size_t offset, const MOD half *p) \ |
2732 | { \ |
2733 | return (float8)(vloada_half4(0, &p[offset*8]), \ |
2734 | vloada_half4(0, &p[offset*8+4])); \ |
2735 | } \ |
2736 | \ |
2737 | - float16 __attribute__ ((__overloadable__)) \ |
2738 | + float16 _cl_overloadable \ |
2739 | vloada_half16(size_t offset, const MOD half *p) \ |
2740 | { \ |
2741 | return (float16)(vloada_half8(0, &p[offset*16]), \ |
2742 | |
2743 | === modified file 'lib/kernel/vstore.cl' |
2744 | --- lib/kernel/vstore.cl 2011-11-25 17:02:42 +0000 |
2745 | +++ lib/kernel/vstore.cl 2011-12-18 04:51:24 +0000 |
2746 | @@ -27,35 +27,35 @@ |
2747 | |
2748 | #define IMPLEMENT_VSTORE(TYPE, MOD) \ |
2749 | \ |
2750 | - void __attribute__ ((__overloadable__)) \ |
2751 | + void _cl_overloadable \ |
2752 | vstore2(TYPE##2 data, size_t offset, MOD TYPE *p) \ |
2753 | { \ |
2754 | p[offset*2] = data.lo; \ |
2755 | p[offset*2+1] = data.hi; \ |
2756 | } \ |
2757 | \ |
2758 | - void __attribute__ ((__overloadable__)) \ |
2759 | + void _cl_overloadable \ |
2760 | vstore3(TYPE##3 data, size_t offset, MOD TYPE *p) \ |
2761 | { \ |
2762 | vstore2(data.lo, 0, &p[offset*3]); \ |
2763 | p[offset*3+2] = data.s2; \ |
2764 | } \ |
2765 | \ |
2766 | - void __attribute__ ((__overloadable__)) \ |
2767 | + void _cl_overloadable \ |
2768 | vstore4(TYPE##4 data, size_t offset, MOD TYPE *p) \ |
2769 | { \ |
2770 | vstore2(data.lo, 0, &p[offset*4]); \ |
2771 | vstore2(data.hi, 0, &p[offset*4+2]); \ |
2772 | } \ |
2773 | \ |
2774 | - void __attribute__ ((__overloadable__)) \ |
2775 | + void _cl_overloadable \ |
2776 | vstore8(TYPE##8 data, size_t offset, MOD TYPE *p) \ |
2777 | { \ |
2778 | vstore4(data.lo, 0, &p[offset*8]); \ |
2779 | vstore4(data.hi, 0, &p[offset*8+4]); \ |
2780 | } \ |
2781 | \ |
2782 | - void __attribute__ ((__overloadable__)) \ |
2783 | + void _cl_overloadable \ |
2784 | vstore16(TYPE##16 data, size_t offset, MOD TYPE *p) \ |
2785 | { \ |
2786 | vstore8(data.lo, 0, &p[offset*16]); \ |
2787 | |
2788 | === modified file 'lib/kernel/vstore_half.cl' |
2789 | --- lib/kernel/vstore_half.cl 2011-12-14 23:01:01 +0000 |
2790 | +++ lib/kernel/vstore_half.cl 2011-12-18 04:51:24 +0000 |
2791 | @@ -25,78 +25,114 @@ |
2792 | |
2793 | #ifdef cl_khr_fp16 |
2794 | |
2795 | +/* |
2796 | + half: 1 sign bit, 5 exponent bits, 10 mantissa bits |
2797 | + float: 1 sign bit, 7 exponent bits, 23 mantissa bits |
2798 | + double: 1 sign bit, 10 exponent bits, 53 mantissa bits |
2799 | +*/ |
2800 | + |
2801 | +#define HALF_MAXPLUS 0x1.ffdp15f /* "one more" than HALF_MAX */ |
2802 | +#define HALF_MIN 0x1.0p-14f |
2803 | +#define HALF_ZERO ((short)0x0000) /* zero */ |
2804 | +#define HALF_INF ((short)0x4000) /* infinity */ |
2805 | +#define HALF_SIGN ((short)0x8000) /* sign bit */ |
2806 | + |
2807 | + |
2808 | + |
2809 | #define IMPLEMENT_VSTORE_HALF(MOD, SUFFIX) \ |
2810 | \ |
2811 | - void __attribute__ ((__overloadable__)) \ |
2812 | + void _cl_overloadable \ |
2813 | vstore_half##SUFFIX(float data, size_t offset, MOD half *p) \ |
2814 | { \ |
2815 | - p[offset] = data; \ |
2816 | + /* IDEA: modify data (e.g. add "1/2") to round correctly */ \ |
2817 | + int fval = as_int(data); \ |
2818 | + int fsign = (fval & 0x80000000) >> 31; \ |
2819 | + int fexp = (fval & 0x7f800000) >> 23; \ |
2820 | + int fmant = fval & 0x007fffff; \ |
2821 | + bool isdenorm = fexp == 0; \ |
2822 | + bool isinfnan = fexp == 255; \ |
2823 | + fexp -= 127; \ |
2824 | + short hsign = (short)fsign << (short)15; \ |
2825 | + short hexp = (__builtin_expect(isdenorm, false) ? (short)0 : \ |
2826 | + __builtin_expect(isinfnan, false) ? (short)31 : \ |
2827 | + (short)fexp + (short)15); \ |
2828 | + /* TODO: this always truncates */ \ |
2829 | + short hmant = (short)(fmant >> 13); \ |
2830 | + short hval; \ |
2831 | + if (__builtin_expect(fabs(data) >= HALF_MAXPLUS, false)) { \ |
2832 | + hval = data > 0.0f ? HALF_INF : HALF_INF | HALF_SIGN; \ |
2833 | + } else if (__builtin_expect(fabs(data) < HALF_MIN, false)) { \ |
2834 | + hval = signbit(data)==0 ? HALF_ZERO : HALF_ZERO | HALF_SIGN; \ |
2835 | + } else { \ |
2836 | + hval = hsign | hexp | hmant; \ |
2837 | + } \ |
2838 | + ((MOD short*)p)[offset] = hval; \ |
2839 | } \ |
2840 | \ |
2841 | - void __attribute__ ((__overloadable__)) \ |
2842 | + void _cl_overloadable \ |
2843 | vstore_half2##SUFFIX(float2 data, size_t offset, MOD half *p) \ |
2844 | { \ |
2845 | vstore_half##SUFFIX(data.lo, 0, &p[offset*2]); \ |
2846 | vstore_half##SUFFIX(data.hi, 0, &p[offset*2+1]); \ |
2847 | } \ |
2848 | \ |
2849 | - void __attribute__ ((__overloadable__)) \ |
2850 | + void _cl_overloadable \ |
2851 | vstore_half3##SUFFIX(float3 data, size_t offset, MOD half *p) \ |
2852 | { \ |
2853 | vstore_half2##SUFFIX(data.lo, 0, &p[offset*3]); \ |
2854 | vstore_half##SUFFIX(data.s2, 0, &p[offset*3+2]); \ |
2855 | } \ |
2856 | \ |
2857 | - void __attribute__ ((__overloadable__)) \ |
2858 | + void _cl_overloadable \ |
2859 | vstore_half4##SUFFIX(float4 data, size_t offset, MOD half *p) \ |
2860 | { \ |
2861 | vstore_half2##SUFFIX(data.lo, 0, &p[offset*4]); \ |
2862 | vstore_half2##SUFFIX(data.hi, 0, &p[offset*4+2]); \ |
2863 | } \ |
2864 | \ |
2865 | - void __attribute__ ((__overloadable__)) \ |
2866 | + void _cl_overloadable \ |
2867 | vstore_half8##SUFFIX(float8 data, size_t offset, MOD half *p) \ |
2868 | { \ |
2869 | vstore_half4##SUFFIX(data.lo, 0, &p[offset*8]); \ |
2870 | vstore_half4##SUFFIX(data.hi, 0, &p[offset*8+4]); \ |
2871 | } \ |
2872 | \ |
2873 | - void __attribute__ ((__overloadable__)) \ |
2874 | + void _cl_overloadable \ |
2875 | vstore_half16##SUFFIX(float16 data, size_t offset, MOD half *p) \ |
2876 | { \ |
2877 | vstore_half8##SUFFIX(data.lo, 0, &p[offset*16]); \ |
2878 | vstore_half8##SUFFIX(data.hi, 0, &p[offset*16+8]); \ |
2879 | } \ |
2880 | \ |
2881 | - void __attribute__ ((__overloadable__)) \ |
2882 | + void _cl_overloadable \ |
2883 | vstorea_half2##SUFFIX(float2 data, size_t offset, MOD half *p) \ |
2884 | { \ |
2885 | vstore_half##SUFFIX(data.lo, 0, &p[offset*2]); \ |
2886 | vstore_half##SUFFIX(data.hi, 0, &p[offset*2+1]); \ |
2887 | } \ |
2888 | \ |
2889 | - void __attribute__ ((__overloadable__)) \ |
2890 | + void _cl_overloadable \ |
2891 | vstorea_half3##SUFFIX(float3 data, size_t offset, MOD half *p) \ |
2892 | { \ |
2893 | vstorea_half2##SUFFIX(data.lo, 0, &p[offset*3]); \ |
2894 | vstore_half##SUFFIX(data.s2, 0, &p[offset*3+2]); \ |
2895 | } \ |
2896 | \ |
2897 | - void __attribute__ ((__overloadable__)) \ |
2898 | + void _cl_overloadable \ |
2899 | vstorea_half4##SUFFIX(float4 data, size_t offset, MOD half *p) \ |
2900 | { \ |
2901 | vstorea_half2##SUFFIX(data.lo, 0, &p[offset*4]); \ |
2902 | vstorea_half2##SUFFIX(data.hi, 0, &p[offset*4+2]); \ |
2903 | } \ |
2904 | \ |
2905 | - void __attribute__ ((__overloadable__)) \ |
2906 | + void _cl_overloadable \ |
2907 | vstorea_half8##SUFFIX(float8 data, size_t offset, MOD half *p) \ |
2908 | { \ |
2909 | vstorea_half4##SUFFIX(data.lo, 0, &p[offset*8]); \ |
2910 | vstorea_half4##SUFFIX(data.hi, 0, &p[offset*8+4]); \ |
2911 | } \ |
2912 | \ |
2913 | - void __attribute__ ((__overloadable__)) \ |
2914 | + void _cl_overloadable \ |
2915 | vstorea_half16##SUFFIX(float16 data, size_t offset, MOD half *p) \ |
2916 | { \ |
2917 | vstorea_half8##SUFFIX(data.lo, 0, &p[offset*16]); \ |
2918 | @@ -105,20 +141,20 @@ |
2919 | |
2920 | |
2921 | |
2922 | -IMPLEMENT_VSTORE_HALF(__global , ) |
2923 | -IMPLEMENT_VSTORE_HALF(__global , _rte) |
2924 | -IMPLEMENT_VSTORE_HALF(__global , _rtz) |
2925 | -IMPLEMENT_VSTORE_HALF(__global , _rtp) |
2926 | -IMPLEMENT_VSTORE_HALF(__global , _rtn) |
2927 | -IMPLEMENT_VSTORE_HALF(__local , ) |
2928 | -IMPLEMENT_VSTORE_HALF(__local , _rte) |
2929 | -IMPLEMENT_VSTORE_HALF(__local , _rtz) |
2930 | -IMPLEMENT_VSTORE_HALF(__local , _rtp) |
2931 | -IMPLEMENT_VSTORE_HALF(__local , _rtn) |
2932 | -/* IMPLEMENT_VSTORE_HALF(__private , ) */ |
2933 | -/* IMPLEMENT_VSTORE_HALF(__private , _rte) */ |
2934 | -/* IMPLEMENT_VSTORE_HALF(__private , _rtz) */ |
2935 | -/* IMPLEMENT_VSTORE_HALF(__private , _rtp) */ |
2936 | -/* IMPLEMENT_VSTORE_HALF(__private , _rtn) */ |
2937 | +IMPLEMENT_VSTORE_HALF(__global , ) |
2938 | +IMPLEMENT_VSTORE_HALF(__global , _rte) |
2939 | +IMPLEMENT_VSTORE_HALF(__global , _rtz) |
2940 | +IMPLEMENT_VSTORE_HALF(__global , _rtp) |
2941 | +IMPLEMENT_VSTORE_HALF(__global , _rtn) |
2942 | +IMPLEMENT_VSTORE_HALF(__local , ) |
2943 | +IMPLEMENT_VSTORE_HALF(__local , _rte) |
2944 | +IMPLEMENT_VSTORE_HALF(__local , _rtz) |
2945 | +IMPLEMENT_VSTORE_HALF(__local , _rtp) |
2946 | +IMPLEMENT_VSTORE_HALF(__local , _rtn) |
2947 | +/* IMPLEMENT_VSTORE_HALF(__private, ) */ |
2948 | +/* IMPLEMENT_VSTORE_HALF(__private, _rte) */ |
2949 | +/* IMPLEMENT_VSTORE_HALF(__private, _rtz) */ |
2950 | +/* IMPLEMENT_VSTORE_HALF(__private, _rtp) */ |
2951 | +/* IMPLEMENT_VSTORE_HALF(__private, _rtn) */ |
2952 | |
2953 | #endif |