--- /dev/null
+; ModuleID = 'void.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @hop() nounwind readnone noinline {
+entry:
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void ()* @hop}
cl_command_queue_bind_surface(cl_command_queue queue,
cl_kernel k,
char *curbe,
- cl_buffer **local,
- cl_buffer **priv,
- cl_buffer **scratch,
+ cl_buffer *local,
+ cl_buffer *priv,
+ cl_buffer *scratch,
uint32_t local_sz)
{
cl_context ctx = queue->ctx;
- cl_gpgpu *gpgpu = queue->gpgpu;
- cl_buffer_mgr *bufmgr = cl_context_get_bufmgr(ctx);
- cl_buffer *sync_bo = NULL;
+ cl_gpgpu gpgpu = queue->gpgpu;
+ cl_buffer_mgr bufmgr = cl_context_get_bufmgr(ctx);
+ cl_buffer sync_bo = NULL;
cl_int err = CL_SUCCESS;
#if 0
cl_context ctx = queue->ctx;
}
#if USE_FULSIM
-extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr*);
-extern void drm_intel_bufmgr_gem_set_aubfile(cl_buffer_mgr*, FILE*);
-extern void aub_exec_dump_raw_file(cl_buffer*, size_t offset, size_t sz);
+extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr);
+extern void drm_intel_bufmgr_gem_set_aubfile(cl_buffer_mgr, FILE*);
+extern void aub_exec_dump_raw_file(cl_buffer, size_t offset, size_t sz);
static void
cl_run_fulsim(void)
cl_int err = CL_SUCCESS;
#if USE_FULSIM
- cl_buffer_mgr *bufmgr = NULL;
+ cl_buffer_mgr bufmgr = NULL;
FILE *file = fopen("dump.aub", "wb");
FATAL_IF (file == NULL, "Unable to open file dump.aub");
bufmgr = cl_context_get_bufmgr(queue->ctx);
/* Basically, this is a (kind-of) batch buffer */
struct _cl_command_queue {
- uint64_t magic; /* To identify it as a command queue */
- volatile int ref_n; /* We reference count this object */
- cl_context ctx; /* Its parent context */
- cl_command_queue prev, next; /* We chain the command queues together */
- cl_gpgpu *gpgpu; /* Setup all GEN commands */
- cl_mem perf; /* Where to put the perf counters */
- cl_mem fulsim_out; /* Fulsim will output this buffer */
- cl_buffer *last_batch; /* To synchronize using clFinish */
+ uint64_t magic; /* To identify it as a command queue */
+ volatile int ref_n; /* We reference count this object */
+ cl_context ctx; /* Its parent context */
+ cl_command_queue prev, next; /* We chain the command queues together */
+ cl_gpgpu gpgpu; /* Setup all GEN commands */
+ cl_mem perf; /* Where to put the perf counters */
+ cl_mem fulsim_out; /* Fulsim will output this buffer */
+ cl_buffer last_batch; /* To synchronize using clFinish */
};
/* Allocate and initialize a new command queue. Also insert it in the list of
extern cl_int cl_command_queue_bind_surface(cl_command_queue queue,
cl_kernel k,
char *curbe,
- cl_buffer **local,
- cl_buffer **priv,
- cl_buffer **scratch,
+ cl_buffer *local,
+ cl_buffer *priv,
+ cl_buffer *scratch,
uint32_t local_sz);
#endif /* __CL_COMMAND_QUEUE_H__ */
const size_t *local_wk_sz)
{
cl_context ctx = queue->ctx;
- cl_gpgpu *gpgpu = queue->gpgpu;
+ cl_gpgpu gpgpu = queue->gpgpu;
char *curbe = NULL; /* Does not include per-thread local IDs */
char *final_curbe = NULL; /* Includes them */
- cl_buffer *private_bo = NULL, *scratch_bo = NULL;
- cl_gpgpu_kernel_t kernel;
+ cl_buffer private_bo = NULL, scratch_bo = NULL;
+ cl_gpgpu_kernel kernel;
const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
size_t i, batch_sz = 0u, local_sz = 0u, thread_n = 0u, id_offset = 0u, cst_sz = 0u;
cl_int err = CL_SUCCESS;
goto exit;
}
-struct cl_buffer_mgr*
+cl_buffer_mgr
cl_context_get_bufmgr(cl_context ctx)
{
return cl_driver_get_bufmgr(ctx->drv);
struct _cl_context {
uint64_t magic; /* To identify it as a context */
volatile int ref_n; /* We reference count this object */
- cl_driver *drv; /* Handles HW or simulator */
+ cl_driver drv; /* Handles HW or simulator */
cl_device_id device; /* All information about the GPU device */
cl_command_queue queues; /* All command queues currently allocated */
cl_program programs; /* All programs currently allocated */
const size_t*);
/* Used for allocation */
-extern cl_buffer_mgr* cl_context_get_bufmgr(cl_context ctx);
+extern cl_buffer_mgr cl_context_get_bufmgr(cl_context ctx);
#endif /* __CL_CONTEXT_H__ */
**************************************************************************/
/* Encapsulates command buffer / data buffer / kernels */
-typedef struct cl_buffer cl_buffer;
+typedef struct _cl_buffer *cl_buffer;
/* Encapsulates buffer manager */
-typedef struct cl_buffer_mgr cl_buffer_mgr;
+typedef struct _cl_buffer_mgr *cl_buffer_mgr;
/* Encapsulates the driver backend functionalities */
-typedef struct cl_driver cl_driver;
+typedef struct _cl_driver *cl_driver;
/* Encapsulates the gpgpu stream of commands */
-typedef struct cl_gpgpu cl_gpgpu;
+typedef struct _cl_gpgpu *cl_gpgpu;
/**************************************************************************
* Driver
**************************************************************************/
/* Create a new driver */
-typedef cl_driver* (cl_driver_new_cb)(void);
+typedef cl_driver (cl_driver_new_cb)(void);
extern cl_driver_new_cb *cl_driver_new;
/* Delete the driver */
-typedef void (cl_driver_delete_cb)(cl_driver*);
+typedef void (cl_driver_delete_cb)(cl_driver);
extern cl_driver_delete_cb *cl_driver_delete;
/* Get the buffer manager from the driver */
-typedef cl_buffer_mgr* (cl_driver_get_bufmgr_cb)(cl_driver*);
+typedef cl_buffer_mgr (cl_driver_get_bufmgr_cb)(cl_driver);
extern cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr;
/* Get the Gen version from the driver */
-typedef uint32_t (cl_driver_get_ver_cb)(cl_driver*);
+typedef uint32_t (cl_driver_get_ver_cb)(cl_driver);
extern cl_driver_get_ver_cb *cl_driver_get_ver;
/**************************************************************************
GPGPU_NO_TILE = 0,
GPGPU_TILE_X = 1,
GPGPU_TILE_Y = 2,
-} cl_gpgpu_tiling_t;
+} cl_gpgpu_tiling;
/* Cache control options */
-enum gen6_cache_control {
+typedef enum cl_cache_control {
cc_gtt = 0x0,
cc_l3 = 0x1,
cc_llc = 0x2,
cc_llc_l3 = 0x3
-};
-
+} cl_cache_control;
/* Use this structure to bind kernels in the gpgpu state */
typedef struct cl_gpgpu_kernel {
const char *name; /* kernel name and bo name */
uint32_t grf_blocks; /* register blocks kernel wants (in 8 reg blocks) */
uint32_t cst_sz; /* total size of all constants */
- cl_buffer *bo; /* kernel code in the proper addr space */
+ cl_buffer bo; /* kernel code in the proper addr space */
int32_t barrierID; /* barrierID for _this_ kernel */
uint32_t use_barrier:1; /* For gen7 (automatic barrier management) */
uint32_t thread_n:15; /* For gen7 (automatic barrier management) */
uint32_t slm_sz:16; /* For gen7 (automatic SLM allocation) */
-} cl_gpgpu_kernel_t;
+} cl_gpgpu_kernel;
/* Create a new gpgpu state */
-typedef cl_gpgpu* (cl_gpgpu_new_cb)(cl_driver*);
+typedef cl_gpgpu (cl_gpgpu_new_cb)(cl_driver);
extern cl_gpgpu_new_cb *cl_gpgpu_new;
/* Delete the gpgpu state */
-typedef void (cl_gpgpu_delete_cb)(cl_gpgpu*);
+typedef void (cl_gpgpu_delete_cb)(cl_gpgpu);
extern cl_gpgpu_delete_cb *cl_gpgpu_delete;
/* Bind a regular unformatted buffer */
-typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu*, int32_t index, cl_buffer *, uint32_t cchint);
+typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, int32_t index, cl_buffer, uint32_t cchint);
extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
/* Set a 2d texture */
typedef void (cl_gpgpu_bind_image2D_cb)(cl_gpgpu state,
int32_t index,
- struct cl_buffer* obj_bo,
+ cl_buffer obj_bo,
uint32_t format,
int32_t w,
int32_t h,
int pitch,
- cl_gpgpu_tiling_t tiling);
+ cl_gpgpu_tiling tiling);
extern cl_gpgpu_bind_image2D_cb *cl_gpgpu_bind_image2D;
/* Configure internal state */
-typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu*, uint32_t max_threads, uint32_t size_cs_entry);
+typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry);
extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
/* Set the buffer object where to report performance counters */
-typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu*, struct cl_buffer *perf);
+typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu, cl_buffer perf);
extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters;
/* Fills current constant buffer with data */
-typedef void (cl_gpgpu_upload_constants_cb)(cl_gpgpu*, const void* data, uint32_t size);
+typedef void (cl_gpgpu_upload_constants_cb)(cl_gpgpu, const void* data, uint32_t size);
extern cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants;
/* Setup all indirect states */
-typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu*, cl_gpgpu_kernel_t* kernel, uint32_t ker_n);
+typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu, cl_gpgpu_kernel* kernel, uint32_t ker_n);
extern cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup;
/* Make HW threads use barrierID */
-typedef void (cl_gpgpu_update_barrier_cb)(cl_gpgpu*, uint32_t barrierID, uint32_t thread_n);
+typedef void (cl_gpgpu_update_barrier_cb)(cl_gpgpu, uint32_t barrierID, uint32_t thread_n);
extern cl_gpgpu_update_barrier_cb *cl_gpgpu_update_barrier;
/* Upload the constant samplers as specified inside the OCL kernel */
extern cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers;
/* Set a sampler */
-typedef void (cl_gpgpu_set_sampler_cb)(cl_gpgpu*, uint32_t index, uint32_t non_normalized);
+typedef void (cl_gpgpu_set_sampler_cb)(cl_gpgpu, uint32_t index, uint32_t non_normalized);
extern cl_gpgpu_set_sampler_cb *cl_gpgpu_set_sampler;
/* Allocate the batch buffer and return the BO used for the batch buffer */
-typedef void (cl_gpgpu_batch_reset_cb)(cl_gpgpu*, size_t sz);
+typedef void (cl_gpgpu_batch_reset_cb)(cl_gpgpu, size_t sz);
extern cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset;
/* Atomic begin, pipeline select, urb, pipeline state and constant buffer */
-typedef void (cl_gpgpu_batch_start_cb)(cl_gpgpu*);
+typedef void (cl_gpgpu_batch_start_cb)(cl_gpgpu);
extern cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start;
/* atomic end with possibly inserted flush */
-typedef void (cl_gpgpu_batch_end_cb)(cl_gpgpu*, int32_t flush_mode);
+typedef void (cl_gpgpu_batch_end_cb)(cl_gpgpu, int32_t flush_mode);
extern cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end;
/* Flush the command buffer */
-typedef void (cl_gpgpu_flush_cb)(cl_gpgpu *state);
+typedef void (cl_gpgpu_flush_cb)(cl_gpgpu);
extern cl_gpgpu_flush_cb *cl_gpgpu_flush;
/* Will spawn all threads */
-typedef void (cl_gpgpu_walker_cb)(cl_gpgpu *state,
+typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
uint32_t simd_sz,
uint32_t thread_n,
const size_t global_wk_off[3],
* Buffer
**************************************************************************/
/* Allocate a buffer */
-typedef cl_buffer* (cl_buffer_alloc_cb)(cl_buffer_mgr*, const char*, unsigned long, unsigned long);
+typedef cl_buffer (cl_buffer_alloc_cb)(cl_buffer_mgr, const char*, unsigned long, unsigned long);
extern cl_buffer_alloc_cb *cl_buffer_alloc;
/* Unref a buffer and destroy it if no more ref */
-typedef void (cl_buffer_unreference_cb)(cl_buffer*);
+typedef void (cl_buffer_unreference_cb)(cl_buffer);
extern cl_buffer_unreference_cb *cl_buffer_unreference;
/* Add one more ref on a buffer */
-typedef void (cl_buffer_reference_cb)(cl_buffer*);
+typedef void (cl_buffer_reference_cb)(cl_buffer);
extern cl_buffer_reference_cb *cl_buffer_reference;
/* Map a buffer */
-typedef int (cl_buffer_map_cb)(cl_buffer*, uint32_t write_enable);
+typedef int (cl_buffer_map_cb)(cl_buffer, uint32_t write_enable);
extern cl_buffer_map_cb *cl_buffer_map;
/* Unmap a buffer */
-typedef int (cl_buffer_unmap_cb)(cl_buffer*);
+typedef int (cl_buffer_unmap_cb)(cl_buffer);
extern cl_buffer_unmap_cb *cl_buffer_unmap;
/* Get the virtual address (when mapped) */
-typedef void* (cl_buffer_get_virtual_cb)(cl_buffer*);
+typedef void* (cl_buffer_get_virtual_cb)(cl_buffer);
extern cl_buffer_get_virtual_cb *cl_buffer_get_virtual;
/* Pin a buffer */
-typedef int (cl_buffer_pin_cb)(cl_buffer*, uint32_t alignment);
+typedef int (cl_buffer_pin_cb)(cl_buffer, uint32_t alignment);
extern cl_buffer_pin_cb *cl_buffer_pin;
/* Unpin a buffer */
-typedef int (cl_buffer_unpin_cb)(cl_buffer*);
+typedef int (cl_buffer_unpin_cb)(cl_buffer);
extern cl_buffer_unpin_cb *cl_buffer_unpin;
/* Fill data in the buffer */
-typedef int (cl_buffer_subdata_cb)(cl_buffer*, unsigned long, unsigned long, const void*);
+typedef int (cl_buffer_subdata_cb)(cl_buffer, unsigned long, unsigned long, const void*);
extern cl_buffer_subdata_cb *cl_buffer_subdata;
/* Emit relocation */
-typedef int (cl_buffer_emit_reloc_cb) (cl_buffer *, uint32_t, cl_buffer*, uint32_t, uint32_t, uint32_t);
+typedef int (cl_buffer_emit_reloc_cb) (cl_buffer, uint32_t, cl_buffer, uint32_t, uint32_t, uint32_t);
extern cl_buffer_emit_reloc_cb *cl_buffer_emit_reloc;
/* Wait for all pending rendering for this buffer to complete */
-typedef int (cl_buffer_wait_rendering_cb) (cl_buffer *);
+typedef int (cl_buffer_wait_rendering_cb) (cl_buffer);
extern cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering;
#endif /* __CL_BUFFER_H__ */
cl_kernel_setup(cl_kernel k, const struct GenKernel *gen_kernel)
{
cl_context ctx = k->program->ctx;
- cl_buffer_mgr *bufmgr = cl_context_get_bufmgr(ctx);
+ cl_buffer_mgr bufmgr = cl_context_get_bufmgr(ctx);
/* Allocate the gen code here */
const uint32_t code_sz = GenKernelGetCodeSize(gen_kernel);
struct _cl_kernel {
uint64_t magic; /* To identify it as a kernel */
volatile int ref_n; /* We reference count this object */
- struct cl_buffer *bo; /* The code itself */
- struct cl_buffer *const_bo; /* Buffer for all __constants values in the OCL program */
+ cl_buffer bo; /* The code itself */
+ cl_buffer const_bo; /* Buffer for all __constants values in the OCL program */
cl_program program; /* Owns this structure (and pointers) */
const struct GenKernel *gen_kernel; /* (Opaque) compiler structure for the OCL kernel */
- uint8_t ref_its_program; /* True only for the user kernel (those created by clCreateKernel) */
+ uint8_t ref_its_program; /* True only for the user kernel (created by clCreateKernel) */
};
/* Allocate an empty kernel */
cl_int is_tiled,
cl_int *errcode)
{
- cl_buffer_mgr *bufmgr = NULL;
+ cl_buffer_mgr bufmgr = NULL;
cl_mem mem = NULL;
cl_int err = CL_SUCCESS;
size_t alignment = 64;
struct _cl_mem {
uint64_t magic; /* To identify it as a memory object */
volatile int ref_n; /* This object is reference counted */
- cl_buffer *bo; /* Data in GPU memory */
+ cl_buffer bo; /* Data in GPU memory */
cl_mem prev, next; /* We chain the memory buffers together */
cl_context ctx; /* Context it belongs to */
cl_mem_flags flags; /* Flags specified at the creation time */
#define MAX_IF_DESC 32
-/* Device abstraction */
-struct intel_driver;
-
/* Handle GPGPU state (actually "media" state) */
struct intel_gpgpu
{
intel_driver_t *drv;
intel_batchbuffer_t *batch;
- cl_gpgpu_kernel_t *ker;
+ cl_gpgpu_kernel *ker;
struct {
dri_bo *bo;
int32_t w,
int32_t h,
int32_t pitch,
- cl_gpgpu_tiling_t tiling)
+ cl_gpgpu_tiling tiling)
{
assert(index < MAX_SURFACES);
if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
static void
intel_gpgpu_build_idrt(intel_gpgpu_t *state,
- cl_gpgpu_kernel_t *kernel,
+ cl_gpgpu_kernel *kernel,
uint32_t ker_n)
{
gen6_interface_descriptor_t *desc;
intel_gpgpu_upload_samplers(intel_gpgpu_t *state, const void *data, uint32_t n)
{
if (n) {
- /*sizeof(gen6_sampler_state_t) == sizeof(gen7_surface_state_t) */
const size_t sz = n * sizeof(gen6_sampler_state_t);
memcpy(state->sampler_state_b.bo->virtual, data, sz);
}
}
static void
-intel_gpgpu_states_setup(intel_gpgpu_t *state, cl_gpgpu_kernel_t *kernel, uint32_t ker_n)
+intel_gpgpu_states_setup(intel_gpgpu_t *state, cl_gpgpu_kernel *kernel, uint32_t ker_n)
{
state->ker = kernel;
intel_gpgpu_build_idrt(state, kernel, ker_n);