OpenCL is Khronos Group's version of DirectCompute to allow General Purpose programming on a GPU (GPGPU).
The code and explanations provided are from my own personal experience with OpenCL 1.2. I am not a Khronos Group representative nor do I work for them.
Set-up
cl_kernel kernel;
cl_program program;
cl_context context;
cl_mem inputBuffer;
cl_mem outputBuffer;
cl_command_queue commandQueue;
cl_uint numPlatforms;
cl_platform_id platformID;
cl_int status;
status =
clGetPlatformIDs(0, NULL, &numPlatforms);
if (status !=
CL_SUCCESS) {
return false;
}
if (numPlatforms > 0) {
cl_platform_id *platforms =
new cl_platform_id[numPlatforms];
status =
clGetPlatformIDs(numPlatforms, platforms, NULL);
platformID = platforms[0];
delete [] platforms;
}
cl_uint numDevices;
cl_device_id *devices;
status =
clGetDeviceIDs(platformID,
CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
if (numdevices > 0) {
devices =
new cl_device_id[numDevices];
status =
clGetDeviceIDs(platformID,
CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
}
context =
clCreateContext(NULL, 1, devices, NULL, NULL, NULL);
commandQueue =
clCreateCommandQueue(context, devices[0], 0, NULL);
Writing The Shader (myshader.cs)
__kernel
void main(__global
float4 *in, __global
float4 *out) {
const int num =
get_global_id(0);
out[num].x = in[num].x*in[num].x;
out[num].y = in[num].y*in[num].x;
out[num].z = in[num].z*in[num].x;
out[num].w = in[num].w*in[num].x;
}
Compiling The Shader
ifstream shaderFile;
char *source;
size_t sourceSize;
shaderFile.
open(
"mycshader.cs", ios::binary | ios::ate);
if (!shaderFile.
is_open()) {
return false;
}
sourceSize = shaderFile.
tellg();
shaderFile.
seekg(0, ios::beg);
source =
new char[sourceSize+1]
shaderFile.
read(source, sourceSize);
source[sourceSize] = 0;
shaderFile.
close();
const char *csource = source;
program =
clCreateProgramWithSource(context, 1, &csource, &sourceSize, NULL);
status =
clBuildProgram(program, 1, devices, NULL, NULL, NULL);
if (status !=
CL_SUCCESS) {
return false;
}
Allocating the Buffers
cl_float16 data;
inputBuffer =
clCreateBuffer(context,
CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR,
sizeof(
cl_float16), &data, NULL);
outputBuffer =
clCreateBuffer(context,
CL_MEM_WRITE_ONLY,
sizeof(
cl_float16), NULL, NULL);
Creating the Kernel
Queuing Reads/Writes/Executes
cl_float16 newdata = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f };
status =
clEnqueueWriteBuffer(commandQueue, inputBuffer,
CL_TRUE, 0,
sizeof(
cl_float16), (
void*)newdata, NULL, NULL, NULL);
if (status !=
CL_SUCCESS) {
cout <<
"Error writing to buffer" << endl;
}
const size_t ipt[] = { 4, 4, 4, 4 };
const size_t localipt[] = { 1, 1, 1, 1 };
status =
clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, ipt, localipt, 0, NULL, NULL);
if (status !=
CL_SUCCESS) {
cout <<
"Error queuing the kernel" << endl;
}
status =
clEnqueueReadBuffer(commandQueue, outputBuffer,
CL_TRUE, 0,
sizeof(
cl_float16), (
void*)data, 0, NULL, NULL);
if (status ==
CL_SUCCESS) {
}
Cleaning up