OpenCL 1.2

Stephen Marz (23 May 2013)

Introduction

OpenCL is Khronos Group's version of DirectCompute to allow General Purpose programming on a GPU (GPGPU).

The code and explanations provided are from my own personal experience with OpenCL 1.2. I am not a Khronos Group representative nor do I work for them.

Getting Started

Programming

Set-up
// The following data types are the minimum for a small shader
cl_kernel kernel;
cl_program program;
cl_context context;
cl_mem inputBuffer;
cl_mem outputBuffer;
cl_command_queue commandQueue;

// OpenCL needs to figure out the Platform it will run on (it can run on the CPU as well)
cl_uint numPlatforms;
cl_platform_id platformID;
cl_int status;

status = clGetPlatformIDs(0, NULL, &numPlatforms); // First, figure how many platforms are available
if (status != CL_SUCCESS) {
   return false; // Could not get the number of platforms
}
//Now that we have the number of platforms, we can enumerate them individual and choose the one we want
if (numPlatforms > 0) {
   // Even though getting the number of platforms may have succeeded, there may not be any that support OpenCL
   cl_platform_id *platforms = new cl_platform_id[numPlatforms];
   status = clGetPlatformIDs(numPlatforms, platforms, NULL); // Notice the arguments now specify how many platforms to enumerate
   platformID = platforms[0]; // This just chooses the first one. You can do some diagnostics on each platform to specifically target one
   delete [] platforms;
}
//Now platformID contains the platform we chose to use, now we basically have to do the exact same thing with the devices
cl_uint numDevices;
cl_device_id *devices;
status = clGetDeviceIDs(platformID, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); // Get the number of devices that support platform specified and uses the GPU
if (numdevices > 0) {
   devices = new cl_device_id[numDevices];
   status = clGetDeviceIDs(platformID, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); // Figure out the actual devices that support the platform
}
// Now we have the platform and the device to use.
context = clCreateContext(NULL, 1, devices, NULL, NULL, NULL);
//A Command queue is a queue that holds what is reading or writing from the shader. This will be used for reading/writing/and executing to the shader
commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
Writing The Shader (myshader.cs)
// The CLSL (CL Shader Langauge) is somewhat like C
// We named our kernel "main" and the function must be preceeded by the keyword __kernel
/* Notice that the shader uses the data type float4 and our data is a 4x4 matrix. This means that it will spawn 4 threads to cover all of the data. The get_global_id(0) function below is how you determine which thread is executing. Also, the __global keyword is required for all data that will be transferred between the shader and the host. The other keyword for just shader data would be __local. */
__kernel void main(__global float4 *in, __global float4 *out) {
   const int num = get_global_id(0);
   out[num].x = in[num].x*in[num].x;
   out[num].y = in[num].y*in[num].x;
   out[num].z = in[num].z*in[num].x;
   out[num].w = in[num].w*in[num].x;
}
// All this kernel does is square each element of the matrix
Compiling The Shader
// I am using C++'s iostream here to get the source of the shader
ifstream shaderFile;
char *source;
size_t sourceSize;

shaderFile.open("mycshader.cs", ios::binary | ios::ate);
if (!shaderFile.is_open()) {
   return false; // Can't open the file for some reason
}

sourceSize = shaderFile.tellg();
shaderFile.seekg(0, ios::beg);
source = new char[sourceSize+1]
shaderFile.read(source, sourceSize);
source[sourceSize] = 0;
shaderFile.close();

const char *csource = source; // OpenCL's clCreateProgramWithSource requires the source pointer to be constant
program = clCreateProgramWithSource(context, 1, &csource, &sourceSize, NULL);
status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
if (status != CL_SUCCESS) {
   return false; // The program couldn't be built. This usually means a syntax error inside of the actual shader file
}
Allocating the Buffers
// The input buffer requires some initial data, so we'll just create an initial 4x4 matrix
cl_float16 data;
// The following function will create a read only input buffer by copying the host's memory
inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float16), &data, NULL);
outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float16), NULL, NULL);
Creating the Kernel
// A kernel in OpenCL is the "atomic" level of an operating program
kernel = clCreateKernel(program, "main", NULL); // We specify the actual main function of the OpenCL shader
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&inputBuffer); // The first (0th) argument to the main function will be the input buffer
clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&outputBuffer); // The second (1st) argument to the main function will be the output buffer
Queuing Reads/Writes/Executes
// First we'll enqueue a buffer write to reset the data to a known matrix
cl_float16 newdata = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f };
// Below, notice we use CL_TRUE in the blocking argument. If you didn't, this function would return whether or not the command was executed.
status = clEnqueueWriteBuffer(commandQueue, inputBuffer, CL_TRUE, 0, sizeof(cl_float16), (void*)newdata, NULL, NULL, NULL);
if (status != CL_SUCCESS) {
   cout << "Error writing to buffer" << endl;
}

// Now execute the shader

const size_t ipt[] = { 4, 4, 4, 4 };
const size_t localipt[] = { 1, 1, 1, 1 };
status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, ipt, localipt, 0, NULL, NULL);
if (status != CL_SUCCESS) {
   cout << "Error queuing the kernel" << endl;
}

// Read the data from the shader
status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, sizeof(cl_float16), (void*)data, 0, NULL, NULL);
if (status == CL_SUCCESS) {
   // Now the variable data (of type cl_float16) contains the data written by the shader
}
Cleaning up
delete [] source;
delete [] devices;

clReleaseKernel
(kernel);
clReleaseProgram(program);
clReleaseMemObject(inputBuffer);
clReleaseMemObject(outputBuffer);
clReleaseCommandQueue(commandQueue);
clReleaseContext(context);