I am going to show you how to build and run a very simple OpenCL program, that will calculate the dot product, since it is my favorite functionality testing algorithm. I enrolled one course about GPGPU and we were told,that learning OpenCL starts with learning CUDA first. This is just because the CUDA runtime API is much more elegant and easier to understand than the CUDA driver API, which is in fact very similar to OpenCL. Although OpenCL can be run on any platform, I prefer CUDA due its support from nVidia (Good documentation and Library base – CuFFT, CuBLAS …).
I will start with describing a standard CUDA cyclus:
- Allocate Memory on GPU
- Data copy HostToDevice
- Set Grid Dimensions
- Run Kernel to compute whatever
- Data copy DeviceToHost
- Clear memory
On the other hand, lets take a look on a standard OpenCL cyclus:
- Select Platform
- Select Device
- Create Context
- Create Command Queue
- Allocate Memory on the GPU
- Load Kernel files
- Create a Program
- Build the Program
- Create Kernel
- Set Kernel Arguments
- Data copy HostToDevice
- Set Grid Dimensions
- Run Kernel to compute whatever
- Data copy DeviceToHost
- Release resources
As you can see, there is plenty of code, that basically “does nothing useful at all”. Its more like a talking to a child and telling him exactly what to do. Although almost the same must be done in case of the CUDA driver API, most programmers tend to use the runtime API for its simplicity. More OpenCL code is a result of its portability. It can be run anywhere, even on the CPU. I will now post the example code, which you can find as a VS2013 project at the bottom of this page. Keep in mind, that to fully understand the OpenCL you will have to lookup the documentation, as you can basically pass plenty of OpenCL parameters with NULL.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
// C++ INCLUDES #include <stdio.h> #include <string> #include <iostream> #include <fstream> #include <sstream> // OPENCL INCLUDES #include <CL\cl.h> using namespace std; int main(int argc, char **argv){ size_t N = 8192*1024; // AMOUNT OF WORK // ALLOCATE CPU MEMORY cl_float *HostInput1 = new cl_float[N]; cl_float *HostInput2 = new cl_float[N]; cl_float *HostResults = new cl_float[N]; // FEED INPUT ARRAYS ON THE CPU for (size_t i = 0; i < N; i++){ HostInput1[i] = static_cast<cl_float>(i); HostInput2[i] = static_cast<cl_float>(N - i); HostResults[i] = 0.0f; } // OPENCL INIT cl_context Context = NULL; cl_command_queue Quene = NULL; cl_program Program = NULL; cl_kernel DotProduct = NULL; cl_platform_id PlatformID = NULL; cl_device_id Device = NULL; cl_int Status = NULL; cl_uint NumEntries = 1; // SELECT PLATFORM cl_uint NumPlatformAvailable; Status = clGetPlatformIDs(NumEntries, &PlatformID, &NumPlatformAvailable); if (Status != CL_SUCCESS){ cout << "PLATFORM NOT FOUND OR PLATFORM FAILURE" << endl; } // SELECT DEVICE cl_uint NumDevices; Status = clGetDeviceIDs(PlatformID, CL_DEVICE_TYPE_GPU, NumEntries, &Device, &NumDevices); if (Status != CL_SUCCESS){ cout << "DEVICE SEARCH FAILURE" << endl; } // CREATE CONTEXT ON THE SELECTED PLATFORM Context = clCreateContext(0, NumDevices, &Device, NULL, NULL, &Status); if (Status != CL_SUCCESS){ cout << "CONTEXT CREATION FAILED" << endl; } // CREATE COMMAND QUEEN Quene = clCreateCommandQueue(Context, Device, 0, &Status); if (Status != CL_SUCCESS){ cout << " COMMAND QUENE CREATING FAILED" << endl; } // ALLOCATE MEMORY BUFFERS cl_mem DevIn1 = clCreateBuffer(Context, CL_MEM_READ_WRITE, N*sizeof(cl_float), NULL, &Status); cl_mem DevIn2 = clCreateBuffer(Context, CL_MEM_READ_WRITE, N*sizeof(cl_float), NULL, &Status); cl_mem DevOut = clCreateBuffer(Context, CL_MEM_READ_WRITE, N*sizeof(cl_float), NULL, &Status); // LOAD KERNEL AND CREATE PROGRAM OBJECT const char name[] = "Kernel.cl"; ifstream kernelFile(name, ios::in); ostringstream oss; if (!kernelFile.is_open()){cout << "MAIN KERNEL FILE CANNOT BE OPENED" << endl;} oss << kernelFile.rdbuf(); string SourceString = oss.str(); const char * KernelSource = SourceString.c_str(); Program = clCreateProgramWithSource(Context, 1, &KernelSource, NULL, &Status); if (Status != CL_SUCCESS){ cout << " PROGRAM CREATION FAILED" << endl; } // BUILD THE PROGRAM Status = clBuildProgram(Program, 0, NULL, NULL, NULL, NULL); if (Status != CL_SUCCESS){ char log[16384]; cout<< "PROGRAM COMPILATION FAILED" << endl; cout<< "------- DETAILED INFORMATION: -------" << endl; clGetProgramBuildInfo(Program, Device, CL_PROGRAM_BUILD_LOG, sizeof(log), log, NULL); cout << log << endl; cout<<"------ ------ ------ END LOG ------- ------ ------"<<endl; } // CREATE KERNEL DotProduct = clCreateKernel(Program, "DotProduct", &Status); if (Status != CL_SUCCESS){ cout << "KERNEL CREATION FAILED" << endl; } // SET KERNEL ARGUMENTS Status = clSetKernelArg(DotProduct, 0, sizeof(cl_mem), (void*)&DevIn1); if (Status != CL_SUCCESS){ cout << "KERNEL ARGUMENT SET FAILURE" << endl; } Status = clSetKernelArg(DotProduct, 1, sizeof(cl_mem), (void*)&DevIn2); if (Status != CL_SUCCESS){ cout << "KERNEL ARGUMENT SET FAILURE" << endl; } Status = clSetKernelArg(DotProduct, 2, sizeof(cl_mem), (void*)&DevOut); if (Status != CL_SUCCESS){ cout << "KERNEL ARGUMENT SET FAILURE" << endl; } // COPY DATA TO THE GPU Status = clEnqueueWriteBuffer(Quene, DevIn1, CL_FALSE, 0, N*sizeof(cl_float), HostInput1, 0, NULL, NULL); if (Status != CL_SUCCESS){ cout << "HostToDevice COPY FAILURE" << endl; } Status = clEnqueueWriteBuffer(Quene, DevIn2, CL_FALSE, 0, N*sizeof(cl_float), HostInput2, 0, NULL, NULL); if (Status != CL_SUCCESS){ cout << "HostToDevice COPY FAILURE" << endl; } // CHOOSE GRID DIMENSION const cl_uint Dimensionality = 3; size_t global_work_size[Dimensionality] = { N,1,1 }; size_t local_work_size[Dimensionality] = { 1024,1,1 }; // ACTUALLY LAUNCH THE KERNEL Status = clEnqueueNDRangeKernel(Quene, DotProduct, Dimensionality, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (Status != CL_SUCCESS){cout<<"STARTING THE DOTPRODUCT KERNEL FAILED"<<endl;} clFinish(Quene); // COPY RESULTS BACK TO HOST Status = clEnqueueReadBuffer(Quene, DevOut, CL_TRUE, 0, N*sizeof(cl_float), HostResults, 0, NULL, NULL); if (Status != CL_SUCCESS){ cout << "DeviceToHost COPY FAILURE" << endl; } // CALCULATE CPU COMPARISON AND SHOW DIFFERENCES cl_float CPURes = 0.0f; cl_float GPURes = 0.0f; for (size_t i = 0; i < N; i++){ CPURes += HostInput1[i] * HostInput2[i]; // CPU MUL GPURes += HostResults[i]; // GPU ADD } cout << "CPU DOT PRODUCT RESULT: " << CPURes << endl; cout << "GPU DOT PRODUCT RESULT: " << GPURes << endl; // CLEAN MEMORY delete HostInput1; delete HostInput2; delete HostResults; clReleaseMemObject(DevIn1); clReleaseMemObject(DevIn2); clReleaseMemObject(DevOut); clReleaseKernel(DotProduct); clReleaseProgram(Program); clReleaseContext(Context); clReleaseCommandQueue(Quene); system("pause"); return 0; } |
And last, the very basic OpenCL Kernel:
1 2 3 4 |
__kernel void DotProduct(__global float *Input1, __global float *Input2, __global float *Output){ int tid = get_global_id(0); Output[tid] = Input1[tid] * Input2[tid]; } |
Thats it. You can copy and paste the code for yourself, or download the VS2013 Project files HERE. Keep also in mind that OpenCL has to be supported by your GPU, you can easily check with the free GPU-Z program. In case OpenCL is not available, you should probably update your drivers. In case you are still wondering whether to program in OpenCL or CUDA, I suggest you to take the green CUDA road :)