123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646 |
- #define BUCKET_WARP_LOG_SIZE 5
- #define BUCKET_WARP_N 1
- #ifdef BUCKET_WG_SIZE_1
- #define BUCKET_THREAD_N BUCKET_WG_SIZE_1
- #else
- #define BUCKET_THREAD_N (BUCKET_WARP_N << BUCKET_WARP_LOG_SIZE)
- #endif
- #define BUCKET_BLOCK_MEMORY (DIVISIONS * BUCKET_WARP_N)
- #define BUCKET_BAND 128
- #define SIZE (1 << 22)
- #define DATA_SIZE (1024)
- #define MAX_SOURCE_SIZE (0x100000)
- #define HISTOGRAM_SIZE (1024 * sizeof(unsigned int))
- #include <fcntl.h>
- #include <float.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <math.h>
- #include <unistd.h>
- #include <sys/types.h>
- #include <sys/stat.h>
- #include <CL/cl.h>
- #include "bucketsort.h"
- #include <time.h>
- ////////////////////////////////////////////////////////////////////////////////
- // Forward declarations
- ////////////////////////////////////////////////////////////////////////////////
- void calcPivotPoints(float *histogram, int histosize, int listsize,
- int divisions, float min, float max, float *pivotPoints,
- float histo_width);
- ////////////////////////////////////////////////////////////////////////////////
- // Globals
- ////////////////////////////////////////////////////////////////////////////////
- const int histosize = 1024;
- unsigned int* h_offsets = NULL;
- unsigned int* d_offsets = NULL;
- cl_mem d_offsets_buff;
- int *d_indice = NULL;
- cl_mem d_indice_buff;
- cl_mem d_input_buff;
- cl_mem d_indice_input_buff;
- float *pivotPoints = NULL;
- float *historesult = NULL;
- float *l_pivotpoints = NULL;
- cl_mem l_pivotpoints_buff;
- unsigned int *d_prefixoffsets = NULL;
- unsigned int *d_prefixoffsets_altered = NULL;
- cl_mem d_prefixoffsets_buff;
- cl_mem d_prefixoffsets_input_buff;
- unsigned int *l_offsets = NULL;
- cl_mem l_offsets_buff;
- unsigned int *d_Result1024;
- cl_device_id device_id; // compute device id
- cl_context bucketContext; // compute context
- cl_context histoContext;
- cl_command_queue bucketCommands; // compute command queue
- cl_command_queue histoCommands;
- cl_program bucketProgram; // compute program
- cl_program histoProgram;
- cl_kernel bucketcountKernel; // compute kernel
- cl_kernel histoKernel;
- cl_kernel bucketprefixKernel;
- cl_kernel bucketsortKernel;
- cl_mem histoInput;
- cl_mem histoOutput;
- cl_mem bucketOutput;
- cl_int err;
- cl_uint num_platforms;
- cl_event histoEvent;
- cl_event bucketCountEvent;
- cl_event bucketPrefixEvent;
- cl_event bucketSortEvent;
- double sum = 0;
- ////////////////////////////////////////////////////////////////////////////////
- // Initialize the bucketsort algorithm
- ////////////////////////////////////////////////////////////////////////////////
- void init_bucketsort(int listsize, int platform_id, int device_id, int use_gpu)
- {
- cl_uint num = 0;
- clGetPlatformIDs(0, NULL, &num);
- cl_platform_id platformID[num];
- clGetPlatformIDs(num, platformID, NULL);
-
- // Selector for CPU/GPU
- cl_device_type device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
-
- clGetDeviceIDs(platformID[platform_id],device_type,0,NULL,&num);
-
- cl_device_id devices[num];
- err = clGetDeviceIDs(platformID[platform_id],device_type,num,devices,NULL);
- // int gpu = 1;
- // err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 2, &device_id, NULL);
-
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to create a device group!\n");
- exit(1);
- }
- char name[128];
-
-
- clGetDeviceInfo(devices[device_id],CL_DEVICE_NAME,128,name,NULL);
-
- bucketContext = clCreateContext(0, 1, &devices[device_id], NULL, NULL, &err);
- bucketCommands = clCreateCommandQueue(bucketContext, devices[device_id], CL_QUEUE_PROFILING_ENABLE, &err);
-
- h_offsets = (unsigned int *) malloc(DIVISIONS * sizeof(unsigned int));
- for(int i = 0; i < DIVISIONS; i++){
- h_offsets[i] = 0;
- }
- d_offsets_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, DIVISIONS * sizeof(unsigned int),NULL,NULL);
- pivotPoints = (float *)malloc(DIVISIONS * sizeof(float));
-
- d_indice_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, listsize * sizeof(int),NULL,NULL);
- d_indice_input_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, listsize * sizeof(int),NULL,NULL);
- d_indice = (int *)malloc(listsize * sizeof(int));
- historesult = (float *)malloc(histosize * sizeof(float));
- l_pivotpoints = (float *)malloc(DIVISIONS*sizeof(float));
- l_pivotpoints_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, DIVISIONS * sizeof(float), NULL, NULL);
- l_offsets_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, DIVISIONS * sizeof(unsigned int), NULL, NULL);
-
- int blocks = ((listsize - 1) / (BUCKET_THREAD_N * BUCKET_BAND)) + 1;
- d_prefixoffsets_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), NULL, NULL);
- d_prefixoffsets = (unsigned int *)malloc(blocks*BUCKET_BLOCK_MEMORY*sizeof(int));
- d_prefixoffsets_altered = (unsigned int *)malloc(blocks*BUCKET_BLOCK_MEMORY*sizeof(int));
- d_prefixoffsets_input_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), NULL, NULL);
- bucketOutput = clCreateBuffer(bucketContext, CL_MEM_READ_WRITE, (listsize + (DIVISIONS*4))*sizeof(float), NULL, NULL);
- FILE *fp;
- const char fileName[]="./bucketsort_kernels.cl";
- size_t source_size;
- char *source_str;
-
- fp = fopen(fileName, "r");
- if (!fp) {
- fprintf(stderr, "Failed to load bucket kernel.\n");
- exit(1);
- }
-
-
- source_str = (char *)malloc(MAX_SOURCE_SIZE);
- source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
- fclose(fp);
-
- bucketProgram = clCreateProgramWithSource(bucketContext, 1, (const char **) &source_str, (const size_t)&source_size, &err);
- if (!bucketProgram)
- {
- printf("Error: Failed to create bucket compute program!\n");
- exit(1);
- }
-
- err = clBuildProgram(bucketProgram, 0, NULL, NULL, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- size_t len;
- char buffer[2048];
-
- printf("Error: Failed to build bucket program executable!\n");
- clGetProgramBuildInfo(bucketProgram, devices[device_id], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
- printf("%s\n", buffer);
- exit(1);
- }
-
- }
- ////////////////////////////////////////////////////////////////////////////////
- // Uninitialize the bucketsort algorithm
- ////////////////////////////////////////////////////////////////////////////////
- void finish_bucketsort()
- {
- clReleaseMemObject(d_offsets_buff);
- clReleaseMemObject(d_indice_buff);
- clReleaseMemObject(l_pivotpoints_buff);
- clReleaseMemObject(l_offsets_buff);
- clReleaseMemObject(d_prefixoffsets_buff);
- clReleaseMemObject(d_input_buff);
- clReleaseMemObject(d_indice_input_buff);
- clReleaseMemObject(bucketOutput);
- clReleaseProgram(bucketProgram);
- clReleaseKernel(bucketcountKernel);
- clReleaseKernel(bucketprefixKernel);
- clReleaseKernel(bucketsortKernel);
- clReleaseCommandQueue(bucketCommands);
- clReleaseContext(bucketContext);
- free(pivotPoints);
- free(h_offsets);
- free(historesult);
- }
- void histogramInit(int listsize, int platform_id, int device_id, int use_gpu) {
- cl_uint num = 0;
- clGetPlatformIDs(0, NULL, &num);
- cl_platform_id platformID[num];
- clGetPlatformIDs(num, platformID, NULL);
-
- // Selector for CPU/GPU
- cl_device_type device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
-
- clGetDeviceIDs(platformID[platform_id],device_type,0,NULL,&num);
-
- num = 2;
- char name[128];
-
- clGetPlatformInfo(platformID[platform_id], CL_PLATFORM_PROFILE,128,name,NULL);
-
-
- cl_device_id devices[num];
- err = clGetDeviceIDs(platformID[1],device_type,num,devices,NULL);
- // int gpu = 1;
- // err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 2, &device_id, NULL);
-
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to create a device group!\n");
- exit(1);
- }
-
- clGetDeviceInfo(devices[device_id],CL_DEVICE_NAME,128,name,NULL);
-
- printf("%s \n", name);
-
- cl_context_properties contextProperties[] =
- {
- CL_CONTEXT_PLATFORM,
- (cl_context_properties)platformID[num],
- 0
- };
-
- histoContext = clCreateContext(0, 1, &devices[device_id], NULL, NULL, &err);
-
- histoCommands = clCreateCommandQueue(histoContext, devices[device_id], CL_QUEUE_PROFILING_ENABLE, &err);
- histoInput = clCreateBuffer(histoContext, CL_MEM_READ_ONLY, listsize*(sizeof(float)), NULL, NULL);
- histoOutput = clCreateBuffer(histoContext, CL_MEM_READ_WRITE, 1024 * sizeof(unsigned int), NULL, NULL);
- FILE *fp;
- const char fileName[]="./histogram1024.cl";
- size_t source_size;
- char *source_str;
-
- fp = fopen(fileName, "r");
- if (!fp) {
- fprintf(stderr, "Failed to load kernel.\n");
- exit(1);
- }
-
-
- source_str = (char *)malloc(MAX_SOURCE_SIZE);
- source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
- fclose(fp);
-
- histoProgram = clCreateProgramWithSource(histoContext, 1, (const char **) &source_str, (const size_t)&source_size, &err);
- if (!histoProgram)
- {
- printf("Error: Failed to create compute program!\n");
- exit(1);
- }
- err = clBuildProgram(histoProgram, 0, NULL, NULL, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- size_t len;
- char buffer[2048];
-
- printf("Error: Failed to build program executable!\n");
- clGetProgramBuildInfo(histoProgram, devices[device_id], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
- printf("%s\n", buffer);
- exit(1);
- }
-
- histoKernel = clCreateKernel(histoProgram, "histogram1024Kernel", &err);
- if (!histoKernel || err != CL_SUCCESS)
- {
- printf("Error: Failed to create compute kernel!\n");
- exit(1);
- }
-
- }
- void histogram1024GPU(unsigned int *h_Result, float *d_Data, float minimum, float maximum,int listsize){
- err = clEnqueueWriteBuffer(histoCommands, histoInput, CL_TRUE, 0, listsize*sizeof(float), d_Data, 0, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to write to source array!\n");
- exit(1);
- }
- err = clEnqueueWriteBuffer(histoCommands, histoOutput, CL_TRUE, 0, DIVISIONS*sizeof(unsigned int), h_Result, 0, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to write to source array!\n");
- exit(1);
- }
- err = 0;
- err = clSetKernelArg(histoKernel, 0, sizeof(cl_mem), &histoOutput);
- err = clSetKernelArg(histoKernel, 1, sizeof(cl_mem), &histoInput);
- err = clSetKernelArg(histoKernel, 2, sizeof(float), &minimum);
- err = clSetKernelArg(histoKernel, 3, sizeof(float), &maximum);
- err = clSetKernelArg(histoKernel, 4, sizeof(int), &listsize);
-
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to set kernel arguments! %d\n", err);
- exit(1);
- }
-
- size_t global = 6144;
- size_t local;
- #ifdef HISTO_WG_SIZE_0
- local = HISTO_WG_SIZE_0;
- #else
- local = 96;
- #endif
- err = clEnqueueNDRangeKernel(histoCommands, histoKernel, 1, NULL, &global, &local, 0, NULL, &histoEvent);
- if (err)
- {
- printf("Error: Failed to execute histogram kernel!\n");
- exit(1);
- }
- clWaitForEvents(1 , &histoEvent);
- clFinish(histoCommands);
- err = clEnqueueReadBuffer( histoCommands, histoOutput, CL_TRUE, 0, 1024 * sizeof(unsigned int), h_Result, 0, NULL, NULL );
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to read histo output array! %d\n", err);
- exit(1);
- }
- clFinish(histoCommands);
-
- cl_ulong time_start, time_end;
- double total_time;
- clGetEventProfilingInfo(histoEvent, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
- clGetEventProfilingInfo(histoEvent, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
- total_time = time_end - time_start;
- sum+= total_time/1000000.0;
- printf("Histogram Kernel Time: %0.3f \n", total_time/1000000);
- }
- void finish_histogram() {
- clReleaseProgram(histoProgram);
- clReleaseKernel(histoKernel);
- clReleaseCommandQueue(histoCommands);
- clReleaseContext(histoContext);
- clReleaseMemObject((histoInput));
- clReleaseMemObject((histoOutput));
- }
- ////////////////////////////////////////////////////////////////////////////////
- // Given the input array of floats and the min and max of the distribution,
- // sort the elements into float4 aligned buckets of roughly equal size
- ////////////////////////////////////////////////////////////////////////////////
- void bucketSort(float *d_input, float *d_output, int listsize,
- int *sizes, int *nullElements, float minimum, float maximum,
- unsigned int *origOffsets, int platform_id, int device_id, int use_gpu)
- {
- // ////////////////////////////////////////////////////////////////////////////
- // // First pass - Create 1024 bin histogram
- // ////////////////////////////////////////////////////////////////////////////
- histogramInit(listsize, platform_id, device_id, use_gpu);
- histogram1024GPU(h_offsets, d_input, minimum, maximum, listsize);
- finish_histogram();
- for(int i=0; i<histosize; i++) historesult[i] = (float)h_offsets[i];
- // ///////////////////////////////////////////////////////////////////////////
- // // Calculate pivot points (CPU algorithm)
- // ///////////////////////////////////////////////////////////////////////////
- calcPivotPoints(historesult, histosize, listsize, DIVISIONS,
- minimum, maximum, pivotPoints,
- (maximum - minimum)/(float)histosize);
- //
- // ///////////////////////////////////////////////////////////////////////////
- // // Count the bucket sizes in new divisions
- // ///////////////////////////////////////////////////////////////////////////
-
- bucketcountKernel = clCreateKernel(bucketProgram, "bucketcount", &err);
- if (!bucketcountKernel || err != CL_SUCCESS)
- {
- printf("Error: Failed to create bucketsort compute kernel!\n");
- exit(1);
- }
- err = clEnqueueWriteBuffer(bucketCommands, l_pivotpoints_buff, CL_TRUE, 0, DIVISIONS*sizeof(float), pivotPoints, 0, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to write to l_pivotpoints source array!\n");
- exit(1);
- }
-
- d_input_buff = clCreateBuffer(bucketContext,CL_MEM_READ_WRITE, (listsize + (DIVISIONS*4))*sizeof(float),NULL,NULL);
-
- err = clEnqueueWriteBuffer(bucketCommands, d_input_buff, CL_TRUE, 0, (listsize + (DIVISIONS*4))*sizeof(float), d_input, 0, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to write to d_input_buff source array!\n");
- exit(1);
- }
- err = 0;
- err = clSetKernelArg(bucketcountKernel, 0, sizeof(cl_mem), &d_input_buff);
- err = clSetKernelArg(bucketcountKernel, 1, sizeof(cl_mem), &d_indice_buff);
- err = clSetKernelArg(bucketcountKernel, 2, sizeof(cl_mem), &d_prefixoffsets_buff);
- err = clSetKernelArg(bucketcountKernel, 3, sizeof(cl_int), &listsize);
- err = clSetKernelArg(bucketcountKernel, 4, sizeof(cl_mem), &l_pivotpoints_buff);
-
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to set kernel arguments! %d\n", err);
- exit(1);
- }
- int blocks =((listsize -1) / (BUCKET_THREAD_N*BUCKET_BAND)) + 1;
- size_t global[] = {blocks*BUCKET_THREAD_N,1,1};
- size_t local[] = {BUCKET_THREAD_N,1,1};
-
- err = clEnqueueNDRangeKernel(bucketCommands, bucketcountKernel, 3, NULL, global, local, 0, NULL, &bucketCountEvent);
- if (err)
- {
- printf("Error: Failed to execute bucket count kernel!\n");
- exit(1);
- }
- clWaitForEvents(1 , &bucketCountEvent);
- clFinish(bucketCommands);
- err = clEnqueueReadBuffer( bucketCommands, d_prefixoffsets_buff, CL_TRUE, 0, blocks * BUCKET_BLOCK_MEMORY * sizeof(unsigned int), d_prefixoffsets, 0, NULL, NULL );
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to read prefix output array! %d\n", err);
- exit(1);
- }
- err = clEnqueueReadBuffer( bucketCommands, d_indice_buff, CL_TRUE, 0, listsize * sizeof(int), d_indice, 0, NULL, NULL );
-
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to read indice output array! %d\n", err);
- exit(1);
- }
- clFinish(bucketCommands);
- cl_ulong time_start, time_end;
- double total_time;
- clGetEventProfilingInfo(bucketCountEvent, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
- clGetEventProfilingInfo(bucketCountEvent, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
- total_time = time_end - time_start;
- sum+= total_time/1000000;
- printf("Bucket Count Kernel Time: %0.3f \n", total_time/1000000);
-
- //
- // ///////////////////////////////////////////////////////////////////////////
- // // Prefix scan offsets and align each division to float4 (required by
- // // mergesort)
- // ///////////////////////////////////////////////////////////////////////////
- #ifdef BUCKET_WG_SIZE_0
- size_t localpre[] = {BUCKET_WG_SIZE_0,1,1};
- #else
- size_t localpre[] = {128,1,1};
- #endif
- size_t globalpre[] = {(DIVISIONS),1,1};
-
- bucketprefixKernel = clCreateKernel(bucketProgram, "bucketprefixoffset", &err);
- if (!bucketprefixKernel || err != CL_SUCCESS)
- {
- printf("Error: Failed to create bucket prefix compute kernel!\n");
- exit(1);
- }
- err = clEnqueueWriteBuffer(bucketCommands, d_prefixoffsets_buff, CL_TRUE, 0, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), d_prefixoffsets, 0, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to write to prefix offsets source array!\n");
- exit(1);
- }
- err = 0;
- err = clSetKernelArg(bucketprefixKernel, 0, sizeof(cl_mem), &d_prefixoffsets_buff);
- err = clSetKernelArg(bucketprefixKernel, 1, sizeof(cl_mem), &d_offsets_buff);
- err = clSetKernelArg(bucketprefixKernel, 2, sizeof(cl_int), &blocks);
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to set kernel arguments! %d\n", err);
- exit(1);
- }
- err = clEnqueueNDRangeKernel(bucketCommands, bucketprefixKernel, 3, NULL, globalpre, localpre, 0, NULL, &bucketPrefixEvent);
- if (err)
- {
- printf("%d Error: Failed to execute bucket prefix kernel!\n", err);
- exit(1);
- }
- clWaitForEvents(1 , &bucketPrefixEvent);
- clFinish(bucketCommands);
- err = clEnqueueReadBuffer( bucketCommands, d_offsets_buff, CL_TRUE, 0, DIVISIONS * sizeof(unsigned int), h_offsets, 0, NULL, NULL );
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to read d_offsets output array! %d\n", err);
- exit(1);
- }
- err = clEnqueueReadBuffer( bucketCommands, d_prefixoffsets_buff, CL_TRUE, 0, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), d_prefixoffsets_altered, 0, NULL, NULL );
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to read d_offsets output array! %d\n", err);
- exit(1);
- }
- clFinish(bucketCommands);
- clGetEventProfilingInfo(bucketPrefixEvent, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
- clGetEventProfilingInfo(bucketPrefixEvent, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
- total_time = time_end - time_start;
- sum+= total_time/1000000;
- printf("Bucket Prefix Kernel Time: %0.3f \n", total_time/1000000);
- // // copy the sizes from device to host
- origOffsets[0] = 0;
- for(int i=0; i<DIVISIONS; i++){
- origOffsets[i+1] = h_offsets[i] + origOffsets[i];
- if((h_offsets[i] % 4) != 0){
- nullElements[i] = (h_offsets[i] & ~3) + 4 - h_offsets[i];
- }
- else nullElements[i] = 0;
- }
- for(int i=0; i<DIVISIONS; i++) sizes[i] = (h_offsets[i] + nullElements[i])/4;
- for(int i=0; i<DIVISIONS; i++) {
- if((h_offsets[i] % 4) != 0) h_offsets[i] = (h_offsets[i] & ~3) + 4;
- }
- for(int i=1; i<DIVISIONS; i++) h_offsets[i] = h_offsets[i-1] + h_offsets[i];
- for(int i=DIVISIONS - 1; i>0; i--) h_offsets[i] = h_offsets[i-1];
- h_offsets[0] = 0;
-
- // ///////////////////////////////////////////////////////////////////////////
- // // Finally, sort the lot
- // ///////////////////////////////////////////////////////////////////////////
- bucketsortKernel = clCreateKernel(bucketProgram, "bucketsort", &err);
- if (!bucketsortKernel|| err != CL_SUCCESS)
- {
- printf("Error: Failed to create bucketsort compute kernel!\n");
- exit(1);
- }
- err = clEnqueueWriteBuffer(bucketCommands, l_offsets_buff, CL_TRUE, 0, DIVISIONS * sizeof(unsigned int), h_offsets, 0, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to write to l_offsets source array!\n");
- exit(1);
- }
- err = clEnqueueWriteBuffer(bucketCommands, d_input_buff, CL_TRUE, 0, (listsize + (DIVISIONS*4))*sizeof(float), d_input, 0, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to write to d_input_buff source array!\n");
- exit(1);
- }
- err = clEnqueueWriteBuffer(bucketCommands, d_indice_input_buff, CL_TRUE, 0, listsize*sizeof(int), d_indice, 0, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to write to d_input_buff source array!\n");
- exit(1);
- }
- err = clEnqueueWriteBuffer(bucketCommands, d_prefixoffsets_input_buff, CL_TRUE, 0, blocks * BUCKET_BLOCK_MEMORY * sizeof(int), d_prefixoffsets_altered, 0, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to write to prefix offsets source array!\n");
- exit(1);
- }
- err = clEnqueueWriteBuffer(bucketCommands, bucketOutput, CL_TRUE, 0, (listsize + (DIVISIONS*4))*sizeof(float), d_output, 0, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to write to source array!\n");
- exit(1);
- }
- size_t localfinal[] = {BUCKET_THREAD_N,1,1};
- blocks = ((listsize - 1) / (BUCKET_THREAD_N * BUCKET_BAND)) + 1;
- size_t globalfinal[] = {blocks*BUCKET_THREAD_N,1,1};
- err = 0;
- err = clSetKernelArg(bucketsortKernel, 0, sizeof(cl_mem), &d_input_buff);
- err = clSetKernelArg(bucketsortKernel, 1, sizeof(cl_mem), &d_indice_input_buff);
- err = clSetKernelArg(bucketsortKernel, 2, sizeof(cl_mem), &bucketOutput);
- err = clSetKernelArg(bucketsortKernel, 3, sizeof(cl_int), &listsize);
- err = clSetKernelArg(bucketsortKernel, 4, sizeof(cl_mem), &d_prefixoffsets_input_buff);
- err = clSetKernelArg(bucketsortKernel, 5, sizeof(cl_mem), &l_offsets_buff);
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to set kernel arguments! %d\n", err);
- exit(1);
- }
- err = clEnqueueNDRangeKernel(bucketCommands, bucketsortKernel, 3, NULL, globalfinal, localfinal, 0, NULL, &bucketSortEvent);
- if (err)
- {
- printf("%d Error: Failed to execute bucketsort kernel!\n", err);
- }
- err = clEnqueueReadBuffer( bucketCommands, bucketOutput, CL_TRUE, 0, (listsize + (DIVISIONS*4))*sizeof(float), d_output, 0, NULL, NULL );
- if (err != CL_SUCCESS)
- {
- printf("Error: Failed to read d_output array! %d\n", err);
- }
- clWaitForEvents(1 , &bucketSortEvent);
- clFinish(bucketCommands);
- clGetEventProfilingInfo(bucketSortEvent, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
- clGetEventProfilingInfo(bucketSortEvent, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
- total_time = time_end - time_start;
- sum+= total_time/1000000;
- printf("Bucket Sort Kernel Time: %0.3f \n", total_time/1000000);
- }
- double getBucketTime() {
- return sum;
- }
- ////////////////////////////////////////////////////////////////////////////////
- // Given a histogram of the list, figure out suitable pivotpoints that divide
- // the list into approximately listsize/divisions elements each
- ////////////////////////////////////////////////////////////////////////////////
- void calcPivotPoints(float *histogram, int histosize, int listsize,
- int divisions, float min, float max, float *pivotPoints, float histo_width)
- {
- float elemsPerSlice = listsize/(float)divisions;
- float startsAt = min;
- float endsAt = min + histo_width;
- float we_need = elemsPerSlice;
- int p_idx = 0;
- for(int i=0; i<histosize; i++)
- {
- if(i == histosize - 1){
- if(!(p_idx < divisions)){
- pivotPoints[p_idx++] = startsAt + (we_need/histogram[i]) * histo_width;
- }
- break;
- }
- while(histogram[i] > we_need){
- if(!(p_idx < divisions)){
- printf("i=%d, p_idx = %d, divisions = %d\n", i, p_idx, divisions);
- exit(0);
- }
- pivotPoints[p_idx++] = startsAt + (we_need/histogram[i]) * histo_width;
- startsAt += (we_need/histogram[i]) * histo_width;
- histogram[i] -= we_need;
- we_need = elemsPerSlice;
- }
- // grab what we can from what remains of it
- we_need -= histogram[i];
-
- startsAt = endsAt;
- endsAt += histo_width;
- }
- while(p_idx < divisions){
- pivotPoints[p_idx] = pivotPoints[p_idx-1];
- p_idx++;
- }
- }
|