root / API / api.cu @ 13
View | Annotate | Download (5.8 KB)
| 1 | /* |
|---|---|
| 2 | The idea behind this API is that, the application first will have to create a context of GPU which also includes a pointer |
| 3 | to the buffer that will contain a copy the frame of camera. This buffer can be pinned or pageable depending upon the |
| 4 | arguments passed to "host_flag". Gpu will create its own buffer and copy the contents of context buffer to it. Gpu context |
| 5 | buffer resides on host memory. |
| 6 | |
| 7 | IPLimage(cannot be made pinned memory) -----> gpu_context_t buffer(can be made pinned) ----> Grayscale filter buffer(cudaMalloc). |
| 8 | | |
| 9 | | |
| 10 | |------------------------> Threshold filter buffer(cudaMalloc). |
| 11 | |
| 12 | The scenerio being used right now (the one we dicussed via google doc as shown above) is not using pinned memory. |
| 13 | */ |
| 14 | |
| 15 | #include "assert.h" |
| 16 | |
| 17 | #ifndef _CUDA_H |
| 18 | #define _CUDA_H |
| 19 | #include "cuda.h" |
| 20 | #endif |
| 21 | |
| 22 | #include <stdio.h> |
| 23 | #include <assert.h> |
| 24 | #include <string.h> |
| 25 | |
| 26 | #include "api.h" |
| 27 | |
| 28 | gpu_error_t last_error = GPU_OK; |
| 29 | cudaError_t last_cuda_error = cudaSuccess; |
| 30 | |
| 31 | //////////////////// Necessary Cuda calls /////////////////////////////// |
| 32 | /////////////// This call copies data from global memory /////////////// |
| 33 | void cuda_set_input(gpu_context_t *ctx, unsigned char *idata) |
| 34 | {
|
| 35 | int i = 0, |
| 36 | size = ctx->width * ctx->height; |
| 37 | |
| 38 | switch ( ctx->nchannels ) |
| 39 | {
|
| 40 | case 1: |
| 41 | for( ; i < size; i++) |
| 42 | {
|
| 43 | ctx->output_buffer[i * 4 + 0] = idata[i]; |
| 44 | // FIXME if the image is already gray (1 channel), |
| 45 | // do we need to do that ? |
| 46 | ctx->output_buffer[i * 4 + 1] = idata[i]; |
| 47 | ctx->output_buffer[i * 4 + 2] = idata[i]; |
| 48 | } |
| 49 | |
| 50 | case 3: |
| 51 | for( ; i < size; i++) |
| 52 | {
|
| 53 | ctx->output_buffer[i * 4 + 0] = idata[i * 3 + 0]; |
| 54 | ctx->output_buffer[i * 4 + 1] = idata[i * 3 + 1]; |
| 55 | ctx->output_buffer[i * 4 + 2] = idata[i * 3 + 2]; |
| 56 | } |
| 57 | break; |
| 58 | |
| 59 | default: |
| 60 | // this is because we don't know how to copy this input image to gpu buffer |
| 61 | assert(0); |
| 62 | } |
| 63 | |
| 64 | cudaMemcpy(ctx->output_buffer, ctx->gpu_buffer, size * 4, cudaMemcpyHostToDevice); |
| 65 | checkCudaError(); |
| 66 | } |
| 67 | |
| 68 | ///////////////////////////////////////////////////////////////////////// |
| 69 | |
| 70 | ////// This code will return error occured on GPU in a string format //// |
| 71 | const char *gpu_error() |
| 72 | {
|
| 73 | // reset the error for next call |
| 74 | gpu_error_t error = last_error; |
| 75 | cudaError_t cuda_error = last_cuda_error; |
| 76 | last_error = GPU_OK; |
| 77 | last_cuda_error = cudaSuccess; |
| 78 | |
| 79 | switch (error) |
| 80 | {
|
| 81 | case GPU_OK: |
| 82 | return "OK"; |
| 83 | case GPU_ERR_MEM: |
| 84 | return "Memory allocation"; |
| 85 | case GPU_ERR_CUDA: |
| 86 | return cudaGetErrorString(cuda_error); |
| 87 | } |
| 88 | |
| 89 | return "Unknown"; |
| 90 | } |
| 91 | |
| 92 | //////////// This code will check for any Cuda related error //////////// |
| 93 | gpu_error_t checkCudaError() |
| 94 | {
|
| 95 | last_cuda_error = cudaGetLastError(); |
| 96 | if (last_cuda_error != cudaSuccess) |
| 97 | return GPU_ERR_CUDA; |
| 98 | return GPU_OK; |
| 99 | } |
| 100 | |
| 101 | //////////// This code will create a gpu context ///////////// |
| 102 | gpu_error_t gpu_context_create(gpu_context_t **ctx) |
| 103 | {
|
| 104 | last_error = GPU_OK; |
| 105 | assert(ctx != NULL); |
| 106 | |
| 107 | // create the context and initialize it |
| 108 | *ctx = (gpu_context_t *)malloc( sizeof(gpu_context_t) ); |
| 109 | if (*ctx == NULL) |
| 110 | last_error = GPU_ERR_MEM; |
| 111 | else |
| 112 | memset(*ctx, 0, sizeof(gpu_context_t)); |
| 113 | |
| 114 | return last_error; |
| 115 | |
| 116 | } |
| 117 | |
| 118 | /////////////////////////////// This code will initialize the previously created contex /////////////////////////////////////////////////// |
| 119 | gpu_error_t gpu_context_init(gpu_context_t *ctx, int host_height, int host_width, int host_nchannels, gpu_context_memory_t host_flag) |
| 120 | {
|
| 121 | assert(ctx != NULL); |
| 122 | assert(host_height > 0); |
| 123 | assert(host_width > 0); |
| 124 | assert(host_nchannels == 3); |
| 125 | |
| 126 | ctx->height = host_height; |
| 127 | ctx->width = host_width; |
| 128 | ctx->nchannels = host_nchannels; |
| 129 | ctx->mem_flag = host_flag; |
| 130 | // whatever the source channels is, we always use 4 channels images |
| 131 | ctx->size = ctx->height * ctx->width * 4 * sizeof(unsigned char); |
| 132 | |
| 133 | cudaMalloc( (void **)&ctx->gpu_buffer, ctx->size); |
| 134 | last_error = checkCudaError(); |
| 135 | if(last_error == GPU_OK) |
| 136 | {
|
| 137 | switch (ctx->mem_flag) |
| 138 | {
|
| 139 | case GPU_MEMORY_HOST: |
| 140 | ctx->output_buffer = (unsigned char *)malloc(ctx->size); |
| 141 | if(!(ctx->output_buffer)) |
| 142 | last_error = GPU_ERR_MEM; |
| 143 | break; |
| 144 | |
| 145 | case GPU_MEMORY_PINNED_WRITE_COMBINED: |
| 146 | cudaHostAlloc( (void **)&ctx->output_buffer, ctx->size, cudaHostAllocWriteCombined); |
| 147 | last_error = checkCudaError(); |
| 148 | break; |
| 149 | |
| 150 | case GPU_MEMORY_PINNED: |
| 151 | cudaHostAlloc((void **)&ctx->output_buffer, ctx->size, cudaHostAllocDefault); |
| 152 | last_error = checkCudaError(); |
| 153 | break; |
| 154 | |
| 155 | default: |
| 156 | // should never happen |
| 157 | assert(0); |
| 158 | last_error = GPU_ERR_MEM; |
| 159 | break; |
| 160 | } |
| 161 | } |
| 162 | |
| 163 | return last_error; |
| 164 | } |
| 165 | |
| 166 | ///////////// This code will set the context buffer to the input buffer ////////////// |
| 167 | gpu_error_t gpu_set_input( gpu_context_t *ctx, unsigned char *idata) |
| 168 | {
|
| 169 | assert( ctx || idata ); |
| 170 | |
| 171 | cuda_set_input(ctx, idata); |
| 172 | /*for(int i=0;i < (ctx->width)*(ctx->height);i++) |
| 173 | {
|
| 174 | ctx->output_buffer[i * 4 + 0] = idata[i * 3 + 0]; |
| 175 | ctx->output_buffer[i * 4 + 1] = idata[i * 3 + 1]; |
| 176 | ctx->output_buffer[i * 4 + 2] = idata[i * 3 + 2]; |
| 177 | }*/ |
| 178 | |
| 179 | return GPU_OK; |
| 180 | } |
| 181 | |
| 182 | ///////////// This code will set the input buffer to the context buffer ////////////// |
| 183 | gpu_error_t gpu_get_output(gpu_context_t *ctx, unsigned char **output) |
| 184 | {
|
| 185 | assert( ctx ); |
| 186 | assert( output != NULL ); |
| 187 | |
| 188 | // copy back the gpu buffer to host buffer |
| 189 | cudaMemcpy(ctx->output_buffer, ctx->gpu_buffer, ctx->width * ctx->height * 4, cudaMemcpyDeviceToHost); |
| 190 | last_error = checkCudaError(); |
| 191 | if ( last_error == GPU_OK ) |
| 192 | *output = ctx->output_buffer; |
| 193 | |
| 194 | return last_error; |
| 195 | } |
| 196 | |
| 197 | ///// This code will deallocate all the memory held by context, including the memory on GPU ////// |
| 198 | void gpu_context_free( gpu_context_t *ctx) |
| 199 | {
|
| 200 | assert(ctx); |
| 201 | switch ( ctx->mem_flag ) |
| 202 | {
|
| 203 | case GPU_MEMORY_HOST: |
| 204 | free(ctx->output_buffer); |
| 205 | cudaFreeHost(ctx->gpu_buffer); |
| 206 | break; |
| 207 | |
| 208 | case GPU_MEMORY_PINNED_WRITE_COMBINED: |
| 209 | case GPU_MEMORY_PINNED: |
| 210 | cudaFreeHost(ctx->output_buffer); |
| 211 | cudaFreeHost(ctx->gpu_buffer); |
| 212 | break; |
| 213 | |
| 214 | default: |
| 215 | // should never happen |
| 216 | assert(0); |
| 217 | break; |
| 218 | } |
| 219 | free(ctx); |
| 220 | } |
