root / API / api.cu @ 13

View | Annotate | Download (5.8 KB)

1
/*
2
	The idea behind this API is that, the application first will have to create a context of GPU which also includes a pointer	
3
	to the buffer that will contain a copy the frame of camera. This buffer can be pinned or pageable depending upon the 
4
	arguments passed to "host_flag". Gpu will create its own buffer and copy the contents of context buffer to it. Gpu context 
5
	buffer resides on host memory.
6
7
	IPLimage(cannot be made pinned memory) -----> gpu_context_t buffer(can be made pinned) ----> Grayscale filter buffer(cudaMalloc).
8
									|		
9
									|	
10
									|------------------------> Threshold filter buffer(cudaMalloc).
11
	
12
	The scenerio being used right now (the one we dicussed via google doc as shown above) is not using pinned memory.
13
*/
14
15
#include "assert.h"
16
17
#ifndef _CUDA_H
18
#define _CUDA_H
19
#include "cuda.h"
20
#endif
21
22
#include <stdio.h>
23
#include <assert.h>
24
#include <string.h>
25
26
#include "api.h"
27
28
gpu_error_t last_error = GPU_OK;
29
cudaError_t last_cuda_error = cudaSuccess;
30
31
//////////////////// Necessary Cuda calls ///////////////////////////////
32
/////////////// This call copies data from global memory ///////////////
33
void cuda_set_input(gpu_context_t *ctx, unsigned char *idata)
34
{
35
	int i = 0,
36
		size = ctx->width * ctx->height;
37
38
	switch ( ctx->nchannels )
39
	{
40
		case 1:
41
			for( ; i < size; i++)
42
			{
43
				ctx->output_buffer[i * 4 + 0] = idata[i];
44
				// FIXME if the image is already gray (1 channel),
45
				// do we need to do that ?
46
				ctx->output_buffer[i * 4 + 1] = idata[i];
47
				ctx->output_buffer[i * 4 + 2] = idata[i];
48
			}
49
50
		case 3:
51
			for( ; i < size; i++)
52
			{
53
				ctx->output_buffer[i * 4 + 0] = idata[i * 3 + 0];
54
				ctx->output_buffer[i * 4 + 1] = idata[i * 3 + 1];
55
				ctx->output_buffer[i * 4 + 2] = idata[i * 3 + 2];
56
			}
57
			break;
58
59
		default:
60
			// this is because we don't know how to copy this input image to gpu buffer
61
			assert(0);
62
	}
63
64
	cudaMemcpy(ctx->output_buffer, ctx->gpu_buffer, size * 4, cudaMemcpyHostToDevice);
65
	checkCudaError();
66
}
67
68
/////////////////////////////////////////////////////////////////////////
69
70
////// This code will return error occured on GPU in a string format ////
71
const char *gpu_error()
72
{
73
	// reset the error for next call
74
	gpu_error_t error = last_error;
75
	cudaError_t cuda_error = last_cuda_error;
76
	last_error = GPU_OK;
77
	last_cuda_error = cudaSuccess;
78
79
	switch (error)
80
	{
81
		case GPU_OK:
82
			return "OK";
83
		case GPU_ERR_MEM:
84
			return "Memory allocation";
85
		case GPU_ERR_CUDA:
86
			return cudaGetErrorString(cuda_error);
87
	}
88
89
	return "Unknown";
90
}
91
92
//////////// This code will check for any Cuda related error ////////////
93
gpu_error_t checkCudaError()
94
{
95
	last_cuda_error = cudaGetLastError();
96
	if (last_cuda_error != cudaSuccess)
97
		return GPU_ERR_CUDA;
98
	return GPU_OK;
99
}
100
101
//////////// This code will create a gpu context /////////////
102
gpu_error_t gpu_context_create(gpu_context_t **ctx)
103
{
104
	last_error = GPU_OK;
105
	assert(ctx != NULL);
106
107
	// create the context and initialize it
108
	*ctx = (gpu_context_t *)malloc( sizeof(gpu_context_t) );
109
	if (*ctx == NULL)
110
		last_error = GPU_ERR_MEM;
111
	else
112
		memset(*ctx, 0, sizeof(gpu_context_t));
113
114
	return last_error;
115
116
}
117
118
/////////////////////////////// This code will initialize the previously created contex ///////////////////////////////////////////////////
119
gpu_error_t gpu_context_init(gpu_context_t *ctx, int host_height, int host_width, int host_nchannels, gpu_context_memory_t host_flag)
120
{
121
	assert(ctx != NULL);
122
	assert(host_height > 0);
123
	assert(host_width > 0);
124
	assert(host_nchannels == 3);
125
126
	ctx->height = host_height;
127
	ctx->width = host_width;
128
	ctx->nchannels = host_nchannels;
129
	ctx->mem_flag = host_flag;
130
	// whatever the source channels is, we always use 4 channels images
131
	ctx->size = ctx->height * ctx->width * 4 * sizeof(unsigned char);
132
133
	cudaMalloc( (void **)&ctx->gpu_buffer, ctx->size);
134
	last_error = checkCudaError();
135
	if(last_error == GPU_OK)
136
	{
137
		switch (ctx->mem_flag)
138
		{
139
			case GPU_MEMORY_HOST:
140
				ctx->output_buffer = (unsigned char *)malloc(ctx->size);
141
				if(!(ctx->output_buffer))
142
					last_error = GPU_ERR_MEM;
143
				break;
144
145
			case GPU_MEMORY_PINNED_WRITE_COMBINED:
146
				cudaHostAlloc( (void **)&ctx->output_buffer, ctx->size, cudaHostAllocWriteCombined);
147
				last_error = checkCudaError();
148
				break;
149
150
			case GPU_MEMORY_PINNED:
151
				cudaHostAlloc((void **)&ctx->output_buffer, ctx->size, cudaHostAllocDefault);
152
				last_error = checkCudaError();
153
				break;
154
155
			default:
156
				// should never happen
157
				assert(0);
158
				last_error = GPU_ERR_MEM;
159
				break;
160
		}
161
	}
162
163
	return last_error;
164
}
165
166
///////////// This code will set the context buffer to the input buffer //////////////
167
gpu_error_t gpu_set_input( gpu_context_t *ctx, unsigned char *idata)
168
{
169
	assert( ctx || idata );
170
171
	cuda_set_input(ctx, idata);
172
	/*for(int i=0;i < (ctx->width)*(ctx->height);i++)
173
	{
174
		ctx->output_buffer[i * 4 + 0] = idata[i * 3 + 0];
175
		ctx->output_buffer[i * 4 + 1] = idata[i * 3 + 1];
176
		ctx->output_buffer[i * 4 + 2] = idata[i * 3 + 2];
177
	}*/
178
179
	return GPU_OK;
180
}
181
182
///////////// This code will set the input buffer to the context buffer //////////////
183
gpu_error_t gpu_get_output(gpu_context_t *ctx, unsigned char **output)
184
{
185
	assert( ctx );
186
	assert( output != NULL );
187
188
	// copy back the gpu buffer to host buffer
189
	cudaMemcpy(ctx->output_buffer, ctx->gpu_buffer, ctx->width * ctx->height * 4, cudaMemcpyDeviceToHost);
190
	last_error = checkCudaError();
191
	if ( last_error == GPU_OK )
192
		*output = ctx->output_buffer;
193
194
	return last_error;
195
}
196
197
///// This code will deallocate all the memory held by context, including the memory on GPU //////
198
void gpu_context_free( gpu_context_t *ctx)
199
{
200
	assert(ctx);
201
	switch ( ctx->mem_flag )
202
	{
203
		case GPU_MEMORY_HOST:
204
			free(ctx->output_buffer);
205
			cudaFreeHost(ctx->gpu_buffer);
206
			break;
207
208
		case GPU_MEMORY_PINNED_WRITE_COMBINED:
209
		case GPU_MEMORY_PINNED:
210
			cudaFreeHost(ctx->output_buffer);
211
			cudaFreeHost(ctx->gpu_buffer);
212
			break;
213
214
		default:
215
			// should never happen
216
			assert(0);
217
			break;
218
	}
219
	free(ctx);
220
}