jtootf

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#include <stdio.h>
#include <stdlib.h>

#define BLOCK_SIZE 8
#define ARRAY_SIZE 78*78*78

cudaError_t CUerr;
inline int CUERROR(char *str){
	if(CUerr != cudaSuccess){
		fprintf(stderr, "%s, %s\n", str, cudaGetErrorString(CUerr));
		return 1;
	}else return 0;
}
#define CUALLOC(var, size)		do{				\
	CUerr = cudaMalloc((void**)&var, size);		\
	if(CUERROR("CUDA: can't allocate memory")){	\
		exit(1);								\
}}while(0)
#define CUMOV2HOST(dest, src, size) do{			\
	CUerr = cudaMemcpy(dest, src, size,			\
				cudaMemcpyDeviceToHost);		\
	if(CUERROR("CUDA: can't copy data to host")){\
		exit(1);								\
}}while(0)
#define CUMOV2DEV(dest, src, size) do{			\
	CUerr = cudaMemcpy(dest, src, size,			\
				cudaMemcpyHostToDevice);		\
	if(CUERROR("CUDA: can't copy data to device")){\
		exit(1);								\
}}while(0)
#define CUFREE(var) do{cudaFree(var); var = NULL; }while(0)

__global__ void testKernel(float *data) {
    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    if (ix > ARRAY_SIZE) return;
    data[ix] = (float)ix;
}

int main(int argc, char *argv[]) {
	int size = ARRAY_SIZE;
	int SZ = size * sizeof(float), i;
	float *devData = NULL;
	float *outData = (float*)malloc(SZ);
	if(!outData){
		fprintf(stderr, "Can't allocate outData\n");
		exit(1);
	}
	CUALLOC(devData, SZ);
	dim3 blockSize(BLOCK_SIZE);
	int BLK = (ARRAY_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE;
	dim3 gridSize(BLK);
	cudaMemset(devData, 0, SZ);
	testKernel<<<gridSize, blockSize>>>(devData);
	cudaThreadSynchronize();
	CUMOV2HOST(outData, devData, SZ);
	CUFREE(devData);
	for(i=0; i!=size; i++)
		printf("%g\n", outData[i]);
	free(outData);
	return 0;
}