forked from victoroliv2/CCL-GPU
-
Notifications
You must be signed in to change notification settings - Fork 0
/
distance_transform.cu
129 lines (95 loc) · 3.16 KB
/
distance_transform.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda.h>
#include "pgm.h"
#define BLOCK_SIZE 256
__device__ float
distance(float x, float y, float ex, float ey) {
return sqrtf(powf(ex-x,2.0f)+powf(ey-y,2.0f));
}
__global__ void
euclidian_distance_transform(uchar2* img,
float2* dist, int w, int h) {
//each thread process 4 pixels
__shared__ uchar2 img_line [BLOCK_SIZE];
int i = blockIdx.x*blockDim.x + threadIdx.x;
float N = w*h;
int2 ox = {(2*i) % w, (2*i+1) % w};
int2 oy = {(2*i) / w, (2*i+1) / w};
if (2*i < N) {
float2 d = {N, N};
for(int bi=0; bi<gridDim.x; bi++) {
int nbi = (blockIdx.x+bi) % gridDim.x;
int target = nbi*blockDim.x+threadIdx.x;
//32-bit coalesced transaction
img_line[threadIdx.x] = img[target];
__syncthreads();
for (int k=0;k<blockDim.x;k++) {
float2 nd;
//let's back to the original reference system
int2 dx = {(2*target) % w, (2*target+1) % w};
int2 dy = {(2*target) / w, (2*target+1) / w};
//pixel 1 - x
//pixel 2 - y
nd.x = distance(float(ox.x),float(oy.x), float(dx.x),float(dy.x));
nd.y = distance(float(ox.y),float(oy.y), float(dx.y),float(dy.y));
uchar2 v = img_line[threadIdx.x];
if (v.x == 1 && nd.x < d.x) d.x = nd.x;
if (v.y == 1 && nd.y < d.y) d.y = nd.y;
}
}
//128-bit coalesced transaction
dist[i] = d;
}
}
int main(int argc, char* argv[]) {
cudaEvent_t start,stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
//float et;
cudaError_t err;
unsigned char *img;
img = load_ppm(argv[1], &w, &h);
unsigned char *d_img;
cudaMalloc((void**) &d_img, w*h*sizeof(unsigned char));
cudaMemcpy(d_img, img, w*h*sizeof(unsigned char), cudaMemcpyHostToDevice);
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("ERROR: %s\n", cudaGetErrorString(err));
return 1;
}
fprintf(stderr, "GO!");
//-------------------
float* dist = (float*)malloc(w*h*sizeof(float));
float* d_dist;
cudaMalloc((void**) &d_dist, w*h*sizeof(float));
dim3 block (BLOCK_SIZE,1);
dim3 grid ((w*h+(2*BLOCK_SIZE-1))/(2*BLOCK_SIZE));
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("ERROR: %s\n", cudaGetErrorString(err));
return 1;
}
fprintf(stderr, "GO!");
euclidian_distance_transform
<<<grid, block>>> ((uchar2*)d_img, (float2*)d_dist, w, h);
cudaThreadSynchronize();
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("ERROR: %s\n", cudaGetErrorString(err));
return 1;
}
fprintf(stderr, "GO!");
cudaMemcpy(dist, d_dist, w*h*sizeof(float), cudaMemcpyDeviceToHost);
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("ERROR: %s\n", cudaGetErrorString(err));
return 1;
}
cudaFree(d_img);
cudaFree(d_dist);
free(dist);
free(img);
return 0;
}