cuda takes 0.16 sec, included all process ex)upload, download, malloc, release..
cpu takes 0.00 sec.
cpu is faster than cuda??
Because processing is simple??
test by this source code..
cpu version
#include < iostream> #include < windows.h> #include < stdlib.h> #include "cuda.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" #define N 50000 //vector size void add( int *a, int *b, int *c ) { int tid = 0; // this is CPU zero, so we start at zero while (tid < N) { c[tid] = a[tid] + b[tid]; tid += 1; // we have one CPU, so we increment by one } } int main( void ) { //for processing time measure unsigned long Atime=0, Btime=0; Atime = GetTickCount(); int a[N], b[N], c[N]; // fill the arrays 'a' and 'b' on the CPU for (int i=0; i< N; i++) { a[i] = -i; b[i] = i * i; } add( a, b, c ); // display the results /* for (int i=0; i< N; i++) { printf( "%d + %d = %d\n", a[i], b[i], c[i] ); } */ Btime = GetTickCount(); printf("%.20lf sec\n", (Btime - Atime)/1000.0 ); return 0; }cuda version
#include < iostream> #include < windows.h> #include < stdlib.h> #include "cuda.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" #define N 50000 //vector size __global__ void add(int *a, int *b, int *c){ int tid = blockIdx.x; //block index that is set by gpu device. if(tid < N) c[tid] = a[tid] + b[tid]; } int main(void) { //for processing time measure unsigned long Atime=0, Btime=0; Atime = GetTickCount(); int a[N], b[N], c[N]; int *dev_a, *dev_b, *dev_c; //gpu mem allocation cudaMalloc( (void**)&dev_a, N*sizeof(int) ); cudaMalloc( (void**)&dev_b, N*sizeof(int) ); cudaMalloc( (void**)&dev_c, N*sizeof(int) ); //value alloc in cpu for(int i=0; i< N; ++i) { a[i] = -i; b[i] = i*i; } //value copy from cpu to gpu cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice); //add kernel call add<<< N, 1>>>(dev_a, dev_b, dev_c); //result value copy gpu -> cpu cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost) ; //result print //for(int i=0; i< N; ++i){ //printf("%d + %d = %d\n", a[i], b[i], c[i] ); //} //memory release cudaFree( dev_a ); cudaFree( dev_b ); cudaFree( dev_c ); Btime = GetTickCount(); printf("%.2lf sec\n", (Btime - Atime)/1000.0 ); }
cude example is introduced in here -> https://bitbucket.org/mrfright/cuda_by_example/src
No comments:
Post a Comment