cuda takes 0.16 sec, included all process ex)upload, download, malloc, release..
cpu takes 0.00 sec.
cpu is faster than cuda??
Because processing is simple??
test by this source code..
cpu version
#include < iostream>
#include < windows.h>
#include < stdlib.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 50000 //vector size
void add( int *a, int *b, int *c ) {
int tid = 0; // this is CPU zero, so we start at zero
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += 1; // we have one CPU, so we increment by one
}
}
int main( void ) {
//for processing time measure
unsigned long Atime=0, Btime=0;
Atime = GetTickCount();
int a[N], b[N], c[N];
// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i< N; i++) {
a[i] = -i;
b[i] = i * i;
}
add( a, b, c );
// display the results
/*
for (int i=0; i< N; i++) {
printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}
*/
Btime = GetTickCount();
printf("%.20lf sec\n", (Btime - Atime)/1000.0 );
return 0;
}
cuda version #include < iostream>
#include < windows.h>
#include < stdlib.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 50000 //vector size
__global__ void add(int *a, int *b, int *c){
int tid = blockIdx.x; //block index that is set by gpu device.
if(tid < N)
c[tid] = a[tid] + b[tid];
}
int main(void)
{
//for processing time measure
unsigned long Atime=0, Btime=0;
Atime = GetTickCount();
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
//gpu mem allocation
cudaMalloc( (void**)&dev_a, N*sizeof(int) );
cudaMalloc( (void**)&dev_b, N*sizeof(int) );
cudaMalloc( (void**)&dev_c, N*sizeof(int) );
//value alloc in cpu
for(int i=0; i< N; ++i)
{
a[i] = -i;
b[i] = i*i;
}
//value copy from cpu to gpu
cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);
//add kernel call
add<<< N, 1>>>(dev_a, dev_b, dev_c);
//result value copy gpu -> cpu
cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost) ;
//result print
//for(int i=0; i< N; ++i){
//printf("%d + %d = %d\n", a[i], b[i], c[i] );
//}
//memory release
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );
Btime = GetTickCount();
printf("%.2lf sec\n", (Btime - Atime)/1000.0 );
}
cude example is introduced in here -> https://bitbucket.org/mrfright/cuda_by_example/src
No comments:
Post a Comment