
50000 x 1 vector sum, gpu vs cpu processing time compare

in (50000 x 1 + 50000 x 1) vector sum
cuda takes 0.16 sec, included all process ex)upload, download, malloc, release..
cpu takes 0.00 sec.

cpu is faster than cuda??
Because processing is simple??

test by this source code..

cpu version

#include < iostream>
#include < windows.h>
#include < stdlib.h>
 #include "cuda.h"  
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  

#define N 50000 //vector size

void add( int *a, int *b, int *c ) {
    int tid = 0;    // this is CPU zero, so we start at zero
    while (tid < N) {
        c[tid] = a[tid] + b[tid];
        tid += 1;   // we have one CPU, so we increment by one

int main( void ) {
 //for processing time measure
 unsigned long Atime=0, Btime=0;
 Atime = GetTickCount();

    int a[N], b[N], c[N];

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i< N; i++) {
        a[i] = -i;
        b[i] = i * i;

    add( a, b, c );

    // display the results
    for (int i=0; i< N; i++) {
        printf( "%d + %d = %d\n", a[i], b[i], c[i] );

 Btime = GetTickCount();
 printf("%.20lf sec\n",  (Btime - Atime)/1000.0 );

    return 0;
cuda version
#include < iostream>
#include < windows.h>
#include < stdlib.h>
 #include "cuda.h"  
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  

#define N 50000 //vector size

__global__ void add(int *a, int *b, int *c){
 int tid = blockIdx.x; //block index that is set by gpu device.
 if(tid < N) 
  c[tid] = a[tid] + b[tid];

int main(void)
 //for processing time measure
 unsigned long Atime=0, Btime=0;
 Atime = GetTickCount();

 int a[N], b[N], c[N];
 int *dev_a, *dev_b, *dev_c;

 //gpu mem allocation
 cudaMalloc( (void**)&dev_a, N*sizeof(int) );
 cudaMalloc( (void**)&dev_b, N*sizeof(int) );
 cudaMalloc( (void**)&dev_c, N*sizeof(int) );

 //value alloc in cpu
 for(int i=0; i< N; ++i)
  a[i] = -i;
  b[i] = i*i;

 //value copy from cpu to gpu
 cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
 cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);

 //add kernel call
 add<<< N, 1>>>(dev_a, dev_b, dev_c);

 //result value copy gpu -> cpu
 cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost) ;


 //result print
 //for(int i=0; i< N; ++i){
  //printf("%d + %d = %d\n", a[i], b[i], c[i] );

 //memory release
 cudaFree( dev_a );
 cudaFree( dev_b );
 cudaFree( dev_c );

 Btime = GetTickCount();
 printf("%.2lf sec\n",  (Btime - Atime)/1000.0 );

cude example is introduced in here -> https://bitbucket.org/mrfright/cuda_by_example/src

