3/12/2015

Meaning of threadIdx, blockIdx, blockDim, gridDim in the cuda (2D)

This article explain how to access the thread index when you make block and thread with two dimensions.

please refer to this page about method to access thread 1D.
->http://feelmare.blogspot.com/2015/03/meaning-of-threadidx-blockidx-blockdim.html

If you make kernel like that

dim3 blocks(2,3);
dim3 thread(3,2);
Kernel<<< blocks, threads >>>

The threads are made as follows figure.


36 threads are made and gridDim and blockDim is (2,3) and (3,2).

problem is now...
How to access 15th thread??
See the this figure..



Do you understand?
We have to do indexing calculation, because threadIdx.x, threadIdx.y is only indicate indexing in their block.

For more detail, refer to below figure that represent the index list of tid calculation result.




#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include < stdio.h>



#define N 40

__global__ void increase(int *c){
 int x = threadIdx.x + blockIdx.x * blockDim.x;
 int y = threadIdx.y + blockIdx.y * blockDim.y;
 int tid = x + y*blockDim.x * gridDim.x;
 if(tid < N)
  c[tid] = tid;
}



int main(void)
{
 int c[N];
 int *dev_c;

 cudaMalloc( (void**)&dev_c, N*sizeof(int) );

 for(int i=0; i< N; ++i)
 {
  c[i] = -1;
 }

 cudaMemcpy(dev_c, c, N*sizeof(int), cudaMemcpyHostToDevice);

 dim3 blocks(2,3);
 dim3 threads(3,2);
 increase<<< blocks, threads>>>(dev_c);

 cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost );

 for(int i=0; i< N; ++i)
 {
  printf("c[%d] = %d \n" ,i, c[i] );
 }

 cudaFree( dev_c );
}

...


In the source code, threads are made only 36. so 37th 38th 39th 40th array have left initial value -1.





Meaning of threadIdx, blockIdx, blockDim, gridDim in the cuda (1D)


When we study cuda firstly, thread indexing is very confusing.
So I tried to clean up.

First, Let's grab a sense of looking at this example
...
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include < stdio.h>

#define N 15

__global__ void increase(int *c){
 int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
 if(tid < N)
  c[tid] = tid;
}

int main(void)
{
 int c[N];
 int *dev_c;

 cudaMalloc( (void**)&dev_c, N*sizeof(int) );

 for(int i=0; i< N; ++i)
 {
  c[i] = -1;
 }

 cudaMemcpy(dev_c, c, N*sizeof(int), cudaMemcpyHostToDevice);

 increase<<< 4, 3>>>(dev_c);

 cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost );

 for(int i=0; i< N; ++i)
 {
  printf("c[%d] = %d \n" ,i, c[i] );
 }

 cudaFree( dev_c );
}
...

The result of this example source is

In the source code, kernel function(increase) is created by <<< 4, 3 >>>, this means to create 12 threads.
12 threads are executed at the same time.
So, the kernel function need to know what number of thread am I?
The method is threadIdx and blockIdx.

But we have to calculate thread index, because threadIdx and blockIdx is different space index.
like that " int tid = threadIdx.x + blockIdx.x * blockDim.x; "

threadIdx tells current thread index.
blockIdx tells current block index.

gridDim tells number of blocks in a grid
blockDim tells number of a threads in a block


Did you more confused?

My final explanation. See the this figure.




Next time, I will introduce 2D kernel.
http://study.marearts.com/2015/03/meaning-of-threadidx-blockidx-blockdim_12.html
Thank you.



3/10/2015

error : calling a __host__ function("cuComplex::cuComplex") from a __device__ function("julia") is not allowed

If you meet this error studying juliaset in CUDA by example book capture 4.

calling a __host__ function("cuComplex::cuComplex") from a __device__ function("julia") is not allowed

modify like this in cuComplex structure
cuComplex( float a, float b ) : r(a), i(b)  {}  -->  __device__ cuComplex( float a, float b ) : r(a), i(b)  {}


Hope to see julia set, beautiful~


50000 x 1 vector sum, gpu vs cpu processing time compare

in (50000 x 1 + 50000 x 1) vector sum
cuda takes 0.16 sec, included all process ex)upload, download, malloc, release..
cpu takes 0.00 sec.


cpu is faster than cuda??
Because processing is simple??

test by this source code..

cpu version

#include < iostream>
#include < windows.h>
#include < stdlib.h>
 #include "cuda.h"  
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  



#define N 50000 //vector size

void add( int *a, int *b, int *c ) {
    int tid = 0;    // this is CPU zero, so we start at zero
    while (tid < N) {
        c[tid] = a[tid] + b[tid];
        tid += 1;   // we have one CPU, so we increment by one
    }
}

int main( void ) {
 //for processing time measure
 unsigned long Atime=0, Btime=0;
 Atime = GetTickCount();

    int a[N], b[N], c[N];

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i< N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    add( a, b, c );

    // display the results
 /*
    for (int i=0; i< N; i++) {
        printf( "%d + %d = %d\n", a[i], b[i], c[i] );
    }
 */

 Btime = GetTickCount();
 printf("%.20lf sec\n",  (Btime - Atime)/1000.0 );

    return 0;
}
cuda version
#include < iostream>
#include < windows.h>
#include < stdlib.h>
 #include "cuda.h"  
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  

#define N 50000 //vector size


__global__ void add(int *a, int *b, int *c){
 
 int tid = blockIdx.x; //block index that is set by gpu device.
 if(tid < N) 
  c[tid] = a[tid] + b[tid];
}


int main(void)
{
 //for processing time measure
 unsigned long Atime=0, Btime=0;
 Atime = GetTickCount();

 int a[N], b[N], c[N];
 int *dev_a, *dev_b, *dev_c;

 //gpu mem allocation
 cudaMalloc( (void**)&dev_a, N*sizeof(int) );
 cudaMalloc( (void**)&dev_b, N*sizeof(int) );
 cudaMalloc( (void**)&dev_c, N*sizeof(int) );

 //value alloc in cpu
 for(int i=0; i< N; ++i)
 {
  a[i] = -i;
  b[i] = i*i;
 }

 //value copy from cpu to gpu
 cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
 cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);

 //add kernel call
 add<<< N, 1>>>(dev_a, dev_b, dev_c);

 //result value copy gpu -> cpu
 cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost) ;


 

 //result print
 //for(int i=0; i< N; ++i){
  //printf("%d + %d = %d\n", a[i], b[i], c[i] );
 //}

 //memory release
 cudaFree( dev_a );
 cudaFree( dev_b );
 cudaFree( dev_c );

 Btime = GetTickCount();
 printf("%.2lf sec\n",  (Btime - Atime)/1000.0 );
}





cude example is introduced in here -> https://bitbucket.org/mrfright/cuda_by_example/src

3/09/2015

First CUDA tutorial - CUDA toolkit 6.0 + Visual studio 2012 setting and example simple source


You have already installed Visual studio 2012 or other version.
And, you have to install CUDA toolkit, go to the nvidia homepage -> https://developer.nvidia.com/cuda-downloads
Now cuda 6.5 is available.

Don't worry, In this introduction, version is not important.


After install cuda toolkit and Visual studio.
There is two option to start simple cuda project.

One is little bit complex.
Make win32 console project (empty optioni)
set cuda include and lib directory.
set build option
add cuda file
and coding...

See the this video.
https://www.youtube.com/watch?v=i43dUW4E-fE&feature=youtu.be



Another method is using cuda runtime project.
This is very easy.
See the this video.
https://www.youtube.com/watch?v=j2wV4-IiGh4&feature=youtu.be


This is example source code

#include < iostream>
#include "cuda.h"  
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  

__global__ void add(int a, int b, int*c)
{
 *c = a+b;
}

int main()
{

 int c;
 int *dev_c;
 cudaMalloc( (void**)&dev_c, sizeof(int) );

 add<<< 1,1>>>(2,7, dev_c);

 cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);

 printf(" 2+7 = %d \n", c);

 cudaFree(dev_c); 

 return 0;

}

As I am cuda beginner, I don't know which method is good.
But I think first method is more useful for adding cuda code into existing project.

And refer to this youtube channel.
https://www.youtube.com/channel/UCBHcMCGaiJhv-ESTcWGJPcw

cudacast playlist will be efficient.

Thank you.

To save mouse drag region to image file on the video frame. example source using opencv

This example source code is for saving image file that specified a rectangle region by mouse drag on the video frame.

The method is easy.
Firstly, enter the file name included file extension (ex: s.avi)
Then, video will be play.
Press p key on the video play window, if you want to save image.
Video will play after end of drag and drag region will be saved in your folder.
'ESC' key is for program finish.

Thank you.

example video is here
-> https://www.youtube.com/watch?v=ZpO1b-lZb7g



///
#include < stdio.h>
#include < iostream>

#include < opencv2\opencv.hpp>

#ifdef _DEBUG        
#pragma comment(lib, "opencv_core249d.lib")
#pragma comment(lib, "opencv_highgui249d.lib")
#else
#pragma comment(lib, "opencv_core249.lib")
#pragma comment(lib, "opencv_highgui249.lib")
#endif 

using namespace std;
using namespace cv;

bool selectObject = false;
Rect selection;
Point origin;
Mat image;
bool pause =false;
double fpss;

Rect PatchRect;
Mat PatchImg;

unsigned int frame_index=0;

static void onMouse( int event, int x, int y, int, void* )
{
 if( selectObject & pause)
 {

  selection.x = MIN(x, origin.x);
  selection.y = MIN(y, origin.y);
  selection.width = std::abs(x - origin.x);
  selection.height = std::abs(y - origin.y);
  selection &= Rect(0, 0, image.cols, image.rows);
 }

 switch( event )
 {
 case CV_EVENT_LBUTTONDOWN:
  origin = Point(x,y);
  selection = Rect(x,y,0,0);
  selectObject = true;
  break;
 case CV_EVENT_LBUTTONUP:
  if(selectObject && pause)
  {
   if(selection.width > 5 && selection.height > 5 )
   {
    PatchRect = selection;
    image( PatchRect ).copyTo( PatchImg );
    imshow("Selected Img", PatchImg );

    
    char str[100];
    sprintf_s(str,"%d.jpg", int(frame_index/fpss));
    imwrite(str, PatchImg);

   }else
    selection = Rect(0,0,0,0);
  }
  selectObject = false;
  pause = false;

  break;
 }
}


int main (void)  
{  


 printf("avi file name?");
 char nstr[255];
 scanf_s("%s", nstr);
 printf("-> %s", nstr);

 VideoCapture cap(nstr); 

 Mat frame;
 namedWindow( "Demo", 0 );
 setMouseCallback( "Demo", onMouse, 0 );
 printf("P key is pause, ESC key is exit.\n");

 for(;;)
 {
  frame_index++;

  if(!pause)
   cap >> frame;
  if( frame.empty() )
   break;
  frame.copyTo(image);


  if( pause && selection.width > 0 && selection.height > 0 )
  {
   rectangle(image, Point(selection.x-1, selection.y-1), Point(selection.x+selection.width+1, selection.y+selection.height+1), CV_RGB(255,0,0) );
  }

  imshow( "Demo", image );

  char k = waitKey(10);

  if( k == 27 )
   break;
  else if(k == 'p' || k=='P' )
   pause=!pause;
 }

 return 0;  
}  


...