## 3/12/2015

### Meaning of threadIdx, blockIdx, blockDim, gridDim in the cuda (2D)

If you make kernel like that

dim3 blocks(2,3);

36 threads are made and gridDim and blockDim is (2,3) and (3,2).

problem is now...
See the this figure..

Do you understand?
We have to do indexing calculation, because threadIdx.x, threadIdx.y is only indicate indexing in their block.

For more detail, refer to below figure that represent the index list of tid calculation result.

```#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include < stdio.h>

#define N 40

__global__ void increase(int *c){
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int tid = x + y*blockDim.x * gridDim.x;
if(tid < N)
c[tid] = tid;
}

int main(void)
{
int c[N];
int *dev_c;

cudaMalloc( (void**)&dev_c, N*sizeof(int) );

for(int i=0; i< N; ++i)
{
c[i] = -1;
}

cudaMemcpy(dev_c, c, N*sizeof(int), cudaMemcpyHostToDevice);

dim3 blocks(2,3);

cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost );

for(int i=0; i< N; ++i)
{
printf("c[%d] = %d \n" ,i, c[i] );
}

cudaFree( dev_c );
}

```
...

In the source code, threads are made only 36. so 37th 38th 39th 40th array have left initial value -1.

### Meaning of threadIdx, blockIdx, blockDim, gridDim in the cuda (1D)

When we study cuda firstly, thread indexing is very confusing.
So I tried to clean up.

First, Let's grab a sense of looking at this example
...
```#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include < stdio.h>

#define N 15

__global__ void increase(int *c){
int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid < N)
c[tid] = tid;
}

int main(void)
{
int c[N];
int *dev_c;

cudaMalloc( (void**)&dev_c, N*sizeof(int) );

for(int i=0; i< N; ++i)
{
c[i] = -1;
}

cudaMemcpy(dev_c, c, N*sizeof(int), cudaMemcpyHostToDevice);

increase<<< 4, 3>>>(dev_c);

cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost );

for(int i=0; i< N; ++i)
{
printf("c[%d] = %d \n" ,i, c[i] );
}

cudaFree( dev_c );
}
```
...

The result of this example source is

In the source code, kernel function(increase) is created by <<< 4, 3 >>>, this means to create 12 threads.
12 threads are executed at the same time.
So, the kernel function need to know what number of thread am I?
The method is threadIdx and blockIdx.

But we have to calculate thread index, because threadIdx and blockIdx is different space index.
like that " int tid = threadIdx.x + blockIdx.x * blockDim.x; "

blockIdx tells current block index.

gridDim tells number of blocks in a grid
blockDim tells number of a threads in a block

Did you more confused?

My final explanation. See the this figure.

Next time, I will introduce 2D kernel.
Thank you.

## 3/10/2015

### error : calling a __host__ function("cuComplex::cuComplex") from a __device__ function("julia") is not allowed

If you meet this error studying juliaset in CUDA by example book capture 4.

calling a __host__ function("cuComplex::cuComplex") from a __device__ function("julia") is not allowed

modify like this in cuComplex structure
cuComplex( float a, float b ) : r(a), i(b)  {}  -->  __device__ cuComplex( float a, float b ) : r(a), i(b)  {}

Hope to see julia set, beautiful~

### 50000 x 1 vector sum, gpu vs cpu processing time compare

in (50000 x 1 + 50000 x 1) vector sum
cpu takes 0.00 sec.

cpu is faster than cuda??
Because processing is simple??

test by this source code..

cpu version

```#include < iostream>
#include < windows.h>
#include < stdlib.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define N 50000 //vector size

void add( int *a, int *b, int *c ) {
int tid = 0;    // this is CPU zero, so we start at zero
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += 1;   // we have one CPU, so we increment by one
}
}

int main( void ) {
//for processing time measure
unsigned long Atime=0, Btime=0;
Atime = GetTickCount();

int a[N], b[N], c[N];

// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i< N; i++) {
a[i] = -i;
b[i] = i * i;
}

// display the results
/*
for (int i=0; i< N; i++) {
printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}
*/

Btime = GetTickCount();
printf("%.20lf sec\n",  (Btime - Atime)/1000.0 );

return 0;
}
```
cuda version
```#include < iostream>
#include < windows.h>
#include < stdlib.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define N 50000 //vector size

__global__ void add(int *a, int *b, int *c){

int tid = blockIdx.x; //block index that is set by gpu device.
if(tid < N)
c[tid] = a[tid] + b[tid];
}

int main(void)
{
//for processing time measure
unsigned long Atime=0, Btime=0;
Atime = GetTickCount();

int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;

//gpu mem allocation
cudaMalloc( (void**)&dev_a, N*sizeof(int) );
cudaMalloc( (void**)&dev_b, N*sizeof(int) );
cudaMalloc( (void**)&dev_c, N*sizeof(int) );

//value alloc in cpu
for(int i=0; i< N; ++i)
{
a[i] = -i;
b[i] = i*i;
}

//value copy from cpu to gpu
cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);

//result value copy gpu -> cpu
cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost) ;

//result print
//for(int i=0; i< N; ++i){
//printf("%d + %d = %d\n", a[i], b[i], c[i] );
//}

//memory release
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );

Btime = GetTickCount();
printf("%.2lf sec\n",  (Btime - Atime)/1000.0 );
}

```

cude example is introduced in here -> https://bitbucket.org/mrfright/cuda_by_example/src

## 3/09/2015

### First CUDA tutorial - CUDA toolkit 6.0 + Visual studio 2012 setting and example simple source

You have already installed Visual studio 2012 or other version.
And, you have to install CUDA toolkit, go to the nvidia homepage -> https://developer.nvidia.com/cuda-downloads
Now cuda 6.5 is available.

Don't worry, In this introduction, version is not important.

After install cuda toolkit and Visual studio.
There is two option to start simple cuda project.

One is little bit complex.
Make win32 console project (empty optioni)
set cuda include and lib directory.
set build option
and coding...

See the this video.

Another method is using cuda runtime project.
This is very easy.
See the this video.

This is example source code

```#include < iostream>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

__global__ void add(int a, int b, int*c)
{
*c = a+b;
}

int main()
{

int c;
int *dev_c;
cudaMalloc( (void**)&dev_c, sizeof(int) );

cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);

printf(" 2+7 = %d \n", c);

cudaFree(dev_c);

return 0;

}
```

As I am cuda beginner, I don't know which method is good.
But I think first method is more useful for adding cuda code into existing project.

And refer to this youtube channel.

cudacast playlist will be efficient.

Thank you.

### To save mouse drag region to image file on the video frame. example source using opencv

This example source code is for saving image file that specified a rectangle region by mouse drag on the video frame.

The method is easy.
Firstly, enter the file name included file extension (ex: s.avi)
Then, video will be play.
Press p key on the video play window, if you want to save image.
Video will play after end of drag and drag region will be saved in your folder.
'ESC' key is for program finish.

Thank you.

example video is here

///
```#include < stdio.h>
#include < iostream>

#include < opencv2\opencv.hpp>

#ifdef _DEBUG
#pragma comment(lib, "opencv_core249d.lib")
#pragma comment(lib, "opencv_highgui249d.lib")
#else
#pragma comment(lib, "opencv_core249.lib")
#pragma comment(lib, "opencv_highgui249.lib")
#endif

using namespace std;
using namespace cv;

bool selectObject = false;
Rect selection;
Point origin;
Mat image;
bool pause =false;
double fpss;

Rect PatchRect;
Mat PatchImg;

unsigned int frame_index=0;

static void onMouse( int event, int x, int y, int, void* )
{
if( selectObject & pause)
{

selection.x = MIN(x, origin.x);
selection.y = MIN(y, origin.y);
selection.width = std::abs(x - origin.x);
selection.height = std::abs(y - origin.y);
selection &= Rect(0, 0, image.cols, image.rows);
}

switch( event )
{
case CV_EVENT_LBUTTONDOWN:
origin = Point(x,y);
selection = Rect(x,y,0,0);
selectObject = true;
break;
case CV_EVENT_LBUTTONUP:
if(selectObject && pause)
{
if(selection.width > 5 && selection.height > 5 )
{
PatchRect = selection;
image( PatchRect ).copyTo( PatchImg );
imshow("Selected Img", PatchImg );

char str;
sprintf_s(str,"%d.jpg", int(frame_index/fpss));
imwrite(str, PatchImg);

}else
selection = Rect(0,0,0,0);
}
selectObject = false;
pause = false;

break;
}
}

int main (void)
{

printf("avi file name?");
char nstr;
scanf_s("%s", nstr);
printf("-> %s", nstr);

VideoCapture cap(nstr);

Mat frame;
namedWindow( "Demo", 0 );
setMouseCallback( "Demo", onMouse, 0 );
printf("P key is pause, ESC key is exit.\n");

for(;;)
{
frame_index++;

if(!pause)
cap >> frame;
if( frame.empty() )
break;
frame.copyTo(image);

if( pause && selection.width > 0 && selection.height > 0 )
{
rectangle(image, Point(selection.x-1, selection.y-1), Point(selection.x+selection.width+1, selection.y+selection.height+1), CV_RGB(255,0,0) );
}

imshow( "Demo", image );

char k = waitKey(10);

if( k == 27 )
break;
else if(k == 'p' || k=='P' )
pause=!pause;
}

return 0;
}

```
...