## 3/24/2015

### Image warping (using opencv findHomography, warpPerspective)

fig 1. Left: set 4 points (Left Top, Right Top, Right Bottom, Left Bottom), right:warped image to (0,0) (300,0), (300,300), (0,300)

Firstly, we have to know Homography matrix for image warping.
A homography matrix is that the converting matrix can transform from A plane to B plane in 3D space.
See more detail about Homography in here
http://en.wikipedia.org/wiki/Homography_%28computer_vision%29

So, as the above equation, H matrix convert A matrix to B matrix.
In here, A is left, B is right 4 points in fig 1.

In OpenCV function, findHomography function gives H matrix.
Input parameter is findHomography(A, B). Do not confuse.

After get H matrix, we can warp image using various transform functions in opencv.
In this example, I use warpPerspective function, because rectangle shape is a trapezoidal model.
Input parameter is warpPerspective(Origin_mage, warped_image, H, cv::Size(cols, rows));

see the test video of this example source code in here

In source code, actually to get homography and warping part is 88 ~ 108 lines.
And 109~142 lines are the part for calculated value confirm.
Left code is for interface and selection point ordering.
http://study.marearts.com/2015/03/any-4-points-odering-by-lefttop.html

```#include < opencv2\opencv.hpp>
#include < string>
#include < stdio.h>

#ifdef _DEBUG
#pragma comment(lib, "opencv_core249d.lib")
#pragma comment(lib, "opencv_imgproc249d.lib")   //MAT processing
#pragma comment(lib, "opencv_highgui249d.lib")
#pragma comment(lib, "opencv_calib3d249d.lib")
#else
#pragma comment(lib, "opencv_core249.lib")
#pragma comment(lib, "opencv_highgui249.lib")
#pragma comment(lib, "opencv_calib3d249.lib")
#endif

using namespace std;
using namespace cv;

static void onMouse( int event, int x, int y, int, void* );
Point2f roi4point[4]={0,};
int roiIndex=0;
bool oksign = false;

Point2f MinDistFind(float x, float y, Point2f* inPoints);
void PointOrderbyConner(Point2f* inPoints, int w, int h );

int main()
{
char fileName[100] = "./chess.jpg";

//origin
Mat GetImg = imread( fileName );
//copy for drawing
Mat RoiImg;

//window
namedWindow( "set roi by 4 points", 0 );

//mouse callback
setMouseCallback( "set roi by 4 points", onMouse, 0 );

//point selection until 4 points setting
while(1)
{

if(oksign == true) //right button click
break;

//draw point
RoiImg = GetImg.clone();
for(int i=0; i< roiIndex; ++i)
circle(RoiImg, roi4point[i], 5,CV_RGB(255,0,255),5);
imshow("set roi by 4 points", RoiImg);

waitKey(10);
}

printf("points ordered by LT, RT, RB, LB \n");
PointOrderbyConner(roi4point, GetImg.size().width,  GetImg.size().height);
for(int i=0; i< 4; ++i)
{
printf("[%d] (%.2lf, %.2lf) \n",i, roi4point[i].x, roi4point[i].y );
}

//drwaring
RoiImg = GetImg.clone();
string TestStr[4]={"LT","RT","RB","LB"};
putText(RoiImg, TestStr[0].c_str(), roi4point[0], CV_FONT_NORMAL, 1, Scalar(0,0,255),3);
circle(RoiImg, roi4point[0], 3,CV_RGB(0,0,255));
int i;
for(i=1; i< roiIndex; ++i)
{
line(RoiImg, roi4point[i-1], roi4point[i], CV_RGB(255,0,0),1 );
circle(RoiImg, roi4point[i], 1,CV_RGB(0,0,255),3);
putText(RoiImg, TestStr[i].c_str(), roi4point[i], CV_FONT_NORMAL, 1, Scalar(0,0,255),3);
}

line(RoiImg, roi4point[0], roi4point[i-1], CV_RGB(255,0,0),1 );
imshow("set roi by 4 points2", RoiImg);

//prepare to get homography matrix
vector< Point2f> P1; //clicked positions
vector< Point2f> P2(4); //user setting positions
for(int i=0; i< 4; ++i)
P1.push_back( roi4point[i] );

//user setting position
P2[0].x = 0; P2[0].y = 0;
P2[1].x = 300; P2[1].y = 0;
P2[2].x = 300; P2[2].y = 300;
P2[3].x = 0; P2[3].y = 300;

//get homography
Mat H = findHomography(P1, P2);

//warping
Mat warped_image;
warpPerspective(GetImg, warped_image, H,cv::Size(GetImg.cols, GetImg.rows));
rectangle(warped_image, Point(0,0), Point(300,300), CV_RGB(255,0,0) );
imshow("warped_image", warped_image);

///////////////////////////
//calculation confirm
cout << "h" << endl << H << endl;
cout << "size rows and cols " << H.rows << " " << H.cols << endl;

Mat A(3,4,CV_64F); //3xN, P1
Mat B(3,4,CV_64F); //3xN, P2
//B = H*A  (P2 = h(P1))

for(int i=0; i< 4; ++i)
{
A.at< double>(0,i) = P1[i].x;
A.at< double>(1,i) = P1[i].y;
A.at< double>(2,i) = 1;

B.at< double>(0,i) = P2[i].x;
B.at< double>(1,i) = P2[i].y;
B.at< double>(2,i) = 1;
}

cout << "a" << endl << A << endl;
cout << "b" << endl << B << endl;
Mat HA = H*A;

for(int i=0; i< 4; ++i)
{
HA.at< double>(0,i) /= HA.at< double>(2,i);
HA.at< double>(1,i) /= HA.at< double>(2,i);
HA.at< double>(2,i) /= HA.at< double>(2,i);
}

cout << "HA" << endl << HA << endl;

waitKey(0);
}

void PointOrderbyConner(Point2f* inPoints, int w, int h )
{

vector< pair< float, float> > s_point;
for(int i=0; i< 4; ++i)
s_point.push_back( make_pair(inPoints[i].x, inPoints[i].y) );

//sort
sort(s_point.begin(), s_point.end(), [](const pair< float, float>& A, const pair< float, float>& B){ return A.second < B.second; } );

if( s_point[0].first < s_point[1].first )
{
inPoints[0].x = s_point[0].first;
inPoints[0].y = s_point[0].second;

inPoints[1].x = s_point[1].first;
inPoints[1].y = s_point[1].second;

}else{
inPoints[0].x = s_point[1].first;
inPoints[0].y = s_point[1].second;

inPoints[1].x = s_point[0].first;
inPoints[1].y = s_point[0].second;
}

if( s_point[2].first > s_point[3].first )
{
inPoints[2].x = s_point[2].first;
inPoints[2].y = s_point[2].second;

inPoints[3].x = s_point[3].first;
inPoints[3].y = s_point[3].second;

}else{
inPoints[2].x = s_point[3].first;
inPoints[2].y = s_point[3].second;

inPoints[3].x = s_point[2].first;
inPoints[3].y = s_point[2].second;
}

}

static void onMouse( int event, int x, int y, int, void* )
{

if( event == CV_EVENT_LBUTTONDOWN && oksign==false)
{
//4 point select
if(roiIndex>=4)
{
roiIndex=0;
for(int i=0; i< 4; ++i)
roi4point[i].x = roi4point[i].y =0;
}

roi4point[roiIndex].x = x;
roi4point[roiIndex].y = y;

//point coordinate print
printf("-(%..2lf,%.2lf), 2:(%.2lf,%.2lf), 3:(%.2lf,%.2lf), 4:(%.2lf,%.2lf)\n",
roi4point[0].x, roi4point[0].y,roi4point[1].x, roi4point[1].y,roi4point[2].x, roi4point[2].y,roi4point[3].x, roi4point[3].y );

roiIndex++;
}

if(event == CV_EVENT_RBUTTONDOWN)
{
//set point.
if(roiIndex == 4)
{
oksign = true;
printf("Warping Start!!!\n");
}
}

}
```

This is matlab source code for confirm.
x1 is clicked 4 point in opencv(I did value copy into matlab), matlabH is calculated by homography2d function. (refer to peter homepage for this function detail http://www.csse.uwa.edu.au/~pk/Research/MatlabFns/index.html)
x2 is calculate exactly when matlabH*x1.

I try in opencv with same values of x1, x2.
opencvH is calculated value from opencv source code.
Value is slightly different. Because scaling, OpenCV H and Matlab H will be same when (3,3) value will be divided by equal to 1.

```clc;
clear all;

x1 =[259 126 1; 566 222 1; 400 473 1; 33 305 1]';
x2 =[0 0 1; 300 0 1; 300 300 1; 0 300 1]';

matlabH = homography2d(x1, x2)

matlab_X2= matlabH*x1;
matlab_X2(:,1) = matlab_X2(:,1)/matlab_X2(3,1);
matlab_X2(:,2) = matlab_X2(:,2)/matlab_X2(3,2);
matlab_X2(:,3) = matlab_X2(:,3)/matlab_X2(3,3);
matlab_X2(:,4) = matlab_X2(:,4)/matlab_X2(3,4);

matlab_X2

opencvH = [1.021877004679779, 1.290191078534245, -427.2302201073777;
-0.6109166533338892, 1.953660547640664, -87.93381578924605;
5.540800373074552e-006, 0.002051557898988468, 1]

opencv_x2 = opencvH * x1;
opencv_x2(:,1) = opencv_x2(:,1)/opencv_x2(3,1);
opencv_x2(:,2) = opencv_x2(:,2)/opencv_x2(3,2);
opencv_x2(:,3) = opencv_x2(:,3)/opencv_x2(3,3);
opencv_x2(:,4) = opencv_x2(:,4)/opencv_x2(3,4);
opencv_x2
```

## 3/16/2015

### any 4 points odering by leftTop, RightTop, RightBottom, LeftBottom

We may wish to know the order in lt, rt, rb, lb, for any of the four points.
This example is that for this case.

And see the source code operation in video

...
```//#include < time.h>
#include < opencv2\opencv.hpp>
//#include < opencv2\gpu\gpu.hpp>
//#include < opencv2\stitching\detail\matchers.hpp >
#include < string>
#include < stdio.h>
//#include < queue>

#ifdef _DEBUG
#pragma comment(lib, "opencv_core249d.lib")
//#pragma comment(lib, "opencv_imgproc249d.lib")   //MAT processing
//#pragma comment(lib, "opencv_gpu249d.lib")
#pragma comment(lib, "opencv_highgui249d.lib")
//#pragma comment(lib, "opencv_objdetect249d.lib")
//#pragma comment(lib, "opencv_calib3d249d.lib")
//#pragma comment(lib, "opencv_nonfree249d.lib")
//#pragma comment(lib, "opencv_features2d249d.lib")
#else
#pragma comment(lib, "opencv_core249.lib")
//#pragma comment(lib, "opencv_imgproc249.lib")
//#pragma comment(lib, "opencv_gpu249.lib")
#pragma comment(lib, "opencv_highgui249.lib")
//#pragma comment(lib, "opencv_objdetect249.lib")
//#pragma comment(lib, "opencv_calib3d249.lib")
//#pragma comment(lib, "opencv_nonfree249.lib")
//#pragma comment(lib, "opencv_features2d249.lib")
#endif

using namespace std;
using namespace cv;

static void onMouse( int event, int x, int y, int, void* );
Point2f roi4point[4]={0,};
int roiIndex=0;
bool oksign = false;

Point2f MinDistFind(float x, float y, Point2f* inPoints);
void PointOrderbyConner(Point2f* inPoints, int w, int h );

int main()
{

//origin
Mat GetImg = imread( fileName );
//copy for drawing
Mat RoiImg;

//window
namedWindow( "set roi by 4 points", 0 );

//mouse callback
setMouseCallback( "set roi by 4 points", onMouse, 0 );

//point selection until 4 points setting
while(1)
{

if(oksign == true) //right button click
break;

//draw point
RoiImg = GetImg.clone();
for(int i=0; i< roiIndex; ++i)
circle(RoiImg, roi4point[i], 5,CV_RGB(255,0,255),5);
imshow("set roi by 4 points", RoiImg);

waitKey(10);
}

printf("points ordered by LT, RT, RB, LB \n");
PointOrderbyConner(roi4point, GetImg.size().width,  GetImg.size().height);
for(int i=0; i< 4; ++i)
{
printf("[%d] (%.2lf, %.2lf) \n",i, roi4point[i].x, roi4point[i].y );
}

//drwaring
RoiImg = GetImg.clone();
string TestStr[4]={"LT","RT","RB","LB"};
putText(RoiImg, TestStr[0].c_str(), roi4point[0], CV_FONT_NORMAL, 1, Scalar(255,255,255));
circle(RoiImg, roi4point[0], 3,CV_RGB(0,0,255));
int i;
for(i=1; i< roiIndex; ++i)
{
line(RoiImg, roi4point[i-1], roi4point[i], CV_RGB(255,0,0),1 );
circle(RoiImg, roi4point[i], 1,CV_RGB(0,0,255),3);
putText(RoiImg, TestStr[i].c_str(), roi4point[i], CV_FONT_NORMAL, 1, Scalar(255,255,255));
}

line(RoiImg, roi4point[0], roi4point[i-1], CV_RGB(255,0,0),1 );
imshow("set roi by 4 points2", RoiImg);

waitKey(0);
}

void PointOrderbyConner(Point2f* inPoints, int w, int h )
{

vector< pair< float, float> > s_point;
for(int i=0; i< 4; ++i)
s_point.push_back( make_pair(inPoints[i].x, inPoints[i].y) );

//sort
sort(s_point.begin(), s_point.end(), [](const pair< float, float>& A, const pair< float, float>& B){ return A.second < B.second; } );

if( s_point[0].first < s_point[1].first )
{
inPoints[0].x = s_point[0].first;
inPoints[0].y = s_point[0].second;

inPoints[1].x = s_point[1].first;
inPoints[1].y = s_point[1].second;

}else{
inPoints[0].x = s_point[1].first;
inPoints[0].y = s_point[1].second;

inPoints[1].x = s_point[0].first;
inPoints[1].y = s_point[0].second;
}

if( s_point[2].first > s_point[3].first )
{
inPoints[2].x = s_point[2].first;
inPoints[2].y = s_point[2].second;

inPoints[3].x = s_point[3].first;
inPoints[3].y = s_point[3].second;

}else{
inPoints[2].x = s_point[3].first;
inPoints[2].y = s_point[3].second;

inPoints[3].x = s_point[2].first;
inPoints[3].y = s_point[2].second;
}

}

static void onMouse( int event, int x, int y, int, void* )
{

if( event == CV_EVENT_LBUTTONDOWN && oksign==false)
{
//4 point select
if(roiIndex>=4)
{
roiIndex=0;
for(int i=0; i< 4; ++i)
roi4point[i].x = roi4point[i].y =0;
}

roi4point[roiIndex].x = x;
roi4point[roiIndex].y = y;

//point coordinate print
printf("-(%..2lf,%.2lf), 2:(%.2lf,%.2lf), 3:(%.2lf,%.2lf), 4:(%.2lf,%.2lf)\n",
roi4point[0].x, roi4point[0].y,roi4point[1].x, roi4point[1].y,roi4point[2].x, roi4point[2].y,roi4point[3].x, roi4point[3].y );

roiIndex++;
}

if(event == CV_EVENT_RBUTTONDOWN)
{
//set point.
if(roiIndex == 4)
{
oksign = true;
printf("Warping Start!!!\n");
}
}

}

```
---

After 4 points selection, click Right button on the mouse, then ordering start.
PointOrderbyConner function is main function for ordering any points to lt, rt, rb, lb.
The logic is..

1. sorted in descending order with respect to the y-coordinate.
2. at 2 coordinates of top in sorted vectors,
finding min x value coordinate in 2 vectors, min is LT, lager is RT.
2. at 2 coordinates of bottom in sorted vectors,
finding min x value coordinate in 2 vectors, min is LB, lager is RB.

sorry, my low English skill.
See source code more detail.

### To process all arrays by reasonably small number of threads in cuda ( the explaination of tid = blockDim.x * gridDim.x )

see the this source code
...
```#define N 10 //(33*1024)

int tid = threadIdx.x + blockIdx.x * gridDim.x;

if(tid < N)
c[tid] = 1;

while( tid < N)
{
c[tid] = 1;
tid += blockDim.x * gridDim.x;
}

}

int main(void)
{
int c[N];
int *dev_c;
cudaMalloc( (void**)&dev_c, N*sizeof(int) );

for(int i=0; i< N; ++i)
{
c[i] = -1;
}

cudaMemcpy(dev_c, c, N*sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost );

for(int i=0; i< N; ++i)
{
printf("c[%d] = %d \n" ,i, c[i] );
}

cudaFree( dev_c );

}

```
---

Because we have to create reasonably small number of threads, if N is larger than 10 ex) 33*1024.

This source code is example of this case.
arrays are 10, cuda threads are 4.
How to access all 10 arrays only by 4 threads.

see the page about meaning of threadIdx, blockIdx, blockDim, gridDim in the cuda detail.

In this source code,
gridDim.x -> 2     //this means number of block of x
gridDim.y -> 1     //this means number of block of y
blockDim.x -> 2   //this means number of thread of x in a block
blockDim.y -> 1   //this means number of thread of y in a block

In add kernel function, we can access 0, 1, 2, 3 index of thread

int tid = threadIdx.x + blockIdx.x * blockDim.x;
①0+0*2=0
②1+0*2=1
③0+1*2=2
④1+1*2=3

How to access rest of index 4, 5, 6, 7, 8, 9.
There is a calculation in while loop
while(tid
{
c[tid] = 1;
tid += blockDim.x + gridDim.x;
}

** first call of kernel **
#1 loop: 0+2*2=4
#2 loop: 4+2*2=8
#3 loop: 8+2*2=12 ( but this value is false, while out!)

** second call of kernel **
#1 loop: 1+2*2=5
#2 loop: 5+2*2=9
#3 loop: 9+2*2=13 ( but this value is false, while out!)

** third call of kernel **
#1 loop: 2+2*2=6
#2 loop: 6+2*2=10 ( but this value is false, while out!)

** fourth call of kernel **
#1 loop: 3+2*2=7
#2 loop: 7+2*2=11 ( but this value is false, while out!)

So, all index of 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 can access by tid value.

## 3/12/2015

### Meaning of threadIdx, blockIdx, blockDim, gridDim in the cuda (2D)

If you make kernel like that

dim3 blocks(2,3);

36 threads are made and gridDim and blockDim is (2,3) and (3,2).

problem is now...
See the this figure..

Do you understand?
We have to do indexing calculation, because threadIdx.x, threadIdx.y is only indicate indexing in their block.

For more detail, refer to below figure that represent the index list of tid calculation result.

```#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include < stdio.h>

#define N 40

__global__ void increase(int *c){
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int tid = x + y*blockDim.x * gridDim.x;
if(tid < N)
c[tid] = tid;
}

int main(void)
{
int c[N];
int *dev_c;

cudaMalloc( (void**)&dev_c, N*sizeof(int) );

for(int i=0; i< N; ++i)
{
c[i] = -1;
}

cudaMemcpy(dev_c, c, N*sizeof(int), cudaMemcpyHostToDevice);

dim3 blocks(2,3);

cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost );

for(int i=0; i< N; ++i)
{
printf("c[%d] = %d \n" ,i, c[i] );
}

cudaFree( dev_c );
}

```
...

In the source code, threads are made only 36. so 37th 38th 39th 40th array have left initial value -1.

### Meaning of threadIdx, blockIdx, blockDim, gridDim in the cuda (1D)

When we study cuda firstly, thread indexing is very confusing.
So I tried to clean up.

First, Let's grab a sense of looking at this example
...
```#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include < stdio.h>

#define N 15

__global__ void increase(int *c){
int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid < N)
c[tid] = tid;
}

int main(void)
{
int c[N];
int *dev_c;

cudaMalloc( (void**)&dev_c, N*sizeof(int) );

for(int i=0; i< N; ++i)
{
c[i] = -1;
}

cudaMemcpy(dev_c, c, N*sizeof(int), cudaMemcpyHostToDevice);

increase<<< 4, 3>>>(dev_c);

cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost );

for(int i=0; i< N; ++i)
{
printf("c[%d] = %d \n" ,i, c[i] );
}

cudaFree( dev_c );
}
```
...

The result of this example source is

In the source code, kernel function(increase) is created by <<< 4, 3 >>>, this means to create 12 threads.
12 threads are executed at the same time.
So, the kernel function need to know what number of thread am I?
The method is threadIdx and blockIdx.

But we have to calculate thread index, because threadIdx and blockIdx is different space index.
like that " int tid = threadIdx.x + blockIdx.x * blockDim.x; "

blockIdx tells current block index.

gridDim tells number of blocks in a grid
blockDim tells number of a threads in a block

Did you more confused?

My final explanation. See the this figure.

Next time, I will introduce 2D kernel.
Thank you.

## 3/10/2015

### error : calling a __host__ function("cuComplex::cuComplex") from a __device__ function("julia") is not allowed

If you meet this error studying juliaset in CUDA by example book capture 4.

calling a __host__ function("cuComplex::cuComplex") from a __device__ function("julia") is not allowed

modify like this in cuComplex structure
cuComplex( float a, float b ) : r(a), i(b)  {}  -->  __device__ cuComplex( float a, float b ) : r(a), i(b)  {}

Hope to see julia set, beautiful~

### 50000 x 1 vector sum, gpu vs cpu processing time compare

in (50000 x 1 + 50000 x 1) vector sum
cpu takes 0.00 sec.

cpu is faster than cuda??
Because processing is simple??

test by this source code..

cpu version

```#include < iostream>
#include < windows.h>
#include < stdlib.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define N 50000 //vector size

void add( int *a, int *b, int *c ) {
int tid = 0;    // this is CPU zero, so we start at zero
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += 1;   // we have one CPU, so we increment by one
}
}

int main( void ) {
//for processing time measure
unsigned long Atime=0, Btime=0;
Atime = GetTickCount();

int a[N], b[N], c[N];

// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i< N; i++) {
a[i] = -i;
b[i] = i * i;
}

// display the results
/*
for (int i=0; i< N; i++) {
printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}
*/

Btime = GetTickCount();
printf("%.20lf sec\n",  (Btime - Atime)/1000.0 );

return 0;
}
```
cuda version
```#include < iostream>
#include < windows.h>
#include < stdlib.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define N 50000 //vector size

__global__ void add(int *a, int *b, int *c){

int tid = blockIdx.x; //block index that is set by gpu device.
if(tid < N)
c[tid] = a[tid] + b[tid];
}

int main(void)
{
//for processing time measure
unsigned long Atime=0, Btime=0;
Atime = GetTickCount();

int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;

//gpu mem allocation
cudaMalloc( (void**)&dev_a, N*sizeof(int) );
cudaMalloc( (void**)&dev_b, N*sizeof(int) );
cudaMalloc( (void**)&dev_c, N*sizeof(int) );

//value alloc in cpu
for(int i=0; i< N; ++i)
{
a[i] = -i;
b[i] = i*i;
}

//value copy from cpu to gpu
cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);

//result value copy gpu -> cpu
cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost) ;

//result print
//for(int i=0; i< N; ++i){
//printf("%d + %d = %d\n", a[i], b[i], c[i] );
//}

//memory release
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );

Btime = GetTickCount();
printf("%.2lf sec\n",  (Btime - Atime)/1000.0 );
}

```

cude example is introduced in here -> https://bitbucket.org/mrfright/cuda_by_example/src

## 3/09/2015

### First CUDA tutorial - CUDA toolkit 6.0 + Visual studio 2012 setting and example simple source

You have already installed Visual studio 2012 or other version.
And, you have to install CUDA toolkit, go to the nvidia homepage -> https://developer.nvidia.com/cuda-downloads
Now cuda 6.5 is available.

Don't worry, In this introduction, version is not important.

After install cuda toolkit and Visual studio.
There is two option to start simple cuda project.

One is little bit complex.
Make win32 console project (empty optioni)
set cuda include and lib directory.
set build option
and coding...

See the this video.

Another method is using cuda runtime project.
This is very easy.
See the this video.

This is example source code

```#include < iostream>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

__global__ void add(int a, int b, int*c)
{
*c = a+b;
}

int main()
{

int c;
int *dev_c;
cudaMalloc( (void**)&dev_c, sizeof(int) );

cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);

printf(" 2+7 = %d \n", c);

cudaFree(dev_c);

return 0;

}
```

As I am cuda beginner, I don't know which method is good.
But I think first method is more useful for adding cuda code into existing project.

And refer to this youtube channel.

cudacast playlist will be efficient.

Thank you.

### To save mouse drag region to image file on the video frame. example source using opencv

This example source code is for saving image file that specified a rectangle region by mouse drag on the video frame.

The method is easy.
Firstly, enter the file name included file extension (ex: s.avi)
Then, video will be play.
Press p key on the video play window, if you want to save image.
Video will play after end of drag and drag region will be saved in your folder.
'ESC' key is for program finish.

Thank you.

example video is here

///
```#include < stdio.h>
#include < iostream>

#include < opencv2\opencv.hpp>

#ifdef _DEBUG
#pragma comment(lib, "opencv_core249d.lib")
#pragma comment(lib, "opencv_highgui249d.lib")
#else
#pragma comment(lib, "opencv_core249.lib")
#pragma comment(lib, "opencv_highgui249.lib")
#endif

using namespace std;
using namespace cv;

bool selectObject = false;
Rect selection;
Point origin;
Mat image;
bool pause =false;
double fpss;

Rect PatchRect;
Mat PatchImg;

unsigned int frame_index=0;

static void onMouse( int event, int x, int y, int, void* )
{
if( selectObject & pause)
{

selection.x = MIN(x, origin.x);
selection.y = MIN(y, origin.y);
selection.width = std::abs(x - origin.x);
selection.height = std::abs(y - origin.y);
selection &= Rect(0, 0, image.cols, image.rows);
}

switch( event )
{
case CV_EVENT_LBUTTONDOWN:
origin = Point(x,y);
selection = Rect(x,y,0,0);
selectObject = true;
break;
case CV_EVENT_LBUTTONUP:
if(selectObject && pause)
{
if(selection.width > 5 && selection.height > 5 )
{
PatchRect = selection;
image( PatchRect ).copyTo( PatchImg );
imshow("Selected Img", PatchImg );

char str[100];
sprintf_s(str,"%d.jpg", int(frame_index/fpss));
imwrite(str, PatchImg);

}else
selection = Rect(0,0,0,0);
}
selectObject = false;
pause = false;

break;
}
}

int main (void)
{

printf("avi file name?");
char nstr[255];
scanf_s("%s", nstr);
printf("-> %s", nstr);

VideoCapture cap(nstr);

Mat frame;
namedWindow( "Demo", 0 );
setMouseCallback( "Demo", onMouse, 0 );
printf("P key is pause, ESC key is exit.\n");

for(;;)
{
frame_index++;

if(!pause)
cap >> frame;
if( frame.empty() )
break;
frame.copyTo(image);

if( pause && selection.width > 0 && selection.height > 0 )
{
rectangle(image, Point(selection.x-1, selection.y-1), Point(selection.x+selection.width+1, selection.y+selection.height+1), CV_RGB(255,0,0) );
}

imshow( "Demo", image );

char k = waitKey(10);

if( k == 27 )
break;
else if(k == 'p' || k=='P' )
pause=!pause;
}

return 0;
}

```
...