3/24/2015

Image warping (using opencv findHomography, warpPerspective)

fig 1. Left: set 4 points (Left Top, Right Top, Right Bottom, Left Bottom), right:warped image to (0,0) (300,0), (300,300), (0,300)


Firstly, we have to know Homography matrix for image warping.
A homography matrix is that the converting matrix can transform from A plane to B plane in 3D space.
See more detail about Homography in here
http://en.wikipedia.org/wiki/Homography_%28computer_vision%29


So, as the above equation, H matrix convert A matrix to B matrix.
In here, A is left, B is right 4 points in fig 1.

In OpenCV function, findHomography function gives H matrix.
Input parameter is findHomography(A, B). Do not confuse.

After get H matrix, we can warp image using various transform functions in opencv.
In this example, I use warpPerspective function, because rectangle shape is a trapezoidal model.
Input parameter is warpPerspective(Origin_mage, warped_image, H, cv::Size(cols, rows));

see the test video of this example source code in here


In source code, actually to get homography and warping part is 88 ~ 108 lines.
And 109~142 lines are the part for calculated value confirm.
Left code is for interface and selection point ordering.
About interface and 4 points ordering refer to this page
http://study.marearts.com/2015/03/any-4-points-odering-by-lefttop.html


#include < opencv2\opencv.hpp>  
#include < string>  
#include < stdio.h>  

#ifdef _DEBUG          
#pragma comment(lib, "opencv_core249d.lib")  
#pragma comment(lib, "opencv_imgproc249d.lib")   //MAT processing  
#pragma comment(lib, "opencv_highgui249d.lib")
#pragma comment(lib, "opencv_calib3d249d.lib") 
#else  
#pragma comment(lib, "opencv_core249.lib")  
#pragma comment(lib, "opencv_highgui249.lib")
#pragma comment(lib, "opencv_calib3d249.lib") 
#endif     


using namespace std;  
using namespace cv;  

static void onMouse( int event, int x, int y, int, void* );
Point2f roi4point[4]={0,};
int roiIndex=0;
bool oksign = false;

Point2f MinDistFind(float x, float y, Point2f* inPoints);
void PointOrderbyConner(Point2f* inPoints, int w, int h );

int main()  
{  
 //image loading
 //char fileName[100] = "./road-ahead.jpg";
 char fileName[100] = "./chess.jpg";

 //origin
 Mat GetImg = imread( fileName );
 //copy for drawing
 Mat RoiImg;
 
 //window
 namedWindow( "set roi by 4 points", 0 );  

 //mouse callback
 setMouseCallback( "set roi by 4 points", onMouse, 0 );  
 
 //point selection until 4 points setting
 while(1)
 {

  if(oksign == true) //right button click
   break;

  //draw point
  RoiImg = GetImg.clone();
  for(int i=0; i< roiIndex; ++i)
   circle(RoiImg, roi4point[i], 5,CV_RGB(255,0,255),5);
  imshow("set roi by 4 points", RoiImg);
    
  waitKey(10);
 }



 printf("points ordered by LT, RT, RB, LB \n");
 PointOrderbyConner(roi4point, GetImg.size().width,  GetImg.size().height);
 for(int i=0; i< 4; ++i)
 {
  printf("[%d] (%.2lf, %.2lf) \n",i, roi4point[i].x, roi4point[i].y );
 }


 //drwaring
 RoiImg = GetImg.clone();
 string TestStr[4]={"LT","RT","RB","LB"};  
 putText(RoiImg, TestStr[0].c_str(), roi4point[0], CV_FONT_NORMAL, 1, Scalar(0,0,255),3);
 circle(RoiImg, roi4point[0], 3,CV_RGB(0,0,255));
 int i;
 for(i=1; i< roiIndex; ++i)
 {
  line(RoiImg, roi4point[i-1], roi4point[i], CV_RGB(255,0,0),1 );
  circle(RoiImg, roi4point[i], 1,CV_RGB(0,0,255),3);  
  putText(RoiImg, TestStr[i].c_str(), roi4point[i], CV_FONT_NORMAL, 1, Scalar(0,0,255),3);
 }

 line(RoiImg, roi4point[0], roi4point[i-1], CV_RGB(255,0,0),1 );
 imshow("set roi by 4 points2", RoiImg);


 //prepare to get homography matrix
 vector< Point2f> P1; //clicked positions
 vector< Point2f> P2(4); //user setting positions
 for(int i=0; i< 4; ++i)
  P1.push_back( roi4point[i] );

 //user setting position
 P2[0].x = 0; P2[0].y = 0; 
 P2[1].x = 300; P2[1].y = 0; 
 P2[2].x = 300; P2[2].y = 300; 
 P2[3].x = 0; P2[3].y = 300; 

 //get homography
 Mat H = findHomography(P1, P2);

 //warping
 Mat warped_image;
 warpPerspective(GetImg, warped_image, H,cv::Size(GetImg.cols, GetImg.rows));
 rectangle(warped_image, Point(0,0), Point(300,300), CV_RGB(255,0,0) );
 imshow("warped_image", warped_image);
 

 ///////////////////////////
 //calculation confirm
 cout << "h" << endl << H << endl;
 cout << "size rows and cols " << H.rows << " " << H.cols << endl;

 Mat A(3,4,CV_64F); //3xN, P1
 Mat B(3,4,CV_64F); //3xN, P2
 //B = H*A  (P2 = h(P1))


 for(int i=0; i< 4; ++i)
 {
  A.at< double>(0,i) = P1[i].x;
  A.at< double>(1,i) = P1[i].y;
  A.at< double>(2,i) = 1;
  

  B.at< double>(0,i) = P2[i].x;
  B.at< double>(1,i) = P2[i].y;
  B.at< double>(2,i) = 1;
 }

 cout << "a" << endl << A << endl;
 cout << "b" << endl << B << endl;
 Mat HA = H*A;
 
for(int i=0; i< 4; ++i)
 {
  HA.at< double>(0,i) /= HA.at< double>(2,i);
  HA.at< double>(1,i) /= HA.at< double>(2,i);
  HA.at< double>(2,i) /= HA.at< double>(2,i);
 }

 cout << "HA" << endl << HA << endl;

 waitKey(0);
}  

void PointOrderbyConner(Point2f* inPoints, int w, int h )
{

 vector< pair< float, float> > s_point;
 for(int i=0; i< 4; ++i)
  s_point.push_back( make_pair(inPoints[i].x, inPoints[i].y) );

 //sort
 sort(s_point.begin(), s_point.end(), [](const pair< float, float>& A, const pair< float, float>& B){ return A.second < B.second; } );

 if( s_point[0].first < s_point[1].first )
 {
  inPoints[0].x = s_point[0].first;
  inPoints[0].y = s_point[0].second;

  inPoints[1].x = s_point[1].first;
  inPoints[1].y = s_point[1].second;

 }else{
  inPoints[0].x = s_point[1].first;
  inPoints[0].y = s_point[1].second;

  inPoints[1].x = s_point[0].first;
  inPoints[1].y = s_point[0].second;
 }

 if( s_point[2].first > s_point[3].first )
 {
  inPoints[2].x = s_point[2].first;
  inPoints[2].y = s_point[2].second;

  inPoints[3].x = s_point[3].first;
  inPoints[3].y = s_point[3].second;

 }else{
  inPoints[2].x = s_point[3].first;
  inPoints[2].y = s_point[3].second;

  inPoints[3].x = s_point[2].first;
  inPoints[3].y = s_point[2].second;
 }

  

}


static void onMouse( int event, int x, int y, int, void* )  
{  
 
 
    if( event == CV_EVENT_LBUTTONDOWN && oksign==false)
 {
  //4 point select
  if(roiIndex>=4)
  {
   roiIndex=0;  
   for(int i=0; i< 4; ++i)
    roi4point[i].x = roi4point[i].y =0;
  }

  roi4point[roiIndex].x = x;
  roi4point[roiIndex].y = y;

  //point coordinate print
  printf("-(%..2lf,%.2lf), 2:(%.2lf,%.2lf), 3:(%.2lf,%.2lf), 4:(%.2lf,%.2lf)\n",  
   roi4point[0].x, roi4point[0].y,roi4point[1].x, roi4point[1].y,roi4point[2].x, roi4point[2].y,roi4point[3].x, roi4point[3].y );  
  
  roiIndex++;
 }

 if(event == CV_EVENT_RBUTTONDOWN)
 {
  //set point.
  if(roiIndex == 4)
  {
   oksign = true;
   printf("Warping Start!!!\n");
  }
 }

 
 
}  


This is matlab source code for confirm.
x1 is clicked 4 point in opencv(I did value copy into matlab), matlabH is calculated by homography2d function. (refer to peter homepage for this function detail http://www.csse.uwa.edu.au/~pk/Research/MatlabFns/index.html)
x2 is calculate exactly when matlabH*x1.

I try in opencv with same values of x1, x2.
opencvH is calculated value from opencv source code.
Value is slightly different. Because scaling, OpenCV H and Matlab H will be same when (3,3) value will be divided by equal to 1.


clc;
clear all;

x1 =[259 126 1; 566 222 1; 400 473 1; 33 305 1]';
x2 =[0 0 1; 300 0 1; 300 300 1; 0 300 1]';

matlabH = homography2d(x1, x2)


matlab_X2= matlabH*x1;
matlab_X2(:,1) = matlab_X2(:,1)/matlab_X2(3,1);
matlab_X2(:,2) = matlab_X2(:,2)/matlab_X2(3,2);
matlab_X2(:,3) = matlab_X2(:,3)/matlab_X2(3,3);
matlab_X2(:,4) = matlab_X2(:,4)/matlab_X2(3,4);

matlab_X2





opencvH = [1.021877004679779, 1.290191078534245, -427.2302201073777;
  -0.6109166533338892, 1.953660547640664, -87.93381578924605;
  5.540800373074552e-006, 0.002051557898988468, 1]

opencv_x2 = opencvH * x1;
opencv_x2(:,1) = opencv_x2(:,1)/opencv_x2(3,1);
opencv_x2(:,2) = opencv_x2(:,2)/opencv_x2(3,2);
opencv_x2(:,3) = opencv_x2(:,3)/opencv_x2(3,3);
opencv_x2(:,4) = opencv_x2(:,4)/opencv_x2(3,4);
opencv_x2



3/16/2015

any 4 points odering by leftTop, RightTop, RightBottom, LeftBottom

We may wish to know the order in lt, rt, rb, lb, for any of the four points.
This example is that for this case.


And see the source code operation in video



...
//#include < time.h>  
#include < opencv2\opencv.hpp>  
//#include < opencv2\gpu\gpu.hpp>  
//#include < opencv2\stitching\detail\matchers.hpp >
#include < string>  
#include < stdio.h>  
//#include < queue>

#ifdef _DEBUG          
#pragma comment(lib, "opencv_core249d.lib")  
//#pragma comment(lib, "opencv_imgproc249d.lib")   //MAT processing  
//#pragma comment(lib, "opencv_gpu249d.lib")  
#pragma comment(lib, "opencv_highgui249d.lib")
//#pragma comment(lib, "opencv_objdetect249d.lib")
//#pragma comment(lib, "opencv_calib3d249d.lib") 
//#pragma comment(lib, "opencv_nonfree249d.lib") 
//#pragma comment(lib, "opencv_features2d249d.lib") 
#else  
#pragma comment(lib, "opencv_core249.lib")  
//#pragma comment(lib, "opencv_imgproc249.lib")  
//#pragma comment(lib, "opencv_gpu249.lib")  
#pragma comment(lib, "opencv_highgui249.lib")
//#pragma comment(lib, "opencv_objdetect249.lib")
//#pragma comment(lib, "opencv_calib3d249.lib") 
//#pragma comment(lib, "opencv_nonfree249.lib") 
//#pragma comment(lib, "opencv_features2d249.lib") 
#endif     


using namespace std;  
using namespace cv;  

static void onMouse( int event, int x, int y, int, void* );
Point2f roi4point[4]={0,};
int roiIndex=0;
bool oksign = false;

Point2f MinDistFind(float x, float y, Point2f* inPoints);
void PointOrderbyConner(Point2f* inPoints, int w, int h );

int main()  
{  
 //image loading
 char fileName[100] = "./road-ahead.jpg";

 //origin
 Mat GetImg = imread( fileName );
 //copy for drawing
 Mat RoiImg;
 
 //window
 namedWindow( "set roi by 4 points", 0 );  

 //mouse callback
 setMouseCallback( "set roi by 4 points", onMouse, 0 );  
 
 //point selection until 4 points setting
 while(1)
 {

  if(oksign == true) //right button click
   break;

  //draw point
  RoiImg = GetImg.clone();
  for(int i=0; i< roiIndex; ++i)
   circle(RoiImg, roi4point[i], 5,CV_RGB(255,0,255),5);
  imshow("set roi by 4 points", RoiImg);
    
  waitKey(10);
 }



 printf("points ordered by LT, RT, RB, LB \n");
 PointOrderbyConner(roi4point, GetImg.size().width,  GetImg.size().height);
 for(int i=0; i< 4; ++i)
 {
  printf("[%d] (%.2lf, %.2lf) \n",i, roi4point[i].x, roi4point[i].y );
 }


 //drwaring
 RoiImg = GetImg.clone();
 string TestStr[4]={"LT","RT","RB","LB"};  
 putText(RoiImg, TestStr[0].c_str(), roi4point[0], CV_FONT_NORMAL, 1, Scalar(255,255,255));
 circle(RoiImg, roi4point[0], 3,CV_RGB(0,0,255));
 int i;
 for(i=1; i< roiIndex; ++i)
 {
  line(RoiImg, roi4point[i-1], roi4point[i], CV_RGB(255,0,0),1 );
  circle(RoiImg, roi4point[i], 1,CV_RGB(0,0,255),3);  
  putText(RoiImg, TestStr[i].c_str(), roi4point[i], CV_FONT_NORMAL, 1, Scalar(255,255,255));
 }

 line(RoiImg, roi4point[0], roi4point[i-1], CV_RGB(255,0,0),1 );
 imshow("set roi by 4 points2", RoiImg);


 waitKey(0);
}  

void PointOrderbyConner(Point2f* inPoints, int w, int h )
{

 vector< pair< float, float> > s_point;
 for(int i=0; i< 4; ++i)
  s_point.push_back( make_pair(inPoints[i].x, inPoints[i].y) );

 //sort
 sort(s_point.begin(), s_point.end(), [](const pair< float, float>& A, const pair< float, float>& B){ return A.second < B.second; } );

 if( s_point[0].first < s_point[1].first )
 {
  inPoints[0].x = s_point[0].first;
  inPoints[0].y = s_point[0].second;

  inPoints[1].x = s_point[1].first;
  inPoints[1].y = s_point[1].second;

 }else{
  inPoints[0].x = s_point[1].first;
  inPoints[0].y = s_point[1].second;

  inPoints[1].x = s_point[0].first;
  inPoints[1].y = s_point[0].second;
 }

 if( s_point[2].first > s_point[3].first )
 {
  inPoints[2].x = s_point[2].first;
  inPoints[2].y = s_point[2].second;

  inPoints[3].x = s_point[3].first;
  inPoints[3].y = s_point[3].second;

 }else{
  inPoints[2].x = s_point[3].first;
  inPoints[2].y = s_point[3].second;

  inPoints[3].x = s_point[2].first;
  inPoints[3].y = s_point[2].second;
 }

  

}


static void onMouse( int event, int x, int y, int, void* )  
{  
 
 
    if( event == CV_EVENT_LBUTTONDOWN && oksign==false)
 {
  //4 point select
  if(roiIndex>=4)
  {
   roiIndex=0;  
   for(int i=0; i< 4; ++i)
    roi4point[i].x = roi4point[i].y =0;
  }

  roi4point[roiIndex].x = x;
  roi4point[roiIndex].y = y;

  //point coordinate print
  printf("-(%..2lf,%.2lf), 2:(%.2lf,%.2lf), 3:(%.2lf,%.2lf), 4:(%.2lf,%.2lf)\n",  
   roi4point[0].x, roi4point[0].y,roi4point[1].x, roi4point[1].y,roi4point[2].x, roi4point[2].y,roi4point[3].x, roi4point[3].y );  
  
  roiIndex++;
 }

 if(event == CV_EVENT_RBUTTONDOWN)
 {
  //set point.
  if(roiIndex == 4)
  {
   oksign = true;
   printf("Warping Start!!!\n");
  }
 }

 
 
}  

---

After 4 points selection, click Right button on the mouse, then ordering start.
PointOrderbyConner function is main function for ordering any points to lt, rt, rb, lb.
The logic is..

1. sorted in descending order with respect to the y-coordinate.
2. at 2 coordinates of top in sorted vectors,
   finding min x value coordinate in 2 vectors, min is LT, lager is RT.
2. at 2 coordinates of bottom in sorted vectors,
   finding min x value coordinate in 2 vectors, min is LB, lager is RB.

sorry, my low English skill.
See source code more detail.


To process all arrays by reasonably small number of threads in cuda ( the explaination of tid = blockDim.x * gridDim.x )

see the this source code
...
#define N 10 //(33*1024)

__global__ void add(int *c){
 int tid = threadIdx.x + blockIdx.x * gridDim.x;

 if(tid < N)
  c[tid] = 1;
 
 
 while( tid < N)
 {
  c[tid] = 1;
  tid += blockDim.x * gridDim.x;
 }
 
}




int main(void)
{
 int c[N];
 int *dev_c;
 cudaMalloc( (void**)&dev_c, N*sizeof(int) );

 for(int i=0; i< N; ++i)
 {
  c[i] = -1;
 }

 cudaMemcpy(dev_c, c, N*sizeof(int), cudaMemcpyHostToDevice);

 add<<< 2, 2>>>(dev_c);
 cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost );


 for(int i=0; i< N; ++i)
 {
  printf("c[%d] = %d \n" ,i, c[i] );
 }

 cudaFree( dev_c );

}


---

Why we do not create 10 threads ex) add<<<2>>> or add<5>>>
Because we have to create reasonably small number of threads, if N is larger than 10 ex) 33*1024.

This source code is example of this case.
arrays are 10, cuda threads are 4.
How to access all 10 arrays only by 4 threads.


see the page about meaning of threadIdx, blockIdx, blockDim, gridDim in the cuda detail.
(1D) -> http://study.marearts.com/2015/03/meaning-of-threadidx-blockidx-blockdim.html


In this source code, 
gridDim.x -> 2     //this means number of block of x
gridDim.y -> 1     //this means number of block of y
blockDim.x -> 2   //this means number of thread of x in a block
blockDim.y -> 1   //this means number of thread of y in a block

Our number of thread are 4, because 2*2(blocks * thread).

In add kernel function, we can access 0, 1, 2, 3 index of thread

int tid = threadIdx.x + blockIdx.x * blockDim.x;
①0+0*2=0
②1+0*2=1
③0+1*2=2
④1+1*2=3 

How to access rest of index 4, 5, 6, 7, 8, 9.
There is a calculation in while loop
while(tid
{
   c[tid] = 1;
   tid += blockDim.x + gridDim.x;
}

** first call of kernel **
#1 loop: 0+2*2=4
#2 loop: 4+2*2=8 
#3 loop: 8+2*2=12 ( but this value is false, while out!)

** second call of kernel **
#1 loop: 1+2*2=5
#2 loop: 5+2*2=9
#3 loop: 9+2*2=13 ( but this value is false, while out!)

** third call of kernel **
#1 loop: 2+2*2=6
#2 loop: 6+2*2=10 ( but this value is false, while out!)

** fourth call of kernel **
#1 loop: 3+2*2=7
#2 loop: 7+2*2=11 ( but this value is false, while out!)

So, all index of 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 can access by tid value.







3/12/2015

Meaning of threadIdx, blockIdx, blockDim, gridDim in the cuda (2D)

This article explain how to access the thread index when you make block and thread with two dimensions.

please refer to this page about method to access thread 1D.
->http://feelmare.blogspot.com/2015/03/meaning-of-threadidx-blockidx-blockdim.html

If you make kernel like that

dim3 blocks(2,3);
dim3 thread(3,2);
Kernel<<< blocks, threads >>>

The threads are made as follows figure.


36 threads are made and gridDim and blockDim is (2,3) and (3,2).

problem is now...
How to access 15th thread??
See the this figure..



Do you understand?
We have to do indexing calculation, because threadIdx.x, threadIdx.y is only indicate indexing in their block.

For more detail, refer to below figure that represent the index list of tid calculation result.




#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include < stdio.h>



#define N 40

__global__ void increase(int *c){
 int x = threadIdx.x + blockIdx.x * blockDim.x;
 int y = threadIdx.y + blockIdx.y * blockDim.y;
 int tid = x + y*blockDim.x * gridDim.x;
 if(tid < N)
  c[tid] = tid;
}



int main(void)
{
 int c[N];
 int *dev_c;

 cudaMalloc( (void**)&dev_c, N*sizeof(int) );

 for(int i=0; i< N; ++i)
 {
  c[i] = -1;
 }

 cudaMemcpy(dev_c, c, N*sizeof(int), cudaMemcpyHostToDevice);

 dim3 blocks(2,3);
 dim3 threads(3,2);
 increase<<< blocks, threads>>>(dev_c);

 cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost );

 for(int i=0; i< N; ++i)
 {
  printf("c[%d] = %d \n" ,i, c[i] );
 }

 cudaFree( dev_c );
}

...


In the source code, threads are made only 36. so 37th 38th 39th 40th array have left initial value -1.





Meaning of threadIdx, blockIdx, blockDim, gridDim in the cuda (1D)


When we study cuda firstly, thread indexing is very confusing.
So I tried to clean up.

First, Let's grab a sense of looking at this example
...
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include < stdio.h>

#define N 15

__global__ void increase(int *c){
 int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
 if(tid < N)
  c[tid] = tid;
}

int main(void)
{
 int c[N];
 int *dev_c;

 cudaMalloc( (void**)&dev_c, N*sizeof(int) );

 for(int i=0; i< N; ++i)
 {
  c[i] = -1;
 }

 cudaMemcpy(dev_c, c, N*sizeof(int), cudaMemcpyHostToDevice);

 increase<<< 4, 3>>>(dev_c);

 cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost );

 for(int i=0; i< N; ++i)
 {
  printf("c[%d] = %d \n" ,i, c[i] );
 }

 cudaFree( dev_c );
}
...

The result of this example source is

In the source code, kernel function(increase) is created by <<< 4, 3 >>>, this means to create 12 threads.
12 threads are executed at the same time.
So, the kernel function need to know what number of thread am I?
The method is threadIdx and blockIdx.

But we have to calculate thread index, because threadIdx and blockIdx is different space index.
like that " int tid = threadIdx.x + blockIdx.x * blockDim.x; "

threadIdx tells current thread index.
blockIdx tells current block index.

gridDim tells number of blocks in a grid
blockDim tells number of a threads in a block


Did you more confused?

My final explanation. See the this figure.




Next time, I will introduce 2D kernel.
http://study.marearts.com/2015/03/meaning-of-threadidx-blockidx-blockdim_12.html
Thank you.



3/10/2015

error : calling a __host__ function("cuComplex::cuComplex") from a __device__ function("julia") is not allowed

If you meet this error studying juliaset in CUDA by example book capture 4.

calling a __host__ function("cuComplex::cuComplex") from a __device__ function("julia") is not allowed

modify like this in cuComplex structure
cuComplex( float a, float b ) : r(a), i(b)  {}  -->  __device__ cuComplex( float a, float b ) : r(a), i(b)  {}


Hope to see julia set, beautiful~


50000 x 1 vector sum, gpu vs cpu processing time compare

in (50000 x 1 + 50000 x 1) vector sum
cuda takes 0.16 sec, included all process ex)upload, download, malloc, release..
cpu takes 0.00 sec.


cpu is faster than cuda??
Because processing is simple??

test by this source code..

cpu version

#include < iostream>
#include < windows.h>
#include < stdlib.h>
 #include "cuda.h"  
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  



#define N 50000 //vector size

void add( int *a, int *b, int *c ) {
    int tid = 0;    // this is CPU zero, so we start at zero
    while (tid < N) {
        c[tid] = a[tid] + b[tid];
        tid += 1;   // we have one CPU, so we increment by one
    }
}

int main( void ) {
 //for processing time measure
 unsigned long Atime=0, Btime=0;
 Atime = GetTickCount();

    int a[N], b[N], c[N];

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i< N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    add( a, b, c );

    // display the results
 /*
    for (int i=0; i< N; i++) {
        printf( "%d + %d = %d\n", a[i], b[i], c[i] );
    }
 */

 Btime = GetTickCount();
 printf("%.20lf sec\n",  (Btime - Atime)/1000.0 );

    return 0;
}
cuda version
#include < iostream>
#include < windows.h>
#include < stdlib.h>
 #include "cuda.h"  
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  

#define N 50000 //vector size


__global__ void add(int *a, int *b, int *c){
 
 int tid = blockIdx.x; //block index that is set by gpu device.
 if(tid < N) 
  c[tid] = a[tid] + b[tid];
}


int main(void)
{
 //for processing time measure
 unsigned long Atime=0, Btime=0;
 Atime = GetTickCount();

 int a[N], b[N], c[N];
 int *dev_a, *dev_b, *dev_c;

 //gpu mem allocation
 cudaMalloc( (void**)&dev_a, N*sizeof(int) );
 cudaMalloc( (void**)&dev_b, N*sizeof(int) );
 cudaMalloc( (void**)&dev_c, N*sizeof(int) );

 //value alloc in cpu
 for(int i=0; i< N; ++i)
 {
  a[i] = -i;
  b[i] = i*i;
 }

 //value copy from cpu to gpu
 cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
 cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);

 //add kernel call
 add<<< N, 1>>>(dev_a, dev_b, dev_c);

 //result value copy gpu -> cpu
 cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost) ;


 

 //result print
 //for(int i=0; i< N; ++i){
  //printf("%d + %d = %d\n", a[i], b[i], c[i] );
 //}

 //memory release
 cudaFree( dev_a );
 cudaFree( dev_b );
 cudaFree( dev_c );

 Btime = GetTickCount();
 printf("%.2lf sec\n",  (Btime - Atime)/1000.0 );
}





cude example is introduced in here -> https://bitbucket.org/mrfright/cuda_by_example/src

3/09/2015

First CUDA tutorial - CUDA toolkit 6.0 + Visual studio 2012 setting and example simple source


You have already installed Visual studio 2012 or other version.
And, you have to install CUDA toolkit, go to the nvidia homepage -> https://developer.nvidia.com/cuda-downloads
Now cuda 6.5 is available.

Don't worry, In this introduction, version is not important.


After install cuda toolkit and Visual studio.
There is two option to start simple cuda project.

One is little bit complex.
Make win32 console project (empty optioni)
set cuda include and lib directory.
set build option
add cuda file
and coding...

See the this video.
https://www.youtube.com/watch?v=i43dUW4E-fE&feature=youtu.be



Another method is using cuda runtime project.
This is very easy.
See the this video.
https://www.youtube.com/watch?v=j2wV4-IiGh4&feature=youtu.be


This is example source code

#include < iostream>
#include "cuda.h"  
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  

__global__ void add(int a, int b, int*c)
{
 *c = a+b;
}

int main()
{

 int c;
 int *dev_c;
 cudaMalloc( (void**)&dev_c, sizeof(int) );

 add<<< 1,1>>>(2,7, dev_c);

 cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);

 printf(" 2+7 = %d \n", c);

 cudaFree(dev_c); 

 return 0;

}

As I am cuda beginner, I don't know which method is good.
But I think first method is more useful for adding cuda code into existing project.

And refer to this youtube channel.
https://www.youtube.com/channel/UCBHcMCGaiJhv-ESTcWGJPcw

cudacast playlist will be efficient.

Thank you.

To save mouse drag region to image file on the video frame. example source using opencv

This example source code is for saving image file that specified a rectangle region by mouse drag on the video frame.

The method is easy.
Firstly, enter the file name included file extension (ex: s.avi)
Then, video will be play.
Press p key on the video play window, if you want to save image.
Video will play after end of drag and drag region will be saved in your folder.
'ESC' key is for program finish.

Thank you.

example video is here
-> https://www.youtube.com/watch?v=ZpO1b-lZb7g



///
#include < stdio.h>
#include < iostream>

#include < opencv2\opencv.hpp>

#ifdef _DEBUG        
#pragma comment(lib, "opencv_core249d.lib")
#pragma comment(lib, "opencv_highgui249d.lib")
#else
#pragma comment(lib, "opencv_core249.lib")
#pragma comment(lib, "opencv_highgui249.lib")
#endif 

using namespace std;
using namespace cv;

bool selectObject = false;
Rect selection;
Point origin;
Mat image;
bool pause =false;
double fpss;

Rect PatchRect;
Mat PatchImg;

unsigned int frame_index=0;

static void onMouse( int event, int x, int y, int, void* )
{
 if( selectObject & pause)
 {

  selection.x = MIN(x, origin.x);
  selection.y = MIN(y, origin.y);
  selection.width = std::abs(x - origin.x);
  selection.height = std::abs(y - origin.y);
  selection &= Rect(0, 0, image.cols, image.rows);
 }

 switch( event )
 {
 case CV_EVENT_LBUTTONDOWN:
  origin = Point(x,y);
  selection = Rect(x,y,0,0);
  selectObject = true;
  break;
 case CV_EVENT_LBUTTONUP:
  if(selectObject && pause)
  {
   if(selection.width > 5 && selection.height > 5 )
   {
    PatchRect = selection;
    image( PatchRect ).copyTo( PatchImg );
    imshow("Selected Img", PatchImg );

    
    char str[100];
    sprintf_s(str,"%d.jpg", int(frame_index/fpss));
    imwrite(str, PatchImg);

   }else
    selection = Rect(0,0,0,0);
  }
  selectObject = false;
  pause = false;

  break;
 }
}


int main (void)  
{  


 printf("avi file name?");
 char nstr[255];
 scanf_s("%s", nstr);
 printf("-> %s", nstr);

 VideoCapture cap(nstr); 

 Mat frame;
 namedWindow( "Demo", 0 );
 setMouseCallback( "Demo", onMouse, 0 );
 printf("P key is pause, ESC key is exit.\n");

 for(;;)
 {
  frame_index++;

  if(!pause)
   cap >> frame;
  if( frame.empty() )
   break;
  frame.copyTo(image);


  if( pause && selection.width > 0 && selection.height > 0 )
  {
   rectangle(image, Point(selection.x-1, selection.y-1), Point(selection.x+selection.width+1, selection.y+selection.height+1), CV_RGB(255,0,0) );
  }

  imshow( "Demo", image );

  char k = waitKey(10);

  if( k == 27 )
   break;
  else if(k == 'p' || k=='P' )
   pause=!pause;
 }

 return 0;  
}  


...