...
#define N 10 //(33*1024) __global__ void add(int *c){ int tid = threadIdx.x + blockIdx.x * gridDim.x; if(tid < N) c[tid] = 1; while( tid < N) { c[tid] = 1; tid += blockDim.x * gridDim.x; } } int main(void) { int c[N]; int *dev_c; cudaMalloc( (void**)&dev_c, N*sizeof(int) ); for(int i=0; i< N; ++i) { c[i] = -1; } cudaMemcpy(dev_c, c, N*sizeof(int), cudaMemcpyHostToDevice); add<<< 2, 2>>>(dev_c); cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost ); for(int i=0; i< N; ++i) { printf("c[%d] = %d \n" ,i, c[i] ); } cudaFree( dev_c ); }---
Why we do not create 10 threads ex) add<<<2>>> or add<5>>>5>2>
Because we have to create reasonably small number of threads, if N is larger than 10 ex) 33*1024.
This source code is example of this case.
arrays are 10, cuda threads are 4.
How to access all 10 arrays only by 4 threads.
see the page about meaning of threadIdx, blockIdx, blockDim, gridDim in the cuda detail.
(1D) -> http://study.marearts.com/2015/03/meaning-of-threadidx-blockidx-blockdim.html
int tid = threadIdx.x + blockIdx.x * blockDim.x;
①0+0*2=0
②1+0*2=1
③0+1*2=2
④1+1*2=3
In this source code,
gridDim.x -> 2 //this means number of block of x
gridDim.y -> 1 //this means number of block of y
blockDim.x -> 2 //this means number of thread of x in a block
blockDim.y -> 1 //this means number of thread of y in a block
Our number of thread are 4, because 2*2(blocks * thread).
In add kernel function, we can access 0, 1, 2, 3 index of thread
int tid = threadIdx.x + blockIdx.x * blockDim.x;
①0+0*2=0
②1+0*2=1
③0+1*2=2
④1+1*2=3
How to access rest of index 4, 5, 6, 7, 8, 9.
There is a calculation in while loop
while(tid
#2 loop: 4+2*2=8
#2 loop: 5+2*2=9
{
c[tid] = 1;
tid += blockDim.x + gridDim.x;
}
** first call of kernel **
#1 loop: 0+2*2=4 #2 loop: 4+2*2=8
#3 loop: 8+2*2=12 ( but this value is false, while out!)
** second call of kernel **
#1 loop: 1+2*2=5#2 loop: 5+2*2=9
#3 loop: 9+2*2=13 ( but this value is false, while out!)
** third call of kernel **
#1 loop: 2+2*2=6
#2 loop: 6+2*2=10 ( but this value is false, while out!)
#2 loop: 6+2*2=10 ( but this value is false, while out!)
** fourth call of kernel **
#1 loop: 3+2*2=7
#2 loop: 7+2*2=11 ( but this value is false, while out!)
#2 loop: 7+2*2=11 ( but this value is false, while out!)
So, all index of 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 can access by tid value.
No comments:
Post a Comment