Tuesday, April 8, 2008

CUDA Programming Part 3

Extended Hello World to execute it on N number of blocks..



======================

Main Line code

======================

/* HellowWorld Project
* This project demonstrates the basics on how to setup
* an example GPU Copmuting application.
*
* THis file contains the CPU (host) code.
*/

// Host defines
#define NUM_BLOCKS 32
#define NUM_THREADS 32
#define STR_SIZE 50

// Includes
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// CUDA includes
#include <cutil.h> // CUDA Utility Tools

// GPU Kernels declarations - declare as inlcude
#include <HelloWorld_kernel.cu>

//////////////////////
// Program main
//////////////////////
int main( int argc, char** argv)
{

// Host variables
int i,nBytes;
unsigned int timer;
unsigned int num_threads,num_blocks;
char *cpu_odata;
char *string;

// GPU variables
char *gpu_odata;
int str_size;

/////////////////////////////////////////////////////////////////////
// This routine gets the number of GPUs existing in the computer
// For each GPU (device) found, it checks to see if there is a GPU
// that supports CUDA. If no GPU that supports CUDA is found,
// the routine wll exit
/////////////////////////////////////////////////////////////////////
CUT_DEVICE_INIT();

/////////////////////////////////////////////////////////////////////
// Create and start a timer called "timer"
// alls to create ans start times are enveloped in the CUT_SAFE_CALL
// This CUDA Utility Tool checks for errors upon return.
// If an error is found, it prints out and error message, file name,
// and line number in file where the error can be found
/////////////////////////////////////////////////////////////////////
timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));

// Initialize CPU variables and allocate required memory
num_threads = (unsigned int) NUM_THREADS;
num_blocks = (unsigned int) NUM_BLOCKS;
nBytes = num_blocks*num_threads*STR_SIZE*sizeof(char);

// Allocate and initialize CPU output vector
string = (char *) malloc(STR_SIZE);
if(!string) {
printf("Cannot allocate string memory on CPU\n");
exit(-1);
}
cpu_odata = (char *) malloc(nBytes);
if(!cpu_odata) {
printf("Cannot allocate cpu_odata memory on CPU\n");
exit(-1);
}

// Allocate GPU (device) memory and variables
str_size = (int) STR_SIZE;
CUDA_SAFE_CALL(cudaMalloc( (void**) &gpu_odata, nBytes));

// Setup kernel execution parameters
dim3 grid(num_blocks,1,1);
dim3 threads(num_threads,1,1);

// Execute the kernel on the GPU
HelloWorld_kernel<<< grid, threads >>>(str_size,num_threads, gpu_odata);
// HelloWorld_kernel<<<num_blocks,num_threads>>>(str_size, gpu_odata);

// Check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");

// Copy result from GPU to CPU
CUDA_SAFE_CALL(cudaMemcpy(cpu_odata,gpu_odata,nBytes,cudaMemcpyDeviceToHost));

// Stop the timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue(timer));

// Delete the timer
CUT_SAFE_CALL(cutDeleteTimer(timer));

// Output results is same as the expected solution
for(i=0;i<num_threads*num_blocks;i++) {
strncpy(string,&cpu_odata[i*STR_SIZE],STR_SIZE);
printf("From thread %d: %s\n",i,string);/* HellowWorld Project
* This project demonstrates the basics on how to setup
* an example GPU Copmuting application.
*
* THis file contains the CPU (host) code.
*/

// Host defines
#define NUM_BLOCKS 32
#define NUM_THREADS 32
#define STR_SIZE 50

// Includes
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// CUDA includes
#include <cutil.h> // CUDA Utility Tools

// GPU Kernels declarations - declare as inlcude
#include <HelloWorld_kernel.cu>

//////////////////////
// Program main
//////////////////////
int main( int argc, char** argv)
{

// Host variables
int i,nBytes;
unsigned int timer;
unsigned int num_threads,num_blocks;
char *cpu_odata;
char *string;

// GPU variables
char *gpu_odata;
int str_size;

/////////////////////////////////////////////////////////////////////
// This routine gets the number of GPUs existing in the computer
// For each GPU (device) found, it checks to see if there is a GPU
// that supports CUDA. If no GPU that supports CUDA is found,
// the routine wll exit
/////////////////////////////////////////////////////////////////////
CUT_DEVICE_INIT();

/////////////////////////////////////////////////////////////////////
// Create and start a timer called "timer"
// alls to create ans start times are enveloped in the CUT_SAFE_CALL
// This CUDA Utility Tool checks for errors upon return.
// If an error is found, it prints out and error message, file name,
// and line number in file where the error can be found
/////////////////////////////////////////////////////////////////////
timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));

// Initialize CPU variables and allocate required memory
num_threads = (unsigned int) NUM_THREADS;
num_blocks = (unsigned int) NUM_BLOCKS;
nBytes = num_blocks*num_threads*STR_SIZE*sizeof(char);

// Allocate and initialize CPU output vector
string = (char *) malloc(STR_SIZE);
if(!string) {
printf("Cannot allocate string memory on CPU\n");
exit(-1);
}
cpu_odata = (char *) malloc(nBytes);
if(!cpu_odata) {
printf("Cannot allocate cpu_odata memory on CPU\n");
exit(-1);
}

// Allocate GPU (device) memory and variables
str_size = (int) STR_SIZE;
CUDA_SAFE_CALL(cudaMalloc( (void**) &gpu_odata, nBytes));

// Setup kernel execution parameters
dim3 grid(num_blocks,1,1);
dim3 threads(num_threads,1,1);

// Execute the kernel on the GPU
HelloWorld_kernel<<< grid, threads >>>(str_size,num_threads, gpu_odata);
// HelloWorld_kernel<<<num_blocks,num_threads>>>(str_size, gpu_odata);

// Check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");

// Copy result from GPU to CPU
CUDA_SAFE_CALL(cudaMemcpy(cpu_odata,gpu_odata,nBytes,cudaMemcpyDeviceToHost));

// Stop the timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue(timer));

// Delete the timer
CUT_SAFE_CALL(cutDeleteTimer(timer));

// Output results is same as the expected solution
for(i=0;i<num_threads*num_blocks;i++) {
strncpy(string,&cpu_odata[i*STR_SIZE],STR_SIZE);
printf("From thread %d: %s\n",i,string);
}


//////////////////////////////////////////
// All done - clean up and exit
//////////////////////////////////////////
// Free up CPU memory
free(cpu_odata);

// Free up GPU memory
CUDA_SAFE_CALL(cudaFree(gpu_odata));

// Use CUDA Utility Tool to exit cleanly
CUT_EXIT(argc, argv);
}
}


//////////////////////////////////////////
// All done - clean up and exit
//////////////////////////////////////////
// Free up CPU memory
free(cpu_odata);

// Free up GPU memory
CUDA_SAFE_CALL(cudaFree(gpu_odata));

// Use CUDA Utility Tool to exit cleanly
CUT_EXIT(argc, argv);
}







======================
Kernel
======================


#ifndef _HELLOWORLD_KERNEL_H_
#define _HELLOWORLD_KERNEL_H_

#include <stdio.h>

///////////////////////////////////////////////////////////
// Simple Hello World kernel
// @param gpu_odata output data in global memory
///////////////////////////////////////////////////////////
__global__ void HelloWorld_kernel(int size,int num_threads, char *gpu_odata)
{
int i,k,x,n,last_num;
int idx,not_done;

// Access thread id and block id
const unsigned int thread_idx = threadIdx.x;
const unsigned int block_idx = blockIdx.x;

// Write data to global memory
idx = 0;
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'H';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'e';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'l';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'l';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'o';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'W';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'o';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'r';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'l';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'd';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'F';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'r';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'o';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'm';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'T';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'h';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'r';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'e';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'a';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'd';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';

// Convert thread id to chars
// Determine number of places in thread idx
not_done = 1;
k = 10;
n = 1;
while(not_done == 1) {
x = thread_idx/k;
if (x>0) {
k = k*10;
n +=1;
}
else
not_done = 0;
}

// Parse out the thread index and convert to chars
k = k/10;
last_num = 0;
for(i=n;i>0;i--) {
x = thread_idx/k-last_num;
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = '0' + x;
last_num = (thread_idx/k)*10;
k = k/10;
}

gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'i';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'n';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'B';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'l';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'o';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'c';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'k';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';

// Convert block id to chars
// Determine number of places in thread idx
not_done = 1;
k = 10;
n = 1;
while(not_done == 1) {
x = block_idx/k;
if (x>0) {
k = k*10;
n +=1;
}
else
not_done = 0;
}

// Parse out the block index and convert to chars
k = k/10;
last_num = 0;
for(i=n;i>0;i--) {
x = block_idx/k-last_num;
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = '0' + x;
last_num = (block_idx/k)*10;
k = k/10;
}

// Fill out rest of string
for(i=idx;i<size;i++)
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
}

#endif

http://developer.nvidia.com/dev_content/cg/cg_examples/images/sine_wave_perturbation_ogl.jpg

Second Assignment is to generate a 2d sine wave for the RK4 method.

3 comments:

O Extrator said...
This comment has been removed by the author.
Daniel Leal Souza said...

Great example!

I Have two questios about it:

1 - What the the function CUDA_SAFE_CALL really does? and when we have to use it?

2 - Have you use nvcc compiler or gcc with some extra arguments? if gcc, what's the command for compile it?

Anonymous said...

I've recently started learning CUDA, and you put things together well there. A lot of things "clicked" after reading this.

Thanks!