Learn something new every day: CUDA Programming Part 3

Extended Hello World to execute it on N number of blocks..

======================

Main Line code

======================

/* HellowWorld Project
* This project demonstrates the basics on how to setup
* an example GPU Copmuting application.
*
* THis file contains the CPU (host) code.
*/

// Host defines
#define NUM_BLOCKS 32
#define NUM_THREADS 32
#define STR_SIZE 50

// Includes
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// CUDA includes
#include <cutil.h> // CUDA Utility Tools

// GPU Kernels declarations - declare as inlcude
#include <HelloWorld_kernel.cu>

//////////////////////
// Program main
//////////////////////
int main( int argc, char** argv)
{

// Host variables
int i,nBytes;
unsigned int timer;
unsigned int num_threads,num_blocks;
char *cpu_odata;
char *string;

// GPU variables
char *gpu_odata;
int str_size;

/////////////////////////////////////////////////////////////////////
// This routine gets the number of GPUs existing in the computer
// For each GPU (device) found, it checks to see if there is a GPU
// that supports CUDA. If no GPU that supports CUDA is found,
// the routine wll exit
/////////////////////////////////////////////////////////////////////
CUT_DEVICE_INIT();

/////////////////////////////////////////////////////////////////////
// Create and start a timer called "timer"
// alls to create ans start times are enveloped in the CUT_SAFE_CALL
// This CUDA Utility Tool checks for errors upon return.
// If an error is found, it prints out and error message, file name,
// and line number in file where the error can be found
/////////////////////////////////////////////////////////////////////
timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));

// Initialize CPU variables and allocate required memory
num_threads = (unsigned int) NUM_THREADS;
num_blocks = (unsigned int) NUM_BLOCKS;
nBytes = num_blocks*num_threads*STR_SIZE*sizeof(char);

// Allocate and initialize CPU output vector
string = (char *) malloc(STR_SIZE);
if(!string) {
printf("Cannot allocate string memory on CPU\n");
exit(-1);
}
cpu_odata = (char *) malloc(nBytes);
if(!cpu_odata) {
printf("Cannot allocate cpu_odata memory on CPU\n");
exit(-1);
}

// Allocate GPU (device) memory and variables
str_size = (int) STR_SIZE;
CUDA_SAFE_CALL(cudaMalloc( (void**) &gpu_odata, nBytes));

// Setup kernel execution parameters
dim3 grid(num_blocks,1,1);
dim3 threads(num_threads,1,1);

// Execute the kernel on the GPU
HelloWorld_kernel<<< grid, threads >>>(str_size,num_threads, gpu_odata);
// HelloWorld_kernel<<<num_blocks,num_threads>>>(str_size, gpu_odata);

// Check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");

// Copy result from GPU to CPU
CUDA_SAFE_CALL(cudaMemcpy(cpu_odata,gpu_odata,nBytes,cudaMemcpyDeviceToHost));

// Stop the timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue(timer));

// Delete the timer
CUT_SAFE_CALL(cutDeleteTimer(timer));

// Output results is same as the expected solution
for(i=0;i<num_threads*num_blocks;i++) {
strncpy(string,&cpu_odata[i*STR_SIZE],STR_SIZE);
printf("From thread %d: %s\n",i,string);/* HellowWorld Project
* This project demonstrates the basics on how to setup
* an example GPU Copmuting application.
*
* THis file contains the CPU (host) code.
*/

// Host defines
#define NUM_BLOCKS 32
#define NUM_THREADS 32
#define STR_SIZE 50

// Includes
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// CUDA includes
#include <cutil.h> // CUDA Utility Tools

// GPU Kernels declarations - declare as inlcude
#include <HelloWorld_kernel.cu>

//////////////////////
// Program main
//////////////////////
int main( int argc, char** argv)
{

// Host variables
int i,nBytes;
unsigned int timer;
unsigned int num_threads,num_blocks;
char *cpu_odata;
char *string;

// GPU variables
char *gpu_odata;
int str_size;

/////////////////////////////////////////////////////////////////////
// This routine gets the number of GPUs existing in the computer
// For each GPU (device) found, it checks to see if there is a GPU
// that supports CUDA. If no GPU that supports CUDA is found,
// the routine wll exit
/////////////////////////////////////////////////////////////////////
CUT_DEVICE_INIT();

/////////////////////////////////////////////////////////////////////
// Create and start a timer called "timer"
// alls to create ans start times are enveloped in the CUT_SAFE_CALL
// This CUDA Utility Tool checks for errors upon return.
// If an error is found, it prints out and error message, file name,
// and line number in file where the error can be found
/////////////////////////////////////////////////////////////////////
timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));

// Initialize CPU variables and allocate required memory
num_threads = (unsigned int) NUM_THREADS;
num_blocks = (unsigned int) NUM_BLOCKS;
nBytes = num_blocks*num_threads*STR_SIZE*sizeof(char);

// Allocate and initialize CPU output vector
string = (char *) malloc(STR_SIZE);
if(!string) {
printf("Cannot allocate string memory on CPU\n");
exit(-1);
}
cpu_odata = (char *) malloc(nBytes);
if(!cpu_odata) {
printf("Cannot allocate cpu_odata memory on CPU\n");
exit(-1);
}

// Allocate GPU (device) memory and variables
str_size = (int) STR_SIZE;
CUDA_SAFE_CALL(cudaMalloc( (void**) &gpu_odata, nBytes));

// Setup kernel execution parameters
dim3 grid(num_blocks,1,1);
dim3 threads(num_threads,1,1);

// Execute the kernel on the GPU
HelloWorld_kernel<<< grid, threads >>>(str_size,num_threads, gpu_odata);
// HelloWorld_kernel<<<num_blocks,num_threads>>>(str_size, gpu_odata);

// Check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");

// Copy result from GPU to CPU
CUDA_SAFE_CALL(cudaMemcpy(cpu_odata,gpu_odata,nBytes,cudaMemcpyDeviceToHost));

// Stop the timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue(timer));

// Delete the timer
CUT_SAFE_CALL(cutDeleteTimer(timer));

// Output results is same as the expected solution
for(i=0;i<num_threads*num_blocks;i++) {
strncpy(string,&cpu_odata[i*STR_SIZE],STR_SIZE);
printf("From thread %d: %s\n",i,string);
}

//////////////////////////////////////////
// All done - clean up and exit
//////////////////////////////////////////
// Free up CPU memory
free(cpu_odata);

// Free up GPU memory
CUDA_SAFE_CALL(cudaFree(gpu_odata));

// Use CUDA Utility Tool to exit cleanly
CUT_EXIT(argc, argv);
}
}

//////////////////////////////////////////
// All done - clean up and exit
//////////////////////////////////////////
// Free up CPU memory
free(cpu_odata);

// Free up GPU memory
CUDA_SAFE_CALL(cudaFree(gpu_odata));

// Use CUDA Utility Tool to exit cleanly
CUT_EXIT(argc, argv);
}

======================
Kernel
======================

#ifndef _HELLOWORLD_KERNEL_H_
#define _HELLOWORLD_KERNEL_H_

#include <stdio.h>

///////////////////////////////////////////////////////////
// Simple Hello World kernel
// @param gpu_odata output data in global memory
///////////////////////////////////////////////////////////
__global__ void HelloWorld_kernel(int size,int num_threads, char *gpu_odata)
{
int i,k,x,n,last_num;
int idx,not_done;

// Access thread id and block id
const unsigned int thread_idx = threadIdx.x;
const unsigned int block_idx = blockIdx.x;

// Write data to global memory
idx = 0;
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'H';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'e';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'l';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'l';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'o';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'W';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'o';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'r';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'l';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'd';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'F';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'r';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'o';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'm';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'T';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'h';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'r';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'e';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'a';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'd';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';

// Convert thread id to chars
// Determine number of places in thread idx
not_done = 1;
k = 10;
n = 1;
while(not_done == 1) {
x = thread_idx/k;
if (x>0) {
k = k*10;
n +=1;
}
else
not_done = 0;
}

// Parse out the thread index and convert to chars
k = k/10;
last_num = 0;
for(i=n;i>0;i--) {
x = thread_idx/k-last_num;
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = '0' + x;
last_num = (thread_idx/k)*10;
k = k/10;
}

gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'i';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'n';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'B';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'l';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'o';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'c';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'k';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';

// Convert block id to chars
// Determine number of places in thread idx
not_done = 1;
k = 10;
n = 1;
while(not_done == 1) {
x = block_idx/k;
if (x>0) {
k = k*10;
n +=1;
}
else
not_done = 0;
}

// Parse out the block index and convert to chars
k = k/10;
last_num = 0;
for(i=n;i>0;i--) {
x = block_idx/k-last_num;
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = '0' + x;
last_num = (block_idx/k)*10;
k = k/10;
}

// Fill out rest of string
for(i=idx;i<size;i++)
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
}

#endif

http://developer.nvidia.com/dev_content/cg/cg_examples/images/sine_wave_perturbation_ogl.jpg

Second Assignment is to generate a 2d sine wave for the RK4 method.

3 comments:

O Extrator said...: This comment has been removed by the author.; January 23, 2010 at 12:32 PM
Daniel Leal Souza said...: Great example!

I Have two questios about it:

1 - What the the function CUDA_SAFE_CALL really does? and when we have to use it?

2 - Have you use nvcc compiler or gcc with some extra arguments? if gcc, what's the command for compile it?; January 23, 2010 at 12:32 PM
Anonymous said...: I've recently started learning CUDA, and you put things together well there. A lot of things "clicked" after reading this.

Thanks!; September 26, 2011 at 2:09 AM

Tuesday, April 8, 2008

CUDA Programming Part 3

3 comments: