Extended Hello World to execute it on N number of blocks..
======================
Main Line code
======================
/* HellowWorld Project
* This project demonstrates the basics on how to setup
* an example GPU Copmuting application.
*
* THis file contains the CPU (host) code.
*/
// Host defines
#define NUM_BLOCKS 32
#define NUM_THREADS 32
#define STR_SIZE 50
// Includes
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// CUDA includes
#include <cutil.h> // CUDA Utility Tools
// GPU Kernels declarations - declare as inlcude
#include <HelloWorld_kernel.cu>
//////////////////////
// Program main
//////////////////////
int main( int argc, char** argv)
{
// Host variables
int i,nBytes;
unsigned int timer;
unsigned int num_threads,num_blocks;
char *cpu_odata;
char *string;
// GPU variables
char *gpu_odata;
int str_size;
/////////////////////////////////////////////////////////////////////
// This routine gets the number of GPUs existing in the computer
// For each GPU (device) found, it checks to see if there is a GPU
// that supports CUDA. If no GPU that supports CUDA is found,
// the routine wll exit
/////////////////////////////////////////////////////////////////////
CUT_DEVICE_INIT();
/////////////////////////////////////////////////////////////////////
// Create and start a timer called "timer"
// alls to create ans start times are enveloped in the CUT_SAFE_CALL
// This CUDA Utility Tool checks for errors upon return.
// If an error is found, it prints out and error message, file name,
// and line number in file where the error can be found
/////////////////////////////////////////////////////////////////////
timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));
// Initialize CPU variables and allocate required memory
num_threads = (unsigned int) NUM_THREADS;
num_blocks = (unsigned int) NUM_BLOCKS;
nBytes = num_blocks*num_threads*STR_SIZE*sizeof(char);
// Allocate and initialize CPU output vector
string = (char *) malloc(STR_SIZE);
if(!string) {
printf("Cannot allocate string memory on CPU\n");
exit(-1);
}
cpu_odata = (char *) malloc(nBytes);
if(!cpu_odata) {
printf("Cannot allocate cpu_odata memory on CPU\n");
exit(-1);
}
// Allocate GPU (device) memory and variables
str_size = (int) STR_SIZE;
CUDA_SAFE_CALL(cudaMalloc( (void**) &gpu_odata, nBytes));
// Setup kernel execution parameters
dim3 grid(num_blocks,1,1);
dim3 threads(num_threads,1,1);
// Execute the kernel on the GPU
HelloWorld_kernel<<< grid, threads >>>(str_size,num_threads, gpu_odata);
// HelloWorld_kernel<<<num_blocks,num_threads>>>(str_size, gpu_odata);
// Check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
// Copy result from GPU to CPU
CUDA_SAFE_CALL(cudaMemcpy(cpu_odata,gpu_odata,nBytes,cudaMemcpyDeviceToHost));
// Stop the timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue(timer));
// Delete the timer
CUT_SAFE_CALL(cutDeleteTimer(timer));
// Output results is same as the expected solution
for(i=0;i<num_threads*num_blocks;i++) {
strncpy(string,&cpu_odata[i*STR_SIZE],STR_SIZE);
printf("From thread %d: %s\n",i,string);/* HellowWorld Project
* This project demonstrates the basics on how to setup
* an example GPU Copmuting application.
*
* THis file contains the CPU (host) code.
*/
// Host defines
#define NUM_BLOCKS 32
#define NUM_THREADS 32
#define STR_SIZE 50
// Includes
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// CUDA includes
#include <cutil.h> // CUDA Utility Tools
// GPU Kernels declarations - declare as inlcude
#include <HelloWorld_kernel.cu>
//////////////////////
// Program main
//////////////////////
int main( int argc, char** argv)
{
// Host variables
int i,nBytes;
unsigned int timer;
unsigned int num_threads,num_blocks;
char *cpu_odata;
char *string;
// GPU variables
char *gpu_odata;
int str_size;
/////////////////////////////////////////////////////////////////////
// This routine gets the number of GPUs existing in the computer
// For each GPU (device) found, it checks to see if there is a GPU
// that supports CUDA. If no GPU that supports CUDA is found,
// the routine wll exit
/////////////////////////////////////////////////////////////////////
CUT_DEVICE_INIT();
/////////////////////////////////////////////////////////////////////
// Create and start a timer called "timer"
// alls to create ans start times are enveloped in the CUT_SAFE_CALL
// This CUDA Utility Tool checks for errors upon return.
// If an error is found, it prints out and error message, file name,
// and line number in file where the error can be found
/////////////////////////////////////////////////////////////////////
timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));
// Initialize CPU variables and allocate required memory
num_threads = (unsigned int) NUM_THREADS;
num_blocks = (unsigned int) NUM_BLOCKS;
nBytes = num_blocks*num_threads*STR_SIZE*sizeof(char);
// Allocate and initialize CPU output vector
string = (char *) malloc(STR_SIZE);
if(!string) {
printf("Cannot allocate string memory on CPU\n");
exit(-1);
}
cpu_odata = (char *) malloc(nBytes);
if(!cpu_odata) {
printf("Cannot allocate cpu_odata memory on CPU\n");
exit(-1);
}
// Allocate GPU (device) memory and variables
str_size = (int) STR_SIZE;
CUDA_SAFE_CALL(cudaMalloc( (void**) &gpu_odata, nBytes));
// Setup kernel execution parameters
dim3 grid(num_blocks,1,1);
dim3 threads(num_threads,1,1);
// Execute the kernel on the GPU
HelloWorld_kernel<<< grid, threads >>>(str_size,num_threads, gpu_odata);
// HelloWorld_kernel<<<num_blocks,num_threads>>>(str_size, gpu_odata);
// Check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
// Copy result from GPU to CPU
CUDA_SAFE_CALL(cudaMemcpy(cpu_odata,gpu_odata,nBytes,cudaMemcpyDeviceToHost));
// Stop the timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue(timer));
// Delete the timer
CUT_SAFE_CALL(cutDeleteTimer(timer));
// Output results is same as the expected solution
for(i=0;i<num_threads*num_blocks;i++) {
strncpy(string,&cpu_odata[i*STR_SIZE],STR_SIZE);
printf("From thread %d: %s\n",i,string);
}
//////////////////////////////////////////
// All done - clean up and exit
//////////////////////////////////////////
// Free up CPU memory
free(cpu_odata);
// Free up GPU memory
CUDA_SAFE_CALL(cudaFree(gpu_odata));
// Use CUDA Utility Tool to exit cleanly
CUT_EXIT(argc, argv);
}
}
//////////////////////////////////////////
// All done - clean up and exit
//////////////////////////////////////////
// Free up CPU memory
free(cpu_odata);
// Free up GPU memory
CUDA_SAFE_CALL(cudaFree(gpu_odata));
// Use CUDA Utility Tool to exit cleanly
CUT_EXIT(argc, argv);
}
======================
Kernel
======================
#ifndef _HELLOWORLD_KERNEL_H_
#define _HELLOWORLD_KERNEL_H_
#include <stdio.h>
///////////////////////////////////////////////////////////
// Simple Hello World kernel
// @param gpu_odata output data in global memory
///////////////////////////////////////////////////////////
__global__ void HelloWorld_kernel(int size,int num_threads, char *gpu_odata)
{
int i,k,x,n,last_num;
int idx,not_done;
// Access thread id and block id
const unsigned int thread_idx = threadIdx.x;
const unsigned int block_idx = blockIdx.x;
// Write data to global memory
idx = 0;
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'H';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'e';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'l';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'l';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'o';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'W';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'o';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'r';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'l';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'd';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'F';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'r';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'o';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'm';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'T';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'h';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'r';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'e';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'a';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'd';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
// Convert thread id to chars
// Determine number of places in thread idx
not_done = 1;
k = 10;
n = 1;
while(not_done == 1) {
x = thread_idx/k;
if (x>0) {
k = k*10;
n +=1;
}
else
not_done = 0;
}
// Parse out the thread index and convert to chars
k = k/10;
last_num = 0;
for(i=n;i>0;i--) {
x = thread_idx/k-last_num;
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = '0' + x;
last_num = (thread_idx/k)*10;
k = k/10;
}
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'i';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'n';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'B';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'l';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'o';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'c';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = 'k';
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
// Convert block id to chars
// Determine number of places in thread idx
not_done = 1;
k = 10;
n = 1;
while(not_done == 1) {
x = block_idx/k;
if (x>0) {
k = k*10;
n +=1;
}
else
not_done = 0;
}
// Parse out the block index and convert to chars
k = k/10;
last_num = 0;
for(i=n;i>0;i--) {
x = block_idx/k-last_num;
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = '0' + x;
last_num = (block_idx/k)*10;
k = k/10;
}
// Fill out rest of string
for(i=idx;i<size;i++)
gpu_odata[block_idx*num_threads*size + thread_idx*size + idx++] = ' ';
}
#endif
http://developer.nvidia.com/dev_content/cg/cg_examples/images/sine_wave_perturbation_ogl.jpg
Second Assignment is to generate a 2d sine wave for the RK4 method.