Search This Blog

Thursday, April 3, 2008

First Hello World program from CUDA

HELLO WORLD (CUDA)


Step 1:

cp -r template HelloWorld

NOTE:- CUDA documentation is under common\ cutil_readme.txt

step 2:

compile using make (for testing)

Step 3:

Change the template with (HelloWorld) in make file

Open the HelloWorld.cu and change the include template_kernel.cu to

// includes, kernels
#include <HelloWorld_kernel.cu>

NOTES:-

CUDA_DEVICE_INIT all the cuda initialization's under here

CUDA_SAFE_CALL tells how to call ( CUDA Routines)

Limitation is up to 32 threads


(HelloWorld_kernel.cu) Kernel is going to execute on GPU.

(HelloWorld.cu) and other will run on CPU.


Step 4:

Compile using make (should come up without any error)

NOTE : - in this default example we will use the global memory area

Using cuda malloc it makes sure the data transfer is optimized


Step 5:

remove the reference code from the HelloWorld.cu file

NOTE:

* h_odata - CPU data
* d_odata - GPU data

------------------------------------------------
Final Working program
------------------------------------------------

==========================
HelloWorld_Kernel
==========================

/*
* Copyright 2008, Karen Hains, UWA (University of Western Australia).
* All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* WE MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND.
*/

#ifndef _HELLOWORLD_KERNEL_H_
#define _HELLOWORLD_KERNEL_H_

#include <stdio.h>


///////////////////////////////////////////////////////////
// Simple Hello World kernel
// @param gpu_odata output data in global memory
///////////////////////////////////////////////////////////
__global__ void HelloWorld_kernel(int size, char *gpu_odata)
{
int i,k,x,n,last_num;
int idx,not_done;

// Access thread id and block id
const unsigned int thread_idx = threadIdx.x;
const unsigned int block_idx = blockIdx.x;

// Write data to global memory
idx = 0;
gpu_odata[thread_idx*size + idx++] = 'H';
gpu_odata[thread_idx*size + idx++] = 'e';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'W';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = 'r';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'd';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'F';
gpu_odata[thread_idx*size + idx++] = 'r';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = 'm';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'T';
gpu_odata[thread_idx*size + idx++] = 'h';
gpu_odata[thread_idx*size + idx++] = 'r';
gpu_odata[thread_idx*size + idx++] = 'e';
gpu_odata[thread_idx*size + idx++] = 'a';
gpu_odata[thread_idx*size + idx++] = 'd';
gpu_odata[thread_idx*size + idx++] = ' ';

// Convert thread id to chars
// Determine number of places in thread idx
not_done = 1;
k = 10;
n = 1;
while(not_done == 1) {
x = thread_idx/k;
if (x>0) {
k = k*10;
n +=1;
}
else
not_done = 0;
}

// Parse out the thread index and convert to chars
k = k/10;
last_num = 0;
for(i=n;i>0;i--) {
x = thread_idx/k-last_num;
gpu_odata[thread_idx*size + idx++] = '0' + x;
last_num = (thread_idx/k)*10;
k = k/10;
}

gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'i';
gpu_odata[thread_idx*size + idx++] = 'n';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'B';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = 'c';
gpu_odata[thread_idx*size + idx++] = 'k';
gpu_odata[thread_idx*size + idx++] = ' ';

// Convert block id to chars
// Determine number of places in thread idx
not_done = 1;
k = 10;
n = 1;
while(not_done == 1) {
x = block_idx/k;
if (x>0) {
k = k*10;
n +=1;
}
else
not_done = 0;
}

// Parse out the block index and convert to chars
k = k/10;
last_num = 0;
for(i=n;i>0;i--) {
x = block_idx/k-last_num;
gpu_odata[thread_idx*size + idx++] = '0' + x;
last_num = (block_idx/k)*10;
k = k/10;
}

// Fill out rest of string
for(i=idx;i<size;i++)
gpu_odata[thread_idx*size + idx++] = ' ';
}

#endif // #ifndef _HELLOWORLD_KERNEL_H_


==========================

HelloWorld.cu

==========================

/*

* Copyright 2008, Karen Hains, UWA . All rights reserved.

*

* NOTICE TO USER:

*

* This source code is subject to NVIDIA ownership rights under U.S. and

* international Copyright laws. Users and possessors of this source code

* are hereby granted a nonexclusive, royalty-free license to use this code

* in individual and commercial software.

*

* WE MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE

* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

* IMPLIED WARRANTY OF ANY KIND.

*/

/* HellowWorld Project
* This project demonstrates the basics on how to setup
* an example GPU Copmuting application.
*
* THis file contains the CPU (host) code.
*/

// Host defines
#define NUM_THREADS 32
#define STR_SIZE 50

// Includes
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// CUDA includes
#include <cutil.h> // CUDA Utility Tools

// GPU Kernels declarations - declare as inlcude
#include <HelloWorld_kernel.cu>

//////////////////////
// Program main
//////////////////////
int main( int argc, char** argv)
{

// Host variables
int i,nBytes;
unsigned int timer;
unsigned int num_threads;
char *cpu_odata;
char *string;

// GPU variables
char *gpu_odata;
int str_size;

/////////////////////////////////////////////////////////////////////
// This routine gets the number of GPUs existing in the computer
// For each GPU (device) found, it checks to see if there is a GPU
// that supports CUDA. If no GPU that supports CUDA is found,
// the routine wll exit
/////////////////////////////////////////////////////////////////////
CUT_DEVICE_INIT();

/////////////////////////////////////////////////////////////////////
// Create and start a timer called "timer"
// alls to create ans start times are enveloped in the CUT_SAFE_CALL
// This CUDA Utility Tool checks for errors upon return.
// If an error is found, it prints out and error message, file name,
// and line number in file where the error can be found
/////////////////////////////////////////////////////////////////////
timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));

// Initialize CPU variables and allocate required memory
num_threads = (unsigned int) NUM_THREADS;
nBytes = num_threads*STR_SIZE*sizeof(char);

// Allocate and initialize CPU output vector
string = (char *) malloc(STR_SIZE);
if(!string) {
printf("Cannot allocate string memory on CPU\n");
exit(-1);
}
cpu_odata = (char *) malloc(nBytes);
if(!cpu_odata) {
printf("Cannot allocate cpu_odata memory on CPU\n");
exit(-1);
}

// Allocate GPU (device) memory and variables
str_size = (int) STR_SIZE;
CUDA_SAFE_CALL(cudaMalloc( (void**) &gpu_odata, nBytes));

// Setup kernel execution parameters
dim3 grid(1,1,1);
dim3 threads(num_threads,1,1);

// Execute the kernel on the GPU
HelloWorld_kernel<<< grid, threads >>>(str_size, gpu_odata);

// Check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");

// Copy result from GPU to CPU
CUDA_SAFE_CALL(cudaMemcpy(cpu_odata,gpu_odata,nBytes,cudaMemcpyDeviceToHost));

// Stop the timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue(timer));

// Delete the timer
CUT_SAFE_CALL(cutDeleteTimer(timer));

// Output results is same as the expected solution
for(i=0;i<num_threads;i++) {
strncpy(string,&cpu_odata[i*STR_SIZE],STR_SIZE);
printf("From thread %d: %s\n",i,string);
}


//////////////////////////////////////////
// All done - clean up and exit
//////////////////////////////////////////
// Free up CPU memory
free(cpu_odata);

// Free up GPU memory
CUDA_SAFE_CALL(cudaFree(gpu_odata));

// Use CUDA Utility Tool to exit cleanly
CUT_EXIT(argc, argv);
}

==========================
Makefile

==========================

################################################################################
#
# Build script for project
#
################################################################################

# Add source files here
EXECUTABLE := HelloWorld
# CUDA source files (compiled with cudacc)
CUFILES := HelloWorld.cu
# CUDA dependency files
CU_DEPS := \
HelloWorld_kernel.cu \

# C/C++ source files (compiled with gcc / c++)
CCFILES := \



################################################################################
# Rules and targets

include ../../common/common.mk


Ref : Western Australian Super computer Program, University of Western Australia , 2008, April, 1- 3, 12-2 pm.

4 comments:

Anonymous said...

too many typos... skip this page if you can..

Anonymous said...

Meh. Not that big a deal. Replace < with < and > with > replace rsize with size, and add argc, argv to the init call.Do all that and it's a pretty decent example.

Anonymous said...

thats & l t ; with < and & g t ; with >

Anonymous said...

Who knows where to download XRumer 5.0 Palladium?
Help, please. All recommend this program to effectively advertise on the Internet, this is the best program!