HELLO WORLD (CUDA)
Step 1:
cp -r template HelloWorld
NOTE:- CUDA documentation is under common\ cutil_readme.txt
step 2:
compile using make (for testing)
Step 3:
Change the template with (HelloWorld) in make file
Open the HelloWorld.cu and change the include template_kernel.cu to
// includes, kernels
#include <HelloWorld_kernel.cu>
NOTES:-
CUDA_DEVICE_INIT all the cuda initialization's under here
CUDA_SAFE_CALL tells how to call ( CUDA Routines)
Limitation is up to 32 threads
(HelloWorld_kernel.cu) Kernel is going to execute on GPU.
(HelloWorld.cu) and other will run on CPU.
Step 4:
Compile using make (should come up without any error)
NOTE : - in this default example we will use the global memory area
Using cuda malloc it makes sure the data transfer is optimized
Step 5:
remove the reference code from the HelloWorld.cu file
NOTE:
* h_odata - CPU data
* d_odata - GPU data
------------------------------------------------
Final Working program
------------------------------------------------
==========================
HelloWorld_Kernel
==========================
/*
* Copyright 2008, Karen Hains, UWA (University of Western Australia).
* All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* WE MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND.
*/
#ifndef _HELLOWORLD_KERNEL_H_
#define _HELLOWORLD_KERNEL_H_
#include <stdio.h>
///////////////////////////////////////////////////////////
// Simple Hello World kernel
// @param gpu_odata output data in global memory
///////////////////////////////////////////////////////////
__global__ void HelloWorld_kernel(int size, char *gpu_odata)
{
int i,k,x,n,last_num;
int idx,not_done;
// Access thread id and block id
const unsigned int thread_idx = threadIdx.x;
const unsigned int block_idx = blockIdx.x;
// Write data to global memory
idx = 0;
gpu_odata[thread_idx*size + idx++] = 'H';
gpu_odata[thread_idx*size + idx++] = 'e';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'W';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = 'r';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'd';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'F';
gpu_odata[thread_idx*size + idx++] = 'r';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = 'm';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'T';
gpu_odata[thread_idx*size + idx++] = 'h';
gpu_odata[thread_idx*size + idx++] = 'r';
gpu_odata[thread_idx*size + idx++] = 'e';
gpu_odata[thread_idx*size + idx++] = 'a';
gpu_odata[thread_idx*size + idx++] = 'd';
gpu_odata[thread_idx*size + idx++] = ' ';
// Convert thread id to chars
// Determine number of places in thread idx
not_done = 1;
k = 10;
n = 1;
while(not_done == 1) {
x = thread_idx/k;
if (x>0) {
k = k*10;
n +=1;
}
else
not_done = 0;
}
// Parse out the thread index and convert to chars
k = k/10;
last_num = 0;
for(i=n;i>0;i--) {
x = thread_idx/k-last_num;
gpu_odata[thread_idx*size + idx++] = '0' + x;
last_num = (thread_idx/k)*10;
k = k/10;
}
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'i';
gpu_odata[thread_idx*size + idx++] = 'n';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'B';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = 'c';
gpu_odata[thread_idx*size + idx++] = 'k';
gpu_odata[thread_idx*size + idx++] = ' ';
// Convert block id to chars
// Determine number of places in thread idx
not_done = 1;
k = 10;
n = 1;
while(not_done == 1) {
x = block_idx/k;
if (x>0) {
k = k*10;
n +=1;
}
else
not_done = 0;
}
// Parse out the block index and convert to chars
k = k/10;
last_num = 0;
for(i=n;i>0;i--) {
x = block_idx/k-last_num;
gpu_odata[thread_idx*size + idx++] = '0' + x;
last_num = (block_idx/k)*10;
k = k/10;
}
// Fill out rest of string
for(i=idx;i<size;i++)
gpu_odata[thread_idx*size + idx++] = ' ';
}
#endif // #ifndef _HELLOWORLD_KERNEL_H_
==========================
HelloWorld.cu
==========================
/*
* Copyright 2008, Karen Hains, UWA . All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* WE MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND.
*/
/* HellowWorld Project
* This project demonstrates the basics on how to setup
* an example GPU Copmuting application.
*
* THis file contains the CPU (host) code.
*/
// Host defines
#define NUM_THREADS 32
#define STR_SIZE 50
// Includes
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// CUDA includes
#include <cutil.h> // CUDA Utility Tools
// GPU Kernels declarations - declare as inlcude
#include <HelloWorld_kernel.cu>
//////////////////////
// Program main
//////////////////////
int main( int argc, char** argv)
{
// Host variables
int i,nBytes;
unsigned int timer;
unsigned int num_threads;
char *cpu_odata;
char *string;
// GPU variables
char *gpu_odata;
int str_size;
/////////////////////////////////////////////////////////////////////
// This routine gets the number of GPUs existing in the computer
// For each GPU (device) found, it checks to see if there is a GPU
// that supports CUDA. If no GPU that supports CUDA is found,
// the routine wll exit
/////////////////////////////////////////////////////////////////////
CUT_DEVICE_INIT();
/////////////////////////////////////////////////////////////////////
// Create and start a timer called "timer"
// alls to create ans start times are enveloped in the CUT_SAFE_CALL
// This CUDA Utility Tool checks for errors upon return.
// If an error is found, it prints out and error message, file name,
// and line number in file where the error can be found
/////////////////////////////////////////////////////////////////////
timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));
// Initialize CPU variables and allocate required memory
num_threads = (unsigned int) NUM_THREADS;
nBytes = num_threads*STR_SIZE*sizeof(char);
// Allocate and initialize CPU output vector
string = (char *) malloc(STR_SIZE);
if(!string) {
printf("Cannot allocate string memory on CPU\n");
exit(-1);
}
cpu_odata = (char *) malloc(nBytes);
if(!cpu_odata) {
printf("Cannot allocate cpu_odata memory on CPU\n");
exit(-1);
}
// Allocate GPU (device) memory and variables
str_size = (int) STR_SIZE;
CUDA_SAFE_CALL(cudaMalloc( (void**) &gpu_odata, nBytes));
// Setup kernel execution parameters
dim3 grid(1,1,1);
dim3 threads(num_threads,1,1);
// Execute the kernel on the GPU
HelloWorld_kernel<<< grid, threads >>>(str_size, gpu_odata);
// Check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
// Copy result from GPU to CPU
CUDA_SAFE_CALL(cudaMemcpy(cpu_odata,gpu_odata,nBytes,cudaMemcpyDeviceToHost));
// Stop the timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue(timer));
// Delete the timer
CUT_SAFE_CALL(cutDeleteTimer(timer));
// Output results is same as the expected solution
for(i=0;i<num_threads;i++) {
strncpy(string,&cpu_odata[i*STR_SIZE],STR_SIZE);
printf("From thread %d: %s\n",i,string);
}
//////////////////////////////////////////
// All done - clean up and exit
//////////////////////////////////////////
// Free up CPU memory
free(cpu_odata);
// Free up GPU memory
CUDA_SAFE_CALL(cudaFree(gpu_odata));
// Use CUDA Utility Tool to exit cleanly
CUT_EXIT(argc, argv);
}
==========================
Makefile
==========================
################################################################################
#
# Build script for project
#
################################################################################
# Add source files here
EXECUTABLE := HelloWorld
# CUDA source files (compiled with cudacc)
CUFILES := HelloWorld.cu
# CUDA dependency files
CU_DEPS := \
HelloWorld_kernel.cu \
# C/C++ source files (compiled with gcc / c++)
CCFILES := \
################################################################################
# Rules and targets
include ../../common/common.mk
Ref : Western Australian Super computer Program, University of Western Australia , 2008, April, 1- 3, 12-2 pm.
Step 1:
cp -r template HelloWorld
NOTE:- CUDA documentation is under common\ cutil_readme.txt
step 2:
compile using make (for testing)
Step 3:
Change the template with (HelloWorld) in make file
Open the HelloWorld.cu and change the include template_kernel.cu to
// includes, kernels
#include <HelloWorld_kernel.cu>
NOTES:-
CUDA_DEVICE_INIT all the cuda initialization's under here
CUDA_SAFE_CALL tells how to call ( CUDA Routines)
Limitation is up to 32 threads
(HelloWorld_kernel.cu) Kernel is going to execute on GPU.
(HelloWorld.cu) and other will run on CPU.
Step 4:
Compile using make (should come up without any error)
NOTE : - in this default example we will use the global memory area
Using cuda malloc it makes sure the data transfer is optimized
Step 5:
remove the reference code from the HelloWorld.cu file
NOTE:
* h_odata - CPU data
* d_odata - GPU data
------------------------------------------------
Final Working program
------------------------------------------------
==========================
HelloWorld_Kernel
==========================
/*
* Copyright 2008, Karen Hains, UWA (University of Western Australia).
* All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* WE MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND.
*/
#ifndef _HELLOWORLD_KERNEL_H_
#define _HELLOWORLD_KERNEL_H_
#include <stdio.h>
///////////////////////////////////////////////////////////
// Simple Hello World kernel
// @param gpu_odata output data in global memory
///////////////////////////////////////////////////////////
__global__ void HelloWorld_kernel(int size, char *gpu_odata)
{
int i,k,x,n,last_num;
int idx,not_done;
// Access thread id and block id
const unsigned int thread_idx = threadIdx.x;
const unsigned int block_idx = blockIdx.x;
// Write data to global memory
idx = 0;
gpu_odata[thread_idx*size + idx++] = 'H';
gpu_odata[thread_idx*size + idx++] = 'e';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'W';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = 'r';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'd';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'F';
gpu_odata[thread_idx*size + idx++] = 'r';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = 'm';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'T';
gpu_odata[thread_idx*size + idx++] = 'h';
gpu_odata[thread_idx*size + idx++] = 'r';
gpu_odata[thread_idx*size + idx++] = 'e';
gpu_odata[thread_idx*size + idx++] = 'a';
gpu_odata[thread_idx*size + idx++] = 'd';
gpu_odata[thread_idx*size + idx++] = ' ';
// Convert thread id to chars
// Determine number of places in thread idx
not_done = 1;
k = 10;
n = 1;
while(not_done == 1) {
x = thread_idx/k;
if (x>0) {
k = k*10;
n +=1;
}
else
not_done = 0;
}
// Parse out the thread index and convert to chars
k = k/10;
last_num = 0;
for(i=n;i>0;i--) {
x = thread_idx/k-last_num;
gpu_odata[thread_idx*size + idx++] = '0' + x;
last_num = (thread_idx/k)*10;
k = k/10;
}
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'i';
gpu_odata[thread_idx*size + idx++] = 'n';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'B';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = 'c';
gpu_odata[thread_idx*size + idx++] = 'k';
gpu_odata[thread_idx*size + idx++] = ' ';
// Convert block id to chars
// Determine number of places in thread idx
not_done = 1;
k = 10;
n = 1;
while(not_done == 1) {
x = block_idx/k;
if (x>0) {
k = k*10;
n +=1;
}
else
not_done = 0;
}
// Parse out the block index and convert to chars
k = k/10;
last_num = 0;
for(i=n;i>0;i--) {
x = block_idx/k-last_num;
gpu_odata[thread_idx*size + idx++] = '0' + x;
last_num = (block_idx/k)*10;
k = k/10;
}
// Fill out rest of string
for(i=idx;i<size;i++)
gpu_odata[thread_idx*size + idx++] = ' ';
}
#endif // #ifndef _HELLOWORLD_KERNEL_H_
==========================
HelloWorld.cu
==========================
/*
* Copyright 2008, Karen Hains, UWA . All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* WE MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND.
*/
/* HellowWorld Project
* This project demonstrates the basics on how to setup
* an example GPU Copmuting application.
*
* THis file contains the CPU (host) code.
*/
// Host defines
#define NUM_THREADS 32
#define STR_SIZE 50
// Includes
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// CUDA includes
#include <cutil.h> // CUDA Utility Tools
// GPU Kernels declarations - declare as inlcude
#include <HelloWorld_kernel.cu>
//////////////////////
// Program main
//////////////////////
int main( int argc, char** argv)
{
// Host variables
int i,nBytes;
unsigned int timer;
unsigned int num_threads;
char *cpu_odata;
char *string;
// GPU variables
char *gpu_odata;
int str_size;
/////////////////////////////////////////////////////////////////////
// This routine gets the number of GPUs existing in the computer
// For each GPU (device) found, it checks to see if there is a GPU
// that supports CUDA. If no GPU that supports CUDA is found,
// the routine wll exit
/////////////////////////////////////////////////////////////////////
CUT_DEVICE_INIT();
/////////////////////////////////////////////////////////////////////
// Create and start a timer called "timer"
// alls to create ans start times are enveloped in the CUT_SAFE_CALL
// This CUDA Utility Tool checks for errors upon return.
// If an error is found, it prints out and error message, file name,
// and line number in file where the error can be found
/////////////////////////////////////////////////////////////////////
timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));
// Initialize CPU variables and allocate required memory
num_threads = (unsigned int) NUM_THREADS;
nBytes = num_threads*STR_SIZE*sizeof(char);
// Allocate and initialize CPU output vector
string = (char *) malloc(STR_SIZE);
if(!string) {
printf("Cannot allocate string memory on CPU\n");
exit(-1);
}
cpu_odata = (char *) malloc(nBytes);
if(!cpu_odata) {
printf("Cannot allocate cpu_odata memory on CPU\n");
exit(-1);
}
// Allocate GPU (device) memory and variables
str_size = (int) STR_SIZE;
CUDA_SAFE_CALL(cudaMalloc( (void**) &gpu_odata, nBytes));
// Setup kernel execution parameters
dim3 grid(1,1,1);
dim3 threads(num_threads,1,1);
// Execute the kernel on the GPU
HelloWorld_kernel<<< grid, threads >>>(str_size, gpu_odata);
// Check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
// Copy result from GPU to CPU
CUDA_SAFE_CALL(cudaMemcpy(cpu_odata,gpu_odata,nBytes,cudaMemcpyDeviceToHost));
// Stop the timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue(timer));
// Delete the timer
CUT_SAFE_CALL(cutDeleteTimer(timer));
// Output results is same as the expected solution
for(i=0;i<num_threads;i++) {
strncpy(string,&cpu_odata[i*STR_SIZE],STR_SIZE);
printf("From thread %d: %s\n",i,string);
}
//////////////////////////////////////////
// All done - clean up and exit
//////////////////////////////////////////
// Free up CPU memory
free(cpu_odata);
// Free up GPU memory
CUDA_SAFE_CALL(cudaFree(gpu_odata));
// Use CUDA Utility Tool to exit cleanly
CUT_EXIT(argc, argv);
}
==========================
Makefile
==========================
################################################################################
#
# Build script for project
#
################################################################################
# Add source files here
EXECUTABLE := HelloWorld
# CUDA source files (compiled with cudacc)
CUFILES := HelloWorld.cu
# CUDA dependency files
CU_DEPS := \
HelloWorld_kernel.cu \
# C/C++ source files (compiled with gcc / c++)
CCFILES := \
################################################################################
# Rules and targets
include ../../common/common.mk
Ref : Western Australian Super computer Program, University of Western Australia , 2008, April, 1- 3, 12-2 pm.
4 comments:
too many typos... skip this page if you can..
Meh. Not that big a deal. Replace < with < and > with > replace rsize with size, and add argc, argv to the init call.Do all that and it's a pretty decent example.
thats & l t ; with < and & g t ; with >
Who knows where to download XRumer 5.0 Palladium?
Help, please. All recommend this program to effectively advertise on the Internet, this is the best program!
Post a Comment