OpenCL - مثال ضرب دو ماتریس با استفاده از CPU و GPU

در این مثال روش ضرب دو ماتریس با استفاده از GPU را مشاهده می کنید:

// You might need to change this header based on your install:

// You might need to change this header based on your install:
#include <CL/cl.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <string>
#include <fstream>
#include <time.h>
#include <windows.h>

#define SUCCESS 0
#define FAILURE 1

using namespace std;

#pragma  comment(lib, "OpenCl.lib")

static void check_error(cl_int error, char* name) {
	if (error != CL_SUCCESS) {
		fprintf(stderr, "Non-successful return code %d for %s.  Exiting.\n", error, name);
		exit(1);
	}
}
//برای دیدن کد کامل، ادامه مطلب را ببینید
 
//We assume the matrix dimensions are divisible by 16
const int coef = 100;
const int hA = coef*16;
const int wA  = coef*16;

const int hB = coef*16;
const int wB  = coef*16;

const int hC = coef*16;
const int wC = coef*16;

float A[hA][wA] = { 0 };
float B[hB][wB] = { 0 };
float C[hC][wC] = { 0 };

//in second
inline double StopWatch(int start0stop1 = 0, bool showMessage = false)
{
	static LARGE_INTEGER swFreq = { 0, 0 }, swStart, swStop;
	static const double TwoPow32 = pow(2.0, 32.0);
	if (!swFreq.LowPart)
		QueryPerformanceFrequency(&swFreq);
	double result = -1;
	if (start0stop1 == 0)	QueryPerformanceCounter(&swStart);
	else {
		QueryPerformanceCounter(&swStop);
		if (swFreq.LowPart == 0 && swFreq.HighPart == 0) return -1;
		else {
			result = (double)((swStop.HighPart - swStart.HighPart)*TwoPow32 + swStop.LowPart - swStart.LowPart);
			if (result < 0) result += TwoPow32;
			result /= (swFreq.LowPart + swFreq.HighPart*TwoPow32);
		}
		if (showMessage) {
			char s[25] = {0};
			sprintf(s, "Time (s): %.4f", result);
			MessageBox(NULL, s, "Elapsed Time", 0);
		}
	}
	return result;
}

void createMatrices()
{
	for (size_t i = 0; i < hA; i++)
		for (size_t j = 0; j < wA; j++)
		{
			A[i][j] = rand() / (float)RAND_MAX;
		}

	for (size_t i = 0; i < hB; i++)
		for (size_t j = 0; j < wB; j++)
		{
			B[i][j] = rand() / (float)RAND_MAX;
		}
}

/* convert the kernel file into a string */
int convertToString(const char *filename, std::string& s)
{
	size_t size;
	char*  str;
	std::fstream f(filename, (std::fstream::in | std::fstream::binary));

	if (f.is_open())
	{
		size_t fileSize;
		f.seekg(0, std::fstream::end);
		size = fileSize = (size_t)f.tellg();
		f.seekg(0, std::fstream::beg);
		str = new char[size + 1];
		if (!str)
		{
			f.close();
			return 0;
		}

		f.read(str, fileSize);
		f.close();
		str[size] = '\0';
		s = str;
		delete[] str;
		return 0;
	}
	cout << "Error: failed to open file\n:" << filename << endl;
	return FAILURE;
}


void cpu_multiply()
{
	// Iterate over the rows of Matrix A
	for (int i = 0; i < hA; i++) {
		// Iterate over the columns of Matrix B
		for (int j = 0; j < wB; j++) {
			C[i][j] = 0;
			// Multiply and accumulate the values in the current row
			// of A and column of B
			for (int k = 0; k < wA; k++) {
				C[i][j] += A[i][k] * B[k][j];
			}
		}
	}	
}

void showResult()
{
	return;
	for (size_t i = 0; i < hA; i++) 
	{
		printf("\n");
		for (size_t j = 0; j < wA; j++)
		{
			printf(" %.2f", C[i][j]);
			//A[i][j] = rand() / (float)RAND_MAX;
		}
	}

	/*for (size_t i = 0; i < hB; i++)
		for (size_t j = 0; j < wB; j++)
		{
			B[i][j] = rand() / (float)RAND_MAX;
		}*/
}
int main(int argc, char const *argv[])
{
	srand(time(0));
	createMatrices();

	StopWatch(0);
	cpu_multiply();
	StopWatch(1, 1);

	showResult();
	printf("\n-------------------------------");
	//////////////////////////////////////////////////////
	//Step 1: Set Up Environment
	cl_int ciErrNum;
	// Use the first platform
	cl_platform_id platform;
	ciErrNum = clGetPlatformIDs(1, &platform, NULL);
	// Use the first device
	cl_device_id device;
	ciErrNum = clGetDeviceIDs( platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);

	cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
	// Create the context
	cl_context ctx = clCreateContext( cps, 1, &device, NULL, NULL, &ciErrNum);
	// Create the command queue
	cl_command_queue myqueue = clCreateCommandQueue( ctx, device, 0, &ciErrNum);
	
	//////////////////////////////////////////////////////
	//Step 2: Declare Buffers and Move Data
	// We assume that A, B, C are float arrays which
	// have been declared and initialized
	// Allocate space for Matrix A on the device

	//////////////////////////////////////////////////////
	//Step 3: Runtime Kernel Compilation
	const char *filename = "MatMul_Kernel.cl";
	string sourceStr;
	int status = convertToString(filename, sourceStr);
	const char *source = sourceStr.c_str();
	size_t sourceSize[] = { strlen(source) };
	cl_program myprog = clCreateProgramWithSource(ctx, 1, &source, sourceSize, &ciErrNum);

	// Compile the program. Passing NULL for the �device_list’
	// argument targets all devices in the context
	ciErrNum = clBuildProgram(myprog, 0, NULL, NULL, NULL, NULL);
	// Create the kernel
	cl_kernel mykernel = clCreateKernel(
		myprog,
		"simpleMultiply",
		&ciErrNum);

	//////////////////////////////////////////////////////
	StopWatch(0);
	//both of the folowings are true 
#if 0
	cl_mem bufferA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, wA*hA * sizeof(float), NULL, &ciErrNum);
	// Copy Matrix A to the device
	ciErrNum = clEnqueueWriteBuffer(myqueue, bufferA, CL_TRUE, 0, wA*hA * sizeof(float), (void *)A, 0, NULL, NULL);
#else
	cl_mem bufferA = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, wA*hA * sizeof(float), (void *)A, &ciErrNum);
#endif

	// Allocate space for Matrix B on the device
	cl_mem bufferB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, wB*hB * sizeof(float), NULL, &ciErrNum);
	// Copy Matrix B to the device
	ciErrNum = clEnqueueWriteBuffer(myqueue, bufferB, CL_TRUE, 0, wB*hB * sizeof(float), (void *)B, 0, NULL, NULL);
	// Allocate space for Matrix C on the device
	cl_mem bufferC = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, hA*wB * sizeof(float), NULL, &ciErrNum);

	//Step 4: Run the Program
	// Set the kernel arguments
	clSetKernelArg(mykernel, 0, sizeof(cl_mem), (void *)&bufferC);
	clSetKernelArg(mykernel, 1, sizeof(cl_int), (void *)&wA);
	clSetKernelArg(mykernel, 2, sizeof(cl_int), (void *)&hA);
	clSetKernelArg(mykernel, 3, sizeof(cl_int), (void *)&wB);
	clSetKernelArg(mykernel, 4, sizeof(cl_int), (void *)&hB);
	clSetKernelArg(mykernel, 5, sizeof(cl_mem), (void *)&bufferA);
	clSetKernelArg(mykernel, 6, sizeof(cl_mem), (void *)&bufferB);
	// Set local and global workgroup sizes
	//We assume the matrix dimensions are divisible by 16
	size_t localws[2]  = { 16, 16 };
	size_t globalws[2] = { wC, hC };//global work size
	// Execute the kernel
	ciErrNum = clEnqueueNDRangeKernel(myqueue, mykernel, 2, NULL, globalws, localws, 0, NULL, NULL);
	//////////////////////////////////////////////////////
	//Step 5: Obtain Results to Host
	// Read the output data back to the host
	ciErrNum = clEnqueueReadBuffer(
		myqueue,
		bufferC,
		CL_TRUE,
		0,
		wC*hC*sizeof(float),
		(void *)C,
		0,
		NULL,
		NULL);

	StopWatch(1, 1);
	showResult();
	return 0;
}

کد کرنل - MatMul_Kernel.cl

// widthA = heightB for valid matrix multiplication
__kernel void simpleMultiply(
	__global float* outputC,
	int widthA,
	int heightA,
	int widthB,
	int heightB,
	__global float* inputA,
	__global float* inputB) {
	//Get global position in Y direction
	int row = get_global_id(1);
	//printf("widthC: %d\n", get_global_size(0));
	//printf("heightC: %d\n", get_global_size(1));
	//printf("Core #%d\n", get_global_id(0));
	//Get global position in X direction
	int col = get_global_id(0);
	float sum = 0.0f;
	//Calculate result of one element of Matrix C
	for (int i = 0; i < widthA; i++) {
		sum += inputA[row*widthA + i] * inputB[i*widthB + col];
	}
	outputC[row*widthB + col] = sum;
}

حسین خسروی

حسین خسروی

وبلاگ دانشگاهی حسین خسروی، عضو هیات علمی دانشگاه صنعتی شاهرود

برنامه نویسی C

مثالهای C

تمرینهای C

اسلایدهای C

شبکه های عصبی

تمرینهای شبکه عصبی

سیستم های چند پردازنده ای

مثالهای MultiCore

تمرینهای MultiCore

بینایی ماشین

تمرینهای بینایی ماشین

برنامه ریزی دروس

عمومی

برنامه نویسی پیشرفته

مثالهای ++C

تمرینهای ++C

ریزپردازنده

اسلایدهای ریزپردازنده

تمرینهای ریزپردازنده

متفرقه

OpenCL - مثال ضرب دو ماتریس با استفاده از CPU و GPU

کد کرنل - MatMul_Kernel.cl

CPU

GPU

Matrix Multiplication

OpenCL Programming

برنامه نویسی OpenCL

ضرب ماتریس با استفاده از چند پردازنده

پردازش موازی

نظرات (۰)

ارسال نظر