Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions src/hamc/MatrixAdd.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@
#define ushort unsigned short

__global__ void MatrixAdd(ushort *A, ushort *B, ushort *C,
int height, int width,) {
int ROW = blockIdx.y*blockDim.y + threadIdx.y;
int height, int width,)
int ROW = blockIdx.y*blockDim.y + threadIdx.y;

int COL = blockIdx.x*blockDim.x + threadIdx.x;
int COL = blockIdx.x*blockDim.x + threadIdx.x;

if((ROW < height) && (COL < width)){
int address = ROW*width+COL;
C[address] = A[i] ^ B[i];
}
int index = row * N + col;

if( (ROW < numARows) && (COL < numAColumns) )
{
C[index] = A[index] ^ B[index];

}
}
31 changes: 8 additions & 23 deletions src/tests/MatrixAdd/MatrixAdd_cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,8 @@
#define mat_element(mat, cols, row_idx, col_idx) \
mat[(row_idx * cols) + col_idx]

//initialize the matrix
ushort mat_init(int rows, int cols)
{
if(rows <= 0 || cols <= 0)
{
return NULL;
}
ushort A;
A = (ushort)safe_malloc(sizeof(struct matrix));
A->cols = cols;
A->rows = rows;
A->data = (ushort *)safe_malloc(rows*cols*sizeof(ushort));
return A;
}

//Set the value of matix element at position given by the indices to "val"
void set_matrix_element(ushort A, int row_idx, int col_idx, ushort val)
void set_matrix_element(bin_matrix A, int row_idx, int col_idx, bin_matrix val)
{
if(row_idx < 0 || row_idx >= A->rows || col_idx < 0 || col_idx >= A->cols)
{
Expand All @@ -33,14 +18,14 @@ void set_matrix_element(ushort A, int row_idx, int col_idx, ushort val)
mat_element(A, row_idx, col_idx) = val;
}

ushort matrix_add(ushort *A, ushort *B)
bin_matrix MatrixAdd_cpu(bin_matrix *A, bin_matrix *B)
{
if(A->rows != B->rows || A->cols != B->cols)
{
printf("Incompatible dimensions for matrix addition. \n");
exit(0);
}
ushort temp mat_init(A->rows, A->cols);
// if(A->rows != B->rows || A->cols != B->cols)
// {
// printf("Incompatible dimensions for matrix addition. \n");
// exit(0);
// }
// bin_matrix temp mat_init(A->rows, A->cols);
for(int i = 0; i < A->rows; i++)
{
for(int j = 0; j < A->cols; j++)
Expand Down
5 changes: 2 additions & 3 deletions src/tests/MatrixAdd/MatrixAdd_cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ extern "C" {
#define ushort unsigned short

//FUNCTIONS go in here
ushort mat_init(int rows, int cols);
void set_matrix_element(ushort A, int row_idx, int col_idx, ushort val);
ushort add_matrix(ushort *A, ushort *B);
void set_matrix_element(bin_matrix A, int row_idx, int col_idx, bin_matrix val);
bin_matrix MatrixAdd_cpu(bin_matrix *A, bin_matrix *B);


#ifdef __cplusplus
Expand Down
149 changes: 99 additions & 50 deletions src/tests/MatrixAdd/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "../../hamc/MatrixAdd.cu"

#define TILE_WIDTH 16
#define ushort unsigned short

#define CUDA_CHECK(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
Expand All @@ -23,6 +24,28 @@ inline void gpuAssert(cudaError_t code, const char *file, int line,
}
}

#define mat_element(mat, row_idx, col_idx) \
mat->data[row_idx * (mat->cols) + col_idx]

typedef struct matrix
{
int rows;
int cols;
ushort *data;

}*bin_matrix;

void* safe_malloc(size_t n)
{
void* p = malloc(n);
if (!p)
{
fprintf(stderr, "Out of memory(%lu bytes)\n",(size_t)n);
exit(EXIT_FAILURE);
}
return p;
}

void printHelp()
{
printf("run this executable with the following flags\n");
Expand All @@ -32,30 +55,85 @@ void printHelp()
printf("\t-s <solution file name>\n");
}

bin_matrix run_cpu(bin_matrix A, bin_matrix B)
{
if (A->rows != B->rows || A->cols != B->cols){
printf("Matrices are incompatible, check dimensions...\n");
exit(0);
}

return MatrixAdd_cpu(A, B);
}

void run_cpu(const char *in, const char*sol)
bin_matrix run_kernel(bin_matrix A, bin_matrix B)
{
int numARows, numAColumns;
ushort *hostA = (ushort *)wbImport(in, &numARows, &numAColumns);
ushort *hostC = (ushort *)malloc(numARows*numAColumns * sizeof(ushort));
if (A->rows != B->rows || A->cols != B->cols){
printf("Matrices are incompatible, check dimensions...\n");
exit(0);
}

ushort *deviceA;
ushort *deviceB;
ushort *deviceC;

/* allocate the memory space on GPU */
wbTime_start(GPU, "Allocating GPU memory.");
cudaMalloc((void **) &deviceA, A->cols * A->rows * sizeof(ushort));
cudaMalloc((void **) &deviceB, B->cols * B->rows * sizeof(ushort));
cudaMalloc((void **) &deviceC, B->cols * A->rows * sizeof(ushort));
wbTime_stop(GPU, "Allocating GPU memory.");


cudaMemcpy(deviceA, A->data, A->cols * A->rows * sizeof(ushort), cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, B->data, B->cols * B->rows * sizeof(ushort), cudaMemcpyHostToDevice);

dim3 DimBlock(TILE_WIDTH, TILE_WIDTH, 1);
int x_blocks = ((B->cols - 1)/TILE_WIDTH) + 1;
int y_blocks = ((A->rows - 1)/TILE_WIDTH) + 1;
dim3 DimGrid(x_blocks, y_blocks, 1);

matrixAdd<<<DimGrid, DimBlock>>>(deviceA, deviceB, deviceC, A->rows, A->cols);

cudaDeviceSynchronize();

wbTime_start(Copy, "Copying output memory to the CPU");
cudaMemcpy(C->data, deviceC, B->cols * A->rows * sizeof(ushort), cudaMemcpyDeviceToHost);
wbTime_stop(Copy, "Copying output memory to the CPU");

wbTime_start(GPU, "Freeing GPU Memory");
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
wbTime_stop(GPU, "Freeing GPU Memory");

matrix_add(hostA, hostC, numARows, numAColumns);
return C;
}


int main(int argc, char *argv[])
{
printf("MatrixAdd test:\n");

// Variable - Matrices (Device)
wbArg_t args;

ushort *hostA; // The A matrix
ushort *hostC; // The output C matrix
ushort *deviceA; // A matrix on device
ushort *deviceB; // B matrix on device (copy of A)
ushort *deviceC; // C matrix on device
bin_matrix A;
bin_matrix B;
bin_matrix C;

// Variable - Matrices (Host)
ushort *hostA;
ushort *hostB;
ushort *hostC;

// Variables - Rows & Cols
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A

int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C
int numCColumns; // number of columns in the matrix C

//Inputs
char *inputFileName;
char *solutionFileName;

Expand Down Expand Up @@ -97,49 +175,20 @@ int main(int argc, char *argv[])

/* allocate host data for matrix */
wbTime_start(Generic, "Importing data and creating memory on host");

hostA = (ushort *)wbImport(inputFileName, &numARows, &numAColumns);
int numBRows = numARows; // number of rows in the matrix B
int numBColumns = numAColumns; // number of columns in the matrix B
int numCRows = numARows; // number of rows in the matrix C
int numCColumns = numAColumns; // number of columns in the matrix C
A = mat_init(numARows, numACols);
A->data = hostA;

hostB = (ushort *)wbImport(inputFileName, &numBRows, &numBColumns);
B = mat_init(numARows, numACols);
B->data = hostB;

hostC = (ushort *)malloc(numCRows*numCColumns * sizeof(ushort));

wbTime_stop(Generic, "Importing data and creating memory on host");


wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);


/* allocate the memory space on GPU */
wbTime_start(GPU, "Allocating GPU memory.");
cudaMalloc((void**) &deviceA, numARows * numAColumns * sizeof(ushort));
cudaMalloc((void**) &deviceB, numBRows * numBColumns * sizeof(ushort));
cudaMalloc((void**) &deviceC, numCRows * numCColumns * sizeof(ushort));
wbTime_stop(GPU, "Allocating GPU memory.");


dim3 dimGrid((numCColumns - 1) / 16 + 1, (numCRows - 1) / TILE_WIDTH + 1, 1);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);

/* call CUDA kernel to perform computations */
wbTime_start(Compute, "Performing CUDA computation for RREF");
MatrixAdd_kernel<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC, numARows);
cudaDeviceSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");



wbTime_start(Copy, "Copying output memory to the CPU");
cudaMemcpy(hostC, deviceC, numCRows * numCColumns * sizeof(ushort), cudaMemcpyDeviceToHost);
wbTime_stop(Copy, "Copying output memory to the CPU");

/* Free GPU Memory */
wbTime_start(GPU, "Freeing GPU Memory");
cudaFree(deviceA);
cudaFree(deviceC);
wbTime_stop(GPU, "Freeing GPU Memory");

wbSolution(args, hostC, numCRows, numCColumns);

free(hostA);
free(hostB);
free(hostC);
Expand Down
4 changes: 2 additions & 2 deletions src/tests/MatrixAdd/sources.cmake
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
add_lab("ADD_MATRIX_test")
add_lab_solution("ADD_MATRIX_test", ${CMAKE_CURRENT_LIST_DIR}/main.cu)
add_generator("ADD_MATRIX_test" ${CMAKE_CURRENT_LIST_DIR}/dataset_generator.cpp)
add_lab_solution("ADD_MATRIX_test" ${CMAKE_CURRENT_LIST_DIR}/main.cu)
#add_generator("ADD_MATRIX_test", ${CMAKE_CURRENT_LIST_DIR}/dataset_generator.cpp)
2 changes: 2 additions & 0 deletions src/tests/sources.cmake
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
include(${CMAKE_CURRENT_LIST_DIR}/RREFMatrix/sources.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/MatrixAdd/sources.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/InverseMatrix/sources.cmake)