# Run unit tests
ctest --preset default
# Run performance benchmarks
./build-release/benchmark
# Run MNIST demo (optional)
./build-release/mnist_demo
Build the Project
Using CMake Presets (Recommended)
Preset
Purpose
Configuration
default
Development & debugging
Debug mode, enables tests
release
Performance testing
Release mode, O3 optimization
ci
Continuous integration
Strict warnings, test coverage
1
2
3
4
5
6
# List available presets
cmake --list-presets# Use specific preset
cmake --preset <preset-name>
cmake --build--preset <preset-name>
Manual Build
1
2
3
4
5
6
7
8
9
10
mkdir build &&cd build
# Configure
cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON
# Compile (using all available cores)
make -j$(nproc)# Run tests
ctest --output-on-failure
Build Options
Option
Description
Default
BUILD_TESTS
Build unit tests
ON
BUILD_BENCHMARKS
Build benchmarks
ON
BUILD_MNIST_DEMO
Build MNIST demo
ON
CMAKE_CUDA_ARCHITECTURES
GPU architecture
Native architecture
1
cmake .. -DBUILD_TESTS=ON -DBUILD_BENCHMARKS=ON
Run Tests
Run All Tests
1
ctest --preset default
Run Specific Tests
1
2
3
4
5
6
7
8
# Run GEMM-related tests
./build/tests --gtest_filter="GemmTest*"# Run Tensor tests
./build/tests --gtest_filter="TensorTest*"# Run specific test case
./build/tests --gtest_filter="GemmTest.NaiveMatMulCorrectness"
Test Coverage
1
2
3
4
# Generate coverage report (requires gcov/lcov)
cmake --preset ci
cmake --build--preset ci
ctest --preset ci
#include"common.h"
#include"kernels.cuh"
#include<iostream>
#include<vector>intmain(){// Set GPU deviceCUDA_CHECK(cudaSetDevice(0));// Define matrix dimensionsconstintM=1024,N=1024,K=1024;// Allocate GPU memoryDeviceMemoryd_A(M*K*sizeof(float));DeviceMemoryd_B(K*N*sizeof(float));DeviceMemoryd_C(M*N*sizeof(float));// Create and initialize host datastd::vector<float>h_A(M*K),h_B(K*N);random_init(h_A.data(),h_A.size());random_init(h_B.data(),h_B.size());// Copy to GPUd_A.copy_from_host(h_A.data(),M*K*sizeof(float));d_B.copy_from_host(h_B.data(),K*N*sizeof(float));// Execute optimized GEMMlaunch_optimized_gemm(d_A.get(),d_B.get(),d_C.get(),M,N,K);// SynchronizeCUDA_CHECK(cudaDeviceSynchronize());// Get resultsstd::vector<float>h_C(M*N);d_C.copy_to_host(h_C.data(),M*N*sizeof(float));std::cout<<"✓ GEMM completed! C[0] = "<<h_C[0]<<std::endl;return0;}
Compile and Run
1
2
3
4
5
6
7
# Add file to CMakeLists.txt as executable target# Or compile manually:
nvcc -o first_gemm first_gemm.cpp \-I./include -L./build -lmini_inference\-lcudart-lcublas-std=c++17
./first_gemm
Verify Correctness
1
2
3
4
5
6
7
8
9
#include"common.h"// Add verification codestd::vector<float>h_C_ref(M*N);cpu_matmul(h_A.data(),h_B.data(),h_C_ref.data(),M,N,K);floatmax_error=compare_matrices(h_C.data(),h_C_ref.data(),M*N);std::cout<<"Max error: "<<max_error<<std::endl;// Should be < 1e-4
MNIST Demo
MNIST demo shows how to use the inference engine for handwritten digit recognition.
Prepare Weights File
1
2
3
# Use Python script to export weightscd scripts
python export_mnist_weights.py --output ../weights/mnist_model.bin
# Run single test for detailed error
./build/tests --gtest_filter="GemmTest.NaiveMatMulCorrectness"--gtest_also_run_disabled_tests# Use CUDA memory checker
cuda-memcheck ./build/tests --gtest_filter="GemmTest*"
Next Steps
Congratulations! You’ve completed the quick start. Next you can: