Language: English 简体中文

Table of Contents


Core Classes

DeviceMemory

RAII wrapper for GPU memory with automatic lifecycle management.

Header: common.h

Constructors

1
2
DeviceMemory();                          // Default constructor, empty
explicit DeviceMemory(size_t bytes);     // Allocate specified bytes

Member Functions

Function Description
void allocate(size_t bytes) Reallocate memory
void free() Free memory
void copy_from_host(const void* data, size_t bytes) Copy from host
void copy_to_host(void* data, size_t bytes) const Copy to host
void zero() Zero memory
T* get() Get device pointer
size_t size() const Get byte count
bool empty() const Check if empty

Example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#include "common.h"

// Create and allocate memory
DeviceMemory mem(1024 * 1024);  // 1MB

// Copy from host
std::vector<float> host_data(256, 1.0f);
mem.copy_from_host(host_data.data(), 256 * sizeof(float));

// Copy to host
std::vector<float> result(256);
mem.copy_to_host(result.data(), 256 * sizeof(float));

// Zero memory
mem.zero();

// Get raw pointer
float* ptr = mem.get<float>();

Tensor

N-dimensional tensor class with GPU storage and common operations.

Header: tensor.h

Constructors

1
2
3
Tensor();                                                    // Default, empty
explicit Tensor(const std::vector<int>& shape);             // Specify shape
Tensor(const std::vector<int>& shape, const float* data);   // With data

Member Functions

Function Description
Tensor clone() const Deep copy
void reshape(const std::vector<int>& new_shape) Reshape
void fill(float value) Fill with value
void zero() Zero
void copy_from_host(const float* data) Copy from host
void copy_to_host(float* data) const Copy to host
std::vector<float> to_host() const Convert to host vector
MatrixDesc as_matrix() const Get matrix view
const std::vector<int>& shape() const Get shape
size_t size() const Element count
int ndim() const Number of dimensions

MemoryPool

GPU memory pool that reduces cudaMalloc overhead through caching.

Header: memory_pool.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// Singleton access
static MemoryPool& instance();

// Allocate/deallocate
void* allocate(size_t size);
void deallocate(void* ptr);

// Cache management
void clear_cache();
void release_all();

// Statistics
struct Stats {
    size_t total_allocated;
    size_t cached_size;
    size_t cache_hits;
    size_t cache_misses;
};
Stats get_stats() const;

InferenceEngine

Neural network inference engine supporting multi-layer forward propagation.

Header: inference_engine.h

Function Description
void init(int device_id = 0) Initialize engine
bool load_weights(const std::string& path) Load weights
void forward(const float* input, float* output, int batch) Forward pass
void forward_with_timing(...) Forward with per-layer timing
size_t num_layers() const Number of layers
int input_dim() const Input dimension
int output_dim() const Output dimension

GEMM Kernel API

Basic Kernels

Header: kernels.cuh

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
// Basic implementations
void launch_naive_matmul(const float* A, const float* B, float* C,
                         int M, int N, int K, cudaStream_t stream = 0);

void launch_tiled_gemm(const float* A, const float* B, float* C,
                       int M, int N, int K, cudaStream_t stream = 0);

void launch_coalesced_gemm(const float* A, const float* B, float* C,
                           int M, int N, int K, cudaStream_t stream = 0);

void launch_double_buffer_gemm(const float* A, const float* B, float* C,
                               int M, int N, int K, cudaStream_t stream = 0);

// Optimized (recommended)
void launch_optimized_gemm(const float* A, const float* B, float* C,
                           int M, int N, int K, cudaStream_t stream = 0);

// Fused GEMM + Bias + ReLU
void launch_fused_gemm(const float* A, const float* B, float* C,
                       const float* bias, int M, int N, int K,
                       bool add_bias, bool apply_relu, cudaStream_t stream = 0);

// cuBLAS wrapper
void launch_cublas_gemm(cublasHandle_t handle, 
                        const float* A, const float* B, float* C,
                        int M, int N, int K, cudaStream_t stream = 0);

Half-Precision GEMM

Header: half_gemm.cuh

1
2
3
4
5
6
7
8
9
10
11
// FP16 GEMM (FP16 input/output, FP32 accumulation)
void launch_half_gemm(const half* A, const half* B, half* C,
                      int M, int N, int K, cudaStream_t stream = 0);

// Mixed precision (FP16 input, FP32 output)
void launch_mixed_precision_gemm(const half* A, const half* B, float* C,
                                  int M, int N, int K, cudaStream_t stream = 0);

// Type conversion
void convert_float_to_half(const float* src, half* dst, size_t n, cudaStream_t stream = 0);
void convert_half_to_float(const half* src, float* dst, size_t n, cudaStream_t stream = 0);

Batched GEMM Operations

Header: batch_gemm.h

BatchGemmDesc

Descriptor for batched GEMM operations, holding pointers to multiple matrices.

1
2
3
4
5
6
7
8
9
10
11
12
13
struct BatchGemmDesc {
    std::vector<const float*> A_ptrs;  // Array of A matrix pointers
    std::vector<const float*> B_ptrs;  // Array of B matrix pointers
    std::vector<float*> C_ptrs;        // Array of C matrix pointers
    int M, N, K;                       // Matrix dimensions (same for all batches)
    int batch_size;                    // Number of matrix pairs

    // Constructor
    BatchGemmDesc(int m, int n, int k, int batch);

    // Add a matrix triplet to the batch
    void add_matrices(const float* A, const float* B, float* C);
};

Example

1
2
3
4
5
6
7
// Create batch descriptor for 4 matrices of size 128x64
BatchGemmDesc desc(128, 64, 32, 4);

// Add matrix pointers
for (int i = 0; i < 4; i++) {
    desc.add_matrices(d_A[i], d_B[i], d_C[i]);
}

launch_batched_gemm_streams

Execute multiple GEMM operations in parallel using CUDA streams.

1
2
3
4
5
void launch_batched_gemm_streams(
    const BatchGemmDesc& desc,
    GemmKernelType kernel_type = GemmKernelType::REGISTER_BLOCKED,
    cublasHandle_t cublas_handle = nullptr
);

Parameters

Parameter Description
desc Batch descriptor with matrix pointers
kernel_type GEMM kernel to use for each operation
cublas_handle cuBLAS handle (required if kernel_type is CUBLAS)

launch_strided_batched_gemm

Execute batched GEMM on contiguous matrix arrays with fixed strides.

1
2
3
4
5
6
void launch_strided_batched_gemm(
    const float* A, const float* B, float* C,
    int M, int N, int K, int batch_size,
    GemmKernelType kernel_type = GemmKernelType::REGISTER_BLOCKED,
    cublasHandle_t cublas_handle = nullptr
);

Matrix strides are computed as:

  • A: M * K elements between batches
  • B: K * N elements between batches
  • C: M * N elements between batches

launch_cublas_batched_gemm

Optimized cuBLAS batched GEMM for many small matrices.

1
2
3
4
5
void launch_cublas_batched_gemm(
    cublasHandle_t handle,
    const float** A_array, const float** B_array, float** C_array,
    int M, int N, int K, int batch_size
);

BatchPerfStats

Performance statistics for batched GEMM operations.

1
2
3
4
5
6
7
8
struct BatchPerfStats {
    float total_time_ms;         // Total execution time
    float avg_time_per_gemm_ms;  // Average time per GEMM
    float total_gflops;          // Aggregate throughput
    int batch_size;              // Number of operations

    void compute(int M, int N, int K, int batch);
};

benchmark_batched_gemm

Benchmark batched GEMM performance.

1
2
3
4
5
6
7
BatchPerfStats benchmark_batched_gemm(
    const BatchGemmDesc& desc,
    GemmKernelType kernel_type,
    int warmup_iters = 3,
    int bench_iters = 10,
    cublasHandle_t cublas_handle = nullptr
);

Utility Classes

Profiler

Performance profiler with detailed statistics and Roofline analysis.

Header: profiler.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
Profiler profiler;

// Profile kernel
auto result = profiler.profile("Optimized GEMM", M, N, K, 
    5,   // warmup iterations
    20,  // benchmark iterations
    [&]() { launch_optimized_gemm(A, B, C, M, N, K); });

printf("Time: %.3f ms\n", result.avg_time_ms);
printf("GFLOPS: %.2f\n", result.gflops);

// Print results
Profiler::print_result(result);

// Compare multiple kernels
std::vector<ProfileResult> results;
results.push_back(result1);
results.push_back(result2);
Profiler::print_comparison(results, "cuBLAS");

ProfileResult:

1
2
3
4
5
6
7
8
struct ProfileResult {
    std::string name;
    float avg_time_ms;
    float gflops;
    float memory_bandwidth_gb;
    float arithmetic_intensity;
    int iterations;
};

RooflineAnalyzer

Roofline model analysis for identifying performance bottlenecks.

Header: profiler.h

1
2
3
4
5
6
7
8
9
10
11
12
// Create analyzer with peak performance metrics
RooflineAnalyzer analyzer(
    10000.0f,  // Peak GFLOPS (device-specific)
    900.0f     // Peak bandwidth GB/s
);

// Add profiling results
analyzer.add_point(result1);
analyzer.add_point(result2);

// Print roofline analysis
analyzer.analyze();

Output shows:

  • Arithmetic intensity (FLOPs/Byte)
  • Achieved vs. roofline GFLOPS
  • Efficiency percentage
  • Memory-bound vs. compute-bound classification

AutoTuner

Automatic tuner that selects optimal kernel for given matrix sizes.

Header: autotuner.h

1
2
3
4
5
6
7
8
AutoTuner tuner;

// Tune and cache results
auto result = tuner.tune(M, N, K, cublas_handle);
printf("Best kernel: %s\n", kernel_type_name(result.config.kernel_type));

// Execute with best kernel
tuner.execute_best(A, B, C, M, N, K, stream);

TuningResult:

1
2
3
4
5
6
7
8
9
struct TuningResult {
    TuningConfig config;   // Contains kernel_type
    float time_ms;         // Execution time
    float gflops;          // Performance metric
};

struct TuningConfig {
    GemmKernelType kernel_type;  // Selected kernel type
};

Configuration & Logging

Config

Configuration manager supporting files and environment variables.

Header: config.h

1
2
3
4
5
6
7
8
9
10
11
12
13
// Singleton access
auto& config = Config::instance();

// Load from file
config.load_from_file("config/default.ini");

// Get values (with defaults)
int device = config.get_int("CUDA_DEVICE", 0);
bool enabled = config.get_bool("ENABLE_TENSOR_CORES", true);
std::string preset = config.get("GEMM_PRESET", "medium");

// Set values
config.set("LOG_LEVEL", "DEBUG");

Logger

Thread-safe logging system.

Header: logger.h

1
2
3
4
5
6
7
8
Logger::instance().set_level(LogLevel::DEBUG);
Logger::instance().set_file("app.log");

// Log macros
LOG_DEBUG("Debug: value = %d", value);
LOG_INFO("Info message");
LOG_WARN("Warning: %s", msg);
LOG_ERROR("Error occurred");

Log Levels: TRACE, DEBUG, INFO, WARN, ERROR, FATAL, OFF


Quantization

QuantizationParams

Header: quantization.h

1
2
3
4
5
6
7
8
9
10
// Compute quantization parameters (symmetric)
QuantizationParams params = compute_quant_params(data, n);

// Quantize
std::vector<int8_t> quantized(n);
quantize_tensor(data, quantized.data(), n, params);

// Dequantize
std::vector<float> dequantized(n);
dequantize_tensor(quantized.data(), dequantized.data(), n, params);

Per-Channel Quantization

1
2
auto params = compute_per_channel_params(data, rows, cols);
quantize_per_channel(data, quantized.data(), rows, cols, params);

Error Handling

CUDA_CHECK Macro

1
2
3
4
5
6
// Check CUDA calls
CUDA_CHECK(cudaMalloc(&ptr, size));
CUDA_CHECK(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
CUDA_CHECK(cudaDeviceSynchronize());

// Throws CudaException on failure

CUBLAS_CHECK Macro

1
2
3
4
// Check cuBLAS calls
CUBLAS_CHECK(cublasSgemm(handle, ...));

// Throws std::runtime_error on failure

Performance Measurement

Utility Functions

Header: common.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
// Initialize array with random values in range [-1.0, 1.0]
void random_init(float* data, size_t n);

// Initialize array with zeros
void zero_init(float* data, size_t n);

// CPU reference implementation for GEMM
void cpu_matmul(const float* A, const float* B, float* C, int M, int N, int K);

// CPU reference with bias and ReLU fusion
void cpu_matmul_bias_relu(const float* A, const float* B, float* C,
                          const float* bias, int M, int N, int K,
                          bool add_bias, bool apply_relu);

// Compare matrices and return max absolute error
float compare_matrices(const float* a, const float* b, size_t n);

benchmark_kernel

1
2
3
4
5
6
7
8
9
10
11
12
13
#include "kernels.cuh"

PerfStats stats = benchmark_kernel(
    GemmKernelType::REGISTER_BLOCKED,
    A, B, C, M, N, K,
    5,    // warmup
    20,   // iterations
    cublas_handle,
    stream
);

printf("Time: %.3f ms\n", stats.kernel_time_ms);
printf("GFLOPS: %.2f\n", stats.gflops);


*Last Updated: 2025-04-16 Document Version: v1.1.0*

Back to top

MIT License | A learning project for the CUDA community