API Reference

Core Classes

DeviceMemory

RAII wrapper for GPU memory allocation.

#include "common.h"

// Allocate 1MB
DeviceMemory mem(1024 * 1024);

// Copy from host
std::vector<float> host_data(256);
mem.copy_from_host(host_data.data(), 256 * sizeof(float));

// Copy to host
mem.copy_to_host(host_data.data(), 256 * sizeof(float));

// Zero memory
mem.zero();

// Get raw pointer
float* ptr = mem.get();

Tensor

N-dimensional tensor with GPU storage.

#include "tensor.h"

// Create tensor
Tensor t({batch, height, width, channels});

// Create with data
std::vector<float> data(100);
Tensor t({10, 10}, data.data());

// Operations
t.fill(1.0f);
t.zero();
Tensor t2 = t.clone();
t.reshape({100});

// Get data
auto host_data = t.to_host();

MemoryPool

Efficient GPU memory allocation with caching.

#include "memory_pool.h"

// Allocate from pool
void* ptr = MemoryPool::instance().allocate(1024);

// Return to pool
MemoryPool::instance().deallocate(ptr);

// Get statistics
auto stats = MemoryPool::instance().get_stats();
printf("Hit rate: %.1f%%\n", 
       100.0 * stats.cache_hits / (stats.cache_hits + stats.cache_misses));

// Clear cache
MemoryPool::instance().clear_cache();

StreamManager

CUDA stream management for concurrent execution.

#include "stream_manager.h"

// Initialize with 4 streams
StreamManager::instance().init(4);

// Get stream (round-robin)
cudaStream_t stream = StreamManager::instance().get_stream();

// Get specific stream
cudaStream_t stream0 = StreamManager::instance().get_stream(0);

// Synchronize all
StreamManager::instance().sync_all();

InferenceEngine

Neural network inference engine.

#include "inference_engine.h"

InferenceEngine engine;
engine.init(0);  // GPU device 0

// Load weights
engine.load_weights("model.bin");

// Forward pass
engine.forward(input_ptr, output_ptr, batch_size);

// With timing
std::vector<float> layer_times;
engine.forward_with_timing(input_ptr, output_ptr, batch_size, layer_times);

engine.cleanup();

GEMM Functions

Kernel Launch Functions

#include "kernels.cuh"

// Naive MatMul
launch_naive_matmul(A, B, C, M, N, K, stream);

// Tiled GEMM
launch_tiled_gemm(A, B, C, M, N, K, stream);

// Coalesced GEMM
launch_coalesced_gemm(A, B, C, M, N, K, stream);

// Double Buffer GEMM
launch_double_buffer_gemm(A, B, C, M, N, K, stream);

// Optimized GEMM (Register Blocked)
launch_optimized_gemm(A, B, C, M, N, K, stream);

// Fused GEMM + Bias + ReLU
launch_fused_gemm(A, B, C, bias, M, N, K, add_bias, apply_relu, stream);

// cuBLAS
launch_cublas_gemm(handle, A, B, C, M, N, K, stream);

Batched GEMM

#include "batch_gemm.h"

// Create batch descriptor
BatchGemmDesc desc(M, N, K, batch_size);
for (int i = 0; i < batch_size; i++) {
    desc.add_matrices(A_ptrs[i], B_ptrs[i], C_ptrs[i]);
}

// Launch batched GEMM
launch_batched_gemm_streams(desc, GemmKernelType::REGISTER_BLOCKED);

// Strided batched GEMM
launch_strided_batched_gemm(A, B, C, M, N, K, batch_size);

Half Precision GEMM

#include "half_gemm.cuh"

// FP16 GEMM with FP32 accumulation
launch_half_gemm(A_half, B_half, C, M, N, K, stream);

Quantization

#include "quantization.h"

// Compute quantization parameters
auto params = compute_quant_params(data, n);

// Quantize
std::vector<int8_t> quantized(n);
quantize_tensor(data, quantized.data(), n, params);

// Dequantize
std::vector<float> dequantized(n);
dequantize_tensor(quantized.data(), dequantized.data(), n, params);

// Per-channel quantization
auto ch_params = compute_per_channel_params(data, rows, cols);
quantize_per_channel(data, quantized.data(), rows, cols, ch_params);

// Calibration
QuantizationCalibrator calibrator;
calibrator.observe(batch1, n1);
calibrator.observe(batch2, n2);
auto final_params = calibrator.get_params();

Configuration

#include "config.h"

// Load from file
Config::instance().load_from_file("config.ini");

// Get values
int device = Config::instance().get_int("CUDA_DEVICE", 0);
bool enabled = Config::instance().get_bool("ENABLE_TENSOR_CORES", true);
std::string preset = Config::instance().get("GEMM_PRESET", "medium");

// Set values
Config::instance().set("LOG_LEVEL", "DEBUG");

// GEMM presets
GemmPreset preset = get_gemm_preset("large");

Logging

#include "logger.h"

// Set log level
Logger::instance().set_level(LogLevel::DEBUG);

// Log to file
Logger::instance().set_file("app.log");

// Log messages
LOG_TRACE("Trace message");
LOG_DEBUG("Debug: value = %d", value);
LOG_INFO("Info message");
LOG_WARN("Warning: %s", msg);
LOG_ERROR("Error occurred");
LOG_FATAL("Fatal error");

Performance Measurement

#include "kernels.cuh"

// GPU Timer
GpuTimer timer;
timer.start();
// ... kernel execution ...
timer.stop();
float ms = timer.elapsed_ms();

// Benchmark kernel
PerfStats stats = benchmark_kernel(
    GemmKernelType::REGISTER_BLOCKED,
    A, B, C, M, N, K,
    5,   // warmup iterations
    20,  // benchmark iterations
    cublas_handle
);
printf("GFLOPS: %.2f\n", stats.gflops);

Profiler

#include "profiler.h"

Profiler profiler;

// Profile a kernel
auto result = profiler.profile("MyKernel", M, N, K, warmup, iters, [&]() {
    launch_optimized_gemm(A, B, C, M, N, K);
});

printf("Time: %.3f ms, GFLOPS: %.2f\n", result.time_ms, result.gflops);

// Roofline analysis
RooflineAnalyzer roofline(peak_gflops, peak_bandwidth);
roofline.add_point(result);
roofline.analyze();

Auto-Tuner

#include "autotuner.h"

AutoTuner tuner;

// Find best kernel for given dimensions
GemmKernelType best = tuner.find_best_kernel(M, N, K, cublas_handle);

// Get recommended config
GemmConfig config = tuner.get_recommended_config(M, N, K);