API Reference

Complete API reference for Tiny-LLM inference engine.


Table of Contents


Data Types

ModelConfig

Model configuration structure defining all hyperparameters.

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#include <tiny_llm/inference_engine.h>

struct ModelConfig {
    int vocab_size = 32000;           // Vocabulary size
    int hidden_dim = 4096;            // Hidden dimension
    int num_layers = 32;              // Number of Transformer layers
    int num_heads = 32;               // Number of attention heads
    int num_kv_heads = 32;            // Number of KV heads (GQA support)
    int head_dim = 128;               // Dimension per head
    int intermediate_dim = 11008;     // FFN intermediate dimension
    int max_seq_len = 2048;           // Maximum sequence length
    float rope_theta = 10000.0f;      // RoPE base frequency
    float rms_norm_eps = 1e-5f;       // RMSNorm epsilon
    int eos_token_id = 2;             // End-of-sequence token ID
    int bos_token_id = 1;             // Beginning-of-sequence token ID
};

Common Configurations:

Model Size hidden_dim num_layers num_heads intermediate_dim
7B 4096 32 32 11008
13B 5120 40 40 13824
70B 8192 80 64 28672

GenerationConfig

Text generation configuration controlling sampling behavior.

class="highlight">
1
2
3
4
5
6
7
8
struct GenerationConfig {
    int max_new_tokens = 256;         // Maximum tokens to generate
    float temperature = 1.0f;         // Sampling temperature
    int top_k = 50;                   // Top-k sampling cutoff
    float top_p = 0.9f;               // Top-p (nucleus) sampling threshold
    bool do_sample = false;           // Enable sampling (false = greedy)
    float repetition_penalty = 1.0f;  // Penalty for repeated tokens
};

Sampling Parameters:

Parameter Range Effect
temperature 0.0 - 2.0 Lower = more deterministic
top_k 1 - vocab_size Consider only top k tokens
top_p 0.0 - 1.0 Consider tokens with cumulative prob ≤ p
repetition_penalty 1.0 - 2.0 >1.0 penalizes repeated tokens

QuantizedWeight

INT8 quantized weight with FP16 scales.

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
struct QuantizedWeight {
    int8_t* data;                     // INT8 weights [rows, cols]
    half* scales;                     // FP16 scales [rows/group_size, cols]
    int rows;                         // Input dimension
    int cols;                         // Output dimension
    int group_size = 128;             // Quantization group size
    
    // Helper methods
    int scaleRows() const;            // ceil(rows / group_size)
    int scaleCols() const;            // cols
    size_t weightElements() const;    // rows * cols
    size_t scaleElements() const;     // scaleRows() * cols
    size_t weightBytes() const;       // weightElements()
    size_t scaleBytes() const;        // scaleElements() * 2
    size_t totalBytes() const;        // weightBytes + scaleBytes
    bool isValid() const;             // Validate dimensions
};

GenerationStats

Performance statistics from text generation.

class="highlight">
1
2
3
4
5
6
7
8
struct GenerationStats {
    float prefill_time_ms = 0.0f;     // Prefill phase time (ms)
    float decode_time_ms = 0.0f;      // Decode phase time (ms)
    int prompt_tokens = 0;            // Number of prompt tokens
    int tokens_generated = 0;         // Number of generated tokens
    float tokens_per_second = 0.0f;   // Generation throughput
    size_t peak_memory_bytes = 0;     // Peak GPU memory usage
};

Core Classes

InferenceEngine

Main inference engine class. Thread-safe for concurrent generation on different engine instances.

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#include <tiny_llm/inference_engine.h>

class InferenceEngine {
public:
    // Load model from custom binary format
    static Result<std::unique_ptr<InferenceEngine>> load(
        const std::string& model_path,
        const ModelConfig& config
    );
    
    // Generate completion for prompt
    std::vector<int> generate(
        const std::vector<int>& prompt_tokens,
        const GenerationConfig& gen_config
    );
    
    // Get generation statistics
    const GenerationStats& getStats() const;
    void resetStats();
    
    // Standalone sampling functions (stateless)
    static int sampleGreedy(
        const half* logits, int vocab_size);
    
    static int sampleTemperature(
        const half* logits, int vocab_size,
        float temperature, unsigned seed = 0);
    
    static int sampleTopK(
        const half* logits, int vocab_size,
        int k, float temperature, unsigned seed = 0);
    
    static int sampleTopP(
        const half* logits, int vocab_size,
        float p, float temperature, unsigned seed = 0);
};

Usage Example:

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
// Configure model
ModelConfig config;
config.vocab_size = 32000;
config.hidden_dim = 4096;
config.num_layers = 32;

// Load model
auto result = InferenceEngine::load("model.bin", config);
if (result.isErr()) {
    std::cerr << "Error: " << result.error() << std::endl;
    return 1;
}
auto engine = std::move(result.value());

// Configure generation
GenerationConfig gen_config;
gen_config.max_new_tokens = 256;
gen_config.temperature = 0.7f;
gen_config.top_p = 0.9f;
gen_config.do_sample = true;

// Generate
std::vector<int> prompt = {1, 15043, 29892};  // "Hello,"
auto output = engine->generate(prompt, gen_config);

// Check performance
const auto& stats = engine->getStats();
std::cout << "Speed: " << stats.tokens_per_second << " tok/s" << std::endl;

KVCacheManager

Efficient key-value cache management for autoregressive generation.

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#include <tiny_llm/kv_cache.h>

struct KVCacheConfig {
    int num_layers = 32;              // Number of transformer layers
    int num_heads = 32;               // Number of KV heads
    int head_dim = 128;               // Dimension per head
    int max_seq_len = 2048;           // Maximum sequence length
    int max_batch_size = 1;           // Maximum batch size
};

class KVCacheManager {
public:
    explicit KVCacheManager(const KVCacheConfig& config);
    ~KVCacheManager();
    
    // Sequence management
    Result<int> allocateSequence(int max_len);
    void releaseSequence(int seq_id);
    bool hasSequence(int seq_id) const;
    
    // Cache access for attention computation
    std::pair<half*, half*> getCache(int seq_id, int layer_idx);
    int getSeqLen(int seq_id) const;
    
    // KV append (write-only, stateless)
    void appendKV(int seq_id, int layer_idx,
                  const half* new_k, const half* new_v,
                  int num_tokens, cudaStream_t stream = 0);
    
    // Advance sequence length after all layers complete
    void advanceSeqLen(int seq_id, int num_tokens);
    
    // Memory statistics
    size_t getUsedMemory() const;
    size_t getTotalMemory() const;
    size_t getFreeMemory() const;
    int getActiveSequenceCount() const;
};

Usage Pattern:

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
KVCacheConfig cache_config;
cache_config.num_layers = 32;
cache_config.num_heads = 32;
cache_config.head_dim = 128;
cache_config.max_seq_len = 2048;

KVCacheManager kv_cache(cache_config);

// Allocate sequence
auto seq_result = kv_cache.allocateSequence(1024);
if (seq_result.isErr()) {
    // Handle allocation failure
}
int seq_id = seq_result.value();

// Forward pass through layers
for (int i = 0; i < num_layers; i++) {
    layers[i]->forward(hidden_states, kv_cache, seq_id, position, stream);
}

// Advance sequence length after all layers
kv_cache.advanceSeqLen(seq_id, 1);

// Release when done
kv_cache.releaseSequence(seq_id);

TransformerLayer

Single transformer layer with attention and FFN.

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#include <tiny_llm/transformer.h>

class TransformerLayer {
public:
    TransformerLayer(int layer_idx,
                     const TransformerWeights& weights,
                     const ModelConfig& config);
    
    // Single token forward (decode phase)
    void forward(half* hidden_states,
                 KVCacheManager& kv_cache,
                 int seq_id,
                 int position,
                 cudaStream_t stream = 0);
    
    // Multi-token forward (prefill phase)
    void forwardPrefill(half* hidden_states,
                        KVCacheManager& kv_cache,
                        int seq_id,
                        int seq_len,
                        cudaStream_t stream = 0);
    
    int getLayerIdx() const;
};

CUDA Kernels

W8A16 Matrix Multiplication

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#include <w8a16_matmul.cuh>

namespace tiny_llm::kernels {

// Main W8A16 matmul kernel
void w8a16_matmul(
    const half* input,           // [M, K] FP16
    const int8_t* weight,        // [K, N] INT8 (column-major)
    const half* scales,          // [K/group_size, N] FP16
    half* output,                // [M, N] FP16
    int M,                       // Batch size
    int N,                       // Output dimension
    int K,                       // Input dimension
    int group_size,              // Quantization group size
    cudaStream_t stream = 0
);

// Weight dequantization (for testing/reference)
void dequantize_weights(
    const int8_t* weight_int8,
    const half* scales,
    half* weight_fp16,
    int K, int N,
    int group_size,
    cudaStream_t stream = 0
);

// Reference implementation for validation
void w8a16_matmul_reference(
    const half* input,
    const int8_t* weight,
    const half* scales,
    half* output,
    int M, int N, int K,
    int group_size
);

}

Attention Kernels

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#include <attention.cuh>

namespace tiny_llm::kernels {

// Decode: single query token against cached KV
void attention_decode(
    const half* query,           // [batch, num_heads, 1, head_dim]
    const half* k_cache,         // [batch, num_heads, seq_len, head_dim]
    const half* v_cache,         // [batch, num_heads, seq_len, head_dim]
    half* output,                // [batch, num_heads, 1, head_dim]
    float scale,                 // 1/sqrt(head_dim)
    int batch_size,
    int num_heads,
    int seq_len,
    int head_dim,
    cudaStream_t stream = 0
);

// Prefill: all query tokens with causal mask
void attention_prefill(
    const half* query,           // [batch, num_heads, seq_len, head_dim]
    const half* key,             // [batch, num_heads, seq_len, head_dim]
    const half* value,           // [batch, num_heads, seq_len, head_dim]
    half* output,                // [batch, num_heads, seq_len, head_dim]
    float scale,
    int batch_size,
    int num_heads,
    int seq_len,
    int head_dim,
    cudaStream_t stream = 0
);

// Standalone softmax
void softmax(
    const half* input,           // [batch, seq_len]
    half* output,
    int batch_size,
    int seq_len,
    cudaStream_t stream = 0
);

}

RMSNorm

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#include <rmsnorm.cuh>

namespace tiny_llm::kernels {

// RMSNorm: output = x / sqrt(mean(x^2) + eps) * weight
void rmsnorm(
    const half* input,           // [batch, hidden_dim]
    const half* weight,          // [hidden_dim]
    half* output,                // [batch, hidden_dim]
    int batch_size,
    int hidden_dim,
    float eps = 1e-5f,
    cudaStream_t stream = 0
);

// In-place RMSNorm
void rmsnorm_inplace(
    half* x,                     // [batch, hidden_dim] (in-place)
    const half* weight,
    int batch_size,
    int hidden_dim,
    float eps = 1e-5f,
    cudaStream_t stream = 0
);

}

Elementwise Operations

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#include <elementwise.cuh>

namespace tiny_llm::kernels {

// In-place addition: data[i] += add[i]
void add_inplace(
    half* data,
    const half* add,
    int num_elements,
    cudaStream_t stream = 0
);

// SwiGLU fused: gate[i] = silu(gate[i]) * up[i]
void silu_mul_inplace(
    half* gate,
    const half* up,
    int num_elements,
    cudaStream_t stream = 0
);

// Embedding lookup
void gather_embeddings(
    const int* tokens,           // [num_tokens]
    const half* embedding,       // [vocab_size, hidden_dim]
    half* output,                // [num_tokens, hidden_dim]
    int num_tokens,
    int hidden_dim,
    int vocab_size,
    cudaStream_t stream = 0
);

}

Error Handling

Result

Rust-inspired Result type for error handling without exceptions.

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#include <tiny_llm/result.h>

template<typename T>
class Result {
public:
    // Constructors
    static Result<T> ok(T value);
    static Result<T> err(std::string message);
    
    // State checks
    bool isOk() const;
    bool isErr() const;
    
    // Value access (throws if error)
    T& value();
    const T& value() const;
    T valueOr(T default_value) const;
    
    // Error access (throws if ok)
    const std::string& error() const;
    
    // Monadic operations
    template<typename F>
    auto map(F&& f) -> Result<decltype(f(value()))>;
    
    template<typename F>
    auto flatMap(F&& f) -> decltype(f(value()));
};

Usage:

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Result<int> parseInt(const std::string& s) {
    try {
        return Result<int>::ok(std::stoi(s));
    } catch (...) {
        return Result<int>::err("Invalid integer: " + s);
    }
}

auto result = parseInt("42");
if (result.isOk()) {
    std::cout << "Value: " << result.value() << std::endl;
} else {
    std::cerr << "Error: " << result.error() << std::endl;
}

// Or with default
int val = parseInt("abc").valueOr(0);  // val = 0

CudaException

CUDA error exception with context.

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#include <tiny_llm/cuda_utils.h>

class CudaException : public std::exception {
public:
    CudaException(cudaError_t err, const char* file, int line);
    
    const char* what() const noexcept override;
    cudaError_t error() const;
    const char* file() const;
    int line() const;
};

// Error checking macro
#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            throw CudaException(err, __FILE__, __LINE__); \
        } \
    } while(0)

Utilities

DeviceBuffer

RAII wrapper for GPU memory.

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#include <tiny_llm/cuda_utils.h>

template<typename T>
class DeviceBuffer {
public:
    DeviceBuffer();                        // Empty buffer
    explicit DeviceBuffer(size_t count);   // Allocate count elements
    ~DeviceBuffer();                       // Automatic cleanup
    
    // Non-copyable
    DeviceBuffer(const DeviceBuffer&) = delete;
    DeviceBuffer& operator=(const DeviceBuffer&) = delete;
    
    // Movable
    DeviceBuffer(DeviceBuffer&&) noexcept;
    DeviceBuffer& operator=(DeviceBuffer&&) noexcept;
    
    // Data access
    T* data();
    const T* data() const;
    size_t size() const;
    size_t bytes() const;
    
    // Data transfer
    void copyFromHost(const T* src, size_t count, cudaStream_t stream = 0);
    void copyToHost(T* dst, size_t count, cudaStream_t stream = 0) const;
};

CudaStream

RAII wrapper for CUDA streams.

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
class CudaStream {
public:
    CudaStream();
    ~CudaStream();
    
    CudaStream(const CudaStream&) = delete;
    CudaStream(CudaStream&&) noexcept;
    
    cudaStream_t get() const;
    operator cudaStream_t() const;
    
    void synchronize();
};

CudaEvent

CUDA event for timing and synchronization.

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
class CudaEvent {
public:
    CudaEvent();
    ~CudaEvent();
    
    void record(cudaStream_t stream = 0);
    void synchronize();
    
    static float elapsedMs(const CudaEvent& start, const CudaEvent& end);
    
    cudaEvent_t get() const;
};

Timing Example:

class="highlight">
1
2
3
4
5
6
7
8
9
CudaEvent start, end;

start.record(stream);
kernel<<<grid, block, 0, stream>>>(...);
end.record(stream);

end.synchronize();
float ms = CudaEvent::elapsedMs(start, end);
std::cout << "Kernel time: " << ms << " ms" << std::endl;

StreamPool

Pool of CUDA streams for parallel execution.

class="highlight">
1
2
3
4
5
6
7
8
9
10
class StreamPool {
public:
    explicit StreamPool(int num_streams = 4);
    
    cudaStream_t getStream();        // Round-robin
    cudaStream_t getStream(int idx);  // By index
    
    void synchronizeAll();
    int numStreams() const;
};


Back to top