Model configuration structure defining all hyperparameters.
class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#include<tiny_llm/inference_engine.h>structModelConfig{intvocab_size=32000;// Vocabulary sizeinthidden_dim=4096;// Hidden dimensionintnum_layers=32;// Number of Transformer layersintnum_heads=32;// Number of attention headsintnum_kv_heads=32;// Number of KV heads (GQA support)inthead_dim=128;// Dimension per headintintermediate_dim=11008;// FFN intermediate dimensionintmax_seq_len=2048;// Maximum sequence lengthfloatrope_theta=10000.0f;// RoPE base frequencyfloatrms_norm_eps=1e-5f;// RMSNorm epsiloninteos_token_id=2;// End-of-sequence token IDintbos_token_id=1;// Beginning-of-sequence token ID};
Common Configurations:
Model Size
hidden_dim
num_layers
num_heads
intermediate_dim
7B
4096
32
32
11008
13B
5120
40
40
13824
70B
8192
80
64
28672
GenerationConfig
Text generation configuration controlling sampling behavior.
class="highlight">
1
2
3
4
5
6
7
8
structGenerationConfig{intmax_new_tokens=256;// Maximum tokens to generatefloattemperature=1.0f;// Sampling temperatureinttop_k=50;// Top-k sampling cutofffloattop_p=0.9f;// Top-p (nucleus) sampling thresholdbooldo_sample=false;// Enable sampling (false = greedy)floatrepetition_penalty=1.0f;// Penalty for repeated tokens};
structGenerationStats{floatprefill_time_ms=0.0f;// Prefill phase time (ms)floatdecode_time_ms=0.0f;// Decode phase time (ms)intprompt_tokens=0;// Number of prompt tokensinttokens_generated=0;// Number of generated tokensfloattokens_per_second=0.0f;// Generation throughputsize_tpeak_memory_bytes=0;// Peak GPU memory usage};
Core Classes
InferenceEngine
Main inference engine class. Thread-safe for concurrent generation on different engine instances.
#include<tiny_llm/inference_engine.h>classInferenceEngine{public:// Load model from custom binary formatstaticResult<std::unique_ptr<InferenceEngine>>load(conststd::string&model_path,constModelConfig&config);// Generate completion for promptstd::vector<int>generate(conststd::vector<int>&prompt_tokens,constGenerationConfig&gen_config);// Get generation statisticsconstGenerationStats&getStats()const;voidresetStats();// Standalone sampling functions (stateless)staticintsampleGreedy(consthalf*logits,intvocab_size);staticintsampleTemperature(consthalf*logits,intvocab_size,floattemperature,unsignedseed=0);staticintsampleTopK(consthalf*logits,intvocab_size,intk,floattemperature,unsignedseed=0);staticintsampleTopP(consthalf*logits,intvocab_size,floatp,floattemperature,unsignedseed=0);};
#include<tiny_llm/kv_cache.h>structKVCacheConfig{intnum_layers=32;// Number of transformer layersintnum_heads=32;// Number of KV headsinthead_dim=128;// Dimension per headintmax_seq_len=2048;// Maximum sequence lengthintmax_batch_size=1;// Maximum batch size};classKVCacheManager{public:explicitKVCacheManager(constKVCacheConfig&config);~KVCacheManager();// Sequence managementResult<int>allocateSequence(intmax_len);voidreleaseSequence(intseq_id);boolhasSequence(intseq_id)const;// Cache access for attention computationstd::pair<half*,half*>getCache(intseq_id,intlayer_idx);intgetSeqLen(intseq_id)const;// KV append (write-only, stateless)voidappendKV(intseq_id,intlayer_idx,consthalf*new_k,consthalf*new_v,intnum_tokens,cudaStream_tstream=0);// Advance sequence length after all layers completevoidadvanceSeqLen(intseq_id,intnum_tokens);// Memory statisticssize_tgetUsedMemory()const;size_tgetTotalMemory()const;size_tgetFreeMemory()const;intgetActiveSequenceCount()const;};
KVCacheConfigcache_config;cache_config.num_layers=32;cache_config.num_heads=32;cache_config.head_dim=128;cache_config.max_seq_len=2048;KVCacheManagerkv_cache(cache_config);// Allocate sequenceautoseq_result=kv_cache.allocateSequence(1024);if(seq_result.isErr()){// Handle allocation failure}intseq_id=seq_result.value();// Forward pass through layersfor(inti=0;i<num_layers;i++){layers[i]->forward(hidden_states,kv_cache,seq_id,position,stream);}// Advance sequence length after all layerskv_cache.advanceSeqLen(seq_id,1);// Release when donekv_cache.releaseSequence(seq_id);
#include<tiny_llm/result.h>template<typenameT>classResult{public:// ConstructorsstaticResult<T>ok(Tvalue);staticResult<T>err(std::stringmessage);// State checksboolisOk()const;boolisErr()const;// Value access (throws if error)T&value();constT&value()const;TvalueOr(Tdefault_value)const;// Error access (throws if ok)conststd::string&error()const;// Monadic operationstemplate<typenameF>automap(F&&f)->Result<decltype(f(value()))>;template<typenameF>autoflatMap(F&&f)->decltype(f(value()));};
Usage:
class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Result<int>parseInt(conststd::string&s){try{returnResult<int>::ok(std::stoi(s));}catch(...){returnResult<int>::err("Invalid integer: "+s);}}autoresult=parseInt("42");if(result.isOk()){std::cout<<"Value: "<<result.value()<<std::endl;}else{std::cerr<<"Error: "<<result.error()<<std::endl;}// Or with defaultintval=parseInt("abc").valueOr(0);// val = 0
classStreamPool{public:explicitStreamPool(intnum_streams=4);cudaStream_tgetStream();// Round-robincudaStream_tgetStream(intidx);// By indexvoidsynchronizeAll();intnumStreams()const;};