HPC-AI-Optimization-Lab 1.0.0
High-Performance CUDA Kernels for AI/ML Workloads
Loading...
Searching...
No Matches
tma.cuh
Go to the documentation of this file.
1#pragma once
2
3#include <cuda_runtime.h>
4#include <cuda/pipeline>
5
6namespace hpc::cuda13 {
7
8struct TMAConfig {
9 int cluster_width = 1;
10 int cluster_height = 1;
11 int pipeline_depth = 2;
12 bool use_tma = true;
13};
14
15bool is_hopper_architecture();
16
17template <typename T, int NUM_CHANNELS = 8>
18void tma_copy_2d(const T* src, T* dst,
19 int rows, int cols,
20 const TMAConfig& config,
21 cudaStream_t stream = nullptr);
22
23template <typename T>
24void tma_copy_2d_fallback(const T* src, T* dst,
25 int rows, int cols,
26 cudaStream_t stream = nullptr);
27
28} // namespace hpc::cuda13