HPC-AI-Optimization-Lab 1.0.0
High-Performance CUDA Kernels for AI/ML Workloads
Loading...
Searching...
No Matches
cluster.cuh
Go to the documentation of this file.
1#pragma once
2
3#include <cuda_runtime.h>
4
5namespace hpc::cuda13 {
6
7struct ClusterConfig {
8 dim3 cluster_dims;
9 dim3 grid_dims;
10 dim3 block_dims;
11 bool use_cluster = true;
12};
13
14bool is_hopper_architecture();
15
16template <typename T>
17void cluster_reduce(const T* input, T* output, size_t n,
18 const ClusterConfig& config,
19 cudaStream_t stream = nullptr);
20
21template <typename T>
22void cluster_reduce_fallback(const T* input, T* output, size_t n,
23 const ClusterConfig& config,
24 cudaStream_t stream = nullptr);
25
26} // namespace hpc::cuda13