Usage Examples

Common usage patterns for Mini-ImagePipe.

Table of contents

  1. Basic Pipeline
  2. Batch Processing
  3. Runtime Parameter Updates
  4. Error Handling
    1. Basic Error Checking
    2. Custom Error Callback
  5. Pipeline Configuration
  6. Performance Tips

Basic Pipeline

#include "pipeline.h"
#include "operators/resize.h"
#include "operators/color_convert.h"
#include "operators/gaussian_blur.h"
#include "operators/sobel.h"

using namespace mini_image_pipe;

int main() {
    PipelineConfig config;
    config.numStreams = 4;
    Pipeline pipeline(config);

    // Add operators
    auto resize = std::make_shared<ResizeOperator>(320, 240, InterpolationMode::BILINEAR);
    auto gray   = std::make_shared<ColorConvertOperator>(ColorConversionType::RGB_TO_GRAY);
    auto blur   = std::make_shared<GaussianBlurOperator>(GaussianKernelSize::KERNEL_5x5);
    auto sobel  = std::make_shared<SobelOperator>();

    int n1 = pipeline.addOperator("Resize", resize);
    int n2 = pipeline.addOperator("Gray",   gray);
    int n3 = pipeline.addOperator("Blur",   blur);
    int n4 = pipeline.addOperator("Sobel",  sobel);

    // Connect: Resize -> Gray -> Blur -> Sobel
    pipeline.connect(n1, n2);
    pipeline.connect(n2, n3);
    pipeline.connect(n3, n4);

    // Set input and execute
    pipeline.setInput(n1, d_input, width, height, channels);
    pipeline.execute();

    // Get output
    void* output = pipeline.getOutput(n4);
    return 0;
}

Batch Processing

For processing multiple frames efficiently:

std::vector<void*> inputs = {...};  // Array of device pointers
std::vector<void*> outputs;

Pipeline pipeline;
// ... setup pipeline ...

cudaError_t err = pipeline.executeBatch(inputs, outputs, width, height, channels);

The batch executor:

  • Processes frames concurrently across multiple streams
  • Reuses allocated buffers between frames
  • Synchronizes only at the end of each batch

Runtime Parameter Updates

Operators can be reconfigured between executions:

auto resizeOp = std::make_shared<ResizeOperator>(640, 480);
pipeline.addOperator("Resize", resizeOp);

// Later, change target size
resizeOp->setTargetSize(320, 240);
pipeline.reset();
pipeline.setInput(...);
pipeline.execute();

Error Handling

Basic Error Checking

cudaError_t err = pipeline.execute();
if (err != cudaSuccess) {
    std::cerr << "Pipeline failed: " << cudaGetErrorString(err) << std::endl;
    // Check individual task states
    for (const auto& task : pipeline.getTaskGraph().getTasks()) {
        if (task.state.load() == TaskState::FAILED) {
            std::cerr << "Task " << task.name << " failed" << std::endl;
        }
    }
}

Custom Error Callback

pipeline.getScheduler().setErrorCallback([](int taskId, cudaError_t err) {
    std::cerr << "Task " << taskId << " failed with error: " 
              << cudaGetErrorString(err) << std::endl;
});

Pipeline Configuration

PipelineConfig config;
config.numStreams = 4;                        // CUDA streams for parallelism
config.pinnedPoolSize = 64 * 1024 * 1024;     // 64MB pinned memory pool
config.devicePoolSize = 256 * 1024 * 1024;    // 256MB device memory pool
config.enableProfiling = true;                // Enable CUDA profiling
config.maxBatchSize = 16;                     // Maximum frames per batch

Pipeline pipeline(config);

Performance Tips

  1. Match streams to workload: Use 2-4 streams for most workloads. Too many streams can hurt performance due to context switching.

  2. Reuse pipelines: Create once, execute many times. Buffer reuse significantly reduces allocation overhead.

  3. Batch processing: Use executeBatch() for video streams or image sequences.

  4. Memory pool sizing: Size pools to hold 2-3 frames worth of intermediate buffers.

  5. Operator fusion: When possible, combine operations to reduce memory bandwidth:

    // Good: Single operation
    auto op = std::make_shared<ComplexOperator>();
       
    // Avoid: Multiple small operations
    auto op1 = std::make_shared<Op1>();
    auto op2 = std::make_shared<Op2>();