cudaFlowCapturer class
          #include <taskflow/cuda/cuda_capturer.hpp>
        
        class to create a cudaFlow graph using stream capture
The usage of tf::task_1 and task_2, where task_1 runs before task_2.
taskflow.emplace([](tf::cudaFlowCapturer& capturer){ // capture my_kernel_1 through the given stream managed by the capturer auto task_1 = capturer.on([&](cudaStream_t stream){ my_kernel_1<<<grid_1, block_1, shm_size_1, stream>>>(my_parameters_1); }); // capture my_kernel_2 through the given stream managed by the capturer auto task_2 = capturer.on([&](cudaStream_t stream){ my_kernel_2<<<grid_2, block_2, shm_size_2, stream>>>(my_parameters_2); }); task_1.precede(task_2); });
Similar to tf::
Please refer to GPU Tasking (cudaFlowCapturer) for details.
Constructors, destructors, conversion operators
- cudaFlowCapturer() defaulted
- constructs a standalone cudaFlowCapturer
- ~cudaFlowCapturer() defaulted
- destructs the cudaFlowCapturer
- cudaFlowCapturer(cudaFlowCapturer&&) defaulted
- default move constructor
Public functions
- auto operator=(cudaFlowCapturer&&) -> cudaFlowCapturer& defaulted
- default move assignment operator
- auto empty() const -> bool
- queries the emptiness of the graph
- auto num_tasks() const -> size_t
- queries the number of tasks
- void clear()
- clear this cudaFlow capturer
- 
              void dump(std::ostream& os) const 
- dumps the cudaFlow graph into a DOT format through an output stream
- 
              void dump_native_graph(std::ostream& os) const 
- dumps the native captured graph into a DOT format through an output stream
- 
              template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr>auto on(C&& callable) -> cudaTask
- captures a sequential CUDA operations from the given callable
- 
              template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr>void on(cudaTask task, C&& callable)
- updates a capture task to another sequential CUDA operations
- auto noop() -> cudaTask
- captures a no-operation task
- void noop(cudaTask task)
- updates a task to a no-operation task
- auto memcpy(void* dst, const void* src, size_t count) -> cudaTask
- copies data between host and device asynchronously through a stream
- void memcpy(cudaTask task, void* dst, const void* src, size_t count)
- updates a capture task to a memcpy operation
- 
              template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>auto copy(T* tgt, const T* src, size_t num) -> cudaTask
- captures a copy task of typed data
- 
              template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>void copy(cudaTask task, T* tgt, const T* src, size_t num)
- updates a capture task to a copy operation
- auto memset(void* ptr, int v, size_t n) -> cudaTask
- initializes or sets GPU memory to the given value byte by byte
- void memset(cudaTask task, void* ptr, int value, size_t n)
- updates a capture task to a memset operation
- 
              template<typename F, typename... ArgsT>auto kernel(dim3 g, dim3 b, size_t s, F f, ArgsT && ... args) -> cudaTask
- captures a kernel
- 
              template<typename F, typename... ArgsT>void kernel(cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT && ... args)
- updates a capture task to a kernel operation
- 
              template<typename C>auto single_task(C c) -> cudaTask
- capturers a kernel to runs the given callable with only one thread
- 
              template<typename C>void single_task(cudaTask task, C c)
- updates a capture task to a single-threaded kernel
- 
              template<typename I, typename C>auto for_each(I first, I last, C callable) -> cudaTask
- captures a kernel that applies a callable to each dereferenced element of the data array
- 
              template<typename I, typename C>void for_each(cudaTask task, I first, I last, C callable)
- updates a capture task to a for-each kernel task
- 
              template<typename I, typename C>auto for_each_index(I first, I last, I step, C callable) -> cudaTask
- captures a kernel that applies a callable to each index in the range with the step size
- 
              template<typename I, typename C>void for_each_index(cudaTask task, I first, I last, I step, C callable)
- updates a capture task to a for-each-index kernel task
- 
              template<typename I, typename O, typename C>auto transform(I first, I last, O output, C op) -> cudaTask
- captures a kernel that transforms an input range to an output range
- 
              template<typename I, typename O, typename C>void transform(cudaTask task, I first, I last, O output, C op)
- updates a capture task to a transform kernel task
- 
              template<typename I1, typename I2, typename O, typename C>auto transform(I1 first1, I1 last1, I2 first2, O output, C op) -> cudaTask
- captures a kernel that transforms two input ranges to an output range
- 
              template<typename I1, typename I2, typename O, typename C>void transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op)
- updates a capture task to a transform kernel task
- 
              template<typename OPT, typename... ArgsT>auto make_optimizer(ArgsT && ... args) -> OPT&
- selects a different optimization algorithm
- auto capture() -> cudaGraph_t
- captures the cudaFlow and turns it into a CUDA Graph
- void run(cudaStream_t stream)
- offloads the cudaFlowCapturer onto a GPU asynchronously via a stream
- auto native_graph() -> cudaGraph_t
- acquires a reference to the underlying CUDA graph
- auto native_executable() -> cudaGraphExec_t
- acquires a reference to the underlying CUDA graph executable
Function documentation
               tf::
            constructs a standalone cudaFlowCapturer
A standalone cudaFlow capturer does not go through any taskflow and can be run by the caller thread using tf::
              
                template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr>
              
              cudaTask tf::
            captures a sequential CUDA operations from the given callable
| Template parameters | |
|---|---|
| C | callable type constructible with std::function<void(cudaStream_t)> | 
| Parameters | |
| callable | a callable to capture CUDA operations with the stream | 
This methods applies a stream created by the flow to capture a sequence of CUDA operations defined in the callable.
              
                template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr>
              
              void tf::
            updates a capture task to another sequential CUDA operations
The method is similar to cudaFlowCapturer::
              cudaTask tf::
            captures a no-operation task
| Returns | a tf:: | 
|---|
An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.
              void tf::
            updates a task to a no-operation task
The method is similar to tf::
              cudaTask tf::
            copies data between host and device asynchronously through a stream
| Parameters | |
|---|---|
| dst | destination memory address | 
| src | source memory address | 
| count | size in bytes to copy | 
The method captures a cudaMemcpyAsync operation through an internal stream.
              void tf::
            updates a capture task to a memcpy operation
The method is similar to cudaFlowCapturer::
              
                template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
              
              cudaTask tf::
            captures a copy task of typed data
| Template parameters | |
|---|---|
| T | element type (non-void) | 
| Parameters | |
| tgt | pointer to the target memory block | 
| src | pointer to the source memory block | 
| num | number of elements to copy | 
| Returns | cudaTask handle | 
A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
              
                template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
              
              void tf::
            updates a capture task to a copy operation
The method is similar to cudaFlowCapturer::
              cudaTask tf::
            initializes or sets GPU memory to the given value byte by byte
| Parameters | |
|---|---|
| ptr | pointer to GPU memory | 
| v | value to set for each byte of the specified memory | 
| n | size in bytes to set | 
The method captures a cudaMemsetAsync operation through an internal stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value.
              void tf::
            updates a capture task to a memset operation
The method is similar to cudaFlowCapturer::
              
                template<typename F, typename... ArgsT>
              
              cudaTask tf::
            captures a kernel
| Template parameters | |
|---|---|
| F | kernel function type | 
| ArgsT | kernel function parameters type | 
| Parameters | |
| g | configured grid | 
| b | configured block | 
| s | configured shared memory size in bytes | 
| f | kernel function | 
| args | arguments to forward to the kernel function by copy | 
| Returns | cudaTask handle | 
              
                template<typename F, typename... ArgsT>
              
              void tf::
            updates a capture task to a kernel operation
The method is similar to cudaFlowCapturer::
              
                template<typename C>
              
              cudaTask tf::
            capturers a kernel to runs the given callable with only one thread
| Template parameters | |
|---|---|
| C | callable type | 
| Parameters | |
| c | callable to run by a single kernel thread | 
              
                template<typename C>
              
              void tf::
            updates a capture task to a single-threaded kernel
This method is similar to cudaFlowCapturer::
              
                template<typename I, typename C>
              
              cudaTask tf::
            captures a kernel that applies a callable to each dereferenced element of the data array
| Template parameters | |
|---|---|
| I | iterator type | 
| C | callable type | 
| Parameters | |
| first | iterator to the beginning | 
| last | iterator to the end | 
| callable | a callable object to apply to the dereferenced iterator | 
| Returns | cudaTask handle | 
This method is equivalent to the parallel execution of the following loop on a GPU:
for(auto itr = first; itr != last; i++) { callable(*itr); }
              
                template<typename I, typename C>
              
              void tf::
            updates a capture task to a for-each kernel task
This method is similar to cudaFlowCapturer::
              
                template<typename I, typename C>
              
              cudaTask tf::
            captures a kernel that applies a callable to each index in the range with the step size
| Template parameters | |
|---|---|
| I | index type | 
| C | callable type | 
| Parameters | |
| first | beginning index | 
| last | last index | 
| step | step size | 
| callable | the callable to apply to each element in the data array | 
| Returns | cudaTask handle | 
This method is equivalent to the parallel execution of the following loop on a GPU:
// step is positive [first, last) for(auto i=first; i<last; i+=step) { callable(i); } // step is negative [first, last) for(auto i=first; i>last; i+=step) { callable(i); }
              
                template<typename I, typename C>
              
              void tf::
            updates a capture task to a for-each-index kernel task
This method is similar to cudaFlowCapturer::
              
                template<typename I, typename O, typename C>
              
              cudaTask tf::
            captures a kernel that transforms an input range to an output range
| Template parameters | |
|---|---|
| I | input iterator type | 
| O | output iterator type | 
| C | unary operator type | 
| Parameters | |
| first | iterator to the beginning of the input range | 
| last | iterator to the end of the input range | 
| output | iterator to the beginning of the output range | 
| op | unary operator to apply to transform each item in the range | 
| Returns | cudaTask handle | 
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *output++ = op(*first++); }
              
                template<typename I, typename O, typename C>
              
              void tf::
            updates a capture task to a transform kernel task
This method is similar to cudaFlowCapturer::
              
                template<typename I1, typename I2, typename O, typename C>
              
              cudaTask tf::
            captures a kernel that transforms two input ranges to an output range
| Template parameters | |
|---|---|
| I1 | first input iterator type | 
| I2 | second input iterator type | 
| O | output iterator type | 
| C | unary operator type | 
| Parameters | |
| first1 | iterator to the beginning of the input range | 
| last1 | iterator to the end of the input range | 
| first2 | iterato | 
| output | iterator to the beginning of the output range | 
| op | binary operator to apply to transform each pair of items in the two input ranges | 
| Returns | cudaTask handle | 
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first1 != last1) { *output++ = op(*first1++, *first2++); }
              
                template<typename I1, typename I2, typename O, typename C>
              
              void tf::
            updates a capture task to a transform kernel task
This method is similar to cudaFlowCapturer::
              
                template<typename OPT, typename... ArgsT>
              
              OPT& tf::
            selects a different optimization algorithm
| Template parameters | |
|---|---|
| OPT | optimizer type | 
| ArgsT | arguments types | 
| Parameters | |
| args | arguments to forward to construct the optimizer | 
| Returns | a reference to the optimizer | 
We currently supports the following optimization algorithms to capture a user-described cudaFlow:
By default, tf::
              void tf::
            offloads the cudaFlowCapturer onto a GPU asynchronously via a stream
| Parameters | |
|---|---|
| stream | stream for performing this operation | 
Offloads the present cudaFlowCapturer onto a GPU asynchronously via the given stream.
An offloaded cudaFlowCapturer forces the underlying graph to be instantiated. After the instantiation, you should not modify the graph topology but update node parameters.