4 #include "notifier.hpp" 5 #include "observer.hpp" 6 #include "taskflow.hpp" 33 Notifier::Waiter* waiter;
35 TaskQueue<Node*> wsq[NUM_DOMAINS];
39 Worker* worker {
nullptr};
56 size_t M = cuda_num_devices()
134 template<
typename P,
typename C>
205 template <
typename Observer,
typename... Args>
211 template <
typename Observer>
221 const size_t _VICTIM_BEG;
222 const size_t _VICTIM_END;
223 const size_t _MAX_STEALS;
224 const size_t _MAX_YIELDS;
230 size_t _num_topologies {0};
235 #ifdef TF_ENABLE_CUDA 239 Notifier _notifier[NUM_DOMAINS];
241 TaskQueue<Node*> _wsq[NUM_DOMAINS];
243 size_t _id_offset[NUM_DOMAINS] = {0};
253 PerThread& _per_thread()
const;
255 bool _wait_for_task(Worker&, Node*&);
257 void _instantiate_tfprof();
258 void _flush_tfprof();
259 void _observer_prologue(Worker&, Node*);
260 void _observer_epilogue(Worker&, Node*);
261 void _spawn(
size_t, Domain);
262 void _worker_loop(Worker&);
263 void _exploit_task(Worker&, Node*&);
264 void _explore_task(Worker&, Node*&);
265 void _schedule(Node*);
266 void _schedule(PassiveVector<Node*>&);
267 void _invoke(Worker&, Node*);
268 void _invoke_static_work(Worker&, Node*);
269 void _invoke_dynamic_work(Worker&, Node*);
270 void _invoke_dynamic_work_internal(Worker&, Node*, Graph&,
bool);
271 void _invoke_dynamic_work_external(Node*, Graph&,
bool);
272 void _invoke_condition_work(Worker&, Node*);
273 void _invoke_module_work(Worker&, Node*);
275 #ifdef TF_ENABLE_CUDA 276 void _invoke_cudaflow_work(Worker&, Node*);
277 void _invoke_cudaflow_work_internal(Worker&, Node*);
280 void _set_up_topology(Topology*);
281 void _tear_down_topology(Topology*);
282 void _increment_topology();
283 void _decrement_topology();
284 void _decrement_topology_and_notify();
288 #ifdef TF_ENABLE_CUDA 292 _VICTIM_END {N + M - 1},
293 _MAX_STEALS {(N + M + 1) << 1},
296 _cuda_devices {cuda_num_devices()},
297 _notifier {Notifier(N), Notifier(M)} {
300 TF_THROW(
"no cpu workers to execute taskflows");
304 TF_THROW(
"no gpu workers to execute cudaflows");
307 for(
int i=0; i<NUM_DOMAINS; ++i) {
308 _num_actives[i].store(0, std::memory_order_relaxed);
309 _num_thieves[i].store(0, std::memory_order_relaxed);
313 for(
size_t i=0; i<_cuda_devices.size(); ++i) {
314 _cuda_devices[i].streams.resize(M);
315 cudaScopedDevice ctx(i);
316 for(
size_t m=0; m<M; ++m) {
318 cudaStreamCreate(&(_cuda_devices[i].streams[m])),
319 "failed to create a cudaStream for worker ", m,
" on device ", i
328 _instantiate_tfprof();
336 _MAX_STEALS {(N + 1) << 1},
339 _notifier {Notifier(N)} {
342 TF_THROW(
"no cpu workers to execute taskflows");
345 for(
int i=0; i<NUM_DOMAINS; ++i) {
346 _num_actives[i].store(0, std::memory_order_relaxed);
347 _num_thieves[i].store(0, std::memory_order_relaxed);
353 _instantiate_tfprof();
366 for(
int i=0; i<NUM_DOMAINS; ++i) {
367 _notifier[i].notify(
true);
370 for(
auto& t : _threads){
374 #ifdef TF_ENABLE_CUDA 376 for(
size_t i=0; i<_cuda_devices.size(); ++i) {
377 cudaScopedDevice ctx(i);
378 for(
size_t m=0; m<_cuda_devices[i].streams.size(); ++m) {
379 cudaStreamDestroy(_cuda_devices[i].streams[m]);
389 inline void Executor::_instantiate_tfprof() {
391 _tfprof = get_env(
"TF_ENABLE_PROFILER").empty() ?
392 nullptr : make_observer<TFProfObserver>().
get();
396 inline void Executor::_flush_tfprof() {
399 fpath << get_env(
"TF_ENABLE_PROFILER") << _tfprof->_uuid <<
".tfp";
407 return _workers.size();
417 return _num_topologies;
421 inline Executor::PerThread& Executor::_per_thread()
const {
422 thread_local PerThread pt;
428 auto worker = _per_thread().worker;
429 return worker ?
static_cast<int>(worker->id) : -1;
433 inline void Executor::_spawn(
size_t N, Domain d) {
435 auto id = _threads.size();
439 for(
size_t i=0; i<N; ++i, ++id) {
441 _workers[id].id = id;
442 _workers[id].vtm = id;
443 _workers[id].domain = d;
444 _workers[id].executor =
this;
445 _workers[id].waiter = &_notifier[d]._waiters[i];
447 _threads.emplace_back([
this] (Worker& w) ->
void {
449 PerThread& pt = _per_thread();
461 if(_wait_for_task(w, t) ==
false) {
472 inline void Executor::_explore_task(Worker& w, Node*& t) {
477 const auto d = w.domain;
479 size_t num_steals = 0;
480 size_t num_yields = 0;
503 t = (w.id == w.vtm) ? _wsq[d].steal() : _workers[w.vtm].wsq[d].steal();
509 if(num_steals++ > _MAX_STEALS) {
511 if(num_yields++ > _MAX_YIELDS) {
516 w.vtm = rdvtm(w.rdgen);
522 inline void Executor::_exploit_task(Worker& w, Node*& t) {
526 const auto d = w.domain;
528 if(_num_actives[d].fetch_add(1) == 0 && _num_thieves[d] == 0) {
529 _notifier[d].notify(
false);
535 if(t->_parent ==
nullptr) {
536 if(t->_topology->_join_counter.fetch_sub(1) == 1) {
537 _tear_down_topology(t->_topology);
541 t->_parent->_join_counter.fetch_sub(1);
552 inline bool Executor::_wait_for_task(Worker& worker, Node*& t) {
554 const auto d = worker.domain;
564 _explore_task(worker, t);
567 if(_num_thieves[d].fetch_sub(1) == 1) {
568 _notifier[d].notify(
false);
573 _notifier[d].prepare_wait(worker.waiter);
576 if(!_wsq[d].empty()) {
578 _notifier[d].cancel_wait(worker.waiter);
583 if(_num_thieves[d].fetch_sub(1) == 1) {
584 _notifier[d].notify(
false);
589 worker.vtm = worker.id;
595 _notifier[d].cancel_wait(worker.waiter);
596 for(
int i=0; i<NUM_DOMAINS; ++i) {
597 _notifier[i].notify(
true);
603 if(_num_thieves[d].fetch_sub(1) == 1) {
604 if(_num_actives[d]) {
605 _notifier[d].cancel_wait(worker.waiter);
609 for(
auto& w : _workers) {
610 if(!w.wsq[d].empty()) {
612 _notifier[d].cancel_wait(worker.waiter);
619 _notifier[d].commit_wait(worker.waiter);
625 template<
typename Observer,
typename... Args>
630 "Observer must be derived from ObserverInterface" 634 auto ptr = std::make_shared<Observer>(std::forward<Args>(args)...);
636 ptr->set_up(_workers.size());
638 _observers.emplace(std::static_pointer_cast<ObserverInterface>(ptr));
644 template <
typename Observer>
649 "Observer must be derived from ObserverInterface" 652 _observers.erase(std::static_pointer_cast<ObserverInterface>(ptr));
657 return _observers.size();
663 inline void Executor::_schedule(Node* node) {
667 const auto d = node->domain();
670 auto worker = _per_thread().worker;
672 if(worker !=
nullptr && worker->executor ==
this) {
673 worker->wsq[d].push(node);
674 if(worker->domain != d) {
675 if(_num_actives[d] == 0 && _num_thieves[d] == 0) {
676 _notifier[d].notify(
false);
688 _notifier[d].notify(
false);
694 inline void Executor::_schedule(PassiveVector<Node*>& nodes) {
700 const auto num_nodes = nodes.size();
707 auto worker = _per_thread().worker;
710 size_t tcount[NUM_DOMAINS] = {0};
712 if(worker !=
nullptr && worker->executor ==
this) {
713 for(
size_t i=0; i<num_nodes; ++i) {
714 const auto d = nodes[i]->domain();
715 worker->wsq[d].push(nodes[i]);
719 for(
int d=0; d<NUM_DOMAINS; ++d) {
720 if(tcount[d] && d != worker->domain) {
721 if(_num_actives[d] == 0 && _num_thieves[d] == 0) {
722 _notifier[d].notify_n(tcount[d]);
733 for(
size_t k=0; k<num_nodes; ++k) {
734 const auto d = nodes[k]->domain();
735 _wsq[d].push(nodes[k]);
740 for(
int d=0; d<NUM_DOMAINS; ++d) {
741 _notifier[d].notify_n(tcount[d]);
747 inline void Executor::_invoke(Worker& worker, Node* node) {
753 const auto num_successors = node->num_successors();
756 auto& c = (node->_parent) ? node->_parent->_join_counter :
757 node->_topology->_join_counter;
760 switch(node->_handle.index()) {
762 case Node::STATIC_WORK:{
763 _invoke_static_work(worker, node);
768 case Node::MODULE_WORK: {
769 _invoke_module_work(worker, node);
774 case Node::DYNAMIC_WORK: {
775 _invoke_dynamic_work(worker, node);
780 case Node::CONDITION_WORK: {
781 _invoke_condition_work(worker, node);
786 #ifdef TF_ENABLE_CUDA 787 case Node::CUDAFLOW_WORK: {
788 _invoke_cudaflow_work(worker, node);
802 if(node->_has_state(Node::BRANCHED)) {
804 node->_join_counter = node->num_strong_dependents();
807 node->_join_counter = node->num_dependents();
811 for(
size_t i=0; i<num_successors; ++i) {
812 if(--(node->_successors[i]->_join_counter) == 0) {
814 _schedule(node->_successors[i]);
820 inline void Executor::_observer_prologue(Worker& worker, Node* node) {
821 for(
auto& observer : _observers) {
822 observer->on_entry(worker.id, TaskView(node));
827 inline void Executor::_observer_epilogue(Worker& worker, Node* node) {
828 for(
auto& observer : _observers) {
829 observer->on_exit(worker.id, TaskView(node));
834 inline void Executor::_invoke_static_work(Worker& worker, Node* node) {
835 _observer_prologue(worker, node);
836 nstd::get<Node::StaticWork>(node->_handle).work();
837 _observer_epilogue(worker, node);
841 inline void Executor::_invoke_dynamic_work(Worker& w, Node* node) {
843 _observer_prologue(w, node);
845 auto& handle = nstd::get<Node::DynamicWork>(node->_handle);
847 handle.subgraph.clear();
849 Subflow sf(*
this, node, handle.subgraph);
854 _invoke_dynamic_work_internal(w, node, handle.subgraph,
false);
858 _observer_epilogue(w, node);
862 inline void Executor::_invoke_dynamic_work_external(Node*p, Graph& g,
bool detach) {
864 auto worker = _per_thread().worker;
866 assert(worker && worker->executor ==
this);
868 _invoke_dynamic_work_internal(*worker, p, g, detach);
872 inline void Executor::_invoke_dynamic_work_internal(
873 Worker& w, Node* p, Graph& g,
bool detach
878 if(g.empty())
return;
880 PassiveVector<Node*> src;
882 for(
auto n : g._nodes) {
884 n->_topology = p->_topology;
885 n->_set_up_join_counter();
888 n->_parent =
nullptr;
889 n->_set_state(Node::DETACHED);
895 if(n->num_dependents() == 0) {
905 p->_topology->_taskflow._graph.merge(std::move(g));
908 p->_topology->_join_counter.fetch_add(src.size());
913 p->_join_counter.fetch_add(src.size());
919 while(p->_join_counter != 0) {
921 t = w.wsq[w.domain].pop();
927 if(t->_parent ==
nullptr) {
928 if(t->_topology->_join_counter.fetch_sub(1) == 1) {
929 _tear_down_topology(t->_topology);
933 t->_parent->_join_counter.fetch_sub(1);
939 t = (w.id == w.vtm) ? _wsq[w.domain].steal() :
940 _workers[w.vtm].wsq[w.domain].steal();
944 else if(p->_join_counter != 0){
946 w.vtm = rdvtm(w.rdgen);
958 inline void Executor::_invoke_condition_work(Worker& worker, Node* node) {
960 _observer_prologue(worker, node);
962 if(node->_has_state(Node::BRANCHED)) {
963 node->_join_counter = node->num_strong_dependents();
966 node->_join_counter = node->num_dependents();
969 auto id = nstd::get<Node::ConditionWork>(node->_handle).work();
971 if(
id >= 0 && static_cast<size_t>(
id) < node->num_successors()) {
972 auto s = node->_successors[id];
973 s->_join_counter.store(0);
975 node->_parent ? node->_parent->_join_counter.fetch_add(1) :
976 node->_topology->_join_counter.fetch_add(1);
990 _observer_epilogue(worker, node);
993 #ifdef TF_ENABLE_CUDA 995 inline void Executor::_invoke_cudaflow_work(Worker& worker, Node* node) {
996 _observer_prologue(worker, node);
997 _invoke_cudaflow_work_internal(worker, node);
998 _observer_epilogue(worker, node);
1002 inline void Executor::_invoke_cudaflow_work_internal(Worker& w, Node* node) {
1004 assert(w.domain == node->domain());
1006 auto& h = nstd::get<Node::cudaFlowWork>(node->_handle);
1010 cudaFlow cf(h.graph, [repeat=1] ()
mutable { return repeat-- == 0; });
1014 if(h.graph.empty()) {
1020 const int d = cf._device;
1022 cudaScopedDevice ctx(d);
1024 auto s = cf._stream ? *(cf._stream) :
1025 _cuda_devices[d].streams[w.id - _id_offset[w.domain]];
1027 h.graph._make_native_graph();
1029 cudaGraphExec_t exec;
1032 cudaGraphInstantiate(&exec, h.graph._native_handle,
nullptr,
nullptr, 0),
1033 "failed to create an executable cudaGraph" 1036 while(!cf._predicate()) {
1038 cudaGraphLaunch(exec, s),
"failed to launch cudaGraph on stream ", s
1042 cudaStreamSynchronize(s),
"failed to synchronize stream ", s
1047 cudaGraphExecDestroy(exec),
"failed to destroy an executable cudaGraph" 1050 h.graph.clear_native_graph();
1055 inline void Executor::_invoke_module_work(Worker& w, Node* node) {
1057 _observer_prologue(w, node);
1059 auto module = nstd::get<Node::ModuleWork>(node->_handle).module;
1061 _invoke_dynamic_work_internal(w, node, module->_graph,
false);
1063 _observer_epilogue(w, node);
1068 return run_n(f, 1, [](){});
1072 template <
typename C>
1074 return run_n(f, 1, std::forward<C>(c));
1079 return run_n(f, repeat, [](){});
1083 template <
typename C>
1085 return run_until(f, [repeat]()
mutable {
return repeat-- == 0; }, std::forward<C>(c));
1089 template<
typename P>
1091 return run_until(f, std::forward<P>(pred), [](){});
1095 inline void Executor::_set_up_topology(Topology* tpg) {
1097 tpg->_sources.clear();
1098 tpg->_taskflow._graph.clear_detached();
1101 for(
auto node : tpg->_taskflow._graph._nodes) {
1103 node->_topology = tpg;
1104 node->_clear_state();
1106 if(node->num_dependents() == 0) {
1107 tpg->_sources.push_back(node);
1110 node->_set_up_join_counter();
1113 tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed);
1117 inline void Executor::_tear_down_topology(Topology* tpg) {
1119 auto &f = tpg->_taskflow;
1124 if(! tpg->_pred() ) {
1127 assert(tpg->_join_counter == 0);
1128 tpg->_join_counter = tpg->_sources.size();
1130 _schedule(tpg->_sources);
1135 if(tpg->_call !=
nullptr) {
1142 if(f._topologies.size() > 1) {
1144 assert(tpg->_join_counter == 0);
1147 tpg->_promise.set_value();
1148 f._topologies.pop_front();
1152 _decrement_topology();
1154 tpg = &(f._topologies.front());
1156 _set_up_topology(tpg);
1157 _schedule(tpg->_sources);
1169 assert(f._topologies.size() == 1);
1179 f._topologies.pop_front();
1186 _decrement_topology_and_notify();
1192 template <
typename P,
typename C>
1195 _increment_topology();
1198 if(f.
empty() || pred()) {
1200 promise.set_value();
1201 _decrement_topology_and_notify();
1202 return promise.get_future();
1206 bool run_now {
false};
1215 f._topologies.emplace_back(f, std::forward<P>(pred), std::forward<C>(c));
1216 tpg = &(f._topologies.back());
1217 future = tpg->_promise.get_future();
1219 if(f._topologies.size() == 1) {
1229 _set_up_topology(tpg);
1230 _schedule(tpg->_sources);
1237 inline void Executor::_increment_topology() {
1243 inline void Executor::_decrement_topology_and_notify() {
1245 if(--_num_topologies == 0) {
1246 _topology_cv.notify_all();
1251 inline void Executor::_decrement_topology() {
1259 _topology_cv.wait(lock, [&](){
return _num_topologies == 0; });
1269 TF_THROW(
"subflow not joinable");
1272 _executor._invoke_dynamic_work_external(_parent,
_graph,
false);
1279 TF_THROW(
"subflow already joined or detached");
1282 _executor._invoke_dynamic_work_external(_parent,
_graph,
true);
int this_worker_id() const
queries the id of the caller thread in this executor
Definition: executor.hpp:427
std::future< void > run(Taskflow &taskflow)
runs the taskflow once
Definition: executor.hpp:1067
std::future< void > run_until(Taskflow &taskflow, P &&pred)
runs the taskflow multiple times until the predicate becomes true and then invokes a callback ...
Definition: executor.hpp:1090
~Executor()
destructs the executor
Definition: executor.hpp:358
Graph & _graph
associated graph object
Definition: flow_builder.hpp:624
T hardware_concurrency(T... args)
observer designed based on taskflow board format
Definition: observer.hpp:262
void detach()
enables the subflow to detach from its parent task
Definition: executor.hpp:1276
Executor(size_t N=std::thread::hardware_concurrency(), size_t M=cuda_num_devices())
constructs the executor with N/M cpu/gpu worker threads
Definition: executor.hpp:290
void remove_observer(std::shared_ptr< Observer > observer)
removes the associated observer
Definition: executor.hpp:645
main entry to create a task dependency graph
Definition: core/taskflow.hpp:18
bool empty() const
queries the emptiness of the taskflow
Definition: core/taskflow.hpp:132
size_t num_domains() const
queries the number of worker domains
Definition: executor.hpp:411
std::shared_ptr< Observer > make_observer(Args &&... args)
constructs an observer to inspect the activities of worker threads
Definition: executor.hpp:626
void dump(std::ostream &ostream) const
dump the timelines in JSON format to an ostream
Definition: observer.hpp:395
size_t num_observers() const
queries the number of observers
Definition: executor.hpp:656
execution interface for running a taskflow graph
Definition: executor.hpp:24
size_t num_workers() const
queries the number of worker threads (can be zero)
Definition: executor.hpp:406
size_t num_topologies() const
queries the number of running topologies at the time of this call
Definition: executor.hpp:416
building methods of a subflow graph in dynamic tasking
Definition: flow_builder.hpp:956
std::future< void > run_n(Taskflow &taskflow, size_t N)
runs the taskflow for N times
Definition: executor.hpp:1078
void join()
enables the subflow to join its parent task
Definition: executor.hpp:1266
void wait_for_all()
wait for all pending graphs to complete
Definition: executor.hpp:1257