Taskflow  2.6.0
executor.hpp
1 #pragma once
2 
3 #include "tsq.hpp"
4 #include "notifier.hpp"
5 #include "observer.hpp"
6 #include "taskflow.hpp"
7 
8 namespace tf {
9 
10 
11 // ----------------------------------------------------------------------------
12 // Executor Definition
13 // ----------------------------------------------------------------------------
14 
15 
24 class Executor {
25 
26  friend class Subflow;
27 
28  struct Worker {
29  size_t id;
30  size_t vtm;
31  Domain domain;
32  Executor* executor;
33  Notifier::Waiter* waiter;
34  std::mt19937 rdgen { std::random_device{}() };
35  TaskQueue<Node*> wsq[NUM_DOMAINS];
36  };
37 
38  struct PerThread {
39  Worker* worker {nullptr};
40  };
41 
42 #ifdef TF_ENABLE_CUDA
43  struct cudaDevice {
45  };
46 #endif
47 
48  public:
49 
50 #ifdef TF_ENABLE_CUDA
51 
54  explicit Executor(
56  size_t M = cuda_num_devices()
57  );
58 #else
59 
62  explicit Executor(size_t N = std::thread::hardware_concurrency());
63 #endif
64 
68  ~Executor();
69 
77  std::future<void> run(Taskflow& taskflow);
78 
87  template<typename C>
88  std::future<void> run(Taskflow& taskflow, C&& callable);
89 
98  std::future<void> run_n(Taskflow& taskflow, size_t N);
99 
109  template<typename C>
110  std::future<void> run_n(Taskflow& taskflow, size_t N, C&& callable);
111 
121  template<typename P>
122  std::future<void> run_until(Taskflow& taskflow, P&& pred);
123 
134  template<typename P, typename C>
135  std::future<void> run_until(Taskflow& taskflow, P&& pred, C&& callable);
136 
140  void wait_for_all();
141 
145  size_t num_workers() const;
146 
153  size_t num_topologies() const;
154 
161  size_t num_domains() const;
162 
169  int this_worker_id() const;
170 
171  //
172  //@brief runs a given function asynchronously and returns std::future that will
173  // eventually hold the result of that function call
174  //template <typename F, typename... ArgsT>
175  //auto async(F&& f, ArgsT&&... args) {
176 
177  // using R = typename function_traits<F>::return_type;
178 
179  // std::promise<R> p;
180 
181  // auto fu = p.get_future();
182 
183  // auto lambda = [p=std::move(p), f=std::forward<F>(f), args...] () {
184  // f(args...);
185  // };
186 
187  // //std::function<void()> f {[f=std::forward<F>(f), args...] () {
188  // //}};
189 
190  // return fu;
191  //}
192 
205  template <typename Observer, typename... Args>
207 
211  template <typename Observer>
213 
217  size_t num_observers() const;
218 
219  private:
220 
221  const size_t _VICTIM_BEG;
222  const size_t _VICTIM_END;
223  const size_t _MAX_STEALS;
224  const size_t _MAX_YIELDS;
225 
226  std::condition_variable _topology_cv;
227  std::mutex _topology_mutex;
228  std::mutex _wsq_mutex;
229 
230  size_t _num_topologies {0};
231 
232  std::vector<Worker> _workers;
233  std::vector<std::thread> _threads;
234 
235 #ifdef TF_ENABLE_CUDA
236  std::vector<cudaDevice> _cuda_devices;
237 #endif
238 
239  Notifier _notifier[NUM_DOMAINS];
240 
241  TaskQueue<Node*> _wsq[NUM_DOMAINS];
242 
243  size_t _id_offset[NUM_DOMAINS] = {0};
244 
245  std::atomic<size_t> _num_actives[NUM_DOMAINS];
246  std::atomic<size_t> _num_thieves[NUM_DOMAINS];
247  std::atomic<bool> _done {0};
248 
250 
251  TFProfObserver* _tfprof;
252 
253  PerThread& _per_thread() const;
254 
255  bool _wait_for_task(Worker&, Node*&);
256 
257  void _instantiate_tfprof();
258  void _flush_tfprof();
259  void _observer_prologue(Worker&, Node*);
260  void _observer_epilogue(Worker&, Node*);
261  void _spawn(size_t, Domain);
262  void _worker_loop(Worker&);
263  void _exploit_task(Worker&, Node*&);
264  void _explore_task(Worker&, Node*&);
265  void _schedule(Node*);
266  void _schedule(PassiveVector<Node*>&);
267  void _invoke(Worker&, Node*);
268  void _invoke_static_work(Worker&, Node*);
269  void _invoke_dynamic_work(Worker&, Node*);
270  void _invoke_dynamic_work_internal(Worker&, Node*, Graph&, bool);
271  void _invoke_dynamic_work_external(Node*, Graph&, bool);
272  void _invoke_condition_work(Worker&, Node*);
273  void _invoke_module_work(Worker&, Node*);
274 
275 #ifdef TF_ENABLE_CUDA
276  void _invoke_cudaflow_work(Worker&, Node*);
277  void _invoke_cudaflow_work_internal(Worker&, Node*);
278 #endif
279 
280  void _set_up_topology(Topology*);
281  void _tear_down_topology(Topology*);
282  void _increment_topology();
283  void _decrement_topology();
284  void _decrement_topology_and_notify();
285 };
286 
287 
288 #ifdef TF_ENABLE_CUDA
289 // Constructor
290 inline Executor::Executor(size_t N, size_t M) :
291  _VICTIM_BEG {0},
292  _VICTIM_END {N + M - 1},
293  _MAX_STEALS {(N + M + 1) << 1},
294  _MAX_YIELDS {100},
295  _workers {N + M},
296  _cuda_devices {cuda_num_devices()},
297  _notifier {Notifier(N), Notifier(M)} {
298 
299  if(N == 0) {
300  TF_THROW("no cpu workers to execute taskflows");
301  }
302 
303  if(M == 0) {
304  TF_THROW("no gpu workers to execute cudaflows");
305  }
306 
307  for(int i=0; i<NUM_DOMAINS; ++i) {
308  _num_actives[i].store(0, std::memory_order_relaxed);
309  _num_thieves[i].store(0, std::memory_order_relaxed);
310  }
311 
312  // create a per-worker stream on each cuda device
313  for(size_t i=0; i<_cuda_devices.size(); ++i) {
314  _cuda_devices[i].streams.resize(M);
315  cudaScopedDevice ctx(i);
316  for(size_t m=0; m<M; ++m) {
317  TF_CHECK_CUDA(
318  cudaStreamCreate(&(_cuda_devices[i].streams[m])),
319  "failed to create a cudaStream for worker ", m, " on device ", i
320  );
321  }
322  }
323 
324  _spawn(N, HOST);
325  _spawn(M, CUDA);
326 
327  // initiate the observer if requested
328  _instantiate_tfprof();
329 }
330 
331 #else
332 // Constructor
333 inline Executor::Executor(size_t N) :
334  _VICTIM_BEG {0},
335  _VICTIM_END {N - 1},
336  _MAX_STEALS {(N + 1) << 1},
337  _MAX_YIELDS {100},
338  _workers {N},
339  _notifier {Notifier(N)} {
340 
341  if(N == 0) {
342  TF_THROW("no cpu workers to execute taskflows");
343  }
344 
345  for(int i=0; i<NUM_DOMAINS; ++i) {
346  _num_actives[i].store(0, std::memory_order_relaxed);
347  _num_thieves[i].store(0, std::memory_order_relaxed);
348  }
349 
350  _spawn(N, HOST);
351 
352  // instantite the default observer if requested
353  _instantiate_tfprof();
354 }
355 #endif
356 
357 // Destructor
359 
360  // wait for all topologies to complete
361  wait_for_all();
362 
363  // shut down the scheduler
364  _done = true;
365 
366  for(int i=0; i<NUM_DOMAINS; ++i) {
367  _notifier[i].notify(true);
368  }
369 
370  for(auto& t : _threads){
371  t.join();
372  }
373 
374 #ifdef TF_ENABLE_CUDA
375  // clean up the cuda streams
376  for(size_t i=0; i<_cuda_devices.size(); ++i) {
377  cudaScopedDevice ctx(i);
378  for(size_t m=0; m<_cuda_devices[i].streams.size(); ++m) {
379  cudaStreamDestroy(_cuda_devices[i].streams[m]);
380  }
381  }
382 #endif
383 
384  // flush the default observer
385  _flush_tfprof();
386 }
387 
388 // Procedure: _instantiate_tfprof
389 inline void Executor::_instantiate_tfprof() {
390  // TF_OBSERVER_TYPE
391  _tfprof = get_env("TF_ENABLE_PROFILER").empty() ?
392  nullptr : make_observer<TFProfObserver>().get();
393 }
394 
395 // Procedure: _flush_tfprof
396 inline void Executor::_flush_tfprof() {
397  if(_tfprof) {
398  std::ostringstream fpath;
399  fpath << get_env("TF_ENABLE_PROFILER") << _tfprof->_uuid << ".tfp";
400  std::ofstream ofs(fpath.str());
401  _tfprof->dump(ofs);
402  }
403 }
404 
405 // Function: num_workers
406 inline size_t Executor::num_workers() const {
407  return _workers.size();
408 }
409 
410 // Function: num_domains
411 inline size_t Executor::num_domains() const {
412  return NUM_DOMAINS;
413 }
414 
415 // Function: num_topologies
416 inline size_t Executor::num_topologies() const {
417  return _num_topologies;
418 }
419 
420 // Function: _per_thread
421 inline Executor::PerThread& Executor::_per_thread() const {
422  thread_local PerThread pt;
423  return pt;
424 }
425 
426 // Function: this_worker_id
427 inline int Executor::this_worker_id() const {
428  auto worker = _per_thread().worker;
429  return worker ? static_cast<int>(worker->id) : -1;
430 }
431 
432 // Procedure: _spawn
433 inline void Executor::_spawn(size_t N, Domain d) {
434 
435  auto id = _threads.size();
436 
437  _id_offset[d] = id;
438 
439  for(size_t i=0; i<N; ++i, ++id) {
440 
441  _workers[id].id = id;
442  _workers[id].vtm = id;
443  _workers[id].domain = d;
444  _workers[id].executor = this;
445  _workers[id].waiter = &_notifier[d]._waiters[i];
446 
447  _threads.emplace_back([this] (Worker& w) -> void {
448 
449  PerThread& pt = _per_thread();
450  pt.worker = &w;
451 
452  Node* t = nullptr;
453 
454  // must use 1 as condition instead of !done
455  while(1) {
456 
457  // execute the tasks.
458  _exploit_task(w, t);
459 
460  // wait for tasks
461  if(_wait_for_task(w, t) == false) {
462  break;
463  }
464  }
465 
466  }, std::ref(_workers[id]));
467  }
468 
469 }
470 
471 // Function: _explore_task
472 inline void Executor::_explore_task(Worker& w, Node*& t) {
473 
474  //assert(_workers[w].wsq.empty());
475  assert(!t);
476 
477  const auto d = w.domain;
478 
479  size_t num_steals = 0;
480  size_t num_yields = 0;
481 
482  std::uniform_int_distribution<size_t> rdvtm(_VICTIM_BEG, _VICTIM_END);
483 
484  //while(!_done) {
485  //
486  // size_t vtm = rdvtm(w.rdgen);
487  //
488  // t = (vtm == w.id) ? _wsq[d].steal() : _workers[vtm].wsq[d].steal();
489 
490  // if(t) {
491  // break;
492  // }
493 
494  // if(num_steal++ > _MAX_STEALS) {
495  // std::this_thread::yield();
496  // if(num_yields++ > _MAX_YIELDS) {
497  // break;
498  // }
499  // }
500  //}
501 
502  do {
503  t = (w.id == w.vtm) ? _wsq[d].steal() : _workers[w.vtm].wsq[d].steal();
504 
505  if(t) {
506  break;
507  }
508 
509  if(num_steals++ > _MAX_STEALS) {
511  if(num_yields++ > _MAX_YIELDS) {
512  break;
513  }
514  }
515 
516  w.vtm = rdvtm(w.rdgen);
517  } while(!_done);
518 
519 }
520 
521 // Procedure: _exploit_task
522 inline void Executor::_exploit_task(Worker& w, Node*& t) {
523 
524  if(t) {
525 
526  const auto d = w.domain;
527 
528  if(_num_actives[d].fetch_add(1) == 0 && _num_thieves[d] == 0) {
529  _notifier[d].notify(false);
530  }
531 
532  while(t) {
533  _invoke(w, t);
534 
535  if(t->_parent == nullptr) {
536  if(t->_topology->_join_counter.fetch_sub(1) == 1) {
537  _tear_down_topology(t->_topology);
538  }
539  }
540  else { // joined subflow
541  t->_parent->_join_counter.fetch_sub(1);
542  }
543 
544  t = w.wsq[d].pop();
545  }
546 
547  --_num_actives[d];
548  }
549 }
550 
551 // Function: _wait_for_task
552 inline bool Executor::_wait_for_task(Worker& worker, Node*& t) {
553 
554  const auto d = worker.domain;
555 
556  wait_for_task:
557 
558  assert(!t);
559 
560  ++_num_thieves[d];
561 
562  explore_task:
563 
564  _explore_task(worker, t);
565 
566  if(t) {
567  if(_num_thieves[d].fetch_sub(1) == 1) {
568  _notifier[d].notify(false);
569  }
570  return true;
571  }
572 
573  _notifier[d].prepare_wait(worker.waiter);
574 
575  //if(auto vtm = _find_vtm(me); vtm != _workers.size()) {
576  if(!_wsq[d].empty()) {
577 
578  _notifier[d].cancel_wait(worker.waiter);
579  //t = (vtm == me) ? _wsq.steal() : _workers[vtm].wsq.steal();
580 
581  t = _wsq[d].steal(); // must steal here
582  if(t) {
583  if(_num_thieves[d].fetch_sub(1) == 1) {
584  _notifier[d].notify(false);
585  }
586  return true;
587  }
588  else {
589  worker.vtm = worker.id;
590  goto explore_task;
591  }
592  }
593 
594  if(_done) {
595  _notifier[d].cancel_wait(worker.waiter);
596  for(int i=0; i<NUM_DOMAINS; ++i) {
597  _notifier[i].notify(true);
598  }
599  --_num_thieves[d];
600  return false;
601  }
602 
603  if(_num_thieves[d].fetch_sub(1) == 1) {
604  if(_num_actives[d]) {
605  _notifier[d].cancel_wait(worker.waiter);
606  goto wait_for_task;
607  }
608  // check all domain queue again
609  for(auto& w : _workers) {
610  if(!w.wsq[d].empty()) {
611  worker.vtm = w.id;
612  _notifier[d].cancel_wait(worker.waiter);
613  goto wait_for_task;
614  }
615  }
616  }
617 
618  // Now I really need to relinguish my self to others
619  _notifier[d].commit_wait(worker.waiter);
620 
621  return true;
622 }
623 
624 // Function: make_observer
625 template<typename Observer, typename... Args>
627 
628  static_assert(
630  "Observer must be derived from ObserverInterface"
631  );
632 
633  // use a local variable to mimic the constructor
634  auto ptr = std::make_shared<Observer>(std::forward<Args>(args)...);
635 
636  ptr->set_up(_workers.size());
637 
638  _observers.emplace(std::static_pointer_cast<ObserverInterface>(ptr));
639 
640  return ptr;
641 }
642 
643 // Procedure: remove_observer
644 template <typename Observer>
646 
647  static_assert(
649  "Observer must be derived from ObserverInterface"
650  );
651 
652  _observers.erase(std::static_pointer_cast<ObserverInterface>(ptr));
653 }
654 
655 // Function: num_observers
656 inline size_t Executor::num_observers() const {
657  return _observers.size();
658 }
659 
660 // Procedure: _schedule
661 // The main procedure to schedule a give task node.
662 // Each task node has two types of tasks - regular and subflow.
663 inline void Executor::_schedule(Node* node) {
664 
665  //assert(_workers.size() != 0);
666 
667  const auto d = node->domain();
668 
669  // caller is a worker to this pool
670  auto worker = _per_thread().worker;
671 
672  if(worker != nullptr && worker->executor == this) {
673  worker->wsq[d].push(node);
674  if(worker->domain != d) {
675  if(_num_actives[d] == 0 && _num_thieves[d] == 0) {
676  _notifier[d].notify(false);
677  }
678  }
679  return;
680  }
681 
682  // other threads
683  {
684  std::lock_guard<std::mutex> lock(_wsq_mutex);
685  _wsq[d].push(node);
686  }
687 
688  _notifier[d].notify(false);
689 }
690 
691 // Procedure: _schedule
692 // The main procedure to schedule a set of task nodes.
693 // Each task node has two types of tasks - regular and subflow.
694 inline void Executor::_schedule(PassiveVector<Node*>& nodes) {
695 
696  //assert(_workers.size() != 0);
697 
698  // We need to cacth the node count to avoid accessing the nodes
699  // vector while the parent topology is removed!
700  const auto num_nodes = nodes.size();
701 
702  if(num_nodes == 0) {
703  return;
704  }
705 
706  // worker thread
707  auto worker = _per_thread().worker;
708 
709  // task counts
710  size_t tcount[NUM_DOMAINS] = {0};
711 
712  if(worker != nullptr && worker->executor == this) {
713  for(size_t i=0; i<num_nodes; ++i) {
714  const auto d = nodes[i]->domain();
715  worker->wsq[d].push(nodes[i]);
716  tcount[d]++;
717  }
718 
719  for(int d=0; d<NUM_DOMAINS; ++d) {
720  if(tcount[d] && d != worker->domain) {
721  if(_num_actives[d] == 0 && _num_thieves[d] == 0) {
722  _notifier[d].notify_n(tcount[d]);
723  }
724  }
725  }
726 
727  return;
728  }
729 
730  // other threads
731  {
732  std::lock_guard<std::mutex> lock(_wsq_mutex);
733  for(size_t k=0; k<num_nodes; ++k) {
734  const auto d = nodes[k]->domain();
735  _wsq[d].push(nodes[k]);
736  tcount[d]++;
737  }
738  }
739 
740  for(int d=0; d<NUM_DOMAINS; ++d) {
741  _notifier[d].notify_n(tcount[d]);
742  }
743 }
744 
745 
746 // Procedure: _invoke
747 inline void Executor::_invoke(Worker& worker, Node* node) {
748 
749  //assert(_workers.size() != 0);
750 
751  // Here we need to fetch the num_successors first to avoid the invalid memory
752  // access caused by topology clear.
753  const auto num_successors = node->num_successors();
754 
755  // acquire the parent flow counter
756  auto& c = (node->_parent) ? node->_parent->_join_counter :
757  node->_topology->_join_counter;
758 
759  // switch is faster than nested if-else due to jump table
760  switch(node->_handle.index()) {
761  // static task
762  case Node::STATIC_WORK:{
763  _invoke_static_work(worker, node);
764  }
765  break;
766 
767  // module task
768  case Node::MODULE_WORK: {
769  _invoke_module_work(worker, node);
770  }
771  break;
772 
773  // dynamic task
774  case Node::DYNAMIC_WORK: {
775  _invoke_dynamic_work(worker, node);
776  }
777  break;
778 
779  // condition task
780  case Node::CONDITION_WORK: {
781  _invoke_condition_work(worker, node);
782  return ;
783  } // no need to add a break here due to the immediate return
784 
785  // cudaflow task
786 #ifdef TF_ENABLE_CUDA
787  case Node::CUDAFLOW_WORK: {
788  _invoke_cudaflow_work(worker, node);
789  }
790  break;
791 #endif
792 
793  // monostate
794  default:
795  break;
796  }
797 
798  // We MUST recover the dependency since subflow may have
799  // a condition node to go back (cyclic).
800  // This must be done before scheduling the successors, otherwise this might cause
801  // race condition on the _dependents
802  if(node->_has_state(Node::BRANCHED)) {
803  // If this is a case node, we need to deduct condition predecessors
804  node->_join_counter = node->num_strong_dependents();
805  }
806  else {
807  node->_join_counter = node->num_dependents();
808  }
809 
810  // At this point, the node storage might be destructed.
811  for(size_t i=0; i<num_successors; ++i) {
812  if(--(node->_successors[i]->_join_counter) == 0) {
813  c.fetch_add(1);
814  _schedule(node->_successors[i]);
815  }
816  }
817 }
818 
819 // Procedure: _observer_prologue
820 inline void Executor::_observer_prologue(Worker& worker, Node* node) {
821  for(auto& observer : _observers) {
822  observer->on_entry(worker.id, TaskView(node));
823  }
824 }
825 
826 // Procedure: _observer_epilogue
827 inline void Executor::_observer_epilogue(Worker& worker, Node* node) {
828  for(auto& observer : _observers) {
829  observer->on_exit(worker.id, TaskView(node));
830  }
831 }
832 
833 // Procedure: _invoke_static_work
834 inline void Executor::_invoke_static_work(Worker& worker, Node* node) {
835  _observer_prologue(worker, node);
836  nstd::get<Node::StaticWork>(node->_handle).work();
837  _observer_epilogue(worker, node);
838 }
839 
840 // Procedure: _invoke_dynamic_work
841 inline void Executor::_invoke_dynamic_work(Worker& w, Node* node) {
842 
843  _observer_prologue(w, node);
844 
845  auto& handle = nstd::get<Node::DynamicWork>(node->_handle);
846 
847  handle.subgraph.clear();
848 
849  Subflow sf(*this, node, handle.subgraph);
850 
851  handle.work(sf);
852 
853  if(sf._joinable) {
854  _invoke_dynamic_work_internal(w, node, handle.subgraph, false);
855  }
856 
857  // TODO
858  _observer_epilogue(w, node);
859 }
860 
861 // Procedure: _invoke_dynamic_work_external
862 inline void Executor::_invoke_dynamic_work_external(Node*p, Graph& g, bool detach) {
863 
864  auto worker = _per_thread().worker;
865 
866  assert(worker && worker->executor == this);
867 
868  _invoke_dynamic_work_internal(*worker, p, g, detach);
869 }
870 
871 // Procedure: _invoke_dynamic_work_internal
872 inline void Executor::_invoke_dynamic_work_internal(
873  Worker& w, Node* p, Graph& g, bool detach
874 ) {
875 
876  assert(p);
877 
878  if(g.empty()) return;
879 
880  PassiveVector<Node*> src;
881 
882  for(auto n : g._nodes) {
883 
884  n->_topology = p->_topology;
885  n->_set_up_join_counter();
886 
887  if(detach) {
888  n->_parent = nullptr;
889  n->_set_state(Node::DETACHED);
890  }
891  else {
892  n->_parent = p;
893  }
894 
895  if(n->num_dependents() == 0) {
896  src.push_back(n);
897  }
898  }
899 
900  // detach here
901  if(detach) {
902 
903  {
904  std::lock_guard<std::mutex> lock(p->_topology->_taskflow._mtx);
905  p->_topology->_taskflow._graph.merge(std::move(g));
906  }
907 
908  p->_topology->_join_counter.fetch_add(src.size());
909  _schedule(src);
910  }
911  // join here
912  else {
913  p->_join_counter.fetch_add(src.size());
914  _schedule(src);
915  Node* t = nullptr;
916 
917  std::uniform_int_distribution<size_t> rdvtm(_VICTIM_BEG, _VICTIM_END);
918 
919  while(p->_join_counter != 0) {
920 
921  t = w.wsq[w.domain].pop();
922 
923  exploit:
924 
925  if(t) {
926  _invoke(w, t);
927  if(t->_parent == nullptr) {
928  if(t->_topology->_join_counter.fetch_sub(1) == 1) {
929  _tear_down_topology(t->_topology);
930  }
931  }
932  else { // joined subflow
933  t->_parent->_join_counter.fetch_sub(1);
934  }
935  }
936  else {
937 
938  explore:
939  t = (w.id == w.vtm) ? _wsq[w.domain].steal() :
940  _workers[w.vtm].wsq[w.domain].steal();
941  if(t) {
942  goto exploit;
943  }
944  else if(p->_join_counter != 0){
946  w.vtm = rdvtm(w.rdgen);
947  goto explore;
948  }
949  else {
950  break;
951  }
952  }
953  }
954  }
955 }
956 
957 // Procedure: _invoke_condition_work
958 inline void Executor::_invoke_condition_work(Worker& worker, Node* node) {
959 
960  _observer_prologue(worker, node);
961 
962  if(node->_has_state(Node::BRANCHED)) {
963  node->_join_counter = node->num_strong_dependents();
964  }
965  else {
966  node->_join_counter = node->num_dependents();
967  }
968 
969  auto id = nstd::get<Node::ConditionWork>(node->_handle).work();
970 
971  if(id >= 0 && static_cast<size_t>(id) < node->num_successors()) {
972  auto s = node->_successors[id];
973  s->_join_counter.store(0);
974 
975  node->_parent ? node->_parent->_join_counter.fetch_add(1) :
976  node->_topology->_join_counter.fetch_add(1);
977  _schedule(s);
978 
979 
980  //if(s->domain() == worker.domain) {
981  // _schedule(s, true);
982  //}
983  //else {
984  // node->_parent ? node->_parent->_join_counter.fetch_add(1) :
985  // node->_topology->_join_counter.fetch_add(1);
986  // _schedule(s, false);
987  //}
988  }
989 
990  _observer_epilogue(worker, node);
991 }
992 
993 #ifdef TF_ENABLE_CUDA
994 // Procedure: _invoke_cudaflow_work
995 inline void Executor::_invoke_cudaflow_work(Worker& worker, Node* node) {
996  _observer_prologue(worker, node);
997  _invoke_cudaflow_work_internal(worker, node);
998  _observer_epilogue(worker, node);
999 }
1000 
1001 // Procedure: _invoke_cudaflow_work_internal
1002 inline void Executor::_invoke_cudaflow_work_internal(Worker& w, Node* node) {
1003 
1004  assert(w.domain == node->domain());
1005 
1006  auto& h = nstd::get<Node::cudaFlowWork>(node->_handle);
1007 
1008  h.graph.clear();
1009 
1010  cudaFlow cf(h.graph, [repeat=1] () mutable { return repeat-- == 0; });
1011 
1012  h.work(cf);
1013 
1014  if(h.graph.empty()) {
1015  return;
1016  }
1017 
1018  // transforms cudaFlow to a native cudaGraph under the specified device
1019  // and launches the graph through a given or an internal device stream
1020  const int d = cf._device;
1021 
1022  cudaScopedDevice ctx(d);
1023 
1024  auto s = cf._stream ? *(cf._stream) :
1025  _cuda_devices[d].streams[w.id - _id_offset[w.domain]];
1026 
1027  h.graph._make_native_graph();
1028 
1029  cudaGraphExec_t exec;
1030 
1031  TF_CHECK_CUDA(
1032  cudaGraphInstantiate(&exec, h.graph._native_handle, nullptr, nullptr, 0),
1033  "failed to create an executable cudaGraph"
1034  );
1035 
1036  while(!cf._predicate()) {
1037  TF_CHECK_CUDA(
1038  cudaGraphLaunch(exec, s), "failed to launch cudaGraph on stream ", s
1039  );
1040 
1041  TF_CHECK_CUDA(
1042  cudaStreamSynchronize(s), "failed to synchronize stream ", s
1043  );
1044  }
1045 
1046  TF_CHECK_CUDA(
1047  cudaGraphExecDestroy(exec), "failed to destroy an executable cudaGraph"
1048  );
1049 
1050  h.graph.clear_native_graph();
1051 }
1052 #endif
1053 
1054 // Procedure: _invoke_module_work
1055 inline void Executor::_invoke_module_work(Worker& w, Node* node) {
1056 
1057  _observer_prologue(w, node);
1058 
1059  auto module = nstd::get<Node::ModuleWork>(node->_handle).module;
1060 
1061  _invoke_dynamic_work_internal(w, node, module->_graph, false);
1062 
1063  _observer_epilogue(w, node);
1064 }
1065 
1066 // Function: run
1068  return run_n(f, 1, [](){});
1069 }
1070 
1071 // Function: run
1072 template <typename C>
1074  return run_n(f, 1, std::forward<C>(c));
1075 }
1076 
1077 // Function: run_n
1078 inline std::future<void> Executor::run_n(Taskflow& f, size_t repeat) {
1079  return run_n(f, repeat, [](){});
1080 }
1081 
1082 // Function: run_n
1083 template <typename C>
1084 std::future<void> Executor::run_n(Taskflow& f, size_t repeat, C&& c) {
1085  return run_until(f, [repeat]() mutable { return repeat-- == 0; }, std::forward<C>(c));
1086 }
1087 
1088 // Function: run_until
1089 template<typename P>
1091  return run_until(f, std::forward<P>(pred), [](){});
1092 }
1093 
1094 // Function: _set_up_topology
1095 inline void Executor::_set_up_topology(Topology* tpg) {
1096 
1097  tpg->_sources.clear();
1098  tpg->_taskflow._graph.clear_detached();
1099 
1100  // scan each node in the graph and build up the links
1101  for(auto node : tpg->_taskflow._graph._nodes) {
1102 
1103  node->_topology = tpg;
1104  node->_clear_state();
1105 
1106  if(node->num_dependents() == 0) {
1107  tpg->_sources.push_back(node);
1108  }
1109 
1110  node->_set_up_join_counter();
1111  }
1112 
1113  tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed);
1114 }
1115 
1116 // Function: _tear_down_topology
1117 inline void Executor::_tear_down_topology(Topology* tpg) {
1118 
1119  auto &f = tpg->_taskflow;
1120 
1121  //assert(&tpg == &(f._topologies.front()));
1122 
1123  // case 1: we still need to run the topology again
1124  if(! tpg->_pred() ) {
1125  //tpg->_recover_num_sinks();
1126 
1127  assert(tpg->_join_counter == 0);
1128  tpg->_join_counter = tpg->_sources.size();
1129 
1130  _schedule(tpg->_sources);
1131  }
1132  // case 2: the final run of this topology
1133  else {
1134 
1135  if(tpg->_call != nullptr) {
1136  tpg->_call();
1137  }
1138 
1139  f._mtx.lock();
1140 
1141  // If there is another run (interleave between lock)
1142  if(f._topologies.size() > 1) {
1143 
1144  assert(tpg->_join_counter == 0);
1145 
1146  // Set the promise
1147  tpg->_promise.set_value();
1148  f._topologies.pop_front();
1149  f._mtx.unlock();
1150 
1151  // decrement the topology but since this is not the last we don't notify
1152  _decrement_topology();
1153 
1154  tpg = &(f._topologies.front());
1155 
1156  _set_up_topology(tpg);
1157  _schedule(tpg->_sources);
1158 
1159  //f._topologies.front()._bind(f._graph);
1160  //*tpg = &(f._topologies.front());
1161 
1162  //assert(f._topologies.front()._join_counter == 0);
1163 
1164  //f._topologies.front()._join_counter = f._topologies.front()._sources.size();
1165 
1166  //_schedule(f._topologies.front()._sources);
1167  }
1168  else {
1169  assert(f._topologies.size() == 1);
1170 
1171  // Need to back up the promise first here becuz taskflow might be
1172  // destroy before taskflow leaves
1173  auto p {std::move(tpg->_promise)};
1174 
1175  // Back up lambda capture in case it has the topology pointer, to avoid it releasing on
1176  // pop_front ahead of _mtx.unlock & _promise.set_value. Released safely when leaving scope.
1177  auto bc{ std::move( tpg->_call ) };
1178 
1179  f._topologies.pop_front();
1180 
1181  f._mtx.unlock();
1182 
1183  // We set the promise in the end in case taskflow leaves before taskflow
1184  p.set_value();
1185 
1186  _decrement_topology_and_notify();
1187  }
1188  }
1189 }
1190 
1191 // Function: run_until
1192 template <typename P, typename C>
1194 
1195  _increment_topology();
1196 
1197  // Special case of predicate
1198  if(f.empty() || pred()) {
1199  std::promise<void> promise;
1200  promise.set_value();
1201  _decrement_topology_and_notify();
1202  return promise.get_future();
1203  }
1204 
1205  // Multi-threaded execution.
1206  bool run_now {false};
1207  Topology* tpg;
1208  std::future<void> future;
1209 
1210  {
1211  std::lock_guard<std::mutex> lock(f._mtx);
1212 
1213  // create a topology for this run
1214  //tpg = &(f._topologies.emplace_back(f, std::forward<P>(pred), std::forward<C>(c)));
1215  f._topologies.emplace_back(f, std::forward<P>(pred), std::forward<C>(c));
1216  tpg = &(f._topologies.back());
1217  future = tpg->_promise.get_future();
1218 
1219  if(f._topologies.size() == 1) {
1220  run_now = true;
1221  //tpg->_bind(f._graph);
1222  //_schedule(tpg->_sources);
1223  }
1224  }
1225 
1226  // Notice here calling schedule may cause the topology to be removed sonner
1227  // before the function leaves.
1228  if(run_now) {
1229  _set_up_topology(tpg);
1230  _schedule(tpg->_sources);
1231  }
1232 
1233  return future;
1234 }
1235 
1236 // Procedure: _increment_topology
1237 inline void Executor::_increment_topology() {
1238  std::lock_guard<std::mutex> lock(_topology_mutex);
1239  ++_num_topologies;
1240 }
1241 
1242 // Procedure: _decrement_topology_and_notify
1243 inline void Executor::_decrement_topology_and_notify() {
1244  std::lock_guard<std::mutex> lock(_topology_mutex);
1245  if(--_num_topologies == 0) {
1246  _topology_cv.notify_all();
1247  }
1248 }
1249 
1250 // Procedure: _decrement_topology
1251 inline void Executor::_decrement_topology() {
1252  std::lock_guard<std::mutex> lock(_topology_mutex);
1253  --_num_topologies;
1254 }
1255 
1256 // Procedure: wait_for_all
1257 inline void Executor::wait_for_all() {
1258  std::unique_lock<std::mutex> lock(_topology_mutex);
1259  _topology_cv.wait(lock, [&](){ return _num_topologies == 0; });
1260 }
1261 
1262 // ----------------------------------------------------------------------------
1263 // Subflow Definition
1264 // ----------------------------------------------------------------------------
1265 
1266 inline void Subflow::join() {
1267 
1268  if(!_joinable) {
1269  TF_THROW("subflow not joinable");
1270  }
1271 
1272  _executor._invoke_dynamic_work_external(_parent, _graph, false);
1273  _joinable = false;
1274 }
1275 
1276 inline void Subflow::detach() {
1277 
1278  if(!_joinable) {
1279  TF_THROW("subflow already joined or detached");
1280  }
1281 
1282  _executor._invoke_dynamic_work_external(_parent, _graph, true);
1283  _joinable = false;
1284 }
1285 
1286 
1287 } // end of namespace tf -----------------------------------------------------
1288 
1289 
1290 
1291 
1292 
1293 
1294 
1295 
1296 
1297 
1298 
int this_worker_id() const
queries the id of the caller thread in this executor
Definition: executor.hpp:427
std::future< void > run(Taskflow &taskflow)
runs the taskflow once
Definition: executor.hpp:1067
std::future< void > run_until(Taskflow &taskflow, P &&pred)
runs the taskflow multiple times until the predicate becomes true and then invokes a callback ...
Definition: executor.hpp:1090
~Executor()
destructs the executor
Definition: executor.hpp:358
T yield(T... args)
Definition: error.hpp:9
Graph & _graph
associated graph object
Definition: flow_builder.hpp:624
T hardware_concurrency(T... args)
observer designed based on taskflow board format
Definition: observer.hpp:262
void detach()
enables the subflow to detach from its parent task
Definition: executor.hpp:1276
Executor(size_t N=std::thread::hardware_concurrency(), size_t M=cuda_num_devices())
constructs the executor with N/M cpu/gpu worker threads
Definition: executor.hpp:290
void remove_observer(std::shared_ptr< Observer > observer)
removes the associated observer
Definition: executor.hpp:645
T lock(T... args)
main entry to create a task dependency graph
Definition: core/taskflow.hpp:18
bool empty() const
queries the emptiness of the taskflow
Definition: core/taskflow.hpp:132
T move(T... args)
size_t num_domains() const
queries the number of worker domains
Definition: executor.hpp:411
std::shared_ptr< Observer > make_observer(Args &&... args)
constructs an observer to inspect the activities of worker threads
Definition: executor.hpp:626
T ref(T... args)
void dump(std::ostream &ostream) const
dump the timelines in JSON format to an ostream
Definition: observer.hpp:395
size_t num_observers() const
queries the number of observers
Definition: executor.hpp:656
execution interface for running a taskflow graph
Definition: executor.hpp:24
size_t num_workers() const
queries the number of worker threads (can be zero)
Definition: executor.hpp:406
size_t num_topologies() const
queries the number of running topologies at the time of this call
Definition: executor.hpp:416
building methods of a subflow graph in dynamic tasking
Definition: flow_builder.hpp:956
std::future< void > run_n(Taskflow &taskflow, size_t N)
runs the taskflow for N times
Definition: executor.hpp:1078
void join()
enables the subflow to join its parent task
Definition: executor.hpp:1266
void wait_for_all()
wait for all pending graphs to complete
Definition: executor.hpp:1257