DPC++ Runtime
Runtime libraries for oneAPI DPC++
scheduler.hpp
Go to the documentation of this file.
1 //==-------------- scheduler.hpp - SYCL standard header file ---------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
14 #include <sycl/detail/cg.hpp>
15 
16 #include <cstddef>
17 #include <memory>
18 #include <queue>
19 #include <set>
20 #include <shared_mutex>
21 #include <unordered_map>
22 #include <unordered_set>
23 #include <vector>
24 
170 
171 // For testing purposes
172 class MockScheduler;
173 
174 namespace sycl {
176 namespace detail {
177 class queue_impl;
178 class event_impl;
179 class context_impl;
180 class DispatchHostTask;
181 
182 using ContextImplPtr = std::shared_ptr<detail::context_impl>;
183 using EventImplPtr = std::shared_ptr<detail::event_impl>;
184 using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
185 using StreamImplPtr = std::shared_ptr<detail::stream_impl>;
186 
187 using QueueIdT = std::hash<std::shared_ptr<detail::queue_impl>>::result_type;
188 using CommandPtr = std::unique_ptr<Command>;
189 using FusionList = std::unique_ptr<KernelFusionCommand>;
190 using FusionMap = std::unordered_map<QueueIdT, FusionList>;
191 
198 struct MemObjRecord {
199  MemObjRecord(ContextImplPtr Ctx, std::size_t LeafLimit,
200  LeavesCollection::AllocateDependencyF AllocateDependency)
201  : MReadLeaves{this, LeafLimit, AllocateDependency},
202  MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx} {}
203 
204  // Contains all allocation commands for the memory object.
205  std::vector<AllocaCommandBase *> MAllocaCommands;
206 
207  // Contains latest read only commands working with memory object.
209 
210  // Contains latest write commands working with memory object.
212 
213  // The context which has the latest state of the memory object.
215 
216  // The mode this object can be accessed with from the host context.
217  // Valid only if the current context is host.
219 
220  // The flag indicates that the content of the memory object was/will be
221  // modified. Used while deciding if copy back needed.
222  bool MMemModified = false;
223 };
224 
259 //
260 // +----------+ +----------+ +----------+
261 // | | | | | |
262 // | Allocate |<----| Execute |<----| Execute |
263 // | | | | | |
264 // +----------+ +----------+ +----------+
265 //
293 // +----------+
294 // | |
295 // | D |
296 // | |
297 // +----------+
298 // / \
299 // / \
300 // v v
301 // +----------+ +----------+
302 // | | | |
303 // | B | | C |
304 // | | | |
305 // +----------+ +----------+
306 // \ /
307 // \ /
308 // v v
309 // +----------+
310 // | |
311 // | A |
312 // | |
313 // +----------+
363 class Scheduler {
364 public:
371  EventImplPtr addCG(std::unique_ptr<detail::CG> CommandGroup,
372  const QueueImplPtr &Queue);
373 
380  EventImplPtr addCopyBack(Requirement *Req);
381 
388  void waitForEvent(const EventImplPtr &Event);
389 
412  bool removeMemoryObject(detail::SYCLMemObjI *MemObj, bool StrictLock = true);
413 
424  EventImplPtr addHostAccessor(Requirement *Req);
425 
430  void releaseHostAccessor(Requirement *Req);
431 
433  static Scheduler &getInstance();
434 
435  QueueImplPtr getDefaultHostQueue() { return DefaultHostQueue; }
436 
437  const QueueImplPtr &getDefaultHostQueue() const { return DefaultHostQueue; }
438 
439  static MemObjRecord *getMemObjRecord(const Requirement *const Req);
440 
441  void deferMemObjRelease(const std::shared_ptr<detail::SYCLMemObjI> &MemObj);
442 
443  void startFusion(QueueImplPtr Queue);
444 
445  void cancelFusion(QueueImplPtr Queue);
446 
447  EventImplPtr completeFusion(QueueImplPtr Queue, const property_list &);
448 
449  bool isInFusionMode(QueueIdT Queue);
450 
451  Scheduler();
452  ~Scheduler();
453  void releaseResources();
454  bool isDeferredMemObjectsEmpty();
455 
456  void enqueueCommandForCG(EventImplPtr NewEvent,
457  std::vector<Command *> &AuxilaryCmds,
458  BlockingT Blocking = NON_BLOCKING);
459 
460 protected:
461  using RWLockT = std::shared_timed_mutex;
462  using ReadLockT = std::shared_lock<RWLockT>;
463  using WriteLockT = std::unique_lock<RWLockT>;
464 
468 #ifdef _WIN32
469  WriteLockT Lock(MGraphLock, std::defer_lock);
470  while (!Lock.try_lock_for(std::chrono::milliseconds(10))) {
471  // Without yield while loop acts like endless while loop and occupies the
472  // whole CPU when multiple command groups are created in multiple host
473  // threads
474  std::this_thread::yield();
475  }
476 #else
477  WriteLockT Lock(MGraphLock);
478  // It is a deadlock on UNIX in implementation of lock and lock_shared, if
479  // try_lock in the loop above will be executed, so using a single lock here
480 #endif // _WIN32
481  return Lock;
482  }
483 
486  ReadLockT acquireReadLock() { return ReadLockT{MGraphLock}; }
487 
488  void cleanupCommands(const std::vector<Command *> &Cmds);
489 
490  void NotifyHostTaskCompletion(Command *Cmd);
491 
492  static void enqueueLeavesOfReqUnlocked(const Requirement *const Req,
493  ReadLockT &GraphReadLock,
494  std::vector<Command *> &ToCleanUp);
495 
496  static void
497  enqueueUnblockedCommands(const std::vector<EventImplPtr> &CmdsToEnqueue,
498  ReadLockT &GraphReadLock,
499  std::vector<Command *> &ToCleanUp);
500 
501  // May lock graph with read and write modes during execution.
502  void cleanupDeferredMemObjects(BlockingT Blocking);
503 
504  // POD struct to convey some additional information from GraphBuilder::addCG
505  // to the Scheduler to support kernel fusion.
510  };
511 
512  void registerAuxiliaryResources(
513  EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources);
514  void cleanupAuxiliaryResources(BlockingT Blocking);
515 
522  class GraphBuilder {
523  public:
524  GraphBuilder();
525 
533  GraphBuildResult addCG(std::unique_ptr<detail::CG> CommandGroup,
534  const QueueImplPtr &Queue,
535  std::vector<Command *> &ToEnqueue);
536 
541  Command *addCGUpdateHost(std::unique_ptr<detail::CG> CommandGroup,
542  const QueueImplPtr &HostQueue,
543  std::vector<Command *> &ToEnqueue);
544 
548  Command *addCopyBack(Requirement *Req, std::vector<Command *> &ToEnqueue);
549 
553  Command *addHostAccessor(Requirement *Req,
554  std::vector<Command *> &ToEnqueue);
555 
557  void optimize();
558 
561  void optimize(const EventImplPtr &Event);
562 
563  void cleanupCommand(Command *Cmd, bool AllowUnsubmitted = false);
564 
571  void rescheduleCommand(Command *Cmd, const QueueImplPtr &Queue);
572 
575  MemObjRecord *getMemObjRecord(SYCLMemObjI *MemObject);
576 
579  MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue,
580  const Requirement *Req,
581  std::vector<Command *> &ToEnqueue);
582 
584  void decrementLeafCountersForRecord(MemObjRecord *Record);
585 
587  void cleanupCommandsForRecord(MemObjRecord *Record);
588 
590  void removeRecordForMemObj(SYCLMemObjI *MemObject);
591 
593  void addNodeToLeaves(MemObjRecord *Record, Command *Cmd,
595  std::vector<Command *> &ToEnqueue);
596 
598  void updateLeaves(const std::set<Command *> &Cmds, MemObjRecord *Record,
600  std::vector<Command *> &ToCleanUp);
601 
611  Command *connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent,
612  const DepDesc &Dep,
613  std::vector<Command *> &ToCleanUp);
614 
615  void startFusion(QueueImplPtr Queue);
616 
617  void cancelFusion(QueueImplPtr Queue, std::vector<Command *> &ToEnqueue);
618 
619  EventImplPtr completeFusion(QueueImplPtr Queue,
620  std::vector<Command *> &ToEnqueue,
621  const property_list &);
622 
623  bool isInFusionMode(QueueIdT queue);
624 
625  std::vector<SYCLMemObjI *> MMemObjs;
626 
627  private:
637  Command *insertMemoryMove(MemObjRecord *Record, Requirement *Req,
638  const QueueImplPtr &Queue,
639  std::vector<Command *> &ToEnqueue);
640 
641  // Inserts commands required to remap the memory object to its current host
642  // context so that the required access mode becomes valid.
643  Command *remapMemoryObject(MemObjRecord *Record, Requirement *Req,
644  AllocaCommandBase *HostAllocaCmd,
645  std::vector<Command *> &ToEnqueue);
646 
648  insertUpdateHostReqCmd(MemObjRecord *Record, Requirement *Req,
649  const QueueImplPtr &Queue,
650  std::vector<Command *> &ToEnqueue);
651 
653  std::set<Command *> findDepsForReq(MemObjRecord *Record,
654  const Requirement *Req,
655  const ContextImplPtr &Context);
656 
657  EmptyCommand *addEmptyCmd(Command *Cmd,
658  const std::vector<Requirement *> &Req,
659  const QueueImplPtr &Queue,
660  Command::BlockReason Reason,
661  std::vector<Command *> &ToEnqueue,
662  const bool AddDepsToLeaves = true);
663 
664  void createGraphForCommand(Command *NewCmd, CG &CG, bool isInteropTask,
665  std::vector<Requirement *> &Reqs,
666  const std::vector<detail::EventImplPtr> &Events,
667  QueueImplPtr Queue,
668  std::vector<Command *> &ToEnqueue);
669 
670  protected:
672  DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record);
673 
675  AllocaCommandBase *findAllocaForReq(MemObjRecord *Record,
676  const Requirement *Req,
677  const ContextImplPtr &Context,
678  bool AllowConst = true);
679 
680  friend class Command;
681 
682  private:
683  friend class ::MockScheduler;
684 
689  getOrCreateAllocaForReq(MemObjRecord *Record, const Requirement *Req,
690  const QueueImplPtr &Queue,
691  std::vector<Command *> &ToEnqueue);
692 
693  void markModifiedIfWrite(MemObjRecord *Record, Requirement *Req);
694 
695  FusionMap::iterator findFusionList(QueueIdT Id) {
696  return MFusionMap.find(Id);
697  }
698 
699  void removeNodeFromGraph(Command *Node, std::vector<Command *> &ToEnqueue);
700 
703  std::queue<Command *> MCmdsToVisit;
705  std::vector<Command *> MVisitedCmds;
706 
709  FusionMap MFusionMap;
710 
715  void printGraphAsDot(const char *ModeName);
716  enum PrintOptions {
717  BeforeAddCG = 0,
718  AfterAddCG,
719  BeforeAddCopyBack,
720  AfterAddCopyBack,
721  BeforeAddHostAcc,
722  AfterAddHostAcc,
723  AfterFusionComplete,
724  AfterFusionCancel,
725  Size
726  };
727  std::array<bool, PrintOptions::Size> MPrintOptionsArray{false};
728  };
729 
803  public:
811  static void waitForEvent(const EventImplPtr &Event,
812  ReadLockT &GraphReadLock,
813  std::vector<Command *> &ToCleanUp,
814  bool LockTheLock = true);
815 
825  static bool enqueueCommand(Command *Cmd, ReadLockT &GraphReadLock,
826  EnqueueResultT &EnqueueResult,
827  std::vector<Command *> &ToCleanUp,
828  Command *RootCommand,
829  BlockingT Blocking = NON_BLOCKING);
830 
843  static bool handleBlockingCmd(Command *Cmd, EnqueueResultT &EnqueueResult,
844  Command *RootCommand, BlockingT Blocking);
845  };
846 
855  void waitForRecordToFinish(MemObjRecord *Record, ReadLockT &GraphReadLock);
856  bool checkLeavesCompletion(MemObjRecord *Record);
857 
860 
861  std::vector<Command *> MDeferredCleanupCommands;
863 
864  std::vector<std::shared_ptr<SYCLMemObjI>> MDeferredMemObjRelease;
866 
867  std::unordered_map<EventImplPtr, std::vector<std::shared_ptr<const void>>>
870 
872 
873  // This thread local flag is a workaround for a problem with managing
874  // auxiliary resources. We would like to release internal buffers used for
875  // reductions in a deferred manner, but marking them individually isn't an
876  // option since all auxiliary resources (buffers, host memory, USM) are passed
877  // to the library as type erased shared pointers. This flag makes it so that
878  // release of every memory object is deferred while it's set, and it should
879  // only be set during release of auxiliary resources.
880  // TODO Remove once ABI breaking changes are allowed.
881  friend class SYCLMemObjT;
882  static thread_local bool ForceDeferredMemObjRelease;
884  ForceDeferredReleaseWrapper() { ForceDeferredMemObjRelease = true; };
885  ~ForceDeferredReleaseWrapper() { ForceDeferredMemObjRelease = false; };
886  };
887 
888  friend class Command;
889  friend class DispatchHostTask;
890  friend class queue_impl;
891  friend class event_impl;
892  friend class ::MockScheduler;
893 
894 private:
895  static void printFusionWarning(const std::string &Message);
896 
897  static KernelFusionCommand *isPartOfActiveFusion(Command *Cmd);
898 };
899 
900 } // namespace detail
901 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
902 } // namespace sycl
sycl::_V1::detail::Scheduler::GraphProcessor
Graph Processor provides interfaces for enqueueing commands and their dependencies to the underlying ...
Definition: scheduler.hpp:802
sycl::_V1::detail::Scheduler::MGraphBuilder
GraphBuilder MGraphBuilder
Definition: scheduler.hpp:858
sycl::_V1::property_list
Objects of the property_list class are containers for the SYCL properties.
Definition: property_list.hpp:24
sycl::_V1::detail::Command
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:99
sycl::_V1::detail::Scheduler::GraphBuildResult::NewCmd
Command * NewCmd
Definition: scheduler.hpp:507
cg.hpp
sycl::_V1::access::mode
mode
Definition: access.hpp:30
sycl_mem_obj_i.hpp
sycl::_V1::detail::ContextImplPtr
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:30
sycl::_V1::detail::QueueIdT
std::hash< std::shared_ptr< detail::queue_impl > >::result_type QueueIdT
Definition: scheduler.hpp:187
sycl::_V1::detail::AccessorImplHost
Definition: accessor_impl.hpp:42
__SYCL_INLINE_VER_NAMESPACE
#define __SYCL_INLINE_VER_NAMESPACE(X)
Definition: defines_elementary.hpp:11
sycl::_V1::detail::Scheduler::WriteLockT
std::unique_lock< RWLockT > WriteLockT
Definition: scheduler.hpp:463
sycl::_V1::detail::EnqueueResultT
Result of command enqueueing.
Definition: commands.hpp:54
sycl::_V1::detail::DispatchHostTask
Definition: commands.cpp:296
leaves_collection.hpp
sycl::_V1::detail::Scheduler::MDeferredMemReleaseMutex
std::mutex MDeferredMemReleaseMutex
Definition: scheduler.hpp:865
sycl::_V1::detail::Scheduler::MDeferredMemObjRelease
std::vector< std::shared_ptr< SYCLMemObjI > > MDeferredMemObjRelease
Definition: scheduler.hpp:864
sycl
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
sycl::_V1::detail::EmptyCommand
The empty command does nothing during enqueue.
Definition: commands.hpp:378
sycl::_V1::detail::Scheduler::GraphBuilder
Graph builder class.
Definition: scheduler.hpp:522
sycl::_V1::detail::KernelFusionCommand
The KernelFusionCommand is placed in the execution graph together with the individual kernels of the ...
Definition: commands.hpp:660
sycl::_V1::detail::Scheduler::MAuxiliaryResources
std::unordered_map< EventImplPtr, std::vector< std::shared_ptr< const void > > > MAuxiliaryResources
Definition: scheduler.hpp:868
sycl::_V1::detail::MemObjRecord::MemObjRecord
MemObjRecord(ContextImplPtr Ctx, std::size_t LeafLimit, LeavesCollection::AllocateDependencyF AllocateDependency)
Definition: scheduler.hpp:199
sycl::_V1::detail::Scheduler::RWLockT
std::shared_timed_mutex RWLockT
Definition: scheduler.hpp:461
sycl::_V1::detail::Scheduler::MAuxiliaryResourcesMutex
std::mutex MAuxiliaryResourcesMutex
Definition: scheduler.hpp:869
sycl::_V1::detail::MemObjRecord::MReadLeaves
LeavesCollection MReadLeaves
Definition: scheduler.hpp:208
sycl::_V1::detail::NON_BLOCKING
@ NON_BLOCKING
Definition: commands.hpp:51
sycl::_V1::detail::BlockingT
BlockingT
Definition: commands.hpp:51
sycl::_V1::detail::LeavesCollection
A wrapper for CircularBuffer class along with collection for host accessor's EmptyCommands.
Definition: leaves_collection.hpp:38
sycl::_V1::detail::Command::BlockReason
BlockReason
Definition: commands.hpp:311
sycl::_V1::detail::Scheduler::MGraphLock
RWLockT MGraphLock
Definition: scheduler.hpp:859
sycl::_V1::detail::MemObjRecord
Memory Object Record.
Definition: scheduler.hpp:198
sycl::_V1::detail::event_impl
Definition: event_impl.hpp:36
sycl::_V1::detail::MemObjRecord::MWriteLeaves
LeavesCollection MWriteLeaves
Definition: scheduler.hpp:211
sycl::_V1::queue
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:89
sycl::_V1::detail::Scheduler::ReadLockT
std::shared_lock< RWLockT > ReadLockT
Definition: scheduler.hpp:462
sycl::_V1::detail::Scheduler::ForceDeferredReleaseWrapper::ForceDeferredReleaseWrapper
ForceDeferredReleaseWrapper()
Definition: scheduler.hpp:884
commands.hpp
sycl::_V1::detail::DepDesc
Dependency between two commands.
Definition: commands.hpp:73
sycl::_V1::detail::MemObjRecord::MCurContext
ContextImplPtr MCurContext
Definition: scheduler.hpp:214
sycl::_V1::detail::Scheduler::MDeferredCleanupMutex
std::mutex MDeferredCleanupMutex
Definition: scheduler.hpp:862
sycl::_V1::read_write
constexpr mode_tag_t< access_mode::read_write > read_write
Definition: access.hpp:74
sycl::_V1::detail::EventImplPtr
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:42
sycl::_V1::detail::Scheduler::DefaultHostQueue
QueueImplPtr DefaultHostQueue
Definition: scheduler.hpp:871
sycl::_V1::detail::Scheduler::getDefaultHostQueue
const QueueImplPtr & getDefaultHostQueue() const
Definition: scheduler.hpp:437
sycl::_V1::detail::Scheduler::GraphBuildResult
Definition: scheduler.hpp:506
sycl::_V1::detail::LeavesCollection::AllocateDependencyF
std::function< void(Command *, Command *, MemObjRecord *, EnqueueListT &)> AllocateDependencyF
Definition: leaves_collection.hpp:46
sycl::_V1::detail::queue_impl
Definition: queue_impl.hpp:59
sycl::_V1::detail::Scheduler::acquireWriteLock
WriteLockT acquireWriteLock()
Provides exclusive access to std::shared_timed_mutex object with deadlock avoidance.
Definition: scheduler.hpp:467
sycl::_V1::detail::StreamImplPtr
std::shared_ptr< detail::stream_impl > StreamImplPtr
Definition: commands.hpp:42
sycl::_V1::detail::Scheduler::MDeferredCleanupCommands
std::vector< Command * > MDeferredCleanupCommands
Definition: scheduler.hpp:861
sycl::_V1::detail::FusionMap
std::unordered_map< QueueIdT, FusionList > FusionMap
Definition: scheduler.hpp:190
sycl::_V1::detail::Scheduler::GraphBuildResult::NewEvent
EventImplPtr NewEvent
Definition: scheduler.hpp:508
sycl::_V1::detail::Scheduler::getDefaultHostQueue
QueueImplPtr getDefaultHostQueue()
Definition: scheduler.hpp:435
sycl::_V1::detail::QueueImplPtr
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: event_impl.hpp:32
sycl::_V1::detail::Scheduler::acquireReadLock
ReadLockT acquireReadLock()
Provides shared access to std::shared_timed_mutex object with deadlock avoidance.
Definition: scheduler.hpp:486
sycl::_V1::detail::SYCLMemObjI
Definition: sycl_mem_obj_i.hpp:28
sycl::_V1::detail::CommandPtr
std::unique_ptr< Command > CommandPtr
Definition: scheduler.hpp:188
sycl::_V1::detail::SYCLMemObjT
Definition: sycl_mem_obj_t.hpp:39
sycl::_V1::detail::Scheduler
DPC++ graph scheduler class.
Definition: scheduler.hpp:363
sycl::_V1::detail::Scheduler::ForceDeferredMemObjRelease
static thread_local bool ForceDeferredMemObjRelease
Definition: scheduler.hpp:882
sycl::_V1::detail::CG
Base class for all types of command groups.
Definition: cg.hpp:52
sycl::_V1::detail::Scheduler::GraphBuilder::MMemObjs
std::vector< SYCLMemObjI * > MMemObjs
Definition: scheduler.hpp:625
sycl::_V1::detail::UpdateHostRequirementCommand
Definition: commands.hpp:641
sycl::_V1::detail::AllocaCommandBase
Base class for memory allocation commands.
Definition: commands.hpp:420
sycl::_V1::detail::FusionList
std::unique_ptr< KernelFusionCommand > FusionList
Definition: scheduler.hpp:189
sycl::_V1::AccessMode
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
Definition: accessor.hpp:2854
sycl::_V1::detail::Scheduler::ForceDeferredReleaseWrapper::~ForceDeferredReleaseWrapper
~ForceDeferredReleaseWrapper()
Definition: scheduler.hpp:885
sycl::_V1::detail::MemObjRecord::MAllocaCommands
std::vector< AllocaCommandBase * > MAllocaCommands
Definition: scheduler.hpp:205
sycl::_V1::detail::Scheduler::ForceDeferredReleaseWrapper
Definition: scheduler.hpp:883
sycl::_V1::detail::Scheduler::GraphBuildResult::ShouldEnqueue
bool ShouldEnqueue
Definition: scheduler.hpp:509