DPC++ Runtime
Runtime libraries for oneAPI DPC++
scheduler.hpp
Go to the documentation of this file.
1 //==-------------- scheduler.hpp - SYCL standard header file ---------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
14 #include <sycl/detail/cg.hpp>
15 
16 #include <cstddef>
17 #include <memory>
18 #include <queue>
19 #include <set>
20 #include <shared_mutex>
21 #include <unordered_map>
22 #include <unordered_set>
23 #include <vector>
24 
170 
171 // For testing purposes
172 class MockScheduler;
173 
174 namespace sycl {
176 namespace detail {
177 
178 class queue_impl;
179 class event_impl;
180 class context_impl;
181 class DispatchHostTask;
182 
183 using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
184 using EventImplPtr = std::shared_ptr<detail::event_impl>;
185 using ContextImplPtr = std::shared_ptr<detail::context_impl>;
186 
193 struct MemObjRecord {
194  MemObjRecord(ContextImplPtr Ctx, std::size_t LeafLimit,
195  LeavesCollection::AllocateDependencyF AllocateDependency)
196  : MReadLeaves{this, LeafLimit, AllocateDependency},
197  MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx} {}
198 
199  // Contains all allocation commands for the memory object.
200  std::vector<AllocaCommandBase *> MAllocaCommands;
201 
202  // Contains latest read only commands working with memory object.
204 
205  // Contains latest write commands working with memory object.
207 
208  // The context which has the latest state of the memory object.
210 
211  // The mode this object can be accessed with from the host context.
212  // Valid only if the current context is host.
213  access::mode MHostAccess = access::mode::read_write;
214 
215  // The flag indicates that the content of the memory object was/will be
216  // modified. Used while deciding if copy back needed.
217  bool MMemModified = false;
218 };
219 
254 //
255 // +----------+ +----------+ +----------+
256 // | | | | | |
257 // | Allocate |<----| Execute |<----| Execute |
258 // | | | | | |
259 // +----------+ +----------+ +----------+
260 //
288 // +----------+
289 // | |
290 // | D |
291 // | |
292 // +----------+
293 // / \
294 // / \
295 // v v
296 // +----------+ +----------+
297 // | | | |
298 // | B | | C |
299 // | | | |
300 // +----------+ +----------+
301 // \ /
302 // \ /
303 // v v
304 // +----------+
305 // | |
306 // | A |
307 // | |
308 // +----------+
358 class Scheduler {
359 public:
366  EventImplPtr addCG(std::unique_ptr<detail::CG> CommandGroup,
367  const QueueImplPtr &Queue);
368 
375  EventImplPtr addCopyBack(Requirement *Req);
376 
383  void waitForEvent(const EventImplPtr &Event);
384 
400  void removeMemoryObject(detail::SYCLMemObjI *MemObj);
401 
407  void cleanupFinishedCommands(const EventImplPtr &FinishedEvent);
408 
419  EventImplPtr addHostAccessor(Requirement *Req);
420 
425  void releaseHostAccessor(Requirement *Req);
426 
428  static Scheduler &getInstance();
429 
435  void allocateStreamBuffers(stream_impl *, size_t, size_t);
436 
440  void deallocateStreamBuffers(stream_impl *);
441 
442  QueueImplPtr getDefaultHostQueue() { return DefaultHostQueue; }
443 
444  const QueueImplPtr &getDefaultHostQueue() const { return DefaultHostQueue; }
445 
446  static MemObjRecord *getMemObjRecord(const Requirement *const Req);
447 
448  Scheduler();
449  ~Scheduler();
450 
451 protected:
452  using RWLockT = std::shared_timed_mutex;
453  using ReadLockT = std::shared_lock<RWLockT>;
454  using WriteLockT = std::unique_lock<RWLockT>;
455 
459 #ifdef _WIN32
460  WriteLockT Lock(MGraphLock, std::defer_lock);
461  while (!Lock.try_lock_for(std::chrono::milliseconds(10))) {
462  // Without yield while loop acts like endless while loop and occupies the
463  // whole CPU when multiple command groups are created in multiple host
464  // threads
465  std::this_thread::yield();
466  }
467 #else
468  WriteLockT Lock(MGraphLock);
469  // It is a deadlock on UNIX in implementation of lock and lock_shared, if
470  // try_lock in the loop above will be executed, so using a single lock here
471 #endif // _WIN32
472  return Lock;
473  }
474 
477  ReadLockT acquireReadLock() { return ReadLockT{MGraphLock}; }
478 
479  void cleanupCommands(const std::vector<Command *> &Cmds);
480 
481  void NotifyHostTaskCompletion(Command *Cmd, Command *BlockingCmd);
482 
483  static void enqueueLeavesOfReqUnlocked(const Requirement *const Req,
484  std::vector<Command *> &ToCleanUp);
485 
492  class GraphBuilder {
493  public:
494  GraphBuilder();
495 
501  Command *addCG(std::unique_ptr<detail::CG> CommandGroup,
502  const QueueImplPtr &Queue,
503  std::vector<Command *> &ToEnqueue);
504 
509  Command *addCGUpdateHost(std::unique_ptr<detail::CG> CommandGroup,
510  const QueueImplPtr &HostQueue,
511  std::vector<Command *> &ToEnqueue);
512 
516  Command *addCopyBack(Requirement *Req, std::vector<Command *> &ToEnqueue);
517 
521  Command *addHostAccessor(Requirement *Req,
522  std::vector<Command *> &ToEnqueue);
523 
525  void optimize();
526 
529  void optimize(const EventImplPtr &Event);
530 
531  void cleanupCommand(Command *Cmd);
532 
535  void cleanupFinishedCommands(
536  Command *FinishedCmd,
537  std::vector<std::shared_ptr<sycl::detail::stream_impl>> &,
538  std::vector<std::shared_ptr<const void>> &);
539 
546  void rescheduleCommand(Command *Cmd, const QueueImplPtr &Queue);
547 
550  MemObjRecord *getMemObjRecord(SYCLMemObjI *MemObject);
551 
554  MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue,
555  const Requirement *Req,
556  std::vector<Command *> &ToEnqueue);
557 
559  void decrementLeafCountersForRecord(MemObjRecord *Record);
560 
562  void cleanupCommandsForRecord(
563  MemObjRecord *Record,
564  std::vector<std::shared_ptr<sycl::detail::stream_impl>> &,
565  std::vector<std::shared_ptr<const void>> &);
566 
568  void removeRecordForMemObj(SYCLMemObjI *MemObject);
569 
571  void addNodeToLeaves(MemObjRecord *Record, Command *Cmd,
573  std::vector<Command *> &ToEnqueue);
574 
576  void updateLeaves(const std::set<Command *> &Cmds, MemObjRecord *Record,
578  std::vector<Command *> &ToCleanUp);
579 
589  Command *connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent,
590  const DepDesc &Dep,
591  std::vector<Command *> &ToCleanUp);
592 
593  std::vector<SYCLMemObjI *> MMemObjs;
594 
595  private:
605  Command *insertMemoryMove(MemObjRecord *Record, Requirement *Req,
606  const QueueImplPtr &Queue,
607  std::vector<Command *> &ToEnqueue);
608 
609  // Inserts commands required to remap the memory object to its current host
610  // context so that the required access mode becomes valid.
611  Command *remapMemoryObject(MemObjRecord *Record, Requirement *Req,
612  AllocaCommandBase *HostAllocaCmd,
613  std::vector<Command *> &ToEnqueue);
614 
616  insertUpdateHostReqCmd(MemObjRecord *Record, Requirement *Req,
617  const QueueImplPtr &Queue,
618  std::vector<Command *> &ToEnqueue);
619 
621  std::set<Command *> findDepsForReq(MemObjRecord *Record,
622  const Requirement *Req,
623  const ContextImplPtr &Context);
624 
625  template <typename T>
626  typename std::enable_if_t<
627  std::is_same<typename std::remove_cv_t<T>, Requirement>::value,
628  EmptyCommand *>
629  addEmptyCmd(Command *Cmd, const std::vector<T *> &Req,
630  const QueueImplPtr &Queue, Command::BlockReason Reason,
631  std::vector<Command *> &ToEnqueue,
632  const bool AddDepsToLeaves = true);
633 
634  protected:
636  DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record);
637 
639  AllocaCommandBase *findAllocaForReq(MemObjRecord *Record,
640  const Requirement *Req,
641  const ContextImplPtr &Context,
642  bool AllowConst = true);
643 
644  friend class Command;
645 
646  private:
647  friend class ::MockScheduler;
648 
653  getOrCreateAllocaForReq(MemObjRecord *Record, const Requirement *Req,
654  const QueueImplPtr &Queue,
655  std::vector<Command *> &ToEnqueue);
656 
657  void markModifiedIfWrite(MemObjRecord *Record, Requirement *Req);
658 
660  std::queue<Command *> MCmdsToVisit;
662  std::vector<Command *> MVisitedCmds;
667  void printGraphAsDot(const char *ModeName);
668  enum PrintOptions {
669  BeforeAddCG = 0,
670  AfterAddCG,
671  BeforeAddCopyBack,
672  AfterAddCopyBack,
673  BeforeAddHostAcc,
674  AfterAddHostAcc,
675  Size
676  };
677  std::array<bool, PrintOptions::Size> MPrintOptionsArray{false};
678  };
679 
753  public:
761  static void waitForEvent(const EventImplPtr &Event,
762  ReadLockT &GraphReadLock,
763  std::vector<Command *> &ToCleanUp,
764  bool LockTheLock = true);
765 
774  static bool enqueueCommand(Command *Cmd, EnqueueResultT &EnqueueResult,
775  std::vector<Command *> &ToCleanUp,
776  BlockingT Blocking = NON_BLOCKING);
777  };
778 
787  void waitForRecordToFinish(MemObjRecord *Record, ReadLockT &GraphReadLock);
788 
791 
792  std::vector<Command *> MDeferredCleanupCommands;
794 
796 
797  friend class Command;
798  friend class DispatchHostTask;
799  friend class queue_impl;
800  friend class event_impl;
801 
805  struct StreamBuffers {
806  StreamBuffers(size_t StreamBufferSize, size_t FlushBufferSize)
807  // Initialize stream buffer with zeros, this is needed for two reasons:
808  // 1. We don't need to care about end of line when printing out
809  // streamed data.
810  // 2. Offset is properly initialized.
811  : Data(StreamBufferSize, 0),
812  Buf(Data.data(), range<1>(StreamBufferSize),
814  FlushBuf(range<1>(FlushBufferSize)) {
815  // Disable copy back on buffer destruction. Copy is scheduled as a host
816  // task which fires up as soon as kernel has completed exectuion.
817  Buf.set_write_back(false);
818  FlushBuf.set_write_back(false);
819  }
820 
821  // Vector on the host side which is used to initialize the stream
822  // buffer
823  std::vector<char> Data;
824 
825  // Stream buffer
827 
828  // Global flush buffer
830  };
831 
832  friend class stream_impl;
833  friend void initStream(StreamImplPtr, QueueImplPtr);
834 
835  // Protects stream buffers pool
836  std::recursive_mutex StreamBuffersPoolMutex;
837 
838  // We need to store a pointer to the structure with stream buffers because we
839  // want to avoid a situation when buffers are destructed during destruction of
840  // the scheduler. Scheduler is a global object and it can be destructed after
841  // all device runtimes are unloaded. Destruction of the buffers at this stage
842  // will lead to a faliure. In the correct program there will be sync points
843  // for all kernels and all allocated resources will be released by the
844  // scheduler. If program is not correct and doesn't have necessary sync point
845  // then warning will be issued.
846  std::unordered_map<stream_impl *, StreamBuffers *> StreamBuffersPool;
847 };
848 
849 } // namespace detail
850 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
851 } // namespace sycl
Base class for memory allocation commands.
Definition: commands.hpp:373
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:95
The empty command does nothing during enqueue.
Definition: commands.hpp:332
A wrapper for CircularBuffer class along with collection for host accessor's EmptyCommands.
std::function< void(Command *, Command *, MemObjRecord *, EnqueueListT &)> AllocateDependencyF
void optimize(const EventImplPtr &Event)
[Provisional] Optimizes subgraph that consists of command associated with Event passed and its depend...
std::vector< SYCLMemObjI * > MMemObjs
Definition: scheduler.hpp:593
void optimize()
[Provisional] Optimizes the whole graph.
void rescheduleCommand(Command *Cmd, const QueueImplPtr &Queue)
Reschedules the command passed using Queue provided.
Graph Processor provides interfaces for enqueueing commands and their dependencies to the underlying ...
Definition: scheduler.hpp:752
DPC++ graph scheduler class.
Definition: scheduler.hpp:358
ReadLockT acquireReadLock()
Provides shared access to std::shared_timed_mutex object with deadlock avoidance.
Definition: scheduler.hpp:477
std::recursive_mutex StreamBuffersPoolMutex
Definition: scheduler.hpp:836
std::unique_lock< RWLockT > WriteLockT
Definition: scheduler.hpp:454
QueueImplPtr getDefaultHostQueue()
Definition: scheduler.hpp:442
std::shared_timed_mutex RWLockT
Definition: scheduler.hpp:452
std::shared_lock< RWLockT > ReadLockT
Definition: scheduler.hpp:453
std::unordered_map< stream_impl *, StreamBuffers * > StreamBuffersPool
Definition: scheduler.hpp:846
const QueueImplPtr & getDefaultHostQueue() const
Definition: scheduler.hpp:444
WriteLockT acquireWriteLock()
Provides exclusive access to std::shared_timed_mutex object with deadlock avoidance.
Definition: scheduler.hpp:458
std::vector< Command * > MDeferredCleanupCommands
Definition: scheduler.hpp:792
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: range.hpp:24
#define __SYCL_INLINE_VER_NAMESPACE(X)
void initStream(StreamImplPtr Stream, QueueImplPtr Queue)
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:30
std::shared_ptr< detail::stream_impl > StreamImplPtr
Definition: commands.hpp:38
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:42
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: event_impl.hpp:32
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
Definition: accessor.hpp:2782
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
Dependency between two commands.
Definition: commands.hpp:69
Result of command enqueueing.
Definition: commands.hpp:50
Memory Object Record.
Definition: scheduler.hpp:193
MemObjRecord(ContextImplPtr Ctx, std::size_t LeafLimit, LeavesCollection::AllocateDependencyF AllocateDependency)
Definition: scheduler.hpp:194
std::vector< AllocaCommandBase * > MAllocaCommands
Definition: scheduler.hpp:200
StreamBuffers(size_t StreamBufferSize, size_t FlushBufferSize)
Definition: scheduler.hpp:806