Ginkgo Generated from branch based on master. Ginkgo version 1.7.0
A numerical linear algebra library targeting many-core architectures
Loading...
Searching...
No Matches
executor.hpp
1/*******************************<GINKGO LICENSE>******************************
2Copyright (c) 2017-2023, the Ginkgo authors
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions
7are met:
8
91. Redistributions of source code must retain the above copyright
10notice, this list of conditions and the following disclaimer.
11
122. Redistributions in binary form must reproduce the above copyright
13notice, this list of conditions and the following disclaimer in the
14documentation and/or other materials provided with the distribution.
15
163. Neither the name of the copyright holder nor the names of its
17contributors may be used to endorse or promote products derived from
18this software without specific prior written permission.
19
20THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31******************************<GINKGO LICENSE>*******************************/
32
33#ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
34#define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
35
36
37#include <array>
38#include <atomic>
39#include <iostream>
40#include <memory>
41#include <mutex>
42#include <sstream>
43#include <string>
44#include <tuple>
45#include <type_traits>
46#include <vector>
47
48
49#include <ginkgo/core/base/device.hpp>
50#include <ginkgo/core/base/fwd_decls.hpp>
51#include <ginkgo/core/base/machine_topology.hpp>
52#include <ginkgo/core/base/memory.hpp>
53#include <ginkgo/core/base/scoped_device_id_guard.hpp>
54#include <ginkgo/core/base/types.hpp>
55#include <ginkgo/core/log/logger.hpp>
56#include <ginkgo/core/synthesizer/containers.hpp>
57
58
59namespace gko {
60
61
69 never,
76};
77
78
91enum class allocation_mode { device, unified_global, unified_host };
92
93
94#ifdef NDEBUG
95
96// When in release, prefer device allocations
97constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
98
99constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
100
101#else
102
103// When in debug, always UM allocations.
104constexpr allocation_mode default_cuda_alloc_mode =
105 allocation_mode::unified_global;
106
107#if (GINKGO_HIP_PLATFORM_HCC == 1)
108
109// HIP on AMD GPUs does not support UM, so always prefer device allocations.
110constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
111
112#else
113
114// HIP on NVIDIA GPUs supports UM, so prefer UM allocations.
115constexpr allocation_mode default_hip_alloc_mode =
116 allocation_mode::unified_global;
117
118#endif
119
120#endif
121
122
123} // namespace gko
124
125
130enum class dpcpp_queue_property {
134 in_order = 1,
135
139 enable_profiling = 2
140};
141
142GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
143 dpcpp_queue_property b)
144{
145 return static_cast<dpcpp_queue_property>(static_cast<int>(a) |
146 static_cast<int>(b));
147}
148
149
150namespace gko {
151
152
153#define GKO_FORWARD_DECLARE(_type, ...) class _type
154
155GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
156
157#undef GKO_FORWARD_DECLARE
158
159
160class ReferenceExecutor;
161
162
163namespace detail {
164
165
166template <typename>
167class ExecutorBase;
168
169
170} // namespace detail
171
172
288public:
289#define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
290 virtual void run(std::shared_ptr<const _type>) const
291
292 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
293
294#undef GKO_DECLARE_RUN_OVERLOAD
295
296 // ReferenceExecutor overload can be defaulted to OmpExecutor's
297 virtual void run(std::shared_ptr<const ReferenceExecutor> executor) const;
298
304 virtual const char* get_name() const noexcept;
305};
306
307
308namespace detail {
309
310
320template <typename Closure>
321class RegisteredOperation : public Operation {
322public:
329 RegisteredOperation(const char* name, Closure op)
330 : name_(name), op_(std::move(op))
331 {}
332
333 const char* get_name() const noexcept override { return name_; }
334
335 void run(std::shared_ptr<const ReferenceExecutor> exec) const override
336 {
337 op_(exec);
338 }
339
340 void run(std::shared_ptr<const OmpExecutor> exec) const override
341 {
342 op_(exec);
343 }
344
345 void run(std::shared_ptr<const CudaExecutor> exec) const override
346 {
347 op_(exec);
348 }
349
350 void run(std::shared_ptr<const HipExecutor> exec) const override
351 {
352 op_(exec);
353 }
354
355 void run(std::shared_ptr<const DpcppExecutor> exec) const override
356 {
357 op_(exec);
358 }
359
360private:
361 const char* name_;
362 Closure op_;
363};
364
365
366template <typename Closure>
367RegisteredOperation<Closure> make_register_operation(const char* name,
368 Closure op)
369{
370 return RegisteredOperation<Closure>{name, std::move(op)};
371}
372
373
374} // namespace detail
375
376
448#define GKO_REGISTER_OPERATION(_name, _kernel) \
449 template <typename... Args> \
450 auto make_##_name(Args&&... args) \
451 { \
452 return ::gko::detail::make_register_operation( \
453 #_kernel, [&args...](auto exec) { \
454 using exec_type = decltype(exec); \
455 if (std::is_same< \
456 exec_type, \
457 std::shared_ptr<const ::gko::ReferenceExecutor>>:: \
458 value) { \
459 ::gko::kernels::reference::_kernel( \
460 std::dynamic_pointer_cast< \
461 const ::gko::ReferenceExecutor>(exec), \
462 std::forward<Args>(args)...); \
463 } else if (std::is_same< \
464 exec_type, \
465 std::shared_ptr<const ::gko::OmpExecutor>>:: \
466 value) { \
467 ::gko::kernels::omp::_kernel( \
468 std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
469 exec), \
470 std::forward<Args>(args)...); \
471 } else if (std::is_same< \
472 exec_type, \
473 std::shared_ptr<const ::gko::CudaExecutor>>:: \
474 value) { \
475 ::gko::kernels::cuda::_kernel( \
476 std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
477 exec), \
478 std::forward<Args>(args)...); \
479 } else if (std::is_same< \
480 exec_type, \
481 std::shared_ptr<const ::gko::HipExecutor>>:: \
482 value) { \
483 ::gko::kernels::hip::_kernel( \
484 std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
485 exec), \
486 std::forward<Args>(args)...); \
487 } else if (std::is_same< \
488 exec_type, \
489 std::shared_ptr<const ::gko::DpcppExecutor>>:: \
490 value) { \
491 ::gko::kernels::dpcpp::_kernel( \
492 std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
493 exec), \
494 std::forward<Args>(args)...); \
495 } else { \
496 GKO_NOT_IMPLEMENTED; \
497 } \
498 }); \
499 } \
500 static_assert(true, \
501 "This assert is used to counter the false positive extra " \
502 "semi-colon warnings")
503
504
542#define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
543 template <typename... Args> \
544 auto make_##_name(Args&&... args) \
545 { \
546 return ::gko::detail::make_register_operation( \
547 #_kernel, \
548 [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
549 } \
550 static_assert(true, \
551 "This assert is used to counter the false positive extra " \
552 "semi-colon warnings")
553
554
555#define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
556
644class Executor : public log::EnableLogging<Executor> {
645 template <typename T>
646 friend class detail::ExecutorBase;
647
648 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
649 friend class ReferenceExecutor;
650
651public:
652 virtual ~Executor() = default;
653
654 Executor() = default;
655 Executor(Executor&) = delete;
656 Executor(Executor&&) = delete;
657 Executor& operator=(Executor&) = delete;
658 Executor& operator=(Executor&&) = delete;
659
665 virtual void run(const Operation& op) const = 0;
666
681 template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
682 typename ClosureDpcpp>
683 void run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
684 const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
685 {
686 LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip, ClosureDpcpp> op(
687 op_omp, op_cuda, op_hip, op_dpcpp);
688 this->run(op);
689 }
690
702 template <typename T>
703 T* alloc(size_type num_elems) const
704 {
705 this->template log<log::Logger::allocation_started>(
706 this, num_elems * sizeof(T));
707 T* allocated = static_cast<T*>(this->raw_alloc(num_elems * sizeof(T)));
708 this->template log<log::Logger::allocation_completed>(
709 this, num_elems * sizeof(T), reinterpret_cast<uintptr>(allocated));
710 return allocated;
711 }
712
720 void free(void* ptr) const noexcept
721 {
722 this->template log<log::Logger::free_started>(
723 this, reinterpret_cast<uintptr>(ptr));
724 this->raw_free(ptr);
725 this->template log<log::Logger::free_completed>(
726 this, reinterpret_cast<uintptr>(ptr));
727 }
728
741 template <typename T>
743 const T* src_ptr, T* dest_ptr) const
744 {
745 const auto src_loc = reinterpret_cast<uintptr>(src_ptr);
746 const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);
747 this->template log<log::Logger::copy_started>(
748 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
749 if (this != src_exec.get()) {
750 src_exec->template log<log::Logger::copy_started>(
751 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
752 }
753 try {
754 this->raw_copy_from(src_exec.get(), num_elems * sizeof(T), src_ptr,
755 dest_ptr);
756 } catch (NotSupported&) {
757#if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
758 // Unoptimized copy. Try to go through the masters.
759 // output to log when verbose >= 1 and debug build
760 std::clog << "Not direct copy. Try to copy data from the masters."
761 << std::endl;
762#endif
763 auto src_master = src_exec->get_master().get();
764 if (num_elems > 0 && src_master != src_exec.get()) {
765 auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
766 src_master->copy_from<T>(src_exec, num_elems, src_ptr,
767 master_ptr);
768 this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
769 src_master->free(master_ptr);
770 }
771 }
772 this->template log<log::Logger::copy_completed>(
773 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
774 if (this != src_exec.get()) {
775 src_exec->template log<log::Logger::copy_completed>(
776 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
777 }
778 }
779
791 template <typename T>
792 void copy(size_type num_elems, const T* src_ptr, T* dest_ptr) const
793 {
794 this->copy_from(this, num_elems, src_ptr, dest_ptr);
795 }
796
806 template <typename T>
807 T copy_val_to_host(const T* ptr) const
808 {
809 T out{};
810 this->get_master()->copy_from(this, 1, ptr, &out);
811 return out;
812 }
813
818 virtual std::shared_ptr<Executor> get_master() noexcept = 0;
819
823 virtual std::shared_ptr<const Executor> get_master() const noexcept = 0;
824
828 virtual void synchronize() const = 0;
829
836 void add_logger(std::shared_ptr<const log::Logger> logger) override
837 {
838 this->propagating_logger_refcount_.fetch_add(
839 logger->needs_propagation() ? 1 : 0);
840 this->EnableLogging<Executor>::add_logger(logger);
841 }
842
849 void remove_logger(const log::Logger* logger) override
850 {
851 this->propagating_logger_refcount_.fetch_sub(
852 logger->needs_propagation() ? 1 : 0);
853 this->EnableLogging<Executor>::remove_logger(logger);
854 }
855
856 using EnableLogging<Executor>::remove_logger;
857
866 {
867 log_propagation_mode_ = mode;
868 }
869
878 {
879 return this->propagating_logger_refcount_.load() > 0 &&
880 log_propagation_mode_ == log_propagation_mode::automatic;
881 }
882
890 bool memory_accessible(const std::shared_ptr<const Executor>& other) const
891 {
892 return this->verify_memory_from(other.get());
893 }
894
895 virtual scoped_device_id_guard get_scoped_device_id_guard() const = 0;
896
897protected:
902 struct exec_info {
906 int device_id = -1;
907
911 std::string device_type;
912
916 int numa_node = -1;
917
926 int num_computing_units = -1;
927
939 int num_pu_per_cu = -1;
940
949 std::vector<int> subgroup_sizes{};
950
959 int max_subgroup_size = -1;
960
971 std::vector<int> max_workitem_sizes{};
972
982 int max_workgroup_size;
983
987 int major = -1;
988
992 int minor = -1;
993
999 std::string pci_bus_id = std::string(13, 'x');
1000
1011 std::vector<int> closest_pu_ids{};
1012 };
1013
1019 const exec_info& get_exec_info() const { return this->exec_info_; }
1020
1030 virtual void* raw_alloc(size_type size) const = 0;
1031
1039 virtual void raw_free(void* ptr) const noexcept = 0;
1040
1051 virtual void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1052 const void* src_ptr, void* dest_ptr) const = 0;
1053
1063#define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1064 virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1065 const void* src_ptr, void* dest_ptr) const = 0
1066
1067 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1068
1069#undef GKO_ENABLE_RAW_COPY_TO
1070
1078 virtual bool verify_memory_from(const Executor* src_exec) const = 0;
1079
1089#define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1090 virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1091
1092 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1093
1094 GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1095
1096#undef GKO_ENABLE_VERIFY_MEMORY_TO
1097
1104 virtual void populate_exec_info(const machine_topology* mach_topo) = 0;
1105
1111 exec_info& get_exec_info() { return this->exec_info_; }
1112
1113 exec_info exec_info_;
1114
1115 log_propagation_mode log_propagation_mode_{log_propagation_mode::automatic};
1116
1117 std::atomic<int> propagating_logger_refcount_{};
1118
1119private:
1134 template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
1135 typename ClosureDpcpp>
1136 class LambdaOperation : public Operation {
1137 public:
1148 LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
1149 const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
1150 : op_omp_(op_omp),
1151 op_cuda_(op_cuda),
1152 op_hip_(op_hip),
1153 op_dpcpp_(op_dpcpp)
1154 {}
1155
1156 void run(std::shared_ptr<const OmpExecutor>) const override
1157 {
1158 op_omp_();
1159 }
1160
1161 void run(std::shared_ptr<const ReferenceExecutor>) const override
1162 {
1163 op_omp_();
1164 }
1165
1166 void run(std::shared_ptr<const CudaExecutor>) const override
1167 {
1168 op_cuda_();
1169 }
1170
1171 void run(std::shared_ptr<const HipExecutor>) const override
1172 {
1173 op_hip_();
1174 }
1175
1176 void run(std::shared_ptr<const DpcppExecutor>) const override
1177 {
1178 op_dpcpp_();
1179 }
1180
1181 private:
1182 ClosureOmp op_omp_;
1183 ClosureCuda op_cuda_;
1184 ClosureHip op_hip_;
1185 ClosureDpcpp op_dpcpp_;
1186 };
1187};
1188
1189
1198template <typename T>
1200public:
1201 using pointer = T*;
1202
1208 explicit executor_deleter(std::shared_ptr<const Executor> exec)
1209 : exec_{exec}
1210 {}
1211
1217 void operator()(pointer ptr) const
1218 {
1219 if (exec_) {
1220 exec_->free(ptr);
1221 }
1222 }
1223
1224private:
1225 std::shared_ptr<const Executor> exec_;
1226};
1227
1228// a specialization for arrays
1229template <typename T>
1231public:
1232 using pointer = T[];
1233
1234 explicit executor_deleter(std::shared_ptr<const Executor> exec)
1235 : exec_{exec}
1236 {}
1237
1238 void operator()(pointer ptr) const
1239 {
1240 if (exec_) {
1241 exec_->free(ptr);
1242 }
1243 }
1244
1245private:
1246 std::shared_ptr<const Executor> exec_;
1247};
1248
1249
1250namespace detail {
1251
1252
1253template <typename ConcreteExecutor>
1254class ExecutorBase : public Executor {
1255 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
1256 friend class ReferenceExecutor;
1257
1258public:
1259 using Executor::run;
1260
1261 void run(const Operation& op) const override
1262 {
1263 this->template log<log::Logger::operation_launched>(this, &op);
1264 auto scope_guard = get_scoped_device_id_guard();
1265 op.run(self()->shared_from_this());
1266 this->template log<log::Logger::operation_completed>(this, &op);
1267 }
1268
1269protected:
1270 void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1271 const void* src_ptr, void* dest_ptr) const override
1272 {
1273 src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
1274 }
1275
1276 virtual bool verify_memory_from(const Executor* src_exec) const override
1277 {
1278 return src_exec->verify_memory_to(self());
1279 }
1280
1281private:
1282 ConcreteExecutor* self() noexcept
1283 {
1284 return static_cast<ConcreteExecutor*>(this);
1285 }
1286
1287 const ConcreteExecutor* self() const noexcept
1288 {
1289 return static_cast<const ConcreteExecutor*>(this);
1290 }
1291};
1292
1293#undef GKO_DECLARE_EXECUTOR_FRIEND
1294
1295
1303class EnableDeviceReset {
1304public:
1310 GKO_DEPRECATED(
1311 "device_reset is no longer supported, call "
1312 "cudaDeviceReset/hipDeviceReset manually")
1313 void set_device_reset(bool device_reset) {}
1314
1320 GKO_DEPRECATED(
1321 "device_reset is no longer supported, call "
1322 "cudaDeviceReset/hipDeviceReset manually")
1323 bool get_device_reset() { return false; }
1324
1325protected:
1331 EnableDeviceReset() {}
1332
1333 GKO_DEPRECATED(
1334 "device_reset is no longer supported, call "
1335 "cudaDeviceReset/hipDeviceReset manually")
1336 EnableDeviceReset(bool device_reset) {}
1337};
1338
1339
1340} // namespace detail
1341
1342
1343#define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1344 void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1345 const void* src_ptr, void* dest_ptr) const override
1346
1347
1348#define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1349 virtual bool verify_memory_to(const dest_* other) const override \
1350 { \
1351 return bool_; \
1352 } \
1353 static_assert(true, \
1354 "This assert is used to counter the false positive extra " \
1355 "semi-colon warnings")
1356
1357
1365class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
1366 public std::enable_shared_from_this<OmpExecutor> {
1367 friend class detail::ExecutorBase<OmpExecutor>;
1368
1369public:
1373 static std::shared_ptr<OmpExecutor> create(
1374 std::shared_ptr<CpuAllocatorBase> alloc =
1375 std::make_shared<CpuAllocator>())
1376 {
1377 return std::shared_ptr<OmpExecutor>(new OmpExecutor(std::move(alloc)));
1378 }
1379
1380 std::shared_ptr<Executor> get_master() noexcept override;
1381
1382 std::shared_ptr<const Executor> get_master() const noexcept override;
1383
1384 void synchronize() const override;
1385
1386 int get_num_cores() const
1387 {
1388 return this->get_exec_info().num_computing_units;
1389 }
1390
1391 int get_num_threads_per_core() const
1392 {
1393 return this->get_exec_info().num_pu_per_cu;
1394 }
1395
1396 static int get_num_omp_threads();
1397
1398 scoped_device_id_guard get_scoped_device_id_guard() const override;
1399
1400protected:
1401 OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1402 : alloc_{std::move(alloc)}
1403 {
1404 this->OmpExecutor::populate_exec_info(machine_topology::get_instance());
1405 }
1406
1407 void populate_exec_info(const machine_topology* mach_topo) override;
1408
1409 void* raw_alloc(size_type size) const override;
1410
1411 void raw_free(void* ptr) const noexcept override;
1412
1413 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1414
1415 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, true);
1416
1417 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1418
1419 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1420
1421 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1422
1423 bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
1424
1425 std::shared_ptr<CpuAllocatorBase> alloc_;
1426};
1427
1428
1429namespace kernels {
1430namespace omp {
1431using DefaultExecutor = OmpExecutor;
1432} // namespace omp
1433} // namespace kernels
1434
1435
1444public:
1445 static std::shared_ptr<ReferenceExecutor> create(
1446 std::shared_ptr<CpuAllocatorBase> alloc =
1447 std::make_shared<CpuAllocator>())
1448 {
1449 return std::shared_ptr<ReferenceExecutor>(
1450 new ReferenceExecutor(std::move(alloc)));
1451 }
1452
1453 scoped_device_id_guard get_scoped_device_id_guard() const override
1454 {
1455 return {this, 0};
1456 }
1457
1458 void run(const Operation& op) const override
1459 {
1460 this->template log<log::Logger::operation_launched>(this, &op);
1461 op.run(std::static_pointer_cast<const ReferenceExecutor>(
1462 this->shared_from_this()));
1463 this->template log<log::Logger::operation_completed>(this, &op);
1464 }
1465
1466protected:
1467 ReferenceExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1468 : OmpExecutor{std::move(alloc)}
1469 {
1470 this->ReferenceExecutor::populate_exec_info(
1472 }
1473
1474 void populate_exec_info(const machine_topology*) override
1475 {
1476 this->get_exec_info().device_id = -1;
1477 this->get_exec_info().num_computing_units = 1;
1478 this->get_exec_info().num_pu_per_cu = 1;
1479 }
1480
1481 bool verify_memory_from(const Executor* src_exec) const override
1482 {
1483 return src_exec->verify_memory_to(this);
1484 }
1485
1486 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, true);
1487
1488 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1489
1490 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1491
1492 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1493
1494 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1495};
1496
1497
1498namespace kernels {
1499namespace reference {
1500using DefaultExecutor = ReferenceExecutor;
1501} // namespace reference
1502} // namespace kernels
1503
1504
1511class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
1512 public std::enable_shared_from_this<CudaExecutor>,
1513 public detail::EnableDeviceReset {
1514 friend class detail::ExecutorBase<CudaExecutor>;
1515
1516public:
1528 GKO_DEPRECATED(
1529 "device_reset is deprecated entirely, call cudaDeviceReset directly. "
1530 "alloc_mode was replaced by the Allocator type "
1531 "hierarchy.")
1532 static std::shared_ptr<CudaExecutor> create(
1533 int device_id, std::shared_ptr<Executor> master, bool device_reset,
1534 allocation_mode alloc_mode = default_cuda_alloc_mode,
1535 CUstream_st* stream = nullptr);
1536
1546 static std::shared_ptr<CudaExecutor> create(
1547 int device_id, std::shared_ptr<Executor> master,
1548 std::shared_ptr<CudaAllocatorBase> alloc =
1549 std::make_shared<CudaAllocator>(),
1550 CUstream_st* stream = nullptr);
1551
1552 std::shared_ptr<Executor> get_master() noexcept override;
1553
1554 std::shared_ptr<const Executor> get_master() const noexcept override;
1555
1556 void synchronize() const override;
1557
1558 scoped_device_id_guard get_scoped_device_id_guard() const override;
1559
1563 int get_device_id() const noexcept
1564 {
1565 return this->get_exec_info().device_id;
1566 }
1567
1571 static int get_num_devices();
1572
1577 {
1578 return this->get_exec_info().num_pu_per_cu;
1579 }
1580
1585 {
1586 return this->get_exec_info().num_computing_units;
1587 }
1588
1593 {
1594 return this->get_exec_info().num_computing_units *
1595 this->get_exec_info().num_pu_per_cu;
1596 }
1597
1602 {
1603 return this->get_exec_info().max_subgroup_size;
1604 }
1605
1610 {
1611 return this->get_exec_info().major;
1612 }
1613
1618 {
1619 return this->get_exec_info().minor;
1620 }
1621
1627 cublasContext* get_cublas_handle() const { return cublas_handle_.get(); }
1628
1635 {
1636 return cusparse_handle_.get();
1637 }
1638
1644 std::vector<int> get_closest_pus() const
1645 {
1646 return this->get_exec_info().closest_pu_ids;
1647 }
1648
1654 int get_closest_numa() const { return this->get_exec_info().numa_node; }
1655
1662 CUstream_st* get_stream() const { return stream_; }
1663
1664protected:
1665 void set_gpu_property();
1666
1667 void init_handles();
1668
1669 CudaExecutor(int device_id, std::shared_ptr<Executor> master,
1670 std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
1671 : alloc_{std::move(alloc)}, master_(master), stream_{stream}
1672 {
1673 this->get_exec_info().device_id = device_id;
1674 this->get_exec_info().num_computing_units = 0;
1675 this->get_exec_info().num_pu_per_cu = 0;
1676 this->CudaExecutor::populate_exec_info(
1678 this->set_gpu_property();
1679 this->init_handles();
1680 }
1681
1682 void* raw_alloc(size_type size) const override;
1683
1684 void raw_free(void* ptr) const noexcept override;
1685
1686 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1687
1688 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1689
1690 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1691
1692 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1693
1694 bool verify_memory_to(const HipExecutor* dest_exec) const override;
1695
1696 bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1697
1698 void populate_exec_info(const machine_topology* mach_topo) override;
1699
1700private:
1701 std::shared_ptr<Executor> master_;
1702
1703 template <typename T>
1704 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1705 handle_manager<cublasContext> cublas_handle_;
1706 handle_manager<cusparseContext> cusparse_handle_;
1707 std::shared_ptr<CudaAllocatorBase> alloc_;
1708 CUstream_st* stream_;
1709};
1710
1711
1712namespace kernels {
1713namespace cuda {
1714using DefaultExecutor = CudaExecutor;
1715} // namespace cuda
1716} // namespace kernels
1717
1718
1725class HipExecutor : public detail::ExecutorBase<HipExecutor>,
1726 public std::enable_shared_from_this<HipExecutor>,
1727 public detail::EnableDeviceReset {
1728 friend class detail::ExecutorBase<HipExecutor>;
1729
1730public:
1742 GKO_DEPRECATED(
1743 "device_reset is deprecated entirely, call hipDeviceReset directly. "
1744 "alloc_mode was replaced by the Allocator type "
1745 "hierarchy.")
1746 static std::shared_ptr<HipExecutor> create(
1747 int device_id, std::shared_ptr<Executor> master, bool device_reset,
1748 allocation_mode alloc_mode = default_hip_alloc_mode,
1749 GKO_HIP_STREAM_STRUCT* stream = nullptr);
1750
1751 static std::shared_ptr<HipExecutor> create(
1752 int device_id, std::shared_ptr<Executor> master,
1753 std::shared_ptr<HipAllocatorBase> alloc =
1754 std::make_shared<HipAllocator>(),
1755 GKO_HIP_STREAM_STRUCT* stream = nullptr);
1756
1757 std::shared_ptr<Executor> get_master() noexcept override;
1758
1759 std::shared_ptr<const Executor> get_master() const noexcept override;
1760
1761 void synchronize() const override;
1762
1763 scoped_device_id_guard get_scoped_device_id_guard() const override;
1764
1768 int get_device_id() const noexcept
1769 {
1770 return this->get_exec_info().device_id;
1771 }
1772
1776 static int get_num_devices();
1777
1781 int get_num_warps_per_sm() const noexcept
1782 {
1783 return this->get_exec_info().num_pu_per_cu;
1784 }
1785
1789 int get_num_multiprocessor() const noexcept
1790 {
1791 return this->get_exec_info().num_computing_units;
1792 }
1793
1797 int get_major_version() const noexcept
1798 {
1799 return this->get_exec_info().major;
1800 }
1801
1805 int get_minor_version() const noexcept
1806 {
1807 return this->get_exec_info().minor;
1808 }
1809
1813 int get_num_warps() const noexcept
1814 {
1815 return this->get_exec_info().num_computing_units *
1816 this->get_exec_info().num_pu_per_cu;
1817 }
1818
1822 int get_warp_size() const noexcept
1823 {
1824 return this->get_exec_info().max_subgroup_size;
1825 }
1826
1832 hipblasContext* get_hipblas_handle() const { return hipblas_handle_.get(); }
1833
1839 hipsparseContext* get_hipsparse_handle() const
1840 {
1841 return hipsparse_handle_.get();
1842 }
1843
1849 int get_closest_numa() const { return this->get_exec_info().numa_node; }
1850
1856 std::vector<int> get_closest_pus() const
1857 {
1858 return this->get_exec_info().closest_pu_ids;
1859 }
1860
1861 GKO_HIP_STREAM_STRUCT* get_stream() const { return stream_; }
1862
1863protected:
1864 void set_gpu_property();
1865
1866 void init_handles();
1867
1868 HipExecutor(int device_id, std::shared_ptr<Executor> master,
1869 std::shared_ptr<HipAllocatorBase> alloc,
1870 GKO_HIP_STREAM_STRUCT* stream)
1871 : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
1872 {
1873 this->get_exec_info().device_id = device_id;
1874 this->get_exec_info().num_computing_units = 0;
1875 this->get_exec_info().num_pu_per_cu = 0;
1876 this->HipExecutor::populate_exec_info(machine_topology::get_instance());
1877 this->set_gpu_property();
1878 this->init_handles();
1879 }
1880
1881 void* raw_alloc(size_type size) const override;
1882
1883 void raw_free(void* ptr) const noexcept override;
1884
1885 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1886
1887 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1888
1889 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1890
1891 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1892
1893 bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1894
1895 bool verify_memory_to(const HipExecutor* dest_exec) const override;
1896
1897 void populate_exec_info(const machine_topology* mach_topo) override;
1898
1899private:
1900 std::shared_ptr<Executor> master_;
1901
1902 template <typename T>
1903 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1904 handle_manager<hipblasContext> hipblas_handle_;
1905 handle_manager<hipsparseContext> hipsparse_handle_;
1906 std::shared_ptr<HipAllocatorBase> alloc_;
1907 GKO_HIP_STREAM_STRUCT* stream_;
1908};
1909
1910
1911namespace kernels {
1912namespace hip {
1913using DefaultExecutor = HipExecutor;
1914} // namespace hip
1915} // namespace kernels
1916
1917
1924class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
1925 public std::enable_shared_from_this<DpcppExecutor> {
1926 friend class detail::ExecutorBase<DpcppExecutor>;
1927
1928public:
1938 static std::shared_ptr<DpcppExecutor> create(
1939 int device_id, std::shared_ptr<Executor> master,
1940 std::string device_type = "all",
1941 dpcpp_queue_property property = dpcpp_queue_property::in_order);
1942
1943 std::shared_ptr<Executor> get_master() noexcept override;
1944
1945 std::shared_ptr<const Executor> get_master() const noexcept override;
1946
1947 void synchronize() const override;
1948
1949 scoped_device_id_guard get_scoped_device_id_guard() const override;
1950
1956 int get_device_id() const noexcept
1957 {
1958 return this->get_exec_info().device_id;
1959 }
1960
1961 sycl::queue* get_queue() const { return queue_.get(); }
1962
1970 static int get_num_devices(std::string device_type);
1971
1977 const std::vector<int>& get_subgroup_sizes() const noexcept
1978 {
1979 return this->get_exec_info().subgroup_sizes;
1980 }
1981
1987 int get_num_computing_units() const noexcept
1988 {
1989 return this->get_exec_info().num_computing_units;
1990 }
1991
1995 int get_num_subgroups() const noexcept
1996 {
1997 return this->get_exec_info().num_computing_units *
1998 this->get_exec_info().num_pu_per_cu;
1999 }
2000
2006 const std::vector<int>& get_max_workitem_sizes() const noexcept
2007 {
2008 return this->get_exec_info().max_workitem_sizes;
2009 }
2010
2016 int get_max_workgroup_size() const noexcept
2017 {
2018 return this->get_exec_info().max_workgroup_size;
2019 }
2020
2026 int get_max_subgroup_size() const noexcept
2027 {
2028 return this->get_exec_info().max_subgroup_size;
2029 }
2030
2036 std::string get_device_type() const noexcept
2037 {
2038 return this->get_exec_info().device_type;
2039 }
2040
2041protected:
2042 void set_device_property(
2043 dpcpp_queue_property property = dpcpp_queue_property::in_order);
2044
2046 int device_id, std::shared_ptr<Executor> master,
2047 std::string device_type = "all",
2048 dpcpp_queue_property property = dpcpp_queue_property::in_order)
2049 : master_(master)
2050 {
2051 std::for_each(device_type.begin(), device_type.end(),
2052 [](char& c) { c = std::tolower(c); });
2053 this->get_exec_info().device_type = std::string(device_type);
2054 this->get_exec_info().device_id = device_id;
2055 this->set_device_property(property);
2056 }
2057
2058 void populate_exec_info(const machine_topology* mach_topo) override;
2059
2060 void* raw_alloc(size_type size) const override;
2061
2062 void raw_free(void* ptr) const noexcept override;
2063
2064 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2065
2066 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
2067
2068 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
2069
2070 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
2071
2072 bool verify_memory_to(const OmpExecutor* dest_exec) const override;
2073
2074 bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
2075
2076private:
2077 std::shared_ptr<Executor> master_;
2078
2079 template <typename T>
2080 using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2081 queue_manager<sycl::queue> queue_;
2082};
2083
2084
2085namespace kernels {
2086namespace dpcpp {
2087using DefaultExecutor = DpcppExecutor;
2088} // namespace dpcpp
2089} // namespace kernels
2090
2091
2092#undef GKO_OVERRIDE_RAW_COPY_TO
2093
2094
2095} // namespace gko
2096
2097
2098#endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
Implement this interface to provide an allocator for CudaExecutor.
Definition memory.hpp:68
Allocator using cudaMalloc.
Definition memory.hpp:130
This is the Executor subclass which represents the CUDA device.
Definition executor.hpp:1513
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1644
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1654
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1576
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1609
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1584
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1601
cusparseContext * get_cusparse_handle() const
Get the cusparse handle for this executor.
Definition executor.hpp:1634
CUstream_st * get_stream() const
Returns the CUDA stream used by this executor.
Definition executor.hpp:1662
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1617
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1592
cublasContext * get_cublas_handle() const
Get the cublas handle for this executor.
Definition executor.hpp:1627
static int get_num_devices()
Get the number of devices present on the system.
This is the Executor subclass which represents a DPC++ enhanced device.
Definition executor.hpp:1925
const std::vector< int > & get_subgroup_sizes() const noexcept
Get the available subgroup sizes for this device.
Definition executor.hpp:1977
int get_num_computing_units() const noexcept
Get the number of Computing Units of this executor.
Definition executor.hpp:1987
int get_max_workgroup_size() const noexcept
Get the maximum workgroup size.
Definition executor.hpp:2016
int get_num_subgroups() const noexcept
Get the number of subgroups of this executor.
Definition executor.hpp:1995
const std::vector< int > & get_max_workitem_sizes() const noexcept
Get the maximum work item sizes.
Definition executor.hpp:2006
int get_max_subgroup_size() const noexcept
Get the maximum subgroup size.
Definition executor.hpp:2026
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
std::string get_device_type() const noexcept
Get a string representing the device type.
Definition executor.hpp:2036
static int get_num_devices(std::string device_type)
Get the number of devices present on the system.
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
The first step in using the Ginkgo library consists of creating an executor.
Definition executor.hpp:644
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition executor.hpp:720
virtual void run(const Operation &op) const =0
Runs the specified Operation using this Executor.
void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda, const ClosureHip &op_hip, const ClosureDpcpp &op_dpcpp) const
Runs one of the passed in functors, depending on the Executor type.
Definition executor.hpp:683
bool should_propagate_log() const
Returns true iff events occurring at an object created on this executor should be logged at propagati...
Definition executor.hpp:877
bool memory_accessible(const std::shared_ptr< const Executor > &other) const
Verifies whether the executors share the same memory.
Definition executor.hpp:890
void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data within this Executor.
Definition executor.hpp:792
void copy_from(ptr_param< const Executor > src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition executor.hpp:742
void set_log_propagation_mode(log_propagation_mode mode)
Sets the logger event propagation mode for the executor.
Definition executor.hpp:865
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition executor.hpp:703
virtual std::shared_ptr< Executor > get_master() noexcept=0
Returns the master OmpExecutor of this Executor.
T copy_val_to_host(const T *ptr) const
Retrieves a single element at the given location from executor memory.
Definition executor.hpp:807
void remove_logger(const log::Logger *logger) override
Definition executor.hpp:849
Implement this interface to provide an allocator for HipExecutor.
Definition memory.hpp:93
Definition memory.hpp:200
This is the Executor subclass which represents the HIP enhanced device.
Definition executor.hpp:1727
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1781
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1797
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1856
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1805
static int get_num_devices()
Get the number of devices present on the system.
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1789
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1813
hipsparseContext * get_hipsparse_handle() const
Get the hipsparse handle for this executor.
Definition executor.hpp:1839
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1849
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1822
hipblasContext * get_hipblas_handle() const
Get the hipblas handle for this executor.
Definition executor.hpp:1832
NotSupported is thrown in case it is not possible to perform the requested operation on the given obj...
Definition exception.hpp:156
This is the Executor subclass which represents the OpenMP device (typically CPU).
Definition executor.hpp:1366
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition executor.hpp:1373
Operations can be used to define functionalities whose implementations differ among devices.
Definition executor.hpp:287
virtual const char * get_name() const noexcept
Returns the operation's name.
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition executor.hpp:1443
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition executor.hpp:1458
This is a deleter that uses an executor's free method to deallocate the data.
Definition executor.hpp:1199
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition executor.hpp:1208
void operator()(pointer ptr) const
Deletes the object.
Definition executor.hpp:1217
EnableLogging is a mixin which should be inherited by any class which wants to enable logging.
Definition logger.hpp:777
Definition logger.hpp:104
virtual bool needs_propagation() const
Returns true if this logger, when attached to an Executor, needs to be forwarded all events from obje...
Definition logger.hpp:671
static machine_topology * get_instance()
Returns an instance of the machine_topology object.
Definition machine_topology.hpp:211
This class is used for function parameters in the place of raw pointers.
Definition utils_helper.hpp:71
T * get() const
Definition utils_helper.hpp:105
This move-only class uses RAII to set the device id within a scoped block, if necessary.
Definition scoped_device_id_guard.hpp:104
The Ginkgo namespace.
Definition abstract_factory.hpp:48
constexpr T one()
Returns the multiplicative identity for T.
Definition math.hpp:803
std::uintptr_t uintptr
Unsigned integer type capable of holding a pointer to void.
Definition types.hpp:172
std::size_t size_type
Integral type used for allocation quantities.
Definition types.hpp:120
log_propagation_mode
How Logger events are propagated to their Executor.
Definition executor.hpp:63
@ automatic
Events get reported to loggers attached to the triggering object and propagating loggers (Logger::nee...
@ never
Events only get reported at loggers attached to the triggering object.
allocation_mode
Specify the mode of allocation for CUDA/HIP GPUs.
Definition executor.hpp:91