Merge ~bullwinkle-team/ubuntu/+source/rocrand:bullwinkle/llvm-21/ubuntu/devel into ubuntu/+source/rocrand:ubuntu/devel

Proposed by Talha Can Havadar
Status: Merged
Merged at revision: a2767349c734c428c327fdbccec96d71247eb0c5
Proposed branch: ~bullwinkle-team/ubuntu/+source/rocrand:bullwinkle/llvm-21/ubuntu/devel
Merge into: ubuntu/+source/rocrand:ubuntu/devel
Diff against target: 28178 lines (+14021/-3963)
172 files modified
.github/CODEOWNERS (+3/-3)
.gitlab-ci.yml (+36/-18)
CHANGELOG.md (+67/-0)
CMakeLists.txt (+48/-42)
CONTRIBUTING.md (+2/-2)
LICENSE.md (+1/-1)
README.md (+79/-49)
benchmark/benchmark_curand_device_api.cpp (+68/-53)
benchmark/benchmark_curand_generate.cpp (+133/-63)
benchmark/benchmark_curand_host_api.cpp (+210/-134)
benchmark/benchmark_curand_kernel.cpp (+286/-183)
benchmark/benchmark_curand_utils.hpp (+14/-1)
benchmark/benchmark_rocrand_device_api.cpp (+50/-101)
benchmark/benchmark_rocrand_kernel.cpp (+64/-110)
benchmark/custom_csv_formater.hpp (+3/-10)
cmake/Dependencies.cmake (+96/-91)
cmake/Summary.cmake (+49/-1)
debian/changelog (+27/-0)
debian/control (+18/-12)
debian/librocrand-doc.doc-base (+2/-2)
debian/librocrand-doc.docs (+1/-1)
debian/librocrand-doc.links (+1/-1)
debian/not-installed (+1/-1)
debian/patches/0005-dont-set-rocm-path-in-cmake.patch (+60/-0)
debian/patches/0006-fix-doxygen-settings.patch (+31/-0)
debian/patches/Hide-internal-symbols.patch (+11/-11)
debian/patches/series (+2/-0)
debian/rules (+11/-4)
dev/null (+0/-170)
docs/api-reference/cpp-api.rst (+20/-12)
docs/api-reference/data-type-support.rst (+64/-48)
docs/api-reference/python-api.rst (+4/-4)
docs/conceptual/curand-compatibility.rst (+2/-2)
docs/conceptual/dynamic_ordering_configuration.rst (+88/-38)
docs/conceptual/programmers-guide.rst (+110/-92)
docs/conf.py (+7/-1)
docs/doxygen/Doxyfile (+582/-270)
docs/index.rst (+7/-3)
docs/install/installing.rst (+44/-21)
docs/license.rst (+1/-1)
docs/mainpage.dox (+2/-2)
docs/sphinx/_toc.yml.in (+1/-1)
docs/sphinx/requirements.in (+1/-1)
docs/sphinx/requirements.txt (+143/-8)
install (+0/-10)
library/CMakeLists.txt (+11/-35)
library/include/rocrand/rocrand.h (+123/-109)
library/include/rocrand/rocrand.hpp (+80/-78)
library/include/rocrand/rocrand_common.h (+35/-56)
library/include/rocrand/rocrand_discrete.h (+80/-78)
library/include/rocrand/rocrand_kernel.h (+3/-2)
library/include/rocrand/rocrand_lfsr113.h (+32/-29)
library/include/rocrand/rocrand_log_normal.h (+239/-251)
library/include/rocrand/rocrand_mrg31k3p.h (+27/-23)
library/include/rocrand/rocrand_mrg32k3a.h (+87/-105)
library/include/rocrand/rocrand_mrg32k3a_precomputed.h (+13/-13)
library/include/rocrand/rocrand_mtgp32.h (+30/-26)
library/include/rocrand/rocrand_mtgp32_11213.h (+3/-2)
library/include/rocrand/rocrand_normal.h (+139/-107)
library/include/rocrand/rocrand_philox4x32_10.h (+44/-51)
library/include/rocrand/rocrand_poisson.h (+65/-62)
library/include/rocrand/rocrand_scrambled_sobol32.h (+19/-16)
library/include/rocrand/rocrand_scrambled_sobol32_constants.h (+4/-1)
library/include/rocrand/rocrand_scrambled_sobol32_precomputed.h (+3/-2)
library/include/rocrand/rocrand_scrambled_sobol64.h (+19/-18)
library/include/rocrand/rocrand_scrambled_sobol64_constants.h (+4/-1)
library/include/rocrand/rocrand_scrambled_sobol64_precomputed.h (+3/-2)
library/include/rocrand/rocrand_sobol32.h (+16/-14)
library/include/rocrand/rocrand_sobol32_precomputed.h (+4/-2)
library/include/rocrand/rocrand_sobol64.h (+16/-14)
library/include/rocrand/rocrand_sobol64_precomputed.h (+4/-2)
library/include/rocrand/rocrand_threefry2_impl.h (+7/-4)
library/include/rocrand/rocrand_threefry2x32_20.h (+18/-13)
library/include/rocrand/rocrand_threefry2x64_20.h (+18/-13)
library/include/rocrand/rocrand_threefry4_impl.h (+5/-3)
library/include/rocrand/rocrand_threefry4x32_20.h (+18/-13)
library/include/rocrand/rocrand_threefry4x64_20.h (+18/-13)
library/include/rocrand/rocrand_threefry_common.h (+2/-2)
library/include/rocrand/rocrand_uniform.h (+105/-80)
library/include/rocrand/rocrand_version.h.in (+1/-2)
library/include/rocrand/rocrand_xorwow.h (+29/-25)
library/src/fortran/CMakeLists.txt (+1/-1)
library/src/fortran/README.md (+33/-20)
library/src/fortran/rocrand_m.f90 (+2/-1)
library/src/rng/common.hpp (+7/-4)
library/src/rng/config/lfsr113_config.hpp (+2/-0)
library/src/rng/config/mrg31k3p_config.hpp (+2/-0)
library/src/rng/config/mrg32k3a_config.hpp (+2/-0)
library/src/rng/config/mt19937_config.hpp (+4/-2)
library/src/rng/config/mtgp32_config.hpp (+2/-0)
library/src/rng/config/philox4_32_10_config.hpp (+2/-0)
library/src/rng/config/threefry2_32_20_config.hpp (+2/-0)
library/src/rng/config/threefry2_64_20_config.hpp (+2/-0)
library/src/rng/config/threefry4_32_20_config.hpp (+2/-0)
library/src/rng/config/threefry4_64_20_config.hpp (+2/-0)
library/src/rng/config/xorwow_config.hpp (+2/-0)
library/src/rng/config_types.hpp (+10/-5)
library/src/rng/distribution/discrete.hpp (+2/-3)
library/src/rng/distribution/log_normal.hpp (+2/-2)
library/src/rng/distribution/normal.hpp (+3/-5)
library/src/rng/distribution/poisson.hpp (+5/-4)
library/src/rng/distribution/uniform.hpp (+3/-5)
library/src/rng/distributions.hpp (+3/-1)
library/src/rng/generator_type.hpp (+4/-3)
library/src/rng/lfsr113.hpp (+3/-1)
library/src/rng/mrg.hpp (+3/-1)
library/src/rng/mt19937.hpp (+51/-45)
library/src/rng/mtgp32.hpp (+5/-3)
library/src/rng/philox4x32_10.hpp (+3/-1)
library/src/rng/sobol.hpp (+41/-14)
library/src/rng/threefry.hpp (+3/-1)
library/src/rng/utils/cpp_utils.hpp (+2/-4)
library/src/rng/xorwow.hpp (+3/-1)
library/src/rocrand.cpp (+13/-1)
python/rocrand/setup.py (+1/-3)
rmake.py (+6/-4)
scripts/config-tuning/requirements.txt (+1/-1)
test/cpp_wrapper/CMakeLists.txt (+9/-3)
test/cpp_wrapper/cmake/Dependencies.cmake (+13/-32)
test/internal/test_normal_distribution.cpp (+1/-2)
test/internal/test_rocrand_config_dispatch.cpp (+17/-13)
test/internal/test_rocrand_discrete.cpp (+959/-0)
test/internal/test_rocrand_generator_type.cpp (+13/-1)
test/internal/test_rocrand_host_prng.hpp (+690/-0)
test/internal/test_rocrand_lfsr113_prng.cpp (+30/-1)
test/internal/test_rocrand_log_normal.cpp (+585/-0)
test/internal/test_rocrand_mrg31k3p_prng.cpp (+187/-0)
test/internal/test_rocrand_mrg32k3a_prng.cpp (+187/-0)
test/internal/test_rocrand_mrg_prng.cpp (+1/-75)
test/internal/test_rocrand_mt19937_octo_engine_prng.cpp (+299/-0)
test/internal/test_rocrand_mt19937_prng.cpp (+94/-8)
test/internal/test_rocrand_mtgp32_prng.cpp (+408/-1)
test/internal/test_rocrand_normal.cpp (+815/-0)
test/internal/test_rocrand_philox_prng.cpp (+31/-2)
test/internal/test_rocrand_poisson.cpp (+624/-0)
test/internal/test_rocrand_scrambled_sobol32_qrng.cpp (+494/-0)
test/internal/test_rocrand_scrambled_sobol64_qrng.cpp (+492/-0)
test/internal/test_rocrand_sobol32_qrng.cpp (+442/-0)
test/internal/test_rocrand_sobol64_qrng.cpp (+439/-0)
test/internal/test_rocrand_sobol_qrng.hpp (+4/-4)
test/internal/test_rocrand_threefry2x32_20_prng.cpp (+125/-2)
test/internal/test_rocrand_threefry2x64_20_prng.cpp (+104/-2)
test/internal/test_rocrand_threefry4x32_20_prng.cpp (+130/-2)
test/internal/test_rocrand_threefry4x64_20_prng.cpp (+129/-2)
test/internal/test_rocrand_xorwow_prng.cpp (+2/-2)
test/internal/test_uniform_distribution.cpp (+1220/-1)
test/package/CMakeLists.txt (+9/-2)
test/parity/CMakeLists.txt (+8/-2)
test/test_common.hpp (+20/-1)
test/test_rocrand_cpp_wrapper.cpp (+1/-2)
test/test_rocrand_generate_log_normal.cpp (+76/-36)
test/test_rocrand_generate_normal.cpp (+76/-36)
test/test_rocrand_generate_poisson.cpp (+21/-1)
test/test_rocrand_generate_uniform.cpp (+66/-1)
test/test_rocrand_hipgraphs.cpp (+5/-0)
test/test_rocrand_host.cpp (+6/-5)
test/test_rocrand_kernel_lfsr113.cpp (+24/-19)
test/test_rocrand_kernel_mrg.cpp (+24/-18)
test/test_rocrand_kernel_mtgp32.cpp (+104/-23)
test/test_rocrand_kernel_philox4x32_10.cpp (+120/-140)
test/test_rocrand_kernel_scrambled_sobol32.cpp (+108/-19)
test/test_rocrand_kernel_scrambled_sobol64.cpp (+105/-18)
test/test_rocrand_kernel_sobol32.cpp (+182/-106)
test/test_rocrand_kernel_sobol64.cpp (+203/-104)
test/test_rocrand_kernel_threefry2x32_20.cpp (+24/-18)
test/test_rocrand_kernel_threefry2x64_20.cpp (+26/-20)
test/test_rocrand_kernel_threefry4x32_20.cpp (+24/-18)
test/test_rocrand_kernel_threefry4x64_20.cpp (+24/-18)
test/test_rocrand_kernel_xorwow.cpp (+119/-134)
toolchain-linux.cmake (+11/-5)
toolchain-windows.cmake (+1/-1)
tools/mrg32k3a_precomputed_generator.cpp (+82/-82)
Reviewer Review Type Date Requested Status
Andreas Hasenack Approve
Ubuntu Sponsors Pending
Review via email: mp+499170@code.launchpad.net

Description of the change

See the build in this ppa: https://launchpad.net/~tchavadar/+archive/ubuntu/mp499170/+packages
  - all archs and -proposed enabled

Because of this particular bug in llvm-toolchain https://bugs.launchpad.net/ubuntu/+source/llvm-toolchain-21/+bug/2138890 (due to python 3.14 migration in the archive) I needed to add following dependencies to see if the rocrand is even building:
1. https://launchpad.net/~igorluppi/+archive/ubuntu/lp2138890-default for llvm-toolchain-21
2. https://launchpad.net/~tchavadar/+archive/ubuntu/lp2138906 => for rocm-llvm
3. https://launchpad.net/~tchavadar/+archive/ubuntu/lp2138659 => for rocr-runtime

Autopkgtest results will be here: https://autopkgtest.ubuntu.com/user/tchavadar/ppa/mp499170

To post a comment you must log in.
Revision history for this message
Talha Can Havadar (tchavadar) wrote (last edit ):

Rebased to fix merge conflict: see the diff before force push (so no functional change):

```
 git diff origin/bullwinkle/llvm-21/ubuntu/devel
diff --git a/debian/changelog b/debian/changelog
index 691bc9d..6f5a793 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -21,14 +21,17 @@ rocrand (7.1.0-0ubuntu1) resolute; urgency=medium

  -- Tim Flink <email address hidden> Wed, 21 Jan 2026 12:11:41 -0600

-rocrand (6.4.3-1~exp1ubuntu1) questing; urgency=medium
+rocrand (6.4.3-1) unstable; urgency=medium

- * d/control: update build-depends
- * d/control: update maintainer field
- * d/rules: fix FTBFS by adding -Wl,--gc-sections to flags
+ * Upload to unstable
+ * Update my email address and copyright
+ * Drop inactive maintainers from uploaders
+ * Update symbol tracking with optional symbols
+
+ [ Igor Luppi ]
   * d/rules: use GPU_TARGETS instead of deprecated AMDGPU_TARGETS

- -- Igor Luppi <email address hidden> Tue, 14 Oct 2025 15:45:14 -0300
+ -- Cordell Bloor <email address hidden> Tue, 28 Oct 2025 21:31:30 -0600

 rocrand (6.4.3-1~exp1) experimental; urgency=medium

diff --git a/debian/copyright b/debian/copyright
index de0b4f8..db61ae1 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -79,7 +79,7 @@ Comment:
 Files: debian/*
 Copyright: 2022 Maxime Chambonnet <email address hidden>
            2022 Étienne Mollier <email address hidden>
- 2022-2024, Cordell Bloor <email address hidden>
+ 2022-2025, Cordell Bloor <email address hidden>
            2023-2025, Christian Kastner <email address hidden>
 License: Expat

```

Revision history for this message
Igor Luppi (igorluppi) wrote :
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Please evaluate the reverse dependencies on ppc64el that will also have to be dropped:
--- a/debian/control
+++ b/debian/control
@@ -30,7 +36,7 @@ Rules-Requires-Root: no

 Package: librocrand1
 Section: libs
-Architecture: amd64 arm64 ppc64el
+Architecture: amd64 arm64
 XB-X-ROCm-GPU-Architecture: ${rocm:GPU-Architecture}
 Depends: ${misc:Depends}, ${shlibs:Depends},
 Description: generate pseudo- and quasi-random numbers - library
@@ -45,7 +51,7 @@ Description: generate pseudo- and quasi-random numbers - library

 Package: librocrand-dev
 Section: libdevel
-Architecture: amd64 arm64 ppc64el
+Architecture: amd64 arm64
 Depends: librocrand1 (= ${binary:Version}),
          libamdhip64-dev,
          ${misc:Depends},
@@ -63,7 +69,7 @@ Description: generate pseudo- and quasi-random numbers - headers

 Package: librocrand1-tests
 Section: libdevel
-Architecture: amd64 arm64 ppc64el
+Architecture: amd64 arm64
 Build-Profiles: <!nocheck>
 Depends: librocrand1 (= ${binary:Version}),${misc:Depends}, ${shlibs:Depends},
 Description: generate pseudo- and quasi-random numbers - test binaries

review: Needs Information
Revision history for this message
Igor Luppi (igorluppi) wrote :

Safe to delete:

for i in librocrand1 librocrand-dev librocrand1-tests; do reverse-depends --arch ppc64el $i -x; done
Reverse-Depends
===============
* librocrand-dev
* librocrand1-tests

No reverse dependencies found
No reverse dependencies found

Its within rocm stack.

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

 Build-Depends: debhelper-compat (= 13),
                cmake,
- hipcc (>= 7.0~),
- libamd-comgr-dev (>= 6.4~),
- libamdhip64-dev (>= 6.4~),
- libhsa-runtime-dev (>= 6.4~),
+ hipcc (>= 6.0.0),
+ libamd-comgr-dev (>= 6.0~),
+ libhsa-runtime-dev (>= 6.0.0~),

Are these new versions correct? They are lower now, in the higher 7.1.0 rocrand?

review: Needs Information
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

In d/rules:
@@ -15,6 +21,9 @@ CMAKE_FLAGS = \
    -DROCM_SYMLINK_LIBS=OFF \
    -DGPU_TARGETS="$(shell rocm-target-arch --sep ';')" \
    -DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF \
+ --debug-trycompile \
+ -DCMAKE_VERBOSE_MAKEFILE=ON \
+ -DCMAKE_PREFIX_PATH=/usr/lib/llvm-21/lib \
    -DENABLE_INLINE_ASM=1
 ifeq (,$(filter nocheck,$(DEB_BUILD_PROFILES)))
 CMAKE_FLAGS += -DBUILD_TEST=ON

This "--debug-trycompile", it's not mentioned explicitly in d/changelog. Is this a leftover from debugging, or an intended change?

review: Needs Information
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

d/rules:

+ rocm-docs-build -r $(shell dpkg-parsechangelog -S Version | sed 's/-.*//')

I checked other rocm packages, and this is the only one (from my small sample) which uses the -r switch. Without this switch, I understand that rocm-docs-build will fetch the version from the internet, so I'm not even sure if it's able to do that in these other packages (maybe they get a null version, or if not, then the wrong version: upstream is at 7.2 now).

So, I do think passing -r seems correct for two reasons:
- set the correct version
- bypass the internet fetch

I was then checking if there was another way to get the version, as fetching it from d/changelog is usually frowned upon. But I didn't see a clear way to do it. The closest I got was this line in CMakeLists.txt:

./CMakeLists.txt:math(EXPR rocrand_VERSION_NUMBER "${rocRAND_VERSION_MAJOR} * 100000 + ${rocRAND_VERSION_MINOR} * 100 + ${rocRAND_VERSION_PATCH}")

So, this is ok. If you find a way to not use d/changelog in the future, that will be better. And perhaps also check which version is being used in the other packages that just call rocm-docs-build without -r.

Revision history for this message
Talha Can Havadar (tchavadar) wrote :

Thank you very much for the review Andreas,

About docs versioning, I will check if there is any other easy way to fetch upstream version of the current package but I also think this is a good option we have here that is working.

- I do feel `--debug-trycompile` is a debugging artifact gonna try a build without to verify.

- the version mismatch might be happened during rebase, to fix the conflict gonna update them

Revision history for this message
Talha Can Havadar (tchavadar) wrote :

checked the reflog for version mismatch seems like it is not because of rebase :) but updated the versions and removed debug-trycompile and make docs versioning to take possible `+dfsg` additions in the version as well (not necessary but makes it more generic)

gonna trigger a new build in ppa and once published trigger autopkg tests again

Revision history for this message
Talha Can Havadar (tchavadar) wrote (last edit ):

amd64 builds are struggling in LP so I built this package in strix halo machine we have see the logs: https://pastebin.ubuntu.com/p/tHNJpnSzbm/

proposed enabled local build logs: https://pastebin.ubuntu.com/p/GBk2kS5zbS/

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

+1

review: Approve
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Sponsored:

Uploading rocrand_7.1.0-0ubuntu1.dsc
Uploading rocrand_7.1.0.orig.tar.gz
Uploading rocrand_7.1.0-0ubuntu1.debian.tar.xz
Uploading rocrand_7.1.0-0ubuntu1_source.buildinfo
Uploading rocrand_7.1.0-0ubuntu1_source.changes

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
2index 65006ad..29bce98 100644
3--- a/.github/CODEOWNERS
4+++ b/.github/CODEOWNERS
5@@ -1,10 +1,10 @@
6 # Global owners
7-* @stanleytsang-amd @umfranzw @RobsonRLemos @lawruble13
8+* @stanleytsang-amd @umfranzw @RobsonRLemos
9
10 # Documentation files
11-docs/* @ROCm/rocm-documentation
12+docs/ @ROCm/rocm-documentation
13 *.md @ROCm/rocm-documentation
14 *.rst @ROCm/rocm-documentation
15
16 # Header directory
17-library/include/* @ROCm/rocm-documentation @stanleytsang-amd @umfranzw @RobsonRLemos @lawruble13
18+library/include/ @ROCm/rocm-documentation @stanleytsang-amd @umfranzw @RobsonRLemos
19diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
20index 93e3a4f..a829b55 100644
21--- a/.gitlab-ci.yml
22+++ b/.gitlab-ci.yml
23@@ -1,6 +1,6 @@
24 # MIT License
25 #
26-# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
27+# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
28 #
29 # Permission is hereby granted, free of charge, to any person obtaining a copy
30 # of this software and associated documentation files (the "Software"), to deal
31@@ -157,12 +157,12 @@ copyright-date:
32 -D BENCHMARK_TUNING_BLOCK_OPTIONS="128, 256"
33 -D BUILD_FORTRAN_WRAPPER=ON
34 -D DEPENDENCIES_FORCE_DOWNLOAD=ON
35- -D BUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF
36 -D BUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
37 -D AMDGPU_TARGETS=${GPU_TARGETS}
38 -D DISABLE_WERROR=OFF
39 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
40 -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
41+ -D CMAKE_CXX_STANDARD="$BUILD_VERSION"
42 - cmake --build $CI_PROJECT_DIR/build
43 - if [[ "${BUILD_SHARED_LIBS}" = "ON" ]]; then cmake --build $CI_PROJECT_DIR/build --target package; fi
44
45@@ -183,12 +183,12 @@ copyright-date:
46 -D BUILD_BENCHMARK=ON
47 -D BUILD_FORTRAN_WRAPPER=ON
48 -D DEPENDENCIES_FORCE_DOWNLOAD=ON
49- -D BUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF
50 -D BUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
51 -D NVGPU_TARGETS=${GPU_TARGETS}
52 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
53 -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
54 -D CMAKE_CUDA_COMPILER_LAUNCHER=phc_sccache_cuda
55+ -D CMAKE_CXX_STANDARD=17
56 - cmake --build $CI_PROJECT_DIR/build
57 - if [[ "${BUILD_SHARED_LIBS}" = "ON" ]]; then cmake --build $CI_PROJECT_DIR/build --target package; fi
58
59@@ -196,15 +196,14 @@ copyright-date:
60 .save-artifacts:
61 artifacts:
62 paths:
63+ - $CI_PROJECT_DIR/build/lib/
64 - $CI_PROJECT_DIR/build/library/
65 - $CI_PROJECT_DIR/build/test/test_*
66 - $CI_PROJECT_DIR/build/**/CTestTestfile.cmake
67 - $CI_PROJECT_DIR/build/benchmark/benchmark_*
68- - $CI_PROJECT_DIR/build/deps/googlebenchmark/
69 - $CI_PROJECT_DIR/build/CMakeCache.txt
70 - $CI_PROJECT_DIR/build/*.deb
71 - $CI_PROJECT_DIR/build/*.zip
72- expire_in: 2 weeks
73
74 build:rocm-cmake-minimum:
75 tags:
76@@ -217,6 +216,7 @@ build:rocm-cmake-minimum:
77 variables:
78 BUILD_SHARED_LIBS: "ON"
79 BUILD_BENCHMARK_TUNING: "ON"
80+ BUILD_VERSION: 17
81
82 build:rocm-hipcc-cmake-minimum:
83 tags:
84@@ -228,6 +228,7 @@ build:rocm-hipcc-cmake-minimum:
85 - .save-artifacts
86 variables:
87 BUILD_SHARED_LIBS: "ON"
88+ BUILD_VERSION: 17
89
90 build:rocm-static-cmake-minimum:
91 tags:
92@@ -238,6 +239,7 @@ build:rocm-static-cmake-minimum:
93 - .rocm:build
94 variables:
95 BUILD_SHARED_LIBS: "OFF"
96+ BUILD_VERSION: 17
97
98 build:rocm-cmake-latest:
99 tags:
100@@ -248,6 +250,10 @@ build:rocm-cmake-latest:
101 - .rocm:build
102 variables:
103 BUILD_SHARED_LIBS: "ON"
104+ parallel:
105+ matrix:
106+ - BUILD_VERSION: 17
107+
108
109 build:nvcc-cmake-minimum:
110 tags:
111@@ -315,7 +321,6 @@ benchmark:benchmark-tuning:
112 -D BUILD_BENCHMARK_TUNING=ON
113 -D BUILD_FORTRAN_WRAPPER=OFF
114 -D DEPENDENCIES_FORCE_DOWNLOAD=ON
115- -D BUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF
116 -D BUILD_SHARED_LIBS=ON
117 -D AMDGPU_TARGETS=${GPU_TARGET}
118 -D DISABLE_WERROR=OFF
119@@ -324,13 +329,13 @@ benchmark:benchmark-tuning:
120 -D BENCHMARK_TUNING_BLOCK_OPTIONS="${BENCHMARK_TUNING_BLOCK_OPTIONS}"
121 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
122 -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
123+ -D CMAKE_CXX_STANDARD=17
124 - cmake --build $CI_PROJECT_DIR/build --target benchmark_rocrand_tuning
125 - $CI_PROJECT_DIR/build/benchmark/tuning/benchmark_rocrand_tuning --benchmark_out_format=json --benchmark_out=$CI_PROJECT_DIR/build/rocrand_config_tuning_${GPU_TARGET}_${CI_JOB_ID}.json
126 artifacts:
127 paths:
128 - $CI_PROJECT_DIR/build/library/
129 - $CI_PROJECT_DIR/build/benchmark/
130- - $CI_PROJECT_DIR/build/deps/googlebenchmark/
131 - $CI_PROJECT_DIR/build/CMakeCache.txt
132 - $CI_PROJECT_DIR/build/*.json
133 expire_in: 2 weeks
134@@ -420,17 +425,17 @@ benchmark:nvcc:
135 - .rules:test
136 script:
137 - $SUDO_CMD apt-get update -qq
138- - $SUDO_CMD apt-get install -y -qq python3 python3-pip python3-numpy
139- - $SUDO_CMD apt-get install -y -qq wget
140- - pip3 install setuptools
141+ - $SUDO_CMD apt-get install -y -qq python3 python3-pip python3-venv
142 - export ROCRAND_PATH=$CI_PROJECT_DIR/build/library/
143 # rocRAND Wrapper with Python 3
144- - pip3 --version
145 - cd $CI_PROJECT_DIR/python/rocrand
146- - python3 setup.py test
147- - pip3 install . --user
148+ - python3 -m venv rocrand-venv
149+ - source rocrand-venv/bin/activate
150+ - python3 -m pip install setuptools numpy
151+ - python3 -m pip install .
152 - python3 tests/rocrand_test.py
153- - pip3 uninstall --yes rocrand
154+ - python3 -m pip uninstall --yes rocrand
155+ - deactivate
156
157 test:rocm-python:
158 tags:
159@@ -512,9 +517,12 @@ test:cpp-wrapper:
160 -B $CI_PROJECT_DIR/build_only_install
161 -D BUILD_TEST=OFF
162 -D BUILD_FORTRAN_WRAPPER=OFF
163- -D BUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF
164 -D CMAKE_CXX_COMPILER=${COMPILER}
165- - $SUDO_CMD cmake --build $CI_PROJECT_DIR/build_only_install --target install
166+ -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
167+ -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
168+ -D CMAKE_CUDA_COMPILER_LAUNCHER=phc_sccache_cuda
169+ # Preserve $PATH when sudoing
170+ - $SUDO_CMD env PATH="$PATH" cmake --build $CI_PROJECT_DIR/build_only_install --target install
171 - cmake
172 -S $CI_PROJECT_DIR/test/package/
173 -B $CI_PROJECT_DIR/install_test
174@@ -546,6 +554,10 @@ test:doc:
175 extends:
176 - .rules:test
177 - .build:docs
178+ artifacts:
179+ paths:
180+ - $DOCS_DIR/_build/html/
181+ expire_in: 2 weeks
182
183 .test:parity:
184 stage: test
185@@ -601,9 +613,11 @@ test:nvcc-parity:
186 -D BUILD_TEST=ON
187 -D CMAKE_BUILD_TYPE=Release
188 -D CMAKE_CXX_COMPILER:FILEPATH="${env:HIP_PATH}/bin/clang++.exe"
189+ -D CMAKE_CXX_FLAGS="-Wno-ignored-attributes"
190 -D CMAKE_INSTALL_PREFIX:PATH="$CI_PROJECT_DIR/build/install"
191 -D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}/lib/cmake"
192- -D DISABLE_WERROR=OFF *>&1
193+ -D DISABLE_WERROR=OFF
194+ -D CMAKE_CXX_STANDARD=17 *>&1
195 # Building
196 - cmake --build "$CI_PROJECT_DIR/build" *>&1
197
198@@ -632,10 +646,13 @@ test:windows:
199 -D CMAKE_BUILD_TYPE=Release
200 -D CMAKE_CXX_COMPILER:FILEPATH="${env:HIP_PATH}/bin/clang++.exe"
201 -D CMAKE_PREFIX_PATH:FILEPATH="${env:HIP_PATH}/lib/cmake;$CI_PROJECT_DIR/build/install" *>&1
202+ -D CMAKE_CXX_STANDARD=17
203 # Build package test
204 - cmake --build "$CI_PROJECT_DIR/build_install_test"
205 # Copy rocRAND.dll to the package test build directory
206 - cmake -E copy "$CI_PROJECT_DIR/build/install/bin/rocRAND.dll" "$CI_PROJECT_DIR/build_install_test" *>&1
207+ - cmake -E copy "$CI_PROJECT_DIR/build/_deps/googlebenchmark-build/src/benchmark.dll" "$CI_PROJECT_DIR/build_install_test" *>&1
208+ - cmake -E copy "$CI_PROJECT_DIR/build/_deps/googlebenchmark-build/src/benchmark_main.dll" "$CI_PROJECT_DIR/build_install_test" *>&1
209 # Run package test
210 - ctest --test-dir "$CI_PROJECT_DIR/build_install_test" -C $CMAKE_BUILD_TYPE --output-on-failure *>&1
211
212@@ -663,6 +680,7 @@ test:windows:
213 -D CMAKE_CXX_COMPILER=${COMPILER}
214 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
215 -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
216+ -D CMAKE_CXX_STANDARD=17
217 - cmake --build ${ROCRAND_STAT_TESTS_DIR}/build
218 - mkdir ${LOGS_DIR}
219 - cd ${ROCRAND_STAT_TESTS_DIR}/build
220@@ -670,7 +688,7 @@ test:windows:
221 artifacts:
222 paths:
223 - ${LOGS_DIR}/*
224- expire_in: never
225+ expire_in: 3 months
226
227 # TestU01 SmallCrush, 10 tests, 15 statistics, takes about 5 seconds
228 statistical-test:crush-small:
229diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
230deleted file mode 100644
231index 65a9a90..0000000
232--- a/.jenkins/common.groovy
233+++ /dev/null
234@@ -1,95 +0,0 @@
235-// This file is for internal AMD use.
236-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
237-
238-def runCompileCommand(platform, project, jobName, boolean debug=false, boolean staticLibrary=false, boolean codeCoverage=false)
239-{
240- project.paths.construct_build_prefix()
241-
242- project.paths.build_command = './install -c'
243- String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release'
244- String buildTypeDir = debug ? 'debug' : 'release'
245- String buildStatic = staticLibrary ? '-DBUILD_SHARED_LIBS=OFF' : '-DBUILD_SHARED_LIBS=ON'
246- String codeCovFlag = codeCoverage ? '-DCODE_COVERAGE=ON' : ''
247- String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
248- //Set CI node's gfx arch as target if PR, otherwise use default targets of the library
249- String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
250-
251- def command = """#!/usr/bin/env bash
252- set -x
253- cd ${project.paths.project_build_prefix}
254- mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir}
255- # gfxTargetParser reads gfxarch and adds target features such as xnack
256- ${auxiliary.gfxTargetParser()}
257- ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${buildStatic} ${amdgpuTargets} ${codeCovFlag} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../..
258- make -j\$(nproc)
259- """
260-
261- platform.runCommand(this, command)
262-}
263-
264-def runTestCommand (platform, project)
265-{
266- String sudo = auxiliary.sudo(platform.jenkinsLabel)
267- // String centos = platform.jenkinsLabel.contains('centos') ? '3' : ''
268- // Disable xorwow test for now as it is a known failure with gfx90a.
269- // def testCommand = "ctest${centos} --output-on-failure"
270- def testCommand = "ctest --output-on-failure"
271-
272- def command = """#!/usr/bin/env bash
273- set -x
274- cd ${project.paths.project_build_prefix}/build/release
275- make -j4
276- ${sudo} LD_LIBRARY_PATH=/opt/rocm/lib/ ${testCommand}
277- """
278-
279- platform.runCommand(this, command)
280-}
281-
282-def runPackageCommand(platform, project)
283-{
284- def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release")
285-
286- platform.runCommand(this, packageHelper[0])
287- platform.archiveArtifacts(this, packageHelper[1])
288-}
289-
290-def runCodeCovTestCommand(platform, project, jobName)
291-{
292- withCredentials([string(credentialsId: 'mathlibs-codecov-token-rocrand', variable: 'CODECOV_TOKEN')])
293- {
294- String prflag = env.CHANGE_ID ? "--pr \"${env.CHANGE_ID}\"" : ''
295-
296- String objectFlags = "-object ./library/librocrand.so"
297-
298- String profdataFile = "./rocRand.profdata"
299- String reportFile = "./code_cov_rocRand.report"
300- String coverageFile = "./code_cov_rocRand.txt"
301- String coverageFilter = "(.*googletest-src.*)|(.*/yaml-cpp-src/.*)|(.*hip/include.*)|(.*/include/llvm/.*)|(.*test/unit.*)|(.*/spdlog/.*)|(.*/msgpack-src/.*)"
302-
303- def command = """#!/usr/bin/env bash
304- set -ex
305- cd ${project.paths.project_build_prefix}/build/release
306- #Remove any preexisting prof files.
307- rm -rf ./test/*.profraw
308-
309- #The `%m` creates a different prof file for each object file.
310- LLVM_PROFILE_FILE=./rocRand_%m.profraw ctest --output-on-failure
311-
312- #this combines them back together.
313- /opt/rocm/llvm/bin/llvm-profdata merge -sparse ./test/*.profraw -o ${profdataFile}
314-
315- #For some reason, with the -object flag, we can't just specify the source directory, so we have to filter out the files we don't want.
316- /opt/rocm/llvm/bin/llvm-cov report ${objectFlags} -instr-profile=${profdataFile} -ignore-filename-regex="${coverageFilter}" > ${reportFile}
317- cat ${reportFile}
318- /opt/rocm/llvm/bin/llvm-cov show -Xdemangler=/opt/rocm/llvm/bin/llvm-cxxfilt ${objectFlags} -instr-profile=${profdataFile} -ignore-filename-regex="${coverageFilter}" > ${coverageFile}
319-
320- #Upload report to codecov
321- curl -Os https://uploader.codecov.io/latest/linux/codecov
322- chmod +x codecov
323- ./codecov -t ${CODECOV_TOKEN} ${prflag} --flags "${platform.gpu}" --sha \$(git rev-parse HEAD) --name "CI: ${jobName}" --file ${coverageFile} -v
324- """
325- platform.runCommand(this, command)
326- }
327-}
328-
329-return this
330diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy
331deleted file mode 100644
332index d067fd8..0000000
333--- a/.jenkins/precheckin.groovy
334+++ /dev/null
335@@ -1,78 +0,0 @@
336-#!/usr/bin/env groovy
337-@Library('rocJenkins@pong') _
338-import com.amd.project.*
339-import com.amd.docker.*
340-import java.nio.file.Path;
341-
342-def runCI =
343-{
344- nodeDetails, jobName->
345-
346- def prj = new rocProject('rocRAND', 'PreCheckin')
347-
348- def nodes = new dockerNodes(nodeDetails, jobName, prj)
349-
350- def commonGroovy
351-
352- boolean formatCheck = false
353-
354- def compileCommand =
355- {
356- platform, project->
357-
358- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
359- commonGroovy.runCompileCommand(platform, project, jobName)
360- }
361-
362-
363- def testCommand =
364- {
365- platform, project->
366-
367- commonGroovy.runTestCommand(platform, project)
368- }
369-
370- def packageCommand =
371- {
372- platform, project->
373-
374- commonGroovy.runPackageCommand(platform, project)
375- }
376-
377- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
378-}
379-
380-ci: {
381- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
382-
383- def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
384- propertyList = auxiliary.appendPropertyList(propertyList)
385-
386- def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])]
387- jobNameList = auxiliary.appendJobNameList(jobNameList)
388-
389- propertyList.each
390- {
391- jobName, property->
392- if (urlJobName == jobName)
393- properties(auxiliary.addCommonProperties(property))
394- }
395-
396- jobNameList.each
397- {
398- jobName, nodeDetails->
399- if (urlJobName == jobName)
400- stage(jobName) {
401- runCI(nodeDetails, jobName)
402- }
403- }
404-
405- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
406- if(!jobNameList.keySet().contains(urlJobName))
407- {
408- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
409- stage(urlJobName) {
410- runCI([ubuntu16:['gfx906']], urlJobName)
411- }
412- }
413-}
414\ No newline at end of file
415diff --git a/.jenkins/static.groovy b/.jenkins/static.groovy
416deleted file mode 100644
417index cee8d3d..0000000
418--- a/.jenkins/static.groovy
419+++ /dev/null
420@@ -1,77 +0,0 @@
421-#!/usr/bin/env groovy
422-@Library('rocJenkins@pong') _
423-import com.amd.project.*
424-import com.amd.docker.*
425-import java.nio.file.Path;
426-
427-def runCI =
428-{
429- nodeDetails, jobName->
430-
431- def prj = new rocProject('rocRAND', 'static')
432-
433- def nodes = new dockerNodes(nodeDetails, jobName, prj)
434-
435- def commonGroovy
436-
437- boolean formatCheck = false
438-
439- def compileCommand =
440- {
441- platform, project->
442-
443- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
444- commonGroovy.runCompileCommand(platform, project, jobName, debug=false, staticLibrary=true)
445- }
446-
447- def testCommand =
448- {
449- platform, project->
450-
451- commonGroovy.runTestCommand(platform, project)
452- }
453-
454- def packageCommand =
455- {
456- platform, project->
457-
458- commonGroovy.runPackageCommand(platform, project)
459- }
460-
461- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
462-}
463-
464-ci: {
465- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
466-
467- def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
468- propertyList = auxiliary.appendPropertyList(propertyList)
469-
470- def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])]
471- jobNameList = auxiliary.appendJobNameList(jobNameList)
472-
473- propertyList.each
474- {
475- jobName, property->
476- if (urlJobName == jobName)
477- properties(auxiliary.addCommonProperties(property))
478- }
479-
480- jobNameList.each
481- {
482- jobName, nodeDetails->
483- if (urlJobName == jobName)
484- stage(jobName) {
485- runCI(nodeDetails, jobName)
486- }
487- }
488-
489- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
490- if(!jobNameList.keySet().contains(urlJobName))
491- {
492- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
493- stage(urlJobName) {
494- runCI([ubuntu16:['gfx906']], urlJobName)
495- }
496- }
497-}
498diff --git a/.jenkins/staticanalysis.groovy b/.jenkins/staticanalysis.groovy
499deleted file mode 100644
500index c41e309..0000000
501--- a/.jenkins/staticanalysis.groovy
502+++ /dev/null
503@@ -1,55 +0,0 @@
504-#!/usr/bin/env groovy
505-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
506-@Library('rocJenkins@pong') _
507-
508-// This is file for internal AMD use.
509-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
510-
511-import com.amd.project.*
512-import com.amd.docker.*
513-import java.nio.file.Path
514-
515-def runCompileCommand(platform, project, jobName, boolean debug=false)
516-{
517- project.paths.construct_build_prefix()
518-}
519-
520-def runCI =
521-{
522- nodeDetails, jobName->
523-
524- def prj = new rocProject('rocRAND', 'StaticAnalysis')
525-
526- // Define test architectures, optional rocm version argument is available
527- def nodes = new dockerNodes(nodeDetails, jobName, prj)
528-
529- boolean formatCheck = false
530- boolean staticAnalysis = true
531-
532- def compileCommand =
533- {
534- platform, project->
535-
536- runCompileCommand(platform, project, jobName, false)
537- }
538-
539- buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis)
540-}
541-
542-ci: {
543- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
544-
545- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])]))
546-
547- def jobNameList = ["main":([ubuntu22:['any']])]
548- jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocBLAS')
549-
550- jobNameList.each
551- {
552- jobName, nodeDetails->
553- if (urlJobName == jobName)
554- stage(jobName) {
555- runCI(nodeDetails, jobName)
556- }
557- }
558-}
559diff --git a/CHANGELOG.md b/CHANGELOG.md
560index 3b08796..69ebb6c 100644
561--- a/CHANGELOG.md
562+++ b/CHANGELOG.md
563@@ -3,6 +3,67 @@
564 Documentation for rocRAND is available at
565 [https://rocm.docs.amd.com/projects/rocRAND/en/latest/](https://rocm.docs.amd.com/projects/rocRAND/en/latest/)
566
567+## rocRAND 4.1.0 for ROCm 7.1
568+
569+### Resolved issues
570+
571+* Updated error handling for several rocRAND unit tests to accomodate the new hipGetLastError behaviour that was introduced in ROCm 7.0.
572+As of ROCm 7.0, the internal error state is cleared on each call to `hipGetLastError` rather than on every HIP API call.
573+
574+### Changed
575+
576+* Changed the `USE_DEVICE_DISPATCH` flag so it can turn device dispatch off by setting it to zero. Device dispatch should be turned off when building for SPIRV.
577+
578+## rocRAND 4.0.0 for ROCm 7.0
579+
580+### Added
581+
582+* gfx950 support
583+* Additional unit tests for `test_log_normal_distribution.cpp`
584+* Additional unit tests for `test_normal_distribution.cpp`
585+* Additional unit tests for `test_rocrand_mtgp32_prng.cpp`
586+* Additional unit tests for `test_rocrand_scrambled_sobol32_qrng.cpp`
587+* Additional unit tests for `test_rocrand_scrambled_sobol64_qrng.cpp`
588+* Additional unit tests for `test_rocrand_sobol32_qrng.cpp`
589+* Additional unit tests for `test_rocrand_sobol64_qrng.cpp`
590+* Additional unit tests for `test_rocrand_threefry2x32_20_prng.cpp`
591+* Additional unit tests for `test_rocrand_threefry2x64_20_prng.cpp`
592+* Additional unit tests for `test_rocrand_threefry4x32_20_prng.cpp`
593+* Additional unit tests for `test_rocrand_threefry4x64_20_prng.cpp`
594+* Additional unit tests for `test_uniform_distribution.cpp`
595+* New unit tests for `include/rocrand/rocrand_discrete.h` in `test_rocrand_discrete.cpp`
596+* New unit tests for `include/rocrand/rocrand_mrg31k3p.h` in `test_rocrand_mrg31k3p_prng.cpp`
597+* New unit tests for `include/rocrand/rocrand_mrg32k3a.h` in `test_rocrand_mrg32k3a_prng.cpp`
598+* New unit tests for `include/rocrand/rocrand_poisson.h` in `test_rocrand_poisson.cpp`
599+
600+### Changed
601+
602+* Changed the return type for `rocrand_generate_poisson` for the `SOBOL64` and `SCRAMBLED_SOBOL64` engines.
603+* Changed the unnecessarily large 64-bit data type for constants used for skipping in `MRG32K3A` to the 32-bit data type.
604+* Updated several `gfx942` auto tuning parameters.
605+* Modified error handling and expanded the error information for the case of double-deallocation of the (scrambled) sobol32 and sobol64 constants and direction vectors.
606+
607+### Removed
608+
609+* Removed inline assembly and the `ENABLE_INLINE_ASM` CMake option. Inline assembly was used to optimizate of multiplications in the Mrg32k3a and Philox 4x32-10 generators. It is no longer needed because the current HIP compiler is able to produce code with the same or better performance.
610+* Removed instances of the deprecated clang definition `__AMDGCN_WAVEFRONT_SIZE`.
611+* Removed C++14 support. Beginning with this release, only C++17 is supported.
612+* Directly accessing the (scrambled) sobol32 and sobol64 constants and direction vectors is no longer supported. For:
613+ * `h_scrambled_sobol32_constants`, use `rocrand_get_scramble_constants32` instead.
614+ * `h_scrambled_sobol64_constants`, use `rocrand_get_scramble_constants64` instead.
615+ * `rocrand_h_sobol32_direction_vectors`, use `rocrand_get_direction_vectors32` instead.
616+ * `rocrand_h_sobol64_direction_vectors`, use `rocrand_get_direction_vectors64` instead.
617+ * `rocrand_h_scrambled_sobol32_direction_vectors`, use `rocrand_get_direction_vectors32` instead.
618+ * `rocrand_h_scrambled_sobol64_direction_vectors`, use `rocrand_get_direction_vectors64` instead.
619+
620+### Resolved issues
621+
622+* Fixed an issue where `mt19937.hpp` would cause kernel errors during auto tuning.
623+
624+### Upcoming changes
625+
626+* Deprecated the rocRAND Fortran API in favor of hipfort.
627+
628 ## rocRAND 3.3.0 for ROCm 6.4
629
630 ### Added
631@@ -14,9 +75,15 @@ Documentation for rocRAND is available at
632
633 ### Changed
634
635+* Removed a section in `cmake/Dependencies.cmake` that was forcing `DCMAKE_CXX_COMPILER` to be set to either `cl` or `g++` if the compiler was not `GNU`.
636 * `--test|-t` is no longer a required flag for `rtest.py`. Instead, the user can use either `--emulation|-e` or `--test|-t`, but not both.
637 * Removed TBB dependency for multi-core processing of host-side generation.
638
639+## Resolved issues
640+
641+* Fixed an issue where `CMAKE_PREFIX_PATH` was not defined properly in `CMAKELists.txt` and `toolchain-linux.cmake`.
642+* Fixed an issue in `rmake.py` where `cmake_platform_opts` was sometimes a string instead of a list.
643+
644 ## rocRAND 3.2.0 for ROCm 6.3.0
645
646 ### Added
647diff --git a/CMakeLists.txt b/CMakeLists.txt
648index f746b3c..b32acc8 100644
649--- a/CMakeLists.txt
650+++ b/CMakeLists.txt
651@@ -21,6 +21,7 @@
652 # SOFTWARE.
653
654 cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
655+cmake_policy(VERSION 3.16...3.25)
656
657 # Build options
658 include(CMakeDependentOption)
659@@ -29,22 +30,48 @@ option(BUILD_FORTRAN_WRAPPER "Build Fortran wrapper" OFF)
660 option(BUILD_TEST "Build tests (requires googletest)" OFF)
661 option(BUILD_BENCHMARK "Build benchmarks" OFF)
662 cmake_dependent_option(BUILD_BENCHMARK_TUNING
663- "Build extra benchmarks for kernel configuration tuning" OFF "BUILD_BENCHMARK" OFF)
664+"Build extra benchmarks for kernel configuration tuning" OFF "BUILD_BENCHMARK" OFF)
665 option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF)
666 option(CODE_COVERAGE "Build with code coverage flags (clang only)" OFF)
667 option(DEPENDENCIES_FORCE_DOWNLOAD "Don't search the system for dependencies, always download them" OFF)
668 cmake_dependent_option(RUN_SLOW_TESTS "Run extra tests with CTest. These cover niche functionality and take long time" OFF "BUILD_TEST" OFF)
669
670+
671+if (NOT DEFINED ENV{ROCM_PATH})
672+#Path to ROCm installation
673+ set(ENV{ROCM_PATH} "/opt/rocm")
674+endif()
675+
676 # Install prefix
677-set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories")
678+set(CMAKE_INSTALL_PREFIX "$ENV{ROCM_PATH}" CACHE PATH "Install path prefix, prepended onto install directories")
679+
680+if(WIN32)
681+ set(CPACK_SOURCE_GENERATOR "ZIP")
682+ set(CPACK_GENERATOR "ZIP")
683+ set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path")
684+ set(INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
685+ set(CPACK_SET_DESTDIR OFF)
686+ set(CPACK_PACKAGE_INSTALL_DIRECTORY "${CMAKE_INSTALL_PREFIX}")
687+ set(CPACK_PACKAGING_INSTALL_PREFIX "")
688+ set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
689+else()
690+ set(CMAKE_INSTALL_PREFIX "$ENV{ROCM_PATH}" CACHE PATH "Install path prefix, prepended onto install directories")
691+ #Adding CMAKE_PREFIX_PATH
692+ if(NOT DEFINED CMAKE_PREFIX_PATH)
693+ list( APPEND CMAKE_PREFIX_PATH $ENV{ROCM_PATH}/llvm $ENV{ROCM_PATH})
694+ endif()
695+ if(NOT CPACK_PACKAGING_INSTALL_PREFIX)
696+ set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
697+ endif()
698+endif()
699
700 # CMake modules
701 list(APPEND CMAKE_MODULE_PATH
702- ${CMAKE_CURRENT_SOURCE_DIR}/cmake
703- ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules
704- $ENV{ROCM_PATH}/lib/cmake/hip
705- ${HIP_PATH}/cmake $ENV{ROCM_PATH}/hip/cmake # FindHIP.cmake
706- $ENV{ROCM_PATH}/llvm
707+${CMAKE_CURRENT_SOURCE_DIR}/cmake
708+${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules
709+$ENV{ROCM_PATH}/lib/cmake/hip
710+${HIP_PATH}/cmake $ENV{ROCM_PATH}/hip/cmake # FindHIP.cmake
711+$ENV{ROCM_PATH}/llvm
712 )
713
714 #
715@@ -97,11 +124,11 @@ if(GPU_TARGETS STREQUAL "all")
716 if(BUILD_ADDRESS_SANITIZER)
717 # ASAN builds require xnack
718 rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
719- TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+"
720+ TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+;gfx950:xnack+"
721 )
722 else()
723 rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
724- TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151"
725+ TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151"
726 )
727 endif()
728
729@@ -115,7 +142,9 @@ include(cmake/VerifyCompiler.cmake)
730 option(DISABLE_WERROR "Disable building with Werror" ON)
731
732 # Build CXX flags
733-set(CMAKE_CXX_STANDARD 11)
734+if (NOT DEFINED CMAKE_CXX_STANDARD)
735+ set(CMAKE_CXX_STANDARD 17)
736+endif()
737 set(CMAKE_CXX_STANDARD_REQUIRED ON)
738 set(CMAKE_CXX_EXTENSIONS OFF)
739 if(DISABLE_WERROR)
740@@ -125,6 +154,13 @@ else()
741 endif()
742 if(CODE_COVERAGE)
743 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
744+ add_definitions(-DCODE_COVERAGE_ENABLED)
745+endif()
746+
747+if (CMAKE_CXX_STANDARD EQUAL 14)
748+ message(WARNING "C++14 will be deprecated in the next major release")
749+elseif(NOT CMAKE_CXX_STANDARD EQUAL 17)
750+ message(FATAL_ERROR "Only C++14 and C++17 are supported")
751 endif()
752
753 # HIP on Windows: xhip is required with clang++ to get __half defined
754@@ -142,21 +178,8 @@ if(BUILD_ADDRESS_SANITIZER AND BUILD_SHARED_LIBS)
755 add_link_options(-fuse-ld=lld)
756 endif()
757
758-# FOR HANDLING ENABLE/DISABLE OPTIONAL BACKWARD COMPATIBILITY for FILE/FOLDER REORG
759-option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" OFF)
760-if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32)
761- rocm_wrap_header_dir(
762- "${PROJECT_SOURCE_DIR}/library/include/rocrand"
763- HEADER_LOCATION include/rocrand
764- GUARDS SYMLINK WRAPPER
765- WRAPPER_LOCATIONS include rocrand/include
766- OUTPUT_LOCATIONS library/include library/rocrand/include
767- PATTERNS *.h *.hpp
768- )
769-endif()
770-
771 # Set version variables
772-rocm_setup_version( VERSION "3.3.0" )
773+rocm_setup_version( VERSION "4.1.0" )
774 set ( rocrand_VERSION ${rocRAND_VERSION} )
775 # Old-style version number used within the library's API. rocrand_get_version should be modified.
776 math(EXPR rocrand_VERSION_NUMBER "${rocRAND_VERSION_MAJOR} * 100000 + ${rocRAND_VERSION_MINOR} * 100 + ${rocRAND_VERSION_PATCH}")
777@@ -190,25 +213,8 @@ if (BUILD_BENCHMARK)
778 endif()
779
780 # Package (make package)
781-set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.txt")
782+set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md")
783 set(CPACK_RPM_PACKAGE_LICENSE "MIT")
784-if(WIN32)
785- set(CPACK_SOURCE_GENERATOR "ZIP")
786- set(CPACK_GENERATOR "ZIP")
787- set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path")
788- set(INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
789- set(CPACK_SET_DESTDIR OFF)
790- set(CPACK_PACKAGE_INSTALL_DIRECTORY "${CMAKE_INSTALL_PREFIX}")
791- set(CPACK_PACKAGING_INSTALL_PREFIX "")
792- set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
793-else()
794- set(CMAKE_INSTALL_PREFIX "$ENV{ROCM_PATH}" CACHE PATH "Install path prefix, prepended onto install directories")
795- #Adding CMAKE_PREFIX_PATH
796- list( APPEND CMAKE_PREFIX_PATH $ENV{ROCM_PATH}/llvm $ENV{ROCM_PATH} )
797- if(NOT CPACK_PACKAGING_INSTALL_PREFIX)
798- set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
799- endif()
800-endif()
801
802 if( HIP_RUNTIME_LOWER STREQUAL "rocclr" )
803 if(BUILD_ADDRESS_SANITIZER)
804diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
805index ebac71d..04f2af5 100644
806--- a/CONTRIBUTING.md
807+++ b/CONTRIBUTING.md
808@@ -6,7 +6,7 @@
809
810 # Contributing to rocRAND #
811
812-We welcome contributions to rocRAND. Please follow these details to help ensure your contributions will be successfully accepted.
813+We welcome contributions to rocRAND. Please follow these details to help ensure your contributions will be successfully accepted.
814
815 ## Issue Discussion ##
816
817@@ -110,4 +110,4 @@ During code reviews, another developer will take a look through your proposed ch
818 needed), they may leave a comment. You can follow up and respond to the comment, and/or create comments of your own if you have questions or ideas.
819 When a modification request has been completed, the conversation thread about it will be marked as resolved.
820
821-To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request.
822\ No newline at end of file
823+To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request.
824diff --git a/LICENSE.txt b/LICENSE.md
825similarity index 94%
826rename from LICENSE.txt
827rename to LICENSE.md
828index e650db9..4d43ac8 100644
829--- a/LICENSE.txt
830+++ b/LICENSE.md
831@@ -1,6 +1,6 @@
832 MIT License
833
834-Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
835+Copyright (C) Advanced Micro Devices, Inc.
836
837 Permission is hereby granted, free of charge, to any person obtaining a copy
838 of this software and associated documentation files (the "Software"), to deal
839diff --git a/README.md b/README.md
840index 6bfdaaf..3503913 100644
841--- a/README.md
842+++ b/README.md
843@@ -1,12 +1,15 @@
844 # rocRAND
845
846+> [!NOTE]
847+> The published rocRAND documentation is available [here](https://rocm.docs.amd.com/projects/rocRAND/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
848+
849 The rocRAND project provides functions that generate pseudorandom and quasirandom numbers.
850-The rocRAND library is implemented in the [HIP](https://github.com/ROCm/HIP)
851+The rocRAND library is implemented in the [HIP](https://github.com/ROCm/rocm-systems/tree/develop/projects/hip)
852 programming language and optimized for AMD's latest discrete GPUs. It is designed to run on top
853-of AMD's [ROCm](https://rocm.docs.amd.com) runtime, but it also works on CUDA-enabled GPUs.
854+of AMD's [ROCm](https://rocm.docs.amd.com) runtime.
855
856 Prior to ROCm version 5.0, this project included the
857-[hipRAND](https://github.com/ROCm/hipRAND.git) wrapper. As of version 5.0, it was
858+[hipRAND](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hiprand) wrapper. As of version 5.0, it was
859 split into a separate library. As of version 6.0, hipRAND can no longer be built from rocRAND.
860
861 ## Supported random number generators
862@@ -24,28 +27,6 @@ split into a separate library. As of version 6.0, hipRAND can no longer be built
863 * Scrambled Sobol64
864 * ThreeFry
865
866-## Documentation
867-
868-> [!NOTE]
869-> The published rocRAND documentation is available at [rocRAND](https://rocm.docs.amd.com/projects/rocRAND/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the rocRAND/docs folder of this repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
870-
871-To build documentation locally, use the following code:
872-
873-```sh
874-# Go to the docs directory
875-cd docs
876-
877-# Install Python dependencies
878-python3 -m pip install -r sphinx/requirements.txt
879-
880-# Build the documentation
881-python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
882-
883-# E.g. serve the HTML docs locally
884-cd _build/html
885-python3 -m http.server
886-```
887-
888 ## Requirements
889
890 * CMake (3.16 or later)
891@@ -56,11 +37,8 @@ python3 -m http.server
892 * C++ compiler with C++11 support to consume the library.
893 * For AMD platforms:
894 * [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/native-install/index.html) (1.7 or later)
895- * [HIP-clang](https://github.com/ROCm/HIP/blob/master/INSTALL.md#hip-clang) compiler, which must be
896- set as C++ compiler on ROCm platform.
897-* For CUDA platforms:
898- * [HIP](https://github.com/ROCm/HIP)
899- * Latest CUDA SDK
900+ * [HIP-clang](https://rocm.docs.amd.com/projects/HIP/en/latest/install/install.html) compiler, which must be
901+ set as the C++ compiler for the ROCm platform.
902 * Python 3.6 or higher (HIP on Windows only, only required for install script)
903 * Visual Studio 2019 with clang support (HIP on Windows only)
904 * Strawberry Perl (HIP on Windows only)
905@@ -83,11 +61,18 @@ dependencies, rather than using the system-installed libraries.
906
907 ## Build and install
908
909+> [!NOTE]
910+> The following clone command downloads all components in the [rocm-libraries](https://github.com/ROCm/rocm-libraries) GitHub repository.
911+This is recommended for working with multiple library components, but can take a very long time to
912+download. For a shorter download process that only clones the rocRAND library, see the
913+[rocRAND installation documentation](https://rocm.docs.amd.com/projects/rocRAND/en/latest/install/installing.html)
914+for ROCm 7.0 or later.
915+
916 ```shell
917-git clone https://github.com/ROCm/rocRAND.git
918+git clone https://github.com/ROCm/rocm-libraries.git
919
920 # Go to rocRAND directory, create and go to build directory
921-cd rocRAND; mkdir build; cd build
922+cd rocm-libraries/projects/rocrand; mkdir build; cd build
923
924 # Configure rocRAND, setup options for your system
925 # Build options: BUILD_TEST (off by default), BUILD_BENCHMARK (off by default), BUILD_SHARED_LIBS (on by default)
926@@ -101,14 +86,6 @@ cd rocRAND; mkdir build; cd build
927 #
928 [CXX=hipcc] cmake -DBUILD_BENCHMARK=ON ../. -DCMAKE_PREFIX_PATH=/opt/rocm # or cmake-gui ../.
929
930-# To configure rocRAND for NVIDIA platforms, the CXX compiler must be set to a host compiler. The CUDA compiler can
931-# be set explicitly using `-DCMAKE_CUDA_COMPILER=<path-to-nvcc>`.
932-# Additionally, the path to FindHIP.cmake should be passed via CMAKE_MODULE_PATH. By default, this is module is
933-# installed in /opt/rocm/hip/cmake.
934-cmake -DBUILD_BENCHMARK=ON ../. -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake # or cmake-gui ../.
935-# or
936-[CXX=g++] cmake -DBUILD_BENCHMARK=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake ../. # or cmake-gui ../.
937-
938 # Build
939 make -j4
940
941@@ -119,14 +96,19 @@ ctest --output-on-failure
942 [sudo] make install
943 ```
944
945+### SPIR-V
946+
947+rocRAND supports the `amdgcnspirv` target, but it should be built with `USE_DEVICE_DISPATCH`
948+turned off like `-DUSE_DEVICE_DISPATCH=0`.
949+
950 ### HIP on Windows
951
952 We've added initial support for HIP on Windows, which you can install using the `rmake.py` python
953 script:
954
955 ```shell
956-git clone https://github.com/ROCm/rocRAND.git
957-cd rocRAND
958+git clone https://github.com/ROCm/rocm-libraries.git
959+cd rocm-libraries/projects/rocrand
960
961 # the -i option will install rocPRIM to C:\hipSDK by default
962 python rmake.py -i
963@@ -140,15 +122,11 @@ compilers) may cause a build failure; if you encounter errors with the existing
964 other dependencies, you can pass the `DEPENDENCIES_FORCE_DOWNLOAD` flag to CMake, which can
965 help to solve the problem.
966
967-To disable inline assembly optimizations in rocRAND (for both the host library and
968-the device functions provided in `rocrand_kernel.h`), set the CMake option `ENABLE_INLINE_ASM`
969-to `OFF`.
970-
971 ## Running unit tests
972
973 ```shell
974 # Go to rocRAND build directory
975-cd rocRAND; cd build
976+cd rocm-libraries/projects/rocrand; cd build
977
978 # To run all tests
979 ctest
980@@ -161,7 +139,7 @@ ctest
981
982 ```shell
983 # Go to rocRAND build directory
984-cd rocRAND; cd build
985+cd rocm-libraries/projects/rocrand; cd build
986
987 # To run benchmark for the host generate functions:
988 # The benchmarks are registered with Google Benchmark as `device_generate<engine,distribution>`, where
989@@ -225,10 +203,62 @@ been migrated to the new framework.
990 * [Fortran wrappers](./library/src/fortran/).
991 * [Python wrappers](./python/): [rocRAND](./python/rocrand).
992
993+## Building the documentation locally
994+
995+### Requirements
996+
997+#### Doxygen
998+
999+The build system uses Doxygen [version 1.9.4](https://github.com/doxygen/doxygen/releases/tag/Release_1_9_4). You can try using a newer version, but that might cause issues.
1000+
1001+After you have downloaded Doxygen version 1.9.4:
1002+
1003+```shell
1004+# Add doxygen to your PATH
1005+echo 'export PATH=<doxygen 1.9.4 path>/bin:$PATH' >> ~/.bashrc
1006+
1007+# Apply the updated .bashrc
1008+source ~/.bashrc
1009+
1010+# Confirm that you are using version 1.9.4
1011+doxygen --version
1012+```
1013+
1014+#### Python
1015+
1016+The build system uses Python version 3.10. You can try using a newer version, but that might cause issues.
1017+
1018+You can install Python 3.10 alongside your other Python versions using [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation):
1019+
1020+```shell
1021+# Install Python 3.10
1022+pyenv install 3.10
1023+
1024+# Create a Python 3.10 virtual environment
1025+pyenv virtualenv 3.10 venv_rocrand
1026+
1027+# Activate the virtual environment
1028+pyenv activate venv_rocrand
1029+```
1030+
1031+### Building
1032+
1033+After cloning this repository, and `cd`ing into it:
1034+
1035+```shell
1036+# Install Python dependencies
1037+python3 -m pip install -r docs/sphinx/requirements.txt
1038+
1039+# Build the documentation
1040+python3 -m sphinx -T -E -b html -d docs/_build/doctrees -D language=en docs docs/_build/html
1041+```
1042+
1043+You can then open `docs/_build/html/index.html` in your browser to view the documentation.
1044+
1045 ## Support
1046
1047 Bugs and feature requests can be reported through the
1048-[issue tracker](https://github.com/ROCm/rocRAND/issues).
1049+[issue tracker](https://github.com/ROCm/rocm-libraries/issues).
1050
1051 ## Contributions and license
1052
1053diff --git a/benchmark/benchmark_curand_device_api.cpp b/benchmark/benchmark_curand_device_api.cpp
1054index ec3123e..03a214b 100644
1055--- a/benchmark/benchmark_curand_device_api.cpp
1056+++ b/benchmark/benchmark_curand_device_api.cpp
1057@@ -1,4 +1,4 @@
1058-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
1059+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
1060 //
1061 // Permission is hereby granted, free of charge, to any person obtaining a copy
1062 // of this software and associated documentation files (the "Software"), to deal
1063@@ -38,19 +38,6 @@
1064 #include <utility>
1065 #include <vector>
1066
1067-#define CUDA_CALL(condition) \
1068- do \
1069- { \
1070- cudaError_t error_ = condition; \
1071- if(error_ != cudaSuccess) \
1072- { \
1073- std::cout << "CUDA error: " << error_ << " at " << __FILE__ << ":" << __LINE__ \
1074- << std::endl; \
1075- exit(error_); \
1076- } \
1077- } \
1078- while(0)
1079-
1080 #define CURAND_DEFAULT_MAX_BLOCK_SIZE 256
1081
1082 #ifndef DEFAULT_RAND_N
1083@@ -100,7 +87,7 @@ struct runner
1084
1085 init_kernel<<<blocks, threads>>>(states, seed, offset);
1086
1087- CUDA_CALL(cudaPeekAtLastError());
1088+ CUDA_CALL(cudaGetLastError());
1089 CUDA_CALL(cudaDeviceSynchronize());
1090 }
1091
1092@@ -127,7 +114,7 @@ __global__ __launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE) void generate_kernel
1093 {
1094 const unsigned int state_id = blockIdx.x;
1095 const unsigned int thread_id = threadIdx.x;
1096- unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
1097+ unsigned int index = blockIdx.x * blockDim.x + thread_id;
1098 unsigned int stride = gridDim.x * blockDim.x;
1099
1100 __shared__ curandStateMtgp32_t state;
1101@@ -137,7 +124,13 @@ __global__ __launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE) void generate_kernel
1102 __syncthreads();
1103
1104 const size_t r = size % blockDim.x;
1105- const size_t size_rounded_up = r == 0 ? size : size + (blockDim.x - r);
1106+ const size_t size_rounded_down = size - r;
1107+ const size_t size_rounded_up = r == 0 ? size : size_rounded_down + blockDim.x;
1108+ while(index < size_rounded_down)
1109+ {
1110+ data[index] = generator(&state);
1111+ index += stride;
1112+ }
1113 while(index < size_rounded_up)
1114 {
1115 auto value = generator(&state);
1116@@ -258,12 +251,13 @@ struct runner<curandStateSobol32_t>
1117 {
1118 this->dimensions = dimensions;
1119
1120- const size_t states_size = blocks * threads * dimensions;
1121- CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateSobol32_t)));
1122-
1123 curandDirectionVectors32_t* h_directions;
1124 CURAND_CALL(
1125 curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6));
1126+
1127+ const size_t states_size = blocks * threads * dimensions;
1128+ CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateSobol32_t)));
1129+
1130 unsigned int* directions;
1131 const size_t size = dimensions * sizeof(unsigned int) * 32;
1132 CUDA_CALL(cudaMalloc(&directions, size));
1133@@ -275,7 +269,7 @@ struct runner<curandStateSobol32_t>
1134 directions,
1135 static_cast<unsigned int>(offset));
1136
1137- CUDA_CALL(cudaPeekAtLastError());
1138+ CUDA_CALL(cudaGetLastError());
1139 CUDA_CALL(cudaDeviceSynchronize());
1140
1141 CUDA_CALL(cudaFree(directions));
1142@@ -316,26 +310,26 @@ struct runner<curandStateScrambledSobol32_t>
1143 {
1144 this->dimensions = dimensions;
1145
1146- const size_t states_size = blocks * threads * dimensions;
1147- CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol32_t)));
1148-
1149 curandDirectionVectors32_t* h_directions;
1150+ unsigned int* h_constants;
1151+
1152 CURAND_CALL(
1153 curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6));
1154+ CURAND_CALL(curandGetScrambleConstants32(&h_constants));
1155+
1156+ const size_t states_size = blocks * threads * dimensions;
1157+ CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol32_t)));
1158+
1159 unsigned int* directions;
1160- const size_t size = dimensions * sizeof(unsigned int) * 32;
1161- CUDA_CALL(cudaMalloc(&directions, size));
1162- CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));
1163+ const size_t directions_size = dimensions * sizeof(unsigned int) * 32;
1164+ CUDA_CALL(cudaMalloc(&directions, directions_size));
1165+ CUDA_CALL(cudaMemcpy(directions, h_directions, directions_size, cudaMemcpyHostToDevice));
1166
1167- unsigned int* h_scramble_constants;
1168- CURAND_CALL(curandGetScrambleConstants32(&h_scramble_constants));
1169 unsigned int* scramble_constants;
1170 const size_t constants_size = dimensions * sizeof(unsigned int);
1171 CUDA_CALL(cudaMalloc(&scramble_constants, constants_size));
1172- CUDA_CALL(cudaMemcpy(scramble_constants,
1173- h_scramble_constants,
1174- constants_size,
1175- cudaMemcpyHostToDevice));
1176+ CUDA_CALL(
1177+ cudaMemcpy(scramble_constants, h_constants, constants_size, cudaMemcpyHostToDevice));
1178
1179 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
1180 init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(
1181@@ -344,7 +338,7 @@ struct runner<curandStateScrambledSobol32_t>
1182 scramble_constants,
1183 static_cast<unsigned int>(offset));
1184
1185- CUDA_CALL(cudaPeekAtLastError());
1186+ CUDA_CALL(cudaGetLastError());
1187 CUDA_CALL(cudaDeviceSynchronize());
1188
1189 CUDA_CALL(cudaFree(directions));
1190@@ -386,12 +380,13 @@ struct runner<curandStateSobol64_t>
1191 {
1192 this->dimensions = dimensions;
1193
1194- const size_t states_size = blocks * threads * dimensions;
1195- CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateSobol64_t)));
1196-
1197 curandDirectionVectors64_t* h_directions;
1198 CURAND_CALL(
1199 curandGetDirectionVectors64(&h_directions, CURAND_DIRECTION_VECTORS_64_JOEKUO6));
1200+
1201+ const size_t states_size = blocks * threads * dimensions;
1202+ CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateSobol64_t)));
1203+
1204 unsigned long long int* directions;
1205 const size_t size = dimensions * sizeof(unsigned long long) * 64;
1206 CUDA_CALL(cudaMalloc(&directions, size));
1207@@ -400,7 +395,7 @@ struct runner<curandStateSobol64_t>
1208 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
1209 init_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, directions, offset);
1210
1211- CUDA_CALL(cudaPeekAtLastError());
1212+ CUDA_CALL(cudaGetLastError());
1213 CUDA_CALL(cudaDeviceSynchronize());
1214
1215 CUDA_CALL(cudaFree(directions));
1216@@ -441,26 +436,26 @@ struct runner<curandStateScrambledSobol64_t>
1217 {
1218 this->dimensions = dimensions;
1219
1220- const size_t states_size = blocks * threads * dimensions;
1221- CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol64_t)));
1222-
1223 curandDirectionVectors64_t* h_directions;
1224+ unsigned long long* h_constants;
1225+
1226 CURAND_CALL(
1227 curandGetDirectionVectors64(&h_directions, CURAND_DIRECTION_VECTORS_64_JOEKUO6));
1228+ CURAND_CALL(curandGetScrambleConstants64(&h_constants));
1229+
1230+ const size_t states_size = blocks * threads * dimensions;
1231+ CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol64_t)));
1232+
1233 unsigned long long* directions;
1234- const size_t size = dimensions * sizeof(unsigned long long) * 64;
1235- CUDA_CALL(cudaMalloc(&directions, size));
1236- CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));
1237+ const size_t directions_size = dimensions * sizeof(unsigned long long) * 64;
1238+ CUDA_CALL(cudaMalloc(&directions, directions_size));
1239+ CUDA_CALL(cudaMemcpy(directions, h_directions, directions_size, cudaMemcpyHostToDevice));
1240
1241- unsigned long long* h_scramble_constants;
1242- CURAND_CALL(curandGetScrambleConstants64(&h_scramble_constants));
1243 unsigned long long* scramble_constants;
1244 const size_t constants_size = dimensions * sizeof(unsigned long long);
1245 CUDA_CALL(cudaMalloc(&scramble_constants, constants_size));
1246- CUDA_CALL(cudaMemcpy(scramble_constants,
1247- h_scramble_constants,
1248- constants_size,
1249- cudaMemcpyHostToDevice));
1250+ CUDA_CALL(
1251+ cudaMemcpy(scramble_constants, h_constants, constants_size, cudaMemcpyHostToDevice));
1252
1253 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
1254 init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(states,
1255@@ -468,7 +463,7 @@ struct runner<curandStateScrambledSobol64_t>
1256 scramble_constants,
1257 offset);
1258
1259- CUDA_CALL(cudaPeekAtLastError());
1260+ CUDA_CALL(cudaGetLastError());
1261 CUDA_CALL(cudaDeviceSynchronize());
1262
1263 CUDA_CALL(cudaFree(directions));
1264@@ -722,7 +717,7 @@ void run_benchmark(benchmark::State& state,
1265 for(size_t i = 0; i < 5; i++)
1266 {
1267 r.generate(blocks, threads, stream, data, size, generator);
1268- CUDA_CALL(cudaPeekAtLastError());
1269+ CUDA_CALL(cudaGetLastError());
1270 CUDA_CALL(cudaDeviceSynchronize());
1271 }
1272
1273@@ -819,6 +814,14 @@ void add_benchmarks(const benchmark_context& ctx,
1274
1275 int main(int argc, char* argv[])
1276 {
1277+ // get paramaters before they are passed into
1278+ // benchmark::Initialize()
1279+ std::string outFormat = "";
1280+ std::string filter = "";
1281+ std::string consoleFormat = "";
1282+
1283+ getFormats(argc, argv, outFormat, filter, consoleFormat);
1284+
1285 benchmark::Initialize(&argc, argv);
1286
1287 cli::Parser parser(argc, argv);
1288@@ -884,8 +887,20 @@ int main(int argc, char* argv[])
1289 b->Unit(benchmark::kMillisecond);
1290 }
1291
1292+ benchmark::BenchmarkReporter* console_reporter = getConsoleReporter(consoleFormat);
1293+ benchmark::BenchmarkReporter* out_file_reporter = getOutFileReporter(outFormat);
1294+
1295+ std::string spec = (filter == "" || filter == "all") ? "." : filter;
1296+
1297 // Run benchmarks
1298- benchmark::RunSpecifiedBenchmarks();
1299+ if(outFormat == "") // default case
1300+ {
1301+ benchmark::RunSpecifiedBenchmarks(console_reporter, spec);
1302+ }
1303+ else
1304+ {
1305+ benchmark::RunSpecifiedBenchmarks(console_reporter, out_file_reporter, spec);
1306+ }
1307 CUDA_CALL(cudaStreamDestroy(stream));
1308
1309 return 0;
1310diff --git a/benchmark/benchmark_curand_generate.cpp b/benchmark/benchmark_curand_generate.cpp
1311index c3c450f..f85338b 100644
1312--- a/benchmark/benchmark_curand_generate.cpp
1313+++ b/benchmark/benchmark_curand_generate.cpp
1314@@ -1,4 +1,4 @@
1315-// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
1316+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
1317 //
1318 // Permission is hereby granted, free of charge, to any person obtaining a copy
1319 // of this software and associated documentation files (the "Software"), to deal
1320@@ -52,11 +52,17 @@ template<typename T>
1321 void run_benchmark(const cli::Parser& parser,
1322 const rng_type_t rng_type,
1323 cudaStream_t stream,
1324- generate_func_type<T> generate_func)
1325+ generate_func_type<T> generate_func,
1326+ const std::string& distribution,
1327+ const std::string& engine,
1328+ const double lambda = 0.f)
1329 {
1330- const size_t size = parser.get<size_t>("size");
1331- const size_t trials = parser.get<size_t>("trials");
1332- const size_t offset = parser.get<size_t>("offset");
1333+ const size_t size0 = parser.get<size_t>("size");
1334+ const size_t trials = parser.get<size_t>("trials");
1335+ const size_t dimensions = parser.get<size_t>("dimensions");
1336+ const size_t offset = parser.get<size_t>("offset");
1337+ const size_t size = (size0 / dimensions) * dimensions;
1338+ const std::string format = parser.get<std::string>("format");
1339
1340 T * data;
1341 CUDA_CALL(cudaMalloc(&data, size * sizeof(T)));
1342@@ -64,7 +70,6 @@ void run_benchmark(const cli::Parser& parser,
1343 curandGenerator_t generator;
1344 CURAND_CALL(curandCreateGenerator(&generator, rng_type));
1345
1346- const size_t dimensions = parser.get<size_t>("dimensions");
1347 curandStatus_t status = curandSetQuasiRandomGeneratorDimensions(generator, dimensions);
1348 if (status != CURAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random
1349 {
1350@@ -90,7 +95,6 @@ void run_benchmark(const cli::Parser& parser,
1351 cudaEvent_t start, stop;
1352 CUDA_CALL(cudaEventCreate(&start));
1353 CUDA_CALL(cudaEventCreate(&stop));
1354-
1355 CUDA_CALL(cudaEventRecord(start, stream));
1356 for (size_t i = 0; i < trials; i++)
1357 {
1358@@ -98,21 +102,40 @@ void run_benchmark(const cli::Parser& parser,
1359 }
1360 CUDA_CALL(cudaEventRecord(stop, stream));
1361 CUDA_CALL(cudaEventSynchronize(stop));
1362-
1363 float elapsed;
1364 CUDA_CALL(cudaEventElapsedTime(&elapsed, start, stop));
1365-
1366 CUDA_CALL(cudaEventDestroy(stop));
1367 CUDA_CALL(cudaEventDestroy(start));
1368
1369- std::cout << std::fixed << std::setprecision(3) << " "
1370- << "Throughput = " << std::setw(8)
1371- << (trials * size * sizeof(T)) / (elapsed / 1e3 * (1 << 30))
1372- << " GB/s, Samples = " << std::setw(8)
1373- << (trials * size) / (elapsed / 1e3 * (1 << 30))
1374- << " GSample/s, AvgTime (1 trial) = " << std::setw(8) << elapsed / trials
1375- << " ms, Time (all) = " << std::setw(8) << elapsed << " ms, Size = " << size
1376- << std::endl;
1377+ if(format.compare("csv") == 0)
1378+ {
1379+ std::cout << std::fixed << std::setprecision(3) << engine << "," << distribution << ","
1380+ << (trials * size * sizeof(T)) / (elapsed / 1e3 * (1 << 30)) << ","
1381+ << (trials * size) / (elapsed / 1e3 * (1 << 30)) << "," << elapsed / trials << ","
1382+ << elapsed << "," << size << ",";
1383+ if(distribution.compare("poisson") == 0 || distribution.compare("discrete-poisson") == 0)
1384+ {
1385+ std::cout << lambda;
1386+ }
1387+ std::cout << std::endl;
1388+ }
1389+ else
1390+ {
1391+ if(format.compare("console") != 0)
1392+ {
1393+ std::cout << "Unknown format specified (must be either console or csv). Defaulting to "
1394+ "console output."
1395+ << std::endl;
1396+ }
1397+ std::cout << std::fixed << std::setprecision(3) << " "
1398+ << "Throughput = " << std::setw(8)
1399+ << (trials * size * sizeof(T)) / (elapsed / 1e3 * (1 << 30))
1400+ << " GB/s, Samples = " << std::setw(8)
1401+ << (trials * size) / (elapsed / 1e3 * (1 << 30))
1402+ << " GSample/s, AvgTime (1 trial) = " << std::setw(8) << elapsed / trials
1403+ << " ms, Time (all) = " << std::setw(8) << elapsed << " ms, Size = " << size
1404+ << std::endl;
1405+ }
1406
1407 CURAND_CALL(curandDestroyGenerator(generator));
1408 CUDA_CALL(cudaFree(data));
1409@@ -121,94 +144,123 @@ void run_benchmark(const cli::Parser& parser,
1410 void run_benchmarks(const cli::Parser& parser,
1411 const rng_type_t rng_type,
1412 const std::string& distribution,
1413+ const std::string& engine,
1414 cudaStream_t stream)
1415 {
1416+ const std::string format = parser.get<std::string>("format");
1417 if (distribution == "uniform-uint")
1418 {
1419- if (rng_type != CURAND_RNG_QUASI_SOBOL64 &&
1420- rng_type != CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
1421+ if(rng_type != CURAND_RNG_QUASI_SOBOL64 && rng_type != CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
1422 {
1423- run_benchmark<unsigned int>(parser,
1424- rng_type,
1425- stream,
1426- [](curandGenerator_t gen, unsigned int* data, size_t size)
1427- { return curandGenerate(gen, data, size); });
1428+ run_benchmark<unsigned int>(
1429+ parser,
1430+ rng_type,
1431+ stream,
1432+ [](curandGenerator_t gen, unsigned int* data, size_t size)
1433+ { return curandGenerate(gen, data, size); },
1434+ distribution,
1435+ engine);
1436 }
1437 }
1438 if (distribution == "uniform-long-long")
1439 {
1440- if (rng_type == CURAND_RNG_QUASI_SOBOL64 ||
1441- rng_type == CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
1442+ if(rng_type == CURAND_RNG_QUASI_SOBOL64 || rng_type == CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
1443 {
1444 run_benchmark<unsigned long long>(
1445 parser,
1446 rng_type,
1447 stream,
1448 [](curandGenerator_t gen, unsigned long long* data, size_t size)
1449- { return curandGenerateLongLong(gen, data, size); });
1450+ { return curandGenerateLongLong(gen, data, size); },
1451+ distribution,
1452+ engine);
1453 }
1454 }
1455 if (distribution == "uniform-float")
1456 {
1457- run_benchmark<float>(parser,
1458- rng_type,
1459- stream,
1460- [](curandGenerator_t gen, float* data, size_t size)
1461- { return curandGenerateUniform(gen, data, size); });
1462+ run_benchmark<float>(
1463+ parser,
1464+ rng_type,
1465+ stream,
1466+ [](curandGenerator_t gen, float* data, size_t size)
1467+ { return curandGenerateUniform(gen, data, size); },
1468+ distribution,
1469+ engine);
1470 }
1471 if (distribution == "uniform-double")
1472 {
1473- run_benchmark<double>(parser,
1474- rng_type,
1475- stream,
1476- [](curandGenerator_t gen, double* data, size_t size)
1477- { return curandGenerateUniformDouble(gen, data, size); });
1478+ run_benchmark<double>(
1479+ parser,
1480+ rng_type,
1481+ stream,
1482+ [](curandGenerator_t gen, double* data, size_t size)
1483+ { return curandGenerateUniformDouble(gen, data, size); },
1484+ distribution,
1485+ engine);
1486 }
1487 if (distribution == "normal-float")
1488 {
1489- run_benchmark<float>(parser,
1490- rng_type,
1491- stream,
1492- [](curandGenerator_t gen, float* data, size_t size)
1493- { return curandGenerateNormal(gen, data, size, 0.0f, 1.0f); });
1494+ run_benchmark<float>(
1495+ parser,
1496+ rng_type,
1497+ stream,
1498+ [](curandGenerator_t gen, float* data, size_t size)
1499+ { return curandGenerateNormal(gen, data, size, 0.0f, 1.0f); },
1500+ distribution,
1501+ engine);
1502 }
1503 if (distribution == "normal-double")
1504 {
1505- run_benchmark<double>(parser,
1506- rng_type,
1507- stream,
1508- [](curandGenerator_t gen, double* data, size_t size)
1509- { return curandGenerateNormalDouble(gen, data, size, 0.0, 1.0); });
1510+ run_benchmark<double>(
1511+ parser,
1512+ rng_type,
1513+ stream,
1514+ [](curandGenerator_t gen, double* data, size_t size)
1515+ { return curandGenerateNormalDouble(gen, data, size, 0.0, 1.0); },
1516+ distribution,
1517+ engine);
1518 }
1519 if (distribution == "log-normal-float")
1520 {
1521- run_benchmark<float>(parser,
1522- rng_type,
1523- stream,
1524- [](curandGenerator_t gen, float* data, size_t size)
1525- { return curandGenerateLogNormal(gen, data, size, 0.0f, 1.0f); });
1526+ run_benchmark<float>(
1527+ parser,
1528+ rng_type,
1529+ stream,
1530+ [](curandGenerator_t gen, float* data, size_t size)
1531+ { return curandGenerateLogNormal(gen, data, size, 0.0f, 1.0f); },
1532+ distribution,
1533+ engine);
1534 }
1535 if (distribution == "log-normal-double")
1536 {
1537- run_benchmark<double>(parser,
1538- rng_type,
1539- stream,
1540- [](curandGenerator_t gen, double* data, size_t size)
1541- { return curandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0); });
1542+ run_benchmark<double>(
1543+ parser,
1544+ rng_type,
1545+ stream,
1546+ [](curandGenerator_t gen, double* data, size_t size)
1547+ { return curandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0); },
1548+ distribution,
1549+ engine);
1550 }
1551 if (distribution == "poisson")
1552 {
1553 const auto lambdas = parser.get<std::vector<double>>("lambda");
1554 for (double lambda : lambdas)
1555 {
1556- std::cout << " " << "lambda "
1557- << std::fixed << std::setprecision(1) << lambda << std::endl;
1558+ if(format.compare("console") == 0)
1559+ {
1560+ std::cout << " "
1561+ << "lambda " << std::fixed << std::setprecision(1) << lambda << std::endl;
1562+ }
1563 run_benchmark<unsigned int>(
1564 parser,
1565 rng_type,
1566 stream,
1567 [lambda](curandGenerator_t gen, unsigned int* data, size_t size)
1568- { return curandGeneratePoisson(gen, data, size, lambda); });
1569+ { return curandGeneratePoisson(gen, data, size, lambda); },
1570+ distribution,
1571+ engine,
1572+ lambda);
1573 }
1574 }
1575 }
1576@@ -265,6 +317,10 @@ int main(int argc, char *argv[])
1577 parser.set_optional<std::vector<std::string>>("dis", "dis", {"uniform-uint"}, distribution_desc.c_str());
1578 parser.set_optional<std::vector<std::string>>("engine", "engine", {"philox"}, engine_desc.c_str());
1579 parser.set_optional<std::vector<double>>("lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution");
1580+ parser.set_optional<std::string>("format",
1581+ "format",
1582+ {"console"},
1583+ "output format: console or csv");
1584 parser.run_and_exit_if_error();
1585
1586 std::vector<std::string> engines;
1587@@ -310,6 +366,7 @@ int main(int argc, char *argv[])
1588 cudaDeviceProp props;
1589 CUDA_CALL(cudaGetDeviceProperties(&props, device_id));
1590
1591+ std::cout << "benchmark_curand_generate" << std::endl;
1592 std::cout << "cuRAND: " << version << " ";
1593 std::cout << "Runtime: " << runtime_version << " ";
1594 std::cout << "Device: " << props.name;
1595@@ -318,6 +375,17 @@ int main(int argc, char *argv[])
1596 cudaStream_t stream;
1597 CUDA_CALL(cudaStreamCreate(&stream));
1598
1599+ std::string format = parser.get<std::string>("format");
1600+ bool console_output = format.compare("console") == 0 ? true : false;
1601+
1602+ if(!console_output)
1603+ {
1604+ std::cout
1605+ << "Engine,Distribution,Throughput,Samples,AvgTime (1 Trial),Time(all),Size,Lambda"
1606+ << std::endl;
1607+ std::cout << ",,GB/s,GSample/s,ms),ms),values," << std::endl;
1608+ }
1609+
1610 for (auto engine : engines)
1611 {
1612 rng_type_t rng_type = CURAND_RNG_PSEUDO_XORWOW;
1613@@ -345,12 +413,14 @@ int main(int argc, char *argv[])
1614 exit(1);
1615 }
1616
1617- std::cout << engine << ":" << std::endl;
1618+ if(console_output)
1619+ std::cout << engine << ":" << std::endl;
1620
1621 for (auto distribution : distributions)
1622 {
1623- std::cout << " " << distribution << ":" << std::endl;
1624- run_benchmarks(parser, rng_type, distribution, stream);
1625+ if(console_output)
1626+ std::cout << " " << distribution << ":" << std::endl;
1627+ run_benchmarks(parser, rng_type, distribution, engine, stream);
1628 }
1629 std::cout << std::endl;
1630 }
1631diff --git a/benchmark/benchmark_curand_host_api.cpp b/benchmark/benchmark_curand_host_api.cpp
1632index ddb121f..e1a099d 100644
1633--- a/benchmark/benchmark_curand_host_api.cpp
1634+++ b/benchmark/benchmark_curand_host_api.cpp
1635@@ -1,4 +1,4 @@
1636-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
1637+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
1638 //
1639 // Permission is hereby granted, free of charge, to any person obtaining a copy
1640 // of this software and associated documentation files (the "Software"), to deal
1641@@ -21,22 +21,11 @@
1642 #include "benchmark_curand_utils.hpp"
1643 #include "cmdparser.hpp"
1644
1645+#include <benchmark/benchmark.h>
1646+
1647 #include <cuda_runtime.h>
1648 #include <curand.h>
1649
1650-#define CUDA_CALL(condition) \
1651- do \
1652- { \
1653- cudaError_t error_ = condition; \
1654- if(error_ != cudaSuccess) \
1655- { \
1656- std::cout << "CUDA error: " << error_ << " at " << __FILE__ << ":" << __LINE__ \
1657- << std::endl; \
1658- exit(error_); \
1659- } \
1660- } \
1661- while(0)
1662-
1663 #ifndef DEFAULT_RAND_N
1664 const size_t DEFAULT_RAND_N = 1024 * 1024 * 128;
1665 #endif
1666@@ -48,29 +37,36 @@ using generate_func_type = std::function<curandStatus_t(curandGenerator_t, T*, s
1667
1668 template<typename T>
1669 void run_benchmark(benchmark::State& state,
1670- const rng_type_t rng_type,
1671 generate_func_type<T> generate_func,
1672 const size_t size,
1673+ const bool byte_size,
1674 const size_t trials,
1675- const size_t offset,
1676 const size_t dimensions,
1677+ const size_t offset,
1678+ const rng_type_t rng_type,
1679+ const curandOrdering ordering,
1680 const bool benchmark_host,
1681 cudaStream_t stream)
1682 {
1683+ const size_t binary_div = byte_size ? sizeof(T) : 1;
1684+ const size_t rounded_size = (size / binary_div / dimensions) * dimensions;
1685+
1686 T* data;
1687 curandGenerator_t generator;
1688
1689 if(benchmark_host)
1690 {
1691- data = new T[size];
1692+ data = new T[rounded_size];
1693 CURAND_CALL(curandCreateGeneratorHost(&generator, rng_type));
1694 }
1695 else
1696 {
1697- CUDA_CALL(cudaMalloc(&data, size * sizeof(T)));
1698+ CUDA_CALL(cudaMalloc(&data, rounded_size * sizeof(T)));
1699 CURAND_CALL(curandCreateGenerator(&generator, rng_type));
1700 }
1701
1702+ CURAND_CALL(curandSetGeneratorOrdering(generator, ordering));
1703+
1704 curandStatus_t status = curandSetQuasiRandomGeneratorDimensions(generator, dimensions);
1705 if(status != CURAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random
1706 {
1707@@ -88,7 +84,7 @@ void run_benchmark(benchmark::State& state,
1708 // Warm-up
1709 for(size_t i = 0; i < 15; i++)
1710 {
1711- CURAND_CALL(generate_func(generator, data, size));
1712+ CURAND_CALL(generate_func(generator, data, rounded_size));
1713 }
1714 CUDA_CALL(cudaDeviceSynchronize());
1715
1716@@ -96,28 +92,27 @@ void run_benchmark(benchmark::State& state,
1717 cudaEvent_t start, stop;
1718 CUDA_CALL(cudaEventCreate(&start));
1719 CUDA_CALL(cudaEventCreate(&stop));
1720-
1721 for(auto _ : state)
1722 {
1723 CUDA_CALL(cudaEventRecord(start, stream));
1724 for(size_t i = 0; i < trials; i++)
1725 {
1726- CURAND_CALL(generate_func(generator, data, size));
1727+ CURAND_CALL(generate_func(generator, data, rounded_size));
1728 }
1729 CUDA_CALL(cudaEventRecord(stop, stream));
1730 CUDA_CALL(cudaEventSynchronize(stop));
1731
1732 float elapsed = 0.0f;
1733 CUDA_CALL(cudaEventElapsedTime(&elapsed, start, stop));
1734+
1735 state.SetIterationTime(elapsed / 1000.f);
1736 }
1737
1738- state.SetBytesProcessed(trials * state.iterations() * size * sizeof(T));
1739- state.SetItemsProcessed(trials * state.iterations() * size);
1740+ state.SetBytesProcessed(trials * state.iterations() * rounded_size * sizeof(T));
1741+ state.SetItemsProcessed(trials * state.iterations() * rounded_size);
1742
1743 CUDA_CALL(cudaEventDestroy(stop));
1744 CUDA_CALL(cudaEventDestroy(start));
1745-
1746 CURAND_CALL(curandDestroyGenerator(generator));
1747
1748 if(benchmark_host)
1749@@ -133,6 +128,10 @@ void run_benchmark(benchmark::State& state,
1750 void configure_parser(cli::Parser& parser)
1751 {
1752 parser.set_optional<size_t>("size", "size", DEFAULT_RAND_N, "number of values");
1753+ parser.set_optional<bool>("byte-size",
1754+ "byte-size",
1755+ false,
1756+ "--size is interpreted as the number of generated bytes");
1757 parser.set_optional<size_t>("trials", "trials", 20, "number of trials");
1758 parser.set_optional<size_t>("offset", "offset", 0, "offset of generated pseudo-random values");
1759 parser.set_optional<size_t>("dimensions",
1760@@ -152,6 +151,14 @@ void configure_parser(cli::Parser& parser)
1761
1762 int main(int argc, char* argv[])
1763 {
1764+ // get paramaters before they are passed into
1765+ std::string outFormat = "";
1766+ std::string filter = "";
1767+ std::string consoleFormat = "";
1768+
1769+ getFormats(argc, argv, outFormat, filter, consoleFormat);
1770+
1771+ // Parse argv
1772 benchmark::Initialize(&argc, argv);
1773
1774 // Parse arguments from command line
1775@@ -165,6 +172,7 @@ int main(int argc, char* argv[])
1776 add_common_benchmark_curand_info();
1777
1778 const size_t size = parser.get<size_t>("size");
1779+ const bool byte_size = parser.get<bool>("byte-size");
1780 const size_t trials = parser.get<size_t>("trials");
1781 const size_t offset = parser.get<size_t>("offset");
1782 const size_t dimensions = parser.get<size_t>("dimensions");
1783@@ -172,12 +180,13 @@ int main(int argc, char* argv[])
1784 const bool benchmark_host = parser.get<bool>("host");
1785
1786 benchmark::AddCustomContext("size", std::to_string(size));
1787+ benchmark::AddCustomContext("byte-size", std::to_string(byte_size));
1788 benchmark::AddCustomContext("trials", std::to_string(trials));
1789 benchmark::AddCustomContext("offset", std::to_string(offset));
1790 benchmark::AddCustomContext("dimensions", std::to_string(dimensions));
1791 benchmark::AddCustomContext("benchmark_host", std::to_string(benchmark_host));
1792
1793- const std::vector<rng_type_t> engine_types{
1794+ const std::vector<rng_type_t> benchmarked_engine_types{
1795 CURAND_RNG_PSEUDO_MT19937,
1796 CURAND_RNG_PSEUDO_MTGP32,
1797 CURAND_RNG_PSEUDO_MRG32K3A,
1798@@ -189,147 +198,214 @@ int main(int argc, char* argv[])
1799 CURAND_RNG_PSEUDO_XORWOW,
1800 };
1801
1802- const std::string benchmark_name_prefix = "device_generate";
1803- std::vector<benchmark::internal::Benchmark*> benchmarks = {};
1804+ const std::map<curandOrdering, std::string> ordering_name_map{
1805+ {CURAND_ORDERING_PSEUDO_DEFAULT, "default"},
1806+ { CURAND_ORDERING_PSEUDO_LEGACY, "legacy"},
1807+ { CURAND_ORDERING_PSEUDO_BEST, "best"},
1808+ {CURAND_ORDERING_PSEUDO_DYNAMIC, "dynamic"},
1809+ { CURAND_ORDERING_PSEUDO_SEEDED, "seeded"},
1810+ { CURAND_ORDERING_QUASI_DEFAULT, "default"},
1811+ };
1812
1813+ const std::map<rng_type_t, std::vector<curandOrdering>> benchmarked_orderings{
1814+ // clang-format off
1815+ { CURAND_RNG_PSEUDO_MTGP32,
1816+ {CURAND_ORDERING_PSEUDO_DEFAULT, CURAND_ORDERING_PSEUDO_DYNAMIC}},
1817+ { CURAND_RNG_PSEUDO_MT19937, {CURAND_ORDERING_PSEUDO_DEFAULT}},
1818+ { CURAND_RNG_PSEUDO_XORWOW,
1819+ {CURAND_ORDERING_PSEUDO_DEFAULT, CURAND_ORDERING_PSEUDO_DYNAMIC} },
1820+ { CURAND_RNG_PSEUDO_MRG32K3A,
1821+ {CURAND_ORDERING_PSEUDO_DEFAULT, CURAND_ORDERING_PSEUDO_DYNAMIC}},
1822+ { CURAND_RNG_PSEUDO_PHILOX4_32_10,
1823+ {CURAND_ORDERING_PSEUDO_DEFAULT, CURAND_ORDERING_PSEUDO_DYNAMIC}},
1824+ { CURAND_RNG_QUASI_SOBOL32, {CURAND_ORDERING_QUASI_DEFAULT}},
1825+ {CURAND_RNG_QUASI_SCRAMBLED_SOBOL32, {CURAND_ORDERING_QUASI_DEFAULT}},
1826+ { CURAND_RNG_QUASI_SOBOL64, {CURAND_ORDERING_QUASI_DEFAULT}},
1827+ {CURAND_RNG_QUASI_SCRAMBLED_SOBOL64, {CURAND_ORDERING_QUASI_DEFAULT}},
1828+ // clang-format on
1829+ };
1830+
1831+ const std::string benchmark_name_prefix = "device_generate";
1832 // Add benchmarks
1833- for(const rng_type_t engine_type : engine_types)
1834+ std::vector<benchmark::internal::Benchmark*> benchmarks = {};
1835+ for(const rng_type_t engine_type : benchmarked_engine_types)
1836 {
1837- const std::string benchmark_name_engine
1838- = benchmark_name_prefix + "<" + engine_name(engine_type) + ",";
1839+ const std::string name = engine_name(engine_type);
1840+ for(const curandOrdering ordering : benchmarked_orderings.at(engine_type))
1841+ {
1842+ const std::string name_engine_prefix
1843+ = benchmark_name_prefix + "<" + name + "," + ordering_name_map.at(ordering) + ",";
1844+
1845+ if(engine_type != CURAND_RNG_QUASI_SOBOL64
1846+ && engine_type != CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
1847+
1848+ {
1849+ benchmarks.emplace_back(benchmark::RegisterBenchmark(
1850+ (name_engine_prefix + "uniform-uint>").c_str(),
1851+ &run_benchmark<unsigned int>,
1852+ [](curandGenerator_t gen, unsigned int* data, size_t size_gen)
1853+ { return curandGenerate(gen, data, size_gen); },
1854+ size,
1855+ byte_size,
1856+ trials,
1857+ dimensions,
1858+ offset,
1859+ engine_type,
1860+ ordering,
1861+ benchmark_host,
1862+ stream));
1863+ }
1864+ else
1865+ {
1866+ benchmarks.emplace_back(benchmark::RegisterBenchmark(
1867+ (name_engine_prefix + "uniform-long-long>").c_str(),
1868+ &run_benchmark<unsigned long long>,
1869+ [](curandGenerator_t gen, unsigned long long* data, size_t size)
1870+ { return curandGenerateLongLong(gen, data, size); },
1871+ size,
1872+ byte_size,
1873+ trials,
1874+ dimensions,
1875+ offset,
1876+ engine_type,
1877+ ordering,
1878+ benchmark_host,
1879+ stream));
1880+ }
1881+
1882+ benchmarks.emplace_back(
1883+ benchmark::RegisterBenchmark((name_engine_prefix + "uniform-float>").c_str(),
1884+ &run_benchmark<float>,
1885+ [](curandGenerator_t gen, float* data, size_t size_gen)
1886+ { return curandGenerateUniform(gen, data, size_gen); },
1887+ size,
1888+ byte_size,
1889+ trials,
1890+ dimensions,
1891+ offset,
1892+ engine_type,
1893+ ordering,
1894+ benchmark_host,
1895+ stream));
1896
1897- if(engine_type != CURAND_RNG_QUASI_SOBOL64
1898- && engine_type != CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
1899 benchmarks.emplace_back(benchmark::RegisterBenchmark(
1900- (benchmark_name_engine + "uniform-uint>").c_str(),
1901- &run_benchmark<unsigned int>,
1902- engine_type,
1903- [](curandGenerator_t gen, unsigned int* data, size_t size)
1904- { return curandGenerate(gen, data, size); },
1905+ (name_engine_prefix + "uniform-double>").c_str(),
1906+ &run_benchmark<double>,
1907+ [](curandGenerator_t gen, double* data, size_t size_gen)
1908+ { return curandGenerateUniformDouble(gen, data, size_gen); },
1909 size,
1910+ byte_size,
1911 trials,
1912- offset,
1913 dimensions,
1914+ offset,
1915+ engine_type,
1916+ ordering,
1917 benchmark_host,
1918 stream));
1919- else
1920+
1921 benchmarks.emplace_back(benchmark::RegisterBenchmark(
1922- (benchmark_name_engine + "uniform-long-long>").c_str(),
1923- &run_benchmark<unsigned long long>,
1924- engine_type,
1925- [](curandGenerator_t gen, unsigned long long* data, size_t size)
1926- { return curandGenerateLongLong(gen, data, size); },
1927+ (name_engine_prefix + "normal-float>").c_str(),
1928+ &run_benchmark<float>,
1929+ [](curandGenerator_t gen, float* data, size_t size_gen)
1930+ { return curandGenerateNormal(gen, data, size_gen, 0.0f, 1.0f); },
1931 size,
1932+ byte_size,
1933 trials,
1934- offset,
1935 dimensions,
1936+ offset,
1937+ engine_type,
1938+ ordering,
1939 benchmark_host,
1940 stream));
1941
1942- benchmarks.emplace_back(
1943- benchmark::RegisterBenchmark((benchmark_name_engine + "uniform-float>").c_str(),
1944- &run_benchmark<float>,
1945- engine_type,
1946- [](curandGenerator_t gen, float* data, size_t size)
1947- { return curandGenerateUniform(gen, data, size); },
1948- size,
1949- trials,
1950- offset,
1951- dimensions,
1952- benchmark_host,
1953- stream));
1954-
1955- benchmarks.emplace_back(
1956- benchmark::RegisterBenchmark((benchmark_name_engine + "uniform-double>").c_str(),
1957- &run_benchmark<double>,
1958- engine_type,
1959- [](curandGenerator_t gen, double* data, size_t size)
1960- { return curandGenerateUniformDouble(gen, data, size); },
1961- size,
1962- trials,
1963- offset,
1964- dimensions,
1965- benchmark_host,
1966- stream));
1967-
1968- benchmarks.emplace_back(benchmark::RegisterBenchmark(
1969- (benchmark_name_engine + "normal-float>").c_str(),
1970- &run_benchmark<float>,
1971- engine_type,
1972- [](curandGenerator_t gen, float* data, size_t size)
1973- { return curandGenerateNormal(gen, data, size, 0.0f, 1.0f); },
1974- size,
1975- trials,
1976- offset,
1977- dimensions,
1978- benchmark_host,
1979- stream));
1980-
1981- benchmarks.emplace_back(benchmark::RegisterBenchmark(
1982- (benchmark_name_engine + "normal-double>").c_str(),
1983- &run_benchmark<double>,
1984- engine_type,
1985- [](curandGenerator_t gen, double* data, size_t size)
1986- { return curandGenerateNormalDouble(gen, data, size, 0.0, 1.0); },
1987- size,
1988- trials,
1989- offset,
1990- dimensions,
1991- benchmark_host,
1992- stream));
1993-
1994- benchmarks.emplace_back(benchmark::RegisterBenchmark(
1995- (benchmark_name_engine + "log-normal-float>").c_str(),
1996- &run_benchmark<float>,
1997- engine_type,
1998- [](curandGenerator_t gen, float* data, size_t size)
1999- { return curandGenerateLogNormal(gen, data, size, 0.0f, 1.0f); },
2000- size,
2001- trials,
2002- offset,
2003- dimensions,
2004- benchmark_host,
2005- stream));
2006-
2007- benchmarks.emplace_back(benchmark::RegisterBenchmark(
2008- (benchmark_name_engine + "log-normal-double>").c_str(),
2009- &run_benchmark<double>,
2010- engine_type,
2011- [](curandGenerator_t gen, double* data, size_t size)
2012- { return curandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0); },
2013- size,
2014- trials,
2015- offset,
2016- dimensions,
2017- benchmark_host,
2018- stream));
2019-
2020- for(auto lambda : poisson_lambdas)
2021- {
2022- const std::string poisson_dis_name
2023- = std::string("poisson(lambda=") + std::to_string(lambda) + ")>";
2024-
2025 benchmarks.emplace_back(benchmark::RegisterBenchmark(
2026- (benchmark_name_engine + poisson_dis_name).c_str(),
2027- &run_benchmark<unsigned int>,
2028+ (name_engine_prefix + "normal-double>").c_str(),
2029+ &run_benchmark<double>,
2030+ [](curandGenerator_t gen, double* data, size_t size_gen)
2031+ { return curandGenerateNormalDouble(gen, data, size_gen, 0.0, 1.0); },
2032+ size,
2033+ byte_size,
2034+ trials,
2035+ dimensions,
2036+ offset,
2037 engine_type,
2038- [lambda](curandGenerator_t gen, unsigned int* data, size_t size)
2039- { return curandGeneratePoisson(gen, data, size, lambda); },
2040+ ordering,
2041+ benchmark_host,
2042+ stream));
2043+
2044+ benchmarks.emplace_back(benchmark::RegisterBenchmark(
2045+ (name_engine_prefix + "log-normal-float>").c_str(),
2046+ &run_benchmark<float>,
2047+ [](curandGenerator_t gen, float* data, size_t size_gen)
2048+ { return curandGenerateLogNormal(gen, data, size_gen, 0.0f, 1.0f); },
2049 size,
2050+ byte_size,
2051 trials,
2052+ dimensions,
2053 offset,
2054+ engine_type,
2055+ ordering,
2056+ benchmark_host,
2057+ stream));
2058+
2059+ benchmarks.emplace_back(benchmark::RegisterBenchmark(
2060+ (name_engine_prefix + "log-normal-double>").c_str(),
2061+ &run_benchmark<double>,
2062+ [](curandGenerator_t gen, double* data, size_t size_gen)
2063+ { return curandGenerateLogNormalDouble(gen, data, size_gen, 0.0, 1.0); },
2064+ size,
2065+ byte_size,
2066+ trials,
2067 dimensions,
2068+ offset,
2069+ engine_type,
2070+ ordering,
2071 benchmark_host,
2072 stream));
2073+
2074+ for(auto lambda : poisson_lambdas)
2075+ {
2076+ const std::string poisson_dis_name
2077+ = std::string("poisson(lambda=") + std::to_string(lambda) + ")>";
2078+ benchmarks.emplace_back(benchmark::RegisterBenchmark(
2079+ (name_engine_prefix + poisson_dis_name).c_str(),
2080+ &run_benchmark<unsigned int>,
2081+ [lambda](curandGenerator_t gen, unsigned int* data, size_t size_gen)
2082+ { return curandGeneratePoisson(gen, data, size_gen, lambda); },
2083+ size,
2084+ byte_size,
2085+ trials,
2086+ dimensions,
2087+ offset,
2088+ engine_type,
2089+ ordering,
2090+ benchmark_host,
2091+ stream));
2092+ }
2093 }
2094 }
2095- // Use manual timing
2096+
2097 for(auto& b : benchmarks)
2098 {
2099 b->UseManualTime();
2100 b->Unit(benchmark::kMillisecond);
2101 }
2102- benchmark::RunSpecifiedBenchmarks();
2103+
2104+ benchmark::BenchmarkReporter* console_reporter = getConsoleReporter(consoleFormat);
2105+ benchmark::BenchmarkReporter* out_file_reporter = getOutFileReporter(outFormat);
2106+
2107+ std::string spec = (filter == "" || filter == "all") ? "." : filter;
2108+
2109+ // Run benchmarks
2110+ if(outFormat == "") // default case
2111+ {
2112+ benchmark::RunSpecifiedBenchmarks(console_reporter, spec);
2113+ }
2114+ else
2115+ {
2116+ benchmark::RunSpecifiedBenchmarks(console_reporter, out_file_reporter, spec);
2117+ }
2118+
2119 CUDA_CALL(cudaStreamDestroy(stream));
2120
2121 return 0;
2122diff --git a/benchmark/benchmark_curand_kernel.cpp b/benchmark/benchmark_curand_kernel.cpp
2123index 6c205ff..df3e792 100644
2124--- a/benchmark/benchmark_curand_kernel.cpp
2125+++ b/benchmark/benchmark_curand_kernel.cpp
2126@@ -1,4 +1,4 @@
2127-// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
2128+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
2129 //
2130 // Permission is hereby granted, free of charge, to any person obtaining a copy
2131 // of this software and associated documentation files (the "Software"), to deal
2132@@ -36,7 +36,7 @@
2133 #include <curand_mtgp32_host.h>
2134 #include <curand_mtgp32dc_p_11213.h>
2135
2136-#define CUPRAND_DEFAULT_MAX_BLOCK_SIZE 256
2137+#define CURAND_DEFAULT_MAX_BLOCK_SIZE 256
2138
2139 #define CUDA_CALL(x) do { \
2140 cudaError_t error = (x);\
2141@@ -63,8 +63,8 @@ size_t next_power2(size_t x)
2142
2143 template<typename GeneratorState>
2144 __global__
2145-__launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE)
2146-void init_kernel(GeneratorState * states,
2147+__launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE)
2148+void init_kernel(GeneratorState* states,
2149 const unsigned long long seed,
2150 const unsigned long long offset)
2151 {
2152@@ -76,12 +76,12 @@ void init_kernel(GeneratorState * states,
2153
2154 template<typename GeneratorState, typename T, typename GenerateFunc, typename Extra>
2155 __global__
2156-__launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE)
2157-void generate_kernel(GeneratorState * states,
2158- T * data,
2159- const size_t size,
2160- GenerateFunc generate_func,
2161- const Extra extra)
2162+__launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE)
2163+void generate_kernel(GeneratorState* states,
2164+ T* data,
2165+ const size_t size,
2166+ GenerateFunc generate_func,
2167+ const Extra extra)
2168 {
2169 const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
2170 const unsigned int stride = gridDim.x * blockDim.x;
2171@@ -112,7 +112,7 @@ struct runner
2172
2173 init_kernel<<<blocks, threads>>>(states, seed, offset);
2174
2175- CUDA_CALL(cudaPeekAtLastError());
2176+ CUDA_CALL(cudaGetLastError());
2177 CUDA_CALL(cudaDeviceSynchronize());
2178 }
2179
2180@@ -127,12 +127,13 @@ struct runner
2181 }
2182
2183 template<typename T, typename GenerateFunc, typename Extra>
2184- void generate(const size_t blocks,
2185- const size_t threads,
2186- T * data,
2187- const size_t size,
2188+ void generate(const size_t blocks,
2189+ const size_t threads,
2190+ cudaStream_t stream,
2191+ T* data,
2192+ const size_t size,
2193 const GenerateFunc& generate_func,
2194- const Extra extra)
2195+ const Extra extra)
2196 {
2197 generate_kernel<<<blocks, threads>>>(states, data, size, generate_func, extra);
2198 }
2199@@ -140,12 +141,12 @@ struct runner
2200
2201 template<typename T, typename GenerateFunc, typename Extra>
2202 __global__
2203-__launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE)
2204-void generate_kernel(curandStateMtgp32_t * states,
2205- T * data,
2206- const size_t size,
2207- GenerateFunc generate_func,
2208- const Extra extra)
2209+__launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE)
2210+void generate_kernel(curandStateMtgp32_t* states,
2211+ T* data,
2212+ const size_t size,
2213+ GenerateFunc generate_func,
2214+ const Extra extra)
2215 {
2216 const unsigned int state_id = blockIdx.x;
2217 const unsigned int thread_id = threadIdx.x;
2218@@ -159,7 +160,13 @@ void generate_kernel(curandStateMtgp32_t * states,
2219 __syncthreads();
2220
2221 const size_t r = size%blockDim.x;
2222- const size_t size_rounded_up = r == 0 ? size : size + (blockDim.x - r);
2223+ const size_t size_rounded_down = size - r;
2224+ const size_t size_rounded_up = r == 0 ? size : size_rounded_down + blockDim.x;
2225+ while(index < size_rounded_down)
2226+ {
2227+ data[index] = generate_func(&state, extra);
2228+ index += stride;
2229+ }
2230 while(index < size_rounded_up)
2231 {
2232 auto value = generate_func(&state, extra);
2233@@ -207,18 +214,23 @@ struct runner<curandStateMtgp32_t>
2234 template<typename T, typename GenerateFunc, typename Extra>
2235 void generate(const size_t blocks,
2236 const size_t /* threads */,
2237- T * data,
2238- const size_t size,
2239+ cudaStream_t stream,
2240+ T* data,
2241+ const size_t size,
2242 const GenerateFunc& generate_func,
2243- const Extra extra)
2244+ const Extra extra)
2245 {
2246- generate_kernel<<<std::min((size_t)200, blocks), 256>>>(states, data, size, generate_func, extra);
2247+ generate_kernel<<<std::min((size_t)200, blocks), 256, 0, stream>>>(states,
2248+ data,
2249+ size,
2250+ generate_func,
2251+ extra);
2252 }
2253 };
2254
2255 template<typename GeneratorState, typename SobolType>
2256-__global__ __launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE) void init_sobol_kernel(
2257- GeneratorState* states, SobolType* directions, SobolType offset)
2258+__global__ __launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE)
2259+void init_sobol_kernel(GeneratorState* states, SobolType* directions, SobolType offset)
2260 {
2261 const unsigned int dimension = blockIdx.y;
2262 const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
2263@@ -228,8 +240,11 @@ __global__ __launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE) void init_sobol_ker
2264 }
2265
2266 template<typename GeneratorState, typename SobolType>
2267-__global__ __launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE) void init_scrambled_sobol_kernel(
2268- GeneratorState* states, SobolType* directions, SobolType* scramble_constants, SobolType offset)
2269+__global__ __launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE)
2270+void init_scrambled_sobol_kernel(GeneratorState* states,
2271+ SobolType* directions,
2272+ SobolType* scramble_constants,
2273+ SobolType offset)
2274 {
2275 const unsigned int dimension = blockIdx.y;
2276 const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
2277@@ -243,12 +258,12 @@ __global__ __launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE) void init_scrambled
2278
2279 // generate_kernel for the sobol generators
2280 template<typename GeneratorState, typename T, typename GenerateFunc, typename Extra>
2281-__global__ __launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_sobol_kernel(
2282- GeneratorState* states,
2283- T* data,
2284- const size_t size,
2285- GenerateFunc generate_func,
2286- const Extra extra)
2287+__global__ __launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE)
2288+void generate_sobol_kernel(GeneratorState* states,
2289+ T* data,
2290+ const size_t size,
2291+ GenerateFunc generate_func,
2292+ const Extra extra)
2293 {
2294 const unsigned int dimension = blockIdx.y;
2295 const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
2296@@ -282,11 +297,13 @@ struct runner<curandStateSobol32_t>
2297 {
2298 this->dimensions = dimensions;
2299
2300+ curandDirectionVectors32_t* h_directions;
2301+ CURAND_CALL(
2302+ curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6));
2303+
2304 const size_t states_size = blocks * threads * dimensions;
2305 CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateSobol32_t)));
2306
2307- curandDirectionVectors32_t * h_directions;
2308- CURAND_CALL(curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6));
2309 unsigned int* directions;
2310 const size_t size = dimensions * sizeof(unsigned int) * 32;
2311 CUDA_CALL(cudaMalloc(&directions, size));
2312@@ -298,7 +315,7 @@ struct runner<curandStateSobol32_t>
2313 directions,
2314 static_cast<unsigned int>(offset));
2315
2316- CUDA_CALL(cudaPeekAtLastError());
2317+ CUDA_CALL(cudaGetLastError());
2318 CUDA_CALL(cudaDeviceSynchronize());
2319
2320 CUDA_CALL(cudaFree(directions));
2321@@ -315,19 +332,20 @@ struct runner<curandStateSobol32_t>
2322 }
2323
2324 template<typename T, typename GenerateFunc, typename Extra>
2325- void generate(const size_t blocks,
2326- const size_t threads,
2327- T * data,
2328- const size_t size,
2329+ void generate(const size_t blocks,
2330+ const size_t threads,
2331+ cudaStream_t stream,
2332+ T* data,
2333+ const size_t size,
2334 const GenerateFunc& generate_func,
2335- const Extra extra)
2336+ const Extra extra)
2337 {
2338 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
2339- generate_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(states,
2340- data,
2341- size / dimensions,
2342- generate_func,
2343- extra);
2344+ generate_sobol_kernel<<<dim3(blocks_x, dimensions), threads, 0, stream>>>(states,
2345+ data,
2346+ size / dimensions,
2347+ generate_func,
2348+ extra);
2349 }
2350 };
2351
2352@@ -345,26 +363,26 @@ struct runner<curandStateScrambledSobol32_t>
2353 {
2354 this->dimensions = dimensions;
2355
2356- const size_t states_size = blocks * threads * dimensions;
2357- CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol32_t)));
2358-
2359 curandDirectionVectors32_t* h_directions;
2360+ unsigned int* h_constants;
2361+
2362 CURAND_CALL(
2363 curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6));
2364+ CURAND_CALL(curandGetScrambleConstants32(&h_constants));
2365+
2366+ const size_t states_size = blocks * threads * dimensions;
2367+ CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol32_t)));
2368+
2369 unsigned int* directions;
2370- const size_t size = dimensions * sizeof(unsigned int) * 32;
2371- CUDA_CALL(cudaMalloc(&directions, size));
2372- CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));
2373+ const size_t directions_size = dimensions * sizeof(unsigned int) * 32;
2374+ CUDA_CALL(cudaMalloc(&directions, directions_size));
2375+ CUDA_CALL(cudaMemcpy(directions, h_directions, directions_size, cudaMemcpyHostToDevice));
2376
2377- unsigned int* h_scramble_constants;
2378- CURAND_CALL(curandGetScrambleConstants32(&h_scramble_constants));
2379 unsigned int* scramble_constants;
2380 const size_t constants_size = dimensions * sizeof(unsigned int);
2381 CUDA_CALL(cudaMalloc(&scramble_constants, constants_size));
2382- CUDA_CALL(cudaMemcpy(scramble_constants,
2383- h_scramble_constants,
2384- constants_size,
2385- cudaMemcpyHostToDevice));
2386+ CUDA_CALL(
2387+ cudaMemcpy(scramble_constants, h_constants, constants_size, cudaMemcpyHostToDevice));
2388
2389 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
2390 init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(
2391@@ -373,7 +391,7 @@ struct runner<curandStateScrambledSobol32_t>
2392 scramble_constants,
2393 static_cast<unsigned int>(offset));
2394
2395- CUDA_CALL(cudaPeekAtLastError());
2396+ CUDA_CALL(cudaGetLastError());
2397 CUDA_CALL(cudaDeviceSynchronize());
2398
2399 CUDA_CALL(cudaFree(directions));
2400@@ -393,17 +411,18 @@ struct runner<curandStateScrambledSobol32_t>
2401 template<typename T, typename GenerateFunc, typename Extra>
2402 void generate(const size_t blocks,
2403 const size_t threads,
2404+ cudaStream_t stream,
2405 T* data,
2406 const size_t size,
2407 const GenerateFunc& generate_func,
2408 const Extra extra)
2409 {
2410 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
2411- generate_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(states,
2412- data,
2413- size / dimensions,
2414- generate_func,
2415- extra);
2416+ generate_sobol_kernel<<<dim3(blocks_x, dimensions), threads, 0, stream>>>(states,
2417+ data,
2418+ size / dimensions,
2419+ generate_func,
2420+ extra);
2421 }
2422 };
2423
2424@@ -421,11 +440,13 @@ struct runner<curandStateSobol64_t>
2425 {
2426 this->dimensions = dimensions;
2427
2428+ curandDirectionVectors64_t* h_directions;
2429+ CURAND_CALL(
2430+ curandGetDirectionVectors64(&h_directions, CURAND_DIRECTION_VECTORS_64_JOEKUO6));
2431+
2432 const size_t states_size = blocks * threads * dimensions;
2433 CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateSobol64_t)));
2434
2435- curandDirectionVectors64_t * h_directions;
2436- CURAND_CALL(curandGetDirectionVectors64(&h_directions, CURAND_DIRECTION_VECTORS_64_JOEKUO6));
2437 unsigned long long int* directions;
2438 const size_t size = dimensions * sizeof(unsigned long long) * 64;
2439 CUDA_CALL(cudaMalloc(&directions, size));
2440@@ -434,7 +455,7 @@ struct runner<curandStateSobol64_t>
2441 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
2442 init_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, directions, offset);
2443
2444- CUDA_CALL(cudaPeekAtLastError());
2445+ CUDA_CALL(cudaGetLastError());
2446 CUDA_CALL(cudaDeviceSynchronize());
2447
2448 CUDA_CALL(cudaFree(directions));
2449@@ -453,6 +474,7 @@ struct runner<curandStateSobol64_t>
2450 template<typename T, typename GenerateFunc, typename Extra>
2451 void generate(const size_t blocks,
2452 const size_t threads,
2453+ cudaStream_t stream,
2454 T* data,
2455 const size_t size,
2456 const GenerateFunc& generate_func,
2457@@ -481,26 +503,26 @@ struct runner<curandStateScrambledSobol64_t>
2458 {
2459 this->dimensions = dimensions;
2460
2461- const size_t states_size = blocks * threads * dimensions;
2462- CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol64_t)));
2463-
2464 curandDirectionVectors64_t* h_directions;
2465+ unsigned long long* h_constants;
2466+
2467 CURAND_CALL(
2468 curandGetDirectionVectors64(&h_directions, CURAND_DIRECTION_VECTORS_64_JOEKUO6));
2469+ CURAND_CALL(curandGetScrambleConstants64(&h_constants));
2470+
2471+ const size_t states_size = blocks * threads * dimensions;
2472+ CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol64_t)));
2473+
2474 unsigned long long* directions;
2475- const size_t size = dimensions * sizeof(unsigned long long) * 64;
2476- CUDA_CALL(cudaMalloc(&directions, size));
2477- CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));
2478+ const size_t directions_size = dimensions * sizeof(unsigned long long) * 64;
2479+ CUDA_CALL(cudaMalloc(&directions, directions_size));
2480+ CUDA_CALL(cudaMemcpy(directions, h_directions, directions_size, cudaMemcpyHostToDevice));
2481
2482- unsigned long long* h_scramble_constants;
2483- CURAND_CALL(curandGetScrambleConstants64(&h_scramble_constants));
2484 unsigned long long* scramble_constants;
2485 const size_t constants_size = dimensions * sizeof(unsigned long long);
2486 CUDA_CALL(cudaMalloc(&scramble_constants, constants_size));
2487- CUDA_CALL(cudaMemcpy(scramble_constants,
2488- h_scramble_constants,
2489- constants_size,
2490- cudaMemcpyHostToDevice));
2491+ CUDA_CALL(
2492+ cudaMemcpy(scramble_constants, h_constants, constants_size, cudaMemcpyHostToDevice));
2493
2494 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
2495 init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(states,
2496@@ -508,7 +530,7 @@ struct runner<curandStateScrambledSobol64_t>
2497 scramble_constants,
2498 offset);
2499
2500- CUDA_CALL(cudaPeekAtLastError());
2501+ CUDA_CALL(cudaGetLastError());
2502 CUDA_CALL(cudaDeviceSynchronize());
2503
2504 CUDA_CALL(cudaFree(directions));
2505@@ -526,12 +548,13 @@ struct runner<curandStateScrambledSobol64_t>
2506 }
2507
2508 template<typename T, typename GenerateFunc, typename Extra>
2509- void generate(const size_t blocks,
2510- const size_t threads,
2511- T * data,
2512- const size_t size,
2513+ void generate(const size_t blocks,
2514+ const size_t threads,
2515+ cudaStream_t stream,
2516+ T* data,
2517+ const size_t size,
2518 const GenerateFunc& generate_func,
2519- const Extra extra)
2520+ const Extra extra)
2521 {
2522 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
2523 generate_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(states,
2524@@ -543,9 +566,13 @@ struct runner<curandStateScrambledSobol64_t>
2525 };
2526
2527 template<typename T, typename GeneratorState, typename GenerateFunc, typename Extra>
2528-void run_benchmark(const cli::Parser& parser,
2529+void run_benchmark(const cli::Parser& parser,
2530+ cudaStream_t stream,
2531 const GenerateFunc& generate_func,
2532- const Extra extra)
2533+ const Extra extra,
2534+ const std::string& distribution,
2535+ const std::string& engine,
2536+ const double lambda = 0.f)
2537 {
2538 const size_t size = parser.get<size_t>("size");
2539 const size_t dimensions = parser.get<size_t>("dimensions");
2540@@ -554,6 +581,8 @@ void run_benchmark(const cli::Parser& parser,
2541 const size_t blocks = parser.get<size_t>("blocks");
2542 const size_t threads = parser.get<size_t>("threads");
2543
2544+ const std::string format = parser.get<std::string>("format");
2545+
2546 T * data;
2547 CUDA_CALL(cudaMalloc(&data, size * sizeof(T)));
2548
2549@@ -562,129 +591,170 @@ void run_benchmark(const cli::Parser& parser,
2550 // Warm-up
2551 for (size_t i = 0; i < 5; i++)
2552 {
2553- r.generate(blocks, threads, data, size, generate_func, extra);
2554- CUDA_CALL(cudaPeekAtLastError());
2555+ r.generate(blocks, threads, stream, data, size, generate_func, extra);
2556+ CUDA_CALL(cudaGetLastError());
2557 CUDA_CALL(cudaDeviceSynchronize());
2558 }
2559 CUDA_CALL(cudaDeviceSynchronize());
2560
2561 // Measurement
2562- auto start = std::chrono::high_resolution_clock::now();
2563+ cudaEvent_t start, stop;
2564+ CUDA_CALL(cudaEventCreate(&start));
2565+ CUDA_CALL(cudaEventCreate(&stop));
2566+ CUDA_CALL(cudaEventRecord(start, stream));
2567 for (size_t i = 0; i < trials; i++)
2568 {
2569- r.generate(blocks, threads, data, size, generate_func, extra);
2570+ r.generate(blocks, threads, stream, data, size, generate_func, extra);
2571+ }
2572+ CUDA_CALL(cudaEventRecord(stop, stream));
2573+ CUDA_CALL(cudaEventSynchronize(stop));
2574+ float elapsed;
2575+ CUDA_CALL(cudaEventElapsedTime(&elapsed, start, stop));
2576+ CUDA_CALL(cudaEventDestroy(start));
2577+ CUDA_CALL(cudaEventDestroy(stop));
2578+
2579+ if(format.compare("csv") == 0)
2580+ {
2581+ std::cout << std::fixed << std::setprecision(3) << engine << "," << distribution << ","
2582+ << (trials * size * sizeof(T)) / (elapsed / 1e3 * (1 << 30)) << ","
2583+ << (trials * size) / (elapsed / 1e3 * (1 << 30)) << "," << elapsed / trials << ","
2584+ << elapsed << "," << size << ",";
2585+ if(distribution.compare("poisson") == 0 || distribution.compare("discrete-poisson") == 0)
2586+ {
2587+ std::cout << lambda;
2588+ }
2589+ std::cout << std::endl;
2590+ }
2591+ else
2592+ {
2593+ if(format.compare("console") != 0)
2594+ {
2595+ std::cout << "Unknown format specified (must be either console or csv). Defaulting to "
2596+ "console output."
2597+ << std::endl;
2598+ }
2599+ std::cout << std::fixed << std::setprecision(3) << " "
2600+ << "Throughput = " << std::setw(8)
2601+ << (trials * size * sizeof(T)) / (elapsed / 1e3 * (1 << 30))
2602+ << " GB/s, Samples = " << std::setw(8)
2603+ << (trials * size) / (elapsed / 1e3 * (1 << 30))
2604+ << " GSample/s, AvgTime (1 trial) = " << std::setw(8) << elapsed / trials
2605+ << " ms, Time (all) = " << std::setw(8) << elapsed << " ms, Size = " << size
2606+ << std::endl;
2607 }
2608- CUDA_CALL(cudaPeekAtLastError());
2609- CUDA_CALL(cudaDeviceSynchronize());
2610- auto end = std::chrono::high_resolution_clock::now();
2611- std::chrono::duration<double, std::milli> elapsed = end - start;
2612-
2613- std::cout << std::fixed << std::setprecision(3)
2614- << " "
2615- << "Throughput = "
2616- << std::setw(8) << (trials * size * sizeof(T)) /
2617- (elapsed.count() / 1e3 * (1 << 30))
2618- << " GB/s, Samples = "
2619- << std::setw(8) << (trials * size) /
2620- (elapsed.count() / 1e3 * (1 << 30))
2621- << " GSample/s, AvgTime (1 trial) = "
2622- << std::setw(8) << elapsed.count() / trials
2623- << " ms, Time (all) = "
2624- << std::setw(8) << elapsed.count()
2625- << " ms, Size = " << size
2626- << std::endl;
2627
2628 CUDA_CALL(cudaFree(data));
2629 }
2630
2631 template<typename GeneratorState>
2632 void run_benchmarks(const cli::Parser& parser,
2633- const std::string& distribution)
2634+ const std::string& distribution,
2635+ const std::string& engine,
2636+ cudaStream_t stream)
2637 {
2638+ const std::string format = parser.get<std::string>("format");
2639 if (distribution == "uniform-uint")
2640 {
2641- if (!std::is_same<GeneratorState, curandStateSobol64_t>::value &&
2642- !std::is_same<GeneratorState, curandStateScrambledSobol64_t>::value)
2643- {
2644- run_benchmark<unsigned int, GeneratorState>(parser,
2645- [] __device__ (GeneratorState * state, int) {
2646- return curand(state);
2647- }, 0
2648- );
2649- }
2650+ run_benchmark<unsigned int, GeneratorState>(
2651+ parser,
2652+ stream,
2653+ [] __device__ ( GeneratorState* state, int) { return curand(state); },
2654+ 0,
2655+ distribution,
2656+ engine);
2657 }
2658 if (distribution == "uniform-long-long")
2659 {
2660 if (std::is_same<GeneratorState, curandStateSobol64_t>::value ||
2661 std::is_same<GeneratorState, curandStateScrambledSobol64_t>::value)
2662 {
2663- run_benchmark<unsigned long long, GeneratorState>(parser,
2664- [] __device__ (GeneratorState * state, int) {
2665- return curand(state);
2666- }, 0
2667- );
2668+ run_benchmark<unsigned long long, GeneratorState>(
2669+ parser,
2670+ stream,
2671+ [] __device__ ( GeneratorState* state, int) { return curand(state); },
2672+ 0,
2673+ distribution,
2674+ engine);
2675 }
2676 }
2677 if (distribution == "uniform-float")
2678 {
2679- run_benchmark<float, GeneratorState>(parser,
2680- [] __device__ (GeneratorState * state, int) {
2681- return curand_uniform(state);
2682- }, 0
2683- );
2684+ run_benchmark<float, GeneratorState>(
2685+ parser,
2686+ stream,
2687+ [] __device__ ( GeneratorState* state, int) { return curand_uniform(state); },
2688+ 0,
2689+ distribution,
2690+ engine);
2691 }
2692 if (distribution == "uniform-double")
2693 {
2694- run_benchmark<double, GeneratorState>(parser,
2695- [] __device__ (GeneratorState * state, int) {
2696- return curand_uniform_double(state);
2697- }, 0
2698- );
2699+ run_benchmark<double, GeneratorState>(
2700+ parser,
2701+ stream,
2702+ [] __device__ ( GeneratorState* state, int) { return curand_uniform_double(state); },
2703+ 0,
2704+ distribution,
2705+ engine);
2706 }
2707 if (distribution == "normal-float")
2708 {
2709- run_benchmark<float, GeneratorState>(parser,
2710- [] __device__ (GeneratorState * state, int) {
2711- return curand_normal(state);
2712- }, 0
2713- );
2714+ run_benchmark<float, GeneratorState>(
2715+ parser,
2716+ stream,
2717+ [] __device__ ( GeneratorState* state, int) { return curand_normal(state); },
2718+ 0,
2719+ distribution,
2720+ engine);
2721 }
2722 if (distribution == "normal-double")
2723 {
2724- run_benchmark<double, GeneratorState>(parser,
2725- [] __device__ (GeneratorState * state, int) {
2726- return curand_normal_double(state);
2727- }, 0
2728- );
2729+ run_benchmark<double, GeneratorState>(
2730+ parser,
2731+ stream,
2732+ [] __device__ ( GeneratorState* state, int) { return curand_normal_double(state); },
2733+ 0,
2734+ distribution,
2735+ engine);
2736 }
2737 if (distribution == "log-normal-float")
2738 {
2739- run_benchmark<float, GeneratorState>(parser,
2740- [] __device__ (GeneratorState * state, int) {
2741- return curand_log_normal(state, 0.0f, 1.0f);
2742- }, 0
2743- );
2744+ run_benchmark<float, GeneratorState>(
2745+ parser,
2746+ stream,
2747+ [] __device__ ( GeneratorState* state, int) { return curand_log_normal(state, 0.0f, 1.0f); },
2748+ 0,
2749+ distribution,
2750+ engine);
2751 }
2752 if (distribution == "log-normal-double")
2753 {
2754- run_benchmark<double, GeneratorState>(parser,
2755- [] __device__ (GeneratorState * state, int) {
2756- return curand_log_normal_double(state, 0.0, 1.0);
2757- }, 0
2758- );
2759+ run_benchmark<double, GeneratorState>(
2760+ parser,
2761+ stream,
2762+ [] __device__ ( GeneratorState* state, int) { return curand_log_normal_double(state, 0.0, 1.0); },
2763+ 0,
2764+ distribution,
2765+ engine);
2766 }
2767 if (distribution == "poisson")
2768 {
2769 const auto lambdas = parser.get<std::vector<double>>("lambda");
2770 for (double lambda : lambdas)
2771 {
2772- std::cout << " " << "lambda "
2773- << std::fixed << std::setprecision(1) << lambda << std::endl;
2774- run_benchmark<unsigned int, GeneratorState>(parser,
2775- [] __device__ (GeneratorState * state, double lambda) {
2776- return curand_poisson(state, lambda);
2777- }, lambda
2778- );
2779+ if(format.compare("console") == 0)
2780+ {
2781+ std::cout << " "
2782+ << "lambda " << std::fixed << std::setprecision(1) << lambda << std::endl;
2783+ }
2784+ run_benchmark<unsigned int, GeneratorState>(
2785+ parser,
2786+ stream,
2787+ [] __device__ ( GeneratorState* state, double lambda) { return curand_poisson(state, lambda); },
2788+ lambda,
2789+ distribution,
2790+ engine,
2791+ lambda);
2792 }
2793 }
2794 if (distribution == "discrete-poisson")
2795@@ -692,15 +762,22 @@ void run_benchmarks(const cli::Parser& parser,
2796 const auto lambdas = parser.get<std::vector<double>>("lambda");
2797 for (double lambda : lambdas)
2798 {
2799- std::cout << " " << "lambda "
2800- << std::fixed << std::setprecision(1) << lambda << std::endl;
2801+ if(format.compare("console") == 0)
2802+ {
2803+ std::cout << " "
2804+ << "lambda " << std::fixed << std::setprecision(1) << lambda << std::endl;
2805+ }
2806 curandDiscreteDistribution_t discrete_distribution;
2807 CURAND_CALL(curandCreatePoissonDistribution(lambda, &discrete_distribution));
2808- run_benchmark<unsigned int, GeneratorState>(parser,
2809- [] __device__ (GeneratorState * state, curandDiscreteDistribution_t discrete_distribution) {
2810- return curand_discrete(state, discrete_distribution);
2811- }, discrete_distribution
2812- );
2813+ run_benchmark<unsigned int, GeneratorState>(
2814+ parser,
2815+ stream,
2816+ [] __device__ ( GeneratorState* state, curandDiscreteDistribution_t discrete_distribution)
2817+ { return curand_discrete(state, discrete_distribution); },
2818+ discrete_distribution,
2819+ distribution,
2820+ engine,
2821+ lambda);
2822 CURAND_CALL(curandDestroyDistribution(discrete_distribution));
2823 }
2824 }
2825@@ -758,6 +835,10 @@ int main(int argc, char *argv[])
2826 parser.set_optional<std::vector<std::string>>("dis", "dis", {"uniform-uint"}, distribution_desc.c_str());
2827 parser.set_optional<std::vector<std::string>>("engine", "engine", {"philox"}, engine_desc.c_str());
2828 parser.set_optional<std::vector<double>>("lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution");
2829+ parser.set_optional<std::string>("format",
2830+ "format",
2831+ {"console"},
2832+ "output format: console or csv");
2833 parser.run_and_exit_if_error();
2834
2835 std::vector<std::string> engines;
2836@@ -803,53 +884,75 @@ int main(int argc, char *argv[])
2837 cudaDeviceProp props;
2838 CUDA_CALL(cudaGetDeviceProperties(&props, device_id));
2839
2840+ std::cout << "benchmark_curand_kernel" << std::endl;
2841 std::cout << "cuRAND: " << version << " ";
2842 std::cout << "Runtime: " << runtime_version << " ";
2843 std::cout << "Device: " << props.name;
2844 std::cout << std::endl << std::endl;
2845
2846+ cudaStream_t stream;
2847+ CUDA_CALL(cudaStreamCreate(&stream));
2848+
2849+ std::string format = parser.get<std::string>("format");
2850+ bool console_output = format.compare("console") == 0 ? true : false;
2851+
2852+ if(!console_output)
2853+ {
2854+ std::cout
2855+ << "Engine,Distribution,Throughput,Samples,AvgTime (1 Trial),Time(all),Size,Lambda"
2856+ << std::endl;
2857+ std::cout << ",,GB/s,GSample/s,ms),ms),values," << std::endl;
2858+ }
2859 for (auto engine : engines)
2860 {
2861- std::cout << engine << ":" << std::endl;
2862+ if(console_output)
2863+ {
2864+ std::cout << engine << ":" << std::endl;
2865+ }
2866 for (auto distribution : distributions)
2867 {
2868- std::cout << " " << distribution << ":" << std::endl;
2869+ if(console_output)
2870+ {
2871+ std::cout << engine << ":" << std::endl;
2872+ }
2873 const std::string plot_name = engine + "-" + distribution;
2874 if (engine == "xorwow")
2875 {
2876- run_benchmarks<curandStateXORWOW_t>(parser, distribution);
2877+ run_benchmarks<curandStateXORWOW_t>(parser, distribution, engine, stream);
2878 }
2879 else if (engine == "mrg32k3a")
2880 {
2881- run_benchmarks<curandStateMRG32k3a_t>(parser, distribution);
2882+ run_benchmarks<curandStateMRG32k3a_t>(parser, distribution, engine, stream);
2883 }
2884 else if (engine == "philox")
2885 {
2886- run_benchmarks<curandStatePhilox4_32_10_t>(parser, distribution);
2887+ run_benchmarks<curandStatePhilox4_32_10_t>(parser, distribution, engine, stream);
2888 }
2889 else if (engine == "sobol32")
2890 {
2891- run_benchmarks<curandStateSobol32_t>(parser, distribution);
2892+ run_benchmarks<curandStateSobol32_t>(parser, distribution, engine, stream);
2893 }
2894 else if(engine == "scrambled_sobol32")
2895 {
2896- run_benchmarks<curandStateScrambledSobol32_t>(parser, distribution);
2897+ run_benchmarks<curandStateScrambledSobol32_t>(parser, distribution, engine, stream);
2898 }
2899 else if (engine == "sobol64")
2900 {
2901- run_benchmarks<curandStateSobol64_t>(parser, distribution);
2902+ run_benchmarks<curandStateSobol64_t>(parser, distribution, engine, stream);
2903 }
2904 else if(engine == "scrambled_sobol64")
2905 {
2906- run_benchmarks<curandStateScrambledSobol64_t>(parser, distribution);
2907+ run_benchmarks<curandStateScrambledSobol64_t>(parser, distribution, engine, stream);
2908 }
2909 else if (engine == "mtgp32")
2910 {
2911- run_benchmarks<curandStateMtgp32_t>(parser, distribution);
2912+ run_benchmarks<curandStateMtgp32_t>(parser, distribution, engine, stream);
2913 }
2914 }
2915 std::cout << std::endl;
2916 }
2917
2918+ CUDA_CALL(cudaStreamDestroy(stream));
2919+
2920 return 0;
2921 }
2922diff --git a/benchmark/benchmark_curand_utils.hpp b/benchmark/benchmark_curand_utils.hpp
2923index 3a0ff37..8eef353 100644
2924--- a/benchmark/benchmark_curand_utils.hpp
2925+++ b/benchmark/benchmark_curand_utils.hpp
2926@@ -1,4 +1,4 @@
2927-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
2928+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
2929 //
2930 // Permission is hereby granted, free of charge, to any person obtaining a copy
2931 // of this software and associated documentation files (the "Software"), to deal
2932@@ -42,6 +42,19 @@
2933 } \
2934 while(0)
2935
2936+#define CUDA_CALL(condition) \
2937+ do \
2938+ { \
2939+ cudaError_t error_ = condition; \
2940+ if(error_ != cudaSuccess) \
2941+ { \
2942+ std::cout << "CUDA error: " << error_ << " at " << __FILE__ << ":" << __LINE__ \
2943+ << std::endl; \
2944+ exit(error_); \
2945+ } \
2946+ } \
2947+ while(0)
2948+
2949 inline void add_common_benchmark_curand_info()
2950 {
2951 int version;
2952diff --git a/benchmark/benchmark_rocrand_device_api.cpp b/benchmark/benchmark_rocrand_device_api.cpp
2953index b38430e..cdf1237 100644
2954--- a/benchmark/benchmark_rocrand_device_api.cpp
2955+++ b/benchmark/benchmark_rocrand_device_api.cpp
2956@@ -1,4 +1,4 @@
2957-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
2958+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
2959 //
2960 // Permission is hereby granted, free of charge, to any person obtaining a copy
2961 // of this software and associated documentation files (the "Software"), to deal
2962@@ -85,14 +85,7 @@ struct runner
2963 const size_t states_size = blocks * threads;
2964 HIP_CHECK(hipMalloc(&states, states_size * sizeof(EngineState)));
2965
2966- hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel),
2967- dim3(blocks),
2968- dim3(threads),
2969- 0,
2970- 0,
2971- states,
2972- seed,
2973- offset);
2974+ init_kernel<<<dim3(blocks), dim3(threads)>>>(states, seed, offset);
2975
2976 HIP_CHECK(hipGetLastError());
2977 HIP_CHECK(hipDeviceSynchronize());
2978@@ -111,15 +104,7 @@ struct runner
2979 const size_t size,
2980 const Generator& generator)
2981 {
2982- hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel),
2983- dim3(blocks),
2984- dim3(threads),
2985- 0,
2986- stream,
2987- states,
2988- data,
2989- size,
2990- generator);
2991+ generate_kernel<<<dim3(blocks), dim3(threads), 0, stream>>>(states, data, size, generator);
2992 }
2993 };
2994
2995@@ -185,15 +170,10 @@ struct runner<rocrand_state_mtgp32>
2996 const size_t size,
2997 const Generator& generator)
2998 {
2999- hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel),
3000- dim3(std::min((size_t)200, blocks)),
3001- dim3(256),
3002- 0,
3003- stream,
3004- states,
3005- data,
3006- size,
3007- generator);
3008+ generate_kernel<<<dim3(std::min((size_t)200, blocks)), dim3(256), 0, stream>>>(states,
3009+ data,
3010+ size,
3011+ generator);
3012 }
3013 };
3014
3015@@ -341,14 +321,10 @@ struct runner<rocrand_state_sobol32>
3016 HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice));
3017
3018 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3019- hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel),
3020- dim3(blocks_x, dimensions),
3021- dim3(threads),
3022- 0,
3023- 0,
3024- states,
3025- directions,
3026- static_cast<unsigned int>(offset));
3027+ init_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(
3028+ states,
3029+ directions,
3030+ static_cast<unsigned int>(offset));
3031
3032 HIP_CHECK(hipGetLastError());
3033 HIP_CHECK(hipDeviceSynchronize());
3034@@ -370,15 +346,11 @@ struct runner<rocrand_state_sobol32>
3035 const Generator& generator)
3036 {
3037 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3038- hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
3039- dim3(blocks_x, dimensions),
3040- dim3(threads),
3041- 0,
3042- stream,
3043- states,
3044- data,
3045- size / dimensions,
3046- generator);
3047+ generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
3048+ states,
3049+ data,
3050+ size / dimensions,
3051+ generator);
3052 }
3053 };
3054
3055@@ -419,15 +391,11 @@ struct runner<rocrand_state_scrambled_sobol32>
3056 hipMemcpy(scramble_constants, h_constants, constants_size, hipMemcpyHostToDevice));
3057
3058 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3059- hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel),
3060- dim3(blocks_x, dimensions),
3061- dim3(threads),
3062- 0,
3063- 0,
3064- states,
3065- directions,
3066- scramble_constants,
3067- static_cast<unsigned int>(offset));
3068+ init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(
3069+ states,
3070+ directions,
3071+ scramble_constants,
3072+ static_cast<unsigned int>(offset));
3073
3074 HIP_CHECK(hipGetLastError());
3075 HIP_CHECK(hipDeviceSynchronize());
3076@@ -450,15 +418,11 @@ struct runner<rocrand_state_scrambled_sobol32>
3077 const Generator& generator)
3078 {
3079 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3080- hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
3081- dim3(blocks_x, dimensions),
3082- dim3(threads),
3083- 0,
3084- stream,
3085- states,
3086- data,
3087- size / dimensions,
3088- generator);
3089+ generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
3090+ states,
3091+ data,
3092+ size / dimensions,
3093+ generator);
3094 }
3095 };
3096
3097@@ -477,7 +441,8 @@ struct runner<rocrand_state_sobol64>
3098 this->dimensions = dimensions;
3099
3100 const unsigned long long* h_directions;
3101- rocrand_get_direction_vectors64(&h_directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6);
3102+ ROCRAND_CHECK(
3103+ rocrand_get_direction_vectors64(&h_directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6));
3104
3105 const size_t states_size = blocks * threads * dimensions;
3106 HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_sobol64)));
3107@@ -488,14 +453,9 @@ struct runner<rocrand_state_sobol64>
3108 HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice));
3109
3110 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3111- hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel),
3112- dim3(blocks_x, dimensions),
3113- dim3(threads),
3114- 0,
3115- 0,
3116- states,
3117- directions,
3118- offset);
3119+ init_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(states,
3120+ directions,
3121+ offset);
3122
3123 HIP_CHECK(hipGetLastError());
3124 HIP_CHECK(hipDeviceSynchronize());
3125@@ -517,15 +477,11 @@ struct runner<rocrand_state_sobol64>
3126 const Generator& generator)
3127 {
3128 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3129- hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
3130- dim3(blocks_x, dimensions),
3131- dim3(threads),
3132- 0,
3133- stream,
3134- states,
3135- data,
3136- size / dimensions,
3137- generator);
3138+ generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
3139+ states,
3140+ data,
3141+ size / dimensions,
3142+ generator);
3143 }
3144 };
3145
3146@@ -546,9 +502,10 @@ struct runner<rocrand_state_scrambled_sobol64>
3147 const unsigned long long* h_directions;
3148 const unsigned long long* h_constants;
3149
3150- rocrand_get_direction_vectors64(&h_directions,
3151- ROCRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6);
3152- rocrand_get_scramble_constants64(&h_constants);
3153+ ROCRAND_CHECK(
3154+ rocrand_get_direction_vectors64(&h_directions,
3155+ ROCRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6));
3156+ ROCRAND_CHECK(rocrand_get_scramble_constants64(&h_constants));
3157
3158 const size_t states_size = blocks * threads * dimensions;
3159 HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_scrambled_sobol64)));
3160@@ -565,15 +522,11 @@ struct runner<rocrand_state_scrambled_sobol64>
3161 hipMemcpy(scramble_constants, h_constants, constants_size, hipMemcpyHostToDevice));
3162
3163 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3164- hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel),
3165- dim3(blocks_x, dimensions),
3166- dim3(threads),
3167- 0,
3168- 0,
3169- states,
3170- directions,
3171- scramble_constants,
3172- offset);
3173+ init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(
3174+ states,
3175+ directions,
3176+ scramble_constants,
3177+ offset);
3178
3179 HIP_CHECK(hipGetLastError());
3180 HIP_CHECK(hipDeviceSynchronize());
3181@@ -596,15 +549,11 @@ struct runner<rocrand_state_scrambled_sobol64>
3182 const Generator& generator)
3183 {
3184 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3185- hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
3186- dim3(blocks_x, dimensions),
3187- dim3(threads),
3188- 0,
3189- stream,
3190- states,
3191- data,
3192- size / dimensions,
3193- generator);
3194+ generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
3195+ states,
3196+ data,
3197+ size / dimensions,
3198+ generator);
3199 }
3200 };
3201
3202diff --git a/benchmark/benchmark_rocrand_kernel.cpp b/benchmark/benchmark_rocrand_kernel.cpp
3203index 1e4c389..8e50094 100644
3204--- a/benchmark/benchmark_rocrand_kernel.cpp
3205+++ b/benchmark/benchmark_rocrand_kernel.cpp
3206@@ -1,4 +1,4 @@
3207-// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
3208+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
3209 //
3210 // Permission is hereby granted, free of charge, to any person obtaining a copy
3211 // of this software and associated documentation files (the "Software"), to deal
3212@@ -114,11 +114,7 @@ struct runner
3213 const size_t states_size = blocks * threads;
3214 HIP_CHECK(hipMalloc(&states, states_size * sizeof(GeneratorState)));
3215
3216- hipLaunchKernelGGL(
3217- HIP_KERNEL_NAME(init_kernel),
3218- dim3(blocks), dim3(threads), 0, 0,
3219- states, seed, offset
3220- );
3221+ init_kernel<<<dim3(blocks), dim3(threads)>>>(states, seed, offset);
3222
3223 HIP_CHECK(hipGetLastError());
3224 HIP_CHECK(hipDeviceSynchronize());
3225@@ -143,11 +139,11 @@ struct runner
3226 const GenerateFunc& generate_func,
3227 const Extra extra)
3228 {
3229- hipLaunchKernelGGL(
3230- HIP_KERNEL_NAME(generate_kernel),
3231- dim3(blocks), dim3(threads), 0, stream,
3232- states, data, size, generate_func, extra
3233- );
3234+ generate_kernel<<<dim3(blocks), dim3(threads), 0, stream>>>(states,
3235+ data,
3236+ size,
3237+ generate_func,
3238+ extra);
3239 }
3240 };
3241
3242@@ -222,11 +218,12 @@ struct runner<rocrand_state_mtgp32>
3243 const GenerateFunc& generate_func,
3244 const Extra extra)
3245 {
3246- hipLaunchKernelGGL(
3247- HIP_KERNEL_NAME(generate_kernel),
3248- dim3(std::min((size_t)200, blocks)), dim3(256), 0, stream,
3249- states, data, size, generate_func, extra
3250- );
3251+ generate_kernel<<<dim3(std::min((size_t)200, blocks)), dim3(256), 0, stream>>>(
3252+ states,
3253+ data,
3254+ size,
3255+ generate_func,
3256+ extra);
3257 }
3258 };
3259
3260@@ -255,16 +252,11 @@ struct runner<rocrand_state_lfsr113>
3261 const size_t states_size = blocks * threads;
3262 HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_lfsr113)));
3263
3264- hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel),
3265- dim3(blocks),
3266- dim3(threads),
3267- 0,
3268- 0,
3269- states,
3270- uint4{ROCRAND_LFSR113_DEFAULT_SEED_X,
3271- ROCRAND_LFSR113_DEFAULT_SEED_Y,
3272- ROCRAND_LFSR113_DEFAULT_SEED_Z,
3273- ROCRAND_LFSR113_DEFAULT_SEED_W});
3274+ init_kernel<<<dim3(blocks), dim3(threads)>>>(states,
3275+ uint4{ROCRAND_LFSR113_DEFAULT_SEED_X,
3276+ ROCRAND_LFSR113_DEFAULT_SEED_Y,
3277+ ROCRAND_LFSR113_DEFAULT_SEED_Z,
3278+ ROCRAND_LFSR113_DEFAULT_SEED_W});
3279
3280 HIP_CHECK(hipGetLastError());
3281 HIP_CHECK(hipDeviceSynchronize());
3282@@ -289,16 +281,11 @@ struct runner<rocrand_state_lfsr113>
3283 const GenerateFunc& generate_func,
3284 const Extra extra)
3285 {
3286- hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel),
3287- dim3(blocks),
3288- dim3(threads),
3289- 0,
3290- stream,
3291- states,
3292- data,
3293- size,
3294- generate_func,
3295- extra);
3296+ generate_kernel<<<dim3(blocks), dim3(threads), 0, stream>>>(states,
3297+ data,
3298+ size,
3299+ generate_func,
3300+ extra);
3301 }
3302 };
3303
3304@@ -381,14 +368,10 @@ struct runner<rocrand_state_sobol32>
3305 HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice));
3306
3307 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3308- hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel),
3309- dim3(blocks_x, dimensions),
3310- dim3(threads),
3311- 0,
3312- 0,
3313- states,
3314- directions,
3315- static_cast<unsigned int>(offset));
3316+ init_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(
3317+ states,
3318+ directions,
3319+ static_cast<unsigned int>(offset));
3320
3321 HIP_CHECK(hipGetLastError());
3322 HIP_CHECK(hipDeviceSynchronize());
3323@@ -416,16 +399,12 @@ struct runner<rocrand_state_sobol32>
3324 const Extra extra)
3325 {
3326 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3327- hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
3328- dim3(blocks_x, dimensions),
3329- dim3(threads),
3330- 0,
3331- stream,
3332- states,
3333- data,
3334- size / dimensions,
3335- generate_func,
3336- extra);
3337+ generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
3338+ states,
3339+ data,
3340+ size / dimensions,
3341+ generate_func,
3342+ extra);
3343 }
3344 };
3345
3346@@ -466,15 +445,11 @@ struct runner<rocrand_state_scrambled_sobol32>
3347 hipMemcpy(scramble_constants, h_constants, constants_size, hipMemcpyHostToDevice));
3348
3349 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3350- hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel),
3351- dim3(blocks_x, dimensions),
3352- dim3(threads),
3353- 0,
3354- 0,
3355- states,
3356- directions,
3357- scramble_constants,
3358- static_cast<unsigned int>(offset));
3359+ init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(
3360+ states,
3361+ directions,
3362+ scramble_constants,
3363+ static_cast<unsigned int>(offset));
3364
3365 HIP_CHECK(hipGetLastError());
3366 HIP_CHECK(hipDeviceSynchronize());
3367@@ -503,16 +478,12 @@ struct runner<rocrand_state_scrambled_sobol32>
3368 const Extra extra)
3369 {
3370 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3371- hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
3372- dim3(blocks_x, dimensions),
3373- dim3(threads),
3374- 0,
3375- stream,
3376- states,
3377- data,
3378- size / dimensions,
3379- generate_func,
3380- extra);
3381+ generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
3382+ states,
3383+ data,
3384+ size / dimensions,
3385+ generate_func,
3386+ extra);
3387 }
3388 };
3389
3390@@ -542,14 +513,9 @@ struct runner<rocrand_state_sobol64>
3391 HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice));
3392
3393 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3394- hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel),
3395- dim3(blocks_x, dimensions),
3396- dim3(threads),
3397- 0,
3398- 0,
3399- states,
3400- directions,
3401- offset);
3402+ init_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(states,
3403+ directions,
3404+ offset);
3405
3406 HIP_CHECK(hipGetLastError());
3407 HIP_CHECK(hipDeviceSynchronize());
3408@@ -577,16 +543,12 @@ struct runner<rocrand_state_sobol64>
3409 const Extra extra)
3410 {
3411 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3412- hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
3413- dim3(blocks_x, dimensions),
3414- dim3(threads),
3415- 0,
3416- stream,
3417- states,
3418- data,
3419- size / dimensions,
3420- generate_func,
3421- extra);
3422+ generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
3423+ states,
3424+ data,
3425+ size / dimensions,
3426+ generate_func,
3427+ extra);
3428 }
3429 };
3430
3431@@ -626,15 +588,11 @@ struct runner<rocrand_state_scrambled_sobol64>
3432 hipMemcpy(scramble_constants, h_constants, constants_size, hipMemcpyHostToDevice));
3433
3434 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3435- hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel),
3436- dim3(blocks_x, dimensions),
3437- dim3(threads),
3438- 0,
3439- 0,
3440- states,
3441- directions,
3442- scramble_constants,
3443- offset);
3444+ init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(
3445+ states,
3446+ directions,
3447+ scramble_constants,
3448+ offset);
3449
3450 HIP_CHECK(hipGetLastError());
3451 HIP_CHECK(hipDeviceSynchronize());
3452@@ -663,16 +621,12 @@ struct runner<rocrand_state_scrambled_sobol64>
3453 const Extra extra)
3454 {
3455 const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
3456- hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
3457- dim3(blocks_x, dimensions),
3458- dim3(threads),
3459- 0,
3460- stream,
3461- states,
3462- data,
3463- size / dimensions,
3464- generate_func,
3465- extra);
3466+ generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
3467+ states,
3468+ data,
3469+ size / dimensions,
3470+ generate_func,
3471+ extra);
3472 }
3473 };
3474
3475diff --git a/benchmark/custom_csv_formater.hpp b/benchmark/custom_csv_formater.hpp
3476index 75d05fa..d31550e 100644
3477--- a/benchmark/custom_csv_formater.hpp
3478+++ b/benchmark/custom_csv_formater.hpp
3479@@ -1,4 +1,4 @@
3480-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
3481+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
3482 //
3483 // Permission is hereby granted, free of charge, to any person obtaining a copy
3484 // of this software and associated documentation files (the "Software"), to deal
3485@@ -155,17 +155,11 @@ inline void customCSVReporter::PrintRunData(const Run& run)
3486 std::ostream& Err = GetErrorStream();
3487
3488 //get the name of the engine and distribution:
3489-
3490 std::string temp = run.benchmark_name();
3491-
3492 std::string deviceName = std::string(temp.begin(), temp.begin() + temp.find("<"));
3493-
3494 temp.erase(0, temp.find("<") + 1);
3495-
3496 std::string engineName = std::string(temp.begin(), temp.begin() + temp.find(","));
3497-
3498 temp.erase(0, engineName.size() + 1);
3499-
3500 std::string mode = "default";
3501
3502 if(deviceName != "device_kernel")
3503@@ -174,7 +168,6 @@ inline void customCSVReporter::PrintRunData(const Run& run)
3504 temp.erase(0, temp.find(",") + 1);
3505 }
3506 std::string disName = std::string(temp.begin(), temp.begin() + temp.find(">"));
3507-
3508 std::string lambda = "";
3509
3510 size_t ePos = disName.find("=");
3511@@ -186,11 +179,11 @@ inline void customCSVReporter::PrintRunData(const Run& run)
3512
3513 Out << engineName << "," << disName << "," << mode << ",";
3514 Out << CsvEscape(run.benchmark_name()) << ",";
3515- if(run.error_occurred)
3516+ if(run.skipped)
3517 {
3518 Err << std::string(elements.size() - 3, ',');
3519 Err << "true,";
3520- Err << CsvEscape(run.error_message) << "\n";
3521+ Err << CsvEscape(run.skip_message) << "\n";
3522 return;
3523 }
3524
3525diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
3526index 09b7286..329e7b6 100644
3527--- a/cmake/Dependencies.cmake
3528+++ b/cmake/Dependencies.cmake
3529@@ -1,6 +1,6 @@
3530 # MIT License
3531 #
3532-# Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
3533+# Copyright (c) 2018-2025 Advanced Micro Devices, Inc. All rights reserved.
3534 #
3535 # Permission is hereby granted, free of charge, to any person obtaining a copy
3536 # of this software and associated documentation files (the "Software"), to deal
3537@@ -20,106 +20,50 @@
3538 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3539 # SOFTWARE.
3540
3541-cmake_minimum_required(VERSION 3.16)
3542+# Dependencies
3543
3544-# find_package() uses upper-case <PACKAGENAME>_ROOT variables.
3545-# altough we use GTEST_ROOT for our purposes, it is actually even benefecial for
3546-# find_package() to look for it there (that's where we are going to put it anyway)
3547-if(POLICY CMP0144)
3548- cmake_policy(SET CMP0144 NEW)
3549+# Save global state
3550+# NOTE1: the reason we don't scope global state meddling using add_subdirectory
3551+# is because CMake < 3.24 lacks CMAKE_FIND_PACKAGE_TARGETS_GLOBAL which
3552+# would promote IMPORTED targets of find_package(CONFIG) to be visible
3553+# by other parts of the build. So we save and restore global state.
3554+#
3555+# NOTE2: We disable the ROCMChecks.cmake warning noting that we meddle with
3556+# global state. This is consequence of abusing the CMake CXX language
3557+# which HIP piggybacks on top of. This kind of HIP support has one chance
3558+# at observing the global flags, at the find_package(HIP) invocation.
3559+# The device compiler won't be able to pick up changes after that, hence
3560+# the warning.
3561+set(USER_CXX_FLAGS ${CMAKE_CXX_FLAGS})
3562+if(DEFINED BUILD_SHARED_LIBS)
3563+ set(USER_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
3564 endif()
3565-
3566-# Dependencies
3567+set(USER_ROCM_WARN_TOOLCHAIN_VAR ${ROCM_WARN_TOOLCHAIN_VAR})
3568+
3569+# Change variables before configuring dependencies
3570+set(ROCM_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "")
3571+# Turn off warnings and errors for all warnings in dependencies
3572+separate_arguments(CXX_FLAGS_LIST NATIVE_COMMAND ${CMAKE_CXX_FLAGS})
3573+list(REMOVE_ITEM CXX_FLAGS_LIST /WX -Werror -Werror=pendantic -pedantic-errors)
3574+if(MSVC)
3575+ list(FILTER CXX_FLAGS_LIST EXCLUDE REGEX "/[Ww]([0-4]?)(all)?") # Remove MSVC warning flags
3576+ list(APPEND CXX_FLAGS_LIST /w)
3577+else()
3578+ list(FILTER CXX_FLAGS_LIST EXCLUDE REGEX "-W(all|extra|everything)") # Remove GCC/LLVM flags
3579+ list(APPEND CXX_FLAGS_LIST -w)
3580+endif()
3581+list(JOIN CXX_FLAGS_LIST " " CMAKE_CXX_FLAGS)
3582+# Don't build client dependencies as shared
3583+set(BUILD_SHARED_LIBS OFF CACHE BOOL "Global flag to cause add_library() to create shared libraries if on." FORCE)
3584
3585 # HIP dependency is handled earlier in the project cmake file
3586 # when VerifyCompiler.cmake is included.
3587
3588-# For downloading, building, and installing required dependencies
3589-include(cmake/DownloadProject.cmake)
3590-
3591 # Fortran Wrapper
3592 if(BUILD_FORTRAN_WRAPPER)
3593 enable_language(Fortran)
3594 endif()
3595
3596-# Test dependencies
3597-if(BUILD_TEST)
3598- # NOTE: Google Test has created a mess with legacy FindGTest.cmake and newer GTestConfig.cmake
3599- #
3600- # FindGTest.cmake defines: GTest::GTest, GTest::Main, GTEST_FOUND
3601- #
3602- # GTestConfig.cmake defines: GTest::gtest, GTest::gtest_main, GTest::gmock, GTest::gmock_main
3603- #
3604- # NOTE2: Finding GTest in MODULE mode, one cannot invoke find_package in CONFIG mode, because targets
3605- # will be duplicately defined.
3606- if(NOT DEPENDENCIES_FORCE_DOWNLOAD)
3607- # Google Test (https://github.com/google/googletest)
3608- find_package(GTest QUIET)
3609- endif()
3610-
3611- if(NOT TARGET GTest::GTest AND NOT TARGET GTest::gtest)
3612- message(STATUS "GTest not found or force download GTest on. Downloading and building GTest.")
3613- set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/deps/gtest CACHE PATH "")
3614- if(DEFINED CMAKE_CXX_COMPILER)
3615- set(CXX_COMPILER_OPTION "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}")
3616- endif()
3617- download_project(
3618- PROJ googletest
3619- GIT_REPOSITORY https://github.com/google/googletest.git
3620- GIT_TAG release-1.11.0
3621- INSTALL_DIR ${GTEST_ROOT}
3622- CMAKE_ARGS -DBUILD_GTEST=ON -DINSTALL_GTEST=ON -Dgtest_force_shared_crt=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> ${CXX_COMPILER_OPTION} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
3623- LOG_DOWNLOAD TRUE
3624- LOG_CONFIGURE TRUE
3625- LOG_BUILD TRUE
3626- LOG_INSTALL TRUE
3627- BUILD_PROJECT TRUE
3628- UPDATE_DISCONNECTED TRUE # Never update automatically from the remote repository
3629- )
3630- find_package(GTest CONFIG REQUIRED PATHS ${GTEST_ROOT} NO_DEFAULT_PATH)
3631- endif()
3632-endif()
3633-
3634-
3635-# Benchmark dependencies
3636-if(BUILD_BENCHMARK)
3637- if(NOT DEPENDENCIES_FORCE_DOWNLOAD)
3638- # Google Benchmark (https://github.com/google/benchmark.git)
3639- find_package(benchmark QUIET)
3640- endif()
3641-
3642- if(NOT benchmark_FOUND)
3643- message(STATUS "Google Benchmark not found or force download Google Benchmark on. Downloading and building Google Benchmark.")
3644- if(CMAKE_CONFIGURATION_TYPES)
3645- message(FATAL_ERROR "DownloadProject.cmake doesn't support multi-configuration generators.")
3646- endif()
3647- set(GOOGLEBENCHMARK_ROOT ${CMAKE_CURRENT_BINARY_DIR}/deps/googlebenchmark CACHE PATH "")
3648- if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
3649- # hip-clang cannot compile googlebenchmark for some reason
3650- if(WIN32)
3651- set(COMPILER_OVERRIDE "-DCMAKE_CXX_COMPILER=cl")
3652- else()
3653- set(COMPILER_OVERRIDE "-DCMAKE_CXX_COMPILER=g++")
3654- endif()
3655- endif()
3656-
3657- download_project(
3658- PROJ googlebenchmark
3659- GIT_REPOSITORY https://github.com/google/benchmark.git
3660- GIT_TAG v1.6.1
3661- INSTALL_DIR ${GOOGLEBENCHMARK_ROOT}
3662- CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_SHARED_LIBS=OFF -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DCMAKE_CXX_STANDARD=14 ${COMPILER_OVERRIDE}
3663- LOG_DOWNLOAD TRUE
3664- LOG_CONFIGURE TRUE
3665- LOG_BUILD TRUE
3666- LOG_INSTALL TRUE
3667- BUILD_PROJECT TRUE
3668- UPDATE_DISCONNECTED TRUE
3669- )
3670- endif()
3671- find_package(benchmark REQUIRED CONFIG PATHS ${GOOGLEBENCHMARK_ROOT} NO_DEFAULT_PATH)
3672-endif()
3673-
3674 set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
3675
3676 # Find or download/install rocm-cmake project
3677@@ -167,4 +111,65 @@ include(ROCMInstallSymlinks)
3678 include(ROCMCheckTargetIds)
3679 include(ROCMUtilities)
3680 include(ROCMClients)
3681-include(ROCMHeaderWrapper)
3682+
3683+# For downloading and building required dependencies
3684+include(FetchContent)
3685+# Test dependencies
3686+if(BUILD_TEST)
3687+ # Google Test (https://github.com/google/googletest)
3688+ # NOTE: Google Test has created a mess with legacy FindGTest.cmake and newer GTestConfig.cmake
3689+ #
3690+ # FindGTest.cmake defines: GTest::GTest, GTest::Main, GTEST_FOUND
3691+ #
3692+ # GTestConfig.cmake defines: GTest::gtest, GTest::gtest_main, GTest::gmock, GTest::gmock_main
3693+ #
3694+ # NOTE2: Finding GTest in MODULE mode, one cannot invoke find_package in CONFIG mode, because targets
3695+ # will be duplicately defined.
3696+ if(NOT DEPENDENCIES_FORCE_DOWNLOAD)
3697+ find_package(GTest QUIET)
3698+ endif()
3699+
3700+ if(NOT TARGET GTest::GTest AND NOT TARGET GTest::gtest)
3701+ message(STATUS "Google Test not found or force download on. Fetching...")
3702+ option(BUILD_GTEST "Builds the googletest subproject" ON)
3703+ option(BUILD_GMOCK "Builds the googlemock subproject" OFF)
3704+ option(INSTALL_GTEST "Enable installation of googletest" ON)
3705+ FetchContent_Declare(
3706+ googletest
3707+ GIT_REPOSITORY https://github.com/google/googletest.git
3708+ GIT_TAG v1.15.2
3709+ )
3710+ FetchContent_MakeAvailable(googletest)
3711+ endif()
3712+endif()
3713+
3714+# Benchmark dependencies
3715+if(BUILD_BENCHMARK)
3716+ # Google Benchmark (https://github.com/google/benchmark)
3717+ if(NOT DEPENDENCIES_FORCE_DOWNLOAD)
3718+ find_package(benchmark 1.9.1 QUIET)
3719+ endif()
3720+
3721+ if(NOT TARGET benchmark::benchmark)
3722+ message(STATUS "Google Benchmark not found or force download on. Fetching...")
3723+ option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library" OFF)
3724+ option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark" OFF)
3725+ FetchContent_Declare(
3726+ googlebenchmark
3727+ GIT_REPOSITORY https://github.com/google/benchmark.git
3728+ GIT_TAG v1.9.1
3729+ )
3730+ set(HAVE_STD_REGEX ON)
3731+ set(RUN_HAVE_STD_REGEX 1)
3732+ FetchContent_MakeAvailable(googlebenchmark)
3733+ endif()
3734+endif()
3735+
3736+# Restore user global state
3737+set(CMAKE_CXX_FLAGS ${USER_CXX_FLAGS})
3738+if(DEFINED USER_BUILD_SHARED_LIBS)
3739+ set(BUILD_SHARED_LIBS ${USER_BUILD_SHARED_LIBS})
3740+else()
3741+ unset(BUILD_SHARED_LIBS CACHE )
3742+endif()
3743+set(ROCM_WARN_TOOLCHAIN_VAR ${USER_ROCM_WARN_TOOLCHAIN_VAR} CACHE BOOL "")
3744diff --git a/cmake/DownloadProject.CMakeLists.cmake.in b/cmake/DownloadProject.CMakeLists.cmake.in
3745deleted file mode 100644
3746index d6e544f..0000000
3747--- a/cmake/DownloadProject.CMakeLists.cmake.in
3748+++ /dev/null
3749@@ -1,14 +0,0 @@
3750-# Distributed under the OSI-approved MIT License. See accompanying
3751-# file LICENSE or https://github.com/Crascit/DownloadProject for details.
3752-
3753-cmake_minimum_required(VERSION 2.8.2)
3754-
3755-project(${DL_ARGS_PROJ}-download NONE)
3756-
3757-include(ExternalProject)
3758-ExternalProject_Add(${DL_ARGS_PROJ}-download
3759- ${DL_ARGS_UNPARSED_ARGUMENTS}
3760- SOURCE_DIR "${DL_ARGS_SOURCE_DIR}"
3761- BUILD_IN_SOURCE TRUE
3762- TEST_COMMAND ""
3763-)
3764\ No newline at end of file
3765diff --git a/cmake/DownloadProject.cmake b/cmake/DownloadProject.cmake
3766deleted file mode 100644
3767index 110bbd5..0000000
3768--- a/cmake/DownloadProject.cmake
3769+++ /dev/null
3770@@ -1,170 +0,0 @@
3771-# Distributed under the OSI-approved MIT License. See accompanying
3772-# file LICENSE or https://github.com/Crascit/DownloadProject for details.
3773-#
3774-# MODULE: DownloadProject
3775-#
3776-# PROVIDES:
3777-# download_project( PROJ projectName
3778-# [PREFIX prefixDir]
3779-# [DOWNLOAD_DIR downloadDir]
3780-# [SOURCE_DIR srcDir]
3781-# [BINARY_DIR binDir]
3782-# [QUIET]
3783-# ...
3784-# )
3785-#
3786-# Provides the ability to download and unpack a tarball, zip file, git repository,
3787-# etc. at configure time (i.e. when the cmake command is run). How the downloaded
3788-# and unpacked contents are used is up to the caller, but the motivating case is
3789-# to download source code which can then be included directly in the build with
3790-# add_subdirectory() after the call to download_project(). Source and build
3791-# directories are set up with this in mind.
3792-#
3793-# The PROJ argument is required. The projectName value will be used to construct
3794-# the following variables upon exit (obviously replace projectName with its actual
3795-# value):
3796-#
3797-# projectName_SOURCE_DIR
3798-# projectName_BINARY_DIR
3799-#
3800-# The SOURCE_DIR and BINARY_DIR arguments are optional and would not typically
3801-# need to be provided. They can be specified if you want the downloaded source
3802-# and build directories to be located in a specific place. The contents of
3803-# projectName_SOURCE_DIR and projectName_BINARY_DIR will be populated with the
3804-# locations used whether you provide SOURCE_DIR/BINARY_DIR or not.
3805-#
3806-# The DOWNLOAD_DIR argument does not normally need to be set. It controls the
3807-# location of the temporary CMake build used to perform the download.
3808-#
3809-# The PREFIX argument can be provided to change the base location of the default
3810-# values of DOWNLOAD_DIR, SOURCE_DIR and BINARY_DIR. If all of those three arguments
3811-# are provided, then PREFIX will have no effect. The default value for PREFIX is
3812-# CMAKE_BINARY_DIR.
3813-#
3814-# The QUIET option can be given if you do not want to show the output associated
3815-# with downloading the specified project.
3816-#
3817-# In addition to the above, any other options are passed through unmodified to
3818-# ExternalProject_Add() to perform the actual download, patch and update steps.
3819-#
3820-# Only those ExternalProject_Add() arguments which relate to downloading, patching
3821-# and updating of the project sources are intended to be used. Also note that at
3822-# least one set of download-related arguments are required.
3823-#
3824-# If using CMake 3.2 or later, the UPDATE_DISCONNECTED option can be used to
3825-# prevent a check at the remote end for changes every time CMake is run
3826-# after the first successful download. See the documentation of the ExternalProject
3827-# module for more information. It is likely you will want to use this option if it
3828-# is available to you. Note, however, that the ExternalProject implementation contains
3829-# bugs which result in incorrect handling of the UPDATE_DISCONNECTED option when
3830-# using the URL download method or when specifying a SOURCE_DIR with no download
3831-# method. Fixes for these have been created, the last of which is scheduled for
3832-# inclusion in CMake 3.8.0. Details can be found here:
3833-#
3834-# https://gitlab.kitware.com/cmake/cmake/commit/bdca68388bd57f8302d3c1d83d691034b7ffa70c
3835-# https://gitlab.kitware.com/cmake/cmake/issues/16428
3836-#
3837-# If you experience build errors related to the update step, consider avoiding
3838-# the use of UPDATE_DISCONNECTED.
3839-#
3840-# EXAMPLE USAGE:
3841-#
3842-# include(DownloadProject)
3843-# download_project(PROJ googletest
3844-# GIT_REPOSITORY https://github.com/google/googletest.git
3845-# GIT_TAG master
3846-# UPDATE_DISCONNECTED 1
3847-# QUIET
3848-# )
3849-#
3850-# add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR})
3851-#
3852-#========================================================================================
3853-
3854-
3855-set(_DownloadProjectDir "${CMAKE_CURRENT_LIST_DIR}")
3856-
3857-include(CMakeParseArguments)
3858-
3859-function(download_project)
3860-
3861- set(options QUIET)
3862- set(oneValueArgs
3863- PROJ
3864- PREFIX
3865- DOWNLOAD_DIR
3866- SOURCE_DIR
3867- BINARY_DIR
3868- )
3869- set(multiValueArgs "")
3870-
3871- cmake_parse_arguments(DL_ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
3872-
3873- # Hide output if requested
3874- if (DL_ARGS_QUIET)
3875- set(OUTPUT_QUIET "OUTPUT_QUIET")
3876- else()
3877- unset(OUTPUT_QUIET)
3878- message(STATUS "Downloading/updating ${DL_ARGS_PROJ}")
3879- endif()
3880-
3881- # Set up where we will put our temporary CMakeLists.txt file and also
3882- # the base point below which the default source and binary dirs will be.
3883- # The prefix must always be an absolute path.
3884- if (NOT DL_ARGS_PREFIX)
3885- set(DL_ARGS_PREFIX "${CMAKE_BINARY_DIR}")
3886- else()
3887- get_filename_component(DL_ARGS_PREFIX "${DL_ARGS_PREFIX}" ABSOLUTE
3888- BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
3889- endif()
3890- if (NOT DL_ARGS_DOWNLOAD_DIR)
3891- set(DL_ARGS_DOWNLOAD_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-download")
3892- endif()
3893-
3894- # Ensure the caller can know where to find the source and build directories
3895- if (NOT DL_ARGS_SOURCE_DIR)
3896- set(DL_ARGS_SOURCE_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-src")
3897- endif()
3898- if (NOT DL_ARGS_BINARY_DIR)
3899- set(DL_ARGS_BINARY_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-build")
3900- endif()
3901- set(${DL_ARGS_PROJ}_SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" PARENT_SCOPE)
3902- set(${DL_ARGS_PROJ}_BINARY_DIR "${DL_ARGS_BINARY_DIR}" PARENT_SCOPE)
3903-
3904- # The way that CLion manages multiple configurations, it causes a copy of
3905- # the CMakeCache.txt to be copied across due to it not expecting there to
3906- # be a project within a project. This causes the hard-coded paths in the
3907- # cache to be copied and builds to fail. To mitigate this, we simply
3908- # remove the cache if it exists before we configure the new project. It
3909- # is safe to do so because it will be re-generated. Since this is only
3910- # executed at the configure step, it should not cause additional builds or
3911- # downloads.
3912- file(REMOVE "${DL_ARGS_DOWNLOAD_DIR}/CMakeCache.txt")
3913-
3914- # Create and build a separate CMake project to carry out the download.
3915- # If we've already previously done these steps, they will not cause
3916- # anything to be updated, so extra rebuilds of the project won't occur.
3917- # Make sure to pass through CMAKE_MAKE_PROGRAM in case the main project
3918- # has this set to something not findable on the PATH.
3919- configure_file("${_DownloadProjectDir}/DownloadProject.CMakeLists.cmake.in"
3920- "${DL_ARGS_DOWNLOAD_DIR}/CMakeLists.txt")
3921- execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
3922- -D "CMAKE_MAKE_PROGRAM:FILE=${CMAKE_MAKE_PROGRAM}"
3923- .
3924- RESULT_VARIABLE result
3925- ${OUTPUT_QUIET}
3926- WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}"
3927- )
3928- if(result)
3929- message(FATAL_ERROR "CMake step for ${DL_ARGS_PROJ} failed: ${result}")
3930- endif()
3931- execute_process(COMMAND ${CMAKE_COMMAND} --build .
3932- RESULT_VARIABLE result
3933- ${OUTPUT_QUIET}
3934- WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}"
3935- )
3936- if(result)
3937- message(FATAL_ERROR "Build step for ${DL_ARGS_PROJ} failed: ${result}")
3938- endif()
3939-
3940-endfunction()
3941\ No newline at end of file
3942diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
3943index c243e87..fe0d238 100644
3944--- a/cmake/Summary.cmake
3945+++ b/cmake/Summary.cmake
3946@@ -1,4 +1,41 @@
3947-function (print_configuration_summary)
3948+function(print_configuration_summary)
3949+ find_package(Git)
3950+ if(GIT_FOUND)
3951+ execute_process(
3952+ COMMAND ${GIT_EXECUTABLE} show --format=%H --no-patch
3953+ WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
3954+ OUTPUT_VARIABLE COMMIT_HASH
3955+ OUTPUT_STRIP_TRAILING_WHITESPACE
3956+ )
3957+ execute_process(
3958+ COMMAND ${GIT_EXECUTABLE} show --format=%s --no-patch
3959+ WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
3960+ OUTPUT_VARIABLE COMMIT_SUBJECT
3961+ OUTPUT_STRIP_TRAILING_WHITESPACE
3962+ )
3963+ endif()
3964+
3965+ execute_process(
3966+ COMMAND ${CMAKE_CXX_COMPILER} --version
3967+ WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
3968+ OUTPUT_VARIABLE CMAKE_CXX_COMPILER_VERBOSE_DETAILS
3969+ OUTPUT_STRIP_TRAILING_WHITESPACE
3970+ )
3971+
3972+ find_program(UNAME_EXECUTABLE uname)
3973+ if(UNAME_EXECUTABLE)
3974+ execute_process(
3975+ COMMAND ${UNAME_EXECUTABLE} -a
3976+ WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
3977+ OUTPUT_VARIABLE LINUX_KERNEL_DETAILS
3978+ OUTPUT_STRIP_TRAILING_WHITESPACE
3979+ )
3980+ endif()
3981+
3982+ string(REPLACE "\n" ";" CMAKE_CXX_COMPILER_VERBOSE_DETAILS "${CMAKE_CXX_COMPILER_VERBOSE_DETAILS}")
3983+ list(TRANSFORM CMAKE_CXX_COMPILER_VERBOSE_DETAILS PREPEND "-- ")
3984+ string(REPLACE ";" "\n" CMAKE_CXX_COMPILER_VERBOSE_DETAILS "${CMAKE_CXX_COMPILER_VERBOSE_DETAILS}")
3985+
3986 message(STATUS "")
3987 message(STATUS "******** Summary ********")
3988 message(STATUS "General:")
3989@@ -35,4 +72,15 @@ function (print_configuration_summary)
3990 endif()
3991 message(STATUS " BUILD_ADDRESS_SANITIZER : ${BUILD_ADDRESS_SANITIZER}")
3992 message(STATUS " DEPENDENCIES_FORCE_DOWNLOAD: ${DEPENDENCIES_FORCE_DOWNLOAD}")
3993+ message(STATUS "")
3994+ message(STATUS "Detailed:")
3995+ message(STATUS " C++ compiler details : \n${CMAKE_CXX_COMPILER_VERBOSE_DETAILS}")
3996+if(GIT_FOUND)
3997+ message(STATUS " Commit : ${COMMIT_HASH}")
3998+ message(STATUS " ${COMMIT_SUBJECT}")
3999+endif()
4000+if(UNAME_EXECUTABLE)
4001+ message(STATUS " Unix name : ${LINUX_KERNEL_DETAILS}")
4002+endif()
4003+
4004 endfunction()
4005diff --git a/debian/changelog b/debian/changelog
4006index d80e590..8bc43b8 100644
4007--- a/debian/changelog
4008+++ b/debian/changelog
4009@@ -1,3 +1,30 @@
4010+rocrand (7.1.0-0ubuntu1) resolute; urgency=medium
4011+
4012+ [Talha Can Havadar]
4013+ * d/control: update rocm dependency version to >= 7.1.0
4014+
4015+ [Tim Flink]
4016+ * new upstream 7.1.0 release
4017+ - Better test coverage
4018+ - Bugfixes
4019+ - Deprecated the rocRAND Fortran API in favor of hipfort
4020+ - Removed C++14 support. Beginning with this release, only C++17
4021+ is supported
4022+ * rebased patches
4023+ * removed ppc64el builds
4024+ * build docs with the rocm-docs-build script from rocm-docs-core
4025+ * new patches:
4026+ - 0005-dont-set-rocm-path-in-cmake.patch
4027+ - 0006-fix-doxygen-settings.patch
4028+
4029+ [ Igor Luppi ]
4030+ * d/control: update build-depends
4031+ * d/control: update maintainer field
4032+ * d/rules: fix FTBFS by adding -Wl,--gc-sections to flags
4033+ * d/rules: use GPU_TARGETS instead of deprecated AMDGPU_TARGETS
4034+
4035+ -- Talha Can Havadar <talha.can.havadar@canonical.com> Thu, 05 Feb 2026 11:00:51 +0300
4036+
4037 rocrand (6.4.3-1) unstable; urgency=medium
4038
4039 * Upload to unstable
4040diff --git a/debian/control b/debian/control
4041index 23379c0..52499ee 100644
4042--- a/debian/control
4043+++ b/debian/control
4044@@ -5,19 +5,25 @@ Priority: optional
4045 Standards-Version: 4.7.2
4046 Vcs-Git: https://salsa.debian.org/rocm-team/rocrand.git
4047 Vcs-Browser: https://salsa.debian.org/rocm-team/rocrand
4048-Maintainer: Debian ROCm Team <debian-ai@lists.debian.org>
4049-Uploaders: Étienne Mollier <emollier@debian.org>,
4050- Cordell Bloor <cgmb@debian.org>,
4051+Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>
4052+XSBC-Original-Maintainer: Debian ROCm Team <debian-ai@lists.debian.org>
4053+Uploaders: Maxime Chambonnet <maxzor@maxzor.eu>,
4054+ Étienne Mollier <emollier@debian.org>,
4055+ Cordell Bloor <cgmb@slerp.xyz>,
4056 Christian Kastner <ckk@debian.org>,
4057 Build-Depends: debhelper-compat (= 13),
4058 cmake,
4059- hipcc (>= 7.0~),
4060- libamd-comgr-dev (>= 6.4~),
4061- libamdhip64-dev (>= 6.4~),
4062- libhsa-runtime-dev (>= 6.4~),
4063- rocm-cmake,
4064+ hipcc (>= 7.1.0~),
4065+ libamd-comgr-dev (>= 7.1.0~),
4066+ libhsa-runtime-dev (>= 7.1.0~),
4067+ rocm-cmake (>= 7.1.0~),
4068 libgtest-dev <!nocheck>,
4069- pkg-rocm-tools (>= 0.9.0~exp3),
4070+ pkg-rocm-tools (>= 0.9.5),
4071+ libstdc++-dev,
4072+ lld-21,
4073+ clang-tools-21,
4074+ llvm-21,
4075+ rocm-device-libs-21,
4076 Build-Depends-Indep: dh-sequence-sphinxdoc <!nodoc>,
4077 python3-doxysphinx <!nodoc>,
4078 python3-rocm-docs (>= 1.20.0-1~) <!nodoc>,
4079@@ -30,7 +36,7 @@ Rules-Requires-Root: no
4080
4081 Package: librocrand1
4082 Section: libs
4083-Architecture: amd64 arm64 ppc64el
4084+Architecture: amd64 arm64
4085 XB-X-ROCm-GPU-Architecture: ${rocm:GPU-Architecture}
4086 Depends: ${misc:Depends}, ${shlibs:Depends},
4087 Description: generate pseudo- and quasi-random numbers - library
4088@@ -45,7 +51,7 @@ Description: generate pseudo- and quasi-random numbers - library
4089
4090 Package: librocrand-dev
4091 Section: libdevel
4092-Architecture: amd64 arm64 ppc64el
4093+Architecture: amd64 arm64
4094 Depends: librocrand1 (= ${binary:Version}),
4095 libamdhip64-dev,
4096 ${misc:Depends},
4097@@ -63,7 +69,7 @@ Description: generate pseudo- and quasi-random numbers - headers
4098
4099 Package: librocrand1-tests
4100 Section: libdevel
4101-Architecture: amd64 arm64 ppc64el
4102+Architecture: amd64 arm64
4103 Build-Profiles: <!nocheck>
4104 Depends: librocrand1 (= ${binary:Version}),${misc:Depends}, ${shlibs:Depends},
4105 Description: generate pseudo- and quasi-random numbers - test binaries
4106diff --git a/debian/librocrand-doc.doc-base b/debian/librocrand-doc.doc-base
4107index 600e35d..77970d0 100644
4108--- a/debian/librocrand-doc.doc-base
4109+++ b/debian/librocrand-doc.doc-base
4110@@ -5,7 +5,7 @@ Abstract: Documentation describing the design and use of the rocRAND library.
4111 Section: Programming
4112
4113 Format: HTML
4114-Index: /usr/share/doc/librocrand-dev/rocrand-html/index.html
4115-Files: /usr/share/doc/librocrand-dev/rocrand-html/*.html
4116+Index: /usr/share/doc/librocrand-dev/html/index.html
4117+Files: /usr/share/doc/librocrand-dev/html/*.html
4118
4119
4120diff --git a/debian/librocrand-doc.docs b/debian/librocrand-doc.docs
4121index f280757..6d28621 100644
4122--- a/debian/librocrand-doc.docs
4123+++ b/debian/librocrand-doc.docs
4124@@ -1 +1 @@
4125-rocrand-html/
4126+build/html
4127diff --git a/debian/librocrand-doc.links b/debian/librocrand-doc.links
4128index 7c25cb4..a20ad3b 100644
4129--- a/debian/librocrand-doc.links
4130+++ b/debian/librocrand-doc.links
4131@@ -1 +1 @@
4132-usr/share/javascript/jquery/jquery.js usr/share/doc/librocrand-dev/rocrand-html/doxygen/html/jquery.js
4133+usr/share/javascript/jquery/jquery.js usr/share/doc/librocrand-dev/html/doxygen/html/jquery.js
4134diff --git a/debian/not-installed b/debian/not-installed
4135index 70399f3..930f73c 100644
4136--- a/debian/not-installed
4137+++ b/debian/not-installed
4138@@ -1,2 +1,2 @@
4139-usr/share/doc/rocrand/LICENSE.txt
4140+usr/share/doc/rocrand/LICENSE.md
4141 usr/bin/rocRAND/CTestTestfile.cmake
4142diff --git a/debian/patches/0005-dont-set-rocm-path-in-cmake.patch b/debian/patches/0005-dont-set-rocm-path-in-cmake.patch
4143new file mode 100644
4144index 0000000..76ae6ba
4145--- /dev/null
4146+++ b/debian/patches/0005-dont-set-rocm-path-in-cmake.patch
4147@@ -0,0 +1,60 @@
4148+From: Tim Flink <Tim.Flink@amd.com>
4149+Date: Wed, 21 Jan 2026 08:27:17 -0600
4150+Subject: remove setting ROCM_PATH env var
4151+
4152+Setting the ROCM_PATH environment variable interferes with the system
4153+LLVM based compiler's ability to find device libs which are required
4154+for the compilation process.
4155+
4156+This patch removes all setting of the ROCM_PATH environment variable
4157+and any logic which uses that environment variable.
4158+
4159+As this isn't relevant to how the upstream project is normally built
4160+and as such, does not need to be forwarded upstream.
4161+
4162+Forwarded: not-needed
4163+---
4164+Index: bullwinkle-rocrand/CMakeLists.txt
4165+===================================================================
4166+--- bullwinkle-rocrand.orig/CMakeLists.txt
4167++++ bullwinkle-rocrand/CMakeLists.txt
4168+@@ -37,13 +37,14 @@ option(DEPENDENCIES_FORCE_DOWNLOAD "Don'
4169+ cmake_dependent_option(RUN_SLOW_TESTS "Run extra tests with CTest. These cover niche functionality and take long time" OFF "BUILD_TEST" OFF)
4170+
4171+
4172+-if (NOT DEFINED ENV{ROCM_PATH})
4173+-#Path to ROCm installation
4174+- set(ENV{ROCM_PATH} "/opt/rocm")
4175+-endif()
4176+-
4177+-# Install prefix
4178+-set(CMAKE_INSTALL_PREFIX "$ENV{ROCM_PATH}" CACHE PATH "Install path prefix, prepended onto install directories")
4179++# setting ROCM_PATH causes problems with hipcc finding device libs
4180++#if (NOT DEFINED ENV{ROCM_PATH})
4181++##Path to ROCm installation
4182++# set(ENV{ROCM_PATH} "/opt/rocm")
4183++#endif()
4184++#
4185++## Install prefix
4186++#set(CMAKE_INSTALL_PREFIX "$ENV{ROCM_PATH}" CACHE PATH "Install path prefix, prepended onto install directories")
4187+
4188+ if(WIN32)
4189+ set(CPACK_SOURCE_GENERATOR "ZIP")
4190+@@ -55,11 +56,12 @@ if(WIN32)
4191+ set(CPACK_PACKAGING_INSTALL_PREFIX "")
4192+ set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
4193+ else()
4194+- set(CMAKE_INSTALL_PREFIX "$ENV{ROCM_PATH}" CACHE PATH "Install path prefix, prepended onto install directories")
4195+- #Adding CMAKE_PREFIX_PATH
4196+- if(NOT DEFINED CMAKE_PREFIX_PATH)
4197+- list( APPEND CMAKE_PREFIX_PATH $ENV{ROCM_PATH}/llvm $ENV{ROCM_PATH})
4198+- endif()
4199++ # setting ROCM_PATH causes problems with hipcc finding device libs
4200++ # set(CMAKE_INSTALL_PREFIX "$ENV{ROCM_PATH}" CACHE PATH "Install path prefix, prepended onto install directories")
4201++ # #Adding CMAKE_PREFIX_PATH
4202++ # if(NOT DEFINED CMAKE_PREFIX_PATH)
4203++ # list( APPEND CMAKE_PREFIX_PATH $ENV{ROCM_PATH}/llvm $ENV{ROCM_PATH})
4204++ # endif()
4205+ if(NOT CPACK_PACKAGING_INSTALL_PREFIX)
4206+ set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
4207+ endif()
4208diff --git a/debian/patches/0006-fix-doxygen-settings.patch b/debian/patches/0006-fix-doxygen-settings.patch
4209new file mode 100644
4210index 0000000..9db17eb
4211--- /dev/null
4212+++ b/debian/patches/0006-fix-doxygen-settings.patch
4213@@ -0,0 +1,31 @@
4214+From: Tim Flink <Tim.Flink@amd.com>
4215+Date: Wed, 21 Jan 2026 11:08:37 -0600
4216+Subject: fix invalid doxygen settings
4217+
4218+Doxygen will fail with the upstream settings. This patch makes minimal
4219+changes to the Doxygen settings such that the docs will build.
4220+
4221+Forwarded: https://github.com/ROCm/rocm-libraries/pull/3994
4222+---
4223+Index: bullwinkle-rocrand/docs/doxygen/Doxyfile
4224+===================================================================
4225+--- bullwinkle-rocrand.orig/docs/doxygen/Doxyfile
4226++++ bullwinkle-rocrand/docs/doxygen/Doxyfile
4227+@@ -1674,7 +1674,7 @@ DISABLE_INDEX = NO
4228+ # The default value is: NO.
4229+ # This tag requires that the tag GENERATE_HTML is set to YES.
4230+
4231+-GENERATE_TREEVIEW = NONE
4232++GENERATE_TREEVIEW = NO
4233+
4234+ # When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
4235+ # FULL_SIDEBAR option determines if the side bar is limited to only the treeview
4236+@@ -1971,7 +1971,7 @@ COMPACT_LATEX = NO
4237+ # The default value is: a4.
4238+ # This tag requires that the tag GENERATE_LATEX is set to YES.
4239+
4240+-PAPER_TYPE = a4wide
4241++PAPER_TYPE = a4
4242+
4243+ # The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
4244+ # that should be included in the LaTeX output. The package can be specified just
4245diff --git a/debian/patches/Hide-internal-symbols.patch b/debian/patches/Hide-internal-symbols.patch
4246index 8149956..15fab61 100644
4247--- a/debian/patches/Hide-internal-symbols.patch
4248+++ b/debian/patches/Hide-internal-symbols.patch
4249@@ -7,22 +7,22 @@ Subject: Hide internal symbols
4250 library/src/rng/system.hpp | 1 +
4251 2 files changed, 2 insertions(+)
4252
4253-diff --git a/library/src/rng/sobol.hpp b/library/src/rng/sobol.hpp
4254-index 65aff0b..5611188 100644
4255---- a/library/src/rng/sobol.hpp
4256-+++ b/library/src/rng/sobol.hpp
4257-@@ -110,6 +110,7 @@ template<unsigned int OutputPerThread,
4258+Index: rocrand/library/src/rng/sobol.hpp
4259+===================================================================
4260+--- rocrand.orig/library/src/rng/sobol.hpp
4261++++ rocrand/library/src/rng/sobol.hpp
4262+@@ -111,6 +111,7 @@ template<unsigned int OutputPerThread,
4263 class T,
4264 class Distribution,
4265 int block_size>
4266 +__attribute__((visibility("hidden")))
4267- __global__ __launch_bounds__(block_size) void generate_sobol_kernel(
4268+ __global__ __launch_bounds__(block_size)
4269+ void generate_sobol_kernel(
4270 T*, const size_t, const Constant*, const Constant*, const unsigned int, Distribution)
4271- {}
4272-diff --git a/library/src/rng/system.hpp b/library/src/rng/system.hpp
4273-index fee2704..6d6550c 100644
4274---- a/library/src/rng/system.hpp
4275-+++ b/library/src/rng/system.hpp
4276+Index: rocrand/library/src/rng/system.hpp
4277+===================================================================
4278+--- rocrand.orig/library/src/rng/system.hpp
4279++++ rocrand/library/src/rng/system.hpp
4280 @@ -257,6 +257,7 @@ namespace detail
4281 {
4282
4283diff --git a/debian/patches/series b/debian/patches/series
4284index e104293..649b46f 100644
4285--- a/debian/patches/series
4286+++ b/debian/patches/series
4287@@ -1,3 +1,5 @@
4288 Extend-docs-conf.py-for-offline-build.patch
4289 Hide-internal-symbols.patch
4290 Use-local-mathjax.patch
4291+0005-dont-set-rocm-path-in-cmake.patch
4292+0006-fix-doxygen-settings.patch
4293diff --git a/debian/rules b/debian/rules
4294index 2c5bd98..0071864 100755
4295--- a/debian/rules
4296+++ b/debian/rules
4297@@ -2,6 +2,12 @@
4298 export CXX=hipcc
4299 export DEB_BUILD_MAINT_OPTIONS = hardening=+all optimize=-lto
4300 export DEB_CXXFLAGS_MAINT_PREPEND = -gz
4301+# -Wl,--gc-sections:
4302+# Fixes linker errors ("relocation refers to a discarded section") when using gtest.
4303+# The linker's section garbage collection incorrectly discards needed code due to
4304+# a toolchain mismatch (ROCm's clang vs. the system's GCC). This flag forces the
4305+# linker to correctly re-evaluate dependencies, keeping the required sections.
4306+export DEB_LDFLAGS_MAINT_PREPEND = -Wl,--gc-sections
4307 export VERBOSE=1
4308 #export AMD_LOG_LEVEL=4
4309
4310@@ -15,6 +21,8 @@ CMAKE_FLAGS = \
4311 -DROCM_SYMLINK_LIBS=OFF \
4312 -DGPU_TARGETS="$(shell rocm-target-arch --sep ';')" \
4313 -DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF \
4314+ -DCMAKE_VERBOSE_MAKEFILE=ON \
4315+ -DCMAKE_PREFIX_PATH=/usr/lib/llvm-21/lib \
4316 -DENABLE_INLINE_ASM=1
4317 ifeq (,$(filter nocheck,$(DEB_BUILD_PROFILES)))
4318 CMAKE_FLAGS += -DBUILD_TEST=ON
4319@@ -47,10 +55,9 @@ override_dh_auto_build-indep:
4320 ifeq (,$(filter nodoc,$(DEB_BUILD_OPTIONS)))
4321 http_proxy='127.0.0.1:9' \
4322 https_proxy='127.0.0.1:9' \
4323- ROCM_DOCS_REMOTE_DETAILS=http://localhost,local \
4324- latest_version= \
4325- release_candidate= \
4326- sphinx-build -v -b html docs rocrand-html
4327+ rocm-docs-build -r $(shell dpkg-parsechangelog -S Version | sed 's/[+-].*//')
4328+ rm -rf build/html/_static/fonts
4329+ rm -rf build/html/_static/vendor
4330 endif
4331
4332 override_dh_auto_test-indep:
4333diff --git a/docs/api-reference/cpp-api.rst b/docs/api-reference/cpp-api.rst
4334index 6db52ae..2460d72 100644
4335--- a/docs/api-reference/cpp-api.rst
4336+++ b/docs/api-reference/cpp-api.rst
4337@@ -1,6 +1,6 @@
4338 .. meta::
4339- :description: rocRAND documentation and API reference library
4340- :keywords: rocRAND, ROCm, API, documentation
4341+ :description: rocRAND C/C++ API reference
4342+ :keywords: rocRAND, ROCm, API, documentation, C, C++
4343
4344 .. _cpp-api:
4345
4346@@ -11,14 +11,15 @@ C/C++ API reference
4347 This chapter describes the rocRAND C and C++ API.
4348
4349 API index
4350-===========
4351+=========
4352
4353-To search an API, refer to the API :ref:`genindex`.
4354+To search the API, refer to the API :ref:`genindex`.
4355
4356 Device functions
4357 ================
4358
4359-To use the device API, include the file ``rocrand_kernel.h`` in files that define kernels that use rocRAND device functions. The typical usage of device functions consists of the following operations in the device kernel definition:
4360+To use the device API, include the file ``rocrand_kernel.h`` in files that define kernels that use rocRAND device functions.
4361+Follow these steps to use the device functions in the device kernel definition:
4362
4363 1. Create a new generator state object of the desired generator type.
4364
4365@@ -28,9 +29,10 @@ To use the device API, include the file ``rocrand_kernel.h`` in files that defin
4366
4367 4. Use the results.
4368
4369-Since the rocRAND device functions are invoked from inside the user kernel, the generated numbers can be used right away in the kernel without the need to copy them to the host memory.
4370+The rocRAND device functions are invoked from inside the user kernel.
4371+This means the generated numbers can be used immediately in the kernel without copying them to the host memory.
4372
4373-In the below example, random number generation is using the XORWOW generator.
4374+In the following example, random number generation uses the XORWOW generator.
4375
4376 .. code-block:: cpp
4377
4378@@ -62,7 +64,9 @@ In the below example, random number generation is using the XORWOW generator.
4379 C host API
4380 ==========
4381
4382-The C host API allows encapsulation of the internal generator state. Random numbers may be produced either on the host or device, depending on the created generator object. The typical sequence of operations for device generation consists of the following steps:
4383+The C host API allows encapsulation of the internal generator state.
4384+Random numbers can be produced either on the host or device, depending on the created generator object.
4385+The typical sequence of operations for device generation consists of the following steps:
4386
4387 1. Allocate memory on the device with ``hipMalloc``.
4388
4389@@ -76,9 +80,12 @@ The C host API allows encapsulation of the internal generator state. Random numb
4390
4391 6. Clean up with ``rocrand_destroy_generator`` and ``hipFree``.
4392
4393-To generate random numbers on the host, the memory allocation in step one should be made using a host memory allocation call. In step two ``rocrand_create_generator_host`` should be called instead. In the last step, the appropriate memory release should be made using the ``rocrand_destroy_generator``. All other calls work identically whether you are generating random numbers on the device or on the host CPU.
4394+To generate random numbers on the host, allocate the memory in the first step
4395+using a host memory allocation call. In step two, call ``rocrand_create_generator_host`` instead.
4396+In the last step, release the appropriate memory using ``rocrand_destroy_generator``.
4397+All other calls work identically whether you are generating random numbers on the device or on the host CPU.
4398
4399-In the example below, the C host API is used to generate 10 random floats using GPU capabilities.
4400+The example below uses the C host API to generate ten random floats using GPU capabilities.
4401
4402 .. code-block:: c
4403
4404@@ -118,9 +125,10 @@ In the example below, the C host API is used to generate 10 random floats using
4405 C++ host API wrapper
4406 ====================
4407
4408-The C++ host API wrapper provides resource management and an object-oriented interface for random number generation facilities.
4409+The C++ host API wrapper provides resource management and an object-oriented interface for random number
4410+generation facilities.
4411
4412-In the example below C++ host API wrapper is used to produce a random number using the default generation parameters.
4413+The example below uses the C++ host API wrapper to produce a random number using the default generation parameters.
4414
4415 .. code-block:: cpp
4416
4417diff --git a/docs/api-reference/data-type-support.rst b/docs/api-reference/data-type-support.rst
4418index faba618..20ae4fe 100644
4419--- a/docs/api-reference/data-type-support.rst
4420+++ b/docs/api-reference/data-type-support.rst
4421@@ -1,18 +1,24 @@
4422 .. meta::
4423- :description: rocRAND documentation and API reference library
4424- :keywords: rocRAND, ROCm, API, documentation, cuRAND
4425+ :description: Data type support for rocRAND documentation
4426+ :keywords: rocRAND, ROCm, API, documentation, cuRAND, data types
4427
4428 .. _data-type-support:
4429
4430 rocRAND data type support
4431 ******************************************
4432
4433+This topic discusses the various data types supported by rocRAND and provides a comparison
4434+with the data type support in NVIDIA CUDA cuRAND.
4435+
4436 Host API
4437 ========
4438
4439+This section covers the data types supported for the host API.
4440+
4441 Generator types
4442 ---------------
4443- .. list-table:: Supported generators on the host
4444+
4445+.. list-table:: Supported generators on the host
4446 :header-rows: 1
4447 :name: host-supported-generators
4448
4449@@ -81,22 +87,24 @@ Generator types
4450 - ✅
4451 - ✅
4452
4453-Only Sobol64, Scrambled Sobol64, ThreeFry 2x64-20 and ThreeFry 4x64-20 support generation of 64 bit :code:`unsigned long long int` integers, the other generators generate 32 bit :code:`unsigned int` integers.
4454+Only Sobol64, Scrambled Sobol64, ThreeFry 2x64-20, and ThreeFry 4x64-20 support the generation of 64-bit :code:`unsigned long long int` integers.
4455+The other generators generate 32-bit :code:`unsigned int` integers.
4456
4457 Seed types
4458 ----------
4459
4460-All generators can be seeded with :code:`unsigned long long`, however LFSR113 can additionally be seeded using an :code:`uint4`.
4461+All generators can be seeded with :code:`unsigned long long`. However, LFSR113 can also be seeded using a :code:`uint4`.
4462
4463 Output types
4464 ------------
4465
4466-The generators produce pseudo-random numbers chosen from a given distribution. The following distributions and corresponding output types are supported for the host API:
4467+The generators produce pseudo-random numbers chosen from a given distribution.
4468+The following distributions and corresponding output types are supported for the host API:
4469
4470 Uniform distribution
4471-""""""""""""""""""""
4472+^^^^^^^^^^^^^^^^^^^^^^
4473
4474- .. list-table:: Supported types for uniform distributions on the host
4475+.. list-table:: Supported types for uniform distributions on the host
4476 :header-rows: 1
4477 :name: host-types-uniform-distribution
4478
4479@@ -122,7 +130,7 @@ Uniform distribution
4480 - ✅
4481 *
4482 - :code:`unsigned long long`
4483- - 64 bit [#]_
4484+ - 64 bit (see note)
4485 - ✅
4486 - ✅
4487 *
4488@@ -141,14 +149,19 @@ Uniform distribution
4489 - ✅
4490 - ✅
4491
4492-Uniform distributions of integral types return a number between 0 and 2^(size in bits) - 1, whereas floating-point types return a number between 0.0 and 1.0, excluding 1.0.
4493+Uniform distributions of integral types return a number between 0 and 2^(size in bits) - 1,
4494+whereas floating-point types return a number between 0.0 and 1.0, excluding 1.0.
4495+
4496+.. note::
4497+
4498+ The generation of 64-bit :code:`unsigned long long` integers is only supported by 64-bit generators
4499+ (Scrambled Sobol 64, Sobol64, Threefry 2x64-20, and Threefry 4x64-20).
4500
4501 Poisson distribution
4502-"""""""""""""""""""""
4503+^^^^^^^^^^^^^^^^^^^^^^
4504
4505- .. list-table:: Supported types for the poisson distribution on the host
4506+.. list-table:: Supported types for the Poisson distribution on the host
4507 :header-rows: 1
4508- :name: host-types-poisson-distribution
4509
4510 *
4511 - Type
4512@@ -162,9 +175,9 @@ Poisson distribution
4513 - ✅
4514
4515 Normal distribution
4516-"""""""""""""""""""""
4517+^^^^^^^^^^^^^^^^^^^^^^
4518
4519- .. list-table:: Supported types for normal distributions on the host
4520+.. list-table:: Supported types for normal distributions on the host
4521 :header-rows: 1
4522 :name: host-types-normal-distribution
4523
4524@@ -190,9 +203,9 @@ Normal distribution
4525 - ✅
4526
4527 Log-normal distributions
4528-""""""""""""""""""""""""
4529+^^^^^^^^^^^^^^^^^^^^^^^^
4530
4531- .. list-table:: Supported types for log-normal distributions on the host
4532+.. list-table:: Supported types for log-normal distributions on the host
4533 :header-rows: 1
4534 :name: host-types-log-normal-distribution
4535
4536@@ -220,9 +233,13 @@ Log-normal distributions
4537 Device API
4538 ==========
4539
4540+This section covers the supported data types for the device API.
4541+
4542+
4543 Generator types
4544 ---------------
4545- .. list-table:: Supported generators on the device
4546+
4547+.. list-table:: Supported generators on the device
4548 :header-rows: 1
4549 :name: device-supported-generators
4550
4551@@ -294,40 +311,42 @@ Generator types
4552 Seed types
4553 ----------
4554
4555-All generators can be seeded with :code:`unsigned long long`, however LFSR113 can additionally be seeded using an :code:`uint4`.
4556+All generators can be seeded with :code:`unsigned long long`, however LFSR113 can also be seeded using an :code:`uint4`.
4557
4558 Output types
4559 ------------
4560
4561-The generators produce pseudo-random numbers chosen from a given distribution. The following distributions and corresponding output types are supported for the device API, however not all generators support all types:
4562+The generators produce pseudo-random numbers chosen from a given distribution.
4563+The following distributions and corresponding output types are supported for the device API.
4564+However, not all generators support all types.
4565
4566
4567 Uniform distribution
4568-""""""""""""""""""""
4569+^^^^^^^^^^^^^^^^^^^^^^
4570
4571- .. list-table:: Supported types for uniform distributions on the device
4572+.. list-table:: Supported types for uniform distributions on the device
4573 :header-rows: 1
4574 :name: device-types-uniform-distribution
4575
4576 *
4577 - Type
4578 - rocRAND support
4579- - supported rocRAND generators
4580+ - Supported rocRAND generators
4581 - cuRAND support
4582 *
4583 - :code:`unsigned int`
4584 - ✅
4585- - all native 32-bit generators
4586+ - All native 32-bit generators
4587 - ✅
4588 *
4589 - :code:`unsigned long long int`
4590 - ✅
4591- - all native 64-bit generators
4592+ - All native 64-bit generators
4593 - ✅
4594 *
4595 - :code:`float`
4596 - ✅
4597- - all generators
4598+ - All generators
4599 - ✅
4600 *
4601 - :code:`float2`
4602@@ -342,7 +361,7 @@ Uniform distribution
4603 *
4604 - :code:`double`
4605 - ✅
4606- - all generators
4607+ - All generators
4608 - ✅
4609 *
4610 - :code:`double2`
4611@@ -357,21 +376,21 @@ Uniform distribution
4612
4613
4614 Normal distribution
4615-""""""""""""""""""""
4616+^^^^^^^^^^^^^^^^^^^^^^
4617
4618- .. list-table:: Supported types for normal distributions on the device
4619+.. list-table:: Supported types for normal distributions on the device
4620 :header-rows: 1
4621 :name: device-types-normal-distribution
4622
4623 *
4624 - Type
4625 - rocRAND support
4626- - supported rocRAND generators
4627+ - Supported rocRAND generators
4628 - cuRAND support
4629 *
4630 - :code:`float`
4631 - ✅
4632- - all generators
4633+ - All generators
4634 - ✅
4635 *
4636 - :code:`float2`
4637@@ -386,7 +405,7 @@ Normal distribution
4638 *
4639 - :code:`double`
4640 - ✅
4641- - all generators
4642+ - All generators
4643 - ✅
4644 *
4645 - :code:`double2`
4646@@ -400,21 +419,21 @@ Normal distribution
4647 - ❌
4648
4649 Log-normal distributions
4650-""""""""""""""""""""""""
4651+^^^^^^^^^^^^^^^^^^^^^^^^
4652
4653- .. list-table:: Supported types for log-normal distributions on the device
4654+.. list-table:: Supported types for log-normal distributions on the device
4655 :header-rows: 1
4656 :name: device-types-log-normal-distribution
4657
4658 *
4659 - Type
4660 - rocRAND support
4661- - supported rocRAND generators
4662+ - Supported rocRAND generators
4663 - cuRAND support
4664 *
4665 - :code:`float`
4666 - ✅
4667- - all generators
4668+ - All generators
4669 - ✅
4670 *
4671 - :code:`float2`
4672@@ -429,7 +448,7 @@ Log-normal distributions
4673 *
4674 - :code:`double`
4675 - ✅
4676- - all generators
4677+ - All generators
4678 - ✅
4679 *
4680 - :code:`double2`
4681@@ -443,16 +462,16 @@ Log-normal distributions
4682 - ❌
4683
4684 Poisson distributions
4685-"""""""""""""""""""""
4686+^^^^^^^^^^^^^^^^^^^^^^
4687
4688- .. list-table:: Supported types for poisson distributions on the device
4689+.. list-table:: Supported types for Poisson distributions on the device
4690 :header-rows: 1
4691 :name: device-types-poisson-distribution
4692
4693 *
4694 - Type
4695 - rocRAND support
4696- - supported rocRAND generators
4697+ - Supported rocRAND generators
4698 - cuRAND support
4699 *
4700 - :code:`unsigned int`
4701@@ -462,7 +481,7 @@ Poisson distributions
4702 *
4703 - :code:`unsigned long long int`
4704 - ✅
4705- - Sobol64, Scrambled sobol64
4706+ - Sobol64, Scrambled Sobol64
4707 - ❌
4708 *
4709 - :code:`uint4`
4710@@ -471,27 +490,24 @@ Poisson distributions
4711 - ✅
4712
4713 Discrete distributions
4714-""""""""""""""""""""""
4715+^^^^^^^^^^^^^^^^^^^^^^
4716
4717- .. list-table:: Supported types for discrete distributions on the device
4718+.. list-table:: Supported types for discrete distributions on the device
4719 :header-rows: 1
4720 :name: device-types-discrete-distribution
4721
4722 *
4723 - Type
4724 - rocRAND support
4725- - supported rocRAND generators
4726+ - Supported rocRAND generators
4727 - cuRAND support
4728 *
4729 - :code:`unsigned int`
4730 - ✅
4731- - all generators
4732+ - All generators
4733 - ✅
4734 *
4735 - :code:`uint4`
4736 - ✅
4737 - Philox 4x32-10
4738 - ✅ - only Philox - 4x32-10
4739-
4740-.. rubric:: Footnotes
4741-.. [#] Generation of 64 bit :code:`unsigned long long` integers is only supported by 64 bit generators (Scrambled Sobol 64, Sobol64, Threefry 2x64-20 and Threefry 4x64-20).
4742\ No newline at end of file
4743diff --git a/docs/api-reference/python-api.rst b/docs/api-reference/python-api.rst
4744index 189f9b0..ff7104c 100644
4745--- a/docs/api-reference/python-api.rst
4746+++ b/docs/api-reference/python-api.rst
4747@@ -1,6 +1,6 @@
4748 .. meta::
4749- :description: rocRAND documentation and API reference library
4750- :keywords: rocRAND, ROCm, API, documentation
4751+ :description: rocRAND Python API reference
4752+ :keywords: rocRAND, ROCm, API, documentation, Python
4753
4754 .. _python-api:
4755
4756@@ -13,7 +13,7 @@ This chapter describes the rocRAND Python module API.
4757 API index
4758 ------------
4759
4760-To search an API, refer to the API :ref:`genindex`.
4761+To search the API, see the API :ref:`genindex`.
4762
4763 .. default-domain:: py
4764 .. py:currentmodule:: rocrand
4765@@ -51,4 +51,4 @@ Utilities
4766
4767 .. autofunction:: rocrand.get_version
4768
4769-To search an API, refer to the :ref:`genindex` for all rocRAND APIs.
4770+To search the API, see the :ref:`genindex` for all rocRAND APIs.
4771diff --git a/docs/conceptual/curand-compatibility.rst b/docs/conceptual/curand-compatibility.rst
4772index d95e7a7..b75b48b 100644
4773--- a/docs/conceptual/curand-compatibility.rst
4774+++ b/docs/conceptual/curand-compatibility.rst
4775@@ -1,5 +1,5 @@
4776 .. meta::
4777- :description: rocRAND documentation and API reference library
4778+ :description: rocRAND compatibility with cuRAND
4779 :keywords: rocRAND, ROCm, API, documentation, cuRAND
4780
4781 .. _curand-compatibility:
4782@@ -10,7 +10,7 @@ cuRAND compatibility
4783
4784 The following table shows which rocRAND generators produce the exact same sequence as the equivalent NVIDIA CUDA cuRAND generator when using legacy ordering, given the same seed, number of dimensions, and offset.
4785
4786-.. table:: cuRAND Compatibility
4787+.. table:: cuRAND compatibility
4788 :widths: auto
4789
4790 ================= =====================
4791diff --git a/docs/conceptual/dynamic_ordering_configuration.rst b/docs/conceptual/dynamic_ordering_configuration.rst
4792index bba95db..c1f62c0 100644
4793--- a/docs/conceptual/dynamic_ordering_configuration.rst
4794+++ b/docs/conceptual/dynamic_ordering_configuration.rst
4795@@ -1,6 +1,6 @@
4796 .. meta::
4797- :description: rocRAND documentation and API reference library
4798- :keywords: rocRAND, ROCm, API, documentation
4799+ :description: rocRAND documentation for dynamic ordering configuration
4800+ :keywords: rocRAND, ROCm, API, documentation, dynamic ordering
4801
4802 .. _dynamic-ordering-configuration:
4803
4804@@ -8,85 +8,135 @@
4805 Kernel configurations for dynamic ordering
4806 =============================================================
4807
4808-Overview
4809-========
4810+When dynamic ordering (``ROCRAND_ORDERING_PSEUDO_DYNAMIC``) is set, rocRAND selects the number of blocks and threads
4811+to launch on the GPU to accommodate the specific GPU model best.
4812+Consequently, the number of allocated generators and the sequence of the generated numbers can also vary.
4813
4814-When dynamic ordering (``ROCRAND_ORDERING_PSEUDO_DYNAMIC``) is set, the number of blocks and threads launched on the GPU is selected such that it best accommodates the specific GPU model. As a consequence, the number of allocated generators and thereby the sequence of the generated numbers can also vary.
4815-
4816-The tuning, i.e. the selection of the most performant configuration for each GPU architecture can be performed in an automated manner. The necessary tools and benchmarks for the tuning are provided in the rocRAND repository. In the following, the process of the tuning is described.
4817+The tuning, which is the selection of the most performant configuration for each GPU architecture,
4818+can be performed in an automated manner. The necessary tools and benchmarks for the tuning are provided
4819+in the rocRAND repository. The following sections provide additional details about the tuning process.
4820
4821 .. _tuning-benchmark-build:
4822
4823 Building the tuning benchmarks
4824 ==============================
4825
4826-The principle of the tuning is very simple: the random number generation kernel is run for a list of kernel block size / kernel grid size combinations, and the fastest combination is selected as the dynamic ordering configuration for the particular device. rocRAND provides an executable target that runs the benchmarks with all these combinations: `benchmark_rocrand_tuning`. This target is disabled by default, and can be enabled and built by the following snippet.
4827+The principle behind the tuning is straightforward. The random number generation kernel is run
4828+for a list of kernel block size and kernel grid size combinations. The fastest combination
4829+is then selected as the dynamic ordering configuration for that particular device.
4830+rocRAND provides an executable target named ``benchmark_rocrand_tuning`` that runs the benchmarks with all these
4831+combinations.
4832+
4833+This target is disabled by default, but it can be enabled and built using the following snippet.
4834+Use the ``GPU_TARGETS`` variable to specify a comma-separated list of GPU architectures to build the benchmarks for.
4835+To determine the architecture of the installed GPU(s), run the ``rocminfo`` command
4836+and look for ``gfx`` in the "ISA Info" section.
4837
4838-Use the `GPU_TARGETS` variable to specify the comma-separated list of GPU architectures to build the benchmarks for. To acquire the architecture of the GPU(s) installed, run `rocminfo`, and look for `gfx` in the "ISA Info" section. ::
4839+.. code-block:: shell
4840
4841- $ cd rocRAND
4842- $ cmake -S . -B ./build
4843- -D BUILD_BENCHMARK=ON
4844- -D BUILD_BENCHMARK_TUNING=ON
4845- -D CMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++
4846- -D GPU_TARGETS=gfx908
4847- $ cmake --build build --target benchmark_rocrand_tuning
4848+ cd rocm-libraries/projects/rocrand
4849+ cmake -S . -B ./build
4850+ -D BUILD_BENCHMARK=ON
4851+ -D BUILD_BENCHMARK_TUNING=ON
4852+ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++
4853+ -D GPU_TARGETS=gfx908
4854+ cmake --build build --target benchmark_rocrand_tuning
4855
4856-Additionally, the following CMake cache variables control the generation of the benchmarked matrix:
4857+The following CMake cache variables control the generation of the benchmarked matrix:
4858
4859 ========================================== ===============================================================
4860 Variable name Explanation
4861 ========================================== ===============================================================
4862 ``BENCHMARK_TUNING_THREAD_OPTIONS`` Comma-separated list of benchmarked block sizes
4863 ``BENCHMARK_TUNING_BLOCK_OPTIONS`` Comma-separated list of benchmarked grid sizes
4864-``BENCHMARK_TUNING_MIN_GRID_SIZE`` Configurations with fewer total number of threads are omitted
4865+``BENCHMARK_TUNING_MIN_GRID_SIZE`` Configurations with fewer total threads are omitted
4866 ========================================== ===============================================================
4867
4868-Note, that currently the benchmark tuning is only supported for AMD GPUs.
4869+.. note::
4870+
4871+ The benchmark tuning is only supported for AMD GPUs.
4872
4873 Using the number of multiprocessors as candidates
4874 -------------------------------------------------
4875
4876-Multiples of the number of multiprocessors of the GPU at hand are good candidates for ``BENCHMARK_TUNING_BLOCK_OPTIONS``. Running `rocRAND/scripts/config-tuning/get_tuned_grid_sizes.py` executes `rocminfo` to acquire the number of multiprocessors, and prints a comma-separated list of grid size candidates to the standard output.
4877+Multiples of the number of multiprocessors on the GPU being benchmarked are
4878+good candidate values for ``BENCHMARK_TUNING_BLOCK_OPTIONS``.
4879+The ``rocm-libraries/projects/rocrand/scripts/config-tuning/get_tuned_grid_sizes.py`` executable
4880+runs ``rocminfo`` to acquire the number of multiprocessors and prints a comma-separated list
4881+of grid size candidates to the standard output.
4882
4883 .. _tuning-benchmark-run:
4884
4885 Running the tuning benchmarks
4886 =============================
4887
4888-When the `benchmark_rocrand_tuning` target is built, the benchmarks can be run and the results can be collected for further processing. Since the benchmarks run for a longer time period, it is crucial that the GPU in use is thermally stable, i.e. the cooling must be adequate enough to keep the GPU at the preset clock rates without throttling. Additionally, make sure that no other workload is dispatched on the GPU concurrently. Otherwise the resulting dynamic ordering configs might not be the optimal ones. The full benchmark suite can be run with the following command: ::
4889+After building the ``benchmark_rocrand_tuning`` target, you can run the benchmarks
4890+and collect the results for further processing.
4891+The benchmarks can run for a long time, so it is crucial that the GPU in use is thermally stable.
4892+For instance, there must be adequate cooling to keep the GPU at the preset clock rates without throttling.
4893+Additionally, ensure that no other workload is concurrently dispatched to the GPU.
4894+Otherwise, the resulting dynamic ordering configurations might not be the optimal ones.
4895+Run the full benchmark suite using the following command:
4896
4897- $ cd ./build/benchmark/tuning
4898- $ ./benchmark_rocrand_tuning --benchmark_out_format=json --benchmark_out=rocrand_tuning_gfx908.json
4899+.. code-block:: shell
4900
4901-This executes the benchmarks and saves the benchmark results into the JSON file at `rocrand_tuning_gfx908.json`. If only a subset of the benchmarks needs to be run, e.g. for a single generator, the `--benchmark_filter=<regex>` option can be used. For example: `--benchmark_filter=".*philox.*"`.
4902+ cd ./build/benchmark/tuning
4903+ ./benchmark_rocrand_tuning --benchmark_out_format=json --benchmark_out=rocrand_tuning_gfx908.json
4904+
4905+This executes the benchmarks and saves the benchmark results to the ``rocrand_tuning_gfx908.json`` JSON file.
4906+To only run a subset of the benchmarks, such as for a single generator, use the ``--benchmark_filter=<regex>`` option,
4907+for example, ``--benchmark_filter=".*philox.*"``.
4908
4909 .. _tuning-benchmark-process:
4910
4911 Processing the benchmark results
4912 ================================
4913
4914-Once the benchmark results in JSON format from all architectures are present, the best configs are selected using the `rocRAND/scripts/config-tuning/select_best_config.py` script. Make sure that the prerequisite libraries are installed, by running ``pip install -r rocRAND/scripts/config-tuning/requirements.txt``.
4915+After the benchmark results from all architectures in JSON format are available, the best configurations
4916+are selected using the ``rocm-libraries/projects/rocrand/scripts/config-tuning/select_best_config.py`` script.
4917+Ensure the prerequisite libraries are installed by running the following command:
4918+
4919+.. code-block:: shell
4920
4921-Each rocRAND generator is capable of generating a multitude of output types and distributions. However, a single configuration is selected for each GPU architecture, which applies uniformly to all types and distributions. It is possible that the configuration that performs the best for one distribution is not the fastest for another. `select_best_config.py` selects the configuration that performs best **on average**. If, under the selected configuration, any type/distribution performs worse than ``ROCRAND_ORDERING_PSEUDO_DEFAULT``, a warning is printed to the standard output. The eventual decision about applying the configuration or not have to be made by the library's maintainers.
4922+ pip install -r rocm-libraries/projects/rocrand/scripts/config-tuning/requirements.txt.
4923
4924-The main output of running `select_best_config.py` is a number of C++ header files that contain the definitions of the dynamic ordering config for the benchmarked architectures. These files are intended to be copied to the `rocRAND/library/src/rng/config` directory of the source tree to be checked in to the version control. The directory, to which the header files are written, can be specified with the `--out-dir` option.
4925+Each rocRAND generator can generate a multitude of output types and distributions.
4926+However, a single configuration is selected for each GPU architecture, which applies uniformly to all types
4927+and distributions. It's possible that the best performing configuration for one distribution
4928+isn't the fastest for another. ``select_best_config.py`` selects the configuration that performs best **on average**.
4929+If any type or distribution performs worse than ``ROCRAND_ORDERING_PSEUDO_DEFAULT`` under the selected configuration,
4930+a warning is printed to the standard output.
4931+The eventual decision about whether to apply the configuration is made by the library's maintainers.
4932
4933-To help humans comprehend the results, `select_best_config.py` can generate colorized diagrams to visually compare the performance of the configuration candidates. This can be invoked by passing the optional `--plot-out` argument, e.g. `--plot-out rocrand-tuning.svg`. This generates an SVG image for each GPU architecture the script has processed.
4934+The ``select_best_config.py`` script produces a set of C++ header files as output
4935+that contain the definitions of the dynamic ordering configuration for the benchmarked architectures.
4936+These files are intended to be copied to the ``rocm-libraries/projects/rocrand/library/src/rng/config`` directory of the source tree
4937+and checked in to the version control system. The directory where the header files are written to
4938+can be specified using the ``--out-dir`` option.
4939
4940-To put it all together, a potential invocation of the `select_best_config.py` script: ::
4941+For more readable results, ``select_best_config.py`` can generate colorized diagrams to visually
4942+compare the performance of the configuration candidates. To select this option, use the
4943+optional ``--plot-out`` argument, for example, ``--plot-out rocrand-tuning.svg``.
4944+This generates an SVG image for each GPU architecture processed by the script.
4945
4946- $ ./rocRAND/scripts/config-tuning/select_best_config.py --plot-out ./rocrand-tuning.svg --out-dir ./rocRAND/library/src/rng/config/ ./rocRAND/build/benchmark/tuning/rocrand_tuning_gfx908.json ./rocRAND/build/benchmark/tuning/rocrand_tuning_gfx1030.json
4947+The following invokation of the ``select_best_config.py`` script demonstrates all these options:
4948+
4949+.. code-block:: shell
4950+
4951+ ./rocm-libraries/projects/rocrand/scripts/config-tuning/select_best_config.py --plot-out ./rocrand-tuning.svg --out-dir ./rocm-libraries/projects/rocrand/library/src/rng/config/ ./rocm-libraries/projects/rocrand/build/benchmark/tuning/rocrand_tuning_gfx908.json ./rocm-libraries/projects/rocrand/build/benchmark/tuning/rocrand_tuning_gfx1030.json
4952
4953 Adding support for a new GPU architecture
4954 =========================================
4955
4956-The intended audience of this section is the developer, who is adding support to rocRAND for a new GPU architecture.
4957+This section is intended for developers who want to add rocRAND support for a new GPU architecture.
4958+To add support, follow this checklist:
4959+
4960+#. Update the hard-coded list of recognized architectures in the ``library/src/rng/config_types.hpp`` file. The following symbols must be updated accordingly:
4961+
4962+ * Enum class ``target_arch``: Lists the recognized architectures as an enumeration.
4963+ * Function ``get_device_arch``: The device to compile to in the device code.
4964+ * Function ``parse_gcn_arch``: Translates from the name of the architecture to the ``target_arch`` enum in the host code.
4965
4966-1. The list of the recognized architectures are hard-coded in source file `library/src/rng/config_types.hpp`. The following symbols have to be updated accordingly:
4967- * Enum class ``target_arch`` - lists the recognized architectures as an enumeration.
4968- * Function ``get_device_arch`` - recognizes the device that we compile to in device code.
4969- * Function ``parse_gcn_arch`` - dispatches from the name of the architecture to the ``target_arch`` enum in host code.
4970-2. The tuning benchmarks has to be compiled and run for the new architecture. See :ref:`tuning-benchmark-build` and :ref:`tuning-benchmark-run`.
4971-3. The benchmark results have to be processed by the provided `select_best_config.py` script. See :ref:`tuning-benchmark-process`.
4972-4. The resulting header files have to be merged with the ones that are checked in the version control in directory `rocRAND/library/src/rng/config`.
4973+#. The tuning benchmarks must be compiled and run for the new architecture. See :ref:`tuning-benchmark-build` and :ref:`tuning-benchmark-run`.
4974+#. The benchmark results must be processed by the ``select_best_config.py`` script. See :ref:`tuning-benchmark-process`.
4975+#. The resulting header files must be added to version control in the ``rocm-libraries/projects/rocrand/library/src/rng/config`` directory.
4976diff --git a/docs/conceptual/programmers-guide.rst b/docs/conceptual/programmers-guide.rst
4977index 4ebc971..0dcce5c 100644
4978--- a/docs/conceptual/programmers-guide.rst
4979+++ b/docs/conceptual/programmers-guide.rst
4980@@ -1,163 +1,181 @@
4981 .. meta::
4982- :description: rocRAND documentation and API reference library
4983- :keywords: rocRAND, ROCm, API, documentation
4984+ :description: Programming guide for rocRAND
4985+ :keywords: rocRAND, ROCm, API, documentation, programming, generator types
4986
4987 .. _programmers-guide:
4988
4989-==================
4990-Programmer's guide
4991-==================
4992+*******************************************************************
4993+rocRAND programming guide
4994+*******************************************************************
4995+
4996+This topic discusses some issues to consider when using rocRAND in your application.
4997
4998 Generator types
4999-===============
5000+===============================
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches