Merge ~bullwinkle-team/ubuntu/+source/rocfft:bullwinkle/llvm-21/ubuntu/devel into ubuntu/+source/rocfft:ubuntu/devel

Proposed by Bruno Bernardo de Moura
Status: Merged
Approved by: Andreas Hasenack
Approved revision: e915243b73be522caf7d38de60e5b0b4fd449e3d
Merged at revision: e915243b73be522caf7d38de60e5b0b4fd449e3d
Proposed branch: ~bullwinkle-team/ubuntu/+source/rocfft:bullwinkle/llvm-21/ubuntu/devel
Merge into: ubuntu/+source/rocfft:ubuntu/devel
Diff against target: 28364 lines (+10558/-6672)
153 files modified
.github/CODEOWNERS (+2/-2)
CHANGELOG.md (+41/-0)
CMakeLists.txt (+29/-48)
LICENSE.md (+19/-5)
README.md (+17/-10)
clients/CMakeLists.txt (+2/-2)
clients/bench/CMakeLists.txt (+4/-27)
clients/bench/bench.cpp (+36/-140)
clients/bench/bench.h (+197/-0)
clients/bench/dyna-bench.cpp (+23/-90)
clients/samples/fixed-16/CMakeLists.txt (+1/-1)
clients/samples/fixed-large/CMakeLists.txt (+1/-1)
clients/samples/mpi/CMakeLists.txt (+2/-1)
clients/samples/mpi/rocfft_mpi_example.cpp (+13/-8)
clients/samples/multi_gpu/CMakeLists.txt (+2/-1)
clients/samples/rocfft/CMakeLists.txt (+9/-1)
clients/samples/rocfft/rocfft_example_callback.cpp (+8/-1)
clients/tests/CMakeLists.txt (+109/-28)
clients/tests/accuracy_test.cpp (+1/-1)
clients/tests/accuracy_test_3D.cpp (+19/-14)
clients/tests/accuracy_test_callback.cpp (+21/-16)
clients/tests/accuracy_test_checkstride.cpp (+4/-0)
clients/tests/accuracy_tests_range.h (+10/-2)
clients/tests/buffer_hash_test.cpp (+22/-1)
clients/tests/callback_change_type.cpp (+22/-17)
clients/tests/default_callbacks_test.cpp (+51/-0)
clients/tests/gtest_main.cpp (+102/-84)
clients/tests/hermitian_test.cpp (+19/-0)
clients/tests/hipGraph_test.cpp (+9/-2)
clients/tests/multi_device_test.cpp (+126/-13)
clients/tests/multithread_test.cpp (+37/-0)
clients/tests/unit_test.cpp (+184/-15)
cmake/sqlite.cmake (+16/-15)
debian/changelog (+24/-0)
debian/control (+13/-13)
debian/gbp.conf (+1/-1)
debian/patches/Extend-docs-conf.py-for-offline-build.patch (+2/-4)
debian/patches/do-not-strictly-depend-on-sqlite3-v3.50.2.patch (+21/-0)
debian/patches/series (+1/-1)
debian/rules (+6/-1)
debian/tests/control (+1/-1)
designdocs/codegen.rst (+1/-1)
designdocs/design.rst (+5/-5)
dev/null (+0/-1019)
docs/conf.py (+1/-2)
docs/doxygen/Doxyfile (+1/-1)
docs/how-to/enabling-logging.rst (+134/-0)
docs/how-to/load-store-callbacks.rst (+5/-0)
docs/how-to/working-with-rocfft.rst (+3/-3)
docs/index.rst (+7/-2)
docs/install/building-installing-rocfft.rst (+3/-2)
docs/license.md (+2/-0)
docs/sphinx/_toc.yml.in (+3/-1)
docs/sphinx/requirements.in (+1/-1)
docs/sphinx/requirements.txt (+135/-4)
docs/what-is-rocfft.rst (+1/-1)
install.sh (+0/-9)
library/solution_map/gfx908_rocfft_solution_map.dat (+0/-12)
library/solution_map/gfx90a_rocfft_solution_map.dat (+0/-12)
library/src/CMakeLists.txt (+46/-37)
library/src/assignment_policy.cpp (+5/-2)
library/src/compute_scheme.cpp (+7/-1)
library/src/device/CMakeLists.txt (+14/-60)
library/src/device/generator.py (+10/-6)
library/src/device/generator/CMakeLists.txt (+1/-0)
library/src/device/generator/fftgenerator.h (+0/-386)
library/src/device/generator/generator.cpp (+0/-31)
library/src/device/generator/generator.h (+24/-55)
library/src/device/generator/stockham_gen.cpp (+507/-45)
library/src/device/generator/stockham_gen.h (+40/-0)
library/src/device/generator/stockham_gen_2d.h (+180/-132)
library/src/device/generator/stockham_gen_base.h (+83/-85)
library/src/device/generator/stockham_gen_cr.h (+26/-18)
library/src/device/generator/stockham_gen_rc.h (+4/-6)
library/src/device/generator/stockham_gen_rr.h (+18/-185)
library/src/device/generator/stockham_pp_gen_cc.h (+1083/-0)
library/src/device/generator/stockham_pp_gen_rr.h (+588/-0)
library/src/device/kernel-generator-embed.h (+1/-0)
library/src/device/kernel-generator.py (+243/-878)
library/src/device/kernels/common.h (+5/-75)
library/src/device/kernels/configs/config_2d_single.py (+180/-0)
library/src/device/kernels/configs/config_lds.py (+2/-30)
library/src/device/kernels/configs/config_pp_3d.py (+33/-0)
library/src/device/kernels/configs/config_sbcc.py (+94/-0)
library/src/device/kernels/configs/config_sbcr.py (+42/-0)
library/src/device/kernels/configs/config_sbrc.py (+52/-0)
library/src/device/kernels/configs/config_sbrr.py (+508/-0)
library/src/device/kernels/device_enum.h (+106/-0)
library/src/device/solution-shipping.py (+3/-4)
library/src/fuse_shim.cpp (+1/-6)
library/src/include/compute_scheme.h (+3/-0)
library/src/include/enum_printer.h (+1/-1)
library/src/include/function_map_key.h (+125/-12)
library/src/include/function_pool.h (+242/-86)
library/src/include/kernel_launch.h (+0/-1066)
library/src/include/node_factory.h (+32/-15)
library/src/include/plan.h (+30/-2)
library/src/include/rocfft_mpi.h (+59/-2)
library/src/include/rtc_kernel.h (+1/-1)
library/src/include/rtc_stockham_gen.h (+44/-42)
library/src/include/tree_node.h (+330/-69)
library/src/include/tree_node_1D.h (+43/-4)
library/src/include/tree_node_2D.h (+1/-1)
library/src/include/tree_node_3D.h (+22/-7)
library/src/include/tree_node_bluestein.h (+5/-4)
library/src/include/tree_node_real.h (+2/-6)
library/src/node_factory.cpp (+214/-106)
library/src/plan.cpp (+919/-230)
library/src/powX.cpp (+18/-22)
library/src/rocfft_aot_helper.cpp (+30/-28)
library/src/rocfft_kernel_config_search.cpp (+80/-61)
library/src/rocfft_ostream.cpp (+12/-0)
library/src/rtc_bluestein_gen.cpp (+2/-0)
library/src/rtc_bluestein_kernel.cpp (+1/-3)
library/src/rtc_cache.cpp (+19/-8)
library/src/rtc_chirp_gen.cpp (+1/-0)
library/src/rtc_kernel.cpp (+14/-14)
library/src/rtc_realcomplex_gen.cpp (+3/-0)
library/src/rtc_realcomplex_kernel.cpp (+1/-0)
library/src/rtc_stockham_gen.cpp (+139/-64)
library/src/rtc_stockham_kernel.cpp (+40/-35)
library/src/rtc_transpose_gen.cpp (+1/-0)
library/src/rtc_transpose_kernel.cpp (+1/-0)
library/src/rtc_twiddle_gen.cpp (+2/-1)
library/src/transform.cpp (+4/-11)
library/src/tree_node.cpp (+252/-119)
library/src/tree_node_1D.cpp (+91/-91)
library/src/tree_node_2D.cpp (+2/-3)
library/src/tree_node_3D.cpp (+189/-146)
library/src/tree_node_bluestein.cpp (+19/-11)
library/src/tree_node_real.cpp (+29/-25)
library/src/twiddles.cpp (+6/-1)
rtest.xml (+3/-0)
scripts/perf/perflib/bench.py (+7/-4)
scripts/perf/perflib/utils.py (+91/-3)
scripts/perf/rocfft-perf (+70/-17)
scripts/perf/suites.py (+52/-38)
scripts/rocfft_mpi_test.py (+140/-0)
scripts/rocfftslurmtest.py (+256/-0)
scripts/rocslurm/__init__.py (+201/-0)
shared/accuracy_test.h (+41/-33)
shared/device_properties.h (+1/-0)
shared/environment.h (+2/-2)
shared/fft_params.h (+390/-73)
shared/gpubuf.h (+14/-10)
shared/hostbuf.h (+62/-34)
shared/mpi_worker.h (+408/-298)
shared/params_gen.h (+46/-17)
shared/printbuffer.h (+62/-19)
shared/rocfft_params.h (+161/-111)
shared/sys_mem.h (+81/-29)
shared/test_params.h (+2/-0)
toolchain-windows.cmake (+4/-3)
Reviewer Review Type Date Requested Status
Andreas Hasenack Approve
Ubuntu Sponsors Pending
Review via email: mp+499354@code.launchpad.net

Description of the change

New upstream version 7.1.0

To post a comment you must log in.
Revision history for this message
Bruno Bernardo de Moura (bruno-bdmoura) wrote :
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

#TBD bug needs updating in d/changelog

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Question in line about bullwinkle.sources

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Just adding a comment because launchpad OOPSed in my previous two, because they included diff comments.

Revision history for this message
Talha Can Havadar (tchavadar) wrote :

Hello Andreas, I added these salsa-ci changes just to use salsa with our ppa during development they can indeed be dropped safely, Bruno can you drop the relevant commits and rebase?

Revision history for this message
Talha Can Havadar (tchavadar) wrote :

just forced with lease, salsa-ci related changes should be gone now

Revision history for this message
Talha Can Havadar (tchavadar) wrote :

Fixed the TBD in changelog as well

Revision history for this message
Igor Luppi (igorluppi) wrote (last edit ):

Where are the inline comments?
src:hiprand was accepted, this one is finally ready to archive.

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Click on "show diff comments" comment text box.

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

And then scroll... Yes, this UI is bad. In future PRs I'll refrain from doing that, and will just copy & paste the diff here in this box when I have a comment.

Revision history for this message
Igor Luppi (igorluppi) wrote :

Due to the fixup/rebase it's not working the 'show diff comments' :/

Revision history for this message
Talha Can Havadar (tchavadar) wrote :

I can see 2 inline comments on first revision in LP review. and they are addressed

they were about TBD and salsa-ci

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Recovering them here:

b) bullwinkle.sources
It's the first time I see something like this. I see the corresponding change in d/salsa-ci.yml, though. Is this a pattern for adding external repositories to salsa ci? Is this the way to do it? I was hoping for something in salsa-ci.yml directly, and not just a reference to a sources file elsewhere in the packaging. Perhaps even an add-apt-repository command.

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

a) TBD in changelog
TBD needs updating

That was it, now checking updates.

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

--- a/debian/control
+++ b/debian/control
@@ -36,7 +36,7 @@ Rules-Requires-Root: no

 Package: librocfft0
 Section: libs
-Architecture: amd64 arm64 ppc64el
+Architecture: amd64 arm64
 Multi-Arch: same
 XB-X-ROCm-GPU-Architecture: ${rocm:GPU-Architecture}
 Depends: ${misc:Depends}, ${shlibs:Depends}
@@ -53,7 +53,7 @@ Description: ROCm library for computing Fast Fourier Transforms - library

 Package: librocfft-dev
 Section: libdevel
-Architecture: amd64 arm64 ppc64el
+Architecture: amd64 arm64
 Depends: librocfft0 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends}
 Recommends: libamdhip64-dev
 Suggests: librocfft-doc
@@ -70,7 +70,7 @@ Description: ROCm library for computing Fast Fourier Transforms - headers

 Package: librocfft0-tests
 Section: libdevel
-Architecture: amd64 arm64 ppc64el
+Architecture: amd64 arm64
 Build-Profiles: <!nocheck>
 Depends: librocfft0 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends}
 Description: ROCm library for computing Fast Fourier Transforms - tests

ppc64el is being dropped, what is the impact?

review: Needs Information
Revision history for this message
Talha Can Havadar (tchavadar) wrote (last edit ):

seems like no rdepends:

```
root@pretty-antelope:~# ./reverse-depends-source.sh rocfft
+ '[' -z rocfft ']'
+ SOURCE_PKG=rocfft
++ apt-cache showsrc rocfft
++ grep -m1 '^Binary:'
++ sed 's/^Binary: //'
++ tr , '\n'
++ tr -d ' '
+ BINARIES=$'librocfft0\nlibrocfft-dev\nlibrocfft0-tests\nlibrocfft-doc'
+ '[' -z $'librocfft0\nlibrocfft-dev\nlibrocfft0-tests\nlibrocfft-doc' ']'
+ for pkg in $BINARIES
+ echo '=== reverse-depends for: librocfft0 ==='
=== reverse-depends for: librocfft0 ===
+ reverse-depends -x -a ppc64el librocfft0
No reverse dependencies found
+ reverse-depends -a ppc64el -x -b librocfft0
No reverse dependencies found
+ echo ''

+ for pkg in $BINARIES
+ echo '=== reverse-depends for: librocfft-dev ==='
=== reverse-depends for: librocfft-dev ===
+ reverse-depends -x -a ppc64el librocfft-dev
No reverse dependencies found
+ reverse-depends -a ppc64el -x -b librocfft-dev
b'<p>Unknown package</p>'
+ echo ''

+ for pkg in $BINARIES
+ echo '=== reverse-depends for: librocfft0-tests ==='
=== reverse-depends for: librocfft0-tests ===
+ reverse-depends -x -a ppc64el librocfft0-tests
No reverse dependencies found
+ reverse-depends -a ppc64el -x -b librocfft0-tests
No reverse dependencies found
+ echo ''

+ for pkg in $BINARIES
+ echo '=== reverse-depends for: librocfft-doc ==='
=== reverse-depends for: librocfft-doc ===
+ reverse-depends -x -a ppc64el librocfft-doc
No reverse dependencies found
+ reverse-depends -a ppc64el -x -b librocfft-doc
No reverse dependencies found
+ echo ''

root@pretty-antelope:~# reverse-depends --arch ppc64el -x rocfft
b'<p>Unknown package</p>'
root@pretty-antelope:~# reverse-depends --arch ppc64el -x src:rocfft
No reverse dependencies found
```

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

I see librocfft0 was removed[1] as requested by bug LP: #2134241, together with many other packages. Maybe the reverse dependencies were in that list as well.

But right now, the state is as you described, so +1.

1. https://launchpad.net/ubuntu/resolute/ppc64el/librocfft0

review: Approve
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Sponsored:

Uploading rocfft_7.1.0-0ubuntu1.dsc
Uploading rocfft_7.1.0.orig.tar.gz
Uploading rocfft_7.1.0-0ubuntu1.debian.tar.xz
Uploading rocfft_7.1.0-0ubuntu1_source.buildinfo
Uploading rocfft_7.1.0-0ubuntu1_source.changes

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
2old mode 100755
3new mode 100644
4index eff670f..056a93f
5--- a/.github/CODEOWNERS
6+++ b/.github/CODEOWNERS
7@@ -1,8 +1,8 @@
8-* @af-ayala @eng-flavio-teixeira @evetsso @feizheng10 @malcolmroberts
9+* @af-ayala @eng-flavio-teixeira @evetsso @malcolmroberts @regan-amd
10 # Documentation files
11 docs/ @ROCm/rocm-documentation
12 *.md @ROCm/rocm-documentation
13 *.rst @ROCm/rocm-documentation
14 .readthedocs.yaml @ROCm/rocm-documentation
15 # Header directory for Doxygen documentation
16-library/include/ @ROCm/rocm-documentation @af-ayala @eng-flavio-teixeira @evetsso @feizheng10 @malcolmroberts
17+library/include/ @ROCm/rocm-documentation @af-ayala @eng-flavio-teixeira @evetsso @malcolmroberts @regan-amd
18diff --git a/.jenkins/application.groovy b/.jenkins/application.groovy
19deleted file mode 100644
20index f57030b..0000000
21--- a/.jenkins/application.groovy
22+++ /dev/null
23@@ -1,182 +0,0 @@
24-#!/usr/bin/env groovy
25-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
26-@Library('rocJenkins@pong') _
27-
28-// This is file for internal AMD use.
29-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
30-
31-import com.amd.project.*
32-import com.amd.docker.*
33-import java.nio.file.Path
34-
35-def runCI =
36-{
37- nodeDetails, jobName->
38-
39- def prj = new rocProject('rocFFT-internal', 'application')
40-
41- prj.defaults.ccache = true
42- prj.timeout.compile = 600
43- prj.timeout.test = 600
44- prj.libraryDependencies = ['rocFFT', 'hipFFT']
45-
46- // Define test architectures, optional rocm version argument is available
47- def nodes = new dockerNodes(nodeDetails, jobName, prj)
48-
49- boolean formatCheck = false
50-
51- def commonGroovy
52-
53- def compileCommand =
54- {
55- platform, project->
56- def getDependenciesCommand = ""
57- if (project.installLibraryDependenciesFromCI)
58- {
59- project.libraryDependencies.each
60- { libraryName ->
61- getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false)
62- }
63- }
64-
65- def command = """#!/usr/bin/env bash
66- set -ex
67- cd ${project.paths.project_build_prefix}
68- ${getDependenciesCommand}
69- git clone -b develop-2021 https://github.com/ROCmSoftwarePlatform/Gromacs.git
70- cd Gromacs
71-
72- mkdir build_tmpi
73- cd build_tmpi
74- cmake -DCMAKE_HIP_ARCHITECTURES=gfx90a -DBUILD_SHARED_LIBS=ON -DGMX_BUILD_FOR_COVERAGE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DGMX_MPI=OFF -DGMX_GPU=hip -DGMX_OPENMP=ON -DGMX_SIMD=AVX2_256 -DREGRESSIONTEST_DOWNLOAD=OFF -DGMX_GPU_USE_VKFFT=OFF -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=../gromacs-install ..
75- make
76- make install
77- cd ..
78-
79- mkdir build_mpi
80- cd build_mpi
81- cmake -DCMAKE_HIP_ARCHITECTURES=gfx908 -DBUILD_SHARED_LIBS=ON -DGMX_BUILD_FOR_COVERAGE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++ -DGMX_MPI=ON -DGMX_GPU=hip -DGMX_OPENMP=ON -DGMX_SIMD=AVX2_256 -DREGRESSIONTEST_DOWNLOAD=OFF -DGMX_GPU_USE_VKFFT=OFF -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=../gromacs-install ..
82- make
83- make install
84- cd ..
85- """
86- platform.runCommand(this, command)
87- }
88-
89- def testCommand =
90- {
91- platform, project->
92-
93- def command = """#!/usr/bin/env bash
94- set -ex
95- cd ${project.paths.project_build_prefix}
96- cd Gromacs
97-
98- source gromacs-install/bin/GMXRC
99- gmx --version
100-
101- export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/opt/rocm/lib
102- echo \$LD_LIBRARY_PATH
103-
104- git clone https://github.com/jychang48/benchmark-gromacs.git
105- cd benchmark-gromacs
106-
107- export GMX_MAXBACKUP=-1
108-
109- echo "* Threaded MPI ******************************************************************************************************"
110-
111- #ADH_DODEC
112- cd adh_dodec
113- tar zxf adh_dodec.tar.gz
114- gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 100 # 1 GPU
115- gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs
116- gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 200 # 4 GPUs
117- gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 150 # 8 GPUs
118-
119- # STMV
120- cd ..
121- cd stmv/
122- tar zxf stmv.tar.gz
123- gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 200 # 1 GPU
124- gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs
125- gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 400 # 4 GPUs
126- gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 400 # 8 GPUs
127-
128- # CELLULOSE_NVE
129- cd ..
130- cd cellulose_nve/
131- tar zxf cellulose_nve.tar.gz
132- gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 100 # 1 GPU
133- gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs
134- gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 200 # 4 GPUs
135- gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 200 # 8 GPUs
136-
137- echo "* MPI ***************************************************************************************************************"
138-
139- # ADH_DODEC
140- cd ..
141- cd adh_dodec/
142- tar zxf adh_dodec.tar.gz
143- mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr # 1 GPU
144- mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs
145- mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs
146- mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs
147-
148- # STMV
149- cd ..
150- cd stmv/
151- tar zxf stmv.tar.gz
152- mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -nstlist 400 -gpu_id 0 -s topol.tpr # 1 GPU
153- mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs
154- mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs
155- mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs
156-
157- # CELLULOSE_NVE
158- cd ..
159- cd cellulose_nve/
160- tar zxf cellulose_nve.tar.gz
161- mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr # 1 GPU
162- mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs
163- mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs
164- mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs
165- """
166- platform.runCommand(this, command)
167- }
168-
169- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null)
170-}
171-
172-ci: {
173- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
174-
175- def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 5')])]]
176- propertyList = auxiliary.appendPropertyList(propertyList)
177-
178- def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu20:['8gfx90a']])]
179- jobNameList = auxiliary.appendJobNameList(jobNameList)
180-
181- propertyList.each
182- {
183- jobName, property->
184- if (urlJobName == jobName)
185- properties(auxiliary.addCommonProperties(property))
186- }
187-
188- jobNameList.each
189- {
190- jobName, nodeDetails->
191- if (urlJobName == jobName)
192- stage(jobName) {
193- runCI(nodeDetails, jobName)
194- }
195- }
196-
197- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
198- if(!jobNameList.keySet().contains(urlJobName))
199- {
200- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
201- stage(urlJobName) {
202- runCI([ubuntu18:['8gfx90a']], urlJobName)
203- }
204- }
205-}
206diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
207deleted file mode 100644
208index fb1397c..0000000
209--- a/.jenkins/common.groovy
210+++ /dev/null
211@@ -1,136 +0,0 @@
212-// This file is for internal AMD use.
213-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
214-
215-def runCompileCommand(platform, project, jobName, boolean debug=false, boolean buildStatic=false, boolean buildMPI=false)
216-{
217- project.paths.construct_build_prefix()
218-
219- def getDependenciesCommand = ""
220- if (project.installLibraryDependenciesFromCI)
221- {
222- project.libraryDependencies.each
223- { libraryName ->
224- getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false)
225- }
226- }
227-
228- String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCH=ON'
229- String warningArgs = '-DWERROR=ON'
230- String buildTunerArgs = '-DROCFFT_BUILD_OFFLINE_TUNER=ON'
231- String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release'
232- String buildTypeDir = debug ? 'debug' : 'release'
233- String buildMPIArgs = buildMPI ? '-DCMAKE_PREFIX_PATH=/usr/local/openmpi -DROCFFT_MPI_ENABLE=ON' : ''
234- String staticArg = buildStatic ? '-DBUILD_SHARED_LIBS=off' : ''
235- String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
236- //Set CI node's gfx arch as target if PR, otherwise use default targets of the library
237- String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
238- String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_LOCAL/rocfft_build_cache.db"
239-
240- def command = """#!/usr/bin/env bash
241- set -x
242- cd ${project.paths.project_build_prefix}
243- ${getDependenciesCommand}
244- set -e
245-
246- mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir}
247- ${auxiliary.gfxTargetParser()}
248- ${cmake} ${buildMPIArgs} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang ${buildTypeArg} ${clientArgs} ${warningArgs} ${buildTunerArgs} ${staticArg} ${amdgpuTargets} ${rtcBuildCache} ../..
249-
250- make -j\$(nproc)
251- sudo make install
252- """
253- platform.runCommand(this, command)
254-}
255-
256-
257-def runCompileClientCommand(platform, project, jobName, boolean debug=false)
258-{
259- project.paths.construct_build_prefix()
260-
261- String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCH=ON'
262- String warningArgs = '-DWERROR=ON'
263- String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
264- String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
265- String buildTypeArgClients = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release'
266- String cmakePrefixPathArg = "-DCMAKE_PREFIX_PATH=${project.paths.project_build_prefix}"
267-
268- def command = """#!/usr/bin/env bash
269- set -ex
270- cd ${project.paths.project_build_prefix}/clients
271- mkdir -p build && cd build
272- ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang ${buildTypeArgClients} ${clientArgs} ${warningArgs} ${cmakePrefixPathArg} ${amdgpuTargets} ../
273- make -j\$(nproc)
274- """
275- platform.runCommand(this, command)
276-}
277-
278-def runTestCommand (platform, project, boolean debug=false, gfilter='', extraArgs='')
279-{
280- String testBinaryName = 'rocfft-test'
281- String directory = debug ? 'debug' : 'release'
282-
283- String gfilterArg = ''
284- if (gfilter)
285- {
286- gfilterArg = "--gtest_filter=${gfilter}"
287- }
288-
289- def command = """#!/usr/bin/env bash
290- set -ex
291- cd ${project.paths.project_build_prefix}/build/${directory}/clients/staging
292- ROCM_PATH=/opt/rocm GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./${testBinaryName} --precompile=rocfft-test-precompile.db ${gfilterArg} --gtest_color=yes --R 80 --nrand 10 ${extraArgs}
293- """
294- platform.runCommand(this, command)
295-}
296-
297-def runPackageCommand(platform, project, jobName, boolean debug=false)
298-{
299- String directory = debug ? 'debug' : 'release'
300- def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/${directory}",false)
301- platform.runCommand(this, packageHelper[0])
302- platform.archiveArtifacts(this, packageHelper[1])
303-
304- //trim temp files
305- def command = """#!/usr/bin/env bash
306- set -ex
307- cd ${project.paths.project_build_prefix}/build/${directory}/
308- rm -rf _CPack_Packages/
309- find -name '*.o' -delete
310- """
311- platform.runCommand(this, command)
312-}
313-
314-def runSubsetBuildCommand(platform, project, jobName, genPattern, genSmall, genLarge, boolean onlyDouble)
315-{
316- project.paths.construct_build_prefix()
317-
318- // Don't build clients, since we're just testing if the library can build
319- String clientArgs = ''
320- String warningArgs = '-DWERROR=ON'
321- String buildTypeArg = '-DCMAKE_BUILD_TYPE=Release'
322- String buildTypeDir = 'release'
323-
324- String genPatternArgs = "-DGENERATOR_PATTERN=${genPattern}"
325- String manualSmallArgs = (genSmall != null) ? "-DGENERATOR_MANUAL_SMALL_SIZE=${genSmall}" : ''
326- String manualLargeArgs = (genLarge != null) ? "-DGENERATOR_MANUAL_LARGE_SIZE=${genLarge}" : ''
327- String precisionArgs = onlyDouble ? '-DGENERATOR_PRECISION=double' : ''
328- String kernelArgs = "${genPatternArgs} ${manualSmallArgs} ${manualLargeArgs} ${precisionArgs}"
329-
330- String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
331- //Set CI node's gfx arch as target if PR, otherwise use default targets of the library
332- String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
333- String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_LOCAL/rocfft_build_cache.db"
334-
335- def command = """#!/usr/bin/env bash
336- set -ex
337-
338- cd ${project.paths.project_build_prefix}
339- rm -rf build/${buildTypeDir}
340- mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir}
341- ${auxiliary.gfxTargetParser()}
342- ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang ${buildTypeArg} ${clientArgs} ${kernelArgs} ${warningArgs} ${amdgpuTargets} ${rtcBuildCache} ../..
343- make -j\$(nproc)
344- """
345- platform.runCommand(this, command)
346-}
347-return this
348diff --git a/.jenkins/debug.groovy b/.jenkins/debug.groovy
349deleted file mode 100644
350index d0f903c..0000000
351--- a/.jenkins/debug.groovy
352+++ /dev/null
353@@ -1,89 +0,0 @@
354-#!/usr/bin/env groovy
355-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
356-@Library('rocJenkins@pong') _
357-
358-// This is file for internal AMD use.
359-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
360-
361-import com.amd.project.*
362-import com.amd.docker.*
363-import java.nio.file.Path
364-
365-def runCI =
366-{
367- nodeDetails, jobName->
368-
369- def prj = new rocProject('rocFFT-internal', 'Debug')
370-
371- prj.defaults.ccache = true
372- prj.timeout.compile = 600
373- prj.timeout.test = 600
374- prj.libraryDependencies = ['rocRAND','hipRAND']
375-
376- // Define test architectures, optional rocm version argument is available
377- def nodes = new dockerNodes(nodeDetails, jobName, prj)
378-
379- boolean formatCheck = false
380-
381- def commonGroovy
382-
383- def compileCommand =
384- {
385- platform, project->
386-
387- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
388- commonGroovy.runCompileCommand(platform, project, jobName, true)
389- }
390-
391- def testCommand =
392- {
393- platform, project->
394-
395- commonGroovy.runTestCommand(platform, project, true)
396- }
397-
398- def packageCommand =
399- {
400- platform, project->
401-
402- commonGroovy.runPackageCommand(platform, project, jobName, true)
403- }
404-
405-
406- buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
407-}
408-
409-ci: {
410- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
411-
412- def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
413- propertyList = auxiliary.appendPropertyList(propertyList)
414-
415- def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900']])]
416- jobNameList = auxiliary.appendJobNameList(jobNameList)
417-
418- propertyList.each
419- {
420- jobName, property->
421- if (urlJobName == jobName)
422- properties(auxiliary.addCommonProperties(property))
423- }
424-
425- jobNameList.each
426- {
427- jobName, nodeDetails->
428- if (urlJobName == jobName)
429- stage(jobName) {
430- runCI(nodeDetails, jobName)
431- }
432- }
433-
434- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
435- if(!jobNameList.keySet().contains(urlJobName))
436- {
437- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
438- stage(urlJobName) {
439- runCI([ubuntu16:['any']], urlJobName)
440- }
441- }
442-}
443diff --git a/.jenkins/extended.groovy b/.jenkins/extended.groovy
444deleted file mode 100644
445index ee508db..0000000
446--- a/.jenkins/extended.groovy
447+++ /dev/null
448@@ -1,120 +0,0 @@
449-#!/usr/bin/env groovy
450-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
451-@Library('rocJenkins@pong') _
452-
453-// This is file for internal AMD use.
454-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
455-
456-import com.amd.project.*
457-import com.amd.docker.*
458-import java.nio.file.Path
459-
460-def runBitwiseReproTest (platform, project, boolean debug=false, gfilter='', reprodb='', int repeat=1)
461-{
462- String testBinaryName = 'rocfft-test'
463- String directory = debug ? 'debug' : 'release'
464-
465- String gfilterArg = ''
466- if (gfilter)
467- {
468- gfilterArg = "--gtest_filter=${gfilter}"
469- }
470-
471- String reproDbArg = ''
472- if (reprodb)
473- {
474- reproDbArg = "--repro-db=${reprodb}"
475- }
476-
477- String repeatArg = ''
478- if (repeat > 1)
479- {
480- repeatArg = "--gtest_repeat=${repeat}"
481- }
482-
483- def command = """#!/usr/bin/env bash
484- set -ex
485- cd ${project.paths.project_build_prefix}/build/${directory}/clients/staging
486- ROCM_PATH=/opt/rocm GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./${testBinaryName} --precompile=rocfft-test-precompile.db ${gfilterArg} ${reproDbArg} ${repeatArg} --gtest_color=yes --R 80 --nrand 10
487- """
488- platform.runCommand(this, command)
489-}
490-
491-def runCI =
492-{
493- nodeDetails, jobName->
494-
495- def prj = new rocProject('rocFFT-internal', 'Extended')
496-
497- prj.defaults.ccache = true
498- prj.timeout.compile = 600
499- prj.timeout.test = 600
500- prj.libraryDependencies = ['rocRAND','hipRAND']
501-
502- // Define test architectures, optional rocm version argument is available
503- def nodes = new dockerNodes(nodeDetails, jobName, prj)
504-
505- boolean formatCheck = false
506-
507- def commonGroovy
508-
509- def compileCommand =
510- {
511- platform, project->
512-
513- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
514- commonGroovy.runCompileCommand(platform, project, jobName)
515- commonGroovy.runCompileClientCommand(platform, project, jobName, false)
516- }
517-
518- def testCommand =
519- {
520- platform, project->
521-
522- runBitwiseReproTest(platform, project, false, "*pow2_1D/bitwise_repro_test*", 'bitwise_repro.db', 2)
523- }
524-
525- def packageCommand =
526- {
527- platform, project->
528-
529- commonGroovy.runPackageCommand(platform, project, jobName)
530- }
531-
532- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
533-}
534-
535-ci: {
536- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
537-
538- def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
539- propertyList = auxiliary.appendPropertyList(propertyList)
540-
541- def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])]
542- jobNameList = auxiliary.appendJobNameList(jobNameList)
543-
544- propertyList.each
545- {
546- jobName, property->
547- if (urlJobName == jobName)
548- properties(auxiliary.addCommonProperties(property))
549- }
550-
551- jobNameList.each
552- {
553- jobName, nodeDetails->
554- if (urlJobName == jobName)
555- stage(jobName) {
556- runCI(nodeDetails, jobName)
557- }
558- }
559-
560- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
561- if(!jobNameList.keySet().contains(urlJobName))
562- {
563- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
564- stage(urlJobName) {
565- runCI([ubuntu18:['gfx906']], urlJobName)
566- }
567- }
568-}
569diff --git a/.jenkins/multigpu.groovy b/.jenkins/multigpu.groovy
570deleted file mode 100644
571index 68e625f..0000000
572--- a/.jenkins/multigpu.groovy
573+++ /dev/null
574@@ -1,93 +0,0 @@
575-#!/usr/bin/env groovy
576-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
577-@Library('rocJenkins@pong') _
578-
579-// This is file for internal AMD use.
580-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
581-
582-import com.amd.project.*
583-import com.amd.docker.*
584-import java.nio.file.Path
585-
586-def runCI =
587-{
588- nodeDetails, jobName->
589-
590- def prj = new rocProject('rocFFT-internal', 'multigpu')
591-
592- prj.defaults.ccache = true
593- prj.timeout.compile = 600
594- prj.timeout.test = 600
595- prj.libraryDependencies = ['rocRAND','hipRAND']
596-
597- // Define test architectures, optional rocm version argument is available
598- def nodes = new dockerNodes(nodeDetails, jobName, prj)
599-
600- boolean formatCheck = false
601-
602- def commonGroovy
603-
604- def compileCommand =
605- {
606- platform, project->
607-
608- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
609- // build with MPI enabled
610- commonGroovy.runCompileCommand(platform, project, jobName, false, false, true)
611- commonGroovy.runCompileClientCommand(platform, project, jobName, false)
612- }
613-
614- def testCommand =
615- {
616- platform, project->
617-
618- //run single-process multi-GPU tests
619- commonGroovy.runTestCommand(platform, project, false, "*multi_gpu*")
620- // run MPI tests across 4 ranks
621- commonGroovy.runTestCommand(platform, project, false, "*multi_gpu*", '--mp_lib mpi --mp_ranks 4 --mp_launch "/usr/local/openmpi/bin/mpirun --np 4 ./rocfft_mpi_worker"')
622- }
623-
624- def packageCommand =
625- {
626- platform, project->
627-
628- // don't package anything - we're not distributing MPI-enabled rocFFT so we don't want to expose any MPI-enabled packages anywhere that other builds can mistakenly pick up
629- }
630-
631- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
632-}
633-
634-ci: {
635- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
636-
637- def propertyList = ["main":[pipelineTriggers([cron('0 1 * * 0')])]]
638- propertyList = auxiliary.appendPropertyList(propertyList)
639-
640- def jobNameList = ["main":([ubuntu20:['8gfx90a']])]
641- jobNameList = auxiliary.appendJobNameList(jobNameList)
642-
643- propertyList.each
644- {
645- jobName, property->
646- if (urlJobName == jobName)
647- properties(auxiliary.addCommonProperties(property))
648- }
649-
650- jobNameList.each
651- {
652- jobName, nodeDetails->
653- if (urlJobName == jobName)
654- stage(jobName) {
655- runCI(nodeDetails, jobName)
656- }
657- }
658-
659- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
660- if(!jobNameList.keySet().contains(urlJobName))
661- {
662- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
663- stage(urlJobName) {
664- runCI([ubuntu20:['8gfx90a']], urlJobName)
665- }
666- }
667-}
668diff --git a/.jenkins/performance.groovy b/.jenkins/performance.groovy
669deleted file mode 100644
670index d488f59..0000000
671--- a/.jenkins/performance.groovy
672+++ /dev/null
673@@ -1,231 +0,0 @@
674-#!/usr/bin/env groovy
675-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
676-@Library('rocJenkins@pong') _
677-
678-// This is file for internal AMD use.
679-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
680-
681-import com.amd.project.*
682-import com.amd.docker.*
683-import java.nio.file.Path
684-
685-def runCompileCommand(platform, project, jobName, boolean debug=false, boolean buildStatic=false)
686-{
687- def reference = (env.BRANCH_NAME ==~ /PR-\d+/) ? 'develop' : 'master'
688-
689- project.paths.construct_build_prefix()
690-
691- def getDependenciesCommand = ""
692- if (project.installLibraryDependenciesFromCI)
693- {
694- project.libraryDependencies.each
695- { libraryName ->
696- getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false)
697- }
698- }
699-
700- dir("${project.paths.project_build_prefix}/ref-repo") {
701- git branch: "${reference}", url: 'https://github.com/ROCmSoftwarePlatform/rocFFT.git'
702- }
703-
704- String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCH=ON'
705- String noclientArgs = '-DBUILD_CLIENTS_SAMPLES=OFF -DBUILD_CLIENTS_TESTS=OFF -DBUILD_CLIENTS_BENCH=OFF'
706- String warningArgs = '-DWERROR=ON'
707- String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release'
708- String buildTypeDir = debug ? 'debug' : 'release'
709- String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_LOCAL/rocfft_build_cache.db"
710- String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
711-
712- def command = """#!/usr/bin/env bash
713- set -x
714- cd ${project.paths.project_build_prefix}
715- ${getDependenciesCommand}
716- set -e
717- mkdir -p build/${buildTypeDir} && pushd build/${buildTypeDir}
718- ${auxiliary.gfxTargetParser()}
719- ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang -DAMDGPU_TARGETS=\$gfx_arch ${buildTypeArg} ${clientArgs} ${warningArgs} ${rtcBuildCache} ../..
720- make -j\$(nproc)
721- popd
722- cd ref-repo
723- mkdir -p build/${buildTypeDir} && pushd build/${buildTypeDir}
724- ${auxiliary.gfxTargetParser()}
725- ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang -DAMDGPU_TARGETS=\$gfx_arch ${buildTypeArg} ${noclientArgs} ${warningArgs} ${rtcBuildCache} ../..
726- make -j\$(nproc)
727- """
728- platform.runCommand(this, command)
729-}
730-
731-def runTestCommand (platform, project, boolean debug=false)
732-{
733- String sudo = auxiliary.sudo(platform.jenkinsLabel)
734- String directory = debug ? 'debug' : 'release'
735-
736- def dataTypes = ['single', 'double']
737- for (def dataType in dataTypes)
738- {
739- def command = """#!/usr/bin/env bash
740- set -ex
741- pwd
742- cd ${project.paths.project_build_prefix}
743- export ROCFFT_RTC_CACHE_PATH="\$JENKINS_HOME_LOCAL/rocfft_build_cache.db"
744- ./scripts/perf/rocfft-perf run --bench ./build/${directory}/clients/staging/dyna-rocfft-bench --lib ./ref-repo/build/${directory}/library/src/librocfft.so --lib ./build/${directory}/library/src/librocfft.so --out ./${dataType}_ref --out ./${dataType}_change --device 0 --precision ${dataType} --suite benchmarks
745- ls ${dataType}_change
746- ls ${dataType}_ref
747- mkdir ${dataType}_results
748- ./scripts/perf/rocfft-perf post ./${dataType}_results ./${dataType}_ref ./${dataType}_change
749- ls ${dataType}_change/*.mdat
750- ./scripts/perf/rocfft-perf html ./${dataType}_results ./${dataType}_ref ./${dataType}_change
751- mv ${dataType}_results/figs.html ${dataType}_results/figs_${platform.gpu}.html
752- """
753- platform.runCommand(this, command)
754-
755- archiveArtifacts "${project.paths.project_build_prefix}/${dataType}_results/*.html"
756- publishHTML([allowMissing: false,
757- alwaysLinkToLastBuild: false,
758- keepAll: false,
759- reportDir: "${project.paths.project_build_prefix}/${dataType}_results",
760- reportFiles: "figs_${platform.gpu}.html",
761- reportName: "${dataType}-precision-${platform.gpu}",
762- reportTitles: "${dataType}-precision-${platform.gpu}"])
763- }
764-
765-
766- withCredentials([gitUsernamePassword(credentialsId: 'GitHub-ROCmMathLibrariesBot-Token', gitToolName: 'git-tool')])
767- {
768- platform.runCommand(
769- this,
770- """
771- cd ${project.paths.build_prefix}
772- git clone https://github.com/ROCmSoftwarePlatform/rocPTS.git -b release/rocpts-rel-1.2.0
773- cd rocPTS
774- python3 -m pip install build
775- python3 -m build
776- python3 -m pip install .
777- """
778- )
779- }
780- writeFile(
781- file: project.paths.project_build_prefix + "/record_pts.py",
782- text: libraryResource("com/amd/scripts/record_pts.py"))
783- def setupBranch = env.CHANGE_ID ? "git branch \$BRANCH_NAME" : ""
784- def command = """#!/usr/bin/env bash
785- set -ex
786- cd ${project.paths.project_build_prefix}
787- ${setupBranch}
788- git checkout \$BRANCH_NAME
789- benchmark_folder=rocFFT_Benchmark_Dataset_\$(date +%Y%m%d)
790- mkdir -p \${benchmark_folder}/all_change \${benchmark_folder}/all_ref
791- cp -uf ./*_change/* \${benchmark_folder}/all_change
792- cp -uf ./*_ref/* \${benchmark_folder}/all_ref
793- python3 ./record_pts.py \
794- --dataset-path \$PWD/\${benchmark_folder} \
795- --reference-dataset all_ref \
796- --new-dataset all_change \
797- --new-build . \
798- --reference-build ./ref-repo\
799- -v 5.5 \
800- -l pts_rocfft_benchmark_data-v1.0.0
801- """
802- withCredentials([usernamePassword(credentialsId: 'PTS_API_ID_KEY_PROD', usernameVariable: 'PTS_API_ID', passwordVariable: 'PTS_API_KEY')])
803- {
804- platform.runCommand(this, command)
805- }
806-}
807-
808-def runCI =
809-{
810- nodeDetails, jobName->
811-
812- def prj = new rocProject('rocFFT-internal', 'Performance')
813-
814- prj.defaults.ccache = true
815- prj.timeout.compile = 600
816- prj.timeout.test = 600
817- prj.libraryDependencies = ['rocRAND','hipRAND']
818-
819- // Define test architectures, optional rocm version argument is available
820- def nodes = new dockerNodes(nodeDetails, jobName, prj)
821-
822- boolean formatCheck = false
823-
824- def commonGroovy
825- def gpus = []
826- def dataTypes = ['single', 'double']
827-
828- def compileCommand =
829- {
830- platform, project->
831-
832- gpus.add(platform.gpu)
833- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
834- runCompileCommand(platform, project, jobName)
835- }
836-
837- def testCommand =
838- {
839- platform, project->
840-
841- runTestCommand(platform, project)
842- }
843-
844- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null)
845- def commentString = "Performance reports: \n" + "Commit hashes: \n"
846- for (parentHash in prj.gitParentHashes) {
847- commentString += "${parentHash} \n"
848- }
849- for (gpu in gpus) {
850- for (dataType in dataTypes) {
851- commentString += "[${gpu} ${dataType} report](${JOB_URL}/${dataType}-precision-${gpu})\n"
852- }
853- }
854-
855- if (env.BRANCH_NAME ==~ /PR-\d+/)
856- {
857- boolean commentExists = false
858- for (prComment in pullRequest.comments) {
859- if (prComment.body.contains("Performance reports:"))
860- {
861- commentExists = true
862- prComment.body = commentString
863- }
864- }
865- if (!commentExists) {
866- def comment = pullRequest.comment(commentString)
867- }
868- }
869-}
870-
871-ci: {
872- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
873-
874- def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
875- propertyList = auxiliary.appendPropertyList(propertyList)
876-
877- def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900','gfx906']])]
878- jobNameList = auxiliary.appendJobNameList(jobNameList)
879-
880- propertyList.each
881- {
882- jobName, property->
883- if (urlJobName == jobName)
884- properties(auxiliary.addCommonProperties(property))
885- }
886-
887- jobNameList.each
888- {
889- jobName, nodeDetails->
890- if (urlJobName == jobName)
891- stage(jobName) {
892- runCI(nodeDetails, jobName)
893- }
894- }
895-
896- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
897- if(!jobNameList.keySet().contains(urlJobName))
898- {
899- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
900- stage(urlJobName) {
901- runCI([ubuntu18:['gfx906']], urlJobName)
902- }
903- }
904-}
905diff --git a/.jenkins/staticanalysis.groovy b/.jenkins/staticanalysis.groovy
906deleted file mode 100644
907index b516520..0000000
908--- a/.jenkins/staticanalysis.groovy
909+++ /dev/null
910@@ -1,110 +0,0 @@
911-#!/usr/bin/env groovy
912-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
913-@Library('rocJenkins@pong') _
914-
915-// This is file for internal AMD use.
916-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
917-
918-import com.amd.project.*
919-import com.amd.docker.*
920-import java.nio.file.Path
921-
922-def runCompileCommand(platform, project, jobName, boolean debug=false)
923-{
924- project.paths.construct_build_prefix()
925-
926- def yapfCommand = """#!/usr/bin/env bash
927- set -x
928- cd ${project.paths.project_build_prefix}
929- yapf --version
930- find . -iname '*.py' \
931- | grep -v 'build/' \
932- | xargs -n 1 -P 1 -I{} -t sh -c 'yapf --style pep8 {} | diff - {}'
933- """
934-
935- platform.runCommand(this, yapfCommand)
936-}
937-
938-def runCI =
939-{
940- nodeDetails, jobName->
941-
942- def prj = new rocProject('rocFFT-internal', 'StaticAnalysis')
943- prj.libraryDependencies = ['rocRAND','hipRAND']
944-
945- // Define test architectures, optional rocm version argument is available
946- def nodes = new dockerNodes(nodeDetails, jobName, prj)
947-
948- boolean formatCheck = true
949- boolean staticAnalysis = true
950-
951- def compileCommand =
952- {
953- platform, project->
954-
955- runCompileCommand(platform, project, jobName, false)
956- }
957-
958- buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis)
959-
960-
961- def kernelSubsetPrj = new rocProject('rocFFT-internal', 'BuildKernelSubset')
962-
963- def nodesForPrj2 = new dockerNodes(nodeDetails, jobName, kernelSubsetPrj)
964-
965- def commonGroovy
966-
967- def compileSubsetCommand =
968- {
969- platform, project->
970-
971- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
972-
973- // build pattern pow2,pow7 no manual small and large, dp only
974- commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'pow2,pow7', null, null, true)
975-
976- // build large sizes, dp only
977- commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'large', null, null, true)
978-
979- // build 2D sizes, dp only
980- commonGroovy.runSubsetBuildCommand(platform, project, jobName, '2D', null, null, true)
981-
982- // put an extra unsupported size(10) in manual large to see if it will be filtered correctly
983- commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', null, '10,50,100,200,336', true)
984-
985- // put an extra unsupported size(23) in manual small to see if it will be filtered correctly
986- commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', '23,1024', '10,50,100,200,336', true)
987-
988- // all the manual sizes are not supported
989- //commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', '23', '10', true)
990- }
991-
992- buildProject(kernelSubsetPrj , formatCheck, nodesForPrj2.dockerArray, compileSubsetCommand, null, null)
993-}
994-
995-ci: {
996- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
997-
998- def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 6')])],
999- "rocm-docker":[]]
1000- propertyList = auxiliary.appendPropertyList(propertyList)
1001-
1002- def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":[]]
1003- jobNameList = auxiliary.appendJobNameList(jobNameList)
1004-
1005- propertyList.each
1006- {
1007- jobName, property->
1008- if (urlJobName == jobName)
1009- properties(auxiliary.addCommonProperties(property))
1010- }
1011-
1012- jobNameList.each
1013- {
1014- jobName, nodeDetails->
1015- if (urlJobName == jobName)
1016- stage(jobName) {
1017- runCI(nodeDetails, jobName)
1018- }
1019- }
1020-}
1021diff --git a/.jenkins/staticlibrary.groovy b/.jenkins/staticlibrary.groovy
1022deleted file mode 100644
1023index dcc8dd5..0000000
1024--- a/.jenkins/staticlibrary.groovy
1025+++ /dev/null
1026@@ -1,88 +0,0 @@
1027-#!/usr/bin/env groovy
1028-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
1029-@Library('rocJenkins@pong') _
1030-
1031-// This is file for internal AMD use.
1032-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
1033-
1034-import com.amd.project.*
1035-import com.amd.docker.*
1036-import java.nio.file.Path
1037-
1038-def runCI =
1039-{
1040- nodeDetails, jobName->
1041-
1042- def prj = new rocProject('rocFFT-internal', 'StaticLibrary')
1043-
1044- prj.defaults.ccache = true
1045- prj.timeout.compile = 600
1046- prj.timeout.test = 600
1047- prj.libraryDependencies = ['rocRAND','hipRAND']
1048-
1049- // Define test architectures, optional rocm version argument is available
1050- def nodes = new dockerNodes(nodeDetails, jobName, prj)
1051-
1052- boolean formatCheck = false
1053-
1054- def commonGroovy
1055-
1056- def compileCommand =
1057- {
1058- platform, project->
1059-
1060- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
1061- commonGroovy.runCompileCommand(platform, project, jobName, false, true)
1062- }
1063-
1064- def testCommand =
1065- {
1066- platform, project->
1067-
1068- commonGroovy.runTestCommand(platform, project)
1069- }
1070-
1071- def packageCommand =
1072- {
1073- platform, project->
1074-
1075- commonGroovy.runPackageCommand(platform, project, jobName)
1076- }
1077-
1078- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
1079-}
1080-
1081-ci: {
1082- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
1083-
1084- def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
1085- propertyList = auxiliary.appendPropertyList(propertyList)
1086-
1087- def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900']])]
1088- jobNameList = auxiliary.appendJobNameList(jobNameList)
1089-
1090- propertyList.each
1091- {
1092- jobName, property->
1093- if (urlJobName == jobName)
1094- properties(auxiliary.addCommonProperties(property))
1095- }
1096-
1097- jobNameList.each
1098- {
1099- jobName, nodeDetails->
1100- if (urlJobName == jobName)
1101- stage(jobName) {
1102- runCI(nodeDetails, jobName)
1103- }
1104- }
1105-
1106- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
1107- if(!jobNameList.keySet().contains(urlJobName))
1108- {
1109- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
1110- stage(urlJobName) {
1111- runCI([ubuntu16:['gfx906']], urlJobName)
1112- }
1113- }
1114-}
1115diff --git a/CHANGELOG.md b/CHANGELOG.md
1116index 5c4bc51..659b4c7 100644
1117--- a/CHANGELOG.md
1118+++ b/CHANGELOG.md
1119@@ -3,6 +3,47 @@
1120 Documentation for rocFFT is available at
1121 [https://rocm.docs.amd.com/projects/rocFFT/en/latest/](https://rocm.docs.amd.com/projects/rocFFT/en/latest/).
1122
1123+## rocFFT 1.0.35 for ROCM 7.1.0
1124+
1125+### Optimized
1126+
1127+* Implemented single-kernel plans for some 2D problem sizes, on devices with at least 160KiB of LDS.
1128+* Improved performance of unit-strided, complex-interleaved, forward/inverse FFTs for lengths:
1129+ - (64,64,128)
1130+ - (64,64,52)
1131+ - (60,60,60)
1132+ - (32,32,128)
1133+ - (32,32,64)
1134+ - (64,32,128)
1135+* Improved performance of 3D MPI pencil decompositions by using sub-communicators for global transpose operations.
1136+
1137+## rocFFT 1.0.34 for ROCm 7.0.0
1138+
1139+### Added
1140+
1141+* Added gfx950 support.
1142+
1143+### Removed
1144+
1145+* Removed rocfft-rider legacy compatibility from clients
1146+* Removed support for the gfx940 and gfx941 targets from the client programs.
1147+
1148+### Optimized
1149+
1150+* Removed unnecessary HIP event/stream allocation and synchronization during MPI transforms.
1151+* Implemented single-precision 1D kernels for lengths:
1152+ - 4704
1153+ - 5488
1154+ - 6144
1155+ - 6561
1156+ - 8192
1157+* Implemented single-kernel plans for some large 1D problem sizes, on devices with at least 160KiB of LDS.
1158+
1159+### Resolved issues
1160+
1161+* Fixed kernel faults on multi-device transforms that gather to a single device, when the input/output bricks are not
1162+ contiguous.
1163+
1164 ## rocFFT 1.0.32 for ROCm 6.4.0
1165
1166 ### Changed
1167diff --git a/CMakeLists.txt b/CMakeLists.txt
1168index 75e5c36..c724aff 100644
1169--- a/CMakeLists.txt
1170+++ b/CMakeLists.txt
1171@@ -1,5 +1,5 @@
1172 # #############################################################################
1173-# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
1174+# Copyright (C) 2016 - 2025 Advanced Micro Devices, Inc. All rights reserved.
1175 #
1176 # Permission is hereby granted, free of charge, to any person obtaining a copy
1177 # of this software and associated documentation files (the "Software"), to deal
1178@@ -52,34 +52,24 @@ project( rocfft LANGUAGES CXX C )
1179 # This finds the rocm-cmake project, and installs it if not found
1180 # rocm-cmake contains common cmake code for rocm projects to help setup and install
1181 set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern )
1182-find_package( ROCM 0.7.3 CONFIG QUIET PATHS ${ROCM_PATH} /opt/rocm )
1183-if( NOT ROCM_FOUND )
1184- set( rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download" )
1185- file( DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip
1186- ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip STATUS status LOG log)
1187-
1188- list(GET status 0 status_code)
1189- list(GET status 1 status_string)
1190-
1191- if(NOT status_code EQUAL 0)
1192- message(FATAL_ERROR "error: downloading
1193- 'https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip' failed
1194- status_code: ${status_code}
1195- status_string: ${status_string}
1196- log: ${log}
1197- ")
1198- endif()
1199+find_package( ROCmCMakeBuildTools PATHS ${ROCM_PATH} /opt/rocm )
1200+if( NOT ROCmCMakeBuildTools_FOUND )
1201+ include( FetchContent )
1202+
1203+ FetchContent_Declare( rocm_cmake_local
1204+ GIT_REPOSITORY https://github.com/ROCm/rocm-cmake
1205+ GIT_TAG rocm-6.4.1
1206+ GIT_SHALLOW ON
1207+ )
1208
1209- message(STATUS "downloading... done")
1210+ FetchContent_MakeAvailable( rocm_cmake_local )
1211
1212- execute_process( COMMAND ${CMAKE_COMMAND} -E tar xzvf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
1213- WORKING_DIRECTORY ${PROJECT_EXTERN_DIR} )
1214 execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake .
1215- WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag} )
1216- execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install
1217- WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
1218+ WORKING_DIRECTORY ${rocm_cmake_local_SOURCE_DIR} )
1219+ execute_process( COMMAND ${CMAKE_COMMAND} --build ${rocm_cmake_local_SOURCE_DIR} --target install
1220+ WORKING_DIRECTORY ${rocm_cmake_local_SOURCE_DIR} )
1221
1222- find_package( ROCM 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake )
1223+ find_package( ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake )
1224 endif( )
1225 include( ROCMSetupVersion )
1226 include( ROCMCreatePackage )
1227@@ -90,8 +80,12 @@ include( ROCMCheckTargetIds )
1228 include( ROCMClients )
1229 include( ROCMHeaderWrapper )
1230
1231+if( ROCM_PATH )
1232+ list( APPEND CMAKE_BUILD_RPATH ${ROCM_PATH}/lib )
1233+endif()
1234+
1235 # Using standardized versioning from rocm-cmake
1236-set ( VERSION_STRING "1.0.32" )
1237+set ( VERSION_STRING "1.0.35" )
1238 rocm_setup_version( VERSION ${VERSION_STRING} )
1239
1240 # Append our library helper cmake path and the cmake path for hip (for
1241@@ -119,25 +113,10 @@ option(ROCFFT_BUILD_OFFLINE_TUNER "Build with offline tuner executable rocfft_of
1242 # Provide ability to disable hipRAND dependency
1243 option(USE_HIPRAND "Use hipRAND to provide device-side input generation" ON)
1244
1245-if( USE_HIPRAND )
1246- add_compile_definitions(USE_HIPRAND)
1247-endif( )
1248-
1249 # Split up function pool compilation across N files to parallelize its build
1250 set(ROCFFT_FUNCTION_POOL_N 8 CACHE STRING "Number of files to split function_pool into for compilation")
1251
1252-# FOR HANDLING ENABLE/DISABLE OPTIONAL BACKWARD COMPATIBILITY for FILE/FOLDER REORG
1253-option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" OFF)
1254-if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32)
1255- rocm_wrap_header_dir(
1256- ${CMAKE_SOURCE_DIR}/library/include
1257- PATTERNS "*.h"
1258- GUARDS SYMLINK WRAPPER
1259- WRAPPER_LOCATIONS ${CMAKE_INSTALL_INCLUDEDIR}
1260- )
1261-endif()
1262-
1263-set( WARNING_FLAGS -Wall -Wno-unused-function -Wimplicit-fallthrough -Wunreachable-code -Wsign-compare )
1264+set( WARNING_FLAGS -Wall -Wno-unused-function -Wimplicit-fallthrough -Wunreachable-code -Wsign-compare -Wno-deprecated-declarations )
1265 if( WERROR )
1266 set( WARNING_FLAGS ${WARNING_FLAGS} -Werror )
1267 endif( )
1268@@ -148,9 +127,8 @@ set(DEFAULT_GPUS
1269 gfx906
1270 gfx908
1271 gfx90a
1272- gfx940
1273- gfx941
1274 gfx942
1275+ gfx950
1276 gfx1030
1277 gfx1100
1278 gfx1101
1279@@ -166,8 +144,6 @@ if(BUILD_ADDRESS_SANITIZER)
1280 SET(DEFAULT_GPUS
1281 gfx908:xnack+
1282 gfx90a:xnack+
1283- gfx940:xnack+
1284- gfx941:xnack+
1285 gfx942:xnack+)
1286 add_link_options(-fuse-ld=lld)
1287 set(ROCFFT_KERNEL_CACHE_ENABLE off)
1288@@ -193,7 +169,8 @@ rocm_check_target_ids(AMDGPU_TARGETS TARGETS "${AMDGPU_TARGETS}")
1289 set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to build for")
1290
1291 # HIP is required - library and clients use HIP to access the device
1292-find_package( HIP REQUIRED CONFIG )
1293+find_package( hip REQUIRED CONFIG PATHS /opt/rocm/lib/cmake/hip/ )
1294+find_package( hiprtc REQUIRED CONFIG PATHS /opt/rocm/lib/cmake/hiprtc/ )
1295
1296 # The nvidia backend can be used to compile for CUDA devices.
1297 # Specify the CUDA prefix in the CUDA_PREFIX variable.
1298@@ -287,7 +264,9 @@ endif( )
1299 if(WIN32)
1300 set(CPACK_SOURCE_GENERATOR "ZIP")
1301 set(CPACK_GENERATOR "ZIP")
1302- set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path" FORCE)
1303+ if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
1304+ set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path" FORCE)
1305+ endif()
1306 set(INSTALL_PREFIX "C:/hipSDK")
1307 set(CPACK_SET_DESTDIR OFF)
1308 set(CPACK_PACKAGE_INSTALL_DIRECTORY "C:/hipSDK")
1309@@ -321,3 +300,5 @@ rocm_create_package(
1310 LDCONFIG
1311 LDCONFIG_DIR ${ROCFFT_CONFIG_DIR}
1312 )
1313+
1314+option(BUILD_CODE_COVERAGE "Build with code coverage flags (clang only)" OFF)
1315diff --git a/LICENSE.md b/LICENSE.md
1316index 4c2aab7..d631884 100644
1317--- a/LICENSE.md
1318+++ b/LICENSE.md
1319@@ -1,12 +1,26 @@
1320-# License
1321+MIT License
1322
1323-Copyright (C) 2016 - 2025 Advanced Micro Devices, Inc. All rights reserved.
1324+Copyright (C) Advanced Micro Devices, Inc.
1325
1326-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
1327+Permission is hereby granted, free of charge, to any person obtaining a copy
1328+of this software and associated documentation files (the "Software"), to deal
1329+in the Software without restriction, including without limitation the rights
1330+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1331+copies of the Software, and to permit persons to whom the Software is
1332+furnished to do so, subject to the following conditions:
1333
1334-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
1335+The above copyright notice and this permission notice shall be included in all
1336+copies or substantial portions of the Software.
1337
1338-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1339+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1340+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1341+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1342+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1343+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1344+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1345+SOFTWARE.
1346+
1347+---
1348
1349 This product includes software from copyright holders as shown below, and distributed under their license terms as specified.
1350
1351diff --git a/README.md b/README.md
1352index fe133cb..8f943be 100644
1353--- a/README.md
1354+++ b/README.md
1355@@ -1,19 +1,18 @@
1356 # rocFFT
1357
1358 rocFFT is a software library for computing fast Fourier transforms (FFTs) written in the HIP
1359-programming language. It's part of AMD's software ecosystem based on
1360-[ROCm](https://github.com/ROCm/ROCm). The rocFFT library can be used with AMD and
1361-NVIDIA GPUs.
1362+programming language. It's part of the AMD software ecosystem based on
1363+[ROCm](https://github.com/ROCm/ROCm). The rocFFT library can be used with AMD GPUs.
1364
1365 ## Documentation
1366
1367 > [!NOTE]
1368-> The published rocFFT documentation is available at [rocFFT](https://rocm.docs.amd.com/projects/rocFFT/en/latest/index.html) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the rocFFT/docs folder of this repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
1369+> The published rocFFT documentation is available at [rocFFT](https://rocm.docs.amd.com/projects/rocFFT/en/latest/index.html) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the projects/rocfft/docs folder of the rocm-libraries repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
1370
1371 To build our documentation locally, use the following code:
1372
1373 ```Bash
1374-cd docs
1375+cd projects/rocfft/docs
1376
1377 pip3 install -r sphinx/requirements.txt
1378
1379@@ -36,7 +35,7 @@ You can install rocFFT using pre-built packages or building from source.
1380 * Building from source:
1381
1382 rocFFT is compiled with AMD's clang++ and uses CMake. You can specify several options to customize your
1383- build. The following commands build a shared library for supported AMD GPUs:
1384+ build. The following commands build a shared library for supported AMD GPUs. Run these commands from the `rocm-libraries/projects/rocfft` directory:
1385
1386 ```bash
1387 mkdir build && cd build
1388@@ -61,6 +60,7 @@ You can install rocFFT using pre-built packages or building from source.
1389 | `rocfft-bench` | `-DBUILD_CLIENTS_BENCH=on` | hipRAND |
1390 | `rocfft-test` | `-DBUILD_CLIENTS_TESTS=on` | hipRAND, FFTW, GoogleTest |
1391 | samples | `-DBUILD_CLIENTS_SAMPLES=on` | None |
1392+ | coverage | `-DBUILD_CODE_COVERAGE=ON` | clang, llvm-cov |
1393
1394 Clients are not built by default. To build them, use `-DBUILD_CLIENTS=on`. The build process
1395 downloads and builds GoogleTest and FFTW if they are not already installed.
1396@@ -80,7 +80,14 @@ You can install rocFFT using pre-built packages or building from source.
1397 sudo apt install libgtest-dev libfftw3-dev libboost-dev
1398 ```
1399
1400- We use version 1.11 of GoogleTest.
1401+ rocFFT uses version 1.11 of GoogleTest.
1402+
1403+ You can generate a test coverage report with the following:
1404+ ```bash
1405+ cmake -DCMAKE_CXX_COMPILER=amdclang++ -DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CODE_COVERAGE=ON <optional: -DCOVERAGE_TEST_OPTIONS="cmdline args to pass to rocfft-test (default: --smoketest)"> ..
1406+ make -j coverage
1407+ ```
1408+ The above will output the coverage report to the terminal and also save an html coverage report to `$PWD/coverage-report`.
1409
1410 ## Examples
1411
1412@@ -91,9 +98,9 @@ You can find additional examples in the `clients/samples` subdirectory.
1413
1414 ## Support
1415
1416-You can report bugs and feature requests through the GitHub
1417-[issue tracker](https://github.com/ROCm/rocFFT/issues).
1418+You can report bugs and feature requests through the rocm-libraries GitHub
1419+[issue tracker](https://github.com/ROCm/rocm-libraries/issues).
1420
1421 ## Contribute
1422
1423-If you want to contribute to rocFFT, you must follow our [contribution guidelines](https://github.com/ROCm/rocFFT/blob/develop/.github/CONTRIBUTING.md).
1424+If you want to contribute to rocFFT, you must follow the [contribution guidelines](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocfft/.github/CONTRIBUTING.md).
1425diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt
1426index 1f42485..44f361d 100644
1427--- a/clients/CMakeLists.txt
1428+++ b/clients/CMakeLists.txt
1429@@ -53,8 +53,8 @@ set(CMAKE_CXX_STANDARD 17)
1430
1431 list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
1432
1433-if( NOT ROCM_FOUND )
1434- find_package( ROCM 0.7.3 REQUIRED )
1435+if( NOT ROCmCMakeBuildTools_FOUND )
1436+ find_package( ROCmCMakeBuildTools REQUIRED )
1437 endif()
1438
1439 include( ROCMInstallTargets )
1440diff --git a/clients/bench/CMakeLists.txt b/clients/bench/CMakeLists.txt
1441index 60980fc..f792c0f 100644
1442--- a/clients/bench/CMakeLists.txt
1443+++ b/clients/bench/CMakeLists.txt
1444@@ -53,11 +53,11 @@ if( NOT TARGET rocfft )
1445 endif( )
1446
1447 if( NOT HIP_FOUND )
1448- find_package( HIP REQUIRED )
1449+ find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ )
1450 endif()
1451
1452-if( NOT ROCM_FOUND )
1453- find_package( ROCM 0.7.3 REQUIRED )
1454+if( NOT ROCmCMakeBuildTools_FOUND )
1455+ find_package( ROCmCMakeBuildTools REQUIRED )
1456 endif()
1457
1458 if( USE_HIPRAND AND NOT hiprand_FOUND )
1459@@ -105,6 +105,7 @@ foreach( bench ${bench_list})
1460 PRIVATE
1461 hip::hiprand
1462 )
1463+ target_compile_definitions( ${bench} PRIVATE USE_HIPRAND )
1464 endif()
1465
1466 # We need to include both rocfft.h and rocfft-export.h
1467@@ -155,30 +156,6 @@ foreach( bench ${bench_list})
1468 ${BENCH_OUT_DIR} )
1469
1470 rocm_install(TARGETS ${bench} COMPONENT benchmarks)
1471-
1472- # install compatibility for old name of bench program - symlink on
1473- # unix, hardlink on windows (since privilege is required to create
1474- # symlinks there)
1475- string(REPLACE bench rider bench_legacy ${bench})
1476- if( WIN32 )
1477- set( BENCH_LINK_COMMAND create_hardlink )
1478- set( BENCH_NEW_NAME ${BENCH_OUT_DIR}/$<TARGET_FILE_BASE_NAME:${bench}>${CMAKE_EXECUTABLE_SUFFIX} )
1479- set( BENCH_OLD_NAME ${BENCH_OUT_DIR}/${bench_legacy}${CMAKE_EXECUTABLE_SUFFIX} )
1480- else()
1481- set( BENCH_LINK_COMMAND create_symlink )
1482- set( BENCH_NEW_NAME $<TARGET_FILE_BASE_NAME:${bench}> )
1483- set( BENCH_OLD_NAME ${BENCH_OUT_DIR}/${bench_legacy} )
1484- endif()
1485- add_custom_command(
1486- TARGET ${bench}
1487- POST_BUILD
1488- COMMAND ${CMAKE_COMMAND} -E ${BENCH_LINK_COMMAND} ${BENCH_NEW_NAME} ${BENCH_OLD_NAME}
1489- )
1490- install(
1491- FILES ${BENCH_OLD_NAME}
1492- DESTINATION ${CMAKE_INSTALL_BINDIR}
1493- COMPONENT benchmarks
1494- )
1495 endforeach()
1496
1497 # Link dyna-rocfft-bench to the experimental filesystem library if
1498diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp
1499index 96f4b8c..1d30106 100644
1500--- a/clients/bench/bench.cpp
1501+++ b/clients/bench/bench.cpp
1502@@ -87,7 +87,11 @@ int main(int argc, char* argv[])
1503 "Type of transform:\n0) complex forward\n1) complex inverse\n2) real "
1504 "forward\n3) real inverse")
1505 ->default_val(fft_transform_type_complex_forward);
1506-
1507+ non_token
1508+ ->add_option("--auto_allocation",
1509+ params.auto_allocate,
1510+ "rocFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"")
1511+ ->default_val("default");
1512 non_token
1513 ->add_option(
1514 "--precision", params.precision, "Transform precision: single (default), double, half")
1515@@ -201,7 +205,10 @@ int main(int argc, char* argv[])
1516 params.mp_lib = fft_params::fft_mp_lib_none;
1517
1518 int localDeviceCount = 0;
1519- (void)hipGetDeviceCount(&localDeviceCount);
1520+ if(hipGetDeviceCount(&localDeviceCount) != hipSuccess)
1521+ {
1522+ throw std::runtime_error("hipGetDeviceCount failed");
1523+ }
1524
1525 // start with all-ones in grids
1526 std::vector<unsigned int> input_grid(params.length.size() + 1, 1);
1527@@ -333,119 +340,49 @@ int main(int argc, char* argv[])
1528 LIB_V_THROW(rocfft_status_failure, "Plan creation failed");
1529
1530 // GPU input buffer:
1531- auto ibuffer_sizes = params.ibuffer_sizes();
1532- std::vector<gpubuf> ibuffer(ibuffer_sizes.size());
1533- std::vector<void*> pibuffer(ibuffer_sizes.size());
1534- for(unsigned int i = 0; i < ibuffer.size(); ++i)
1535- {
1536- try
1537- {
1538- HIP_V_THROW(ibuffer[i].alloc(ibuffer_sizes[i]), "Creating input Buffer failed");
1539- }
1540- catch(rocfft_hip_runtime_error)
1541- {
1542- return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
1543- }
1544- pibuffer[i] = ibuffer[i].data();
1545- }
1546-
1547+ std::vector<gpubuf> ibuffer;
1548+ std::vector<void*> pibuffer;
1549 // CPU-side input buffer
1550 std::vector<hostbuf> ibuffer_cpu;
1551
1552 auto is_host_gen = (params.igen == fft_input_generator_host
1553 || params.igen == fft_input_random_generator_host);
1554
1555-#ifdef USE_HIPRAND
1556- if(!is_host_gen)
1557- {
1558- // Input data:
1559- params.compute_input(ibuffer);
1560+ auto ibricks = get_input_bricks(params);
1561+ auto obricks = get_output_bricks(params);
1562
1563- if(verbose > 1)
1564- {
1565- // Copy input to CPU
1566- try
1567- {
1568- ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize);
1569- }
1570- catch(rocfft_hip_runtime_error)
1571- {
1572- return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
1573- }
1574- for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
1575- {
1576- try
1577- {
1578- HIP_V_THROW(hipMemcpy(ibuffer_cpu.at(idx).data(),
1579- ibuffer[idx].data(),
1580- ibuffer_sizes[idx],
1581- hipMemcpyDeviceToHost),
1582- "hipMemcpy failed");
1583- }
1584- catch(rocfft_hip_runtime_error)
1585- {
1586- return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
1587- }
1588- }
1589+ std::vector<gpubuf> obuffer_data;
1590+ std::vector<gpubuf>* obuffer = nullptr;
1591+ alloc_bench_bricks(
1592+ params, ibricks, obricks, ibuffer, obuffer_data, obuffer, ibuffer_cpu, is_host_gen);
1593
1594- std::cout << "GPU input:\n";
1595- params.print_ibuffer(ibuffer_cpu);
1596- }
1597- }
1598-#endif
1599- if(is_host_gen)
1600+ pibuffer.resize(ibuffer.size());
1601+ for(unsigned int i = 0; i < ibuffer.size(); ++i)
1602 {
1603- // Input data:
1604- ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize);
1605- params.compute_input(ibuffer_cpu);
1606+ pibuffer[i] = ibuffer[i].data();
1607+ }
1608
1609- if(verbose > 1)
1610+ // print input if requested
1611+ if(verbose > 1)
1612+ {
1613+ if(is_host_gen)
1614 {
1615- std::cout << "GPU input:\n";
1616+ // data is already on host
1617 params.print_ibuffer(ibuffer_cpu);
1618 }
1619-
1620- for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx)
1621+ else
1622 {
1623- try
1624- {
1625- HIP_V_THROW(hipMemcpy(pibuffer[idx],
1626- ibuffer_cpu[idx].data(),
1627- ibuffer_cpu[idx].size(),
1628- hipMemcpyHostToDevice),
1629- "hipMemcpy failed");
1630- }
1631- catch(rocfft_hip_runtime_error)
1632- {
1633- return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
1634- }
1635+ print_device_buffer(params, ibuffer, true);
1636 }
1637 }
1638
1639- // GPU output buffer:
1640- std::vector<gpubuf> obuffer_data;
1641- std::vector<gpubuf>* obuffer = &obuffer_data;
1642- if(params.placement == fft_placement_inplace)
1643- {
1644- obuffer = &ibuffer;
1645- }
1646- else
1647- {
1648- auto obuffer_sizes = params.obuffer_sizes();
1649- obuffer_data.resize(obuffer_sizes.size());
1650- for(unsigned int i = 0; i < obuffer_data.size(); ++i)
1651- {
1652- HIP_V_THROW(obuffer_data[i].alloc(obuffer_sizes[i]), "Creating output Buffer failed");
1653- }
1654- }
1655 std::vector<void*> pobuffer(obuffer->size());
1656 for(unsigned int i = 0; i < obuffer->size(); ++i)
1657 {
1658 pobuffer[i] = obuffer->at(i).data();
1659 }
1660
1661- // Scatter input out to other devices and adjust I/O buffers to match requested transform
1662- params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer);
1663+ init_bench_input(params, ibricks, ibuffer, ibuffer_cpu, is_host_gen);
1664
1665 // Execute a warm-up call
1666 params.execute(pibuffer.data(), pobuffer.data());
1667@@ -459,34 +396,13 @@ int main(int argc, char* argv[])
1668 for(unsigned int itrial = 0; itrial < gpu_time.size(); ++itrial)
1669 {
1670 // Create input at every iteration to avoid overflow
1671- if(params.ifields.empty())
1672+ if(is_host_gen)
1673 {
1674-#ifdef USE_HIPRAND
1675- // Compute input on default device
1676- if(!is_host_gen)
1677- params.compute_input(ibuffer);
1678-#endif
1679- if(is_host_gen)
1680- {
1681- for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx)
1682- {
1683- try
1684- {
1685- HIP_V_THROW(hipMemcpy(pibuffer[idx],
1686- ibuffer_cpu[idx].data(),
1687- ibuffer_cpu[idx].size(),
1688- hipMemcpyHostToDevice),
1689- "hipMemcpy failed");
1690- }
1691- catch(rocfft_hip_runtime_error)
1692- {
1693- return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
1694- }
1695- }
1696- }
1697-
1698- // Scatter input out to other devices if this is a multi-GPU test
1699- params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer);
1700+ copy_host_input_to_dev(ibuffer_cpu, ibuffer);
1701+ }
1702+ else
1703+ {
1704+ init_bench_input(params, ibricks, ibuffer, ibuffer_cpu, is_host_gen);
1705 }
1706
1707 HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed");
1708@@ -503,27 +419,7 @@ int main(int argc, char* argv[])
1709 // Print result after FFT transform
1710 if(verbose > 2)
1711 {
1712- // Gather data to default GPU if this is a multi-GPU test
1713- params.multi_gpu_finalize(*obuffer, pobuffer);
1714-
1715- auto output = allocate_host_buffer(params.precision, params.otype, params.osize);
1716- for(unsigned int idx = 0; idx < output.size(); ++idx)
1717- {
1718- try
1719- {
1720- HIP_V_THROW(hipMemcpy(output[idx].data(),
1721- pobuffer.at(idx),
1722- output[idx].size(),
1723- hipMemcpyDeviceToHost),
1724- "hipMemcpy failed");
1725- }
1726- catch(rocfft_hip_runtime_error)
1727- {
1728- return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
1729- }
1730- }
1731- std::cout << "GPU output:\n";
1732- params.print_obuffer(output);
1733+ print_device_buffer(params, *obuffer, false);
1734 }
1735 }
1736
1737diff --git a/clients/bench/bench.h b/clients/bench/bench.h
1738index 51804ac..de06f93 100644
1739--- a/clients/bench/bench.h
1740+++ b/clients/bench/bench.h
1741@@ -21,6 +21,9 @@
1742 #ifndef ROCFFT_BENCH_H
1743 #define ROCFFT_BENCH_H
1744
1745+#include "../../shared/fft_params.h"
1746+#include "../../shared/rocfft_hip.h"
1747+
1748 #include "rocfft/rocfft.h"
1749 #include <hip/hip_runtime_api.h>
1750 #include <vector>
1751@@ -92,4 +95,198 @@ inline void lib_V_Throw(rocfft_status res,
1752 #define HIP_V_THROW(_status, _message) hip_V_Throw(_status, _message, __LINE__, __FILE__)
1753 #define LIB_V_THROW(_status, _message) lib_V_Throw(_status, _message, __LINE__, __FILE__)
1754
1755+// return input bricks for params, or one big brick covering the
1756+// input field if no bricks are specified
1757+template <typename Tparams>
1758+std::vector<fft_params::fft_brick> get_input_bricks(const Tparams& params)
1759+{
1760+ std::vector<fft_params::fft_brick> bricks;
1761+ if(!params.ifields.empty())
1762+ bricks = params.ifields[0].bricks;
1763+ else
1764+ {
1765+ auto len = params.ilength();
1766+
1767+ // just make one big brick covering the whole input field
1768+ bricks.resize(1);
1769+ bricks.front().lower.resize(len.size() + 1);
1770+ bricks.front().upper.resize(len.size() + 1);
1771+ bricks.front().stride.resize(len.size() + 1);
1772+
1773+ bricks.front().upper.front() = params.nbatch;
1774+ std::copy(len.begin(), len.end(), bricks.front().upper.begin() + 1);
1775+
1776+ bricks.front().stride.front() = params.idist;
1777+ std::copy(params.istride.begin(), params.istride.end(), bricks.front().stride.begin() + 1);
1778+ }
1779+ return bricks;
1780+}
1781+
1782+// return output bricks for params, or one big brick covering the
1783+// output field if no bricks are specified
1784+template <typename Tparams>
1785+std::vector<fft_params::fft_brick> get_output_bricks(const Tparams& params)
1786+{
1787+ std::vector<fft_params::fft_brick> bricks;
1788+ if(!params.ofields.empty())
1789+ bricks = params.ofields[0].bricks;
1790+ else
1791+ {
1792+ auto len = params.olength();
1793+
1794+ // just make one big brick covering the whole output field
1795+ bricks.resize(1);
1796+ bricks.front().lower.resize(len.size() + 1);
1797+ bricks.front().upper.resize(len.size() + 1);
1798+ bricks.front().stride.resize(len.size() + 1);
1799+
1800+ bricks.front().upper.front() = params.nbatch;
1801+ std::copy(len.begin(), len.end(), bricks.front().upper.begin() + 1);
1802+
1803+ bricks.front().stride.front() = params.odist;
1804+ std::copy(params.ostride.begin(), params.ostride.end(), bricks.front().stride.begin() + 1);
1805+ }
1806+ return bricks;
1807+}
1808+
1809+// Allocate input/output buffers for a bench run.
1810+template <typename Tparams>
1811+void alloc_bench_bricks(const Tparams& params,
1812+ const std::vector<fft_params::fft_brick>& ibricks,
1813+ const std::vector<fft_params::fft_brick>& obricks,
1814+ std::vector<gpubuf>& ibuffers,
1815+ std::vector<gpubuf>& obuffer_data,
1816+ std::vector<gpubuf>*& obuffers,
1817+ std::vector<hostbuf>& host_buffers,
1818+ bool is_host_gen)
1819+{
1820+ auto alloc_buffers = [&params, &host_buffers](const std::vector<fft_params::fft_brick>& bricks,
1821+ fft_array_type type,
1822+ std::vector<gpubuf>& output,
1823+ bool is_host_gen) {
1824+ auto elem_size = var_size<size_t>(params.precision, type);
1825+ const bool is_planar
1826+ = type == fft_array_type_complex_planar || type == fft_array_type_hermitian_planar;
1827+ // alloc 2x buffers, each half size for planar
1828+ if(is_planar)
1829+ elem_size /= 2;
1830+
1831+ for(const auto& b : bricks)
1832+ {
1833+ rocfft_scoped_device dev(b.device);
1834+
1835+ size_t brick_size_bytes = compute_ptrdiff(b.length(), b.stride, 0, 0) * elem_size;
1836+ output.emplace_back();
1837+ if(output.back().alloc(brick_size_bytes) != hipSuccess)
1838+ throw std::runtime_error("hipMalloc failed");
1839+ if(is_planar)
1840+ {
1841+ output.emplace_back();
1842+ if(output.back().alloc(brick_size_bytes) != hipSuccess)
1843+ throw std::runtime_error("hipMalloc failed");
1844+ }
1845+ if(is_host_gen)
1846+ {
1847+ host_buffers.emplace_back();
1848+ host_buffers.back().alloc(brick_size_bytes);
1849+ if(is_planar)
1850+ {
1851+ host_buffers.emplace_back();
1852+ host_buffers.back().alloc(brick_size_bytes);
1853+ }
1854+ }
1855+ }
1856+ };
1857+
1858+ // If brick shape differs, inplace is only allowed for single
1859+ // bricks. e.g. in-place real-complex
1860+ if(params.placement == fft_placement_inplace)
1861+ {
1862+ if(ibricks.size() != 1 && obricks.size() != 1 && ibricks != obricks)
1863+ throw std::runtime_error(
1864+ "in-place transform to different brick shapes only allowed for single bricks");
1865+
1866+ // allocate the larger of the two bricks
1867+ auto isize_bytes = compute_ptrdiff(ibricks.front().length(), ibricks.front().stride, 0, 0)
1868+ * var_size<size_t>(params.precision, params.itype);
1869+ auto osize_bytes = compute_ptrdiff(obricks.front().length(), obricks.front().stride, 0, 0)
1870+ * var_size<size_t>(params.precision, params.otype);
1871+
1872+ alloc_buffers(isize_bytes > osize_bytes ? ibricks : obricks,
1873+ isize_bytes > osize_bytes ? params.itype : params.otype,
1874+ ibuffers,
1875+ is_host_gen);
1876+ obuffers = &ibuffers;
1877+ }
1878+ else
1879+ {
1880+ alloc_buffers(ibricks, params.itype, ibuffers, is_host_gen);
1881+ alloc_buffers(obricks, params.otype, obuffer_data, false);
1882+ obuffers = &obuffer_data;
1883+ }
1884+}
1885+
1886+void copy_host_input_to_dev(std::vector<hostbuf>& host_buffers, std::vector<gpubuf>& buffers)
1887+{
1888+ for(size_t i = 0; i < buffers.size(); ++i)
1889+ {
1890+ if(hipMemcpy(buffers[i].data(),
1891+ host_buffers[i].data(),
1892+ host_buffers[i].size(),
1893+ hipMemcpyHostToDevice)
1894+ != hipSuccess)
1895+ throw std::runtime_error("hipMemcpy failure");
1896+ }
1897+}
1898+
1899+template <typename Tparams>
1900+void init_bench_input(const Tparams& params,
1901+ const std::vector<fft_params::fft_brick>& bricks,
1902+ std::vector<gpubuf>& buffers,
1903+ std::vector<hostbuf>& host_buffers,
1904+ bool is_host_gen)
1905+{
1906+ auto elem_size = var_size<size_t>(params.precision, params.itype);
1907+ if(is_host_gen)
1908+ {
1909+ std::vector<void*> ptrs;
1910+ ptrs.reserve(host_buffers.size());
1911+ for(auto& buf : host_buffers)
1912+ ptrs.push_back(buf.data());
1913+
1914+ init_local_input<Tparams, hostbuf>(0, params, bricks, elem_size, ptrs);
1915+ copy_host_input_to_dev(host_buffers, buffers);
1916+ }
1917+ else
1918+ {
1919+#ifdef USE_HIPRAND
1920+ std::vector<void*> ptrs;
1921+ ptrs.reserve(buffers.size());
1922+ for(auto& buf : buffers)
1923+ ptrs.push_back(buf.data());
1924+
1925+ init_local_input<Tparams, gpubuf>(0, params, bricks, elem_size, ptrs);
1926+#endif
1927+ }
1928+}
1929+
1930+template <typename Tparams>
1931+void print_device_buffer(const Tparams& params, std::vector<gpubuf>& buffer, bool input)
1932+{
1933+ // copy data back to host
1934+ std::vector<hostbuf> print_buffer;
1935+ for(auto& buf : buffer)
1936+ {
1937+ print_buffer.emplace_back();
1938+ print_buffer.back().alloc(buf.size());
1939+ if(hipMemcpy(print_buffer.back().data(), buf.data(), buf.size(), hipMemcpyDeviceToHost)
1940+ != hipSuccess)
1941+ throw std::runtime_error("hipMemcpy failed");
1942+ }
1943+ if(input)
1944+ params.print_ibuffer(print_buffer);
1945+ else
1946+ params.print_obuffer(print_buffer);
1947+}
1948+
1949 #endif // ROCFFT_BENCH_H
1950diff --git a/clients/bench/dyna-bench.cpp b/clients/bench/dyna-bench.cpp
1951index 7e50435..b121887 100644
1952--- a/clients/bench/dyna-bench.cpp
1953+++ b/clients/bench/dyna-bench.cpp
1954@@ -458,7 +458,10 @@ int main(int argc, char* argv[])
1955 params.mp_lib = fft_params::fft_mp_lib_none;
1956
1957 int localDeviceCount = 0;
1958- (void)hipGetDeviceCount(&localDeviceCount);
1959+ if(hipGetDeviceCount(&localDeviceCount) != hipSuccess)
1960+ {
1961+ throw std::runtime_error("hipGetDeviceCount failed");
1962+ }
1963
1964 // start with all-ones in grids
1965 std::vector<unsigned int> input_grid(params.length.size() + 1, 1);
1966@@ -574,112 +577,42 @@ int main(int argc, char* argv[])
1967 }
1968
1969 // GPU input buffer:
1970- auto ibuffer_sizes = params.ibuffer_sizes();
1971- std::vector<gpubuf> ibuffer(ibuffer_sizes.size());
1972- std::vector<void*> pibuffer(ibuffer_sizes.size());
1973- for(unsigned int i = 0; i < ibuffer.size(); ++i)
1974- {
1975- try
1976- {
1977- HIP_V_THROW(ibuffer[i].alloc(ibuffer_sizes[i]), "Creating input Buffer failed");
1978- }
1979- catch(rocfft_hip_runtime_error)
1980- {
1981- return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
1982- }
1983- pibuffer[i] = ibuffer[i].data();
1984- }
1985-
1986+ std::vector<gpubuf> ibuffer;
1987+ std::vector<void*> pibuffer;
1988 // CPU-side input buffer
1989 std::vector<hostbuf> ibuffer_cpu;
1990
1991 auto is_host_gen = (params.igen == fft_input_generator_host
1992 || params.igen == fft_input_random_generator_host);
1993
1994-#ifdef USE_HIPRAND
1995- if(!is_host_gen)
1996- {
1997- // Input data:
1998- params.compute_input(ibuffer);
1999+ auto ibricks = get_input_bricks(params);
2000+ auto obricks = get_output_bricks(params);
2001
2002- if(verbose > 1)
2003- {
2004- // Copy input to CPU
2005- ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize);
2006- for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
2007- {
2008- try
2009- {
2010- HIP_V_THROW(hipMemcpy(ibuffer_cpu.at(idx).data(),
2011- ibuffer[idx].data(),
2012- ibuffer_sizes[idx],
2013- hipMemcpyDeviceToHost),
2014- "hipMemcpy failed");
2015- }
2016- catch(rocfft_hip_runtime_error)
2017- {
2018- return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
2019- }
2020- }
2021+ std::vector<gpubuf> obuffer_data;
2022+ std::vector<gpubuf>* obuffer = nullptr;
2023+ alloc_bench_bricks(
2024+ params, ibricks, obricks, ibuffer, obuffer_data, obuffer, ibuffer_cpu, is_host_gen);
2025+ init_bench_input(params, ibricks, ibuffer, ibuffer_cpu, is_host_gen);
2026
2027- std::cout << "GPU input:\n";
2028- params.print_ibuffer(ibuffer_cpu);
2029- }
2030- }
2031-#endif
2032- if(is_host_gen)
2033+ for(unsigned int i = 0; i < ibuffer.size(); ++i)
2034 {
2035- // Input data:
2036- ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize);
2037- params.compute_input(ibuffer_cpu);
2038+ pibuffer.push_back(ibuffer[i].data());
2039+ }
2040
2041- if(verbose > 1)
2042+ // print input if requested
2043+ if(verbose > 1)
2044+ {
2045+ if(is_host_gen)
2046 {
2047- std::cout << "GPU input:\n";
2048+ // data is already on host
2049 params.print_ibuffer(ibuffer_cpu);
2050 }
2051-
2052- for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx)
2053+ else
2054 {
2055- try
2056- {
2057- HIP_V_THROW(hipMemcpy(pibuffer[idx],
2058- ibuffer_cpu[idx].data(),
2059- ibuffer_cpu[idx].size(),
2060- hipMemcpyHostToDevice),
2061- "hipMemcpy failed");
2062- }
2063- catch(rocfft_hip_runtime_error)
2064- {
2065- return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
2066- }
2067+ print_device_buffer(params, ibuffer, true);
2068 }
2069 }
2070
2071- // GPU output buffer:
2072- std::vector<gpubuf> obuffer_data;
2073- std::vector<gpubuf>* obuffer = &obuffer_data;
2074- if(params.placement == fft_placement_inplace)
2075- {
2076- obuffer = &ibuffer;
2077- }
2078- else
2079- {
2080- auto obuffer_sizes = params.obuffer_sizes();
2081- obuffer_data.resize(obuffer_sizes.size());
2082- for(unsigned int i = 0; i < obuffer_data.size(); ++i)
2083- {
2084- try
2085- {
2086- HIP_V_THROW(obuffer_data[i].alloc(obuffer_sizes[i]),
2087- "Creating output Buffer failed");
2088- }
2089- catch(rocfft_hip_runtime_error)
2090- {
2091- return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
2092- }
2093- }
2094- }
2095 std::vector<void*> pobuffer(obuffer->size());
2096 for(unsigned int i = 0; i < obuffer->size(); ++i)
2097 {
2098diff --git a/clients/samples/fixed-16/CMakeLists.txt b/clients/samples/fixed-16/CMakeLists.txt
2099index feec728..d74f03b 100644
2100--- a/clients/samples/fixed-16/CMakeLists.txt
2101+++ b/clients/samples/fixed-16/CMakeLists.txt
2102@@ -49,7 +49,7 @@ if( NOT TARGET rocfft )
2103 endif( )
2104
2105 if( NOT HIP_FOUND )
2106- find_package( HIP REQUIRED )
2107+ find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ )
2108 endif()
2109
2110 set( sample_list fixed-16-float fixed-16-double fixed-16-half )
2111diff --git a/clients/samples/fixed-large/CMakeLists.txt b/clients/samples/fixed-large/CMakeLists.txt
2112index bf1c2e7..ddea865 100644
2113--- a/clients/samples/fixed-large/CMakeLists.txt
2114+++ b/clients/samples/fixed-large/CMakeLists.txt
2115@@ -49,7 +49,7 @@ if( NOT TARGET rocfft )
2116 endif( )
2117
2118 if( NOT HIP_FOUND )
2119- find_package( HIP REQUIRED )
2120+ find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ )
2121 endif()
2122
2123 set( sample_list fixed-large-float fixed-large-double )
2124diff --git a/clients/samples/mpi/CMakeLists.txt b/clients/samples/mpi/CMakeLists.txt
2125index 836e652..e4122bc 100644
2126--- a/clients/samples/mpi/CMakeLists.txt
2127+++ b/clients/samples/mpi/CMakeLists.txt
2128@@ -49,7 +49,7 @@ if( NOT TARGET rocfft )
2129 endif( )
2130
2131 if( NOT HIP_FOUND )
2132- find_package( HIP REQUIRED )
2133+ find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ )
2134 endif()
2135
2136 if( NOT MPI_FOUND )
2137@@ -97,6 +97,7 @@ foreach( sample ${sample_list} )
2138 PRIVATE
2139 hip::hiprand
2140 )
2141+ target_compile_definitions( ${sample} PRIVATE USE_HIPRAND )
2142 endif()
2143
2144 target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp )
2145diff --git a/clients/samples/mpi/rocfft_mpi_example.cpp b/clients/samples/mpi/rocfft_mpi_example.cpp
2146index 14f06dc..18fbd12 100644
2147--- a/clients/samples/mpi/rocfft_mpi_example.cpp
2148+++ b/clients/samples/mpi/rocfft_mpi_example.cpp
2149@@ -126,6 +126,11 @@ int main(int argc, char** argv)
2150 if(fftrc != rocfft_status_success)
2151 throw std::runtime_error("failed to create description");
2152
2153+ // This example is unbatched, so the batch stride is not used
2154+ // for anything. For batched examples, this would be
2155+ // distance in elements between consecutive batches.
2156+ const size_t batch_stride = 0;
2157+
2158 if(mpi_rank == 0)
2159 {
2160 std::cout << "input data decomposition:\n";
2161@@ -135,14 +140,14 @@ int main(int argc, char** argv)
2162 rocfft_field infield = nullptr;
2163 rocfft_field_create(&infield);
2164
2165- std::vector<size_t> inbrick_stride = {1, length[1]};
2166+ std::vector<size_t> inbrick_stride = {1, length[1], batch_stride};
2167 const size_t inbrick_length1 = length[1] / (size_t)mpi_size
2168 + ((size_t)mpi_rank < length[1] % (size_t)mpi_size ? 1 : 0);
2169 const size_t inbrick_lower1
2170 = mpi_rank * (length[1] / mpi_size) + std::min((size_t)mpi_rank, length[1] % mpi_size);
2171 const size_t inbrick_upper1 = inbrick_lower1 + inbrick_length1;
2172- std::vector<size_t> inbrick_lower = {0, inbrick_lower1};
2173- std::vector<size_t> inbrick_upper = {length[0], inbrick_upper1};
2174+ std::vector<size_t> inbrick_lower = {0, inbrick_lower1, 0};
2175+ std::vector<size_t> inbrick_upper = {length[0], inbrick_upper1, 1};
2176
2177 rocfft_brick inbrick = nullptr;
2178 rocfft_brick_create(&inbrick,
2179@@ -219,15 +224,15 @@ int main(int argc, char** argv)
2180 std::vector<void*> gpu_out = {nullptr};
2181 std::vector<size_t> outbrick_lower;
2182 std::vector<size_t> outbrick_upper;
2183- std::vector<size_t> outbrick_stride = {1, length[1]};
2184+ std::vector<size_t> outbrick_stride = {1, length[1], batch_stride};
2185 {
2186 const size_t outbrick_length1 = length[1] / (size_t)mpi_size
2187 + ((size_t)mpi_rank < length[1] % (size_t)mpi_size ? 1 : 0);
2188 const size_t outbrick_lower1
2189 = mpi_rank * (length[1] / mpi_size) + std::min((size_t)mpi_rank, length[1] % mpi_size);
2190 const size_t outbrick_upper1 = outbrick_lower1 + outbrick_length1;
2191- outbrick_lower = {0, outbrick_lower1};
2192- outbrick_upper = {length[0], outbrick_upper1};
2193+ outbrick_lower = {0, outbrick_lower1, 0};
2194+ outbrick_upper = {length[0], outbrick_upper1, 1};
2195
2196 const size_t memSize = length[0] * outbrick_length1 * sizeof(std::complex<double>);
2197 for(int irank = 0; irank < mpi_size; ++irank)
2198@@ -254,8 +259,8 @@ int main(int argc, char** argv)
2199 rocfft_field_create(&outfield);
2200
2201 rocfft_brick outbrick = nullptr;
2202- outbrick_lower = {0, outbrick_lower1};
2203- outbrick_upper = {length[0], outbrick_lower1 + outbrick_length1};
2204+ outbrick_lower = {0, outbrick_lower1, 0};
2205+ outbrick_upper = {length[0], outbrick_lower1 + outbrick_length1, 1};
2206 rocfft_brick_create(&outbrick,
2207 outbrick_lower.data(),
2208 outbrick_upper.data(),
2209diff --git a/clients/samples/multi_gpu/CMakeLists.txt b/clients/samples/multi_gpu/CMakeLists.txt
2210index 41bc5e0..f4b1b27 100644
2211--- a/clients/samples/multi_gpu/CMakeLists.txt
2212+++ b/clients/samples/multi_gpu/CMakeLists.txt
2213@@ -49,7 +49,7 @@ if( NOT TARGET rocfft )
2214 endif( )
2215
2216 if( NOT HIP_FOUND )
2217- find_package( HIP REQUIRED )
2218+ find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ )
2219 endif()
2220
2221 if( USE_HIPRAND AND NOT hiprand_FOUND )
2222@@ -79,6 +79,7 @@ foreach( sample ${sample_list} )
2223 PRIVATE
2224 hip::hiprand
2225 )
2226+ target_compile_definitions( ${sample} PRIVATE USE_HIPRAND )
2227 endif()
2228
2229 target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp )
2230diff --git a/clients/samples/rocfft/CMakeLists.txt b/clients/samples/rocfft/CMakeLists.txt
2231index d883915..01d47d6 100644
2232--- a/clients/samples/rocfft/CMakeLists.txt
2233+++ b/clients/samples/rocfft/CMakeLists.txt
2234@@ -49,7 +49,7 @@ if( NOT TARGET rocfft )
2235 endif( )
2236
2237 if( NOT HIP_FOUND )
2238- find_package( HIP REQUIRED )
2239+ find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ )
2240 endif()
2241
2242 if( USE_HIPRAND AND NOT hiprand_FOUND )
2243@@ -80,6 +80,7 @@ foreach( sample ${sample_list} )
2244 PRIVATE
2245 hip::hiprand
2246 )
2247+ target_compile_definitions( ${sample} PRIVATE USE_HIPRAND )
2248 endif()
2249
2250 target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp )
2251@@ -116,3 +117,10 @@ foreach( sample ${sample_list} )
2252 target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} )
2253
2254 endforeach( )
2255+
2256+# callback functions need to be built as relocatable device code
2257+# (causes failure at link stage on Windows)
2258+if (NOT WIN32)
2259+ target_compile_options( rocfft_example_callback PRIVATE -fgpu-rdc )
2260+ target_link_options( rocfft_example_callback PRIVATE -fgpu-rdc )
2261+endif()
2262diff --git a/clients/samples/rocfft/rocfft_example_callback.cpp b/clients/samples/rocfft/rocfft_example_callback.cpp
2263index a6c2dd3..ccc65e5 100644
2264--- a/clients/samples/rocfft/rocfft_example_callback.cpp
2265+++ b/clients/samples/rocfft/rocfft_example_callback.cpp
2266@@ -20,11 +20,12 @@
2267 * THE SOFTWARE.
2268 *******************************************************************************/
2269
2270+#include <iostream>
2271+#ifndef WIN32
2272 #include "rocfft/rocfft.h"
2273 #include <hip/hip_complex.h>
2274 #include <hip/hip_runtime.h>
2275 #include <hip/hip_vector_types.h>
2276-#include <iostream>
2277 #include <math.h>
2278 #include <stdexcept>
2279 #include <vector>
2280@@ -46,9 +47,14 @@ __device__ double2 load_callback(double2* input, size_t offset, void* cbdata, vo
2281 make_hipDoubleComplex(data->scale, data->scale));
2282 }
2283 __device__ auto load_callback_dev = load_callback;
2284+#endif
2285
2286 int main()
2287 {
2288+#ifdef WIN32
2289+ std::cout << "This sample is temporarily disabled on Windows" << std::endl;
2290+ return EXIT_SUCCESS;
2291+#else
2292
2293 const size_t N = 8;
2294
2295@@ -189,4 +195,5 @@ int main()
2296 throw std::runtime_error("rocfft_cleanup failed.");
2297
2298 return 0;
2299+#endif
2300 }
2301diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt
2302index 83ed823..81037c0 100644
2303--- a/clients/tests/CMakeLists.txt
2304+++ b/clients/tests/CMakeLists.txt
2305@@ -53,11 +53,11 @@ if( NOT TARGET rocfft )
2306 endif( )
2307
2308 if( NOT HIP_FOUND )
2309- find_package( HIP REQUIRED )
2310+ find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ )
2311 endif()
2312
2313-if( NOT ROCM_FOUND )
2314- find_package( ROCM 0.7.3 REQUIRED )
2315+if( NOT ROCmCMakeBuildTools_FOUND )
2316+ find_package( ROCmCMakeBuildTools REQUIRED )
2317 endif()
2318
2319 if( USE_HIPRAND AND NOT hiprand_FOUND )
2320@@ -94,6 +94,19 @@ set( rocfft-test_source
2321 add_executable( rocfft-test ${rocfft-test_source} ${rocfft-test_includes} )
2322 add_executable( rtc_helper_crash rtc_helper_crash.cpp )
2323
2324+# rocFFT device code builds with -O3 by default. rocfft-test
2325+# contains device code for callback functions, so ensure the device
2326+# code is built with the same optimization level to minimize chance
2327+# of a mismatch
2328+target_compile_options( rocfft-test PRIVATE -Xarch_device -O3 )
2329+
2330+# callback functions need to be built as relocatable device code
2331+# (causes failure at link stage on Windows)
2332+if (NOT WIN32)
2333+ target_compile_options( rocfft-test PRIVATE -fgpu-rdc )
2334+ target_link_options( rocfft-test PRIVATE -fgpu-rdc )
2335+endif()
2336+
2337 find_package( Boost REQUIRED )
2338 set( Boost_DEBUG ON )
2339 set( Boost_DETAILED_FAILURE_MSG ON )
2340@@ -183,15 +196,38 @@ set( rocfft-test_link_libs
2341 ${FFTW_LIBRARIES}
2342 )
2343
2344-include( ../cmake/build-gtest.cmake )
2345+option( BUILD_CLIENTS_TESTS_OPENMP "Build tests with OpenMP" ON )
2346+if( BUILD_CLIENTS_TESTS_OPENMP )
2347+ # Attempt to find a config version, which provides openmp_LIB_DIR.
2348+ #find_package( OpenMP CONFIG PATHS "${HIP_CLANG_ROOT}/lib/cmake" )
2349+ if( NOT OPENMP_FOUND OR NOT DEFINED ${openmp_LIB_DIR} )
2350+ # Fall-back to module mode.
2351+ find_package( OpenMP REQUIRED )
2352+ set( BUILD_RPATH "${HIP_CLANG_ROOT}/lib" )
2353+ set( INSTALL_RPATH "$ORIGIN/../llvm/lib" )
2354+ else()
2355+ set( BUILD_RPATH "${HIP_CLANG_ROOT}/${openmp_LIB_DIR}" )
2356+ set( INSTALL_RPATH "$ORIGIN/../llvm/${openmp_LIB_DIR}" )
2357+ endif()
2358+ list( APPEND rocfft-test_link_libs OpenMP::OpenMP_CXX )
2359+
2360+ set_target_properties( rocfft-test PROPERTIES
2361+ BUILD_RPATH "{$BUILD_RPATH}"
2362+ )
2363+ set_target_properties( rocfft-test PROPERTIES
2364+ INSTALL_RPATH "${INSTALL_RPATH}"
2365+ )
2366+endif()
2367+
2368+find_package( GTest QUIET )
2369
2370-if( BUILD_GTEST OR NOT GTEST_FOUND )
2371+if( GTest_FOUND )
2372+ target_link_libraries( rocfft-test PRIVATE GTest::gtest )
2373+else()
2374+ include( ../cmake/build-gtest.cmake )
2375 add_dependencies( rocfft-test gtest )
2376 list( APPEND rocfft-test_include_dirs ${GTEST_INCLUDE_DIRS} )
2377 list( APPEND rocfft-test_link_libs ${GTEST_LIBRARIES} )
2378-else()
2379- list( APPEND rocfft-test_include_dirs $<BUILD_INTERFACE:${GTEST_INCLUDE_DIRS}> )
2380- list( APPEND rocfft-test_link_libs ${GTEST_LIBRARIES} )
2381 endif()
2382
2383 target_compile_options( rocfft-test PRIVATE ${WARNING_FLAGS} -Wno-cpp )
2384@@ -223,6 +259,7 @@ if ( USE_HIPRAND )
2385 PRIVATE
2386 hip::hiprand
2387 )
2388+ target_compile_definitions( rocfft-test PRIVATE USE_HIPRAND )
2389 endif()
2390
2391 if( ROCFFT_MPI_ENABLE )
2392@@ -256,7 +293,9 @@ if( USE_CUDA )
2393 )
2394 target_compile_definitions( rocfft-test PRIVATE __HIP_PLATFORM_NVCC__ )
2395 endif( )
2396-target_link_libraries( rocfft-test PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} )
2397+target_link_libraries( rocfft-test PRIVATE
2398+ ${ROCFFT_CLIENTS_HOST_LINK_LIBS}
2399+ ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} )
2400
2401 include( ../../cmake/sqlite.cmake )
2402 target_link_libraries( rocfft-test PUBLIC ${ROCFFT_SQLITE_LIB} )
2403@@ -264,21 +303,6 @@ target_include_directories( rocfft-test PRIVATE ${sqlite_local_SOURCE_DIR} )
2404
2405 set_property( TARGET rocfft-test APPEND PROPERTY LINK_LIBRARIES ${ROCFFT_SQLITE_LIB} )
2406
2407-option( BUILD_CLIENTS_TESTS_OPENMP "Build tests with OpenMP" ON )
2408-
2409-if( BUILD_CLIENTS_TESTS_OPENMP )
2410- if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )
2411- target_compile_options( rocfft-test PRIVATE -fopenmp )
2412- target_link_libraries( rocfft-test PRIVATE -fopenmp -L${HIP_CLANG_ROOT}/lib -Wl,-rpath=${HIP_CLANG_ROOT}/lib )
2413- target_include_directories( rocfft-test PRIVATE ${HIP_CLANG_ROOT}/include )
2414- else()
2415- if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
2416- target_compile_options( rocfft-test PRIVATE -fopenmp=libomp )
2417- target_link_options( rocfft-test PRIVATE -fopenmp=libomp )
2418- endif()
2419- endif()
2420-endif()
2421-
2422 if(FFTW_MULTITHREAD)
2423 target_compile_options( rocfft-test PRIVATE -DFFTW_MULTITHREAD )
2424 endif( )
2425@@ -330,7 +354,18 @@ if( ROCFFT_MPI_ENABLE )
2426 # normal and dynamic-loading MPI worker processes
2427 foreach(worker rocfft_mpi_worker dyna_rocfft_mpi_worker)
2428 add_executable( ${worker} rocfft_mpi_worker.cpp )
2429- target_compile_options( ${worker} PRIVATE -fopenmp )
2430+
2431+ if( BUILD_CLIENTS_TESTS_OPENMP )
2432+ set_target_properties( ${worker} PROPERTIES
2433+ BUILD_RPATH "${BUILD_RPATH}"
2434+ )
2435+ set_target_properties( ${worker} PROPERTIES
2436+ INSTALL_RPATH "${INSTALL_RPATH}"
2437+ )
2438+ endif()
2439+ if( BUILD_FFTW OR NOT FFTW_FOUND )
2440+ add_dependencies( ${worker} fftw_double fftw_single )
2441+ endif()
2442 target_include_directories( ${worker}
2443 PRIVATE
2444 ${CMAKE_BINARY_DIR}/include
2445@@ -341,8 +376,8 @@ if( ROCFFT_MPI_ENABLE )
2446 target_compile_options( ${worker} PRIVATE ${WARNING_FLAGS} )
2447
2448 if ( ROCFFT_CRAY_MPI_ENABLE )
2449- target_link_libraries( ${worker}
2450- -fopenmp
2451+ target_link_libraries( ${worker}
2452+ OpenMP::OpenMP_CXX
2453 hip::hiprand
2454 hip::device
2455 MPI::MPI_CXX
2456@@ -355,7 +390,7 @@ if( ROCFFT_MPI_ENABLE )
2457 ${MPI_LIBDIR}/../../../../gtl/lib )
2458 else()
2459 target_link_libraries( ${worker}
2460- -fopenmp
2461+ OpenMP::OpenMP_CXX
2462 hip::hiprand
2463 hip::device
2464 MPI::MPI_CXX
2465@@ -382,3 +417,49 @@ if( ROCFFT_MPI_ENABLE )
2466 )
2467
2468 endif()
2469+
2470+set(COVERAGE_TEST_OPTIONS "--smoketest;--gtest_filter=-*call*" CACHE STRING "Command line arguments for rocfft-test when generating a code coverage report")
2471+
2472+if(BUILD_CODE_COVERAGE)
2473+ # Coverage won't work in a standalone build of the tests, as we can't
2474+ # guarantee the library was built with coverage enabled
2475+ if( NOT TARGET rocfft )
2476+ message( FATAL_ERROR "BUILD_CODE_COVERAGE requires building from the root of rocFFT" )
2477+ endif()
2478+
2479+ add_custom_target(
2480+ code_cov_tests
2481+ DEPENDS rocfft-test rocfft_rtc_helper
2482+ COMMAND ${CMAKE_COMMAND} -E rm -rf ./coverage-report
2483+ COMMAND ${CMAKE_COMMAND} -E make_directory ./coverage-report/profraw
2484+ COMMAND ${CMAKE_COMMAND} -E env LLVM_PROFILE_FILE="./coverage-report/profraw/rocfft-coverage_%p.profraw" GTEST_LISTENER=NO_PASS_LINE_IN_LOG $<TARGET_FILE:rocfft-test> --precompile=rocfft-test-precompile.db ${COVERAGE_TEST_OPTIONS}
2485+ WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
2486+ )
2487+
2488+ find_program(
2489+ LLVM_PROFDATA
2490+ llvm-profdata
2491+ REQUIRED
2492+ HINTS ${ROCM_PATH}/llvm/bin
2493+ PATHS /opt/rocm/llvm/bin
2494+ )
2495+
2496+ find_program(
2497+ LLVM_COV
2498+ llvm-cov
2499+ REQUIRED
2500+ HINTS ${ROCM_PATH}/llvm/bin
2501+ PATHS /opt/rocm/llvm/bin
2502+ )
2503+
2504+ add_custom_target(
2505+ coverage
2506+ DEPENDS code_cov_tests
2507+ COMMAND ${LLVM_PROFDATA} merge -sparse ./coverage-report/profraw/rocfft-coverage_*.profraw -o ./coverage-report/rocfft.profdata
2508+ COMMAND ${LLVM_COV} report -object ./library/src/librocfft.so -instr-profile=./coverage-report/rocfft.profdata
2509+ COMMAND ${LLVM_COV} show -object ./library/src/librocfft.so -instr-profile=./coverage-report/rocfft.profdata -format=html -output-dir=coverage-report
2510+ COMMAND ${LLVM_COV} export -object ./library/src/librocfft.so -instr-profile=./coverage-report/rocfft.profdata -format=lcov > ./coverage-report/coverage.info
2511+ WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
2512+ )
2513+
2514+endif()
2515diff --git a/clients/tests/accuracy_test.cpp b/clients/tests/accuracy_test.cpp
2516index a3758a6..addaca1 100644
2517--- a/clients/tests/accuracy_test.cpp
2518+++ b/clients/tests/accuracy_test.cpp
2519@@ -230,7 +230,7 @@ __host__ __device__ static void store_callback_round_trip_inverse(
2520 Tdata* output, size_t offset, Tdata element, void* cbdata, void* sharedMem)
2521 {
2522 auto testdata = static_cast<callback_test_data*>(cbdata);
2523- // add scalar to each element
2524+ // divide each element by scalar
2525 if(output == testdata->base)
2526 {
2527 output[offset] = element / testdata->scalar;
2528diff --git a/clients/tests/accuracy_test_3D.cpp b/clients/tests/accuracy_test_3D.cpp
2529index 37c80ca..d4e3e43 100644
2530--- a/clients/tests/accuracy_test_3D.cpp
2531+++ b/clients/tests/accuracy_test_3D.cpp
2532@@ -255,17 +255,22 @@ INSTANTIATE_TEST_SUITE_P(partial_pass_3D,
2533 false)),
2534 accuracy_test::TestName);
2535
2536-INSTANTIATE_TEST_SUITE_P(partial_pass_3D_callback,
2537- accuracy_test,
2538- ::testing::ValuesIn(param_generator(test_prob,
2539- partial_pass_adhoc_3D,
2540- precision_range_sp_dp,
2541- partial_pass_batch_range_3D,
2542- stride_range,
2543- stride_range,
2544- ioffset_range_zero,
2545- ooffset_range_zero,
2546- place_range,
2547- false,
2548- true)),
2549- accuracy_test::TestName);
2550+INSTANTIATE_TEST_SUITE_P(
2551+#ifdef WIN32
2552+ DISABLED_partial_pass_3D_callback,
2553+#else
2554+ partial_pass_3D_callback,
2555+#endif
2556+ accuracy_test,
2557+ ::testing::ValuesIn(param_generator(test_prob,
2558+ partial_pass_adhoc_3D,
2559+ precision_range_sp_dp,
2560+ partial_pass_batch_range_3D,
2561+ stride_range,
2562+ stride_range,
2563+ ioffset_range_zero,
2564+ ooffset_range_zero,
2565+ place_range,
2566+ false,
2567+ true)),
2568+ accuracy_test::TestName);
2569diff --git a/clients/tests/accuracy_test_callback.cpp b/clients/tests/accuracy_test_callback.cpp
2570index 631a974..57c3244 100644
2571--- a/clients/tests/accuracy_test_callback.cpp
2572+++ b/clients/tests/accuracy_test_callback.cpp
2573@@ -104,22 +104,27 @@ const static std::vector<std::vector<size_t>> ooffset_range = {{0, 0}, {1, 1}};
2574 auto forward_transform_types
2575 = {fft_transform_type_complex_forward, fft_transform_type_real_forward};
2576
2577-INSTANTIATE_TEST_SUITE_P(callback,
2578- accuracy_test,
2579- ::testing::ValuesIn(param_generator_base(test_prob,
2580- forward_transform_types,
2581- callback_sizes,
2582- precision_range_sp_dp,
2583- batch_range,
2584- generate_types,
2585- stride_range,
2586- stride_range,
2587- ioffset_range_zero,
2588- ooffset_range_zero,
2589- place_range,
2590- false,
2591- true)),
2592- accuracy_test::TestName);
2593+INSTANTIATE_TEST_SUITE_P(
2594+#ifdef WIN32
2595+ DISABLED_callback_no_offset,
2596+#else
2597+ callback,
2598+#endif
2599+ accuracy_test,
2600+ ::testing::ValuesIn(param_generator_base(test_prob,
2601+ forward_transform_types,
2602+ callback_sizes,
2603+ precision_range_sp_dp,
2604+ batch_range,
2605+ generate_types,
2606+ stride_range,
2607+ stride_range,
2608+ ioffset_range_zero,
2609+ ooffset_range_zero,
2610+ place_range,
2611+ false,
2612+ true)),
2613+ accuracy_test::TestName);
2614
2615 INSTANTIATE_TEST_SUITE_P(DISABLED_callback,
2616 accuracy_test,
2617diff --git a/clients/tests/accuracy_test_checkstride.cpp b/clients/tests/accuracy_test_checkstride.cpp
2618index 0f6b7c6..898497c 100644
2619--- a/clients/tests/accuracy_test_checkstride.cpp
2620+++ b/clients/tests/accuracy_test_checkstride.cpp
2621@@ -70,7 +70,11 @@ inline auto param_checkstride()
2622 for(const auto& types :
2623 generate_types(trans_type, {fft_placement_notinplace}, true))
2624 {
2625+#ifdef WIN32
2626+ for(bool callback : {false})
2627+#else
2628 for(bool callback : {true, false})
2629+#endif
2630 {
2631 // callbacks don't work for planar
2632 bool is_planar = std::get<2>(types) == fft_array_type_complex_planar
2633diff --git a/clients/tests/accuracy_tests_range.h b/clients/tests/accuracy_tests_range.h
2634index 103a97a..6b71b27 100644
2635--- a/clients/tests/accuracy_tests_range.h
2636+++ b/clients/tests/accuracy_tests_range.h
2637@@ -167,7 +167,15 @@ const static std::vector<size_t> inner_batch_3D_batch_range = {3, 2, 1};
2638 // partial pass test problems
2639 //-----------------------------------------------------------------------
2640 //-----------------------------------------------------------------------
2641-const static std::vector<std::vector<size_t>> partial_pass_adhoc_3D = {{64, 64, 64}};
2642-const static std::vector<size_t> partial_pass_batch_range_3D = {1, 5, 10, 20, 50};
2643+const static std::vector<std::vector<size_t>> partial_pass_adhoc_3D = {
2644+ {64, 64, 128},
2645+ {64, 64, 64},
2646+ {64, 64, 52},
2647+ {60, 60, 60},
2648+ {32, 32, 128},
2649+ {32, 32, 64},
2650+ {64, 32, 128},
2651+};
2652+const static std::vector<size_t> partial_pass_batch_range_3D = {1, 5, 10, 20, 50};
2653
2654 #endif // ACCURACY_TESTS_RANGE_H
2655\ No newline at end of file
2656diff --git a/clients/tests/buffer_hash_test.cpp b/clients/tests/buffer_hash_test.cpp
2657index 8d4831d..47095e2 100644
2658--- a/clients/tests/buffer_hash_test.cpp
2659+++ b/clients/tests/buffer_hash_test.cpp
2660@@ -19,6 +19,7 @@
2661 // THE SOFTWARE.
2662
2663 #include "../../shared/fft_hash.h"
2664+#include "../../shared/params_gen.h"
2665 #include "../../shared/rocfft_params.h"
2666 #include <algorithm>
2667 #include <chrono>
2668@@ -29,7 +30,7 @@
2669
2670 static void set_params(const fft_precision precision, fft_params& param)
2671 {
2672- std::vector<size_t> blengths = {16777216};
2673+ std::vector<size_t> blengths = {131072};
2674
2675 std::vector<size_t> unit_strides = {1};
2676
2677@@ -357,6 +358,12 @@ static void run_test(const rocfft_params& params)
2678
2679 TEST(rocfft_UnitTest, buffer_hashing_half)
2680 {
2681+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
2682+ > unittest_prob)
2683+ {
2684+ GTEST_SKIP();
2685+ }
2686+
2687 rocfft_params params;
2688 set_params(fft_precision_half, params);
2689
2690@@ -372,6 +379,13 @@ TEST(rocfft_UnitTest, buffer_hashing_half)
2691
2692 TEST(rocfft_UnitTest, buffer_hashing_single)
2693 {
2694+
2695+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
2696+ > unittest_prob)
2697+ {
2698+ GTEST_SKIP();
2699+ }
2700+
2701 rocfft_params params;
2702 set_params(fft_precision_single, params);
2703
2704@@ -387,6 +401,13 @@ TEST(rocfft_UnitTest, buffer_hashing_single)
2705
2706 TEST(rocfft_UnitTest, buffer_hashing_double)
2707 {
2708+
2709+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
2710+ > unittest_prob)
2711+ {
2712+ GTEST_SKIP();
2713+ }
2714+
2715 rocfft_params params;
2716 set_params(fft_precision_double, params);
2717
2718diff --git a/clients/tests/callback_change_type.cpp b/clients/tests/callback_change_type.cpp
2719index 4a06a0f..24d4863 100644
2720--- a/clients/tests/callback_change_type.cpp
2721+++ b/clients/tests/callback_change_type.cpp
2722@@ -70,23 +70,28 @@ std::vector<std::vector<size_t>> callback_type_sizes = {{4}, {60}, {122}, {220},
2723 // the input can't be any smaller than what rocFFT thinks it is,
2724 // because the overwrite will fail.
2725 const static std::vector<std::vector<size_t>> stride_range = {{1}};
2726-INSTANTIATE_TEST_SUITE_P(callback,
2727- change_type,
2728- ::testing::ValuesIn(param_generator_base(
2729- test_prob,
2730- {fft_transform_type_complex_forward, fft_transform_type_real_forward},
2731- callback_type_sizes,
2732- {fft_precision_single},
2733- {1},
2734- generate_types,
2735- stride_range,
2736- stride_range,
2737- {{0, 0}},
2738- {{0, 0}},
2739- {fft_placement_notinplace},
2740- false,
2741- false)),
2742- accuracy_test::TestName);
2743+INSTANTIATE_TEST_SUITE_P(
2744+#ifdef WIN32
2745+ DISABLED_callback,
2746+#else
2747+ callback,
2748+#endif
2749+ change_type,
2750+ ::testing::ValuesIn(param_generator_base(test_prob,
2751+ {fft_transform_type_complex_forward,
2752+ fft_transform_type_real_forward},
2753+ callback_type_sizes,
2754+ {fft_precision_single},
2755+ {1},
2756+ generate_types,
2757+ stride_range,
2758+ stride_range,
2759+ {{0, 0}},
2760+ {{0, 0}},
2761+ {fft_placement_notinplace},
2762+ false,
2763+ false)),
2764+ accuracy_test::TestName);
2765
2766 // run an out-of-place transform that casts input from short to float
2767 TEST_P(change_type, short_to_float)
2768diff --git a/clients/tests/default_callbacks_test.cpp b/clients/tests/default_callbacks_test.cpp
2769index 7c782aa..7ade249 100644
2770--- a/clients/tests/default_callbacks_test.cpp
2771+++ b/clients/tests/default_callbacks_test.cpp
2772@@ -28,6 +28,7 @@
2773
2774 #include "../../shared/fftw_transform.h"
2775 #include "../../shared/hip_object_wrapper.h"
2776+#include "../../shared/params_gen.h"
2777 #include "../../shared/rocfft_params.h"
2778 #include "rocfft/rocfft.h"
2779
2780@@ -409,8 +410,15 @@ struct Test_Callback
2781 // complex data inputs and having only a load callback set.
2782 // -------------------------------------------------------------------
2783
2784+#ifndef WIN32
2785 TEST(rocfft_UnitTest, default_load_callback_complex_single)
2786 {
2787+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
2788+ > unittest_prob)
2789+ {
2790+ GTEST_SKIP();
2791+ }
2792+
2793 TEST_CALLBACK_CHECK_ASAN;
2794 Test_Callback test(256,
2795 1,
2796@@ -422,6 +430,12 @@ TEST(rocfft_UnitTest, default_load_callback_complex_single)
2797
2798 TEST(rocfft_UnitTest, default_load_callback_complex_double)
2799 {
2800+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
2801+ > unittest_prob)
2802+ {
2803+ GTEST_SKIP();
2804+ }
2805+
2806 TEST_CALLBACK_CHECK_ASAN;
2807 Test_Callback test(512,
2808 1,
2809@@ -433,6 +447,12 @@ TEST(rocfft_UnitTest, default_load_callback_complex_double)
2810
2811 TEST(rocfft_UnitTest, default_load_callback_real_single)
2812 {
2813+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
2814+ > unittest_prob)
2815+ {
2816+ GTEST_SKIP();
2817+ }
2818+
2819 TEST_CALLBACK_CHECK_ASAN;
2820 Test_Callback test(1024,
2821 1,
2822@@ -444,6 +464,12 @@ TEST(rocfft_UnitTest, default_load_callback_real_single)
2823
2824 TEST(rocfft_UnitTest, default_load_callback_real_double)
2825 {
2826+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
2827+ > unittest_prob)
2828+ {
2829+ GTEST_SKIP();
2830+ }
2831+
2832 TEST_CALLBACK_CHECK_ASAN;
2833 Test_Callback test(2048,
2834 1,
2835@@ -460,6 +486,12 @@ TEST(rocfft_UnitTest, default_load_callback_real_double)
2836
2837 TEST(rocfft_UnitTest, default_store_callback_complex_single)
2838 {
2839+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
2840+ > unittest_prob)
2841+ {
2842+ GTEST_SKIP();
2843+ }
2844+
2845 TEST_CALLBACK_CHECK_ASAN;
2846 Test_Callback test(256,
2847 1,
2848@@ -471,6 +503,12 @@ TEST(rocfft_UnitTest, default_store_callback_complex_single)
2849
2850 TEST(rocfft_UnitTest, default_store_callback_complex_double)
2851 {
2852+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
2853+ > unittest_prob)
2854+ {
2855+ GTEST_SKIP();
2856+ }
2857+
2858 TEST_CALLBACK_CHECK_ASAN;
2859 Test_Callback test(512,
2860 1,
2861@@ -482,6 +520,12 @@ TEST(rocfft_UnitTest, default_store_callback_complex_double)
2862
2863 TEST(rocfft_UnitTest, default_store_callback_real_single)
2864 {
2865+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
2866+ > unittest_prob)
2867+ {
2868+ GTEST_SKIP();
2869+ }
2870+
2871 TEST_CALLBACK_CHECK_ASAN;
2872 Test_Callback test(1024,
2873 1,
2874@@ -493,6 +537,12 @@ TEST(rocfft_UnitTest, default_store_callback_real_single)
2875
2876 TEST(rocfft_UnitTest, default_store_callback_real_double)
2877 {
2878+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
2879+ > unittest_prob)
2880+ {
2881+ GTEST_SKIP();
2882+ }
2883+
2884 TEST_CALLBACK_CHECK_ASAN;
2885 Test_Callback test(2048,
2886 1,
2887@@ -501,3 +551,4 @@ TEST(rocfft_UnitTest, default_store_callback_real_double)
2888 DefaultCallbackType::STORE,
2889 8);
2890 }
2891+#endif
2892diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp
2893index 3d3de51..aff9dc1 100644
2894--- a/clients/tests/gtest_main.cpp
2895+++ b/clients/tests/gtest_main.cpp
2896@@ -53,11 +53,14 @@
2897 int verbose;
2898
2899 // User-defined random seed
2900-size_t random_seed;
2901+size_t random_seed;
2902+std::random_device default_seed_dev;
2903 // Overall probability of running conventional tests
2904 double test_prob;
2905 // Probability of running tests from the emulation suite
2906 double emulation_prob;
2907+// Probability of running unit tests
2908+double unittest_prob;
2909 // Modifier for probability of running tests with complex interleaved data
2910 double complex_interleaved_prob_factor;
2911 // Modifier for probability of running tests with real data
2912@@ -81,6 +84,9 @@ size_t ramgb;
2913 // Device memory limitation for tests (GiB):
2914 size_t vramgb;
2915
2916+// Number of hip devices to use.
2917+int ngpus{};
2918+
2919 // Allow skipping tests if there is a runtime error
2920 bool skip_runtime_fails;
2921 // But count the number of failures
2922@@ -166,11 +172,8 @@ void init_gtest_flags()
2923 std::swap(temp_list_tests, testing::GTEST_FLAG(list_tests));
2924 }
2925
2926-void precompile_test_kernels(const std::string& precompile_file)
2927+std::vector<std::string> tokens_to_run()
2928 {
2929- std::cout << "precompiling test kernels...\n";
2930- WorkQueue<std::string> tokenQueue;
2931-
2932 init_gtest_flags();
2933
2934 std::vector<std::string> tokens;
2935@@ -218,6 +221,15 @@ void precompile_test_kernels(const std::string& precompile_file)
2936 }
2937 }
2938 }
2939+ return tokens;
2940+}
2941+
2942+void precompile_test_kernels(const std::string& precompile_file)
2943+{
2944+ std::cout << "precompiling test kernels...\n";
2945+ WorkQueue<std::string> tokenQueue;
2946+
2947+ auto tokens = tokens_to_run();
2948
2949 std::random_device dev;
2950 std::mt19937 dist(dev());
2951@@ -306,24 +318,25 @@ int main(int argc, char* argv[])
2952 " HP - hermitian planar\n"
2953 "\n"
2954 "Usage"};
2955-
2956- // Override CLI11 help to print after later CLI11 options that are defined, and allow gtest's
2957- // help.
2958- // After removing the stage-1 options, individual options are set to null (even if set), but we
2959- // can still capture the behaviour by using a flag.
2960-
2961- for(auto opt : app.get_options())
2962- {
2963- app.remove_option(opt);
2964- }
2965+ // Override CLI11 help to print it along gtest's help
2966+ app.set_help_flag("");
2967+ const auto opt_help = app.add_flag("-h, --help", "Produces this help message");
2968 app.add_option("-v, --verbose", verbose, "Print out detailed information for the tests")
2969 ->default_val(0);
2970 app.add_option("--nrand", n_random_tests, "Number of extra randomized tests")->default_val(0);
2971+
2972+ app.add_option("--ngpus", ngpus, "Number of GPUs to use per rank")
2973+ ->default_val(-1)
2974+ ->check(CLI::NonNegativeNumber);
2975+ app.add_option("--gpus", n_random_tests, "Number of extra randomized tests")->default_val(0);
2976 app.add_option("--test_prob", test_prob, "Probability of running individual tests")
2977 ->default_val(1.0)
2978 ->check(CLI::Range(0.0, 1.0));
2979+ app.add_option("--unittest_prob", unittest_prob, "Probability of running individual unit tests")
2980+ ->default_val(1.0)
2981+ ->check(CLI::Range(0.0, 1.0));
2982 app.add_option(
2983- "--emulation_prob", test_prob, "Probability of running individual emulation tests")
2984+ "--emulation_prob", emulation_prob, "Probability of running individual emulation tests")
2985 ->default_val(1.0)
2986 ->check(CLI::Range(0.0, 1.0));
2987 app.add_option("--real_prob",
2988@@ -411,7 +424,7 @@ int main(int argc, char* argv[])
2989 if(mp_lib == fft_params::fft_mp_lib_none)
2990 {
2991 std::cout << "--mp_launch requires an mp library (see mp_lib in --help).\n";
2992- std::exit(-1);
2993+ std::exit(EXIT_FAILURE);
2994 }
2995 })
2996 ->needs("--mp_lib");
2997@@ -420,8 +433,9 @@ int main(int argc, char* argv[])
2998 ->each([&](const std::string&) {
2999 // The objective is to have an test that takes about 5 minutes, so just set the
3000 // probability per test to a small value to achieve this result.
3001- test_prob = 0.001;
3002- emulation_prob = 0.01;
3003+ test_prob = 0.0005;
3004+ emulation_prob = 0.005;
3005+ unittest_prob = 0.2;
3006 n_random_tests = 10;
3007 });
3008
3009@@ -429,53 +443,8 @@ int main(int argc, char* argv[])
3010 manual_params.run_callbacks = true;
3011 });
3012
3013- {
3014- // We explicitly scope opt_seed so that the object falls out of scope before the final
3015- // parsing of the command line arguments. Otherwise, the second parsing would mark the
3016- // option as not having been specified, which can get rather confusing.
3017-
3018- auto opt_seed = app.add_option(
3019- "--seed", random_seed, "Random seed; if unset, use an actual random seed");
3020-
3021- // Try parsing initial args that will be used to configure tests.
3022- // Allow extras to pass on gtest and rocFFT arguments without error.
3023- app.allow_extras();
3024- try
3025- {
3026- app.parse(argc, argv);
3027- }
3028- catch(const CLI::ParseError& e)
3029- {
3030- return app.exit(e);
3031- }
3032-
3033- if(!*opt_seed)
3034- {
3035- std::cout << "Generating random seed: ";
3036- std::random_device dev;
3037- random_seed = dev();
3038- std::cout << random_seed << "\n";
3039- }
3040- }
3041-
3042- app.set_help_flag("");
3043- auto opt_help = app.add_flag("-h, --help", "Produces this help message");
3044-
3045- std::vector<std::string> remaining_args = app.remaining();
3046- // Google test ignores the first element, so add something there so that it parses all of hte
3047- // arguments that we want it to parse.:
3048- remaining_args.insert(remaining_args.begin(), argv0);
3049- // NB: If we initialize gtest first, then it removes all of its own command-line
3050- // arguments and sets argc and argv correctly;
3051- std::vector<char*> carg;
3052- for(std::string& s : remaining_args)
3053- {
3054- carg.push_back(&s[0]);
3055- }
3056- carg.push_back(NULL);
3057- decltype(argc) cargc = carg.size() - 1;
3058- ::testing::InitGoogleTest(&cargc, carg.data());
3059-
3060+ app.add_option("--seed", random_seed, "Random seed; if unset, use an actual random seed")
3061+ ->default_val(default_seed_dev());
3062 // Filename for fftw and fftwf wisdom.
3063 std::string fftw_wisdom_filename;
3064
3065@@ -488,6 +457,9 @@ int main(int argc, char* argv[])
3066 // Full path to bitwise repro database file
3067 std::string repro_db_path;
3068
3069+ // Bool option to just print tokens and exit
3070+ bool printtokens{false};
3071+
3072 // Declare the supported options. Some option pointers are declared to track passed opts.
3073 app.add_flag("--version", "Print queryable version information from the rocfft library")
3074 ->each([](const std::string&) {
3075@@ -515,6 +487,11 @@ int main(int argc, char* argv[])
3076 "forward\n3) real inverse")
3077 ->default_val(fft_transform_type_complex_forward);
3078 non_token
3079+ ->add_option("--auto_allocation",
3080+ manual_params.auto_allocate,
3081+ "rocFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"")
3082+ ->default_val("default");
3083+ non_token
3084 ->add_option("--precision",
3085 manual_params.precision,
3086 "Transform precision: single (default), double, half")
3087@@ -576,6 +553,7 @@ int main(int argc, char* argv[])
3088 precompile_file,
3089 "Precompile kernels to a file for all test cases before running tests")
3090 ->default_val("");
3091+ app.add_flag("--printtokens", printtokens, "Print test tokens to scheduled to be run and exit");
3092 // Default value is set in fft_params.h based on if device-side PRNG was enabled.
3093 app.add_option("-g, --inputGen",
3094 manual_params.igen,
3095@@ -584,36 +562,57 @@ int main(int argc, char* argv[])
3096 "2) linearly-spaced sequence (device)\n"
3097 "3) linearly-spaced sequence (host)");
3098
3099- // Parse rest of args and catch any errors here
3100+ // Try parsing initial args that will be used to configure tests
3101+ // Allow extras to pass on gtest arguments without error
3102+ app.allow_extras();
3103 try
3104 {
3105- app.parse(cargc, carg.data());
3106+ app.parse(argc, argv);
3107 }
3108 catch(const CLI::ParseError& e)
3109 {
3110 return app.exit(e);
3111 }
3112
3113+ // extract remaining arguments for subsequent gtest initialization
3114+ std::vector<std::string> remaining_args = app.remaining();
3115+ std::string gtest_help_opt = "--help";
3116+ // NB: If we initialize gtest first, then it removes all of its own command-line
3117+ // arguments and sets argc and argv correctly;
3118+ std::vector<char*> gtest_argv;
3119+ gtest_argv.insert(gtest_argv.begin(), argv[0]);
3120+ for(std::string& s : remaining_args)
3121+ {
3122+ gtest_argv.push_back(&s[0]);
3123+ }
3124+ if(*opt_help)
3125+ {
3126+ // make sure gtest prints its help as well
3127+ gtest_argv.push_back(&gtest_help_opt[0]);
3128+ }
3129+ gtest_argv.push_back(NULL);
3130+ decltype(argc) gtest_argc = gtest_argv.size() - 1;
3131+ ::testing::InitGoogleTest(&gtest_argc, gtest_argv.data()); // gtest-relevant args are removed
3132+
3133 if(*opt_help)
3134 {
3135 std::cout << app.help() << "\n";
3136 return EXIT_SUCCESS;
3137 }
3138-
3139- // Ensure there are no leftover options used by neither gtest nor CLI11
3140- const auto leftover_args = app.remaining();
3141- if(!leftover_args.empty())
3142+ // no help was used, gtest_argc is expected to be 1 at this point. If not, some of the
3143+ // used options were not recognized at all
3144+ if(gtest_argc > 1)
3145 {
3146 std::cout << "Unrecognised option(s) found:\n ";
3147- for(auto i : leftover_args)
3148- std::cout << i << " ";
3149+ for(auto i = 1; i < gtest_argc; i++)
3150+ std::cout << gtest_argv[i] << " ";
3151 std::cout << "\nRun with --help for more information.\n";
3152 return EXIT_FAILURE;
3153 }
3154
3155 std::cout << "half epsilon: " << half_epsilon << "\tsingle epsilon: " << single_epsilon
3156- << "\tdouble epsilon: " << double_epsilon << "\n";
3157- std::cout << "Random seed: " << random_seed << "\n";
3158+ << "\tdouble epsilon: " << double_epsilon << std::endl;
3159+ std::cout << "Random seed: " << random_seed << std::endl;
3160
3161 // If precompiling, tell rocFFT to use the specified cache file
3162 // to write kernels to
3163@@ -628,9 +627,11 @@ int main(int argc, char* argv[])
3164 }
3165
3166 rocfft_setup();
3167- char v[256];
3168- rocfft_get_version_string(v, 256);
3169- std::cout << "rocFFT version: " << v << "\n";
3170+ {
3171+ char v[256];
3172+ rocfft_get_version_string(v, 256);
3173+ std::cout << "rocFFT version: " << v << std::endl;
3174+ }
3175
3176 #ifdef FFTW_MULTITHREAD
3177 fftw_init_threads();
3178@@ -641,6 +642,7 @@ int main(int argc, char* argv[])
3179
3180 // Set host memory limit from command-line options
3181 host_memory::singleton().set_limit_gbytes(ramgb);
3182+ std::cout << "Host memory limit: " << ramgb << " GiB" << std::endl;
3183
3184 if(use_fftw_wisdom)
3185 {
3186@@ -693,7 +695,7 @@ int main(int argc, char* argv[])
3187
3188 if(!test_token.empty())
3189 {
3190- std::cout << "Reading fft params from token:\n" << test_token << "\n";
3191+ std::cout << "Reading fft params from token:\n" << test_token << std::endl;
3192
3193 try
3194 {
3195@@ -701,8 +703,8 @@ int main(int argc, char* argv[])
3196 }
3197 catch(...)
3198 {
3199- std::cout << "Unable to parse token.\n";
3200- return 1;
3201+ std::cout << "Unable to parse token." << std::endl;
3202+ return EXIT_FAILURE;
3203 }
3204 }
3205 else
3206@@ -729,6 +731,17 @@ int main(int argc, char* argv[])
3207 if(!precompile_file.empty())
3208 precompile_test_kernels(precompile_file);
3209
3210+ if(printtokens)
3211+ {
3212+ std::cout << "Tokens:" << std::endl;
3213+ const auto tokens = tokens_to_run();
3214+ for(const auto& token : tokens)
3215+ {
3216+ std::cout << token << std::endl;
3217+ }
3218+ return EXIT_SUCCESS;
3219+ }
3220+
3221 auto retval = RUN_ALL_TESTS();
3222
3223 if(use_fftw_wisdom)
3224@@ -749,7 +762,8 @@ int main(int argc, char* argv[])
3225 const auto test_minutes
3226 = std::chrono::duration_cast<std::chrono::minutes>(test_duration - test_hours);
3227 std::cout << "Test suite took " << test_hours.count() << " hours " << test_minutes.count()
3228- << " minutes\n\n";
3229+ << " minutes\n"
3230+ << std::endl;
3231
3232 std::cout << "half precision max l-inf epsilon: " << max_linf_eps_half << "\n";
3233 std::cout << "half precision max l2 epsilon: " << max_l2_eps_half << "\n";
3234@@ -758,8 +772,12 @@ int main(int argc, char* argv[])
3235 std::cout << "double precision max l-inf epsilon: " << max_linf_eps_double << "\n";
3236 std::cout << "double precision max l2 epsilon: " << max_l2_eps_double << "\n";
3237 std::cout << "Number of runtime issues: " << n_hip_failures << "\n";
3238+ std::cout << "Number of successful tests: "
3239+ << ::testing::UnitTest::GetInstance()->successful_test_count() << "\n";
3240+ std::cout << "Number of skipped tests: "
3241+ << ::testing::UnitTest::GetInstance()->skipped_test_count() << "\n";
3242
3243- std::cout << "\nRandom seed: " << random_seed << "\n";
3244+ std::cout << "\nRandom seed: " << random_seed << std::endl;
3245
3246 return retval;
3247 }
3248diff --git a/clients/tests/hermitian_test.cpp b/clients/tests/hermitian_test.cpp
3249index 8a17f6d..6c72b25 100644
3250--- a/clients/tests/hermitian_test.cpp
3251+++ b/clients/tests/hermitian_test.cpp
3252@@ -20,6 +20,7 @@
3253
3254 #include "../../shared/accuracy_test.h"
3255 #include "../../shared/gpubuf.h"
3256+#include "../../shared/params_gen.h"
3257 #include "../../shared/rocfft_params.h"
3258 #include "../samples/rocfft/examplekernels.h"
3259 #include "../samples/rocfft/exampleutils.h"
3260@@ -171,12 +172,24 @@ void run_1D_hermitian_test(size_t length)
3261 // test a case that's small enough that it only needs one kernel
3262 TEST(rocfft_UnitTest, 1D_hermitian_single_small)
3263 {
3264+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3265+ > unittest_prob)
3266+ {
3267+ GTEST_SKIP();
3268+ }
3269+
3270 run_1D_hermitian_test(8);
3271 }
3272
3273 // test a case that's big enough that it needs multiple kernels
3274 TEST(rocfft_UnitTest, 1D_hermitian_single_large)
3275 {
3276+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3277+ > unittest_prob)
3278+ {
3279+ GTEST_SKIP();
3280+ }
3281+
3282 run_1D_hermitian_test(8192);
3283 }
3284
3285@@ -198,6 +211,12 @@ std::string str(T begin, T end)
3286 // Test that the GPU Hermitian symmetrizer code produces the correct results.
3287 TEST(rocfft_UnitTest, gpu_symmetrizer)
3288 {
3289+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3290+ > unittest_prob)
3291+ {
3292+ GTEST_SKIP();
3293+ }
3294+
3295 std::vector<std::vector<size_t>> lengths = {{4, 4, 3},
3296 {5},
3297 {8},
3298diff --git a/clients/tests/hipGraph_test.cpp b/clients/tests/hipGraph_test.cpp
3299index 34f7ed2..da0caa7 100644
3300--- a/clients/tests/hipGraph_test.cpp
3301+++ b/clients/tests/hipGraph_test.cpp
3302@@ -22,6 +22,7 @@
3303 #include "../../shared/arithmetic.h"
3304 #include "../../shared/gpubuf.h"
3305 #include "../../shared/hip_object_wrapper.h"
3306+#include "../../shared/params_gen.h"
3307 #include "../../shared/rocfft_against_fftw.h"
3308 #include "../../shared/rocfft_params.h"
3309 #include "rocfft/rocfft.h"
3310@@ -271,6 +272,12 @@ static void compare_data(const std::vector<rocfft_complex<float>>& original_host
3311
3312 TEST(rocfft_UnitTest, hipGraph_execution)
3313 {
3314+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3315+ > unittest_prob)
3316+ {
3317+ GTEST_SKIP();
3318+ }
3319+
3320 hipGraph_t graph = nullptr;
3321 hipGraphExec_t graph_exec = nullptr;
3322
3323@@ -375,7 +382,7 @@ TEST(rocfft_UnitTest, hipGraph_execution)
3324 ASSERT_EQ(hipGraphLaunch(graph_exec, stream), hipSuccess);
3325
3326 ASSERT_EQ(hipStreamSynchronize(stream), hipSuccess);
3327- ASSERT_EQ(hipStreamDestroy(stream), hipSuccess);
3328+ stream.free();
3329
3330 // check for correctness of the output data
3331 compare_data(host_mem_in, device_mem_out);
3332@@ -386,5 +393,5 @@ TEST(rocfft_UnitTest, hipGraph_execution)
3333 fill(host_mem_counter_modified.begin(), host_mem_counter_modified.end(), num_graph_launches);
3334 compare_data_exact_match<size_t>(other_stream, host_mem_counter_modified, device_mem_counter);
3335
3336- ASSERT_EQ(hipStreamDestroy(other_stream), hipSuccess);
3337+ other_stream.free();
3338 }
3339diff --git a/clients/tests/multi_device_test.cpp b/clients/tests/multi_device_test.cpp
3340index 9c4b476..3cc3958 100644
3341--- a/clients/tests/multi_device_test.cpp
3342+++ b/clients/tests/multi_device_test.cpp
3343@@ -49,20 +49,31 @@ enum SplitType
3344 PENCIL_3D,
3345 };
3346
3347-std::vector<fft_params> param_generator_multi_gpu(const SplitType type)
3348+std::vector<fft_params> param_generator_multi_gpu(const SplitType type, const int ngpus)
3349 {
3350 int localDeviceCount = 0;
3351- (void)hipGetDeviceCount(&localDeviceCount);
3352+ if(ngpus <= 0)
3353+ {
3354+ // Use the command-line option as a priority
3355+ if(hipGetDeviceCount(&localDeviceCount) != hipSuccess)
3356+ {
3357+ throw std::runtime_error("hipGetDeviceCount failed");
3358+ }
3359+
3360+ // Limit local device testing to 16 GPUs, as we have some
3361+ // bottlenecks with larger device counts that unreasonably slow
3362+ // down plan creation
3363+ localDeviceCount = std::min<int>(16, localDeviceCount);
3364+ }
3365+ else
3366+ {
3367+ localDeviceCount = ngpus;
3368+ }
3369
3370 // need multiple devices or multiprocessing to test anything
3371 if(localDeviceCount < 2 && mp_lib == fft_params::fft_mp_lib_none)
3372 return {};
3373
3374- // limit local device testing to 16 GPUs, as we have some
3375- // bottlenecks with larger device counts that unreasonably slow
3376- // down plan creation
3377- localDeviceCount = std::min<int>(16, localDeviceCount);
3378-
3379 auto params_complex = param_generator_complex(test_prob,
3380 multi_gpu_sizes,
3381 precision_range_sp_dp,
3382@@ -165,30 +176,30 @@ std::vector<fft_params> param_generator_multi_gpu(const SplitType type)
3383 // split both input and output on slowest FFT dim
3384 INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_dim,
3385 accuracy_test,
3386- ::testing::ValuesIn(param_generator_multi_gpu(SLOW_INOUT)),
3387+ ::testing::ValuesIn(param_generator_multi_gpu(SLOW_INOUT, ngpus)),
3388 accuracy_test::TestName);
3389
3390 // split slowest FFT dim only on input, or only on output
3391 INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_input_dim,
3392 accuracy_test,
3393- ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN)),
3394+ ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN, ngpus)),
3395 accuracy_test::TestName);
3396 INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_output_dim,
3397 accuracy_test,
3398- ::testing::ValuesIn(param_generator_multi_gpu(SLOW_OUT)),
3399+ ::testing::ValuesIn(param_generator_multi_gpu(SLOW_OUT, ngpus)),
3400 accuracy_test::TestName);
3401
3402 // split input on slowest FFT and output on fastest, to minimize data
3403 // movement (only makes sense for rank-2 and higher FFTs)
3404 INSTANTIATE_TEST_SUITE_P(multi_gpu_slowin_fastout,
3405 accuracy_test,
3406- ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN_FAST_OUT)),
3407+ ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN_FAST_OUT, ngpus)),
3408 accuracy_test::TestName);
3409
3410 // 3D pencil decompositions
3411 INSTANTIATE_TEST_SUITE_P(multi_gpu_3d_pencils,
3412 accuracy_test,
3413- ::testing::ValuesIn(param_generator_multi_gpu(PENCIL_3D)),
3414+ ::testing::ValuesIn(param_generator_multi_gpu(PENCIL_3D, ngpus)),
3415 accuracy_test::TestName);
3416
3417 TEST(multi_gpu_validate, catch_validation_errors)
3418@@ -204,7 +215,7 @@ TEST(multi_gpu_validate, catch_validation_errors)
3419 for(auto type : all_split_types)
3420 {
3421 // gather all of the multi-GPU test cases
3422- auto params = param_generator_multi_gpu(type);
3423+ auto params = param_generator_multi_gpu(type, ngpus);
3424
3425 for(size_t i = 0; i < params.size(); ++i)
3426 {
3427@@ -284,3 +295,105 @@ TEST(multi_gpu_validate, catch_validation_errors)
3428 }
3429 }
3430 }
3431+
3432+static const auto multi_gpu_tokens = {
3433+ // clang-format off
3434+
3435+ // input bricks are not contiguous
3436+ "real_forward_len_160_160_160_single_op_batch_1_ifield_brick_lower_0_0_0_0_upper_1_80_160_160_stride_0_25920_162_1_dev_0_brick_lower_0_80_0_0_upper_1_160_160_160_stride_0_25920_162_1_rank_1_dev_1_ofield_brick_lower_0_0_0_0_upper_1_160_80_81_stride_0_6480_81_1_dev_0_brick_lower_0_0_80_0_upper_1_160_160_81_stride_0_6480_81_1_rank_1_dev_1",
3437+ // output bricks are not contiguous
3438+ "real_forward_len_160_160_160_single_op_batch_1_ifield_brick_lower_0_0_0_0_upper_1_80_160_160_stride_0_25600_160_1_dev_0_brick_lower_0_80_0_0_upper_1_160_160_160_stride_0_25600_160_1_rank_1_dev_1_ofield_brick_lower_0_0_0_0_upper_1_160_80_81_stride_0_6560_82_1_dev_0_brick_lower_0_0_80_0_upper_1_160_160_81_stride_0_6560_82_1_rank_1_dev_1",
3439+ // neither input nor output bricks are contiguous
3440+ "real_forward_len_160_160_160_single_op_batch_1_ifield_brick_lower_0_0_0_0_upper_1_80_160_160_stride_0_25920_162_1_dev_0_brick_lower_0_80_0_0_upper_1_160_160_160_stride_0_25920_162_1_rank_1_dev_1_ofield_brick_lower_0_0_0_0_upper_1_160_80_81_stride_0_6560_82_1_dev_0_brick_lower_0_0_80_0_upper_1_160_160_81_stride_0_6560_82_1_rank_1_dev_1",
3441+ // 1D multi-process batched in-place transform using 1 device per rank
3442+ "complex_forward_len_256_double_ip_batch_4_ifield_brick_lower_0_0_upper_4_128_stride_128_1_dev_0_brick_lower_0_128_upper_4_256_stride_128_1_rank_1_dev_1_ofield_brick_lower_0_0_upper_4_128_stride_128_1_dev_0_brick_lower_0_128_upper_4_256_stride_128_1_rank_1_dev_1",
3443+ // 2D multi-process out-of-place transform using 2 MPI ranks each with 2 GPUs
3444+ "complex_forward_len_128_256_single_op_batch_1_ifield_brick_lower_0_0_0_upper_1_128_64_stride_8192_64_1_dev_0_brick_lower_0_0_64_upper_1_128_128_stride_8192_64_1_rank_1_dev_1_brick_lower_0_0_128_upper_1_128_192_stride_8192_64_1_rank_0_dev_2_brick_lower_0_0_192_upper_1_128_256_stride_8192_64_1_rank_1_dev_3_ofield_brick_lower_0_0_0_upper_1_128_64_stride_8192_64_1_dev_0_brick_lower_0_0_64_upper_1_128_128_stride_8192_64_1_rank_1_dev_1_brick_lower_0_0_128_upper_1_128_192_stride_8192_64_1_rank_0_dev_2_brick_lower_0_0_192_upper_1_128_256_stride_8192_64_1_rank_1_dev_3",
3445+ // 3D multi-process out-of-place transform using 2 MPI ranks each with 2 GPUs
3446+ "complex_forward_len_256_256_256_double_op_batch_1_ifield_brick_lower_0_0_0_0_upper_1_64_256_256_stride_4194304_65536_256_1_dev_0_brick_lower_0_64_0_0_upper_1_128_256_256_stride_4194304_65536_256_1_rank_0_dev_1_brick_lower_0_128_0_0_upper_1_192_256_256_stride_4194304_65536_256_1_rank_1_dev_2_brick_lower_0_192_0_0_upper_1_256_256_256_stride_4194304_65536_256_1_rank_1_dev_3_ofield_brick_lower_0_0_0_0_upper_1_256_256_64_stride_4194304_16384_64_1_dev_0_brick_lower_0_0_0_64_upper_1_256_256_128_stride_4194304_16384_64_1_rank_0_dev_1_brick_lower_0_0_0_128_upper_1_256_256_192_stride_4194304_16384_64_1_rank_1_dev_2_brick_lower_0_0_0_192_upper_1_256_256_256_stride_4194304_16384_64_1_rank_1_dev_3",
3447+ // 3D multi-process batched in-place transform using 2 MPI ranks each with 2 GPUs
3448+ "complex_forward_len_128_300_256_single_op_batch_4_ifield_brick_lower_0_0_0_0_upper_4_32_300_256_stride_2457600_76800_256_1_dev_0_brick_lower_0_32_0_0_upper_4_64_300_256_stride_2457600_76800_256_1_rank_1_dev_1_brick_lower_0_64_0_0_upper_4_96_300_256_stride_2457600_76800_256_1_rank_0_dev_2_brick_lower_0_96_0_0_upper_4_128_300_256_stride_2457600_76800_256_1_rank_1_dev_3_ofield_brick_lower_0_0_0_0_upper_4_128_300_64_stride_2457600_19200_64_1_dev_0_brick_lower_0_0_0_64_upper_4_128_300_128_stride_2457600_19200_64_1_rank_1_dev_1_brick_lower_0_0_0_128_upper_4_128_300_192_stride_2457600_19200_64_1_rank_0_dev_2_brick_lower_0_0_0_192_upper_4_128_300_256_stride_2457600_19200_64_1_rank_1_dev_3 ",
3449+
3450+ // clang-format on
3451+};
3452+
3453+std::vector<fft_params> param_generator_multi_gpu_adhoc()
3454+{
3455+ int localDeviceCount = 0;
3456+ if(ngpus <= 0)
3457+ {
3458+ // Use the command-line option as a priority
3459+ if(hipGetDeviceCount(&localDeviceCount) != hipSuccess)
3460+ {
3461+ throw std::runtime_error("hipGetDeviceCount failed");
3462+ }
3463+
3464+ // Limit local device testing to 16 GPUs, as we have some
3465+ // bottlenecks with larger device counts that unreasonably slow
3466+ // down plan creation
3467+ localDeviceCount = std::min<int>(16, localDeviceCount);
3468+ }
3469+ else
3470+ {
3471+ localDeviceCount = ngpus;
3472+ }
3473+
3474+ auto all_params = param_generator_token(test_prob, multi_gpu_tokens);
3475+
3476+ // check if fields use more bricks than we can support
3477+ auto too_many_bricks = [=](const std::vector<fft_params::fft_field>& fields, size_t maxBricks) {
3478+ for(const auto& f : fields)
3479+ {
3480+ if(f.bricks.size() > maxBricks)
3481+ return true;
3482+
3483+ // also remove a test case if it uses a numbered device
3484+ // that isn't available
3485+ if(std::any_of(f.bricks.begin(), f.bricks.end(), [=](const fft_params::fft_brick& b) {
3486+ return b.device >= localDeviceCount;
3487+ }))
3488+ return true;
3489+ }
3490+ return false;
3491+ };
3492+
3493+ // remove test cases where we don't have enough ranks/devices for
3494+ // the number of bricks
3495+ all_params.erase(std::remove_if(all_params.begin(),
3496+ all_params.end(),
3497+ [=](const fft_params& params) {
3498+ size_t maxBricks = mp_lib == fft_params::fft_mp_lib_mpi
3499+ ? mp_ranks
3500+ : localDeviceCount;
3501+ return too_many_bricks(params.ifields, maxBricks)
3502+ || too_many_bricks(params.ofields, maxBricks);
3503+ }),
3504+ all_params.end());
3505+
3506+ // set all bricks in a field to rank-0, to change an MPI test
3507+ // case to single-proc
3508+ auto set_rank_0 = [](std::vector<fft_params::fft_field>& fields) {
3509+ for(auto& f : fields)
3510+ {
3511+ for(auto& b : f.bricks)
3512+ b.rank = 0;
3513+ }
3514+ };
3515+
3516+ // modify the remaining test cases to use the current multi-GPU lib
3517+ for(auto& params : all_params)
3518+ {
3519+ params.mp_lib = mp_lib;
3520+ if(mp_lib == fft_params::fft_mp_lib_none)
3521+ {
3522+ set_rank_0(params.ifields);
3523+ set_rank_0(params.ofields);
3524+ }
3525+ }
3526+ return all_params;
3527+}
3528+
3529+INSTANTIATE_TEST_SUITE_P(multi_gpu_adhoc_token,
3530+ accuracy_test,
3531+ ::testing::ValuesIn(param_generator_multi_gpu_adhoc()),
3532+ accuracy_test::TestName);
3533diff --git a/clients/tests/multithread_test.cpp b/clients/tests/multithread_test.cpp
3534index 5ec9a5f..3b6cd5e 100644
3535--- a/clients/tests/multithread_test.cpp
3536+++ b/clients/tests/multithread_test.cpp
3537@@ -21,6 +21,7 @@
3538 #include "../../shared/accuracy_test.h"
3539 #include "../../shared/gpubuf.h"
3540 #include "../../shared/hip_object_wrapper.h"
3541+#include "../../shared/params_gen.h"
3542 #include "../../shared/rocfft_against_fftw.h"
3543 #include "../../shared/rocfft_params.h"
3544 #include "rocfft/rocfft.h"
3545@@ -322,30 +323,66 @@ static void multistream_transform(size_t N, size_t dim, size_t num_streams)
3546 // fitting into e.g. 8 GB of GPU memory
3547 TEST(DISABLED_rocfft_UnitTest, simple_multithread_1D)
3548 {
3549+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3550+ > unittest_prob)
3551+ {
3552+ GTEST_SKIP();
3553+ }
3554+
3555 multithread_transform(1048576, 1, 64);
3556 }
3557
3558 TEST(DISABLED_rocfft_UnitTest, simple_multithread_2D)
3559 {
3560+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3561+ > unittest_prob)
3562+ {
3563+ GTEST_SKIP();
3564+ }
3565+
3566 multithread_transform(1024, 2, 64);
3567 }
3568
3569 TEST(DISABLED_rocfft_UnitTest, simple_multithread_3D)
3570 {
3571+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3572+ > unittest_prob)
3573+ {
3574+ GTEST_SKIP();
3575+ }
3576+
3577 multithread_transform(128, 3, 40);
3578 }
3579
3580 TEST(rocfft_UnitTest, simple_multistream_1D)
3581 {
3582+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3583+ > unittest_prob)
3584+ {
3585+ GTEST_SKIP();
3586+ }
3587+
3588 multistream_transform(1048576, 1, 32);
3589 }
3590
3591 TEST(rocfft_UnitTest, simple_multistream_2D)
3592 {
3593+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3594+ > unittest_prob)
3595+ {
3596+ GTEST_SKIP();
3597+ }
3598+
3599 multistream_transform(1024, 2, 32);
3600 }
3601
3602 TEST(rocfft_UnitTest, simple_multistream_3D)
3603 {
3604+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3605+ > unittest_prob)
3606+ {
3607+ GTEST_SKIP();
3608+ }
3609+
3610 multistream_transform(128, 3, 32);
3611 }
3612diff --git a/clients/tests/unit_test.cpp b/clients/tests/unit_test.cpp
3613index 1b9b970..6ee83a2 100644
3614--- a/clients/tests/unit_test.cpp
3615+++ b/clients/tests/unit_test.cpp
3616@@ -23,10 +23,13 @@
3617 #include "../../shared/concurrency.h"
3618 #include "../../shared/environment.h"
3619 #include "../../shared/gpubuf.h"
3620+#include "../../shared/params_gen.h"
3621+#include "../../shared/precision_type.h"
3622 #include "../../shared/rocfft_complex.h"
3623 #include "hip/hip_runtime_api.h"
3624 #include <boost/scope_exit.hpp>
3625 #include <condition_variable>
3626+#include <cstdio>
3627 #include <cstdlib>
3628 #include <fstream>
3629 #include <gtest/gtest.h>
3630@@ -58,6 +61,12 @@ namespace fs = std::filesystem;
3631
3632 TEST(rocfft_UnitTest, plan_description)
3633 {
3634+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3635+ > unittest_prob)
3636+ {
3637+ GTEST_SKIP();
3638+ }
3639+
3640 rocfft_plan_description desc = nullptr;
3641 ASSERT_TRUE(rocfft_status_success == rocfft_plan_description_create(&desc));
3642
3643@@ -106,6 +115,12 @@ TEST(rocfft_UnitTest, plan_description_reuse)
3644 // check that a plan description can be reused between different
3645 // plans, with different layout parameters for each.
3646
3647+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3648+ > unittest_prob)
3649+ {
3650+ GTEST_SKIP();
3651+ }
3652+
3653 // allocate plan description once
3654 rocfft_plan_description desc = nullptr;
3655 ASSERT_EQ(rocfft_plan_description_create(&desc), rocfft_status_success);
3656@@ -191,9 +206,92 @@ TEST(rocfft_UnitTest, plan_description_reuse)
3657 ASSERT_EQ(rocfft_plan_description_destroy(desc), rocfft_status_success);
3658 }
3659
3660+// run a transform with all log levels enabled
3661+TEST(rocfft_UnitTest, log_levels)
3662+{
3663+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3664+ > unittest_prob)
3665+ {
3666+ GTEST_SKIP();
3667+ }
3668+
3669+ // clean up environment and temporary file when we exit
3670+ BOOST_SCOPE_EXIT_ALL(=)
3671+ {
3672+ rocfft_cleanup();
3673+ // re-init logs with default logging
3674+ rocfft_setup();
3675+ };
3676+ rocfft_cleanup();
3677+
3678+ // enumerate all known log levels and direct all of the logs to nowhere
3679+ EnvironmentSetTemp layer("ROCFFT_LAYER", std::to_string(0xffffffff).c_str());
3680+#ifdef WIN32
3681+ static const char* log_output = "NUL";
3682+#else
3683+ static const char* log_output = "/dev/null";
3684+#endif
3685+ EnvironmentSetTemp log_trace_path("ROCFFT_LOG_TRACE_PATH", log_output);
3686+ EnvironmentSetTemp log_bench_path("ROCFFT_LOG_BENCH_PATH", log_output);
3687+ EnvironmentSetTemp log_profile_path("ROCFFT_LOG_PROFILE_PATH", log_output);
3688+ EnvironmentSetTemp log_plan_path("ROCFFT_LOG_PLAN_PATH", log_output);
3689+ EnvironmentSetTemp log_kernelio_path("ROCFFT_LOG_KERNELIO_PATH", log_output);
3690+ EnvironmentSetTemp log_rtc_path("ROCFFT_LOG_RTC_PATH", log_output);
3691+ EnvironmentSetTemp log_tuning_path("ROCFFT_LOG_TUNING_PATH", log_output);
3692+ EnvironmentSetTemp log_graph_path("ROCFFT_LOG_GRAPH_PATH", log_output);
3693+
3694+ rocfft_setup();
3695+
3696+ // Test single-kernel Bluestein and a multi-kernel plan
3697+ //
3698+ // TODO: add fused L1D Bluestein case like 8191, as that does weird
3699+ // things with buffers
3700+ for(const size_t length : {
3701+ 37,
3702+ 64,
3703+ 32768,
3704+ })
3705+ {
3706+ for(const auto type : {rocfft_transform_type_complex_forward,
3707+ rocfft_transform_type_real_forward,
3708+ rocfft_transform_type_real_inverse})
3709+ {
3710+ for(const auto precision :
3711+ {rocfft_precision_single, rocfft_precision_double, rocfft_precision_half})
3712+ {
3713+ rocfft_plan plan = nullptr;
3714+ ASSERT_EQ(
3715+ rocfft_plan_create(
3716+ &plan, rocfft_placement_inplace, type, precision, 1, &length, 1, nullptr),
3717+ rocfft_status_success);
3718+
3719+ // assume transform uses complex, will overallocate for real
3720+ // transforms but we only care about logging
3721+ gpubuf data_dev;
3722+ ASSERT_EQ(
3723+ data_dev.alloc(element_size(precision, rocfft_array_type_complex_interleaved)
3724+ * length),
3725+ hipSuccess);
3726+
3727+ void* data_dev_ptr = data_dev.data();
3728+ ASSERT_EQ(rocfft_execute(plan, &data_dev_ptr, nullptr, nullptr),
3729+ rocfft_status_success);
3730+
3731+ rocfft_plan_destroy(plan);
3732+ }
3733+ }
3734+ }
3735+}
3736+
3737 // Check whether logs can be emitted from multiple threads properly
3738 TEST(rocfft_UnitTest, log_multithreading)
3739 {
3740+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3741+ > unittest_prob)
3742+ {
3743+ GTEST_SKIP();
3744+ }
3745+
3746 static const int NUM_THREADS = 10;
3747 static const int NUM_ITERS_PER_THREAD = 50;
3748 static const char* TRACE_FILE = "trace.log";
3749@@ -321,18 +419,36 @@ void workmem_test(workmem_sizer sizer,
3750 // - library should allocate
3751 TEST(rocfft_UnitTest, workmem_missing)
3752 {
3753+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3754+ > unittest_prob)
3755+ {
3756+ GTEST_SKIP();
3757+ }
3758+
3759 workmem_test([](size_t) { return 0; }, rocfft_status_success);
3760 }
3761
3762 // check what happens if work memory is required but not enough is provided
3763 TEST(rocfft_UnitTest, workmem_small)
3764 {
3765+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3766+ > unittest_prob)
3767+ {
3768+ GTEST_SKIP();
3769+ }
3770+
3771 workmem_test([](size_t requested) { return requested / 2; }, rocfft_status_invalid_work_buffer);
3772 }
3773
3774 // hard to imagine this being a problem, but try giving too much as well
3775 TEST(rocfft_UnitTest, workmem_big)
3776 {
3777+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3778+ > unittest_prob)
3779+ {
3780+ GTEST_SKIP();
3781+ }
3782+
3783 workmem_test([](size_t requested) { return requested * 2; }, rocfft_status_success);
3784 }
3785
3786@@ -341,13 +457,25 @@ TEST(rocfft_UnitTest, workmem_big)
3787 // allocates
3788 TEST(rocfft_UnitTest, workmem_null)
3789 {
3790+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3791+ > unittest_prob)
3792+ {
3793+ GTEST_SKIP();
3794+ }
3795+
3796 workmem_test([](size_t requested) { return requested; }, rocfft_status_success, true);
3797 }
3798
3799 static const size_t RTC_PROBLEM_SIZE = 2304;
3800-// runtime compilation cache tests
3801-TEST(rocfft_UnitTest, rtc_cache)
3802+// runtime compilation cache tests main loop
3803+void rtc_cache_main()
3804 {
3805+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3806+ > unittest_prob)
3807+ {
3808+ GTEST_SKIP();
3809+ }
3810+
3811 // PRECONDITIONS
3812
3813 // - set cache location to custom path, requires uninitializing
3814@@ -494,9 +622,27 @@ TEST(rocfft_UnitTest, rtc_cache)
3815 ASSERT_TRUE(fft_kernel_was_compiled());
3816 }
3817
3818+// run the main body of rtc cache tests twice to uncover potential
3819+// problems with thread reuse between iterations
3820+TEST(rocfft_UnitTest, rtc_cache_iter_1)
3821+{
3822+ rtc_cache_main();
3823+}
3824+
3825+TEST(rocfft_UnitTest, rtc_cache_iter_2)
3826+{
3827+ rtc_cache_main();
3828+}
3829+
3830 // make sure cache API functions tolerate null pointers without crashing
3831 TEST(rocfft_UnitTest, rtc_cache_null)
3832 {
3833+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3834+ > unittest_prob)
3835+ {
3836+ GTEST_SKIP();
3837+ }
3838+
3839 void* buf = nullptr;
3840 size_t buf_len = 0;
3841 ASSERT_EQ(rocfft_cache_serialize(nullptr, &buf_len), rocfft_status_invalid_arg_value);
3842@@ -509,6 +655,12 @@ TEST(rocfft_UnitTest, rtc_cache_null)
3843 // make sure RTC gracefully handles a helper process that crashes
3844 TEST(rocfft_UnitTest, rtc_helper_crash)
3845 {
3846+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3847+ > unittest_prob)
3848+ {
3849+ GTEST_SKIP();
3850+ }
3851+
3852 #ifdef WIN32
3853 char filename[MAX_PATH];
3854 GetModuleFileNameA(NULL, filename, MAX_PATH);
3855@@ -574,6 +726,12 @@ TEST(rocfft_UnitTest, rtc_helper_crash)
3856
3857 TEST(rocfft_UnitTest, rtc_test_harness)
3858 {
3859+ if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name())
3860+ > unittest_prob)
3861+ {
3862+ GTEST_SKIP();
3863+ }
3864+
3865 // check that hipcc is available since this test requires it
3866 //
3867 // NOTE: using system() for launching subprocesses for simplicity
3868@@ -597,8 +755,17 @@ TEST(rocfft_UnitTest, rtc_test_harness)
3869
3870 // extra scope to control lifetime of env vars
3871 {
3872- // rtc test harness writes to system's temp directory
3873- auto tmp_path = fs::temp_directory_path();
3874+ // create a temporary directory to hold all of the temp files
3875+ // that get created
3876+ const fs::path tmp_path = std::tmpnam(nullptr);
3877+ try
3878+ {
3879+ fs::create_directory(tmp_path);
3880+ }
3881+ catch(fs::filesystem_error& e)
3882+ {
3883+ GTEST_SKIP() << "unable to create temp dir for test harnesses: " << e.what();
3884+ }
3885
3886 // activate writing of rtc test harnesses
3887 EnvironmentSetTemp env_harness("ROCFFT_DEBUG_GENERATE_KERNEL_HARNESS", "1");
3888@@ -613,17 +780,6 @@ TEST(rocfft_UnitTest, rtc_test_harness)
3889
3890 rocfft_setup();
3891
3892- // ensure stale files from previous runs of this test won't cause
3893- // problems - clean up any rocfft_kernel_harness_*.cpp files that
3894- // might be left behind
3895- for(const auto& entry : std::filesystem::directory_iterator{tmp_path})
3896- {
3897- auto filename = entry.path().filename();
3898- if(filename.string().compare(0, 22, "rocfft_kernel_harness_") == 0
3899- && filename.extension().string() == ".cpp")
3900- fs::remove(entry);
3901- }
3902-
3903 // construct a few different types of plans to try to get all
3904 // different kernels compiled
3905
3906@@ -704,6 +860,19 @@ TEST(rocfft_UnitTest, rtc_test_harness)
3907
3908 // check that all compiles succeeded
3909 for(const auto& file : files)
3910+ {
3911 ASSERT_EQ(file.second, 0);
3912+ }
3913+
3914+ // clean up temporary files
3915+ try
3916+ {
3917+ fs::remove_all(tmp_path);
3918+ }
3919+ catch(fs::filesystem_error&)
3920+ {
3921+ // this should work, but ignore errors as the build
3922+ // status is what matters for this test
3923+ }
3924 }
3925 }
3926diff --git a/cmake/sqlite.cmake b/cmake/sqlite.cmake
3927index 098be8f..2dc2a28 100644
3928--- a/cmake/sqlite.cmake
3929+++ b/cmake/sqlite.cmake
3930@@ -1,4 +1,4 @@
3931-# Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
3932+# Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
3933 #
3934 # Permission is hereby granted, free of charge, to any person obtaining a copy
3935 # of this software and associated documentation files (the "Software"), to deal
3936@@ -20,35 +20,36 @@
3937
3938 include( ExternalProject )
3939
3940-# SQLite 3.36.0 enabled the backup API by default, which we need
3941-# for cache serialization. We also want to use a static SQLite,
3942-# and distro static libraries aren't typically built
3943-# position-independent.
3944+# SQLite is used for rtc_cache. Require a safe baseline (>= 3.50.2).
3945+# Note: the backup API we rely on has been enabled by default since 3.36.0.
3946 option( SQLITE_USE_SYSTEM_PACKAGE "Use SQLite3 from find_package" OFF )
3947
3948 if( SQLITE_USE_SYSTEM_PACKAGE )
3949- find_package(SQLite3 3.36 REQUIRED)
3950+ # Require a safe baseline (fixes truncation/memory-corruption issues).
3951+ find_package(SQLite3 3.50.2 REQUIRED)
3952 list(APPEND static_depends PACKAGE SQLite3)
3953 set(ROCFFT_SQLITE_LIB SQLite::SQLite3)
3954 else()
3955 include( FetchContent )
3956
3957- if(DEFINED ENV{SQLITE_3_43_2_SRC_URL})
3958- set(SQLITE_3_43_2_SRC_URL_INIT $ENV{SQLITE_3_43_2_SRC_URL})
3959+ # embed SQLite amalgamation (version 3.50.2 -> serial 3500200).
3960+ # allow override via environment variable for mirrors/airgapped builds.
3961+ if(DEFINED ENV{SQLITE_3_50_2_SRC_URL})
3962+ set(SQLITE_3_50_2_SRC_URL_INIT $ENV{SQLITE_3_50_2_SRC_URL})
3963 else()
3964- set(SQLITE_3_43_2_SRC_URL_INIT https://www.sqlite.org/2023/sqlite-amalgamation-3430200.zip)
3965+ set(SQLITE_3_50_2_SRC_URL_INIT https://www.sqlite.org/2025/sqlite-amalgamation-3500200.zip)
3966 endif()
3967- set(SQLITE_3_43_2_SRC_URL ${SQLITE_3_43_2_SRC_URL_INIT} CACHE STRING "Location of SQLite source code")
3968- set(SQLITE_SRC_3_43_2_SHA3_256 af02b88cc922e7506c6659737560c0756deee24e4e7741d4b315af341edd8b40 CACHE STRING "SHA3-256 hash of SQLite source code")
3969+ set(SQLITE_3_50_2_SRC_URL ${SQLITE_3_50_2_SRC_URL_INIT} CACHE STRING "Location of SQLite source code")
3970+ set(SQLITE_SRC_3_50_2_SHA3_256 75c118e727ee6a9a3d2c0e7c577500b0c16a848d109027f087b915b671f61f8a CACHE STRING "SHA3-256 hash of SQLite source code")
3971
3972- # embed SQLite
3973+ # use extract timestamp for fetched files instead of timestamps in the archive
3974 if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24)
3975- # use extract timestamp for fetched files instead of timestamps in the archive
3976 cmake_policy(SET CMP0135 NEW)
3977 endif()
3978+
3979 FetchContent_Declare(sqlite_local
3980- URL ${SQLITE_3_43_2_SRC_URL}
3981- URL_HASH SHA3_256=${SQLITE_SRC_3_43_2_SHA3_256}
3982+ URL ${SQLITE_3_50_2_SRC_URL}
3983+ URL_HASH SHA3_256=${SQLITE_SRC_3_50_2_SHA3_256}
3984 )
3985 FetchContent_MakeAvailable(sqlite_local)
3986
3987diff --git a/debian/changelog b/debian/changelog
3988index 4aea252..4c9901e 100644
3989--- a/debian/changelog
3990+++ b/debian/changelog
3991@@ -1,3 +1,27 @@
3992+rocfft (7.1.0-0ubuntu1) resolute; urgency=medium
3993+
3994+ [Talha Can Havadar]
3995+ * New upstream version 7.1.0 (LP: #2139240)
3996+ * d/control: remove ppc64el builds (LP: #2134241)
3997+ * d/control: update b-d to use upstream llvm based toolchain
3998+ * d/gbp.conf: update debian branch to ubuntu/devel
3999+ * d/patches/fix-hiprtc-link.patch: is already upstreamed
4000+ * d/p/Extend-docs-conf.py-for-offline-build.patch: fix the hunk
4001+ * d/p/do-not-strictly-depend-on-sqlite3-v3.50.2.patch: no real hard
4002+ requirement for build to fail
4003+ * d/librocfft-dev.examples: removed from upstream
4004+ * Changes by Igor Luppi made on top of 6.4.3-1~exp1 listed below to keep
4005+ history intact
4006+
4007+ [Igor Luppi]
4008+ * d/rules: use GPU_TARGETS instead of deprecated AMDGPU_TARGETS
4009+ * d/control: remove libamdhip64-dev since hipcc-rocm is used
4010+ * d/control: update build-depends
4011+ * d/control: update maintainer field
4012+ * d/rules: fix FTBFS by adding -Wl,--gc-sections to flags
4013+
4014+ -- Talha Can Havadar <talha.can.havadar@canonical.com> Fri, 23 Jan 2026 10:09:10 +0100
4015+
4016 rocfft (6.4.3-1) unstable; urgency=medium
4017
4018 [ Cordell Bloor ]
4019diff --git a/debian/control b/debian/control
4020index a58bcfd..56faddd 100644
4021--- a/debian/control
4022+++ b/debian/control
4023@@ -5,25 +5,25 @@ Priority: optional
4024 Standards-Version: 4.7.2
4025 Vcs-Git: https://salsa.debian.org/rocm-team/rocfft.git
4026 Vcs-Browser: https://salsa.debian.org/rocm-team/rocfft
4027-Maintainer: Debian ROCm Team <debian-ai@lists.debian.org>
4028-Uploaders: Cordell Bloor <cgmb@debian.org>,
4029+Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>
4030+XSBC-Original-Maintainer: Debian ROCm Team <debian-ai@lists.debian.org>
4031+Uploaders: Maxime Chambonnet <maxzor@maxzor.eu>,
4032+ Cordell Bloor <cgmb@debian.org>,
4033 Christian Kastner <ckk@debian.org>,
4034 Kari Pahula <kaol@debian.org>
4035 Build-Depends: debhelper-compat (= 13),
4036 cmake,
4037- hipcc (>= 5.6.1~),
4038-# end
4039- libamd-comgr-dev (>= 6.0~),
4040-# ckk 2024-03-02: temporary until hipcc question is resolved:
4041- libamdhip64-dev (>= 5.6.1~),
4042+ hipcc (>= 7.1.0~),
4043+ libamd-comgr-dev (>= 7.1.0~),
4044 libboost-program-options-dev,
4045 libfftw3-dev,
4046 libgtest-dev <!nocheck>,
4047- libhsa-runtime-dev (>= 5.7.1~),
4048+ libhiprand-dev (>= 7.1.0~),
4049+ libhsa-runtime-dev (>= 7.1.0~),
4050 libsqlite3-dev,
4051- pkg-rocm-tools (>= 0.9.0~exp3),
4052+ pkg-rocm-tools (>= 0.9.0),
4053 python3-dev,
4054- rocm-cmake (>= 5.3.0)
4055+ rocm-cmake (>= 7.1.0)
4056 Build-Depends-Indep: dh-sequence-sphinxdoc <!nodoc>,
4057 libjs-jquery <!nodoc>,
4058 libjs-mathjax <!nodoc>,
4059@@ -36,7 +36,7 @@ Rules-Requires-Root: no
4060
4061 Package: librocfft0
4062 Section: libs
4063-Architecture: amd64 arm64 ppc64el
4064+Architecture: amd64 arm64
4065 Multi-Arch: same
4066 XB-X-ROCm-GPU-Architecture: ${rocm:GPU-Architecture}
4067 Depends: ${misc:Depends}, ${shlibs:Depends}
4068@@ -53,7 +53,7 @@ Description: ROCm library for computing Fast Fourier Transforms - library
4069
4070 Package: librocfft-dev
4071 Section: libdevel
4072-Architecture: amd64 arm64 ppc64el
4073+Architecture: amd64 arm64
4074 Depends: librocfft0 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends}
4075 Recommends: libamdhip64-dev
4076 Suggests: librocfft-doc
4077@@ -70,7 +70,7 @@ Description: ROCm library for computing Fast Fourier Transforms - headers
4078
4079 Package: librocfft0-tests
4080 Section: libdevel
4081-Architecture: amd64 arm64 ppc64el
4082+Architecture: amd64 arm64
4083 Build-Profiles: <!nocheck>
4084 Depends: librocfft0 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends}
4085 Description: ROCm library for computing Fast Fourier Transforms - tests
4086diff --git a/debian/gbp.conf b/debian/gbp.conf
4087index 357a581..2f75208 100644
4088--- a/debian/gbp.conf
4089+++ b/debian/gbp.conf
4090@@ -1,5 +1,5 @@
4091 [DEFAULT]
4092-debian-branch = debian/unstable
4093+debian-branch = ubuntu/devel
4094 upstream-branch = upstream/latest
4095 pristine-tar = True
4096
4097diff --git a/debian/librocfft-dev.examples b/debian/librocfft-dev.examples
4098deleted file mode 100644
4099index a8f30e1..0000000
4100--- a/debian/librocfft-dev.examples
4101+++ /dev/null
4102@@ -1 +0,0 @@
4103-docs/samples/*.cpp
4104diff --git a/debian/patches/Extend-docs-conf.py-for-offline-build.patch b/debian/patches/Extend-docs-conf.py-for-offline-build.patch
4105index be118d7..791856a 100644
4106--- a/debian/patches/Extend-docs-conf.py-for-offline-build.patch
4107+++ b/debian/patches/Extend-docs-conf.py-for-offline-build.patch
4108@@ -8,16 +8,14 @@ would cause the build to fail.
4109 docs/conf.py | 2 ++
4110 1 file changed, 2 insertions(+)
4111
4112-diff --git a/docs/conf.py b/docs/conf.py
4113-index ff2d0d2..f8ff415 100644
4114 --- a/docs/conf.py
4115 +++ b/docs/conf.py
4116-@@ -27,6 +27,8 @@ docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
4117+@@ -27,6 +27,8 @@
4118 docs_core.setup()
4119
4120 external_projects_current_project = "rocfft"
4121 +external_projects_remote_repository = ""
4122 +external_projects_remote_branch = ""
4123- external_projects = []
4124
4125 for sphinx_var in ROCmDocs.SPHINX_VARS:
4126+ globals()[sphinx_var] = getattr(docs_core, sphinx_var)
4127diff --git a/debian/patches/do-not-strictly-depend-on-sqlite3-v3.50.2.patch b/debian/patches/do-not-strictly-depend-on-sqlite3-v3.50.2.patch
4128new file mode 100644
4129index 0000000..fc6e543
4130--- /dev/null
4131+++ b/debian/patches/do-not-strictly-depend-on-sqlite3-v3.50.2.patch
4132@@ -0,0 +1,21 @@
4133+From: Talha Can Havadar <havadartalha@gmail.com>
4134+Date: Wed, 26 Nov 2025 15:34:26 +0100
4135+Subject: do not strictly depend on sqlite3 v3.50.2
4136+
4137+---
4138+ cmake/sqlite.cmake | 2 +-
4139+ 1 file changed, 1 insertion(+), 1 deletion(-)
4140+
4141+diff --git a/cmake/sqlite.cmake b/cmake/sqlite.cmake
4142+index 2dc2a28..a44e4ef 100644
4143+--- a/cmake/sqlite.cmake
4144++++ b/cmake/sqlite.cmake
4145+@@ -26,7 +26,7 @@ option( SQLITE_USE_SYSTEM_PACKAGE "Use SQLite3 from find_package" OFF )
4146+
4147+ if( SQLITE_USE_SYSTEM_PACKAGE )
4148+ # Require a safe baseline (fixes truncation/memory-corruption issues).
4149+- find_package(SQLite3 3.50.2 REQUIRED)
4150++ find_package(SQLite3 REQUIRED)
4151+ list(APPEND static_depends PACKAGE SQLite3)
4152+ set(ROCFFT_SQLITE_LIB SQLite::SQLite3)
4153+ else()
4154diff --git a/debian/patches/fix-hiprtc-link.patch b/debian/patches/fix-hiprtc-link.patch
4155deleted file mode 100644
4156index 3ff2d1f..0000000
4157--- a/debian/patches/fix-hiprtc-link.patch
4158+++ /dev/null
4159@@ -1,29 +0,0 @@
4160-From: Cordell Bloor <cgmb@slerp.xyz>
4161-Date: Tue, 23 May 2023 18:57:14 -0600
4162-Subject: fix hiprtc link
4163-
4164-The upstream build code linking hiprtc doesn't work with the Debian
4165-package for hiprtc. The hip rtc library is correctly linked by
4166-hip::host, so there's no need to handle it separately. That said, some
4167-other packages for HIP (such as those provided by Spack), have patched
4168-out the automatic linking of hiprtc in hip::host, so upstream will
4169-probably want to find an alternative solution.
4170-
4171-Forwarded: not-needed
4172----
4173- library/src/CMakeLists.txt | 2 +-
4174- 1 file changed, 1 insertion(+), 1 deletion(-)
4175-
4176-diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt
4177-index 63c584b..c02f2fb 100644
4178---- a/library/src/CMakeLists.txt
4179-+++ b/library/src/CMakeLists.txt
4180-@@ -56,7 +56,7 @@ else()
4181- if( WIN32 )
4182- set( ROCFFT_RTC_LINK_LIBS "${HIP_PATH}/lib/hiprtc.lib" )
4183- else()
4184-- set( ROCFFT_RTC_LINK_LIBS -L${ROCM_PATH}/lib -lhiprtc -ldl )
4185-+ set( ROCFFT_RTC_LINK_LIBS hip::host -ldl )
4186- endif()
4187- endif()
4188-
4189diff --git a/debian/patches/series b/debian/patches/series
4190index 8c51f36..581e5fe 100644
4191--- a/debian/patches/series
4192+++ b/debian/patches/series
4193@@ -1,5 +1,5 @@
4194-fix-hiprtc-link.patch
4195 use-local-mathjax.patch
4196 Extend-docs-conf.py-for-offline-build.patch
4197 Add-version-script-to-control-exposed-symbols.patch
4198 Force-host-code-resultion.patch
4199+do-not-strictly-depend-on-sqlite3-v3.50.2.patch
4200diff --git a/debian/rules b/debian/rules
4201index d92552b..6cbc80a 100755
4202--- a/debian/rules
4203+++ b/debian/rules
4204@@ -2,6 +2,12 @@
4205 export CXX=hipcc
4206 export DEB_BUILD_MAINT_OPTIONS = hardening=+all optimize=-lto
4207 export DEB_CXXFLAGS_MAINT_PREPEND = -gz
4208+# -Wl,--gc-sections:
4209+# Fixes linker errors ("relocation refers to a discarded section") when using gtest.
4210+# The linker's section garbage collection incorrectly discards needed code due to
4211+# a toolchain mismatch (ROCm's clang vs. the system's GCC). This flag forces the
4212+# linker to correctly re-evaluate dependencies, keeping the required sections.
4213+export DEB_LDFLAGS_MAINT_PREPEND = -Wl,--gc-sections
4214 export VERBOSE=1
4215
4216 # filter incompatible options from affecting device code
4217@@ -13,7 +19,6 @@ VERSION_STRING = $(shell sed -nr 's/^set.*VERSION_STRING \"([.0-9]+)\".*/\1/p' C
4218
4219 CMAKE_FLAGS = \
4220 -DCMAKE_BUILD_TYPE=Release \
4221- -DCMAKE_SKIP_RPATH=ON \
4222 -DGPU_TARGETS="$(shell rocm-target-arch --sep ';')" \
4223 -DROCFFT_KERNEL_CACHE_ENABLE=OFF \
4224 -DROCM_SYMLINK_LIBS=OFF \
4225diff --git a/debian/tests/control b/debian/tests/control
4226index bd8c7a5..6cc0da2 100644
4227--- a/debian/tests/control
4228+++ b/debian/tests/control
4229@@ -2,4 +2,4 @@ Test-Command: rocm-test-launcher /usr/libexec/rocm/librocfft0-tests/run-tests
4230 Features: test-name=librocfft0-tests
4231 Depends: librocfft0-tests, pkg-rocm-tools
4232 Restrictions: allow-stderr, skippable
4233-Architecture: amd64 arm64 ppc64el
4234+Architecture: amd64 arm64
4235diff --git a/docs/design/bluestein.rst b/designdocs/bluestein.rst
4236similarity index 100%
4237rename from docs/design/bluestein.rst
4238rename to designdocs/bluestein.rst
4239diff --git a/docs/design/buffer_assignment.rst b/designdocs/buffer_assignment.rst
4240similarity index 100%
4241rename from docs/design/buffer_assignment.rst
4242rename to designdocs/buffer_assignment.rst
4243diff --git a/docs/design/codegen.rst b/designdocs/codegen.rst
4244similarity index 99%
4245rename from docs/design/codegen.rst
4246rename to designdocs/codegen.rst
4247index 46b8255..28fdde0 100644
4248--- a/docs/design/codegen.rst
4249+++ b/designdocs/codegen.rst
4250@@ -308,7 +308,7 @@ tiling. Different twiddle table strategies should extend the
4251 ``multiply`` methods.
4252
4253 Twiddle tables may also require additional templates and arguments.
4254-See :ref:`Stockham tiling implementation`.
4255+See the Stockham tiling implementation section.
4256
4257 Copyright and disclaimer
4258 ========================
4259diff --git a/docs/design/design.rst b/designdocs/design.rst
4260similarity index 66%
4261rename from docs/design/design.rst
4262rename to designdocs/design.rst
4263index 7cb5f7a..7e97b33 100644
4264--- a/docs/design/design.rst
4265+++ b/designdocs/design.rst
4266@@ -10,9 +10,9 @@ Design Documents
4267
4268 The Design Documents contain proposals for features of the rocFFT library. They are intended as development proposals for engineering and contributors to the Open Source library.
4269
4270-The current proposals include the following:
4271+The current proposals in this directory include the following:
4272
4273- * :ref:`codegen`
4274- * :ref:`runtime_compilation`
4275- * :ref:`buffer_assignment`
4276- * :ref:`bluestein`
4277+ * codegen
4278+ * runtime_compilation
4279+ * buffer_assignment
4280+ * bluestein
4281diff --git a/docs/design/images/bluestein_fig1.png b/designdocs/images/bluestein_fig1.png
4282similarity index 100%
4283rename from docs/design/images/bluestein_fig1.png
4284rename to designdocs/images/bluestein_fig1.png
4285Binary files a/docs/design/images/bluestein_fig1.png and b/designdocs/images/bluestein_fig1.png differ
4286diff --git a/docs/design/images/bluestein_fig2.png b/designdocs/images/bluestein_fig2.png
4287similarity index 100%
4288rename from docs/design/images/bluestein_fig2.png
4289rename to designdocs/images/bluestein_fig2.png
4290Binary files a/docs/design/images/bluestein_fig2.png and b/designdocs/images/bluestein_fig2.png differ
4291diff --git a/docs/design/images/bluestein_fig3.png b/designdocs/images/bluestein_fig3.png
4292similarity index 100%
4293rename from docs/design/images/bluestein_fig3.png
4294rename to designdocs/images/bluestein_fig3.png
4295Binary files a/docs/design/images/bluestein_fig3.png and b/designdocs/images/bluestein_fig3.png differ
4296diff --git a/docs/design/images/bluestein_fig4.png b/designdocs/images/bluestein_fig4.png
4297similarity index 100%
4298rename from docs/design/images/bluestein_fig4.png
4299rename to designdocs/images/bluestein_fig4.png
4300Binary files a/docs/design/images/bluestein_fig4.png and b/designdocs/images/bluestein_fig4.png differ
4301diff --git a/docs/design/runtime_compilation.rst b/designdocs/runtime_compilation.rst
4302similarity index 100%
4303rename from docs/design/runtime_compilation.rst
4304rename to designdocs/runtime_compilation.rst
4305diff --git a/docs/conf.py b/docs/conf.py
4306index c83a6fb..304d660 100644
4307--- a/docs/conf.py
4308+++ b/docs/conf.py
4309@@ -13,7 +13,7 @@ left_nav_title = f"rocFFT {version_number} Documentation"
4310 # for PDF output on Read the Docs
4311 project = "rocFFT Documentation"
4312 author = "Advanced Micro Devices, Inc."
4313-copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
4314+copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
4315 version = version_number
4316 release = version_number
4317
4318@@ -24,7 +24,6 @@ docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
4319 docs_core.setup()
4320
4321 external_projects_current_project = "rocfft"
4322-external_projects = []
4323
4324 for sphinx_var in ROCmDocs.SPHINX_VARS:
4325 globals()[sphinx_var] = getattr(docs_core, sphinx_var)
4326diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
4327index c47b8fc..3fbe0d4 100644
4328--- a/docs/doxygen/Doxyfile
4329+++ b/docs/doxygen/Doxyfile
4330@@ -38,7 +38,7 @@ PROJECT_NAME = "rocFFT"
4331 # could be handy for archiving the generated documentation or if some version
4332 # control system is used.
4333
4334-PROJECT_NUMBER = v1.0.32
4335+PROJECT_NUMBER = v1.0.35
4336
4337 # Using the PROJECT_BRIEF tag one can provide an optional one line description
4338 # for a project that appears at the top of each page and should give viewer a
4339diff --git a/docs/how-to/enabling-logging.rst b/docs/how-to/enabling-logging.rst
4340new file mode 100644
4341index 0000000..9d9f59a
4342--- /dev/null
4343+++ b/docs/how-to/enabling-logging.rst
4344@@ -0,0 +1,134 @@
4345+.. meta::
4346+ :description: Enabling logging in rocFFT
4347+ :keywords: rocFFT, ROCm, API, documentation, logging
4348+
4349+
4350+.. _enabling-logging:
4351+
4352+********************************************************************
4353+Enabling logging in rocFFT
4354+********************************************************************
4355+
4356+rocFFT can write a variety of log messages to aid troubleshooting. Here are the different
4357+logs that rocFFT supports.
4358+
4359+* **Trace logging**: Logs the library entry points (for example, ``rocfft_plan_create`` or ``rocfft_execute``) and their parameter
4360+ values when they are called. Error messages during plan creation and execution are also logged here.
4361+* **Benchmark logging**: Logs the ``rocfft-bench`` command line when a plan is created.
4362+ You can use this command to rerun the same transform later.
4363+* **Profile logging**: Logs a message for each kernel launched during plan execution.
4364+ This message contains the following elements:
4365+
4366+ * Kernel duration
4367+ * The size of the user data buffers seen by the kernel
4368+ * Estimates for the observed memory bandwidth and bandwidth efficiency
4369+
4370+ .. note::
4371+
4372+ To provide the kernel duration, rocFFT must use ``hipEvents`` and wait for each kernel to complete.
4373+ This might interfere with time measurement at higher levels, for example, for ``rocfft-bench``.
4374+
4375+* **Plan logging**: Logs the plan details when a transform is executed, including the following:
4376+
4377+ * Each TreeNode in the plan
4378+ * The work buffer size required by the plan
4379+ * The kernel grid and block dimensions
4380+ * The kernel maximum occupancy (estimated by HIP)
4381+
4382+* **Kernel I/O logging**: Logs the kernel details during plan execution, including the input to each
4383+ kernel (the data provided by the user) and the final output of the transform.
4384+
4385+ .. note::
4386+
4387+ The amount of data logged can become very large, particularly for 2D and 3D transforms, so
4388+ logging it to a file instead of stderr is usually a good idea. See the next section for more details.
4389+
4390+ Writing the data involves extra ``hipMemcpy`` operations and serializing the
4391+ data to the log can also take a significant amount of time. Both of these factors affect performance.
4392+
4393+* **Runtime compilation logging**: Logs details about runtime compilation during plan creation,
4394+ including the following:
4395+
4396+ * The source code
4397+ * Messages indicating a kernel was found in a cache, and did not need to be compiled at runtime
4398+ * Compilation errors (if any)
4399+ * Duration measurements indicating the time it took to generate source code for the kernel and compile the kernel
4400+
4401+ The source code for the kernels is delimited by lines containing the strings ``ROCFFT_RTC_BEGIN``
4402+ and ``ROCFFT_RTC_END``. This lets you isolate the source code for each kernel if a
4403+ single log contains code for multiple kernels.
4404+
4405+ .. note::
4406+
4407+ All non-code messages (except for compile errors) are written as C++ comments, so
4408+ you can pass the whole file to clang-format to inspect the source code.
4409+
4410+ The source code details for the runtime compilation can be very large, so consider writing
4411+ this log to a file instead of stderr.
4412+
4413+* **Tuning logging**: Logs details about any kernels that are tried and rejected while tuning is running.
4414+ It also logs messages when tuned solutions are used during plan building.
4415+* **Graph logging**: Logs the graph of subplans during multi-GPU or multi-process plan execution.
4416+ Subplans include FFT plans, transpose plans (to reshape data for communication), and communication steps.
4417+ This is written as Graphviz data. The view of the global graph might be slightly different from
4418+ different nodes. This is because the current node has more visibility into subplans that run locally
4419+ than those that run on other nodes.
4420+
4421+Configuring the logging output
4422+==============================
4423+
4424+The logging output can be controlled using the ``ROCFFT_LAYER`` environment variable.
4425+``ROCFFT_LAYER`` is a numerical bitmask, where zero or more bits can be set to enable one or more logging layers.
4426+The log output is written to stderr by default.
4427+
4428+The following table maps the different logging layers to a ``ROCFFT_LAYER`` bit field value.
4429+To determine what value to set for ``ROCFFT_LAYER``, add up the values of all the layers you want to see.
4430+For example, to see the output for trace, profile, and plan logging, set ``ROCFFT_LAYER`` to ``13``
4431+(``1`` + ``4`` + ``8``).
4432+
4433+.. csv-table::
4434+ :header: "Log type","ROCFFT_LAYER bit field value"
4435+ :widths: 20, 20
4436+
4437+ "Trace logging","1"
4438+ "Benchmark logging","2"
4439+ "Profile logging","4"
4440+ "Plan logging","8"
4441+ "Kernel I/O logging","16"
4442+ "Runtime compilation logging","32"
4443+ "Tuning logging","64"
4444+ "Graph logging","128"
4445+
4446+Logging to a file
4447+=============================
4448+
4449+By default, messages are written to stderr, but they can be redirected to
4450+output files using the environment variables described in this section.
4451+Each type of log can be redirected separately using a unique environment variable.
4452+The corresponding log must be enabled using the ``ROCFFT_LAYER`` variable
4453+before any details can be logged to the file.
4454+For example, to redirect the trace log to a file, trace logging must
4455+also be enabled in the ``ROCFFT_LAYER`` bit field.
4456+
4457+.. note::
4458+
4459+ Some log types, such as kernel I/O logging and runtime compilation logging, can generate
4460+ a large number of log entries, so redirecting their output to a file is recommended.
4461+
4462+The following table lists the environment variable to redirect logging for each
4463+log type. Set this variable to a valid file path to redirect the output of the corresponding log type.
4464+For example, to send the trace logging output to a file, enable the trace log, then set the
4465+``ROCFFT_LOG_TRACE_PATH`` variable to the name of the destination file.
4466+
4467+.. csv-table::
4468+ :header: "Log type","File redirection variable"
4469+ :widths: 20, 30
4470+
4471+ "Trace logging","``ROCFFT_LOG_TRACE_PATH``"
4472+ "Benchmark logging","``ROCFFT_LOG_BENCH_PATH``"
4473+ "Profile logging","``ROCFFT_LOG_PROFILE_PATH``"
4474+ "Plan logging","``ROCFFT_LOG_PLAN_PATH``"
4475+ "Kernel I/O logging","``ROCFFT_LOG_KERNELIO_PATH``"
4476+ "Runtime compilation logging","``ROCFFT_LOG_RTC_PATH``"
4477+ "Tuning logging","``ROCFFT_LOG_TUNING_PATH``"
4478+ "Graph logging","``ROCFFT_LOG_GRAPH_PATH``"
4479diff --git a/docs/how-to/load-store-callbacks.rst b/docs/how-to/load-store-callbacks.rst
4480index 9d37995..a8ee6ed 100644
4481--- a/docs/how-to/load-store-callbacks.rst
4482+++ b/docs/how-to/load-store-callbacks.rst
4483@@ -17,6 +17,11 @@ to the library using
4484 :cpp:func:`rocfft_execution_info_set_load_callback` and
4485 :cpp:func:`rocfft_execution_info_set_store_callback`.
4486
4487+.. note::
4488+
4489+ Callback functions must be built as relocatable device code by
4490+ passing the ``-fgpu-rdc`` option to the compiler and linker.
4491+
4492 Device functions supplied as callbacks must load and store element
4493 data types appropriate for the transform being executed.
4494
4495diff --git a/docs/how-to/working-with-rocfft.rst b/docs/how-to/working-with-rocfft.rst
4496index 26f8e57..7d764b8 100644
4497--- a/docs/how-to/working-with-rocfft.rst
4498+++ b/docs/how-to/working-with-rocfft.rst
4499@@ -181,7 +181,7 @@ You must allocate, initialize, and specify the input/output buffers that hold th
4500 For larger transforms, temporary work buffers might be needed. Because the library tries to minimize its own allocation of
4501 memory regions on the device, it expects you to manage the work buffers. The size of the buffer that is needed can be queried using
4502 :cpp:func:`rocfft_plan_get_work_buffer_size`. After allocation, it can be passed to the library using
4503-:cpp:func:`rocfft_execution_info_set_work_buffer`. The `GitHub repository <https://github.com/ROCm/rocFFT/tree/develop/clients/samples>`_
4504+:cpp:func:`rocfft_execution_info_set_work_buffer`. The `GitHub repository <https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocfft/clients/samples>`_
4505 provide some samples and examples.
4506
4507 Transform and array types
4508@@ -316,8 +316,8 @@ reside on a different device and have its own layout parameters.
4509
4510 The rocFFT APIs for declaring fields and bricks are currently experimental and
4511 subject to change in future releases. To submit feedback, questions, and comments
4512- about these interfaces, use the `rocFFT issue tracker
4513- <https://github.com/ROCmSoftwarePlatform/rocFFT/issues>`_.
4514+ about these interfaces, use the `rocm-libraries issue tracker
4515+ <https://github.com/ROCm/rocm-libraries/issues>`_.
4516
4517 The workflow for using fields is as follows:
4518
4519diff --git a/docs/index.rst b/docs/index.rst
4520index 71ef011..82f089d 100644
4521--- a/docs/index.rst
4522+++ b/docs/index.rst
4523@@ -15,7 +15,11 @@ The rocFFT library calculates discrete Fourier transforms for one, two, and thre
4524 supporting various data types for real and complex values.
4525 To learn more, see :doc:`What is rocFFT? <./what-is-rocfft>`
4526
4527-The rocFFT public repository is located at `<https://github.com/ROCm/rocFFT>`_.
4528+The rocFFT public repository is located at `<https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocfft>`_.
4529+
4530+.. note::
4531+
4532+ The rocFFT repository for ROCm 6.4.3 and earlier is located at `<https://github.com/ROCm/rocFFT>`_.
4533
4534 .. grid:: 2
4535 :gutter: 3
4536@@ -38,10 +42,11 @@ The rocFFT public repository is located at `<https://github.com/ROCm/rocFFT>`_.
4537 * :doc:`Load and store callbacks <./how-to/load-store-callbacks>`
4538 * :doc:`Use runtime compilation <./how-to/runtime-compilation>`
4539 * :doc:`Distribute transforms <./how-to/distributed-transforms>`
4540+ * :doc:`Enable logging <./how-to/enabling-logging>`
4541
4542 .. grid-item-card:: Samples
4543
4544- * `rocFFT GitHub client examples <https://github.com/ROCm/rocFFT/tree/develop/clients/samples>`_
4545+ * `rocFFT GitHub client examples <https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocfft/clients/samples>`_
4546
4547 .. grid-item-card:: API reference
4548
4549diff --git a/docs/install/building-installing-rocfft.rst b/docs/install/building-installing-rocfft.rst
4550index 44fb458..5d5f2a1 100644
4551--- a/docs/install/building-installing-rocfft.rst
4552+++ b/docs/install/building-installing-rocfft.rst
4553@@ -32,7 +32,8 @@ You can use the GitHub releases tab to download the source code. This might prov
4554 than the prebuilt packages.
4555
4556 rocFFT uses the AMD clang++ compiler and CMake. You can specify several options to customize your build.
4557-Use the following commands to build a shared library for the supported AMD GPUs:
4558+Use the following commands to build a shared library for the supported AMD GPUs.
4559+Run these commands from the ``rocm-libraries/projects/rocfft`` directory:
4560
4561 .. code-block:: shell
4562
4563@@ -78,7 +79,7 @@ rocFFT uses version 1.11 of GoogleTest.
4564
4565 You can build the clients separately from the main library.
4566 For example, to build all the clients with an existing rocFFT library, invoke CMake from
4567-within the ``rocFFT-src/clients`` folder using these commands:
4568+within the ``rocm-libraries/projects/rocfft/rocFFT-src/clients`` folder using these commands:
4569
4570 .. code-block:: shell
4571
4572diff --git a/docs/license.md b/docs/license.md
4573index 7eeeef5..c1c707c 100644
4574--- a/docs/license.md
4575+++ b/docs/license.md
4576@@ -5,5 +5,7 @@ myst:
4577 "keywords": "rocFFT, FFT, ROCm, API, documentation, license"
4578 ---
4579
4580+# License
4581+
4582 ```{include} ../LICENSE.md
4583 ```
4584diff --git a/docs/samples/README.md b/docs/samples/README.md
4585deleted file mode 100644
4586index ea0f091..0000000
4587--- a/docs/samples/README.md
4588+++ /dev/null
4589@@ -1,12 +0,0 @@
4590-# Samples to demo using rocfft
4591-
4592-## `complex_1d`
4593-
4594-You may need to add the directories for amdclang++ and rocFFT to your
4595-`CMAKE_PREFIX_PATH`, and ensure that `amdclang++` is in your `PATH`.
4596-
4597-``` bash
4598-$ mkdir build && cd build
4599-$ cmake -DCMAKE_CXX_COMPILER=amdclang++ ..
4600-$ make
4601-```
4602diff --git a/docs/samples/complex_1d.cpp b/docs/samples/complex_1d.cpp
4603deleted file mode 100644
4604index e815f15..0000000
4605--- a/docs/samples/complex_1d.cpp
4606+++ /dev/null
4607@@ -1,178 +0,0 @@
4608-// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
4609-//
4610-// Permission is hereby granted, free of charge, to any person obtaining a copy
4611-// of this software and associated documentation files (the "Software"), to deal
4612-// in the Software without restriction, including without limitation the rights
4613-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
4614-// copies of the Software, and to permit persons to whom the Software is
4615-// furnished to do so, subject to the following conditions:
4616-//
4617-// The above copyright notice and this permission notice shall be included in
4618-// all copies or substantial portions of the Software.
4619-//
4620-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
4621-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
4622-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
4623-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
4624-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
4625-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
4626-// THE SOFTWARE.
4627-
4628-#include <cassert>
4629-#include <complex>
4630-#include <iostream>
4631-#include <vector>
4632-
4633-#include <hip/hip_runtime_api.h>
4634-
4635-#include <rocfft/rocfft.h>
4636-
4637-int main(int argc, char* argv[])
4638-{
4639- std::cout << "rocFFT complex 1d FFT example\n";
4640-
4641- // The problem size
4642- const size_t Nx = (argc < 2) ? 8 : atoi(argv[1]);
4643- const bool inplace = (argc < 3) ? false : atoi(argv[2]);
4644- std::cout << "Nx: " << Nx << "\tin-place: " << inplace << std::endl;
4645-
4646- // Initialize data on the host:
4647- std::cout << "Input:\n";
4648- std::vector<std::complex<float>> cx(Nx);
4649- for(size_t i = 0; i < Nx; i++)
4650- {
4651- cx[i] = std::complex<float>(i, 0);
4652- }
4653- for(size_t i = 0; i < Nx; i++)
4654- {
4655- std::cout << cx[i] << " ";
4656- }
4657- std::cout << std::endl;
4658-
4659- // Create HIP device object and copy data:
4660- float2* x = NULL;
4661- hipMalloc(&x, cx.size() * sizeof(decltype(cx)::value_type));
4662- float2* y = inplace ? (float2*)x : NULL;
4663- if(!inplace)
4664- {
4665- hipMalloc(&y, cx.size() * sizeof(decltype(cx)::value_type));
4666- }
4667- hipMemcpy(x, cx.data(), cx.size() * sizeof(decltype(cx)::value_type), hipMemcpyHostToDevice);
4668-
4669- rocfft_setup();
4670-
4671- rocfft_status status = rocfft_status_success;
4672-
4673- // Create forward plan
4674- rocfft_plan forward = NULL;
4675- status = rocfft_plan_create(&forward,
4676- inplace ? rocfft_placement_inplace : rocfft_placement_notinplace,
4677- rocfft_transform_type_complex_forward,
4678- rocfft_precision_single,
4679- 1, // Dimensions
4680- &Nx, // lengths
4681- 1, // Number of transforms
4682- NULL); // Description
4683- assert(status == rocfft_status_success);
4684-
4685- // We may need work memory, which is passed via rocfft_execution_info
4686- rocfft_execution_info forwardinfo = NULL;
4687- status = rocfft_execution_info_create(&forwardinfo);
4688- assert(status == rocfft_status_success);
4689- size_t fbuffersize = 0;
4690- status = rocfft_plan_get_work_buffer_size(forward, &fbuffersize);
4691- assert(status == rocfft_status_success);
4692- void* fbuffer = NULL;
4693- if(fbuffersize > 0)
4694- {
4695- hipMalloc(&fbuffer, fbuffersize);
4696- status = rocfft_execution_info_set_work_buffer(forwardinfo, fbuffer, fbuffersize);
4697- assert(status == rocfft_status_success);
4698- }
4699-
4700- // Create backward plan
4701- rocfft_plan backward = NULL;
4702- status = rocfft_plan_create(&backward,
4703- inplace ? rocfft_placement_inplace : rocfft_placement_notinplace,
4704- rocfft_transform_type_complex_inverse,
4705- rocfft_precision_single,
4706- 1, // Dimensions
4707- &Nx, // lengths
4708- 1, // Number of transforms
4709- NULL); // Description
4710- assert(status == rocfft_status_success);
4711-
4712- // Execution info for the backward transform:
4713- rocfft_execution_info backwardinfo = NULL;
4714- status = rocfft_execution_info_create(&backwardinfo);
4715- assert(status == rocfft_status_success);
4716- size_t bbuffersize = 0;
4717- status = rocfft_plan_get_work_buffer_size(backward, &bbuffersize);
4718- assert(status == rocfft_status_success);
4719- void* bbuffer = NULL;
4720- if(bbuffersize > 0)
4721- {
4722- hipMalloc(&bbuffer, bbuffersize);
4723- status = rocfft_execution_info_set_work_buffer(backwardinfo, bbuffer, bbuffersize);
4724- assert(status == rocfft_status_success);
4725- }
4726-
4727- // Execute the forward transform
4728- status = rocfft_execute(forward,
4729- (void**)&x, // in_buffer
4730- (void**)&y, // out_buffer
4731- forwardinfo); // execution info
4732- assert(status == rocfft_status_success);
4733-
4734- // Copy result back to host
4735- std::vector<std::complex<float>> cy(Nx);
4736- hipMemcpy(cy.data(), y, cy.size() * sizeof(decltype(cy)::value_type), hipMemcpyDeviceToHost);
4737-
4738- std::cout << "Transformed:\n";
4739- for(size_t i = 0; i < cy.size(); i++)
4740- {
4741- std::cout << cy[i] << " ";
4742- }
4743- std::cout << std::endl;
4744-
4745- // Execute the backward transform
4746- rocfft_execute(backward,
4747- (void**)&y, // in_buffer
4748- (void**)&x, // out_buffer
4749- backwardinfo); // execution info
4750-
4751- std::cout << "Transformed back:\n";
4752- hipMemcpy(cy.data(), x, cy.size() * sizeof(decltype(cy)::value_type), hipMemcpyDeviceToHost);
4753- for(size_t i = 0; i < cy.size(); i++)
4754- {
4755- std::cout << cy[i] << " ";
4756- }
4757- std::cout << std::endl;
4758-
4759- const float overN = 1.0f / Nx;
4760- float error = 0.0f;
4761- for(size_t i = 0; i < cx.size(); i++)
4762- {
4763- float diff = std::max(std::abs(cx[i].real() - cy[i].real() * overN),
4764- std::abs(cx[i].imag() - cy[i].imag() * overN));
4765- if(diff > error)
4766- {
4767- error = diff;
4768- }
4769- }
4770- std::cout << "Maximum error: " << error << "\n";
4771-
4772- hipFree(x);
4773- if(!inplace)
4774- {
4775- hipFree(y);
4776- }
4777- hipFree(fbuffer);
4778- hipFree(bbuffer);
4779-
4780- // Destroy plans
4781- rocfft_plan_destroy(forward);
4782- rocfft_plan_destroy(backward);
4783-
4784- rocfft_cleanup();
4785-}
4786diff --git a/docs/samples/complex_2d.cpp b/docs/samples/complex_2d.cpp
4787deleted file mode 100644
4788index 9da818b..0000000
4789--- a/docs/samples/complex_2d.cpp
4790+++ /dev/null
4791@@ -1,198 +0,0 @@
4792-// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
4793-//
4794-// Permission is hereby granted, free of charge, to any person obtaining a copy
4795-// of this software and associated documentation files (the "Software"), to deal
4796-// in the Software without restriction, including without limitation the rights
4797-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
4798-// copies of the Software, and to permit persons to whom the Software is
4799-// furnished to do so, subject to the following conditions:
4800-//
4801-// The above copyright notice and this permission notice shall be included in
4802-// all copies or substantial portions of the Software.
4803-//
4804-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
4805-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
4806-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
4807-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
4808-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
4809-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
4810-// THE SOFTWARE.
4811-
4812-#include <cassert>
4813-#include <complex>
4814-#include <iostream>
4815-#include <vector>
4816-
4817-#include <hip/hip_runtime_api.h>
4818-
4819-#include <rocfft/rocfft.h>
4820-
4821-int main(int argc, char* argv[])
4822-{
4823- std::cout << "rocFFT complex 2d FFT example\n";
4824-
4825- // The problem size
4826- const size_t Nx = (argc < 2) ? 8 : atoi(argv[1]);
4827- const size_t Ny = (argc < 3) ? 8 : atoi(argv[2]);
4828- const bool inplace = (argc < 4) ? false : atoi(argv[3]);
4829- std::cout << "Nx: " << Nx << "\tNy: " << Ny << "\tin-place: " << inplace << std::endl;
4830-
4831- // Initialize data on the host
4832- std::cout << "Input:\n";
4833- std::vector<std::complex<float>> cx(Nx * Ny);
4834- for(size_t i = 0; i < Nx; i++)
4835- {
4836- for(size_t j = 0; j < Ny; j++)
4837- {
4838- cx[i * Ny + j] = std::complex<float>(i + j, 0.0);
4839- }
4840- }
4841- for(size_t i = 0; i < Nx; i++)
4842- {
4843- for(size_t j = 0; j < Ny; j++)
4844- {
4845- std::cout << cx[i * Ny + j] << " ";
4846- }
4847- std::cout << "\n";
4848- }
4849- std::cout << "\n";
4850-
4851- rocfft_setup();
4852-
4853- // Create HIP device object and copy data:
4854- float2* x = NULL;
4855- hipMalloc(&x, cx.size() * sizeof(decltype(cx)::value_type));
4856- float2* y = inplace ? (float2*)x : NULL;
4857- if(!inplace)
4858- {
4859- hipMalloc(&y, cx.size() * sizeof(decltype(cx)::value_type));
4860- }
4861- hipMemcpy(x, cx.data(), cx.size() * sizeof(decltype(cx)::value_type), hipMemcpyHostToDevice);
4862-
4863- // Length are in reverse order because rocfft is column-major.
4864- const size_t lengths[2] = {Ny, Nx};
4865-
4866- rocfft_status status = rocfft_status_success;
4867-
4868- // Create plans
4869- rocfft_plan forward = NULL;
4870- status = rocfft_plan_create(&forward,
4871- inplace ? rocfft_placement_inplace : rocfft_placement_notinplace,
4872- rocfft_transform_type_complex_forward,
4873- rocfft_precision_single,
4874- 2, // Dimensions
4875- lengths, // lengths
4876- 1, // Number of transforms
4877- NULL); // Description
4878- assert(status == rocfft_status_success);
4879-
4880- // We may need work memory, which is passed via rocfft_execution_info
4881- rocfft_execution_info forwardinfo = NULL;
4882- status = rocfft_execution_info_create(&forwardinfo);
4883- assert(status == rocfft_status_success);
4884- size_t fbuffersize = 0;
4885- status = rocfft_plan_get_work_buffer_size(forward, &fbuffersize);
4886- assert(status == rocfft_status_success);
4887- void* fbuffer = NULL;
4888- if(fbuffersize > 0)
4889- {
4890- hipMalloc(&fbuffer, fbuffersize);
4891- status = rocfft_execution_info_set_work_buffer(forwardinfo, fbuffer, fbuffersize);
4892- assert(status == rocfft_status_success);
4893- }
4894-
4895- // Create plans
4896- rocfft_plan backward = NULL;
4897- status = rocfft_plan_create(&backward,
4898- inplace ? rocfft_placement_inplace : rocfft_placement_notinplace,
4899- rocfft_transform_type_complex_inverse,
4900- rocfft_precision_single,
4901- 2, // Dimensions
4902- lengths, // lengths
4903- 1, // Number of transforms
4904- NULL); // Description
4905- assert(status == rocfft_status_success);
4906-
4907- // Execution info for the backward transform:
4908- rocfft_execution_info backwardinfo = NULL;
4909- status = rocfft_execution_info_create(&backwardinfo);
4910- assert(status == rocfft_status_success);
4911- size_t bbuffersize = 0;
4912- status = rocfft_plan_get_work_buffer_size(backward, &bbuffersize);
4913- assert(status == rocfft_status_success);
4914- void* bbuffer = NULL;
4915- if(bbuffersize > 0)
4916- {
4917- hipMalloc(&bbuffer, bbuffersize);
4918- status = rocfft_execution_info_set_work_buffer(backwardinfo, bbuffer, bbuffersize);
4919- assert(status == rocfft_status_success);
4920- }
4921-
4922- // Execute the forward transform
4923- status = rocfft_execute(forward,
4924- (void**)&x, // in_buffer
4925- (void**)&y, // out_buffer
4926- forwardinfo); // execution info
4927- assert(status == rocfft_status_success);
4928-
4929- // Copy result back to host
4930- std::vector<std::complex<float>> cy(cx.size());
4931- hipMemcpy(cy.data(), y, cy.size() * sizeof(decltype(cy)::value_type), hipMemcpyDeviceToHost);
4932-
4933- std::cout << "Transformed:\n";
4934- for(size_t i = 0; i < Nx; i++)
4935- {
4936- for(size_t j = 0; j < Ny; j++)
4937- {
4938- std::cout << cy[i * Ny + j] << " ";
4939- }
4940- std::cout << "\n";
4941- }
4942- std::cout << "\n";
4943-
4944- // Execute the backward transform
4945- status = rocfft_execute(backward,
4946- (void**)&y, // in_buffer
4947- (void**)&x, // out_buffer
4948- backwardinfo); // execution info
4949- assert(status == rocfft_status_success);
4950-
4951- hipMemcpy(cy.data(), x, cy.size() * sizeof(decltype(cy)::value_type), hipMemcpyDeviceToHost);
4952- std::cout << "Transformed back:\n";
4953- for(size_t i = 0; i < Nx; i++)
4954- {
4955- for(size_t j = 0; j < Ny; j++)
4956- {
4957- std::cout << cy[i * Ny + j] << " ";
4958- }
4959- std::cout << "\n";
4960- }
4961- std::cout << "\n";
4962-
4963- const float overN = 1.0f / cx.size();
4964- float error = 0.0f;
4965- for(size_t i = 0; i < cx.size(); i++)
4966- {
4967- float diff = std::max(std::abs(cx[i].real() - cy[i].real() * overN),
4968- std::abs(cx[i].imag() - cy[i].imag() * overN));
4969- if(diff > error)
4970- {
4971- error = diff;
4972- }
4973- }
4974- std::cout << "Maximum error: " << error << "\n";
4975-
4976- hipFree(x);
4977- if(!inplace)
4978- {
4979- hipFree(y);
4980- }
4981- hipFree(fbuffer);
4982- hipFree(bbuffer);
4983-
4984- // Destroy plans
4985- rocfft_plan_destroy(forward);
4986- rocfft_plan_destroy(backward);
4987-
4988- rocfft_cleanup();
4989-}
4990diff --git a/docs/samples/complex_3d.cpp b/docs/samples/complex_3d.cpp
4991deleted file mode 100644
4992index b547d7e..0000000
4993--- a/docs/samples/complex_3d.cpp
4994+++ /dev/null
4995@@ -1,218 +0,0 @@
4996-// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
4997-//
4998-// Permission is hereby granted, free of charge, to any person obtaining a copy
4999-// of this software and associated documentation files (the "Software"), to deal
5000-// in the Software without restriction, including without limitation the rights
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches