Merge ~bullwinkle-team/ubuntu/+source/rocfft:bullwinkle/llvm-21/ubuntu/devel into ubuntu/+source/rocfft:ubuntu/devel
- Git
- lp:~bullwinkle-team/ubuntu/+source/rocfft
- bullwinkle/llvm-21/ubuntu/devel
- Merge into ubuntu/devel
| Status: | Merged | ||||||||
|---|---|---|---|---|---|---|---|---|---|
| Approved by: | Andreas Hasenack | ||||||||
| Approved revision: | e915243b73be522caf7d38de60e5b0b4fd449e3d | ||||||||
| Merged at revision: | e915243b73be522caf7d38de60e5b0b4fd449e3d | ||||||||
| Proposed branch: | ~bullwinkle-team/ubuntu/+source/rocfft:bullwinkle/llvm-21/ubuntu/devel | ||||||||
| Merge into: | ubuntu/+source/rocfft:ubuntu/devel | ||||||||
| Diff against target: |
28364 lines (+10558/-6672) 153 files modified
.github/CODEOWNERS (+2/-2) CHANGELOG.md (+41/-0) CMakeLists.txt (+29/-48) LICENSE.md (+19/-5) README.md (+17/-10) clients/CMakeLists.txt (+2/-2) clients/bench/CMakeLists.txt (+4/-27) clients/bench/bench.cpp (+36/-140) clients/bench/bench.h (+197/-0) clients/bench/dyna-bench.cpp (+23/-90) clients/samples/fixed-16/CMakeLists.txt (+1/-1) clients/samples/fixed-large/CMakeLists.txt (+1/-1) clients/samples/mpi/CMakeLists.txt (+2/-1) clients/samples/mpi/rocfft_mpi_example.cpp (+13/-8) clients/samples/multi_gpu/CMakeLists.txt (+2/-1) clients/samples/rocfft/CMakeLists.txt (+9/-1) clients/samples/rocfft/rocfft_example_callback.cpp (+8/-1) clients/tests/CMakeLists.txt (+109/-28) clients/tests/accuracy_test.cpp (+1/-1) clients/tests/accuracy_test_3D.cpp (+19/-14) clients/tests/accuracy_test_callback.cpp (+21/-16) clients/tests/accuracy_test_checkstride.cpp (+4/-0) clients/tests/accuracy_tests_range.h (+10/-2) clients/tests/buffer_hash_test.cpp (+22/-1) clients/tests/callback_change_type.cpp (+22/-17) clients/tests/default_callbacks_test.cpp (+51/-0) clients/tests/gtest_main.cpp (+102/-84) clients/tests/hermitian_test.cpp (+19/-0) clients/tests/hipGraph_test.cpp (+9/-2) clients/tests/multi_device_test.cpp (+126/-13) clients/tests/multithread_test.cpp (+37/-0) clients/tests/unit_test.cpp (+184/-15) cmake/sqlite.cmake (+16/-15) debian/changelog (+24/-0) debian/control (+13/-13) debian/gbp.conf (+1/-1) debian/patches/Extend-docs-conf.py-for-offline-build.patch (+2/-4) debian/patches/do-not-strictly-depend-on-sqlite3-v3.50.2.patch (+21/-0) debian/patches/series (+1/-1) debian/rules (+6/-1) debian/tests/control (+1/-1) designdocs/codegen.rst (+1/-1) designdocs/design.rst (+5/-5) dev/null (+0/-1019) docs/conf.py (+1/-2) docs/doxygen/Doxyfile (+1/-1) docs/how-to/enabling-logging.rst (+134/-0) docs/how-to/load-store-callbacks.rst (+5/-0) docs/how-to/working-with-rocfft.rst (+3/-3) docs/index.rst (+7/-2) docs/install/building-installing-rocfft.rst (+3/-2) docs/license.md (+2/-0) docs/sphinx/_toc.yml.in (+3/-1) docs/sphinx/requirements.in (+1/-1) docs/sphinx/requirements.txt (+135/-4) docs/what-is-rocfft.rst (+1/-1) install.sh (+0/-9) library/solution_map/gfx908_rocfft_solution_map.dat (+0/-12) library/solution_map/gfx90a_rocfft_solution_map.dat (+0/-12) library/src/CMakeLists.txt (+46/-37) library/src/assignment_policy.cpp (+5/-2) library/src/compute_scheme.cpp (+7/-1) library/src/device/CMakeLists.txt (+14/-60) library/src/device/generator.py (+10/-6) library/src/device/generator/CMakeLists.txt (+1/-0) library/src/device/generator/fftgenerator.h (+0/-386) library/src/device/generator/generator.cpp (+0/-31) library/src/device/generator/generator.h (+24/-55) library/src/device/generator/stockham_gen.cpp (+507/-45) library/src/device/generator/stockham_gen.h (+40/-0) library/src/device/generator/stockham_gen_2d.h (+180/-132) library/src/device/generator/stockham_gen_base.h (+83/-85) library/src/device/generator/stockham_gen_cr.h (+26/-18) library/src/device/generator/stockham_gen_rc.h (+4/-6) library/src/device/generator/stockham_gen_rr.h (+18/-185) library/src/device/generator/stockham_pp_gen_cc.h (+1083/-0) library/src/device/generator/stockham_pp_gen_rr.h (+588/-0) library/src/device/kernel-generator-embed.h (+1/-0) library/src/device/kernel-generator.py (+243/-878) library/src/device/kernels/common.h (+5/-75) library/src/device/kernels/configs/config_2d_single.py (+180/-0) library/src/device/kernels/configs/config_lds.py (+2/-30) library/src/device/kernels/configs/config_pp_3d.py (+33/-0) library/src/device/kernels/configs/config_sbcc.py (+94/-0) library/src/device/kernels/configs/config_sbcr.py (+42/-0) library/src/device/kernels/configs/config_sbrc.py (+52/-0) library/src/device/kernels/configs/config_sbrr.py (+508/-0) library/src/device/kernels/device_enum.h (+106/-0) library/src/device/solution-shipping.py (+3/-4) library/src/fuse_shim.cpp (+1/-6) library/src/include/compute_scheme.h (+3/-0) library/src/include/enum_printer.h (+1/-1) library/src/include/function_map_key.h (+125/-12) library/src/include/function_pool.h (+242/-86) library/src/include/kernel_launch.h (+0/-1066) library/src/include/node_factory.h (+32/-15) library/src/include/plan.h (+30/-2) library/src/include/rocfft_mpi.h (+59/-2) library/src/include/rtc_kernel.h (+1/-1) library/src/include/rtc_stockham_gen.h (+44/-42) library/src/include/tree_node.h (+330/-69) library/src/include/tree_node_1D.h (+43/-4) library/src/include/tree_node_2D.h (+1/-1) library/src/include/tree_node_3D.h (+22/-7) library/src/include/tree_node_bluestein.h (+5/-4) library/src/include/tree_node_real.h (+2/-6) library/src/node_factory.cpp (+214/-106) library/src/plan.cpp (+919/-230) library/src/powX.cpp (+18/-22) library/src/rocfft_aot_helper.cpp (+30/-28) library/src/rocfft_kernel_config_search.cpp (+80/-61) library/src/rocfft_ostream.cpp (+12/-0) library/src/rtc_bluestein_gen.cpp (+2/-0) library/src/rtc_bluestein_kernel.cpp (+1/-3) library/src/rtc_cache.cpp (+19/-8) library/src/rtc_chirp_gen.cpp (+1/-0) library/src/rtc_kernel.cpp (+14/-14) library/src/rtc_realcomplex_gen.cpp (+3/-0) library/src/rtc_realcomplex_kernel.cpp (+1/-0) library/src/rtc_stockham_gen.cpp (+139/-64) library/src/rtc_stockham_kernel.cpp (+40/-35) library/src/rtc_transpose_gen.cpp (+1/-0) library/src/rtc_transpose_kernel.cpp (+1/-0) library/src/rtc_twiddle_gen.cpp (+2/-1) library/src/transform.cpp (+4/-11) library/src/tree_node.cpp (+252/-119) library/src/tree_node_1D.cpp (+91/-91) library/src/tree_node_2D.cpp (+2/-3) library/src/tree_node_3D.cpp (+189/-146) library/src/tree_node_bluestein.cpp (+19/-11) library/src/tree_node_real.cpp (+29/-25) library/src/twiddles.cpp (+6/-1) rtest.xml (+3/-0) scripts/perf/perflib/bench.py (+7/-4) scripts/perf/perflib/utils.py (+91/-3) scripts/perf/rocfft-perf (+70/-17) scripts/perf/suites.py (+52/-38) scripts/rocfft_mpi_test.py (+140/-0) scripts/rocfftslurmtest.py (+256/-0) scripts/rocslurm/__init__.py (+201/-0) shared/accuracy_test.h (+41/-33) shared/device_properties.h (+1/-0) shared/environment.h (+2/-2) shared/fft_params.h (+390/-73) shared/gpubuf.h (+14/-10) shared/hostbuf.h (+62/-34) shared/mpi_worker.h (+408/-298) shared/params_gen.h (+46/-17) shared/printbuffer.h (+62/-19) shared/rocfft_params.h (+161/-111) shared/sys_mem.h (+81/-29) shared/test_params.h (+2/-0) toolchain-windows.cmake (+4/-3) |
||||||||
| Related bugs: |
|
| Reviewer | Review Type | Date Requested | Status |
|---|---|---|---|
| Andreas Hasenack | Approve | ||
| Ubuntu Sponsors | Pending | ||
|
Review via email:
|
|||
Commit message
Description of the change
New upstream version 7.1.0
| Bruno Bernardo de Moura (bruno-bdmoura) wrote : | # |
| Andreas Hasenack (ahasenack) wrote : | # |
#TBD bug needs updating in d/changelog
| Andreas Hasenack (ahasenack) wrote : | # |
Question in line about bullwinkle.sources
| Andreas Hasenack (ahasenack) wrote : | # |
Just adding a comment because launchpad OOPSed in my previous two, because they included diff comments.
| Talha Can Havadar (tchavadar) wrote : | # |
Hello Andreas, I added these salsa-ci changes just to use salsa with our ppa during development they can indeed be dropped safely, Bruno can you drop the relevant commits and rebase?
| Talha Can Havadar (tchavadar) wrote : | # |
just forced with lease, salsa-ci related changes should be gone now
| Talha Can Havadar (tchavadar) wrote : | # |
Fixed the TBD in changelog as well
| Igor Luppi (igorluppi) wrote (last edit ): | # |
Where are the inline comments?
src:hiprand was accepted, this one is finally ready to archive.
| Andreas Hasenack (ahasenack) wrote : | # |
Click on "show diff comments" comment text box.
| Andreas Hasenack (ahasenack) wrote : | # |
And then scroll... Yes, this UI is bad. In future PRs I'll refrain from doing that, and will just copy & paste the diff here in this box when I have a comment.
| Igor Luppi (igorluppi) wrote : | # |
Due to the fixup/rebase it's not working the 'show diff comments' :/
| Talha Can Havadar (tchavadar) wrote : | # |
I can see 2 inline comments on first revision in LP review. and they are addressed
they were about TBD and salsa-ci
| Andreas Hasenack (ahasenack) wrote : | # |
Recovering them here:
b) bullwinkle.sources
It's the first time I see something like this. I see the corresponding change in d/salsa-ci.yml, though. Is this a pattern for adding external repositories to salsa ci? Is this the way to do it? I was hoping for something in salsa-ci.yml directly, and not just a reference to a sources file elsewhere in the packaging. Perhaps even an add-apt-repository command.
| Andreas Hasenack (ahasenack) wrote : | # |
a) TBD in changelog
TBD needs updating
That was it, now checking updates.
| Andreas Hasenack (ahasenack) wrote : | # |
--- a/debian/control
+++ b/debian/control
@@ -36,7 +36,7 @@ Rules-Requires-
Package: librocfft0
Section: libs
-Architecture: amd64 arm64 ppc64el
+Architecture: amd64 arm64
Multi-Arch: same
XB-X-ROCm-
Depends: ${misc:Depends}, ${shlibs:Depends}
@@ -53,7 +53,7 @@ Description: ROCm library for computing Fast Fourier Transforms - library
Package: librocfft-dev
Section: libdevel
-Architecture: amd64 arm64 ppc64el
+Architecture: amd64 arm64
Depends: librocfft0 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends}
Recommends: libamdhip64-dev
Suggests: librocfft-doc
@@ -70,7 +70,7 @@ Description: ROCm library for computing Fast Fourier Transforms - headers
Package: librocfft0-tests
Section: libdevel
-Architecture: amd64 arm64 ppc64el
+Architecture: amd64 arm64
Build-Profiles: <!nocheck>
Depends: librocfft0 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends}
Description: ROCm library for computing Fast Fourier Transforms - tests
ppc64el is being dropped, what is the impact?
| Talha Can Havadar (tchavadar) wrote (last edit ): | # |
seems like no rdepends:
```
root@pretty-
+ '[' -z rocfft ']'
+ SOURCE_PKG=rocfft
++ apt-cache showsrc rocfft
++ grep -m1 '^Binary:'
++ sed 's/^Binary: //'
++ tr , '\n'
++ tr -d ' '
+ BINARIES=
+ '[' -z $'librocfft0\
+ for pkg in $BINARIES
+ echo '=== reverse-depends for: librocfft0 ==='
=== reverse-depends for: librocfft0 ===
+ reverse-depends -x -a ppc64el librocfft0
No reverse dependencies found
+ reverse-depends -a ppc64el -x -b librocfft0
No reverse dependencies found
+ echo ''
+ for pkg in $BINARIES
+ echo '=== reverse-depends for: librocfft-dev ==='
=== reverse-depends for: librocfft-dev ===
+ reverse-depends -x -a ppc64el librocfft-dev
No reverse dependencies found
+ reverse-depends -a ppc64el -x -b librocfft-dev
b'<p>Unknown package</p>'
+ echo ''
+ for pkg in $BINARIES
+ echo '=== reverse-depends for: librocfft0-tests ==='
=== reverse-depends for: librocfft0-tests ===
+ reverse-depends -x -a ppc64el librocfft0-tests
No reverse dependencies found
+ reverse-depends -a ppc64el -x -b librocfft0-tests
No reverse dependencies found
+ echo ''
+ for pkg in $BINARIES
+ echo '=== reverse-depends for: librocfft-doc ==='
=== reverse-depends for: librocfft-doc ===
+ reverse-depends -x -a ppc64el librocfft-doc
No reverse dependencies found
+ reverse-depends -a ppc64el -x -b librocfft-doc
No reverse dependencies found
+ echo ''
root@pretty-
b'<p>Unknown package</p>'
root@pretty-
No reverse dependencies found
```
| Andreas Hasenack (ahasenack) wrote : | # |
I see librocfft0 was removed[1] as requested by bug LP: #2134241, together with many other packages. Maybe the reverse dependencies were in that list as well.
But right now, the state is as you described, so +1.
| Andreas Hasenack (ahasenack) wrote : | # |
Sponsored:
Uploading rocfft_
Uploading rocfft_
Uploading rocfft_
Uploading rocfft_
Uploading rocfft_
Preview Diff
| 1 | diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS |
| 2 | old mode 100755 |
| 3 | new mode 100644 |
| 4 | index eff670f..056a93f |
| 5 | --- a/.github/CODEOWNERS |
| 6 | +++ b/.github/CODEOWNERS |
| 7 | @@ -1,8 +1,8 @@ |
| 8 | -* @af-ayala @eng-flavio-teixeira @evetsso @feizheng10 @malcolmroberts |
| 9 | +* @af-ayala @eng-flavio-teixeira @evetsso @malcolmroberts @regan-amd |
| 10 | # Documentation files |
| 11 | docs/ @ROCm/rocm-documentation |
| 12 | *.md @ROCm/rocm-documentation |
| 13 | *.rst @ROCm/rocm-documentation |
| 14 | .readthedocs.yaml @ROCm/rocm-documentation |
| 15 | # Header directory for Doxygen documentation |
| 16 | -library/include/ @ROCm/rocm-documentation @af-ayala @eng-flavio-teixeira @evetsso @feizheng10 @malcolmroberts |
| 17 | +library/include/ @ROCm/rocm-documentation @af-ayala @eng-flavio-teixeira @evetsso @malcolmroberts @regan-amd |
| 18 | diff --git a/.jenkins/application.groovy b/.jenkins/application.groovy |
| 19 | deleted file mode 100644 |
| 20 | index f57030b..0000000 |
| 21 | --- a/.jenkins/application.groovy |
| 22 | +++ /dev/null |
| 23 | @@ -1,182 +0,0 @@ |
| 24 | -#!/usr/bin/env groovy |
| 25 | -// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ |
| 26 | -@Library('rocJenkins@pong') _ |
| 27 | - |
| 28 | -// This is file for internal AMD use. |
| 29 | -// If you are interested in running your own Jenkins, please raise a github issue for assistance. |
| 30 | - |
| 31 | -import com.amd.project.* |
| 32 | -import com.amd.docker.* |
| 33 | -import java.nio.file.Path |
| 34 | - |
| 35 | -def runCI = |
| 36 | -{ |
| 37 | - nodeDetails, jobName-> |
| 38 | - |
| 39 | - def prj = new rocProject('rocFFT-internal', 'application') |
| 40 | - |
| 41 | - prj.defaults.ccache = true |
| 42 | - prj.timeout.compile = 600 |
| 43 | - prj.timeout.test = 600 |
| 44 | - prj.libraryDependencies = ['rocFFT', 'hipFFT'] |
| 45 | - |
| 46 | - // Define test architectures, optional rocm version argument is available |
| 47 | - def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 48 | - |
| 49 | - boolean formatCheck = false |
| 50 | - |
| 51 | - def commonGroovy |
| 52 | - |
| 53 | - def compileCommand = |
| 54 | - { |
| 55 | - platform, project-> |
| 56 | - def getDependenciesCommand = "" |
| 57 | - if (project.installLibraryDependenciesFromCI) |
| 58 | - { |
| 59 | - project.libraryDependencies.each |
| 60 | - { libraryName -> |
| 61 | - getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false) |
| 62 | - } |
| 63 | - } |
| 64 | - |
| 65 | - def command = """#!/usr/bin/env bash |
| 66 | - set -ex |
| 67 | - cd ${project.paths.project_build_prefix} |
| 68 | - ${getDependenciesCommand} |
| 69 | - git clone -b develop-2021 https://github.com/ROCmSoftwarePlatform/Gromacs.git |
| 70 | - cd Gromacs |
| 71 | - |
| 72 | - mkdir build_tmpi |
| 73 | - cd build_tmpi |
| 74 | - cmake -DCMAKE_HIP_ARCHITECTURES=gfx90a -DBUILD_SHARED_LIBS=ON -DGMX_BUILD_FOR_COVERAGE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DGMX_MPI=OFF -DGMX_GPU=hip -DGMX_OPENMP=ON -DGMX_SIMD=AVX2_256 -DREGRESSIONTEST_DOWNLOAD=OFF -DGMX_GPU_USE_VKFFT=OFF -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=../gromacs-install .. |
| 75 | - make |
| 76 | - make install |
| 77 | - cd .. |
| 78 | - |
| 79 | - mkdir build_mpi |
| 80 | - cd build_mpi |
| 81 | - cmake -DCMAKE_HIP_ARCHITECTURES=gfx908 -DBUILD_SHARED_LIBS=ON -DGMX_BUILD_FOR_COVERAGE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++ -DGMX_MPI=ON -DGMX_GPU=hip -DGMX_OPENMP=ON -DGMX_SIMD=AVX2_256 -DREGRESSIONTEST_DOWNLOAD=OFF -DGMX_GPU_USE_VKFFT=OFF -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=../gromacs-install .. |
| 82 | - make |
| 83 | - make install |
| 84 | - cd .. |
| 85 | - """ |
| 86 | - platform.runCommand(this, command) |
| 87 | - } |
| 88 | - |
| 89 | - def testCommand = |
| 90 | - { |
| 91 | - platform, project-> |
| 92 | - |
| 93 | - def command = """#!/usr/bin/env bash |
| 94 | - set -ex |
| 95 | - cd ${project.paths.project_build_prefix} |
| 96 | - cd Gromacs |
| 97 | - |
| 98 | - source gromacs-install/bin/GMXRC |
| 99 | - gmx --version |
| 100 | - |
| 101 | - export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/opt/rocm/lib |
| 102 | - echo \$LD_LIBRARY_PATH |
| 103 | - |
| 104 | - git clone https://github.com/jychang48/benchmark-gromacs.git |
| 105 | - cd benchmark-gromacs |
| 106 | - |
| 107 | - export GMX_MAXBACKUP=-1 |
| 108 | - |
| 109 | - echo "* Threaded MPI ******************************************************************************************************" |
| 110 | - |
| 111 | - #ADH_DODEC |
| 112 | - cd adh_dodec |
| 113 | - tar zxf adh_dodec.tar.gz |
| 114 | - gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 100 # 1 GPU |
| 115 | - gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs |
| 116 | - gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 200 # 4 GPUs |
| 117 | - gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 150 # 8 GPUs |
| 118 | - |
| 119 | - # STMV |
| 120 | - cd .. |
| 121 | - cd stmv/ |
| 122 | - tar zxf stmv.tar.gz |
| 123 | - gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 200 # 1 GPU |
| 124 | - gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs |
| 125 | - gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 400 # 4 GPUs |
| 126 | - gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 400 # 8 GPUs |
| 127 | - |
| 128 | - # CELLULOSE_NVE |
| 129 | - cd .. |
| 130 | - cd cellulose_nve/ |
| 131 | - tar zxf cellulose_nve.tar.gz |
| 132 | - gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 100 # 1 GPU |
| 133 | - gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs |
| 134 | - gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 200 # 4 GPUs |
| 135 | - gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 200 # 8 GPUs |
| 136 | - |
| 137 | - echo "* MPI ***************************************************************************************************************" |
| 138 | - |
| 139 | - # ADH_DODEC |
| 140 | - cd .. |
| 141 | - cd adh_dodec/ |
| 142 | - tar zxf adh_dodec.tar.gz |
| 143 | - mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr # 1 GPU |
| 144 | - mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs |
| 145 | - mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs |
| 146 | - mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs |
| 147 | - |
| 148 | - # STMV |
| 149 | - cd .. |
| 150 | - cd stmv/ |
| 151 | - tar zxf stmv.tar.gz |
| 152 | - mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -nstlist 400 -gpu_id 0 -s topol.tpr # 1 GPU |
| 153 | - mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs |
| 154 | - mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs |
| 155 | - mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs |
| 156 | - |
| 157 | - # CELLULOSE_NVE |
| 158 | - cd .. |
| 159 | - cd cellulose_nve/ |
| 160 | - tar zxf cellulose_nve.tar.gz |
| 161 | - mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr # 1 GPU |
| 162 | - mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs |
| 163 | - mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs |
| 164 | - mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs |
| 165 | - """ |
| 166 | - platform.runCommand(this, command) |
| 167 | - } |
| 168 | - |
| 169 | - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null) |
| 170 | -} |
| 171 | - |
| 172 | -ci: { |
| 173 | - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) |
| 174 | - |
| 175 | - def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 5')])]] |
| 176 | - propertyList = auxiliary.appendPropertyList(propertyList) |
| 177 | - |
| 178 | - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu20:['8gfx90a']])] |
| 179 | - jobNameList = auxiliary.appendJobNameList(jobNameList) |
| 180 | - |
| 181 | - propertyList.each |
| 182 | - { |
| 183 | - jobName, property-> |
| 184 | - if (urlJobName == jobName) |
| 185 | - properties(auxiliary.addCommonProperties(property)) |
| 186 | - } |
| 187 | - |
| 188 | - jobNameList.each |
| 189 | - { |
| 190 | - jobName, nodeDetails-> |
| 191 | - if (urlJobName == jobName) |
| 192 | - stage(jobName) { |
| 193 | - runCI(nodeDetails, jobName) |
| 194 | - } |
| 195 | - } |
| 196 | - |
| 197 | - // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 |
| 198 | - if(!jobNameList.keySet().contains(urlJobName)) |
| 199 | - { |
| 200 | - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) |
| 201 | - stage(urlJobName) { |
| 202 | - runCI([ubuntu18:['8gfx90a']], urlJobName) |
| 203 | - } |
| 204 | - } |
| 205 | -} |
| 206 | diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy |
| 207 | deleted file mode 100644 |
| 208 | index fb1397c..0000000 |
| 209 | --- a/.jenkins/common.groovy |
| 210 | +++ /dev/null |
| 211 | @@ -1,136 +0,0 @@ |
| 212 | -// This file is for internal AMD use. |
| 213 | -// If you are interested in running your own Jenkins, please raise a github issue for assistance. |
| 214 | - |
| 215 | -def runCompileCommand(platform, project, jobName, boolean debug=false, boolean buildStatic=false, boolean buildMPI=false) |
| 216 | -{ |
| 217 | - project.paths.construct_build_prefix() |
| 218 | - |
| 219 | - def getDependenciesCommand = "" |
| 220 | - if (project.installLibraryDependenciesFromCI) |
| 221 | - { |
| 222 | - project.libraryDependencies.each |
| 223 | - { libraryName -> |
| 224 | - getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false) |
| 225 | - } |
| 226 | - } |
| 227 | - |
| 228 | - String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCH=ON' |
| 229 | - String warningArgs = '-DWERROR=ON' |
| 230 | - String buildTunerArgs = '-DROCFFT_BUILD_OFFLINE_TUNER=ON' |
| 231 | - String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release' |
| 232 | - String buildTypeDir = debug ? 'debug' : 'release' |
| 233 | - String buildMPIArgs = buildMPI ? '-DCMAKE_PREFIX_PATH=/usr/local/openmpi -DROCFFT_MPI_ENABLE=ON' : '' |
| 234 | - String staticArg = buildStatic ? '-DBUILD_SHARED_LIBS=off' : '' |
| 235 | - String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' |
| 236 | - //Set CI node's gfx arch as target if PR, otherwise use default targets of the library |
| 237 | - String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' |
| 238 | - String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_LOCAL/rocfft_build_cache.db" |
| 239 | - |
| 240 | - def command = """#!/usr/bin/env bash |
| 241 | - set -x |
| 242 | - cd ${project.paths.project_build_prefix} |
| 243 | - ${getDependenciesCommand} |
| 244 | - set -e |
| 245 | - |
| 246 | - mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} |
| 247 | - ${auxiliary.gfxTargetParser()} |
| 248 | - ${cmake} ${buildMPIArgs} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang ${buildTypeArg} ${clientArgs} ${warningArgs} ${buildTunerArgs} ${staticArg} ${amdgpuTargets} ${rtcBuildCache} ../.. |
| 249 | - |
| 250 | - make -j\$(nproc) |
| 251 | - sudo make install |
| 252 | - """ |
| 253 | - platform.runCommand(this, command) |
| 254 | -} |
| 255 | - |
| 256 | - |
| 257 | -def runCompileClientCommand(platform, project, jobName, boolean debug=false) |
| 258 | -{ |
| 259 | - project.paths.construct_build_prefix() |
| 260 | - |
| 261 | - String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCH=ON' |
| 262 | - String warningArgs = '-DWERROR=ON' |
| 263 | - String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' |
| 264 | - String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' |
| 265 | - String buildTypeArgClients = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release' |
| 266 | - String cmakePrefixPathArg = "-DCMAKE_PREFIX_PATH=${project.paths.project_build_prefix}" |
| 267 | - |
| 268 | - def command = """#!/usr/bin/env bash |
| 269 | - set -ex |
| 270 | - cd ${project.paths.project_build_prefix}/clients |
| 271 | - mkdir -p build && cd build |
| 272 | - ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang ${buildTypeArgClients} ${clientArgs} ${warningArgs} ${cmakePrefixPathArg} ${amdgpuTargets} ../ |
| 273 | - make -j\$(nproc) |
| 274 | - """ |
| 275 | - platform.runCommand(this, command) |
| 276 | -} |
| 277 | - |
| 278 | -def runTestCommand (platform, project, boolean debug=false, gfilter='', extraArgs='') |
| 279 | -{ |
| 280 | - String testBinaryName = 'rocfft-test' |
| 281 | - String directory = debug ? 'debug' : 'release' |
| 282 | - |
| 283 | - String gfilterArg = '' |
| 284 | - if (gfilter) |
| 285 | - { |
| 286 | - gfilterArg = "--gtest_filter=${gfilter}" |
| 287 | - } |
| 288 | - |
| 289 | - def command = """#!/usr/bin/env bash |
| 290 | - set -ex |
| 291 | - cd ${project.paths.project_build_prefix}/build/${directory}/clients/staging |
| 292 | - ROCM_PATH=/opt/rocm GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./${testBinaryName} --precompile=rocfft-test-precompile.db ${gfilterArg} --gtest_color=yes --R 80 --nrand 10 ${extraArgs} |
| 293 | - """ |
| 294 | - platform.runCommand(this, command) |
| 295 | -} |
| 296 | - |
| 297 | -def runPackageCommand(platform, project, jobName, boolean debug=false) |
| 298 | -{ |
| 299 | - String directory = debug ? 'debug' : 'release' |
| 300 | - def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/${directory}",false) |
| 301 | - platform.runCommand(this, packageHelper[0]) |
| 302 | - platform.archiveArtifacts(this, packageHelper[1]) |
| 303 | - |
| 304 | - //trim temp files |
| 305 | - def command = """#!/usr/bin/env bash |
| 306 | - set -ex |
| 307 | - cd ${project.paths.project_build_prefix}/build/${directory}/ |
| 308 | - rm -rf _CPack_Packages/ |
| 309 | - find -name '*.o' -delete |
| 310 | - """ |
| 311 | - platform.runCommand(this, command) |
| 312 | -} |
| 313 | - |
| 314 | -def runSubsetBuildCommand(platform, project, jobName, genPattern, genSmall, genLarge, boolean onlyDouble) |
| 315 | -{ |
| 316 | - project.paths.construct_build_prefix() |
| 317 | - |
| 318 | - // Don't build clients, since we're just testing if the library can build |
| 319 | - String clientArgs = '' |
| 320 | - String warningArgs = '-DWERROR=ON' |
| 321 | - String buildTypeArg = '-DCMAKE_BUILD_TYPE=Release' |
| 322 | - String buildTypeDir = 'release' |
| 323 | - |
| 324 | - String genPatternArgs = "-DGENERATOR_PATTERN=${genPattern}" |
| 325 | - String manualSmallArgs = (genSmall != null) ? "-DGENERATOR_MANUAL_SMALL_SIZE=${genSmall}" : '' |
| 326 | - String manualLargeArgs = (genLarge != null) ? "-DGENERATOR_MANUAL_LARGE_SIZE=${genLarge}" : '' |
| 327 | - String precisionArgs = onlyDouble ? '-DGENERATOR_PRECISION=double' : '' |
| 328 | - String kernelArgs = "${genPatternArgs} ${manualSmallArgs} ${manualLargeArgs} ${precisionArgs}" |
| 329 | - |
| 330 | - String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' |
| 331 | - //Set CI node's gfx arch as target if PR, otherwise use default targets of the library |
| 332 | - String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' |
| 333 | - String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_LOCAL/rocfft_build_cache.db" |
| 334 | - |
| 335 | - def command = """#!/usr/bin/env bash |
| 336 | - set -ex |
| 337 | - |
| 338 | - cd ${project.paths.project_build_prefix} |
| 339 | - rm -rf build/${buildTypeDir} |
| 340 | - mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} |
| 341 | - ${auxiliary.gfxTargetParser()} |
| 342 | - ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang ${buildTypeArg} ${clientArgs} ${kernelArgs} ${warningArgs} ${amdgpuTargets} ${rtcBuildCache} ../.. |
| 343 | - make -j\$(nproc) |
| 344 | - """ |
| 345 | - platform.runCommand(this, command) |
| 346 | -} |
| 347 | -return this |
| 348 | diff --git a/.jenkins/debug.groovy b/.jenkins/debug.groovy |
| 349 | deleted file mode 100644 |
| 350 | index d0f903c..0000000 |
| 351 | --- a/.jenkins/debug.groovy |
| 352 | +++ /dev/null |
| 353 | @@ -1,89 +0,0 @@ |
| 354 | -#!/usr/bin/env groovy |
| 355 | -// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ |
| 356 | -@Library('rocJenkins@pong') _ |
| 357 | - |
| 358 | -// This is file for internal AMD use. |
| 359 | -// If you are interested in running your own Jenkins, please raise a github issue for assistance. |
| 360 | - |
| 361 | -import com.amd.project.* |
| 362 | -import com.amd.docker.* |
| 363 | -import java.nio.file.Path |
| 364 | - |
| 365 | -def runCI = |
| 366 | -{ |
| 367 | - nodeDetails, jobName-> |
| 368 | - |
| 369 | - def prj = new rocProject('rocFFT-internal', 'Debug') |
| 370 | - |
| 371 | - prj.defaults.ccache = true |
| 372 | - prj.timeout.compile = 600 |
| 373 | - prj.timeout.test = 600 |
| 374 | - prj.libraryDependencies = ['rocRAND','hipRAND'] |
| 375 | - |
| 376 | - // Define test architectures, optional rocm version argument is available |
| 377 | - def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 378 | - |
| 379 | - boolean formatCheck = false |
| 380 | - |
| 381 | - def commonGroovy |
| 382 | - |
| 383 | - def compileCommand = |
| 384 | - { |
| 385 | - platform, project-> |
| 386 | - |
| 387 | - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" |
| 388 | - commonGroovy.runCompileCommand(platform, project, jobName, true) |
| 389 | - } |
| 390 | - |
| 391 | - def testCommand = |
| 392 | - { |
| 393 | - platform, project-> |
| 394 | - |
| 395 | - commonGroovy.runTestCommand(platform, project, true) |
| 396 | - } |
| 397 | - |
| 398 | - def packageCommand = |
| 399 | - { |
| 400 | - platform, project-> |
| 401 | - |
| 402 | - commonGroovy.runPackageCommand(platform, project, jobName, true) |
| 403 | - } |
| 404 | - |
| 405 | - |
| 406 | - buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) |
| 407 | -} |
| 408 | - |
| 409 | -ci: { |
| 410 | - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) |
| 411 | - |
| 412 | - def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] |
| 413 | - propertyList = auxiliary.appendPropertyList(propertyList) |
| 414 | - |
| 415 | - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900']])] |
| 416 | - jobNameList = auxiliary.appendJobNameList(jobNameList) |
| 417 | - |
| 418 | - propertyList.each |
| 419 | - { |
| 420 | - jobName, property-> |
| 421 | - if (urlJobName == jobName) |
| 422 | - properties(auxiliary.addCommonProperties(property)) |
| 423 | - } |
| 424 | - |
| 425 | - jobNameList.each |
| 426 | - { |
| 427 | - jobName, nodeDetails-> |
| 428 | - if (urlJobName == jobName) |
| 429 | - stage(jobName) { |
| 430 | - runCI(nodeDetails, jobName) |
| 431 | - } |
| 432 | - } |
| 433 | - |
| 434 | - // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 |
| 435 | - if(!jobNameList.keySet().contains(urlJobName)) |
| 436 | - { |
| 437 | - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) |
| 438 | - stage(urlJobName) { |
| 439 | - runCI([ubuntu16:['any']], urlJobName) |
| 440 | - } |
| 441 | - } |
| 442 | -} |
| 443 | diff --git a/.jenkins/extended.groovy b/.jenkins/extended.groovy |
| 444 | deleted file mode 100644 |
| 445 | index ee508db..0000000 |
| 446 | --- a/.jenkins/extended.groovy |
| 447 | +++ /dev/null |
| 448 | @@ -1,120 +0,0 @@ |
| 449 | -#!/usr/bin/env groovy |
| 450 | -// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ |
| 451 | -@Library('rocJenkins@pong') _ |
| 452 | - |
| 453 | -// This is file for internal AMD use. |
| 454 | -// If you are interested in running your own Jenkins, please raise a github issue for assistance. |
| 455 | - |
| 456 | -import com.amd.project.* |
| 457 | -import com.amd.docker.* |
| 458 | -import java.nio.file.Path |
| 459 | - |
| 460 | -def runBitwiseReproTest (platform, project, boolean debug=false, gfilter='', reprodb='', int repeat=1) |
| 461 | -{ |
| 462 | - String testBinaryName = 'rocfft-test' |
| 463 | - String directory = debug ? 'debug' : 'release' |
| 464 | - |
| 465 | - String gfilterArg = '' |
| 466 | - if (gfilter) |
| 467 | - { |
| 468 | - gfilterArg = "--gtest_filter=${gfilter}" |
| 469 | - } |
| 470 | - |
| 471 | - String reproDbArg = '' |
| 472 | - if (reprodb) |
| 473 | - { |
| 474 | - reproDbArg = "--repro-db=${reprodb}" |
| 475 | - } |
| 476 | - |
| 477 | - String repeatArg = '' |
| 478 | - if (repeat > 1) |
| 479 | - { |
| 480 | - repeatArg = "--gtest_repeat=${repeat}" |
| 481 | - } |
| 482 | - |
| 483 | - def command = """#!/usr/bin/env bash |
| 484 | - set -ex |
| 485 | - cd ${project.paths.project_build_prefix}/build/${directory}/clients/staging |
| 486 | - ROCM_PATH=/opt/rocm GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./${testBinaryName} --precompile=rocfft-test-precompile.db ${gfilterArg} ${reproDbArg} ${repeatArg} --gtest_color=yes --R 80 --nrand 10 |
| 487 | - """ |
| 488 | - platform.runCommand(this, command) |
| 489 | -} |
| 490 | - |
| 491 | -def runCI = |
| 492 | -{ |
| 493 | - nodeDetails, jobName-> |
| 494 | - |
| 495 | - def prj = new rocProject('rocFFT-internal', 'Extended') |
| 496 | - |
| 497 | - prj.defaults.ccache = true |
| 498 | - prj.timeout.compile = 600 |
| 499 | - prj.timeout.test = 600 |
| 500 | - prj.libraryDependencies = ['rocRAND','hipRAND'] |
| 501 | - |
| 502 | - // Define test architectures, optional rocm version argument is available |
| 503 | - def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 504 | - |
| 505 | - boolean formatCheck = false |
| 506 | - |
| 507 | - def commonGroovy |
| 508 | - |
| 509 | - def compileCommand = |
| 510 | - { |
| 511 | - platform, project-> |
| 512 | - |
| 513 | - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" |
| 514 | - commonGroovy.runCompileCommand(platform, project, jobName) |
| 515 | - commonGroovy.runCompileClientCommand(platform, project, jobName, false) |
| 516 | - } |
| 517 | - |
| 518 | - def testCommand = |
| 519 | - { |
| 520 | - platform, project-> |
| 521 | - |
| 522 | - runBitwiseReproTest(platform, project, false, "*pow2_1D/bitwise_repro_test*", 'bitwise_repro.db', 2) |
| 523 | - } |
| 524 | - |
| 525 | - def packageCommand = |
| 526 | - { |
| 527 | - platform, project-> |
| 528 | - |
| 529 | - commonGroovy.runPackageCommand(platform, project, jobName) |
| 530 | - } |
| 531 | - |
| 532 | - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) |
| 533 | -} |
| 534 | - |
| 535 | -ci: { |
| 536 | - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) |
| 537 | - |
| 538 | - def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] |
| 539 | - propertyList = auxiliary.appendPropertyList(propertyList) |
| 540 | - |
| 541 | - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] |
| 542 | - jobNameList = auxiliary.appendJobNameList(jobNameList) |
| 543 | - |
| 544 | - propertyList.each |
| 545 | - { |
| 546 | - jobName, property-> |
| 547 | - if (urlJobName == jobName) |
| 548 | - properties(auxiliary.addCommonProperties(property)) |
| 549 | - } |
| 550 | - |
| 551 | - jobNameList.each |
| 552 | - { |
| 553 | - jobName, nodeDetails-> |
| 554 | - if (urlJobName == jobName) |
| 555 | - stage(jobName) { |
| 556 | - runCI(nodeDetails, jobName) |
| 557 | - } |
| 558 | - } |
| 559 | - |
| 560 | - // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 |
| 561 | - if(!jobNameList.keySet().contains(urlJobName)) |
| 562 | - { |
| 563 | - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) |
| 564 | - stage(urlJobName) { |
| 565 | - runCI([ubuntu18:['gfx906']], urlJobName) |
| 566 | - } |
| 567 | - } |
| 568 | -} |
| 569 | diff --git a/.jenkins/multigpu.groovy b/.jenkins/multigpu.groovy |
| 570 | deleted file mode 100644 |
| 571 | index 68e625f..0000000 |
| 572 | --- a/.jenkins/multigpu.groovy |
| 573 | +++ /dev/null |
| 574 | @@ -1,93 +0,0 @@ |
| 575 | -#!/usr/bin/env groovy |
| 576 | -// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ |
| 577 | -@Library('rocJenkins@pong') _ |
| 578 | - |
| 579 | -// This is file for internal AMD use. |
| 580 | -// If you are interested in running your own Jenkins, please raise a github issue for assistance. |
| 581 | - |
| 582 | -import com.amd.project.* |
| 583 | -import com.amd.docker.* |
| 584 | -import java.nio.file.Path |
| 585 | - |
| 586 | -def runCI = |
| 587 | -{ |
| 588 | - nodeDetails, jobName-> |
| 589 | - |
| 590 | - def prj = new rocProject('rocFFT-internal', 'multigpu') |
| 591 | - |
| 592 | - prj.defaults.ccache = true |
| 593 | - prj.timeout.compile = 600 |
| 594 | - prj.timeout.test = 600 |
| 595 | - prj.libraryDependencies = ['rocRAND','hipRAND'] |
| 596 | - |
| 597 | - // Define test architectures, optional rocm version argument is available |
| 598 | - def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 599 | - |
| 600 | - boolean formatCheck = false |
| 601 | - |
| 602 | - def commonGroovy |
| 603 | - |
| 604 | - def compileCommand = |
| 605 | - { |
| 606 | - platform, project-> |
| 607 | - |
| 608 | - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" |
| 609 | - // build with MPI enabled |
| 610 | - commonGroovy.runCompileCommand(platform, project, jobName, false, false, true) |
| 611 | - commonGroovy.runCompileClientCommand(platform, project, jobName, false) |
| 612 | - } |
| 613 | - |
| 614 | - def testCommand = |
| 615 | - { |
| 616 | - platform, project-> |
| 617 | - |
| 618 | - //run single-process multi-GPU tests |
| 619 | - commonGroovy.runTestCommand(platform, project, false, "*multi_gpu*") |
| 620 | - // run MPI tests across 4 ranks |
| 621 | - commonGroovy.runTestCommand(platform, project, false, "*multi_gpu*", '--mp_lib mpi --mp_ranks 4 --mp_launch "/usr/local/openmpi/bin/mpirun --np 4 ./rocfft_mpi_worker"') |
| 622 | - } |
| 623 | - |
| 624 | - def packageCommand = |
| 625 | - { |
| 626 | - platform, project-> |
| 627 | - |
| 628 | - // don't package anything - we're not distributing MPI-enabled rocFFT so we don't want to expose any MPI-enabled packages anywhere that other builds can mistakenly pick up |
| 629 | - } |
| 630 | - |
| 631 | - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) |
| 632 | -} |
| 633 | - |
| 634 | -ci: { |
| 635 | - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) |
| 636 | - |
| 637 | - def propertyList = ["main":[pipelineTriggers([cron('0 1 * * 0')])]] |
| 638 | - propertyList = auxiliary.appendPropertyList(propertyList) |
| 639 | - |
| 640 | - def jobNameList = ["main":([ubuntu20:['8gfx90a']])] |
| 641 | - jobNameList = auxiliary.appendJobNameList(jobNameList) |
| 642 | - |
| 643 | - propertyList.each |
| 644 | - { |
| 645 | - jobName, property-> |
| 646 | - if (urlJobName == jobName) |
| 647 | - properties(auxiliary.addCommonProperties(property)) |
| 648 | - } |
| 649 | - |
| 650 | - jobNameList.each |
| 651 | - { |
| 652 | - jobName, nodeDetails-> |
| 653 | - if (urlJobName == jobName) |
| 654 | - stage(jobName) { |
| 655 | - runCI(nodeDetails, jobName) |
| 656 | - } |
| 657 | - } |
| 658 | - |
| 659 | - // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 |
| 660 | - if(!jobNameList.keySet().contains(urlJobName)) |
| 661 | - { |
| 662 | - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) |
| 663 | - stage(urlJobName) { |
| 664 | - runCI([ubuntu20:['8gfx90a']], urlJobName) |
| 665 | - } |
| 666 | - } |
| 667 | -} |
| 668 | diff --git a/.jenkins/performance.groovy b/.jenkins/performance.groovy |
| 669 | deleted file mode 100644 |
| 670 | index d488f59..0000000 |
| 671 | --- a/.jenkins/performance.groovy |
| 672 | +++ /dev/null |
| 673 | @@ -1,231 +0,0 @@ |
| 674 | -#!/usr/bin/env groovy |
| 675 | -// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ |
| 676 | -@Library('rocJenkins@pong') _ |
| 677 | - |
| 678 | -// This is file for internal AMD use. |
| 679 | -// If you are interested in running your own Jenkins, please raise a github issue for assistance. |
| 680 | - |
| 681 | -import com.amd.project.* |
| 682 | -import com.amd.docker.* |
| 683 | -import java.nio.file.Path |
| 684 | - |
| 685 | -def runCompileCommand(platform, project, jobName, boolean debug=false, boolean buildStatic=false) |
| 686 | -{ |
| 687 | - def reference = (env.BRANCH_NAME ==~ /PR-\d+/) ? 'develop' : 'master' |
| 688 | - |
| 689 | - project.paths.construct_build_prefix() |
| 690 | - |
| 691 | - def getDependenciesCommand = "" |
| 692 | - if (project.installLibraryDependenciesFromCI) |
| 693 | - { |
| 694 | - project.libraryDependencies.each |
| 695 | - { libraryName -> |
| 696 | - getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false) |
| 697 | - } |
| 698 | - } |
| 699 | - |
| 700 | - dir("${project.paths.project_build_prefix}/ref-repo") { |
| 701 | - git branch: "${reference}", url: 'https://github.com/ROCmSoftwarePlatform/rocFFT.git' |
| 702 | - } |
| 703 | - |
| 704 | - String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCH=ON' |
| 705 | - String noclientArgs = '-DBUILD_CLIENTS_SAMPLES=OFF -DBUILD_CLIENTS_TESTS=OFF -DBUILD_CLIENTS_BENCH=OFF' |
| 706 | - String warningArgs = '-DWERROR=ON' |
| 707 | - String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release' |
| 708 | - String buildTypeDir = debug ? 'debug' : 'release' |
| 709 | - String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_LOCAL/rocfft_build_cache.db" |
| 710 | - String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' |
| 711 | - |
| 712 | - def command = """#!/usr/bin/env bash |
| 713 | - set -x |
| 714 | - cd ${project.paths.project_build_prefix} |
| 715 | - ${getDependenciesCommand} |
| 716 | - set -e |
| 717 | - mkdir -p build/${buildTypeDir} && pushd build/${buildTypeDir} |
| 718 | - ${auxiliary.gfxTargetParser()} |
| 719 | - ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang -DAMDGPU_TARGETS=\$gfx_arch ${buildTypeArg} ${clientArgs} ${warningArgs} ${rtcBuildCache} ../.. |
| 720 | - make -j\$(nproc) |
| 721 | - popd |
| 722 | - cd ref-repo |
| 723 | - mkdir -p build/${buildTypeDir} && pushd build/${buildTypeDir} |
| 724 | - ${auxiliary.gfxTargetParser()} |
| 725 | - ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang -DAMDGPU_TARGETS=\$gfx_arch ${buildTypeArg} ${noclientArgs} ${warningArgs} ${rtcBuildCache} ../.. |
| 726 | - make -j\$(nproc) |
| 727 | - """ |
| 728 | - platform.runCommand(this, command) |
| 729 | -} |
| 730 | - |
| 731 | -def runTestCommand (platform, project, boolean debug=false) |
| 732 | -{ |
| 733 | - String sudo = auxiliary.sudo(platform.jenkinsLabel) |
| 734 | - String directory = debug ? 'debug' : 'release' |
| 735 | - |
| 736 | - def dataTypes = ['single', 'double'] |
| 737 | - for (def dataType in dataTypes) |
| 738 | - { |
| 739 | - def command = """#!/usr/bin/env bash |
| 740 | - set -ex |
| 741 | - pwd |
| 742 | - cd ${project.paths.project_build_prefix} |
| 743 | - export ROCFFT_RTC_CACHE_PATH="\$JENKINS_HOME_LOCAL/rocfft_build_cache.db" |
| 744 | - ./scripts/perf/rocfft-perf run --bench ./build/${directory}/clients/staging/dyna-rocfft-bench --lib ./ref-repo/build/${directory}/library/src/librocfft.so --lib ./build/${directory}/library/src/librocfft.so --out ./${dataType}_ref --out ./${dataType}_change --device 0 --precision ${dataType} --suite benchmarks |
| 745 | - ls ${dataType}_change |
| 746 | - ls ${dataType}_ref |
| 747 | - mkdir ${dataType}_results |
| 748 | - ./scripts/perf/rocfft-perf post ./${dataType}_results ./${dataType}_ref ./${dataType}_change |
| 749 | - ls ${dataType}_change/*.mdat |
| 750 | - ./scripts/perf/rocfft-perf html ./${dataType}_results ./${dataType}_ref ./${dataType}_change |
| 751 | - mv ${dataType}_results/figs.html ${dataType}_results/figs_${platform.gpu}.html |
| 752 | - """ |
| 753 | - platform.runCommand(this, command) |
| 754 | - |
| 755 | - archiveArtifacts "${project.paths.project_build_prefix}/${dataType}_results/*.html" |
| 756 | - publishHTML([allowMissing: false, |
| 757 | - alwaysLinkToLastBuild: false, |
| 758 | - keepAll: false, |
| 759 | - reportDir: "${project.paths.project_build_prefix}/${dataType}_results", |
| 760 | - reportFiles: "figs_${platform.gpu}.html", |
| 761 | - reportName: "${dataType}-precision-${platform.gpu}", |
| 762 | - reportTitles: "${dataType}-precision-${platform.gpu}"]) |
| 763 | - } |
| 764 | - |
| 765 | - |
| 766 | - withCredentials([gitUsernamePassword(credentialsId: 'GitHub-ROCmMathLibrariesBot-Token', gitToolName: 'git-tool')]) |
| 767 | - { |
| 768 | - platform.runCommand( |
| 769 | - this, |
| 770 | - """ |
| 771 | - cd ${project.paths.build_prefix} |
| 772 | - git clone https://github.com/ROCmSoftwarePlatform/rocPTS.git -b release/rocpts-rel-1.2.0 |
| 773 | - cd rocPTS |
| 774 | - python3 -m pip install build |
| 775 | - python3 -m build |
| 776 | - python3 -m pip install . |
| 777 | - """ |
| 778 | - ) |
| 779 | - } |
| 780 | - writeFile( |
| 781 | - file: project.paths.project_build_prefix + "/record_pts.py", |
| 782 | - text: libraryResource("com/amd/scripts/record_pts.py")) |
| 783 | - def setupBranch = env.CHANGE_ID ? "git branch \$BRANCH_NAME" : "" |
| 784 | - def command = """#!/usr/bin/env bash |
| 785 | - set -ex |
| 786 | - cd ${project.paths.project_build_prefix} |
| 787 | - ${setupBranch} |
| 788 | - git checkout \$BRANCH_NAME |
| 789 | - benchmark_folder=rocFFT_Benchmark_Dataset_\$(date +%Y%m%d) |
| 790 | - mkdir -p \${benchmark_folder}/all_change \${benchmark_folder}/all_ref |
| 791 | - cp -uf ./*_change/* \${benchmark_folder}/all_change |
| 792 | - cp -uf ./*_ref/* \${benchmark_folder}/all_ref |
| 793 | - python3 ./record_pts.py \ |
| 794 | - --dataset-path \$PWD/\${benchmark_folder} \ |
| 795 | - --reference-dataset all_ref \ |
| 796 | - --new-dataset all_change \ |
| 797 | - --new-build . \ |
| 798 | - --reference-build ./ref-repo\ |
| 799 | - -v 5.5 \ |
| 800 | - -l pts_rocfft_benchmark_data-v1.0.0 |
| 801 | - """ |
| 802 | - withCredentials([usernamePassword(credentialsId: 'PTS_API_ID_KEY_PROD', usernameVariable: 'PTS_API_ID', passwordVariable: 'PTS_API_KEY')]) |
| 803 | - { |
| 804 | - platform.runCommand(this, command) |
| 805 | - } |
| 806 | -} |
| 807 | - |
| 808 | -def runCI = |
| 809 | -{ |
| 810 | - nodeDetails, jobName-> |
| 811 | - |
| 812 | - def prj = new rocProject('rocFFT-internal', 'Performance') |
| 813 | - |
| 814 | - prj.defaults.ccache = true |
| 815 | - prj.timeout.compile = 600 |
| 816 | - prj.timeout.test = 600 |
| 817 | - prj.libraryDependencies = ['rocRAND','hipRAND'] |
| 818 | - |
| 819 | - // Define test architectures, optional rocm version argument is available |
| 820 | - def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 821 | - |
| 822 | - boolean formatCheck = false |
| 823 | - |
| 824 | - def commonGroovy |
| 825 | - def gpus = [] |
| 826 | - def dataTypes = ['single', 'double'] |
| 827 | - |
| 828 | - def compileCommand = |
| 829 | - { |
| 830 | - platform, project-> |
| 831 | - |
| 832 | - gpus.add(platform.gpu) |
| 833 | - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" |
| 834 | - runCompileCommand(platform, project, jobName) |
| 835 | - } |
| 836 | - |
| 837 | - def testCommand = |
| 838 | - { |
| 839 | - platform, project-> |
| 840 | - |
| 841 | - runTestCommand(platform, project) |
| 842 | - } |
| 843 | - |
| 844 | - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null) |
| 845 | - def commentString = "Performance reports: \n" + "Commit hashes: \n" |
| 846 | - for (parentHash in prj.gitParentHashes) { |
| 847 | - commentString += "${parentHash} \n" |
| 848 | - } |
| 849 | - for (gpu in gpus) { |
| 850 | - for (dataType in dataTypes) { |
| 851 | - commentString += "[${gpu} ${dataType} report](${JOB_URL}/${dataType}-precision-${gpu})\n" |
| 852 | - } |
| 853 | - } |
| 854 | - |
| 855 | - if (env.BRANCH_NAME ==~ /PR-\d+/) |
| 856 | - { |
| 857 | - boolean commentExists = false |
| 858 | - for (prComment in pullRequest.comments) { |
| 859 | - if (prComment.body.contains("Performance reports:")) |
| 860 | - { |
| 861 | - commentExists = true |
| 862 | - prComment.body = commentString |
| 863 | - } |
| 864 | - } |
| 865 | - if (!commentExists) { |
| 866 | - def comment = pullRequest.comment(commentString) |
| 867 | - } |
| 868 | - } |
| 869 | -} |
| 870 | - |
| 871 | -ci: { |
| 872 | - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) |
| 873 | - |
| 874 | - def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] |
| 875 | - propertyList = auxiliary.appendPropertyList(propertyList) |
| 876 | - |
| 877 | - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900','gfx906']])] |
| 878 | - jobNameList = auxiliary.appendJobNameList(jobNameList) |
| 879 | - |
| 880 | - propertyList.each |
| 881 | - { |
| 882 | - jobName, property-> |
| 883 | - if (urlJobName == jobName) |
| 884 | - properties(auxiliary.addCommonProperties(property)) |
| 885 | - } |
| 886 | - |
| 887 | - jobNameList.each |
| 888 | - { |
| 889 | - jobName, nodeDetails-> |
| 890 | - if (urlJobName == jobName) |
| 891 | - stage(jobName) { |
| 892 | - runCI(nodeDetails, jobName) |
| 893 | - } |
| 894 | - } |
| 895 | - |
| 896 | - // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 |
| 897 | - if(!jobNameList.keySet().contains(urlJobName)) |
| 898 | - { |
| 899 | - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) |
| 900 | - stage(urlJobName) { |
| 901 | - runCI([ubuntu18:['gfx906']], urlJobName) |
| 902 | - } |
| 903 | - } |
| 904 | -} |
| 905 | diff --git a/.jenkins/staticanalysis.groovy b/.jenkins/staticanalysis.groovy |
| 906 | deleted file mode 100644 |
| 907 | index b516520..0000000 |
| 908 | --- a/.jenkins/staticanalysis.groovy |
| 909 | +++ /dev/null |
| 910 | @@ -1,110 +0,0 @@ |
| 911 | -#!/usr/bin/env groovy |
| 912 | -// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ |
| 913 | -@Library('rocJenkins@pong') _ |
| 914 | - |
| 915 | -// This is file for internal AMD use. |
| 916 | -// If you are interested in running your own Jenkins, please raise a github issue for assistance. |
| 917 | - |
| 918 | -import com.amd.project.* |
| 919 | -import com.amd.docker.* |
| 920 | -import java.nio.file.Path |
| 921 | - |
| 922 | -def runCompileCommand(platform, project, jobName, boolean debug=false) |
| 923 | -{ |
| 924 | - project.paths.construct_build_prefix() |
| 925 | - |
| 926 | - def yapfCommand = """#!/usr/bin/env bash |
| 927 | - set -x |
| 928 | - cd ${project.paths.project_build_prefix} |
| 929 | - yapf --version |
| 930 | - find . -iname '*.py' \ |
| 931 | - | grep -v 'build/' \ |
| 932 | - | xargs -n 1 -P 1 -I{} -t sh -c 'yapf --style pep8 {} | diff - {}' |
| 933 | - """ |
| 934 | - |
| 935 | - platform.runCommand(this, yapfCommand) |
| 936 | -} |
| 937 | - |
| 938 | -def runCI = |
| 939 | -{ |
| 940 | - nodeDetails, jobName-> |
| 941 | - |
| 942 | - def prj = new rocProject('rocFFT-internal', 'StaticAnalysis') |
| 943 | - prj.libraryDependencies = ['rocRAND','hipRAND'] |
| 944 | - |
| 945 | - // Define test architectures, optional rocm version argument is available |
| 946 | - def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 947 | - |
| 948 | - boolean formatCheck = true |
| 949 | - boolean staticAnalysis = true |
| 950 | - |
| 951 | - def compileCommand = |
| 952 | - { |
| 953 | - platform, project-> |
| 954 | - |
| 955 | - runCompileCommand(platform, project, jobName, false) |
| 956 | - } |
| 957 | - |
| 958 | - buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis) |
| 959 | - |
| 960 | - |
| 961 | - def kernelSubsetPrj = new rocProject('rocFFT-internal', 'BuildKernelSubset') |
| 962 | - |
| 963 | - def nodesForPrj2 = new dockerNodes(nodeDetails, jobName, kernelSubsetPrj) |
| 964 | - |
| 965 | - def commonGroovy |
| 966 | - |
| 967 | - def compileSubsetCommand = |
| 968 | - { |
| 969 | - platform, project-> |
| 970 | - |
| 971 | - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" |
| 972 | - |
| 973 | - // build pattern pow2,pow7 no manual small and large, dp only |
| 974 | - commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'pow2,pow7', null, null, true) |
| 975 | - |
| 976 | - // build large sizes, dp only |
| 977 | - commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'large', null, null, true) |
| 978 | - |
| 979 | - // build 2D sizes, dp only |
| 980 | - commonGroovy.runSubsetBuildCommand(platform, project, jobName, '2D', null, null, true) |
| 981 | - |
| 982 | - // put an extra unsupported size(10) in manual large to see if it will be filtered correctly |
| 983 | - commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', null, '10,50,100,200,336', true) |
| 984 | - |
| 985 | - // put an extra unsupported size(23) in manual small to see if it will be filtered correctly |
| 986 | - commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', '23,1024', '10,50,100,200,336', true) |
| 987 | - |
| 988 | - // all the manual sizes are not supported |
| 989 | - //commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', '23', '10', true) |
| 990 | - } |
| 991 | - |
| 992 | - buildProject(kernelSubsetPrj , formatCheck, nodesForPrj2.dockerArray, compileSubsetCommand, null, null) |
| 993 | -} |
| 994 | - |
| 995 | -ci: { |
| 996 | - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) |
| 997 | - |
| 998 | - def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 6')])], |
| 999 | - "rocm-docker":[]] |
| 1000 | - propertyList = auxiliary.appendPropertyList(propertyList) |
| 1001 | - |
| 1002 | - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":[]] |
| 1003 | - jobNameList = auxiliary.appendJobNameList(jobNameList) |
| 1004 | - |
| 1005 | - propertyList.each |
| 1006 | - { |
| 1007 | - jobName, property-> |
| 1008 | - if (urlJobName == jobName) |
| 1009 | - properties(auxiliary.addCommonProperties(property)) |
| 1010 | - } |
| 1011 | - |
| 1012 | - jobNameList.each |
| 1013 | - { |
| 1014 | - jobName, nodeDetails-> |
| 1015 | - if (urlJobName == jobName) |
| 1016 | - stage(jobName) { |
| 1017 | - runCI(nodeDetails, jobName) |
| 1018 | - } |
| 1019 | - } |
| 1020 | -} |
| 1021 | diff --git a/.jenkins/staticlibrary.groovy b/.jenkins/staticlibrary.groovy |
| 1022 | deleted file mode 100644 |
| 1023 | index dcc8dd5..0000000 |
| 1024 | --- a/.jenkins/staticlibrary.groovy |
| 1025 | +++ /dev/null |
| 1026 | @@ -1,88 +0,0 @@ |
| 1027 | -#!/usr/bin/env groovy |
| 1028 | -// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ |
| 1029 | -@Library('rocJenkins@pong') _ |
| 1030 | - |
| 1031 | -// This is file for internal AMD use. |
| 1032 | -// If you are interested in running your own Jenkins, please raise a github issue for assistance. |
| 1033 | - |
| 1034 | -import com.amd.project.* |
| 1035 | -import com.amd.docker.* |
| 1036 | -import java.nio.file.Path |
| 1037 | - |
| 1038 | -def runCI = |
| 1039 | -{ |
| 1040 | - nodeDetails, jobName-> |
| 1041 | - |
| 1042 | - def prj = new rocProject('rocFFT-internal', 'StaticLibrary') |
| 1043 | - |
| 1044 | - prj.defaults.ccache = true |
| 1045 | - prj.timeout.compile = 600 |
| 1046 | - prj.timeout.test = 600 |
| 1047 | - prj.libraryDependencies = ['rocRAND','hipRAND'] |
| 1048 | - |
| 1049 | - // Define test architectures, optional rocm version argument is available |
| 1050 | - def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 1051 | - |
| 1052 | - boolean formatCheck = false |
| 1053 | - |
| 1054 | - def commonGroovy |
| 1055 | - |
| 1056 | - def compileCommand = |
| 1057 | - { |
| 1058 | - platform, project-> |
| 1059 | - |
| 1060 | - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" |
| 1061 | - commonGroovy.runCompileCommand(platform, project, jobName, false, true) |
| 1062 | - } |
| 1063 | - |
| 1064 | - def testCommand = |
| 1065 | - { |
| 1066 | - platform, project-> |
| 1067 | - |
| 1068 | - commonGroovy.runTestCommand(platform, project) |
| 1069 | - } |
| 1070 | - |
| 1071 | - def packageCommand = |
| 1072 | - { |
| 1073 | - platform, project-> |
| 1074 | - |
| 1075 | - commonGroovy.runPackageCommand(platform, project, jobName) |
| 1076 | - } |
| 1077 | - |
| 1078 | - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) |
| 1079 | -} |
| 1080 | - |
| 1081 | -ci: { |
| 1082 | - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) |
| 1083 | - |
| 1084 | - def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] |
| 1085 | - propertyList = auxiliary.appendPropertyList(propertyList) |
| 1086 | - |
| 1087 | - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900']])] |
| 1088 | - jobNameList = auxiliary.appendJobNameList(jobNameList) |
| 1089 | - |
| 1090 | - propertyList.each |
| 1091 | - { |
| 1092 | - jobName, property-> |
| 1093 | - if (urlJobName == jobName) |
| 1094 | - properties(auxiliary.addCommonProperties(property)) |
| 1095 | - } |
| 1096 | - |
| 1097 | - jobNameList.each |
| 1098 | - { |
| 1099 | - jobName, nodeDetails-> |
| 1100 | - if (urlJobName == jobName) |
| 1101 | - stage(jobName) { |
| 1102 | - runCI(nodeDetails, jobName) |
| 1103 | - } |
| 1104 | - } |
| 1105 | - |
| 1106 | - // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 |
| 1107 | - if(!jobNameList.keySet().contains(urlJobName)) |
| 1108 | - { |
| 1109 | - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) |
| 1110 | - stage(urlJobName) { |
| 1111 | - runCI([ubuntu16:['gfx906']], urlJobName) |
| 1112 | - } |
| 1113 | - } |
| 1114 | -} |
| 1115 | diff --git a/CHANGELOG.md b/CHANGELOG.md |
| 1116 | index 5c4bc51..659b4c7 100644 |
| 1117 | --- a/CHANGELOG.md |
| 1118 | +++ b/CHANGELOG.md |
| 1119 | @@ -3,6 +3,47 @@ |
| 1120 | Documentation for rocFFT is available at |
| 1121 | [https://rocm.docs.amd.com/projects/rocFFT/en/latest/](https://rocm.docs.amd.com/projects/rocFFT/en/latest/). |
| 1122 | |
| 1123 | +## rocFFT 1.0.35 for ROCM 7.1.0 |
| 1124 | + |
| 1125 | +### Optimized |
| 1126 | + |
| 1127 | +* Implemented single-kernel plans for some 2D problem sizes, on devices with at least 160KiB of LDS. |
| 1128 | +* Improved performance of unit-strided, complex-interleaved, forward/inverse FFTs for lengths: |
| 1129 | + - (64,64,128) |
| 1130 | + - (64,64,52) |
| 1131 | + - (60,60,60) |
| 1132 | + - (32,32,128) |
| 1133 | + - (32,32,64) |
| 1134 | + - (64,32,128) |
| 1135 | +* Improved performance of 3D MPI pencil decompositions by using sub-communicators for global transpose operations. |
| 1136 | + |
| 1137 | +## rocFFT 1.0.34 for ROCm 7.0.0 |
| 1138 | + |
| 1139 | +### Added |
| 1140 | + |
| 1141 | +* Added gfx950 support. |
| 1142 | + |
| 1143 | +### Removed |
| 1144 | + |
| 1145 | +* Removed rocfft-rider legacy compatibility from clients |
| 1146 | +* Removed support for the gfx940 and gfx941 targets from the client programs. |
| 1147 | + |
| 1148 | +### Optimized |
| 1149 | + |
| 1150 | +* Removed unnecessary HIP event/stream allocation and synchronization during MPI transforms. |
| 1151 | +* Implemented single-precision 1D kernels for lengths: |
| 1152 | + - 4704 |
| 1153 | + - 5488 |
| 1154 | + - 6144 |
| 1155 | + - 6561 |
| 1156 | + - 8192 |
| 1157 | +* Implemented single-kernel plans for some large 1D problem sizes, on devices with at least 160KiB of LDS. |
| 1158 | + |
| 1159 | +### Resolved issues |
| 1160 | + |
| 1161 | +* Fixed kernel faults on multi-device transforms that gather to a single device, when the input/output bricks are not |
| 1162 | + contiguous. |
| 1163 | + |
| 1164 | ## rocFFT 1.0.32 for ROCm 6.4.0 |
| 1165 | |
| 1166 | ### Changed |
| 1167 | diff --git a/CMakeLists.txt b/CMakeLists.txt |
| 1168 | index 75e5c36..c724aff 100644 |
| 1169 | --- a/CMakeLists.txt |
| 1170 | +++ b/CMakeLists.txt |
| 1171 | @@ -1,5 +1,5 @@ |
| 1172 | # ############################################################################# |
| 1173 | -# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. |
| 1174 | +# Copyright (C) 2016 - 2025 Advanced Micro Devices, Inc. All rights reserved. |
| 1175 | # |
| 1176 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1177 | # of this software and associated documentation files (the "Software"), to deal |
| 1178 | @@ -52,34 +52,24 @@ project( rocfft LANGUAGES CXX C ) |
| 1179 | # This finds the rocm-cmake project, and installs it if not found |
| 1180 | # rocm-cmake contains common cmake code for rocm projects to help setup and install |
| 1181 | set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern ) |
| 1182 | -find_package( ROCM 0.7.3 CONFIG QUIET PATHS ${ROCM_PATH} /opt/rocm ) |
| 1183 | -if( NOT ROCM_FOUND ) |
| 1184 | - set( rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download" ) |
| 1185 | - file( DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip |
| 1186 | - ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip STATUS status LOG log) |
| 1187 | - |
| 1188 | - list(GET status 0 status_code) |
| 1189 | - list(GET status 1 status_string) |
| 1190 | - |
| 1191 | - if(NOT status_code EQUAL 0) |
| 1192 | - message(FATAL_ERROR "error: downloading |
| 1193 | - 'https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip' failed |
| 1194 | - status_code: ${status_code} |
| 1195 | - status_string: ${status_string} |
| 1196 | - log: ${log} |
| 1197 | - ") |
| 1198 | - endif() |
| 1199 | +find_package( ROCmCMakeBuildTools PATHS ${ROCM_PATH} /opt/rocm ) |
| 1200 | +if( NOT ROCmCMakeBuildTools_FOUND ) |
| 1201 | + include( FetchContent ) |
| 1202 | + |
| 1203 | + FetchContent_Declare( rocm_cmake_local |
| 1204 | + GIT_REPOSITORY https://github.com/ROCm/rocm-cmake |
| 1205 | + GIT_TAG rocm-6.4.1 |
| 1206 | + GIT_SHALLOW ON |
| 1207 | + ) |
| 1208 | |
| 1209 | - message(STATUS "downloading... done") |
| 1210 | + FetchContent_MakeAvailable( rocm_cmake_local ) |
| 1211 | |
| 1212 | - execute_process( COMMAND ${CMAKE_COMMAND} -E tar xzvf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip |
| 1213 | - WORKING_DIRECTORY ${PROJECT_EXTERN_DIR} ) |
| 1214 | execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake . |
| 1215 | - WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag} ) |
| 1216 | - execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install |
| 1217 | - WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}) |
| 1218 | + WORKING_DIRECTORY ${rocm_cmake_local_SOURCE_DIR} ) |
| 1219 | + execute_process( COMMAND ${CMAKE_COMMAND} --build ${rocm_cmake_local_SOURCE_DIR} --target install |
| 1220 | + WORKING_DIRECTORY ${rocm_cmake_local_SOURCE_DIR} ) |
| 1221 | |
| 1222 | - find_package( ROCM 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake ) |
| 1223 | + find_package( ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake ) |
| 1224 | endif( ) |
| 1225 | include( ROCMSetupVersion ) |
| 1226 | include( ROCMCreatePackage ) |
| 1227 | @@ -90,8 +80,12 @@ include( ROCMCheckTargetIds ) |
| 1228 | include( ROCMClients ) |
| 1229 | include( ROCMHeaderWrapper ) |
| 1230 | |
| 1231 | +if( ROCM_PATH ) |
| 1232 | + list( APPEND CMAKE_BUILD_RPATH ${ROCM_PATH}/lib ) |
| 1233 | +endif() |
| 1234 | + |
| 1235 | # Using standardized versioning from rocm-cmake |
| 1236 | -set ( VERSION_STRING "1.0.32" ) |
| 1237 | +set ( VERSION_STRING "1.0.35" ) |
| 1238 | rocm_setup_version( VERSION ${VERSION_STRING} ) |
| 1239 | |
| 1240 | # Append our library helper cmake path and the cmake path for hip (for |
| 1241 | @@ -119,25 +113,10 @@ option(ROCFFT_BUILD_OFFLINE_TUNER "Build with offline tuner executable rocfft_of |
| 1242 | # Provide ability to disable hipRAND dependency |
| 1243 | option(USE_HIPRAND "Use hipRAND to provide device-side input generation" ON) |
| 1244 | |
| 1245 | -if( USE_HIPRAND ) |
| 1246 | - add_compile_definitions(USE_HIPRAND) |
| 1247 | -endif( ) |
| 1248 | - |
| 1249 | # Split up function pool compilation across N files to parallelize its build |
| 1250 | set(ROCFFT_FUNCTION_POOL_N 8 CACHE STRING "Number of files to split function_pool into for compilation") |
| 1251 | |
| 1252 | -# FOR HANDLING ENABLE/DISABLE OPTIONAL BACKWARD COMPATIBILITY for FILE/FOLDER REORG |
| 1253 | -option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" OFF) |
| 1254 | -if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32) |
| 1255 | - rocm_wrap_header_dir( |
| 1256 | - ${CMAKE_SOURCE_DIR}/library/include |
| 1257 | - PATTERNS "*.h" |
| 1258 | - GUARDS SYMLINK WRAPPER |
| 1259 | - WRAPPER_LOCATIONS ${CMAKE_INSTALL_INCLUDEDIR} |
| 1260 | - ) |
| 1261 | -endif() |
| 1262 | - |
| 1263 | -set( WARNING_FLAGS -Wall -Wno-unused-function -Wimplicit-fallthrough -Wunreachable-code -Wsign-compare ) |
| 1264 | +set( WARNING_FLAGS -Wall -Wno-unused-function -Wimplicit-fallthrough -Wunreachable-code -Wsign-compare -Wno-deprecated-declarations ) |
| 1265 | if( WERROR ) |
| 1266 | set( WARNING_FLAGS ${WARNING_FLAGS} -Werror ) |
| 1267 | endif( ) |
| 1268 | @@ -148,9 +127,8 @@ set(DEFAULT_GPUS |
| 1269 | gfx906 |
| 1270 | gfx908 |
| 1271 | gfx90a |
| 1272 | - gfx940 |
| 1273 | - gfx941 |
| 1274 | gfx942 |
| 1275 | + gfx950 |
| 1276 | gfx1030 |
| 1277 | gfx1100 |
| 1278 | gfx1101 |
| 1279 | @@ -166,8 +144,6 @@ if(BUILD_ADDRESS_SANITIZER) |
| 1280 | SET(DEFAULT_GPUS |
| 1281 | gfx908:xnack+ |
| 1282 | gfx90a:xnack+ |
| 1283 | - gfx940:xnack+ |
| 1284 | - gfx941:xnack+ |
| 1285 | gfx942:xnack+) |
| 1286 | add_link_options(-fuse-ld=lld) |
| 1287 | set(ROCFFT_KERNEL_CACHE_ENABLE off) |
| 1288 | @@ -193,7 +169,8 @@ rocm_check_target_ids(AMDGPU_TARGETS TARGETS "${AMDGPU_TARGETS}") |
| 1289 | set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to build for") |
| 1290 | |
| 1291 | # HIP is required - library and clients use HIP to access the device |
| 1292 | -find_package( HIP REQUIRED CONFIG ) |
| 1293 | +find_package( hip REQUIRED CONFIG PATHS /opt/rocm/lib/cmake/hip/ ) |
| 1294 | +find_package( hiprtc REQUIRED CONFIG PATHS /opt/rocm/lib/cmake/hiprtc/ ) |
| 1295 | |
| 1296 | # The nvidia backend can be used to compile for CUDA devices. |
| 1297 | # Specify the CUDA prefix in the CUDA_PREFIX variable. |
| 1298 | @@ -287,7 +264,9 @@ endif( ) |
| 1299 | if(WIN32) |
| 1300 | set(CPACK_SOURCE_GENERATOR "ZIP") |
| 1301 | set(CPACK_GENERATOR "ZIP") |
| 1302 | - set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path" FORCE) |
| 1303 | + if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) |
| 1304 | + set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path" FORCE) |
| 1305 | + endif() |
| 1306 | set(INSTALL_PREFIX "C:/hipSDK") |
| 1307 | set(CPACK_SET_DESTDIR OFF) |
| 1308 | set(CPACK_PACKAGE_INSTALL_DIRECTORY "C:/hipSDK") |
| 1309 | @@ -321,3 +300,5 @@ rocm_create_package( |
| 1310 | LDCONFIG |
| 1311 | LDCONFIG_DIR ${ROCFFT_CONFIG_DIR} |
| 1312 | ) |
| 1313 | + |
| 1314 | +option(BUILD_CODE_COVERAGE "Build with code coverage flags (clang only)" OFF) |
| 1315 | diff --git a/LICENSE.md b/LICENSE.md |
| 1316 | index 4c2aab7..d631884 100644 |
| 1317 | --- a/LICENSE.md |
| 1318 | +++ b/LICENSE.md |
| 1319 | @@ -1,12 +1,26 @@ |
| 1320 | -# License |
| 1321 | +MIT License |
| 1322 | |
| 1323 | -Copyright (C) 2016 - 2025 Advanced Micro Devices, Inc. All rights reserved. |
| 1324 | +Copyright (C) Advanced Micro Devices, Inc. |
| 1325 | |
| 1326 | -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: |
| 1327 | +Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1328 | +of this software and associated documentation files (the "Software"), to deal |
| 1329 | +in the Software without restriction, including without limitation the rights |
| 1330 | +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 1331 | +copies of the Software, and to permit persons to whom the Software is |
| 1332 | +furnished to do so, subject to the following conditions: |
| 1333 | |
| 1334 | -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. |
| 1335 | +The above copyright notice and this permission notice shall be included in all |
| 1336 | +copies or substantial portions of the Software. |
| 1337 | |
| 1338 | -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 1339 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 1340 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 1341 | +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 1342 | +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 1343 | +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 1344 | +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 1345 | +SOFTWARE. |
| 1346 | + |
| 1347 | +--- |
| 1348 | |
| 1349 | This product includes software from copyright holders as shown below, and distributed under their license terms as specified. |
| 1350 | |
| 1351 | diff --git a/README.md b/README.md |
| 1352 | index fe133cb..8f943be 100644 |
| 1353 | --- a/README.md |
| 1354 | +++ b/README.md |
| 1355 | @@ -1,19 +1,18 @@ |
| 1356 | # rocFFT |
| 1357 | |
| 1358 | rocFFT is a software library for computing fast Fourier transforms (FFTs) written in the HIP |
| 1359 | -programming language. It's part of AMD's software ecosystem based on |
| 1360 | -[ROCm](https://github.com/ROCm/ROCm). The rocFFT library can be used with AMD and |
| 1361 | -NVIDIA GPUs. |
| 1362 | +programming language. It's part of the AMD software ecosystem based on |
| 1363 | +[ROCm](https://github.com/ROCm/ROCm). The rocFFT library can be used with AMD GPUs. |
| 1364 | |
| 1365 | ## Documentation |
| 1366 | |
| 1367 | > [!NOTE] |
| 1368 | -> The published rocFFT documentation is available at [rocFFT](https://rocm.docs.amd.com/projects/rocFFT/en/latest/index.html) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the rocFFT/docs folder of this repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). |
| 1369 | +> The published rocFFT documentation is available at [rocFFT](https://rocm.docs.amd.com/projects/rocFFT/en/latest/index.html) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the projects/rocfft/docs folder of the rocm-libraries repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). |
| 1370 | |
| 1371 | To build our documentation locally, use the following code: |
| 1372 | |
| 1373 | ```Bash |
| 1374 | -cd docs |
| 1375 | +cd projects/rocfft/docs |
| 1376 | |
| 1377 | pip3 install -r sphinx/requirements.txt |
| 1378 | |
| 1379 | @@ -36,7 +35,7 @@ You can install rocFFT using pre-built packages or building from source. |
| 1380 | * Building from source: |
| 1381 | |
| 1382 | rocFFT is compiled with AMD's clang++ and uses CMake. You can specify several options to customize your |
| 1383 | - build. The following commands build a shared library for supported AMD GPUs: |
| 1384 | + build. The following commands build a shared library for supported AMD GPUs. Run these commands from the `rocm-libraries/projects/rocfft` directory: |
| 1385 | |
| 1386 | ```bash |
| 1387 | mkdir build && cd build |
| 1388 | @@ -61,6 +60,7 @@ You can install rocFFT using pre-built packages or building from source. |
| 1389 | | `rocfft-bench` | `-DBUILD_CLIENTS_BENCH=on` | hipRAND | |
| 1390 | | `rocfft-test` | `-DBUILD_CLIENTS_TESTS=on` | hipRAND, FFTW, GoogleTest | |
| 1391 | | samples | `-DBUILD_CLIENTS_SAMPLES=on` | None | |
| 1392 | + | coverage | `-DBUILD_CODE_COVERAGE=ON` | clang, llvm-cov | |
| 1393 | |
| 1394 | Clients are not built by default. To build them, use `-DBUILD_CLIENTS=on`. The build process |
| 1395 | downloads and builds GoogleTest and FFTW if they are not already installed. |
| 1396 | @@ -80,7 +80,14 @@ You can install rocFFT using pre-built packages or building from source. |
| 1397 | sudo apt install libgtest-dev libfftw3-dev libboost-dev |
| 1398 | ``` |
| 1399 | |
| 1400 | - We use version 1.11 of GoogleTest. |
| 1401 | + rocFFT uses version 1.11 of GoogleTest. |
| 1402 | + |
| 1403 | + You can generate a test coverage report with the following: |
| 1404 | + ```bash |
| 1405 | + cmake -DCMAKE_CXX_COMPILER=amdclang++ -DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CODE_COVERAGE=ON <optional: -DCOVERAGE_TEST_OPTIONS="cmdline args to pass to rocfft-test (default: --smoketest)"> .. |
| 1406 | + make -j coverage |
| 1407 | + ``` |
| 1408 | + The above will output the coverage report to the terminal and also save an html coverage report to `$PWD/coverage-report`. |
| 1409 | |
| 1410 | ## Examples |
| 1411 | |
| 1412 | @@ -91,9 +98,9 @@ You can find additional examples in the `clients/samples` subdirectory. |
| 1413 | |
| 1414 | ## Support |
| 1415 | |
| 1416 | -You can report bugs and feature requests through the GitHub |
| 1417 | -[issue tracker](https://github.com/ROCm/rocFFT/issues). |
| 1418 | +You can report bugs and feature requests through the rocm-libraries GitHub |
| 1419 | +[issue tracker](https://github.com/ROCm/rocm-libraries/issues). |
| 1420 | |
| 1421 | ## Contribute |
| 1422 | |
| 1423 | -If you want to contribute to rocFFT, you must follow our [contribution guidelines](https://github.com/ROCm/rocFFT/blob/develop/.github/CONTRIBUTING.md). |
| 1424 | +If you want to contribute to rocFFT, you must follow the [contribution guidelines](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocfft/.github/CONTRIBUTING.md). |
| 1425 | diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt |
| 1426 | index 1f42485..44f361d 100644 |
| 1427 | --- a/clients/CMakeLists.txt |
| 1428 | +++ b/clients/CMakeLists.txt |
| 1429 | @@ -53,8 +53,8 @@ set(CMAKE_CXX_STANDARD 17) |
| 1430 | |
| 1431 | list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) |
| 1432 | |
| 1433 | -if( NOT ROCM_FOUND ) |
| 1434 | - find_package( ROCM 0.7.3 REQUIRED ) |
| 1435 | +if( NOT ROCmCMakeBuildTools_FOUND ) |
| 1436 | + find_package( ROCmCMakeBuildTools REQUIRED ) |
| 1437 | endif() |
| 1438 | |
| 1439 | include( ROCMInstallTargets ) |
| 1440 | diff --git a/clients/bench/CMakeLists.txt b/clients/bench/CMakeLists.txt |
| 1441 | index 60980fc..f792c0f 100644 |
| 1442 | --- a/clients/bench/CMakeLists.txt |
| 1443 | +++ b/clients/bench/CMakeLists.txt |
| 1444 | @@ -53,11 +53,11 @@ if( NOT TARGET rocfft ) |
| 1445 | endif( ) |
| 1446 | |
| 1447 | if( NOT HIP_FOUND ) |
| 1448 | - find_package( HIP REQUIRED ) |
| 1449 | + find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ ) |
| 1450 | endif() |
| 1451 | |
| 1452 | -if( NOT ROCM_FOUND ) |
| 1453 | - find_package( ROCM 0.7.3 REQUIRED ) |
| 1454 | +if( NOT ROCmCMakeBuildTools_FOUND ) |
| 1455 | + find_package( ROCmCMakeBuildTools REQUIRED ) |
| 1456 | endif() |
| 1457 | |
| 1458 | if( USE_HIPRAND AND NOT hiprand_FOUND ) |
| 1459 | @@ -105,6 +105,7 @@ foreach( bench ${bench_list}) |
| 1460 | PRIVATE |
| 1461 | hip::hiprand |
| 1462 | ) |
| 1463 | + target_compile_definitions( ${bench} PRIVATE USE_HIPRAND ) |
| 1464 | endif() |
| 1465 | |
| 1466 | # We need to include both rocfft.h and rocfft-export.h |
| 1467 | @@ -155,30 +156,6 @@ foreach( bench ${bench_list}) |
| 1468 | ${BENCH_OUT_DIR} ) |
| 1469 | |
| 1470 | rocm_install(TARGETS ${bench} COMPONENT benchmarks) |
| 1471 | - |
| 1472 | - # install compatibility for old name of bench program - symlink on |
| 1473 | - # unix, hardlink on windows (since privilege is required to create |
| 1474 | - # symlinks there) |
| 1475 | - string(REPLACE bench rider bench_legacy ${bench}) |
| 1476 | - if( WIN32 ) |
| 1477 | - set( BENCH_LINK_COMMAND create_hardlink ) |
| 1478 | - set( BENCH_NEW_NAME ${BENCH_OUT_DIR}/$<TARGET_FILE_BASE_NAME:${bench}>${CMAKE_EXECUTABLE_SUFFIX} ) |
| 1479 | - set( BENCH_OLD_NAME ${BENCH_OUT_DIR}/${bench_legacy}${CMAKE_EXECUTABLE_SUFFIX} ) |
| 1480 | - else() |
| 1481 | - set( BENCH_LINK_COMMAND create_symlink ) |
| 1482 | - set( BENCH_NEW_NAME $<TARGET_FILE_BASE_NAME:${bench}> ) |
| 1483 | - set( BENCH_OLD_NAME ${BENCH_OUT_DIR}/${bench_legacy} ) |
| 1484 | - endif() |
| 1485 | - add_custom_command( |
| 1486 | - TARGET ${bench} |
| 1487 | - POST_BUILD |
| 1488 | - COMMAND ${CMAKE_COMMAND} -E ${BENCH_LINK_COMMAND} ${BENCH_NEW_NAME} ${BENCH_OLD_NAME} |
| 1489 | - ) |
| 1490 | - install( |
| 1491 | - FILES ${BENCH_OLD_NAME} |
| 1492 | - DESTINATION ${CMAKE_INSTALL_BINDIR} |
| 1493 | - COMPONENT benchmarks |
| 1494 | - ) |
| 1495 | endforeach() |
| 1496 | |
| 1497 | # Link dyna-rocfft-bench to the experimental filesystem library if |
| 1498 | diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp |
| 1499 | index 96f4b8c..1d30106 100644 |
| 1500 | --- a/clients/bench/bench.cpp |
| 1501 | +++ b/clients/bench/bench.cpp |
| 1502 | @@ -87,7 +87,11 @@ int main(int argc, char* argv[]) |
| 1503 | "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " |
| 1504 | "forward\n3) real inverse") |
| 1505 | ->default_val(fft_transform_type_complex_forward); |
| 1506 | - |
| 1507 | + non_token |
| 1508 | + ->add_option("--auto_allocation", |
| 1509 | + params.auto_allocate, |
| 1510 | + "rocFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"") |
| 1511 | + ->default_val("default"); |
| 1512 | non_token |
| 1513 | ->add_option( |
| 1514 | "--precision", params.precision, "Transform precision: single (default), double, half") |
| 1515 | @@ -201,7 +205,10 @@ int main(int argc, char* argv[]) |
| 1516 | params.mp_lib = fft_params::fft_mp_lib_none; |
| 1517 | |
| 1518 | int localDeviceCount = 0; |
| 1519 | - (void)hipGetDeviceCount(&localDeviceCount); |
| 1520 | + if(hipGetDeviceCount(&localDeviceCount) != hipSuccess) |
| 1521 | + { |
| 1522 | + throw std::runtime_error("hipGetDeviceCount failed"); |
| 1523 | + } |
| 1524 | |
| 1525 | // start with all-ones in grids |
| 1526 | std::vector<unsigned int> input_grid(params.length.size() + 1, 1); |
| 1527 | @@ -333,119 +340,49 @@ int main(int argc, char* argv[]) |
| 1528 | LIB_V_THROW(rocfft_status_failure, "Plan creation failed"); |
| 1529 | |
| 1530 | // GPU input buffer: |
| 1531 | - auto ibuffer_sizes = params.ibuffer_sizes(); |
| 1532 | - std::vector<gpubuf> ibuffer(ibuffer_sizes.size()); |
| 1533 | - std::vector<void*> pibuffer(ibuffer_sizes.size()); |
| 1534 | - for(unsigned int i = 0; i < ibuffer.size(); ++i) |
| 1535 | - { |
| 1536 | - try |
| 1537 | - { |
| 1538 | - HIP_V_THROW(ibuffer[i].alloc(ibuffer_sizes[i]), "Creating input Buffer failed"); |
| 1539 | - } |
| 1540 | - catch(rocfft_hip_runtime_error) |
| 1541 | - { |
| 1542 | - return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; |
| 1543 | - } |
| 1544 | - pibuffer[i] = ibuffer[i].data(); |
| 1545 | - } |
| 1546 | - |
| 1547 | + std::vector<gpubuf> ibuffer; |
| 1548 | + std::vector<void*> pibuffer; |
| 1549 | // CPU-side input buffer |
| 1550 | std::vector<hostbuf> ibuffer_cpu; |
| 1551 | |
| 1552 | auto is_host_gen = (params.igen == fft_input_generator_host |
| 1553 | || params.igen == fft_input_random_generator_host); |
| 1554 | |
| 1555 | -#ifdef USE_HIPRAND |
| 1556 | - if(!is_host_gen) |
| 1557 | - { |
| 1558 | - // Input data: |
| 1559 | - params.compute_input(ibuffer); |
| 1560 | + auto ibricks = get_input_bricks(params); |
| 1561 | + auto obricks = get_output_bricks(params); |
| 1562 | |
| 1563 | - if(verbose > 1) |
| 1564 | - { |
| 1565 | - // Copy input to CPU |
| 1566 | - try |
| 1567 | - { |
| 1568 | - ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize); |
| 1569 | - } |
| 1570 | - catch(rocfft_hip_runtime_error) |
| 1571 | - { |
| 1572 | - return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; |
| 1573 | - } |
| 1574 | - for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) |
| 1575 | - { |
| 1576 | - try |
| 1577 | - { |
| 1578 | - HIP_V_THROW(hipMemcpy(ibuffer_cpu.at(idx).data(), |
| 1579 | - ibuffer[idx].data(), |
| 1580 | - ibuffer_sizes[idx], |
| 1581 | - hipMemcpyDeviceToHost), |
| 1582 | - "hipMemcpy failed"); |
| 1583 | - } |
| 1584 | - catch(rocfft_hip_runtime_error) |
| 1585 | - { |
| 1586 | - return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; |
| 1587 | - } |
| 1588 | - } |
| 1589 | + std::vector<gpubuf> obuffer_data; |
| 1590 | + std::vector<gpubuf>* obuffer = nullptr; |
| 1591 | + alloc_bench_bricks( |
| 1592 | + params, ibricks, obricks, ibuffer, obuffer_data, obuffer, ibuffer_cpu, is_host_gen); |
| 1593 | |
| 1594 | - std::cout << "GPU input:\n"; |
| 1595 | - params.print_ibuffer(ibuffer_cpu); |
| 1596 | - } |
| 1597 | - } |
| 1598 | -#endif |
| 1599 | - if(is_host_gen) |
| 1600 | + pibuffer.resize(ibuffer.size()); |
| 1601 | + for(unsigned int i = 0; i < ibuffer.size(); ++i) |
| 1602 | { |
| 1603 | - // Input data: |
| 1604 | - ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize); |
| 1605 | - params.compute_input(ibuffer_cpu); |
| 1606 | + pibuffer[i] = ibuffer[i].data(); |
| 1607 | + } |
| 1608 | |
| 1609 | - if(verbose > 1) |
| 1610 | + // print input if requested |
| 1611 | + if(verbose > 1) |
| 1612 | + { |
| 1613 | + if(is_host_gen) |
| 1614 | { |
| 1615 | - std::cout << "GPU input:\n"; |
| 1616 | + // data is already on host |
| 1617 | params.print_ibuffer(ibuffer_cpu); |
| 1618 | } |
| 1619 | - |
| 1620 | - for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx) |
| 1621 | + else |
| 1622 | { |
| 1623 | - try |
| 1624 | - { |
| 1625 | - HIP_V_THROW(hipMemcpy(pibuffer[idx], |
| 1626 | - ibuffer_cpu[idx].data(), |
| 1627 | - ibuffer_cpu[idx].size(), |
| 1628 | - hipMemcpyHostToDevice), |
| 1629 | - "hipMemcpy failed"); |
| 1630 | - } |
| 1631 | - catch(rocfft_hip_runtime_error) |
| 1632 | - { |
| 1633 | - return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; |
| 1634 | - } |
| 1635 | + print_device_buffer(params, ibuffer, true); |
| 1636 | } |
| 1637 | } |
| 1638 | |
| 1639 | - // GPU output buffer: |
| 1640 | - std::vector<gpubuf> obuffer_data; |
| 1641 | - std::vector<gpubuf>* obuffer = &obuffer_data; |
| 1642 | - if(params.placement == fft_placement_inplace) |
| 1643 | - { |
| 1644 | - obuffer = &ibuffer; |
| 1645 | - } |
| 1646 | - else |
| 1647 | - { |
| 1648 | - auto obuffer_sizes = params.obuffer_sizes(); |
| 1649 | - obuffer_data.resize(obuffer_sizes.size()); |
| 1650 | - for(unsigned int i = 0; i < obuffer_data.size(); ++i) |
| 1651 | - { |
| 1652 | - HIP_V_THROW(obuffer_data[i].alloc(obuffer_sizes[i]), "Creating output Buffer failed"); |
| 1653 | - } |
| 1654 | - } |
| 1655 | std::vector<void*> pobuffer(obuffer->size()); |
| 1656 | for(unsigned int i = 0; i < obuffer->size(); ++i) |
| 1657 | { |
| 1658 | pobuffer[i] = obuffer->at(i).data(); |
| 1659 | } |
| 1660 | |
| 1661 | - // Scatter input out to other devices and adjust I/O buffers to match requested transform |
| 1662 | - params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer); |
| 1663 | + init_bench_input(params, ibricks, ibuffer, ibuffer_cpu, is_host_gen); |
| 1664 | |
| 1665 | // Execute a warm-up call |
| 1666 | params.execute(pibuffer.data(), pobuffer.data()); |
| 1667 | @@ -459,34 +396,13 @@ int main(int argc, char* argv[]) |
| 1668 | for(unsigned int itrial = 0; itrial < gpu_time.size(); ++itrial) |
| 1669 | { |
| 1670 | // Create input at every iteration to avoid overflow |
| 1671 | - if(params.ifields.empty()) |
| 1672 | + if(is_host_gen) |
| 1673 | { |
| 1674 | -#ifdef USE_HIPRAND |
| 1675 | - // Compute input on default device |
| 1676 | - if(!is_host_gen) |
| 1677 | - params.compute_input(ibuffer); |
| 1678 | -#endif |
| 1679 | - if(is_host_gen) |
| 1680 | - { |
| 1681 | - for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx) |
| 1682 | - { |
| 1683 | - try |
| 1684 | - { |
| 1685 | - HIP_V_THROW(hipMemcpy(pibuffer[idx], |
| 1686 | - ibuffer_cpu[idx].data(), |
| 1687 | - ibuffer_cpu[idx].size(), |
| 1688 | - hipMemcpyHostToDevice), |
| 1689 | - "hipMemcpy failed"); |
| 1690 | - } |
| 1691 | - catch(rocfft_hip_runtime_error) |
| 1692 | - { |
| 1693 | - return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; |
| 1694 | - } |
| 1695 | - } |
| 1696 | - } |
| 1697 | - |
| 1698 | - // Scatter input out to other devices if this is a multi-GPU test |
| 1699 | - params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer); |
| 1700 | + copy_host_input_to_dev(ibuffer_cpu, ibuffer); |
| 1701 | + } |
| 1702 | + else |
| 1703 | + { |
| 1704 | + init_bench_input(params, ibricks, ibuffer, ibuffer_cpu, is_host_gen); |
| 1705 | } |
| 1706 | |
| 1707 | HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed"); |
| 1708 | @@ -503,27 +419,7 @@ int main(int argc, char* argv[]) |
| 1709 | // Print result after FFT transform |
| 1710 | if(verbose > 2) |
| 1711 | { |
| 1712 | - // Gather data to default GPU if this is a multi-GPU test |
| 1713 | - params.multi_gpu_finalize(*obuffer, pobuffer); |
| 1714 | - |
| 1715 | - auto output = allocate_host_buffer(params.precision, params.otype, params.osize); |
| 1716 | - for(unsigned int idx = 0; idx < output.size(); ++idx) |
| 1717 | - { |
| 1718 | - try |
| 1719 | - { |
| 1720 | - HIP_V_THROW(hipMemcpy(output[idx].data(), |
| 1721 | - pobuffer.at(idx), |
| 1722 | - output[idx].size(), |
| 1723 | - hipMemcpyDeviceToHost), |
| 1724 | - "hipMemcpy failed"); |
| 1725 | - } |
| 1726 | - catch(rocfft_hip_runtime_error) |
| 1727 | - { |
| 1728 | - return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; |
| 1729 | - } |
| 1730 | - } |
| 1731 | - std::cout << "GPU output:\n"; |
| 1732 | - params.print_obuffer(output); |
| 1733 | + print_device_buffer(params, *obuffer, false); |
| 1734 | } |
| 1735 | } |
| 1736 | |
| 1737 | diff --git a/clients/bench/bench.h b/clients/bench/bench.h |
| 1738 | index 51804ac..de06f93 100644 |
| 1739 | --- a/clients/bench/bench.h |
| 1740 | +++ b/clients/bench/bench.h |
| 1741 | @@ -21,6 +21,9 @@ |
| 1742 | #ifndef ROCFFT_BENCH_H |
| 1743 | #define ROCFFT_BENCH_H |
| 1744 | |
| 1745 | +#include "../../shared/fft_params.h" |
| 1746 | +#include "../../shared/rocfft_hip.h" |
| 1747 | + |
| 1748 | #include "rocfft/rocfft.h" |
| 1749 | #include <hip/hip_runtime_api.h> |
| 1750 | #include <vector> |
| 1751 | @@ -92,4 +95,198 @@ inline void lib_V_Throw(rocfft_status res, |
| 1752 | #define HIP_V_THROW(_status, _message) hip_V_Throw(_status, _message, __LINE__, __FILE__) |
| 1753 | #define LIB_V_THROW(_status, _message) lib_V_Throw(_status, _message, __LINE__, __FILE__) |
| 1754 | |
| 1755 | +// return input bricks for params, or one big brick covering the |
| 1756 | +// input field if no bricks are specified |
| 1757 | +template <typename Tparams> |
| 1758 | +std::vector<fft_params::fft_brick> get_input_bricks(const Tparams& params) |
| 1759 | +{ |
| 1760 | + std::vector<fft_params::fft_brick> bricks; |
| 1761 | + if(!params.ifields.empty()) |
| 1762 | + bricks = params.ifields[0].bricks; |
| 1763 | + else |
| 1764 | + { |
| 1765 | + auto len = params.ilength(); |
| 1766 | + |
| 1767 | + // just make one big brick covering the whole input field |
| 1768 | + bricks.resize(1); |
| 1769 | + bricks.front().lower.resize(len.size() + 1); |
| 1770 | + bricks.front().upper.resize(len.size() + 1); |
| 1771 | + bricks.front().stride.resize(len.size() + 1); |
| 1772 | + |
| 1773 | + bricks.front().upper.front() = params.nbatch; |
| 1774 | + std::copy(len.begin(), len.end(), bricks.front().upper.begin() + 1); |
| 1775 | + |
| 1776 | + bricks.front().stride.front() = params.idist; |
| 1777 | + std::copy(params.istride.begin(), params.istride.end(), bricks.front().stride.begin() + 1); |
| 1778 | + } |
| 1779 | + return bricks; |
| 1780 | +} |
| 1781 | + |
| 1782 | +// return output bricks for params, or one big brick covering the |
| 1783 | +// output field if no bricks are specified |
| 1784 | +template <typename Tparams> |
| 1785 | +std::vector<fft_params::fft_brick> get_output_bricks(const Tparams& params) |
| 1786 | +{ |
| 1787 | + std::vector<fft_params::fft_brick> bricks; |
| 1788 | + if(!params.ofields.empty()) |
| 1789 | + bricks = params.ofields[0].bricks; |
| 1790 | + else |
| 1791 | + { |
| 1792 | + auto len = params.olength(); |
| 1793 | + |
| 1794 | + // just make one big brick covering the whole output field |
| 1795 | + bricks.resize(1); |
| 1796 | + bricks.front().lower.resize(len.size() + 1); |
| 1797 | + bricks.front().upper.resize(len.size() + 1); |
| 1798 | + bricks.front().stride.resize(len.size() + 1); |
| 1799 | + |
| 1800 | + bricks.front().upper.front() = params.nbatch; |
| 1801 | + std::copy(len.begin(), len.end(), bricks.front().upper.begin() + 1); |
| 1802 | + |
| 1803 | + bricks.front().stride.front() = params.odist; |
| 1804 | + std::copy(params.ostride.begin(), params.ostride.end(), bricks.front().stride.begin() + 1); |
| 1805 | + } |
| 1806 | + return bricks; |
| 1807 | +} |
| 1808 | + |
| 1809 | +// Allocate input/output buffers for a bench run. |
| 1810 | +template <typename Tparams> |
| 1811 | +void alloc_bench_bricks(const Tparams& params, |
| 1812 | + const std::vector<fft_params::fft_brick>& ibricks, |
| 1813 | + const std::vector<fft_params::fft_brick>& obricks, |
| 1814 | + std::vector<gpubuf>& ibuffers, |
| 1815 | + std::vector<gpubuf>& obuffer_data, |
| 1816 | + std::vector<gpubuf>*& obuffers, |
| 1817 | + std::vector<hostbuf>& host_buffers, |
| 1818 | + bool is_host_gen) |
| 1819 | +{ |
| 1820 | + auto alloc_buffers = [¶ms, &host_buffers](const std::vector<fft_params::fft_brick>& bricks, |
| 1821 | + fft_array_type type, |
| 1822 | + std::vector<gpubuf>& output, |
| 1823 | + bool is_host_gen) { |
| 1824 | + auto elem_size = var_size<size_t>(params.precision, type); |
| 1825 | + const bool is_planar |
| 1826 | + = type == fft_array_type_complex_planar || type == fft_array_type_hermitian_planar; |
| 1827 | + // alloc 2x buffers, each half size for planar |
| 1828 | + if(is_planar) |
| 1829 | + elem_size /= 2; |
| 1830 | + |
| 1831 | + for(const auto& b : bricks) |
| 1832 | + { |
| 1833 | + rocfft_scoped_device dev(b.device); |
| 1834 | + |
| 1835 | + size_t brick_size_bytes = compute_ptrdiff(b.length(), b.stride, 0, 0) * elem_size; |
| 1836 | + output.emplace_back(); |
| 1837 | + if(output.back().alloc(brick_size_bytes) != hipSuccess) |
| 1838 | + throw std::runtime_error("hipMalloc failed"); |
| 1839 | + if(is_planar) |
| 1840 | + { |
| 1841 | + output.emplace_back(); |
| 1842 | + if(output.back().alloc(brick_size_bytes) != hipSuccess) |
| 1843 | + throw std::runtime_error("hipMalloc failed"); |
| 1844 | + } |
| 1845 | + if(is_host_gen) |
| 1846 | + { |
| 1847 | + host_buffers.emplace_back(); |
| 1848 | + host_buffers.back().alloc(brick_size_bytes); |
| 1849 | + if(is_planar) |
| 1850 | + { |
| 1851 | + host_buffers.emplace_back(); |
| 1852 | + host_buffers.back().alloc(brick_size_bytes); |
| 1853 | + } |
| 1854 | + } |
| 1855 | + } |
| 1856 | + }; |
| 1857 | + |
| 1858 | + // If brick shape differs, inplace is only allowed for single |
| 1859 | + // bricks. e.g. in-place real-complex |
| 1860 | + if(params.placement == fft_placement_inplace) |
| 1861 | + { |
| 1862 | + if(ibricks.size() != 1 && obricks.size() != 1 && ibricks != obricks) |
| 1863 | + throw std::runtime_error( |
| 1864 | + "in-place transform to different brick shapes only allowed for single bricks"); |
| 1865 | + |
| 1866 | + // allocate the larger of the two bricks |
| 1867 | + auto isize_bytes = compute_ptrdiff(ibricks.front().length(), ibricks.front().stride, 0, 0) |
| 1868 | + * var_size<size_t>(params.precision, params.itype); |
| 1869 | + auto osize_bytes = compute_ptrdiff(obricks.front().length(), obricks.front().stride, 0, 0) |
| 1870 | + * var_size<size_t>(params.precision, params.otype); |
| 1871 | + |
| 1872 | + alloc_buffers(isize_bytes > osize_bytes ? ibricks : obricks, |
| 1873 | + isize_bytes > osize_bytes ? params.itype : params.otype, |
| 1874 | + ibuffers, |
| 1875 | + is_host_gen); |
| 1876 | + obuffers = &ibuffers; |
| 1877 | + } |
| 1878 | + else |
| 1879 | + { |
| 1880 | + alloc_buffers(ibricks, params.itype, ibuffers, is_host_gen); |
| 1881 | + alloc_buffers(obricks, params.otype, obuffer_data, false); |
| 1882 | + obuffers = &obuffer_data; |
| 1883 | + } |
| 1884 | +} |
| 1885 | + |
| 1886 | +void copy_host_input_to_dev(std::vector<hostbuf>& host_buffers, std::vector<gpubuf>& buffers) |
| 1887 | +{ |
| 1888 | + for(size_t i = 0; i < buffers.size(); ++i) |
| 1889 | + { |
| 1890 | + if(hipMemcpy(buffers[i].data(), |
| 1891 | + host_buffers[i].data(), |
| 1892 | + host_buffers[i].size(), |
| 1893 | + hipMemcpyHostToDevice) |
| 1894 | + != hipSuccess) |
| 1895 | + throw std::runtime_error("hipMemcpy failure"); |
| 1896 | + } |
| 1897 | +} |
| 1898 | + |
| 1899 | +template <typename Tparams> |
| 1900 | +void init_bench_input(const Tparams& params, |
| 1901 | + const std::vector<fft_params::fft_brick>& bricks, |
| 1902 | + std::vector<gpubuf>& buffers, |
| 1903 | + std::vector<hostbuf>& host_buffers, |
| 1904 | + bool is_host_gen) |
| 1905 | +{ |
| 1906 | + auto elem_size = var_size<size_t>(params.precision, params.itype); |
| 1907 | + if(is_host_gen) |
| 1908 | + { |
| 1909 | + std::vector<void*> ptrs; |
| 1910 | + ptrs.reserve(host_buffers.size()); |
| 1911 | + for(auto& buf : host_buffers) |
| 1912 | + ptrs.push_back(buf.data()); |
| 1913 | + |
| 1914 | + init_local_input<Tparams, hostbuf>(0, params, bricks, elem_size, ptrs); |
| 1915 | + copy_host_input_to_dev(host_buffers, buffers); |
| 1916 | + } |
| 1917 | + else |
| 1918 | + { |
| 1919 | +#ifdef USE_HIPRAND |
| 1920 | + std::vector<void*> ptrs; |
| 1921 | + ptrs.reserve(buffers.size()); |
| 1922 | + for(auto& buf : buffers) |
| 1923 | + ptrs.push_back(buf.data()); |
| 1924 | + |
| 1925 | + init_local_input<Tparams, gpubuf>(0, params, bricks, elem_size, ptrs); |
| 1926 | +#endif |
| 1927 | + } |
| 1928 | +} |
| 1929 | + |
| 1930 | +template <typename Tparams> |
| 1931 | +void print_device_buffer(const Tparams& params, std::vector<gpubuf>& buffer, bool input) |
| 1932 | +{ |
| 1933 | + // copy data back to host |
| 1934 | + std::vector<hostbuf> print_buffer; |
| 1935 | + for(auto& buf : buffer) |
| 1936 | + { |
| 1937 | + print_buffer.emplace_back(); |
| 1938 | + print_buffer.back().alloc(buf.size()); |
| 1939 | + if(hipMemcpy(print_buffer.back().data(), buf.data(), buf.size(), hipMemcpyDeviceToHost) |
| 1940 | + != hipSuccess) |
| 1941 | + throw std::runtime_error("hipMemcpy failed"); |
| 1942 | + } |
| 1943 | + if(input) |
| 1944 | + params.print_ibuffer(print_buffer); |
| 1945 | + else |
| 1946 | + params.print_obuffer(print_buffer); |
| 1947 | +} |
| 1948 | + |
| 1949 | #endif // ROCFFT_BENCH_H |
| 1950 | diff --git a/clients/bench/dyna-bench.cpp b/clients/bench/dyna-bench.cpp |
| 1951 | index 7e50435..b121887 100644 |
| 1952 | --- a/clients/bench/dyna-bench.cpp |
| 1953 | +++ b/clients/bench/dyna-bench.cpp |
| 1954 | @@ -458,7 +458,10 @@ int main(int argc, char* argv[]) |
| 1955 | params.mp_lib = fft_params::fft_mp_lib_none; |
| 1956 | |
| 1957 | int localDeviceCount = 0; |
| 1958 | - (void)hipGetDeviceCount(&localDeviceCount); |
| 1959 | + if(hipGetDeviceCount(&localDeviceCount) != hipSuccess) |
| 1960 | + { |
| 1961 | + throw std::runtime_error("hipGetDeviceCount failed"); |
| 1962 | + } |
| 1963 | |
| 1964 | // start with all-ones in grids |
| 1965 | std::vector<unsigned int> input_grid(params.length.size() + 1, 1); |
| 1966 | @@ -574,112 +577,42 @@ int main(int argc, char* argv[]) |
| 1967 | } |
| 1968 | |
| 1969 | // GPU input buffer: |
| 1970 | - auto ibuffer_sizes = params.ibuffer_sizes(); |
| 1971 | - std::vector<gpubuf> ibuffer(ibuffer_sizes.size()); |
| 1972 | - std::vector<void*> pibuffer(ibuffer_sizes.size()); |
| 1973 | - for(unsigned int i = 0; i < ibuffer.size(); ++i) |
| 1974 | - { |
| 1975 | - try |
| 1976 | - { |
| 1977 | - HIP_V_THROW(ibuffer[i].alloc(ibuffer_sizes[i]), "Creating input Buffer failed"); |
| 1978 | - } |
| 1979 | - catch(rocfft_hip_runtime_error) |
| 1980 | - { |
| 1981 | - return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; |
| 1982 | - } |
| 1983 | - pibuffer[i] = ibuffer[i].data(); |
| 1984 | - } |
| 1985 | - |
| 1986 | + std::vector<gpubuf> ibuffer; |
| 1987 | + std::vector<void*> pibuffer; |
| 1988 | // CPU-side input buffer |
| 1989 | std::vector<hostbuf> ibuffer_cpu; |
| 1990 | |
| 1991 | auto is_host_gen = (params.igen == fft_input_generator_host |
| 1992 | || params.igen == fft_input_random_generator_host); |
| 1993 | |
| 1994 | -#ifdef USE_HIPRAND |
| 1995 | - if(!is_host_gen) |
| 1996 | - { |
| 1997 | - // Input data: |
| 1998 | - params.compute_input(ibuffer); |
| 1999 | + auto ibricks = get_input_bricks(params); |
| 2000 | + auto obricks = get_output_bricks(params); |
| 2001 | |
| 2002 | - if(verbose > 1) |
| 2003 | - { |
| 2004 | - // Copy input to CPU |
| 2005 | - ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize); |
| 2006 | - for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) |
| 2007 | - { |
| 2008 | - try |
| 2009 | - { |
| 2010 | - HIP_V_THROW(hipMemcpy(ibuffer_cpu.at(idx).data(), |
| 2011 | - ibuffer[idx].data(), |
| 2012 | - ibuffer_sizes[idx], |
| 2013 | - hipMemcpyDeviceToHost), |
| 2014 | - "hipMemcpy failed"); |
| 2015 | - } |
| 2016 | - catch(rocfft_hip_runtime_error) |
| 2017 | - { |
| 2018 | - return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; |
| 2019 | - } |
| 2020 | - } |
| 2021 | + std::vector<gpubuf> obuffer_data; |
| 2022 | + std::vector<gpubuf>* obuffer = nullptr; |
| 2023 | + alloc_bench_bricks( |
| 2024 | + params, ibricks, obricks, ibuffer, obuffer_data, obuffer, ibuffer_cpu, is_host_gen); |
| 2025 | + init_bench_input(params, ibricks, ibuffer, ibuffer_cpu, is_host_gen); |
| 2026 | |
| 2027 | - std::cout << "GPU input:\n"; |
| 2028 | - params.print_ibuffer(ibuffer_cpu); |
| 2029 | - } |
| 2030 | - } |
| 2031 | -#endif |
| 2032 | - if(is_host_gen) |
| 2033 | + for(unsigned int i = 0; i < ibuffer.size(); ++i) |
| 2034 | { |
| 2035 | - // Input data: |
| 2036 | - ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize); |
| 2037 | - params.compute_input(ibuffer_cpu); |
| 2038 | + pibuffer.push_back(ibuffer[i].data()); |
| 2039 | + } |
| 2040 | |
| 2041 | - if(verbose > 1) |
| 2042 | + // print input if requested |
| 2043 | + if(verbose > 1) |
| 2044 | + { |
| 2045 | + if(is_host_gen) |
| 2046 | { |
| 2047 | - std::cout << "GPU input:\n"; |
| 2048 | + // data is already on host |
| 2049 | params.print_ibuffer(ibuffer_cpu); |
| 2050 | } |
| 2051 | - |
| 2052 | - for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx) |
| 2053 | + else |
| 2054 | { |
| 2055 | - try |
| 2056 | - { |
| 2057 | - HIP_V_THROW(hipMemcpy(pibuffer[idx], |
| 2058 | - ibuffer_cpu[idx].data(), |
| 2059 | - ibuffer_cpu[idx].size(), |
| 2060 | - hipMemcpyHostToDevice), |
| 2061 | - "hipMemcpy failed"); |
| 2062 | - } |
| 2063 | - catch(rocfft_hip_runtime_error) |
| 2064 | - { |
| 2065 | - return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; |
| 2066 | - } |
| 2067 | + print_device_buffer(params, ibuffer, true); |
| 2068 | } |
| 2069 | } |
| 2070 | |
| 2071 | - // GPU output buffer: |
| 2072 | - std::vector<gpubuf> obuffer_data; |
| 2073 | - std::vector<gpubuf>* obuffer = &obuffer_data; |
| 2074 | - if(params.placement == fft_placement_inplace) |
| 2075 | - { |
| 2076 | - obuffer = &ibuffer; |
| 2077 | - } |
| 2078 | - else |
| 2079 | - { |
| 2080 | - auto obuffer_sizes = params.obuffer_sizes(); |
| 2081 | - obuffer_data.resize(obuffer_sizes.size()); |
| 2082 | - for(unsigned int i = 0; i < obuffer_data.size(); ++i) |
| 2083 | - { |
| 2084 | - try |
| 2085 | - { |
| 2086 | - HIP_V_THROW(obuffer_data[i].alloc(obuffer_sizes[i]), |
| 2087 | - "Creating output Buffer failed"); |
| 2088 | - } |
| 2089 | - catch(rocfft_hip_runtime_error) |
| 2090 | - { |
| 2091 | - return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; |
| 2092 | - } |
| 2093 | - } |
| 2094 | - } |
| 2095 | std::vector<void*> pobuffer(obuffer->size()); |
| 2096 | for(unsigned int i = 0; i < obuffer->size(); ++i) |
| 2097 | { |
| 2098 | diff --git a/clients/samples/fixed-16/CMakeLists.txt b/clients/samples/fixed-16/CMakeLists.txt |
| 2099 | index feec728..d74f03b 100644 |
| 2100 | --- a/clients/samples/fixed-16/CMakeLists.txt |
| 2101 | +++ b/clients/samples/fixed-16/CMakeLists.txt |
| 2102 | @@ -49,7 +49,7 @@ if( NOT TARGET rocfft ) |
| 2103 | endif( ) |
| 2104 | |
| 2105 | if( NOT HIP_FOUND ) |
| 2106 | - find_package( HIP REQUIRED ) |
| 2107 | + find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ ) |
| 2108 | endif() |
| 2109 | |
| 2110 | set( sample_list fixed-16-float fixed-16-double fixed-16-half ) |
| 2111 | diff --git a/clients/samples/fixed-large/CMakeLists.txt b/clients/samples/fixed-large/CMakeLists.txt |
| 2112 | index bf1c2e7..ddea865 100644 |
| 2113 | --- a/clients/samples/fixed-large/CMakeLists.txt |
| 2114 | +++ b/clients/samples/fixed-large/CMakeLists.txt |
| 2115 | @@ -49,7 +49,7 @@ if( NOT TARGET rocfft ) |
| 2116 | endif( ) |
| 2117 | |
| 2118 | if( NOT HIP_FOUND ) |
| 2119 | - find_package( HIP REQUIRED ) |
| 2120 | + find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ ) |
| 2121 | endif() |
| 2122 | |
| 2123 | set( sample_list fixed-large-float fixed-large-double ) |
| 2124 | diff --git a/clients/samples/mpi/CMakeLists.txt b/clients/samples/mpi/CMakeLists.txt |
| 2125 | index 836e652..e4122bc 100644 |
| 2126 | --- a/clients/samples/mpi/CMakeLists.txt |
| 2127 | +++ b/clients/samples/mpi/CMakeLists.txt |
| 2128 | @@ -49,7 +49,7 @@ if( NOT TARGET rocfft ) |
| 2129 | endif( ) |
| 2130 | |
| 2131 | if( NOT HIP_FOUND ) |
| 2132 | - find_package( HIP REQUIRED ) |
| 2133 | + find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ ) |
| 2134 | endif() |
| 2135 | |
| 2136 | if( NOT MPI_FOUND ) |
| 2137 | @@ -97,6 +97,7 @@ foreach( sample ${sample_list} ) |
| 2138 | PRIVATE |
| 2139 | hip::hiprand |
| 2140 | ) |
| 2141 | + target_compile_definitions( ${sample} PRIVATE USE_HIPRAND ) |
| 2142 | endif() |
| 2143 | |
| 2144 | target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp ) |
| 2145 | diff --git a/clients/samples/mpi/rocfft_mpi_example.cpp b/clients/samples/mpi/rocfft_mpi_example.cpp |
| 2146 | index 14f06dc..18fbd12 100644 |
| 2147 | --- a/clients/samples/mpi/rocfft_mpi_example.cpp |
| 2148 | +++ b/clients/samples/mpi/rocfft_mpi_example.cpp |
| 2149 | @@ -126,6 +126,11 @@ int main(int argc, char** argv) |
| 2150 | if(fftrc != rocfft_status_success) |
| 2151 | throw std::runtime_error("failed to create description"); |
| 2152 | |
| 2153 | + // This example is unbatched, so the batch stride is not used |
| 2154 | + // for anything. For batched examples, this would be |
| 2155 | + // distance in elements between consecutive batches. |
| 2156 | + const size_t batch_stride = 0; |
| 2157 | + |
| 2158 | if(mpi_rank == 0) |
| 2159 | { |
| 2160 | std::cout << "input data decomposition:\n"; |
| 2161 | @@ -135,14 +140,14 @@ int main(int argc, char** argv) |
| 2162 | rocfft_field infield = nullptr; |
| 2163 | rocfft_field_create(&infield); |
| 2164 | |
| 2165 | - std::vector<size_t> inbrick_stride = {1, length[1]}; |
| 2166 | + std::vector<size_t> inbrick_stride = {1, length[1], batch_stride}; |
| 2167 | const size_t inbrick_length1 = length[1] / (size_t)mpi_size |
| 2168 | + ((size_t)mpi_rank < length[1] % (size_t)mpi_size ? 1 : 0); |
| 2169 | const size_t inbrick_lower1 |
| 2170 | = mpi_rank * (length[1] / mpi_size) + std::min((size_t)mpi_rank, length[1] % mpi_size); |
| 2171 | const size_t inbrick_upper1 = inbrick_lower1 + inbrick_length1; |
| 2172 | - std::vector<size_t> inbrick_lower = {0, inbrick_lower1}; |
| 2173 | - std::vector<size_t> inbrick_upper = {length[0], inbrick_upper1}; |
| 2174 | + std::vector<size_t> inbrick_lower = {0, inbrick_lower1, 0}; |
| 2175 | + std::vector<size_t> inbrick_upper = {length[0], inbrick_upper1, 1}; |
| 2176 | |
| 2177 | rocfft_brick inbrick = nullptr; |
| 2178 | rocfft_brick_create(&inbrick, |
| 2179 | @@ -219,15 +224,15 @@ int main(int argc, char** argv) |
| 2180 | std::vector<void*> gpu_out = {nullptr}; |
| 2181 | std::vector<size_t> outbrick_lower; |
| 2182 | std::vector<size_t> outbrick_upper; |
| 2183 | - std::vector<size_t> outbrick_stride = {1, length[1]}; |
| 2184 | + std::vector<size_t> outbrick_stride = {1, length[1], batch_stride}; |
| 2185 | { |
| 2186 | const size_t outbrick_length1 = length[1] / (size_t)mpi_size |
| 2187 | + ((size_t)mpi_rank < length[1] % (size_t)mpi_size ? 1 : 0); |
| 2188 | const size_t outbrick_lower1 |
| 2189 | = mpi_rank * (length[1] / mpi_size) + std::min((size_t)mpi_rank, length[1] % mpi_size); |
| 2190 | const size_t outbrick_upper1 = outbrick_lower1 + outbrick_length1; |
| 2191 | - outbrick_lower = {0, outbrick_lower1}; |
| 2192 | - outbrick_upper = {length[0], outbrick_upper1}; |
| 2193 | + outbrick_lower = {0, outbrick_lower1, 0}; |
| 2194 | + outbrick_upper = {length[0], outbrick_upper1, 1}; |
| 2195 | |
| 2196 | const size_t memSize = length[0] * outbrick_length1 * sizeof(std::complex<double>); |
| 2197 | for(int irank = 0; irank < mpi_size; ++irank) |
| 2198 | @@ -254,8 +259,8 @@ int main(int argc, char** argv) |
| 2199 | rocfft_field_create(&outfield); |
| 2200 | |
| 2201 | rocfft_brick outbrick = nullptr; |
| 2202 | - outbrick_lower = {0, outbrick_lower1}; |
| 2203 | - outbrick_upper = {length[0], outbrick_lower1 + outbrick_length1}; |
| 2204 | + outbrick_lower = {0, outbrick_lower1, 0}; |
| 2205 | + outbrick_upper = {length[0], outbrick_lower1 + outbrick_length1, 1}; |
| 2206 | rocfft_brick_create(&outbrick, |
| 2207 | outbrick_lower.data(), |
| 2208 | outbrick_upper.data(), |
| 2209 | diff --git a/clients/samples/multi_gpu/CMakeLists.txt b/clients/samples/multi_gpu/CMakeLists.txt |
| 2210 | index 41bc5e0..f4b1b27 100644 |
| 2211 | --- a/clients/samples/multi_gpu/CMakeLists.txt |
| 2212 | +++ b/clients/samples/multi_gpu/CMakeLists.txt |
| 2213 | @@ -49,7 +49,7 @@ if( NOT TARGET rocfft ) |
| 2214 | endif( ) |
| 2215 | |
| 2216 | if( NOT HIP_FOUND ) |
| 2217 | - find_package( HIP REQUIRED ) |
| 2218 | + find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ ) |
| 2219 | endif() |
| 2220 | |
| 2221 | if( USE_HIPRAND AND NOT hiprand_FOUND ) |
| 2222 | @@ -79,6 +79,7 @@ foreach( sample ${sample_list} ) |
| 2223 | PRIVATE |
| 2224 | hip::hiprand |
| 2225 | ) |
| 2226 | + target_compile_definitions( ${sample} PRIVATE USE_HIPRAND ) |
| 2227 | endif() |
| 2228 | |
| 2229 | target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp ) |
| 2230 | diff --git a/clients/samples/rocfft/CMakeLists.txt b/clients/samples/rocfft/CMakeLists.txt |
| 2231 | index d883915..01d47d6 100644 |
| 2232 | --- a/clients/samples/rocfft/CMakeLists.txt |
| 2233 | +++ b/clients/samples/rocfft/CMakeLists.txt |
| 2234 | @@ -49,7 +49,7 @@ if( NOT TARGET rocfft ) |
| 2235 | endif( ) |
| 2236 | |
| 2237 | if( NOT HIP_FOUND ) |
| 2238 | - find_package( HIP REQUIRED ) |
| 2239 | + find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ ) |
| 2240 | endif() |
| 2241 | |
| 2242 | if( USE_HIPRAND AND NOT hiprand_FOUND ) |
| 2243 | @@ -80,6 +80,7 @@ foreach( sample ${sample_list} ) |
| 2244 | PRIVATE |
| 2245 | hip::hiprand |
| 2246 | ) |
| 2247 | + target_compile_definitions( ${sample} PRIVATE USE_HIPRAND ) |
| 2248 | endif() |
| 2249 | |
| 2250 | target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp ) |
| 2251 | @@ -116,3 +117,10 @@ foreach( sample ${sample_list} ) |
| 2252 | target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} ) |
| 2253 | |
| 2254 | endforeach( ) |
| 2255 | + |
| 2256 | +# callback functions need to be built as relocatable device code |
| 2257 | +# (causes failure at link stage on Windows) |
| 2258 | +if (NOT WIN32) |
| 2259 | + target_compile_options( rocfft_example_callback PRIVATE -fgpu-rdc ) |
| 2260 | + target_link_options( rocfft_example_callback PRIVATE -fgpu-rdc ) |
| 2261 | +endif() |
| 2262 | diff --git a/clients/samples/rocfft/rocfft_example_callback.cpp b/clients/samples/rocfft/rocfft_example_callback.cpp |
| 2263 | index a6c2dd3..ccc65e5 100644 |
| 2264 | --- a/clients/samples/rocfft/rocfft_example_callback.cpp |
| 2265 | +++ b/clients/samples/rocfft/rocfft_example_callback.cpp |
| 2266 | @@ -20,11 +20,12 @@ |
| 2267 | * THE SOFTWARE. |
| 2268 | *******************************************************************************/ |
| 2269 | |
| 2270 | +#include <iostream> |
| 2271 | +#ifndef WIN32 |
| 2272 | #include "rocfft/rocfft.h" |
| 2273 | #include <hip/hip_complex.h> |
| 2274 | #include <hip/hip_runtime.h> |
| 2275 | #include <hip/hip_vector_types.h> |
| 2276 | -#include <iostream> |
| 2277 | #include <math.h> |
| 2278 | #include <stdexcept> |
| 2279 | #include <vector> |
| 2280 | @@ -46,9 +47,14 @@ __device__ double2 load_callback(double2* input, size_t offset, void* cbdata, vo |
| 2281 | make_hipDoubleComplex(data->scale, data->scale)); |
| 2282 | } |
| 2283 | __device__ auto load_callback_dev = load_callback; |
| 2284 | +#endif |
| 2285 | |
| 2286 | int main() |
| 2287 | { |
| 2288 | +#ifdef WIN32 |
| 2289 | + std::cout << "This sample is temporarily disabled on Windows" << std::endl; |
| 2290 | + return EXIT_SUCCESS; |
| 2291 | +#else |
| 2292 | |
| 2293 | const size_t N = 8; |
| 2294 | |
| 2295 | @@ -189,4 +195,5 @@ int main() |
| 2296 | throw std::runtime_error("rocfft_cleanup failed."); |
| 2297 | |
| 2298 | return 0; |
| 2299 | +#endif |
| 2300 | } |
| 2301 | diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt |
| 2302 | index 83ed823..81037c0 100644 |
| 2303 | --- a/clients/tests/CMakeLists.txt |
| 2304 | +++ b/clients/tests/CMakeLists.txt |
| 2305 | @@ -53,11 +53,11 @@ if( NOT TARGET rocfft ) |
| 2306 | endif( ) |
| 2307 | |
| 2308 | if( NOT HIP_FOUND ) |
| 2309 | - find_package( HIP REQUIRED ) |
| 2310 | + find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ ) |
| 2311 | endif() |
| 2312 | |
| 2313 | -if( NOT ROCM_FOUND ) |
| 2314 | - find_package( ROCM 0.7.3 REQUIRED ) |
| 2315 | +if( NOT ROCmCMakeBuildTools_FOUND ) |
| 2316 | + find_package( ROCmCMakeBuildTools REQUIRED ) |
| 2317 | endif() |
| 2318 | |
| 2319 | if( USE_HIPRAND AND NOT hiprand_FOUND ) |
| 2320 | @@ -94,6 +94,19 @@ set( rocfft-test_source |
| 2321 | add_executable( rocfft-test ${rocfft-test_source} ${rocfft-test_includes} ) |
| 2322 | add_executable( rtc_helper_crash rtc_helper_crash.cpp ) |
| 2323 | |
| 2324 | +# rocFFT device code builds with -O3 by default. rocfft-test |
| 2325 | +# contains device code for callback functions, so ensure the device |
| 2326 | +# code is built with the same optimization level to minimize chance |
| 2327 | +# of a mismatch |
| 2328 | +target_compile_options( rocfft-test PRIVATE -Xarch_device -O3 ) |
| 2329 | + |
| 2330 | +# callback functions need to be built as relocatable device code |
| 2331 | +# (causes failure at link stage on Windows) |
| 2332 | +if (NOT WIN32) |
| 2333 | + target_compile_options( rocfft-test PRIVATE -fgpu-rdc ) |
| 2334 | + target_link_options( rocfft-test PRIVATE -fgpu-rdc ) |
| 2335 | +endif() |
| 2336 | + |
| 2337 | find_package( Boost REQUIRED ) |
| 2338 | set( Boost_DEBUG ON ) |
| 2339 | set( Boost_DETAILED_FAILURE_MSG ON ) |
| 2340 | @@ -183,15 +196,38 @@ set( rocfft-test_link_libs |
| 2341 | ${FFTW_LIBRARIES} |
| 2342 | ) |
| 2343 | |
| 2344 | -include( ../cmake/build-gtest.cmake ) |
| 2345 | +option( BUILD_CLIENTS_TESTS_OPENMP "Build tests with OpenMP" ON ) |
| 2346 | +if( BUILD_CLIENTS_TESTS_OPENMP ) |
| 2347 | + # Attempt to find a config version, which provides openmp_LIB_DIR. |
| 2348 | + #find_package( OpenMP CONFIG PATHS "${HIP_CLANG_ROOT}/lib/cmake" ) |
| 2349 | + if( NOT OPENMP_FOUND OR NOT DEFINED ${openmp_LIB_DIR} ) |
| 2350 | + # Fall-back to module mode. |
| 2351 | + find_package( OpenMP REQUIRED ) |
| 2352 | + set( BUILD_RPATH "${HIP_CLANG_ROOT}/lib" ) |
| 2353 | + set( INSTALL_RPATH "$ORIGIN/../llvm/lib" ) |
| 2354 | + else() |
| 2355 | + set( BUILD_RPATH "${HIP_CLANG_ROOT}/${openmp_LIB_DIR}" ) |
| 2356 | + set( INSTALL_RPATH "$ORIGIN/../llvm/${openmp_LIB_DIR}" ) |
| 2357 | + endif() |
| 2358 | + list( APPEND rocfft-test_link_libs OpenMP::OpenMP_CXX ) |
| 2359 | + |
| 2360 | + set_target_properties( rocfft-test PROPERTIES |
| 2361 | + BUILD_RPATH "{$BUILD_RPATH}" |
| 2362 | + ) |
| 2363 | + set_target_properties( rocfft-test PROPERTIES |
| 2364 | + INSTALL_RPATH "${INSTALL_RPATH}" |
| 2365 | + ) |
| 2366 | +endif() |
| 2367 | + |
| 2368 | +find_package( GTest QUIET ) |
| 2369 | |
| 2370 | -if( BUILD_GTEST OR NOT GTEST_FOUND ) |
| 2371 | +if( GTest_FOUND ) |
| 2372 | + target_link_libraries( rocfft-test PRIVATE GTest::gtest ) |
| 2373 | +else() |
| 2374 | + include( ../cmake/build-gtest.cmake ) |
| 2375 | add_dependencies( rocfft-test gtest ) |
| 2376 | list( APPEND rocfft-test_include_dirs ${GTEST_INCLUDE_DIRS} ) |
| 2377 | list( APPEND rocfft-test_link_libs ${GTEST_LIBRARIES} ) |
| 2378 | -else() |
| 2379 | - list( APPEND rocfft-test_include_dirs $<BUILD_INTERFACE:${GTEST_INCLUDE_DIRS}> ) |
| 2380 | - list( APPEND rocfft-test_link_libs ${GTEST_LIBRARIES} ) |
| 2381 | endif() |
| 2382 | |
| 2383 | target_compile_options( rocfft-test PRIVATE ${WARNING_FLAGS} -Wno-cpp ) |
| 2384 | @@ -223,6 +259,7 @@ if ( USE_HIPRAND ) |
| 2385 | PRIVATE |
| 2386 | hip::hiprand |
| 2387 | ) |
| 2388 | + target_compile_definitions( rocfft-test PRIVATE USE_HIPRAND ) |
| 2389 | endif() |
| 2390 | |
| 2391 | if( ROCFFT_MPI_ENABLE ) |
| 2392 | @@ -256,7 +293,9 @@ if( USE_CUDA ) |
| 2393 | ) |
| 2394 | target_compile_definitions( rocfft-test PRIVATE __HIP_PLATFORM_NVCC__ ) |
| 2395 | endif( ) |
| 2396 | -target_link_libraries( rocfft-test PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} ) |
| 2397 | +target_link_libraries( rocfft-test PRIVATE |
| 2398 | + ${ROCFFT_CLIENTS_HOST_LINK_LIBS} |
| 2399 | + ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} ) |
| 2400 | |
| 2401 | include( ../../cmake/sqlite.cmake ) |
| 2402 | target_link_libraries( rocfft-test PUBLIC ${ROCFFT_SQLITE_LIB} ) |
| 2403 | @@ -264,21 +303,6 @@ target_include_directories( rocfft-test PRIVATE ${sqlite_local_SOURCE_DIR} ) |
| 2404 | |
| 2405 | set_property( TARGET rocfft-test APPEND PROPERTY LINK_LIBRARIES ${ROCFFT_SQLITE_LIB} ) |
| 2406 | |
| 2407 | -option( BUILD_CLIENTS_TESTS_OPENMP "Build tests with OpenMP" ON ) |
| 2408 | - |
| 2409 | -if( BUILD_CLIENTS_TESTS_OPENMP ) |
| 2410 | - if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) |
| 2411 | - target_compile_options( rocfft-test PRIVATE -fopenmp ) |
| 2412 | - target_link_libraries( rocfft-test PRIVATE -fopenmp -L${HIP_CLANG_ROOT}/lib -Wl,-rpath=${HIP_CLANG_ROOT}/lib ) |
| 2413 | - target_include_directories( rocfft-test PRIVATE ${HIP_CLANG_ROOT}/include ) |
| 2414 | - else() |
| 2415 | - if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") |
| 2416 | - target_compile_options( rocfft-test PRIVATE -fopenmp=libomp ) |
| 2417 | - target_link_options( rocfft-test PRIVATE -fopenmp=libomp ) |
| 2418 | - endif() |
| 2419 | - endif() |
| 2420 | -endif() |
| 2421 | - |
| 2422 | if(FFTW_MULTITHREAD) |
| 2423 | target_compile_options( rocfft-test PRIVATE -DFFTW_MULTITHREAD ) |
| 2424 | endif( ) |
| 2425 | @@ -330,7 +354,18 @@ if( ROCFFT_MPI_ENABLE ) |
| 2426 | # normal and dynamic-loading MPI worker processes |
| 2427 | foreach(worker rocfft_mpi_worker dyna_rocfft_mpi_worker) |
| 2428 | add_executable( ${worker} rocfft_mpi_worker.cpp ) |
| 2429 | - target_compile_options( ${worker} PRIVATE -fopenmp ) |
| 2430 | + |
| 2431 | + if( BUILD_CLIENTS_TESTS_OPENMP ) |
| 2432 | + set_target_properties( ${worker} PROPERTIES |
| 2433 | + BUILD_RPATH "${BUILD_RPATH}" |
| 2434 | + ) |
| 2435 | + set_target_properties( ${worker} PROPERTIES |
| 2436 | + INSTALL_RPATH "${INSTALL_RPATH}" |
| 2437 | + ) |
| 2438 | + endif() |
| 2439 | + if( BUILD_FFTW OR NOT FFTW_FOUND ) |
| 2440 | + add_dependencies( ${worker} fftw_double fftw_single ) |
| 2441 | + endif() |
| 2442 | target_include_directories( ${worker} |
| 2443 | PRIVATE |
| 2444 | ${CMAKE_BINARY_DIR}/include |
| 2445 | @@ -341,8 +376,8 @@ if( ROCFFT_MPI_ENABLE ) |
| 2446 | target_compile_options( ${worker} PRIVATE ${WARNING_FLAGS} ) |
| 2447 | |
| 2448 | if ( ROCFFT_CRAY_MPI_ENABLE ) |
| 2449 | - target_link_libraries( ${worker} |
| 2450 | - -fopenmp |
| 2451 | + target_link_libraries( ${worker} |
| 2452 | + OpenMP::OpenMP_CXX |
| 2453 | hip::hiprand |
| 2454 | hip::device |
| 2455 | MPI::MPI_CXX |
| 2456 | @@ -355,7 +390,7 @@ if( ROCFFT_MPI_ENABLE ) |
| 2457 | ${MPI_LIBDIR}/../../../../gtl/lib ) |
| 2458 | else() |
| 2459 | target_link_libraries( ${worker} |
| 2460 | - -fopenmp |
| 2461 | + OpenMP::OpenMP_CXX |
| 2462 | hip::hiprand |
| 2463 | hip::device |
| 2464 | MPI::MPI_CXX |
| 2465 | @@ -382,3 +417,49 @@ if( ROCFFT_MPI_ENABLE ) |
| 2466 | ) |
| 2467 | |
| 2468 | endif() |
| 2469 | + |
| 2470 | +set(COVERAGE_TEST_OPTIONS "--smoketest;--gtest_filter=-*call*" CACHE STRING "Command line arguments for rocfft-test when generating a code coverage report") |
| 2471 | + |
| 2472 | +if(BUILD_CODE_COVERAGE) |
| 2473 | + # Coverage won't work in a standalone build of the tests, as we can't |
| 2474 | + # guarantee the library was built with coverage enabled |
| 2475 | + if( NOT TARGET rocfft ) |
| 2476 | + message( FATAL_ERROR "BUILD_CODE_COVERAGE requires building from the root of rocFFT" ) |
| 2477 | + endif() |
| 2478 | + |
| 2479 | + add_custom_target( |
| 2480 | + code_cov_tests |
| 2481 | + DEPENDS rocfft-test rocfft_rtc_helper |
| 2482 | + COMMAND ${CMAKE_COMMAND} -E rm -rf ./coverage-report |
| 2483 | + COMMAND ${CMAKE_COMMAND} -E make_directory ./coverage-report/profraw |
| 2484 | + COMMAND ${CMAKE_COMMAND} -E env LLVM_PROFILE_FILE="./coverage-report/profraw/rocfft-coverage_%p.profraw" GTEST_LISTENER=NO_PASS_LINE_IN_LOG $<TARGET_FILE:rocfft-test> --precompile=rocfft-test-precompile.db ${COVERAGE_TEST_OPTIONS} |
| 2485 | + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} |
| 2486 | + ) |
| 2487 | + |
| 2488 | + find_program( |
| 2489 | + LLVM_PROFDATA |
| 2490 | + llvm-profdata |
| 2491 | + REQUIRED |
| 2492 | + HINTS ${ROCM_PATH}/llvm/bin |
| 2493 | + PATHS /opt/rocm/llvm/bin |
| 2494 | + ) |
| 2495 | + |
| 2496 | + find_program( |
| 2497 | + LLVM_COV |
| 2498 | + llvm-cov |
| 2499 | + REQUIRED |
| 2500 | + HINTS ${ROCM_PATH}/llvm/bin |
| 2501 | + PATHS /opt/rocm/llvm/bin |
| 2502 | + ) |
| 2503 | + |
| 2504 | + add_custom_target( |
| 2505 | + coverage |
| 2506 | + DEPENDS code_cov_tests |
| 2507 | + COMMAND ${LLVM_PROFDATA} merge -sparse ./coverage-report/profraw/rocfft-coverage_*.profraw -o ./coverage-report/rocfft.profdata |
| 2508 | + COMMAND ${LLVM_COV} report -object ./library/src/librocfft.so -instr-profile=./coverage-report/rocfft.profdata |
| 2509 | + COMMAND ${LLVM_COV} show -object ./library/src/librocfft.so -instr-profile=./coverage-report/rocfft.profdata -format=html -output-dir=coverage-report |
| 2510 | + COMMAND ${LLVM_COV} export -object ./library/src/librocfft.so -instr-profile=./coverage-report/rocfft.profdata -format=lcov > ./coverage-report/coverage.info |
| 2511 | + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} |
| 2512 | + ) |
| 2513 | + |
| 2514 | +endif() |
| 2515 | diff --git a/clients/tests/accuracy_test.cpp b/clients/tests/accuracy_test.cpp |
| 2516 | index a3758a6..addaca1 100644 |
| 2517 | --- a/clients/tests/accuracy_test.cpp |
| 2518 | +++ b/clients/tests/accuracy_test.cpp |
| 2519 | @@ -230,7 +230,7 @@ __host__ __device__ static void store_callback_round_trip_inverse( |
| 2520 | Tdata* output, size_t offset, Tdata element, void* cbdata, void* sharedMem) |
| 2521 | { |
| 2522 | auto testdata = static_cast<callback_test_data*>(cbdata); |
| 2523 | - // add scalar to each element |
| 2524 | + // divide each element by scalar |
| 2525 | if(output == testdata->base) |
| 2526 | { |
| 2527 | output[offset] = element / testdata->scalar; |
| 2528 | diff --git a/clients/tests/accuracy_test_3D.cpp b/clients/tests/accuracy_test_3D.cpp |
| 2529 | index 37c80ca..d4e3e43 100644 |
| 2530 | --- a/clients/tests/accuracy_test_3D.cpp |
| 2531 | +++ b/clients/tests/accuracy_test_3D.cpp |
| 2532 | @@ -255,17 +255,22 @@ INSTANTIATE_TEST_SUITE_P(partial_pass_3D, |
| 2533 | false)), |
| 2534 | accuracy_test::TestName); |
| 2535 | |
| 2536 | -INSTANTIATE_TEST_SUITE_P(partial_pass_3D_callback, |
| 2537 | - accuracy_test, |
| 2538 | - ::testing::ValuesIn(param_generator(test_prob, |
| 2539 | - partial_pass_adhoc_3D, |
| 2540 | - precision_range_sp_dp, |
| 2541 | - partial_pass_batch_range_3D, |
| 2542 | - stride_range, |
| 2543 | - stride_range, |
| 2544 | - ioffset_range_zero, |
| 2545 | - ooffset_range_zero, |
| 2546 | - place_range, |
| 2547 | - false, |
| 2548 | - true)), |
| 2549 | - accuracy_test::TestName); |
| 2550 | +INSTANTIATE_TEST_SUITE_P( |
| 2551 | +#ifdef WIN32 |
| 2552 | + DISABLED_partial_pass_3D_callback, |
| 2553 | +#else |
| 2554 | + partial_pass_3D_callback, |
| 2555 | +#endif |
| 2556 | + accuracy_test, |
| 2557 | + ::testing::ValuesIn(param_generator(test_prob, |
| 2558 | + partial_pass_adhoc_3D, |
| 2559 | + precision_range_sp_dp, |
| 2560 | + partial_pass_batch_range_3D, |
| 2561 | + stride_range, |
| 2562 | + stride_range, |
| 2563 | + ioffset_range_zero, |
| 2564 | + ooffset_range_zero, |
| 2565 | + place_range, |
| 2566 | + false, |
| 2567 | + true)), |
| 2568 | + accuracy_test::TestName); |
| 2569 | diff --git a/clients/tests/accuracy_test_callback.cpp b/clients/tests/accuracy_test_callback.cpp |
| 2570 | index 631a974..57c3244 100644 |
| 2571 | --- a/clients/tests/accuracy_test_callback.cpp |
| 2572 | +++ b/clients/tests/accuracy_test_callback.cpp |
| 2573 | @@ -104,22 +104,27 @@ const static std::vector<std::vector<size_t>> ooffset_range = {{0, 0}, {1, 1}}; |
| 2574 | auto forward_transform_types |
| 2575 | = {fft_transform_type_complex_forward, fft_transform_type_real_forward}; |
| 2576 | |
| 2577 | -INSTANTIATE_TEST_SUITE_P(callback, |
| 2578 | - accuracy_test, |
| 2579 | - ::testing::ValuesIn(param_generator_base(test_prob, |
| 2580 | - forward_transform_types, |
| 2581 | - callback_sizes, |
| 2582 | - precision_range_sp_dp, |
| 2583 | - batch_range, |
| 2584 | - generate_types, |
| 2585 | - stride_range, |
| 2586 | - stride_range, |
| 2587 | - ioffset_range_zero, |
| 2588 | - ooffset_range_zero, |
| 2589 | - place_range, |
| 2590 | - false, |
| 2591 | - true)), |
| 2592 | - accuracy_test::TestName); |
| 2593 | +INSTANTIATE_TEST_SUITE_P( |
| 2594 | +#ifdef WIN32 |
| 2595 | + DISABLED_callback_no_offset, |
| 2596 | +#else |
| 2597 | + callback, |
| 2598 | +#endif |
| 2599 | + accuracy_test, |
| 2600 | + ::testing::ValuesIn(param_generator_base(test_prob, |
| 2601 | + forward_transform_types, |
| 2602 | + callback_sizes, |
| 2603 | + precision_range_sp_dp, |
| 2604 | + batch_range, |
| 2605 | + generate_types, |
| 2606 | + stride_range, |
| 2607 | + stride_range, |
| 2608 | + ioffset_range_zero, |
| 2609 | + ooffset_range_zero, |
| 2610 | + place_range, |
| 2611 | + false, |
| 2612 | + true)), |
| 2613 | + accuracy_test::TestName); |
| 2614 | |
| 2615 | INSTANTIATE_TEST_SUITE_P(DISABLED_callback, |
| 2616 | accuracy_test, |
| 2617 | diff --git a/clients/tests/accuracy_test_checkstride.cpp b/clients/tests/accuracy_test_checkstride.cpp |
| 2618 | index 0f6b7c6..898497c 100644 |
| 2619 | --- a/clients/tests/accuracy_test_checkstride.cpp |
| 2620 | +++ b/clients/tests/accuracy_test_checkstride.cpp |
| 2621 | @@ -70,7 +70,11 @@ inline auto param_checkstride() |
| 2622 | for(const auto& types : |
| 2623 | generate_types(trans_type, {fft_placement_notinplace}, true)) |
| 2624 | { |
| 2625 | +#ifdef WIN32 |
| 2626 | + for(bool callback : {false}) |
| 2627 | +#else |
| 2628 | for(bool callback : {true, false}) |
| 2629 | +#endif |
| 2630 | { |
| 2631 | // callbacks don't work for planar |
| 2632 | bool is_planar = std::get<2>(types) == fft_array_type_complex_planar |
| 2633 | diff --git a/clients/tests/accuracy_tests_range.h b/clients/tests/accuracy_tests_range.h |
| 2634 | index 103a97a..6b71b27 100644 |
| 2635 | --- a/clients/tests/accuracy_tests_range.h |
| 2636 | +++ b/clients/tests/accuracy_tests_range.h |
| 2637 | @@ -167,7 +167,15 @@ const static std::vector<size_t> inner_batch_3D_batch_range = {3, 2, 1}; |
| 2638 | // partial pass test problems |
| 2639 | //----------------------------------------------------------------------- |
| 2640 | //----------------------------------------------------------------------- |
| 2641 | -const static std::vector<std::vector<size_t>> partial_pass_adhoc_3D = {{64, 64, 64}}; |
| 2642 | -const static std::vector<size_t> partial_pass_batch_range_3D = {1, 5, 10, 20, 50}; |
| 2643 | +const static std::vector<std::vector<size_t>> partial_pass_adhoc_3D = { |
| 2644 | + {64, 64, 128}, |
| 2645 | + {64, 64, 64}, |
| 2646 | + {64, 64, 52}, |
| 2647 | + {60, 60, 60}, |
| 2648 | + {32, 32, 128}, |
| 2649 | + {32, 32, 64}, |
| 2650 | + {64, 32, 128}, |
| 2651 | +}; |
| 2652 | +const static std::vector<size_t> partial_pass_batch_range_3D = {1, 5, 10, 20, 50}; |
| 2653 | |
| 2654 | #endif // ACCURACY_TESTS_RANGE_H |
| 2655 | \ No newline at end of file |
| 2656 | diff --git a/clients/tests/buffer_hash_test.cpp b/clients/tests/buffer_hash_test.cpp |
| 2657 | index 8d4831d..47095e2 100644 |
| 2658 | --- a/clients/tests/buffer_hash_test.cpp |
| 2659 | +++ b/clients/tests/buffer_hash_test.cpp |
| 2660 | @@ -19,6 +19,7 @@ |
| 2661 | // THE SOFTWARE. |
| 2662 | |
| 2663 | #include "../../shared/fft_hash.h" |
| 2664 | +#include "../../shared/params_gen.h" |
| 2665 | #include "../../shared/rocfft_params.h" |
| 2666 | #include <algorithm> |
| 2667 | #include <chrono> |
| 2668 | @@ -29,7 +30,7 @@ |
| 2669 | |
| 2670 | static void set_params(const fft_precision precision, fft_params& param) |
| 2671 | { |
| 2672 | - std::vector<size_t> blengths = {16777216}; |
| 2673 | + std::vector<size_t> blengths = {131072}; |
| 2674 | |
| 2675 | std::vector<size_t> unit_strides = {1}; |
| 2676 | |
| 2677 | @@ -357,6 +358,12 @@ static void run_test(const rocfft_params& params) |
| 2678 | |
| 2679 | TEST(rocfft_UnitTest, buffer_hashing_half) |
| 2680 | { |
| 2681 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 2682 | + > unittest_prob) |
| 2683 | + { |
| 2684 | + GTEST_SKIP(); |
| 2685 | + } |
| 2686 | + |
| 2687 | rocfft_params params; |
| 2688 | set_params(fft_precision_half, params); |
| 2689 | |
| 2690 | @@ -372,6 +379,13 @@ TEST(rocfft_UnitTest, buffer_hashing_half) |
| 2691 | |
| 2692 | TEST(rocfft_UnitTest, buffer_hashing_single) |
| 2693 | { |
| 2694 | + |
| 2695 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 2696 | + > unittest_prob) |
| 2697 | + { |
| 2698 | + GTEST_SKIP(); |
| 2699 | + } |
| 2700 | + |
| 2701 | rocfft_params params; |
| 2702 | set_params(fft_precision_single, params); |
| 2703 | |
| 2704 | @@ -387,6 +401,13 @@ TEST(rocfft_UnitTest, buffer_hashing_single) |
| 2705 | |
| 2706 | TEST(rocfft_UnitTest, buffer_hashing_double) |
| 2707 | { |
| 2708 | + |
| 2709 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 2710 | + > unittest_prob) |
| 2711 | + { |
| 2712 | + GTEST_SKIP(); |
| 2713 | + } |
| 2714 | + |
| 2715 | rocfft_params params; |
| 2716 | set_params(fft_precision_double, params); |
| 2717 | |
| 2718 | diff --git a/clients/tests/callback_change_type.cpp b/clients/tests/callback_change_type.cpp |
| 2719 | index 4a06a0f..24d4863 100644 |
| 2720 | --- a/clients/tests/callback_change_type.cpp |
| 2721 | +++ b/clients/tests/callback_change_type.cpp |
| 2722 | @@ -70,23 +70,28 @@ std::vector<std::vector<size_t>> callback_type_sizes = {{4}, {60}, {122}, {220}, |
| 2723 | // the input can't be any smaller than what rocFFT thinks it is, |
| 2724 | // because the overwrite will fail. |
| 2725 | const static std::vector<std::vector<size_t>> stride_range = {{1}}; |
| 2726 | -INSTANTIATE_TEST_SUITE_P(callback, |
| 2727 | - change_type, |
| 2728 | - ::testing::ValuesIn(param_generator_base( |
| 2729 | - test_prob, |
| 2730 | - {fft_transform_type_complex_forward, fft_transform_type_real_forward}, |
| 2731 | - callback_type_sizes, |
| 2732 | - {fft_precision_single}, |
| 2733 | - {1}, |
| 2734 | - generate_types, |
| 2735 | - stride_range, |
| 2736 | - stride_range, |
| 2737 | - {{0, 0}}, |
| 2738 | - {{0, 0}}, |
| 2739 | - {fft_placement_notinplace}, |
| 2740 | - false, |
| 2741 | - false)), |
| 2742 | - accuracy_test::TestName); |
| 2743 | +INSTANTIATE_TEST_SUITE_P( |
| 2744 | +#ifdef WIN32 |
| 2745 | + DISABLED_callback, |
| 2746 | +#else |
| 2747 | + callback, |
| 2748 | +#endif |
| 2749 | + change_type, |
| 2750 | + ::testing::ValuesIn(param_generator_base(test_prob, |
| 2751 | + {fft_transform_type_complex_forward, |
| 2752 | + fft_transform_type_real_forward}, |
| 2753 | + callback_type_sizes, |
| 2754 | + {fft_precision_single}, |
| 2755 | + {1}, |
| 2756 | + generate_types, |
| 2757 | + stride_range, |
| 2758 | + stride_range, |
| 2759 | + {{0, 0}}, |
| 2760 | + {{0, 0}}, |
| 2761 | + {fft_placement_notinplace}, |
| 2762 | + false, |
| 2763 | + false)), |
| 2764 | + accuracy_test::TestName); |
| 2765 | |
| 2766 | // run an out-of-place transform that casts input from short to float |
| 2767 | TEST_P(change_type, short_to_float) |
| 2768 | diff --git a/clients/tests/default_callbacks_test.cpp b/clients/tests/default_callbacks_test.cpp |
| 2769 | index 7c782aa..7ade249 100644 |
| 2770 | --- a/clients/tests/default_callbacks_test.cpp |
| 2771 | +++ b/clients/tests/default_callbacks_test.cpp |
| 2772 | @@ -28,6 +28,7 @@ |
| 2773 | |
| 2774 | #include "../../shared/fftw_transform.h" |
| 2775 | #include "../../shared/hip_object_wrapper.h" |
| 2776 | +#include "../../shared/params_gen.h" |
| 2777 | #include "../../shared/rocfft_params.h" |
| 2778 | #include "rocfft/rocfft.h" |
| 2779 | |
| 2780 | @@ -409,8 +410,15 @@ struct Test_Callback |
| 2781 | // complex data inputs and having only a load callback set. |
| 2782 | // ------------------------------------------------------------------- |
| 2783 | |
| 2784 | +#ifndef WIN32 |
| 2785 | TEST(rocfft_UnitTest, default_load_callback_complex_single) |
| 2786 | { |
| 2787 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 2788 | + > unittest_prob) |
| 2789 | + { |
| 2790 | + GTEST_SKIP(); |
| 2791 | + } |
| 2792 | + |
| 2793 | TEST_CALLBACK_CHECK_ASAN; |
| 2794 | Test_Callback test(256, |
| 2795 | 1, |
| 2796 | @@ -422,6 +430,12 @@ TEST(rocfft_UnitTest, default_load_callback_complex_single) |
| 2797 | |
| 2798 | TEST(rocfft_UnitTest, default_load_callback_complex_double) |
| 2799 | { |
| 2800 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 2801 | + > unittest_prob) |
| 2802 | + { |
| 2803 | + GTEST_SKIP(); |
| 2804 | + } |
| 2805 | + |
| 2806 | TEST_CALLBACK_CHECK_ASAN; |
| 2807 | Test_Callback test(512, |
| 2808 | 1, |
| 2809 | @@ -433,6 +447,12 @@ TEST(rocfft_UnitTest, default_load_callback_complex_double) |
| 2810 | |
| 2811 | TEST(rocfft_UnitTest, default_load_callback_real_single) |
| 2812 | { |
| 2813 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 2814 | + > unittest_prob) |
| 2815 | + { |
| 2816 | + GTEST_SKIP(); |
| 2817 | + } |
| 2818 | + |
| 2819 | TEST_CALLBACK_CHECK_ASAN; |
| 2820 | Test_Callback test(1024, |
| 2821 | 1, |
| 2822 | @@ -444,6 +464,12 @@ TEST(rocfft_UnitTest, default_load_callback_real_single) |
| 2823 | |
| 2824 | TEST(rocfft_UnitTest, default_load_callback_real_double) |
| 2825 | { |
| 2826 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 2827 | + > unittest_prob) |
| 2828 | + { |
| 2829 | + GTEST_SKIP(); |
| 2830 | + } |
| 2831 | + |
| 2832 | TEST_CALLBACK_CHECK_ASAN; |
| 2833 | Test_Callback test(2048, |
| 2834 | 1, |
| 2835 | @@ -460,6 +486,12 @@ TEST(rocfft_UnitTest, default_load_callback_real_double) |
| 2836 | |
| 2837 | TEST(rocfft_UnitTest, default_store_callback_complex_single) |
| 2838 | { |
| 2839 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 2840 | + > unittest_prob) |
| 2841 | + { |
| 2842 | + GTEST_SKIP(); |
| 2843 | + } |
| 2844 | + |
| 2845 | TEST_CALLBACK_CHECK_ASAN; |
| 2846 | Test_Callback test(256, |
| 2847 | 1, |
| 2848 | @@ -471,6 +503,12 @@ TEST(rocfft_UnitTest, default_store_callback_complex_single) |
| 2849 | |
| 2850 | TEST(rocfft_UnitTest, default_store_callback_complex_double) |
| 2851 | { |
| 2852 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 2853 | + > unittest_prob) |
| 2854 | + { |
| 2855 | + GTEST_SKIP(); |
| 2856 | + } |
| 2857 | + |
| 2858 | TEST_CALLBACK_CHECK_ASAN; |
| 2859 | Test_Callback test(512, |
| 2860 | 1, |
| 2861 | @@ -482,6 +520,12 @@ TEST(rocfft_UnitTest, default_store_callback_complex_double) |
| 2862 | |
| 2863 | TEST(rocfft_UnitTest, default_store_callback_real_single) |
| 2864 | { |
| 2865 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 2866 | + > unittest_prob) |
| 2867 | + { |
| 2868 | + GTEST_SKIP(); |
| 2869 | + } |
| 2870 | + |
| 2871 | TEST_CALLBACK_CHECK_ASAN; |
| 2872 | Test_Callback test(1024, |
| 2873 | 1, |
| 2874 | @@ -493,6 +537,12 @@ TEST(rocfft_UnitTest, default_store_callback_real_single) |
| 2875 | |
| 2876 | TEST(rocfft_UnitTest, default_store_callback_real_double) |
| 2877 | { |
| 2878 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 2879 | + > unittest_prob) |
| 2880 | + { |
| 2881 | + GTEST_SKIP(); |
| 2882 | + } |
| 2883 | + |
| 2884 | TEST_CALLBACK_CHECK_ASAN; |
| 2885 | Test_Callback test(2048, |
| 2886 | 1, |
| 2887 | @@ -501,3 +551,4 @@ TEST(rocfft_UnitTest, default_store_callback_real_double) |
| 2888 | DefaultCallbackType::STORE, |
| 2889 | 8); |
| 2890 | } |
| 2891 | +#endif |
| 2892 | diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp |
| 2893 | index 3d3de51..aff9dc1 100644 |
| 2894 | --- a/clients/tests/gtest_main.cpp |
| 2895 | +++ b/clients/tests/gtest_main.cpp |
| 2896 | @@ -53,11 +53,14 @@ |
| 2897 | int verbose; |
| 2898 | |
| 2899 | // User-defined random seed |
| 2900 | -size_t random_seed; |
| 2901 | +size_t random_seed; |
| 2902 | +std::random_device default_seed_dev; |
| 2903 | // Overall probability of running conventional tests |
| 2904 | double test_prob; |
| 2905 | // Probability of running tests from the emulation suite |
| 2906 | double emulation_prob; |
| 2907 | +// Probability of running unit tests |
| 2908 | +double unittest_prob; |
| 2909 | // Modifier for probability of running tests with complex interleaved data |
| 2910 | double complex_interleaved_prob_factor; |
| 2911 | // Modifier for probability of running tests with real data |
| 2912 | @@ -81,6 +84,9 @@ size_t ramgb; |
| 2913 | // Device memory limitation for tests (GiB): |
| 2914 | size_t vramgb; |
| 2915 | |
| 2916 | +// Number of hip devices to use. |
| 2917 | +int ngpus{}; |
| 2918 | + |
| 2919 | // Allow skipping tests if there is a runtime error |
| 2920 | bool skip_runtime_fails; |
| 2921 | // But count the number of failures |
| 2922 | @@ -166,11 +172,8 @@ void init_gtest_flags() |
| 2923 | std::swap(temp_list_tests, testing::GTEST_FLAG(list_tests)); |
| 2924 | } |
| 2925 | |
| 2926 | -void precompile_test_kernels(const std::string& precompile_file) |
| 2927 | +std::vector<std::string> tokens_to_run() |
| 2928 | { |
| 2929 | - std::cout << "precompiling test kernels...\n"; |
| 2930 | - WorkQueue<std::string> tokenQueue; |
| 2931 | - |
| 2932 | init_gtest_flags(); |
| 2933 | |
| 2934 | std::vector<std::string> tokens; |
| 2935 | @@ -218,6 +221,15 @@ void precompile_test_kernels(const std::string& precompile_file) |
| 2936 | } |
| 2937 | } |
| 2938 | } |
| 2939 | + return tokens; |
| 2940 | +} |
| 2941 | + |
| 2942 | +void precompile_test_kernels(const std::string& precompile_file) |
| 2943 | +{ |
| 2944 | + std::cout << "precompiling test kernels...\n"; |
| 2945 | + WorkQueue<std::string> tokenQueue; |
| 2946 | + |
| 2947 | + auto tokens = tokens_to_run(); |
| 2948 | |
| 2949 | std::random_device dev; |
| 2950 | std::mt19937 dist(dev()); |
| 2951 | @@ -306,24 +318,25 @@ int main(int argc, char* argv[]) |
| 2952 | " HP - hermitian planar\n" |
| 2953 | "\n" |
| 2954 | "Usage"}; |
| 2955 | - |
| 2956 | - // Override CLI11 help to print after later CLI11 options that are defined, and allow gtest's |
| 2957 | - // help. |
| 2958 | - // After removing the stage-1 options, individual options are set to null (even if set), but we |
| 2959 | - // can still capture the behaviour by using a flag. |
| 2960 | - |
| 2961 | - for(auto opt : app.get_options()) |
| 2962 | - { |
| 2963 | - app.remove_option(opt); |
| 2964 | - } |
| 2965 | + // Override CLI11 help to print it along gtest's help |
| 2966 | + app.set_help_flag(""); |
| 2967 | + const auto opt_help = app.add_flag("-h, --help", "Produces this help message"); |
| 2968 | app.add_option("-v, --verbose", verbose, "Print out detailed information for the tests") |
| 2969 | ->default_val(0); |
| 2970 | app.add_option("--nrand", n_random_tests, "Number of extra randomized tests")->default_val(0); |
| 2971 | + |
| 2972 | + app.add_option("--ngpus", ngpus, "Number of GPUs to use per rank") |
| 2973 | + ->default_val(-1) |
| 2974 | + ->check(CLI::NonNegativeNumber); |
| 2975 | + app.add_option("--gpus", n_random_tests, "Number of extra randomized tests")->default_val(0); |
| 2976 | app.add_option("--test_prob", test_prob, "Probability of running individual tests") |
| 2977 | ->default_val(1.0) |
| 2978 | ->check(CLI::Range(0.0, 1.0)); |
| 2979 | + app.add_option("--unittest_prob", unittest_prob, "Probability of running individual unit tests") |
| 2980 | + ->default_val(1.0) |
| 2981 | + ->check(CLI::Range(0.0, 1.0)); |
| 2982 | app.add_option( |
| 2983 | - "--emulation_prob", test_prob, "Probability of running individual emulation tests") |
| 2984 | + "--emulation_prob", emulation_prob, "Probability of running individual emulation tests") |
| 2985 | ->default_val(1.0) |
| 2986 | ->check(CLI::Range(0.0, 1.0)); |
| 2987 | app.add_option("--real_prob", |
| 2988 | @@ -411,7 +424,7 @@ int main(int argc, char* argv[]) |
| 2989 | if(mp_lib == fft_params::fft_mp_lib_none) |
| 2990 | { |
| 2991 | std::cout << "--mp_launch requires an mp library (see mp_lib in --help).\n"; |
| 2992 | - std::exit(-1); |
| 2993 | + std::exit(EXIT_FAILURE); |
| 2994 | } |
| 2995 | }) |
| 2996 | ->needs("--mp_lib"); |
| 2997 | @@ -420,8 +433,9 @@ int main(int argc, char* argv[]) |
| 2998 | ->each([&](const std::string&) { |
| 2999 | // The objective is to have an test that takes about 5 minutes, so just set the |
| 3000 | // probability per test to a small value to achieve this result. |
| 3001 | - test_prob = 0.001; |
| 3002 | - emulation_prob = 0.01; |
| 3003 | + test_prob = 0.0005; |
| 3004 | + emulation_prob = 0.005; |
| 3005 | + unittest_prob = 0.2; |
| 3006 | n_random_tests = 10; |
| 3007 | }); |
| 3008 | |
| 3009 | @@ -429,53 +443,8 @@ int main(int argc, char* argv[]) |
| 3010 | manual_params.run_callbacks = true; |
| 3011 | }); |
| 3012 | |
| 3013 | - { |
| 3014 | - // We explicitly scope opt_seed so that the object falls out of scope before the final |
| 3015 | - // parsing of the command line arguments. Otherwise, the second parsing would mark the |
| 3016 | - // option as not having been specified, which can get rather confusing. |
| 3017 | - |
| 3018 | - auto opt_seed = app.add_option( |
| 3019 | - "--seed", random_seed, "Random seed; if unset, use an actual random seed"); |
| 3020 | - |
| 3021 | - // Try parsing initial args that will be used to configure tests. |
| 3022 | - // Allow extras to pass on gtest and rocFFT arguments without error. |
| 3023 | - app.allow_extras(); |
| 3024 | - try |
| 3025 | - { |
| 3026 | - app.parse(argc, argv); |
| 3027 | - } |
| 3028 | - catch(const CLI::ParseError& e) |
| 3029 | - { |
| 3030 | - return app.exit(e); |
| 3031 | - } |
| 3032 | - |
| 3033 | - if(!*opt_seed) |
| 3034 | - { |
| 3035 | - std::cout << "Generating random seed: "; |
| 3036 | - std::random_device dev; |
| 3037 | - random_seed = dev(); |
| 3038 | - std::cout << random_seed << "\n"; |
| 3039 | - } |
| 3040 | - } |
| 3041 | - |
| 3042 | - app.set_help_flag(""); |
| 3043 | - auto opt_help = app.add_flag("-h, --help", "Produces this help message"); |
| 3044 | - |
| 3045 | - std::vector<std::string> remaining_args = app.remaining(); |
| 3046 | - // Google test ignores the first element, so add something there so that it parses all of hte |
| 3047 | - // arguments that we want it to parse.: |
| 3048 | - remaining_args.insert(remaining_args.begin(), argv0); |
| 3049 | - // NB: If we initialize gtest first, then it removes all of its own command-line |
| 3050 | - // arguments and sets argc and argv correctly; |
| 3051 | - std::vector<char*> carg; |
| 3052 | - for(std::string& s : remaining_args) |
| 3053 | - { |
| 3054 | - carg.push_back(&s[0]); |
| 3055 | - } |
| 3056 | - carg.push_back(NULL); |
| 3057 | - decltype(argc) cargc = carg.size() - 1; |
| 3058 | - ::testing::InitGoogleTest(&cargc, carg.data()); |
| 3059 | - |
| 3060 | + app.add_option("--seed", random_seed, "Random seed; if unset, use an actual random seed") |
| 3061 | + ->default_val(default_seed_dev()); |
| 3062 | // Filename for fftw and fftwf wisdom. |
| 3063 | std::string fftw_wisdom_filename; |
| 3064 | |
| 3065 | @@ -488,6 +457,9 @@ int main(int argc, char* argv[]) |
| 3066 | // Full path to bitwise repro database file |
| 3067 | std::string repro_db_path; |
| 3068 | |
| 3069 | + // Bool option to just print tokens and exit |
| 3070 | + bool printtokens{false}; |
| 3071 | + |
| 3072 | // Declare the supported options. Some option pointers are declared to track passed opts. |
| 3073 | app.add_flag("--version", "Print queryable version information from the rocfft library") |
| 3074 | ->each([](const std::string&) { |
| 3075 | @@ -515,6 +487,11 @@ int main(int argc, char* argv[]) |
| 3076 | "forward\n3) real inverse") |
| 3077 | ->default_val(fft_transform_type_complex_forward); |
| 3078 | non_token |
| 3079 | + ->add_option("--auto_allocation", |
| 3080 | + manual_params.auto_allocate, |
| 3081 | + "rocFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"") |
| 3082 | + ->default_val("default"); |
| 3083 | + non_token |
| 3084 | ->add_option("--precision", |
| 3085 | manual_params.precision, |
| 3086 | "Transform precision: single (default), double, half") |
| 3087 | @@ -576,6 +553,7 @@ int main(int argc, char* argv[]) |
| 3088 | precompile_file, |
| 3089 | "Precompile kernels to a file for all test cases before running tests") |
| 3090 | ->default_val(""); |
| 3091 | + app.add_flag("--printtokens", printtokens, "Print test tokens to scheduled to be run and exit"); |
| 3092 | // Default value is set in fft_params.h based on if device-side PRNG was enabled. |
| 3093 | app.add_option("-g, --inputGen", |
| 3094 | manual_params.igen, |
| 3095 | @@ -584,36 +562,57 @@ int main(int argc, char* argv[]) |
| 3096 | "2) linearly-spaced sequence (device)\n" |
| 3097 | "3) linearly-spaced sequence (host)"); |
| 3098 | |
| 3099 | - // Parse rest of args and catch any errors here |
| 3100 | + // Try parsing initial args that will be used to configure tests |
| 3101 | + // Allow extras to pass on gtest arguments without error |
| 3102 | + app.allow_extras(); |
| 3103 | try |
| 3104 | { |
| 3105 | - app.parse(cargc, carg.data()); |
| 3106 | + app.parse(argc, argv); |
| 3107 | } |
| 3108 | catch(const CLI::ParseError& e) |
| 3109 | { |
| 3110 | return app.exit(e); |
| 3111 | } |
| 3112 | |
| 3113 | + // extract remaining arguments for subsequent gtest initialization |
| 3114 | + std::vector<std::string> remaining_args = app.remaining(); |
| 3115 | + std::string gtest_help_opt = "--help"; |
| 3116 | + // NB: If we initialize gtest first, then it removes all of its own command-line |
| 3117 | + // arguments and sets argc and argv correctly; |
| 3118 | + std::vector<char*> gtest_argv; |
| 3119 | + gtest_argv.insert(gtest_argv.begin(), argv[0]); |
| 3120 | + for(std::string& s : remaining_args) |
| 3121 | + { |
| 3122 | + gtest_argv.push_back(&s[0]); |
| 3123 | + } |
| 3124 | + if(*opt_help) |
| 3125 | + { |
| 3126 | + // make sure gtest prints its help as well |
| 3127 | + gtest_argv.push_back(>est_help_opt[0]); |
| 3128 | + } |
| 3129 | + gtest_argv.push_back(NULL); |
| 3130 | + decltype(argc) gtest_argc = gtest_argv.size() - 1; |
| 3131 | + ::testing::InitGoogleTest(>est_argc, gtest_argv.data()); // gtest-relevant args are removed |
| 3132 | + |
| 3133 | if(*opt_help) |
| 3134 | { |
| 3135 | std::cout << app.help() << "\n"; |
| 3136 | return EXIT_SUCCESS; |
| 3137 | } |
| 3138 | - |
| 3139 | - // Ensure there are no leftover options used by neither gtest nor CLI11 |
| 3140 | - const auto leftover_args = app.remaining(); |
| 3141 | - if(!leftover_args.empty()) |
| 3142 | + // no help was used, gtest_argc is expected to be 1 at this point. If not, some of the |
| 3143 | + // used options were not recognized at all |
| 3144 | + if(gtest_argc > 1) |
| 3145 | { |
| 3146 | std::cout << "Unrecognised option(s) found:\n "; |
| 3147 | - for(auto i : leftover_args) |
| 3148 | - std::cout << i << " "; |
| 3149 | + for(auto i = 1; i < gtest_argc; i++) |
| 3150 | + std::cout << gtest_argv[i] << " "; |
| 3151 | std::cout << "\nRun with --help for more information.\n"; |
| 3152 | return EXIT_FAILURE; |
| 3153 | } |
| 3154 | |
| 3155 | std::cout << "half epsilon: " << half_epsilon << "\tsingle epsilon: " << single_epsilon |
| 3156 | - << "\tdouble epsilon: " << double_epsilon << "\n"; |
| 3157 | - std::cout << "Random seed: " << random_seed << "\n"; |
| 3158 | + << "\tdouble epsilon: " << double_epsilon << std::endl; |
| 3159 | + std::cout << "Random seed: " << random_seed << std::endl; |
| 3160 | |
| 3161 | // If precompiling, tell rocFFT to use the specified cache file |
| 3162 | // to write kernels to |
| 3163 | @@ -628,9 +627,11 @@ int main(int argc, char* argv[]) |
| 3164 | } |
| 3165 | |
| 3166 | rocfft_setup(); |
| 3167 | - char v[256]; |
| 3168 | - rocfft_get_version_string(v, 256); |
| 3169 | - std::cout << "rocFFT version: " << v << "\n"; |
| 3170 | + { |
| 3171 | + char v[256]; |
| 3172 | + rocfft_get_version_string(v, 256); |
| 3173 | + std::cout << "rocFFT version: " << v << std::endl; |
| 3174 | + } |
| 3175 | |
| 3176 | #ifdef FFTW_MULTITHREAD |
| 3177 | fftw_init_threads(); |
| 3178 | @@ -641,6 +642,7 @@ int main(int argc, char* argv[]) |
| 3179 | |
| 3180 | // Set host memory limit from command-line options |
| 3181 | host_memory::singleton().set_limit_gbytes(ramgb); |
| 3182 | + std::cout << "Host memory limit: " << ramgb << " GiB" << std::endl; |
| 3183 | |
| 3184 | if(use_fftw_wisdom) |
| 3185 | { |
| 3186 | @@ -693,7 +695,7 @@ int main(int argc, char* argv[]) |
| 3187 | |
| 3188 | if(!test_token.empty()) |
| 3189 | { |
| 3190 | - std::cout << "Reading fft params from token:\n" << test_token << "\n"; |
| 3191 | + std::cout << "Reading fft params from token:\n" << test_token << std::endl; |
| 3192 | |
| 3193 | try |
| 3194 | { |
| 3195 | @@ -701,8 +703,8 @@ int main(int argc, char* argv[]) |
| 3196 | } |
| 3197 | catch(...) |
| 3198 | { |
| 3199 | - std::cout << "Unable to parse token.\n"; |
| 3200 | - return 1; |
| 3201 | + std::cout << "Unable to parse token." << std::endl; |
| 3202 | + return EXIT_FAILURE; |
| 3203 | } |
| 3204 | } |
| 3205 | else |
| 3206 | @@ -729,6 +731,17 @@ int main(int argc, char* argv[]) |
| 3207 | if(!precompile_file.empty()) |
| 3208 | precompile_test_kernels(precompile_file); |
| 3209 | |
| 3210 | + if(printtokens) |
| 3211 | + { |
| 3212 | + std::cout << "Tokens:" << std::endl; |
| 3213 | + const auto tokens = tokens_to_run(); |
| 3214 | + for(const auto& token : tokens) |
| 3215 | + { |
| 3216 | + std::cout << token << std::endl; |
| 3217 | + } |
| 3218 | + return EXIT_SUCCESS; |
| 3219 | + } |
| 3220 | + |
| 3221 | auto retval = RUN_ALL_TESTS(); |
| 3222 | |
| 3223 | if(use_fftw_wisdom) |
| 3224 | @@ -749,7 +762,8 @@ int main(int argc, char* argv[]) |
| 3225 | const auto test_minutes |
| 3226 | = std::chrono::duration_cast<std::chrono::minutes>(test_duration - test_hours); |
| 3227 | std::cout << "Test suite took " << test_hours.count() << " hours " << test_minutes.count() |
| 3228 | - << " minutes\n\n"; |
| 3229 | + << " minutes\n" |
| 3230 | + << std::endl; |
| 3231 | |
| 3232 | std::cout << "half precision max l-inf epsilon: " << max_linf_eps_half << "\n"; |
| 3233 | std::cout << "half precision max l2 epsilon: " << max_l2_eps_half << "\n"; |
| 3234 | @@ -758,8 +772,12 @@ int main(int argc, char* argv[]) |
| 3235 | std::cout << "double precision max l-inf epsilon: " << max_linf_eps_double << "\n"; |
| 3236 | std::cout << "double precision max l2 epsilon: " << max_l2_eps_double << "\n"; |
| 3237 | std::cout << "Number of runtime issues: " << n_hip_failures << "\n"; |
| 3238 | + std::cout << "Number of successful tests: " |
| 3239 | + << ::testing::UnitTest::GetInstance()->successful_test_count() << "\n"; |
| 3240 | + std::cout << "Number of skipped tests: " |
| 3241 | + << ::testing::UnitTest::GetInstance()->skipped_test_count() << "\n"; |
| 3242 | |
| 3243 | - std::cout << "\nRandom seed: " << random_seed << "\n"; |
| 3244 | + std::cout << "\nRandom seed: " << random_seed << std::endl; |
| 3245 | |
| 3246 | return retval; |
| 3247 | } |
| 3248 | diff --git a/clients/tests/hermitian_test.cpp b/clients/tests/hermitian_test.cpp |
| 3249 | index 8a17f6d..6c72b25 100644 |
| 3250 | --- a/clients/tests/hermitian_test.cpp |
| 3251 | +++ b/clients/tests/hermitian_test.cpp |
| 3252 | @@ -20,6 +20,7 @@ |
| 3253 | |
| 3254 | #include "../../shared/accuracy_test.h" |
| 3255 | #include "../../shared/gpubuf.h" |
| 3256 | +#include "../../shared/params_gen.h" |
| 3257 | #include "../../shared/rocfft_params.h" |
| 3258 | #include "../samples/rocfft/examplekernels.h" |
| 3259 | #include "../samples/rocfft/exampleutils.h" |
| 3260 | @@ -171,12 +172,24 @@ void run_1D_hermitian_test(size_t length) |
| 3261 | // test a case that's small enough that it only needs one kernel |
| 3262 | TEST(rocfft_UnitTest, 1D_hermitian_single_small) |
| 3263 | { |
| 3264 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3265 | + > unittest_prob) |
| 3266 | + { |
| 3267 | + GTEST_SKIP(); |
| 3268 | + } |
| 3269 | + |
| 3270 | run_1D_hermitian_test(8); |
| 3271 | } |
| 3272 | |
| 3273 | // test a case that's big enough that it needs multiple kernels |
| 3274 | TEST(rocfft_UnitTest, 1D_hermitian_single_large) |
| 3275 | { |
| 3276 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3277 | + > unittest_prob) |
| 3278 | + { |
| 3279 | + GTEST_SKIP(); |
| 3280 | + } |
| 3281 | + |
| 3282 | run_1D_hermitian_test(8192); |
| 3283 | } |
| 3284 | |
| 3285 | @@ -198,6 +211,12 @@ std::string str(T begin, T end) |
| 3286 | // Test that the GPU Hermitian symmetrizer code produces the correct results. |
| 3287 | TEST(rocfft_UnitTest, gpu_symmetrizer) |
| 3288 | { |
| 3289 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3290 | + > unittest_prob) |
| 3291 | + { |
| 3292 | + GTEST_SKIP(); |
| 3293 | + } |
| 3294 | + |
| 3295 | std::vector<std::vector<size_t>> lengths = {{4, 4, 3}, |
| 3296 | {5}, |
| 3297 | {8}, |
| 3298 | diff --git a/clients/tests/hipGraph_test.cpp b/clients/tests/hipGraph_test.cpp |
| 3299 | index 34f7ed2..da0caa7 100644 |
| 3300 | --- a/clients/tests/hipGraph_test.cpp |
| 3301 | +++ b/clients/tests/hipGraph_test.cpp |
| 3302 | @@ -22,6 +22,7 @@ |
| 3303 | #include "../../shared/arithmetic.h" |
| 3304 | #include "../../shared/gpubuf.h" |
| 3305 | #include "../../shared/hip_object_wrapper.h" |
| 3306 | +#include "../../shared/params_gen.h" |
| 3307 | #include "../../shared/rocfft_against_fftw.h" |
| 3308 | #include "../../shared/rocfft_params.h" |
| 3309 | #include "rocfft/rocfft.h" |
| 3310 | @@ -271,6 +272,12 @@ static void compare_data(const std::vector<rocfft_complex<float>>& original_host |
| 3311 | |
| 3312 | TEST(rocfft_UnitTest, hipGraph_execution) |
| 3313 | { |
| 3314 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3315 | + > unittest_prob) |
| 3316 | + { |
| 3317 | + GTEST_SKIP(); |
| 3318 | + } |
| 3319 | + |
| 3320 | hipGraph_t graph = nullptr; |
| 3321 | hipGraphExec_t graph_exec = nullptr; |
| 3322 | |
| 3323 | @@ -375,7 +382,7 @@ TEST(rocfft_UnitTest, hipGraph_execution) |
| 3324 | ASSERT_EQ(hipGraphLaunch(graph_exec, stream), hipSuccess); |
| 3325 | |
| 3326 | ASSERT_EQ(hipStreamSynchronize(stream), hipSuccess); |
| 3327 | - ASSERT_EQ(hipStreamDestroy(stream), hipSuccess); |
| 3328 | + stream.free(); |
| 3329 | |
| 3330 | // check for correctness of the output data |
| 3331 | compare_data(host_mem_in, device_mem_out); |
| 3332 | @@ -386,5 +393,5 @@ TEST(rocfft_UnitTest, hipGraph_execution) |
| 3333 | fill(host_mem_counter_modified.begin(), host_mem_counter_modified.end(), num_graph_launches); |
| 3334 | compare_data_exact_match<size_t>(other_stream, host_mem_counter_modified, device_mem_counter); |
| 3335 | |
| 3336 | - ASSERT_EQ(hipStreamDestroy(other_stream), hipSuccess); |
| 3337 | + other_stream.free(); |
| 3338 | } |
| 3339 | diff --git a/clients/tests/multi_device_test.cpp b/clients/tests/multi_device_test.cpp |
| 3340 | index 9c4b476..3cc3958 100644 |
| 3341 | --- a/clients/tests/multi_device_test.cpp |
| 3342 | +++ b/clients/tests/multi_device_test.cpp |
| 3343 | @@ -49,20 +49,31 @@ enum SplitType |
| 3344 | PENCIL_3D, |
| 3345 | }; |
| 3346 | |
| 3347 | -std::vector<fft_params> param_generator_multi_gpu(const SplitType type) |
| 3348 | +std::vector<fft_params> param_generator_multi_gpu(const SplitType type, const int ngpus) |
| 3349 | { |
| 3350 | int localDeviceCount = 0; |
| 3351 | - (void)hipGetDeviceCount(&localDeviceCount); |
| 3352 | + if(ngpus <= 0) |
| 3353 | + { |
| 3354 | + // Use the command-line option as a priority |
| 3355 | + if(hipGetDeviceCount(&localDeviceCount) != hipSuccess) |
| 3356 | + { |
| 3357 | + throw std::runtime_error("hipGetDeviceCount failed"); |
| 3358 | + } |
| 3359 | + |
| 3360 | + // Limit local device testing to 16 GPUs, as we have some |
| 3361 | + // bottlenecks with larger device counts that unreasonably slow |
| 3362 | + // down plan creation |
| 3363 | + localDeviceCount = std::min<int>(16, localDeviceCount); |
| 3364 | + } |
| 3365 | + else |
| 3366 | + { |
| 3367 | + localDeviceCount = ngpus; |
| 3368 | + } |
| 3369 | |
| 3370 | // need multiple devices or multiprocessing to test anything |
| 3371 | if(localDeviceCount < 2 && mp_lib == fft_params::fft_mp_lib_none) |
| 3372 | return {}; |
| 3373 | |
| 3374 | - // limit local device testing to 16 GPUs, as we have some |
| 3375 | - // bottlenecks with larger device counts that unreasonably slow |
| 3376 | - // down plan creation |
| 3377 | - localDeviceCount = std::min<int>(16, localDeviceCount); |
| 3378 | - |
| 3379 | auto params_complex = param_generator_complex(test_prob, |
| 3380 | multi_gpu_sizes, |
| 3381 | precision_range_sp_dp, |
| 3382 | @@ -165,30 +176,30 @@ std::vector<fft_params> param_generator_multi_gpu(const SplitType type) |
| 3383 | // split both input and output on slowest FFT dim |
| 3384 | INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_dim, |
| 3385 | accuracy_test, |
| 3386 | - ::testing::ValuesIn(param_generator_multi_gpu(SLOW_INOUT)), |
| 3387 | + ::testing::ValuesIn(param_generator_multi_gpu(SLOW_INOUT, ngpus)), |
| 3388 | accuracy_test::TestName); |
| 3389 | |
| 3390 | // split slowest FFT dim only on input, or only on output |
| 3391 | INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_input_dim, |
| 3392 | accuracy_test, |
| 3393 | - ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN)), |
| 3394 | + ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN, ngpus)), |
| 3395 | accuracy_test::TestName); |
| 3396 | INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_output_dim, |
| 3397 | accuracy_test, |
| 3398 | - ::testing::ValuesIn(param_generator_multi_gpu(SLOW_OUT)), |
| 3399 | + ::testing::ValuesIn(param_generator_multi_gpu(SLOW_OUT, ngpus)), |
| 3400 | accuracy_test::TestName); |
| 3401 | |
| 3402 | // split input on slowest FFT and output on fastest, to minimize data |
| 3403 | // movement (only makes sense for rank-2 and higher FFTs) |
| 3404 | INSTANTIATE_TEST_SUITE_P(multi_gpu_slowin_fastout, |
| 3405 | accuracy_test, |
| 3406 | - ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN_FAST_OUT)), |
| 3407 | + ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN_FAST_OUT, ngpus)), |
| 3408 | accuracy_test::TestName); |
| 3409 | |
| 3410 | // 3D pencil decompositions |
| 3411 | INSTANTIATE_TEST_SUITE_P(multi_gpu_3d_pencils, |
| 3412 | accuracy_test, |
| 3413 | - ::testing::ValuesIn(param_generator_multi_gpu(PENCIL_3D)), |
| 3414 | + ::testing::ValuesIn(param_generator_multi_gpu(PENCIL_3D, ngpus)), |
| 3415 | accuracy_test::TestName); |
| 3416 | |
| 3417 | TEST(multi_gpu_validate, catch_validation_errors) |
| 3418 | @@ -204,7 +215,7 @@ TEST(multi_gpu_validate, catch_validation_errors) |
| 3419 | for(auto type : all_split_types) |
| 3420 | { |
| 3421 | // gather all of the multi-GPU test cases |
| 3422 | - auto params = param_generator_multi_gpu(type); |
| 3423 | + auto params = param_generator_multi_gpu(type, ngpus); |
| 3424 | |
| 3425 | for(size_t i = 0; i < params.size(); ++i) |
| 3426 | { |
| 3427 | @@ -284,3 +295,105 @@ TEST(multi_gpu_validate, catch_validation_errors) |
| 3428 | } |
| 3429 | } |
| 3430 | } |
| 3431 | + |
| 3432 | +static const auto multi_gpu_tokens = { |
| 3433 | + // clang-format off |
| 3434 | + |
| 3435 | + // input bricks are not contiguous |
| 3436 | + "real_forward_len_160_160_160_single_op_batch_1_ifield_brick_lower_0_0_0_0_upper_1_80_160_160_stride_0_25920_162_1_dev_0_brick_lower_0_80_0_0_upper_1_160_160_160_stride_0_25920_162_1_rank_1_dev_1_ofield_brick_lower_0_0_0_0_upper_1_160_80_81_stride_0_6480_81_1_dev_0_brick_lower_0_0_80_0_upper_1_160_160_81_stride_0_6480_81_1_rank_1_dev_1", |
| 3437 | + // output bricks are not contiguous |
| 3438 | + "real_forward_len_160_160_160_single_op_batch_1_ifield_brick_lower_0_0_0_0_upper_1_80_160_160_stride_0_25600_160_1_dev_0_brick_lower_0_80_0_0_upper_1_160_160_160_stride_0_25600_160_1_rank_1_dev_1_ofield_brick_lower_0_0_0_0_upper_1_160_80_81_stride_0_6560_82_1_dev_0_brick_lower_0_0_80_0_upper_1_160_160_81_stride_0_6560_82_1_rank_1_dev_1", |
| 3439 | + // neither input nor output bricks are contiguous |
| 3440 | + "real_forward_len_160_160_160_single_op_batch_1_ifield_brick_lower_0_0_0_0_upper_1_80_160_160_stride_0_25920_162_1_dev_0_brick_lower_0_80_0_0_upper_1_160_160_160_stride_0_25920_162_1_rank_1_dev_1_ofield_brick_lower_0_0_0_0_upper_1_160_80_81_stride_0_6560_82_1_dev_0_brick_lower_0_0_80_0_upper_1_160_160_81_stride_0_6560_82_1_rank_1_dev_1", |
| 3441 | + // 1D multi-process batched in-place transform using 1 device per rank |
| 3442 | + "complex_forward_len_256_double_ip_batch_4_ifield_brick_lower_0_0_upper_4_128_stride_128_1_dev_0_brick_lower_0_128_upper_4_256_stride_128_1_rank_1_dev_1_ofield_brick_lower_0_0_upper_4_128_stride_128_1_dev_0_brick_lower_0_128_upper_4_256_stride_128_1_rank_1_dev_1", |
| 3443 | + // 2D multi-process out-of-place transform using 2 MPI ranks each with 2 GPUs |
| 3444 | + "complex_forward_len_128_256_single_op_batch_1_ifield_brick_lower_0_0_0_upper_1_128_64_stride_8192_64_1_dev_0_brick_lower_0_0_64_upper_1_128_128_stride_8192_64_1_rank_1_dev_1_brick_lower_0_0_128_upper_1_128_192_stride_8192_64_1_rank_0_dev_2_brick_lower_0_0_192_upper_1_128_256_stride_8192_64_1_rank_1_dev_3_ofield_brick_lower_0_0_0_upper_1_128_64_stride_8192_64_1_dev_0_brick_lower_0_0_64_upper_1_128_128_stride_8192_64_1_rank_1_dev_1_brick_lower_0_0_128_upper_1_128_192_stride_8192_64_1_rank_0_dev_2_brick_lower_0_0_192_upper_1_128_256_stride_8192_64_1_rank_1_dev_3", |
| 3445 | + // 3D multi-process out-of-place transform using 2 MPI ranks each with 2 GPUs |
| 3446 | + "complex_forward_len_256_256_256_double_op_batch_1_ifield_brick_lower_0_0_0_0_upper_1_64_256_256_stride_4194304_65536_256_1_dev_0_brick_lower_0_64_0_0_upper_1_128_256_256_stride_4194304_65536_256_1_rank_0_dev_1_brick_lower_0_128_0_0_upper_1_192_256_256_stride_4194304_65536_256_1_rank_1_dev_2_brick_lower_0_192_0_0_upper_1_256_256_256_stride_4194304_65536_256_1_rank_1_dev_3_ofield_brick_lower_0_0_0_0_upper_1_256_256_64_stride_4194304_16384_64_1_dev_0_brick_lower_0_0_0_64_upper_1_256_256_128_stride_4194304_16384_64_1_rank_0_dev_1_brick_lower_0_0_0_128_upper_1_256_256_192_stride_4194304_16384_64_1_rank_1_dev_2_brick_lower_0_0_0_192_upper_1_256_256_256_stride_4194304_16384_64_1_rank_1_dev_3", |
| 3447 | + // 3D multi-process batched in-place transform using 2 MPI ranks each with 2 GPUs |
| 3448 | + "complex_forward_len_128_300_256_single_op_batch_4_ifield_brick_lower_0_0_0_0_upper_4_32_300_256_stride_2457600_76800_256_1_dev_0_brick_lower_0_32_0_0_upper_4_64_300_256_stride_2457600_76800_256_1_rank_1_dev_1_brick_lower_0_64_0_0_upper_4_96_300_256_stride_2457600_76800_256_1_rank_0_dev_2_brick_lower_0_96_0_0_upper_4_128_300_256_stride_2457600_76800_256_1_rank_1_dev_3_ofield_brick_lower_0_0_0_0_upper_4_128_300_64_stride_2457600_19200_64_1_dev_0_brick_lower_0_0_0_64_upper_4_128_300_128_stride_2457600_19200_64_1_rank_1_dev_1_brick_lower_0_0_0_128_upper_4_128_300_192_stride_2457600_19200_64_1_rank_0_dev_2_brick_lower_0_0_0_192_upper_4_128_300_256_stride_2457600_19200_64_1_rank_1_dev_3 ", |
| 3449 | + |
| 3450 | + // clang-format on |
| 3451 | +}; |
| 3452 | + |
| 3453 | +std::vector<fft_params> param_generator_multi_gpu_adhoc() |
| 3454 | +{ |
| 3455 | + int localDeviceCount = 0; |
| 3456 | + if(ngpus <= 0) |
| 3457 | + { |
| 3458 | + // Use the command-line option as a priority |
| 3459 | + if(hipGetDeviceCount(&localDeviceCount) != hipSuccess) |
| 3460 | + { |
| 3461 | + throw std::runtime_error("hipGetDeviceCount failed"); |
| 3462 | + } |
| 3463 | + |
| 3464 | + // Limit local device testing to 16 GPUs, as we have some |
| 3465 | + // bottlenecks with larger device counts that unreasonably slow |
| 3466 | + // down plan creation |
| 3467 | + localDeviceCount = std::min<int>(16, localDeviceCount); |
| 3468 | + } |
| 3469 | + else |
| 3470 | + { |
| 3471 | + localDeviceCount = ngpus; |
| 3472 | + } |
| 3473 | + |
| 3474 | + auto all_params = param_generator_token(test_prob, multi_gpu_tokens); |
| 3475 | + |
| 3476 | + // check if fields use more bricks than we can support |
| 3477 | + auto too_many_bricks = [=](const std::vector<fft_params::fft_field>& fields, size_t maxBricks) { |
| 3478 | + for(const auto& f : fields) |
| 3479 | + { |
| 3480 | + if(f.bricks.size() > maxBricks) |
| 3481 | + return true; |
| 3482 | + |
| 3483 | + // also remove a test case if it uses a numbered device |
| 3484 | + // that isn't available |
| 3485 | + if(std::any_of(f.bricks.begin(), f.bricks.end(), [=](const fft_params::fft_brick& b) { |
| 3486 | + return b.device >= localDeviceCount; |
| 3487 | + })) |
| 3488 | + return true; |
| 3489 | + } |
| 3490 | + return false; |
| 3491 | + }; |
| 3492 | + |
| 3493 | + // remove test cases where we don't have enough ranks/devices for |
| 3494 | + // the number of bricks |
| 3495 | + all_params.erase(std::remove_if(all_params.begin(), |
| 3496 | + all_params.end(), |
| 3497 | + [=](const fft_params& params) { |
| 3498 | + size_t maxBricks = mp_lib == fft_params::fft_mp_lib_mpi |
| 3499 | + ? mp_ranks |
| 3500 | + : localDeviceCount; |
| 3501 | + return too_many_bricks(params.ifields, maxBricks) |
| 3502 | + || too_many_bricks(params.ofields, maxBricks); |
| 3503 | + }), |
| 3504 | + all_params.end()); |
| 3505 | + |
| 3506 | + // set all bricks in a field to rank-0, to change an MPI test |
| 3507 | + // case to single-proc |
| 3508 | + auto set_rank_0 = [](std::vector<fft_params::fft_field>& fields) { |
| 3509 | + for(auto& f : fields) |
| 3510 | + { |
| 3511 | + for(auto& b : f.bricks) |
| 3512 | + b.rank = 0; |
| 3513 | + } |
| 3514 | + }; |
| 3515 | + |
| 3516 | + // modify the remaining test cases to use the current multi-GPU lib |
| 3517 | + for(auto& params : all_params) |
| 3518 | + { |
| 3519 | + params.mp_lib = mp_lib; |
| 3520 | + if(mp_lib == fft_params::fft_mp_lib_none) |
| 3521 | + { |
| 3522 | + set_rank_0(params.ifields); |
| 3523 | + set_rank_0(params.ofields); |
| 3524 | + } |
| 3525 | + } |
| 3526 | + return all_params; |
| 3527 | +} |
| 3528 | + |
| 3529 | +INSTANTIATE_TEST_SUITE_P(multi_gpu_adhoc_token, |
| 3530 | + accuracy_test, |
| 3531 | + ::testing::ValuesIn(param_generator_multi_gpu_adhoc()), |
| 3532 | + accuracy_test::TestName); |
| 3533 | diff --git a/clients/tests/multithread_test.cpp b/clients/tests/multithread_test.cpp |
| 3534 | index 5ec9a5f..3b6cd5e 100644 |
| 3535 | --- a/clients/tests/multithread_test.cpp |
| 3536 | +++ b/clients/tests/multithread_test.cpp |
| 3537 | @@ -21,6 +21,7 @@ |
| 3538 | #include "../../shared/accuracy_test.h" |
| 3539 | #include "../../shared/gpubuf.h" |
| 3540 | #include "../../shared/hip_object_wrapper.h" |
| 3541 | +#include "../../shared/params_gen.h" |
| 3542 | #include "../../shared/rocfft_against_fftw.h" |
| 3543 | #include "../../shared/rocfft_params.h" |
| 3544 | #include "rocfft/rocfft.h" |
| 3545 | @@ -322,30 +323,66 @@ static void multistream_transform(size_t N, size_t dim, size_t num_streams) |
| 3546 | // fitting into e.g. 8 GB of GPU memory |
| 3547 | TEST(DISABLED_rocfft_UnitTest, simple_multithread_1D) |
| 3548 | { |
| 3549 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3550 | + > unittest_prob) |
| 3551 | + { |
| 3552 | + GTEST_SKIP(); |
| 3553 | + } |
| 3554 | + |
| 3555 | multithread_transform(1048576, 1, 64); |
| 3556 | } |
| 3557 | |
| 3558 | TEST(DISABLED_rocfft_UnitTest, simple_multithread_2D) |
| 3559 | { |
| 3560 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3561 | + > unittest_prob) |
| 3562 | + { |
| 3563 | + GTEST_SKIP(); |
| 3564 | + } |
| 3565 | + |
| 3566 | multithread_transform(1024, 2, 64); |
| 3567 | } |
| 3568 | |
| 3569 | TEST(DISABLED_rocfft_UnitTest, simple_multithread_3D) |
| 3570 | { |
| 3571 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3572 | + > unittest_prob) |
| 3573 | + { |
| 3574 | + GTEST_SKIP(); |
| 3575 | + } |
| 3576 | + |
| 3577 | multithread_transform(128, 3, 40); |
| 3578 | } |
| 3579 | |
| 3580 | TEST(rocfft_UnitTest, simple_multistream_1D) |
| 3581 | { |
| 3582 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3583 | + > unittest_prob) |
| 3584 | + { |
| 3585 | + GTEST_SKIP(); |
| 3586 | + } |
| 3587 | + |
| 3588 | multistream_transform(1048576, 1, 32); |
| 3589 | } |
| 3590 | |
| 3591 | TEST(rocfft_UnitTest, simple_multistream_2D) |
| 3592 | { |
| 3593 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3594 | + > unittest_prob) |
| 3595 | + { |
| 3596 | + GTEST_SKIP(); |
| 3597 | + } |
| 3598 | + |
| 3599 | multistream_transform(1024, 2, 32); |
| 3600 | } |
| 3601 | |
| 3602 | TEST(rocfft_UnitTest, simple_multistream_3D) |
| 3603 | { |
| 3604 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3605 | + > unittest_prob) |
| 3606 | + { |
| 3607 | + GTEST_SKIP(); |
| 3608 | + } |
| 3609 | + |
| 3610 | multistream_transform(128, 3, 32); |
| 3611 | } |
| 3612 | diff --git a/clients/tests/unit_test.cpp b/clients/tests/unit_test.cpp |
| 3613 | index 1b9b970..6ee83a2 100644 |
| 3614 | --- a/clients/tests/unit_test.cpp |
| 3615 | +++ b/clients/tests/unit_test.cpp |
| 3616 | @@ -23,10 +23,13 @@ |
| 3617 | #include "../../shared/concurrency.h" |
| 3618 | #include "../../shared/environment.h" |
| 3619 | #include "../../shared/gpubuf.h" |
| 3620 | +#include "../../shared/params_gen.h" |
| 3621 | +#include "../../shared/precision_type.h" |
| 3622 | #include "../../shared/rocfft_complex.h" |
| 3623 | #include "hip/hip_runtime_api.h" |
| 3624 | #include <boost/scope_exit.hpp> |
| 3625 | #include <condition_variable> |
| 3626 | +#include <cstdio> |
| 3627 | #include <cstdlib> |
| 3628 | #include <fstream> |
| 3629 | #include <gtest/gtest.h> |
| 3630 | @@ -58,6 +61,12 @@ namespace fs = std::filesystem; |
| 3631 | |
| 3632 | TEST(rocfft_UnitTest, plan_description) |
| 3633 | { |
| 3634 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3635 | + > unittest_prob) |
| 3636 | + { |
| 3637 | + GTEST_SKIP(); |
| 3638 | + } |
| 3639 | + |
| 3640 | rocfft_plan_description desc = nullptr; |
| 3641 | ASSERT_TRUE(rocfft_status_success == rocfft_plan_description_create(&desc)); |
| 3642 | |
| 3643 | @@ -106,6 +115,12 @@ TEST(rocfft_UnitTest, plan_description_reuse) |
| 3644 | // check that a plan description can be reused between different |
| 3645 | // plans, with different layout parameters for each. |
| 3646 | |
| 3647 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3648 | + > unittest_prob) |
| 3649 | + { |
| 3650 | + GTEST_SKIP(); |
| 3651 | + } |
| 3652 | + |
| 3653 | // allocate plan description once |
| 3654 | rocfft_plan_description desc = nullptr; |
| 3655 | ASSERT_EQ(rocfft_plan_description_create(&desc), rocfft_status_success); |
| 3656 | @@ -191,9 +206,92 @@ TEST(rocfft_UnitTest, plan_description_reuse) |
| 3657 | ASSERT_EQ(rocfft_plan_description_destroy(desc), rocfft_status_success); |
| 3658 | } |
| 3659 | |
| 3660 | +// run a transform with all log levels enabled |
| 3661 | +TEST(rocfft_UnitTest, log_levels) |
| 3662 | +{ |
| 3663 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3664 | + > unittest_prob) |
| 3665 | + { |
| 3666 | + GTEST_SKIP(); |
| 3667 | + } |
| 3668 | + |
| 3669 | + // clean up environment and temporary file when we exit |
| 3670 | + BOOST_SCOPE_EXIT_ALL(=) |
| 3671 | + { |
| 3672 | + rocfft_cleanup(); |
| 3673 | + // re-init logs with default logging |
| 3674 | + rocfft_setup(); |
| 3675 | + }; |
| 3676 | + rocfft_cleanup(); |
| 3677 | + |
| 3678 | + // enumerate all known log levels and direct all of the logs to nowhere |
| 3679 | + EnvironmentSetTemp layer("ROCFFT_LAYER", std::to_string(0xffffffff).c_str()); |
| 3680 | +#ifdef WIN32 |
| 3681 | + static const char* log_output = "NUL"; |
| 3682 | +#else |
| 3683 | + static const char* log_output = "/dev/null"; |
| 3684 | +#endif |
| 3685 | + EnvironmentSetTemp log_trace_path("ROCFFT_LOG_TRACE_PATH", log_output); |
| 3686 | + EnvironmentSetTemp log_bench_path("ROCFFT_LOG_BENCH_PATH", log_output); |
| 3687 | + EnvironmentSetTemp log_profile_path("ROCFFT_LOG_PROFILE_PATH", log_output); |
| 3688 | + EnvironmentSetTemp log_plan_path("ROCFFT_LOG_PLAN_PATH", log_output); |
| 3689 | + EnvironmentSetTemp log_kernelio_path("ROCFFT_LOG_KERNELIO_PATH", log_output); |
| 3690 | + EnvironmentSetTemp log_rtc_path("ROCFFT_LOG_RTC_PATH", log_output); |
| 3691 | + EnvironmentSetTemp log_tuning_path("ROCFFT_LOG_TUNING_PATH", log_output); |
| 3692 | + EnvironmentSetTemp log_graph_path("ROCFFT_LOG_GRAPH_PATH", log_output); |
| 3693 | + |
| 3694 | + rocfft_setup(); |
| 3695 | + |
| 3696 | + // Test single-kernel Bluestein and a multi-kernel plan |
| 3697 | + // |
| 3698 | + // TODO: add fused L1D Bluestein case like 8191, as that does weird |
| 3699 | + // things with buffers |
| 3700 | + for(const size_t length : { |
| 3701 | + 37, |
| 3702 | + 64, |
| 3703 | + 32768, |
| 3704 | + }) |
| 3705 | + { |
| 3706 | + for(const auto type : {rocfft_transform_type_complex_forward, |
| 3707 | + rocfft_transform_type_real_forward, |
| 3708 | + rocfft_transform_type_real_inverse}) |
| 3709 | + { |
| 3710 | + for(const auto precision : |
| 3711 | + {rocfft_precision_single, rocfft_precision_double, rocfft_precision_half}) |
| 3712 | + { |
| 3713 | + rocfft_plan plan = nullptr; |
| 3714 | + ASSERT_EQ( |
| 3715 | + rocfft_plan_create( |
| 3716 | + &plan, rocfft_placement_inplace, type, precision, 1, &length, 1, nullptr), |
| 3717 | + rocfft_status_success); |
| 3718 | + |
| 3719 | + // assume transform uses complex, will overallocate for real |
| 3720 | + // transforms but we only care about logging |
| 3721 | + gpubuf data_dev; |
| 3722 | + ASSERT_EQ( |
| 3723 | + data_dev.alloc(element_size(precision, rocfft_array_type_complex_interleaved) |
| 3724 | + * length), |
| 3725 | + hipSuccess); |
| 3726 | + |
| 3727 | + void* data_dev_ptr = data_dev.data(); |
| 3728 | + ASSERT_EQ(rocfft_execute(plan, &data_dev_ptr, nullptr, nullptr), |
| 3729 | + rocfft_status_success); |
| 3730 | + |
| 3731 | + rocfft_plan_destroy(plan); |
| 3732 | + } |
| 3733 | + } |
| 3734 | + } |
| 3735 | +} |
| 3736 | + |
| 3737 | // Check whether logs can be emitted from multiple threads properly |
| 3738 | TEST(rocfft_UnitTest, log_multithreading) |
| 3739 | { |
| 3740 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3741 | + > unittest_prob) |
| 3742 | + { |
| 3743 | + GTEST_SKIP(); |
| 3744 | + } |
| 3745 | + |
| 3746 | static const int NUM_THREADS = 10; |
| 3747 | static const int NUM_ITERS_PER_THREAD = 50; |
| 3748 | static const char* TRACE_FILE = "trace.log"; |
| 3749 | @@ -321,18 +419,36 @@ void workmem_test(workmem_sizer sizer, |
| 3750 | // - library should allocate |
| 3751 | TEST(rocfft_UnitTest, workmem_missing) |
| 3752 | { |
| 3753 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3754 | + > unittest_prob) |
| 3755 | + { |
| 3756 | + GTEST_SKIP(); |
| 3757 | + } |
| 3758 | + |
| 3759 | workmem_test([](size_t) { return 0; }, rocfft_status_success); |
| 3760 | } |
| 3761 | |
| 3762 | // check what happens if work memory is required but not enough is provided |
| 3763 | TEST(rocfft_UnitTest, workmem_small) |
| 3764 | { |
| 3765 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3766 | + > unittest_prob) |
| 3767 | + { |
| 3768 | + GTEST_SKIP(); |
| 3769 | + } |
| 3770 | + |
| 3771 | workmem_test([](size_t requested) { return requested / 2; }, rocfft_status_invalid_work_buffer); |
| 3772 | } |
| 3773 | |
| 3774 | // hard to imagine this being a problem, but try giving too much as well |
| 3775 | TEST(rocfft_UnitTest, workmem_big) |
| 3776 | { |
| 3777 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3778 | + > unittest_prob) |
| 3779 | + { |
| 3780 | + GTEST_SKIP(); |
| 3781 | + } |
| 3782 | + |
| 3783 | workmem_test([](size_t requested) { return requested * 2; }, rocfft_status_success); |
| 3784 | } |
| 3785 | |
| 3786 | @@ -341,13 +457,25 @@ TEST(rocfft_UnitTest, workmem_big) |
| 3787 | // allocates |
| 3788 | TEST(rocfft_UnitTest, workmem_null) |
| 3789 | { |
| 3790 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3791 | + > unittest_prob) |
| 3792 | + { |
| 3793 | + GTEST_SKIP(); |
| 3794 | + } |
| 3795 | + |
| 3796 | workmem_test([](size_t requested) { return requested; }, rocfft_status_success, true); |
| 3797 | } |
| 3798 | |
| 3799 | static const size_t RTC_PROBLEM_SIZE = 2304; |
| 3800 | -// runtime compilation cache tests |
| 3801 | -TEST(rocfft_UnitTest, rtc_cache) |
| 3802 | +// runtime compilation cache tests main loop |
| 3803 | +void rtc_cache_main() |
| 3804 | { |
| 3805 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3806 | + > unittest_prob) |
| 3807 | + { |
| 3808 | + GTEST_SKIP(); |
| 3809 | + } |
| 3810 | + |
| 3811 | // PRECONDITIONS |
| 3812 | |
| 3813 | // - set cache location to custom path, requires uninitializing |
| 3814 | @@ -494,9 +622,27 @@ TEST(rocfft_UnitTest, rtc_cache) |
| 3815 | ASSERT_TRUE(fft_kernel_was_compiled()); |
| 3816 | } |
| 3817 | |
| 3818 | +// run the main body of rtc cache tests twice to uncover potential |
| 3819 | +// problems with thread reuse between iterations |
| 3820 | +TEST(rocfft_UnitTest, rtc_cache_iter_1) |
| 3821 | +{ |
| 3822 | + rtc_cache_main(); |
| 3823 | +} |
| 3824 | + |
| 3825 | +TEST(rocfft_UnitTest, rtc_cache_iter_2) |
| 3826 | +{ |
| 3827 | + rtc_cache_main(); |
| 3828 | +} |
| 3829 | + |
| 3830 | // make sure cache API functions tolerate null pointers without crashing |
| 3831 | TEST(rocfft_UnitTest, rtc_cache_null) |
| 3832 | { |
| 3833 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3834 | + > unittest_prob) |
| 3835 | + { |
| 3836 | + GTEST_SKIP(); |
| 3837 | + } |
| 3838 | + |
| 3839 | void* buf = nullptr; |
| 3840 | size_t buf_len = 0; |
| 3841 | ASSERT_EQ(rocfft_cache_serialize(nullptr, &buf_len), rocfft_status_invalid_arg_value); |
| 3842 | @@ -509,6 +655,12 @@ TEST(rocfft_UnitTest, rtc_cache_null) |
| 3843 | // make sure RTC gracefully handles a helper process that crashes |
| 3844 | TEST(rocfft_UnitTest, rtc_helper_crash) |
| 3845 | { |
| 3846 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3847 | + > unittest_prob) |
| 3848 | + { |
| 3849 | + GTEST_SKIP(); |
| 3850 | + } |
| 3851 | + |
| 3852 | #ifdef WIN32 |
| 3853 | char filename[MAX_PATH]; |
| 3854 | GetModuleFileNameA(NULL, filename, MAX_PATH); |
| 3855 | @@ -574,6 +726,12 @@ TEST(rocfft_UnitTest, rtc_helper_crash) |
| 3856 | |
| 3857 | TEST(rocfft_UnitTest, rtc_test_harness) |
| 3858 | { |
| 3859 | + if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) |
| 3860 | + > unittest_prob) |
| 3861 | + { |
| 3862 | + GTEST_SKIP(); |
| 3863 | + } |
| 3864 | + |
| 3865 | // check that hipcc is available since this test requires it |
| 3866 | // |
| 3867 | // NOTE: using system() for launching subprocesses for simplicity |
| 3868 | @@ -597,8 +755,17 @@ TEST(rocfft_UnitTest, rtc_test_harness) |
| 3869 | |
| 3870 | // extra scope to control lifetime of env vars |
| 3871 | { |
| 3872 | - // rtc test harness writes to system's temp directory |
| 3873 | - auto tmp_path = fs::temp_directory_path(); |
| 3874 | + // create a temporary directory to hold all of the temp files |
| 3875 | + // that get created |
| 3876 | + const fs::path tmp_path = std::tmpnam(nullptr); |
| 3877 | + try |
| 3878 | + { |
| 3879 | + fs::create_directory(tmp_path); |
| 3880 | + } |
| 3881 | + catch(fs::filesystem_error& e) |
| 3882 | + { |
| 3883 | + GTEST_SKIP() << "unable to create temp dir for test harnesses: " << e.what(); |
| 3884 | + } |
| 3885 | |
| 3886 | // activate writing of rtc test harnesses |
| 3887 | EnvironmentSetTemp env_harness("ROCFFT_DEBUG_GENERATE_KERNEL_HARNESS", "1"); |
| 3888 | @@ -613,17 +780,6 @@ TEST(rocfft_UnitTest, rtc_test_harness) |
| 3889 | |
| 3890 | rocfft_setup(); |
| 3891 | |
| 3892 | - // ensure stale files from previous runs of this test won't cause |
| 3893 | - // problems - clean up any rocfft_kernel_harness_*.cpp files that |
| 3894 | - // might be left behind |
| 3895 | - for(const auto& entry : std::filesystem::directory_iterator{tmp_path}) |
| 3896 | - { |
| 3897 | - auto filename = entry.path().filename(); |
| 3898 | - if(filename.string().compare(0, 22, "rocfft_kernel_harness_") == 0 |
| 3899 | - && filename.extension().string() == ".cpp") |
| 3900 | - fs::remove(entry); |
| 3901 | - } |
| 3902 | - |
| 3903 | // construct a few different types of plans to try to get all |
| 3904 | // different kernels compiled |
| 3905 | |
| 3906 | @@ -704,6 +860,19 @@ TEST(rocfft_UnitTest, rtc_test_harness) |
| 3907 | |
| 3908 | // check that all compiles succeeded |
| 3909 | for(const auto& file : files) |
| 3910 | + { |
| 3911 | ASSERT_EQ(file.second, 0); |
| 3912 | + } |
| 3913 | + |
| 3914 | + // clean up temporary files |
| 3915 | + try |
| 3916 | + { |
| 3917 | + fs::remove_all(tmp_path); |
| 3918 | + } |
| 3919 | + catch(fs::filesystem_error&) |
| 3920 | + { |
| 3921 | + // this should work, but ignore errors as the build |
| 3922 | + // status is what matters for this test |
| 3923 | + } |
| 3924 | } |
| 3925 | } |
| 3926 | diff --git a/cmake/sqlite.cmake b/cmake/sqlite.cmake |
| 3927 | index 098be8f..2dc2a28 100644 |
| 3928 | --- a/cmake/sqlite.cmake |
| 3929 | +++ b/cmake/sqlite.cmake |
| 3930 | @@ -1,4 +1,4 @@ |
| 3931 | -# Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. |
| 3932 | +# Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. |
| 3933 | # |
| 3934 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3935 | # of this software and associated documentation files (the "Software"), to deal |
| 3936 | @@ -20,35 +20,36 @@ |
| 3937 | |
| 3938 | include( ExternalProject ) |
| 3939 | |
| 3940 | -# SQLite 3.36.0 enabled the backup API by default, which we need |
| 3941 | -# for cache serialization. We also want to use a static SQLite, |
| 3942 | -# and distro static libraries aren't typically built |
| 3943 | -# position-independent. |
| 3944 | +# SQLite is used for rtc_cache. Require a safe baseline (>= 3.50.2). |
| 3945 | +# Note: the backup API we rely on has been enabled by default since 3.36.0. |
| 3946 | option( SQLITE_USE_SYSTEM_PACKAGE "Use SQLite3 from find_package" OFF ) |
| 3947 | |
| 3948 | if( SQLITE_USE_SYSTEM_PACKAGE ) |
| 3949 | - find_package(SQLite3 3.36 REQUIRED) |
| 3950 | + # Require a safe baseline (fixes truncation/memory-corruption issues). |
| 3951 | + find_package(SQLite3 3.50.2 REQUIRED) |
| 3952 | list(APPEND static_depends PACKAGE SQLite3) |
| 3953 | set(ROCFFT_SQLITE_LIB SQLite::SQLite3) |
| 3954 | else() |
| 3955 | include( FetchContent ) |
| 3956 | |
| 3957 | - if(DEFINED ENV{SQLITE_3_43_2_SRC_URL}) |
| 3958 | - set(SQLITE_3_43_2_SRC_URL_INIT $ENV{SQLITE_3_43_2_SRC_URL}) |
| 3959 | + # embed SQLite amalgamation (version 3.50.2 -> serial 3500200). |
| 3960 | + # allow override via environment variable for mirrors/airgapped builds. |
| 3961 | + if(DEFINED ENV{SQLITE_3_50_2_SRC_URL}) |
| 3962 | + set(SQLITE_3_50_2_SRC_URL_INIT $ENV{SQLITE_3_50_2_SRC_URL}) |
| 3963 | else() |
| 3964 | - set(SQLITE_3_43_2_SRC_URL_INIT https://www.sqlite.org/2023/sqlite-amalgamation-3430200.zip) |
| 3965 | + set(SQLITE_3_50_2_SRC_URL_INIT https://www.sqlite.org/2025/sqlite-amalgamation-3500200.zip) |
| 3966 | endif() |
| 3967 | - set(SQLITE_3_43_2_SRC_URL ${SQLITE_3_43_2_SRC_URL_INIT} CACHE STRING "Location of SQLite source code") |
| 3968 | - set(SQLITE_SRC_3_43_2_SHA3_256 af02b88cc922e7506c6659737560c0756deee24e4e7741d4b315af341edd8b40 CACHE STRING "SHA3-256 hash of SQLite source code") |
| 3969 | + set(SQLITE_3_50_2_SRC_URL ${SQLITE_3_50_2_SRC_URL_INIT} CACHE STRING "Location of SQLite source code") |
| 3970 | + set(SQLITE_SRC_3_50_2_SHA3_256 75c118e727ee6a9a3d2c0e7c577500b0c16a848d109027f087b915b671f61f8a CACHE STRING "SHA3-256 hash of SQLite source code") |
| 3971 | |
| 3972 | - # embed SQLite |
| 3973 | + # use extract timestamp for fetched files instead of timestamps in the archive |
| 3974 | if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24) |
| 3975 | - # use extract timestamp for fetched files instead of timestamps in the archive |
| 3976 | cmake_policy(SET CMP0135 NEW) |
| 3977 | endif() |
| 3978 | + |
| 3979 | FetchContent_Declare(sqlite_local |
| 3980 | - URL ${SQLITE_3_43_2_SRC_URL} |
| 3981 | - URL_HASH SHA3_256=${SQLITE_SRC_3_43_2_SHA3_256} |
| 3982 | + URL ${SQLITE_3_50_2_SRC_URL} |
| 3983 | + URL_HASH SHA3_256=${SQLITE_SRC_3_50_2_SHA3_256} |
| 3984 | ) |
| 3985 | FetchContent_MakeAvailable(sqlite_local) |
| 3986 | |
| 3987 | diff --git a/debian/changelog b/debian/changelog |
| 3988 | index 4aea252..4c9901e 100644 |
| 3989 | --- a/debian/changelog |
| 3990 | +++ b/debian/changelog |
| 3991 | @@ -1,3 +1,27 @@ |
| 3992 | +rocfft (7.1.0-0ubuntu1) resolute; urgency=medium |
| 3993 | + |
| 3994 | + [Talha Can Havadar] |
| 3995 | + * New upstream version 7.1.0 (LP: #2139240) |
| 3996 | + * d/control: remove ppc64el builds (LP: #2134241) |
| 3997 | + * d/control: update b-d to use upstream llvm based toolchain |
| 3998 | + * d/gbp.conf: update debian branch to ubuntu/devel |
| 3999 | + * d/patches/fix-hiprtc-link.patch: is already upstreamed |
| 4000 | + * d/p/Extend-docs-conf.py-for-offline-build.patch: fix the hunk |
| 4001 | + * d/p/do-not-strictly-depend-on-sqlite3-v3.50.2.patch: no real hard |
| 4002 | + requirement for build to fail |
| 4003 | + * d/librocfft-dev.examples: removed from upstream |
| 4004 | + * Changes by Igor Luppi made on top of 6.4.3-1~exp1 listed below to keep |
| 4005 | + history intact |
| 4006 | + |
| 4007 | + [Igor Luppi] |
| 4008 | + * d/rules: use GPU_TARGETS instead of deprecated AMDGPU_TARGETS |
| 4009 | + * d/control: remove libamdhip64-dev since hipcc-rocm is used |
| 4010 | + * d/control: update build-depends |
| 4011 | + * d/control: update maintainer field |
| 4012 | + * d/rules: fix FTBFS by adding -Wl,--gc-sections to flags |
| 4013 | + |
| 4014 | + -- Talha Can Havadar <talha.can.havadar@canonical.com> Fri, 23 Jan 2026 10:09:10 +0100 |
| 4015 | + |
| 4016 | rocfft (6.4.3-1) unstable; urgency=medium |
| 4017 | |
| 4018 | [ Cordell Bloor ] |
| 4019 | diff --git a/debian/control b/debian/control |
| 4020 | index a58bcfd..56faddd 100644 |
| 4021 | --- a/debian/control |
| 4022 | +++ b/debian/control |
| 4023 | @@ -5,25 +5,25 @@ Priority: optional |
| 4024 | Standards-Version: 4.7.2 |
| 4025 | Vcs-Git: https://salsa.debian.org/rocm-team/rocfft.git |
| 4026 | Vcs-Browser: https://salsa.debian.org/rocm-team/rocfft |
| 4027 | -Maintainer: Debian ROCm Team <debian-ai@lists.debian.org> |
| 4028 | -Uploaders: Cordell Bloor <cgmb@debian.org>, |
| 4029 | +Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com> |
| 4030 | +XSBC-Original-Maintainer: Debian ROCm Team <debian-ai@lists.debian.org> |
| 4031 | +Uploaders: Maxime Chambonnet <maxzor@maxzor.eu>, |
| 4032 | + Cordell Bloor <cgmb@debian.org>, |
| 4033 | Christian Kastner <ckk@debian.org>, |
| 4034 | Kari Pahula <kaol@debian.org> |
| 4035 | Build-Depends: debhelper-compat (= 13), |
| 4036 | cmake, |
| 4037 | - hipcc (>= 5.6.1~), |
| 4038 | -# end |
| 4039 | - libamd-comgr-dev (>= 6.0~), |
| 4040 | -# ckk 2024-03-02: temporary until hipcc question is resolved: |
| 4041 | - libamdhip64-dev (>= 5.6.1~), |
| 4042 | + hipcc (>= 7.1.0~), |
| 4043 | + libamd-comgr-dev (>= 7.1.0~), |
| 4044 | libboost-program-options-dev, |
| 4045 | libfftw3-dev, |
| 4046 | libgtest-dev <!nocheck>, |
| 4047 | - libhsa-runtime-dev (>= 5.7.1~), |
| 4048 | + libhiprand-dev (>= 7.1.0~), |
| 4049 | + libhsa-runtime-dev (>= 7.1.0~), |
| 4050 | libsqlite3-dev, |
| 4051 | - pkg-rocm-tools (>= 0.9.0~exp3), |
| 4052 | + pkg-rocm-tools (>= 0.9.0), |
| 4053 | python3-dev, |
| 4054 | - rocm-cmake (>= 5.3.0) |
| 4055 | + rocm-cmake (>= 7.1.0) |
| 4056 | Build-Depends-Indep: dh-sequence-sphinxdoc <!nodoc>, |
| 4057 | libjs-jquery <!nodoc>, |
| 4058 | libjs-mathjax <!nodoc>, |
| 4059 | @@ -36,7 +36,7 @@ Rules-Requires-Root: no |
| 4060 | |
| 4061 | Package: librocfft0 |
| 4062 | Section: libs |
| 4063 | -Architecture: amd64 arm64 ppc64el |
| 4064 | +Architecture: amd64 arm64 |
| 4065 | Multi-Arch: same |
| 4066 | XB-X-ROCm-GPU-Architecture: ${rocm:GPU-Architecture} |
| 4067 | Depends: ${misc:Depends}, ${shlibs:Depends} |
| 4068 | @@ -53,7 +53,7 @@ Description: ROCm library for computing Fast Fourier Transforms - library |
| 4069 | |
| 4070 | Package: librocfft-dev |
| 4071 | Section: libdevel |
| 4072 | -Architecture: amd64 arm64 ppc64el |
| 4073 | +Architecture: amd64 arm64 |
| 4074 | Depends: librocfft0 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends} |
| 4075 | Recommends: libamdhip64-dev |
| 4076 | Suggests: librocfft-doc |
| 4077 | @@ -70,7 +70,7 @@ Description: ROCm library for computing Fast Fourier Transforms - headers |
| 4078 | |
| 4079 | Package: librocfft0-tests |
| 4080 | Section: libdevel |
| 4081 | -Architecture: amd64 arm64 ppc64el |
| 4082 | +Architecture: amd64 arm64 |
| 4083 | Build-Profiles: <!nocheck> |
| 4084 | Depends: librocfft0 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends} |
| 4085 | Description: ROCm library for computing Fast Fourier Transforms - tests |
| 4086 | diff --git a/debian/gbp.conf b/debian/gbp.conf |
| 4087 | index 357a581..2f75208 100644 |
| 4088 | --- a/debian/gbp.conf |
| 4089 | +++ b/debian/gbp.conf |
| 4090 | @@ -1,5 +1,5 @@ |
| 4091 | [DEFAULT] |
| 4092 | -debian-branch = debian/unstable |
| 4093 | +debian-branch = ubuntu/devel |
| 4094 | upstream-branch = upstream/latest |
| 4095 | pristine-tar = True |
| 4096 | |
| 4097 | diff --git a/debian/librocfft-dev.examples b/debian/librocfft-dev.examples |
| 4098 | deleted file mode 100644 |
| 4099 | index a8f30e1..0000000 |
| 4100 | --- a/debian/librocfft-dev.examples |
| 4101 | +++ /dev/null |
| 4102 | @@ -1 +0,0 @@ |
| 4103 | -docs/samples/*.cpp |
| 4104 | diff --git a/debian/patches/Extend-docs-conf.py-for-offline-build.patch b/debian/patches/Extend-docs-conf.py-for-offline-build.patch |
| 4105 | index be118d7..791856a 100644 |
| 4106 | --- a/debian/patches/Extend-docs-conf.py-for-offline-build.patch |
| 4107 | +++ b/debian/patches/Extend-docs-conf.py-for-offline-build.patch |
| 4108 | @@ -8,16 +8,14 @@ would cause the build to fail. |
| 4109 | docs/conf.py | 2 ++ |
| 4110 | 1 file changed, 2 insertions(+) |
| 4111 | |
| 4112 | -diff --git a/docs/conf.py b/docs/conf.py |
| 4113 | -index ff2d0d2..f8ff415 100644 |
| 4114 | --- a/docs/conf.py |
| 4115 | +++ b/docs/conf.py |
| 4116 | -@@ -27,6 +27,8 @@ docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml") |
| 4117 | +@@ -27,6 +27,8 @@ |
| 4118 | docs_core.setup() |
| 4119 | |
| 4120 | external_projects_current_project = "rocfft" |
| 4121 | +external_projects_remote_repository = "" |
| 4122 | +external_projects_remote_branch = "" |
| 4123 | - external_projects = [] |
| 4124 | |
| 4125 | for sphinx_var in ROCmDocs.SPHINX_VARS: |
| 4126 | + globals()[sphinx_var] = getattr(docs_core, sphinx_var) |
| 4127 | diff --git a/debian/patches/do-not-strictly-depend-on-sqlite3-v3.50.2.patch b/debian/patches/do-not-strictly-depend-on-sqlite3-v3.50.2.patch |
| 4128 | new file mode 100644 |
| 4129 | index 0000000..fc6e543 |
| 4130 | --- /dev/null |
| 4131 | +++ b/debian/patches/do-not-strictly-depend-on-sqlite3-v3.50.2.patch |
| 4132 | @@ -0,0 +1,21 @@ |
| 4133 | +From: Talha Can Havadar <havadartalha@gmail.com> |
| 4134 | +Date: Wed, 26 Nov 2025 15:34:26 +0100 |
| 4135 | +Subject: do not strictly depend on sqlite3 v3.50.2 |
| 4136 | + |
| 4137 | +--- |
| 4138 | + cmake/sqlite.cmake | 2 +- |
| 4139 | + 1 file changed, 1 insertion(+), 1 deletion(-) |
| 4140 | + |
| 4141 | +diff --git a/cmake/sqlite.cmake b/cmake/sqlite.cmake |
| 4142 | +index 2dc2a28..a44e4ef 100644 |
| 4143 | +--- a/cmake/sqlite.cmake |
| 4144 | ++++ b/cmake/sqlite.cmake |
| 4145 | +@@ -26,7 +26,7 @@ option( SQLITE_USE_SYSTEM_PACKAGE "Use SQLite3 from find_package" OFF ) |
| 4146 | + |
| 4147 | + if( SQLITE_USE_SYSTEM_PACKAGE ) |
| 4148 | + # Require a safe baseline (fixes truncation/memory-corruption issues). |
| 4149 | +- find_package(SQLite3 3.50.2 REQUIRED) |
| 4150 | ++ find_package(SQLite3 REQUIRED) |
| 4151 | + list(APPEND static_depends PACKAGE SQLite3) |
| 4152 | + set(ROCFFT_SQLITE_LIB SQLite::SQLite3) |
| 4153 | + else() |
| 4154 | diff --git a/debian/patches/fix-hiprtc-link.patch b/debian/patches/fix-hiprtc-link.patch |
| 4155 | deleted file mode 100644 |
| 4156 | index 3ff2d1f..0000000 |
| 4157 | --- a/debian/patches/fix-hiprtc-link.patch |
| 4158 | +++ /dev/null |
| 4159 | @@ -1,29 +0,0 @@ |
| 4160 | -From: Cordell Bloor <cgmb@slerp.xyz> |
| 4161 | -Date: Tue, 23 May 2023 18:57:14 -0600 |
| 4162 | -Subject: fix hiprtc link |
| 4163 | - |
| 4164 | -The upstream build code linking hiprtc doesn't work with the Debian |
| 4165 | -package for hiprtc. The hip rtc library is correctly linked by |
| 4166 | -hip::host, so there's no need to handle it separately. That said, some |
| 4167 | -other packages for HIP (such as those provided by Spack), have patched |
| 4168 | -out the automatic linking of hiprtc in hip::host, so upstream will |
| 4169 | -probably want to find an alternative solution. |
| 4170 | - |
| 4171 | -Forwarded: not-needed |
| 4172 | ---- |
| 4173 | - library/src/CMakeLists.txt | 2 +- |
| 4174 | - 1 file changed, 1 insertion(+), 1 deletion(-) |
| 4175 | - |
| 4176 | -diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt |
| 4177 | -index 63c584b..c02f2fb 100644 |
| 4178 | ---- a/library/src/CMakeLists.txt |
| 4179 | -+++ b/library/src/CMakeLists.txt |
| 4180 | -@@ -56,7 +56,7 @@ else() |
| 4181 | - if( WIN32 ) |
| 4182 | - set( ROCFFT_RTC_LINK_LIBS "${HIP_PATH}/lib/hiprtc.lib" ) |
| 4183 | - else() |
| 4184 | -- set( ROCFFT_RTC_LINK_LIBS -L${ROCM_PATH}/lib -lhiprtc -ldl ) |
| 4185 | -+ set( ROCFFT_RTC_LINK_LIBS hip::host -ldl ) |
| 4186 | - endif() |
| 4187 | - endif() |
| 4188 | - |
| 4189 | diff --git a/debian/patches/series b/debian/patches/series |
| 4190 | index 8c51f36..581e5fe 100644 |
| 4191 | --- a/debian/patches/series |
| 4192 | +++ b/debian/patches/series |
| 4193 | @@ -1,5 +1,5 @@ |
| 4194 | -fix-hiprtc-link.patch |
| 4195 | use-local-mathjax.patch |
| 4196 | Extend-docs-conf.py-for-offline-build.patch |
| 4197 | Add-version-script-to-control-exposed-symbols.patch |
| 4198 | Force-host-code-resultion.patch |
| 4199 | +do-not-strictly-depend-on-sqlite3-v3.50.2.patch |
| 4200 | diff --git a/debian/rules b/debian/rules |
| 4201 | index d92552b..6cbc80a 100755 |
| 4202 | --- a/debian/rules |
| 4203 | +++ b/debian/rules |
| 4204 | @@ -2,6 +2,12 @@ |
| 4205 | export CXX=hipcc |
| 4206 | export DEB_BUILD_MAINT_OPTIONS = hardening=+all optimize=-lto |
| 4207 | export DEB_CXXFLAGS_MAINT_PREPEND = -gz |
| 4208 | +# -Wl,--gc-sections: |
| 4209 | +# Fixes linker errors ("relocation refers to a discarded section") when using gtest. |
| 4210 | +# The linker's section garbage collection incorrectly discards needed code due to |
| 4211 | +# a toolchain mismatch (ROCm's clang vs. the system's GCC). This flag forces the |
| 4212 | +# linker to correctly re-evaluate dependencies, keeping the required sections. |
| 4213 | +export DEB_LDFLAGS_MAINT_PREPEND = -Wl,--gc-sections |
| 4214 | export VERBOSE=1 |
| 4215 | |
| 4216 | # filter incompatible options from affecting device code |
| 4217 | @@ -13,7 +19,6 @@ VERSION_STRING = $(shell sed -nr 's/^set.*VERSION_STRING \"([.0-9]+)\".*/\1/p' C |
| 4218 | |
| 4219 | CMAKE_FLAGS = \ |
| 4220 | -DCMAKE_BUILD_TYPE=Release \ |
| 4221 | - -DCMAKE_SKIP_RPATH=ON \ |
| 4222 | -DGPU_TARGETS="$(shell rocm-target-arch --sep ';')" \ |
| 4223 | -DROCFFT_KERNEL_CACHE_ENABLE=OFF \ |
| 4224 | -DROCM_SYMLINK_LIBS=OFF \ |
| 4225 | diff --git a/debian/tests/control b/debian/tests/control |
| 4226 | index bd8c7a5..6cc0da2 100644 |
| 4227 | --- a/debian/tests/control |
| 4228 | +++ b/debian/tests/control |
| 4229 | @@ -2,4 +2,4 @@ Test-Command: rocm-test-launcher /usr/libexec/rocm/librocfft0-tests/run-tests |
| 4230 | Features: test-name=librocfft0-tests |
| 4231 | Depends: librocfft0-tests, pkg-rocm-tools |
| 4232 | Restrictions: allow-stderr, skippable |
| 4233 | -Architecture: amd64 arm64 ppc64el |
| 4234 | +Architecture: amd64 arm64 |
| 4235 | diff --git a/docs/design/bluestein.rst b/designdocs/bluestein.rst |
| 4236 | similarity index 100% |
| 4237 | rename from docs/design/bluestein.rst |
| 4238 | rename to designdocs/bluestein.rst |
| 4239 | diff --git a/docs/design/buffer_assignment.rst b/designdocs/buffer_assignment.rst |
| 4240 | similarity index 100% |
| 4241 | rename from docs/design/buffer_assignment.rst |
| 4242 | rename to designdocs/buffer_assignment.rst |
| 4243 | diff --git a/docs/design/codegen.rst b/designdocs/codegen.rst |
| 4244 | similarity index 99% |
| 4245 | rename from docs/design/codegen.rst |
| 4246 | rename to designdocs/codegen.rst |
| 4247 | index 46b8255..28fdde0 100644 |
| 4248 | --- a/docs/design/codegen.rst |
| 4249 | +++ b/designdocs/codegen.rst |
| 4250 | @@ -308,7 +308,7 @@ tiling. Different twiddle table strategies should extend the |
| 4251 | ``multiply`` methods. |
| 4252 | |
| 4253 | Twiddle tables may also require additional templates and arguments. |
| 4254 | -See :ref:`Stockham tiling implementation`. |
| 4255 | +See the Stockham tiling implementation section. |
| 4256 | |
| 4257 | Copyright and disclaimer |
| 4258 | ======================== |
| 4259 | diff --git a/docs/design/design.rst b/designdocs/design.rst |
| 4260 | similarity index 66% |
| 4261 | rename from docs/design/design.rst |
| 4262 | rename to designdocs/design.rst |
| 4263 | index 7cb5f7a..7e97b33 100644 |
| 4264 | --- a/docs/design/design.rst |
| 4265 | +++ b/designdocs/design.rst |
| 4266 | @@ -10,9 +10,9 @@ Design Documents |
| 4267 | |
| 4268 | The Design Documents contain proposals for features of the rocFFT library. They are intended as development proposals for engineering and contributors to the Open Source library. |
| 4269 | |
| 4270 | -The current proposals include the following: |
| 4271 | +The current proposals in this directory include the following: |
| 4272 | |
| 4273 | - * :ref:`codegen` |
| 4274 | - * :ref:`runtime_compilation` |
| 4275 | - * :ref:`buffer_assignment` |
| 4276 | - * :ref:`bluestein` |
| 4277 | + * codegen |
| 4278 | + * runtime_compilation |
| 4279 | + * buffer_assignment |
| 4280 | + * bluestein |
| 4281 | diff --git a/docs/design/images/bluestein_fig1.png b/designdocs/images/bluestein_fig1.png |
| 4282 | similarity index 100% |
| 4283 | rename from docs/design/images/bluestein_fig1.png |
| 4284 | rename to designdocs/images/bluestein_fig1.png |
| 4285 | Binary files a/docs/design/images/bluestein_fig1.png and b/designdocs/images/bluestein_fig1.png differ |
| 4286 | diff --git a/docs/design/images/bluestein_fig2.png b/designdocs/images/bluestein_fig2.png |
| 4287 | similarity index 100% |
| 4288 | rename from docs/design/images/bluestein_fig2.png |
| 4289 | rename to designdocs/images/bluestein_fig2.png |
| 4290 | Binary files a/docs/design/images/bluestein_fig2.png and b/designdocs/images/bluestein_fig2.png differ |
| 4291 | diff --git a/docs/design/images/bluestein_fig3.png b/designdocs/images/bluestein_fig3.png |
| 4292 | similarity index 100% |
| 4293 | rename from docs/design/images/bluestein_fig3.png |
| 4294 | rename to designdocs/images/bluestein_fig3.png |
| 4295 | Binary files a/docs/design/images/bluestein_fig3.png and b/designdocs/images/bluestein_fig3.png differ |
| 4296 | diff --git a/docs/design/images/bluestein_fig4.png b/designdocs/images/bluestein_fig4.png |
| 4297 | similarity index 100% |
| 4298 | rename from docs/design/images/bluestein_fig4.png |
| 4299 | rename to designdocs/images/bluestein_fig4.png |
| 4300 | Binary files a/docs/design/images/bluestein_fig4.png and b/designdocs/images/bluestein_fig4.png differ |
| 4301 | diff --git a/docs/design/runtime_compilation.rst b/designdocs/runtime_compilation.rst |
| 4302 | similarity index 100% |
| 4303 | rename from docs/design/runtime_compilation.rst |
| 4304 | rename to designdocs/runtime_compilation.rst |
| 4305 | diff --git a/docs/conf.py b/docs/conf.py |
| 4306 | index c83a6fb..304d660 100644 |
| 4307 | --- a/docs/conf.py |
| 4308 | +++ b/docs/conf.py |
| 4309 | @@ -13,7 +13,7 @@ left_nav_title = f"rocFFT {version_number} Documentation" |
| 4310 | # for PDF output on Read the Docs |
| 4311 | project = "rocFFT Documentation" |
| 4312 | author = "Advanced Micro Devices, Inc." |
| 4313 | -copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved." |
| 4314 | +copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved." |
| 4315 | version = version_number |
| 4316 | release = version_number |
| 4317 | |
| 4318 | @@ -24,7 +24,6 @@ docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml") |
| 4319 | docs_core.setup() |
| 4320 | |
| 4321 | external_projects_current_project = "rocfft" |
| 4322 | -external_projects = [] |
| 4323 | |
| 4324 | for sphinx_var in ROCmDocs.SPHINX_VARS: |
| 4325 | globals()[sphinx_var] = getattr(docs_core, sphinx_var) |
| 4326 | diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile |
| 4327 | index c47b8fc..3fbe0d4 100644 |
| 4328 | --- a/docs/doxygen/Doxyfile |
| 4329 | +++ b/docs/doxygen/Doxyfile |
| 4330 | @@ -38,7 +38,7 @@ PROJECT_NAME = "rocFFT" |
| 4331 | # could be handy for archiving the generated documentation or if some version |
| 4332 | # control system is used. |
| 4333 | |
| 4334 | -PROJECT_NUMBER = v1.0.32 |
| 4335 | +PROJECT_NUMBER = v1.0.35 |
| 4336 | |
| 4337 | # Using the PROJECT_BRIEF tag one can provide an optional one line description |
| 4338 | # for a project that appears at the top of each page and should give viewer a |
| 4339 | diff --git a/docs/how-to/enabling-logging.rst b/docs/how-to/enabling-logging.rst |
| 4340 | new file mode 100644 |
| 4341 | index 0000000..9d9f59a |
| 4342 | --- /dev/null |
| 4343 | +++ b/docs/how-to/enabling-logging.rst |
| 4344 | @@ -0,0 +1,134 @@ |
| 4345 | +.. meta:: |
| 4346 | + :description: Enabling logging in rocFFT |
| 4347 | + :keywords: rocFFT, ROCm, API, documentation, logging |
| 4348 | + |
| 4349 | + |
| 4350 | +.. _enabling-logging: |
| 4351 | + |
| 4352 | +******************************************************************** |
| 4353 | +Enabling logging in rocFFT |
| 4354 | +******************************************************************** |
| 4355 | + |
| 4356 | +rocFFT can write a variety of log messages to aid troubleshooting. Here are the different |
| 4357 | +logs that rocFFT supports. |
| 4358 | + |
| 4359 | +* **Trace logging**: Logs the library entry points (for example, ``rocfft_plan_create`` or ``rocfft_execute``) and their parameter |
| 4360 | + values when they are called. Error messages during plan creation and execution are also logged here. |
| 4361 | +* **Benchmark logging**: Logs the ``rocfft-bench`` command line when a plan is created. |
| 4362 | + You can use this command to rerun the same transform later. |
| 4363 | +* **Profile logging**: Logs a message for each kernel launched during plan execution. |
| 4364 | + This message contains the following elements: |
| 4365 | + |
| 4366 | + * Kernel duration |
| 4367 | + * The size of the user data buffers seen by the kernel |
| 4368 | + * Estimates for the observed memory bandwidth and bandwidth efficiency |
| 4369 | + |
| 4370 | + .. note:: |
| 4371 | + |
| 4372 | + To provide the kernel duration, rocFFT must use ``hipEvents`` and wait for each kernel to complete. |
| 4373 | + This might interfere with time measurement at higher levels, for example, for ``rocfft-bench``. |
| 4374 | + |
| 4375 | +* **Plan logging**: Logs the plan details when a transform is executed, including the following: |
| 4376 | + |
| 4377 | + * Each TreeNode in the plan |
| 4378 | + * The work buffer size required by the plan |
| 4379 | + * The kernel grid and block dimensions |
| 4380 | + * The kernel maximum occupancy (estimated by HIP) |
| 4381 | + |
| 4382 | +* **Kernel I/O logging**: Logs the kernel details during plan execution, including the input to each |
| 4383 | + kernel (the data provided by the user) and the final output of the transform. |
| 4384 | + |
| 4385 | + .. note:: |
| 4386 | + |
| 4387 | + The amount of data logged can become very large, particularly for 2D and 3D transforms, so |
| 4388 | + logging it to a file instead of stderr is usually a good idea. See the next section for more details. |
| 4389 | + |
| 4390 | + Writing the data involves extra ``hipMemcpy`` operations and serializing the |
| 4391 | + data to the log can also take a significant amount of time. Both of these factors affect performance. |
| 4392 | + |
| 4393 | +* **Runtime compilation logging**: Logs details about runtime compilation during plan creation, |
| 4394 | + including the following: |
| 4395 | + |
| 4396 | + * The source code |
| 4397 | + * Messages indicating a kernel was found in a cache, and did not need to be compiled at runtime |
| 4398 | + * Compilation errors (if any) |
| 4399 | + * Duration measurements indicating the time it took to generate source code for the kernel and compile the kernel |
| 4400 | + |
| 4401 | + The source code for the kernels is delimited by lines containing the strings ``ROCFFT_RTC_BEGIN`` |
| 4402 | + and ``ROCFFT_RTC_END``. This lets you isolate the source code for each kernel if a |
| 4403 | + single log contains code for multiple kernels. |
| 4404 | + |
| 4405 | + .. note:: |
| 4406 | + |
| 4407 | + All non-code messages (except for compile errors) are written as C++ comments, so |
| 4408 | + you can pass the whole file to clang-format to inspect the source code. |
| 4409 | + |
| 4410 | + The source code details for the runtime compilation can be very large, so consider writing |
| 4411 | + this log to a file instead of stderr. |
| 4412 | + |
| 4413 | +* **Tuning logging**: Logs details about any kernels that are tried and rejected while tuning is running. |
| 4414 | + It also logs messages when tuned solutions are used during plan building. |
| 4415 | +* **Graph logging**: Logs the graph of subplans during multi-GPU or multi-process plan execution. |
| 4416 | + Subplans include FFT plans, transpose plans (to reshape data for communication), and communication steps. |
| 4417 | + This is written as Graphviz data. The view of the global graph might be slightly different from |
| 4418 | + different nodes. This is because the current node has more visibility into subplans that run locally |
| 4419 | + than those that run on other nodes. |
| 4420 | + |
| 4421 | +Configuring the logging output |
| 4422 | +============================== |
| 4423 | + |
| 4424 | +The logging output can be controlled using the ``ROCFFT_LAYER`` environment variable. |
| 4425 | +``ROCFFT_LAYER`` is a numerical bitmask, where zero or more bits can be set to enable one or more logging layers. |
| 4426 | +The log output is written to stderr by default. |
| 4427 | + |
| 4428 | +The following table maps the different logging layers to a ``ROCFFT_LAYER`` bit field value. |
| 4429 | +To determine what value to set for ``ROCFFT_LAYER``, add up the values of all the layers you want to see. |
| 4430 | +For example, to see the output for trace, profile, and plan logging, set ``ROCFFT_LAYER`` to ``13`` |
| 4431 | +(``1`` + ``4`` + ``8``). |
| 4432 | + |
| 4433 | +.. csv-table:: |
| 4434 | + :header: "Log type","ROCFFT_LAYER bit field value" |
| 4435 | + :widths: 20, 20 |
| 4436 | + |
| 4437 | + "Trace logging","1" |
| 4438 | + "Benchmark logging","2" |
| 4439 | + "Profile logging","4" |
| 4440 | + "Plan logging","8" |
| 4441 | + "Kernel I/O logging","16" |
| 4442 | + "Runtime compilation logging","32" |
| 4443 | + "Tuning logging","64" |
| 4444 | + "Graph logging","128" |
| 4445 | + |
| 4446 | +Logging to a file |
| 4447 | +============================= |
| 4448 | + |
| 4449 | +By default, messages are written to stderr, but they can be redirected to |
| 4450 | +output files using the environment variables described in this section. |
| 4451 | +Each type of log can be redirected separately using a unique environment variable. |
| 4452 | +The corresponding log must be enabled using the ``ROCFFT_LAYER`` variable |
| 4453 | +before any details can be logged to the file. |
| 4454 | +For example, to redirect the trace log to a file, trace logging must |
| 4455 | +also be enabled in the ``ROCFFT_LAYER`` bit field. |
| 4456 | + |
| 4457 | +.. note:: |
| 4458 | + |
| 4459 | + Some log types, such as kernel I/O logging and runtime compilation logging, can generate |
| 4460 | + a large number of log entries, so redirecting their output to a file is recommended. |
| 4461 | + |
| 4462 | +The following table lists the environment variable to redirect logging for each |
| 4463 | +log type. Set this variable to a valid file path to redirect the output of the corresponding log type. |
| 4464 | +For example, to send the trace logging output to a file, enable the trace log, then set the |
| 4465 | +``ROCFFT_LOG_TRACE_PATH`` variable to the name of the destination file. |
| 4466 | + |
| 4467 | +.. csv-table:: |
| 4468 | + :header: "Log type","File redirection variable" |
| 4469 | + :widths: 20, 30 |
| 4470 | + |
| 4471 | + "Trace logging","``ROCFFT_LOG_TRACE_PATH``" |
| 4472 | + "Benchmark logging","``ROCFFT_LOG_BENCH_PATH``" |
| 4473 | + "Profile logging","``ROCFFT_LOG_PROFILE_PATH``" |
| 4474 | + "Plan logging","``ROCFFT_LOG_PLAN_PATH``" |
| 4475 | + "Kernel I/O logging","``ROCFFT_LOG_KERNELIO_PATH``" |
| 4476 | + "Runtime compilation logging","``ROCFFT_LOG_RTC_PATH``" |
| 4477 | + "Tuning logging","``ROCFFT_LOG_TUNING_PATH``" |
| 4478 | + "Graph logging","``ROCFFT_LOG_GRAPH_PATH``" |
| 4479 | diff --git a/docs/how-to/load-store-callbacks.rst b/docs/how-to/load-store-callbacks.rst |
| 4480 | index 9d37995..a8ee6ed 100644 |
| 4481 | --- a/docs/how-to/load-store-callbacks.rst |
| 4482 | +++ b/docs/how-to/load-store-callbacks.rst |
| 4483 | @@ -17,6 +17,11 @@ to the library using |
| 4484 | :cpp:func:`rocfft_execution_info_set_load_callback` and |
| 4485 | :cpp:func:`rocfft_execution_info_set_store_callback`. |
| 4486 | |
| 4487 | +.. note:: |
| 4488 | + |
| 4489 | + Callback functions must be built as relocatable device code by |
| 4490 | + passing the ``-fgpu-rdc`` option to the compiler and linker. |
| 4491 | + |
| 4492 | Device functions supplied as callbacks must load and store element |
| 4493 | data types appropriate for the transform being executed. |
| 4494 | |
| 4495 | diff --git a/docs/how-to/working-with-rocfft.rst b/docs/how-to/working-with-rocfft.rst |
| 4496 | index 26f8e57..7d764b8 100644 |
| 4497 | --- a/docs/how-to/working-with-rocfft.rst |
| 4498 | +++ b/docs/how-to/working-with-rocfft.rst |
| 4499 | @@ -181,7 +181,7 @@ You must allocate, initialize, and specify the input/output buffers that hold th |
| 4500 | For larger transforms, temporary work buffers might be needed. Because the library tries to minimize its own allocation of |
| 4501 | memory regions on the device, it expects you to manage the work buffers. The size of the buffer that is needed can be queried using |
| 4502 | :cpp:func:`rocfft_plan_get_work_buffer_size`. After allocation, it can be passed to the library using |
| 4503 | -:cpp:func:`rocfft_execution_info_set_work_buffer`. The `GitHub repository <https://github.com/ROCm/rocFFT/tree/develop/clients/samples>`_ |
| 4504 | +:cpp:func:`rocfft_execution_info_set_work_buffer`. The `GitHub repository <https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocfft/clients/samples>`_ |
| 4505 | provide some samples and examples. |
| 4506 | |
| 4507 | Transform and array types |
| 4508 | @@ -316,8 +316,8 @@ reside on a different device and have its own layout parameters. |
| 4509 | |
| 4510 | The rocFFT APIs for declaring fields and bricks are currently experimental and |
| 4511 | subject to change in future releases. To submit feedback, questions, and comments |
| 4512 | - about these interfaces, use the `rocFFT issue tracker |
| 4513 | - <https://github.com/ROCmSoftwarePlatform/rocFFT/issues>`_. |
| 4514 | + about these interfaces, use the `rocm-libraries issue tracker |
| 4515 | + <https://github.com/ROCm/rocm-libraries/issues>`_. |
| 4516 | |
| 4517 | The workflow for using fields is as follows: |
| 4518 | |
| 4519 | diff --git a/docs/index.rst b/docs/index.rst |
| 4520 | index 71ef011..82f089d 100644 |
| 4521 | --- a/docs/index.rst |
| 4522 | +++ b/docs/index.rst |
| 4523 | @@ -15,7 +15,11 @@ The rocFFT library calculates discrete Fourier transforms for one, two, and thre |
| 4524 | supporting various data types for real and complex values. |
| 4525 | To learn more, see :doc:`What is rocFFT? <./what-is-rocfft>` |
| 4526 | |
| 4527 | -The rocFFT public repository is located at `<https://github.com/ROCm/rocFFT>`_. |
| 4528 | +The rocFFT public repository is located at `<https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocfft>`_. |
| 4529 | + |
| 4530 | +.. note:: |
| 4531 | + |
| 4532 | + The rocFFT repository for ROCm 6.4.3 and earlier is located at `<https://github.com/ROCm/rocFFT>`_. |
| 4533 | |
| 4534 | .. grid:: 2 |
| 4535 | :gutter: 3 |
| 4536 | @@ -38,10 +42,11 @@ The rocFFT public repository is located at `<https://github.com/ROCm/rocFFT>`_. |
| 4537 | * :doc:`Load and store callbacks <./how-to/load-store-callbacks>` |
| 4538 | * :doc:`Use runtime compilation <./how-to/runtime-compilation>` |
| 4539 | * :doc:`Distribute transforms <./how-to/distributed-transforms>` |
| 4540 | + * :doc:`Enable logging <./how-to/enabling-logging>` |
| 4541 | |
| 4542 | .. grid-item-card:: Samples |
| 4543 | |
| 4544 | - * `rocFFT GitHub client examples <https://github.com/ROCm/rocFFT/tree/develop/clients/samples>`_ |
| 4545 | + * `rocFFT GitHub client examples <https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocfft/clients/samples>`_ |
| 4546 | |
| 4547 | .. grid-item-card:: API reference |
| 4548 | |
| 4549 | diff --git a/docs/install/building-installing-rocfft.rst b/docs/install/building-installing-rocfft.rst |
| 4550 | index 44fb458..5d5f2a1 100644 |
| 4551 | --- a/docs/install/building-installing-rocfft.rst |
| 4552 | +++ b/docs/install/building-installing-rocfft.rst |
| 4553 | @@ -32,7 +32,8 @@ You can use the GitHub releases tab to download the source code. This might prov |
| 4554 | than the prebuilt packages. |
| 4555 | |
| 4556 | rocFFT uses the AMD clang++ compiler and CMake. You can specify several options to customize your build. |
| 4557 | -Use the following commands to build a shared library for the supported AMD GPUs: |
| 4558 | +Use the following commands to build a shared library for the supported AMD GPUs. |
| 4559 | +Run these commands from the ``rocm-libraries/projects/rocfft`` directory: |
| 4560 | |
| 4561 | .. code-block:: shell |
| 4562 | |
| 4563 | @@ -78,7 +79,7 @@ rocFFT uses version 1.11 of GoogleTest. |
| 4564 | |
| 4565 | You can build the clients separately from the main library. |
| 4566 | For example, to build all the clients with an existing rocFFT library, invoke CMake from |
| 4567 | -within the ``rocFFT-src/clients`` folder using these commands: |
| 4568 | +within the ``rocm-libraries/projects/rocfft/rocFFT-src/clients`` folder using these commands: |
| 4569 | |
| 4570 | .. code-block:: shell |
| 4571 | |
| 4572 | diff --git a/docs/license.md b/docs/license.md |
| 4573 | index 7eeeef5..c1c707c 100644 |
| 4574 | --- a/docs/license.md |
| 4575 | +++ b/docs/license.md |
| 4576 | @@ -5,5 +5,7 @@ myst: |
| 4577 | "keywords": "rocFFT, FFT, ROCm, API, documentation, license" |
| 4578 | --- |
| 4579 | |
| 4580 | +# License |
| 4581 | + |
| 4582 | ```{include} ../LICENSE.md |
| 4583 | ``` |
| 4584 | diff --git a/docs/samples/README.md b/docs/samples/README.md |
| 4585 | deleted file mode 100644 |
| 4586 | index ea0f091..0000000 |
| 4587 | --- a/docs/samples/README.md |
| 4588 | +++ /dev/null |
| 4589 | @@ -1,12 +0,0 @@ |
| 4590 | -# Samples to demo using rocfft |
| 4591 | - |
| 4592 | -## `complex_1d` |
| 4593 | - |
| 4594 | -You may need to add the directories for amdclang++ and rocFFT to your |
| 4595 | -`CMAKE_PREFIX_PATH`, and ensure that `amdclang++` is in your `PATH`. |
| 4596 | - |
| 4597 | -``` bash |
| 4598 | -$ mkdir build && cd build |
| 4599 | -$ cmake -DCMAKE_CXX_COMPILER=amdclang++ .. |
| 4600 | -$ make |
| 4601 | -``` |
| 4602 | diff --git a/docs/samples/complex_1d.cpp b/docs/samples/complex_1d.cpp |
| 4603 | deleted file mode 100644 |
| 4604 | index e815f15..0000000 |
| 4605 | --- a/docs/samples/complex_1d.cpp |
| 4606 | +++ /dev/null |
| 4607 | @@ -1,178 +0,0 @@ |
| 4608 | -// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved. |
| 4609 | -// |
| 4610 | -// Permission is hereby granted, free of charge, to any person obtaining a copy |
| 4611 | -// of this software and associated documentation files (the "Software"), to deal |
| 4612 | -// in the Software without restriction, including without limitation the rights |
| 4613 | -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 4614 | -// copies of the Software, and to permit persons to whom the Software is |
| 4615 | -// furnished to do so, subject to the following conditions: |
| 4616 | -// |
| 4617 | -// The above copyright notice and this permission notice shall be included in |
| 4618 | -// all copies or substantial portions of the Software. |
| 4619 | -// |
| 4620 | -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 4621 | -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 4622 | -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 4623 | -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 4624 | -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 4625 | -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 4626 | -// THE SOFTWARE. |
| 4627 | - |
| 4628 | -#include <cassert> |
| 4629 | -#include <complex> |
| 4630 | -#include <iostream> |
| 4631 | -#include <vector> |
| 4632 | - |
| 4633 | -#include <hip/hip_runtime_api.h> |
| 4634 | - |
| 4635 | -#include <rocfft/rocfft.h> |
| 4636 | - |
| 4637 | -int main(int argc, char* argv[]) |
| 4638 | -{ |
| 4639 | - std::cout << "rocFFT complex 1d FFT example\n"; |
| 4640 | - |
| 4641 | - // The problem size |
| 4642 | - const size_t Nx = (argc < 2) ? 8 : atoi(argv[1]); |
| 4643 | - const bool inplace = (argc < 3) ? false : atoi(argv[2]); |
| 4644 | - std::cout << "Nx: " << Nx << "\tin-place: " << inplace << std::endl; |
| 4645 | - |
| 4646 | - // Initialize data on the host: |
| 4647 | - std::cout << "Input:\n"; |
| 4648 | - std::vector<std::complex<float>> cx(Nx); |
| 4649 | - for(size_t i = 0; i < Nx; i++) |
| 4650 | - { |
| 4651 | - cx[i] = std::complex<float>(i, 0); |
| 4652 | - } |
| 4653 | - for(size_t i = 0; i < Nx; i++) |
| 4654 | - { |
| 4655 | - std::cout << cx[i] << " "; |
| 4656 | - } |
| 4657 | - std::cout << std::endl; |
| 4658 | - |
| 4659 | - // Create HIP device object and copy data: |
| 4660 | - float2* x = NULL; |
| 4661 | - hipMalloc(&x, cx.size() * sizeof(decltype(cx)::value_type)); |
| 4662 | - float2* y = inplace ? (float2*)x : NULL; |
| 4663 | - if(!inplace) |
| 4664 | - { |
| 4665 | - hipMalloc(&y, cx.size() * sizeof(decltype(cx)::value_type)); |
| 4666 | - } |
| 4667 | - hipMemcpy(x, cx.data(), cx.size() * sizeof(decltype(cx)::value_type), hipMemcpyHostToDevice); |
| 4668 | - |
| 4669 | - rocfft_setup(); |
| 4670 | - |
| 4671 | - rocfft_status status = rocfft_status_success; |
| 4672 | - |
| 4673 | - // Create forward plan |
| 4674 | - rocfft_plan forward = NULL; |
| 4675 | - status = rocfft_plan_create(&forward, |
| 4676 | - inplace ? rocfft_placement_inplace : rocfft_placement_notinplace, |
| 4677 | - rocfft_transform_type_complex_forward, |
| 4678 | - rocfft_precision_single, |
| 4679 | - 1, // Dimensions |
| 4680 | - &Nx, // lengths |
| 4681 | - 1, // Number of transforms |
| 4682 | - NULL); // Description |
| 4683 | - assert(status == rocfft_status_success); |
| 4684 | - |
| 4685 | - // We may need work memory, which is passed via rocfft_execution_info |
| 4686 | - rocfft_execution_info forwardinfo = NULL; |
| 4687 | - status = rocfft_execution_info_create(&forwardinfo); |
| 4688 | - assert(status == rocfft_status_success); |
| 4689 | - size_t fbuffersize = 0; |
| 4690 | - status = rocfft_plan_get_work_buffer_size(forward, &fbuffersize); |
| 4691 | - assert(status == rocfft_status_success); |
| 4692 | - void* fbuffer = NULL; |
| 4693 | - if(fbuffersize > 0) |
| 4694 | - { |
| 4695 | - hipMalloc(&fbuffer, fbuffersize); |
| 4696 | - status = rocfft_execution_info_set_work_buffer(forwardinfo, fbuffer, fbuffersize); |
| 4697 | - assert(status == rocfft_status_success); |
| 4698 | - } |
| 4699 | - |
| 4700 | - // Create backward plan |
| 4701 | - rocfft_plan backward = NULL; |
| 4702 | - status = rocfft_plan_create(&backward, |
| 4703 | - inplace ? rocfft_placement_inplace : rocfft_placement_notinplace, |
| 4704 | - rocfft_transform_type_complex_inverse, |
| 4705 | - rocfft_precision_single, |
| 4706 | - 1, // Dimensions |
| 4707 | - &Nx, // lengths |
| 4708 | - 1, // Number of transforms |
| 4709 | - NULL); // Description |
| 4710 | - assert(status == rocfft_status_success); |
| 4711 | - |
| 4712 | - // Execution info for the backward transform: |
| 4713 | - rocfft_execution_info backwardinfo = NULL; |
| 4714 | - status = rocfft_execution_info_create(&backwardinfo); |
| 4715 | - assert(status == rocfft_status_success); |
| 4716 | - size_t bbuffersize = 0; |
| 4717 | - status = rocfft_plan_get_work_buffer_size(backward, &bbuffersize); |
| 4718 | - assert(status == rocfft_status_success); |
| 4719 | - void* bbuffer = NULL; |
| 4720 | - if(bbuffersize > 0) |
| 4721 | - { |
| 4722 | - hipMalloc(&bbuffer, bbuffersize); |
| 4723 | - status = rocfft_execution_info_set_work_buffer(backwardinfo, bbuffer, bbuffersize); |
| 4724 | - assert(status == rocfft_status_success); |
| 4725 | - } |
| 4726 | - |
| 4727 | - // Execute the forward transform |
| 4728 | - status = rocfft_execute(forward, |
| 4729 | - (void**)&x, // in_buffer |
| 4730 | - (void**)&y, // out_buffer |
| 4731 | - forwardinfo); // execution info |
| 4732 | - assert(status == rocfft_status_success); |
| 4733 | - |
| 4734 | - // Copy result back to host |
| 4735 | - std::vector<std::complex<float>> cy(Nx); |
| 4736 | - hipMemcpy(cy.data(), y, cy.size() * sizeof(decltype(cy)::value_type), hipMemcpyDeviceToHost); |
| 4737 | - |
| 4738 | - std::cout << "Transformed:\n"; |
| 4739 | - for(size_t i = 0; i < cy.size(); i++) |
| 4740 | - { |
| 4741 | - std::cout << cy[i] << " "; |
| 4742 | - } |
| 4743 | - std::cout << std::endl; |
| 4744 | - |
| 4745 | - // Execute the backward transform |
| 4746 | - rocfft_execute(backward, |
| 4747 | - (void**)&y, // in_buffer |
| 4748 | - (void**)&x, // out_buffer |
| 4749 | - backwardinfo); // execution info |
| 4750 | - |
| 4751 | - std::cout << "Transformed back:\n"; |
| 4752 | - hipMemcpy(cy.data(), x, cy.size() * sizeof(decltype(cy)::value_type), hipMemcpyDeviceToHost); |
| 4753 | - for(size_t i = 0; i < cy.size(); i++) |
| 4754 | - { |
| 4755 | - std::cout << cy[i] << " "; |
| 4756 | - } |
| 4757 | - std::cout << std::endl; |
| 4758 | - |
| 4759 | - const float overN = 1.0f / Nx; |
| 4760 | - float error = 0.0f; |
| 4761 | - for(size_t i = 0; i < cx.size(); i++) |
| 4762 | - { |
| 4763 | - float diff = std::max(std::abs(cx[i].real() - cy[i].real() * overN), |
| 4764 | - std::abs(cx[i].imag() - cy[i].imag() * overN)); |
| 4765 | - if(diff > error) |
| 4766 | - { |
| 4767 | - error = diff; |
| 4768 | - } |
| 4769 | - } |
| 4770 | - std::cout << "Maximum error: " << error << "\n"; |
| 4771 | - |
| 4772 | - hipFree(x); |
| 4773 | - if(!inplace) |
| 4774 | - { |
| 4775 | - hipFree(y); |
| 4776 | - } |
| 4777 | - hipFree(fbuffer); |
| 4778 | - hipFree(bbuffer); |
| 4779 | - |
| 4780 | - // Destroy plans |
| 4781 | - rocfft_plan_destroy(forward); |
| 4782 | - rocfft_plan_destroy(backward); |
| 4783 | - |
| 4784 | - rocfft_cleanup(); |
| 4785 | -} |
| 4786 | diff --git a/docs/samples/complex_2d.cpp b/docs/samples/complex_2d.cpp |
| 4787 | deleted file mode 100644 |
| 4788 | index 9da818b..0000000 |
| 4789 | --- a/docs/samples/complex_2d.cpp |
| 4790 | +++ /dev/null |
| 4791 | @@ -1,198 +0,0 @@ |
| 4792 | -// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved. |
| 4793 | -// |
| 4794 | -// Permission is hereby granted, free of charge, to any person obtaining a copy |
| 4795 | -// of this software and associated documentation files (the "Software"), to deal |
| 4796 | -// in the Software without restriction, including without limitation the rights |
| 4797 | -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 4798 | -// copies of the Software, and to permit persons to whom the Software is |
| 4799 | -// furnished to do so, subject to the following conditions: |
| 4800 | -// |
| 4801 | -// The above copyright notice and this permission notice shall be included in |
| 4802 | -// all copies or substantial portions of the Software. |
| 4803 | -// |
| 4804 | -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 4805 | -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 4806 | -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 4807 | -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 4808 | -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 4809 | -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 4810 | -// THE SOFTWARE. |
| 4811 | - |
| 4812 | -#include <cassert> |
| 4813 | -#include <complex> |
| 4814 | -#include <iostream> |
| 4815 | -#include <vector> |
| 4816 | - |
| 4817 | -#include <hip/hip_runtime_api.h> |
| 4818 | - |
| 4819 | -#include <rocfft/rocfft.h> |
| 4820 | - |
| 4821 | -int main(int argc, char* argv[]) |
| 4822 | -{ |
| 4823 | - std::cout << "rocFFT complex 2d FFT example\n"; |
| 4824 | - |
| 4825 | - // The problem size |
| 4826 | - const size_t Nx = (argc < 2) ? 8 : atoi(argv[1]); |
| 4827 | - const size_t Ny = (argc < 3) ? 8 : atoi(argv[2]); |
| 4828 | - const bool inplace = (argc < 4) ? false : atoi(argv[3]); |
| 4829 | - std::cout << "Nx: " << Nx << "\tNy: " << Ny << "\tin-place: " << inplace << std::endl; |
| 4830 | - |
| 4831 | - // Initialize data on the host |
| 4832 | - std::cout << "Input:\n"; |
| 4833 | - std::vector<std::complex<float>> cx(Nx * Ny); |
| 4834 | - for(size_t i = 0; i < Nx; i++) |
| 4835 | - { |
| 4836 | - for(size_t j = 0; j < Ny; j++) |
| 4837 | - { |
| 4838 | - cx[i * Ny + j] = std::complex<float>(i + j, 0.0); |
| 4839 | - } |
| 4840 | - } |
| 4841 | - for(size_t i = 0; i < Nx; i++) |
| 4842 | - { |
| 4843 | - for(size_t j = 0; j < Ny; j++) |
| 4844 | - { |
| 4845 | - std::cout << cx[i * Ny + j] << " "; |
| 4846 | - } |
| 4847 | - std::cout << "\n"; |
| 4848 | - } |
| 4849 | - std::cout << "\n"; |
| 4850 | - |
| 4851 | - rocfft_setup(); |
| 4852 | - |
| 4853 | - // Create HIP device object and copy data: |
| 4854 | - float2* x = NULL; |
| 4855 | - hipMalloc(&x, cx.size() * sizeof(decltype(cx)::value_type)); |
| 4856 | - float2* y = inplace ? (float2*)x : NULL; |
| 4857 | - if(!inplace) |
| 4858 | - { |
| 4859 | - hipMalloc(&y, cx.size() * sizeof(decltype(cx)::value_type)); |
| 4860 | - } |
| 4861 | - hipMemcpy(x, cx.data(), cx.size() * sizeof(decltype(cx)::value_type), hipMemcpyHostToDevice); |
| 4862 | - |
| 4863 | - // Length are in reverse order because rocfft is column-major. |
| 4864 | - const size_t lengths[2] = {Ny, Nx}; |
| 4865 | - |
| 4866 | - rocfft_status status = rocfft_status_success; |
| 4867 | - |
| 4868 | - // Create plans |
| 4869 | - rocfft_plan forward = NULL; |
| 4870 | - status = rocfft_plan_create(&forward, |
| 4871 | - inplace ? rocfft_placement_inplace : rocfft_placement_notinplace, |
| 4872 | - rocfft_transform_type_complex_forward, |
| 4873 | - rocfft_precision_single, |
| 4874 | - 2, // Dimensions |
| 4875 | - lengths, // lengths |
| 4876 | - 1, // Number of transforms |
| 4877 | - NULL); // Description |
| 4878 | - assert(status == rocfft_status_success); |
| 4879 | - |
| 4880 | - // We may need work memory, which is passed via rocfft_execution_info |
| 4881 | - rocfft_execution_info forwardinfo = NULL; |
| 4882 | - status = rocfft_execution_info_create(&forwardinfo); |
| 4883 | - assert(status == rocfft_status_success); |
| 4884 | - size_t fbuffersize = 0; |
| 4885 | - status = rocfft_plan_get_work_buffer_size(forward, &fbuffersize); |
| 4886 | - assert(status == rocfft_status_success); |
| 4887 | - void* fbuffer = NULL; |
| 4888 | - if(fbuffersize > 0) |
| 4889 | - { |
| 4890 | - hipMalloc(&fbuffer, fbuffersize); |
| 4891 | - status = rocfft_execution_info_set_work_buffer(forwardinfo, fbuffer, fbuffersize); |
| 4892 | - assert(status == rocfft_status_success); |
| 4893 | - } |
| 4894 | - |
| 4895 | - // Create plans |
| 4896 | - rocfft_plan backward = NULL; |
| 4897 | - status = rocfft_plan_create(&backward, |
| 4898 | - inplace ? rocfft_placement_inplace : rocfft_placement_notinplace, |
| 4899 | - rocfft_transform_type_complex_inverse, |
| 4900 | - rocfft_precision_single, |
| 4901 | - 2, // Dimensions |
| 4902 | - lengths, // lengths |
| 4903 | - 1, // Number of transforms |
| 4904 | - NULL); // Description |
| 4905 | - assert(status == rocfft_status_success); |
| 4906 | - |
| 4907 | - // Execution info for the backward transform: |
| 4908 | - rocfft_execution_info backwardinfo = NULL; |
| 4909 | - status = rocfft_execution_info_create(&backwardinfo); |
| 4910 | - assert(status == rocfft_status_success); |
| 4911 | - size_t bbuffersize = 0; |
| 4912 | - status = rocfft_plan_get_work_buffer_size(backward, &bbuffersize); |
| 4913 | - assert(status == rocfft_status_success); |
| 4914 | - void* bbuffer = NULL; |
| 4915 | - if(bbuffersize > 0) |
| 4916 | - { |
| 4917 | - hipMalloc(&bbuffer, bbuffersize); |
| 4918 | - status = rocfft_execution_info_set_work_buffer(backwardinfo, bbuffer, bbuffersize); |
| 4919 | - assert(status == rocfft_status_success); |
| 4920 | - } |
| 4921 | - |
| 4922 | - // Execute the forward transform |
| 4923 | - status = rocfft_execute(forward, |
| 4924 | - (void**)&x, // in_buffer |
| 4925 | - (void**)&y, // out_buffer |
| 4926 | - forwardinfo); // execution info |
| 4927 | - assert(status == rocfft_status_success); |
| 4928 | - |
| 4929 | - // Copy result back to host |
| 4930 | - std::vector<std::complex<float>> cy(cx.size()); |
| 4931 | - hipMemcpy(cy.data(), y, cy.size() * sizeof(decltype(cy)::value_type), hipMemcpyDeviceToHost); |
| 4932 | - |
| 4933 | - std::cout << "Transformed:\n"; |
| 4934 | - for(size_t i = 0; i < Nx; i++) |
| 4935 | - { |
| 4936 | - for(size_t j = 0; j < Ny; j++) |
| 4937 | - { |
| 4938 | - std::cout << cy[i * Ny + j] << " "; |
| 4939 | - } |
| 4940 | - std::cout << "\n"; |
| 4941 | - } |
| 4942 | - std::cout << "\n"; |
| 4943 | - |
| 4944 | - // Execute the backward transform |
| 4945 | - status = rocfft_execute(backward, |
| 4946 | - (void**)&y, // in_buffer |
| 4947 | - (void**)&x, // out_buffer |
| 4948 | - backwardinfo); // execution info |
| 4949 | - assert(status == rocfft_status_success); |
| 4950 | - |
| 4951 | - hipMemcpy(cy.data(), x, cy.size() * sizeof(decltype(cy)::value_type), hipMemcpyDeviceToHost); |
| 4952 | - std::cout << "Transformed back:\n"; |
| 4953 | - for(size_t i = 0; i < Nx; i++) |
| 4954 | - { |
| 4955 | - for(size_t j = 0; j < Ny; j++) |
| 4956 | - { |
| 4957 | - std::cout << cy[i * Ny + j] << " "; |
| 4958 | - } |
| 4959 | - std::cout << "\n"; |
| 4960 | - } |
| 4961 | - std::cout << "\n"; |
| 4962 | - |
| 4963 | - const float overN = 1.0f / cx.size(); |
| 4964 | - float error = 0.0f; |
| 4965 | - for(size_t i = 0; i < cx.size(); i++) |
| 4966 | - { |
| 4967 | - float diff = std::max(std::abs(cx[i].real() - cy[i].real() * overN), |
| 4968 | - std::abs(cx[i].imag() - cy[i].imag() * overN)); |
| 4969 | - if(diff > error) |
| 4970 | - { |
| 4971 | - error = diff; |
| 4972 | - } |
| 4973 | - } |
| 4974 | - std::cout << "Maximum error: " << error << "\n"; |
| 4975 | - |
| 4976 | - hipFree(x); |
| 4977 | - if(!inplace) |
| 4978 | - { |
| 4979 | - hipFree(y); |
| 4980 | - } |
| 4981 | - hipFree(fbuffer); |
| 4982 | - hipFree(bbuffer); |
| 4983 | - |
| 4984 | - // Destroy plans |
| 4985 | - rocfft_plan_destroy(forward); |
| 4986 | - rocfft_plan_destroy(backward); |
| 4987 | - |
| 4988 | - rocfft_cleanup(); |
| 4989 | -} |
| 4990 | diff --git a/docs/samples/complex_3d.cpp b/docs/samples/complex_3d.cpp |
| 4991 | deleted file mode 100644 |
| 4992 | index b547d7e..0000000 |
| 4993 | --- a/docs/samples/complex_3d.cpp |
| 4994 | +++ /dev/null |
| 4995 | @@ -1,218 +0,0 @@ |
| 4996 | -// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved. |
| 4997 | -// |
| 4998 | -// Permission is hereby granted, free of charge, to any person obtaining a copy |
| 4999 | -// of this software and associated documentation files (the "Software"), to deal |
| 5000 | -// in the Software without restriction, including without limitation the rights |

Build in ppa: https:/ /launchpad. net/~bruno- bdmoura/ +archive/ ubuntu/ lp-2139240- rocfft
(~proposed and target archs)