Merge ~bullwinkle-team/ubuntu/+source/hipcub:bullwinkle/llvm-21/ubuntu/devel into ubuntu/+source/hipcub:ubuntu/devel
- Git
- lp:~bullwinkle-team/ubuntu/+source/hipcub
- bullwinkle/llvm-21/ubuntu/devel
- Merge into ubuntu/devel
| Status: | Merged | ||||
|---|---|---|---|---|---|
| Approved by: | Andreas Hasenack | ||||
| Approved revision: | a79958c79245f1e5eed0d43cec34ec1c57e4b67d | ||||
| Merged at revision: | a79958c79245f1e5eed0d43cec34ec1c57e4b67d | ||||
| Proposed branch: | ~bullwinkle-team/ubuntu/+source/hipcub:bullwinkle/llvm-21/ubuntu/devel | ||||
| Merge into: | ubuntu/+source/hipcub:ubuntu/devel | ||||
| Diff against target: |
38063 lines (+18301/-6495) 314 files modified
.github/CODEOWNERS (+1/-1) .gitignore (+1/-1) .gitlab-ci.yml (+167/-17) CHANGELOG.md (+101/-0) CMakeLists.txt (+114/-47) LICENSE.txt (+2/-1) README.md (+119/-59) benchmark/CMakeLists.txt (+3/-2) benchmark/benchmark_block_adjacent_difference.cpp (+3/-4) benchmark/benchmark_block_discontinuity.cpp (+5/-6) benchmark/benchmark_block_exchange.cpp (+3/-3) benchmark/benchmark_block_histogram.cpp (+1/-1) benchmark/benchmark_block_merge_sort.cpp (+3/-3) benchmark/benchmark_block_radix_rank.cpp (+4/-4) benchmark/benchmark_block_radix_sort.cpp (+3/-3) benchmark/benchmark_block_reduce.cpp (+2/-2) benchmark/benchmark_block_run_length_decode.cpp (+3/-3) benchmark/benchmark_block_scan.cpp (+1/-1) benchmark/benchmark_block_shuffle.cpp (+1/-1) benchmark/benchmark_device_batch_copy.cpp (+4/-4) benchmark/benchmark_device_batch_memcpy.cpp (+5/-4) benchmark/benchmark_device_for.cpp (+4/-4) benchmark/benchmark_device_histogram.cpp (+2/-2) benchmark/benchmark_device_memory.cpp (+43/-33) benchmark/benchmark_device_merge.cpp (+364/-0) benchmark/benchmark_device_merge_sort.cpp (+2/-2) benchmark/benchmark_device_partition.cpp (+1/-1) benchmark/benchmark_device_radix_sort.cpp (+1/-1) benchmark/benchmark_device_reduce.cpp (+5/-2) benchmark/benchmark_device_reduce_by_key.cpp (+1/-1) benchmark/benchmark_device_run_length_encode.cpp (+1/-1) benchmark/benchmark_device_scan.cpp (+1/-1) benchmark/benchmark_device_segmented_radix_sort.cpp (+26/-26) benchmark/benchmark_device_segmented_reduce.cpp (+1/-1) benchmark/benchmark_device_segmented_sort.cpp (+22/-22) benchmark/benchmark_device_select.cpp (+7/-8) benchmark/benchmark_device_spmv.cpp (+7/-1) benchmark/benchmark_utils.hpp (+12/-4) benchmark/benchmark_warp_exchange.cpp (+1/-1) benchmark/benchmark_warp_load.cpp (+1/-1) benchmark/benchmark_warp_merge_sort.cpp (+4/-4) benchmark/benchmark_warp_reduce.cpp (+1/-1) benchmark/benchmark_warp_scan.cpp (+70/-52) benchmark/benchmark_warp_store.cpp (+1/-1) benchmark/common_benchmark_header.hpp (+9/-9) cmake/Dependencies.cmake (+242/-32) cmake/GenerateResourceSpec.cmake (+8/-7) cmake/ROCmCMakeBuildToolsDependency.cmake (+0/-1) cmake/SetupNVCC.cmake (+1/-1) cmake/Summary.cmake (+51/-2) debian/changelog (+25/-0) debian/control (+7/-6) debian/patches/0001-install-cmake-config-to-share.patch (+4/-4) debian/patches/series (+0/-1) debian/rules (+6/-0) debian/tests/control (+1/-1) dev/null (+0/-88) docs/conf.py (+5/-1) docs/doxygen/Doxyfile (+471/-206) docs/index.rst (+1/-1) docs/install/hipCUB-install-on-Windows.rst (+2/-1) docs/install/hipCUB-install-overview.rst (+10/-1) docs/install/hipCUB-install-with-cmake.rst (+2/-1) docs/install/hipCUB-prerequisites.rst (+3/-3) docs/sphinx/requirements.in (+1/-1) docs/sphinx/requirements.txt (+157/-7) examples/CMakeLists.txt (+1/-1) examples/block/example_block_radix_sort.cu (+5/-5) examples/block/example_block_reduce.cu (+4/-4) examples/block/example_block_scan.cu (+7/-6) examples/device/example_device_partition_flagged.cpp (+5/-5) examples/device/example_device_partition_if.cpp (+4/-4) examples/device/example_device_radix_sort.cpp (+1/-1) examples/device/example_device_reduce.cpp (+3/-3) examples/device/example_device_scan.cpp (+3/-3) examples/device/example_device_select_flagged.cpp (+5/-5) examples/device/example_device_select_if.cpp (+4/-4) examples/device/example_device_select_unique.cpp (+4/-4) examples/device/example_device_sort_find_non_trivial_runs.cpp (+7/-7) examples/example_utils.hpp (+1/-1) examples/mersenne.h (+3/-3) hipcub/CMakeLists.txt (+0/-20) hipcub/include/hipcub/agent/single_pass_scan_operators.hpp (+30/-0) hipcub/include/hipcub/backend/cub/agent/single_pass_scan_operators.hpp (+101/-0) hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp (+2/-2) hipcub/include/hipcub/backend/cub/device/device_copy.hpp (+2/-2) hipcub/include/hipcub/backend/cub/device/device_for.hpp (+130/-79) hipcub/include/hipcub/backend/cub/device/device_histogram.hpp (+2/-2) hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp (+2/-2) hipcub/include/hipcub/backend/cub/device/device_merge.hpp (+107/-0) hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp (+2/-2) hipcub/include/hipcub/backend/cub/device/device_partition.hpp (+77/-68) hipcub/include/hipcub/backend/cub/device/device_radix_sort.hpp (+2/-2) hipcub/include/hipcub/backend/cub/device/device_reduce.hpp (+70/-14) hipcub/include/hipcub/backend/cub/device/device_run_length_encode.hpp (+2/-2) hipcub/include/hipcub/backend/cub/device/device_scan.hpp (+103/-58) hipcub/include/hipcub/backend/cub/device/device_segmented_radix_sort.hpp (+2/-3) hipcub/include/hipcub/backend/cub/device/device_segmented_reduce.hpp (+2/-2) hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp (+2/-2) hipcub/include/hipcub/backend/cub/device/device_select.hpp (+2/-2) hipcub/include/hipcub/backend/cub/device/device_spmv.hpp (+91/-79) hipcub/include/hipcub/backend/cub/device/device_transform.hpp (+206/-0) hipcub/include/hipcub/backend/cub/grid/grid_barrier.hpp (+7/-8) hipcub/include/hipcub/backend/cub/hipcub.hpp (+49/-46) hipcub/include/hipcub/backend/cub/iterator/tex_obj_input_iterator.hpp (+2/-2) hipcub/include/hipcub/backend/cub/thread/thread_operators.hpp (+42/-0) hipcub/include/hipcub/backend/cub/util_allocator.hpp (+2/-2) hipcub/include/hipcub/backend/cub/util_device.hpp (+2/-2) hipcub/include/hipcub/backend/cub/util_macro.hpp (+14/-8) hipcub/include/hipcub/backend/cub/util_temporary_storage.hpp (+17/-16) hipcub/include/hipcub/backend/rocprim/agent/single_pass_scan_operators.hpp (+484/-0) hipcub/include/hipcub/backend/rocprim/block/block_adjacent_difference.hpp (+2/-120) hipcub/include/hipcub/backend/rocprim/block/block_discontinuity.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/block/block_exchange.hpp (+3/-3) hipcub/include/hipcub/backend/rocprim/block/block_histogram.hpp (+3/-3) hipcub/include/hipcub/backend/rocprim/block/block_load.hpp (+3/-3) hipcub/include/hipcub/backend/rocprim/block/block_load_func.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/block/block_merge_sort.hpp (+14/-12) hipcub/include/hipcub/backend/rocprim/block/block_radix_rank.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/block/block_radix_sort.hpp (+4/-4) hipcub/include/hipcub/backend/rocprim/block/block_raking_layout.hpp (+3/-4) hipcub/include/hipcub/backend/rocprim/block/block_reduce.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp (+4/-3) hipcub/include/hipcub/backend/rocprim/block/block_scan.hpp (+28/-2) hipcub/include/hipcub/backend/rocprim/block/block_shuffle.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/block/block_store.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/block/block_store_func.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp (+11/-11) hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/device/device_copy.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/device/device_for.hpp (+272/-107) hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp (+9/-3) hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/device/device_merge.hpp (+109/-0) hipcub/include/hipcub/backend/rocprim/device/device_merge_sort.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/device/device_partition.hpp (+77/-68) hipcub/include/hipcub/backend/rocprim/device/device_radix_sort.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp (+83/-9) hipcub/include/hipcub/backend/rocprim/device/device_run_length_encode.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp (+146/-114) hipcub/include/hipcub/backend/rocprim/device/device_segmented_radix_sort.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp (+194/-186) hipcub/include/hipcub/backend/rocprim/device/device_segmented_sort.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/device/device_select.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/device/device_spmv.hpp (+44/-36) hipcub/include/hipcub/backend/rocprim/device/device_transform.hpp (+249/-0) hipcub/include/hipcub/backend/rocprim/grid/grid_barrier.hpp (+19/-12) hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp (+19/-17) hipcub/include/hipcub/backend/rocprim/grid/grid_queue.hpp (+4/-11) hipcub/include/hipcub/backend/rocprim/hipcub.hpp (+4/-2) hipcub/include/hipcub/backend/rocprim/iterator/arg_index_input_iterator.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/iterator/cache_modified_output_iterator.hpp (+1/-3) hipcub/include/hipcub/backend/rocprim/iterator/constant_input_iterator.hpp (+5/-3) hipcub/include/hipcub/backend/rocprim/iterator/counting_input_iterator.hpp (+5/-3) hipcub/include/hipcub/backend/rocprim/iterator/discard_output_iterator.hpp (+13/-4) hipcub/include/hipcub/backend/rocprim/iterator/iterator_wrapper.hpp (+3/-3) hipcub/include/hipcub/backend/rocprim/iterator/tex_obj_input_iterator.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/iterator/transform_input_iterator.hpp (+6/-4) hipcub/include/hipcub/backend/rocprim/thread/thread_load.hpp (+55/-83) hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp (+335/-30) hipcub/include/hipcub/backend/rocprim/thread/thread_reduce.hpp (+142/-38) hipcub/include/hipcub/backend/rocprim/thread/thread_scan.hpp (+153/-136) hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp (+6/-5) hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp (+83/-78) hipcub/include/hipcub/backend/rocprim/tuple.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/util_allocator.hpp (+13/-14) hipcub/include/hipcub/backend/rocprim/util_macro.hpp (+6/-0) hipcub/include/hipcub/backend/rocprim/util_mdspan.hpp (+230/-0) hipcub/include/hipcub/backend/rocprim/util_ptx.hpp (+112/-104) hipcub/include/hipcub/backend/rocprim/util_temporary_storage.hpp (+26/-10) hipcub/include/hipcub/backend/rocprim/util_type.hpp (+293/-101) hipcub/include/hipcub/backend/rocprim/warp/specializations/warp_exchange_shfl.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/warp/specializations/warp_exchange_smem.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/warp/warp_exchange.hpp (+3/-3) hipcub/include/hipcub/backend/rocprim/warp/warp_load.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/warp/warp_merge_sort.hpp (+13/-12) hipcub/include/hipcub/backend/rocprim/warp/warp_reduce.hpp (+2/-2) hipcub/include/hipcub/backend/rocprim/warp/warp_scan.hpp (+22/-2) hipcub/include/hipcub/backend/rocprim/warp/warp_store.hpp (+2/-2) hipcub/include/hipcub/block/block_adjacent_difference.hpp (+3/-3) hipcub/include/hipcub/block/block_discontinuity.hpp (+3/-3) hipcub/include/hipcub/block/block_exchange.hpp (+3/-3) hipcub/include/hipcub/block/block_histogram.hpp (+3/-3) hipcub/include/hipcub/block/block_load.hpp (+3/-3) hipcub/include/hipcub/block/block_merge_sort.hpp (+3/-3) hipcub/include/hipcub/block/block_radix_rank.hpp (+6/-6) hipcub/include/hipcub/block/block_radix_sort.hpp (+3/-3) hipcub/include/hipcub/block/block_raking_layout.hpp (+3/-3) hipcub/include/hipcub/block/block_reduce.hpp (+3/-3) hipcub/include/hipcub/block/block_run_length_decode.hpp (+3/-3) hipcub/include/hipcub/block/block_scan.hpp (+3/-3) hipcub/include/hipcub/block/block_shuffle.hpp (+3/-3) hipcub/include/hipcub/block/block_store.hpp (+3/-3) hipcub/include/hipcub/block/radix_rank_sort_operations.hpp (+3/-3) hipcub/include/hipcub/config.hpp (+40/-5) hipcub/include/hipcub/device/device_adjacent_difference.hpp (+3/-3) hipcub/include/hipcub/device/device_copy.hpp (+3/-3) hipcub/include/hipcub/device/device_for.hpp (+3/-3) hipcub/include/hipcub/device/device_histogram.hpp (+3/-3) hipcub/include/hipcub/device/device_memcpy.hpp (+3/-3) hipcub/include/hipcub/device/device_merge.hpp (+7/-9) hipcub/include/hipcub/device/device_merge_sort.hpp (+3/-3) hipcub/include/hipcub/device/device_partition.hpp (+3/-3) hipcub/include/hipcub/device/device_radix_sort.hpp (+3/-3) hipcub/include/hipcub/device/device_reduce.hpp (+3/-3) hipcub/include/hipcub/device/device_run_length_encode.hpp (+3/-3) hipcub/include/hipcub/device/device_scan.hpp (+3/-3) hipcub/include/hipcub/device/device_segmented_radix_sort.hpp (+3/-3) hipcub/include/hipcub/device/device_segmented_reduce.hpp (+3/-3) hipcub/include/hipcub/device/device_segmented_sort.hpp (+3/-3) hipcub/include/hipcub/device/device_select.hpp (+3/-3) hipcub/include/hipcub/device/device_spmv.hpp (+3/-3) hipcub/include/hipcub/device/device_transform.hpp (+38/-0) hipcub/include/hipcub/grid/grid_barrier.hpp (+10/-10) hipcub/include/hipcub/grid/grid_even_share.hpp (+10/-10) hipcub/include/hipcub/grid/grid_mapping.hpp (+10/-10) hipcub/include/hipcub/grid/grid_queue.hpp (+10/-10) hipcub/include/hipcub/hipcub.hpp (+4/-6) hipcub/include/hipcub/hipcub_version.hpp.in (+3/-3) hipcub/include/hipcub/iterator/arg_index_input_iterator.hpp (+3/-3) hipcub/include/hipcub/iterator/cache_modified_input_iterator.hpp (+3/-3) hipcub/include/hipcub/iterator/cache_modified_output_iterator.hpp (+3/-3) hipcub/include/hipcub/iterator/constant_input_iterator.hpp (+3/-3) hipcub/include/hipcub/iterator/counting_input_iterator.hpp (+3/-3) hipcub/include/hipcub/iterator/discard_output_iterator.hpp (+3/-3) hipcub/include/hipcub/iterator/tex_obj_input_iterator.hpp (+3/-3) hipcub/include/hipcub/iterator/transform_input_iterator.hpp (+3/-3) hipcub/include/hipcub/thread/thread_load.hpp (+3/-3) hipcub/include/hipcub/thread/thread_operators.hpp (+3/-4) hipcub/include/hipcub/thread/thread_reduce.hpp (+3/-3) hipcub/include/hipcub/thread/thread_scan.hpp (+3/-3) hipcub/include/hipcub/thread/thread_search.hpp (+3/-3) hipcub/include/hipcub/thread/thread_sort.hpp (+3/-3) hipcub/include/hipcub/thread/thread_store.hpp (+3/-3) hipcub/include/hipcub/tuple.hpp (+3/-3) hipcub/include/hipcub/util_allocator.hpp (+3/-3) hipcub/include/hipcub/util_device.hpp (+3/-3) hipcub/include/hipcub/util_macro.hpp (+3/-3) hipcub/include/hipcub/util_ptx.hpp (+3/-3) hipcub/include/hipcub/util_temporary_storage.hpp (+3/-3) hipcub/include/hipcub/util_type.hpp (+3/-3) hipcub/include/hipcub/warp/warp_exchange.hpp (+3/-3) hipcub/include/hipcub/warp/warp_load.hpp (+3/-3) hipcub/include/hipcub/warp/warp_merge_sort.hpp (+3/-3) hipcub/include/hipcub/warp/warp_reduce.hpp (+3/-3) hipcub/include/hipcub/warp/warp_scan.hpp (+3/-3) hipcub/include/hipcub/warp/warp_store.hpp (+3/-3) rmake.py (+5/-1) test/CMakeLists.txt (+6/-2) test/extra/CMakeLists.txt (+14/-29) test/extra/test_hipcub_package.cpp (+11/-8) test/hipcub/CMakeLists.txt (+23/-3) test/hipcub/bfloat16.hpp (+12/-1) test/hipcub/common_test_header.hpp (+42/-24) test/hipcub/experimental/sparse_matrix.hpp (+19/-14) test/hipcub/half.hpp (+16/-6) test/hipcub/single_index_iterator.hpp (+1/-1) test/hipcub/test_hipcub_block_adjacent_difference.cpp (+23/-588) test/hipcub/test_hipcub_block_discontinuity.cpp (+186/-190) test/hipcub/test_hipcub_block_exchange.cpp (+1374/-258) test/hipcub/test_hipcub_block_histogram.cpp (+28/-29) test/hipcub/test_hipcub_block_load_store.cpp (+1/-1) test/hipcub/test_hipcub_block_load_store.hpp (+5/-3) test/hipcub/test_hipcub_block_load_store.kernels.hpp (+24/-24) test/hipcub/test_hipcub_block_merge_sort.cpp (+783/-126) test/hipcub/test_hipcub_block_radix_rank.cpp (+519/-48) test/hipcub/test_hipcub_block_radix_sort.cpp (+290/-198) test/hipcub/test_hipcub_block_reduce.cpp (+458/-143) test/hipcub/test_hipcub_block_run_length_decode.cpp (+89/-113) test/hipcub/test_hipcub_block_scan.cpp (+2047/-237) test/hipcub/test_hipcub_block_shuffle.cpp (+342/-177) test/hipcub/test_hipcub_caching_device_allocator.cpp (+4/-2) test/hipcub/test_hipcub_device_adjacent_difference.cpp (+199/-124) test/hipcub/test_hipcub_device_copy.cpp (+11/-8) test/hipcub/test_hipcub_device_for.cpp (+887/-77) test/hipcub/test_hipcub_device_histogram.cpp (+82/-124) test/hipcub/test_hipcub_device_memcpy.cpp (+11/-8) test/hipcub/test_hipcub_device_merge.cpp (+567/-0) test/hipcub/test_hipcub_device_merge_sort.cpp (+61/-124) test/hipcub/test_hipcub_device_partition.cpp (+461/-52) test/hipcub/test_hipcub_device_radix_sort.hpp (+33/-69) test/hipcub/test_hipcub_device_reduce.cpp (+484/-106) test/hipcub/test_hipcub_device_reduce_by_key.cpp (+10/-20) test/hipcub/test_hipcub_device_run_length_encode.cpp (+79/-18) test/hipcub/test_hipcub_device_scan.cpp (+240/-102) test/hipcub/test_hipcub_device_segmented_radix_sort.hpp (+1/-1) test/hipcub/test_hipcub_device_segmented_reduce.cpp (+210/-84) test/hipcub/test_hipcub_device_segmented_sort.hpp (+1/-1) test/hipcub/test_hipcub_device_select.cpp (+50/-100) test/hipcub/test_hipcub_device_spmv.cpp (+19/-19) test/hipcub/test_hipcub_device_transform.cpp (+284/-0) test/hipcub/test_hipcub_grid.cpp (+29/-11) test/hipcub/test_hipcub_iterators.cpp (+38/-48) test/hipcub/test_hipcub_no_half_operators.cpp (+33/-0) test/hipcub/test_hipcub_single_pass_scan_operators.cpp (+370/-0) test/hipcub/test_hipcub_thread.cpp (+272/-52) test/hipcub/test_hipcub_thread_operators.cpp (+17/-21) test/hipcub/test_hipcub_thread_sort.cpp (+7/-8) test/hipcub/test_hipcub_util_device.cpp (+9/-5) test/hipcub/test_hipcub_util_ptx.cpp (+21/-22) test/hipcub/test_hipcub_vector.cpp (+17/-18) test/hipcub/test_hipcub_warp_exchange.cpp (+1/-1) test/hipcub/test_hipcub_warp_load.cpp (+1/-1) test/hipcub/test_hipcub_warp_merge_sort.cpp (+6/-7) test/hipcub/test_hipcub_warp_reduce.cpp (+4/-5) test/hipcub/test_hipcub_warp_scan.cpp (+361/-5) test/hipcub/test_hipcub_warp_store.cpp (+1/-1) test/hipcub/test_utils.hpp (+52/-15) test/hipcub/test_utils_assertions.hpp (+16/-0) test/hipcub/test_utils_data_generation.hpp (+4/-8) test/hipcub/test_utils_hipgraphs.hpp (+58/-47) test/hipcub/test_utils_sort_comparator.hpp (+17/-11) test/hipcub/test_utils_thread_operators.hpp (+22/-95) toolchain-windows.cmake (+1/-1) |
||||
| Related bugs: |
|
| Reviewer | Review Type | Date Requested | Status |
|---|---|---|---|
| Andreas Hasenack | Approve | ||
| Ubuntu Sponsors | Pending | ||
|
Review via email:
|
|||
Commit message
Description of the change
Update to new upstream version 7.1.0
| Bruno Bernardo de Moura (bruno-bdmoura) wrote : | # |
| Andreas Hasenack (ahasenack) wrote : | # |
Questions inline (I hope LP doesn't crash)
| Andreas Hasenack (ahasenack) wrote : | # |
LP crashed:
```
Launchpad encountered an internal error during the following operation: emailing a code review comment. It was logged with id OOPS-54cc5a00e1a774048fc490b8026e60b1. Sorry for the inconvenience.
```
You will have to check the GUI here to see the diff comments.
| Bruno Bernardo de Moura (bruno-bdmoura) wrote : | # |
Related to the first question:
You're right, I initially misunderstood the `all` label on the Architecture field. It is now rolled back.
Now, related to the second question:
As can be seen below, the rdepends of libhipcub-tests results in libraries inside the own ROCm stack. Since it's planned for the ROCm stack to drop ppc64el architectures for packages belonging to dependency layers 1 and 2, this won't interfere with any other packages on the archive.
```
$ apt rdepends libhipcub-tests
libhipcub-tests
Reverse Depends:
Depends: rocm-tests
```
| Andreas Hasenack (ahasenack) wrote : | # |
Sponsored:
Uploading hipcub_
Uploading hipcub_
Uploading hipcub_
Uploading hipcub_
Uploading hipcub_
Preview Diff
| 1 | diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS |
| 2 | index 89abd08..63d89af 100755 |
| 3 | --- a/.github/CODEOWNERS |
| 4 | +++ b/.github/CODEOWNERS |
| 5 | @@ -1,4 +1,4 @@ |
| 6 | -* @stanleytsang-amd @umfranzw @RobsonRLemos @lawruble13 |
| 7 | +* @stanleytsang-amd @umfranzw @RobsonRLemos |
| 8 | # Documentation files |
| 9 | docs/* @ROCm/rocm-documentation |
| 10 | *.md @ROCm/rocm-documentation |
| 11 | diff --git a/.gitignore b/.gitignore |
| 12 | index a09b2d6..16d3be3 100644 |
| 13 | --- a/.gitignore |
| 14 | +++ b/.gitignore |
| 15 | @@ -1,6 +1,6 @@ |
| 16 | |
| 17 | ### Build dirs ### |
| 18 | -build/ |
| 19 | +build*/ |
| 20 | |
| 21 | # Created by https://www.gitignore.io/api/c++,cmake |
| 22 | |
| 23 | diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml |
| 24 | index e8481a9..386ba44 100644 |
| 25 | --- a/.gitlab-ci.yml |
| 26 | +++ b/.gitlab-ci.yml |
| 27 | @@ -1,6 +1,6 @@ |
| 28 | # MIT License |
| 29 | # |
| 30 | -# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 31 | +# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 32 | # |
| 33 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 34 | # of this software and associated documentation files (the "Software"), to deal |
| 35 | @@ -30,6 +30,8 @@ include: |
| 36 | - /deps-format.yaml |
| 37 | - /deps-rocm.yaml |
| 38 | - /deps-nvcc.yaml |
| 39 | + - /deps-vcpkg.yaml |
| 40 | + - /deps-windows.yaml |
| 41 | - /deps-compiler-acceleration.yaml |
| 42 | - /gpus-rocm.yaml |
| 43 | - /gpus-nvcc.yaml |
| 44 | @@ -41,6 +43,10 @@ stages: |
| 45 | - test |
| 46 | - benchmark |
| 47 | |
| 48 | +workflow: |
| 49 | + rules: |
| 50 | + - if: $CI_MERGE_REQUEST_LABELS !~ /CI Skip/ |
| 51 | + |
| 52 | clang-format: |
| 53 | extends: |
| 54 | - .lint:clang-format |
| 55 | @@ -88,7 +94,7 @@ copyright-date: |
| 56 | -D GPU_TARGETS="$GPU_TARGETS" |
| 57 | -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c |
| 58 | -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx |
| 59 | - -D CMAKE_CXX_STANDARD=14 |
| 60 | + -D CMAKE_CXX_STANDARD=17 |
| 61 | -B $CI_PROJECT_DIR/rocPRIM/build |
| 62 | -S $CI_PROJECT_DIR/rocPRIM |
| 63 | - cd $CI_PROJECT_DIR/rocPRIM/build |
| 64 | @@ -113,7 +119,6 @@ build:rocm: |
| 65 | -D BUILD_TEST=ON |
| 66 | -D BUILD_EXAMPLE=ON |
| 67 | -D GPU_TARGETS="$GPU_TARGETS" |
| 68 | - -D GPU_TEST_TARGETS="$GPU_TARGETS" |
| 69 | -D ROCM_SYMLINK_LIBS=OFF |
| 70 | -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c |
| 71 | -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx |
| 72 | @@ -137,7 +142,7 @@ build:rocm: |
| 73 | expire_in: 2 weeks |
| 74 | parallel: |
| 75 | matrix: |
| 76 | - - BUILD_VERSION: [14, 17] |
| 77 | + - BUILD_VERSION: 17 |
| 78 | |
| 79 | build:rocm-benchmark: |
| 80 | extends: |
| 81 | @@ -158,7 +163,7 @@ build:rocm-benchmark: |
| 82 | -D GPU_TARGETS="$GPU_TARGETS" |
| 83 | -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c |
| 84 | -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx |
| 85 | - -D CMAKE_CXX_STANDARD=14 |
| 86 | + -D CMAKE_CXX_STANDARD=17 |
| 87 | -B $CI_PROJECT_DIR/build |
| 88 | -S $CI_PROJECT_DIR |
| 89 | - cmake --build $CI_PROJECT_DIR/build |
| 90 | @@ -170,19 +175,106 @@ build:rocm-benchmark: |
| 91 | - $CI_PROJECT_DIR/build/CMakeCache.txt |
| 92 | expire_in: 2 weeks |
| 93 | |
| 94 | -test:rocm: |
| 95 | +.rocm-windows-rocprim: |
| 96 | + variables: |
| 97 | + ROCPRIM_GIT_BRANCH: "develop_stream" |
| 98 | + script: |
| 99 | + # Install rocPRIM from git |
| 100 | + - $BRANCH_NAME = $env:ROCPRIM_GIT_BRANCH |
| 101 | + - if ($env:CI_COMMIT_BRANCH -eq "develop" -or $env:CI_COMMIT_BRANCH -eq "master") { $BRANCH_NAME = $env:CI_COMMIT_BRANCH } |
| 102 | + - git clone -b "$BRANCH_NAME" --depth 1 https://gitlab-ci-token:${CI_JOB_TOKEN}@${ROCPRIM_GIT_URL} $CI_PROJECT_DIR/rocPRIM |
| 103 | + - cmake |
| 104 | + -G Ninja |
| 105 | + -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe" |
| 106 | + -D CMAKE_INSTALL_PREFIX=$CI_PROJECT_DIR/rocPRIM_install |
| 107 | + -D ONLY_INSTALL=ON |
| 108 | + -B $CI_PROJECT_DIR/rocPRIM/build |
| 109 | + -S $CI_PROJECT_DIR/rocPRIM |
| 110 | + - cmake --build $CI_PROJECT_DIR/rocPRIM/build --target install |
| 111 | + |
| 112 | +build:rocm-windows: |
| 113 | + extends: |
| 114 | + - .rocm-windows-rocprim |
| 115 | + - .rules:build |
| 116 | + - .gpus:rocm-windows |
| 117 | + - .deps:rocm-windows |
| 118 | + - .deps:visual-studio-devshell |
| 119 | + stage: build |
| 120 | + needs: [] |
| 121 | + script: |
| 122 | + - !reference [".rocm-windows-rocprim", script] |
| 123 | + - cmake |
| 124 | + -G Ninja |
| 125 | + -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe" |
| 126 | + -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror -Wno-error=pass-failed" |
| 127 | + -D CMAKE_PREFIX_PATH:PATH="$CI_PROJECT_DIR/rocPRIM_install;${env:HIP_PATH}" |
| 128 | + -D CMAKE_BUILD_TYPE=Release |
| 129 | + -D BUILD_TEST=ON |
| 130 | + -D BUILD_EXAMPLE=ON |
| 131 | + -D GPU_TARGETS=$GPU_TARGET |
| 132 | + -D CMAKE_CXX_STANDARD="$BUILD_VERSION" |
| 133 | + -B $CI_PROJECT_DIR/build |
| 134 | + -S $CI_PROJECT_DIR |
| 135 | + - cmake --build $CI_PROJECT_DIR/build |
| 136 | + artifacts: |
| 137 | + paths: |
| 138 | + - $CI_PROJECT_DIR/build/test/hipcub/test_* |
| 139 | + - $CI_PROJECT_DIR/build/test/CTestTestfile.cmake |
| 140 | + - $CI_PROJECT_DIR/build/test/hipcub/CTestTestfile.cmake |
| 141 | + - $CI_PROJECT_DIR/build/gtest/ |
| 142 | + - $CI_PROJECT_DIR/build/CMakeCache.txt |
| 143 | + - $CI_PROJECT_DIR/build/CTestTestfile.cmake |
| 144 | + - $CI_PROJECT_DIR/build/.ninja_log |
| 145 | + expire_in: 1 day |
| 146 | + parallel: |
| 147 | + matrix: |
| 148 | + - BUILD_VERSION: 17 |
| 149 | + |
| 150 | +build:rocm-windows-benchmark: |
| 151 | + extends: |
| 152 | + - .rocm-windows-rocprim |
| 153 | + - .rules:build |
| 154 | + - .gpus:rocm-windows |
| 155 | + - .deps:rocm-windows |
| 156 | + - .deps:visual-studio-devshell |
| 157 | + stage: build |
| 158 | + needs: [] |
| 159 | + script: |
| 160 | + - !reference [".rocm-windows-rocprim", script] |
| 161 | + - cmake |
| 162 | + -G Ninja |
| 163 | + -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe" |
| 164 | + -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror" |
| 165 | + -D CMAKE_PREFIX_PATH:PATH="$CI_PROJECT_DIR/rocPRIM_install;${env:HIP_PATH}" |
| 166 | + -D CMAKE_BUILD_TYPE=Release |
| 167 | + -D BUILD_BENCHMARK=ON |
| 168 | + -D GPU_TARGETS=$GPU_TARGET |
| 169 | + -D CMAKE_CXX_STANDARD=17 |
| 170 | + -B $CI_PROJECT_DIR/build |
| 171 | + -S $CI_PROJECT_DIR |
| 172 | + - cmake --build $CI_PROJECT_DIR/build |
| 173 | + artifacts: |
| 174 | + paths: |
| 175 | + - $CI_PROJECT_DIR/build/benchmark/* |
| 176 | + - $CI_PROJECT_DIR/build/deps/googlebenchmark/ |
| 177 | + - $CI_PROJECT_DIR/build/.ninja_log |
| 178 | + - $CI_PROJECT_DIR/build/CMakeCache.txt |
| 179 | + expire_in: 1 day |
| 180 | + |
| 181 | +.test:rocm: |
| 182 | stage: test |
| 183 | needs: |
| 184 | - build:rocm |
| 185 | + tags: |
| 186 | + - rocm |
| 187 | + - $GPU |
| 188 | extends: |
| 189 | - .rocm |
| 190 | - - .gpus:rocm |
| 191 | - - .rules:test |
| 192 | script: |
| 193 | - cd $CI_PROJECT_DIR/build |
| 194 | - cmake |
| 195 | -D CMAKE_PREFIX_PATH=/opt/rocm |
| 196 | - -D CMAKE_CXX_STANDARD=14 |
| 197 | + -D CMAKE_CXX_STANDARD=17 |
| 198 | -P $CI_PROJECT_DIR/cmake/GenerateResourceSpec.cmake |
| 199 | - cat ./resources.json |
| 200 | # Parallel execution (with other AMDGPU processes) can oversubscribe the SDMA queue. |
| 201 | @@ -191,10 +283,45 @@ test:rocm: |
| 202 | - HSA_ENABLE_SDMA=0 ctest |
| 203 | --output-on-failure |
| 204 | --repeat-until-fail 2 |
| 205 | - --tests-regex "$GPU_TARGET" |
| 206 | --resource-spec-file ./resources.json |
| 207 | --parallel $PARALLEL_JOBS |
| 208 | |
| 209 | +test:rocm-any-gpu: |
| 210 | + variables: |
| 211 | + GPU: "" |
| 212 | + PARALLEL_JOBS: 1 |
| 213 | + extends: |
| 214 | + - .test:rocm |
| 215 | + rules: |
| 216 | + - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/ && $CI_MERGE_REQUEST_LABELS !~ /Arch::/ |
| 217 | + |
| 218 | +test:rocm-label-arch: |
| 219 | + extends: |
| 220 | + - .gpus:rocm |
| 221 | + - .test:rocm |
| 222 | + - .rules:arch-labels |
| 223 | + |
| 224 | +test:rocm-all-gpus: |
| 225 | + variables: |
| 226 | + SHOULD_BE_UNDRAFTED: "true" |
| 227 | + extends: |
| 228 | + - .gpus:rocm |
| 229 | + - .test:rocm |
| 230 | + - .rules:test |
| 231 | + |
| 232 | +test:rocm-windows: |
| 233 | + stage: test |
| 234 | + extends: |
| 235 | + - .deps:rocm-windows |
| 236 | + - .gpus:rocm-gpus-windows |
| 237 | + - .deps:visual-studio-devshell |
| 238 | + - .rules:test |
| 239 | + needs: |
| 240 | + - job: build:rocm-windows |
| 241 | + script: |
| 242 | + - cd $CI_PROJECT_DIR/build |
| 243 | + - ctest --output-on-failure |
| 244 | + |
| 245 | .benchmark: |
| 246 | stage: benchmark |
| 247 | variables: |
| 248 | @@ -238,7 +365,7 @@ benchmark:rocm: |
| 249 | -G Ninja |
| 250 | -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror" |
| 251 | "$GPU_TARGETS_ARG" |
| 252 | - -D CMAKE_CXX_STANDARD=14 |
| 253 | + -D CMAKE_CXX_STANDARD=17 |
| 254 | -S $CI_PROJECT_DIR/test/extra |
| 255 | -B $CI_PROJECT_DIR/build/package_test |
| 256 | - cmake --build $CI_PROJECT_DIR/build/package_test |
| 257 | @@ -259,7 +386,7 @@ benchmark:rocm: |
| 258 | - cmake |
| 259 | -G Ninja |
| 260 | -D BUILD_TEST=OFF |
| 261 | - -D CMAKE_CXX_STANDARD=14 |
| 262 | + -D CMAKE_CXX_STANDARD=17 |
| 263 | -S $CI_PROJECT_DIR |
| 264 | -B $CI_PROJECT_DIR/build_only_install |
| 265 | # Preserve $PATH when sudoing |
| 266 | @@ -322,6 +449,7 @@ build:nvcc: |
| 267 | -D CMAKE_BUILD_TYPE=Release |
| 268 | -D BUILD_TEST=ON |
| 269 | -D BUILD_EXAMPLE=ON |
| 270 | + "$(if [ "$SCHEDULE" == "nightly" ]; then echo "-D BUILD_COMPUTE_SANITIZER=ON"; fi)" |
| 271 | -D NVGPU_TARGETS="$GPU_TARGETS" |
| 272 | -D ROCM_SYMLINK_LIBS=OFF |
| 273 | -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c |
| 274 | @@ -347,7 +475,7 @@ build:nvcc: |
| 275 | expire_in: 2 weeks |
| 276 | parallel: |
| 277 | matrix: |
| 278 | - - BUILD_VERSION: [14, 17] |
| 279 | + - BUILD_VERSION: 17 |
| 280 | |
| 281 | build:nvcc-benchmark: |
| 282 | stage: build |
| 283 | @@ -367,7 +495,7 @@ build:nvcc-benchmark: |
| 284 | -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c |
| 285 | -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx |
| 286 | -D CMAKE_CUDA_COMPILER_LAUNCHER=phc_sccache_cuda |
| 287 | - -D CMAKE_CXX_STANDARD=14 |
| 288 | + -D CMAKE_CXX_STANDARD=17 |
| 289 | -B $CI_PROJECT_DIR/build |
| 290 | -S $CI_PROJECT_DIR |
| 291 | - cmake --build $CI_PROJECT_DIR/build |
| 292 | @@ -379,14 +507,15 @@ build:nvcc-benchmark: |
| 293 | - $CI_PROJECT_DIR/build/CMakeCache.txt |
| 294 | expire_in: 2 weeks |
| 295 | |
| 296 | -test:nvcc: |
| 297 | +.test:nvcc: |
| 298 | stage: test |
| 299 | + tags: |
| 300 | + - nvcc |
| 301 | + - $GPU |
| 302 | needs: |
| 303 | - build:nvcc |
| 304 | extends: |
| 305 | - .nvcc |
| 306 | - - .gpus:nvcc |
| 307 | - - .rules:test |
| 308 | before_script: |
| 309 | # This is only needed because of the legacy before_script in .gpus:nvcc would otherwise overwrite before_script |
| 310 | - !reference [.nvcc, before_script] |
| 311 | @@ -394,6 +523,23 @@ test:nvcc: |
| 312 | - cd $CI_PROJECT_DIR/build |
| 313 | - ctest --output-on-failure --repeat-until-fail 2 |
| 314 | |
| 315 | +test:nvcc-any-gpu: |
| 316 | + variables: |
| 317 | + GPU: "" |
| 318 | + PARALLEL_JOBS: 1 |
| 319 | + extends: |
| 320 | + - .test:nvcc |
| 321 | + rules: |
| 322 | + - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/ |
| 323 | + |
| 324 | +test:nvcc-all-gpus: |
| 325 | + variables: |
| 326 | + SHOULD_BE_UNDRAFTED: "true" |
| 327 | + extends: |
| 328 | + - .gpus:nvcc |
| 329 | + - .test:nvcc |
| 330 | + - .rules:test |
| 331 | + |
| 332 | benchmark:nvcc: |
| 333 | needs: |
| 334 | - build:nvcc-benchmark |
| 335 | @@ -436,6 +582,10 @@ test:doc: |
| 336 | extends: |
| 337 | - .rules:test |
| 338 | - .build:docs |
| 339 | + artifacts: |
| 340 | + paths: |
| 341 | + - $DOCS_DIR/_build/html/ |
| 342 | + expire_in: 2 weeks |
| 343 | |
| 344 | scheduled-check-changes: |
| 345 | extends: .rules:scheduled-check-changes |
| 346 | diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy |
| 347 | deleted file mode 100644 |
| 348 | index 79ebcb6..0000000 |
| 349 | --- a/.jenkins/common.groovy |
| 350 | +++ /dev/null |
| 351 | @@ -1,53 +0,0 @@ |
| 352 | -// This file is for internal AMD use. |
| 353 | -// If you are interested in running your own Jenkins, please raise a github issue for assistance. |
| 354 | - |
| 355 | -def runCompileCommand(platform, project, jobName, boolean debug=false, boolean sameOrg=true) |
| 356 | -{ |
| 357 | - project.paths.construct_build_prefix() |
| 358 | - |
| 359 | - String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release' |
| 360 | - String buildTypeDir = debug ? 'debug' : 'release' |
| 361 | - String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' |
| 362 | - //Set CI node's gfx arch as target if PR, otherwise use default targets of the library |
| 363 | - String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' |
| 364 | - |
| 365 | - def getRocPRIM = auxiliary.getLibrary('rocPRIM', platform.jenkinsLabel, null, sameOrg) |
| 366 | - |
| 367 | - def command = """#!/usr/bin/env bash |
| 368 | - set -x |
| 369 | - ${getRocPRIM} |
| 370 | - cd ${project.paths.project_build_prefix} |
| 371 | - mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} |
| 372 | - ${auxiliary.gfxTargetParser()} |
| 373 | - ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../.. |
| 374 | - make -j\$(nproc) |
| 375 | - """ |
| 376 | - |
| 377 | - platform.runCommand(this, command) |
| 378 | -} |
| 379 | - |
| 380 | - |
| 381 | -def runTestCommand (platform, project) |
| 382 | -{ |
| 383 | - String sudo = auxiliary.sudo(platform.jenkinsLabel) |
| 384 | - |
| 385 | - def testCommand = "ctest --output-on-failure --verbose --timeout 900" |
| 386 | - def command = """#!/usr/bin/env bash |
| 387 | - set -x |
| 388 | - cd ${project.paths.project_build_prefix} |
| 389 | - cd ${project.testDirectory} |
| 390 | - ${sudo} LD_LIBRARY_PATH=/opt/rocm/lib ${testCommand} |
| 391 | - """ |
| 392 | - |
| 393 | - platform.runCommand(this, command) |
| 394 | -} |
| 395 | - |
| 396 | -def runPackageCommand(platform, project) |
| 397 | -{ |
| 398 | - def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release") |
| 399 | - |
| 400 | - platform.runCommand(this, packageHelper[0]) |
| 401 | - platform.archiveArtifacts(this, packageHelper[1]) |
| 402 | -} |
| 403 | - |
| 404 | -return this |
| 405 | diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy |
| 406 | deleted file mode 100644 |
| 407 | index 074bc53..0000000 |
| 408 | --- a/.jenkins/precheckin.groovy |
| 409 | +++ /dev/null |
| 410 | @@ -1,84 +0,0 @@ |
| 411 | -#!/usr/bin/env groovy |
| 412 | -// This shared library is available at https://github.com/ROCm/rocJENKINS/ |
| 413 | -@Library('rocJenkins@pong') _ |
| 414 | - |
| 415 | -// This file is for internal AMD use. |
| 416 | -// If you are interested in running your own Jenkins, please raise a github issue for assistance. |
| 417 | - |
| 418 | -import com.amd.project.* |
| 419 | -import com.amd.docker.* |
| 420 | -import java.nio.file.Path; |
| 421 | - |
| 422 | -def runCI = |
| 423 | -{ |
| 424 | - nodeDetails, jobName-> |
| 425 | - |
| 426 | - def prj = new rocProject('hipCUB', 'PreCheckin') |
| 427 | - prj.timeout.compile = 400 |
| 428 | - // Define test architectures, optional rocm version argument is available |
| 429 | - def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 430 | - |
| 431 | - boolean formatCheck = false |
| 432 | - |
| 433 | - def commonGroovy |
| 434 | - |
| 435 | - def compileCommand = |
| 436 | - { |
| 437 | - platform, project-> |
| 438 | - |
| 439 | - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" |
| 440 | - commonGroovy.runCompileCommand(platform, project, jobName) |
| 441 | - } |
| 442 | - |
| 443 | - def testCommand = |
| 444 | - { |
| 445 | - platform, project-> |
| 446 | - |
| 447 | - commonGroovy.runTestCommand(platform, project) |
| 448 | - } |
| 449 | - |
| 450 | - def packageCommand = |
| 451 | - { |
| 452 | - platform, project-> |
| 453 | - |
| 454 | - commonGroovy.runPackageCommand(platform, project) |
| 455 | - } |
| 456 | - |
| 457 | - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) |
| 458 | -} |
| 459 | - |
| 460 | -ci: { |
| 461 | - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) |
| 462 | - |
| 463 | - def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] |
| 464 | - propertyList = auxiliary.appendPropertyList(propertyList) |
| 465 | - |
| 466 | - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] |
| 467 | - jobNameList = auxiliary.appendJobNameList(jobNameList) |
| 468 | - |
| 469 | - auxiliary.registerDependencyBranchParameter(["rocPRIM"]) |
| 470 | - |
| 471 | - propertyList.each |
| 472 | - { |
| 473 | - jobName, property-> |
| 474 | - if (urlJobName == jobName) |
| 475 | - properties(auxiliary.addCommonProperties(property)) |
| 476 | - } |
| 477 | - |
| 478 | - Set seenJobNames = [] |
| 479 | - jobNameList.each |
| 480 | - { |
| 481 | - jobName, nodeDetails-> |
| 482 | - seenJobNames.add(jobName) |
| 483 | - if (urlJobName == jobName) |
| 484 | - runCI(nodeDetails, jobName) |
| 485 | - } |
| 486 | - |
| 487 | - // For url job names that are outside of the standardJobNameSet i.e. compute-rocm-dkms-no-npi-1901 |
| 488 | - if(!seenJobNames.contains(urlJobName)) |
| 489 | - { |
| 490 | - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) |
| 491 | - runCI([ubuntu16:['gfx906']], urlJobName) |
| 492 | - } |
| 493 | -} |
| 494 | - |
| 495 | diff --git a/.jenkins/staticanalysis.groovy b/.jenkins/staticanalysis.groovy |
| 496 | deleted file mode 100644 |
| 497 | index 5c0b949..0000000 |
| 498 | --- a/.jenkins/staticanalysis.groovy |
| 499 | +++ /dev/null |
| 500 | @@ -1,46 +0,0 @@ |
| 501 | -#!/usr/bin/env groovy |
| 502 | -// This shared library is available at https://github.com/ROCm/rocJENKINS/ |
| 503 | -@Library('rocJenkins@pong') _ |
| 504 | - |
| 505 | -// This is file for internal AMD use. |
| 506 | -// If you are interested in running your own Jenkins, please raise a github issue for assistance. |
| 507 | - |
| 508 | -import com.amd.project.* |
| 509 | -import com.amd.docker.* |
| 510 | -import java.nio.file.Path |
| 511 | - |
| 512 | -def runCompileCommand(platform, project, jobName, boolean debug=false) |
| 513 | -{ |
| 514 | - project.paths.construct_build_prefix() |
| 515 | -} |
| 516 | - |
| 517 | -def runCI = |
| 518 | -{ |
| 519 | - nodeDetails, jobName-> |
| 520 | - |
| 521 | - def prj = new rocProject('hipCUB', 'StaticAnalysis') |
| 522 | - |
| 523 | - // Define test architectures, optional rocm version argument is available |
| 524 | - def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 525 | - |
| 526 | - boolean formatCheck = false |
| 527 | - boolean staticAnalysis = true |
| 528 | - |
| 529 | - def compileCommand = |
| 530 | - { |
| 531 | - platform, project-> |
| 532 | - |
| 533 | - runCompileCommand(platform, project, jobName, false) |
| 534 | - } |
| 535 | - |
| 536 | - buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis) |
| 537 | -} |
| 538 | - |
| 539 | -ci: { |
| 540 | - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) |
| 541 | - |
| 542 | - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])])) |
| 543 | - stage(urlJobName) { |
| 544 | - runCI([ubuntu20:['any']], urlJobName) |
| 545 | - } |
| 546 | -} |
| 547 | diff --git a/.jenkins/staticlibrary.groovy b/.jenkins/staticlibrary.groovy |
| 548 | deleted file mode 100644 |
| 549 | index 549913d..0000000 |
| 550 | --- a/.jenkins/staticlibrary.groovy |
| 551 | +++ /dev/null |
| 552 | @@ -1,82 +0,0 @@ |
| 553 | -#!/usr/bin/env groovy |
| 554 | -@Library('rocJenkins@pong') _ |
| 555 | -import com.amd.project.* |
| 556 | -import com.amd.docker.* |
| 557 | -import java.nio.file.Path; |
| 558 | - |
| 559 | -def runCI = |
| 560 | -{ |
| 561 | - nodeDetails, jobName-> |
| 562 | - |
| 563 | - def prj = new rocProject('hipCUB', 'Static Library PreCheckin') |
| 564 | - |
| 565 | - def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 566 | - |
| 567 | - def commonGroovy |
| 568 | - |
| 569 | - boolean formatCheck = false |
| 570 | - |
| 571 | - def compileCommand = |
| 572 | - { |
| 573 | - platform, project-> |
| 574 | - |
| 575 | - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" |
| 576 | - commonGroovy.runCompileCommand(platform, project, jobName, false, true) |
| 577 | - } |
| 578 | - |
| 579 | - |
| 580 | - def testCommand = |
| 581 | - { |
| 582 | - platform, project-> |
| 583 | - |
| 584 | - commonGroovy.runTestCommand(platform, project) |
| 585 | - } |
| 586 | - |
| 587 | - def packageCommand = |
| 588 | - { |
| 589 | - platform, project-> |
| 590 | - |
| 591 | - commonGroovy.runPackageCommand(platform, project) |
| 592 | - } |
| 593 | - |
| 594 | - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) |
| 595 | -} |
| 596 | - |
| 597 | -ci: { |
| 598 | - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) |
| 599 | - |
| 600 | - def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], |
| 601 | - "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], |
| 602 | - "rocm-docker":[]] |
| 603 | - propertyList = auxiliary.appendPropertyList(propertyList) |
| 604 | - |
| 605 | - def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]), |
| 606 | - "compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]), |
| 607 | - "rocm-docker":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']])] |
| 608 | - jobNameList = auxiliary.appendJobNameList(jobNameList) |
| 609 | - |
| 610 | - propertyList.each |
| 611 | - { |
| 612 | - jobName, property-> |
| 613 | - if (urlJobName == jobName) |
| 614 | - properties(auxiliary.addCommonProperties(property)) |
| 615 | - } |
| 616 | - |
| 617 | - jobNameList.each |
| 618 | - { |
| 619 | - jobName, nodeDetails-> |
| 620 | - if (urlJobName == jobName) |
| 621 | - stage(jobName) { |
| 622 | - runCI(nodeDetails, jobName) |
| 623 | - } |
| 624 | - } |
| 625 | - |
| 626 | - // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 |
| 627 | - if(!jobNameList.keySet().contains(urlJobName)) |
| 628 | - { |
| 629 | - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) |
| 630 | - stage(urlJobName) { |
| 631 | - runCI([ubuntu16:['gfx906']], urlJobName) |
| 632 | - } |
| 633 | - } |
| 634 | -} |
| 635 | diff --git a/CHANGELOG.md b/CHANGELOG.md |
| 636 | index 5a13e14..a9814bf 100644 |
| 637 | --- a/CHANGELOG.md |
| 638 | +++ b/CHANGELOG.md |
| 639 | @@ -2,6 +2,107 @@ |
| 640 | |
| 641 | Full documentation for hipCUB is available at [https://rocm.docs.amd.com/projects/hipCUB/en/latest/](https://rocm.docs.amd.com/projects/hipCUB/en/latest/). |
| 642 | |
| 643 | +## hipCUB-4.1.0 for ROCm 7.1 |
| 644 | + |
| 645 | +### Added |
| 646 | + |
| 647 | +* Exposed Thread-level reduction API `hipcub::ThreadReduce`. |
| 648 | +* Added `::hipcub::extents`, with limited parity to C++23's `std::extents`. Only `static extents` is supported; `dynamic extents` is not. Helper structs have been created to perform computations on `::hipcub::extents` only when the backend is rocPRIM. For the CUDA backend, similar functionality exists. |
| 649 | +* Added `projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_mdspan.hpp` to support `::hipcub::extents`. |
| 650 | +* Added `::hipcub::ForEachInExtents` API. |
| 651 | +* Added `hipcub::DeviceTransform::Transform` and `hipcub::DeviceTransform::TransformStableArgumentAddresses`. |
| 652 | + |
| 653 | +* hipCUB and its dependency rocPRIM have been moved into the new rocm-libraries "monorepo" repository (https://github.com/ROCm/rocm-libraries). This repository contains a number of ROCm libraries that are frequently used together. |
| 654 | + * The repository migration requires a few changes to the way that hipCUB fetches library dependencies. |
| 655 | + * CMake build option `ROCPRIM_FETCH_METHOD` may be set to one of the following: |
| 656 | + * `PACKAGE` - (default) searches for a preinstalled packaged version of the dependency. If it is not found, the build will fall back using option `DOWNLOAD`, below. |
| 657 | + * `DOWNLOAD` - downloads the dependency from the rocm-libraries repository. If git >= 2.25 is present, this option uses a sparse checkout that avoids downloading more than it needs to. If not, the whole monorepo is downloaded (this may take some time). |
| 658 | + * `MONOREPO` - this options is intended to be used if you are building hipCUB from within a copy of the rocm-libraries repository that you have cloned (and therefore already contains rocPRIM). When selected, the build will try find the dependency in the local repository tree. If it cannot be found, the build will attempt to use git to perform a sparse-checkout of rocPRIM. If that also fails, it will fall back to using the `DOWNLOAD` option described above. |
| 659 | + |
| 660 | +* Added a new CMake option `-DUSE_SYSTEM_LIB` to allow tests to be built from installed `hipCUB` provided by the system. |
| 661 | + |
| 662 | +### Removed |
| 663 | + |
| 664 | +* Removed `TexRefInputIterator`, which was removed from CUB after CCCL's 2.6.0 release. This API should have already been removed, but somehow it remained and was not tested. |
| 665 | +* Deprecated `hipcub::ConstantInputIterator`, use `rocprim::constant_iterator` or `rocthrust::constant_iterator` instead. |
| 666 | +* Deprecated `hipcub::CountingInputIterator`, use `rocprim::counting_iterator` or `rocthrust::counting_iterator` instead. |
| 667 | +* Deprecated `hipcub::DiscardOutputIterator`, use `rocprim::discard_iterator` or `rocthrust::discard_iterator` instead. |
| 668 | +* Deprecated `hipcub::TransformInputIterator`, use `rocprim::transform_iterator` or `rocthrust::transform_iterator` instead. |
| 669 | +* Deprecated `hipcub::AliasTemporaries`, which is considered to be internal API. Moved it to detail namespace. |
| 670 | +* Deprecated almost all functions in `projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp`. |
| 671 | +* Deprecated hipCUB macros: `HIPCUB_MAX`, `HIPCUB_MIN`, `HIPCUB_QUOTIENT_FLOOR`, `HIPCUB_QUOTIENT_CEILING`, `HIPCUB_ROUND_UP_NEAREST` and `HIPCUB_ROUND_DOWN_NEAREST`. |
| 672 | + |
| 673 | +### Changed |
| 674 | + |
| 675 | +* Changed include headers to avoid relative includes that have slipped in. |
| 676 | +* Changed `CUDA_STANDARD` for tests in `test/hipcub`, due to C++17 APIs such as `std::exclusive_scan` is used in some tests. Still use `CUDA_STANDARD 14` for `test/extra`. |
| 677 | +* Changed `CCCL_MINIMUM_VERSION` to `2.8.2` to align with CUB. |
| 678 | +* Changed `cmake_minimum_required` from `3.16` to `3.18`, in order to support `CUDA_STANDARD 17` as a valid value. |
| 679 | +* Add support for large num_items `DeviceScan`, `DevicePartition` and `Reduce::{ArgMin, ArgMax}`. |
| 680 | +* Added tests for large num_items. |
| 681 | +* The previous dependency-related build option `DEPENDENCIES_FORCE_DOWNLOAD` has been renamed `EXTERNAL_DEPS_FORCE_DOWNLOAD` to differentiate it from the new rocPRIM dependency option described above. It's behaviour remains the same - it forces non-ROCm dependencies (Google Benchmark and Google Test) to be downloaded instead of searching for existing installed packages. This option defaults to `OFF`. |
| 682 | + |
| 683 | +### Known issues |
| 684 | + |
| 685 | +* The '__half' template specializations of Simd operators are currently disabled due to possible build issues with PyTorch. |
| 686 | + |
| 687 | +## hipCUB-4.0.0 for ROCm 7.0 |
| 688 | + |
| 689 | +### Added |
| 690 | + |
| 691 | +* Added a new cmake option, `BUILD_OFFLOAD_COMPRESS`. When hipCUB is build with this option enabled, the `--offload-compress` switch is passed to the compiler. This causes the compiler to compress the binary that it generates. Compression can be useful in cases where you are compiling for a large number of targets, since this often results in a large binary. Without compression, in some cases, the generated binary may become so large symbols are placed out of range, resulting in linking errors. The new `BUILD_OFFLOAD_COMPRESS` option is set to `ON` by default. |
| 692 | +* Added single pass operators in `agent/single_pass_scan_operators.hpp` which contains the following API: |
| 693 | + * `BlockScanRunningPrefixOp` |
| 694 | + * `ScanTileStatus` |
| 695 | + * `ScanTileState` |
| 696 | + * `ReduceByKeyScanTileState` |
| 697 | + * `TilePrefixCallbackOp` |
| 698 | +* Added gfx950 support. |
| 699 | +* Added an overload of `BlockScan::InclusiveScan` that accepts an initial value to seed the scan. |
| 700 | +* Added an overload of `WarpScan::InclusiveScan` that accepts an initial value to seed the scan. |
| 701 | +* `UnrolledThreadLoad`, `UnrolledCopy`, and `ThreadLoadVolatilePointer` were added to align hipCUB with CUB. |
| 702 | +* `ThreadStoreVolatilePtr` and the `IterateThreadStore` struct were added to align hipCUB with CUB. |
| 703 | +* Added `hipcub::InclusiveScanInit` for CUB parity. |
| 704 | +* Additional Unit Tests for: |
| 705 | + * block_exchange |
| 706 | + * block_merge_sort |
| 707 | + * block_radix_rank |
| 708 | + * block_radix_sort |
| 709 | + * block_reduce |
| 710 | + * block_shuffle |
| 711 | + |
| 712 | +### Removed |
| 713 | + |
| 714 | +* The AMD GPU targets `gfx803` and `gfx900` are no longer built by default. If you would like to build for these architectures, please specify them explicitly in the `AMDGPU_TARGETS` cmake option. |
| 715 | +* Deprecated `hipcub::AsmThreadLoad` is removed, use `hipcub::ThreadLoad` instead. |
| 716 | +* Deprecated `hipcub::AsmThreadStore` is removed, use `hipcub::ThreadStore` instead. |
| 717 | +* Deprecated `BlockAdjacentDifference::FlagHeads`, `BlockAdjacentDifference::FlagTails` and `BlockAdjacentDifference::FlagHeadsAndTails` have been removed. |
| 718 | +* This release removes support for custom builds on gfx940 and gfx941. |
| 719 | +* Removed C++14 support, only C++17 is supported. |
| 720 | + |
| 721 | +### Changed |
| 722 | + |
| 723 | +* The NVIDIA backend now requires CUB, Thrust, and libcu++ 2.7.0. If they aren't found, they will be downloaded from the NVIDIA CCCL repository. |
| 724 | +* Updated `thread_load` and `thread_store` to align hipCUB with CUB. |
| 725 | +* All kernels now have hidden symbol visibility. All symbols now have inline namespaces that include the library version, (for example, hipcub::HIPCUB_300400_NS::symbol instead of hipcub::symbol), letting the user link multiple libraries built with different versions of hipCUB. |
| 726 | +* Modified the broadcast kernel in warp scan benchmarks. The reported performance may be different to previous versions. |
| 727 | +* The `hipcub::detail::accumulator_t` in rocPRIM backend has been changed to utilise `rocprim::accumulator_t`. |
| 728 | +* The usage of `rocprim::invoke_result_binary_op_t` has been replaced with `rocprim::accumulator_t`. |
| 729 | + |
| 730 | +### Resolved issues |
| 731 | +* Fixed an issue where `Sort(keys, compare_op, valid_items, oob_default)` in `block_merge_sort.hpp` would not fill in elements that are out of range (items after `valid_items`) with `oob_default`. |
| 732 | +* Fixed an issue where `ScatterToStripedFlagged` in `block_exhange.hpp` was calling the wrong function. |
| 733 | + |
| 734 | +### Known issues |
| 735 | + |
| 736 | +* `BlockAdjacentDifference::FlagHeads`, `BlockAdjacentDifference::FlagTails` and `BlockAdjacentDifference::FlagHeadsAndTails` have been removed from hipCUB's CUB backend. They were already deprecated as of version 2.12.0 of hipCUB and they were removed from CCCL (CUB) as of CCCL's 2.6.0 release. |
| 737 | +* `BlockScan::InclusiveScan` for the NVIDIA backend does not compute the block aggregate correctly when passing an initial value parameter. This behavior is not matched by the AMD backend. |
| 738 | + |
| 739 | + |
| 740 | +### Upcoming Changes |
| 741 | + |
| 742 | +* `BlockAdjacentDifference::FlagHeads`, `BlockAdjacentDifference::FlagTails` and `BlockAdjacentDifference::FlagHeadsAndTails` were deprecated as of version 2.12.0 of hipCUB, and will be removed from the rocPRIM backend in a future release for the next ROCm major version (ROCm 7.0.0). |
| 743 | + |
| 744 | ## hipCUB-3.4.0 for ROCm 6.4.0 |
| 745 | |
| 746 | ### Added |
| 747 | diff --git a/CMakeLists.txt b/CMakeLists.txt |
| 748 | index dd9a3e2..fbcecf0 100644 |
| 749 | --- a/CMakeLists.txt |
| 750 | +++ b/CMakeLists.txt |
| 751 | @@ -1,6 +1,6 @@ |
| 752 | # MIT License |
| 753 | # |
| 754 | -# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 755 | +# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 756 | # |
| 757 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 758 | # of this software and associated documentation files (the "Software"), to deal |
| 759 | @@ -20,8 +20,19 @@ |
| 760 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 761 | # SOFTWARE. |
| 762 | |
| 763 | -cmake_minimum_required(VERSION 3.16 FATAL_ERROR) |
| 764 | -cmake_policy(VERSION 3.16...3.25) |
| 765 | +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) |
| 766 | +cmake_policy(VERSION 3.18...3.25) |
| 767 | + |
| 768 | +# -------------------------------------- |
| 769 | +# Update these variables at release time |
| 770 | +# |
| 771 | +# Set the library version |
| 772 | +set(VERSION_STRING "4.1.0") |
| 773 | +# Set the minimum required rocPRIM version |
| 774 | +set(MIN_ROCPRIM_PACKAGE_VERSION "4.1.0" CACHE STRING "Minimum version of rocPRIM to search for when ROCPRIM_FETCH_METHOD is set to PACKAGE.") |
| 775 | +# Set download branch for dependency rocPRIM |
| 776 | +set(ROCM_DEP_RELEASE_BRANCH "release/rocm-rel-7.1" CACHE STRING "Download branch for ROCm dependencies") |
| 777 | +# -------------------------------------- |
| 778 | |
| 779 | # Install prefix |
| 780 | set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories") |
| 781 | @@ -43,7 +54,7 @@ elseif(NOT CMAKE_CXX_STANDARD EQUAL 17) |
| 782 | endif() |
| 783 | |
| 784 | # Set HIP flags |
| 785 | -set(CMAKE_HIP_STANDARD 14) |
| 786 | +set(CMAKE_HIP_STANDARD 17) |
| 787 | set(CMAKE_HIP_STANDARD_REQUIRED ON) |
| 788 | set(CMAKE_HIP_EXTENSIONS OFF) |
| 789 | |
| 790 | @@ -52,15 +63,23 @@ include(CMakeDependentOption) |
| 791 | |
| 792 | # Build options |
| 793 | option(BUILD_TEST "Build tests (requires googletest)" OFF) |
| 794 | -option(DEPENDENCIES_FORCE_DOWNLOAD "Download dependencies and do not search for packages" OFF) |
| 795 | +option(CODE_COVERAGE "Enable code coverage" OFF) |
| 796 | +option(EXTERNAL_DEPS_FORCE_DOWNLOAD "Download non-ROCm dependencies and do not search for packages" OFF) |
| 797 | option(DOWNLOAD_CUB "Download CUB and thrust. Do not search for CUB package" OFF) |
| 798 | option(BUILD_BENCHMARK "Build benchmarks" OFF) |
| 799 | option(BUILD_EXAMPLE "Build Examples" OFF) |
| 800 | option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF) |
| 801 | +option(BUILD_OFFLOAD_COMPRESS "Build hipCUB with offload compression" ON) |
| 802 | +option(BUILD_COMPUTE_SANITIZER "Build tests with cuda's compute sanitizer enabled" OFF) |
| 803 | +cmake_dependent_option(USE_SYSTEM_LIB "Use installed hipCUB when building tests" OFF BUILD_TEST OFF) |
| 804 | |
| 805 | +# Check and test cuda compiler, defines 'CMAKE_HIP_COMPILER' |
| 806 | check_language(HIP) |
| 807 | cmake_dependent_option(USE_HIPCXX "Use CMake HIP language support" OFF CMAKE_HIP_COMPILER OFF) |
| 808 | |
| 809 | +# Check and test cuda compiler, defines 'CMAKE_CUDA_COMPILER' |
| 810 | +check_language(CUDA) |
| 811 | + |
| 812 | # Set the ROCM install directory. |
| 813 | if(WIN32) |
| 814 | set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation") |
| 815 | @@ -68,15 +87,29 @@ else() |
| 816 | set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") |
| 817 | endif() |
| 818 | |
| 819 | +# Set up options for obtaining dependency rocPRIM. |
| 820 | +# PACKAGE: Search for an install package that contains the dependency. |
| 821 | +# MONOREPO: Assume this is a monorepo checkout and search for the dependency in the directory at ../../projects/. |
| 822 | +# DOWNLOAD: Download the dependency from the monorepo. |
| 823 | +set(FETCH_METHOD_OPTIONS "PACKAGE" "MONOREPO" "DOWNLOAD") |
| 824 | |
| 825 | -# Set the header wrapper ON by default. |
| 826 | -option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" OFF) |
| 827 | +set(ROCPRIM_FETCH_METHOD "PACKAGE" CACHE STRING "How to obtain the rocPRIM dependency") |
| 828 | + |
| 829 | +# This function checks to see if the fetch method variable it's passed is defined, and contains a valid value. |
| 830 | +# If it does not contain a valid value, it issues a fatal failure with an error message. |
| 831 | +function(check_fetch_method method) |
| 832 | + if (DEFINED ${method} AND NOT ${${method}} IN_LIST FETCH_METHOD_OPTIONS) |
| 833 | + message(FATAL_ERROR "Unrecognized ${method}: \"${${method}}\". Valid options are: ${FETCH_METHOD_OPTIONS}.") |
| 834 | + endif() |
| 835 | +endfunction() |
| 836 | + |
| 837 | +check_fetch_method(ROCPRIM_FETCH_METHOD) |
| 838 | |
| 839 | # Add hipCUB's CMake modules |
| 840 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") |
| 841 | |
| 842 | # Set a default build type if none was specified |
| 843 | -if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) |
| 844 | +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES AND NOT CODE_COVERAGE) |
| 845 | message(STATUS "Setting build type to 'Release' as none was specified.") |
| 846 | set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE) |
| 847 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo") |
| 848 | @@ -88,35 +121,55 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker searc |
| 849 | # If hip is included prior to setting that then it defaults to building only for the current architecture |
| 850 | include(ROCmCMakeBuildToolsDependency) |
| 851 | |
| 852 | -# Setup GPU targets for rocm platform |
| 853 | +# Detect compiler through use of result from 'check_language(...)' |
| 854 | if(USE_HIPCXX) |
| 855 | enable_language(HIP) |
| 856 | -else() |
| 857 | +elseif(NOT (CMAKE_CXX_COMPILER MATCHES ".*nvcc$" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")) |
| 858 | + # Detected HIP through archaic match by checking passed CXX compiler. This can |
| 859 | + # be removed once we bump minimum CMake version to 3.21 or higher. |
| 860 | + |
| 861 | # Setup GPU targets for rocm platform |
| 862 | - if(NOT (CMAKE_CXX_COMPILER MATCHES ".*nvcc$" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")) |
| 863 | - if(NOT DEFINED AMDGPU_TARGETS) |
| 864 | - set(GPU_TARGETS "all" CACHE STRING "GPU architectures to compile for") |
| 865 | + message(STATUS "CMake could not derive language via 'check_language'. Falling back to legacy compiler checks.") |
| 866 | + if(NOT DEFINED AMDGPU_TARGETS) |
| 867 | + set(GPU_TARGETS "all" CACHE STRING "GPU architectures to compile for") |
| 868 | + else() |
| 869 | + set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for") |
| 870 | + endif() |
| 871 | + set_property(CACHE GPU_TARGETS PROPERTY STRINGS "all") |
| 872 | + |
| 873 | + if(GPU_TARGETS STREQUAL "all") |
| 874 | + if(BUILD_ADDRESS_SANITIZER) |
| 875 | + # ASAN builds require xnack |
| 876 | + rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS |
| 877 | + TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+;gfx950:xnack+" |
| 878 | + ) |
| 879 | else() |
| 880 | - set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for") |
| 881 | - endif() |
| 882 | - set_property(CACHE GPU_TARGETS PROPERTY STRINGS "all") |
| 883 | - |
| 884 | - if(GPU_TARGETS STREQUAL "all") |
| 885 | - if(BUILD_ADDRESS_SANITIZER) |
| 886 | - # ASAN builds require xnack |
| 887 | - rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS |
| 888 | - TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+" |
| 889 | - ) |
| 890 | - else() |
| 891 | - rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS |
| 892 | - TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201" |
| 893 | - ) |
| 894 | - endif() |
| 895 | - set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE) |
| 896 | + rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS |
| 897 | + TARGETS "gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201" |
| 898 | + ) |
| 899 | endif() |
| 900 | + set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE) |
| 901 | endif() |
| 902 | +elseif(CMAKE_CUDA_COMPILER) |
| 903 | + # We haven't detected HIP, so surely we must be on a CUDA-compatible compiler. |
| 904 | + # Contrary to HIP, 'enable_language(CUDA)' is supported from CMake 3.8 and higher. |
| 905 | + enable_language(CUDA) |
| 906 | + |
| 907 | + # Hack: let CMake think that our 'hip' files are actually CUDA files. |
| 908 | + set(CMAKE_CUDA_SOURCE_FILE_EXTENSIONS hip;cu) |
| 909 | endif() |
| 910 | |
| 911 | +# Compressed offload binaries are currently not working with the SPIR-V target |
| 912 | +if("amdgcnspirv" IN_LIST GPU_TARGETS) |
| 913 | + if(BUILD_OFFLOAD_COMPRESS) |
| 914 | + message(FATAL_ERROR "Cannot combine SPIR-V and BUILD_OFFLOAD_COMPRESS") |
| 915 | + endif() |
| 916 | +endif() |
| 917 | + |
| 918 | +# Setup the library version |
| 919 | +rocm_setup_version(VERSION ${VERSION_STRING}) |
| 920 | +math(EXPR hipcub_VERSION_NUMBER "${hipcub_VERSION_MAJOR} * 100000 + ${hipcub_VERSION_MINOR} * 100 + ${hipcub_VERSION_PATCH}") |
| 921 | + |
| 922 | # Find and verify HIP. |
| 923 | include(VerifyCompiler) |
| 924 | |
| 925 | @@ -128,23 +181,45 @@ if(BUILD_ADDRESS_SANITIZER) |
| 926 | add_link_options(-fuse-ld=lld) |
| 927 | endif() |
| 928 | |
| 929 | -# Setup VERSION |
| 930 | -set(VERSION_STRING "3.4.0") |
| 931 | -rocm_setup_version(VERSION ${VERSION_STRING}) |
| 932 | +include(CheckCXXCompilerFlag) |
| 933 | |
| 934 | -# Print configuration summary |
| 935 | -include(cmake/Summary.cmake) |
| 936 | -print_configuration_summary() |
| 937 | +if(BUILD_OFFLOAD_COMPRESS) |
| 938 | + # We need to pass '-x hip' since check_cxx_compiler_flag assumes c++ and not HIP. |
| 939 | + check_cxx_compiler_flag("--offload-compress -x hip" CXX_COMPILER_SUPPORTS_OFFLOAD_COMPRESS) |
| 940 | + if(CXX_COMPILER_SUPPORTS_OFFLOAD_COMPRESS) |
| 941 | + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --offload-compress") |
| 942 | + else() |
| 943 | + message(STATUS "Warning: BUILD_OFFLOAD_COMPRESS=ON but flag not supported by compiler. Ignoring option.") |
| 944 | + endif() |
| 945 | +endif() |
| 946 | |
| 947 | # hipCUB library |
| 948 | add_subdirectory(hipcub) |
| 949 | |
| 950 | +if(CODE_COVERAGE) |
| 951 | + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -fprofile-instr-generate -fcoverage-mapping") |
| 952 | +endif() |
| 953 | + |
| 954 | if(BUILD_TEST OR (BUILD_BENCHMARK AND NOT ONLY_INSTALL)) |
| 955 | rocm_package_setup_component(clients) |
| 956 | endif() |
| 957 | |
| 958 | # Tests |
| 959 | if(BUILD_TEST) |
| 960 | + if(USE_SYSTEM_LIB) |
| 961 | + # On ROCm hipCUB requires rocPRIM |
| 962 | + if(HIP_COMPILER STREQUAL "clang") |
| 963 | + find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/lib/cmake/rocprim") |
| 964 | + if (${rocprim_VERSION} VERSION_LESS ${MIN_ROCPRIM_PACKAGE_VERSION}) |
| 965 | + message(WARNING "The installed rocprim version, ${rocprim_VERSION}, is less than the minimum required version ${MIN_ROCPRIM_PACKAGE_VERSION}. Building tests with USE_SYSTEM_LIB=ON may not work properly.") |
| 966 | + endif() |
| 967 | + endif() |
| 968 | + find_package(hipcub REQUIRED CONFIG PATHS "/opt/rocm/lib/cmake/hipcub") |
| 969 | + if (NOT ${hipcub_VERSION} VERSION_EQUAL ${VERSION_STRING}) |
| 970 | + message(WARNING "The installed hipcub version, ${hipcub_VERSION}, does not match project version ${VERSION_STRING}. Building tests with USE_SYSTEM_LIB=ON may not work properly.") |
| 971 | + endif() |
| 972 | + endif() |
| 973 | + |
| 974 | enable_testing() |
| 975 | rocm_package_setup_client_component(tests) |
| 976 | add_subdirectory(test) |
| 977 | @@ -161,18 +236,6 @@ if(BUILD_BENCHMARK AND NOT ONLY_INSTALL) |
| 978 | add_subdirectory(benchmark) |
| 979 | endif() |
| 980 | |
| 981 | -# Create header wrapper for backward compatibility |
| 982 | -if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32) |
| 983 | - rocm_wrap_header_dir( |
| 984 | - ${PROJECT_SOURCE_DIR}/hipcub/include/hipcub/ |
| 985 | - PATTERNS "*.h" |
| 986 | - PATTERN "*.hpp" |
| 987 | - GUARDS SYMLINK WRAPPER |
| 988 | - WRAPPER_LOCATIONS cub/${CMAKE_INSTALL_INCLUDEDIR}/hipcub/ |
| 989 | - OUTPUT_LOCATIONS cub/wrapper/include/hipcub/ |
| 990 | - ) |
| 991 | -endif() |
| 992 | - |
| 993 | # Package |
| 994 | if(HIP_COMPILER STREQUAL "clang") |
| 995 | rocm_package_add_deb_dependencies(DEPENDS "rocprim-dev >= 2.10.1") |
| 996 | @@ -205,3 +268,7 @@ else() |
| 997 | HEADER_ONLY |
| 998 | ) |
| 999 | endif() |
| 1000 | + |
| 1001 | +# Print configuration summary |
| 1002 | +include(cmake/Summary.cmake) |
| 1003 | +print_configuration_summary() |
| 1004 | diff --git a/LICENSE.txt b/LICENSE.txt |
| 1005 | index c284d2b..4bbc4ad 100644 |
| 1006 | --- a/LICENSE.txt |
| 1007 | +++ b/LICENSE.txt |
| 1008 | @@ -1,9 +1,10 @@ |
| 1009 | Copyright (c) 2010-2011, Duane Merrill. All rights reserved. |
| 1010 | Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. |
| 1011 | -Modifications Copyright (c) 2019-2021, Advanced Micro Devices, Inc. All rights reserved. |
| 1012 | +Modifications Copyright (c) 2019-2025, Advanced Micro Devices, Inc. All rights reserved. |
| 1013 | |
| 1014 | Redistribution and use in source and binary forms, with or without |
| 1015 | modification, are permitted provided that the following conditions are met: |
| 1016 | + |
| 1017 | * Redistributions of source code must retain the above copyright |
| 1018 | notice, this list of conditions and the following disclaimer. |
| 1019 | * Redistributions in binary form must reproduce the above copyright |
| 1020 | diff --git a/README.md b/README.md |
| 1021 | index 6a0ad19..c144d1f 100644 |
| 1022 | --- a/README.md |
| 1023 | +++ b/README.md |
| 1024 | @@ -1,56 +1,28 @@ |
| 1025 | # hipCUB |
| 1026 | |
| 1027 | > [!NOTE] |
| 1028 | -> The published documentation is available at [hipCUB](https://rocm.docs.amd.com/projects/hipCUB/en/latest/index.html) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). |
| 1029 | +> The published hipCUB documentation is available [here](https://rocm.docs.amd.com/projects/hipCUB/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). |
| 1030 | |
| 1031 | hipCUB is a thin wrapper library on top of |
| 1032 | -[rocPRIM](https://github.com/ROCm/rocPRIM) or |
| 1033 | -[CUB](https://github.com/thrust/cub). You can use it to port a CUB project into |
| 1034 | +[rocPRIM](https://github.com/ROCm/rocm-libraries) or |
| 1035 | +[CUB](https://github.com/nvidia/cccl). You can use it to port a CUB project into |
| 1036 | [HIP](https://github.com/ROCm/HIP) so you can use AMD hardware (and |
| 1037 | [ROCm](https://rocm.docs.amd.com/en/latest/) software). |
| 1038 | |
| 1039 | In the [ROCm](https://rocm.docs.amd.com/en/latest/) |
| 1040 | -environment, hipCUB uses the rocPRIM library as the backend. On CUDA platforms, it uses CUB as the |
| 1041 | -backend. |
| 1042 | - |
| 1043 | -## Documentation |
| 1044 | - |
| 1045 | -Documentation for hipCUB is available at |
| 1046 | -[https://rocm.docs.amd.com/projects/hipCUB/en/latest/](https://rocm.docs.amd.com/projects/hipCUB/en/latest/). |
| 1047 | - |
| 1048 | -To build our documentation locally, run the following code: |
| 1049 | - |
| 1050 | -```shell |
| 1051 | -# Go to the hipCUB docs directory |
| 1052 | -cd hipCUB; cd docs |
| 1053 | - |
| 1054 | -# Install required pip packages |
| 1055 | -python3 -m pip install -r .sphinx/requirements.txt |
| 1056 | - |
| 1057 | -# Build the documentation |
| 1058 | -python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html |
| 1059 | - |
| 1060 | -# For e.g. serve the HTML docs locally |
| 1061 | -cd _build/html |
| 1062 | -python3 -m http.server |
| 1063 | -``` |
| 1064 | +environment, hipCUB uses the rocPRIM library as the backend. |
| 1065 | |
| 1066 | ## Requirements |
| 1067 | |
| 1068 | * Git |
| 1069 | -* CMake (3.16 or later) |
| 1070 | +* CMake (3.18 or later) |
| 1071 | * For AMD GPUs: |
| 1072 | * AMD [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/native-install/index.html) software (1.8.0 or later) |
| 1073 | * The [HIP-clang](https://github.com/ROCm/HIP/blob/master/INSTALL.md#hip-clang) compiler (you |
| 1074 | must, set this as the C++ compiler for ROCm) |
| 1075 | - * The [rocPRIM](https://github.com/ROCm/rocPRIM) library |
| 1076 | + * The [rocPRIM](https://github.com/ROCm/rocm-libraries) library |
| 1077 | * Automatically downloaded and built by the CMake script |
| 1078 | * Requires CMake 3.16.9 or later |
| 1079 | -* For NVIDIA GPUs: |
| 1080 | - * CUDA Toolkit |
| 1081 | - * CCCL library (>= 2.5.0) |
| 1082 | - * Automatically downloaded and built by the CMake script |
| 1083 | - * Requires CMake 3.15.0 or later |
| 1084 | * Python 3.6 or higher (for HIP on Windows only; this is only required for install scripts) |
| 1085 | * Visual Studio 2019 with Clang support (HIP on Windows only) |
| 1086 | * Strawberry Perl (HIP on Windows only) |
| 1087 | @@ -64,31 +36,53 @@ GoogleTest and Google Benchmark are automatically downloaded and built by the CM |
| 1088 | |
| 1089 | ## Build and install |
| 1090 | |
| 1091 | -To build and install hipCub, run the following code: |
| 1092 | +### Obtaining the source code |
| 1093 | + |
| 1094 | +hipCUB can be cloned in two ways: |
| 1095 | + |
| 1096 | +1. Clone hipCUB along with other ROCm libraries that are frequently used together (note that this may take some time to complete): |
| 1097 | +```sh |
| 1098 | +git clone https://github.com/ROCm/rocm-libraries.git |
| 1099 | +cd rocm-libraries |
| 1100 | +``` |
| 1101 | + |
| 1102 | +2. To clone hipCUB individually (faster, but requires git version 2.25+): |
| 1103 | +```sh |
| 1104 | +git clone --no-checkout --depth=1 --filter=tree:0 https://github.com/ROCm/rocm-libraries.git |
| 1105 | +cd rocm-libraries |
| 1106 | +git sparse-checkout init --cone |
| 1107 | +git sparse-checkout set projects/hipcub |
| 1108 | +git checkout develop |
| 1109 | +``` |
| 1110 | + |
| 1111 | +### Building the library |
| 1112 | |
| 1113 | ```shell |
| 1114 | -git clone https://github.com/ROCm/hipCUB.git |
| 1115 | +# Go to the hipCUB directory. |
| 1116 | +cd projects/hipcub |
| 1117 | |
| 1118 | -# Go to hipCUB directory, create and go to the build directory. |
| 1119 | -cd hipCUB; mkdir build; cd build |
| 1120 | +# Create a directory for the build and navigate to it. |
| 1121 | +mkdir build; cd build |
| 1122 | |
| 1123 | # Configure hipCUB, setup options for your system. |
| 1124 | # Build options: |
| 1125 | -# BUILD_TEST - OFF by default, |
| 1126 | -# BUILD_BENCHMARK - OFF by default. |
| 1127 | -# DEPENDENCIES_FORCE_DOWNLOAD - OFF by default and at ON the dependencies will be downloaded to build folder, |
| 1128 | +# BUILD_TEST - OFF by default, |
| 1129 | +# BUILD_BENCHMARK - OFF by default. |
| 1130 | +# ROCPRIM_FETCH_METHOD - One of PACKAGE (default), DOWNLOAD, and MONOREPO. See below for a description of each. |
| 1131 | +# EXTERNAL_DEPS_FORCE_DOWNLOAD - OFF by default, forces download for non-ROCm dependencies (eg. Google Test / Benchmark). |
| 1132 | +# DOWNLOAD_CUB - OFF by default, (Nvidia CUB backend only) forces download of CUB instead of searching for an installed package. |
| 1133 | +# BUILD_OFFLOAD_COMPRESS - ON by default, compresses device code to reduce the size of the generated binary. |
| 1134 | +# BUILD_EXAMPLE - OFF by default, builds examples. |
| 1135 | +# BUILD_ADDRESS_SANITIZER - OFF by default, builds with clang address sanitizer enabled. |
| 1136 | +# BUILD_COMPUTE_SANITIZER - OFF by default, (Nvidia CUB backend only) builds tests with CUDA's compute sanitizer enabled. |
| 1137 | +# USE_SYSTEM_LIB - OFF by default, builds tests using the installed hipCUB provided by the system. This only takes effect when BUILD_TEST is ON. |
| 1138 | +# USE_HIPCXX - OFF by default, builds with CMake HIP language support. This eliminates the need to set CXX. |
| 1139 | # |
| 1140 | # ! IMPORTANT ! |
| 1141 | # Set C++ compiler to HIP-aware clang. You can do it by adding 'CXX=<path-to-compiler>' |
| 1142 | # before 'cmake' or setting cmake option 'CMAKE_CXX_COMPILER' to path to the compiler. |
| 1143 | # |
| 1144 | [CXX=hipcc] cmake ../. # or cmake-gui ../. |
| 1145 | - |
| 1146 | -# To configure hipCUB for Nvidia platforms, 'CXX=<path-to-nvcc>', `CXX=nvcc` or omitting the flag |
| 1147 | -# entirely before 'cmake' is sufficient |
| 1148 | -[CXX=nvcc] cmake -DBUILD_TEST=ON ../. # or cmake-gui ../. |
| 1149 | -# or |
| 1150 | -cmake -DBUILD_TEST=ON ../. # or cmake-gui ../. |
| 1151 | # or to build benchmarks |
| 1152 | cmake -DBUILD_BENCHMARK=ON ../. |
| 1153 | |
| 1154 | @@ -105,20 +99,31 @@ make package |
| 1155 | [sudo] make install |
| 1156 | ``` |
| 1157 | |
| 1158 | +`ROCPRIM_FETCH_METHOD` can be used to control how hipCUB obtains the rocPRIM dependency. It must be set to one of the following values: |
| 1159 | +* `PACKAGE` (default) - Searches for an installed package on the system that meets the minimum version requirement. If it is not found, the build will fall back using option `DOWNLOAD`. |
| 1160 | +* `DOWNLOAD` - Clones rocPRIM from the upstream repository. If git >= 2.25 is present, this option uses a sparse checkout that avoids downloading more than it needs to. If not, the whole monorepo is downloaded (this may take some time). |
| 1161 | +* `MONOREPO` - This value is intended to be used if you are building hipCUB from within a copy of the rocm-libraries repository that you have cloned (and therefore already contains rocPRIM). When selected, the build will try find the dependency in the local repository tree. If it cannot be found, the build will attempt to use git to perform a sparse-checkout of rocPRIM. If that also fails, it will fall back to using the `DOWNLOAD` option described above. |
| 1162 | + |
| 1163 | ### HIP on Windows |
| 1164 | |
| 1165 | Initial support for HIP on Windows is available. You can install it using the provided `rmake.py` Python |
| 1166 | -script: |
| 1167 | +script. To do this, first, clone rocThrust using the steps described in [obtaining the source code](#obtaining-the-source-code). |
| 1168 | +Next: |
| 1169 | |
| 1170 | ```shell |
| 1171 | -git clone https://github.com/ROCm/hipCUB.git |
| 1172 | -cd hipCUB |
| 1173 | +cd projects/hipcub |
| 1174 | |
| 1175 | # the -i option will install rocPRIM to C:\hipSDK by default |
| 1176 | python rmake.py -i |
| 1177 | |
| 1178 | # the -c option will build all clients including unit tests |
| 1179 | python rmake.py -c |
| 1180 | + |
| 1181 | +# to build for a specific architecture only, use the -a option |
| 1182 | +python rmake.py -ci -a gfx1100 |
| 1183 | + |
| 1184 | +# for a full list of available options, please refer to the help documentation |
| 1185 | +python rmake.py -h |
| 1186 | ``` |
| 1187 | |
| 1188 | ### Using hipCUB |
| 1189 | @@ -127,15 +132,13 @@ To use hipCUB in a CMake project, we recommended using the package configuration |
| 1190 | |
| 1191 | ```cmake |
| 1192 | # On ROCm hipCUB requires rocPRIM |
| 1193 | -find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/rocprim") |
| 1194 | +find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/lib/cmake/rocprim") |
| 1195 | |
| 1196 | # "/opt/rocm" - default install prefix |
| 1197 | -find_package(hipcub REQUIRED CONFIG PATHS "/opt/rocm/hipcub") |
| 1198 | +find_package(hipcub REQUIRED CONFIG PATHS "/opt/rocm/lib/cmake/hipcub") |
| 1199 | |
| 1200 | ... |
| 1201 | # On ROCm: includes hipCUB headers and roc::rocprim_hip target |
| 1202 | -# On CUDA: includes only hipCUB headers, user has to include CUB directory |
| 1203 | -target_link_libraries(<your_target> hip::hipcub) |
| 1204 | ``` |
| 1205 | |
| 1206 | Include only the main header file: |
| 1207 | @@ -150,7 +153,7 @@ Depending on your current HIP platform, hipCUB includes CUB or rocPRIM headers. |
| 1208 | |
| 1209 | ```shell |
| 1210 | # Go to hipCUB build directory |
| 1211 | -cd hipCUB; cd build |
| 1212 | +cd projects/hipcub; cd build |
| 1213 | |
| 1214 | # To run all tests |
| 1215 | ctest |
| 1216 | @@ -161,7 +164,7 @@ ctest |
| 1217 | |
| 1218 | ### Using custom seeds for the tests |
| 1219 | |
| 1220 | -Go to the `hipCUB/test/hipcub/test_seed.hpp` file. |
| 1221 | +Go to the `projects/hipcub/test/hipcub/test_seed.hpp` file. |
| 1222 | |
| 1223 | ```cpp |
| 1224 | //(1) |
| 1225 | @@ -190,7 +193,7 @@ static constexpr size_t seed_size = sizeof(seeds) / sizeof(seeds[0]); |
| 1226 | |
| 1227 | ```shell |
| 1228 | # Go to hipCUB build directory |
| 1229 | -cd hipCUB; cd build |
| 1230 | +cd projects/hipcub; cd build |
| 1231 | |
| 1232 | # To run benchmark for warp functions: |
| 1233 | # Further option can be found using --help |
| 1234 | @@ -208,10 +211,67 @@ cd hipCUB; cd build |
| 1235 | ./benchmark/benchmark_device_<function_name> [--size <size>] [--trials <trials>] |
| 1236 | ``` |
| 1237 | |
| 1238 | +## Building the documentation locally |
| 1239 | + |
| 1240 | +### Requirements |
| 1241 | + |
| 1242 | +#### Doxygen |
| 1243 | + |
| 1244 | +The build system uses Doxygen [version 1.9.4](https://github.com/doxygen/doxygen/releases/tag/Release_1_9_4). You can try using a newer version, but that might cause issues. |
| 1245 | + |
| 1246 | +After you have downloaded Doxygen version 1.9.4: |
| 1247 | + |
| 1248 | +```shell |
| 1249 | +# Add doxygen to your PATH |
| 1250 | +echo 'export PATH=<doxygen 1.9.4 path>/bin:$PATH' >> ~/.bashrc |
| 1251 | + |
| 1252 | +# Apply the updated .bashrc |
| 1253 | +source ~/.bashrc |
| 1254 | + |
| 1255 | +# Confirm that you are using version 1.9.4 |
| 1256 | +doxygen --version |
| 1257 | +``` |
| 1258 | + |
| 1259 | +#### Python |
| 1260 | + |
| 1261 | +The build system uses Python version 3.10. You can try using a newer version, but that might cause issues. |
| 1262 | + |
| 1263 | +You can install Python 3.10 alongside your other Python versions using [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation): |
| 1264 | + |
| 1265 | +```shell |
| 1266 | +# Install Python 3.10 |
| 1267 | +pyenv install 3.10 |
| 1268 | + |
| 1269 | +# Create a Python 3.10 virtual environment |
| 1270 | +pyenv virtualenv 3.10 venv_hipcub |
| 1271 | + |
| 1272 | +# Activate the virtual environment |
| 1273 | +pyenv activate venv_hipcub |
| 1274 | +``` |
| 1275 | + |
| 1276 | +### Building |
| 1277 | + |
| 1278 | +After cloning this repository (see [obtaining the source code](#obtaining-the-source-code)): |
| 1279 | + |
| 1280 | +```shell |
| 1281 | +cd rocm-libraries/projects/hipcub |
| 1282 | + |
| 1283 | +# Install Python dependencies |
| 1284 | +python3 -m pip install -r docs/sphinx/requirements.txt |
| 1285 | + |
| 1286 | +# Build the documentation |
| 1287 | +python3 -m sphinx -T -E -b html -d docs/_build/doctrees -D language=en docs docs/_build/html |
| 1288 | +``` |
| 1289 | + |
| 1290 | +You can then open `docs/_build/html/index.html` in your browser to view the documentation. |
| 1291 | + |
| 1292 | ## Support |
| 1293 | |
| 1294 | -Bugs and feature requests can be reported through the |
| 1295 | -[GitHub issue tracker](https://github.com/ROCm/hipCUB/issues). |
| 1296 | +You can report bugs and feature requests through the GitHub |
| 1297 | +[issue tracker](https://github.com/ROCm/rocm-libraries/issues). |
| 1298 | +To help ensure that your issue is seen by the right team more quickly, when creating your issue, please apply the label `project: hipcub`. |
| 1299 | +Similarly, to filter the exising issue list down to only those affecting rocThrust, you can add the filter `label:"project: hipcub"`, |
| 1300 | +or follow [this link](https://github.com/ROCm/rocm-libraries/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22project%3A%20hipcub%22). |
| 1301 | |
| 1302 | ## Contributing |
| 1303 | |
| 1304 | diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt |
| 1305 | index 4a38bac..3e8f663 100644 |
| 1306 | --- a/benchmark/CMakeLists.txt |
| 1307 | +++ b/benchmark/CMakeLists.txt |
| 1308 | @@ -1,6 +1,6 @@ |
| 1309 | # MIT License |
| 1310 | # |
| 1311 | -# Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 1312 | +# Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 1313 | # |
| 1314 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1315 | # of this software and associated documentation files (the "Software"), to deal |
| 1316 | @@ -36,7 +36,7 @@ function(add_hipcub_benchmark BENCHMARK_SOURCE) |
| 1317 | hipcub |
| 1318 | ) |
| 1319 | if((HIP_COMPILER STREQUAL "nvcc")) |
| 1320 | - set_property(TARGET ${BENCHMARK_TARGET} PROPERTY CUDA_STANDARD 14) |
| 1321 | + set_property(TARGET ${BENCHMARK_TARGET} PROPERTY CUDA_STANDARD 17) |
| 1322 | set_source_files_properties(${BENCHMARK_SOURCE} PROPERTIES LANGUAGE CUDA) |
| 1323 | target_compile_options(${BENCHMARK_TARGET} |
| 1324 | PRIVATE |
| 1325 | @@ -88,6 +88,7 @@ add_hipcub_benchmark(benchmark_device_batch_memcpy.cpp) |
| 1326 | add_hipcub_benchmark(benchmark_device_for.cpp) |
| 1327 | add_hipcub_benchmark(benchmark_device_histogram.cpp) |
| 1328 | add_hipcub_benchmark(benchmark_device_memory.cpp) |
| 1329 | +add_hipcub_benchmark(benchmark_device_merge.cpp) |
| 1330 | add_hipcub_benchmark(benchmark_device_merge_sort.cpp) |
| 1331 | add_hipcub_benchmark(benchmark_device_partition.cpp) |
| 1332 | add_hipcub_benchmark(benchmark_device_radix_sort.cpp) |
| 1333 | diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp |
| 1334 | index 1cc7079..7c7ac6b 100644 |
| 1335 | --- a/benchmark/benchmark_block_adjacent_difference.cpp |
| 1336 | +++ b/benchmark/benchmark_block_adjacent_difference.cpp |
| 1337 | @@ -23,10 +23,9 @@ |
| 1338 | #include "common_benchmark_header.hpp" |
| 1339 | |
| 1340 | // HIP API |
| 1341 | -#include "hipcub/block/block_adjacent_difference.hpp" |
| 1342 | - |
| 1343 | -#include "hipcub/block/block_load.hpp" |
| 1344 | -#include "hipcub/block/block_store.hpp" |
| 1345 | +#include <hipcub/block/block_adjacent_difference.hpp> |
| 1346 | +#include <hipcub/block/block_load.hpp> |
| 1347 | +#include <hipcub/block/block_store.hpp> |
| 1348 | |
| 1349 | #ifndef DEFAULT_N |
| 1350 | const size_t DEFAULT_N = 1024 * 1024 * 128; |
| 1351 | diff --git a/benchmark/benchmark_block_discontinuity.cpp b/benchmark/benchmark_block_discontinuity.cpp |
| 1352 | index 24446c9..5e36160 100644 |
| 1353 | --- a/benchmark/benchmark_block_discontinuity.cpp |
| 1354 | +++ b/benchmark/benchmark_block_discontinuity.cpp |
| 1355 | @@ -20,14 +20,13 @@ |
| 1356 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 1357 | // SOFTWARE. |
| 1358 | |
| 1359 | -#include "common_benchmark_header.hpp" |
| 1360 | - |
| 1361 | // HIP API |
| 1362 | -#include "hipcub/block/block_discontinuity.hpp" |
| 1363 | +#include <hipcub/block/block_discontinuity.hpp> |
| 1364 | +#include <hipcub/block/block_load.hpp> |
| 1365 | +#include <hipcub/block/block_store.hpp> |
| 1366 | +#include <hipcub/thread/thread_operators.hpp> //to use hipcub::Equality |
| 1367 | |
| 1368 | -#include "hipcub/block/block_load.hpp" |
| 1369 | -#include "hipcub/block/block_store.hpp" |
| 1370 | -#include "hipcub/thread/thread_operators.hpp" //to use hipcub::Equality |
| 1371 | +#include "common_benchmark_header.hpp" |
| 1372 | |
| 1373 | #ifndef DEFAULT_N |
| 1374 | const size_t DEFAULT_N = 1024 * 1024 * 128; |
| 1375 | diff --git a/benchmark/benchmark_block_exchange.cpp b/benchmark/benchmark_block_exchange.cpp |
| 1376 | index a36d041..000cd41 100644 |
| 1377 | --- a/benchmark/benchmark_block_exchange.cpp |
| 1378 | +++ b/benchmark/benchmark_block_exchange.cpp |
| 1379 | @@ -23,9 +23,9 @@ |
| 1380 | #include "common_benchmark_header.hpp" |
| 1381 | |
| 1382 | // HIP API |
| 1383 | -#include "hipcub/block/block_exchange.hpp" |
| 1384 | -#include "hipcub/block/block_load.hpp" |
| 1385 | -#include "hipcub/block/block_store.hpp" |
| 1386 | +#include <hipcub/block/block_exchange.hpp> |
| 1387 | +#include <hipcub/block/block_load.hpp> |
| 1388 | +#include <hipcub/block/block_store.hpp> |
| 1389 | |
| 1390 | #ifndef DEFAULT_N |
| 1391 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 1392 | diff --git a/benchmark/benchmark_block_histogram.cpp b/benchmark/benchmark_block_histogram.cpp |
| 1393 | index 122ccc3..1206e04 100644 |
| 1394 | --- a/benchmark/benchmark_block_histogram.cpp |
| 1395 | +++ b/benchmark/benchmark_block_histogram.cpp |
| 1396 | @@ -23,7 +23,7 @@ |
| 1397 | #include "common_benchmark_header.hpp" |
| 1398 | |
| 1399 | // HIP API |
| 1400 | -#include "hipcub/block/block_histogram.hpp" |
| 1401 | +#include <hipcub/block/block_histogram.hpp> |
| 1402 | |
| 1403 | #ifndef DEFAULT_N |
| 1404 | const size_t DEFAULT_N = 1024 * 1024 * 128; |
| 1405 | diff --git a/benchmark/benchmark_block_merge_sort.cpp b/benchmark/benchmark_block_merge_sort.cpp |
| 1406 | index 8316764..c8c7402 100644 |
| 1407 | --- a/benchmark/benchmark_block_merge_sort.cpp |
| 1408 | +++ b/benchmark/benchmark_block_merge_sort.cpp |
| 1409 | @@ -24,9 +24,9 @@ |
| 1410 | |
| 1411 | #include "../test/hipcub/test_utils_sort_comparator.hpp" |
| 1412 | // HIP API |
| 1413 | -#include "hipcub/block/block_load.hpp" |
| 1414 | -#include "hipcub/block/block_merge_sort.hpp" |
| 1415 | -#include "hipcub/block/block_store.hpp" |
| 1416 | +#include <hipcub/block/block_load.hpp> |
| 1417 | +#include <hipcub/block/block_merge_sort.hpp> |
| 1418 | +#include <hipcub/block/block_store.hpp> |
| 1419 | |
| 1420 | #ifndef DEFAULT_N |
| 1421 | const size_t DEFAULT_N = 1024 * 1024 * 128; |
| 1422 | diff --git a/benchmark/benchmark_block_radix_rank.cpp b/benchmark/benchmark_block_radix_rank.cpp |
| 1423 | index 2dac0a5..ffcd1d7 100644 |
| 1424 | --- a/benchmark/benchmark_block_radix_rank.cpp |
| 1425 | +++ b/benchmark/benchmark_block_radix_rank.cpp |
| 1426 | @@ -23,11 +23,11 @@ |
| 1427 | #include "common_benchmark_header.hpp" |
| 1428 | |
| 1429 | // HIP API |
| 1430 | -#include "hipcub/block/block_load.hpp" |
| 1431 | -#include "hipcub/block/block_radix_rank.hpp" |
| 1432 | -#include "hipcub/block/block_store.hpp" |
| 1433 | +#include <hipcub/block/block_load.hpp> |
| 1434 | +#include <hipcub/block/block_radix_rank.hpp> |
| 1435 | +#include <hipcub/block/block_store.hpp> |
| 1436 | |
| 1437 | -#include "hipcub/block/radix_rank_sort_operations.hpp" |
| 1438 | +#include <hipcub/block/radix_rank_sort_operations.hpp> |
| 1439 | |
| 1440 | #ifndef DEFAULT_N |
| 1441 | const size_t DEFAULT_N = 1024 * 1024 * 128; |
| 1442 | diff --git a/benchmark/benchmark_block_radix_sort.cpp b/benchmark/benchmark_block_radix_sort.cpp |
| 1443 | index 0bae7b8..4b75c26 100644 |
| 1444 | --- a/benchmark/benchmark_block_radix_sort.cpp |
| 1445 | +++ b/benchmark/benchmark_block_radix_sort.cpp |
| 1446 | @@ -23,9 +23,9 @@ |
| 1447 | #include "common_benchmark_header.hpp" |
| 1448 | |
| 1449 | // HIP API |
| 1450 | -#include "hipcub/block/block_load.hpp" |
| 1451 | -#include "hipcub/block/block_radix_sort.hpp" |
| 1452 | -#include "hipcub/block/block_store.hpp" |
| 1453 | +#include <hipcub/block/block_load.hpp> |
| 1454 | +#include <hipcub/block/block_radix_sort.hpp> |
| 1455 | +#include <hipcub/block/block_store.hpp> |
| 1456 | |
| 1457 | #ifndef DEFAULT_N |
| 1458 | const size_t DEFAULT_N = 1024 * 1024 * 128; |
| 1459 | diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp |
| 1460 | index bdb089e..fe4b815 100644 |
| 1461 | --- a/benchmark/benchmark_block_reduce.cpp |
| 1462 | +++ b/benchmark/benchmark_block_reduce.cpp |
| 1463 | @@ -23,8 +23,8 @@ |
| 1464 | #include "common_benchmark_header.hpp" |
| 1465 | |
| 1466 | // HIP API |
| 1467 | -#include "hipcub/block/block_reduce.hpp" |
| 1468 | -#include "hipcub/thread/thread_operators.hpp" |
| 1469 | +#include <hipcub/block/block_reduce.hpp> |
| 1470 | +#include <hipcub/thread/thread_operators.hpp> |
| 1471 | |
| 1472 | #ifndef DEFAULT_N |
| 1473 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 1474 | diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp |
| 1475 | index 6769fd4..a42d3c4 100644 |
| 1476 | --- a/benchmark/benchmark_block_run_length_decode.cpp |
| 1477 | +++ b/benchmark/benchmark_block_run_length_decode.cpp |
| 1478 | @@ -22,9 +22,9 @@ |
| 1479 | |
| 1480 | #include "common_benchmark_header.hpp" |
| 1481 | |
| 1482 | -#include "hipcub/block/block_load.hpp" |
| 1483 | -#include "hipcub/block/block_run_length_decode.hpp" |
| 1484 | -#include "hipcub/block/block_store.hpp" |
| 1485 | +#include <hipcub/block/block_load.hpp> |
| 1486 | +#include <hipcub/block/block_run_length_decode.hpp> |
| 1487 | +#include <hipcub/block/block_store.hpp> |
| 1488 | |
| 1489 | #ifndef DEFAULT_N |
| 1490 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 1491 | diff --git a/benchmark/benchmark_block_scan.cpp b/benchmark/benchmark_block_scan.cpp |
| 1492 | index 340d3b4..51bf6c6 100644 |
| 1493 | --- a/benchmark/benchmark_block_scan.cpp |
| 1494 | +++ b/benchmark/benchmark_block_scan.cpp |
| 1495 | @@ -23,7 +23,7 @@ |
| 1496 | #include "common_benchmark_header.hpp" |
| 1497 | |
| 1498 | // hipCUB API |
| 1499 | -#include "hipcub/block/block_scan.hpp" |
| 1500 | +#include <hipcub/block/block_scan.hpp> |
| 1501 | |
| 1502 | #ifndef DEFAULT_N |
| 1503 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 1504 | diff --git a/benchmark/benchmark_block_shuffle.cpp b/benchmark/benchmark_block_shuffle.cpp |
| 1505 | index 4ba9fb0..697d381 100644 |
| 1506 | --- a/benchmark/benchmark_block_shuffle.cpp |
| 1507 | +++ b/benchmark/benchmark_block_shuffle.cpp |
| 1508 | @@ -22,7 +22,7 @@ |
| 1509 | |
| 1510 | #include "common_benchmark_header.hpp" |
| 1511 | |
| 1512 | -#include "hipcub/block/block_shuffle.hpp" |
| 1513 | +#include <hipcub/block/block_shuffle.hpp> |
| 1514 | |
| 1515 | #ifndef DEFAULT_N |
| 1516 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 1517 | diff --git a/benchmark/benchmark_device_batch_copy.cpp b/benchmark/benchmark_device_batch_copy.cpp |
| 1518 | index feca312..909c50b 100644 |
| 1519 | --- a/benchmark/benchmark_device_batch_copy.cpp |
| 1520 | +++ b/benchmark/benchmark_device_batch_copy.cpp |
| 1521 | @@ -24,10 +24,10 @@ |
| 1522 | #include "cmdparser.hpp" |
| 1523 | #include "common_benchmark_header.hpp" |
| 1524 | |
| 1525 | -#include "hipcub/block/block_load.hpp" |
| 1526 | -#include "hipcub/block/block_store.hpp" |
| 1527 | -#include "hipcub/device/device_copy.hpp" |
| 1528 | -#include "hipcub/hipcub.hpp" |
| 1529 | +#include <hipcub/block/block_load.hpp> |
| 1530 | +#include <hipcub/block/block_store.hpp> |
| 1531 | +#include <hipcub/device/device_copy.hpp> |
| 1532 | +#include <hipcub/hipcub.hpp> |
| 1533 | |
| 1534 | #include <hip/hip_runtime.h> |
| 1535 | |
| 1536 | diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp |
| 1537 | index f0f38be..8de4f8e 100644 |
| 1538 | --- a/benchmark/benchmark_device_batch_memcpy.cpp |
| 1539 | +++ b/benchmark/benchmark_device_batch_memcpy.cpp |
| 1540 | @@ -24,10 +24,10 @@ |
| 1541 | #include "cmdparser.hpp" |
| 1542 | #include "common_benchmark_header.hpp" |
| 1543 | |
| 1544 | -#include "hipcub/block/block_load.hpp" |
| 1545 | -#include "hipcub/block/block_store.hpp" |
| 1546 | -#include "hipcub/device/device_memcpy.hpp" |
| 1547 | -#include "hipcub/hipcub.hpp" |
| 1548 | +#include <hipcub/block/block_load.hpp> |
| 1549 | +#include <hipcub/block/block_store.hpp> |
| 1550 | +#include <hipcub/device/device_memcpy.hpp> |
| 1551 | +#include <hipcub/hipcub.hpp> |
| 1552 | |
| 1553 | #ifdef __HIP_PLATFORM_AMD__ |
| 1554 | // Only include this on AMD as it contains specialized config information |
| 1555 | @@ -37,6 +37,7 @@ |
| 1556 | #include <hip/hip_runtime.h> |
| 1557 | |
| 1558 | #include <iostream> |
| 1559 | +#include <memory> |
| 1560 | #include <numeric> |
| 1561 | #include <random> |
| 1562 | #include <utility> |
| 1563 | diff --git a/benchmark/benchmark_device_for.cpp b/benchmark/benchmark_device_for.cpp |
| 1564 | index a5669ed..4b7a7d7 100644 |
| 1565 | --- a/benchmark/benchmark_device_for.cpp |
| 1566 | +++ b/benchmark/benchmark_device_for.cpp |
| 1567 | @@ -1,6 +1,6 @@ |
| 1568 | // MIT License |
| 1569 | // |
| 1570 | -// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. |
| 1571 | +// Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 1572 | // |
| 1573 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1574 | // of this software and associated documentation files (the "Software"), to deal |
| 1575 | @@ -26,7 +26,7 @@ |
| 1576 | #include "common_benchmark_header.hpp" |
| 1577 | |
| 1578 | // HIP API |
| 1579 | -#include "hipcub/device/device_for.hpp" |
| 1580 | +#include <hipcub/device/device_for.hpp> |
| 1581 | |
| 1582 | #ifndef DEFAULT_N |
| 1583 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 1584 | @@ -71,7 +71,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) |
| 1585 | // Warm-up |
| 1586 | for(size_t i = 0; i < warmup_size; i++) |
| 1587 | { |
| 1588 | - HIP_CHECK(hipcub::ForEach(d_input, d_input + size, device_op, stream)); |
| 1589 | + HIP_CHECK(hipcub::DeviceFor::ForEach(d_input, d_input + size, device_op, stream)); |
| 1590 | } |
| 1591 | HIP_CHECK(hipDeviceSynchronize()); |
| 1592 | |
| 1593 | @@ -81,7 +81,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) |
| 1594 | |
| 1595 | for(size_t i = 0; i < batch_size; i++) |
| 1596 | { |
| 1597 | - HIP_CHECK(hipcub::ForEach(d_input, d_input + size, device_op, stream)); |
| 1598 | + HIP_CHECK(hipcub::DeviceFor::ForEach(d_input, d_input + size, device_op, stream)); |
| 1599 | } |
| 1600 | HIP_CHECK(hipStreamSynchronize(stream)); |
| 1601 | |
| 1602 | diff --git a/benchmark/benchmark_device_histogram.cpp b/benchmark/benchmark_device_histogram.cpp |
| 1603 | index a5019e4..ded31e2 100644 |
| 1604 | --- a/benchmark/benchmark_device_histogram.cpp |
| 1605 | +++ b/benchmark/benchmark_device_histogram.cpp |
| 1606 | @@ -29,8 +29,8 @@ |
| 1607 | #include "common_benchmark_header.hpp" |
| 1608 | |
| 1609 | // HIP API |
| 1610 | -#include "hipcub/device/device_histogram.hpp" |
| 1611 | -#include "hipcub/iterator/transform_input_iterator.hpp" |
| 1612 | +#include <hipcub/device/device_histogram.hpp> |
| 1613 | +#include <hipcub/iterator/transform_input_iterator.hpp> |
| 1614 | |
| 1615 | #ifndef DEFAULT_N |
| 1616 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 1617 | diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp |
| 1618 | index bb256ac..1e62167 100644 |
| 1619 | --- a/benchmark/benchmark_device_memory.cpp |
| 1620 | +++ b/benchmark/benchmark_device_memory.cpp |
| 1621 | @@ -1,6 +1,6 @@ |
| 1622 | // MIT License |
| 1623 | // |
| 1624 | -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 1625 | +// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 1626 | // |
| 1627 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1628 | // of this software and associated documentation files (the "Software"), to deal |
| 1629 | @@ -22,9 +22,9 @@ |
| 1630 | |
| 1631 | #include "common_benchmark_header.hpp" |
| 1632 | |
| 1633 | -#include "hipcub/block/block_load.hpp" |
| 1634 | -#include "hipcub/block/block_scan.hpp" |
| 1635 | -#include "hipcub/block/block_store.hpp" |
| 1636 | +#include <hipcub/block/block_load.hpp> |
| 1637 | +#include <hipcub/block/block_scan.hpp> |
| 1638 | +#include <hipcub/block/block_store.hpp> |
| 1639 | |
| 1640 | enum memory_operation_method |
| 1641 | { |
| 1642 | @@ -58,9 +58,10 @@ struct operation; |
| 1643 | template<typename T, unsigned int ItemsPerThread, unsigned int BlockSize> |
| 1644 | struct operation<no_operation, T, ItemsPerThread, BlockSize> |
| 1645 | { |
| 1646 | - typedef empty_storage_type storage_type; |
| 1647 | + using storage_type = empty_storage_type; |
| 1648 | |
| 1649 | - HIPCUB_DEVICE inline void |
| 1650 | + HIPCUB_DEVICE |
| 1651 | + inline void |
| 1652 | operator()(storage_type& /*storage*/, T (&)[ItemsPerThread], T* = nullptr) const |
| 1653 | {} |
| 1654 | }; |
| 1655 | @@ -69,11 +70,13 @@ struct operation<no_operation, T, ItemsPerThread, BlockSize> |
| 1656 | template<typename T, unsigned int ItemsPerThread, unsigned int BlockSize> |
| 1657 | struct operation<custom_operation, T, ItemsPerThread, BlockSize> |
| 1658 | { |
| 1659 | - typedef empty_storage_type storage_type; |
| 1660 | + using storage_type = empty_storage_type; |
| 1661 | |
| 1662 | - HIPCUB_DEVICE inline void operator()(storage_type& storage, |
| 1663 | - T (&input)[ItemsPerThread], |
| 1664 | - T* global_mem_output = nullptr) const |
| 1665 | + HIPCUB_DEVICE |
| 1666 | + inline void |
| 1667 | + operator()(storage_type& storage, |
| 1668 | + T (&input)[ItemsPerThread], |
| 1669 | + T* global_mem_output = nullptr) const |
| 1670 | { |
| 1671 | (void)storage; |
| 1672 | (void)global_mem_output; |
| 1673 | @@ -96,14 +99,15 @@ struct operation<custom_operation, T, ItemsPerThread, BlockSize> |
| 1674 | template<typename T, unsigned int ItemsPerThread, unsigned int BlockSize> |
| 1675 | struct operation<block_scan, T, ItemsPerThread, BlockSize> |
| 1676 | { |
| 1677 | - typedef |
| 1678 | - typename hipcub::BlockScan<T, BlockSize, hipcub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS> |
| 1679 | - block_scan_type; |
| 1680 | - typedef typename block_scan_type::TempStorage storage_type; |
| 1681 | - |
| 1682 | - HIPCUB_DEVICE inline void operator()(storage_type& storage, |
| 1683 | - T (&input)[ItemsPerThread], |
| 1684 | - T* global_mem_output = nullptr) |
| 1685 | + using block_scan_type = |
| 1686 | + typename hipcub::BlockScan<T, BlockSize, hipcub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS>; |
| 1687 | + using storage_type = typename block_scan_type::TempStorage; |
| 1688 | + |
| 1689 | + HIPCUB_DEVICE |
| 1690 | + inline void |
| 1691 | + operator()(storage_type& storage, |
| 1692 | + T (&input)[ItemsPerThread], |
| 1693 | + T* global_mem_output = nullptr) |
| 1694 | { |
| 1695 | (void)global_mem_output; |
| 1696 | |
| 1697 | @@ -117,11 +121,13 @@ struct operation<block_scan, T, ItemsPerThread, BlockSize> |
| 1698 | template<typename T, unsigned int ItemsPerThread, unsigned int BlockSize> |
| 1699 | struct operation<atomics_no_collision, T, ItemsPerThread, BlockSize> |
| 1700 | { |
| 1701 | - typedef empty_storage_type storage_type; |
| 1702 | + using storage_type = empty_storage_type; |
| 1703 | |
| 1704 | - HIPCUB_DEVICE inline void operator()(storage_type& storage, |
| 1705 | - T (&input)[ItemsPerThread], |
| 1706 | - T* global_mem_output = nullptr) |
| 1707 | + HIPCUB_DEVICE |
| 1708 | + inline void |
| 1709 | + operator()(storage_type& storage, |
| 1710 | + T (&input)[ItemsPerThread], |
| 1711 | + T* global_mem_output = nullptr) |
| 1712 | { |
| 1713 | (void)storage; |
| 1714 | (void)input; |
| 1715 | @@ -140,11 +146,13 @@ struct operation<atomics_no_collision, T, ItemsPerThread, BlockSize> |
| 1716 | template<typename T, unsigned int ItemsPerThread, unsigned int BlockSize> |
| 1717 | struct operation<atomics_inter_warp_collision, T, ItemsPerThread, BlockSize> |
| 1718 | { |
| 1719 | - typedef empty_storage_type storage_type; |
| 1720 | + using storage_type = empty_storage_type; |
| 1721 | |
| 1722 | - HIPCUB_DEVICE inline void operator()(storage_type& storage, |
| 1723 | - T (&input)[ItemsPerThread], |
| 1724 | - T* global_mem_output = nullptr) |
| 1725 | + HIPCUB_DEVICE |
| 1726 | + inline void |
| 1727 | + operator()(storage_type& storage, |
| 1728 | + T (&input)[ItemsPerThread], |
| 1729 | + T* global_mem_output = nullptr) |
| 1730 | { |
| 1731 | (void)storage; |
| 1732 | (void)input; |
| 1733 | @@ -163,11 +171,13 @@ struct operation<atomics_inter_warp_collision, T, ItemsPerThread, BlockSize> |
| 1734 | template<typename T, unsigned int ItemsPerThread, unsigned int BlockSize> |
| 1735 | struct operation<atomics_inter_block_collision, T, ItemsPerThread, BlockSize> |
| 1736 | { |
| 1737 | - typedef empty_storage_type storage_type; |
| 1738 | + using storage_type = empty_storage_type; |
| 1739 | |
| 1740 | - HIPCUB_DEVICE inline void operator()(storage_type& storage, |
| 1741 | - T (&input)[ItemsPerThread], |
| 1742 | - T* global_mem_output = nullptr) |
| 1743 | + HIPCUB_DEVICE |
| 1744 | + inline void |
| 1745 | + operator()(storage_type& storage, |
| 1746 | + T (&input)[ItemsPerThread], |
| 1747 | + T* global_mem_output = nullptr) |
| 1748 | { |
| 1749 | (void)storage; |
| 1750 | (void)input; |
| 1751 | @@ -237,9 +247,9 @@ template<typename T, |
| 1752 | typename CustomOp> |
| 1753 | __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) |
| 1754 | { |
| 1755 | - typedef memory_operation<MemOp> mem_op; |
| 1756 | - typedef hipcub::BlockLoad<T, BlockSize, ItemsPerThread, mem_op::load_type> load_type; |
| 1757 | - typedef hipcub::BlockStore<T, BlockSize, ItemsPerThread, mem_op::store_type> store_type; |
| 1758 | + using mem_op = memory_operation<MemOp>; |
| 1759 | + using load_type = hipcub::BlockLoad<T, BlockSize, ItemsPerThread, mem_op::load_type>; |
| 1760 | + using store_type = hipcub::BlockStore<T, BlockSize, ItemsPerThread, mem_op::store_type>; |
| 1761 | |
| 1762 | __shared__ union |
| 1763 | { |
| 1764 | diff --git a/benchmark/benchmark_device_merge.cpp b/benchmark/benchmark_device_merge.cpp |
| 1765 | new file mode 100644 |
| 1766 | index 0000000..e22d6ea |
| 1767 | --- /dev/null |
| 1768 | +++ b/benchmark/benchmark_device_merge.cpp |
| 1769 | @@ -0,0 +1,364 @@ |
| 1770 | +// MIT License |
| 1771 | +// |
| 1772 | +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. |
| 1773 | +// |
| 1774 | +// Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1775 | +// of this software and associated documentation files (the "Software"), to deal |
| 1776 | +// in the Software without restriction, including without limitation the rights |
| 1777 | +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 1778 | +// copies of the Software, and to permit persons to whom the Software is |
| 1779 | +// furnished to do so, subject to the following conditions: |
| 1780 | +// |
| 1781 | +// The above copyright notice and this permission notice shall be included in |
| 1782 | +// all copies or substantial portions of the Software. |
| 1783 | +// |
| 1784 | +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 1785 | +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 1786 | +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 1787 | +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 1788 | +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 1789 | +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 1790 | +// SOFTWARE. |
| 1791 | + |
| 1792 | +#include "common_benchmark_header.hpp" |
| 1793 | + |
| 1794 | +// HIP API |
| 1795 | +#include <hipcub/device/device_merge.hpp> |
| 1796 | + |
| 1797 | +#ifndef DEFAULT_N |
| 1798 | +const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 1799 | +#endif |
| 1800 | + |
| 1801 | +const unsigned int batch_size = 10; |
| 1802 | +const unsigned int warmup_size = 5; |
| 1803 | + |
| 1804 | +template<class key_type> |
| 1805 | +struct CompareFunction |
| 1806 | +{ |
| 1807 | + HIPCUB_HOST_DEVICE |
| 1808 | + inline constexpr bool |
| 1809 | + operator()(const key_type& a, const key_type& b) |
| 1810 | + { |
| 1811 | + return a < b; |
| 1812 | + } |
| 1813 | +}; |
| 1814 | + |
| 1815 | +template<class Key> |
| 1816 | +void run_merge_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t size) |
| 1817 | +{ |
| 1818 | + using key_type = Key; |
| 1819 | + |
| 1820 | + CompareFunction<key_type> compare_function; |
| 1821 | + |
| 1822 | + const size_t size1 = size / 2; |
| 1823 | + const size_t size2 = size - size1; |
| 1824 | + |
| 1825 | + std::vector<key_type> keys_input1 = benchmark_utils::get_random_data<key_type>( |
| 1826 | + size1, |
| 1827 | + benchmark_utils::generate_limits<key_type>::min(), |
| 1828 | + benchmark_utils::generate_limits<key_type>::max()); |
| 1829 | + |
| 1830 | + std::vector<key_type> keys_input2 = benchmark_utils::get_random_data<key_type>( |
| 1831 | + size2, |
| 1832 | + benchmark_utils::generate_limits<key_type>::min(), |
| 1833 | + benchmark_utils::generate_limits<key_type>::max()); |
| 1834 | + |
| 1835 | + std::sort(keys_input1.begin(), keys_input1.end(), compare_function); |
| 1836 | + std::sort(keys_input2.begin(), keys_input2.end(), compare_function); |
| 1837 | + |
| 1838 | + key_type* d_keys_input1; |
| 1839 | + HIP_CHECK(hipMalloc(&d_keys_input1, size1 * sizeof(key_type))); |
| 1840 | + HIP_CHECK(hipMemcpy(d_keys_input1, |
| 1841 | + keys_input1.data(), |
| 1842 | + size1 * sizeof(key_type), |
| 1843 | + hipMemcpyHostToDevice)); |
| 1844 | + |
| 1845 | + key_type* d_keys_input2; |
| 1846 | + HIP_CHECK(hipMalloc(&d_keys_input2, size2 * sizeof(key_type))); |
| 1847 | + HIP_CHECK(hipMemcpy(d_keys_input2, |
| 1848 | + keys_input2.data(), |
| 1849 | + size2 * sizeof(key_type), |
| 1850 | + hipMemcpyHostToDevice)); |
| 1851 | + |
| 1852 | + key_type* d_keys_output; |
| 1853 | + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); |
| 1854 | + |
| 1855 | + void* d_temporary_storage = nullptr; |
| 1856 | + size_t temporary_storage_bytes = 0; |
| 1857 | + HIP_CHECK(hipcub::DeviceMerge::MergeKeys(d_temporary_storage, |
| 1858 | + temporary_storage_bytes, |
| 1859 | + d_keys_input1, |
| 1860 | + size1, |
| 1861 | + d_keys_input2, |
| 1862 | + size2, |
| 1863 | + d_keys_output, |
| 1864 | + compare_function, |
| 1865 | + stream)); |
| 1866 | + |
| 1867 | + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); |
| 1868 | + |
| 1869 | + // Warm-up |
| 1870 | + for(size_t i = 0; i < warmup_size; i++) |
| 1871 | + { |
| 1872 | + HIP_CHECK(hipcub::DeviceMerge::MergeKeys(d_temporary_storage, |
| 1873 | + temporary_storage_bytes, |
| 1874 | + d_keys_input1, |
| 1875 | + size1, |
| 1876 | + d_keys_input2, |
| 1877 | + size2, |
| 1878 | + d_keys_output, |
| 1879 | + compare_function, |
| 1880 | + stream)); |
| 1881 | + } |
| 1882 | + HIP_CHECK(hipDeviceSynchronize()); |
| 1883 | + |
| 1884 | + for(auto _ : state) |
| 1885 | + { |
| 1886 | + auto start = std::chrono::high_resolution_clock::now(); |
| 1887 | + |
| 1888 | + for(size_t i = 0; i < batch_size; i++) |
| 1889 | + { |
| 1890 | + HIP_CHECK(hipcub::DeviceMerge::MergeKeys(d_temporary_storage, |
| 1891 | + temporary_storage_bytes, |
| 1892 | + d_keys_input1, |
| 1893 | + size1, |
| 1894 | + d_keys_input2, |
| 1895 | + size2, |
| 1896 | + d_keys_output, |
| 1897 | + compare_function, |
| 1898 | + stream)); |
| 1899 | + } |
| 1900 | + HIP_CHECK(hipDeviceSynchronize()); |
| 1901 | + |
| 1902 | + auto end = std::chrono::high_resolution_clock::now(); |
| 1903 | + auto elapsed_seconds |
| 1904 | + = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); |
| 1905 | + state.SetIterationTime(elapsed_seconds.count()); |
| 1906 | + } |
| 1907 | + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); |
| 1908 | + state.SetItemsProcessed(state.iterations() * batch_size * size); |
| 1909 | + |
| 1910 | + HIP_CHECK(hipFree(d_temporary_storage)); |
| 1911 | + HIP_CHECK(hipFree(d_keys_input1)); |
| 1912 | + HIP_CHECK(hipFree(d_keys_input2)); |
| 1913 | + HIP_CHECK(hipFree(d_keys_output)); |
| 1914 | +} |
| 1915 | + |
| 1916 | +template<class Key, class Value> |
| 1917 | +void run_merge_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_t size) |
| 1918 | +{ |
| 1919 | + using key_type = Key; |
| 1920 | + using value_type = Value; |
| 1921 | + |
| 1922 | + CompareFunction<key_type> compare_function; |
| 1923 | + |
| 1924 | + const size_t size1 = size / 2; |
| 1925 | + const size_t size2 = size - size1; |
| 1926 | + |
| 1927 | + std::vector<key_type> keys_input1 = benchmark_utils::get_random_data<key_type>( |
| 1928 | + size1, |
| 1929 | + benchmark_utils::generate_limits<key_type>::min(), |
| 1930 | + benchmark_utils::generate_limits<key_type>::max()); |
| 1931 | + std::vector<key_type> keys_input2 = benchmark_utils::get_random_data<key_type>( |
| 1932 | + size2, |
| 1933 | + benchmark_utils::generate_limits<key_type>::min(), |
| 1934 | + benchmark_utils::generate_limits<key_type>::max()); |
| 1935 | + |
| 1936 | + std::sort(keys_input1.begin(), keys_input1.end(), compare_function); |
| 1937 | + std::sort(keys_input2.begin(), keys_input2.end(), compare_function); |
| 1938 | + |
| 1939 | + key_type* d_keys_input1; |
| 1940 | + HIP_CHECK(hipMalloc(&d_keys_input1, size1 * sizeof(key_type))); |
| 1941 | + HIP_CHECK(hipMemcpy(d_keys_input1, |
| 1942 | + keys_input1.data(), |
| 1943 | + size1 * sizeof(key_type), |
| 1944 | + hipMemcpyHostToDevice)); |
| 1945 | + |
| 1946 | + key_type* d_keys_input2; |
| 1947 | + HIP_CHECK(hipMalloc(&d_keys_input2, size2 * sizeof(key_type))); |
| 1948 | + HIP_CHECK(hipMemcpy(d_keys_input2, |
| 1949 | + keys_input2.data(), |
| 1950 | + size2 * sizeof(key_type), |
| 1951 | + hipMemcpyHostToDevice)); |
| 1952 | + |
| 1953 | + key_type* d_keys_output; |
| 1954 | + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); |
| 1955 | + |
| 1956 | + std::vector<value_type> values_input1(size1); |
| 1957 | + std::iota(values_input1.begin(), values_input1.end(), 0); |
| 1958 | + value_type* d_values_input1; |
| 1959 | + HIP_CHECK(hipMalloc(&d_values_input1, size1 * sizeof(value_type))); |
| 1960 | + HIP_CHECK(hipMemcpy(d_values_input1, |
| 1961 | + values_input1.data(), |
| 1962 | + size1 * sizeof(value_type), |
| 1963 | + hipMemcpyHostToDevice)); |
| 1964 | + |
| 1965 | + std::vector<value_type> values_input2(size2); |
| 1966 | + std::iota(values_input2.begin(), values_input2.end(), size1); |
| 1967 | + value_type* d_values_input2; |
| 1968 | + HIP_CHECK(hipMalloc(&d_values_input2, size2 * sizeof(value_type))); |
| 1969 | + HIP_CHECK(hipMemcpy(d_values_input2, |
| 1970 | + values_input2.data(), |
| 1971 | + size2 * sizeof(value_type), |
| 1972 | + hipMemcpyHostToDevice)); |
| 1973 | + |
| 1974 | + value_type* d_values_output; |
| 1975 | + HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); |
| 1976 | + |
| 1977 | + void* d_temporary_storage = nullptr; |
| 1978 | + size_t temporary_storage_bytes = 0; |
| 1979 | + HIP_CHECK(hipcub::DeviceMerge::MergePairs(d_temporary_storage, |
| 1980 | + temporary_storage_bytes, |
| 1981 | + d_keys_input1, |
| 1982 | + d_values_input1, |
| 1983 | + size1, |
| 1984 | + d_keys_input2, |
| 1985 | + d_values_input2, |
| 1986 | + size2, |
| 1987 | + d_keys_output, |
| 1988 | + d_values_output, |
| 1989 | + compare_function, |
| 1990 | + stream)); |
| 1991 | + |
| 1992 | + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); |
| 1993 | + |
| 1994 | + // Warm-up |
| 1995 | + for(size_t i = 0; i < warmup_size; i++) |
| 1996 | + { |
| 1997 | + HIP_CHECK(hipcub::DeviceMerge::MergePairs(d_temporary_storage, |
| 1998 | + temporary_storage_bytes, |
| 1999 | + d_keys_input1, |
| 2000 | + d_values_input1, |
| 2001 | + size1, |
| 2002 | + d_keys_input2, |
| 2003 | + d_values_input2, |
| 2004 | + size2, |
| 2005 | + d_keys_output, |
| 2006 | + d_values_output, |
| 2007 | + compare_function, |
| 2008 | + stream)); |
| 2009 | + } |
| 2010 | + HIP_CHECK(hipDeviceSynchronize()); |
| 2011 | + |
| 2012 | + for(auto _ : state) |
| 2013 | + { |
| 2014 | + auto start = std::chrono::high_resolution_clock::now(); |
| 2015 | + |
| 2016 | + for(size_t i = 0; i < batch_size; i++) |
| 2017 | + { |
| 2018 | + HIP_CHECK(hipcub::DeviceMerge::MergePairs(d_temporary_storage, |
| 2019 | + temporary_storage_bytes, |
| 2020 | + d_keys_input1, |
| 2021 | + d_values_input1, |
| 2022 | + size1, |
| 2023 | + d_keys_input2, |
| 2024 | + d_values_input2, |
| 2025 | + size2, |
| 2026 | + d_keys_output, |
| 2027 | + d_values_output, |
| 2028 | + compare_function, |
| 2029 | + stream)); |
| 2030 | + } |
| 2031 | + HIP_CHECK(hipDeviceSynchronize()); |
| 2032 | + |
| 2033 | + auto end = std::chrono::high_resolution_clock::now(); |
| 2034 | + auto elapsed_seconds |
| 2035 | + = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); |
| 2036 | + state.SetIterationTime(elapsed_seconds.count()); |
| 2037 | + } |
| 2038 | + state.SetBytesProcessed(state.iterations() * batch_size * size |
| 2039 | + * (sizeof(key_type) + sizeof(value_type))); |
| 2040 | + state.SetItemsProcessed(state.iterations() * batch_size * size); |
| 2041 | + |
| 2042 | + HIP_CHECK(hipFree(d_temporary_storage)); |
| 2043 | + HIP_CHECK(hipFree(d_keys_input1)); |
| 2044 | + HIP_CHECK(hipFree(d_keys_input2)); |
| 2045 | + HIP_CHECK(hipFree(d_keys_output)); |
| 2046 | + HIP_CHECK(hipFree(d_values_input1)); |
| 2047 | + HIP_CHECK(hipFree(d_values_input2)); |
| 2048 | + HIP_CHECK(hipFree(d_values_output)); |
| 2049 | +} |
| 2050 | + |
| 2051 | +#define CREATE_MERGE_KEYS_BENCHMARK(T) \ |
| 2052 | + benchmarks.push_back(benchmark::RegisterBenchmark( \ |
| 2053 | + std::string("device_merge_keys" \ |
| 2054 | + "<key_data_type:" #T ">.") \ |
| 2055 | + .c_str(), \ |
| 2056 | + [=](benchmark::State& state) { run_merge_keys_benchmark<T>(state, stream, size); })); |
| 2057 | + |
| 2058 | +#define CREATE_MERGE_PAIRS_BENCHMARK(T, V) \ |
| 2059 | + benchmarks.push_back(benchmark::RegisterBenchmark( \ |
| 2060 | + std::string("device_merge_pairs<" \ |
| 2061 | + ",key_data_type:" #T ",value_data_type:" #V ">.") \ |
| 2062 | + .c_str(), \ |
| 2063 | + [=](benchmark::State& state) { run_merge_pairs_benchmark<T, V>(state, stream, size); })); |
| 2064 | + |
| 2065 | +int main(int argc, char* argv[]) |
| 2066 | +{ |
| 2067 | + cli::Parser parser(argc, argv); |
| 2068 | + parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values"); |
| 2069 | + parser.set_optional<int>("trials", "trials", -1, "number of iterations"); |
| 2070 | + parser.run_and_exit_if_error(); |
| 2071 | + |
| 2072 | + // Parse argv |
| 2073 | + benchmark::Initialize(&argc, argv); |
| 2074 | + const size_t size = parser.get<size_t>("size"); |
| 2075 | + const int trials = parser.get<int>("trials"); |
| 2076 | + |
| 2077 | + // HIP |
| 2078 | + hipStream_t stream = 0; // default |
| 2079 | + hipDeviceProp_t devProp; |
| 2080 | + int device_id = 0; |
| 2081 | + HIP_CHECK(hipGetDevice(&device_id)); |
| 2082 | + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); |
| 2083 | + |
| 2084 | + std::cout << "benchmark_device_merge" << std::endl; |
| 2085 | + std::cout << "[HIP] Device name: " << devProp.name << std::endl; |
| 2086 | + |
| 2087 | + // Add benchmarks |
| 2088 | + std::vector<benchmark::internal::Benchmark*> benchmarks; |
| 2089 | + |
| 2090 | + using custom_float2 = benchmark_utils::custom_type<float, float>; |
| 2091 | + using custom_double2 = benchmark_utils::custom_type<double, double>; |
| 2092 | + using custom_char_double = benchmark_utils::custom_type<char, double>; |
| 2093 | + using custom_double_char = benchmark_utils::custom_type<double, char>; |
| 2094 | + |
| 2095 | + CREATE_MERGE_KEYS_BENCHMARK(int) |
| 2096 | + CREATE_MERGE_KEYS_BENCHMARK(long long) |
| 2097 | + CREATE_MERGE_KEYS_BENCHMARK(int8_t) |
| 2098 | + CREATE_MERGE_KEYS_BENCHMARK(uint8_t) |
| 2099 | + CREATE_MERGE_KEYS_BENCHMARK(short) |
| 2100 | + CREATE_MERGE_KEYS_BENCHMARK(double) |
| 2101 | + CREATE_MERGE_KEYS_BENCHMARK(float) |
| 2102 | + CREATE_MERGE_KEYS_BENCHMARK(custom_float2) |
| 2103 | + CREATE_MERGE_KEYS_BENCHMARK(custom_double2) |
| 2104 | + |
| 2105 | + CREATE_MERGE_PAIRS_BENCHMARK(int, int) |
| 2106 | + CREATE_MERGE_PAIRS_BENCHMARK(long long, long long) |
| 2107 | + CREATE_MERGE_PAIRS_BENCHMARK(int8_t, int8_t) |
| 2108 | + CREATE_MERGE_PAIRS_BENCHMARK(uint8_t, uint8_t) |
| 2109 | + CREATE_MERGE_PAIRS_BENCHMARK(short, short) |
| 2110 | + CREATE_MERGE_PAIRS_BENCHMARK(custom_char_double, custom_char_double) |
| 2111 | + CREATE_MERGE_PAIRS_BENCHMARK(int, custom_double_char) |
| 2112 | + CREATE_MERGE_PAIRS_BENCHMARK(custom_double2, custom_double2) |
| 2113 | + |
| 2114 | + // Use manual timing |
| 2115 | + for(auto& b : benchmarks) |
| 2116 | + { |
| 2117 | + b->UseManualTime(); |
| 2118 | + b->Unit(benchmark::kMillisecond); |
| 2119 | + } |
| 2120 | + |
| 2121 | + // Force number of iterations |
| 2122 | + if(trials > 0) |
| 2123 | + { |
| 2124 | + for(auto& b : benchmarks) |
| 2125 | + { |
| 2126 | + b->Iterations(trials); |
| 2127 | + } |
| 2128 | + } |
| 2129 | + |
| 2130 | + // Run benchmarks |
| 2131 | + benchmark::RunSpecifiedBenchmarks(); |
| 2132 | + return 0; |
| 2133 | +} |
| 2134 | diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp |
| 2135 | index fc64fc0..284892c 100644 |
| 2136 | --- a/benchmark/benchmark_device_merge_sort.cpp |
| 2137 | +++ b/benchmark/benchmark_device_merge_sort.cpp |
| 2138 | @@ -23,8 +23,8 @@ |
| 2139 | #include "common_benchmark_header.hpp" |
| 2140 | |
| 2141 | // HIP API |
| 2142 | -#include "hipcub/device/device_merge_sort.hpp" |
| 2143 | -#include "hipcub/hipcub.hpp" |
| 2144 | +#include <hipcub/device/device_merge_sort.hpp> |
| 2145 | +#include <hipcub/hipcub.hpp> |
| 2146 | |
| 2147 | #ifndef DEFAULT_N |
| 2148 | const size_t DEFAULT_N = 32 << 20; |
| 2149 | diff --git a/benchmark/benchmark_device_partition.cpp b/benchmark/benchmark_device_partition.cpp |
| 2150 | index 786fe13..f925e42 100644 |
| 2151 | --- a/benchmark/benchmark_device_partition.cpp |
| 2152 | +++ b/benchmark/benchmark_device_partition.cpp |
| 2153 | @@ -23,7 +23,7 @@ |
| 2154 | #include "common_benchmark_header.hpp" |
| 2155 | |
| 2156 | // HIP API |
| 2157 | -#include "hipcub/device/device_partition.hpp" |
| 2158 | +#include <hipcub/device/device_partition.hpp> |
| 2159 | |
| 2160 | #include <chrono> |
| 2161 | #include <vector> |
| 2162 | diff --git a/benchmark/benchmark_device_radix_sort.cpp b/benchmark/benchmark_device_radix_sort.cpp |
| 2163 | index c65abd8..2e40b3b 100644 |
| 2164 | --- a/benchmark/benchmark_device_radix_sort.cpp |
| 2165 | +++ b/benchmark/benchmark_device_radix_sort.cpp |
| 2166 | @@ -26,7 +26,7 @@ |
| 2167 | #include <type_traits> |
| 2168 | |
| 2169 | // HIP API |
| 2170 | -#include "hipcub/device/device_radix_sort.hpp" |
| 2171 | +#include <hipcub/device/device_radix_sort.hpp> |
| 2172 | |
| 2173 | #ifndef DEFAULT_N |
| 2174 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 2175 | diff --git a/benchmark/benchmark_device_reduce.cpp b/benchmark/benchmark_device_reduce.cpp |
| 2176 | index 2a4d9df..8dcd968 100644 |
| 2177 | --- a/benchmark/benchmark_device_reduce.cpp |
| 2178 | +++ b/benchmark/benchmark_device_reduce.cpp |
| 2179 | @@ -21,9 +21,10 @@ |
| 2180 | // SOFTWARE. |
| 2181 | |
| 2182 | #include "common_benchmark_header.hpp" |
| 2183 | +#include "hipcub/config.hpp" |
| 2184 | |
| 2185 | // HIP API |
| 2186 | -#include "hipcub/device/device_reduce.hpp" |
| 2187 | +#include <hipcub/device/device_reduce.hpp> |
| 2188 | |
| 2189 | #ifndef DEFAULT_N |
| 2190 | const size_t DEFAULT_N = 1024 * 1024 * 128; |
| 2191 | @@ -118,8 +119,10 @@ struct Benchmark<T, hipcub::ArgMin> |
| 2192 | |
| 2193 | static void run(benchmark::State& state, size_t size, const hipStream_t stream) |
| 2194 | { |
| 2195 | + HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH |
| 2196 | hipError_t (*ptr_to_argmin)(void*, size_t&, T*, KeyValue*, int, hipStream_t) |
| 2197 | - = &hipcub::DeviceReduce::ArgMin; |
| 2198 | + = &hipcub::DeviceReduce::ArgMin; |
| 2199 | + HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP |
| 2200 | run_benchmark<T, KeyValue>(state, size, stream, ptr_to_argmin); |
| 2201 | } |
| 2202 | }; |
| 2203 | diff --git a/benchmark/benchmark_device_reduce_by_key.cpp b/benchmark/benchmark_device_reduce_by_key.cpp |
| 2204 | index 54209e6..0d9160f 100644 |
| 2205 | --- a/benchmark/benchmark_device_reduce_by_key.cpp |
| 2206 | +++ b/benchmark/benchmark_device_reduce_by_key.cpp |
| 2207 | @@ -29,7 +29,7 @@ |
| 2208 | #include "common_benchmark_header.hpp" |
| 2209 | |
| 2210 | // HIP API |
| 2211 | -#include "hipcub/device/device_reduce.hpp" |
| 2212 | +#include <hipcub/device/device_reduce.hpp> |
| 2213 | |
| 2214 | #ifndef DEFAULT_N |
| 2215 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 2216 | diff --git a/benchmark/benchmark_device_run_length_encode.cpp b/benchmark/benchmark_device_run_length_encode.cpp |
| 2217 | index b7ef64b..f0c8585 100644 |
| 2218 | --- a/benchmark/benchmark_device_run_length_encode.cpp |
| 2219 | +++ b/benchmark/benchmark_device_run_length_encode.cpp |
| 2220 | @@ -29,7 +29,7 @@ |
| 2221 | #include "common_benchmark_header.hpp" |
| 2222 | |
| 2223 | // HIP API |
| 2224 | -#include "hipcub/device/device_run_length_encode.hpp" |
| 2225 | +#include <hipcub/device/device_run_length_encode.hpp> |
| 2226 | |
| 2227 | #ifndef DEFAULT_N |
| 2228 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 2229 | diff --git a/benchmark/benchmark_device_scan.cpp b/benchmark/benchmark_device_scan.cpp |
| 2230 | index dbfdda6..5d38b96 100644 |
| 2231 | --- a/benchmark/benchmark_device_scan.cpp |
| 2232 | +++ b/benchmark/benchmark_device_scan.cpp |
| 2233 | @@ -29,7 +29,7 @@ |
| 2234 | #include "common_benchmark_header.hpp" |
| 2235 | |
| 2236 | // HIP API |
| 2237 | -#include "hipcub/device/device_scan.hpp" |
| 2238 | +#include <hipcub/device/device_scan.hpp> |
| 2239 | |
| 2240 | #ifndef DEFAULT_N |
| 2241 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 2242 | diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp |
| 2243 | index 05566d6..252e8ff 100644 |
| 2244 | --- a/benchmark/benchmark_device_segmented_radix_sort.cpp |
| 2245 | +++ b/benchmark/benchmark_device_segmented_radix_sort.cpp |
| 2246 | @@ -1,6 +1,6 @@ |
| 2247 | // MIT License |
| 2248 | // |
| 2249 | -// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 2250 | +// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 2251 | // |
| 2252 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 2253 | // of this software and associated documentation files (the "Software"), to deal |
| 2254 | @@ -23,7 +23,7 @@ |
| 2255 | #include "common_benchmark_header.hpp" |
| 2256 | |
| 2257 | // HIP API |
| 2258 | -#include "hipcub/hipcub.hpp" |
| 2259 | +#include <hipcub/hipcub.hpp> |
| 2260 | |
| 2261 | #ifndef DEFAULT_N |
| 2262 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 2263 | @@ -44,17 +44,17 @@ void run_sort_keys_benchmark(benchmark::State& state, |
| 2264 | { |
| 2265 | using offset_type = int; |
| 2266 | using key_type = Key; |
| 2267 | - typedef hipError_t (*sort_func)(void*, |
| 2268 | - size_t&, |
| 2269 | - const key_type*, |
| 2270 | - key_type*, |
| 2271 | - int, |
| 2272 | - int, |
| 2273 | - offset_type*, |
| 2274 | - offset_type*, |
| 2275 | - int, |
| 2276 | - int, |
| 2277 | - hipStream_t); |
| 2278 | + using sort_func = hipError_t (*)(void*, |
| 2279 | + size_t&, |
| 2280 | + const key_type*, |
| 2281 | + key_type*, |
| 2282 | + int, |
| 2283 | + int, |
| 2284 | + offset_type*, |
| 2285 | + offset_type*, |
| 2286 | + int, |
| 2287 | + int, |
| 2288 | + hipStream_t); |
| 2289 | |
| 2290 | sort_func func_ascending = &hipcub::DeviceSegmentedRadixSort::SortKeys<key_type, offset_type*>; |
| 2291 | sort_func func_descending |
| 2292 | @@ -180,19 +180,19 @@ void run_sort_pairs_benchmark(benchmark::State& state, |
| 2293 | using offset_type = int; |
| 2294 | using key_type = Key; |
| 2295 | using value_type = Value; |
| 2296 | - typedef hipError_t (*sort_func)(void*, |
| 2297 | - size_t&, |
| 2298 | - const key_type*, |
| 2299 | - key_type*, |
| 2300 | - const value_type*, |
| 2301 | - value_type*, |
| 2302 | - int, |
| 2303 | - int, |
| 2304 | - offset_type*, |
| 2305 | - offset_type*, |
| 2306 | - int, |
| 2307 | - int, |
| 2308 | - hipStream_t); |
| 2309 | + using sort_func = hipError_t (*)(void*, |
| 2310 | + size_t&, |
| 2311 | + const key_type*, |
| 2312 | + key_type*, |
| 2313 | + const value_type*, |
| 2314 | + value_type*, |
| 2315 | + int, |
| 2316 | + int, |
| 2317 | + offset_type*, |
| 2318 | + offset_type*, |
| 2319 | + int, |
| 2320 | + int, |
| 2321 | + hipStream_t); |
| 2322 | |
| 2323 | sort_func func_ascending |
| 2324 | = &hipcub::DeviceSegmentedRadixSort::SortPairs<key_type, value_type, offset_type*>; |
| 2325 | diff --git a/benchmark/benchmark_device_segmented_reduce.cpp b/benchmark/benchmark_device_segmented_reduce.cpp |
| 2326 | index d1e40c6..1bf1316 100644 |
| 2327 | --- a/benchmark/benchmark_device_segmented_reduce.cpp |
| 2328 | +++ b/benchmark/benchmark_device_segmented_reduce.cpp |
| 2329 | @@ -23,7 +23,7 @@ |
| 2330 | #include "common_benchmark_header.hpp" |
| 2331 | |
| 2332 | // HIP API |
| 2333 | -#include "hipcub/device/device_segmented_reduce.hpp" |
| 2334 | +#include <hipcub/device/device_segmented_reduce.hpp> |
| 2335 | |
| 2336 | #ifndef DEFAULT_N |
| 2337 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 2338 | diff --git a/benchmark/benchmark_device_segmented_sort.cpp b/benchmark/benchmark_device_segmented_sort.cpp |
| 2339 | index e9bbaf3..db69075 100644 |
| 2340 | --- a/benchmark/benchmark_device_segmented_sort.cpp |
| 2341 | +++ b/benchmark/benchmark_device_segmented_sort.cpp |
| 2342 | @@ -1,6 +1,6 @@ |
| 2343 | // MIT License |
| 2344 | // |
| 2345 | -// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 2346 | +// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 2347 | // |
| 2348 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 2349 | // of this software and associated documentation files (the "Software"), to deal |
| 2350 | @@ -23,7 +23,7 @@ |
| 2351 | #include "common_benchmark_header.hpp" |
| 2352 | |
| 2353 | // HIP API |
| 2354 | -#include "hipcub/hipcub.hpp" |
| 2355 | +#include <hipcub/hipcub.hpp> |
| 2356 | |
| 2357 | #ifndef DEFAULT_N |
| 2358 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 2359 | @@ -42,15 +42,15 @@ void run_sort_keys_benchmark(benchmark::State& state, |
| 2360 | { |
| 2361 | using offset_type = int; |
| 2362 | using key_type = Key; |
| 2363 | - typedef hipError_t (*sort_func)(void*, |
| 2364 | - size_t&, |
| 2365 | - const key_type*, |
| 2366 | - key_type*, |
| 2367 | - int, |
| 2368 | - int, |
| 2369 | - offset_type*, |
| 2370 | - offset_type*, |
| 2371 | - hipStream_t); |
| 2372 | + using sort_func = hipError_t (*)(void*, |
| 2373 | + size_t&, |
| 2374 | + const key_type*, |
| 2375 | + key_type*, |
| 2376 | + int, |
| 2377 | + int, |
| 2378 | + offset_type*, |
| 2379 | + offset_type*, |
| 2380 | + hipStream_t); |
| 2381 | |
| 2382 | sort_func func_ascending = &hipcub::DeviceSegmentedSort::SortKeys<key_type, offset_type*>; |
| 2383 | sort_func func_descending |
| 2384 | @@ -175,17 +175,17 @@ void run_sort_pairs_benchmark(benchmark::State& state, |
| 2385 | using offset_type = int; |
| 2386 | using key_type = Key; |
| 2387 | using value_type = Value; |
| 2388 | - typedef hipError_t (*sort_func)(void*, |
| 2389 | - size_t&, |
| 2390 | - const key_type*, |
| 2391 | - key_type*, |
| 2392 | - const value_type*, |
| 2393 | - value_type*, |
| 2394 | - int, |
| 2395 | - int, |
| 2396 | - offset_type*, |
| 2397 | - offset_type*, |
| 2398 | - hipStream_t); |
| 2399 | + using sort_func = hipError_t (*)(void*, |
| 2400 | + size_t&, |
| 2401 | + const key_type*, |
| 2402 | + key_type*, |
| 2403 | + const value_type*, |
| 2404 | + value_type*, |
| 2405 | + int, |
| 2406 | + int, |
| 2407 | + offset_type*, |
| 2408 | + offset_type*, |
| 2409 | + hipStream_t); |
| 2410 | |
| 2411 | sort_func func_ascending |
| 2412 | = &hipcub::DeviceSegmentedSort::SortPairs<key_type, value_type, offset_type*>; |
| 2413 | diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp |
| 2414 | index a14cbdd..04097ec 100644 |
| 2415 | --- a/benchmark/benchmark_device_select.cpp |
| 2416 | +++ b/benchmark/benchmark_device_select.cpp |
| 2417 | @@ -1,6 +1,6 @@ |
| 2418 | // MIT License |
| 2419 | // |
| 2420 | -// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 2421 | +// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 2422 | // |
| 2423 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 2424 | // of this software and associated documentation files (the "Software"), to deal |
| 2425 | @@ -23,7 +23,7 @@ |
| 2426 | #include "common_benchmark_header.hpp" |
| 2427 | |
| 2428 | // HIP API |
| 2429 | -#include "hipcub/device/device_select.hpp" |
| 2430 | +#include <hipcub/device/device_select.hpp> |
| 2431 | |
| 2432 | #ifndef DEFAULT_N |
| 2433 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 2434 | @@ -307,12 +307,11 @@ void run_flagged_if_benchmark(benchmark::State& state, |
| 2435 | state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); |
| 2436 | state.SetItemsProcessed(state.iterations() * batch_size * size); |
| 2437 | |
| 2438 | - hipFree(d_input); |
| 2439 | - hipFree(d_flags); |
| 2440 | - hipFree(d_output); |
| 2441 | - hipFree(d_selected_count_output); |
| 2442 | - hipFree(d_temp_storage); |
| 2443 | - HIP_CHECK(hipDeviceSynchronize()); |
| 2444 | + HIP_CHECK(hipFree(d_input)); |
| 2445 | + HIP_CHECK(hipFree(d_flags)); |
| 2446 | + HIP_CHECK(hipFree(d_output)); |
| 2447 | + HIP_CHECK(hipFree(d_selected_count_output)); |
| 2448 | + HIP_CHECK(hipFree(d_temp_storage)); |
| 2449 | } |
| 2450 | |
| 2451 | template<class T> |
| 2452 | diff --git a/benchmark/benchmark_device_spmv.cpp b/benchmark/benchmark_device_spmv.cpp |
| 2453 | index f98f1c1..fcdb1ab 100644 |
| 2454 | --- a/benchmark/benchmark_device_spmv.cpp |
| 2455 | +++ b/benchmark/benchmark_device_spmv.cpp |
| 2456 | @@ -23,7 +23,7 @@ |
| 2457 | #include "common_benchmark_header.hpp" |
| 2458 | |
| 2459 | // HIP API |
| 2460 | -#include "hipcub/device/device_spmv.hpp" |
| 2461 | +#include <hipcub/device/device_spmv.hpp> |
| 2462 | |
| 2463 | #ifndef DEFAULT_N |
| 2464 | const size_t DEFAULT_N = 1024 * 32; |
| 2465 | @@ -126,6 +126,7 @@ void run_benchmark(benchmark::State& state, |
| 2466 | size_t temp_storage_size_bytes; |
| 2467 | |
| 2468 | // Get size of d_temp_storage |
| 2469 | + HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH |
| 2470 | HIP_CHECK(hipcub::DeviceSpmv::CsrMV(nullptr, |
| 2471 | temp_storage_size_bytes, |
| 2472 | d_values, |
| 2473 | @@ -137,6 +138,7 @@ void run_benchmark(benchmark::State& state, |
| 2474 | size, |
| 2475 | num_nonzeroes, |
| 2476 | stream)); |
| 2477 | + HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP |
| 2478 | HIP_CHECK(hipDeviceSynchronize()); |
| 2479 | |
| 2480 | // allocate temporary storage |
| 2481 | @@ -147,6 +149,7 @@ void run_benchmark(benchmark::State& state, |
| 2482 | // Warm-up |
| 2483 | for(size_t i = 0; i < warmup_size; i++) |
| 2484 | { |
| 2485 | + HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH |
| 2486 | HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage, |
| 2487 | temp_storage_size_bytes, |
| 2488 | d_values, |
| 2489 | @@ -158,6 +161,7 @@ void run_benchmark(benchmark::State& state, |
| 2490 | size, |
| 2491 | num_nonzeroes, |
| 2492 | stream)); |
| 2493 | + HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH |
| 2494 | } |
| 2495 | HIP_CHECK(hipDeviceSynchronize()); |
| 2496 | |
| 2497 | @@ -166,6 +170,7 @@ void run_benchmark(benchmark::State& state, |
| 2498 | auto start = std::chrono::high_resolution_clock::now(); |
| 2499 | for(size_t i = 0; i < batch_size; i++) |
| 2500 | { |
| 2501 | + HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH |
| 2502 | HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage, |
| 2503 | temp_storage_size_bytes, |
| 2504 | d_values, |
| 2505 | @@ -177,6 +182,7 @@ void run_benchmark(benchmark::State& state, |
| 2506 | size, |
| 2507 | num_nonzeroes, |
| 2508 | stream)); |
| 2509 | + HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP |
| 2510 | } |
| 2511 | HIP_CHECK(hipDeviceSynchronize()); |
| 2512 | |
| 2513 | diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp |
| 2514 | index 48a46b8..489cf8d 100644 |
| 2515 | --- a/benchmark/benchmark_utils.hpp |
| 2516 | +++ b/benchmark/benchmark_utils.hpp |
| 2517 | @@ -1,6 +1,6 @@ |
| 2518 | // MIT License |
| 2519 | // |
| 2520 | -// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 2521 | +// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 2522 | // |
| 2523 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 2524 | // of this software and associated documentation files (the "Software"), to deal |
| 2525 | @@ -29,13 +29,13 @@ |
| 2526 | |
| 2527 | // hipCUB API |
| 2528 | #ifdef __HIP_PLATFORM_AMD__ |
| 2529 | - #include "hipcub/backend/rocprim/util_ptx.hpp" |
| 2530 | + #include <hipcub/backend/rocprim/util_ptx.hpp> |
| 2531 | #elif defined(__HIP_PLATFORM_NVIDIA__) |
| 2532 | - #include "hipcub/config.hpp" |
| 2533 | #include <cub/util_ptx.cuh> |
| 2534 | + #include <hipcub/config.hpp> |
| 2535 | #endif |
| 2536 | |
| 2537 | -#include "hipcub/tuple.hpp" |
| 2538 | +#include <hipcub/tuple.hpp> |
| 2539 | |
| 2540 | #ifndef HIPCUB_CUB_API |
| 2541 | #define HIPCUB_WARP_THREADS_MACRO warpSize |
| 2542 | @@ -409,6 +409,14 @@ template<unsigned int LogicalWarpSize> |
| 2543 | __device__ constexpr bool device_test_enabled_for_warp_size_v |
| 2544 | = HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize; |
| 2545 | |
| 2546 | +template<class T> |
| 2547 | +__device__ |
| 2548 | +inline constexpr bool is_power_of_two(const T x) |
| 2549 | +{ |
| 2550 | + static_assert(std::is_integral<T>::value, "T must be integer type"); |
| 2551 | + return (x > 0) && ((x & (x - 1)) == 0); |
| 2552 | +} |
| 2553 | + |
| 2554 | template<typename Iterator> |
| 2555 | using it_value_t = typename std::iterator_traits<Iterator>::value_type; |
| 2556 | |
| 2557 | diff --git a/benchmark/benchmark_warp_exchange.cpp b/benchmark/benchmark_warp_exchange.cpp |
| 2558 | index 598df95..0c41be0 100644 |
| 2559 | --- a/benchmark/benchmark_warp_exchange.cpp |
| 2560 | +++ b/benchmark/benchmark_warp_exchange.cpp |
| 2561 | @@ -23,7 +23,7 @@ |
| 2562 | #include "common_benchmark_header.hpp" |
| 2563 | |
| 2564 | // HIP API |
| 2565 | -#include "hipcub/warp/warp_exchange.hpp" |
| 2566 | +#include <hipcub/warp/warp_exchange.hpp> |
| 2567 | |
| 2568 | #include <type_traits> |
| 2569 | |
| 2570 | diff --git a/benchmark/benchmark_warp_load.cpp b/benchmark/benchmark_warp_load.cpp |
| 2571 | index 4298db6..2c74609 100644 |
| 2572 | --- a/benchmark/benchmark_warp_load.cpp |
| 2573 | +++ b/benchmark/benchmark_warp_load.cpp |
| 2574 | @@ -23,7 +23,7 @@ |
| 2575 | #include "common_benchmark_header.hpp" |
| 2576 | |
| 2577 | // HIP API |
| 2578 | -#include "hipcub/warp/warp_load.hpp" |
| 2579 | +#include <hipcub/warp/warp_load.hpp> |
| 2580 | |
| 2581 | #include <type_traits> |
| 2582 | |
| 2583 | diff --git a/benchmark/benchmark_warp_merge_sort.cpp b/benchmark/benchmark_warp_merge_sort.cpp |
| 2584 | index 5b2d87c..f6d91fe 100644 |
| 2585 | --- a/benchmark/benchmark_warp_merge_sort.cpp |
| 2586 | +++ b/benchmark/benchmark_warp_merge_sort.cpp |
| 2587 | @@ -24,10 +24,10 @@ |
| 2588 | |
| 2589 | #include "../test/hipcub/test_utils_sort_comparator.hpp" |
| 2590 | // HIP API |
| 2591 | -#include "hipcub/block/block_load.hpp" |
| 2592 | -#include "hipcub/block/block_store.hpp" |
| 2593 | -#include "hipcub/util_ptx.hpp" |
| 2594 | -#include "hipcub/warp/warp_merge_sort.hpp" |
| 2595 | +#include <hipcub/block/block_load.hpp> |
| 2596 | +#include <hipcub/block/block_store.hpp> |
| 2597 | +#include <hipcub/util_ptx.hpp> |
| 2598 | +#include <hipcub/warp/warp_merge_sort.hpp> |
| 2599 | |
| 2600 | #include <type_traits> |
| 2601 | |
| 2602 | diff --git a/benchmark/benchmark_warp_reduce.cpp b/benchmark/benchmark_warp_reduce.cpp |
| 2603 | index 1a20d71..f8d08f6 100644 |
| 2604 | --- a/benchmark/benchmark_warp_reduce.cpp |
| 2605 | +++ b/benchmark/benchmark_warp_reduce.cpp |
| 2606 | @@ -23,7 +23,7 @@ |
| 2607 | #include "common_benchmark_header.hpp" |
| 2608 | |
| 2609 | // HIP API |
| 2610 | -#include "hipcub/warp/warp_reduce.hpp" |
| 2611 | +#include <hipcub/warp/warp_reduce.hpp> |
| 2612 | |
| 2613 | #ifndef DEFAULT_N |
| 2614 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 2615 | diff --git a/benchmark/benchmark_warp_scan.cpp b/benchmark/benchmark_warp_scan.cpp |
| 2616 | index ddd499c..db3fe94 100644 |
| 2617 | --- a/benchmark/benchmark_warp_scan.cpp |
| 2618 | +++ b/benchmark/benchmark_warp_scan.cpp |
| 2619 | @@ -1,6 +1,6 @@ |
| 2620 | // MIT License |
| 2621 | // |
| 2622 | -// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 2623 | +// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 2624 | // |
| 2625 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 2626 | // of this software and associated documentation files (the "Software"), to deal |
| 2627 | @@ -23,7 +23,7 @@ |
| 2628 | #include "common_benchmark_header.hpp" |
| 2629 | |
| 2630 | // HIP API |
| 2631 | -#include "hipcub/warp/warp_scan.hpp" |
| 2632 | +#include <hipcub/warp/warp_scan.hpp> |
| 2633 | |
| 2634 | #ifndef DEFAULT_N |
| 2635 | const size_t DEFAULT_N = 1024 * 1024 * 32; |
| 2636 | @@ -106,19 +106,22 @@ struct broadcast |
| 2637 | template<class T, unsigned int WarpSize, unsigned int Trials> |
| 2638 | __device__ |
| 2639 | static auto run(const T* input, T* output, const T init) |
| 2640 | - -> std::enable_if_t<benchmark_utils::device_test_enabled_for_warp_size_v<WarpSize>> |
| 2641 | + -> std::enable_if_t<(benchmark_utils::device_test_enabled_for_warp_size_v<WarpSize> |
| 2642 | + && benchmark_utils::is_power_of_two(WarpSize))> |
| 2643 | { |
| 2644 | (void)init; |
| 2645 | |
| 2646 | - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; |
| 2647 | - auto value = input[i]; |
| 2648 | + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; |
| 2649 | + const unsigned int warp_id = i / WarpSize; |
| 2650 | + const unsigned int src_lane = warp_id % WarpSize; |
| 2651 | + auto value = input[i]; |
| 2652 | |
| 2653 | using wscan_t = hipcub::WarpScan<T, WarpSize>; |
| 2654 | __shared__ typename wscan_t::TempStorage storage; |
| 2655 | #pragma nounroll |
| 2656 | for(unsigned int trial = 0; trial < Trials; trial++) |
| 2657 | { |
| 2658 | - value = wscan_t(storage).Broadcast(value, 0); |
| 2659 | + value = wscan_t(storage).Broadcast(value, src_lane); |
| 2660 | } |
| 2661 | |
| 2662 | output[i] = value; |
| 2663 | @@ -127,7 +130,8 @@ struct broadcast |
| 2664 | template<class T, unsigned int WarpSize, unsigned int Trials> |
| 2665 | __device__ |
| 2666 | static auto run(const T* /*input*/, T* /*output*/, const T /*init*/) |
| 2667 | - -> std::enable_if_t<!benchmark_utils::device_test_enabled_for_warp_size_v<WarpSize>> |
| 2668 | + -> std::enable_if_t<!(benchmark_utils::device_test_enabled_for_warp_size_v<WarpSize> |
| 2669 | + && benchmark_utils::is_power_of_two(WarpSize))> |
| 2670 | {} |
| 2671 | }; |
| 2672 | |
| 2673 | @@ -188,61 +192,75 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) |
| 2674 | #define CREATE_BENCHMARK(T, BS, WS) CREATE_BENCHMARK_IMPL(T, BS, WS, Benchmark) |
| 2675 | |
| 2676 | // clang-format off |
| 2677 | -// If warp size limit is 16 |
| 2678 | -#define BENCHMARK_TYPE_WS16(type) \ |
| 2679 | - CREATE_BENCHMARK(type, 60, 15), \ |
| 2680 | - CREATE_BENCHMARK(type, 256, 16) |
| 2681 | - |
| 2682 | - |
| 2683 | -// If warp size limit is 32 |
| 2684 | -#define BENCHMARK_TYPE_WS32(type) \ |
| 2685 | - BENCHMARK_TYPE_WS16(type), \ |
| 2686 | - CREATE_BENCHMARK(type, 62, 31), \ |
| 2687 | - CREATE_BENCHMARK(type, 256, 32) |
| 2688 | - |
| 2689 | - |
| 2690 | -// If warp size limit is 64 |
| 2691 | -#define BENCHMARK_TYPE_WS64(type) \ |
| 2692 | - BENCHMARK_TYPE_WS32(type), \ |
| 2693 | - CREATE_BENCHMARK(type, 63, 63), \ |
| 2694 | - CREATE_BENCHMARK(type, 64, 64), \ |
| 2695 | - CREATE_BENCHMARK(type, 128, 64), \ |
| 2696 | - CREATE_BENCHMARK(type, 256, 64) |
| 2697 | +#if HIPCUB_WARP_THREADS_MACRO == 32 |
| 2698 | + #define BENCHMARK_TYPE(type) \ |
| 2699 | + CREATE_BENCHMARK(type, 60, 15), \ |
| 2700 | + CREATE_BENCHMARK(type, 256, 16), \ |
| 2701 | + CREATE_BENCHMARK(type, 62, 31), \ |
| 2702 | + CREATE_BENCHMARK(type, 256, 32) |
| 2703 | +#else |
| 2704 | + #define BENCHMARK_TYPE(type) \ |
| 2705 | + CREATE_BENCHMARK(type, 60, 15), \ |
| 2706 | + CREATE_BENCHMARK(type, 256, 16), \ |
| 2707 | + CREATE_BENCHMARK(type, 62, 31), \ |
| 2708 | + CREATE_BENCHMARK(type, 256, 32), \ |
| 2709 | + CREATE_BENCHMARK(type, 63, 63), \ |
| 2710 | + CREATE_BENCHMARK(type, 64, 64), \ |
| 2711 | + CREATE_BENCHMARK(type, 128, 64), \ |
| 2712 | + CREATE_BENCHMARK(type, 256, 64) |
| 2713 | +#endif |
| 2714 | + |
| 2715 | +#if HIPCUB_WARP_THREADS_MACRO == 32 |
| 2716 | + #define BENCHMARK_TYPE_P2(type) \ |
| 2717 | + CREATE_BENCHMARK(type, 256, 16), \ |
| 2718 | + CREATE_BENCHMARK(type, 256, 32) |
| 2719 | +#else |
| 2720 | + #define BENCHMARK_TYPE_P2(type) \ |
| 2721 | + CREATE_BENCHMARK(type, 256, 16), \ |
| 2722 | + CREATE_BENCHMARK(type, 256, 32), \ |
| 2723 | + CREATE_BENCHMARK(type, 64, 64), \ |
| 2724 | + CREATE_BENCHMARK(type, 128, 64), \ |
| 2725 | + CREATE_BENCHMARK(type, 256, 64) |
| 2726 | +#endif |
| 2727 | // clang-format on |
| 2728 | |
| 2729 | template<typename Benchmark> |
| 2730 | -void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks, |
| 2731 | +auto add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks, |
| 2732 | const std::string& method_name, |
| 2733 | hipStream_t stream, |
| 2734 | size_t size) |
| 2735 | + -> std::enable_if_t<std::is_same<Benchmark, inclusive_scan>::value |
| 2736 | + || std::is_same<Benchmark, exclusive_scan>::value> |
| 2737 | { |
| 2738 | using custom_double2 = benchmark_utils::custom_type<double, double>; |
| 2739 | using custom_int_double = benchmark_utils::custom_type<int, double>; |
| 2740 | |
| 2741 | - std::vector<benchmark::internal::Benchmark*> new_benchmarks = { |
| 2742 | -#if HIPCUB_WARP_THREADS_MACRO == 16 |
| 2743 | - BENCHMARK_TYPE_WS16(int), |
| 2744 | - BENCHMARK_TYPE_WS16(float), |
| 2745 | - BENCHMARK_TYPE_WS16(double), |
| 2746 | - BENCHMARK_TYPE_WS16(int8_t), |
| 2747 | - BENCHMARK_TYPE_WS16(custom_double2), |
| 2748 | - BENCHMARK_TYPE_WS16(custom_int_double) |
| 2749 | -#elif HIPCUB_WARP_THREADS_MACRO == 32 |
| 2750 | - BENCHMARK_TYPE_WS32(int), |
| 2751 | - BENCHMARK_TYPE_WS32(float), |
| 2752 | - BENCHMARK_TYPE_WS32(double), |
| 2753 | - BENCHMARK_TYPE_WS32(int8_t), |
| 2754 | - BENCHMARK_TYPE_WS32(custom_double2), |
| 2755 | - BENCHMARK_TYPE_WS32(custom_int_double) |
| 2756 | -#else |
| 2757 | - BENCHMARK_TYPE_WS64(int), |
| 2758 | - BENCHMARK_TYPE_WS64(float), |
| 2759 | - BENCHMARK_TYPE_WS64(double), |
| 2760 | - BENCHMARK_TYPE_WS64(int8_t), |
| 2761 | - BENCHMARK_TYPE_WS64(custom_double2), |
| 2762 | - BENCHMARK_TYPE_WS64(custom_int_double) |
| 2763 | -#endif |
| 2764 | - }; |
| 2765 | + std::vector<benchmark::internal::Benchmark*> new_benchmarks |
| 2766 | + = {BENCHMARK_TYPE(int), |
| 2767 | + BENCHMARK_TYPE(float), |
| 2768 | + BENCHMARK_TYPE(double), |
| 2769 | + BENCHMARK_TYPE(int8_t), |
| 2770 | + BENCHMARK_TYPE(custom_double2), |
| 2771 | + BENCHMARK_TYPE(custom_int_double)}; |
| 2772 | + benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); |
| 2773 | +} |
| 2774 | + |
| 2775 | +template<typename Benchmark> |
| 2776 | +auto add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks, |
| 2777 | + const std::string& method_name, |
| 2778 | + hipStream_t stream, |
| 2779 | + size_t size) -> std::enable_if_t<std::is_same<Benchmark, broadcast>::value> |
| 2780 | +{ |
| 2781 | + using custom_double2 = benchmark_utils::custom_type<double, double>; |
| 2782 | + using custom_int_double = benchmark_utils::custom_type<int, double>; |
| 2783 | + |
| 2784 | + std::vector<benchmark::internal::Benchmark*> new_benchmarks |
| 2785 | + = {BENCHMARK_TYPE_P2(int), |
| 2786 | + BENCHMARK_TYPE_P2(float), |
| 2787 | + BENCHMARK_TYPE_P2(double), |
| 2788 | + BENCHMARK_TYPE_P2(int8_t), |
| 2789 | + BENCHMARK_TYPE_P2(custom_double2), |
| 2790 | + BENCHMARK_TYPE_P2(custom_int_double)}; |
| 2791 | benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); |
| 2792 | } |
| 2793 | |
| 2794 | diff --git a/benchmark/benchmark_warp_store.cpp b/benchmark/benchmark_warp_store.cpp |
| 2795 | index 8e88661..6632faf 100644 |
| 2796 | --- a/benchmark/benchmark_warp_store.cpp |
| 2797 | +++ b/benchmark/benchmark_warp_store.cpp |
| 2798 | @@ -23,7 +23,7 @@ |
| 2799 | #include "common_benchmark_header.hpp" |
| 2800 | |
| 2801 | // HIP API |
| 2802 | -#include "hipcub/warp/warp_store.hpp" |
| 2803 | +#include <hipcub/warp/warp_store.hpp> |
| 2804 | |
| 2805 | #include <type_traits> |
| 2806 | |
| 2807 | diff --git a/benchmark/common_benchmark_header.hpp b/benchmark/common_benchmark_header.hpp |
| 2808 | index eab3048..a632840 100644 |
| 2809 | --- a/benchmark/common_benchmark_header.hpp |
| 2810 | +++ b/benchmark/common_benchmark_header.hpp |
| 2811 | @@ -1,6 +1,6 @@ |
| 2812 | // MIT License |
| 2813 | // |
| 2814 | -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. |
| 2815 | +// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 2816 | // |
| 2817 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 2818 | // of this software and associated documentation files (the "Software"), to deal |
| 2819 | @@ -21,18 +21,18 @@ |
| 2820 | // SOFTWARE. |
| 2821 | |
| 2822 | #include <algorithm> |
| 2823 | +#include <chrono> |
| 2824 | +#include <cmath> |
| 2825 | +#include <cstdlib> |
| 2826 | #include <functional> |
| 2827 | #include <iostream> |
| 2828 | +#include <limits> |
| 2829 | +#include <numeric> |
| 2830 | +#include <random> |
| 2831 | +#include <tuple> |
| 2832 | #include <type_traits> |
| 2833 | -#include <vector> |
| 2834 | #include <utility> |
| 2835 | -#include <tuple> |
| 2836 | -#include <random> |
| 2837 | -#include <limits> |
| 2838 | -#include <cmath> |
| 2839 | -#include <cstdlib> |
| 2840 | -#include <numeric> |
| 2841 | -#include <chrono> |
| 2842 | +#include <vector> |
| 2843 | |
| 2844 | // Google Benchmark |
| 2845 | #include "benchmark/benchmark.h" |
| 2846 | diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake |
| 2847 | index 22d2a4a..36a5310 100644 |
| 2848 | --- a/cmake/Dependencies.cmake |
| 2849 | +++ b/cmake/Dependencies.cmake |
| 2850 | @@ -1,6 +1,6 @@ |
| 2851 | # MIT License |
| 2852 | # |
| 2853 | -# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 2854 | +# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 2855 | # |
| 2856 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 2857 | # of this software and associated documentation files (the "Software"), to deal |
| 2858 | @@ -66,6 +66,226 @@ endforeach() |
| 2859 | |
| 2860 | include(FetchContent) |
| 2861 | |
| 2862 | +# This function checks to see if the download branch given by "branch" exists in the repository. |
| 2863 | +# It does so using the git ls-remote command. |
| 2864 | +# If the branch cannot be found, the variable described by "branch" is changed to "develop" in the host scope. |
| 2865 | +function(find_download_branch git_path branch) |
| 2866 | + set(branch_value ${${branch}}) |
| 2867 | + execute_process(COMMAND ${git_path} "ls-remote" "https://github.com/ROCm/rocm-libraries.git" "refs/heads/${branch_value}" RESULT_VARIABLE ret_code OUTPUT_VARIABLE output) |
| 2868 | + |
| 2869 | + if(NOT ${ret_code} STREQUAL "0") |
| 2870 | + message(WARNING "Unable to check if release branch exists, defaulting to the develop branch.") |
| 2871 | + set(${branch} "develop" PARENT_SCOPE) |
| 2872 | + else() |
| 2873 | + if(${output}) |
| 2874 | + string(STRIP ${output} output) |
| 2875 | + endif() |
| 2876 | + |
| 2877 | + if(NOT (${output} MATCHES "[\t ]+refs/heads/${branch_value}(\n)?$")) |
| 2878 | + message(WARNING "Unable to locate requested release branch \"${branch_value}\" in repository. Defaulting to the develop branch.") |
| 2879 | + set(${branch} "develop" PARENT_SCOPE) |
| 2880 | + else() |
| 2881 | + message(STATUS "Found release branch \"${branch_value}\" in repository.") |
| 2882 | + endif() |
| 2883 | + endif() |
| 2884 | +endfunction() |
| 2885 | + |
| 2886 | +function(check_git_version git_path) |
| 2887 | + execute_process(COMMAND ${git_path} "--version" OUTPUT_VARIABLE git_version_output) |
| 2888 | + string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" GIT_VERSION_STRING ${git_version_output}) |
| 2889 | + if(DEFINED CMAKE_MATCH_0) |
| 2890 | + set(GIT_VERSION ${CMAKE_MATCH_0} PARENT_SCOPE) |
| 2891 | + else() |
| 2892 | + set(GIT_VERSION "" PARENT_SCOPE) |
| 2893 | + endif() |
| 2894 | +endfunction() |
| 2895 | + |
| 2896 | +# This function fetches repository "repo_name" using the method specified by "method". |
| 2897 | +# The result is stored in the parent scope version of "repo_path". |
| 2898 | +# It does not build the repo. |
| 2899 | +function(fetch_dep method repo_name repo_path download_branch) |
| 2900 | + set(method_value ${${method}}) |
| 2901 | + |
| 2902 | + # Since the monorepo is large, we want to avoid downloading the whole thing if possible. |
| 2903 | + # We can do this if we have access to git's sparse-checkout functionality, which was added in git 2.25. |
| 2904 | + # On some Linux systems (eg. Ubuntu), the git in /usr/bin tends to be newer than the git in /usr/local/bin, |
| 2905 | + # and the latter is what gets picked up by find_package(Git), since it's what's in PATH. |
| 2906 | + # Check for a git binary in /usr/bin first, then if git < 2.25 is not found, use find_package(Git) to search |
| 2907 | + # other locations. |
| 2908 | + if (NOT(GIT_PATH)) |
| 2909 | + message(STATUS "Checking git version") |
| 2910 | + set(GIT_MIN_VERSION_FOR_SPARSE_CHECKOUT 2.25) |
| 2911 | + |
| 2912 | + find_program(find_result git PATHS /usr/bin NO_DEFAULT_PATH) |
| 2913 | + if(NOT (${find_result} STREQUAL "find_result-NOTFOUND")) |
| 2914 | + set(GIT_PATH ${find_result} CACHE INTERNAL "Path to the git executable") |
| 2915 | + check_git_version(${GIT_PATH}) |
| 2916 | + endif() |
| 2917 | + |
| 2918 | + if(NOT GIT_VERSION OR "${GIT_VERSION}" LESS ${GIT_MIN_VERSION_FOR_SPARSE_CHECKOUT}) |
| 2919 | + find_package(Git QUIET) |
| 2920 | + if(GIT_FOUND) |
| 2921 | + set(GIT_PATH ${GIT_EXECUTABLE} CACHE INTERNAL "Path to the git executable") |
| 2922 | + check_git_version(${GIT_PATH}) |
| 2923 | + endif() |
| 2924 | + endif() |
| 2925 | + |
| 2926 | + if(NOT GIT_VERSION OR "${GIT_VERSION}" LESS ${GIT_MIN_VERSION_FOR_SPARSE_CHECKOUT}) |
| 2927 | + set(USE_SPARSE_CHECKOUT "OFF" CACHE INTERNAL "Records whether git supports sparse checkout functionality") |
| 2928 | + else() |
| 2929 | + set(USE_SPARSE_CHECKOUT "ON" CACHE INTERNAL "Records whether git supports sparse checkout functionality") |
| 2930 | + endif() |
| 2931 | + |
| 2932 | + if(NOT GIT_VERSION) |
| 2933 | + # Warn the user that we were unable to find git. This will only actually be a problem if we use one of the |
| 2934 | + # fetch methods (download, or monorepo with dependency not present) that requires it. If we end up running |
| 2935 | + # into one of those scenarios, a fatal error will be issued at that point. |
| 2936 | + message(WARNING "Unable to find git.") |
| 2937 | + else() |
| 2938 | + message(STATUS "Found git at: ${GIT_PATH}, version: ${GIT_VERSION}") |
| 2939 | + endif() |
| 2940 | + endif() |
| 2941 | + |
| 2942 | + if(${method_value} STREQUAL "PACKAGE") |
| 2943 | + message(STATUS "Searching for ${repo_name} package") |
| 2944 | + |
| 2945 | + # Add default install location for WIN32 and non-WIN32 as hint |
| 2946 | + find_package(${repo_name} ${MIN_ROCPRIM_PACKAGE_VERSION} CONFIG QUIET PATHS "${ROCM_ROOT}/lib/cmake/rocprim") |
| 2947 | + |
| 2948 | + if(NOT ${${repo_name}_FOUND}) |
| 2949 | + message(STATUS "No existing ${repo_name} package meeting the minimum version requirement (${MIN_ROCPRIM_PACKAGE_VERSION}) was found. Falling back to downloading it.") |
| 2950 | + # update local and parent variable values |
| 2951 | + set(${method} "DOWNLOAD" PARENT_SCOPE) |
| 2952 | + set(method_value "DOWNLOAD") |
| 2953 | + else() |
| 2954 | + message(STATUS "Package found (${${repo_name}_DIR})") |
| 2955 | + endif() |
| 2956 | + |
| 2957 | + elseif(${method_value} STREQUAL "MONOREPO") |
| 2958 | + message(STATUS "Searching for ${repo_name} in the parent monorepo directory") |
| 2959 | + |
| 2960 | + # Check if this looks like a monorepo checkout |
| 2961 | + find_path(found_path NAMES "." PATHS "${CMAKE_CURRENT_SOURCE_DIR}/../../projects/${repo_name}/" NO_CACHE NO_DEFAULT_PATH) |
| 2962 | + |
| 2963 | + # If not, see if the local monorepo is a sparse-checkout. |
| 2964 | + # If it is a sparse-checkout, try to add the dependency to the sparse-checkout list. |
| 2965 | + # If it's not a sparse-checkout (or adding to the sparse-checkout list fails), fallback to downloading the dependency. |
| 2966 | + if(${found_path} STREQUAL "found_path-NOTFOUND") |
| 2967 | + set(FALLBACK_TO_DOWNLOAD ON) |
| 2968 | + message(WARNING "Unable to locate ${repo_name} in parent monorepo (it's not at \"${CMAKE_CURRENT_SOURCE_DIR}/../../projects/${repo_name}/\").") |
| 2969 | + message(STATUS "Checking if local monorepo is a sparse-checkout that we can add ${repo_name} to.") |
| 2970 | + if(NOT(GIT_PATH)) |
| 2971 | + message(FATAL_ERROR "Git could not be found on the system. Since ${repo_name} could not be found in the local monorepo, git is required to download it.") |
| 2972 | + endif() |
| 2973 | + |
| 2974 | + if(USE_SPARSE_CHECKOUT) |
| 2975 | + execute_process(COMMAND ${GIT_PATH} "sparse-checkout" "list" OUTPUT_VARIABLE sparse_list ERROR_VARIABLE git_error RESULT_VARIABLE git_result |
| 2976 | + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../) |
| 2977 | + |
| 2978 | + if(NOT(git_result EQUAL 0) OR git_error) |
| 2979 | + message(STATUS "The local monorepo does not appear to be a sparse-checkout.") |
| 2980 | + else() |
| 2981 | + message(STATUS "The local monorepo appears to be a sparse checkout. Attempting to add \"projects/${repo_name}\" to the checkout list.") |
| 2982 | + # Check if the dependency is already present in the checkout list. |
| 2983 | + # Git lists sparse checkout directories each on a separate line. |
| 2984 | + # Take care not to match something in the middle of a path, eg. "other_dir/projects/${repo_name}/sub_dir". |
| 2985 | + string(REGEX MATCH "(^|\n)projects/${repo_name}($|\n)" find_result ${sparse_list}) |
| 2986 | + if(find_result) |
| 2987 | + message(STATUS "Found existing entry for \"projects/${repo_name}\" in sparse-checkout list - has the directory structure been modified?") |
| 2988 | + else() |
| 2989 | + # Add project/${repo_name} to the sparse checkout |
| 2990 | + execute_process(COMMAND ${GIT_PATH} "sparse-checkout" "add" "projects/${repo_name}" RESULT_VARIABLE sparse_checkout_result |
| 2991 | + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../) |
| 2992 | + # Note that in this case, we are forced to checkout the same branch that the sparse-checkout was created with. |
| 2993 | + execute_process(COMMAND ${GIT_PATH} "checkout" RESULT_VARIABLE checkout_result |
| 2994 | + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../) |
| 2995 | + |
| 2996 | + if(sparse_checkout_result EQUAL 0 AND checkout_result EQUAL 0) |
| 2997 | + message(STATUS "Added new checkout list entry.") |
| 2998 | + set(FALLBACK_TO_DOWNLOAD OFF) |
| 2999 | + else() |
| 3000 | + message(STATUS "Unable to add new checkout list entry.") |
| 3001 | + endif() |
| 3002 | + # Save the monorepo path in the parent scope |
| 3003 | + set(${repo_path} "${CMAKE_CURRENT_SOURCE_DIR}/../../projects/${repo_name}" PARENT_SCOPE) |
| 3004 | + endif() |
| 3005 | + endif() |
| 3006 | + else() |
| 3007 | + message(STATUS "The version of git installed on the system (${GIT_VERSION}) does not support sparse-checkout.") |
| 3008 | + endif() |
| 3009 | + |
| 3010 | + if (FALLBACK_TO_DOWNLOAD) |
| 3011 | + message(WARNING "Unable to locate/fetch dependency ${repo_name} from monorepo. Falling back to downloading it.") |
| 3012 | + # update local and parent variable values |
| 3013 | + set(${method} "DOWNLOAD" PARENT_SCOPE) |
| 3014 | + set(method_value "DOWNLOAD") |
| 3015 | + endif() |
| 3016 | + |
| 3017 | + else() |
| 3018 | + message(STATUS "Found ${repo_name} at ${found_path}") |
| 3019 | + |
| 3020 | + # Save the monorepo path in the parent scope |
| 3021 | + set(${repo_path} ${found_path} PARENT_SCOPE) |
| 3022 | + endif() |
| 3023 | + endif() |
| 3024 | + |
| 3025 | + if(${method_value} STREQUAL "DOWNLOAD") |
| 3026 | + if(NOT DEFINED GIT_PATH) |
| 3027 | + message(FATAL_ERROR "Git could not be found on the system. Git is required for downloading ${repo_name}.") |
| 3028 | + endif() |
| 3029 | + |
| 3030 | + message(STATUS "Checking if repository contains requested branch ${${download_branch}}") |
| 3031 | + find_download_branch(${GIT_PATH} ${download_branch}) |
| 3032 | + set(download_branch_value ${${download_branch}}) |
| 3033 | + |
| 3034 | + message(STATUS "Downloading ${repo_name} from https://github.com/ROCm/rocm-libraries.git") |
| 3035 | + if(${USE_SPARSE_CHECKOUT}) |
| 3036 | + # In this case, we have access to git sparse-checkout. |
| 3037 | + # Check if the dependency has already been downloaded in the past: |
| 3038 | + find_path(found_path NAMES "." PATHS "${CMAKE_CURRENT_BINARY_DIR}/${repo_name}-src/" NO_CACHE NO_DEFAULT_PATH) |
| 3039 | + if(${found_path} STREQUAL "found_path-NOTFOUND") |
| 3040 | + # First, git clone with options "--no-checkout" and "--filter=tree:0" to prevent files from being pulled immediately. |
| 3041 | + # Use option "--depth=1" to avoid downloading past commit history. |
| 3042 | + execute_process(COMMAND ${GIT_PATH} clone --branch ${download_branch_value} --no-checkout --depth=1 --filter=tree:0 https://github.com/ROCm/rocm-libraries.git ${CMAKE_CURRENT_BINARY_DIR}/${repo_name}-src) |
| 3043 | + |
| 3044 | + # Next, use git sparse-checkout to ensure we only pull the directory containing the desired repo. |
| 3045 | + execute_process(COMMAND ${GIT_PATH} sparse-checkout init --cone |
| 3046 | + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${repo_name}-src) |
| 3047 | + |
| 3048 | + execute_process(COMMAND ${GIT_PATH} sparse-checkout set projects/${repo_name} |
| 3049 | + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${repo_name}-src) |
| 3050 | + |
| 3051 | + # Finally, download the files using git checkout. |
| 3052 | + execute_process(COMMAND ${GIT_PATH} checkout ${download_branch_value} |
| 3053 | + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${repo_name}-src) |
| 3054 | + |
| 3055 | + message(STATUS "${repo_name} download complete") |
| 3056 | + else() |
| 3057 | + message("Found previously downloaded directory, skipping download step.") |
| 3058 | + endif() |
| 3059 | + |
| 3060 | + # Save the downloaded path in the parent scope |
| 3061 | + set(${repo_path} "${CMAKE_CURRENT_BINARY_DIR}/${repo_name}-src/projects/${repo_name}" PARENT_SCOPE) |
| 3062 | + else() |
| 3063 | + # In this case, we do not have access to sparse-checkout, so we need to download the whole monorepo. |
| 3064 | + # Check if the monorepo has already been downloaded to satisfy a previous dependency |
| 3065 | + find_path(found_path NAMES "." PATHS "${CMAKE_CURRENT_BINARY_DIR}/monorepo-src/" NO_CACHE NO_DEFAULT_PATH) |
| 3066 | + if(${found_path} STREQUAL "found_path-NOTFOUND") |
| 3067 | + # Warn the user that this will take some time. |
| 3068 | + message(WARNING "The detected version of git (${GIT_VERSION}) is older than 2.25 and does not provide sparse-checkout functionality. Falling back to checking out the whole rocm-libraries repository (this may take a long time).") |
| 3069 | + # Avoid downloading anything related to branches other than the target branch (--single-branch), and avoid any past commit history information (--depth=1) |
| 3070 | + execute_process(COMMAND ${GIT_PATH} clone --single-branch --branch=${download_branch_value} --depth=1 https://github.com/ROCm/rocm-libraries.git ${CMAKE_CURRENT_BINARY_DIR}/monorepo-src) |
| 3071 | + message(STATUS "rocm-libraries download complete") |
| 3072 | + else() |
| 3073 | + message("Found previously downloaded directory, skipping download step.") |
| 3074 | + endif() |
| 3075 | + |
| 3076 | + # Save the downloaded path in the parent scope |
| 3077 | + set(${repo_path} "${CMAKE_CURRENT_BINARY_DIR}/monorepo-src/projects/${repo_name}" PARENT_SCOPE) |
| 3078 | + endif() |
| 3079 | + endif() |
| 3080 | +endfunction() |
| 3081 | + |
| 3082 | # Test dependencies |
| 3083 | if(USER_BUILD_TEST) |
| 3084 | # NOTE1: Google Test has created a mess with legacy FindGTest.cmake and newer GTestConfig.cmake |
| 3085 | @@ -83,14 +303,14 @@ if(USER_BUILD_TEST) |
| 3086 | # mode targets. Otherwise if MODULE or CONFIG succeeded, then it prints the result to the |
| 3087 | # console via a non-QUIET find_package call and if CONFIG succeeded, creates ALIAS targets |
| 3088 | # with the MODULE IMPORTED names. |
| 3089 | - if(NOT DEPENDENCIES_FORCE_DOWNLOAD) |
| 3090 | + if(NOT EXTERNAL_DEPS_FORCE_DOWNLOAD) |
| 3091 | find_package(GTest QUIET) |
| 3092 | endif() |
| 3093 | if(NOT TARGET GTest::GTest AND NOT TARGET GTest::gtest) |
| 3094 | option(BUILD_GTEST "Builds the googletest subproject" ON) |
| 3095 | option(BUILD_GMOCK "Builds the googlemock subproject" OFF) |
| 3096 | option(INSTALL_GTEST "Enable installation of googletest." OFF) |
| 3097 | - if(EXISTS /usr/src/googletest AND NOT DEPENDENCIES_FORCE_DOWNLOAD) |
| 3098 | + if(EXISTS /usr/src/googletest AND NOT EXTERNAL_DEPS_FORCE_DOWNLOAD) |
| 3099 | FetchContent_Declare( |
| 3100 | googletest |
| 3101 | SOURCE_DIR /usr/src/googletest |
| 3102 | @@ -116,7 +336,7 @@ if(USER_BUILD_TEST) |
| 3103 | endif(USER_BUILD_TEST) |
| 3104 | |
| 3105 | if(USER_BUILD_BENCHMARK) |
| 3106 | - if(NOT DEPENDENCIES_FORCE_DOWNLOAD) |
| 3107 | + if(NOT EXTERNAL_DEPS_FORCE_DOWNLOAD) |
| 3108 | find_package(benchmark CONFIG QUIET) |
| 3109 | endif() |
| 3110 | if(NOT TARGET benchmark::benchmark) |
| 3111 | @@ -139,20 +359,13 @@ endif(USER_BUILD_BENCHMARK) |
| 3112 | |
| 3113 | # CUB (only for CUDA platform) |
| 3114 | if(HIP_COMPILER STREQUAL "nvcc") |
| 3115 | - set(CCCL_MINIMUM_VERSION 2.5.0) |
| 3116 | + set(CCCL_MINIMUM_VERSION 2.8.2) |
| 3117 | if(NOT DOWNLOAD_CUB) |
| 3118 | - find_package(CUB ${CCCL_MINIMUM_VERSION} CONFIG) |
| 3119 | - find_package(Thrust ${CCCL_MINIMUM_VERSION} CONFIG) |
| 3120 | - find_package(libcudacxx ${CCCL_MINIMUM_VERSION} CONFIG) |
| 3121 | + find_package(CCCL ${CCCL_MINIMUM_VERSION} CONFIG) |
| 3122 | endif() |
| 3123 | |
| 3124 | - if (NOT CUB_FOUND OR NOT Thrust_FOUND OR NOT libcudacxx_FOUND) |
| 3125 | - if(CUB_FOUND OR Thrust_FOUND OR libcudacxx_FOUND) |
| 3126 | - message(WARNING "Found one of CUB, Thrust or libcu++, but not all of them. |
| 3127 | - This can lead to mixing different potentially incompatible versions.") |
| 3128 | - endif() |
| 3129 | - |
| 3130 | - message(STATUS "CUB, Thrust or libcu++ not found, downloading and extracting CCCL ${CCCL_MINIMUM_VERSION}") |
| 3131 | + if (NOT CCCL_FOUND) |
| 3132 | + message(STATUS "CCCL not found, downloading and extracting CCCL ${CCCL_MINIMUM_VERSION}") |
| 3133 | file(DOWNLOAD https://github.com/NVIDIA/cccl/archive/refs/tags/v${CCCL_MINIMUM_VERSION}.zip |
| 3134 | ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}.zip |
| 3135 | STATUS cccl_download_status LOG cccl_download_log) |
| 3136 | @@ -176,25 +389,24 @@ if(HIP_COMPILER STREQUAL "nvcc") |
| 3137 | endif() |
| 3138 | endif() |
| 3139 | |
| 3140 | - find_package(CUB ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH |
| 3141 | - PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/cub) |
| 3142 | - find_package(Thrust ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH |
| 3143 | - PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/thrust) |
| 3144 | - find_package(libcudacxx ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH |
| 3145 | - PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/libcudacxx) |
| 3146 | + find_package(CCCL ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH |
| 3147 | + PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}) |
| 3148 | endif() |
| 3149 | else() |
| 3150 | # rocPRIM (only for ROCm platform) |
| 3151 | - if(NOT DEPENDENCIES_FORCE_DOWNLOAD) |
| 3152 | - # Add default install location for WIN32 and non-WIN32 as hint |
| 3153 | - find_package(rocprim CONFIG QUIET PATHS "${ROCM_ROOT}/lib/cmake/rocprim") |
| 3154 | - endif() |
| 3155 | - if(NOT TARGET roc::rocprim) |
| 3156 | - message(STATUS "rocPRIM not found. Fetching...") |
| 3157 | + fetch_dep(ROCPRIM_FETCH_METHOD rocprim ROCPRIM_PATH ROCM_DEP_RELEASE_BRANCH) |
| 3158 | + |
| 3159 | + if(${ROCPRIM_FETCH_METHOD} STREQUAL "DOWNLOAD" OR ${ROCPRIM_FETCH_METHOD} STREQUAL "MONOREPO") |
| 3160 | + # The fetch_dep call above should have downloaded/located the source. We just need to make it available. |
| 3161 | + message(STATUS "Configuring rocPRIM") |
| 3162 | FetchContent_Declare( |
| 3163 | - prim |
| 3164 | - GIT_REPOSITORY https://github.com/ROCm/rocPRIM.git |
| 3165 | - GIT_TAG develop |
| 3166 | + prim |
| 3167 | + SOURCE_DIR ${ROCPRIM_PATH} |
| 3168 | + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/deps/rocprim |
| 3169 | + CMAKE_ARGS -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DCMAKE_PREFIX_PATH=/opt/rocm |
| 3170 | + LOG_CONFIGURE TRUE |
| 3171 | + LOG_BUILD TRUE |
| 3172 | + LOG_INSTALL TRUE |
| 3173 | ) |
| 3174 | FetchContent_MakeAvailable(prim) |
| 3175 | if(NOT TARGET roc::rocprim) |
| 3176 | @@ -203,8 +415,6 @@ else() |
| 3177 | if(NOT TARGET roc::rocprim_hip) |
| 3178 | add_library(roc::rocprim_hip ALIAS rocprim_hip) |
| 3179 | endif() |
| 3180 | - else() |
| 3181 | - find_package(rocprim CONFIG REQUIRED) |
| 3182 | endif() |
| 3183 | endif() |
| 3184 | |
| 3185 | diff --git a/cmake/GenerateResourceSpec.cmake b/cmake/GenerateResourceSpec.cmake |
| 3186 | index 7810a20..d485ca4 100644 |
| 3187 | --- a/cmake/GenerateResourceSpec.cmake |
| 3188 | +++ b/cmake/GenerateResourceSpec.cmake |
| 3189 | @@ -17,10 +17,11 @@ execute_process( |
| 3190 | |
| 3191 | if(ROCMINFO_EXIT_CODE) |
| 3192 | message(SEND_ERROR "rocminfo exited with ${ROCMINFO_EXIT_CODE}") |
| 3193 | + message(SEND_ERROR ${ROCMINFO_STDOUT}) |
| 3194 | message(FATAL_ERROR ${ROCMINFO_STDERR}) |
| 3195 | endif() |
| 3196 | |
| 3197 | -string(REGEX MATCHALL [[--(gfx[0-9]+)]] |
| 3198 | +string(REGEX MATCHALL [[--(gfx[0-9a-f]+)]] |
| 3199 | ROCMINFO_MATCHES |
| 3200 | ${ROCMINFO_STDOUT} |
| 3201 | ) |
| 3202 | @@ -37,7 +38,7 @@ string(REGEX MATCHALL [[--(gfx[0-9]+)]] |
| 3203 | # std::string ip; |
| 3204 | # int id; |
| 3205 | # }; |
| 3206 | -# |
| 3207 | +# |
| 3208 | # std::vector<device> GFXIP_AND_ID{ {"gfx900",0},{"gfx803",1},{"gfx900",2} }; |
| 3209 | # std::sort(GFXIP_AND_ID.begin(), GFXIP_AND_ID.end(), |
| 3210 | # [](const device& lhs, const device& rhs) |
| 3211 | @@ -85,14 +86,14 @@ list(SORT GFXIP_AND_ID) |
| 3212 | set(JSON_PAYLOAD) |
| 3213 | set(IT1 0) |
| 3214 | list(GET GFXIP_AND_ID ${IT1} I1) |
| 3215 | -string(REGEX REPLACE ":[0-9]+" "" IP1 ${I1}) |
| 3216 | +string(REGEX REPLACE ":[0-9a-f]+" "" IP1 ${I1}) |
| 3217 | list(LENGTH GFXIP_AND_ID COUNT) |
| 3218 | while(IT1 LESS COUNT) |
| 3219 | string(APPEND JSON_PAYLOAD "\n \"${IP1}\": [") |
| 3220 | set(IT2 ${IT1}) |
| 3221 | list(GET GFXIP_AND_ID ${IT2} I2) |
| 3222 | - string(REGEX REPLACE [[:[0-9]+$]] "" IP2 ${I2}) |
| 3223 | - string(REGEX REPLACE [[^gfx[0-9]+:]] "" ID2 ${I2}) |
| 3224 | + string(REGEX REPLACE [[:[0-9a-f]+$]] "" IP2 ${I2}) |
| 3225 | + string(REGEX REPLACE [[^gfx[0-9a-f]+:]] "" ID2 ${I2}) |
| 3226 | while(${IP2} STREQUAL ${IP1} AND IT2 LESS COUNT) |
| 3227 | string(APPEND JSON_PAYLOAD |
| 3228 | "\n {\n" |
| 3229 | @@ -102,8 +103,8 @@ while(IT1 LESS COUNT) |
| 3230 | math(EXPR IT2 "${IT2} + 1") |
| 3231 | if(IT2 LESS COUNT) |
| 3232 | list(GET GFXIP_AND_ID ${IT2} I2) |
| 3233 | - string(REGEX REPLACE [[:[0-9]+$]] "" IP2 ${I2}) |
| 3234 | - string(REGEX REPLACE [[^gfx[0-9]+:]] "" ID2 ${I2}) |
| 3235 | + string(REGEX REPLACE [[:[0-9a-f]+$]] "" IP2 ${I2}) |
| 3236 | + string(REGEX REPLACE [[^gfx[0-9a-f]+:]] "" ID2 ${I2}) |
| 3237 | endif() |
| 3238 | endwhile() |
| 3239 | string(REGEX REPLACE [[,$]] "" JSON_PAYLOAD ${JSON_PAYLOAD}) |
| 3240 | diff --git a/cmake/ROCmCMakeBuildToolsDependency.cmake b/cmake/ROCmCMakeBuildToolsDependency.cmake |
| 3241 | index 3f8e1fc..8c913b5 100644 |
| 3242 | --- a/cmake/ROCmCMakeBuildToolsDependency.cmake |
| 3243 | +++ b/cmake/ROCmCMakeBuildToolsDependency.cmake |
| 3244 | @@ -49,6 +49,5 @@ include(ROCMCreatePackage) |
| 3245 | include(ROCMInstallTargets) |
| 3246 | include(ROCMPackageConfigHelpers) |
| 3247 | include(ROCMInstallSymlinks) |
| 3248 | -include(ROCMHeaderWrapper) |
| 3249 | include(ROCMCheckTargetIds) |
| 3250 | include(ROCMClients) |
| 3251 | diff --git a/cmake/SetupNVCC.cmake b/cmake/SetupNVCC.cmake |
| 3252 | index 5ec377d..46b88d2 100644 |
| 3253 | --- a/cmake/SetupNVCC.cmake |
| 3254 | +++ b/cmake/SetupNVCC.cmake |
| 3255 | @@ -81,7 +81,7 @@ endif() |
| 3256 | |
| 3257 | # Get CUDA |
| 3258 | enable_language("CUDA") |
| 3259 | -set(CMAKE_CUDA_STANDARD 14) |
| 3260 | +set(CMAKE_CUDA_STANDARD 17) |
| 3261 | |
| 3262 | # Suppressing warnings |
| 3263 | set(HIP_NVCC_FLAGS " ${HIP_NVCC_FLAGS} -Wno-deprecated-gpu-targets -Xcompiler -Wno-return-type -Wno-deprecated-declarations ") |
| 3264 | diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake |
| 3265 | index 7a9af98..9dd5035 100644 |
| 3266 | --- a/cmake/Summary.cmake |
| 3267 | +++ b/cmake/Summary.cmake |
| 3268 | @@ -1,6 +1,6 @@ |
| 3269 | # MIT License |
| 3270 | # |
| 3271 | -# Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. |
| 3272 | +# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 3273 | # |
| 3274 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3275 | # of this software and associated documentation files (the "Software"), to deal |
| 3276 | @@ -21,6 +21,39 @@ |
| 3277 | # SOFTWARE. |
| 3278 | |
| 3279 | function(print_configuration_summary) |
| 3280 | + find_package(Git) |
| 3281 | + if(GIT_FOUND) |
| 3282 | + execute_process( |
| 3283 | + COMMAND ${GIT_EXECUTABLE} show --format=%H --no-patch |
| 3284 | + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} |
| 3285 | + OUTPUT_VARIABLE COMMIT_HASH |
| 3286 | + OUTPUT_STRIP_TRAILING_WHITESPACE |
| 3287 | + ) |
| 3288 | + execute_process( |
| 3289 | + COMMAND ${GIT_EXECUTABLE} show --format=%s --no-patch |
| 3290 | + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} |
| 3291 | + OUTPUT_VARIABLE COMMIT_SUBJECT |
| 3292 | + OUTPUT_STRIP_TRAILING_WHITESPACE |
| 3293 | + ) |
| 3294 | + endif() |
| 3295 | + |
| 3296 | + execute_process( |
| 3297 | + COMMAND ${CMAKE_CXX_COMPILER} --version |
| 3298 | + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} |
| 3299 | + OUTPUT_VARIABLE CMAKE_CXX_COMPILER_VERBOSE_DETAILS |
| 3300 | + OUTPUT_STRIP_TRAILING_WHITESPACE |
| 3301 | + ) |
| 3302 | + |
| 3303 | + find_program(UNAME_EXECUTABLE uname) |
| 3304 | + if(UNAME_EXECUTABLE) |
| 3305 | + execute_process( |
| 3306 | + COMMAND ${UNAME_EXECUTABLE} -a |
| 3307 | + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} |
| 3308 | + OUTPUT_VARIABLE LINUX_KERNEL_DETAILS |
| 3309 | + OUTPUT_STRIP_TRAILING_WHITESPACE |
| 3310 | + ) |
| 3311 | + endif() |
| 3312 | + |
| 3313 | message(STATUS "") |
| 3314 | message(STATUS "******** Summary ********") |
| 3315 | message(STATUS "General:") |
| 3316 | @@ -55,8 +88,24 @@ else() |
| 3317 | message(STATUS " Device targets : ${NVGPU_TARGETS}") |
| 3318 | endif() |
| 3319 | message(STATUS "") |
| 3320 | - message(STATUS " DEPENDENCIES_FORCE_DOWNLOAD : ${DEPENDENCIES_FORCE_DOWNLOAD}") |
| 3321 | + message(STATUS " EXTERNAL_DEPS_FORCE_DOWNLOAD: ${EXTERNAL_DEPS_FORCE_DOWNLOAD}") |
| 3322 | + message(STATUS " ROCPRIM_FETCH_METHOD : ${ROCPRIM_FETCH_METHOD}") |
| 3323 | message(STATUS " BUILD_TEST : ${BUILD_TEST}") |
| 3324 | +if(BUILD_TEST) |
| 3325 | + message(STATUS " CODE_COVERAGE : ${CODE_COVERAGE}") |
| 3326 | +endif() |
| 3327 | message(STATUS " BUILD_BENCHMARK : ${BUILD_BENCHMARK}") |
| 3328 | message(STATUS " BUILD_ADDRESS_SANITIZER : ${BUILD_ADDRESS_SANITIZER}") |
| 3329 | + message(STATUS " BUILD_OFFLOAD_COMPRESS : ${BUILD_OFFLOAD_COMPRESS}") |
| 3330 | + message(STATUS " USE_SYSTEM_LIB : ${USE_SYSTEM_LIB}") |
| 3331 | + message(STATUS "") |
| 3332 | + message(STATUS "Detailed:") |
| 3333 | + message(STATUS " C++ compiler details : \n${CMAKE_CXX_COMPILER_VERBOSE_DETAILS}") |
| 3334 | +if(GIT_FOUND) |
| 3335 | + message(STATUS " Commit : ${COMMIT_HASH}") |
| 3336 | + message(STATUS " ${COMMIT_SUBJECT}") |
| 3337 | +endif() |
| 3338 | +if(UNAME_EXECUTABLE) |
| 3339 | + message(STATUS " Unix name : ${LINUX_KERNEL_DETAILS}") |
| 3340 | +endif() |
| 3341 | endfunction() |
| 3342 | diff --git a/debian/changelog b/debian/changelog |
| 3343 | index a096666..2dc9fc0 100644 |
| 3344 | --- a/debian/changelog |
| 3345 | +++ b/debian/changelog |
| 3346 | @@ -1,3 +1,28 @@ |
| 3347 | +hipcub (7.1.0-0ubuntu2) resolute; urgency=medium |
| 3348 | + |
| 3349 | + * Revert to upstream toolchain (from ROCm LLVM fork) |
| 3350 | + - d/control: update dependencies and rollback from ROCm fork |
| 3351 | + - d/{control, tests/control}: drop ppc64el architecture from tests |
| 3352 | + |
| 3353 | + -- Bruno Bernardo de Moura <bruno.moura@canonical.com> Fri, 23 Jan 2026 10:36:40 -0300 |
| 3354 | + |
| 3355 | +hipcub (7.1.0-0ubuntu1) resolute; urgency=medium |
| 3356 | + |
| 3357 | + * New upstream version 7.1.0 |
| 3358 | + * d/p/0001-install-cmake-config-to-share.patch: refresh patch |
| 3359 | + * d/p/0002-default-thread-load-store-cache-modifiers-to-off.patch: |
| 3360 | + remove patch as suggested on patch description |
| 3361 | + |
| 3362 | + -- Bruno Bernardo de Moura <bruno.moura@canonical.com> Wed, 07 Jan 2026 13:45:35 -0300 |
| 3363 | + |
| 3364 | +hipcub (6.4.3-2ubuntu1) questing; urgency=medium |
| 3365 | + |
| 3366 | + * d/control: update maintainer field |
| 3367 | + * d/rules: fix FTBFS by adding -Wl,--gc-sections to flags |
| 3368 | + * d/control: update build-depends |
| 3369 | + |
| 3370 | + -- Igor Luppi <igor.luppi@canonical.com> Tue, 28 Oct 2025 12:39:22 -0300 |
| 3371 | + |
| 3372 | hipcub (6.4.3-2) unstable; urgency=medium |
| 3373 | |
| 3374 | * Add d/p/0002-default-thread-load-store-cache-modifiers-to-off.patch |
| 3375 | diff --git a/debian/control b/debian/control |
| 3376 | index ced5ff6..33f2971 100644 |
| 3377 | --- a/debian/control |
| 3378 | +++ b/debian/control |
| 3379 | @@ -5,17 +5,18 @@ Priority: optional |
| 3380 | Standards-Version: 4.7.2 |
| 3381 | Vcs-Git: https://salsa.debian.org/rocm-team/hipcub.git |
| 3382 | Vcs-Browser: https://salsa.debian.org/rocm-team/hipcub |
| 3383 | -Maintainer: Debian ROCm Team <debian-ai@lists.debian.org> |
| 3384 | +Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com> |
| 3385 | +XSBC-Original-Maintainer: Debian ROCm Team <debian-ai@lists.debian.org> |
| 3386 | Uploaders: Cordell Bloor <cgmb@debian.org>, |
| 3387 | Christian Kastner <ckk@debian.org>, |
| 3388 | Kari Pahula <kaol@debian.org> |
| 3389 | Build-Depends: debhelper-compat (= 13), |
| 3390 | cmake, |
| 3391 | - hipcc (>= 7.0.1~), |
| 3392 | - libamd-comgr-dev (>= 6.4~), |
| 3393 | + hipcc (>= 7.1~), |
| 3394 | + libamd-comgr-dev (>= 7.1~), |
| 3395 | libgtest-dev <!nocheck>, |
| 3396 | - libhsa-runtime-dev (>= 6.4~), |
| 3397 | - librocprim-dev (>= 6.4.1~), |
| 3398 | + libhsa-runtime-dev (>= 7.1~), |
| 3399 | + librocprim-dev (>= 7.1~), |
| 3400 | pkg-rocm-tools (>= 0.9.3~), |
| 3401 | rocm-cmake |
| 3402 | Rules-Requires-Root: no |
| 3403 | @@ -33,7 +34,7 @@ Description: portable interface for GPU parallel primitives - headers |
| 3404 | |
| 3405 | Package: libhipcub-tests |
| 3406 | Section: libdevel |
| 3407 | -Architecture: amd64 arm64 ppc64el |
| 3408 | +Architecture: amd64 arm64 |
| 3409 | XB-X-ROCm-GPU-Architecture: ${rocm:GPU-Architecture} |
| 3410 | Build-Profiles: <!nocheck> |
| 3411 | Depends: ${misc:Depends}, ${shlibs:Depends} |
| 3412 | diff --git a/debian/patches/0001-install-cmake-config-to-share.patch b/debian/patches/0001-install-cmake-config-to-share.patch |
| 3413 | index 2dafdc1..78551be 100644 |
| 3414 | --- a/debian/patches/0001-install-cmake-config-to-share.patch |
| 3415 | +++ b/debian/patches/0001-install-cmake-config-to-share.patch |
| 3416 | @@ -10,10 +10,10 @@ libraries, such as libcub-dev. |
| 3417 | cmake/ROCMExportTargetsHeaderOnly.cmake | 2 +- |
| 3418 | 1 file changed, 1 insertion(+), 1 deletion(-) |
| 3419 | |
| 3420 | -diff --git a/cmake/ROCMExportTargetsHeaderOnly.cmake b/cmake/ROCMExportTargetsHeaderOnly.cmake |
| 3421 | -index 3f7d814..2c8088d 100644 |
| 3422 | ---- a/cmake/ROCMExportTargetsHeaderOnly.cmake |
| 3423 | -+++ b/cmake/ROCMExportTargetsHeaderOnly.cmake |
| 3424 | +Index: hipcub/cmake/ROCMExportTargetsHeaderOnly.cmake |
| 3425 | +=================================================================== |
| 3426 | +--- hipcub.orig/cmake/ROCMExportTargetsHeaderOnly.cmake |
| 3427 | ++++ hipcub/cmake/ROCMExportTargetsHeaderOnly.cmake |
| 3428 | @@ -29,7 +29,7 @@ include(GNUInstallDirs) |
| 3429 | include(ROCMPackageConfigHelpers) |
| 3430 | include(ROCMInstallTargets) |
| 3431 | diff --git a/debian/patches/0002-default-thread-load-store-cache-modifiers-to-off.patch b/debian/patches/0002-default-thread-load-store-cache-modifiers-to-off.patch |
| 3432 | deleted file mode 100644 |
| 3433 | index 7cb5d48..0000000 |
| 3434 | --- a/debian/patches/0002-default-thread-load-store-cache-modifiers-to-off.patch |
| 3435 | +++ /dev/null |
| 3436 | @@ -1,41 +0,0 @@ |
| 3437 | -From: Cordell Bloor <cgmb@debian.org> |
| 3438 | -Date: Fri, 7 Nov 2025 11:40:56 -0700 |
| 3439 | -Subject: default thread load-store cache modifiers to off |
| 3440 | - |
| 3441 | -The assembly code used for this is invalid on RDNA 3, but it wasn't |
| 3442 | -noticed with older compilers. The broken code was entirely removed |
| 3443 | -by ROCm 7.1, so this patch can be dropped after updating hipcub |
| 3444 | -to that release. |
| 3445 | - |
| 3446 | -Forwarded: not-needed |
| 3447 | ---- |
| 3448 | - hipcub/include/hipcub/thread/thread_load.hpp | 2 +- |
| 3449 | - hipcub/include/hipcub/thread/thread_store.hpp | 2 +- |
| 3450 | - 2 files changed, 2 insertions(+), 2 deletions(-) |
| 3451 | - |
| 3452 | -diff --git a/hipcub/include/hipcub/thread/thread_load.hpp b/hipcub/include/hipcub/thread/thread_load.hpp |
| 3453 | -index 89b4b8b..f50239e 100644 |
| 3454 | ---- a/hipcub/include/hipcub/thread/thread_load.hpp |
| 3455 | -+++ b/hipcub/include/hipcub/thread/thread_load.hpp |
| 3456 | -@@ -33,7 +33,7 @@ |
| 3457 | - #ifdef __HIP_PLATFORM_AMD__ |
| 3458 | - |
| 3459 | - #ifndef HIPCUB_THREAD_LOAD_USE_CACHE_MODIFIERS |
| 3460 | -- #define HIPCUB_THREAD_LOAD_USE_CACHE_MODIFIERS 1 |
| 3461 | -+ #define HIPCUB_THREAD_LOAD_USE_CACHE_MODIFIERS 0 |
| 3462 | - #endif |
| 3463 | - |
| 3464 | - #include "../backend/rocprim/thread/thread_load.hpp" |
| 3465 | -diff --git a/hipcub/include/hipcub/thread/thread_store.hpp b/hipcub/include/hipcub/thread/thread_store.hpp |
| 3466 | -index df61aaa..9af0535 100644 |
| 3467 | ---- a/hipcub/include/hipcub/thread/thread_store.hpp |
| 3468 | -+++ b/hipcub/include/hipcub/thread/thread_store.hpp |
| 3469 | -@@ -33,7 +33,7 @@ |
| 3470 | - #ifdef __HIP_PLATFORM_AMD__ |
| 3471 | - |
| 3472 | - #ifndef HIPCUB_THREAD_STORE_USE_CACHE_MODIFIERS |
| 3473 | -- #define HIPCUB_THREAD_STORE_USE_CACHE_MODIFIERS 1 |
| 3474 | -+ #define HIPCUB_THREAD_STORE_USE_CACHE_MODIFIERS 0 |
| 3475 | - #endif |
| 3476 | - |
| 3477 | - #include "../backend/rocprim/thread/thread_store.hpp" |
| 3478 | diff --git a/debian/patches/series b/debian/patches/series |
| 3479 | index 4315635..6d27231 100644 |
| 3480 | --- a/debian/patches/series |
| 3481 | +++ b/debian/patches/series |
| 3482 | @@ -1,2 +1 @@ |
| 3483 | 0001-install-cmake-config-to-share.patch |
| 3484 | -0002-default-thread-load-store-cache-modifiers-to-off.patch |
| 3485 | diff --git a/debian/rules b/debian/rules |
| 3486 | index 643b44e..6ff7d6b 100755 |
| 3487 | --- a/debian/rules |
| 3488 | +++ b/debian/rules |
| 3489 | @@ -2,6 +2,12 @@ |
| 3490 | export CXX=hipcc |
| 3491 | export DEB_BUILD_MAINT_OPTIONS = hardening=+all optimize=-lto |
| 3492 | export DEB_CXXFLAGS_MAINT_PREPEND = -gz -DROCPRIM_NO_DEPRECATION_WARNINGS |
| 3493 | +# -Wl,--gc-sections: |
| 3494 | +# Fixes linker errors ("relocation refers to a discarded section") when using gtest. |
| 3495 | +# The linker's section garbage collection incorrectly discards needed code due to |
| 3496 | +# a toolchain mismatch (ROCm's clang vs. the system's GCC). This flag forces the |
| 3497 | +# linker to correctly re-evaluate dependencies, keeping the required sections. |
| 3498 | +export DEB_LDFLAGS_MAINT_PREPEND = -Wl,--gc-sections |
| 3499 | export VERBOSE=1 |
| 3500 | #export AMD_LOG_LEVEL=4 |
| 3501 | |
| 3502 | diff --git a/debian/tests/control b/debian/tests/control |
| 3503 | index 820f22f..ede74de 100644 |
| 3504 | --- a/debian/tests/control |
| 3505 | +++ b/debian/tests/control |
| 3506 | @@ -1,4 +1,4 @@ |
| 3507 | Test-Command: /bin/sh debian/tests/upstream-binaries libhipcub-tests |
| 3508 | Depends: libhipcub-tests |
| 3509 | Restrictions: allow-stderr, skippable |
| 3510 | -Architecture: amd64 arm64 ppc64el |
| 3511 | +Architecture: amd64 arm64 |
| 3512 | diff --git a/docs/conf.py b/docs/conf.py |
| 3513 | index 133736c..0cf04f8 100644 |
| 3514 | --- a/docs/conf.py |
| 3515 | +++ b/docs/conf.py |
| 3516 | @@ -18,7 +18,7 @@ left_nav_title = f"hipCUB {version_number} Documentation" |
| 3517 | # for PDF output on Read the Docs |
| 3518 | project = "hipCUB Documentation" |
| 3519 | author = "Advanced Micro Devices, Inc." |
| 3520 | -copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved." |
| 3521 | +copyright = "Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved." |
| 3522 | version = version_number |
| 3523 | release = version_number |
| 3524 | |
| 3525 | @@ -33,3 +33,7 @@ external_projects_current_project = "hipcub" |
| 3526 | |
| 3527 | for sphinx_var in ROCmDocs.SPHINX_VARS: |
| 3528 | globals()[sphinx_var] = getattr(docs_core, sphinx_var) |
| 3529 | + |
| 3530 | +# Suppresses "WARNING: toctree directive not expected with external-toc" |
| 3531 | +# Ideally suppression wouldn't be needed; see sphinx-external-toc#36 |
| 3532 | +suppress_warnings = ["etoc.toctree"] |
| 3533 | diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile |
| 3534 | index 1f4b7b7..16f2d40 100644 |
| 3535 | --- a/docs/doxygen/Doxyfile |
| 3536 | +++ b/docs/doxygen/Doxyfile |
| 3537 | @@ -1,4 +1,4 @@ |
| 3538 | -# Doxyfile 1.8.11 |
| 3539 | +# Doxyfile 1.9.4 |
| 3540 | |
| 3541 | # This file describes the settings to be used by the documentation system |
| 3542 | # doxygen (www.doxygen.org) for a project. |
| 3543 | @@ -12,16 +12,25 @@ |
| 3544 | # For lists, items can also be appended using: |
| 3545 | # TAG += value [value, ...] |
| 3546 | # Values that contain spaces should be placed between quotes (\" \"). |
| 3547 | +# |
| 3548 | +# Note: |
| 3549 | +# |
| 3550 | +# Use doxygen to compare the used configuration file with the template |
| 3551 | +# configuration file: |
| 3552 | +# doxygen -x [configFile] |
| 3553 | +# Use doxygen to compare the used configuration file with the template |
| 3554 | +# configuration file without replacing the environment variables: |
| 3555 | +# doxygen -x_noenv [configFile] |
| 3556 | |
| 3557 | #--------------------------------------------------------------------------- |
| 3558 | # Project related configuration options |
| 3559 | #--------------------------------------------------------------------------- |
| 3560 | |
| 3561 | -# This tag specifies the encoding used for all characters in the config file |
| 3562 | -# that follow. The default is UTF-8 which is also the encoding used for all text |
| 3563 | -# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv |
| 3564 | -# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv |
| 3565 | -# for the list of possible encodings. |
| 3566 | +# This tag specifies the encoding used for all characters in the configuration |
| 3567 | +# file that follow. The default is UTF-8 which is also the encoding used for all |
| 3568 | +# text before the first occurrence of this tag. Doxygen uses libiconv (or the |
| 3569 | +# iconv built into libc) for the transcoding. See |
| 3570 | +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. |
| 3571 | # The default value is: UTF-8. |
| 3572 | |
| 3573 | DOXYFILE_ENCODING = UTF-8 |
| 3574 | @@ -60,16 +69,28 @@ PROJECT_LOGO = |
| 3575 | |
| 3576 | OUTPUT_DIRECTORY = . |
| 3577 | |
| 3578 | -# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- |
| 3579 | -# directories (in 2 levels) under the output directory of each output format and |
| 3580 | -# will distribute the generated files over these directories. Enabling this |
| 3581 | +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096 |
| 3582 | +# sub-directories (in 2 levels) under the output directory of each output format |
| 3583 | +# and will distribute the generated files over these directories. Enabling this |
| 3584 | # option can be useful when feeding doxygen a huge amount of source files, where |
| 3585 | # putting all generated files in the same directory would otherwise causes |
| 3586 | -# performance problems for the file system. |
| 3587 | +# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to |
| 3588 | +# control the number of sub-directories. |
| 3589 | # The default value is: NO. |
| 3590 | |
| 3591 | CREATE_SUBDIRS = NO |
| 3592 | |
| 3593 | +# Controls the number of sub-directories that will be created when |
| 3594 | +# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every |
| 3595 | +# level increment doubles the number of directories, resulting in 4096 |
| 3596 | +# directories at level 8 which is the default and also the maximum value. The |
| 3597 | +# sub-directories are organized in 2 levels, the first level always has a fixed |
| 3598 | +# numer of 16 directories. |
| 3599 | +# Minimum value: 0, maximum value: 8, default value: 8. |
| 3600 | +# This tag requires that the tag CREATE_SUBDIRS is set to YES. |
| 3601 | + |
| 3602 | +CREATE_SUBDIRS_LEVEL = 8 |
| 3603 | + |
| 3604 | # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII |
| 3605 | # characters to appear in the names of generated files. If set to NO, non-ASCII |
| 3606 | # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode |
| 3607 | @@ -81,14 +102,14 @@ ALLOW_UNICODE_NAMES = NO |
| 3608 | # The OUTPUT_LANGUAGE tag is used to specify the language in which all |
| 3609 | # documentation generated by doxygen is written. Doxygen will use this |
| 3610 | # information to generate all constant output in the proper language. |
| 3611 | -# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, |
| 3612 | -# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), |
| 3613 | -# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, |
| 3614 | -# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), |
| 3615 | -# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, |
| 3616 | -# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, |
| 3617 | -# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, |
| 3618 | -# Ukrainian and Vietnamese. |
| 3619 | +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian, |
| 3620 | +# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English |
| 3621 | +# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek, |
| 3622 | +# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with |
| 3623 | +# English messages), Korean, Korean-en (Korean with English messages), Latvian, |
| 3624 | +# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, |
| 3625 | +# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, |
| 3626 | +# Swedish, Turkish, Ukrainian and Vietnamese. |
| 3627 | # The default value is: English. |
| 3628 | |
| 3629 | OUTPUT_LANGUAGE = English |
| 3630 | @@ -118,7 +139,6 @@ REPEAT_BRIEF = YES |
| 3631 | # the entity):The $name class, The $name widget, The $name file, is, provides, |
| 3632 | # specifies, contains, represents, a, an and the. |
| 3633 | |
| 3634 | - |
| 3635 | ABBREVIATE_BRIEF = "The $name class" \ |
| 3636 | "The $name widget" \ |
| 3637 | "The $name file" \ |
| 3638 | @@ -190,6 +210,16 @@ SHORT_NAMES = NO |
| 3639 | |
| 3640 | JAVADOC_AUTOBRIEF = NO |
| 3641 | |
| 3642 | +# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line |
| 3643 | +# such as |
| 3644 | +# /*************** |
| 3645 | +# as being the beginning of a Javadoc-style comment "banner". If set to NO, the |
| 3646 | +# Javadoc-style will behave just like regular comments and it will not be |
| 3647 | +# interpreted by doxygen. |
| 3648 | +# The default value is: NO. |
| 3649 | + |
| 3650 | +JAVADOC_BANNER = NO |
| 3651 | + |
| 3652 | # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first |
| 3653 | # line (until the first dot) of a Qt-style comment as the brief description. If |
| 3654 | # set to NO, the Qt-style will behave just like regular Qt-style comments (thus |
| 3655 | @@ -210,6 +240,14 @@ QT_AUTOBRIEF = NO |
| 3656 | |
| 3657 | MULTILINE_CPP_IS_BRIEF = NO |
| 3658 | |
| 3659 | +# By default Python docstrings are displayed as preformatted text and doxygen's |
| 3660 | +# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the |
| 3661 | +# doxygen's special commands can be used and the contents of the docstring |
| 3662 | +# documentation blocks is shown as doxygen documentation. |
| 3663 | +# The default value is: YES. |
| 3664 | + |
| 3665 | +PYTHON_DOCSTRING = YES |
| 3666 | + |
| 3667 | # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the |
| 3668 | # documentation from any documented member that it re-implements. |
| 3669 | # The default value is: YES. |
| 3670 | @@ -233,20 +271,19 @@ TAB_SIZE = 4 |
| 3671 | # the documentation. An alias has the form: |
| 3672 | # name=value |
| 3673 | # For example adding |
| 3674 | -# "sideeffect=@par Side Effects:\n" |
| 3675 | +# "sideeffect=@par Side Effects:^^" |
| 3676 | # will allow you to put the command \sideeffect (or @sideeffect) in the |
| 3677 | # documentation, which will result in a user-defined paragraph with heading |
| 3678 | -# "Side Effects:". You can put \n's in the value part of an alias to insert |
| 3679 | -# newlines. |
| 3680 | +# "Side Effects:". Note that you cannot put \n's in the value part of an alias |
| 3681 | +# to insert newlines (in the resulting output). You can put ^^ in the value part |
| 3682 | +# of an alias to insert a newline as if a physical newline was in the original |
| 3683 | +# file. When you need a literal { or } or , in the value part of an alias you |
| 3684 | +# have to escape them by means of a backslash (\), this can lead to conflicts |
| 3685 | +# with the commands \{ and \} for these it is advised to use the version @{ and |
| 3686 | +# @} or use a double escape (\\{ and \\}) |
| 3687 | |
| 3688 | ALIASES = |
| 3689 | |
| 3690 | -# This tag can be used to specify a number of word-keyword mappings (TCL only). |
| 3691 | -# A mapping has the form "name=value". For example adding "class=itcl::class" |
| 3692 | -# will allow you to use the command class in the itcl::class meaning. |
| 3693 | - |
| 3694 | -TCL_SUBST = |
| 3695 | - |
| 3696 | # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources |
| 3697 | # only. Doxygen will then generate output that is more tailored for C. For |
| 3698 | # instance, some of the names that are used will be different. The list of all |
| 3699 | @@ -275,28 +312,40 @@ OPTIMIZE_FOR_FORTRAN = NO |
| 3700 | |
| 3701 | OPTIMIZE_OUTPUT_VHDL = NO |
| 3702 | |
| 3703 | +# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice |
| 3704 | +# sources only. Doxygen will then generate output that is more tailored for that |
| 3705 | +# language. For instance, namespaces will be presented as modules, types will be |
| 3706 | +# separated into more groups, etc. |
| 3707 | +# The default value is: NO. |
| 3708 | + |
| 3709 | +OPTIMIZE_OUTPUT_SLICE = NO |
| 3710 | + |
| 3711 | # Doxygen selects the parser to use depending on the extension of the files it |
| 3712 | # parses. With this tag you can assign which parser to use for a given |
| 3713 | # extension. Doxygen has a built-in mapping, but you can override or extend it |
| 3714 | # using this tag. The format is ext=language, where ext is a file extension, and |
| 3715 | -# language is one of the parsers supported by doxygen: IDL, Java, Javascript, |
| 3716 | -# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: |
| 3717 | -# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: |
| 3718 | -# Fortran. In the later case the parser tries to guess whether the code is fixed |
| 3719 | -# or free formatted code, this is the default for Fortran type files), VHDL. For |
| 3720 | -# instance to make doxygen treat .inc files as Fortran files (default is PHP), |
| 3721 | -# and .f files as C (default is Fortran), use: inc=Fortran f=C. |
| 3722 | +# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, |
| 3723 | +# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice, |
| 3724 | +# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: |
| 3725 | +# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser |
| 3726 | +# tries to guess whether the code is fixed or free formatted code, this is the |
| 3727 | +# default for Fortran type files). For instance to make doxygen treat .inc files |
| 3728 | +# as Fortran files (default is PHP), and .f files as C (default is Fortran), |
| 3729 | +# use: inc=Fortran f=C. |
| 3730 | # |
| 3731 | # Note: For files without extension you can use no_extension as a placeholder. |
| 3732 | # |
| 3733 | # Note that for custom extensions you also need to set FILE_PATTERNS otherwise |
| 3734 | -# the files are not read by doxygen. |
| 3735 | +# the files are not read by doxygen. When specifying no_extension you should add |
| 3736 | +# * to the FILE_PATTERNS. |
| 3737 | +# |
| 3738 | +# Note see also the list of default file extension mappings. |
| 3739 | |
| 3740 | EXTENSION_MAPPING = |
| 3741 | |
| 3742 | # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments |
| 3743 | # according to the Markdown format, which allows for more readable |
| 3744 | -# documentation. See http://daringfireball.net/projects/markdown/ for details. |
| 3745 | +# documentation. See https://daringfireball.net/projects/markdown/ for details. |
| 3746 | # The output of markdown processing is further processed by doxygen, so you can |
| 3747 | # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in |
| 3748 | # case of backward compatibilities issues. |
| 3749 | @@ -304,6 +353,15 @@ EXTENSION_MAPPING = |
| 3750 | |
| 3751 | MARKDOWN_SUPPORT = YES |
| 3752 | |
| 3753 | +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up |
| 3754 | +# to that level are automatically included in the table of contents, even if |
| 3755 | +# they do not have an id attribute. |
| 3756 | +# Note: This feature currently applies only to Markdown headings. |
| 3757 | +# Minimum value: 0, maximum value: 99, default value: 5. |
| 3758 | +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. |
| 3759 | + |
| 3760 | +TOC_INCLUDE_HEADINGS = 5 |
| 3761 | + |
| 3762 | # When enabled doxygen tries to link words that correspond to documented |
| 3763 | # classes, or namespaces to their corresponding documentation. Such a link can |
| 3764 | # be prevented in individual cases by putting a % sign in front of the word or |
| 3765 | @@ -329,7 +387,7 @@ BUILTIN_STL_SUPPORT = NO |
| 3766 | CPP_CLI_SUPPORT = NO |
| 3767 | |
| 3768 | # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: |
| 3769 | -# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen |
| 3770 | +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen |
| 3771 | # will parse them like normal C++ but will assume all classes use public instead |
| 3772 | # of private inheritance when no explicit protection keyword is present. |
| 3773 | # The default value is: NO. |
| 3774 | @@ -415,6 +473,19 @@ TYPEDEF_HIDES_STRUCT = NO |
| 3775 | |
| 3776 | LOOKUP_CACHE_SIZE = 0 |
| 3777 | |
| 3778 | +# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use |
| 3779 | +# during processing. When set to 0 doxygen will based this on the number of |
| 3780 | +# cores available in the system. You can set it explicitly to a value larger |
| 3781 | +# than 0 to get more control over the balance between CPU load and processing |
| 3782 | +# speed. At this moment only the input processing can be done using multiple |
| 3783 | +# threads. Since this is still an experimental feature the default is set to 1, |
| 3784 | +# which effectively disables parallel processing. Please report any issues you |
| 3785 | +# encounter. Generating dot graphs in parallel is controlled by the |
| 3786 | +# DOT_NUM_THREADS setting. |
| 3787 | +# Minimum value: 0, maximum value: 32, default value: 1. |
| 3788 | + |
| 3789 | +NUM_PROC_THREADS = 1 |
| 3790 | + |
| 3791 | #--------------------------------------------------------------------------- |
| 3792 | # Build related configuration options |
| 3793 | #--------------------------------------------------------------------------- |
| 3794 | @@ -435,6 +506,12 @@ EXTRACT_ALL = NO |
| 3795 | |
| 3796 | EXTRACT_PRIVATE = NO |
| 3797 | |
| 3798 | +# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual |
| 3799 | +# methods of a class will be included in the documentation. |
| 3800 | +# The default value is: NO. |
| 3801 | + |
| 3802 | +EXTRACT_PRIV_VIRTUAL = NO |
| 3803 | + |
| 3804 | # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal |
| 3805 | # scope will be included in the documentation. |
| 3806 | # The default value is: NO. |
| 3807 | @@ -472,6 +549,13 @@ EXTRACT_LOCAL_METHODS = NO |
| 3808 | |
| 3809 | EXTRACT_ANON_NSPACES = NO |
| 3810 | |
| 3811 | +# If this flag is set to YES, the name of an unnamed parameter in a declaration |
| 3812 | +# will be determined by the corresponding definition. By default unnamed |
| 3813 | +# parameters remain unnamed in the output. |
| 3814 | +# The default value is: YES. |
| 3815 | + |
| 3816 | +RESOLVE_UNNAMED_PARAMS = YES |
| 3817 | + |
| 3818 | # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all |
| 3819 | # undocumented members inside documented classes or files. If set to NO these |
| 3820 | # members will be included in the various overviews, but no documentation |
| 3821 | @@ -489,8 +573,8 @@ HIDE_UNDOC_MEMBERS = NO |
| 3822 | HIDE_UNDOC_CLASSES = NO |
| 3823 | |
| 3824 | # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend |
| 3825 | -# (class|struct|union) declarations. If set to NO, these declarations will be |
| 3826 | -# included in the documentation. |
| 3827 | +# declarations. If set to NO, these declarations will be included in the |
| 3828 | +# documentation. |
| 3829 | # The default value is: NO. |
| 3830 | |
| 3831 | HIDE_FRIEND_COMPOUNDS = NO |
| 3832 | @@ -509,11 +593,18 @@ HIDE_IN_BODY_DOCS = NO |
| 3833 | |
| 3834 | INTERNAL_DOCS = NO |
| 3835 | |
| 3836 | -# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file |
| 3837 | -# names in lower-case letters. If set to YES, upper-case letters are also |
| 3838 | -# allowed. This is useful if you have classes or files whose names only differ |
| 3839 | -# in case and if your file system supports case sensitive file names. Windows |
| 3840 | -# and Mac users are advised to set this option to NO. |
| 3841 | +# With the correct setting of option CASE_SENSE_NAMES doxygen will better be |
| 3842 | +# able to match the capabilities of the underlying filesystem. In case the |
| 3843 | +# filesystem is case sensitive (i.e. it supports files in the same directory |
| 3844 | +# whose names only differ in casing), the option must be set to YES to properly |
| 3845 | +# deal with such files in case they appear in the input. For filesystems that |
| 3846 | +# are not case sensitive the option should be set to NO to properly deal with |
| 3847 | +# output files written for symbols that only differ in casing, such as for two |
| 3848 | +# classes, one named CLASS and the other named Class, and to also support |
| 3849 | +# references to files without having to specify the exact matching casing. On |
| 3850 | +# Windows (including Cygwin) and MacOS, users should typically set this option |
| 3851 | +# to NO, whereas on Linux or other Unix flavors it should typically be set to |
| 3852 | +# YES. |
| 3853 | # The default value is: system dependent. |
| 3854 | |
| 3855 | CASE_SENSE_NAMES = YES |
| 3856 | @@ -532,6 +623,12 @@ HIDE_SCOPE_NAMES = NO |
| 3857 | |
| 3858 | HIDE_COMPOUND_REFERENCE= NO |
| 3859 | |
| 3860 | +# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class |
| 3861 | +# will show which file needs to be included to use the class. |
| 3862 | +# The default value is: YES. |
| 3863 | + |
| 3864 | +SHOW_HEADERFILE = YES |
| 3865 | + |
| 3866 | # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of |
| 3867 | # the files that are included by a file in the documentation of that file. |
| 3868 | # The default value is: YES. |
| 3869 | @@ -689,7 +786,8 @@ FILE_VERSION_FILTER = |
| 3870 | # output files in an output format independent way. To create the layout file |
| 3871 | # that represents doxygen's defaults, run doxygen with the -l option. You can |
| 3872 | # optionally specify a file name after the option, if omitted DoxygenLayout.xml |
| 3873 | -# will be used as the name of the layout file. |
| 3874 | +# will be used as the name of the layout file. See also section "Changing the |
| 3875 | +# layout of pages" for information. |
| 3876 | # |
| 3877 | # Note that if you run doxygen from a directory containing a file called |
| 3878 | # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE |
| 3879 | @@ -700,7 +798,7 @@ LAYOUT_FILE = |
| 3880 | # The CITE_BIB_FILES tag can be used to specify one or more bib files containing |
| 3881 | # the reference definitions. This must be a list of .bib files. The .bib |
| 3882 | # extension is automatically appended if omitted. This requires the bibtex tool |
| 3883 | -# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. |
| 3884 | +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. |
| 3885 | # For LaTeX the style of the bibliography can be controlled using |
| 3886 | # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the |
| 3887 | # search path. See also \cite for info how to create references. |
| 3888 | @@ -735,23 +833,35 @@ WARNINGS = YES |
| 3889 | WARN_IF_UNDOCUMENTED = YES |
| 3890 | |
| 3891 | # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for |
| 3892 | -# potential errors in the documentation, such as not documenting some parameters |
| 3893 | -# in a documented function, or documenting parameters that don't exist or using |
| 3894 | -# markup commands wrongly. |
| 3895 | +# potential errors in the documentation, such as documenting some parameters in |
| 3896 | +# a documented function twice, or documenting parameters that don't exist or |
| 3897 | +# using markup commands wrongly. |
| 3898 | # The default value is: YES. |
| 3899 | |
| 3900 | WARN_IF_DOC_ERROR = YES |
| 3901 | |
| 3902 | +# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete |
| 3903 | +# function parameter documentation. If set to NO, doxygen will accept that some |
| 3904 | +# parameters have no documentation without warning. |
| 3905 | +# The default value is: YES. |
| 3906 | + |
| 3907 | +WARN_IF_INCOMPLETE_DOC = YES |
| 3908 | + |
| 3909 | # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that |
| 3910 | # are documented, but have no documentation for their parameters or return |
| 3911 | -# value. If set to NO, doxygen will only warn about wrong or incomplete |
| 3912 | -# parameter documentation, but not about the absence of documentation. |
| 3913 | +# value. If set to NO, doxygen will only warn about wrong parameter |
| 3914 | +# documentation, but not about the absence of documentation. If EXTRACT_ALL is |
| 3915 | +# set to YES then this flag will automatically be disabled. See also |
| 3916 | +# WARN_IF_INCOMPLETE_DOC |
| 3917 | # The default value is: NO. |
| 3918 | |
| 3919 | WARN_NO_PARAMDOC = NO |
| 3920 | |
| 3921 | # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when |
| 3922 | -# a warning is encountered. |
| 3923 | +# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS |
| 3924 | +# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but |
| 3925 | +# at the end of the doxygen process doxygen will return with a non-zero status. |
| 3926 | +# Possible values are: NO, YES and FAIL_ON_WARNINGS. |
| 3927 | # The default value is: NO. |
| 3928 | |
| 3929 | WARN_AS_ERROR = YES |
| 3930 | @@ -762,13 +872,27 @@ WARN_AS_ERROR = YES |
| 3931 | # and the warning text. Optionally the format may contain $version, which will |
| 3932 | # be replaced by the version of the file (if it could be obtained via |
| 3933 | # FILE_VERSION_FILTER) |
| 3934 | +# See also: WARN_LINE_FORMAT |
| 3935 | # The default value is: $file:$line: $text. |
| 3936 | |
| 3937 | WARN_FORMAT = "$file:$line: $text" |
| 3938 | |
| 3939 | +# In the $text part of the WARN_FORMAT command it is possible that a reference |
| 3940 | +# to a more specific place is given. To make it easier to jump to this place |
| 3941 | +# (outside of doxygen) the user can define a custom "cut" / "paste" string. |
| 3942 | +# Example: |
| 3943 | +# WARN_LINE_FORMAT = "'vi $file +$line'" |
| 3944 | +# See also: WARN_FORMAT |
| 3945 | +# The default value is: at line $line of file $file. |
| 3946 | + |
| 3947 | +WARN_LINE_FORMAT = "at line $line of file $file" |
| 3948 | + |
| 3949 | # The WARN_LOGFILE tag can be used to specify a file to which warning and error |
| 3950 | # messages should be written. If left blank the output is written to standard |
| 3951 | -# error (stderr). |
| 3952 | +# error (stderr). In case the file specified cannot be opened for writing the |
| 3953 | +# warning and error messages are written to standard error. When as file - is |
| 3954 | +# specified the warning and error messages are written to standard output |
| 3955 | +# (stdout). |
| 3956 | |
| 3957 | WARN_LOGFILE = |
| 3958 | |
| 3959 | @@ -787,8 +911,8 @@ INPUT = ../../hipcub/include/hipcub |
| 3960 | # This tag can be used to specify the character encoding of the source files |
| 3961 | # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses |
| 3962 | # libiconv (or the iconv built into libc) for the transcoding. See the libiconv |
| 3963 | -# documentation (see: http://www.gnu.org/software/libiconv) for the list of |
| 3964 | -# possible encodings. |
| 3965 | +# documentation (see: |
| 3966 | +# https://www.gnu.org/software/libiconv/) for the list of possible encodings. |
| 3967 | # The default value is: UTF-8. |
| 3968 | |
| 3969 | INPUT_ENCODING = UTF-8 |
| 3970 | @@ -801,11 +925,15 @@ INPUT_ENCODING = UTF-8 |
| 3971 | # need to set EXTENSION_MAPPING for the extension otherwise the files are not |
| 3972 | # read by doxygen. |
| 3973 | # |
| 3974 | +# Note the list of default checked file patterns might differ from the list of |
| 3975 | +# default file extension mappings. |
| 3976 | +# |
| 3977 | # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, |
| 3978 | # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, |
| 3979 | -# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, |
| 3980 | -# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl, |
| 3981 | -# *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js. |
| 3982 | +# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, |
| 3983 | +# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C |
| 3984 | +# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, |
| 3985 | +# *.vhdl, *.ucf, *.qsf and *.ice. |
| 3986 | |
| 3987 | FILE_PATTERNS = |
| 3988 | |
| 3989 | @@ -838,13 +966,14 @@ EXCLUDE_SYMLINKS = NO |
| 3990 | # Note that the wildcards are matched against the file with absolute path, so to |
| 3991 | # exclude all test directories for example use the pattern */test/* |
| 3992 | |
| 3993 | -EXCLUDE_PATTERNS = */detail/*,*/backend/* |
| 3994 | +EXCLUDE_PATTERNS = */detail/* \ |
| 3995 | + */backend/* |
| 3996 | |
| 3997 | # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names |
| 3998 | # (namespaces, classes, functions, etc.) that should be excluded from the |
| 3999 | # output. The symbol name can be a fully qualified name, a word, or if the |
| 4000 | # wildcard * is used, a substring. Examples: ANamespace, AClass, |
| 4001 | -# AClass::ANamespace, ANamespace::*Test |
| 4002 | +# ANamespace::AClass, ANamespace::*Test |
| 4003 | # |
| 4004 | # Note that the wildcards are matched against the file with absolute path, so to |
| 4005 | # exclude all test directories use the pattern */test/* |
| 4006 | @@ -960,7 +1089,7 @@ INLINE_SOURCES = NO |
| 4007 | STRIP_CODE_COMMENTS = YES |
| 4008 | |
| 4009 | # If the REFERENCED_BY_RELATION tag is set to YES then for each documented |
| 4010 | -# function all documented functions referencing it will be listed. |
| 4011 | +# entity all documented functions referencing it will be listed. |
| 4012 | # The default value is: NO. |
| 4013 | |
| 4014 | REFERENCED_BY_RELATION = NO |
| 4015 | @@ -992,12 +1121,12 @@ SOURCE_TOOLTIPS = YES |
| 4016 | # If the USE_HTAGS tag is set to YES then the references to source code will |
| 4017 | # point to the HTML generated by the htags(1) tool instead of doxygen built-in |
| 4018 | # source browser. The htags tool is part of GNU's global source tagging system |
| 4019 | -# (see http://www.gnu.org/software/global/global.html). You will need version |
| 4020 | +# (see https://www.gnu.org/software/global/global.html). You will need version |
| 4021 | # 4.8.6 or higher. |
| 4022 | # |
| 4023 | # To use it do the following: |
| 4024 | # - Install the latest version of global |
| 4025 | -# - Enable SOURCE_BROWSER and USE_HTAGS in the config file |
| 4026 | +# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file |
| 4027 | # - Make sure the INPUT points to the root of the source tree |
| 4028 | # - Run doxygen as normal |
| 4029 | # |
| 4030 | @@ -1020,16 +1149,24 @@ USE_HTAGS = NO |
| 4031 | VERBATIM_HEADERS = YES |
| 4032 | |
| 4033 | # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the |
| 4034 | -# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the |
| 4035 | -# cost of reduced performance. This can be particularly helpful with template |
| 4036 | -# rich C++ code for which doxygen's built-in parser lacks the necessary type |
| 4037 | -# information. |
| 4038 | +# clang parser (see: |
| 4039 | +# http://clang.llvm.org/) for more accurate parsing at the cost of reduced |
| 4040 | +# performance. This can be particularly helpful with template rich C++ code for |
| 4041 | +# which doxygen's built-in parser lacks the necessary type information. |
| 4042 | # Note: The availability of this option depends on whether or not doxygen was |
| 4043 | -# generated with the -Duse-libclang=ON option for CMake. |
| 4044 | +# generated with the -Duse_libclang=ON option for CMake. |
| 4045 | # The default value is: NO. |
| 4046 | |
| 4047 | CLANG_ASSISTED_PARSING = NO |
| 4048 | |
| 4049 | +# If the CLANG_ASSISTED_PARSING tag is set to YES and the CLANG_ADD_INC_PATHS |
| 4050 | +# tag is set to YES then doxygen will add the directory of each input to the |
| 4051 | +# include path. |
| 4052 | +# The default value is: YES. |
| 4053 | +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. |
| 4054 | + |
| 4055 | +CLANG_ADD_INC_PATHS = YES |
| 4056 | + |
| 4057 | # If clang assisted parsing is enabled you can provide the compiler with command |
| 4058 | # line options that you would normally use when invoking the compiler. Note that |
| 4059 | # the include paths will already be set by doxygen for the files and directories |
| 4060 | @@ -1038,6 +1175,19 @@ CLANG_ASSISTED_PARSING = NO |
| 4061 | |
| 4062 | CLANG_OPTIONS = |
| 4063 | |
| 4064 | +# If clang assisted parsing is enabled you can provide the clang parser with the |
| 4065 | +# path to the directory containing a file called compile_commands.json. This |
| 4066 | +# file is the compilation database (see: |
| 4067 | +# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the |
| 4068 | +# options used when the source files were built. This is equivalent to |
| 4069 | +# specifying the -p option to a clang tool, such as clang-check. These options |
| 4070 | +# will then be passed to the parser. Any options specified with CLANG_OPTIONS |
| 4071 | +# will be added as well. |
| 4072 | +# Note: The availability of this option depends on whether or not doxygen was |
| 4073 | +# generated with the -Duse_libclang=ON option for CMake. |
| 4074 | + |
| 4075 | +CLANG_DATABASE_PATH = |
| 4076 | + |
| 4077 | #--------------------------------------------------------------------------- |
| 4078 | # Configuration options related to the alphabetical class index |
| 4079 | #--------------------------------------------------------------------------- |
| 4080 | @@ -1049,13 +1199,6 @@ CLANG_OPTIONS = |
| 4081 | |
| 4082 | ALPHABETICAL_INDEX = NO |
| 4083 | |
| 4084 | -# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in |
| 4085 | -# which the alphabetical index list will be split. |
| 4086 | -# Minimum value: 1, maximum value: 20, default value: 5. |
| 4087 | -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. |
| 4088 | - |
| 4089 | -COLS_IN_ALPHA_INDEX = 5 |
| 4090 | - |
| 4091 | # In case all classes in a project start with a common prefix, all classes will |
| 4092 | # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag |
| 4093 | # can be used to specify a prefix (or a list of prefixes) that should be ignored |
| 4094 | @@ -1155,8 +1298,8 @@ HTML_EXTRA_FILES = |
| 4095 | |
| 4096 | # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen |
| 4097 | # will adjust the colors in the style sheet and background images according to |
| 4098 | -# this color. Hue is specified as an angle on a colorwheel, see |
| 4099 | -# http://en.wikipedia.org/wiki/Hue for more information. For instance the value |
| 4100 | +# this color. Hue is specified as an angle on a color-wheel, see |
| 4101 | +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value |
| 4102 | # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 |
| 4103 | # purple, and 360 is red again. |
| 4104 | # Minimum value: 0, maximum value: 359, default value: 220. |
| 4105 | @@ -1165,7 +1308,7 @@ HTML_EXTRA_FILES = |
| 4106 | HTML_COLORSTYLE_HUE = 220 |
| 4107 | |
| 4108 | # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors |
| 4109 | -# in the HTML output. For a value of 0 the output will use grayscales only. A |
| 4110 | +# in the HTML output. For a value of 0 the output will use gray-scales only. A |
| 4111 | # value of 255 will produce the most vivid colors. |
| 4112 | # Minimum value: 0, maximum value: 255, default value: 100. |
| 4113 | # This tag requires that the tag GENERATE_HTML is set to YES. |
| 4114 | @@ -1192,6 +1335,17 @@ HTML_COLORSTYLE_GAMMA = 80 |
| 4115 | |
| 4116 | HTML_TIMESTAMP = NO |
| 4117 | |
| 4118 | +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML |
| 4119 | +# documentation will contain a main index with vertical navigation menus that |
| 4120 | +# are dynamically created via JavaScript. If disabled, the navigation index will |
| 4121 | +# consists of multiple levels of tabs that are statically embedded in every HTML |
| 4122 | +# page. Disable this option to support browsers that do not have JavaScript, |
| 4123 | +# like the Qt help browser. |
| 4124 | +# The default value is: YES. |
| 4125 | +# This tag requires that the tag GENERATE_HTML is set to YES. |
| 4126 | + |
| 4127 | +HTML_DYNAMIC_MENUS = YES |
| 4128 | + |
| 4129 | # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML |
| 4130 | # documentation will contain sections that can be hidden and shown after the |
| 4131 | # page has loaded. |
| 4132 | @@ -1215,13 +1369,14 @@ HTML_INDEX_NUM_ENTRIES = 100 |
| 4133 | |
| 4134 | # If the GENERATE_DOCSET tag is set to YES, additional index files will be |
| 4135 | # generated that can be used as input for Apple's Xcode 3 integrated development |
| 4136 | -# environment (see: http://developer.apple.com/tools/xcode/), introduced with |
| 4137 | -# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a |
| 4138 | -# Makefile in the HTML output directory. Running make will produce the docset in |
| 4139 | -# that directory and running make install will install the docset in |
| 4140 | +# environment (see: |
| 4141 | +# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To |
| 4142 | +# create a documentation set, doxygen will generate a Makefile in the HTML |
| 4143 | +# output directory. Running make will produce the docset in that directory and |
| 4144 | +# running make install will install the docset in |
| 4145 | # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at |
| 4146 | -# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html |
| 4147 | -# for more information. |
| 4148 | +# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy |
| 4149 | +# genXcode/_index.html for more information. |
| 4150 | # The default value is: NO. |
| 4151 | # This tag requires that the tag GENERATE_HTML is set to YES. |
| 4152 | |
| 4153 | @@ -1235,6 +1390,13 @@ GENERATE_DOCSET = NO |
| 4154 | |
| 4155 | DOCSET_FEEDNAME = "Doxygen generated docs" |
| 4156 | |
| 4157 | +# This tag determines the URL of the docset feed. A documentation feed provides |
| 4158 | +# an umbrella under which multiple documentation sets from a single provider |
| 4159 | +# (such as a company or product suite) can be grouped. |
| 4160 | +# This tag requires that the tag GENERATE_DOCSET is set to YES. |
| 4161 | + |
| 4162 | +DOCSET_FEEDURL = |
| 4163 | + |
| 4164 | # This tag specifies a string that should uniquely identify the documentation |
| 4165 | # set bundle. This should be a reverse domain-name style string, e.g. |
| 4166 | # com.mycompany.MyDocSet. Doxygen will append .docset to the name. |
| 4167 | @@ -1260,8 +1422,12 @@ DOCSET_PUBLISHER_NAME = Publisher |
| 4168 | # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three |
| 4169 | # additional HTML index files: index.hhp, index.hhc, and index.hhk. The |
| 4170 | # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop |
| 4171 | -# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on |
| 4172 | -# Windows. |
| 4173 | +# on Windows. In the beginning of 2021 Microsoft took the original page, with |
| 4174 | +# a.o. the download links, offline the HTML help workshop was already many years |
| 4175 | +# in maintenance mode). You can download the HTML help workshop from the web |
| 4176 | +# archives at Installation executable (see: |
| 4177 | +# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo |
| 4178 | +# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe). |
| 4179 | # |
| 4180 | # The HTML Help Workshop contains a compiler that can convert all HTML output |
| 4181 | # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML |
| 4182 | @@ -1291,7 +1457,7 @@ CHM_FILE = |
| 4183 | HHC_LOCATION = |
| 4184 | |
| 4185 | # The GENERATE_CHI flag controls if a separate .chi index file is generated |
| 4186 | -# (YES) or that it should be included in the master .chm file (NO). |
| 4187 | +# (YES) or that it should be included in the main .chm file (NO). |
| 4188 | # The default value is: NO. |
| 4189 | # This tag requires that the tag GENERATE_HTMLHELP is set to YES. |
| 4190 | |
| 4191 | @@ -1336,7 +1502,8 @@ QCH_FILE = |
| 4192 | |
| 4193 | # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help |
| 4194 | # Project output. For more information please see Qt Help Project / Namespace |
| 4195 | -# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). |
| 4196 | +# (see: |
| 4197 | +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). |
| 4198 | # The default value is: org.doxygen.Project. |
| 4199 | # This tag requires that the tag GENERATE_QHP is set to YES. |
| 4200 | |
| 4201 | @@ -1344,8 +1511,8 @@ QHP_NAMESPACE = |
| 4202 | |
| 4203 | # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt |
| 4204 | # Help Project output. For more information please see Qt Help Project / Virtual |
| 4205 | -# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- |
| 4206 | -# folders). |
| 4207 | +# Folders (see: |
| 4208 | +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). |
| 4209 | # The default value is: doc. |
| 4210 | # This tag requires that the tag GENERATE_QHP is set to YES. |
| 4211 | |
| 4212 | @@ -1353,30 +1520,30 @@ QHP_VIRTUAL_FOLDER = doc |
| 4213 | |
| 4214 | # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom |
| 4215 | # filter to add. For more information please see Qt Help Project / Custom |
| 4216 | -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- |
| 4217 | -# filters). |
| 4218 | +# Filters (see: |
| 4219 | +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). |
| 4220 | # This tag requires that the tag GENERATE_QHP is set to YES. |
| 4221 | |
| 4222 | QHP_CUST_FILTER_NAME = |
| 4223 | |
| 4224 | # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the |
| 4225 | # custom filter to add. For more information please see Qt Help Project / Custom |
| 4226 | -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- |
| 4227 | -# filters). |
| 4228 | +# Filters (see: |
| 4229 | +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). |
| 4230 | # This tag requires that the tag GENERATE_QHP is set to YES. |
| 4231 | |
| 4232 | QHP_CUST_FILTER_ATTRS = |
| 4233 | |
| 4234 | # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this |
| 4235 | # project's filter section matches. Qt Help Project / Filter Attributes (see: |
| 4236 | -# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). |
| 4237 | +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). |
| 4238 | # This tag requires that the tag GENERATE_QHP is set to YES. |
| 4239 | |
| 4240 | QHP_SECT_FILTER_ATTRS = |
| 4241 | |
| 4242 | -# The QHG_LOCATION tag can be used to specify the location of Qt's |
| 4243 | -# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the |
| 4244 | -# generated .qhp file. |
| 4245 | +# The QHG_LOCATION tag can be used to specify the location (absolute path |
| 4246 | +# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to |
| 4247 | +# run qhelpgenerator on the generated .qhp file. |
| 4248 | # This tag requires that the tag GENERATE_QHP is set to YES. |
| 4249 | |
| 4250 | QHG_LOCATION = |
| 4251 | @@ -1419,16 +1586,28 @@ DISABLE_INDEX = NO |
| 4252 | # to work a browser that supports JavaScript, DHTML, CSS and frames is required |
| 4253 | # (i.e. any modern browser). Windows users are probably better off using the |
| 4254 | # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can |
| 4255 | -# further fine-tune the look of the index. As an example, the default style |
| 4256 | -# sheet generated by doxygen has an example that shows how to put an image at |
| 4257 | -# the root of the tree instead of the PROJECT_NAME. Since the tree basically has |
| 4258 | -# the same information as the tab index, you could consider setting |
| 4259 | -# DISABLE_INDEX to YES when enabling this option. |
| 4260 | +# further fine tune the look of the index (see "Fine-tuning the output"). As an |
| 4261 | +# example, the default style sheet generated by doxygen has an example that |
| 4262 | +# shows how to put an image at the root of the tree instead of the PROJECT_NAME. |
| 4263 | +# Since the tree basically has the same information as the tab index, you could |
| 4264 | +# consider setting DISABLE_INDEX to YES when enabling this option. |
| 4265 | # The default value is: NO. |
| 4266 | # This tag requires that the tag GENERATE_HTML is set to YES. |
| 4267 | |
| 4268 | GENERATE_TREEVIEW = NO |
| 4269 | |
| 4270 | +# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the |
| 4271 | +# FULL_SIDEBAR option determines if the side bar is limited to only the treeview |
| 4272 | +# area (value NO) or if it should extend to the full height of the window (value |
| 4273 | +# YES). Setting this to YES gives a layout similar to |
| 4274 | +# https://docs.readthedocs.io with more room for contents, but less room for the |
| 4275 | +# project logo, title, and description. If either GENERATE_TREEVIEW or |
| 4276 | +# DISABLE_INDEX is set to NO, this option has no effect. |
| 4277 | +# The default value is: NO. |
| 4278 | +# This tag requires that the tag GENERATE_HTML is set to YES. |
| 4279 | + |
| 4280 | +FULL_SIDEBAR = NO |
| 4281 | + |
| 4282 | # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that |
| 4283 | # doxygen will group on one line in the generated HTML documentation. |
| 4284 | # |
| 4285 | @@ -1453,6 +1632,24 @@ TREEVIEW_WIDTH = 250 |
| 4286 | |
| 4287 | EXT_LINKS_IN_WINDOW = NO |
| 4288 | |
| 4289 | +# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email |
| 4290 | +# addresses. |
| 4291 | +# The default value is: YES. |
| 4292 | +# This tag requires that the tag GENERATE_HTML is set to YES. |
| 4293 | + |
| 4294 | +OBFUSCATE_EMAILS = YES |
| 4295 | + |
| 4296 | +# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg |
| 4297 | +# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see |
| 4298 | +# https://inkscape.org) to generate formulas as SVG images instead of PNGs for |
| 4299 | +# the HTML output. These images will generally look nicer at scaled resolutions. |
| 4300 | +# Possible values are: png (the default) and svg (looks nicer but requires the |
| 4301 | +# pdf2svg or inkscape tool). |
| 4302 | +# The default value is: png. |
| 4303 | +# This tag requires that the tag GENERATE_HTML is set to YES. |
| 4304 | + |
| 4305 | +HTML_FORMULA_FORMAT = png |
| 4306 | + |
| 4307 | # Use this tag to change the font size of LaTeX formulas included as images in |
| 4308 | # the HTML documentation. When you change the font size after a successful |
| 4309 | # doxygen run you need to manually remove any form_*.png images from the HTML |
| 4310 | @@ -1462,7 +1659,7 @@ EXT_LINKS_IN_WINDOW = NO |
| 4311 | |
| 4312 | FORMULA_FONTSIZE = 10 |
| 4313 | |
| 4314 | -# Use the FORMULA_TRANPARENT tag to determine whether or not the images |
| 4315 | +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images |
| 4316 | # generated for formulas are transparent PNGs. Transparent PNGs are not |
| 4317 | # supported properly for IE 6.0, but are supported on all modern browsers. |
| 4318 | # |
| 4319 | @@ -1473,8 +1670,14 @@ FORMULA_FONTSIZE = 10 |
| 4320 | |
| 4321 | FORMULA_TRANSPARENT = YES |
| 4322 | |
| 4323 | +# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands |
| 4324 | +# to create new LaTeX commands to be used in formulas as building blocks. See |
| 4325 | +# the section "Including formulas" for details. |
| 4326 | + |
| 4327 | +FORMULA_MACROFILE = |
| 4328 | + |
| 4329 | # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see |
| 4330 | -# http://www.mathjax.org) which uses client side Javascript for the rendering |
| 4331 | +# https://www.mathjax.org) which uses client side JavaScript for the rendering |
| 4332 | # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX |
| 4333 | # installed or if you want to formulas look prettier in the HTML output. When |
| 4334 | # enabled you may also need to install MathJax separately and configure the path |
| 4335 | @@ -1484,11 +1687,29 @@ FORMULA_TRANSPARENT = YES |
| 4336 | |
| 4337 | USE_MATHJAX = YES |
| 4338 | |
| 4339 | +# With MATHJAX_VERSION it is possible to specify the MathJax version to be used. |
| 4340 | +# Note that the different versions of MathJax have different requirements with |
| 4341 | +# regards to the different settings, so it is possible that also other MathJax |
| 4342 | +# settings have to be changed when switching between the different MathJax |
| 4343 | +# versions. |
| 4344 | +# Possible values are: MathJax_2 and MathJax_3. |
| 4345 | +# The default value is: MathJax_2. |
| 4346 | +# This tag requires that the tag USE_MATHJAX is set to YES. |
| 4347 | + |
| 4348 | +MATHJAX_VERSION = MathJax_2 |
| 4349 | + |
| 4350 | # When MathJax is enabled you can set the default output format to be used for |
| 4351 | -# the MathJax output. See the MathJax site (see: |
| 4352 | -# http://docs.mathjax.org/en/latest/output.html) for more details. |
| 4353 | +# the MathJax output. For more details about the output format see MathJax |
| 4354 | +# version 2 (see: |
| 4355 | +# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3 |
| 4356 | +# (see: |
| 4357 | +# http://docs.mathjax.org/en/latest/web/components/output.html). |
| 4358 | # Possible values are: HTML-CSS (which is slower, but has the best |
| 4359 | -# compatibility), NativeMML (i.e. MathML) and SVG. |
| 4360 | +# compatibility. This is the name for Mathjax version 2, for MathJax version 3 |
| 4361 | +# this will be translated into chtml), NativeMML (i.e. MathML. Only supported |
| 4362 | +# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This |
| 4363 | +# is the name for Mathjax version 3, for MathJax version 2 this will be |
| 4364 | +# translated into HTML-CSS) and SVG. |
| 4365 | # The default value is: HTML-CSS. |
| 4366 | # This tag requires that the tag USE_MATHJAX is set to YES. |
| 4367 | |
| 4368 | @@ -1501,22 +1722,29 @@ MATHJAX_FORMAT = HTML-CSS |
| 4369 | # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax |
| 4370 | # Content Delivery Network so you can quickly see the result without installing |
| 4371 | # MathJax. However, it is strongly recommended to install a local copy of |
| 4372 | -# MathJax from http://www.mathjax.org before deployment. |
| 4373 | -# The default value is: http://cdn.mathjax.org/mathjax/latest. |
| 4374 | +# MathJax from https://www.mathjax.org before deployment. The default value is: |
| 4375 | +# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2 |
| 4376 | +# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3 |
| 4377 | # This tag requires that the tag USE_MATHJAX is set to YES. |
| 4378 | |
| 4379 | MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest |
| 4380 | |
| 4381 | # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax |
| 4382 | # extension names that should be enabled during MathJax rendering. For example |
| 4383 | +# for MathJax version 2 (see https://docs.mathjax.org/en/v2.7-latest/tex.html |
| 4384 | +# #tex-and-latex-extensions): |
| 4385 | # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols |
| 4386 | +# For example for MathJax version 3 (see |
| 4387 | +# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html): |
| 4388 | +# MATHJAX_EXTENSIONS = ams |
| 4389 | # This tag requires that the tag USE_MATHJAX is set to YES. |
| 4390 | |
| 4391 | MATHJAX_EXTENSIONS = |
| 4392 | |
| 4393 | # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces |
| 4394 | # of code that will be used on startup of the MathJax code. See the MathJax site |
| 4395 | -# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an |
| 4396 | +# (see: |
| 4397 | +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an |
| 4398 | # example see the documentation. |
| 4399 | # This tag requires that the tag USE_MATHJAX is set to YES. |
| 4400 | |
| 4401 | @@ -1544,7 +1772,7 @@ MATHJAX_CODEFILE = |
| 4402 | SEARCHENGINE = NO |
| 4403 | |
| 4404 | # When the SERVER_BASED_SEARCH tag is enabled the search engine will be |
| 4405 | -# implemented using a web server instead of a web client using Javascript. There |
| 4406 | +# implemented using a web server instead of a web client using JavaScript. There |
| 4407 | # are two flavors of web server based searching depending on the EXTERNAL_SEARCH |
| 4408 | # setting. When disabled, doxygen will generate a PHP script for searching and |
| 4409 | # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing |
| 4410 | @@ -1563,7 +1791,8 @@ SERVER_BASED_SEARCH = NO |
| 4411 | # |
| 4412 | # Doxygen ships with an example indexer (doxyindexer) and search engine |
| 4413 | # (doxysearch.cgi) which are based on the open source search engine library |
| 4414 | -# Xapian (see: http://xapian.org/). |
| 4415 | +# Xapian (see: |
| 4416 | +# https://xapian.org/). |
| 4417 | # |
| 4418 | # See the section "External Indexing and Searching" for details. |
| 4419 | # The default value is: NO. |
| 4420 | @@ -1576,8 +1805,9 @@ EXTERNAL_SEARCH = NO |
| 4421 | # |
| 4422 | # Doxygen ships with an example indexer (doxyindexer) and search engine |
| 4423 | # (doxysearch.cgi) which are based on the open source search engine library |
| 4424 | -# Xapian (see: http://xapian.org/). See the section "External Indexing and |
| 4425 | -# Searching" for details. |
| 4426 | +# Xapian (see: |
| 4427 | +# https://xapian.org/). See the section "External Indexing and Searching" for |
| 4428 | +# details. |
| 4429 | # This tag requires that the tag SEARCHENGINE is set to YES. |
| 4430 | |
| 4431 | SEARCHENGINE_URL = |
| 4432 | @@ -1628,21 +1858,35 @@ LATEX_OUTPUT = latex |
| 4433 | # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be |
| 4434 | # invoked. |
| 4435 | # |
| 4436 | -# Note that when enabling USE_PDFLATEX this option is only used for generating |
| 4437 | -# bitmaps for formulas in the HTML output, but not in the Makefile that is |
| 4438 | -# written to the output directory. |
| 4439 | -# The default file is: latex. |
| 4440 | +# Note that when not enabling USE_PDFLATEX the default is latex when enabling |
| 4441 | +# USE_PDFLATEX the default is pdflatex and when in the later case latex is |
| 4442 | +# chosen this is overwritten by pdflatex. For specific output languages the |
| 4443 | +# default can have been set differently, this depends on the implementation of |
| 4444 | +# the output language. |
| 4445 | # This tag requires that the tag GENERATE_LATEX is set to YES. |
| 4446 | |
| 4447 | LATEX_CMD_NAME = latex |
| 4448 | |
| 4449 | # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate |
| 4450 | # index for LaTeX. |
| 4451 | +# Note: This tag is used in the Makefile / make.bat. |
| 4452 | +# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file |
| 4453 | +# (.tex). |
| 4454 | # The default file is: makeindex. |
| 4455 | # This tag requires that the tag GENERATE_LATEX is set to YES. |
| 4456 | |
| 4457 | MAKEINDEX_CMD_NAME = makeindex |
| 4458 | |
| 4459 | +# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to |
| 4460 | +# generate index for LaTeX. In case there is no backslash (\) as first character |
| 4461 | +# it will be automatically added in the LaTeX code. |
| 4462 | +# Note: This tag is used in the generated output file (.tex). |
| 4463 | +# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat. |
| 4464 | +# The default value is: makeindex. |
| 4465 | +# This tag requires that the tag GENERATE_LATEX is set to YES. |
| 4466 | + |
| 4467 | +LATEX_MAKEINDEX_CMD = makeindex |
| 4468 | + |
| 4469 | # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX |
| 4470 | # documents. This may be useful for small projects and may help to save some |
| 4471 | # trees in general. |
| 4472 | @@ -1672,29 +1916,31 @@ PAPER_TYPE = a4 |
| 4473 | |
| 4474 | EXTRA_PACKAGES = |
| 4475 | |
| 4476 | -# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the |
| 4477 | -# generated LaTeX document. The header should contain everything until the first |
| 4478 | -# chapter. If it is left blank doxygen will generate a standard header. See |
| 4479 | -# section "Doxygen usage" for information on how to let doxygen write the |
| 4480 | -# default header to a separate file. |
| 4481 | +# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for |
| 4482 | +# the generated LaTeX document. The header should contain everything until the |
| 4483 | +# first chapter. If it is left blank doxygen will generate a standard header. It |
| 4484 | +# is highly recommended to start with a default header using |
| 4485 | +# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty |
| 4486 | +# and then modify the file new_header.tex. See also section "Doxygen usage" for |
| 4487 | +# information on how to generate the default header that doxygen normally uses. |
| 4488 | # |
| 4489 | -# Note: Only use a user-defined header if you know what you are doing! The |
| 4490 | -# following commands have a special meaning inside the header: $title, |
| 4491 | -# $datetime, $date, $doxygenversion, $projectname, $projectnumber, |
| 4492 | -# $projectbrief, $projectlogo. Doxygen will replace $title with the empty |
| 4493 | -# string, for the replacement values of the other commands the user is referred |
| 4494 | -# to HTML_HEADER. |
| 4495 | +# Note: Only use a user-defined header if you know what you are doing! |
| 4496 | +# Note: The header is subject to change so you typically have to regenerate the |
| 4497 | +# default header when upgrading to a newer version of doxygen. The following |
| 4498 | +# commands have a special meaning inside the header (and footer): For a |
| 4499 | +# description of the possible markers and block names see the documentation. |
| 4500 | # This tag requires that the tag GENERATE_LATEX is set to YES. |
| 4501 | |
| 4502 | LATEX_HEADER = |
| 4503 | |
| 4504 | -# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the |
| 4505 | -# generated LaTeX document. The footer should contain everything after the last |
| 4506 | -# chapter. If it is left blank doxygen will generate a standard footer. See |
| 4507 | +# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for |
| 4508 | +# the generated LaTeX document. The footer should contain everything after the |
| 4509 | +# last chapter. If it is left blank doxygen will generate a standard footer. See |
| 4510 | # LATEX_HEADER for more information on how to generate a default footer and what |
| 4511 | -# special commands can be used inside the footer. |
| 4512 | -# |
| 4513 | -# Note: Only use a user-defined footer if you know what you are doing! |
| 4514 | +# special commands can be used inside the footer. See also section "Doxygen |
| 4515 | +# usage" for information on how to generate the default footer that doxygen |
| 4516 | +# normally uses. Note: Only use a user-defined footer if you know what you are |
| 4517 | +# doing! |
| 4518 | # This tag requires that the tag GENERATE_LATEX is set to YES. |
| 4519 | |
| 4520 | LATEX_FOOTER = |
| 4521 | @@ -1727,9 +1973,11 @@ LATEX_EXTRA_FILES = |
| 4522 | |
| 4523 | PDF_HYPERLINKS = YES |
| 4524 | |
| 4525 | -# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate |
| 4526 | -# the PDF file directly from the LaTeX files. Set this option to YES, to get a |
| 4527 | -# higher quality PDF documentation. |
| 4528 | +# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as |
| 4529 | +# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX |
| 4530 | +# files. Set this option to YES, to get a higher quality PDF documentation. |
| 4531 | +# |
| 4532 | +# See also section LATEX_CMD_NAME for selecting the engine. |
| 4533 | # The default value is: YES. |
| 4534 | # This tag requires that the tag GENERATE_LATEX is set to YES. |
| 4535 | |
| 4536 | @@ -1737,8 +1985,7 @@ USE_PDFLATEX = YES |
| 4537 | |
| 4538 | # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode |
| 4539 | # command to the generated LaTeX files. This will instruct LaTeX to keep running |
| 4540 | -# if errors occur, instead of asking the user for help. This option is also used |
| 4541 | -# when generating formulas in HTML. |
| 4542 | +# if errors occur, instead of asking the user for help. |
| 4543 | # The default value is: NO. |
| 4544 | # This tag requires that the tag GENERATE_LATEX is set to YES. |
| 4545 | |
| 4546 | @@ -1751,19 +1998,9 @@ LATEX_BATCHMODE = NO |
| 4547 | |
| 4548 | LATEX_HIDE_INDICES = NO |
| 4549 | |
| 4550 | -# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source |
| 4551 | -# code with syntax highlighting in the LaTeX output. |
| 4552 | -# |
| 4553 | -# Note that which sources are shown also depends on other settings such as |
| 4554 | -# SOURCE_BROWSER. |
| 4555 | -# The default value is: NO. |
| 4556 | -# This tag requires that the tag GENERATE_LATEX is set to YES. |
| 4557 | - |
| 4558 | -LATEX_SOURCE_CODE = NO |
| 4559 | - |
| 4560 | # The LATEX_BIB_STYLE tag can be used to specify the style to use for the |
| 4561 | # bibliography, e.g. plainnat, or ieeetr. See |
| 4562 | -# http://en.wikipedia.org/wiki/BibTeX and \cite for more info. |
| 4563 | +# https://en.wikipedia.org/wiki/BibTeX and \cite for more info. |
| 4564 | # The default value is: plain. |
| 4565 | # This tag requires that the tag GENERATE_LATEX is set to YES. |
| 4566 | |
| 4567 | @@ -1777,6 +2014,14 @@ LATEX_BIB_STYLE = plain |
| 4568 | |
| 4569 | LATEX_TIMESTAMP = NO |
| 4570 | |
| 4571 | +# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute) |
| 4572 | +# path from which the emoji images will be read. If a relative path is entered, |
| 4573 | +# it will be relative to the LATEX_OUTPUT directory. If left blank the |
| 4574 | +# LATEX_OUTPUT directory will be used. |
| 4575 | +# This tag requires that the tag GENERATE_LATEX is set to YES. |
| 4576 | + |
| 4577 | +LATEX_EMOJI_DIRECTORY = |
| 4578 | + |
| 4579 | #--------------------------------------------------------------------------- |
| 4580 | # Configuration options related to the RTF output |
| 4581 | #--------------------------------------------------------------------------- |
| 4582 | @@ -1816,9 +2061,9 @@ COMPACT_RTF = NO |
| 4583 | |
| 4584 | RTF_HYPERLINKS = NO |
| 4585 | |
| 4586 | -# Load stylesheet definitions from file. Syntax is similar to doxygen's config |
| 4587 | -# file, i.e. a series of assignments. You only have to provide replacements, |
| 4588 | -# missing definitions are set to their default value. |
| 4589 | +# Load stylesheet definitions from file. Syntax is similar to doxygen's |
| 4590 | +# configuration file, i.e. a series of assignments. You only have to provide |
| 4591 | +# replacements, missing definitions are set to their default value. |
| 4592 | # |
| 4593 | # See also section "Doxygen usage" for information on how to generate the |
| 4594 | # default style sheet that doxygen normally uses. |
| 4595 | @@ -1827,22 +2072,12 @@ RTF_HYPERLINKS = NO |
| 4596 | RTF_STYLESHEET_FILE = |
| 4597 | |
| 4598 | # Set optional variables used in the generation of an RTF document. Syntax is |
| 4599 | -# similar to doxygen's config file. A template extensions file can be generated |
| 4600 | -# using doxygen -e rtf extensionFile. |
| 4601 | +# similar to doxygen's configuration file. A template extensions file can be |
| 4602 | +# generated using doxygen -e rtf extensionFile. |
| 4603 | # This tag requires that the tag GENERATE_RTF is set to YES. |
| 4604 | |
| 4605 | RTF_EXTENSIONS_FILE = |
| 4606 | |
| 4607 | -# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code |
| 4608 | -# with syntax highlighting in the RTF output. |
| 4609 | -# |
| 4610 | -# Note that which sources are shown also depends on other settings such as |
| 4611 | -# SOURCE_BROWSER. |
| 4612 | -# The default value is: NO. |
| 4613 | -# This tag requires that the tag GENERATE_RTF is set to YES. |
| 4614 | - |
| 4615 | -RTF_SOURCE_CODE = NO |
| 4616 | - |
| 4617 | #--------------------------------------------------------------------------- |
| 4618 | # Configuration options related to the man page output |
| 4619 | #--------------------------------------------------------------------------- |
| 4620 | @@ -1914,6 +2149,13 @@ XML_OUTPUT = xml |
| 4621 | |
| 4622 | XML_PROGRAMLISTING = YES |
| 4623 | |
| 4624 | +# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include |
| 4625 | +# namespace members in file scope as well, matching the HTML output. |
| 4626 | +# The default value is: NO. |
| 4627 | +# This tag requires that the tag GENERATE_XML is set to YES. |
| 4628 | + |
| 4629 | +XML_NS_MEMB_FILE_SCOPE = NO |
| 4630 | + |
| 4631 | #--------------------------------------------------------------------------- |
| 4632 | # Configuration options related to the DOCBOOK output |
| 4633 | #--------------------------------------------------------------------------- |
| 4634 | @@ -1932,23 +2174,14 @@ GENERATE_DOCBOOK = NO |
| 4635 | |
| 4636 | DOCBOOK_OUTPUT = docbook |
| 4637 | |
| 4638 | -# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the |
| 4639 | -# program listings (including syntax highlighting and cross-referencing |
| 4640 | -# information) to the DOCBOOK output. Note that enabling this will significantly |
| 4641 | -# increase the size of the DOCBOOK output. |
| 4642 | -# The default value is: NO. |
| 4643 | -# This tag requires that the tag GENERATE_DOCBOOK is set to YES. |
| 4644 | - |
| 4645 | -DOCBOOK_PROGRAMLISTING = NO |
| 4646 | - |
| 4647 | #--------------------------------------------------------------------------- |
| 4648 | # Configuration options for the AutoGen Definitions output |
| 4649 | #--------------------------------------------------------------------------- |
| 4650 | |
| 4651 | # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an |
| 4652 | -# AutoGen Definitions (see http://autogen.sf.net) file that captures the |
| 4653 | -# structure of the code including all documentation. Note that this feature is |
| 4654 | -# still experimental and incomplete at the moment. |
| 4655 | +# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures |
| 4656 | +# the structure of the code including all documentation. Note that this feature |
| 4657 | +# is still experimental and incomplete at the moment. |
| 4658 | # The default value is: NO. |
| 4659 | |
| 4660 | GENERATE_AUTOGEN_DEF = NO |
| 4661 | @@ -2027,7 +2260,8 @@ SEARCH_INCLUDES = YES |
| 4662 | |
| 4663 | # The INCLUDE_PATH tag can be used to specify one or more directories that |
| 4664 | # contain include files that are not input files but should be processed by the |
| 4665 | -# preprocessor. |
| 4666 | +# preprocessor. Note that the INCLUDE_PATH is not recursive, so the setting of |
| 4667 | +# RECURSIVE has no effect here. |
| 4668 | # This tag requires that the tag SEARCH_INCLUDES is set to YES. |
| 4669 | |
| 4670 | INCLUDE_PATH = ../../hipcub/include/hipcub/ |
| 4671 | @@ -2121,15 +2355,6 @@ EXTERNAL_PAGES = YES |
| 4672 | # Configuration options related to the dot tool |
| 4673 | #--------------------------------------------------------------------------- |
| 4674 | |
| 4675 | -# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram |
| 4676 | -# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to |
| 4677 | -# NO turns the diagrams off. Note that this option also works with HAVE_DOT |
| 4678 | -# disabled, but it is recommended to install and use dot, since it yields more |
| 4679 | -# powerful graphs. |
| 4680 | -# The default value is: YES. |
| 4681 | - |
| 4682 | -CLASS_DIAGRAMS = YES |
| 4683 | - |
| 4684 | # You can include diagrams made with dia in doxygen documentation. Doxygen will |
| 4685 | # then run dia to produce the diagram and insert it in the documentation. The |
| 4686 | # DIA_PATH tag allows you to specify the directory where the dia binary resides. |
| 4687 | @@ -2148,7 +2373,7 @@ HIDE_UNDOC_RELATIONS = YES |
| 4688 | # http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent |
| 4689 | # Bell Labs. The other options in this section have no effect if this option is |
| 4690 | # set to NO |
| 4691 | -# The default value is: YES. |
| 4692 | +# The default value is: NO. |
| 4693 | |
| 4694 | HAVE_DOT = NO |
| 4695 | |
| 4696 | @@ -2186,11 +2411,14 @@ DOT_FONTSIZE = 10 |
| 4697 | |
| 4698 | DOT_FONTPATH = |
| 4699 | |
| 4700 | -# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for |
| 4701 | -# each documented class showing the direct and indirect inheritance relations. |
| 4702 | -# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO. |
| 4703 | +# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a |
| 4704 | +# graph for each documented class showing the direct and indirect inheritance |
| 4705 | +# relations. In case HAVE_DOT is set as well dot will be used to draw the graph, |
| 4706 | +# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set |
| 4707 | +# to TEXT the direct and indirect inheritance relations will be shown as texts / |
| 4708 | +# links. |
| 4709 | +# Possible values are: NO, YES, TEXT and GRAPH. |
| 4710 | # The default value is: YES. |
| 4711 | -# This tag requires that the tag HAVE_DOT is set to YES. |
| 4712 | |
| 4713 | CLASS_GRAPH = YES |
| 4714 | |
| 4715 | @@ -2204,7 +2432,8 @@ CLASS_GRAPH = YES |
| 4716 | COLLABORATION_GRAPH = YES |
| 4717 | |
| 4718 | # If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for |
| 4719 | -# groups, showing the direct groups dependencies. |
| 4720 | +# groups, showing the direct groups dependencies. See also the chapter Grouping |
| 4721 | +# in the manual. |
| 4722 | # The default value is: YES. |
| 4723 | # This tag requires that the tag HAVE_DOT is set to YES. |
| 4724 | |
| 4725 | @@ -2227,10 +2456,32 @@ UML_LOOK = NO |
| 4726 | # but if the number exceeds 15, the total amount of fields shown is limited to |
| 4727 | # 10. |
| 4728 | # Minimum value: 0, maximum value: 100, default value: 10. |
| 4729 | -# This tag requires that the tag HAVE_DOT is set to YES. |
| 4730 | +# This tag requires that the tag UML_LOOK is set to YES. |
| 4731 | |
| 4732 | UML_LIMIT_NUM_FIELDS = 10 |
| 4733 | |
| 4734 | +# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and |
| 4735 | +# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS |
| 4736 | +# tag is set to YES, doxygen will add type and arguments for attributes and |
| 4737 | +# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen |
| 4738 | +# will not generate fields with class member information in the UML graphs. The |
| 4739 | +# class diagrams will look similar to the default class diagrams but using UML |
| 4740 | +# notation for the relationships. |
| 4741 | +# Possible values are: NO, YES and NONE. |
| 4742 | +# The default value is: NO. |
| 4743 | +# This tag requires that the tag UML_LOOK is set to YES. |
| 4744 | + |
| 4745 | +DOT_UML_DETAILS = NO |
| 4746 | + |
| 4747 | +# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters |
| 4748 | +# to display on a single line. If the actual line length exceeds this threshold |
| 4749 | +# significantly it will wrapped across multiple lines. Some heuristics are apply |
| 4750 | +# to avoid ugly line breaks. |
| 4751 | +# Minimum value: 0, maximum value: 1000, default value: 17. |
| 4752 | +# This tag requires that the tag HAVE_DOT is set to YES. |
| 4753 | + |
| 4754 | +DOT_WRAP_THRESHOLD = 17 |
| 4755 | + |
| 4756 | # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and |
| 4757 | # collaboration graphs will show the relations between templates and their |
| 4758 | # instances. |
| 4759 | @@ -2297,6 +2548,13 @@ GRAPHICAL_HIERARCHY = YES |
| 4760 | |
| 4761 | DIRECTORY_GRAPH = YES |
| 4762 | |
| 4763 | +# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels |
| 4764 | +# of child directories generated in directory dependency graphs by dot. |
| 4765 | +# Minimum value: 1, maximum value: 25, default value: 1. |
| 4766 | +# This tag requires that the tag DIRECTORY_GRAPH is set to YES. |
| 4767 | + |
| 4768 | +DIR_GRAPH_MAX_DEPTH = 1 |
| 4769 | + |
| 4770 | # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images |
| 4771 | # generated by dot. For an explanation of the image formats see the section |
| 4772 | # output formats in the documentation of the dot tool (Graphviz (see: |
| 4773 | @@ -2304,9 +2562,7 @@ DIRECTORY_GRAPH = YES |
| 4774 | # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order |
| 4775 | # to make the SVG files visible in IE 9+ (other browsers do not have this |
| 4776 | # requirement). |
| 4777 | -# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd, |
| 4778 | -# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo, |
| 4779 | -# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo, |
| 4780 | +# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo, |
| 4781 | # png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and |
| 4782 | # png:gdiplus:gdiplus. |
| 4783 | # The default value is: png. |
| 4784 | @@ -2352,13 +2608,18 @@ MSCFILE_DIRS = |
| 4785 | DIAFILE_DIRS = |
| 4786 | |
| 4787 | # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the |
| 4788 | -# path where java can find the plantuml.jar file. If left blank, it is assumed |
| 4789 | -# PlantUML is not used or called during a preprocessing step. Doxygen will |
| 4790 | -# generate a warning when it encounters a \startuml command in this case and |
| 4791 | -# will not generate output for the diagram. |
| 4792 | +# path where java can find the plantuml.jar file or to the filename of jar file |
| 4793 | +# to be used. If left blank, it is assumed PlantUML is not used or called during |
| 4794 | +# a preprocessing step. Doxygen will generate a warning when it encounters a |
| 4795 | +# \startuml command in this case and will not generate output for the diagram. |
| 4796 | |
| 4797 | PLANTUML_JAR_PATH = |
| 4798 | |
| 4799 | +# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a |
| 4800 | +# configuration file for plantuml. |
| 4801 | + |
| 4802 | +PLANTUML_CFG_FILE = |
| 4803 | + |
| 4804 | # When using plantuml, the specified paths are searched for files specified by |
| 4805 | # the !include statement in a plantuml block. |
| 4806 | |
| 4807 | @@ -2412,14 +2673,18 @@ DOT_MULTI_TARGETS = NO |
| 4808 | # If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page |
| 4809 | # explaining the meaning of the various boxes and arrows in the dot generated |
| 4810 | # graphs. |
| 4811 | +# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal |
| 4812 | +# graphical representation for inheritance and collaboration diagrams is used. |
| 4813 | # The default value is: YES. |
| 4814 | # This tag requires that the tag HAVE_DOT is set to YES. |
| 4815 | |
| 4816 | GENERATE_LEGEND = YES |
| 4817 | |
| 4818 | -# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot |
| 4819 | +# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate |
| 4820 | # files that are used to generate the various graphs. |
| 4821 | +# |
| 4822 | +# Note: This setting is not only used for dot files but also for msc temporary |
| 4823 | +# files. |
| 4824 | # The default value is: YES. |
| 4825 | -# This tag requires that the tag HAVE_DOT is set to YES. |
| 4826 | |
| 4827 | DOT_CLEANUP = YES |
| 4828 | diff --git a/docs/index.rst b/docs/index.rst |
| 4829 | index 8e4f96e..14af493 100644 |
| 4830 | --- a/docs/index.rst |
| 4831 | +++ b/docs/index.rst |
| 4832 | @@ -12,7 +12,7 @@ hipCUB documentation |
| 4833 | hipCUB is a thin, header-only wrapper library for `rocPRIM <https://rocm.docs.amd.com/projects/rocPRIM/en/latest/index.html>`_ and `CUB <https://docs.nvidia.com/cuda/cub/index.html>`_. It enables developers to port projects |
| 4834 | using the CUB library to the `HIP <https://rocm.docs.amd.com/projects/HIP/en/latest/index.html>`_ layer and run on AMD hardware. To learn more, see :ref:`what-is-hipcub` |
| 4835 | |
| 4836 | -The hipCUB repository is located at `https://github.com/ROCm/hipCUB <https://github.com/ROCm/hipCUB>`_. |
| 4837 | +The hipCUB project is located in https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipcub. |
| 4838 | |
| 4839 | .. grid:: 2 |
| 4840 | |
| 4841 | diff --git a/docs/install/hipCUB-install-on-Windows.rst b/docs/install/hipCUB-install-on-Windows.rst |
| 4842 | index 76375ab..fc73fd9 100644 |
| 4843 | --- a/docs/install/hipCUB-install-on-Windows.rst |
| 4844 | +++ b/docs/install/hipCUB-install-on-Windows.rst |
| 4845 | @@ -8,8 +8,9 @@ Building and installing hipCUB on Windows |
| 4846 | |
| 4847 | You can use ``rmake.py`` to build and install hipCUB on Microsoft Windows. You can also use `CMake <./hipCUB-install-with-cmake.html>`_ if you want more build and installation options. |
| 4848 | |
| 4849 | +:doc:`Clone the hipCUB project <./hipCUB-install-overview>`. ``rmake.py`` will be located in the ``hipcub`` root directory. |
| 4850 | |
| 4851 | -``rmake.py`` is located in the ``hipCUB`` root directory. To build and install hipCUB with ``rmake.py``, run: |
| 4852 | +To build and install hipCUB with ``rmake.py``, run: |
| 4853 | |
| 4854 | .. code:: shell |
| 4855 | |
| 4856 | diff --git a/docs/install/hipCUB-install-overview.rst b/docs/install/hipCUB-install-overview.rst |
| 4857 | index 772e53d..ff123ef 100644 |
| 4858 | --- a/docs/install/hipCUB-install-overview.rst |
| 4859 | +++ b/docs/install/hipCUB-install-overview.rst |
| 4860 | @@ -6,7 +6,16 @@ |
| 4861 | hipCUB installation overview |
| 4862 | ********************************* |
| 4863 | |
| 4864 | -The hipCUB source code is available from the `hipCUB GitHub Repository <https://github.com/ROCmSoftwarePlatform/hipCUB>`_. |
| 4865 | +The hipCUB source code is available from the `ROCm libraries GitHub Repository <https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipcub>`_. Use sparse checkout when cloning the hipCUB project: |
| 4866 | + |
| 4867 | +.. code:: |
| 4868 | + |
| 4869 | + git clone --no-checkout --filter=blob:none https://github.com/ROCm/rocm-libraries.git |
| 4870 | + cd rocm-libraries |
| 4871 | + git sparse-checkout init --cone |
| 4872 | + git sparse-checkout set projects/hipcub |
| 4873 | + |
| 4874 | +Then use ``git checkout`` to check out the branch you need. |
| 4875 | |
| 4876 | The develop branch is the default branch. The develop branch is intended for users who want to preview new features or contribute to the hipCUB code base. |
| 4877 | |
| 4878 | diff --git a/docs/install/hipCUB-install-with-cmake.rst b/docs/install/hipCUB-install-with-cmake.rst |
| 4879 | index f21bf0d..0fcb91c 100644 |
| 4880 | --- a/docs/install/hipCUB-install-with-cmake.rst |
| 4881 | +++ b/docs/install/hipCUB-install-with-cmake.rst |
| 4882 | @@ -17,7 +17,7 @@ Before you begin, set ``CXX`` to ``amdclang++`` or ``hipcc`` if you're building |
| 4883 | CXX=amdclang++ |
| 4884 | CMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ |
| 4885 | |
| 4886 | -Create the ``build`` directory inside the ``hipCUB`` directory, then change directory to the ``build`` directory: |
| 4887 | +After :doc:`cloning the project <./hipCUB-install-overview>`, create the ``build`` directory under the ``hipcub`` root directory, then change directory to the ``build`` directory: |
| 4888 | |
| 4889 | .. code:: shell |
| 4890 | |
| 4891 | @@ -35,6 +35,7 @@ The available build options are: |
| 4892 | |
| 4893 | * ``BUILD_BENCHMARK``. Set this to ``ON`` to build benchmark tests. Off by default. |
| 4894 | * ``BUILD_TEST``. Set this to ``ON`` to build tests. Off by default. |
| 4895 | +* ``USE_SYSTEM_LIB``: Set to ``ON`` to use the installed ``hipCUB`` from the system when building the tests. Off by default. For this option to take effect, ``BUILD_TEST`` must be ``ON`` and the ``hipCUB`` install (with its dependencies) must be compatible with the version of the tests. |
| 4896 | * ``DEPENDENCIES_FORCE_DOWNLOAD``. Set this to ``ON`` to download the dependencies regardless of whether or not they are already installed. Off by default. |
| 4897 | |
| 4898 | Build hipCUB using the generated make file: |
| 4899 | diff --git a/docs/install/hipCUB-prerequisites.rst b/docs/install/hipCUB-prerequisites.rst |
| 4900 | index 881cef3..d4352f5 100644 |
| 4901 | --- a/docs/install/hipCUB-prerequisites.rst |
| 4902 | +++ b/docs/install/hipCUB-prerequisites.rst |
| 4903 | @@ -8,7 +8,7 @@ hipCUB prerequisites |
| 4904 | |
| 4905 | hipCUB has the following prerequisites on all platforms: |
| 4906 | |
| 4907 | -* `CMake <https://cmake.org/>`_ version 3.16 or higher |
| 4908 | +* `CMake <https://cmake.org/>`_ version 3.18 or higher |
| 4909 | |
| 4910 | On AMD GPUs: |
| 4911 | |
| 4912 | @@ -21,9 +21,9 @@ amdclang++ is installed with ROCm. rocPRIM is automatically downloaded and insta |
| 4913 | On NVIDIA GPUs: |
| 4914 | |
| 4915 | * The CUDA Toolkit |
| 4916 | -* CCCL library version 2.3.2 or later |
| 4917 | +* CCCL library version 2.8.2 or later |
| 4918 | * CUB and Thrust |
| 4919 | -* libcu++ version 2.2.0 |
| 4920 | +* libcu++ version 2.8.2 |
| 4921 | |
| 4922 | The CCCL library is automatically downloaded and built by the CMake script. If libcu++ isn't found on the system, it will be downloaded from the CCCL repository. |
| 4923 | |
| 4924 | diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in |
| 4925 | index ad94caa..73a3602 100644 |
| 4926 | --- a/docs/sphinx/requirements.in |
| 4927 | +++ b/docs/sphinx/requirements.in |
| 4928 | @@ -1 +1 @@ |
| 4929 | -rocm-docs-core[api_reference]==1.8.3 |
| 4930 | +rocm-docs-core[api_reference]==1.20.1 |
| 4931 | diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt |
| 4932 | index 6dd77b1..e7818b8 100644 |
| 4933 | --- a/docs/sphinx/requirements.txt |
| 4934 | +++ b/docs/sphinx/requirements.txt |
| 4935 | @@ -8,6 +8,13 @@ accessible-pygments==0.0.4 |
| 4936 | # via pydata-sphinx-theme |
| 4937 | alabaster==0.7.16 |
| 4938 | # via sphinx |
| 4939 | +asttokens==3.0.0 |
| 4940 | + # via stack-data |
| 4941 | +attrs==25.1.0 |
| 4942 | + # via |
| 4943 | + # jsonschema |
| 4944 | + # jupyter-cache |
| 4945 | + # referencing |
| 4946 | babel==2.14.0 |
| 4947 | # via |
| 4948 | # pydata-sphinx-theme |
| 4949 | @@ -25,9 +32,21 @@ cffi==1.16.0 |
| 4950 | charset-normalizer==3.3.2 |
| 4951 | # via requests |
| 4952 | click==8.1.7 |
| 4953 | - # via sphinx-external-toc |
| 4954 | -cryptography==43.0.1 |
| 4955 | + # via |
| 4956 | + # click-log |
| 4957 | + # doxysphinx |
| 4958 | + # jupyter-cache |
| 4959 | + # sphinx-external-toc |
| 4960 | +click-log==0.4.0 |
| 4961 | + # via doxysphinx |
| 4962 | +comm==0.2.2 |
| 4963 | + # via ipykernel |
| 4964 | +cryptography==44.0.1 |
| 4965 | # via pyjwt |
| 4966 | +debugpy==1.8.12 |
| 4967 | + # via ipykernel |
| 4968 | +decorator==5.1.1 |
| 4969 | + # via ipython |
| 4970 | deprecated==1.2.14 |
| 4971 | # via pygithub |
| 4972 | docutils==0.21.2 |
| 4973 | @@ -38,39 +57,115 @@ docutils==0.21.2 |
| 4974 | # sphinx |
| 4975 | doxysphinx==3.3.8 |
| 4976 | # via rocm-docs-core |
| 4977 | +exceptiongroup==1.2.2 |
| 4978 | + # via ipython |
| 4979 | +executing==2.2.0 |
| 4980 | + # via stack-data |
| 4981 | fastjsonschema==2.19.1 |
| 4982 | - # via rocm-docs-core |
| 4983 | + # via |
| 4984 | + # nbformat |
| 4985 | + # rocm-docs-core |
| 4986 | gitdb==4.0.11 |
| 4987 | # via gitpython |
| 4988 | gitpython==3.1.43 |
| 4989 | # via rocm-docs-core |
| 4990 | +greenlet==3.1.1 |
| 4991 | + # via sqlalchemy |
| 4992 | idna==3.7 |
| 4993 | # via requests |
| 4994 | imagesize==1.4.1 |
| 4995 | # via sphinx |
| 4996 | -jinja2==3.1.4 |
| 4997 | +importlib-metadata==8.6.1 |
| 4998 | + # via |
| 4999 | + # jupyter-cache |
| 5000 | + # myst-nb |

Build in ppa: https:/ /launchpad. net/~bruno- bdmoura/ +archive/ ubuntu/ lp-2139349- hipcub
(~proposed and target archs)