Merge ~bullwinkle-team/ubuntu/+source/hipcub:bullwinkle/llvm-21/ubuntu/devel into ubuntu/+source/hipcub:ubuntu/devel

Proposed by Bruno Bernardo de Moura
Status: Merged
Approved by: Andreas Hasenack
Approved revision: a79958c79245f1e5eed0d43cec34ec1c57e4b67d
Merged at revision: a79958c79245f1e5eed0d43cec34ec1c57e4b67d
Proposed branch: ~bullwinkle-team/ubuntu/+source/hipcub:bullwinkle/llvm-21/ubuntu/devel
Merge into: ubuntu/+source/hipcub:ubuntu/devel
Diff against target: 38063 lines (+18301/-6495)
314 files modified
.github/CODEOWNERS (+1/-1)
.gitignore (+1/-1)
.gitlab-ci.yml (+167/-17)
CHANGELOG.md (+101/-0)
CMakeLists.txt (+114/-47)
LICENSE.txt (+2/-1)
README.md (+119/-59)
benchmark/CMakeLists.txt (+3/-2)
benchmark/benchmark_block_adjacent_difference.cpp (+3/-4)
benchmark/benchmark_block_discontinuity.cpp (+5/-6)
benchmark/benchmark_block_exchange.cpp (+3/-3)
benchmark/benchmark_block_histogram.cpp (+1/-1)
benchmark/benchmark_block_merge_sort.cpp (+3/-3)
benchmark/benchmark_block_radix_rank.cpp (+4/-4)
benchmark/benchmark_block_radix_sort.cpp (+3/-3)
benchmark/benchmark_block_reduce.cpp (+2/-2)
benchmark/benchmark_block_run_length_decode.cpp (+3/-3)
benchmark/benchmark_block_scan.cpp (+1/-1)
benchmark/benchmark_block_shuffle.cpp (+1/-1)
benchmark/benchmark_device_batch_copy.cpp (+4/-4)
benchmark/benchmark_device_batch_memcpy.cpp (+5/-4)
benchmark/benchmark_device_for.cpp (+4/-4)
benchmark/benchmark_device_histogram.cpp (+2/-2)
benchmark/benchmark_device_memory.cpp (+43/-33)
benchmark/benchmark_device_merge.cpp (+364/-0)
benchmark/benchmark_device_merge_sort.cpp (+2/-2)
benchmark/benchmark_device_partition.cpp (+1/-1)
benchmark/benchmark_device_radix_sort.cpp (+1/-1)
benchmark/benchmark_device_reduce.cpp (+5/-2)
benchmark/benchmark_device_reduce_by_key.cpp (+1/-1)
benchmark/benchmark_device_run_length_encode.cpp (+1/-1)
benchmark/benchmark_device_scan.cpp (+1/-1)
benchmark/benchmark_device_segmented_radix_sort.cpp (+26/-26)
benchmark/benchmark_device_segmented_reduce.cpp (+1/-1)
benchmark/benchmark_device_segmented_sort.cpp (+22/-22)
benchmark/benchmark_device_select.cpp (+7/-8)
benchmark/benchmark_device_spmv.cpp (+7/-1)
benchmark/benchmark_utils.hpp (+12/-4)
benchmark/benchmark_warp_exchange.cpp (+1/-1)
benchmark/benchmark_warp_load.cpp (+1/-1)
benchmark/benchmark_warp_merge_sort.cpp (+4/-4)
benchmark/benchmark_warp_reduce.cpp (+1/-1)
benchmark/benchmark_warp_scan.cpp (+70/-52)
benchmark/benchmark_warp_store.cpp (+1/-1)
benchmark/common_benchmark_header.hpp (+9/-9)
cmake/Dependencies.cmake (+242/-32)
cmake/GenerateResourceSpec.cmake (+8/-7)
cmake/ROCmCMakeBuildToolsDependency.cmake (+0/-1)
cmake/SetupNVCC.cmake (+1/-1)
cmake/Summary.cmake (+51/-2)
debian/changelog (+25/-0)
debian/control (+7/-6)
debian/patches/0001-install-cmake-config-to-share.patch (+4/-4)
debian/patches/series (+0/-1)
debian/rules (+6/-0)
debian/tests/control (+1/-1)
dev/null (+0/-88)
docs/conf.py (+5/-1)
docs/doxygen/Doxyfile (+471/-206)
docs/index.rst (+1/-1)
docs/install/hipCUB-install-on-Windows.rst (+2/-1)
docs/install/hipCUB-install-overview.rst (+10/-1)
docs/install/hipCUB-install-with-cmake.rst (+2/-1)
docs/install/hipCUB-prerequisites.rst (+3/-3)
docs/sphinx/requirements.in (+1/-1)
docs/sphinx/requirements.txt (+157/-7)
examples/CMakeLists.txt (+1/-1)
examples/block/example_block_radix_sort.cu (+5/-5)
examples/block/example_block_reduce.cu (+4/-4)
examples/block/example_block_scan.cu (+7/-6)
examples/device/example_device_partition_flagged.cpp (+5/-5)
examples/device/example_device_partition_if.cpp (+4/-4)
examples/device/example_device_radix_sort.cpp (+1/-1)
examples/device/example_device_reduce.cpp (+3/-3)
examples/device/example_device_scan.cpp (+3/-3)
examples/device/example_device_select_flagged.cpp (+5/-5)
examples/device/example_device_select_if.cpp (+4/-4)
examples/device/example_device_select_unique.cpp (+4/-4)
examples/device/example_device_sort_find_non_trivial_runs.cpp (+7/-7)
examples/example_utils.hpp (+1/-1)
examples/mersenne.h (+3/-3)
hipcub/CMakeLists.txt (+0/-20)
hipcub/include/hipcub/agent/single_pass_scan_operators.hpp (+30/-0)
hipcub/include/hipcub/backend/cub/agent/single_pass_scan_operators.hpp (+101/-0)
hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp (+2/-2)
hipcub/include/hipcub/backend/cub/device/device_copy.hpp (+2/-2)
hipcub/include/hipcub/backend/cub/device/device_for.hpp (+130/-79)
hipcub/include/hipcub/backend/cub/device/device_histogram.hpp (+2/-2)
hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp (+2/-2)
hipcub/include/hipcub/backend/cub/device/device_merge.hpp (+107/-0)
hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp (+2/-2)
hipcub/include/hipcub/backend/cub/device/device_partition.hpp (+77/-68)
hipcub/include/hipcub/backend/cub/device/device_radix_sort.hpp (+2/-2)
hipcub/include/hipcub/backend/cub/device/device_reduce.hpp (+70/-14)
hipcub/include/hipcub/backend/cub/device/device_run_length_encode.hpp (+2/-2)
hipcub/include/hipcub/backend/cub/device/device_scan.hpp (+103/-58)
hipcub/include/hipcub/backend/cub/device/device_segmented_radix_sort.hpp (+2/-3)
hipcub/include/hipcub/backend/cub/device/device_segmented_reduce.hpp (+2/-2)
hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp (+2/-2)
hipcub/include/hipcub/backend/cub/device/device_select.hpp (+2/-2)
hipcub/include/hipcub/backend/cub/device/device_spmv.hpp (+91/-79)
hipcub/include/hipcub/backend/cub/device/device_transform.hpp (+206/-0)
hipcub/include/hipcub/backend/cub/grid/grid_barrier.hpp (+7/-8)
hipcub/include/hipcub/backend/cub/hipcub.hpp (+49/-46)
hipcub/include/hipcub/backend/cub/iterator/tex_obj_input_iterator.hpp (+2/-2)
hipcub/include/hipcub/backend/cub/thread/thread_operators.hpp (+42/-0)
hipcub/include/hipcub/backend/cub/util_allocator.hpp (+2/-2)
hipcub/include/hipcub/backend/cub/util_device.hpp (+2/-2)
hipcub/include/hipcub/backend/cub/util_macro.hpp (+14/-8)
hipcub/include/hipcub/backend/cub/util_temporary_storage.hpp (+17/-16)
hipcub/include/hipcub/backend/rocprim/agent/single_pass_scan_operators.hpp (+484/-0)
hipcub/include/hipcub/backend/rocprim/block/block_adjacent_difference.hpp (+2/-120)
hipcub/include/hipcub/backend/rocprim/block/block_discontinuity.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/block/block_exchange.hpp (+3/-3)
hipcub/include/hipcub/backend/rocprim/block/block_histogram.hpp (+3/-3)
hipcub/include/hipcub/backend/rocprim/block/block_load.hpp (+3/-3)
hipcub/include/hipcub/backend/rocprim/block/block_load_func.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/block/block_merge_sort.hpp (+14/-12)
hipcub/include/hipcub/backend/rocprim/block/block_radix_rank.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/block/block_radix_sort.hpp (+4/-4)
hipcub/include/hipcub/backend/rocprim/block/block_raking_layout.hpp (+3/-4)
hipcub/include/hipcub/backend/rocprim/block/block_reduce.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp (+4/-3)
hipcub/include/hipcub/backend/rocprim/block/block_scan.hpp (+28/-2)
hipcub/include/hipcub/backend/rocprim/block/block_shuffle.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/block/block_store.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/block/block_store_func.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp (+11/-11)
hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/device/device_copy.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/device/device_for.hpp (+272/-107)
hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp (+9/-3)
hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/device/device_merge.hpp (+109/-0)
hipcub/include/hipcub/backend/rocprim/device/device_merge_sort.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/device/device_partition.hpp (+77/-68)
hipcub/include/hipcub/backend/rocprim/device/device_radix_sort.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp (+83/-9)
hipcub/include/hipcub/backend/rocprim/device/device_run_length_encode.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp (+146/-114)
hipcub/include/hipcub/backend/rocprim/device/device_segmented_radix_sort.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp (+194/-186)
hipcub/include/hipcub/backend/rocprim/device/device_segmented_sort.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/device/device_select.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/device/device_spmv.hpp (+44/-36)
hipcub/include/hipcub/backend/rocprim/device/device_transform.hpp (+249/-0)
hipcub/include/hipcub/backend/rocprim/grid/grid_barrier.hpp (+19/-12)
hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp (+19/-17)
hipcub/include/hipcub/backend/rocprim/grid/grid_queue.hpp (+4/-11)
hipcub/include/hipcub/backend/rocprim/hipcub.hpp (+4/-2)
hipcub/include/hipcub/backend/rocprim/iterator/arg_index_input_iterator.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/iterator/cache_modified_output_iterator.hpp (+1/-3)
hipcub/include/hipcub/backend/rocprim/iterator/constant_input_iterator.hpp (+5/-3)
hipcub/include/hipcub/backend/rocprim/iterator/counting_input_iterator.hpp (+5/-3)
hipcub/include/hipcub/backend/rocprim/iterator/discard_output_iterator.hpp (+13/-4)
hipcub/include/hipcub/backend/rocprim/iterator/iterator_wrapper.hpp (+3/-3)
hipcub/include/hipcub/backend/rocprim/iterator/tex_obj_input_iterator.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/iterator/transform_input_iterator.hpp (+6/-4)
hipcub/include/hipcub/backend/rocprim/thread/thread_load.hpp (+55/-83)
hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp (+335/-30)
hipcub/include/hipcub/backend/rocprim/thread/thread_reduce.hpp (+142/-38)
hipcub/include/hipcub/backend/rocprim/thread/thread_scan.hpp (+153/-136)
hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp (+6/-5)
hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp (+83/-78)
hipcub/include/hipcub/backend/rocprim/tuple.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/util_allocator.hpp (+13/-14)
hipcub/include/hipcub/backend/rocprim/util_macro.hpp (+6/-0)
hipcub/include/hipcub/backend/rocprim/util_mdspan.hpp (+230/-0)
hipcub/include/hipcub/backend/rocprim/util_ptx.hpp (+112/-104)
hipcub/include/hipcub/backend/rocprim/util_temporary_storage.hpp (+26/-10)
hipcub/include/hipcub/backend/rocprim/util_type.hpp (+293/-101)
hipcub/include/hipcub/backend/rocprim/warp/specializations/warp_exchange_shfl.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/warp/specializations/warp_exchange_smem.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/warp/warp_exchange.hpp (+3/-3)
hipcub/include/hipcub/backend/rocprim/warp/warp_load.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/warp/warp_merge_sort.hpp (+13/-12)
hipcub/include/hipcub/backend/rocprim/warp/warp_reduce.hpp (+2/-2)
hipcub/include/hipcub/backend/rocprim/warp/warp_scan.hpp (+22/-2)
hipcub/include/hipcub/backend/rocprim/warp/warp_store.hpp (+2/-2)
hipcub/include/hipcub/block/block_adjacent_difference.hpp (+3/-3)
hipcub/include/hipcub/block/block_discontinuity.hpp (+3/-3)
hipcub/include/hipcub/block/block_exchange.hpp (+3/-3)
hipcub/include/hipcub/block/block_histogram.hpp (+3/-3)
hipcub/include/hipcub/block/block_load.hpp (+3/-3)
hipcub/include/hipcub/block/block_merge_sort.hpp (+3/-3)
hipcub/include/hipcub/block/block_radix_rank.hpp (+6/-6)
hipcub/include/hipcub/block/block_radix_sort.hpp (+3/-3)
hipcub/include/hipcub/block/block_raking_layout.hpp (+3/-3)
hipcub/include/hipcub/block/block_reduce.hpp (+3/-3)
hipcub/include/hipcub/block/block_run_length_decode.hpp (+3/-3)
hipcub/include/hipcub/block/block_scan.hpp (+3/-3)
hipcub/include/hipcub/block/block_shuffle.hpp (+3/-3)
hipcub/include/hipcub/block/block_store.hpp (+3/-3)
hipcub/include/hipcub/block/radix_rank_sort_operations.hpp (+3/-3)
hipcub/include/hipcub/config.hpp (+40/-5)
hipcub/include/hipcub/device/device_adjacent_difference.hpp (+3/-3)
hipcub/include/hipcub/device/device_copy.hpp (+3/-3)
hipcub/include/hipcub/device/device_for.hpp (+3/-3)
hipcub/include/hipcub/device/device_histogram.hpp (+3/-3)
hipcub/include/hipcub/device/device_memcpy.hpp (+3/-3)
hipcub/include/hipcub/device/device_merge.hpp (+7/-9)
hipcub/include/hipcub/device/device_merge_sort.hpp (+3/-3)
hipcub/include/hipcub/device/device_partition.hpp (+3/-3)
hipcub/include/hipcub/device/device_radix_sort.hpp (+3/-3)
hipcub/include/hipcub/device/device_reduce.hpp (+3/-3)
hipcub/include/hipcub/device/device_run_length_encode.hpp (+3/-3)
hipcub/include/hipcub/device/device_scan.hpp (+3/-3)
hipcub/include/hipcub/device/device_segmented_radix_sort.hpp (+3/-3)
hipcub/include/hipcub/device/device_segmented_reduce.hpp (+3/-3)
hipcub/include/hipcub/device/device_segmented_sort.hpp (+3/-3)
hipcub/include/hipcub/device/device_select.hpp (+3/-3)
hipcub/include/hipcub/device/device_spmv.hpp (+3/-3)
hipcub/include/hipcub/device/device_transform.hpp (+38/-0)
hipcub/include/hipcub/grid/grid_barrier.hpp (+10/-10)
hipcub/include/hipcub/grid/grid_even_share.hpp (+10/-10)
hipcub/include/hipcub/grid/grid_mapping.hpp (+10/-10)
hipcub/include/hipcub/grid/grid_queue.hpp (+10/-10)
hipcub/include/hipcub/hipcub.hpp (+4/-6)
hipcub/include/hipcub/hipcub_version.hpp.in (+3/-3)
hipcub/include/hipcub/iterator/arg_index_input_iterator.hpp (+3/-3)
hipcub/include/hipcub/iterator/cache_modified_input_iterator.hpp (+3/-3)
hipcub/include/hipcub/iterator/cache_modified_output_iterator.hpp (+3/-3)
hipcub/include/hipcub/iterator/constant_input_iterator.hpp (+3/-3)
hipcub/include/hipcub/iterator/counting_input_iterator.hpp (+3/-3)
hipcub/include/hipcub/iterator/discard_output_iterator.hpp (+3/-3)
hipcub/include/hipcub/iterator/tex_obj_input_iterator.hpp (+3/-3)
hipcub/include/hipcub/iterator/transform_input_iterator.hpp (+3/-3)
hipcub/include/hipcub/thread/thread_load.hpp (+3/-3)
hipcub/include/hipcub/thread/thread_operators.hpp (+3/-4)
hipcub/include/hipcub/thread/thread_reduce.hpp (+3/-3)
hipcub/include/hipcub/thread/thread_scan.hpp (+3/-3)
hipcub/include/hipcub/thread/thread_search.hpp (+3/-3)
hipcub/include/hipcub/thread/thread_sort.hpp (+3/-3)
hipcub/include/hipcub/thread/thread_store.hpp (+3/-3)
hipcub/include/hipcub/tuple.hpp (+3/-3)
hipcub/include/hipcub/util_allocator.hpp (+3/-3)
hipcub/include/hipcub/util_device.hpp (+3/-3)
hipcub/include/hipcub/util_macro.hpp (+3/-3)
hipcub/include/hipcub/util_ptx.hpp (+3/-3)
hipcub/include/hipcub/util_temporary_storage.hpp (+3/-3)
hipcub/include/hipcub/util_type.hpp (+3/-3)
hipcub/include/hipcub/warp/warp_exchange.hpp (+3/-3)
hipcub/include/hipcub/warp/warp_load.hpp (+3/-3)
hipcub/include/hipcub/warp/warp_merge_sort.hpp (+3/-3)
hipcub/include/hipcub/warp/warp_reduce.hpp (+3/-3)
hipcub/include/hipcub/warp/warp_scan.hpp (+3/-3)
hipcub/include/hipcub/warp/warp_store.hpp (+3/-3)
rmake.py (+5/-1)
test/CMakeLists.txt (+6/-2)
test/extra/CMakeLists.txt (+14/-29)
test/extra/test_hipcub_package.cpp (+11/-8)
test/hipcub/CMakeLists.txt (+23/-3)
test/hipcub/bfloat16.hpp (+12/-1)
test/hipcub/common_test_header.hpp (+42/-24)
test/hipcub/experimental/sparse_matrix.hpp (+19/-14)
test/hipcub/half.hpp (+16/-6)
test/hipcub/single_index_iterator.hpp (+1/-1)
test/hipcub/test_hipcub_block_adjacent_difference.cpp (+23/-588)
test/hipcub/test_hipcub_block_discontinuity.cpp (+186/-190)
test/hipcub/test_hipcub_block_exchange.cpp (+1374/-258)
test/hipcub/test_hipcub_block_histogram.cpp (+28/-29)
test/hipcub/test_hipcub_block_load_store.cpp (+1/-1)
test/hipcub/test_hipcub_block_load_store.hpp (+5/-3)
test/hipcub/test_hipcub_block_load_store.kernels.hpp (+24/-24)
test/hipcub/test_hipcub_block_merge_sort.cpp (+783/-126)
test/hipcub/test_hipcub_block_radix_rank.cpp (+519/-48)
test/hipcub/test_hipcub_block_radix_sort.cpp (+290/-198)
test/hipcub/test_hipcub_block_reduce.cpp (+458/-143)
test/hipcub/test_hipcub_block_run_length_decode.cpp (+89/-113)
test/hipcub/test_hipcub_block_scan.cpp (+2047/-237)
test/hipcub/test_hipcub_block_shuffle.cpp (+342/-177)
test/hipcub/test_hipcub_caching_device_allocator.cpp (+4/-2)
test/hipcub/test_hipcub_device_adjacent_difference.cpp (+199/-124)
test/hipcub/test_hipcub_device_copy.cpp (+11/-8)
test/hipcub/test_hipcub_device_for.cpp (+887/-77)
test/hipcub/test_hipcub_device_histogram.cpp (+82/-124)
test/hipcub/test_hipcub_device_memcpy.cpp (+11/-8)
test/hipcub/test_hipcub_device_merge.cpp (+567/-0)
test/hipcub/test_hipcub_device_merge_sort.cpp (+61/-124)
test/hipcub/test_hipcub_device_partition.cpp (+461/-52)
test/hipcub/test_hipcub_device_radix_sort.hpp (+33/-69)
test/hipcub/test_hipcub_device_reduce.cpp (+484/-106)
test/hipcub/test_hipcub_device_reduce_by_key.cpp (+10/-20)
test/hipcub/test_hipcub_device_run_length_encode.cpp (+79/-18)
test/hipcub/test_hipcub_device_scan.cpp (+240/-102)
test/hipcub/test_hipcub_device_segmented_radix_sort.hpp (+1/-1)
test/hipcub/test_hipcub_device_segmented_reduce.cpp (+210/-84)
test/hipcub/test_hipcub_device_segmented_sort.hpp (+1/-1)
test/hipcub/test_hipcub_device_select.cpp (+50/-100)
test/hipcub/test_hipcub_device_spmv.cpp (+19/-19)
test/hipcub/test_hipcub_device_transform.cpp (+284/-0)
test/hipcub/test_hipcub_grid.cpp (+29/-11)
test/hipcub/test_hipcub_iterators.cpp (+38/-48)
test/hipcub/test_hipcub_no_half_operators.cpp (+33/-0)
test/hipcub/test_hipcub_single_pass_scan_operators.cpp (+370/-0)
test/hipcub/test_hipcub_thread.cpp (+272/-52)
test/hipcub/test_hipcub_thread_operators.cpp (+17/-21)
test/hipcub/test_hipcub_thread_sort.cpp (+7/-8)
test/hipcub/test_hipcub_util_device.cpp (+9/-5)
test/hipcub/test_hipcub_util_ptx.cpp (+21/-22)
test/hipcub/test_hipcub_vector.cpp (+17/-18)
test/hipcub/test_hipcub_warp_exchange.cpp (+1/-1)
test/hipcub/test_hipcub_warp_load.cpp (+1/-1)
test/hipcub/test_hipcub_warp_merge_sort.cpp (+6/-7)
test/hipcub/test_hipcub_warp_reduce.cpp (+4/-5)
test/hipcub/test_hipcub_warp_scan.cpp (+361/-5)
test/hipcub/test_hipcub_warp_store.cpp (+1/-1)
test/hipcub/test_utils.hpp (+52/-15)
test/hipcub/test_utils_assertions.hpp (+16/-0)
test/hipcub/test_utils_data_generation.hpp (+4/-8)
test/hipcub/test_utils_hipgraphs.hpp (+58/-47)
test/hipcub/test_utils_sort_comparator.hpp (+17/-11)
test/hipcub/test_utils_thread_operators.hpp (+22/-95)
toolchain-windows.cmake (+1/-1)
Reviewer Review Type Date Requested Status
Andreas Hasenack Approve
Ubuntu Sponsors Pending
Review via email: mp+499521@code.launchpad.net

Description of the change

Update to new upstream version 7.1.0

To post a comment you must log in.
Revision history for this message
Bruno Bernardo de Moura (bruno-bdmoura) wrote :
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Questions inline (I hope LP doesn't crash)

review: Needs Information
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

LP crashed:
```
Launchpad encountered an internal error during the following operation: emailing a code review comment. It was logged with id OOPS-54cc5a00e1a774048fc490b8026e60b1. Sorry for the inconvenience.

```

You will have to check the GUI here to see the diff comments.

Revision history for this message
Bruno Bernardo de Moura (bruno-bdmoura) wrote :

Related to the first question:

You're right, I initially misunderstood the `all` label on the Architecture field. It is now rolled back.

Now, related to the second question:

As can be seen below, the rdepends of libhipcub-tests results in libraries inside the own ROCm stack. Since it's planned for the ROCm stack to drop ppc64el architectures for packages belonging to dependency layers 1 and 2, this won't interfere with any other packages on the archive.

```
$ apt rdepends libhipcub-tests
libhipcub-tests
Reverse Depends:
  Depends: rocm-tests
```

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

+1

review: Approve
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Sponsored:

Uploading hipcub_7.1.0-0ubuntu2.dsc
Uploading hipcub_7.1.0.orig.tar.gz
Uploading hipcub_7.1.0-0ubuntu2.debian.tar.xz
Uploading hipcub_7.1.0-0ubuntu2_source.buildinfo
Uploading hipcub_7.1.0-0ubuntu2_source.changes

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
2index 89abd08..63d89af 100755
3--- a/.github/CODEOWNERS
4+++ b/.github/CODEOWNERS
5@@ -1,4 +1,4 @@
6-* @stanleytsang-amd @umfranzw @RobsonRLemos @lawruble13
7+* @stanleytsang-amd @umfranzw @RobsonRLemos
8 # Documentation files
9 docs/* @ROCm/rocm-documentation
10 *.md @ROCm/rocm-documentation
11diff --git a/.gitignore b/.gitignore
12index a09b2d6..16d3be3 100644
13--- a/.gitignore
14+++ b/.gitignore
15@@ -1,6 +1,6 @@
16
17 ### Build dirs ###
18-build/
19+build*/
20
21 # Created by https://www.gitignore.io/api/c++,cmake
22
23diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
24index e8481a9..386ba44 100644
25--- a/.gitlab-ci.yml
26+++ b/.gitlab-ci.yml
27@@ -1,6 +1,6 @@
28 # MIT License
29 #
30-# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
31+# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
32 #
33 # Permission is hereby granted, free of charge, to any person obtaining a copy
34 # of this software and associated documentation files (the "Software"), to deal
35@@ -30,6 +30,8 @@ include:
36 - /deps-format.yaml
37 - /deps-rocm.yaml
38 - /deps-nvcc.yaml
39+ - /deps-vcpkg.yaml
40+ - /deps-windows.yaml
41 - /deps-compiler-acceleration.yaml
42 - /gpus-rocm.yaml
43 - /gpus-nvcc.yaml
44@@ -41,6 +43,10 @@ stages:
45 - test
46 - benchmark
47
48+workflow:
49+ rules:
50+ - if: $CI_MERGE_REQUEST_LABELS !~ /CI Skip/
51+
52 clang-format:
53 extends:
54 - .lint:clang-format
55@@ -88,7 +94,7 @@ copyright-date:
56 -D GPU_TARGETS="$GPU_TARGETS"
57 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
58 -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
59- -D CMAKE_CXX_STANDARD=14
60+ -D CMAKE_CXX_STANDARD=17
61 -B $CI_PROJECT_DIR/rocPRIM/build
62 -S $CI_PROJECT_DIR/rocPRIM
63 - cd $CI_PROJECT_DIR/rocPRIM/build
64@@ -113,7 +119,6 @@ build:rocm:
65 -D BUILD_TEST=ON
66 -D BUILD_EXAMPLE=ON
67 -D GPU_TARGETS="$GPU_TARGETS"
68- -D GPU_TEST_TARGETS="$GPU_TARGETS"
69 -D ROCM_SYMLINK_LIBS=OFF
70 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
71 -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
72@@ -137,7 +142,7 @@ build:rocm:
73 expire_in: 2 weeks
74 parallel:
75 matrix:
76- - BUILD_VERSION: [14, 17]
77+ - BUILD_VERSION: 17
78
79 build:rocm-benchmark:
80 extends:
81@@ -158,7 +163,7 @@ build:rocm-benchmark:
82 -D GPU_TARGETS="$GPU_TARGETS"
83 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
84 -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
85- -D CMAKE_CXX_STANDARD=14
86+ -D CMAKE_CXX_STANDARD=17
87 -B $CI_PROJECT_DIR/build
88 -S $CI_PROJECT_DIR
89 - cmake --build $CI_PROJECT_DIR/build
90@@ -170,19 +175,106 @@ build:rocm-benchmark:
91 - $CI_PROJECT_DIR/build/CMakeCache.txt
92 expire_in: 2 weeks
93
94-test:rocm:
95+.rocm-windows-rocprim:
96+ variables:
97+ ROCPRIM_GIT_BRANCH: "develop_stream"
98+ script:
99+ # Install rocPRIM from git
100+ - $BRANCH_NAME = $env:ROCPRIM_GIT_BRANCH
101+ - if ($env:CI_COMMIT_BRANCH -eq "develop" -or $env:CI_COMMIT_BRANCH -eq "master") { $BRANCH_NAME = $env:CI_COMMIT_BRANCH }
102+ - git clone -b "$BRANCH_NAME" --depth 1 https://gitlab-ci-token:${CI_JOB_TOKEN}@${ROCPRIM_GIT_URL} $CI_PROJECT_DIR/rocPRIM
103+ - cmake
104+ -G Ninja
105+ -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
106+ -D CMAKE_INSTALL_PREFIX=$CI_PROJECT_DIR/rocPRIM_install
107+ -D ONLY_INSTALL=ON
108+ -B $CI_PROJECT_DIR/rocPRIM/build
109+ -S $CI_PROJECT_DIR/rocPRIM
110+ - cmake --build $CI_PROJECT_DIR/rocPRIM/build --target install
111+
112+build:rocm-windows:
113+ extends:
114+ - .rocm-windows-rocprim
115+ - .rules:build
116+ - .gpus:rocm-windows
117+ - .deps:rocm-windows
118+ - .deps:visual-studio-devshell
119+ stage: build
120+ needs: []
121+ script:
122+ - !reference [".rocm-windows-rocprim", script]
123+ - cmake
124+ -G Ninja
125+ -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
126+ -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror -Wno-error=pass-failed"
127+ -D CMAKE_PREFIX_PATH:PATH="$CI_PROJECT_DIR/rocPRIM_install;${env:HIP_PATH}"
128+ -D CMAKE_BUILD_TYPE=Release
129+ -D BUILD_TEST=ON
130+ -D BUILD_EXAMPLE=ON
131+ -D GPU_TARGETS=$GPU_TARGET
132+ -D CMAKE_CXX_STANDARD="$BUILD_VERSION"
133+ -B $CI_PROJECT_DIR/build
134+ -S $CI_PROJECT_DIR
135+ - cmake --build $CI_PROJECT_DIR/build
136+ artifacts:
137+ paths:
138+ - $CI_PROJECT_DIR/build/test/hipcub/test_*
139+ - $CI_PROJECT_DIR/build/test/CTestTestfile.cmake
140+ - $CI_PROJECT_DIR/build/test/hipcub/CTestTestfile.cmake
141+ - $CI_PROJECT_DIR/build/gtest/
142+ - $CI_PROJECT_DIR/build/CMakeCache.txt
143+ - $CI_PROJECT_DIR/build/CTestTestfile.cmake
144+ - $CI_PROJECT_DIR/build/.ninja_log
145+ expire_in: 1 day
146+ parallel:
147+ matrix:
148+ - BUILD_VERSION: 17
149+
150+build:rocm-windows-benchmark:
151+ extends:
152+ - .rocm-windows-rocprim
153+ - .rules:build
154+ - .gpus:rocm-windows
155+ - .deps:rocm-windows
156+ - .deps:visual-studio-devshell
157+ stage: build
158+ needs: []
159+ script:
160+ - !reference [".rocm-windows-rocprim", script]
161+ - cmake
162+ -G Ninja
163+ -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
164+ -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror"
165+ -D CMAKE_PREFIX_PATH:PATH="$CI_PROJECT_DIR/rocPRIM_install;${env:HIP_PATH}"
166+ -D CMAKE_BUILD_TYPE=Release
167+ -D BUILD_BENCHMARK=ON
168+ -D GPU_TARGETS=$GPU_TARGET
169+ -D CMAKE_CXX_STANDARD=17
170+ -B $CI_PROJECT_DIR/build
171+ -S $CI_PROJECT_DIR
172+ - cmake --build $CI_PROJECT_DIR/build
173+ artifacts:
174+ paths:
175+ - $CI_PROJECT_DIR/build/benchmark/*
176+ - $CI_PROJECT_DIR/build/deps/googlebenchmark/
177+ - $CI_PROJECT_DIR/build/.ninja_log
178+ - $CI_PROJECT_DIR/build/CMakeCache.txt
179+ expire_in: 1 day
180+
181+.test:rocm:
182 stage: test
183 needs:
184 - build:rocm
185+ tags:
186+ - rocm
187+ - $GPU
188 extends:
189 - .rocm
190- - .gpus:rocm
191- - .rules:test
192 script:
193 - cd $CI_PROJECT_DIR/build
194 - cmake
195 -D CMAKE_PREFIX_PATH=/opt/rocm
196- -D CMAKE_CXX_STANDARD=14
197+ -D CMAKE_CXX_STANDARD=17
198 -P $CI_PROJECT_DIR/cmake/GenerateResourceSpec.cmake
199 - cat ./resources.json
200 # Parallel execution (with other AMDGPU processes) can oversubscribe the SDMA queue.
201@@ -191,10 +283,45 @@ test:rocm:
202 - HSA_ENABLE_SDMA=0 ctest
203 --output-on-failure
204 --repeat-until-fail 2
205- --tests-regex "$GPU_TARGET"
206 --resource-spec-file ./resources.json
207 --parallel $PARALLEL_JOBS
208
209+test:rocm-any-gpu:
210+ variables:
211+ GPU: ""
212+ PARALLEL_JOBS: 1
213+ extends:
214+ - .test:rocm
215+ rules:
216+ - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/ && $CI_MERGE_REQUEST_LABELS !~ /Arch::/
217+
218+test:rocm-label-arch:
219+ extends:
220+ - .gpus:rocm
221+ - .test:rocm
222+ - .rules:arch-labels
223+
224+test:rocm-all-gpus:
225+ variables:
226+ SHOULD_BE_UNDRAFTED: "true"
227+ extends:
228+ - .gpus:rocm
229+ - .test:rocm
230+ - .rules:test
231+
232+test:rocm-windows:
233+ stage: test
234+ extends:
235+ - .deps:rocm-windows
236+ - .gpus:rocm-gpus-windows
237+ - .deps:visual-studio-devshell
238+ - .rules:test
239+ needs:
240+ - job: build:rocm-windows
241+ script:
242+ - cd $CI_PROJECT_DIR/build
243+ - ctest --output-on-failure
244+
245 .benchmark:
246 stage: benchmark
247 variables:
248@@ -238,7 +365,7 @@ benchmark:rocm:
249 -G Ninja
250 -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror"
251 "$GPU_TARGETS_ARG"
252- -D CMAKE_CXX_STANDARD=14
253+ -D CMAKE_CXX_STANDARD=17
254 -S $CI_PROJECT_DIR/test/extra
255 -B $CI_PROJECT_DIR/build/package_test
256 - cmake --build $CI_PROJECT_DIR/build/package_test
257@@ -259,7 +386,7 @@ benchmark:rocm:
258 - cmake
259 -G Ninja
260 -D BUILD_TEST=OFF
261- -D CMAKE_CXX_STANDARD=14
262+ -D CMAKE_CXX_STANDARD=17
263 -S $CI_PROJECT_DIR
264 -B $CI_PROJECT_DIR/build_only_install
265 # Preserve $PATH when sudoing
266@@ -322,6 +449,7 @@ build:nvcc:
267 -D CMAKE_BUILD_TYPE=Release
268 -D BUILD_TEST=ON
269 -D BUILD_EXAMPLE=ON
270+ "$(if [ "$SCHEDULE" == "nightly" ]; then echo "-D BUILD_COMPUTE_SANITIZER=ON"; fi)"
271 -D NVGPU_TARGETS="$GPU_TARGETS"
272 -D ROCM_SYMLINK_LIBS=OFF
273 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
274@@ -347,7 +475,7 @@ build:nvcc:
275 expire_in: 2 weeks
276 parallel:
277 matrix:
278- - BUILD_VERSION: [14, 17]
279+ - BUILD_VERSION: 17
280
281 build:nvcc-benchmark:
282 stage: build
283@@ -367,7 +495,7 @@ build:nvcc-benchmark:
284 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
285 -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
286 -D CMAKE_CUDA_COMPILER_LAUNCHER=phc_sccache_cuda
287- -D CMAKE_CXX_STANDARD=14
288+ -D CMAKE_CXX_STANDARD=17
289 -B $CI_PROJECT_DIR/build
290 -S $CI_PROJECT_DIR
291 - cmake --build $CI_PROJECT_DIR/build
292@@ -379,14 +507,15 @@ build:nvcc-benchmark:
293 - $CI_PROJECT_DIR/build/CMakeCache.txt
294 expire_in: 2 weeks
295
296-test:nvcc:
297+.test:nvcc:
298 stage: test
299+ tags:
300+ - nvcc
301+ - $GPU
302 needs:
303 - build:nvcc
304 extends:
305 - .nvcc
306- - .gpus:nvcc
307- - .rules:test
308 before_script:
309 # This is only needed because of the legacy before_script in .gpus:nvcc would otherwise overwrite before_script
310 - !reference [.nvcc, before_script]
311@@ -394,6 +523,23 @@ test:nvcc:
312 - cd $CI_PROJECT_DIR/build
313 - ctest --output-on-failure --repeat-until-fail 2
314
315+test:nvcc-any-gpu:
316+ variables:
317+ GPU: ""
318+ PARALLEL_JOBS: 1
319+ extends:
320+ - .test:nvcc
321+ rules:
322+ - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/
323+
324+test:nvcc-all-gpus:
325+ variables:
326+ SHOULD_BE_UNDRAFTED: "true"
327+ extends:
328+ - .gpus:nvcc
329+ - .test:nvcc
330+ - .rules:test
331+
332 benchmark:nvcc:
333 needs:
334 - build:nvcc-benchmark
335@@ -436,6 +582,10 @@ test:doc:
336 extends:
337 - .rules:test
338 - .build:docs
339+ artifacts:
340+ paths:
341+ - $DOCS_DIR/_build/html/
342+ expire_in: 2 weeks
343
344 scheduled-check-changes:
345 extends: .rules:scheduled-check-changes
346diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
347deleted file mode 100644
348index 79ebcb6..0000000
349--- a/.jenkins/common.groovy
350+++ /dev/null
351@@ -1,53 +0,0 @@
352-// This file is for internal AMD use.
353-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
354-
355-def runCompileCommand(platform, project, jobName, boolean debug=false, boolean sameOrg=true)
356-{
357- project.paths.construct_build_prefix()
358-
359- String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release'
360- String buildTypeDir = debug ? 'debug' : 'release'
361- String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
362- //Set CI node's gfx arch as target if PR, otherwise use default targets of the library
363- String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
364-
365- def getRocPRIM = auxiliary.getLibrary('rocPRIM', platform.jenkinsLabel, null, sameOrg)
366-
367- def command = """#!/usr/bin/env bash
368- set -x
369- ${getRocPRIM}
370- cd ${project.paths.project_build_prefix}
371- mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir}
372- ${auxiliary.gfxTargetParser()}
373- ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../..
374- make -j\$(nproc)
375- """
376-
377- platform.runCommand(this, command)
378-}
379-
380-
381-def runTestCommand (platform, project)
382-{
383- String sudo = auxiliary.sudo(platform.jenkinsLabel)
384-
385- def testCommand = "ctest --output-on-failure --verbose --timeout 900"
386- def command = """#!/usr/bin/env bash
387- set -x
388- cd ${project.paths.project_build_prefix}
389- cd ${project.testDirectory}
390- ${sudo} LD_LIBRARY_PATH=/opt/rocm/lib ${testCommand}
391- """
392-
393- platform.runCommand(this, command)
394-}
395-
396-def runPackageCommand(platform, project)
397-{
398- def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release")
399-
400- platform.runCommand(this, packageHelper[0])
401- platform.archiveArtifacts(this, packageHelper[1])
402-}
403-
404-return this
405diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy
406deleted file mode 100644
407index 074bc53..0000000
408--- a/.jenkins/precheckin.groovy
409+++ /dev/null
410@@ -1,84 +0,0 @@
411-#!/usr/bin/env groovy
412-// This shared library is available at https://github.com/ROCm/rocJENKINS/
413-@Library('rocJenkins@pong') _
414-
415-// This file is for internal AMD use.
416-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
417-
418-import com.amd.project.*
419-import com.amd.docker.*
420-import java.nio.file.Path;
421-
422-def runCI =
423-{
424- nodeDetails, jobName->
425-
426- def prj = new rocProject('hipCUB', 'PreCheckin')
427- prj.timeout.compile = 400
428- // Define test architectures, optional rocm version argument is available
429- def nodes = new dockerNodes(nodeDetails, jobName, prj)
430-
431- boolean formatCheck = false
432-
433- def commonGroovy
434-
435- def compileCommand =
436- {
437- platform, project->
438-
439- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
440- commonGroovy.runCompileCommand(platform, project, jobName)
441- }
442-
443- def testCommand =
444- {
445- platform, project->
446-
447- commonGroovy.runTestCommand(platform, project)
448- }
449-
450- def packageCommand =
451- {
452- platform, project->
453-
454- commonGroovy.runPackageCommand(platform, project)
455- }
456-
457- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
458-}
459-
460-ci: {
461- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
462-
463- def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
464- propertyList = auxiliary.appendPropertyList(propertyList)
465-
466- def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])]
467- jobNameList = auxiliary.appendJobNameList(jobNameList)
468-
469- auxiliary.registerDependencyBranchParameter(["rocPRIM"])
470-
471- propertyList.each
472- {
473- jobName, property->
474- if (urlJobName == jobName)
475- properties(auxiliary.addCommonProperties(property))
476- }
477-
478- Set seenJobNames = []
479- jobNameList.each
480- {
481- jobName, nodeDetails->
482- seenJobNames.add(jobName)
483- if (urlJobName == jobName)
484- runCI(nodeDetails, jobName)
485- }
486-
487- // For url job names that are outside of the standardJobNameSet i.e. compute-rocm-dkms-no-npi-1901
488- if(!seenJobNames.contains(urlJobName))
489- {
490- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
491- runCI([ubuntu16:['gfx906']], urlJobName)
492- }
493-}
494-
495diff --git a/.jenkins/staticanalysis.groovy b/.jenkins/staticanalysis.groovy
496deleted file mode 100644
497index 5c0b949..0000000
498--- a/.jenkins/staticanalysis.groovy
499+++ /dev/null
500@@ -1,46 +0,0 @@
501-#!/usr/bin/env groovy
502-// This shared library is available at https://github.com/ROCm/rocJENKINS/
503-@Library('rocJenkins@pong') _
504-
505-// This is file for internal AMD use.
506-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
507-
508-import com.amd.project.*
509-import com.amd.docker.*
510-import java.nio.file.Path
511-
512-def runCompileCommand(platform, project, jobName, boolean debug=false)
513-{
514- project.paths.construct_build_prefix()
515-}
516-
517-def runCI =
518-{
519- nodeDetails, jobName->
520-
521- def prj = new rocProject('hipCUB', 'StaticAnalysis')
522-
523- // Define test architectures, optional rocm version argument is available
524- def nodes = new dockerNodes(nodeDetails, jobName, prj)
525-
526- boolean formatCheck = false
527- boolean staticAnalysis = true
528-
529- def compileCommand =
530- {
531- platform, project->
532-
533- runCompileCommand(platform, project, jobName, false)
534- }
535-
536- buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis)
537-}
538-
539-ci: {
540- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
541-
542- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])]))
543- stage(urlJobName) {
544- runCI([ubuntu20:['any']], urlJobName)
545- }
546-}
547diff --git a/.jenkins/staticlibrary.groovy b/.jenkins/staticlibrary.groovy
548deleted file mode 100644
549index 549913d..0000000
550--- a/.jenkins/staticlibrary.groovy
551+++ /dev/null
552@@ -1,82 +0,0 @@
553-#!/usr/bin/env groovy
554-@Library('rocJenkins@pong') _
555-import com.amd.project.*
556-import com.amd.docker.*
557-import java.nio.file.Path;
558-
559-def runCI =
560-{
561- nodeDetails, jobName->
562-
563- def prj = new rocProject('hipCUB', 'Static Library PreCheckin')
564-
565- def nodes = new dockerNodes(nodeDetails, jobName, prj)
566-
567- def commonGroovy
568-
569- boolean formatCheck = false
570-
571- def compileCommand =
572- {
573- platform, project->
574-
575- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
576- commonGroovy.runCompileCommand(platform, project, jobName, false, true)
577- }
578-
579-
580- def testCommand =
581- {
582- platform, project->
583-
584- commonGroovy.runTestCommand(platform, project)
585- }
586-
587- def packageCommand =
588- {
589- platform, project->
590-
591- commonGroovy.runPackageCommand(platform, project)
592- }
593-
594- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
595-}
596-
597-ci: {
598- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
599-
600- def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])],
601- "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
602- "rocm-docker":[]]
603- propertyList = auxiliary.appendPropertyList(propertyList)
604-
605- def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]),
606- "compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]),
607- "rocm-docker":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']])]
608- jobNameList = auxiliary.appendJobNameList(jobNameList)
609-
610- propertyList.each
611- {
612- jobName, property->
613- if (urlJobName == jobName)
614- properties(auxiliary.addCommonProperties(property))
615- }
616-
617- jobNameList.each
618- {
619- jobName, nodeDetails->
620- if (urlJobName == jobName)
621- stage(jobName) {
622- runCI(nodeDetails, jobName)
623- }
624- }
625-
626- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
627- if(!jobNameList.keySet().contains(urlJobName))
628- {
629- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
630- stage(urlJobName) {
631- runCI([ubuntu16:['gfx906']], urlJobName)
632- }
633- }
634-}
635diff --git a/CHANGELOG.md b/CHANGELOG.md
636index 5a13e14..a9814bf 100644
637--- a/CHANGELOG.md
638+++ b/CHANGELOG.md
639@@ -2,6 +2,107 @@
640
641 Full documentation for hipCUB is available at [https://rocm.docs.amd.com/projects/hipCUB/en/latest/](https://rocm.docs.amd.com/projects/hipCUB/en/latest/).
642
643+## hipCUB-4.1.0 for ROCm 7.1
644+
645+### Added
646+
647+* Exposed Thread-level reduction API `hipcub::ThreadReduce`.
648+* Added `::hipcub::extents`, with limited parity to C++23's `std::extents`. Only `static extents` is supported; `dynamic extents` is not. Helper structs have been created to perform computations on `::hipcub::extents` only when the backend is rocPRIM. For the CUDA backend, similar functionality exists.
649+* Added `projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_mdspan.hpp` to support `::hipcub::extents`.
650+* Added `::hipcub::ForEachInExtents` API.
651+* Added `hipcub::DeviceTransform::Transform` and `hipcub::DeviceTransform::TransformStableArgumentAddresses`.
652+
653+* hipCUB and its dependency rocPRIM have been moved into the new rocm-libraries "monorepo" repository (https://github.com/ROCm/rocm-libraries). This repository contains a number of ROCm libraries that are frequently used together.
654+ * The repository migration requires a few changes to the way that hipCUB fetches library dependencies.
655+ * CMake build option `ROCPRIM_FETCH_METHOD` may be set to one of the following:
656+ * `PACKAGE` - (default) searches for a preinstalled packaged version of the dependency. If it is not found, the build will fall back using option `DOWNLOAD`, below.
657+ * `DOWNLOAD` - downloads the dependency from the rocm-libraries repository. If git >= 2.25 is present, this option uses a sparse checkout that avoids downloading more than it needs to. If not, the whole monorepo is downloaded (this may take some time).
658+ * `MONOREPO` - this options is intended to be used if you are building hipCUB from within a copy of the rocm-libraries repository that you have cloned (and therefore already contains rocPRIM). When selected, the build will try find the dependency in the local repository tree. If it cannot be found, the build will attempt to use git to perform a sparse-checkout of rocPRIM. If that also fails, it will fall back to using the `DOWNLOAD` option described above.
659+
660+* Added a new CMake option `-DUSE_SYSTEM_LIB` to allow tests to be built from installed `hipCUB` provided by the system.
661+
662+### Removed
663+
664+* Removed `TexRefInputIterator`, which was removed from CUB after CCCL's 2.6.0 release. This API should have already been removed, but somehow it remained and was not tested.
665+* Deprecated `hipcub::ConstantInputIterator`, use `rocprim::constant_iterator` or `rocthrust::constant_iterator` instead.
666+* Deprecated `hipcub::CountingInputIterator`, use `rocprim::counting_iterator` or `rocthrust::counting_iterator` instead.
667+* Deprecated `hipcub::DiscardOutputIterator`, use `rocprim::discard_iterator` or `rocthrust::discard_iterator` instead.
668+* Deprecated `hipcub::TransformInputIterator`, use `rocprim::transform_iterator` or `rocthrust::transform_iterator` instead.
669+* Deprecated `hipcub::AliasTemporaries`, which is considered to be internal API. Moved it to detail namespace.
670+* Deprecated almost all functions in `projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp`.
671+* Deprecated hipCUB macros: `HIPCUB_MAX`, `HIPCUB_MIN`, `HIPCUB_QUOTIENT_FLOOR`, `HIPCUB_QUOTIENT_CEILING`, `HIPCUB_ROUND_UP_NEAREST` and `HIPCUB_ROUND_DOWN_NEAREST`.
672+
673+### Changed
674+
675+* Changed include headers to avoid relative includes that have slipped in.
676+* Changed `CUDA_STANDARD` for tests in `test/hipcub`, due to C++17 APIs such as `std::exclusive_scan` is used in some tests. Still use `CUDA_STANDARD 14` for `test/extra`.
677+* Changed `CCCL_MINIMUM_VERSION` to `2.8.2` to align with CUB.
678+* Changed `cmake_minimum_required` from `3.16` to `3.18`, in order to support `CUDA_STANDARD 17` as a valid value.
679+* Add support for large num_items `DeviceScan`, `DevicePartition` and `Reduce::{ArgMin, ArgMax}`.
680+* Added tests for large num_items.
681+* The previous dependency-related build option `DEPENDENCIES_FORCE_DOWNLOAD` has been renamed `EXTERNAL_DEPS_FORCE_DOWNLOAD` to differentiate it from the new rocPRIM dependency option described above. It's behaviour remains the same - it forces non-ROCm dependencies (Google Benchmark and Google Test) to be downloaded instead of searching for existing installed packages. This option defaults to `OFF`.
682+
683+### Known issues
684+
685+* The '__half' template specializations of Simd operators are currently disabled due to possible build issues with PyTorch.
686+
687+## hipCUB-4.0.0 for ROCm 7.0
688+
689+### Added
690+
691+* Added a new cmake option, `BUILD_OFFLOAD_COMPRESS`. When hipCUB is build with this option enabled, the `--offload-compress` switch is passed to the compiler. This causes the compiler to compress the binary that it generates. Compression can be useful in cases where you are compiling for a large number of targets, since this often results in a large binary. Without compression, in some cases, the generated binary may become so large symbols are placed out of range, resulting in linking errors. The new `BUILD_OFFLOAD_COMPRESS` option is set to `ON` by default.
692+* Added single pass operators in `agent/single_pass_scan_operators.hpp` which contains the following API:
693+ * `BlockScanRunningPrefixOp`
694+ * `ScanTileStatus`
695+ * `ScanTileState`
696+ * `ReduceByKeyScanTileState`
697+ * `TilePrefixCallbackOp`
698+* Added gfx950 support.
699+* Added an overload of `BlockScan::InclusiveScan` that accepts an initial value to seed the scan.
700+* Added an overload of `WarpScan::InclusiveScan` that accepts an initial value to seed the scan.
701+* `UnrolledThreadLoad`, `UnrolledCopy`, and `ThreadLoadVolatilePointer` were added to align hipCUB with CUB.
702+* `ThreadStoreVolatilePtr` and the `IterateThreadStore` struct were added to align hipCUB with CUB.
703+* Added `hipcub::InclusiveScanInit` for CUB parity.
704+* Additional Unit Tests for:
705+ * block_exchange
706+ * block_merge_sort
707+ * block_radix_rank
708+ * block_radix_sort
709+ * block_reduce
710+ * block_shuffle
711+
712+### Removed
713+
714+* The AMD GPU targets `gfx803` and `gfx900` are no longer built by default. If you would like to build for these architectures, please specify them explicitly in the `AMDGPU_TARGETS` cmake option.
715+* Deprecated `hipcub::AsmThreadLoad` is removed, use `hipcub::ThreadLoad` instead.
716+* Deprecated `hipcub::AsmThreadStore` is removed, use `hipcub::ThreadStore` instead.
717+* Deprecated `BlockAdjacentDifference::FlagHeads`, `BlockAdjacentDifference::FlagTails` and `BlockAdjacentDifference::FlagHeadsAndTails` have been removed.
718+* This release removes support for custom builds on gfx940 and gfx941.
719+* Removed C++14 support, only C++17 is supported.
720+
721+### Changed
722+
723+* The NVIDIA backend now requires CUB, Thrust, and libcu++ 2.7.0. If they aren't found, they will be downloaded from the NVIDIA CCCL repository.
724+* Updated `thread_load` and `thread_store` to align hipCUB with CUB.
725+* All kernels now have hidden symbol visibility. All symbols now have inline namespaces that include the library version, (for example, hipcub::HIPCUB_300400_NS::symbol instead of hipcub::symbol), letting the user link multiple libraries built with different versions of hipCUB.
726+* Modified the broadcast kernel in warp scan benchmarks. The reported performance may be different to previous versions.
727+* The `hipcub::detail::accumulator_t` in rocPRIM backend has been changed to utilise `rocprim::accumulator_t`.
728+* The usage of `rocprim::invoke_result_binary_op_t` has been replaced with `rocprim::accumulator_t`.
729+
730+### Resolved issues
731+* Fixed an issue where `Sort(keys, compare_op, valid_items, oob_default)` in `block_merge_sort.hpp` would not fill in elements that are out of range (items after `valid_items`) with `oob_default`.
732+* Fixed an issue where `ScatterToStripedFlagged` in `block_exhange.hpp` was calling the wrong function.
733+
734+### Known issues
735+
736+* `BlockAdjacentDifference::FlagHeads`, `BlockAdjacentDifference::FlagTails` and `BlockAdjacentDifference::FlagHeadsAndTails` have been removed from hipCUB's CUB backend. They were already deprecated as of version 2.12.0 of hipCUB and they were removed from CCCL (CUB) as of CCCL's 2.6.0 release.
737+* `BlockScan::InclusiveScan` for the NVIDIA backend does not compute the block aggregate correctly when passing an initial value parameter. This behavior is not matched by the AMD backend.
738+
739+
740+### Upcoming Changes
741+
742+* `BlockAdjacentDifference::FlagHeads`, `BlockAdjacentDifference::FlagTails` and `BlockAdjacentDifference::FlagHeadsAndTails` were deprecated as of version 2.12.0 of hipCUB, and will be removed from the rocPRIM backend in a future release for the next ROCm major version (ROCm 7.0.0).
743+
744 ## hipCUB-3.4.0 for ROCm 6.4.0
745
746 ### Added
747diff --git a/CMakeLists.txt b/CMakeLists.txt
748index dd9a3e2..fbcecf0 100644
749--- a/CMakeLists.txt
750+++ b/CMakeLists.txt
751@@ -1,6 +1,6 @@
752 # MIT License
753 #
754-# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
755+# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
756 #
757 # Permission is hereby granted, free of charge, to any person obtaining a copy
758 # of this software and associated documentation files (the "Software"), to deal
759@@ -20,8 +20,19 @@
760 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
761 # SOFTWARE.
762
763-cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
764-cmake_policy(VERSION 3.16...3.25)
765+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
766+cmake_policy(VERSION 3.18...3.25)
767+
768+# --------------------------------------
769+# Update these variables at release time
770+#
771+# Set the library version
772+set(VERSION_STRING "4.1.0")
773+# Set the minimum required rocPRIM version
774+set(MIN_ROCPRIM_PACKAGE_VERSION "4.1.0" CACHE STRING "Minimum version of rocPRIM to search for when ROCPRIM_FETCH_METHOD is set to PACKAGE.")
775+# Set download branch for dependency rocPRIM
776+set(ROCM_DEP_RELEASE_BRANCH "release/rocm-rel-7.1" CACHE STRING "Download branch for ROCm dependencies")
777+# --------------------------------------
778
779 # Install prefix
780 set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories")
781@@ -43,7 +54,7 @@ elseif(NOT CMAKE_CXX_STANDARD EQUAL 17)
782 endif()
783
784 # Set HIP flags
785-set(CMAKE_HIP_STANDARD 14)
786+set(CMAKE_HIP_STANDARD 17)
787 set(CMAKE_HIP_STANDARD_REQUIRED ON)
788 set(CMAKE_HIP_EXTENSIONS OFF)
789
790@@ -52,15 +63,23 @@ include(CMakeDependentOption)
791
792 # Build options
793 option(BUILD_TEST "Build tests (requires googletest)" OFF)
794-option(DEPENDENCIES_FORCE_DOWNLOAD "Download dependencies and do not search for packages" OFF)
795+option(CODE_COVERAGE "Enable code coverage" OFF)
796+option(EXTERNAL_DEPS_FORCE_DOWNLOAD "Download non-ROCm dependencies and do not search for packages" OFF)
797 option(DOWNLOAD_CUB "Download CUB and thrust. Do not search for CUB package" OFF)
798 option(BUILD_BENCHMARK "Build benchmarks" OFF)
799 option(BUILD_EXAMPLE "Build Examples" OFF)
800 option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF)
801+option(BUILD_OFFLOAD_COMPRESS "Build hipCUB with offload compression" ON)
802+option(BUILD_COMPUTE_SANITIZER "Build tests with cuda's compute sanitizer enabled" OFF)
803+cmake_dependent_option(USE_SYSTEM_LIB "Use installed hipCUB when building tests" OFF BUILD_TEST OFF)
804
805+# Check and test cuda compiler, defines 'CMAKE_HIP_COMPILER'
806 check_language(HIP)
807 cmake_dependent_option(USE_HIPCXX "Use CMake HIP language support" OFF CMAKE_HIP_COMPILER OFF)
808
809+# Check and test cuda compiler, defines 'CMAKE_CUDA_COMPILER'
810+check_language(CUDA)
811+
812 # Set the ROCM install directory.
813 if(WIN32)
814 set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation")
815@@ -68,15 +87,29 @@ else()
816 set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
817 endif()
818
819+# Set up options for obtaining dependency rocPRIM.
820+# PACKAGE: Search for an install package that contains the dependency.
821+# MONOREPO: Assume this is a monorepo checkout and search for the dependency in the directory at ../../projects/.
822+# DOWNLOAD: Download the dependency from the monorepo.
823+set(FETCH_METHOD_OPTIONS "PACKAGE" "MONOREPO" "DOWNLOAD")
824
825-# Set the header wrapper ON by default.
826-option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" OFF)
827+set(ROCPRIM_FETCH_METHOD "PACKAGE" CACHE STRING "How to obtain the rocPRIM dependency")
828+
829+# This function checks to see if the fetch method variable it's passed is defined, and contains a valid value.
830+# If it does not contain a valid value, it issues a fatal failure with an error message.
831+function(check_fetch_method method)
832+ if (DEFINED ${method} AND NOT ${${method}} IN_LIST FETCH_METHOD_OPTIONS)
833+ message(FATAL_ERROR "Unrecognized ${method}: \"${${method}}\". Valid options are: ${FETCH_METHOD_OPTIONS}.")
834+ endif()
835+endfunction()
836+
837+check_fetch_method(ROCPRIM_FETCH_METHOD)
838
839 # Add hipCUB's CMake modules
840 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
841
842 # Set a default build type if none was specified
843-if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
844+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES AND NOT CODE_COVERAGE)
845 message(STATUS "Setting build type to 'Release' as none was specified.")
846 set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE)
847 set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
848@@ -88,35 +121,55 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker searc
849 # If hip is included prior to setting that then it defaults to building only for the current architecture
850 include(ROCmCMakeBuildToolsDependency)
851
852-# Setup GPU targets for rocm platform
853+# Detect compiler through use of result from 'check_language(...)'
854 if(USE_HIPCXX)
855 enable_language(HIP)
856-else()
857+elseif(NOT (CMAKE_CXX_COMPILER MATCHES ".*nvcc$" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU"))
858+ # Detected HIP through archaic match by checking passed CXX compiler. This can
859+ # be removed once we bump minimum CMake version to 3.21 or higher.
860+
861 # Setup GPU targets for rocm platform
862- if(NOT (CMAKE_CXX_COMPILER MATCHES ".*nvcc$" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU"))
863- if(NOT DEFINED AMDGPU_TARGETS)
864- set(GPU_TARGETS "all" CACHE STRING "GPU architectures to compile for")
865+ message(STATUS "CMake could not derive language via 'check_language'. Falling back to legacy compiler checks.")
866+ if(NOT DEFINED AMDGPU_TARGETS)
867+ set(GPU_TARGETS "all" CACHE STRING "GPU architectures to compile for")
868+ else()
869+ set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for")
870+ endif()
871+ set_property(CACHE GPU_TARGETS PROPERTY STRINGS "all")
872+
873+ if(GPU_TARGETS STREQUAL "all")
874+ if(BUILD_ADDRESS_SANITIZER)
875+ # ASAN builds require xnack
876+ rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
877+ TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+;gfx950:xnack+"
878+ )
879 else()
880- set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for")
881- endif()
882- set_property(CACHE GPU_TARGETS PROPERTY STRINGS "all")
883-
884- if(GPU_TARGETS STREQUAL "all")
885- if(BUILD_ADDRESS_SANITIZER)
886- # ASAN builds require xnack
887- rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
888- TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+"
889- )
890- else()
891- rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
892- TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201"
893- )
894- endif()
895- set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE)
896+ rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
897+ TARGETS "gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201"
898+ )
899 endif()
900+ set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE)
901 endif()
902+elseif(CMAKE_CUDA_COMPILER)
903+ # We haven't detected HIP, so surely we must be on a CUDA-compatible compiler.
904+ # Contrary to HIP, 'enable_language(CUDA)' is supported from CMake 3.8 and higher.
905+ enable_language(CUDA)
906+
907+ # Hack: let CMake think that our 'hip' files are actually CUDA files.
908+ set(CMAKE_CUDA_SOURCE_FILE_EXTENSIONS hip;cu)
909 endif()
910
911+# Compressed offload binaries are currently not working with the SPIR-V target
912+if("amdgcnspirv" IN_LIST GPU_TARGETS)
913+ if(BUILD_OFFLOAD_COMPRESS)
914+ message(FATAL_ERROR "Cannot combine SPIR-V and BUILD_OFFLOAD_COMPRESS")
915+ endif()
916+endif()
917+
918+# Setup the library version
919+rocm_setup_version(VERSION ${VERSION_STRING})
920+math(EXPR hipcub_VERSION_NUMBER "${hipcub_VERSION_MAJOR} * 100000 + ${hipcub_VERSION_MINOR} * 100 + ${hipcub_VERSION_PATCH}")
921+
922 # Find and verify HIP.
923 include(VerifyCompiler)
924
925@@ -128,23 +181,45 @@ if(BUILD_ADDRESS_SANITIZER)
926 add_link_options(-fuse-ld=lld)
927 endif()
928
929-# Setup VERSION
930-set(VERSION_STRING "3.4.0")
931-rocm_setup_version(VERSION ${VERSION_STRING})
932+include(CheckCXXCompilerFlag)
933
934-# Print configuration summary
935-include(cmake/Summary.cmake)
936-print_configuration_summary()
937+if(BUILD_OFFLOAD_COMPRESS)
938+ # We need to pass '-x hip' since check_cxx_compiler_flag assumes c++ and not HIP.
939+ check_cxx_compiler_flag("--offload-compress -x hip" CXX_COMPILER_SUPPORTS_OFFLOAD_COMPRESS)
940+ if(CXX_COMPILER_SUPPORTS_OFFLOAD_COMPRESS)
941+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --offload-compress")
942+ else()
943+ message(STATUS "Warning: BUILD_OFFLOAD_COMPRESS=ON but flag not supported by compiler. Ignoring option.")
944+ endif()
945+endif()
946
947 # hipCUB library
948 add_subdirectory(hipcub)
949
950+if(CODE_COVERAGE)
951+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -fprofile-instr-generate -fcoverage-mapping")
952+endif()
953+
954 if(BUILD_TEST OR (BUILD_BENCHMARK AND NOT ONLY_INSTALL))
955 rocm_package_setup_component(clients)
956 endif()
957
958 # Tests
959 if(BUILD_TEST)
960+ if(USE_SYSTEM_LIB)
961+ # On ROCm hipCUB requires rocPRIM
962+ if(HIP_COMPILER STREQUAL "clang")
963+ find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/lib/cmake/rocprim")
964+ if (${rocprim_VERSION} VERSION_LESS ${MIN_ROCPRIM_PACKAGE_VERSION})
965+ message(WARNING "The installed rocprim version, ${rocprim_VERSION}, is less than the minimum required version ${MIN_ROCPRIM_PACKAGE_VERSION}. Building tests with USE_SYSTEM_LIB=ON may not work properly.")
966+ endif()
967+ endif()
968+ find_package(hipcub REQUIRED CONFIG PATHS "/opt/rocm/lib/cmake/hipcub")
969+ if (NOT ${hipcub_VERSION} VERSION_EQUAL ${VERSION_STRING})
970+ message(WARNING "The installed hipcub version, ${hipcub_VERSION}, does not match project version ${VERSION_STRING}. Building tests with USE_SYSTEM_LIB=ON may not work properly.")
971+ endif()
972+ endif()
973+
974 enable_testing()
975 rocm_package_setup_client_component(tests)
976 add_subdirectory(test)
977@@ -161,18 +236,6 @@ if(BUILD_BENCHMARK AND NOT ONLY_INSTALL)
978 add_subdirectory(benchmark)
979 endif()
980
981-# Create header wrapper for backward compatibility
982-if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32)
983- rocm_wrap_header_dir(
984- ${PROJECT_SOURCE_DIR}/hipcub/include/hipcub/
985- PATTERNS "*.h"
986- PATTERN "*.hpp"
987- GUARDS SYMLINK WRAPPER
988- WRAPPER_LOCATIONS cub/${CMAKE_INSTALL_INCLUDEDIR}/hipcub/
989- OUTPUT_LOCATIONS cub/wrapper/include/hipcub/
990- )
991-endif()
992-
993 # Package
994 if(HIP_COMPILER STREQUAL "clang")
995 rocm_package_add_deb_dependencies(DEPENDS "rocprim-dev >= 2.10.1")
996@@ -205,3 +268,7 @@ else()
997 HEADER_ONLY
998 )
999 endif()
1000+
1001+# Print configuration summary
1002+include(cmake/Summary.cmake)
1003+print_configuration_summary()
1004diff --git a/LICENSE.txt b/LICENSE.txt
1005index c284d2b..4bbc4ad 100644
1006--- a/LICENSE.txt
1007+++ b/LICENSE.txt
1008@@ -1,9 +1,10 @@
1009 Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
1010 Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
1011-Modifications Copyright (c) 2019-2021, Advanced Micro Devices, Inc. All rights reserved.
1012+Modifications Copyright (c) 2019-2025, Advanced Micro Devices, Inc. All rights reserved.
1013
1014 Redistribution and use in source and binary forms, with or without
1015 modification, are permitted provided that the following conditions are met:
1016+
1017 * Redistributions of source code must retain the above copyright
1018 notice, this list of conditions and the following disclaimer.
1019 * Redistributions in binary form must reproduce the above copyright
1020diff --git a/README.md b/README.md
1021index 6a0ad19..c144d1f 100644
1022--- a/README.md
1023+++ b/README.md
1024@@ -1,56 +1,28 @@
1025 # hipCUB
1026
1027 > [!NOTE]
1028-> The published documentation is available at [hipCUB](https://rocm.docs.amd.com/projects/hipCUB/en/latest/index.html) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
1029+> The published hipCUB documentation is available [here](https://rocm.docs.amd.com/projects/hipCUB/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
1030
1031 hipCUB is a thin wrapper library on top of
1032-[rocPRIM](https://github.com/ROCm/rocPRIM) or
1033-[CUB](https://github.com/thrust/cub). You can use it to port a CUB project into
1034+[rocPRIM](https://github.com/ROCm/rocm-libraries) or
1035+[CUB](https://github.com/nvidia/cccl). You can use it to port a CUB project into
1036 [HIP](https://github.com/ROCm/HIP) so you can use AMD hardware (and
1037 [ROCm](https://rocm.docs.amd.com/en/latest/) software).
1038
1039 In the [ROCm](https://rocm.docs.amd.com/en/latest/)
1040-environment, hipCUB uses the rocPRIM library as the backend. On CUDA platforms, it uses CUB as the
1041-backend.
1042-
1043-## Documentation
1044-
1045-Documentation for hipCUB is available at
1046-[https://rocm.docs.amd.com/projects/hipCUB/en/latest/](https://rocm.docs.amd.com/projects/hipCUB/en/latest/).
1047-
1048-To build our documentation locally, run the following code:
1049-
1050-```shell
1051-# Go to the hipCUB docs directory
1052-cd hipCUB; cd docs
1053-
1054-# Install required pip packages
1055-python3 -m pip install -r .sphinx/requirements.txt
1056-
1057-# Build the documentation
1058-python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
1059-
1060-# For e.g. serve the HTML docs locally
1061-cd _build/html
1062-python3 -m http.server
1063-```
1064+environment, hipCUB uses the rocPRIM library as the backend.
1065
1066 ## Requirements
1067
1068 * Git
1069-* CMake (3.16 or later)
1070+* CMake (3.18 or later)
1071 * For AMD GPUs:
1072 * AMD [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/native-install/index.html) software (1.8.0 or later)
1073 * The [HIP-clang](https://github.com/ROCm/HIP/blob/master/INSTALL.md#hip-clang) compiler (you
1074 must, set this as the C++ compiler for ROCm)
1075- * The [rocPRIM](https://github.com/ROCm/rocPRIM) library
1076+ * The [rocPRIM](https://github.com/ROCm/rocm-libraries) library
1077 * Automatically downloaded and built by the CMake script
1078 * Requires CMake 3.16.9 or later
1079-* For NVIDIA GPUs:
1080- * CUDA Toolkit
1081- * CCCL library (>= 2.5.0)
1082- * Automatically downloaded and built by the CMake script
1083- * Requires CMake 3.15.0 or later
1084 * Python 3.6 or higher (for HIP on Windows only; this is only required for install scripts)
1085 * Visual Studio 2019 with Clang support (HIP on Windows only)
1086 * Strawberry Perl (HIP on Windows only)
1087@@ -64,31 +36,53 @@ GoogleTest and Google Benchmark are automatically downloaded and built by the CM
1088
1089 ## Build and install
1090
1091-To build and install hipCub, run the following code:
1092+### Obtaining the source code
1093+
1094+hipCUB can be cloned in two ways:
1095+
1096+1. Clone hipCUB along with other ROCm libraries that are frequently used together (note that this may take some time to complete):
1097+```sh
1098+git clone https://github.com/ROCm/rocm-libraries.git
1099+cd rocm-libraries
1100+```
1101+
1102+2. To clone hipCUB individually (faster, but requires git version 2.25+):
1103+```sh
1104+git clone --no-checkout --depth=1 --filter=tree:0 https://github.com/ROCm/rocm-libraries.git
1105+cd rocm-libraries
1106+git sparse-checkout init --cone
1107+git sparse-checkout set projects/hipcub
1108+git checkout develop
1109+```
1110+
1111+### Building the library
1112
1113 ```shell
1114-git clone https://github.com/ROCm/hipCUB.git
1115+# Go to the hipCUB directory.
1116+cd projects/hipcub
1117
1118-# Go to hipCUB directory, create and go to the build directory.
1119-cd hipCUB; mkdir build; cd build
1120+# Create a directory for the build and navigate to it.
1121+mkdir build; cd build
1122
1123 # Configure hipCUB, setup options for your system.
1124 # Build options:
1125-# BUILD_TEST - OFF by default,
1126-# BUILD_BENCHMARK - OFF by default.
1127-# DEPENDENCIES_FORCE_DOWNLOAD - OFF by default and at ON the dependencies will be downloaded to build folder,
1128+# BUILD_TEST - OFF by default,
1129+# BUILD_BENCHMARK - OFF by default.
1130+# ROCPRIM_FETCH_METHOD - One of PACKAGE (default), DOWNLOAD, and MONOREPO. See below for a description of each.
1131+# EXTERNAL_DEPS_FORCE_DOWNLOAD - OFF by default, forces download for non-ROCm dependencies (eg. Google Test / Benchmark).
1132+# DOWNLOAD_CUB - OFF by default, (Nvidia CUB backend only) forces download of CUB instead of searching for an installed package.
1133+# BUILD_OFFLOAD_COMPRESS - ON by default, compresses device code to reduce the size of the generated binary.
1134+# BUILD_EXAMPLE - OFF by default, builds examples.
1135+# BUILD_ADDRESS_SANITIZER - OFF by default, builds with clang address sanitizer enabled.
1136+# BUILD_COMPUTE_SANITIZER - OFF by default, (Nvidia CUB backend only) builds tests with CUDA's compute sanitizer enabled.
1137+# USE_SYSTEM_LIB - OFF by default, builds tests using the installed hipCUB provided by the system. This only takes effect when BUILD_TEST is ON.
1138+# USE_HIPCXX - OFF by default, builds with CMake HIP language support. This eliminates the need to set CXX.
1139 #
1140 # ! IMPORTANT !
1141 # Set C++ compiler to HIP-aware clang. You can do it by adding 'CXX=<path-to-compiler>'
1142 # before 'cmake' or setting cmake option 'CMAKE_CXX_COMPILER' to path to the compiler.
1143 #
1144 [CXX=hipcc] cmake ../. # or cmake-gui ../.
1145-
1146-# To configure hipCUB for Nvidia platforms, 'CXX=<path-to-nvcc>', `CXX=nvcc` or omitting the flag
1147-# entirely before 'cmake' is sufficient
1148-[CXX=nvcc] cmake -DBUILD_TEST=ON ../. # or cmake-gui ../.
1149-# or
1150-cmake -DBUILD_TEST=ON ../. # or cmake-gui ../.
1151 # or to build benchmarks
1152 cmake -DBUILD_BENCHMARK=ON ../.
1153
1154@@ -105,20 +99,31 @@ make package
1155 [sudo] make install
1156 ```
1157
1158+`ROCPRIM_FETCH_METHOD` can be used to control how hipCUB obtains the rocPRIM dependency. It must be set to one of the following values:
1159+* `PACKAGE` (default) - Searches for an installed package on the system that meets the minimum version requirement. If it is not found, the build will fall back using option `DOWNLOAD`.
1160+* `DOWNLOAD` - Clones rocPRIM from the upstream repository. If git >= 2.25 is present, this option uses a sparse checkout that avoids downloading more than it needs to. If not, the whole monorepo is downloaded (this may take some time).
1161+* `MONOREPO` - This value is intended to be used if you are building hipCUB from within a copy of the rocm-libraries repository that you have cloned (and therefore already contains rocPRIM). When selected, the build will try find the dependency in the local repository tree. If it cannot be found, the build will attempt to use git to perform a sparse-checkout of rocPRIM. If that also fails, it will fall back to using the `DOWNLOAD` option described above.
1162+
1163 ### HIP on Windows
1164
1165 Initial support for HIP on Windows is available. You can install it using the provided `rmake.py` Python
1166-script:
1167+script. To do this, first, clone rocThrust using the steps described in [obtaining the source code](#obtaining-the-source-code).
1168+Next:
1169
1170 ```shell
1171-git clone https://github.com/ROCm/hipCUB.git
1172-cd hipCUB
1173+cd projects/hipcub
1174
1175 # the -i option will install rocPRIM to C:\hipSDK by default
1176 python rmake.py -i
1177
1178 # the -c option will build all clients including unit tests
1179 python rmake.py -c
1180+
1181+# to build for a specific architecture only, use the -a option
1182+python rmake.py -ci -a gfx1100
1183+
1184+# for a full list of available options, please refer to the help documentation
1185+python rmake.py -h
1186 ```
1187
1188 ### Using hipCUB
1189@@ -127,15 +132,13 @@ To use hipCUB in a CMake project, we recommended using the package configuration
1190
1191 ```cmake
1192 # On ROCm hipCUB requires rocPRIM
1193-find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/rocprim")
1194+find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/lib/cmake/rocprim")
1195
1196 # "/opt/rocm" - default install prefix
1197-find_package(hipcub REQUIRED CONFIG PATHS "/opt/rocm/hipcub")
1198+find_package(hipcub REQUIRED CONFIG PATHS "/opt/rocm/lib/cmake/hipcub")
1199
1200 ...
1201 # On ROCm: includes hipCUB headers and roc::rocprim_hip target
1202-# On CUDA: includes only hipCUB headers, user has to include CUB directory
1203-target_link_libraries(<your_target> hip::hipcub)
1204 ```
1205
1206 Include only the main header file:
1207@@ -150,7 +153,7 @@ Depending on your current HIP platform, hipCUB includes CUB or rocPRIM headers.
1208
1209 ```shell
1210 # Go to hipCUB build directory
1211-cd hipCUB; cd build
1212+cd projects/hipcub; cd build
1213
1214 # To run all tests
1215 ctest
1216@@ -161,7 +164,7 @@ ctest
1217
1218 ### Using custom seeds for the tests
1219
1220-Go to the `hipCUB/test/hipcub/test_seed.hpp` file.
1221+Go to the `projects/hipcub/test/hipcub/test_seed.hpp` file.
1222
1223 ```cpp
1224 //(1)
1225@@ -190,7 +193,7 @@ static constexpr size_t seed_size = sizeof(seeds) / sizeof(seeds[0]);
1226
1227 ```shell
1228 # Go to hipCUB build directory
1229-cd hipCUB; cd build
1230+cd projects/hipcub; cd build
1231
1232 # To run benchmark for warp functions:
1233 # Further option can be found using --help
1234@@ -208,10 +211,67 @@ cd hipCUB; cd build
1235 ./benchmark/benchmark_device_<function_name> [--size <size>] [--trials <trials>]
1236 ```
1237
1238+## Building the documentation locally
1239+
1240+### Requirements
1241+
1242+#### Doxygen
1243+
1244+The build system uses Doxygen [version 1.9.4](https://github.com/doxygen/doxygen/releases/tag/Release_1_9_4). You can try using a newer version, but that might cause issues.
1245+
1246+After you have downloaded Doxygen version 1.9.4:
1247+
1248+```shell
1249+# Add doxygen to your PATH
1250+echo 'export PATH=<doxygen 1.9.4 path>/bin:$PATH' >> ~/.bashrc
1251+
1252+# Apply the updated .bashrc
1253+source ~/.bashrc
1254+
1255+# Confirm that you are using version 1.9.4
1256+doxygen --version
1257+```
1258+
1259+#### Python
1260+
1261+The build system uses Python version 3.10. You can try using a newer version, but that might cause issues.
1262+
1263+You can install Python 3.10 alongside your other Python versions using [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation):
1264+
1265+```shell
1266+# Install Python 3.10
1267+pyenv install 3.10
1268+
1269+# Create a Python 3.10 virtual environment
1270+pyenv virtualenv 3.10 venv_hipcub
1271+
1272+# Activate the virtual environment
1273+pyenv activate venv_hipcub
1274+```
1275+
1276+### Building
1277+
1278+After cloning this repository (see [obtaining the source code](#obtaining-the-source-code)):
1279+
1280+```shell
1281+cd rocm-libraries/projects/hipcub
1282+
1283+# Install Python dependencies
1284+python3 -m pip install -r docs/sphinx/requirements.txt
1285+
1286+# Build the documentation
1287+python3 -m sphinx -T -E -b html -d docs/_build/doctrees -D language=en docs docs/_build/html
1288+```
1289+
1290+You can then open `docs/_build/html/index.html` in your browser to view the documentation.
1291+
1292 ## Support
1293
1294-Bugs and feature requests can be reported through the
1295-[GitHub issue tracker](https://github.com/ROCm/hipCUB/issues).
1296+You can report bugs and feature requests through the GitHub
1297+[issue tracker](https://github.com/ROCm/rocm-libraries/issues).
1298+To help ensure that your issue is seen by the right team more quickly, when creating your issue, please apply the label `project: hipcub`.
1299+Similarly, to filter the exising issue list down to only those affecting rocThrust, you can add the filter `label:"project: hipcub"`,
1300+or follow [this link](https://github.com/ROCm/rocm-libraries/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22project%3A%20hipcub%22).
1301
1302 ## Contributing
1303
1304diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
1305index 4a38bac..3e8f663 100644
1306--- a/benchmark/CMakeLists.txt
1307+++ b/benchmark/CMakeLists.txt
1308@@ -1,6 +1,6 @@
1309 # MIT License
1310 #
1311-# Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
1312+# Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
1313 #
1314 # Permission is hereby granted, free of charge, to any person obtaining a copy
1315 # of this software and associated documentation files (the "Software"), to deal
1316@@ -36,7 +36,7 @@ function(add_hipcub_benchmark BENCHMARK_SOURCE)
1317 hipcub
1318 )
1319 if((HIP_COMPILER STREQUAL "nvcc"))
1320- set_property(TARGET ${BENCHMARK_TARGET} PROPERTY CUDA_STANDARD 14)
1321+ set_property(TARGET ${BENCHMARK_TARGET} PROPERTY CUDA_STANDARD 17)
1322 set_source_files_properties(${BENCHMARK_SOURCE} PROPERTIES LANGUAGE CUDA)
1323 target_compile_options(${BENCHMARK_TARGET}
1324 PRIVATE
1325@@ -88,6 +88,7 @@ add_hipcub_benchmark(benchmark_device_batch_memcpy.cpp)
1326 add_hipcub_benchmark(benchmark_device_for.cpp)
1327 add_hipcub_benchmark(benchmark_device_histogram.cpp)
1328 add_hipcub_benchmark(benchmark_device_memory.cpp)
1329+add_hipcub_benchmark(benchmark_device_merge.cpp)
1330 add_hipcub_benchmark(benchmark_device_merge_sort.cpp)
1331 add_hipcub_benchmark(benchmark_device_partition.cpp)
1332 add_hipcub_benchmark(benchmark_device_radix_sort.cpp)
1333diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp
1334index 1cc7079..7c7ac6b 100644
1335--- a/benchmark/benchmark_block_adjacent_difference.cpp
1336+++ b/benchmark/benchmark_block_adjacent_difference.cpp
1337@@ -23,10 +23,9 @@
1338 #include "common_benchmark_header.hpp"
1339
1340 // HIP API
1341-#include "hipcub/block/block_adjacent_difference.hpp"
1342-
1343-#include "hipcub/block/block_load.hpp"
1344-#include "hipcub/block/block_store.hpp"
1345+#include <hipcub/block/block_adjacent_difference.hpp>
1346+#include <hipcub/block/block_load.hpp>
1347+#include <hipcub/block/block_store.hpp>
1348
1349 #ifndef DEFAULT_N
1350 const size_t DEFAULT_N = 1024 * 1024 * 128;
1351diff --git a/benchmark/benchmark_block_discontinuity.cpp b/benchmark/benchmark_block_discontinuity.cpp
1352index 24446c9..5e36160 100644
1353--- a/benchmark/benchmark_block_discontinuity.cpp
1354+++ b/benchmark/benchmark_block_discontinuity.cpp
1355@@ -20,14 +20,13 @@
1356 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1357 // SOFTWARE.
1358
1359-#include "common_benchmark_header.hpp"
1360-
1361 // HIP API
1362-#include "hipcub/block/block_discontinuity.hpp"
1363+#include <hipcub/block/block_discontinuity.hpp>
1364+#include <hipcub/block/block_load.hpp>
1365+#include <hipcub/block/block_store.hpp>
1366+#include <hipcub/thread/thread_operators.hpp> //to use hipcub::Equality
1367
1368-#include "hipcub/block/block_load.hpp"
1369-#include "hipcub/block/block_store.hpp"
1370-#include "hipcub/thread/thread_operators.hpp" //to use hipcub::Equality
1371+#include "common_benchmark_header.hpp"
1372
1373 #ifndef DEFAULT_N
1374 const size_t DEFAULT_N = 1024 * 1024 * 128;
1375diff --git a/benchmark/benchmark_block_exchange.cpp b/benchmark/benchmark_block_exchange.cpp
1376index a36d041..000cd41 100644
1377--- a/benchmark/benchmark_block_exchange.cpp
1378+++ b/benchmark/benchmark_block_exchange.cpp
1379@@ -23,9 +23,9 @@
1380 #include "common_benchmark_header.hpp"
1381
1382 // HIP API
1383-#include "hipcub/block/block_exchange.hpp"
1384-#include "hipcub/block/block_load.hpp"
1385-#include "hipcub/block/block_store.hpp"
1386+#include <hipcub/block/block_exchange.hpp>
1387+#include <hipcub/block/block_load.hpp>
1388+#include <hipcub/block/block_store.hpp>
1389
1390 #ifndef DEFAULT_N
1391 const size_t DEFAULT_N = 1024 * 1024 * 32;
1392diff --git a/benchmark/benchmark_block_histogram.cpp b/benchmark/benchmark_block_histogram.cpp
1393index 122ccc3..1206e04 100644
1394--- a/benchmark/benchmark_block_histogram.cpp
1395+++ b/benchmark/benchmark_block_histogram.cpp
1396@@ -23,7 +23,7 @@
1397 #include "common_benchmark_header.hpp"
1398
1399 // HIP API
1400-#include "hipcub/block/block_histogram.hpp"
1401+#include <hipcub/block/block_histogram.hpp>
1402
1403 #ifndef DEFAULT_N
1404 const size_t DEFAULT_N = 1024 * 1024 * 128;
1405diff --git a/benchmark/benchmark_block_merge_sort.cpp b/benchmark/benchmark_block_merge_sort.cpp
1406index 8316764..c8c7402 100644
1407--- a/benchmark/benchmark_block_merge_sort.cpp
1408+++ b/benchmark/benchmark_block_merge_sort.cpp
1409@@ -24,9 +24,9 @@
1410
1411 #include "../test/hipcub/test_utils_sort_comparator.hpp"
1412 // HIP API
1413-#include "hipcub/block/block_load.hpp"
1414-#include "hipcub/block/block_merge_sort.hpp"
1415-#include "hipcub/block/block_store.hpp"
1416+#include <hipcub/block/block_load.hpp>
1417+#include <hipcub/block/block_merge_sort.hpp>
1418+#include <hipcub/block/block_store.hpp>
1419
1420 #ifndef DEFAULT_N
1421 const size_t DEFAULT_N = 1024 * 1024 * 128;
1422diff --git a/benchmark/benchmark_block_radix_rank.cpp b/benchmark/benchmark_block_radix_rank.cpp
1423index 2dac0a5..ffcd1d7 100644
1424--- a/benchmark/benchmark_block_radix_rank.cpp
1425+++ b/benchmark/benchmark_block_radix_rank.cpp
1426@@ -23,11 +23,11 @@
1427 #include "common_benchmark_header.hpp"
1428
1429 // HIP API
1430-#include "hipcub/block/block_load.hpp"
1431-#include "hipcub/block/block_radix_rank.hpp"
1432-#include "hipcub/block/block_store.hpp"
1433+#include <hipcub/block/block_load.hpp>
1434+#include <hipcub/block/block_radix_rank.hpp>
1435+#include <hipcub/block/block_store.hpp>
1436
1437-#include "hipcub/block/radix_rank_sort_operations.hpp"
1438+#include <hipcub/block/radix_rank_sort_operations.hpp>
1439
1440 #ifndef DEFAULT_N
1441 const size_t DEFAULT_N = 1024 * 1024 * 128;
1442diff --git a/benchmark/benchmark_block_radix_sort.cpp b/benchmark/benchmark_block_radix_sort.cpp
1443index 0bae7b8..4b75c26 100644
1444--- a/benchmark/benchmark_block_radix_sort.cpp
1445+++ b/benchmark/benchmark_block_radix_sort.cpp
1446@@ -23,9 +23,9 @@
1447 #include "common_benchmark_header.hpp"
1448
1449 // HIP API
1450-#include "hipcub/block/block_load.hpp"
1451-#include "hipcub/block/block_radix_sort.hpp"
1452-#include "hipcub/block/block_store.hpp"
1453+#include <hipcub/block/block_load.hpp>
1454+#include <hipcub/block/block_radix_sort.hpp>
1455+#include <hipcub/block/block_store.hpp>
1456
1457 #ifndef DEFAULT_N
1458 const size_t DEFAULT_N = 1024 * 1024 * 128;
1459diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp
1460index bdb089e..fe4b815 100644
1461--- a/benchmark/benchmark_block_reduce.cpp
1462+++ b/benchmark/benchmark_block_reduce.cpp
1463@@ -23,8 +23,8 @@
1464 #include "common_benchmark_header.hpp"
1465
1466 // HIP API
1467-#include "hipcub/block/block_reduce.hpp"
1468-#include "hipcub/thread/thread_operators.hpp"
1469+#include <hipcub/block/block_reduce.hpp>
1470+#include <hipcub/thread/thread_operators.hpp>
1471
1472 #ifndef DEFAULT_N
1473 const size_t DEFAULT_N = 1024 * 1024 * 32;
1474diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp
1475index 6769fd4..a42d3c4 100644
1476--- a/benchmark/benchmark_block_run_length_decode.cpp
1477+++ b/benchmark/benchmark_block_run_length_decode.cpp
1478@@ -22,9 +22,9 @@
1479
1480 #include "common_benchmark_header.hpp"
1481
1482-#include "hipcub/block/block_load.hpp"
1483-#include "hipcub/block/block_run_length_decode.hpp"
1484-#include "hipcub/block/block_store.hpp"
1485+#include <hipcub/block/block_load.hpp>
1486+#include <hipcub/block/block_run_length_decode.hpp>
1487+#include <hipcub/block/block_store.hpp>
1488
1489 #ifndef DEFAULT_N
1490 const size_t DEFAULT_N = 1024 * 1024 * 32;
1491diff --git a/benchmark/benchmark_block_scan.cpp b/benchmark/benchmark_block_scan.cpp
1492index 340d3b4..51bf6c6 100644
1493--- a/benchmark/benchmark_block_scan.cpp
1494+++ b/benchmark/benchmark_block_scan.cpp
1495@@ -23,7 +23,7 @@
1496 #include "common_benchmark_header.hpp"
1497
1498 // hipCUB API
1499-#include "hipcub/block/block_scan.hpp"
1500+#include <hipcub/block/block_scan.hpp>
1501
1502 #ifndef DEFAULT_N
1503 const size_t DEFAULT_N = 1024 * 1024 * 32;
1504diff --git a/benchmark/benchmark_block_shuffle.cpp b/benchmark/benchmark_block_shuffle.cpp
1505index 4ba9fb0..697d381 100644
1506--- a/benchmark/benchmark_block_shuffle.cpp
1507+++ b/benchmark/benchmark_block_shuffle.cpp
1508@@ -22,7 +22,7 @@
1509
1510 #include "common_benchmark_header.hpp"
1511
1512-#include "hipcub/block/block_shuffle.hpp"
1513+#include <hipcub/block/block_shuffle.hpp>
1514
1515 #ifndef DEFAULT_N
1516 const size_t DEFAULT_N = 1024 * 1024 * 32;
1517diff --git a/benchmark/benchmark_device_batch_copy.cpp b/benchmark/benchmark_device_batch_copy.cpp
1518index feca312..909c50b 100644
1519--- a/benchmark/benchmark_device_batch_copy.cpp
1520+++ b/benchmark/benchmark_device_batch_copy.cpp
1521@@ -24,10 +24,10 @@
1522 #include "cmdparser.hpp"
1523 #include "common_benchmark_header.hpp"
1524
1525-#include "hipcub/block/block_load.hpp"
1526-#include "hipcub/block/block_store.hpp"
1527-#include "hipcub/device/device_copy.hpp"
1528-#include "hipcub/hipcub.hpp"
1529+#include <hipcub/block/block_load.hpp>
1530+#include <hipcub/block/block_store.hpp>
1531+#include <hipcub/device/device_copy.hpp>
1532+#include <hipcub/hipcub.hpp>
1533
1534 #include <hip/hip_runtime.h>
1535
1536diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp
1537index f0f38be..8de4f8e 100644
1538--- a/benchmark/benchmark_device_batch_memcpy.cpp
1539+++ b/benchmark/benchmark_device_batch_memcpy.cpp
1540@@ -24,10 +24,10 @@
1541 #include "cmdparser.hpp"
1542 #include "common_benchmark_header.hpp"
1543
1544-#include "hipcub/block/block_load.hpp"
1545-#include "hipcub/block/block_store.hpp"
1546-#include "hipcub/device/device_memcpy.hpp"
1547-#include "hipcub/hipcub.hpp"
1548+#include <hipcub/block/block_load.hpp>
1549+#include <hipcub/block/block_store.hpp>
1550+#include <hipcub/device/device_memcpy.hpp>
1551+#include <hipcub/hipcub.hpp>
1552
1553 #ifdef __HIP_PLATFORM_AMD__
1554 // Only include this on AMD as it contains specialized config information
1555@@ -37,6 +37,7 @@
1556 #include <hip/hip_runtime.h>
1557
1558 #include <iostream>
1559+#include <memory>
1560 #include <numeric>
1561 #include <random>
1562 #include <utility>
1563diff --git a/benchmark/benchmark_device_for.cpp b/benchmark/benchmark_device_for.cpp
1564index a5669ed..4b7a7d7 100644
1565--- a/benchmark/benchmark_device_for.cpp
1566+++ b/benchmark/benchmark_device_for.cpp
1567@@ -1,6 +1,6 @@
1568 // MIT License
1569 //
1570-// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
1571+// Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
1572 //
1573 // Permission is hereby granted, free of charge, to any person obtaining a copy
1574 // of this software and associated documentation files (the "Software"), to deal
1575@@ -26,7 +26,7 @@
1576 #include "common_benchmark_header.hpp"
1577
1578 // HIP API
1579-#include "hipcub/device/device_for.hpp"
1580+#include <hipcub/device/device_for.hpp>
1581
1582 #ifndef DEFAULT_N
1583 const size_t DEFAULT_N = 1024 * 1024 * 32;
1584@@ -71,7 +71,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size)
1585 // Warm-up
1586 for(size_t i = 0; i < warmup_size; i++)
1587 {
1588- HIP_CHECK(hipcub::ForEach(d_input, d_input + size, device_op, stream));
1589+ HIP_CHECK(hipcub::DeviceFor::ForEach(d_input, d_input + size, device_op, stream));
1590 }
1591 HIP_CHECK(hipDeviceSynchronize());
1592
1593@@ -81,7 +81,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size)
1594
1595 for(size_t i = 0; i < batch_size; i++)
1596 {
1597- HIP_CHECK(hipcub::ForEach(d_input, d_input + size, device_op, stream));
1598+ HIP_CHECK(hipcub::DeviceFor::ForEach(d_input, d_input + size, device_op, stream));
1599 }
1600 HIP_CHECK(hipStreamSynchronize(stream));
1601
1602diff --git a/benchmark/benchmark_device_histogram.cpp b/benchmark/benchmark_device_histogram.cpp
1603index a5019e4..ded31e2 100644
1604--- a/benchmark/benchmark_device_histogram.cpp
1605+++ b/benchmark/benchmark_device_histogram.cpp
1606@@ -29,8 +29,8 @@
1607 #include "common_benchmark_header.hpp"
1608
1609 // HIP API
1610-#include "hipcub/device/device_histogram.hpp"
1611-#include "hipcub/iterator/transform_input_iterator.hpp"
1612+#include <hipcub/device/device_histogram.hpp>
1613+#include <hipcub/iterator/transform_input_iterator.hpp>
1614
1615 #ifndef DEFAULT_N
1616 const size_t DEFAULT_N = 1024 * 1024 * 32;
1617diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp
1618index bb256ac..1e62167 100644
1619--- a/benchmark/benchmark_device_memory.cpp
1620+++ b/benchmark/benchmark_device_memory.cpp
1621@@ -1,6 +1,6 @@
1622 // MIT License
1623 //
1624-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
1625+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
1626 //
1627 // Permission is hereby granted, free of charge, to any person obtaining a copy
1628 // of this software and associated documentation files (the "Software"), to deal
1629@@ -22,9 +22,9 @@
1630
1631 #include "common_benchmark_header.hpp"
1632
1633-#include "hipcub/block/block_load.hpp"
1634-#include "hipcub/block/block_scan.hpp"
1635-#include "hipcub/block/block_store.hpp"
1636+#include <hipcub/block/block_load.hpp>
1637+#include <hipcub/block/block_scan.hpp>
1638+#include <hipcub/block/block_store.hpp>
1639
1640 enum memory_operation_method
1641 {
1642@@ -58,9 +58,10 @@ struct operation;
1643 template<typename T, unsigned int ItemsPerThread, unsigned int BlockSize>
1644 struct operation<no_operation, T, ItemsPerThread, BlockSize>
1645 {
1646- typedef empty_storage_type storage_type;
1647+ using storage_type = empty_storage_type;
1648
1649- HIPCUB_DEVICE inline void
1650+ HIPCUB_DEVICE
1651+ inline void
1652 operator()(storage_type& /*storage*/, T (&)[ItemsPerThread], T* = nullptr) const
1653 {}
1654 };
1655@@ -69,11 +70,13 @@ struct operation<no_operation, T, ItemsPerThread, BlockSize>
1656 template<typename T, unsigned int ItemsPerThread, unsigned int BlockSize>
1657 struct operation<custom_operation, T, ItemsPerThread, BlockSize>
1658 {
1659- typedef empty_storage_type storage_type;
1660+ using storage_type = empty_storage_type;
1661
1662- HIPCUB_DEVICE inline void operator()(storage_type& storage,
1663- T (&input)[ItemsPerThread],
1664- T* global_mem_output = nullptr) const
1665+ HIPCUB_DEVICE
1666+ inline void
1667+ operator()(storage_type& storage,
1668+ T (&input)[ItemsPerThread],
1669+ T* global_mem_output = nullptr) const
1670 {
1671 (void)storage;
1672 (void)global_mem_output;
1673@@ -96,14 +99,15 @@ struct operation<custom_operation, T, ItemsPerThread, BlockSize>
1674 template<typename T, unsigned int ItemsPerThread, unsigned int BlockSize>
1675 struct operation<block_scan, T, ItemsPerThread, BlockSize>
1676 {
1677- typedef
1678- typename hipcub::BlockScan<T, BlockSize, hipcub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS>
1679- block_scan_type;
1680- typedef typename block_scan_type::TempStorage storage_type;
1681-
1682- HIPCUB_DEVICE inline void operator()(storage_type& storage,
1683- T (&input)[ItemsPerThread],
1684- T* global_mem_output = nullptr)
1685+ using block_scan_type =
1686+ typename hipcub::BlockScan<T, BlockSize, hipcub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS>;
1687+ using storage_type = typename block_scan_type::TempStorage;
1688+
1689+ HIPCUB_DEVICE
1690+ inline void
1691+ operator()(storage_type& storage,
1692+ T (&input)[ItemsPerThread],
1693+ T* global_mem_output = nullptr)
1694 {
1695 (void)global_mem_output;
1696
1697@@ -117,11 +121,13 @@ struct operation<block_scan, T, ItemsPerThread, BlockSize>
1698 template<typename T, unsigned int ItemsPerThread, unsigned int BlockSize>
1699 struct operation<atomics_no_collision, T, ItemsPerThread, BlockSize>
1700 {
1701- typedef empty_storage_type storage_type;
1702+ using storage_type = empty_storage_type;
1703
1704- HIPCUB_DEVICE inline void operator()(storage_type& storage,
1705- T (&input)[ItemsPerThread],
1706- T* global_mem_output = nullptr)
1707+ HIPCUB_DEVICE
1708+ inline void
1709+ operator()(storage_type& storage,
1710+ T (&input)[ItemsPerThread],
1711+ T* global_mem_output = nullptr)
1712 {
1713 (void)storage;
1714 (void)input;
1715@@ -140,11 +146,13 @@ struct operation<atomics_no_collision, T, ItemsPerThread, BlockSize>
1716 template<typename T, unsigned int ItemsPerThread, unsigned int BlockSize>
1717 struct operation<atomics_inter_warp_collision, T, ItemsPerThread, BlockSize>
1718 {
1719- typedef empty_storage_type storage_type;
1720+ using storage_type = empty_storage_type;
1721
1722- HIPCUB_DEVICE inline void operator()(storage_type& storage,
1723- T (&input)[ItemsPerThread],
1724- T* global_mem_output = nullptr)
1725+ HIPCUB_DEVICE
1726+ inline void
1727+ operator()(storage_type& storage,
1728+ T (&input)[ItemsPerThread],
1729+ T* global_mem_output = nullptr)
1730 {
1731 (void)storage;
1732 (void)input;
1733@@ -163,11 +171,13 @@ struct operation<atomics_inter_warp_collision, T, ItemsPerThread, BlockSize>
1734 template<typename T, unsigned int ItemsPerThread, unsigned int BlockSize>
1735 struct operation<atomics_inter_block_collision, T, ItemsPerThread, BlockSize>
1736 {
1737- typedef empty_storage_type storage_type;
1738+ using storage_type = empty_storage_type;
1739
1740- HIPCUB_DEVICE inline void operator()(storage_type& storage,
1741- T (&input)[ItemsPerThread],
1742- T* global_mem_output = nullptr)
1743+ HIPCUB_DEVICE
1744+ inline void
1745+ operator()(storage_type& storage,
1746+ T (&input)[ItemsPerThread],
1747+ T* global_mem_output = nullptr)
1748 {
1749 (void)storage;
1750 (void)input;
1751@@ -237,9 +247,9 @@ template<typename T,
1752 typename CustomOp>
1753 __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op)
1754 {
1755- typedef memory_operation<MemOp> mem_op;
1756- typedef hipcub::BlockLoad<T, BlockSize, ItemsPerThread, mem_op::load_type> load_type;
1757- typedef hipcub::BlockStore<T, BlockSize, ItemsPerThread, mem_op::store_type> store_type;
1758+ using mem_op = memory_operation<MemOp>;
1759+ using load_type = hipcub::BlockLoad<T, BlockSize, ItemsPerThread, mem_op::load_type>;
1760+ using store_type = hipcub::BlockStore<T, BlockSize, ItemsPerThread, mem_op::store_type>;
1761
1762 __shared__ union
1763 {
1764diff --git a/benchmark/benchmark_device_merge.cpp b/benchmark/benchmark_device_merge.cpp
1765new file mode 100644
1766index 0000000..e22d6ea
1767--- /dev/null
1768+++ b/benchmark/benchmark_device_merge.cpp
1769@@ -0,0 +1,364 @@
1770+// MIT License
1771+//
1772+// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
1773+//
1774+// Permission is hereby granted, free of charge, to any person obtaining a copy
1775+// of this software and associated documentation files (the "Software"), to deal
1776+// in the Software without restriction, including without limitation the rights
1777+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1778+// copies of the Software, and to permit persons to whom the Software is
1779+// furnished to do so, subject to the following conditions:
1780+//
1781+// The above copyright notice and this permission notice shall be included in
1782+// all copies or substantial portions of the Software.
1783+//
1784+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1785+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1786+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1787+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1788+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1789+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1790+// SOFTWARE.
1791+
1792+#include "common_benchmark_header.hpp"
1793+
1794+// HIP API
1795+#include <hipcub/device/device_merge.hpp>
1796+
1797+#ifndef DEFAULT_N
1798+const size_t DEFAULT_N = 1024 * 1024 * 32;
1799+#endif
1800+
1801+const unsigned int batch_size = 10;
1802+const unsigned int warmup_size = 5;
1803+
1804+template<class key_type>
1805+struct CompareFunction
1806+{
1807+ HIPCUB_HOST_DEVICE
1808+ inline constexpr bool
1809+ operator()(const key_type& a, const key_type& b)
1810+ {
1811+ return a < b;
1812+ }
1813+};
1814+
1815+template<class Key>
1816+void run_merge_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t size)
1817+{
1818+ using key_type = Key;
1819+
1820+ CompareFunction<key_type> compare_function;
1821+
1822+ const size_t size1 = size / 2;
1823+ const size_t size2 = size - size1;
1824+
1825+ std::vector<key_type> keys_input1 = benchmark_utils::get_random_data<key_type>(
1826+ size1,
1827+ benchmark_utils::generate_limits<key_type>::min(),
1828+ benchmark_utils::generate_limits<key_type>::max());
1829+
1830+ std::vector<key_type> keys_input2 = benchmark_utils::get_random_data<key_type>(
1831+ size2,
1832+ benchmark_utils::generate_limits<key_type>::min(),
1833+ benchmark_utils::generate_limits<key_type>::max());
1834+
1835+ std::sort(keys_input1.begin(), keys_input1.end(), compare_function);
1836+ std::sort(keys_input2.begin(), keys_input2.end(), compare_function);
1837+
1838+ key_type* d_keys_input1;
1839+ HIP_CHECK(hipMalloc(&d_keys_input1, size1 * sizeof(key_type)));
1840+ HIP_CHECK(hipMemcpy(d_keys_input1,
1841+ keys_input1.data(),
1842+ size1 * sizeof(key_type),
1843+ hipMemcpyHostToDevice));
1844+
1845+ key_type* d_keys_input2;
1846+ HIP_CHECK(hipMalloc(&d_keys_input2, size2 * sizeof(key_type)));
1847+ HIP_CHECK(hipMemcpy(d_keys_input2,
1848+ keys_input2.data(),
1849+ size2 * sizeof(key_type),
1850+ hipMemcpyHostToDevice));
1851+
1852+ key_type* d_keys_output;
1853+ HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
1854+
1855+ void* d_temporary_storage = nullptr;
1856+ size_t temporary_storage_bytes = 0;
1857+ HIP_CHECK(hipcub::DeviceMerge::MergeKeys(d_temporary_storage,
1858+ temporary_storage_bytes,
1859+ d_keys_input1,
1860+ size1,
1861+ d_keys_input2,
1862+ size2,
1863+ d_keys_output,
1864+ compare_function,
1865+ stream));
1866+
1867+ HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes));
1868+
1869+ // Warm-up
1870+ for(size_t i = 0; i < warmup_size; i++)
1871+ {
1872+ HIP_CHECK(hipcub::DeviceMerge::MergeKeys(d_temporary_storage,
1873+ temporary_storage_bytes,
1874+ d_keys_input1,
1875+ size1,
1876+ d_keys_input2,
1877+ size2,
1878+ d_keys_output,
1879+ compare_function,
1880+ stream));
1881+ }
1882+ HIP_CHECK(hipDeviceSynchronize());
1883+
1884+ for(auto _ : state)
1885+ {
1886+ auto start = std::chrono::high_resolution_clock::now();
1887+
1888+ for(size_t i = 0; i < batch_size; i++)
1889+ {
1890+ HIP_CHECK(hipcub::DeviceMerge::MergeKeys(d_temporary_storage,
1891+ temporary_storage_bytes,
1892+ d_keys_input1,
1893+ size1,
1894+ d_keys_input2,
1895+ size2,
1896+ d_keys_output,
1897+ compare_function,
1898+ stream));
1899+ }
1900+ HIP_CHECK(hipDeviceSynchronize());
1901+
1902+ auto end = std::chrono::high_resolution_clock::now();
1903+ auto elapsed_seconds
1904+ = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
1905+ state.SetIterationTime(elapsed_seconds.count());
1906+ }
1907+ state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type));
1908+ state.SetItemsProcessed(state.iterations() * batch_size * size);
1909+
1910+ HIP_CHECK(hipFree(d_temporary_storage));
1911+ HIP_CHECK(hipFree(d_keys_input1));
1912+ HIP_CHECK(hipFree(d_keys_input2));
1913+ HIP_CHECK(hipFree(d_keys_output));
1914+}
1915+
1916+template<class Key, class Value>
1917+void run_merge_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_t size)
1918+{
1919+ using key_type = Key;
1920+ using value_type = Value;
1921+
1922+ CompareFunction<key_type> compare_function;
1923+
1924+ const size_t size1 = size / 2;
1925+ const size_t size2 = size - size1;
1926+
1927+ std::vector<key_type> keys_input1 = benchmark_utils::get_random_data<key_type>(
1928+ size1,
1929+ benchmark_utils::generate_limits<key_type>::min(),
1930+ benchmark_utils::generate_limits<key_type>::max());
1931+ std::vector<key_type> keys_input2 = benchmark_utils::get_random_data<key_type>(
1932+ size2,
1933+ benchmark_utils::generate_limits<key_type>::min(),
1934+ benchmark_utils::generate_limits<key_type>::max());
1935+
1936+ std::sort(keys_input1.begin(), keys_input1.end(), compare_function);
1937+ std::sort(keys_input2.begin(), keys_input2.end(), compare_function);
1938+
1939+ key_type* d_keys_input1;
1940+ HIP_CHECK(hipMalloc(&d_keys_input1, size1 * sizeof(key_type)));
1941+ HIP_CHECK(hipMemcpy(d_keys_input1,
1942+ keys_input1.data(),
1943+ size1 * sizeof(key_type),
1944+ hipMemcpyHostToDevice));
1945+
1946+ key_type* d_keys_input2;
1947+ HIP_CHECK(hipMalloc(&d_keys_input2, size2 * sizeof(key_type)));
1948+ HIP_CHECK(hipMemcpy(d_keys_input2,
1949+ keys_input2.data(),
1950+ size2 * sizeof(key_type),
1951+ hipMemcpyHostToDevice));
1952+
1953+ key_type* d_keys_output;
1954+ HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
1955+
1956+ std::vector<value_type> values_input1(size1);
1957+ std::iota(values_input1.begin(), values_input1.end(), 0);
1958+ value_type* d_values_input1;
1959+ HIP_CHECK(hipMalloc(&d_values_input1, size1 * sizeof(value_type)));
1960+ HIP_CHECK(hipMemcpy(d_values_input1,
1961+ values_input1.data(),
1962+ size1 * sizeof(value_type),
1963+ hipMemcpyHostToDevice));
1964+
1965+ std::vector<value_type> values_input2(size2);
1966+ std::iota(values_input2.begin(), values_input2.end(), size1);
1967+ value_type* d_values_input2;
1968+ HIP_CHECK(hipMalloc(&d_values_input2, size2 * sizeof(value_type)));
1969+ HIP_CHECK(hipMemcpy(d_values_input2,
1970+ values_input2.data(),
1971+ size2 * sizeof(value_type),
1972+ hipMemcpyHostToDevice));
1973+
1974+ value_type* d_values_output;
1975+ HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type)));
1976+
1977+ void* d_temporary_storage = nullptr;
1978+ size_t temporary_storage_bytes = 0;
1979+ HIP_CHECK(hipcub::DeviceMerge::MergePairs(d_temporary_storage,
1980+ temporary_storage_bytes,
1981+ d_keys_input1,
1982+ d_values_input1,
1983+ size1,
1984+ d_keys_input2,
1985+ d_values_input2,
1986+ size2,
1987+ d_keys_output,
1988+ d_values_output,
1989+ compare_function,
1990+ stream));
1991+
1992+ HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes));
1993+
1994+ // Warm-up
1995+ for(size_t i = 0; i < warmup_size; i++)
1996+ {
1997+ HIP_CHECK(hipcub::DeviceMerge::MergePairs(d_temporary_storage,
1998+ temporary_storage_bytes,
1999+ d_keys_input1,
2000+ d_values_input1,
2001+ size1,
2002+ d_keys_input2,
2003+ d_values_input2,
2004+ size2,
2005+ d_keys_output,
2006+ d_values_output,
2007+ compare_function,
2008+ stream));
2009+ }
2010+ HIP_CHECK(hipDeviceSynchronize());
2011+
2012+ for(auto _ : state)
2013+ {
2014+ auto start = std::chrono::high_resolution_clock::now();
2015+
2016+ for(size_t i = 0; i < batch_size; i++)
2017+ {
2018+ HIP_CHECK(hipcub::DeviceMerge::MergePairs(d_temporary_storage,
2019+ temporary_storage_bytes,
2020+ d_keys_input1,
2021+ d_values_input1,
2022+ size1,
2023+ d_keys_input2,
2024+ d_values_input2,
2025+ size2,
2026+ d_keys_output,
2027+ d_values_output,
2028+ compare_function,
2029+ stream));
2030+ }
2031+ HIP_CHECK(hipDeviceSynchronize());
2032+
2033+ auto end = std::chrono::high_resolution_clock::now();
2034+ auto elapsed_seconds
2035+ = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
2036+ state.SetIterationTime(elapsed_seconds.count());
2037+ }
2038+ state.SetBytesProcessed(state.iterations() * batch_size * size
2039+ * (sizeof(key_type) + sizeof(value_type)));
2040+ state.SetItemsProcessed(state.iterations() * batch_size * size);
2041+
2042+ HIP_CHECK(hipFree(d_temporary_storage));
2043+ HIP_CHECK(hipFree(d_keys_input1));
2044+ HIP_CHECK(hipFree(d_keys_input2));
2045+ HIP_CHECK(hipFree(d_keys_output));
2046+ HIP_CHECK(hipFree(d_values_input1));
2047+ HIP_CHECK(hipFree(d_values_input2));
2048+ HIP_CHECK(hipFree(d_values_output));
2049+}
2050+
2051+#define CREATE_MERGE_KEYS_BENCHMARK(T) \
2052+ benchmarks.push_back(benchmark::RegisterBenchmark( \
2053+ std::string("device_merge_keys" \
2054+ "<key_data_type:" #T ">.") \
2055+ .c_str(), \
2056+ [=](benchmark::State& state) { run_merge_keys_benchmark<T>(state, stream, size); }));
2057+
2058+#define CREATE_MERGE_PAIRS_BENCHMARK(T, V) \
2059+ benchmarks.push_back(benchmark::RegisterBenchmark( \
2060+ std::string("device_merge_pairs<" \
2061+ ",key_data_type:" #T ",value_data_type:" #V ">.") \
2062+ .c_str(), \
2063+ [=](benchmark::State& state) { run_merge_pairs_benchmark<T, V>(state, stream, size); }));
2064+
2065+int main(int argc, char* argv[])
2066+{
2067+ cli::Parser parser(argc, argv);
2068+ parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
2069+ parser.set_optional<int>("trials", "trials", -1, "number of iterations");
2070+ parser.run_and_exit_if_error();
2071+
2072+ // Parse argv
2073+ benchmark::Initialize(&argc, argv);
2074+ const size_t size = parser.get<size_t>("size");
2075+ const int trials = parser.get<int>("trials");
2076+
2077+ // HIP
2078+ hipStream_t stream = 0; // default
2079+ hipDeviceProp_t devProp;
2080+ int device_id = 0;
2081+ HIP_CHECK(hipGetDevice(&device_id));
2082+ HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
2083+
2084+ std::cout << "benchmark_device_merge" << std::endl;
2085+ std::cout << "[HIP] Device name: " << devProp.name << std::endl;
2086+
2087+ // Add benchmarks
2088+ std::vector<benchmark::internal::Benchmark*> benchmarks;
2089+
2090+ using custom_float2 = benchmark_utils::custom_type<float, float>;
2091+ using custom_double2 = benchmark_utils::custom_type<double, double>;
2092+ using custom_char_double = benchmark_utils::custom_type<char, double>;
2093+ using custom_double_char = benchmark_utils::custom_type<double, char>;
2094+
2095+ CREATE_MERGE_KEYS_BENCHMARK(int)
2096+ CREATE_MERGE_KEYS_BENCHMARK(long long)
2097+ CREATE_MERGE_KEYS_BENCHMARK(int8_t)
2098+ CREATE_MERGE_KEYS_BENCHMARK(uint8_t)
2099+ CREATE_MERGE_KEYS_BENCHMARK(short)
2100+ CREATE_MERGE_KEYS_BENCHMARK(double)
2101+ CREATE_MERGE_KEYS_BENCHMARK(float)
2102+ CREATE_MERGE_KEYS_BENCHMARK(custom_float2)
2103+ CREATE_MERGE_KEYS_BENCHMARK(custom_double2)
2104+
2105+ CREATE_MERGE_PAIRS_BENCHMARK(int, int)
2106+ CREATE_MERGE_PAIRS_BENCHMARK(long long, long long)
2107+ CREATE_MERGE_PAIRS_BENCHMARK(int8_t, int8_t)
2108+ CREATE_MERGE_PAIRS_BENCHMARK(uint8_t, uint8_t)
2109+ CREATE_MERGE_PAIRS_BENCHMARK(short, short)
2110+ CREATE_MERGE_PAIRS_BENCHMARK(custom_char_double, custom_char_double)
2111+ CREATE_MERGE_PAIRS_BENCHMARK(int, custom_double_char)
2112+ CREATE_MERGE_PAIRS_BENCHMARK(custom_double2, custom_double2)
2113+
2114+ // Use manual timing
2115+ for(auto& b : benchmarks)
2116+ {
2117+ b->UseManualTime();
2118+ b->Unit(benchmark::kMillisecond);
2119+ }
2120+
2121+ // Force number of iterations
2122+ if(trials > 0)
2123+ {
2124+ for(auto& b : benchmarks)
2125+ {
2126+ b->Iterations(trials);
2127+ }
2128+ }
2129+
2130+ // Run benchmarks
2131+ benchmark::RunSpecifiedBenchmarks();
2132+ return 0;
2133+}
2134diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp
2135index fc64fc0..284892c 100644
2136--- a/benchmark/benchmark_device_merge_sort.cpp
2137+++ b/benchmark/benchmark_device_merge_sort.cpp
2138@@ -23,8 +23,8 @@
2139 #include "common_benchmark_header.hpp"
2140
2141 // HIP API
2142-#include "hipcub/device/device_merge_sort.hpp"
2143-#include "hipcub/hipcub.hpp"
2144+#include <hipcub/device/device_merge_sort.hpp>
2145+#include <hipcub/hipcub.hpp>
2146
2147 #ifndef DEFAULT_N
2148 const size_t DEFAULT_N = 32 << 20;
2149diff --git a/benchmark/benchmark_device_partition.cpp b/benchmark/benchmark_device_partition.cpp
2150index 786fe13..f925e42 100644
2151--- a/benchmark/benchmark_device_partition.cpp
2152+++ b/benchmark/benchmark_device_partition.cpp
2153@@ -23,7 +23,7 @@
2154 #include "common_benchmark_header.hpp"
2155
2156 // HIP API
2157-#include "hipcub/device/device_partition.hpp"
2158+#include <hipcub/device/device_partition.hpp>
2159
2160 #include <chrono>
2161 #include <vector>
2162diff --git a/benchmark/benchmark_device_radix_sort.cpp b/benchmark/benchmark_device_radix_sort.cpp
2163index c65abd8..2e40b3b 100644
2164--- a/benchmark/benchmark_device_radix_sort.cpp
2165+++ b/benchmark/benchmark_device_radix_sort.cpp
2166@@ -26,7 +26,7 @@
2167 #include <type_traits>
2168
2169 // HIP API
2170-#include "hipcub/device/device_radix_sort.hpp"
2171+#include <hipcub/device/device_radix_sort.hpp>
2172
2173 #ifndef DEFAULT_N
2174 const size_t DEFAULT_N = 1024 * 1024 * 32;
2175diff --git a/benchmark/benchmark_device_reduce.cpp b/benchmark/benchmark_device_reduce.cpp
2176index 2a4d9df..8dcd968 100644
2177--- a/benchmark/benchmark_device_reduce.cpp
2178+++ b/benchmark/benchmark_device_reduce.cpp
2179@@ -21,9 +21,10 @@
2180 // SOFTWARE.
2181
2182 #include "common_benchmark_header.hpp"
2183+#include "hipcub/config.hpp"
2184
2185 // HIP API
2186-#include "hipcub/device/device_reduce.hpp"
2187+#include <hipcub/device/device_reduce.hpp>
2188
2189 #ifndef DEFAULT_N
2190 const size_t DEFAULT_N = 1024 * 1024 * 128;
2191@@ -118,8 +119,10 @@ struct Benchmark<T, hipcub::ArgMin>
2192
2193 static void run(benchmark::State& state, size_t size, const hipStream_t stream)
2194 {
2195+ HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH
2196 hipError_t (*ptr_to_argmin)(void*, size_t&, T*, KeyValue*, int, hipStream_t)
2197- = &hipcub::DeviceReduce::ArgMin;
2198+ = &hipcub::DeviceReduce::ArgMin;
2199+ HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP
2200 run_benchmark<T, KeyValue>(state, size, stream, ptr_to_argmin);
2201 }
2202 };
2203diff --git a/benchmark/benchmark_device_reduce_by_key.cpp b/benchmark/benchmark_device_reduce_by_key.cpp
2204index 54209e6..0d9160f 100644
2205--- a/benchmark/benchmark_device_reduce_by_key.cpp
2206+++ b/benchmark/benchmark_device_reduce_by_key.cpp
2207@@ -29,7 +29,7 @@
2208 #include "common_benchmark_header.hpp"
2209
2210 // HIP API
2211-#include "hipcub/device/device_reduce.hpp"
2212+#include <hipcub/device/device_reduce.hpp>
2213
2214 #ifndef DEFAULT_N
2215 const size_t DEFAULT_N = 1024 * 1024 * 32;
2216diff --git a/benchmark/benchmark_device_run_length_encode.cpp b/benchmark/benchmark_device_run_length_encode.cpp
2217index b7ef64b..f0c8585 100644
2218--- a/benchmark/benchmark_device_run_length_encode.cpp
2219+++ b/benchmark/benchmark_device_run_length_encode.cpp
2220@@ -29,7 +29,7 @@
2221 #include "common_benchmark_header.hpp"
2222
2223 // HIP API
2224-#include "hipcub/device/device_run_length_encode.hpp"
2225+#include <hipcub/device/device_run_length_encode.hpp>
2226
2227 #ifndef DEFAULT_N
2228 const size_t DEFAULT_N = 1024 * 1024 * 32;
2229diff --git a/benchmark/benchmark_device_scan.cpp b/benchmark/benchmark_device_scan.cpp
2230index dbfdda6..5d38b96 100644
2231--- a/benchmark/benchmark_device_scan.cpp
2232+++ b/benchmark/benchmark_device_scan.cpp
2233@@ -29,7 +29,7 @@
2234 #include "common_benchmark_header.hpp"
2235
2236 // HIP API
2237-#include "hipcub/device/device_scan.hpp"
2238+#include <hipcub/device/device_scan.hpp>
2239
2240 #ifndef DEFAULT_N
2241 const size_t DEFAULT_N = 1024 * 1024 * 32;
2242diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp
2243index 05566d6..252e8ff 100644
2244--- a/benchmark/benchmark_device_segmented_radix_sort.cpp
2245+++ b/benchmark/benchmark_device_segmented_radix_sort.cpp
2246@@ -1,6 +1,6 @@
2247 // MIT License
2248 //
2249-// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
2250+// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
2251 //
2252 // Permission is hereby granted, free of charge, to any person obtaining a copy
2253 // of this software and associated documentation files (the "Software"), to deal
2254@@ -23,7 +23,7 @@
2255 #include "common_benchmark_header.hpp"
2256
2257 // HIP API
2258-#include "hipcub/hipcub.hpp"
2259+#include <hipcub/hipcub.hpp>
2260
2261 #ifndef DEFAULT_N
2262 const size_t DEFAULT_N = 1024 * 1024 * 32;
2263@@ -44,17 +44,17 @@ void run_sort_keys_benchmark(benchmark::State& state,
2264 {
2265 using offset_type = int;
2266 using key_type = Key;
2267- typedef hipError_t (*sort_func)(void*,
2268- size_t&,
2269- const key_type*,
2270- key_type*,
2271- int,
2272- int,
2273- offset_type*,
2274- offset_type*,
2275- int,
2276- int,
2277- hipStream_t);
2278+ using sort_func = hipError_t (*)(void*,
2279+ size_t&,
2280+ const key_type*,
2281+ key_type*,
2282+ int,
2283+ int,
2284+ offset_type*,
2285+ offset_type*,
2286+ int,
2287+ int,
2288+ hipStream_t);
2289
2290 sort_func func_ascending = &hipcub::DeviceSegmentedRadixSort::SortKeys<key_type, offset_type*>;
2291 sort_func func_descending
2292@@ -180,19 +180,19 @@ void run_sort_pairs_benchmark(benchmark::State& state,
2293 using offset_type = int;
2294 using key_type = Key;
2295 using value_type = Value;
2296- typedef hipError_t (*sort_func)(void*,
2297- size_t&,
2298- const key_type*,
2299- key_type*,
2300- const value_type*,
2301- value_type*,
2302- int,
2303- int,
2304- offset_type*,
2305- offset_type*,
2306- int,
2307- int,
2308- hipStream_t);
2309+ using sort_func = hipError_t (*)(void*,
2310+ size_t&,
2311+ const key_type*,
2312+ key_type*,
2313+ const value_type*,
2314+ value_type*,
2315+ int,
2316+ int,
2317+ offset_type*,
2318+ offset_type*,
2319+ int,
2320+ int,
2321+ hipStream_t);
2322
2323 sort_func func_ascending
2324 = &hipcub::DeviceSegmentedRadixSort::SortPairs<key_type, value_type, offset_type*>;
2325diff --git a/benchmark/benchmark_device_segmented_reduce.cpp b/benchmark/benchmark_device_segmented_reduce.cpp
2326index d1e40c6..1bf1316 100644
2327--- a/benchmark/benchmark_device_segmented_reduce.cpp
2328+++ b/benchmark/benchmark_device_segmented_reduce.cpp
2329@@ -23,7 +23,7 @@
2330 #include "common_benchmark_header.hpp"
2331
2332 // HIP API
2333-#include "hipcub/device/device_segmented_reduce.hpp"
2334+#include <hipcub/device/device_segmented_reduce.hpp>
2335
2336 #ifndef DEFAULT_N
2337 const size_t DEFAULT_N = 1024 * 1024 * 32;
2338diff --git a/benchmark/benchmark_device_segmented_sort.cpp b/benchmark/benchmark_device_segmented_sort.cpp
2339index e9bbaf3..db69075 100644
2340--- a/benchmark/benchmark_device_segmented_sort.cpp
2341+++ b/benchmark/benchmark_device_segmented_sort.cpp
2342@@ -1,6 +1,6 @@
2343 // MIT License
2344 //
2345-// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
2346+// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
2347 //
2348 // Permission is hereby granted, free of charge, to any person obtaining a copy
2349 // of this software and associated documentation files (the "Software"), to deal
2350@@ -23,7 +23,7 @@
2351 #include "common_benchmark_header.hpp"
2352
2353 // HIP API
2354-#include "hipcub/hipcub.hpp"
2355+#include <hipcub/hipcub.hpp>
2356
2357 #ifndef DEFAULT_N
2358 const size_t DEFAULT_N = 1024 * 1024 * 32;
2359@@ -42,15 +42,15 @@ void run_sort_keys_benchmark(benchmark::State& state,
2360 {
2361 using offset_type = int;
2362 using key_type = Key;
2363- typedef hipError_t (*sort_func)(void*,
2364- size_t&,
2365- const key_type*,
2366- key_type*,
2367- int,
2368- int,
2369- offset_type*,
2370- offset_type*,
2371- hipStream_t);
2372+ using sort_func = hipError_t (*)(void*,
2373+ size_t&,
2374+ const key_type*,
2375+ key_type*,
2376+ int,
2377+ int,
2378+ offset_type*,
2379+ offset_type*,
2380+ hipStream_t);
2381
2382 sort_func func_ascending = &hipcub::DeviceSegmentedSort::SortKeys<key_type, offset_type*>;
2383 sort_func func_descending
2384@@ -175,17 +175,17 @@ void run_sort_pairs_benchmark(benchmark::State& state,
2385 using offset_type = int;
2386 using key_type = Key;
2387 using value_type = Value;
2388- typedef hipError_t (*sort_func)(void*,
2389- size_t&,
2390- const key_type*,
2391- key_type*,
2392- const value_type*,
2393- value_type*,
2394- int,
2395- int,
2396- offset_type*,
2397- offset_type*,
2398- hipStream_t);
2399+ using sort_func = hipError_t (*)(void*,
2400+ size_t&,
2401+ const key_type*,
2402+ key_type*,
2403+ const value_type*,
2404+ value_type*,
2405+ int,
2406+ int,
2407+ offset_type*,
2408+ offset_type*,
2409+ hipStream_t);
2410
2411 sort_func func_ascending
2412 = &hipcub::DeviceSegmentedSort::SortPairs<key_type, value_type, offset_type*>;
2413diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp
2414index a14cbdd..04097ec 100644
2415--- a/benchmark/benchmark_device_select.cpp
2416+++ b/benchmark/benchmark_device_select.cpp
2417@@ -1,6 +1,6 @@
2418 // MIT License
2419 //
2420-// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
2421+// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
2422 //
2423 // Permission is hereby granted, free of charge, to any person obtaining a copy
2424 // of this software and associated documentation files (the "Software"), to deal
2425@@ -23,7 +23,7 @@
2426 #include "common_benchmark_header.hpp"
2427
2428 // HIP API
2429-#include "hipcub/device/device_select.hpp"
2430+#include <hipcub/device/device_select.hpp>
2431
2432 #ifndef DEFAULT_N
2433 const size_t DEFAULT_N = 1024 * 1024 * 32;
2434@@ -307,12 +307,11 @@ void run_flagged_if_benchmark(benchmark::State& state,
2435 state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T));
2436 state.SetItemsProcessed(state.iterations() * batch_size * size);
2437
2438- hipFree(d_input);
2439- hipFree(d_flags);
2440- hipFree(d_output);
2441- hipFree(d_selected_count_output);
2442- hipFree(d_temp_storage);
2443- HIP_CHECK(hipDeviceSynchronize());
2444+ HIP_CHECK(hipFree(d_input));
2445+ HIP_CHECK(hipFree(d_flags));
2446+ HIP_CHECK(hipFree(d_output));
2447+ HIP_CHECK(hipFree(d_selected_count_output));
2448+ HIP_CHECK(hipFree(d_temp_storage));
2449 }
2450
2451 template<class T>
2452diff --git a/benchmark/benchmark_device_spmv.cpp b/benchmark/benchmark_device_spmv.cpp
2453index f98f1c1..fcdb1ab 100644
2454--- a/benchmark/benchmark_device_spmv.cpp
2455+++ b/benchmark/benchmark_device_spmv.cpp
2456@@ -23,7 +23,7 @@
2457 #include "common_benchmark_header.hpp"
2458
2459 // HIP API
2460-#include "hipcub/device/device_spmv.hpp"
2461+#include <hipcub/device/device_spmv.hpp>
2462
2463 #ifndef DEFAULT_N
2464 const size_t DEFAULT_N = 1024 * 32;
2465@@ -126,6 +126,7 @@ void run_benchmark(benchmark::State& state,
2466 size_t temp_storage_size_bytes;
2467
2468 // Get size of d_temp_storage
2469+ HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH
2470 HIP_CHECK(hipcub::DeviceSpmv::CsrMV(nullptr,
2471 temp_storage_size_bytes,
2472 d_values,
2473@@ -137,6 +138,7 @@ void run_benchmark(benchmark::State& state,
2474 size,
2475 num_nonzeroes,
2476 stream));
2477+ HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP
2478 HIP_CHECK(hipDeviceSynchronize());
2479
2480 // allocate temporary storage
2481@@ -147,6 +149,7 @@ void run_benchmark(benchmark::State& state,
2482 // Warm-up
2483 for(size_t i = 0; i < warmup_size; i++)
2484 {
2485+ HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH
2486 HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage,
2487 temp_storage_size_bytes,
2488 d_values,
2489@@ -158,6 +161,7 @@ void run_benchmark(benchmark::State& state,
2490 size,
2491 num_nonzeroes,
2492 stream));
2493+ HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH
2494 }
2495 HIP_CHECK(hipDeviceSynchronize());
2496
2497@@ -166,6 +170,7 @@ void run_benchmark(benchmark::State& state,
2498 auto start = std::chrono::high_resolution_clock::now();
2499 for(size_t i = 0; i < batch_size; i++)
2500 {
2501+ HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH
2502 HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage,
2503 temp_storage_size_bytes,
2504 d_values,
2505@@ -177,6 +182,7 @@ void run_benchmark(benchmark::State& state,
2506 size,
2507 num_nonzeroes,
2508 stream));
2509+ HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP
2510 }
2511 HIP_CHECK(hipDeviceSynchronize());
2512
2513diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp
2514index 48a46b8..489cf8d 100644
2515--- a/benchmark/benchmark_utils.hpp
2516+++ b/benchmark/benchmark_utils.hpp
2517@@ -1,6 +1,6 @@
2518 // MIT License
2519 //
2520-// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
2521+// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
2522 //
2523 // Permission is hereby granted, free of charge, to any person obtaining a copy
2524 // of this software and associated documentation files (the "Software"), to deal
2525@@ -29,13 +29,13 @@
2526
2527 // hipCUB API
2528 #ifdef __HIP_PLATFORM_AMD__
2529- #include "hipcub/backend/rocprim/util_ptx.hpp"
2530+ #include <hipcub/backend/rocprim/util_ptx.hpp>
2531 #elif defined(__HIP_PLATFORM_NVIDIA__)
2532- #include "hipcub/config.hpp"
2533 #include <cub/util_ptx.cuh>
2534+ #include <hipcub/config.hpp>
2535 #endif
2536
2537-#include "hipcub/tuple.hpp"
2538+#include <hipcub/tuple.hpp>
2539
2540 #ifndef HIPCUB_CUB_API
2541 #define HIPCUB_WARP_THREADS_MACRO warpSize
2542@@ -409,6 +409,14 @@ template<unsigned int LogicalWarpSize>
2543 __device__ constexpr bool device_test_enabled_for_warp_size_v
2544 = HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize;
2545
2546+template<class T>
2547+__device__
2548+inline constexpr bool is_power_of_two(const T x)
2549+{
2550+ static_assert(std::is_integral<T>::value, "T must be integer type");
2551+ return (x > 0) && ((x & (x - 1)) == 0);
2552+}
2553+
2554 template<typename Iterator>
2555 using it_value_t = typename std::iterator_traits<Iterator>::value_type;
2556
2557diff --git a/benchmark/benchmark_warp_exchange.cpp b/benchmark/benchmark_warp_exchange.cpp
2558index 598df95..0c41be0 100644
2559--- a/benchmark/benchmark_warp_exchange.cpp
2560+++ b/benchmark/benchmark_warp_exchange.cpp
2561@@ -23,7 +23,7 @@
2562 #include "common_benchmark_header.hpp"
2563
2564 // HIP API
2565-#include "hipcub/warp/warp_exchange.hpp"
2566+#include <hipcub/warp/warp_exchange.hpp>
2567
2568 #include <type_traits>
2569
2570diff --git a/benchmark/benchmark_warp_load.cpp b/benchmark/benchmark_warp_load.cpp
2571index 4298db6..2c74609 100644
2572--- a/benchmark/benchmark_warp_load.cpp
2573+++ b/benchmark/benchmark_warp_load.cpp
2574@@ -23,7 +23,7 @@
2575 #include "common_benchmark_header.hpp"
2576
2577 // HIP API
2578-#include "hipcub/warp/warp_load.hpp"
2579+#include <hipcub/warp/warp_load.hpp>
2580
2581 #include <type_traits>
2582
2583diff --git a/benchmark/benchmark_warp_merge_sort.cpp b/benchmark/benchmark_warp_merge_sort.cpp
2584index 5b2d87c..f6d91fe 100644
2585--- a/benchmark/benchmark_warp_merge_sort.cpp
2586+++ b/benchmark/benchmark_warp_merge_sort.cpp
2587@@ -24,10 +24,10 @@
2588
2589 #include "../test/hipcub/test_utils_sort_comparator.hpp"
2590 // HIP API
2591-#include "hipcub/block/block_load.hpp"
2592-#include "hipcub/block/block_store.hpp"
2593-#include "hipcub/util_ptx.hpp"
2594-#include "hipcub/warp/warp_merge_sort.hpp"
2595+#include <hipcub/block/block_load.hpp>
2596+#include <hipcub/block/block_store.hpp>
2597+#include <hipcub/util_ptx.hpp>
2598+#include <hipcub/warp/warp_merge_sort.hpp>
2599
2600 #include <type_traits>
2601
2602diff --git a/benchmark/benchmark_warp_reduce.cpp b/benchmark/benchmark_warp_reduce.cpp
2603index 1a20d71..f8d08f6 100644
2604--- a/benchmark/benchmark_warp_reduce.cpp
2605+++ b/benchmark/benchmark_warp_reduce.cpp
2606@@ -23,7 +23,7 @@
2607 #include "common_benchmark_header.hpp"
2608
2609 // HIP API
2610-#include "hipcub/warp/warp_reduce.hpp"
2611+#include <hipcub/warp/warp_reduce.hpp>
2612
2613 #ifndef DEFAULT_N
2614 const size_t DEFAULT_N = 1024 * 1024 * 32;
2615diff --git a/benchmark/benchmark_warp_scan.cpp b/benchmark/benchmark_warp_scan.cpp
2616index ddd499c..db3fe94 100644
2617--- a/benchmark/benchmark_warp_scan.cpp
2618+++ b/benchmark/benchmark_warp_scan.cpp
2619@@ -1,6 +1,6 @@
2620 // MIT License
2621 //
2622-// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
2623+// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
2624 //
2625 // Permission is hereby granted, free of charge, to any person obtaining a copy
2626 // of this software and associated documentation files (the "Software"), to deal
2627@@ -23,7 +23,7 @@
2628 #include "common_benchmark_header.hpp"
2629
2630 // HIP API
2631-#include "hipcub/warp/warp_scan.hpp"
2632+#include <hipcub/warp/warp_scan.hpp>
2633
2634 #ifndef DEFAULT_N
2635 const size_t DEFAULT_N = 1024 * 1024 * 32;
2636@@ -106,19 +106,22 @@ struct broadcast
2637 template<class T, unsigned int WarpSize, unsigned int Trials>
2638 __device__
2639 static auto run(const T* input, T* output, const T init)
2640- -> std::enable_if_t<benchmark_utils::device_test_enabled_for_warp_size_v<WarpSize>>
2641+ -> std::enable_if_t<(benchmark_utils::device_test_enabled_for_warp_size_v<WarpSize>
2642+ && benchmark_utils::is_power_of_two(WarpSize))>
2643 {
2644 (void)init;
2645
2646- const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
2647- auto value = input[i];
2648+ const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
2649+ const unsigned int warp_id = i / WarpSize;
2650+ const unsigned int src_lane = warp_id % WarpSize;
2651+ auto value = input[i];
2652
2653 using wscan_t = hipcub::WarpScan<T, WarpSize>;
2654 __shared__ typename wscan_t::TempStorage storage;
2655 #pragma nounroll
2656 for(unsigned int trial = 0; trial < Trials; trial++)
2657 {
2658- value = wscan_t(storage).Broadcast(value, 0);
2659+ value = wscan_t(storage).Broadcast(value, src_lane);
2660 }
2661
2662 output[i] = value;
2663@@ -127,7 +130,8 @@ struct broadcast
2664 template<class T, unsigned int WarpSize, unsigned int Trials>
2665 __device__
2666 static auto run(const T* /*input*/, T* /*output*/, const T /*init*/)
2667- -> std::enable_if_t<!benchmark_utils::device_test_enabled_for_warp_size_v<WarpSize>>
2668+ -> std::enable_if_t<!(benchmark_utils::device_test_enabled_for_warp_size_v<WarpSize>
2669+ && benchmark_utils::is_power_of_two(WarpSize))>
2670 {}
2671 };
2672
2673@@ -188,61 +192,75 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size)
2674 #define CREATE_BENCHMARK(T, BS, WS) CREATE_BENCHMARK_IMPL(T, BS, WS, Benchmark)
2675
2676 // clang-format off
2677-// If warp size limit is 16
2678-#define BENCHMARK_TYPE_WS16(type) \
2679- CREATE_BENCHMARK(type, 60, 15), \
2680- CREATE_BENCHMARK(type, 256, 16)
2681-
2682-
2683-// If warp size limit is 32
2684-#define BENCHMARK_TYPE_WS32(type) \
2685- BENCHMARK_TYPE_WS16(type), \
2686- CREATE_BENCHMARK(type, 62, 31), \
2687- CREATE_BENCHMARK(type, 256, 32)
2688-
2689-
2690-// If warp size limit is 64
2691-#define BENCHMARK_TYPE_WS64(type) \
2692- BENCHMARK_TYPE_WS32(type), \
2693- CREATE_BENCHMARK(type, 63, 63), \
2694- CREATE_BENCHMARK(type, 64, 64), \
2695- CREATE_BENCHMARK(type, 128, 64), \
2696- CREATE_BENCHMARK(type, 256, 64)
2697+#if HIPCUB_WARP_THREADS_MACRO == 32
2698+ #define BENCHMARK_TYPE(type) \
2699+ CREATE_BENCHMARK(type, 60, 15), \
2700+ CREATE_BENCHMARK(type, 256, 16), \
2701+ CREATE_BENCHMARK(type, 62, 31), \
2702+ CREATE_BENCHMARK(type, 256, 32)
2703+#else
2704+ #define BENCHMARK_TYPE(type) \
2705+ CREATE_BENCHMARK(type, 60, 15), \
2706+ CREATE_BENCHMARK(type, 256, 16), \
2707+ CREATE_BENCHMARK(type, 62, 31), \
2708+ CREATE_BENCHMARK(type, 256, 32), \
2709+ CREATE_BENCHMARK(type, 63, 63), \
2710+ CREATE_BENCHMARK(type, 64, 64), \
2711+ CREATE_BENCHMARK(type, 128, 64), \
2712+ CREATE_BENCHMARK(type, 256, 64)
2713+#endif
2714+
2715+#if HIPCUB_WARP_THREADS_MACRO == 32
2716+ #define BENCHMARK_TYPE_P2(type) \
2717+ CREATE_BENCHMARK(type, 256, 16), \
2718+ CREATE_BENCHMARK(type, 256, 32)
2719+#else
2720+ #define BENCHMARK_TYPE_P2(type) \
2721+ CREATE_BENCHMARK(type, 256, 16), \
2722+ CREATE_BENCHMARK(type, 256, 32), \
2723+ CREATE_BENCHMARK(type, 64, 64), \
2724+ CREATE_BENCHMARK(type, 128, 64), \
2725+ CREATE_BENCHMARK(type, 256, 64)
2726+#endif
2727 // clang-format on
2728
2729 template<typename Benchmark>
2730-void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
2731+auto add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
2732 const std::string& method_name,
2733 hipStream_t stream,
2734 size_t size)
2735+ -> std::enable_if_t<std::is_same<Benchmark, inclusive_scan>::value
2736+ || std::is_same<Benchmark, exclusive_scan>::value>
2737 {
2738 using custom_double2 = benchmark_utils::custom_type<double, double>;
2739 using custom_int_double = benchmark_utils::custom_type<int, double>;
2740
2741- std::vector<benchmark::internal::Benchmark*> new_benchmarks = {
2742-#if HIPCUB_WARP_THREADS_MACRO == 16
2743- BENCHMARK_TYPE_WS16(int),
2744- BENCHMARK_TYPE_WS16(float),
2745- BENCHMARK_TYPE_WS16(double),
2746- BENCHMARK_TYPE_WS16(int8_t),
2747- BENCHMARK_TYPE_WS16(custom_double2),
2748- BENCHMARK_TYPE_WS16(custom_int_double)
2749-#elif HIPCUB_WARP_THREADS_MACRO == 32
2750- BENCHMARK_TYPE_WS32(int),
2751- BENCHMARK_TYPE_WS32(float),
2752- BENCHMARK_TYPE_WS32(double),
2753- BENCHMARK_TYPE_WS32(int8_t),
2754- BENCHMARK_TYPE_WS32(custom_double2),
2755- BENCHMARK_TYPE_WS32(custom_int_double)
2756-#else
2757- BENCHMARK_TYPE_WS64(int),
2758- BENCHMARK_TYPE_WS64(float),
2759- BENCHMARK_TYPE_WS64(double),
2760- BENCHMARK_TYPE_WS64(int8_t),
2761- BENCHMARK_TYPE_WS64(custom_double2),
2762- BENCHMARK_TYPE_WS64(custom_int_double)
2763-#endif
2764- };
2765+ std::vector<benchmark::internal::Benchmark*> new_benchmarks
2766+ = {BENCHMARK_TYPE(int),
2767+ BENCHMARK_TYPE(float),
2768+ BENCHMARK_TYPE(double),
2769+ BENCHMARK_TYPE(int8_t),
2770+ BENCHMARK_TYPE(custom_double2),
2771+ BENCHMARK_TYPE(custom_int_double)};
2772+ benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end());
2773+}
2774+
2775+template<typename Benchmark>
2776+auto add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
2777+ const std::string& method_name,
2778+ hipStream_t stream,
2779+ size_t size) -> std::enable_if_t<std::is_same<Benchmark, broadcast>::value>
2780+{
2781+ using custom_double2 = benchmark_utils::custom_type<double, double>;
2782+ using custom_int_double = benchmark_utils::custom_type<int, double>;
2783+
2784+ std::vector<benchmark::internal::Benchmark*> new_benchmarks
2785+ = {BENCHMARK_TYPE_P2(int),
2786+ BENCHMARK_TYPE_P2(float),
2787+ BENCHMARK_TYPE_P2(double),
2788+ BENCHMARK_TYPE_P2(int8_t),
2789+ BENCHMARK_TYPE_P2(custom_double2),
2790+ BENCHMARK_TYPE_P2(custom_int_double)};
2791 benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end());
2792 }
2793
2794diff --git a/benchmark/benchmark_warp_store.cpp b/benchmark/benchmark_warp_store.cpp
2795index 8e88661..6632faf 100644
2796--- a/benchmark/benchmark_warp_store.cpp
2797+++ b/benchmark/benchmark_warp_store.cpp
2798@@ -23,7 +23,7 @@
2799 #include "common_benchmark_header.hpp"
2800
2801 // HIP API
2802-#include "hipcub/warp/warp_store.hpp"
2803+#include <hipcub/warp/warp_store.hpp>
2804
2805 #include <type_traits>
2806
2807diff --git a/benchmark/common_benchmark_header.hpp b/benchmark/common_benchmark_header.hpp
2808index eab3048..a632840 100644
2809--- a/benchmark/common_benchmark_header.hpp
2810+++ b/benchmark/common_benchmark_header.hpp
2811@@ -1,6 +1,6 @@
2812 // MIT License
2813 //
2814-// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
2815+// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
2816 //
2817 // Permission is hereby granted, free of charge, to any person obtaining a copy
2818 // of this software and associated documentation files (the "Software"), to deal
2819@@ -21,18 +21,18 @@
2820 // SOFTWARE.
2821
2822 #include <algorithm>
2823+#include <chrono>
2824+#include <cmath>
2825+#include <cstdlib>
2826 #include <functional>
2827 #include <iostream>
2828+#include <limits>
2829+#include <numeric>
2830+#include <random>
2831+#include <tuple>
2832 #include <type_traits>
2833-#include <vector>
2834 #include <utility>
2835-#include <tuple>
2836-#include <random>
2837-#include <limits>
2838-#include <cmath>
2839-#include <cstdlib>
2840-#include <numeric>
2841-#include <chrono>
2842+#include <vector>
2843
2844 // Google Benchmark
2845 #include "benchmark/benchmark.h"
2846diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
2847index 22d2a4a..36a5310 100644
2848--- a/cmake/Dependencies.cmake
2849+++ b/cmake/Dependencies.cmake
2850@@ -1,6 +1,6 @@
2851 # MIT License
2852 #
2853-# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
2854+# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
2855 #
2856 # Permission is hereby granted, free of charge, to any person obtaining a copy
2857 # of this software and associated documentation files (the "Software"), to deal
2858@@ -66,6 +66,226 @@ endforeach()
2859
2860 include(FetchContent)
2861
2862+# This function checks to see if the download branch given by "branch" exists in the repository.
2863+# It does so using the git ls-remote command.
2864+# If the branch cannot be found, the variable described by "branch" is changed to "develop" in the host scope.
2865+function(find_download_branch git_path branch)
2866+ set(branch_value ${${branch}})
2867+ execute_process(COMMAND ${git_path} "ls-remote" "https://github.com/ROCm/rocm-libraries.git" "refs/heads/${branch_value}" RESULT_VARIABLE ret_code OUTPUT_VARIABLE output)
2868+
2869+ if(NOT ${ret_code} STREQUAL "0")
2870+ message(WARNING "Unable to check if release branch exists, defaulting to the develop branch.")
2871+ set(${branch} "develop" PARENT_SCOPE)
2872+ else()
2873+ if(${output})
2874+ string(STRIP ${output} output)
2875+ endif()
2876+
2877+ if(NOT (${output} MATCHES "[\t ]+refs/heads/${branch_value}(\n)?$"))
2878+ message(WARNING "Unable to locate requested release branch \"${branch_value}\" in repository. Defaulting to the develop branch.")
2879+ set(${branch} "develop" PARENT_SCOPE)
2880+ else()
2881+ message(STATUS "Found release branch \"${branch_value}\" in repository.")
2882+ endif()
2883+ endif()
2884+endfunction()
2885+
2886+function(check_git_version git_path)
2887+ execute_process(COMMAND ${git_path} "--version" OUTPUT_VARIABLE git_version_output)
2888+ string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" GIT_VERSION_STRING ${git_version_output})
2889+ if(DEFINED CMAKE_MATCH_0)
2890+ set(GIT_VERSION ${CMAKE_MATCH_0} PARENT_SCOPE)
2891+ else()
2892+ set(GIT_VERSION "" PARENT_SCOPE)
2893+ endif()
2894+endfunction()
2895+
2896+# This function fetches repository "repo_name" using the method specified by "method".
2897+# The result is stored in the parent scope version of "repo_path".
2898+# It does not build the repo.
2899+function(fetch_dep method repo_name repo_path download_branch)
2900+ set(method_value ${${method}})
2901+
2902+ # Since the monorepo is large, we want to avoid downloading the whole thing if possible.
2903+ # We can do this if we have access to git's sparse-checkout functionality, which was added in git 2.25.
2904+ # On some Linux systems (eg. Ubuntu), the git in /usr/bin tends to be newer than the git in /usr/local/bin,
2905+ # and the latter is what gets picked up by find_package(Git), since it's what's in PATH.
2906+ # Check for a git binary in /usr/bin first, then if git < 2.25 is not found, use find_package(Git) to search
2907+ # other locations.
2908+ if (NOT(GIT_PATH))
2909+ message(STATUS "Checking git version")
2910+ set(GIT_MIN_VERSION_FOR_SPARSE_CHECKOUT 2.25)
2911+
2912+ find_program(find_result git PATHS /usr/bin NO_DEFAULT_PATH)
2913+ if(NOT (${find_result} STREQUAL "find_result-NOTFOUND"))
2914+ set(GIT_PATH ${find_result} CACHE INTERNAL "Path to the git executable")
2915+ check_git_version(${GIT_PATH})
2916+ endif()
2917+
2918+ if(NOT GIT_VERSION OR "${GIT_VERSION}" LESS ${GIT_MIN_VERSION_FOR_SPARSE_CHECKOUT})
2919+ find_package(Git QUIET)
2920+ if(GIT_FOUND)
2921+ set(GIT_PATH ${GIT_EXECUTABLE} CACHE INTERNAL "Path to the git executable")
2922+ check_git_version(${GIT_PATH})
2923+ endif()
2924+ endif()
2925+
2926+ if(NOT GIT_VERSION OR "${GIT_VERSION}" LESS ${GIT_MIN_VERSION_FOR_SPARSE_CHECKOUT})
2927+ set(USE_SPARSE_CHECKOUT "OFF" CACHE INTERNAL "Records whether git supports sparse checkout functionality")
2928+ else()
2929+ set(USE_SPARSE_CHECKOUT "ON" CACHE INTERNAL "Records whether git supports sparse checkout functionality")
2930+ endif()
2931+
2932+ if(NOT GIT_VERSION)
2933+ # Warn the user that we were unable to find git. This will only actually be a problem if we use one of the
2934+ # fetch methods (download, or monorepo with dependency not present) that requires it. If we end up running
2935+ # into one of those scenarios, a fatal error will be issued at that point.
2936+ message(WARNING "Unable to find git.")
2937+ else()
2938+ message(STATUS "Found git at: ${GIT_PATH}, version: ${GIT_VERSION}")
2939+ endif()
2940+ endif()
2941+
2942+ if(${method_value} STREQUAL "PACKAGE")
2943+ message(STATUS "Searching for ${repo_name} package")
2944+
2945+ # Add default install location for WIN32 and non-WIN32 as hint
2946+ find_package(${repo_name} ${MIN_ROCPRIM_PACKAGE_VERSION} CONFIG QUIET PATHS "${ROCM_ROOT}/lib/cmake/rocprim")
2947+
2948+ if(NOT ${${repo_name}_FOUND})
2949+ message(STATUS "No existing ${repo_name} package meeting the minimum version requirement (${MIN_ROCPRIM_PACKAGE_VERSION}) was found. Falling back to downloading it.")
2950+ # update local and parent variable values
2951+ set(${method} "DOWNLOAD" PARENT_SCOPE)
2952+ set(method_value "DOWNLOAD")
2953+ else()
2954+ message(STATUS "Package found (${${repo_name}_DIR})")
2955+ endif()
2956+
2957+ elseif(${method_value} STREQUAL "MONOREPO")
2958+ message(STATUS "Searching for ${repo_name} in the parent monorepo directory")
2959+
2960+ # Check if this looks like a monorepo checkout
2961+ find_path(found_path NAMES "." PATHS "${CMAKE_CURRENT_SOURCE_DIR}/../../projects/${repo_name}/" NO_CACHE NO_DEFAULT_PATH)
2962+
2963+ # If not, see if the local monorepo is a sparse-checkout.
2964+ # If it is a sparse-checkout, try to add the dependency to the sparse-checkout list.
2965+ # If it's not a sparse-checkout (or adding to the sparse-checkout list fails), fallback to downloading the dependency.
2966+ if(${found_path} STREQUAL "found_path-NOTFOUND")
2967+ set(FALLBACK_TO_DOWNLOAD ON)
2968+ message(WARNING "Unable to locate ${repo_name} in parent monorepo (it's not at \"${CMAKE_CURRENT_SOURCE_DIR}/../../projects/${repo_name}/\").")
2969+ message(STATUS "Checking if local monorepo is a sparse-checkout that we can add ${repo_name} to.")
2970+ if(NOT(GIT_PATH))
2971+ message(FATAL_ERROR "Git could not be found on the system. Since ${repo_name} could not be found in the local monorepo, git is required to download it.")
2972+ endif()
2973+
2974+ if(USE_SPARSE_CHECKOUT)
2975+ execute_process(COMMAND ${GIT_PATH} "sparse-checkout" "list" OUTPUT_VARIABLE sparse_list ERROR_VARIABLE git_error RESULT_VARIABLE git_result
2976+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../)
2977+
2978+ if(NOT(git_result EQUAL 0) OR git_error)
2979+ message(STATUS "The local monorepo does not appear to be a sparse-checkout.")
2980+ else()
2981+ message(STATUS "The local monorepo appears to be a sparse checkout. Attempting to add \"projects/${repo_name}\" to the checkout list.")
2982+ # Check if the dependency is already present in the checkout list.
2983+ # Git lists sparse checkout directories each on a separate line.
2984+ # Take care not to match something in the middle of a path, eg. "other_dir/projects/${repo_name}/sub_dir".
2985+ string(REGEX MATCH "(^|\n)projects/${repo_name}($|\n)" find_result ${sparse_list})
2986+ if(find_result)
2987+ message(STATUS "Found existing entry for \"projects/${repo_name}\" in sparse-checkout list - has the directory structure been modified?")
2988+ else()
2989+ # Add project/${repo_name} to the sparse checkout
2990+ execute_process(COMMAND ${GIT_PATH} "sparse-checkout" "add" "projects/${repo_name}" RESULT_VARIABLE sparse_checkout_result
2991+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../)
2992+ # Note that in this case, we are forced to checkout the same branch that the sparse-checkout was created with.
2993+ execute_process(COMMAND ${GIT_PATH} "checkout" RESULT_VARIABLE checkout_result
2994+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../)
2995+
2996+ if(sparse_checkout_result EQUAL 0 AND checkout_result EQUAL 0)
2997+ message(STATUS "Added new checkout list entry.")
2998+ set(FALLBACK_TO_DOWNLOAD OFF)
2999+ else()
3000+ message(STATUS "Unable to add new checkout list entry.")
3001+ endif()
3002+ # Save the monorepo path in the parent scope
3003+ set(${repo_path} "${CMAKE_CURRENT_SOURCE_DIR}/../../projects/${repo_name}" PARENT_SCOPE)
3004+ endif()
3005+ endif()
3006+ else()
3007+ message(STATUS "The version of git installed on the system (${GIT_VERSION}) does not support sparse-checkout.")
3008+ endif()
3009+
3010+ if (FALLBACK_TO_DOWNLOAD)
3011+ message(WARNING "Unable to locate/fetch dependency ${repo_name} from monorepo. Falling back to downloading it.")
3012+ # update local and parent variable values
3013+ set(${method} "DOWNLOAD" PARENT_SCOPE)
3014+ set(method_value "DOWNLOAD")
3015+ endif()
3016+
3017+ else()
3018+ message(STATUS "Found ${repo_name} at ${found_path}")
3019+
3020+ # Save the monorepo path in the parent scope
3021+ set(${repo_path} ${found_path} PARENT_SCOPE)
3022+ endif()
3023+ endif()
3024+
3025+ if(${method_value} STREQUAL "DOWNLOAD")
3026+ if(NOT DEFINED GIT_PATH)
3027+ message(FATAL_ERROR "Git could not be found on the system. Git is required for downloading ${repo_name}.")
3028+ endif()
3029+
3030+ message(STATUS "Checking if repository contains requested branch ${${download_branch}}")
3031+ find_download_branch(${GIT_PATH} ${download_branch})
3032+ set(download_branch_value ${${download_branch}})
3033+
3034+ message(STATUS "Downloading ${repo_name} from https://github.com/ROCm/rocm-libraries.git")
3035+ if(${USE_SPARSE_CHECKOUT})
3036+ # In this case, we have access to git sparse-checkout.
3037+ # Check if the dependency has already been downloaded in the past:
3038+ find_path(found_path NAMES "." PATHS "${CMAKE_CURRENT_BINARY_DIR}/${repo_name}-src/" NO_CACHE NO_DEFAULT_PATH)
3039+ if(${found_path} STREQUAL "found_path-NOTFOUND")
3040+ # First, git clone with options "--no-checkout" and "--filter=tree:0" to prevent files from being pulled immediately.
3041+ # Use option "--depth=1" to avoid downloading past commit history.
3042+ execute_process(COMMAND ${GIT_PATH} clone --branch ${download_branch_value} --no-checkout --depth=1 --filter=tree:0 https://github.com/ROCm/rocm-libraries.git ${CMAKE_CURRENT_BINARY_DIR}/${repo_name}-src)
3043+
3044+ # Next, use git sparse-checkout to ensure we only pull the directory containing the desired repo.
3045+ execute_process(COMMAND ${GIT_PATH} sparse-checkout init --cone
3046+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${repo_name}-src)
3047+
3048+ execute_process(COMMAND ${GIT_PATH} sparse-checkout set projects/${repo_name}
3049+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${repo_name}-src)
3050+
3051+ # Finally, download the files using git checkout.
3052+ execute_process(COMMAND ${GIT_PATH} checkout ${download_branch_value}
3053+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${repo_name}-src)
3054+
3055+ message(STATUS "${repo_name} download complete")
3056+ else()
3057+ message("Found previously downloaded directory, skipping download step.")
3058+ endif()
3059+
3060+ # Save the downloaded path in the parent scope
3061+ set(${repo_path} "${CMAKE_CURRENT_BINARY_DIR}/${repo_name}-src/projects/${repo_name}" PARENT_SCOPE)
3062+ else()
3063+ # In this case, we do not have access to sparse-checkout, so we need to download the whole monorepo.
3064+ # Check if the monorepo has already been downloaded to satisfy a previous dependency
3065+ find_path(found_path NAMES "." PATHS "${CMAKE_CURRENT_BINARY_DIR}/monorepo-src/" NO_CACHE NO_DEFAULT_PATH)
3066+ if(${found_path} STREQUAL "found_path-NOTFOUND")
3067+ # Warn the user that this will take some time.
3068+ message(WARNING "The detected version of git (${GIT_VERSION}) is older than 2.25 and does not provide sparse-checkout functionality. Falling back to checking out the whole rocm-libraries repository (this may take a long time).")
3069+ # Avoid downloading anything related to branches other than the target branch (--single-branch), and avoid any past commit history information (--depth=1)
3070+ execute_process(COMMAND ${GIT_PATH} clone --single-branch --branch=${download_branch_value} --depth=1 https://github.com/ROCm/rocm-libraries.git ${CMAKE_CURRENT_BINARY_DIR}/monorepo-src)
3071+ message(STATUS "rocm-libraries download complete")
3072+ else()
3073+ message("Found previously downloaded directory, skipping download step.")
3074+ endif()
3075+
3076+ # Save the downloaded path in the parent scope
3077+ set(${repo_path} "${CMAKE_CURRENT_BINARY_DIR}/monorepo-src/projects/${repo_name}" PARENT_SCOPE)
3078+ endif()
3079+ endif()
3080+endfunction()
3081+
3082 # Test dependencies
3083 if(USER_BUILD_TEST)
3084 # NOTE1: Google Test has created a mess with legacy FindGTest.cmake and newer GTestConfig.cmake
3085@@ -83,14 +303,14 @@ if(USER_BUILD_TEST)
3086 # mode targets. Otherwise if MODULE or CONFIG succeeded, then it prints the result to the
3087 # console via a non-QUIET find_package call and if CONFIG succeeded, creates ALIAS targets
3088 # with the MODULE IMPORTED names.
3089- if(NOT DEPENDENCIES_FORCE_DOWNLOAD)
3090+ if(NOT EXTERNAL_DEPS_FORCE_DOWNLOAD)
3091 find_package(GTest QUIET)
3092 endif()
3093 if(NOT TARGET GTest::GTest AND NOT TARGET GTest::gtest)
3094 option(BUILD_GTEST "Builds the googletest subproject" ON)
3095 option(BUILD_GMOCK "Builds the googlemock subproject" OFF)
3096 option(INSTALL_GTEST "Enable installation of googletest." OFF)
3097- if(EXISTS /usr/src/googletest AND NOT DEPENDENCIES_FORCE_DOWNLOAD)
3098+ if(EXISTS /usr/src/googletest AND NOT EXTERNAL_DEPS_FORCE_DOWNLOAD)
3099 FetchContent_Declare(
3100 googletest
3101 SOURCE_DIR /usr/src/googletest
3102@@ -116,7 +336,7 @@ if(USER_BUILD_TEST)
3103 endif(USER_BUILD_TEST)
3104
3105 if(USER_BUILD_BENCHMARK)
3106- if(NOT DEPENDENCIES_FORCE_DOWNLOAD)
3107+ if(NOT EXTERNAL_DEPS_FORCE_DOWNLOAD)
3108 find_package(benchmark CONFIG QUIET)
3109 endif()
3110 if(NOT TARGET benchmark::benchmark)
3111@@ -139,20 +359,13 @@ endif(USER_BUILD_BENCHMARK)
3112
3113 # CUB (only for CUDA platform)
3114 if(HIP_COMPILER STREQUAL "nvcc")
3115- set(CCCL_MINIMUM_VERSION 2.5.0)
3116+ set(CCCL_MINIMUM_VERSION 2.8.2)
3117 if(NOT DOWNLOAD_CUB)
3118- find_package(CUB ${CCCL_MINIMUM_VERSION} CONFIG)
3119- find_package(Thrust ${CCCL_MINIMUM_VERSION} CONFIG)
3120- find_package(libcudacxx ${CCCL_MINIMUM_VERSION} CONFIG)
3121+ find_package(CCCL ${CCCL_MINIMUM_VERSION} CONFIG)
3122 endif()
3123
3124- if (NOT CUB_FOUND OR NOT Thrust_FOUND OR NOT libcudacxx_FOUND)
3125- if(CUB_FOUND OR Thrust_FOUND OR libcudacxx_FOUND)
3126- message(WARNING "Found one of CUB, Thrust or libcu++, but not all of them.
3127- This can lead to mixing different potentially incompatible versions.")
3128- endif()
3129-
3130- message(STATUS "CUB, Thrust or libcu++ not found, downloading and extracting CCCL ${CCCL_MINIMUM_VERSION}")
3131+ if (NOT CCCL_FOUND)
3132+ message(STATUS "CCCL not found, downloading and extracting CCCL ${CCCL_MINIMUM_VERSION}")
3133 file(DOWNLOAD https://github.com/NVIDIA/cccl/archive/refs/tags/v${CCCL_MINIMUM_VERSION}.zip
3134 ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}.zip
3135 STATUS cccl_download_status LOG cccl_download_log)
3136@@ -176,25 +389,24 @@ if(HIP_COMPILER STREQUAL "nvcc")
3137 endif()
3138 endif()
3139
3140- find_package(CUB ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH
3141- PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/cub)
3142- find_package(Thrust ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH
3143- PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/thrust)
3144- find_package(libcudacxx ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH
3145- PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/libcudacxx)
3146+ find_package(CCCL ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH
3147+ PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION})
3148 endif()
3149 else()
3150 # rocPRIM (only for ROCm platform)
3151- if(NOT DEPENDENCIES_FORCE_DOWNLOAD)
3152- # Add default install location for WIN32 and non-WIN32 as hint
3153- find_package(rocprim CONFIG QUIET PATHS "${ROCM_ROOT}/lib/cmake/rocprim")
3154- endif()
3155- if(NOT TARGET roc::rocprim)
3156- message(STATUS "rocPRIM not found. Fetching...")
3157+ fetch_dep(ROCPRIM_FETCH_METHOD rocprim ROCPRIM_PATH ROCM_DEP_RELEASE_BRANCH)
3158+
3159+ if(${ROCPRIM_FETCH_METHOD} STREQUAL "DOWNLOAD" OR ${ROCPRIM_FETCH_METHOD} STREQUAL "MONOREPO")
3160+ # The fetch_dep call above should have downloaded/located the source. We just need to make it available.
3161+ message(STATUS "Configuring rocPRIM")
3162 FetchContent_Declare(
3163- prim
3164- GIT_REPOSITORY https://github.com/ROCm/rocPRIM.git
3165- GIT_TAG develop
3166+ prim
3167+ SOURCE_DIR ${ROCPRIM_PATH}
3168+ INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/deps/rocprim
3169+ CMAKE_ARGS -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DCMAKE_PREFIX_PATH=/opt/rocm
3170+ LOG_CONFIGURE TRUE
3171+ LOG_BUILD TRUE
3172+ LOG_INSTALL TRUE
3173 )
3174 FetchContent_MakeAvailable(prim)
3175 if(NOT TARGET roc::rocprim)
3176@@ -203,8 +415,6 @@ else()
3177 if(NOT TARGET roc::rocprim_hip)
3178 add_library(roc::rocprim_hip ALIAS rocprim_hip)
3179 endif()
3180- else()
3181- find_package(rocprim CONFIG REQUIRED)
3182 endif()
3183 endif()
3184
3185diff --git a/cmake/GenerateResourceSpec.cmake b/cmake/GenerateResourceSpec.cmake
3186index 7810a20..d485ca4 100644
3187--- a/cmake/GenerateResourceSpec.cmake
3188+++ b/cmake/GenerateResourceSpec.cmake
3189@@ -17,10 +17,11 @@ execute_process(
3190
3191 if(ROCMINFO_EXIT_CODE)
3192 message(SEND_ERROR "rocminfo exited with ${ROCMINFO_EXIT_CODE}")
3193+ message(SEND_ERROR ${ROCMINFO_STDOUT})
3194 message(FATAL_ERROR ${ROCMINFO_STDERR})
3195 endif()
3196
3197-string(REGEX MATCHALL [[--(gfx[0-9]+)]]
3198+string(REGEX MATCHALL [[--(gfx[0-9a-f]+)]]
3199 ROCMINFO_MATCHES
3200 ${ROCMINFO_STDOUT}
3201 )
3202@@ -37,7 +38,7 @@ string(REGEX MATCHALL [[--(gfx[0-9]+)]]
3203 # std::string ip;
3204 # int id;
3205 # };
3206-#
3207+#
3208 # std::vector<device> GFXIP_AND_ID{ {"gfx900",0},{"gfx803",1},{"gfx900",2} };
3209 # std::sort(GFXIP_AND_ID.begin(), GFXIP_AND_ID.end(),
3210 # [](const device& lhs, const device& rhs)
3211@@ -85,14 +86,14 @@ list(SORT GFXIP_AND_ID)
3212 set(JSON_PAYLOAD)
3213 set(IT1 0)
3214 list(GET GFXIP_AND_ID ${IT1} I1)
3215-string(REGEX REPLACE ":[0-9]+" "" IP1 ${I1})
3216+string(REGEX REPLACE ":[0-9a-f]+" "" IP1 ${I1})
3217 list(LENGTH GFXIP_AND_ID COUNT)
3218 while(IT1 LESS COUNT)
3219 string(APPEND JSON_PAYLOAD "\n \"${IP1}\": [")
3220 set(IT2 ${IT1})
3221 list(GET GFXIP_AND_ID ${IT2} I2)
3222- string(REGEX REPLACE [[:[0-9]+$]] "" IP2 ${I2})
3223- string(REGEX REPLACE [[^gfx[0-9]+:]] "" ID2 ${I2})
3224+ string(REGEX REPLACE [[:[0-9a-f]+$]] "" IP2 ${I2})
3225+ string(REGEX REPLACE [[^gfx[0-9a-f]+:]] "" ID2 ${I2})
3226 while(${IP2} STREQUAL ${IP1} AND IT2 LESS COUNT)
3227 string(APPEND JSON_PAYLOAD
3228 "\n {\n"
3229@@ -102,8 +103,8 @@ while(IT1 LESS COUNT)
3230 math(EXPR IT2 "${IT2} + 1")
3231 if(IT2 LESS COUNT)
3232 list(GET GFXIP_AND_ID ${IT2} I2)
3233- string(REGEX REPLACE [[:[0-9]+$]] "" IP2 ${I2})
3234- string(REGEX REPLACE [[^gfx[0-9]+:]] "" ID2 ${I2})
3235+ string(REGEX REPLACE [[:[0-9a-f]+$]] "" IP2 ${I2})
3236+ string(REGEX REPLACE [[^gfx[0-9a-f]+:]] "" ID2 ${I2})
3237 endif()
3238 endwhile()
3239 string(REGEX REPLACE [[,$]] "" JSON_PAYLOAD ${JSON_PAYLOAD})
3240diff --git a/cmake/ROCmCMakeBuildToolsDependency.cmake b/cmake/ROCmCMakeBuildToolsDependency.cmake
3241index 3f8e1fc..8c913b5 100644
3242--- a/cmake/ROCmCMakeBuildToolsDependency.cmake
3243+++ b/cmake/ROCmCMakeBuildToolsDependency.cmake
3244@@ -49,6 +49,5 @@ include(ROCMCreatePackage)
3245 include(ROCMInstallTargets)
3246 include(ROCMPackageConfigHelpers)
3247 include(ROCMInstallSymlinks)
3248-include(ROCMHeaderWrapper)
3249 include(ROCMCheckTargetIds)
3250 include(ROCMClients)
3251diff --git a/cmake/SetupNVCC.cmake b/cmake/SetupNVCC.cmake
3252index 5ec377d..46b88d2 100644
3253--- a/cmake/SetupNVCC.cmake
3254+++ b/cmake/SetupNVCC.cmake
3255@@ -81,7 +81,7 @@ endif()
3256
3257 # Get CUDA
3258 enable_language("CUDA")
3259-set(CMAKE_CUDA_STANDARD 14)
3260+set(CMAKE_CUDA_STANDARD 17)
3261
3262 # Suppressing warnings
3263 set(HIP_NVCC_FLAGS " ${HIP_NVCC_FLAGS} -Wno-deprecated-gpu-targets -Xcompiler -Wno-return-type -Wno-deprecated-declarations ")
3264diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
3265index 7a9af98..9dd5035 100644
3266--- a/cmake/Summary.cmake
3267+++ b/cmake/Summary.cmake
3268@@ -1,6 +1,6 @@
3269 # MIT License
3270 #
3271-# Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
3272+# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
3273 #
3274 # Permission is hereby granted, free of charge, to any person obtaining a copy
3275 # of this software and associated documentation files (the "Software"), to deal
3276@@ -21,6 +21,39 @@
3277 # SOFTWARE.
3278
3279 function(print_configuration_summary)
3280+ find_package(Git)
3281+ if(GIT_FOUND)
3282+ execute_process(
3283+ COMMAND ${GIT_EXECUTABLE} show --format=%H --no-patch
3284+ WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
3285+ OUTPUT_VARIABLE COMMIT_HASH
3286+ OUTPUT_STRIP_TRAILING_WHITESPACE
3287+ )
3288+ execute_process(
3289+ COMMAND ${GIT_EXECUTABLE} show --format=%s --no-patch
3290+ WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
3291+ OUTPUT_VARIABLE COMMIT_SUBJECT
3292+ OUTPUT_STRIP_TRAILING_WHITESPACE
3293+ )
3294+ endif()
3295+
3296+ execute_process(
3297+ COMMAND ${CMAKE_CXX_COMPILER} --version
3298+ WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
3299+ OUTPUT_VARIABLE CMAKE_CXX_COMPILER_VERBOSE_DETAILS
3300+ OUTPUT_STRIP_TRAILING_WHITESPACE
3301+ )
3302+
3303+ find_program(UNAME_EXECUTABLE uname)
3304+ if(UNAME_EXECUTABLE)
3305+ execute_process(
3306+ COMMAND ${UNAME_EXECUTABLE} -a
3307+ WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
3308+ OUTPUT_VARIABLE LINUX_KERNEL_DETAILS
3309+ OUTPUT_STRIP_TRAILING_WHITESPACE
3310+ )
3311+ endif()
3312+
3313 message(STATUS "")
3314 message(STATUS "******** Summary ********")
3315 message(STATUS "General:")
3316@@ -55,8 +88,24 @@ else()
3317 message(STATUS " Device targets : ${NVGPU_TARGETS}")
3318 endif()
3319 message(STATUS "")
3320- message(STATUS " DEPENDENCIES_FORCE_DOWNLOAD : ${DEPENDENCIES_FORCE_DOWNLOAD}")
3321+ message(STATUS " EXTERNAL_DEPS_FORCE_DOWNLOAD: ${EXTERNAL_DEPS_FORCE_DOWNLOAD}")
3322+ message(STATUS " ROCPRIM_FETCH_METHOD : ${ROCPRIM_FETCH_METHOD}")
3323 message(STATUS " BUILD_TEST : ${BUILD_TEST}")
3324+if(BUILD_TEST)
3325+ message(STATUS " CODE_COVERAGE : ${CODE_COVERAGE}")
3326+endif()
3327 message(STATUS " BUILD_BENCHMARK : ${BUILD_BENCHMARK}")
3328 message(STATUS " BUILD_ADDRESS_SANITIZER : ${BUILD_ADDRESS_SANITIZER}")
3329+ message(STATUS " BUILD_OFFLOAD_COMPRESS : ${BUILD_OFFLOAD_COMPRESS}")
3330+ message(STATUS " USE_SYSTEM_LIB : ${USE_SYSTEM_LIB}")
3331+ message(STATUS "")
3332+ message(STATUS "Detailed:")
3333+ message(STATUS " C++ compiler details : \n${CMAKE_CXX_COMPILER_VERBOSE_DETAILS}")
3334+if(GIT_FOUND)
3335+ message(STATUS " Commit : ${COMMIT_HASH}")
3336+ message(STATUS " ${COMMIT_SUBJECT}")
3337+endif()
3338+if(UNAME_EXECUTABLE)
3339+ message(STATUS " Unix name : ${LINUX_KERNEL_DETAILS}")
3340+endif()
3341 endfunction()
3342diff --git a/debian/changelog b/debian/changelog
3343index a096666..2dc9fc0 100644
3344--- a/debian/changelog
3345+++ b/debian/changelog
3346@@ -1,3 +1,28 @@
3347+hipcub (7.1.0-0ubuntu2) resolute; urgency=medium
3348+
3349+ * Revert to upstream toolchain (from ROCm LLVM fork)
3350+ - d/control: update dependencies and rollback from ROCm fork
3351+ - d/{control, tests/control}: drop ppc64el architecture from tests
3352+
3353+ -- Bruno Bernardo de Moura <bruno.moura@canonical.com> Fri, 23 Jan 2026 10:36:40 -0300
3354+
3355+hipcub (7.1.0-0ubuntu1) resolute; urgency=medium
3356+
3357+ * New upstream version 7.1.0
3358+ * d/p/0001-install-cmake-config-to-share.patch: refresh patch
3359+ * d/p/0002-default-thread-load-store-cache-modifiers-to-off.patch:
3360+ remove patch as suggested on patch description
3361+
3362+ -- Bruno Bernardo de Moura <bruno.moura@canonical.com> Wed, 07 Jan 2026 13:45:35 -0300
3363+
3364+hipcub (6.4.3-2ubuntu1) questing; urgency=medium
3365+
3366+ * d/control: update maintainer field
3367+ * d/rules: fix FTBFS by adding -Wl,--gc-sections to flags
3368+ * d/control: update build-depends
3369+
3370+ -- Igor Luppi <igor.luppi@canonical.com> Tue, 28 Oct 2025 12:39:22 -0300
3371+
3372 hipcub (6.4.3-2) unstable; urgency=medium
3373
3374 * Add d/p/0002-default-thread-load-store-cache-modifiers-to-off.patch
3375diff --git a/debian/control b/debian/control
3376index ced5ff6..33f2971 100644
3377--- a/debian/control
3378+++ b/debian/control
3379@@ -5,17 +5,18 @@ Priority: optional
3380 Standards-Version: 4.7.2
3381 Vcs-Git: https://salsa.debian.org/rocm-team/hipcub.git
3382 Vcs-Browser: https://salsa.debian.org/rocm-team/hipcub
3383-Maintainer: Debian ROCm Team <debian-ai@lists.debian.org>
3384+Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>
3385+XSBC-Original-Maintainer: Debian ROCm Team <debian-ai@lists.debian.org>
3386 Uploaders: Cordell Bloor <cgmb@debian.org>,
3387 Christian Kastner <ckk@debian.org>,
3388 Kari Pahula <kaol@debian.org>
3389 Build-Depends: debhelper-compat (= 13),
3390 cmake,
3391- hipcc (>= 7.0.1~),
3392- libamd-comgr-dev (>= 6.4~),
3393+ hipcc (>= 7.1~),
3394+ libamd-comgr-dev (>= 7.1~),
3395 libgtest-dev <!nocheck>,
3396- libhsa-runtime-dev (>= 6.4~),
3397- librocprim-dev (>= 6.4.1~),
3398+ libhsa-runtime-dev (>= 7.1~),
3399+ librocprim-dev (>= 7.1~),
3400 pkg-rocm-tools (>= 0.9.3~),
3401 rocm-cmake
3402 Rules-Requires-Root: no
3403@@ -33,7 +34,7 @@ Description: portable interface for GPU parallel primitives - headers
3404
3405 Package: libhipcub-tests
3406 Section: libdevel
3407-Architecture: amd64 arm64 ppc64el
3408+Architecture: amd64 arm64
3409 XB-X-ROCm-GPU-Architecture: ${rocm:GPU-Architecture}
3410 Build-Profiles: <!nocheck>
3411 Depends: ${misc:Depends}, ${shlibs:Depends}
3412diff --git a/debian/patches/0001-install-cmake-config-to-share.patch b/debian/patches/0001-install-cmake-config-to-share.patch
3413index 2dafdc1..78551be 100644
3414--- a/debian/patches/0001-install-cmake-config-to-share.patch
3415+++ b/debian/patches/0001-install-cmake-config-to-share.patch
3416@@ -10,10 +10,10 @@ libraries, such as libcub-dev.
3417 cmake/ROCMExportTargetsHeaderOnly.cmake | 2 +-
3418 1 file changed, 1 insertion(+), 1 deletion(-)
3419
3420-diff --git a/cmake/ROCMExportTargetsHeaderOnly.cmake b/cmake/ROCMExportTargetsHeaderOnly.cmake
3421-index 3f7d814..2c8088d 100644
3422---- a/cmake/ROCMExportTargetsHeaderOnly.cmake
3423-+++ b/cmake/ROCMExportTargetsHeaderOnly.cmake
3424+Index: hipcub/cmake/ROCMExportTargetsHeaderOnly.cmake
3425+===================================================================
3426+--- hipcub.orig/cmake/ROCMExportTargetsHeaderOnly.cmake
3427++++ hipcub/cmake/ROCMExportTargetsHeaderOnly.cmake
3428 @@ -29,7 +29,7 @@ include(GNUInstallDirs)
3429 include(ROCMPackageConfigHelpers)
3430 include(ROCMInstallTargets)
3431diff --git a/debian/patches/0002-default-thread-load-store-cache-modifiers-to-off.patch b/debian/patches/0002-default-thread-load-store-cache-modifiers-to-off.patch
3432deleted file mode 100644
3433index 7cb5d48..0000000
3434--- a/debian/patches/0002-default-thread-load-store-cache-modifiers-to-off.patch
3435+++ /dev/null
3436@@ -1,41 +0,0 @@
3437-From: Cordell Bloor <cgmb@debian.org>
3438-Date: Fri, 7 Nov 2025 11:40:56 -0700
3439-Subject: default thread load-store cache modifiers to off
3440-
3441-The assembly code used for this is invalid on RDNA 3, but it wasn't
3442-noticed with older compilers. The broken code was entirely removed
3443-by ROCm 7.1, so this patch can be dropped after updating hipcub
3444-to that release.
3445-
3446-Forwarded: not-needed
3447----
3448- hipcub/include/hipcub/thread/thread_load.hpp | 2 +-
3449- hipcub/include/hipcub/thread/thread_store.hpp | 2 +-
3450- 2 files changed, 2 insertions(+), 2 deletions(-)
3451-
3452-diff --git a/hipcub/include/hipcub/thread/thread_load.hpp b/hipcub/include/hipcub/thread/thread_load.hpp
3453-index 89b4b8b..f50239e 100644
3454---- a/hipcub/include/hipcub/thread/thread_load.hpp
3455-+++ b/hipcub/include/hipcub/thread/thread_load.hpp
3456-@@ -33,7 +33,7 @@
3457- #ifdef __HIP_PLATFORM_AMD__
3458-
3459- #ifndef HIPCUB_THREAD_LOAD_USE_CACHE_MODIFIERS
3460-- #define HIPCUB_THREAD_LOAD_USE_CACHE_MODIFIERS 1
3461-+ #define HIPCUB_THREAD_LOAD_USE_CACHE_MODIFIERS 0
3462- #endif
3463-
3464- #include "../backend/rocprim/thread/thread_load.hpp"
3465-diff --git a/hipcub/include/hipcub/thread/thread_store.hpp b/hipcub/include/hipcub/thread/thread_store.hpp
3466-index df61aaa..9af0535 100644
3467---- a/hipcub/include/hipcub/thread/thread_store.hpp
3468-+++ b/hipcub/include/hipcub/thread/thread_store.hpp
3469-@@ -33,7 +33,7 @@
3470- #ifdef __HIP_PLATFORM_AMD__
3471-
3472- #ifndef HIPCUB_THREAD_STORE_USE_CACHE_MODIFIERS
3473-- #define HIPCUB_THREAD_STORE_USE_CACHE_MODIFIERS 1
3474-+ #define HIPCUB_THREAD_STORE_USE_CACHE_MODIFIERS 0
3475- #endif
3476-
3477- #include "../backend/rocprim/thread/thread_store.hpp"
3478diff --git a/debian/patches/series b/debian/patches/series
3479index 4315635..6d27231 100644
3480--- a/debian/patches/series
3481+++ b/debian/patches/series
3482@@ -1,2 +1 @@
3483 0001-install-cmake-config-to-share.patch
3484-0002-default-thread-load-store-cache-modifiers-to-off.patch
3485diff --git a/debian/rules b/debian/rules
3486index 643b44e..6ff7d6b 100755
3487--- a/debian/rules
3488+++ b/debian/rules
3489@@ -2,6 +2,12 @@
3490 export CXX=hipcc
3491 export DEB_BUILD_MAINT_OPTIONS = hardening=+all optimize=-lto
3492 export DEB_CXXFLAGS_MAINT_PREPEND = -gz -DROCPRIM_NO_DEPRECATION_WARNINGS
3493+# -Wl,--gc-sections:
3494+# Fixes linker errors ("relocation refers to a discarded section") when using gtest.
3495+# The linker's section garbage collection incorrectly discards needed code due to
3496+# a toolchain mismatch (ROCm's clang vs. the system's GCC). This flag forces the
3497+# linker to correctly re-evaluate dependencies, keeping the required sections.
3498+export DEB_LDFLAGS_MAINT_PREPEND = -Wl,--gc-sections
3499 export VERBOSE=1
3500 #export AMD_LOG_LEVEL=4
3501
3502diff --git a/debian/tests/control b/debian/tests/control
3503index 820f22f..ede74de 100644
3504--- a/debian/tests/control
3505+++ b/debian/tests/control
3506@@ -1,4 +1,4 @@
3507 Test-Command: /bin/sh debian/tests/upstream-binaries libhipcub-tests
3508 Depends: libhipcub-tests
3509 Restrictions: allow-stderr, skippable
3510-Architecture: amd64 arm64 ppc64el
3511+Architecture: amd64 arm64
3512diff --git a/docs/conf.py b/docs/conf.py
3513index 133736c..0cf04f8 100644
3514--- a/docs/conf.py
3515+++ b/docs/conf.py
3516@@ -18,7 +18,7 @@ left_nav_title = f"hipCUB {version_number} Documentation"
3517 # for PDF output on Read the Docs
3518 project = "hipCUB Documentation"
3519 author = "Advanced Micro Devices, Inc."
3520-copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
3521+copyright = "Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved."
3522 version = version_number
3523 release = version_number
3524
3525@@ -33,3 +33,7 @@ external_projects_current_project = "hipcub"
3526
3527 for sphinx_var in ROCmDocs.SPHINX_VARS:
3528 globals()[sphinx_var] = getattr(docs_core, sphinx_var)
3529+
3530+# Suppresses "WARNING: toctree directive not expected with external-toc"
3531+# Ideally suppression wouldn't be needed; see sphinx-external-toc#36
3532+suppress_warnings = ["etoc.toctree"]
3533diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
3534index 1f4b7b7..16f2d40 100644
3535--- a/docs/doxygen/Doxyfile
3536+++ b/docs/doxygen/Doxyfile
3537@@ -1,4 +1,4 @@
3538-# Doxyfile 1.8.11
3539+# Doxyfile 1.9.4
3540
3541 # This file describes the settings to be used by the documentation system
3542 # doxygen (www.doxygen.org) for a project.
3543@@ -12,16 +12,25 @@
3544 # For lists, items can also be appended using:
3545 # TAG += value [value, ...]
3546 # Values that contain spaces should be placed between quotes (\" \").
3547+#
3548+# Note:
3549+#
3550+# Use doxygen to compare the used configuration file with the template
3551+# configuration file:
3552+# doxygen -x [configFile]
3553+# Use doxygen to compare the used configuration file with the template
3554+# configuration file without replacing the environment variables:
3555+# doxygen -x_noenv [configFile]
3556
3557 #---------------------------------------------------------------------------
3558 # Project related configuration options
3559 #---------------------------------------------------------------------------
3560
3561-# This tag specifies the encoding used for all characters in the config file
3562-# that follow. The default is UTF-8 which is also the encoding used for all text
3563-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
3564-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
3565-# for the list of possible encodings.
3566+# This tag specifies the encoding used for all characters in the configuration
3567+# file that follow. The default is UTF-8 which is also the encoding used for all
3568+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
3569+# iconv built into libc) for the transcoding. See
3570+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
3571 # The default value is: UTF-8.
3572
3573 DOXYFILE_ENCODING = UTF-8
3574@@ -60,16 +69,28 @@ PROJECT_LOGO =
3575
3576 OUTPUT_DIRECTORY = .
3577
3578-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
3579-# directories (in 2 levels) under the output directory of each output format and
3580-# will distribute the generated files over these directories. Enabling this
3581+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096
3582+# sub-directories (in 2 levels) under the output directory of each output format
3583+# and will distribute the generated files over these directories. Enabling this
3584 # option can be useful when feeding doxygen a huge amount of source files, where
3585 # putting all generated files in the same directory would otherwise causes
3586-# performance problems for the file system.
3587+# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to
3588+# control the number of sub-directories.
3589 # The default value is: NO.
3590
3591 CREATE_SUBDIRS = NO
3592
3593+# Controls the number of sub-directories that will be created when
3594+# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every
3595+# level increment doubles the number of directories, resulting in 4096
3596+# directories at level 8 which is the default and also the maximum value. The
3597+# sub-directories are organized in 2 levels, the first level always has a fixed
3598+# numer of 16 directories.
3599+# Minimum value: 0, maximum value: 8, default value: 8.
3600+# This tag requires that the tag CREATE_SUBDIRS is set to YES.
3601+
3602+CREATE_SUBDIRS_LEVEL = 8
3603+
3604 # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
3605 # characters to appear in the names of generated files. If set to NO, non-ASCII
3606 # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
3607@@ -81,14 +102,14 @@ ALLOW_UNICODE_NAMES = NO
3608 # The OUTPUT_LANGUAGE tag is used to specify the language in which all
3609 # documentation generated by doxygen is written. Doxygen will use this
3610 # information to generate all constant output in the proper language.
3611-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
3612-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
3613-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
3614-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
3615-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
3616-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
3617-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
3618-# Ukrainian and Vietnamese.
3619+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian,
3620+# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English
3621+# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek,
3622+# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with
3623+# English messages), Korean, Korean-en (Korean with English messages), Latvian,
3624+# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese,
3625+# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish,
3626+# Swedish, Turkish, Ukrainian and Vietnamese.
3627 # The default value is: English.
3628
3629 OUTPUT_LANGUAGE = English
3630@@ -118,7 +139,6 @@ REPEAT_BRIEF = YES
3631 # the entity):The $name class, The $name widget, The $name file, is, provides,
3632 # specifies, contains, represents, a, an and the.
3633
3634-
3635 ABBREVIATE_BRIEF = "The $name class" \
3636 "The $name widget" \
3637 "The $name file" \
3638@@ -190,6 +210,16 @@ SHORT_NAMES = NO
3639
3640 JAVADOC_AUTOBRIEF = NO
3641
3642+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
3643+# such as
3644+# /***************
3645+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
3646+# Javadoc-style will behave just like regular comments and it will not be
3647+# interpreted by doxygen.
3648+# The default value is: NO.
3649+
3650+JAVADOC_BANNER = NO
3651+
3652 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
3653 # line (until the first dot) of a Qt-style comment as the brief description. If
3654 # set to NO, the Qt-style will behave just like regular Qt-style comments (thus
3655@@ -210,6 +240,14 @@ QT_AUTOBRIEF = NO
3656
3657 MULTILINE_CPP_IS_BRIEF = NO
3658
3659+# By default Python docstrings are displayed as preformatted text and doxygen's
3660+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
3661+# doxygen's special commands can be used and the contents of the docstring
3662+# documentation blocks is shown as doxygen documentation.
3663+# The default value is: YES.
3664+
3665+PYTHON_DOCSTRING = YES
3666+
3667 # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
3668 # documentation from any documented member that it re-implements.
3669 # The default value is: YES.
3670@@ -233,20 +271,19 @@ TAB_SIZE = 4
3671 # the documentation. An alias has the form:
3672 # name=value
3673 # For example adding
3674-# "sideeffect=@par Side Effects:\n"
3675+# "sideeffect=@par Side Effects:^^"
3676 # will allow you to put the command \sideeffect (or @sideeffect) in the
3677 # documentation, which will result in a user-defined paragraph with heading
3678-# "Side Effects:". You can put \n's in the value part of an alias to insert
3679-# newlines.
3680+# "Side Effects:". Note that you cannot put \n's in the value part of an alias
3681+# to insert newlines (in the resulting output). You can put ^^ in the value part
3682+# of an alias to insert a newline as if a physical newline was in the original
3683+# file. When you need a literal { or } or , in the value part of an alias you
3684+# have to escape them by means of a backslash (\), this can lead to conflicts
3685+# with the commands \{ and \} for these it is advised to use the version @{ and
3686+# @} or use a double escape (\\{ and \\})
3687
3688 ALIASES =
3689
3690-# This tag can be used to specify a number of word-keyword mappings (TCL only).
3691-# A mapping has the form "name=value". For example adding "class=itcl::class"
3692-# will allow you to use the command class in the itcl::class meaning.
3693-
3694-TCL_SUBST =
3695-
3696 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
3697 # only. Doxygen will then generate output that is more tailored for C. For
3698 # instance, some of the names that are used will be different. The list of all
3699@@ -275,28 +312,40 @@ OPTIMIZE_FOR_FORTRAN = NO
3700
3701 OPTIMIZE_OUTPUT_VHDL = NO
3702
3703+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
3704+# sources only. Doxygen will then generate output that is more tailored for that
3705+# language. For instance, namespaces will be presented as modules, types will be
3706+# separated into more groups, etc.
3707+# The default value is: NO.
3708+
3709+OPTIMIZE_OUTPUT_SLICE = NO
3710+
3711 # Doxygen selects the parser to use depending on the extension of the files it
3712 # parses. With this tag you can assign which parser to use for a given
3713 # extension. Doxygen has a built-in mapping, but you can override or extend it
3714 # using this tag. The format is ext=language, where ext is a file extension, and
3715-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
3716-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
3717-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
3718-# Fortran. In the later case the parser tries to guess whether the code is fixed
3719-# or free formatted code, this is the default for Fortran type files), VHDL. For
3720-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
3721-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
3722+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
3723+# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice,
3724+# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
3725+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
3726+# tries to guess whether the code is fixed or free formatted code, this is the
3727+# default for Fortran type files). For instance to make doxygen treat .inc files
3728+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
3729+# use: inc=Fortran f=C.
3730 #
3731 # Note: For files without extension you can use no_extension as a placeholder.
3732 #
3733 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
3734-# the files are not read by doxygen.
3735+# the files are not read by doxygen. When specifying no_extension you should add
3736+# * to the FILE_PATTERNS.
3737+#
3738+# Note see also the list of default file extension mappings.
3739
3740 EXTENSION_MAPPING =
3741
3742 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
3743 # according to the Markdown format, which allows for more readable
3744-# documentation. See http://daringfireball.net/projects/markdown/ for details.
3745+# documentation. See https://daringfireball.net/projects/markdown/ for details.
3746 # The output of markdown processing is further processed by doxygen, so you can
3747 # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
3748 # case of backward compatibilities issues.
3749@@ -304,6 +353,15 @@ EXTENSION_MAPPING =
3750
3751 MARKDOWN_SUPPORT = YES
3752
3753+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
3754+# to that level are automatically included in the table of contents, even if
3755+# they do not have an id attribute.
3756+# Note: This feature currently applies only to Markdown headings.
3757+# Minimum value: 0, maximum value: 99, default value: 5.
3758+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
3759+
3760+TOC_INCLUDE_HEADINGS = 5
3761+
3762 # When enabled doxygen tries to link words that correspond to documented
3763 # classes, or namespaces to their corresponding documentation. Such a link can
3764 # be prevented in individual cases by putting a % sign in front of the word or
3765@@ -329,7 +387,7 @@ BUILTIN_STL_SUPPORT = NO
3766 CPP_CLI_SUPPORT = NO
3767
3768 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
3769-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
3770+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
3771 # will parse them like normal C++ but will assume all classes use public instead
3772 # of private inheritance when no explicit protection keyword is present.
3773 # The default value is: NO.
3774@@ -415,6 +473,19 @@ TYPEDEF_HIDES_STRUCT = NO
3775
3776 LOOKUP_CACHE_SIZE = 0
3777
3778+# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use
3779+# during processing. When set to 0 doxygen will based this on the number of
3780+# cores available in the system. You can set it explicitly to a value larger
3781+# than 0 to get more control over the balance between CPU load and processing
3782+# speed. At this moment only the input processing can be done using multiple
3783+# threads. Since this is still an experimental feature the default is set to 1,
3784+# which effectively disables parallel processing. Please report any issues you
3785+# encounter. Generating dot graphs in parallel is controlled by the
3786+# DOT_NUM_THREADS setting.
3787+# Minimum value: 0, maximum value: 32, default value: 1.
3788+
3789+NUM_PROC_THREADS = 1
3790+
3791 #---------------------------------------------------------------------------
3792 # Build related configuration options
3793 #---------------------------------------------------------------------------
3794@@ -435,6 +506,12 @@ EXTRACT_ALL = NO
3795
3796 EXTRACT_PRIVATE = NO
3797
3798+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
3799+# methods of a class will be included in the documentation.
3800+# The default value is: NO.
3801+
3802+EXTRACT_PRIV_VIRTUAL = NO
3803+
3804 # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
3805 # scope will be included in the documentation.
3806 # The default value is: NO.
3807@@ -472,6 +549,13 @@ EXTRACT_LOCAL_METHODS = NO
3808
3809 EXTRACT_ANON_NSPACES = NO
3810
3811+# If this flag is set to YES, the name of an unnamed parameter in a declaration
3812+# will be determined by the corresponding definition. By default unnamed
3813+# parameters remain unnamed in the output.
3814+# The default value is: YES.
3815+
3816+RESOLVE_UNNAMED_PARAMS = YES
3817+
3818 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
3819 # undocumented members inside documented classes or files. If set to NO these
3820 # members will be included in the various overviews, but no documentation
3821@@ -489,8 +573,8 @@ HIDE_UNDOC_MEMBERS = NO
3822 HIDE_UNDOC_CLASSES = NO
3823
3824 # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
3825-# (class|struct|union) declarations. If set to NO, these declarations will be
3826-# included in the documentation.
3827+# declarations. If set to NO, these declarations will be included in the
3828+# documentation.
3829 # The default value is: NO.
3830
3831 HIDE_FRIEND_COMPOUNDS = NO
3832@@ -509,11 +593,18 @@ HIDE_IN_BODY_DOCS = NO
3833
3834 INTERNAL_DOCS = NO
3835
3836-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
3837-# names in lower-case letters. If set to YES, upper-case letters are also
3838-# allowed. This is useful if you have classes or files whose names only differ
3839-# in case and if your file system supports case sensitive file names. Windows
3840-# and Mac users are advised to set this option to NO.
3841+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
3842+# able to match the capabilities of the underlying filesystem. In case the
3843+# filesystem is case sensitive (i.e. it supports files in the same directory
3844+# whose names only differ in casing), the option must be set to YES to properly
3845+# deal with such files in case they appear in the input. For filesystems that
3846+# are not case sensitive the option should be set to NO to properly deal with
3847+# output files written for symbols that only differ in casing, such as for two
3848+# classes, one named CLASS and the other named Class, and to also support
3849+# references to files without having to specify the exact matching casing. On
3850+# Windows (including Cygwin) and MacOS, users should typically set this option
3851+# to NO, whereas on Linux or other Unix flavors it should typically be set to
3852+# YES.
3853 # The default value is: system dependent.
3854
3855 CASE_SENSE_NAMES = YES
3856@@ -532,6 +623,12 @@ HIDE_SCOPE_NAMES = NO
3857
3858 HIDE_COMPOUND_REFERENCE= NO
3859
3860+# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class
3861+# will show which file needs to be included to use the class.
3862+# The default value is: YES.
3863+
3864+SHOW_HEADERFILE = YES
3865+
3866 # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
3867 # the files that are included by a file in the documentation of that file.
3868 # The default value is: YES.
3869@@ -689,7 +786,8 @@ FILE_VERSION_FILTER =
3870 # output files in an output format independent way. To create the layout file
3871 # that represents doxygen's defaults, run doxygen with the -l option. You can
3872 # optionally specify a file name after the option, if omitted DoxygenLayout.xml
3873-# will be used as the name of the layout file.
3874+# will be used as the name of the layout file. See also section "Changing the
3875+# layout of pages" for information.
3876 #
3877 # Note that if you run doxygen from a directory containing a file called
3878 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
3879@@ -700,7 +798,7 @@ LAYOUT_FILE =
3880 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
3881 # the reference definitions. This must be a list of .bib files. The .bib
3882 # extension is automatically appended if omitted. This requires the bibtex tool
3883-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
3884+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
3885 # For LaTeX the style of the bibliography can be controlled using
3886 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
3887 # search path. See also \cite for info how to create references.
3888@@ -735,23 +833,35 @@ WARNINGS = YES
3889 WARN_IF_UNDOCUMENTED = YES
3890
3891 # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
3892-# potential errors in the documentation, such as not documenting some parameters
3893-# in a documented function, or documenting parameters that don't exist or using
3894-# markup commands wrongly.
3895+# potential errors in the documentation, such as documenting some parameters in
3896+# a documented function twice, or documenting parameters that don't exist or
3897+# using markup commands wrongly.
3898 # The default value is: YES.
3899
3900 WARN_IF_DOC_ERROR = YES
3901
3902+# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete
3903+# function parameter documentation. If set to NO, doxygen will accept that some
3904+# parameters have no documentation without warning.
3905+# The default value is: YES.
3906+
3907+WARN_IF_INCOMPLETE_DOC = YES
3908+
3909 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
3910 # are documented, but have no documentation for their parameters or return
3911-# value. If set to NO, doxygen will only warn about wrong or incomplete
3912-# parameter documentation, but not about the absence of documentation.
3913+# value. If set to NO, doxygen will only warn about wrong parameter
3914+# documentation, but not about the absence of documentation. If EXTRACT_ALL is
3915+# set to YES then this flag will automatically be disabled. See also
3916+# WARN_IF_INCOMPLETE_DOC
3917 # The default value is: NO.
3918
3919 WARN_NO_PARAMDOC = NO
3920
3921 # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
3922-# a warning is encountered.
3923+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
3924+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
3925+# at the end of the doxygen process doxygen will return with a non-zero status.
3926+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
3927 # The default value is: NO.
3928
3929 WARN_AS_ERROR = YES
3930@@ -762,13 +872,27 @@ WARN_AS_ERROR = YES
3931 # and the warning text. Optionally the format may contain $version, which will
3932 # be replaced by the version of the file (if it could be obtained via
3933 # FILE_VERSION_FILTER)
3934+# See also: WARN_LINE_FORMAT
3935 # The default value is: $file:$line: $text.
3936
3937 WARN_FORMAT = "$file:$line: $text"
3938
3939+# In the $text part of the WARN_FORMAT command it is possible that a reference
3940+# to a more specific place is given. To make it easier to jump to this place
3941+# (outside of doxygen) the user can define a custom "cut" / "paste" string.
3942+# Example:
3943+# WARN_LINE_FORMAT = "'vi $file +$line'"
3944+# See also: WARN_FORMAT
3945+# The default value is: at line $line of file $file.
3946+
3947+WARN_LINE_FORMAT = "at line $line of file $file"
3948+
3949 # The WARN_LOGFILE tag can be used to specify a file to which warning and error
3950 # messages should be written. If left blank the output is written to standard
3951-# error (stderr).
3952+# error (stderr). In case the file specified cannot be opened for writing the
3953+# warning and error messages are written to standard error. When as file - is
3954+# specified the warning and error messages are written to standard output
3955+# (stdout).
3956
3957 WARN_LOGFILE =
3958
3959@@ -787,8 +911,8 @@ INPUT = ../../hipcub/include/hipcub
3960 # This tag can be used to specify the character encoding of the source files
3961 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
3962 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
3963-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
3964-# possible encodings.
3965+# documentation (see:
3966+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
3967 # The default value is: UTF-8.
3968
3969 INPUT_ENCODING = UTF-8
3970@@ -801,11 +925,15 @@ INPUT_ENCODING = UTF-8
3971 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
3972 # read by doxygen.
3973 #
3974+# Note the list of default checked file patterns might differ from the list of
3975+# default file extension mappings.
3976+#
3977 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
3978 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
3979-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
3980-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl,
3981-# *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js.
3982+# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
3983+# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
3984+# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
3985+# *.vhdl, *.ucf, *.qsf and *.ice.
3986
3987 FILE_PATTERNS =
3988
3989@@ -838,13 +966,14 @@ EXCLUDE_SYMLINKS = NO
3990 # Note that the wildcards are matched against the file with absolute path, so to
3991 # exclude all test directories for example use the pattern */test/*
3992
3993-EXCLUDE_PATTERNS = */detail/*,*/backend/*
3994+EXCLUDE_PATTERNS = */detail/* \
3995+ */backend/*
3996
3997 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
3998 # (namespaces, classes, functions, etc.) that should be excluded from the
3999 # output. The symbol name can be a fully qualified name, a word, or if the
4000 # wildcard * is used, a substring. Examples: ANamespace, AClass,
4001-# AClass::ANamespace, ANamespace::*Test
4002+# ANamespace::AClass, ANamespace::*Test
4003 #
4004 # Note that the wildcards are matched against the file with absolute path, so to
4005 # exclude all test directories use the pattern */test/*
4006@@ -960,7 +1089,7 @@ INLINE_SOURCES = NO
4007 STRIP_CODE_COMMENTS = YES
4008
4009 # If the REFERENCED_BY_RELATION tag is set to YES then for each documented
4010-# function all documented functions referencing it will be listed.
4011+# entity all documented functions referencing it will be listed.
4012 # The default value is: NO.
4013
4014 REFERENCED_BY_RELATION = NO
4015@@ -992,12 +1121,12 @@ SOURCE_TOOLTIPS = YES
4016 # If the USE_HTAGS tag is set to YES then the references to source code will
4017 # point to the HTML generated by the htags(1) tool instead of doxygen built-in
4018 # source browser. The htags tool is part of GNU's global source tagging system
4019-# (see http://www.gnu.org/software/global/global.html). You will need version
4020+# (see https://www.gnu.org/software/global/global.html). You will need version
4021 # 4.8.6 or higher.
4022 #
4023 # To use it do the following:
4024 # - Install the latest version of global
4025-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
4026+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
4027 # - Make sure the INPUT points to the root of the source tree
4028 # - Run doxygen as normal
4029 #
4030@@ -1020,16 +1149,24 @@ USE_HTAGS = NO
4031 VERBATIM_HEADERS = YES
4032
4033 # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
4034-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
4035-# cost of reduced performance. This can be particularly helpful with template
4036-# rich C++ code for which doxygen's built-in parser lacks the necessary type
4037-# information.
4038+# clang parser (see:
4039+# http://clang.llvm.org/) for more accurate parsing at the cost of reduced
4040+# performance. This can be particularly helpful with template rich C++ code for
4041+# which doxygen's built-in parser lacks the necessary type information.
4042 # Note: The availability of this option depends on whether or not doxygen was
4043-# generated with the -Duse-libclang=ON option for CMake.
4044+# generated with the -Duse_libclang=ON option for CMake.
4045 # The default value is: NO.
4046
4047 CLANG_ASSISTED_PARSING = NO
4048
4049+# If the CLANG_ASSISTED_PARSING tag is set to YES and the CLANG_ADD_INC_PATHS
4050+# tag is set to YES then doxygen will add the directory of each input to the
4051+# include path.
4052+# The default value is: YES.
4053+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
4054+
4055+CLANG_ADD_INC_PATHS = YES
4056+
4057 # If clang assisted parsing is enabled you can provide the compiler with command
4058 # line options that you would normally use when invoking the compiler. Note that
4059 # the include paths will already be set by doxygen for the files and directories
4060@@ -1038,6 +1175,19 @@ CLANG_ASSISTED_PARSING = NO
4061
4062 CLANG_OPTIONS =
4063
4064+# If clang assisted parsing is enabled you can provide the clang parser with the
4065+# path to the directory containing a file called compile_commands.json. This
4066+# file is the compilation database (see:
4067+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the
4068+# options used when the source files were built. This is equivalent to
4069+# specifying the -p option to a clang tool, such as clang-check. These options
4070+# will then be passed to the parser. Any options specified with CLANG_OPTIONS
4071+# will be added as well.
4072+# Note: The availability of this option depends on whether or not doxygen was
4073+# generated with the -Duse_libclang=ON option for CMake.
4074+
4075+CLANG_DATABASE_PATH =
4076+
4077 #---------------------------------------------------------------------------
4078 # Configuration options related to the alphabetical class index
4079 #---------------------------------------------------------------------------
4080@@ -1049,13 +1199,6 @@ CLANG_OPTIONS =
4081
4082 ALPHABETICAL_INDEX = NO
4083
4084-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
4085-# which the alphabetical index list will be split.
4086-# Minimum value: 1, maximum value: 20, default value: 5.
4087-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
4088-
4089-COLS_IN_ALPHA_INDEX = 5
4090-
4091 # In case all classes in a project start with a common prefix, all classes will
4092 # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
4093 # can be used to specify a prefix (or a list of prefixes) that should be ignored
4094@@ -1155,8 +1298,8 @@ HTML_EXTRA_FILES =
4095
4096 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
4097 # will adjust the colors in the style sheet and background images according to
4098-# this color. Hue is specified as an angle on a colorwheel, see
4099-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
4100+# this color. Hue is specified as an angle on a color-wheel, see
4101+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
4102 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
4103 # purple, and 360 is red again.
4104 # Minimum value: 0, maximum value: 359, default value: 220.
4105@@ -1165,7 +1308,7 @@ HTML_EXTRA_FILES =
4106 HTML_COLORSTYLE_HUE = 220
4107
4108 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
4109-# in the HTML output. For a value of 0 the output will use grayscales only. A
4110+# in the HTML output. For a value of 0 the output will use gray-scales only. A
4111 # value of 255 will produce the most vivid colors.
4112 # Minimum value: 0, maximum value: 255, default value: 100.
4113 # This tag requires that the tag GENERATE_HTML is set to YES.
4114@@ -1192,6 +1335,17 @@ HTML_COLORSTYLE_GAMMA = 80
4115
4116 HTML_TIMESTAMP = NO
4117
4118+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
4119+# documentation will contain a main index with vertical navigation menus that
4120+# are dynamically created via JavaScript. If disabled, the navigation index will
4121+# consists of multiple levels of tabs that are statically embedded in every HTML
4122+# page. Disable this option to support browsers that do not have JavaScript,
4123+# like the Qt help browser.
4124+# The default value is: YES.
4125+# This tag requires that the tag GENERATE_HTML is set to YES.
4126+
4127+HTML_DYNAMIC_MENUS = YES
4128+
4129 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
4130 # documentation will contain sections that can be hidden and shown after the
4131 # page has loaded.
4132@@ -1215,13 +1369,14 @@ HTML_INDEX_NUM_ENTRIES = 100
4133
4134 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
4135 # generated that can be used as input for Apple's Xcode 3 integrated development
4136-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
4137-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
4138-# Makefile in the HTML output directory. Running make will produce the docset in
4139-# that directory and running make install will install the docset in
4140+# environment (see:
4141+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
4142+# create a documentation set, doxygen will generate a Makefile in the HTML
4143+# output directory. Running make will produce the docset in that directory and
4144+# running make install will install the docset in
4145 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
4146-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
4147-# for more information.
4148+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
4149+# genXcode/_index.html for more information.
4150 # The default value is: NO.
4151 # This tag requires that the tag GENERATE_HTML is set to YES.
4152
4153@@ -1235,6 +1390,13 @@ GENERATE_DOCSET = NO
4154
4155 DOCSET_FEEDNAME = "Doxygen generated docs"
4156
4157+# This tag determines the URL of the docset feed. A documentation feed provides
4158+# an umbrella under which multiple documentation sets from a single provider
4159+# (such as a company or product suite) can be grouped.
4160+# This tag requires that the tag GENERATE_DOCSET is set to YES.
4161+
4162+DOCSET_FEEDURL =
4163+
4164 # This tag specifies a string that should uniquely identify the documentation
4165 # set bundle. This should be a reverse domain-name style string, e.g.
4166 # com.mycompany.MyDocSet. Doxygen will append .docset to the name.
4167@@ -1260,8 +1422,12 @@ DOCSET_PUBLISHER_NAME = Publisher
4168 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
4169 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
4170 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
4171-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
4172-# Windows.
4173+# on Windows. In the beginning of 2021 Microsoft took the original page, with
4174+# a.o. the download links, offline the HTML help workshop was already many years
4175+# in maintenance mode). You can download the HTML help workshop from the web
4176+# archives at Installation executable (see:
4177+# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo
4178+# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe).
4179 #
4180 # The HTML Help Workshop contains a compiler that can convert all HTML output
4181 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
4182@@ -1291,7 +1457,7 @@ CHM_FILE =
4183 HHC_LOCATION =
4184
4185 # The GENERATE_CHI flag controls if a separate .chi index file is generated
4186-# (YES) or that it should be included in the master .chm file (NO).
4187+# (YES) or that it should be included in the main .chm file (NO).
4188 # The default value is: NO.
4189 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
4190
4191@@ -1336,7 +1502,8 @@ QCH_FILE =
4192
4193 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
4194 # Project output. For more information please see Qt Help Project / Namespace
4195-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
4196+# (see:
4197+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
4198 # The default value is: org.doxygen.Project.
4199 # This tag requires that the tag GENERATE_QHP is set to YES.
4200
4201@@ -1344,8 +1511,8 @@ QHP_NAMESPACE =
4202
4203 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
4204 # Help Project output. For more information please see Qt Help Project / Virtual
4205-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
4206-# folders).
4207+# Folders (see:
4208+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
4209 # The default value is: doc.
4210 # This tag requires that the tag GENERATE_QHP is set to YES.
4211
4212@@ -1353,30 +1520,30 @@ QHP_VIRTUAL_FOLDER = doc
4213
4214 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
4215 # filter to add. For more information please see Qt Help Project / Custom
4216-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
4217-# filters).
4218+# Filters (see:
4219+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
4220 # This tag requires that the tag GENERATE_QHP is set to YES.
4221
4222 QHP_CUST_FILTER_NAME =
4223
4224 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
4225 # custom filter to add. For more information please see Qt Help Project / Custom
4226-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
4227-# filters).
4228+# Filters (see:
4229+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
4230 # This tag requires that the tag GENERATE_QHP is set to YES.
4231
4232 QHP_CUST_FILTER_ATTRS =
4233
4234 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
4235 # project's filter section matches. Qt Help Project / Filter Attributes (see:
4236-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
4237+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
4238 # This tag requires that the tag GENERATE_QHP is set to YES.
4239
4240 QHP_SECT_FILTER_ATTRS =
4241
4242-# The QHG_LOCATION tag can be used to specify the location of Qt's
4243-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
4244-# generated .qhp file.
4245+# The QHG_LOCATION tag can be used to specify the location (absolute path
4246+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
4247+# run qhelpgenerator on the generated .qhp file.
4248 # This tag requires that the tag GENERATE_QHP is set to YES.
4249
4250 QHG_LOCATION =
4251@@ -1419,16 +1586,28 @@ DISABLE_INDEX = NO
4252 # to work a browser that supports JavaScript, DHTML, CSS and frames is required
4253 # (i.e. any modern browser). Windows users are probably better off using the
4254 # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
4255-# further fine-tune the look of the index. As an example, the default style
4256-# sheet generated by doxygen has an example that shows how to put an image at
4257-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
4258-# the same information as the tab index, you could consider setting
4259-# DISABLE_INDEX to YES when enabling this option.
4260+# further fine tune the look of the index (see "Fine-tuning the output"). As an
4261+# example, the default style sheet generated by doxygen has an example that
4262+# shows how to put an image at the root of the tree instead of the PROJECT_NAME.
4263+# Since the tree basically has the same information as the tab index, you could
4264+# consider setting DISABLE_INDEX to YES when enabling this option.
4265 # The default value is: NO.
4266 # This tag requires that the tag GENERATE_HTML is set to YES.
4267
4268 GENERATE_TREEVIEW = NO
4269
4270+# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
4271+# FULL_SIDEBAR option determines if the side bar is limited to only the treeview
4272+# area (value NO) or if it should extend to the full height of the window (value
4273+# YES). Setting this to YES gives a layout similar to
4274+# https://docs.readthedocs.io with more room for contents, but less room for the
4275+# project logo, title, and description. If either GENERATE_TREEVIEW or
4276+# DISABLE_INDEX is set to NO, this option has no effect.
4277+# The default value is: NO.
4278+# This tag requires that the tag GENERATE_HTML is set to YES.
4279+
4280+FULL_SIDEBAR = NO
4281+
4282 # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
4283 # doxygen will group on one line in the generated HTML documentation.
4284 #
4285@@ -1453,6 +1632,24 @@ TREEVIEW_WIDTH = 250
4286
4287 EXT_LINKS_IN_WINDOW = NO
4288
4289+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
4290+# addresses.
4291+# The default value is: YES.
4292+# This tag requires that the tag GENERATE_HTML is set to YES.
4293+
4294+OBFUSCATE_EMAILS = YES
4295+
4296+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
4297+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
4298+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
4299+# the HTML output. These images will generally look nicer at scaled resolutions.
4300+# Possible values are: png (the default) and svg (looks nicer but requires the
4301+# pdf2svg or inkscape tool).
4302+# The default value is: png.
4303+# This tag requires that the tag GENERATE_HTML is set to YES.
4304+
4305+HTML_FORMULA_FORMAT = png
4306+
4307 # Use this tag to change the font size of LaTeX formulas included as images in
4308 # the HTML documentation. When you change the font size after a successful
4309 # doxygen run you need to manually remove any form_*.png images from the HTML
4310@@ -1462,7 +1659,7 @@ EXT_LINKS_IN_WINDOW = NO
4311
4312 FORMULA_FONTSIZE = 10
4313
4314-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
4315+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
4316 # generated for formulas are transparent PNGs. Transparent PNGs are not
4317 # supported properly for IE 6.0, but are supported on all modern browsers.
4318 #
4319@@ -1473,8 +1670,14 @@ FORMULA_FONTSIZE = 10
4320
4321 FORMULA_TRANSPARENT = YES
4322
4323+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
4324+# to create new LaTeX commands to be used in formulas as building blocks. See
4325+# the section "Including formulas" for details.
4326+
4327+FORMULA_MACROFILE =
4328+
4329 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
4330-# http://www.mathjax.org) which uses client side Javascript for the rendering
4331+# https://www.mathjax.org) which uses client side JavaScript for the rendering
4332 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
4333 # installed or if you want to formulas look prettier in the HTML output. When
4334 # enabled you may also need to install MathJax separately and configure the path
4335@@ -1484,11 +1687,29 @@ FORMULA_TRANSPARENT = YES
4336
4337 USE_MATHJAX = YES
4338
4339+# With MATHJAX_VERSION it is possible to specify the MathJax version to be used.
4340+# Note that the different versions of MathJax have different requirements with
4341+# regards to the different settings, so it is possible that also other MathJax
4342+# settings have to be changed when switching between the different MathJax
4343+# versions.
4344+# Possible values are: MathJax_2 and MathJax_3.
4345+# The default value is: MathJax_2.
4346+# This tag requires that the tag USE_MATHJAX is set to YES.
4347+
4348+MATHJAX_VERSION = MathJax_2
4349+
4350 # When MathJax is enabled you can set the default output format to be used for
4351-# the MathJax output. See the MathJax site (see:
4352-# http://docs.mathjax.org/en/latest/output.html) for more details.
4353+# the MathJax output. For more details about the output format see MathJax
4354+# version 2 (see:
4355+# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3
4356+# (see:
4357+# http://docs.mathjax.org/en/latest/web/components/output.html).
4358 # Possible values are: HTML-CSS (which is slower, but has the best
4359-# compatibility), NativeMML (i.e. MathML) and SVG.
4360+# compatibility. This is the name for Mathjax version 2, for MathJax version 3
4361+# this will be translated into chtml), NativeMML (i.e. MathML. Only supported
4362+# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This
4363+# is the name for Mathjax version 3, for MathJax version 2 this will be
4364+# translated into HTML-CSS) and SVG.
4365 # The default value is: HTML-CSS.
4366 # This tag requires that the tag USE_MATHJAX is set to YES.
4367
4368@@ -1501,22 +1722,29 @@ MATHJAX_FORMAT = HTML-CSS
4369 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
4370 # Content Delivery Network so you can quickly see the result without installing
4371 # MathJax. However, it is strongly recommended to install a local copy of
4372-# MathJax from http://www.mathjax.org before deployment.
4373-# The default value is: http://cdn.mathjax.org/mathjax/latest.
4374+# MathJax from https://www.mathjax.org before deployment. The default value is:
4375+# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2
4376+# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3
4377 # This tag requires that the tag USE_MATHJAX is set to YES.
4378
4379 MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest
4380
4381 # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
4382 # extension names that should be enabled during MathJax rendering. For example
4383+# for MathJax version 2 (see https://docs.mathjax.org/en/v2.7-latest/tex.html
4384+# #tex-and-latex-extensions):
4385 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
4386+# For example for MathJax version 3 (see
4387+# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html):
4388+# MATHJAX_EXTENSIONS = ams
4389 # This tag requires that the tag USE_MATHJAX is set to YES.
4390
4391 MATHJAX_EXTENSIONS =
4392
4393 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
4394 # of code that will be used on startup of the MathJax code. See the MathJax site
4395-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
4396+# (see:
4397+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
4398 # example see the documentation.
4399 # This tag requires that the tag USE_MATHJAX is set to YES.
4400
4401@@ -1544,7 +1772,7 @@ MATHJAX_CODEFILE =
4402 SEARCHENGINE = NO
4403
4404 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
4405-# implemented using a web server instead of a web client using Javascript. There
4406+# implemented using a web server instead of a web client using JavaScript. There
4407 # are two flavors of web server based searching depending on the EXTERNAL_SEARCH
4408 # setting. When disabled, doxygen will generate a PHP script for searching and
4409 # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
4410@@ -1563,7 +1791,8 @@ SERVER_BASED_SEARCH = NO
4411 #
4412 # Doxygen ships with an example indexer (doxyindexer) and search engine
4413 # (doxysearch.cgi) which are based on the open source search engine library
4414-# Xapian (see: http://xapian.org/).
4415+# Xapian (see:
4416+# https://xapian.org/).
4417 #
4418 # See the section "External Indexing and Searching" for details.
4419 # The default value is: NO.
4420@@ -1576,8 +1805,9 @@ EXTERNAL_SEARCH = NO
4421 #
4422 # Doxygen ships with an example indexer (doxyindexer) and search engine
4423 # (doxysearch.cgi) which are based on the open source search engine library
4424-# Xapian (see: http://xapian.org/). See the section "External Indexing and
4425-# Searching" for details.
4426+# Xapian (see:
4427+# https://xapian.org/). See the section "External Indexing and Searching" for
4428+# details.
4429 # This tag requires that the tag SEARCHENGINE is set to YES.
4430
4431 SEARCHENGINE_URL =
4432@@ -1628,21 +1858,35 @@ LATEX_OUTPUT = latex
4433 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
4434 # invoked.
4435 #
4436-# Note that when enabling USE_PDFLATEX this option is only used for generating
4437-# bitmaps for formulas in the HTML output, but not in the Makefile that is
4438-# written to the output directory.
4439-# The default file is: latex.
4440+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
4441+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
4442+# chosen this is overwritten by pdflatex. For specific output languages the
4443+# default can have been set differently, this depends on the implementation of
4444+# the output language.
4445 # This tag requires that the tag GENERATE_LATEX is set to YES.
4446
4447 LATEX_CMD_NAME = latex
4448
4449 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
4450 # index for LaTeX.
4451+# Note: This tag is used in the Makefile / make.bat.
4452+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
4453+# (.tex).
4454 # The default file is: makeindex.
4455 # This tag requires that the tag GENERATE_LATEX is set to YES.
4456
4457 MAKEINDEX_CMD_NAME = makeindex
4458
4459+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
4460+# generate index for LaTeX. In case there is no backslash (\) as first character
4461+# it will be automatically added in the LaTeX code.
4462+# Note: This tag is used in the generated output file (.tex).
4463+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
4464+# The default value is: makeindex.
4465+# This tag requires that the tag GENERATE_LATEX is set to YES.
4466+
4467+LATEX_MAKEINDEX_CMD = makeindex
4468+
4469 # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
4470 # documents. This may be useful for small projects and may help to save some
4471 # trees in general.
4472@@ -1672,29 +1916,31 @@ PAPER_TYPE = a4
4473
4474 EXTRA_PACKAGES =
4475
4476-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
4477-# generated LaTeX document. The header should contain everything until the first
4478-# chapter. If it is left blank doxygen will generate a standard header. See
4479-# section "Doxygen usage" for information on how to let doxygen write the
4480-# default header to a separate file.
4481+# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
4482+# the generated LaTeX document. The header should contain everything until the
4483+# first chapter. If it is left blank doxygen will generate a standard header. It
4484+# is highly recommended to start with a default header using
4485+# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty
4486+# and then modify the file new_header.tex. See also section "Doxygen usage" for
4487+# information on how to generate the default header that doxygen normally uses.
4488 #
4489-# Note: Only use a user-defined header if you know what you are doing! The
4490-# following commands have a special meaning inside the header: $title,
4491-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
4492-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
4493-# string, for the replacement values of the other commands the user is referred
4494-# to HTML_HEADER.
4495+# Note: Only use a user-defined header if you know what you are doing!
4496+# Note: The header is subject to change so you typically have to regenerate the
4497+# default header when upgrading to a newer version of doxygen. The following
4498+# commands have a special meaning inside the header (and footer): For a
4499+# description of the possible markers and block names see the documentation.
4500 # This tag requires that the tag GENERATE_LATEX is set to YES.
4501
4502 LATEX_HEADER =
4503
4504-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
4505-# generated LaTeX document. The footer should contain everything after the last
4506-# chapter. If it is left blank doxygen will generate a standard footer. See
4507+# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for
4508+# the generated LaTeX document. The footer should contain everything after the
4509+# last chapter. If it is left blank doxygen will generate a standard footer. See
4510 # LATEX_HEADER for more information on how to generate a default footer and what
4511-# special commands can be used inside the footer.
4512-#
4513-# Note: Only use a user-defined footer if you know what you are doing!
4514+# special commands can be used inside the footer. See also section "Doxygen
4515+# usage" for information on how to generate the default footer that doxygen
4516+# normally uses. Note: Only use a user-defined footer if you know what you are
4517+# doing!
4518 # This tag requires that the tag GENERATE_LATEX is set to YES.
4519
4520 LATEX_FOOTER =
4521@@ -1727,9 +1973,11 @@ LATEX_EXTRA_FILES =
4522
4523 PDF_HYPERLINKS = YES
4524
4525-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
4526-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
4527-# higher quality PDF documentation.
4528+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
4529+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
4530+# files. Set this option to YES, to get a higher quality PDF documentation.
4531+#
4532+# See also section LATEX_CMD_NAME for selecting the engine.
4533 # The default value is: YES.
4534 # This tag requires that the tag GENERATE_LATEX is set to YES.
4535
4536@@ -1737,8 +1985,7 @@ USE_PDFLATEX = YES
4537
4538 # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
4539 # command to the generated LaTeX files. This will instruct LaTeX to keep running
4540-# if errors occur, instead of asking the user for help. This option is also used
4541-# when generating formulas in HTML.
4542+# if errors occur, instead of asking the user for help.
4543 # The default value is: NO.
4544 # This tag requires that the tag GENERATE_LATEX is set to YES.
4545
4546@@ -1751,19 +1998,9 @@ LATEX_BATCHMODE = NO
4547
4548 LATEX_HIDE_INDICES = NO
4549
4550-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
4551-# code with syntax highlighting in the LaTeX output.
4552-#
4553-# Note that which sources are shown also depends on other settings such as
4554-# SOURCE_BROWSER.
4555-# The default value is: NO.
4556-# This tag requires that the tag GENERATE_LATEX is set to YES.
4557-
4558-LATEX_SOURCE_CODE = NO
4559-
4560 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
4561 # bibliography, e.g. plainnat, or ieeetr. See
4562-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
4563+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
4564 # The default value is: plain.
4565 # This tag requires that the tag GENERATE_LATEX is set to YES.
4566
4567@@ -1777,6 +2014,14 @@ LATEX_BIB_STYLE = plain
4568
4569 LATEX_TIMESTAMP = NO
4570
4571+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
4572+# path from which the emoji images will be read. If a relative path is entered,
4573+# it will be relative to the LATEX_OUTPUT directory. If left blank the
4574+# LATEX_OUTPUT directory will be used.
4575+# This tag requires that the tag GENERATE_LATEX is set to YES.
4576+
4577+LATEX_EMOJI_DIRECTORY =
4578+
4579 #---------------------------------------------------------------------------
4580 # Configuration options related to the RTF output
4581 #---------------------------------------------------------------------------
4582@@ -1816,9 +2061,9 @@ COMPACT_RTF = NO
4583
4584 RTF_HYPERLINKS = NO
4585
4586-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
4587-# file, i.e. a series of assignments. You only have to provide replacements,
4588-# missing definitions are set to their default value.
4589+# Load stylesheet definitions from file. Syntax is similar to doxygen's
4590+# configuration file, i.e. a series of assignments. You only have to provide
4591+# replacements, missing definitions are set to their default value.
4592 #
4593 # See also section "Doxygen usage" for information on how to generate the
4594 # default style sheet that doxygen normally uses.
4595@@ -1827,22 +2072,12 @@ RTF_HYPERLINKS = NO
4596 RTF_STYLESHEET_FILE =
4597
4598 # Set optional variables used in the generation of an RTF document. Syntax is
4599-# similar to doxygen's config file. A template extensions file can be generated
4600-# using doxygen -e rtf extensionFile.
4601+# similar to doxygen's configuration file. A template extensions file can be
4602+# generated using doxygen -e rtf extensionFile.
4603 # This tag requires that the tag GENERATE_RTF is set to YES.
4604
4605 RTF_EXTENSIONS_FILE =
4606
4607-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
4608-# with syntax highlighting in the RTF output.
4609-#
4610-# Note that which sources are shown also depends on other settings such as
4611-# SOURCE_BROWSER.
4612-# The default value is: NO.
4613-# This tag requires that the tag GENERATE_RTF is set to YES.
4614-
4615-RTF_SOURCE_CODE = NO
4616-
4617 #---------------------------------------------------------------------------
4618 # Configuration options related to the man page output
4619 #---------------------------------------------------------------------------
4620@@ -1914,6 +2149,13 @@ XML_OUTPUT = xml
4621
4622 XML_PROGRAMLISTING = YES
4623
4624+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
4625+# namespace members in file scope as well, matching the HTML output.
4626+# The default value is: NO.
4627+# This tag requires that the tag GENERATE_XML is set to YES.
4628+
4629+XML_NS_MEMB_FILE_SCOPE = NO
4630+
4631 #---------------------------------------------------------------------------
4632 # Configuration options related to the DOCBOOK output
4633 #---------------------------------------------------------------------------
4634@@ -1932,23 +2174,14 @@ GENERATE_DOCBOOK = NO
4635
4636 DOCBOOK_OUTPUT = docbook
4637
4638-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
4639-# program listings (including syntax highlighting and cross-referencing
4640-# information) to the DOCBOOK output. Note that enabling this will significantly
4641-# increase the size of the DOCBOOK output.
4642-# The default value is: NO.
4643-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
4644-
4645-DOCBOOK_PROGRAMLISTING = NO
4646-
4647 #---------------------------------------------------------------------------
4648 # Configuration options for the AutoGen Definitions output
4649 #---------------------------------------------------------------------------
4650
4651 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
4652-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
4653-# structure of the code including all documentation. Note that this feature is
4654-# still experimental and incomplete at the moment.
4655+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
4656+# the structure of the code including all documentation. Note that this feature
4657+# is still experimental and incomplete at the moment.
4658 # The default value is: NO.
4659
4660 GENERATE_AUTOGEN_DEF = NO
4661@@ -2027,7 +2260,8 @@ SEARCH_INCLUDES = YES
4662
4663 # The INCLUDE_PATH tag can be used to specify one or more directories that
4664 # contain include files that are not input files but should be processed by the
4665-# preprocessor.
4666+# preprocessor. Note that the INCLUDE_PATH is not recursive, so the setting of
4667+# RECURSIVE has no effect here.
4668 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
4669
4670 INCLUDE_PATH = ../../hipcub/include/hipcub/
4671@@ -2121,15 +2355,6 @@ EXTERNAL_PAGES = YES
4672 # Configuration options related to the dot tool
4673 #---------------------------------------------------------------------------
4674
4675-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
4676-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
4677-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
4678-# disabled, but it is recommended to install and use dot, since it yields more
4679-# powerful graphs.
4680-# The default value is: YES.
4681-
4682-CLASS_DIAGRAMS = YES
4683-
4684 # You can include diagrams made with dia in doxygen documentation. Doxygen will
4685 # then run dia to produce the diagram and insert it in the documentation. The
4686 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
4687@@ -2148,7 +2373,7 @@ HIDE_UNDOC_RELATIONS = YES
4688 # http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
4689 # Bell Labs. The other options in this section have no effect if this option is
4690 # set to NO
4691-# The default value is: YES.
4692+# The default value is: NO.
4693
4694 HAVE_DOT = NO
4695
4696@@ -2186,11 +2411,14 @@ DOT_FONTSIZE = 10
4697
4698 DOT_FONTPATH =
4699
4700-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
4701-# each documented class showing the direct and indirect inheritance relations.
4702-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
4703+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
4704+# graph for each documented class showing the direct and indirect inheritance
4705+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
4706+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
4707+# to TEXT the direct and indirect inheritance relations will be shown as texts /
4708+# links.
4709+# Possible values are: NO, YES, TEXT and GRAPH.
4710 # The default value is: YES.
4711-# This tag requires that the tag HAVE_DOT is set to YES.
4712
4713 CLASS_GRAPH = YES
4714
4715@@ -2204,7 +2432,8 @@ CLASS_GRAPH = YES
4716 COLLABORATION_GRAPH = YES
4717
4718 # If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
4719-# groups, showing the direct groups dependencies.
4720+# groups, showing the direct groups dependencies. See also the chapter Grouping
4721+# in the manual.
4722 # The default value is: YES.
4723 # This tag requires that the tag HAVE_DOT is set to YES.
4724
4725@@ -2227,10 +2456,32 @@ UML_LOOK = NO
4726 # but if the number exceeds 15, the total amount of fields shown is limited to
4727 # 10.
4728 # Minimum value: 0, maximum value: 100, default value: 10.
4729-# This tag requires that the tag HAVE_DOT is set to YES.
4730+# This tag requires that the tag UML_LOOK is set to YES.
4731
4732 UML_LIMIT_NUM_FIELDS = 10
4733
4734+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
4735+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
4736+# tag is set to YES, doxygen will add type and arguments for attributes and
4737+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
4738+# will not generate fields with class member information in the UML graphs. The
4739+# class diagrams will look similar to the default class diagrams but using UML
4740+# notation for the relationships.
4741+# Possible values are: NO, YES and NONE.
4742+# The default value is: NO.
4743+# This tag requires that the tag UML_LOOK is set to YES.
4744+
4745+DOT_UML_DETAILS = NO
4746+
4747+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
4748+# to display on a single line. If the actual line length exceeds this threshold
4749+# significantly it will wrapped across multiple lines. Some heuristics are apply
4750+# to avoid ugly line breaks.
4751+# Minimum value: 0, maximum value: 1000, default value: 17.
4752+# This tag requires that the tag HAVE_DOT is set to YES.
4753+
4754+DOT_WRAP_THRESHOLD = 17
4755+
4756 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
4757 # collaboration graphs will show the relations between templates and their
4758 # instances.
4759@@ -2297,6 +2548,13 @@ GRAPHICAL_HIERARCHY = YES
4760
4761 DIRECTORY_GRAPH = YES
4762
4763+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
4764+# of child directories generated in directory dependency graphs by dot.
4765+# Minimum value: 1, maximum value: 25, default value: 1.
4766+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
4767+
4768+DIR_GRAPH_MAX_DEPTH = 1
4769+
4770 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
4771 # generated by dot. For an explanation of the image formats see the section
4772 # output formats in the documentation of the dot tool (Graphviz (see:
4773@@ -2304,9 +2562,7 @@ DIRECTORY_GRAPH = YES
4774 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
4775 # to make the SVG files visible in IE 9+ (other browsers do not have this
4776 # requirement).
4777-# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
4778-# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
4779-# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
4780+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
4781 # png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
4782 # png:gdiplus:gdiplus.
4783 # The default value is: png.
4784@@ -2352,13 +2608,18 @@ MSCFILE_DIRS =
4785 DIAFILE_DIRS =
4786
4787 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
4788-# path where java can find the plantuml.jar file. If left blank, it is assumed
4789-# PlantUML is not used or called during a preprocessing step. Doxygen will
4790-# generate a warning when it encounters a \startuml command in this case and
4791-# will not generate output for the diagram.
4792+# path where java can find the plantuml.jar file or to the filename of jar file
4793+# to be used. If left blank, it is assumed PlantUML is not used or called during
4794+# a preprocessing step. Doxygen will generate a warning when it encounters a
4795+# \startuml command in this case and will not generate output for the diagram.
4796
4797 PLANTUML_JAR_PATH =
4798
4799+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
4800+# configuration file for plantuml.
4801+
4802+PLANTUML_CFG_FILE =
4803+
4804 # When using plantuml, the specified paths are searched for files specified by
4805 # the !include statement in a plantuml block.
4806
4807@@ -2412,14 +2673,18 @@ DOT_MULTI_TARGETS = NO
4808 # If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
4809 # explaining the meaning of the various boxes and arrows in the dot generated
4810 # graphs.
4811+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
4812+# graphical representation for inheritance and collaboration diagrams is used.
4813 # The default value is: YES.
4814 # This tag requires that the tag HAVE_DOT is set to YES.
4815
4816 GENERATE_LEGEND = YES
4817
4818-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
4819+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
4820 # files that are used to generate the various graphs.
4821+#
4822+# Note: This setting is not only used for dot files but also for msc temporary
4823+# files.
4824 # The default value is: YES.
4825-# This tag requires that the tag HAVE_DOT is set to YES.
4826
4827 DOT_CLEANUP = YES
4828diff --git a/docs/index.rst b/docs/index.rst
4829index 8e4f96e..14af493 100644
4830--- a/docs/index.rst
4831+++ b/docs/index.rst
4832@@ -12,7 +12,7 @@ hipCUB documentation
4833 hipCUB is a thin, header-only wrapper library for `rocPRIM <https://rocm.docs.amd.com/projects/rocPRIM/en/latest/index.html>`_ and `CUB <https://docs.nvidia.com/cuda/cub/index.html>`_. It enables developers to port projects
4834 using the CUB library to the `HIP <https://rocm.docs.amd.com/projects/HIP/en/latest/index.html>`_ layer and run on AMD hardware. To learn more, see :ref:`what-is-hipcub`
4835
4836-The hipCUB repository is located at `https://github.com/ROCm/hipCUB <https://github.com/ROCm/hipCUB>`_.
4837+The hipCUB project is located in https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipcub.
4838
4839 .. grid:: 2
4840
4841diff --git a/docs/install/hipCUB-install-on-Windows.rst b/docs/install/hipCUB-install-on-Windows.rst
4842index 76375ab..fc73fd9 100644
4843--- a/docs/install/hipCUB-install-on-Windows.rst
4844+++ b/docs/install/hipCUB-install-on-Windows.rst
4845@@ -8,8 +8,9 @@ Building and installing hipCUB on Windows
4846
4847 You can use ``rmake.py`` to build and install hipCUB on Microsoft Windows. You can also use `CMake <./hipCUB-install-with-cmake.html>`_ if you want more build and installation options.
4848
4849+:doc:`Clone the hipCUB project <./hipCUB-install-overview>`. ``rmake.py`` will be located in the ``hipcub`` root directory.
4850
4851-``rmake.py`` is located in the ``hipCUB`` root directory. To build and install hipCUB with ``rmake.py``, run:
4852+To build and install hipCUB with ``rmake.py``, run:
4853
4854 .. code:: shell
4855
4856diff --git a/docs/install/hipCUB-install-overview.rst b/docs/install/hipCUB-install-overview.rst
4857index 772e53d..ff123ef 100644
4858--- a/docs/install/hipCUB-install-overview.rst
4859+++ b/docs/install/hipCUB-install-overview.rst
4860@@ -6,7 +6,16 @@
4861 hipCUB installation overview
4862 *********************************
4863
4864-The hipCUB source code is available from the `hipCUB GitHub Repository <https://github.com/ROCmSoftwarePlatform/hipCUB>`_.
4865+The hipCUB source code is available from the `ROCm libraries GitHub Repository <https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipcub>`_. Use sparse checkout when cloning the hipCUB project:
4866+
4867+.. code::
4868+
4869+ git clone --no-checkout --filter=blob:none https://github.com/ROCm/rocm-libraries.git
4870+ cd rocm-libraries
4871+ git sparse-checkout init --cone
4872+ git sparse-checkout set projects/hipcub
4873+
4874+Then use ``git checkout`` to check out the branch you need.
4875
4876 The develop branch is the default branch. The develop branch is intended for users who want to preview new features or contribute to the hipCUB code base.
4877
4878diff --git a/docs/install/hipCUB-install-with-cmake.rst b/docs/install/hipCUB-install-with-cmake.rst
4879index f21bf0d..0fcb91c 100644
4880--- a/docs/install/hipCUB-install-with-cmake.rst
4881+++ b/docs/install/hipCUB-install-with-cmake.rst
4882@@ -17,7 +17,7 @@ Before you begin, set ``CXX`` to ``amdclang++`` or ``hipcc`` if you're building
4883 CXX=amdclang++
4884 CMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++
4885
4886-Create the ``build`` directory inside the ``hipCUB`` directory, then change directory to the ``build`` directory:
4887+After :doc:`cloning the project <./hipCUB-install-overview>`, create the ``build`` directory under the ``hipcub`` root directory, then change directory to the ``build`` directory:
4888
4889 .. code:: shell
4890
4891@@ -35,6 +35,7 @@ The available build options are:
4892
4893 * ``BUILD_BENCHMARK``. Set this to ``ON`` to build benchmark tests. Off by default.
4894 * ``BUILD_TEST``. Set this to ``ON`` to build tests. Off by default.
4895+* ``USE_SYSTEM_LIB``: Set to ``ON`` to use the installed ``hipCUB`` from the system when building the tests. Off by default. For this option to take effect, ``BUILD_TEST`` must be ``ON`` and the ``hipCUB`` install (with its dependencies) must be compatible with the version of the tests.
4896 * ``DEPENDENCIES_FORCE_DOWNLOAD``. Set this to ``ON`` to download the dependencies regardless of whether or not they are already installed. Off by default.
4897
4898 Build hipCUB using the generated make file:
4899diff --git a/docs/install/hipCUB-prerequisites.rst b/docs/install/hipCUB-prerequisites.rst
4900index 881cef3..d4352f5 100644
4901--- a/docs/install/hipCUB-prerequisites.rst
4902+++ b/docs/install/hipCUB-prerequisites.rst
4903@@ -8,7 +8,7 @@ hipCUB prerequisites
4904
4905 hipCUB has the following prerequisites on all platforms:
4906
4907-* `CMake <https://cmake.org/>`_ version 3.16 or higher
4908+* `CMake <https://cmake.org/>`_ version 3.18 or higher
4909
4910 On AMD GPUs:
4911
4912@@ -21,9 +21,9 @@ amdclang++ is installed with ROCm. rocPRIM is automatically downloaded and insta
4913 On NVIDIA GPUs:
4914
4915 * The CUDA Toolkit
4916-* CCCL library version 2.3.2 or later
4917+* CCCL library version 2.8.2 or later
4918 * CUB and Thrust
4919-* libcu++ version 2.2.0
4920+* libcu++ version 2.8.2
4921
4922 The CCCL library is automatically downloaded and built by the CMake script. If libcu++ isn't found on the system, it will be downloaded from the CCCL repository.
4923
4924diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
4925index ad94caa..73a3602 100644
4926--- a/docs/sphinx/requirements.in
4927+++ b/docs/sphinx/requirements.in
4928@@ -1 +1 @@
4929-rocm-docs-core[api_reference]==1.8.3
4930+rocm-docs-core[api_reference]==1.20.1
4931diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
4932index 6dd77b1..e7818b8 100644
4933--- a/docs/sphinx/requirements.txt
4934+++ b/docs/sphinx/requirements.txt
4935@@ -8,6 +8,13 @@ accessible-pygments==0.0.4
4936 # via pydata-sphinx-theme
4937 alabaster==0.7.16
4938 # via sphinx
4939+asttokens==3.0.0
4940+ # via stack-data
4941+attrs==25.1.0
4942+ # via
4943+ # jsonschema
4944+ # jupyter-cache
4945+ # referencing
4946 babel==2.14.0
4947 # via
4948 # pydata-sphinx-theme
4949@@ -25,9 +32,21 @@ cffi==1.16.0
4950 charset-normalizer==3.3.2
4951 # via requests
4952 click==8.1.7
4953- # via sphinx-external-toc
4954-cryptography==43.0.1
4955+ # via
4956+ # click-log
4957+ # doxysphinx
4958+ # jupyter-cache
4959+ # sphinx-external-toc
4960+click-log==0.4.0
4961+ # via doxysphinx
4962+comm==0.2.2
4963+ # via ipykernel
4964+cryptography==44.0.1
4965 # via pyjwt
4966+debugpy==1.8.12
4967+ # via ipykernel
4968+decorator==5.1.1
4969+ # via ipython
4970 deprecated==1.2.14
4971 # via pygithub
4972 docutils==0.21.2
4973@@ -38,39 +57,115 @@ docutils==0.21.2
4974 # sphinx
4975 doxysphinx==3.3.8
4976 # via rocm-docs-core
4977+exceptiongroup==1.2.2
4978+ # via ipython
4979+executing==2.2.0
4980+ # via stack-data
4981 fastjsonschema==2.19.1
4982- # via rocm-docs-core
4983+ # via
4984+ # nbformat
4985+ # rocm-docs-core
4986 gitdb==4.0.11
4987 # via gitpython
4988 gitpython==3.1.43
4989 # via rocm-docs-core
4990+greenlet==3.1.1
4991+ # via sqlalchemy
4992 idna==3.7
4993 # via requests
4994 imagesize==1.4.1
4995 # via sphinx
4996-jinja2==3.1.4
4997+importlib-metadata==8.6.1
4998+ # via
4999+ # jupyter-cache
5000+ # myst-nb
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches