Merge ~bullwinkle-team/ubuntu/+source/rocprim:bullwinkle/llvm-21/ubuntu/devel into ubuntu/+source/rocprim:ubuntu/devel

Proposed by Igor Luppi
Status: Merged
Approved by: Andreas Hasenack
Approved revision: 634c24c39d56d3d7c8f9b821d0685cd2f1711606
Merged at revision: 634c24c39d56d3d7c8f9b821d0685cd2f1711606
Proposed branch: ~bullwinkle-team/ubuntu/+source/rocprim:bullwinkle/llvm-21/ubuntu/devel
Merge into: ubuntu/+source/rocprim:ubuntu/devel
Diff against target: 143068 lines (+54614/-40047)
497 files modified
.github/CODEOWNERS (+1/-1)
.gitlab-ci.yml (+163/-32)
.gitlab/report_noise.py (+251/-0)
.gitlab/run_benchmarks.py (+25/-6)
CHANGELOG.md (+164/-4)
CMakeLists.txt (+54/-23)
CONTRIBUTING.md (+2/-0)
LICENSE.md (+2/-2)
README.md (+84/-54)
benchmark/CMakeLists.txt (+3/-1)
benchmark/ConfigAutotuneSettings.cmake (+30/-14)
benchmark/benchmark_block_adjacent_difference.cpp (+155/-267)
benchmark/benchmark_block_discontinuity.cpp (+125/-214)
benchmark/benchmark_block_exchange.cpp (+127/-266)
benchmark/benchmark_block_histogram.cpp (+99/-186)
benchmark/benchmark_block_radix_rank.cpp (+81/-157)
benchmark/benchmark_block_radix_sort.cpp (+169/-251)
benchmark/benchmark_block_reduce.cpp (+103/-200)
benchmark/benchmark_block_run_length_decode.cpp (+86/-136)
benchmark/benchmark_block_scan.cpp (+121/-227)
benchmark/benchmark_block_sort.cpp (+26/-94)
benchmark/benchmark_block_sort.parallel.hpp (+56/-98)
benchmark/benchmark_config_dispatch.cpp (+43/-81)
benchmark/benchmark_device_adjacent_difference.cpp (+35/-101)
benchmark/benchmark_device_adjacent_difference.parallel.cpp.in (+13/-10)
benchmark/benchmark_device_adjacent_difference.parallel.hpp (+68/-176)
benchmark/benchmark_device_adjacent_find.cpp (+24/-86)
benchmark/benchmark_device_adjacent_find.parallel.cpp.in (+6/-3)
benchmark/benchmark_device_adjacent_find.parallel.hpp (+35/-73)
benchmark/benchmark_device_batch_memcpy.cpp (+228/-405)
benchmark/benchmark_device_binary_search.cpp (+50/-244)
benchmark/benchmark_device_binary_search.parallel.cpp.in (+9/-2)
benchmark/benchmark_device_binary_search.parallel.hpp (+86/-100)
benchmark/benchmark_device_find_end.cpp (+25/-78)
benchmark/benchmark_device_find_end.hpp (+35/-87)
benchmark/benchmark_device_find_first_of.cpp (+32/-99)
benchmark/benchmark_device_find_first_of.parallel.cpp.in (+6/-2)
benchmark/benchmark_device_find_first_of.parallel.hpp (+27/-57)
benchmark/benchmark_device_histogram.cpp (+275/-549)
benchmark/benchmark_device_histogram.parallel.cpp.in (+6/-4)
benchmark/benchmark_device_histogram.parallel.hpp (+103/-117)
benchmark/benchmark_device_memory.cpp (+886/-776)
benchmark/benchmark_device_merge.cpp (+28/-106)
benchmark/benchmark_device_merge.parallel.cpp.in (+6/-4)
benchmark/benchmark_device_merge.parallel.hpp (+103/-225)
benchmark/benchmark_device_merge_inplace.cpp (+201/-0)
benchmark/benchmark_device_merge_sort.cpp (+33/-69)
benchmark/benchmark_device_merge_sort.hpp (+53/-120)
benchmark/benchmark_device_merge_sort_block_merge.cpp (+26/-88)
benchmark/benchmark_device_merge_sort_block_merge.parallel.cpp.in (+6/-4)
benchmark/benchmark_device_merge_sort_block_merge.parallel.hpp (+160/-231)
benchmark/benchmark_device_merge_sort_block_sort.cpp (+27/-89)
benchmark/benchmark_device_merge_sort_block_sort.parallel.cpp.in (+6/-4)
benchmark/benchmark_device_merge_sort_block_sort.parallel.hpp (+73/-172)
benchmark/benchmark_device_nth_element.cpp (+20/-70)
benchmark/benchmark_device_nth_element.hpp (+30/-79)
benchmark/benchmark_device_partial_sort.cpp (+20/-69)
benchmark/benchmark_device_partial_sort.hpp (+26/-60)
benchmark/benchmark_device_partial_sort_copy.cpp (+17/-66)
benchmark/benchmark_device_partial_sort_copy.hpp (+31/-78)
benchmark/benchmark_device_partition.cpp (+109/-177)
benchmark/benchmark_device_partition.parallel.cpp.in (+6/-4)
benchmark/benchmark_device_partition.parallel.hpp (+145/-359)
benchmark/benchmark_device_radix_sort.cpp (+40/-57)
benchmark/benchmark_device_radix_sort.hpp (+78/-200)
benchmark/benchmark_device_radix_sort_block_sort.cpp (+25/-87)
benchmark/benchmark_device_radix_sort_block_sort.parallel.cpp.in (+6/-4)
benchmark/benchmark_device_radix_sort_block_sort.parallel.hpp (+73/-178)
benchmark/benchmark_device_radix_sort_onesweep.cpp (+37/-79)
benchmark/benchmark_device_radix_sort_onesweep.parallel.cpp.in (+6/-4)
benchmark/benchmark_device_radix_sort_onesweep.parallel.hpp (+131/-282)
benchmark/benchmark_device_reduce.cpp (+23/-85)
benchmark/benchmark_device_reduce.parallel.cpp.in (+9/-4)
benchmark/benchmark_device_reduce.parallel.hpp (+40/-87)
benchmark/benchmark_device_reduce_by_key.cpp (+5/-130)
benchmark/benchmark_device_reduce_by_key.parallel.cpp.in (+5/-3)
benchmark/benchmark_device_reduce_by_key.parallel.hpp (+114/-89)
benchmark/benchmark_device_reduce_by_key_deterministic.cpp (+4/-109)
benchmark/benchmark_device_run_length_encode.cpp (+26/-101)
benchmark/benchmark_device_run_length_encode.parallel.cpp.in (+5/-3)
benchmark/benchmark_device_run_length_encode.parallel.hpp (+38/-81)
benchmark/benchmark_device_run_length_encode_non_trivial_runs.cpp (+31/-102)
benchmark/benchmark_device_run_length_encode_non_trivial_runs.parallel.cpp.in (+7/-4)
benchmark/benchmark_device_run_length_encode_non_trivial_runs.parallel.hpp (+37/-80)
benchmark/benchmark_device_scan.cpp (+5/-109)
benchmark/benchmark_device_scan.parallel.cpp.in (+7/-4)
benchmark/benchmark_device_scan.parallel.hpp (+90/-81)
benchmark/benchmark_device_scan_by_key.cpp (+5/-122)
benchmark/benchmark_device_scan_by_key.parallel.cpp.in (+7/-4)
benchmark/benchmark_device_scan_by_key.parallel.hpp (+112/-98)
benchmark/benchmark_device_scan_by_key_deterministic.cpp (+6/-102)
benchmark/benchmark_device_scan_deterministic.cpp (+4/-88)
benchmark/benchmark_device_search.cpp (+18/-74)
benchmark/benchmark_device_search.hpp (+35/-87)
benchmark/benchmark_device_search_n.cpp (+35/-45)
benchmark/benchmark_device_search_n.parallel.cpp.in (+10/-6)
benchmark/benchmark_device_search_n.parallel.hpp (+129/-337)
benchmark/benchmark_device_segmented_radix_sort_keys.cpp (+36/-273)
benchmark/benchmark_device_segmented_radix_sort_keys.parallel.cpp.in (+8/-7)
benchmark/benchmark_device_segmented_radix_sort_keys.parallel.hpp (+112/-146)
benchmark/benchmark_device_segmented_radix_sort_pairs.cpp (+47/-318)
benchmark/benchmark_device_segmented_radix_sort_pairs.parallel.cpp.in (+8/-7)
benchmark/benchmark_device_segmented_radix_sort_pairs.parallel.hpp (+126/-169)
benchmark/benchmark_device_segmented_reduce.cpp (+37/-246)
benchmark/benchmark_device_segmented_reduce.parallel.cpp.in (+32/-0)
benchmark/benchmark_device_segmented_reduce.parallel.hpp (+196/-0)
benchmark/benchmark_device_select.cpp (+95/-181)
benchmark/benchmark_device_select.parallel.cpp.in (+6/-4)
benchmark/benchmark_device_select.parallel.hpp (+157/-399)
benchmark/benchmark_device_transform.cpp (+28/-90)
benchmark/benchmark_device_transform.parallel.cpp.in (+10/-6)
benchmark/benchmark_device_transform.parallel.hpp (+69/-81)
benchmark/benchmark_device_transform_pointer.cpp (+72/-0)
benchmark/benchmark_device_transform_pointer.parallel.cpp.in (+37/-0)
benchmark/benchmark_predicate_iterator.cpp (+59/-165)
benchmark/benchmark_utils.hpp (+801/-416)
benchmark/benchmark_warp_exchange.cpp (+248/-275)
benchmark/benchmark_warp_reduce.cpp (+123/-221)
benchmark/benchmark_warp_scan.cpp (+175/-190)
benchmark/benchmark_warp_sort.cpp (+160/-224)
cmake/Dependencies.cmake (+49/-3)
cmake/DownloadProject.CMakeLists.cmake.in (+27/-0)
cmake/DownloadProject.cmake (+170/-0)
cmake/GenerateResourceSpec.cmake (+8/-7)
cmake/Summary.cmake (+59/-7)
common/README.md (+13/-0)
common/device_adjacent_difference.hpp (+163/-0)
common/device_batch_memcpy.hpp (+134/-0)
common/predicate_iterator.hpp (+43/-0)
common/utils.hpp (+3/-0)
common/utils_custom_type.hpp (+270/-0)
common/utils_data_generation.hpp (+195/-0)
common/utils_device_ptr.hpp (+1/-0)
common/utils_half.hpp (+22/-15)
common/warp_exchange.hpp (+121/-0)
debian/changelog (+45/-0)
debian/control (+9/-8)
debian/librocprim-tests.install (+3/-2)
debian/librocprim-tests.lintian-overrides (+7/-0)
debian/not-installed (+1/-2)
debian/patches/Extend-docs-conf.py-for-offline-build.patch (+3/-6)
debian/patches/arch-conversion-macro.patch (+38/-38)
debian/patches/series (+0/-4)
debian/patches/skip-building-test_device_scan.patch (+2/-2)
debian/patches/test-warp-sort-ppc64el.patch (+5/-5)
debian/rules (+15/-4)
debian/tests/control (+1/-1)
dev/null (+0/-242)
docs/block_ops/ops_classes/run_length_decode.rst (+12/-0)
docs/conceptual/rocPRIM-operations.rst (+141/-0)
docs/conceptual/rocPRIM-performance-tuning.rst (+2/-2)
docs/conceptual/rocPRIM-scope.rst (+19/-0)
docs/conceptual/rocPRIM-stripe-block.rst (+15/-0)
docs/conceptual/rocPRIM-type-traits.rst (+44/-11)
docs/conf.py (+4/-3)
docs/device_ops/adjacent_find.rst (+1/-2)
docs/device_ops/config.rst (+2/-14)
docs/device_ops/find_end.rst (+1/-1)
docs/device_ops/merge.rst (+12/-4)
docs/device_ops/partial_sort.rst (+3/-2)
docs/device_ops/search_n.rst (+4/-4)
docs/device_ops/select.rst (+1/-0)
docs/doxygen/Doxyfile (+474/-223)
docs/doxygen/blockmodule.dox (+11/-1)
docs/how-to/rocPRIM-spir-v.rst (+146/-0)
docs/index.rst (+26/-14)
docs/install/rocPRIM-build-install-linux.rst (+50/-0)
docs/install/rocPRIM-build-install-windows.rst (+42/-0)
docs/install/rocPRIM-install-overview.rst (+24/-0)
docs/install/rocPRIM-prerequisites.rst (+20/-0)
docs/reference/developer.rst (+20/-1)
docs/reference/iterators.rst (+2/-0)
docs/reference/reference.rst (+0/-2)
docs/reference/rocPRIM-data-type-support.rst (+49/-0)
docs/reference/rocPRIM-glossary.rst (+65/-0)
docs/reference/types.rst (+1/-1)
docs/sphinx/_toc.yml.in (+97/-82)
docs/sphinx/requirements.in (+1/-1)
docs/sphinx/requirements.txt (+137/-6)
example/example_type_traits_interface.cpp (+2/-0)
install (+0/-10)
rmake.py (+6/-2)
rocprim/CMakeLists.txt (+3/-20)
rocprim/include/rocprim/block/block_adjacent_difference.hpp (+57/-809)
rocprim/include/rocprim/block/block_discontinuity.hpp (+70/-70)
rocprim/include/rocprim/block/block_exchange.hpp (+235/-117)
rocprim/include/rocprim/block/block_histogram.hpp (+22/-22)
rocprim/include/rocprim/block/block_load.hpp (+35/-35)
rocprim/include/rocprim/block/block_load_func.hpp (+429/-153)
rocprim/include/rocprim/block/block_radix_rank.hpp (+156/-85)
rocprim/include/rocprim/block/block_radix_sort.hpp (+237/-110)
rocprim/include/rocprim/block/block_reduce.hpp (+107/-52)
rocprim/include/rocprim/block/block_run_length_decode.hpp (+11/-7)
rocprim/include/rocprim/block/block_scan.hpp (+424/-180)
rocprim/include/rocprim/block/block_shuffle.hpp (+99/-95)
rocprim/include/rocprim/block/block_sort.hpp (+44/-44)
rocprim/include/rocprim/block/block_store.hpp (+22/-22)
rocprim/include/rocprim/block/block_store_func.hpp (+301/-99)
rocprim/include/rocprim/block/detail/block_adjacent_difference_impl.hpp (+138/-111)
rocprim/include/rocprim/block/detail/block_histogram_sort.hpp (+8/-2)
rocprim/include/rocprim/block/detail/block_radix_rank_basic.hpp (+67/-55)
rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp (+13/-11)
rocprim/include/rocprim/block/detail/block_reduce_raking_reduce.hpp (+6/-5)
rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp (+63/-81)
rocprim/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp (+155/-32)
rocprim/include/rocprim/block/detail/block_scan_warp_scan.hpp (+144/-69)
rocprim/include/rocprim/block/detail/block_sort_bitonic.hpp (+5/-4)
rocprim/include/rocprim/common.hpp (+5/-0)
rocprim/include/rocprim/config.hpp (+82/-62)
rocprim/include/rocprim/detail/merge_path.hpp (+19/-9)
rocprim/include/rocprim/detail/temp_storage.hpp (+23/-23)
rocprim/include/rocprim/detail/various.hpp (+108/-49)
rocprim/include/rocprim/detail/virtual_shared_memory.hpp (+132/-0)
rocprim/include/rocprim/device/config_types.hpp (+347/-121)
rocprim/include/rocprim/device/detail/config/device_adjacent_difference.hpp (+79/-1)
rocprim/include/rocprim/device/detail/config/device_adjacent_difference_inplace.hpp (+81/-3)
rocprim/include/rocprim/device/detail/config/device_adjacent_find.hpp (+36/-26)
rocprim/include/rocprim/device/detail/config/device_batch_copy.hpp (+263/-0)
rocprim/include/rocprim/device/detail/config/device_batch_memcpy.hpp (+263/-0)
rocprim/include/rocprim/device/detail/config/device_binary_search.hpp (+199/-58)
rocprim/include/rocprim/device/detail/config/device_find_first_of.hpp (+14/-5)
rocprim/include/rocprim/device/detail/config/device_histogram.hpp (+41/-41)
rocprim/include/rocprim/device/detail/config/device_lower_bound.hpp (+196/-55)
rocprim/include/rocprim/device/detail/config/device_merge.hpp (+161/-16)
rocprim/include/rocprim/device/detail/config/device_merge_sort_block_merge.hpp (+609/-43)
rocprim/include/rocprim/device/detail/config/device_merge_sort_block_sort.hpp (+230/-162)
rocprim/include/rocprim/device/detail/config/device_partition_flag.hpp (+1/-1)
rocprim/include/rocprim/device/detail/config/device_partition_predicate.hpp (+1/-1)
rocprim/include/rocprim/device/detail/config/device_partition_three_way.hpp (+1/-1)
rocprim/include/rocprim/device/detail/config/device_partition_two_way_flag.hpp (+1/-1)
rocprim/include/rocprim/device/detail/config/device_partition_two_way_predicate.hpp (+1/-1)
rocprim/include/rocprim/device/detail/config/device_radix_sort_block_sort.hpp (+278/-133)
rocprim/include/rocprim/device/detail/config/device_radix_sort_onesweep.hpp (+813/-103)
rocprim/include/rocprim/device/detail/config/device_reduce.hpp (+19/-9)
rocprim/include/rocprim/device/detail/config/device_reduce_by_key.hpp (+1430/-90)
rocprim/include/rocprim/device/detail/config/device_run_length_encode.hpp (+41/-26)
rocprim/include/rocprim/device/detail/config/device_run_length_encode_non_trivial.hpp (+42/-29)
rocprim/include/rocprim/device/detail/config/device_scan.hpp (+33/-19)
rocprim/include/rocprim/device/detail/config/device_scan_by_key.hpp (+699/-75)
rocprim/include/rocprim/device/detail/config/device_search_n.hpp (+663/-0)
rocprim/include/rocprim/device/detail/config/device_segmented_radix_sort.hpp (+1077/-698)
rocprim/include/rocprim/device/detail/config/device_segmented_reduce.hpp (+161/-1)
rocprim/include/rocprim/device/detail/config/device_select_flag.hpp (+1/-1)
rocprim/include/rocprim/device/detail/config/device_select_predicate.hpp (+1/-1)
rocprim/include/rocprim/device/detail/config/device_select_predicated_flag.hpp (+2/-2)
rocprim/include/rocprim/device/detail/config/device_select_unique.hpp (+1/-1)
rocprim/include/rocprim/device/detail/config/device_select_unique_by_key.hpp (+2/-2)
rocprim/include/rocprim/device/detail/config/device_transform.hpp (+21/-11)
rocprim/include/rocprim/device/detail/config/device_transform_pointer.hpp (+695/-0)
rocprim/include/rocprim/device/detail/config/device_upper_bound.hpp (+199/-58)
rocprim/include/rocprim/device/detail/device_adjacent_difference.hpp (+6/-7)
rocprim/include/rocprim/device/detail/device_adjacent_find.hpp (+19/-22)
rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp (+310/-256)
rocprim/include/rocprim/device/detail/device_config_helper.hpp (+297/-110)
rocprim/include/rocprim/device/detail/device_histogram.hpp (+194/-61)
rocprim/include/rocprim/device/detail/device_merge.hpp (+175/-168)
rocprim/include/rocprim/device/detail/device_merge_sort.hpp (+116/-172)
rocprim/include/rocprim/device/detail/device_merge_sort_mergepath.hpp (+349/-351)
rocprim/include/rocprim/device/detail/device_nth_element.hpp (+247/-167)
rocprim/include/rocprim/device/detail/device_partition.hpp (+431/-430)
rocprim/include/rocprim/device/detail/device_radix_sort.hpp (+131/-120)
rocprim/include/rocprim/device/detail/device_reduce.hpp (+5/-5)
rocprim/include/rocprim/device/detail/device_reduce_by_key.hpp (+92/-36)
rocprim/include/rocprim/device/detail/device_run_length_encode.hpp (+89/-27)
rocprim/include/rocprim/device/detail/device_scan.hpp (+83/-30)
rocprim/include/rocprim/device/detail/device_scan_by_key.hpp (+231/-216)
rocprim/include/rocprim/device/detail/device_scan_common.hpp (+52/-31)
rocprim/include/rocprim/device/detail/device_search.hpp (+138/-68)
rocprim/include/rocprim/device/detail/device_search_n.hpp (+356/-290)
rocprim/include/rocprim/device/detail/device_segmented_radix_sort.hpp (+307/-254)
rocprim/include/rocprim/device/detail/device_segmented_reduce.hpp (+11/-11)
rocprim/include/rocprim/device/detail/device_segmented_scan.hpp (+2/-2)
rocprim/include/rocprim/device/detail/device_transform.hpp (+140/-62)
rocprim/include/rocprim/device/detail/lookback_scan_state.hpp (+439/-252)
rocprim/include/rocprim/device/detail/ordered_block_id.hpp (+100/-4)
rocprim/include/rocprim/device/device_adjacent_difference.hpp (+110/-76)
rocprim/include/rocprim/device/device_adjacent_find.hpp (+29/-20)
rocprim/include/rocprim/device/device_binary_search.hpp (+53/-56)
rocprim/include/rocprim/device/device_binary_search_config.hpp (+16/-16)
rocprim/include/rocprim/device/device_copy.hpp (+13/-12)
rocprim/include/rocprim/device/device_copy_config.hpp (+3/-32)
rocprim/include/rocprim/device/device_find_first_of.hpp (+56/-49)
rocprim/include/rocprim/device/device_histogram.hpp (+528/-262)
rocprim/include/rocprim/device/device_memcpy.hpp (+13/-12)
rocprim/include/rocprim/device/device_memcpy_config.hpp (+51/-40)
rocprim/include/rocprim/device/device_merge.hpp (+279/-197)
rocprim/include/rocprim/device/device_merge_inplace.hpp (+924/-0)
rocprim/include/rocprim/device/device_merge_inplace_config.hpp (+51/-0)
rocprim/include/rocprim/device/device_merge_sort.hpp (+666/-317)
rocprim/include/rocprim/device/device_merge_sort_config.hpp (+8/-8)
rocprim/include/rocprim/device/device_nth_element.hpp (+23/-20)
rocprim/include/rocprim/device/device_partial_sort.hpp (+7/-3)
rocprim/include/rocprim/device/device_partial_sort_config.hpp (+4/-4)
rocprim/include/rocprim/device/device_partition.hpp (+400/-247)
rocprim/include/rocprim/device/device_radix_sort.hpp (+252/-176)
rocprim/include/rocprim/device/device_radix_sort_config.hpp (+5/-5)
rocprim/include/rocprim/device/device_reduce.hpp (+129/-105)
rocprim/include/rocprim/device/device_reduce_by_key.hpp (+80/-67)
rocprim/include/rocprim/device/device_run_length_encode.hpp (+68/-58)
rocprim/include/rocprim/device/device_run_length_encode_config.hpp (+3/-3)
rocprim/include/rocprim/device/device_scan.hpp (+427/-169)
rocprim/include/rocprim/device/device_scan_by_key.hpp (+77/-68)
rocprim/include/rocprim/device/device_search_n.hpp (+3/-0)
rocprim/include/rocprim/device/device_search_n_config.hpp (+6/-5)
rocprim/include/rocprim/device/device_segmented_radix_sort.hpp (+670/-594)
rocprim/include/rocprim/device/device_segmented_radix_sort_config.hpp (+3/-3)
rocprim/include/rocprim/device/device_segmented_reduce.hpp (+109/-99)
rocprim/include/rocprim/device/device_segmented_reduce_config.hpp (+79/-0)
rocprim/include/rocprim/device/device_segmented_scan.hpp (+236/-225)
rocprim/include/rocprim/device/device_select.hpp (+213/-198)
rocprim/include/rocprim/device/device_transform.hpp (+238/-117)
rocprim/include/rocprim/device/device_transform_config.hpp (+24/-7)
rocprim/include/rocprim/device/specialization/device_radix_block_sort.hpp (+48/-29)
rocprim/include/rocprim/device/specialization/device_radix_merge_sort.hpp (+20/-10)
rocprim/include/rocprim/intrinsics/arch.hpp (+201/-9)
rocprim/include/rocprim/intrinsics/atomic.hpp (+16/-6)
rocprim/include/rocprim/intrinsics/bit.hpp (+40/-3)
rocprim/include/rocprim/intrinsics/thread.hpp (+33/-46)
rocprim/include/rocprim/intrinsics/warp.hpp (+24/-14)
rocprim/include/rocprim/intrinsics/warp_shuffle.hpp (+41/-33)
rocprim/include/rocprim/iterator.hpp (+1/-0)
rocprim/include/rocprim/iterator/arg_index_iterator.hpp (+14/-19)
rocprim/include/rocprim/iterator/constant_iterator.hpp (+7/-13)
rocprim/include/rocprim/iterator/counting_iterator.hpp (+6/-12)
rocprim/include/rocprim/iterator/detail/common.hpp (+58/-0)
rocprim/include/rocprim/iterator/discard_iterator.hpp (+3/-9)
rocprim/include/rocprim/iterator/predicate_iterator.hpp (+3/-5)
rocprim/include/rocprim/iterator/reverse_iterator.hpp (+5/-11)
rocprim/include/rocprim/iterator/texture_cache_iterator.hpp (+25/-32)
rocprim/include/rocprim/iterator/transform_iterator.hpp (+19/-21)
rocprim/include/rocprim/iterator/transform_output_iterator.hpp (+272/-0)
rocprim/include/rocprim/iterator/zip_iterator.hpp (+11/-12)
rocprim/include/rocprim/rocprim.hpp (+2/-4)
rocprim/include/rocprim/rocprim_version.hpp.in (+3/-3)
rocprim/include/rocprim/thread/radix_key_codec.hpp (+4/-634)
rocprim/include/rocprim/thread/thread_copy.hpp (+221/-0)
rocprim/include/rocprim/thread/thread_load.hpp (+156/-55)
rocprim/include/rocprim/thread/thread_operators.hpp (+16/-4)
rocprim/include/rocprim/thread/thread_reduce.hpp (+15/-66)
rocprim/include/rocprim/thread/thread_scan.hpp (+2/-1)
rocprim/include/rocprim/thread/thread_search.hpp (+3/-1)
rocprim/include/rocprim/thread/thread_store.hpp (+100/-46)
rocprim/include/rocprim/type_traits.hpp (+1277/-371)
rocprim/include/rocprim/type_traits_functions.hpp (+561/-0)
rocprim/include/rocprim/types.hpp (+18/-10)
rocprim/include/rocprim/types/integer_sequence.hpp (+5/-5)
rocprim/include/rocprim/types/key_value_pair.hpp (+28/-22)
rocprim/include/rocprim/types/tuple.hpp (+52/-15)
rocprim/include/rocprim/warp/detail/warp_reduce_crosslane.hpp (+102/-13)
rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp (+89/-50)
rocprim/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp (+34/-37)
rocprim/include/rocprim/warp/detail/warp_reduce_shuffle.hpp (+30/-31)
rocprim/include/rocprim/warp/detail/warp_scan_crosslane.hpp (+213/-12)
rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp (+163/-90)
rocprim/include/rocprim/warp/detail/warp_scan_shared_mem.hpp (+88/-47)
rocprim/include/rocprim/warp/detail/warp_scan_shuffle.hpp (+114/-66)
rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp (+29/-9)
rocprim/include/rocprim/warp/detail/warp_sort_shuffle.hpp (+42/-27)
rocprim/include/rocprim/warp/detail/warp_sort_stable.hpp (+17/-17)
rocprim/include/rocprim/warp/warp_exchange.hpp (+665/-107)
rocprim/include/rocprim/warp/warp_load.hpp (+144/-81)
rocprim/include/rocprim/warp/warp_reduce.hpp (+160/-103)
rocprim/include/rocprim/warp/warp_scan.hpp (+480/-205)
rocprim/include/rocprim/warp/warp_sort.hpp (+271/-158)
rocprim/include/rocprim/warp/warp_store.hpp (+121/-66)
rtest.xml (+2/-2)
scripts/autotune-search/main.py (+24/-5)
scripts/autotune/create_optimization.py (+31/-1)
scripts/autotune/fallback_config.json (+1/-2)
scripts/autotune/templates/config_template (+2/-0)
scripts/autotune/templates/histogram_config_template (+1/-1)
scripts/autotune/templates/merge_config_template (+1/-1)
scripts/autotune/templates/search_n_config_template (+19/-0)
scripts/autotune/templates/segmented_radix_sort_config_template (+2/-2)
scripts/autotune/templates/segmented_reduce_config_template (+19/-0)
scripts/autotune/templates/transform_pointer_config_template (+20/-0)
test/CMakeLists.txt (+11/-5)
test/common_test_header.hpp (+12/-82)
test/extra/CMakeLists.txt (+3/-5)
test/hip/test_hip_api.cpp (+6/-5)
test/hip/test_hip_async_copy.cpp (+83/-58)
test/hip/test_ordered_block_id.cpp (+4/-3)
test/hipgraph/test_hipgraph_algs.cpp (+29/-46)
test/rocprim/CMakeLists.txt (+132/-10)
test/rocprim/detail/get_rocprim_version.cpp (+6/-22)
test/rocprim/identity_iterator.hpp (+7/-1)
test/rocprim/internal/test_internal_merge_path.cpp (+6/-8)
test/rocprim/test_accumulator_t.cpp (+149/-0)
test/rocprim/test_arg_index_iterator.cpp (+134/-105)
test/rocprim/test_bit_cast.cpp (+213/-0)
test/rocprim/test_block_adjacent_difference.cpp.in (+27/-22)
test/rocprim/test_block_adjacent_difference.hpp (+20/-60)
test/rocprim/test_block_adjacent_difference.kernels.hpp (+158/-731)
test/rocprim/test_block_discontinuity.cpp.in (+30/-22)
test/rocprim/test_block_discontinuity.hpp (+8/-1)
test/rocprim/test_block_discontinuity.kernels.hpp (+120/-214)
test/rocprim/test_block_exchange.cpp (+4/-1)
test/rocprim/test_block_exchange.kernels.hpp (+107/-207)
test/rocprim/test_block_histogram.kernels.hpp (+35/-49)
test/rocprim/test_block_load_store.cpp (+6/-5)
test/rocprim/test_block_load_store.hpp (+51/-111)
test/rocprim/test_block_load_store.kernels.hpp (+80/-85)
test/rocprim/test_block_radix_rank.cpp.in (+4/-1)
test/rocprim/test_block_radix_rank.hpp (+42/-31)
test/rocprim/test_block_radix_sort.cpp.in (+19/-7)
test/rocprim/test_block_radix_sort.kernels.hpp (+71/-77)
test/rocprim/test_block_reduce.cpp (+4/-1)
test/rocprim/test_block_reduce.hpp (+120/-84)
test/rocprim/test_block_reduce.kernels.hpp (+31/-39)
test/rocprim/test_block_run_length_decode.cpp (+49/-69)
test/rocprim/test_block_scan.cpp.in (+26/-18)
test/rocprim/test_block_scan.hpp (+278/-103)
test/rocprim/test_block_scan.kernels.hpp (+160/-241)
test/rocprim/test_block_shuffle.cpp (+4/-1)
test/rocprim/test_block_shuffle.hpp (+77/-141)
test/rocprim/test_block_sort.hpp (+47/-74)
test/rocprim/test_block_sort_bitonic.cpp (+4/-1)
test/rocprim/test_block_sort_merge.cpp (+4/-1)
test/rocprim/test_block_sort_merge_stable.cpp (+4/-1)
test/rocprim/test_config_dispatch.cpp (+43/-16)
test/rocprim/test_constant_iterator.cpp (+112/-30)
test/rocprim/test_counting_iterator.cpp (+114/-35)
test/rocprim/test_device_adjacent_difference.cpp (+186/-289)
test/rocprim/test_device_adjacent_find.cpp (+27/-35)
test/rocprim/test_device_batch_memcpy.cpp (+97/-187)
test/rocprim/test_device_binary_search.cpp (+104/-165)
test/rocprim/test_device_find_end.cpp (+58/-89)
test/rocprim/test_device_find_first_of.cpp (+45/-70)
test/rocprim/test_device_histogram.cpp (+277/-503)
test/rocprim/test_device_merge.cpp (+96/-30)
test/rocprim/test_device_merge_inplace.cpp (+452/-0)
test/rocprim/test_device_merge_sort.cpp (+198/-110)
test/rocprim/test_device_nth_element.cpp (+55/-98)
test/rocprim/test_device_partial_sort.cpp (+55/-97)
test/rocprim/test_device_partition.cpp (+351/-359)
test/rocprim/test_device_radix_sort.cpp.in (+24/-11)
test/rocprim/test_device_radix_sort.hpp (+201/-276)
test/rocprim/test_device_reduce.cpp (+208/-277)
test/rocprim/test_device_reduce_by_key.cpp (+93/-176)
test/rocprim/test_device_run_length_encode.cpp (+88/-147)
test/rocprim/test_device_scan.cpp (+911/-1054)
test/rocprim/test_device_scan_by_key.cpp (+841/-0)
test/rocprim/test_device_search.cpp (+56/-89)
test/rocprim/test_device_search_n.cpp (+266/-172)
test/rocprim/test_device_segmented_radix_sort.cpp.in (+9/-3)
test/rocprim/test_device_segmented_radix_sort.hpp (+268/-497)
test/rocprim/test_device_segmented_reduce.cpp (+100/-108)
test/rocprim/test_device_segmented_scan.cpp (+256/-302)
test/rocprim/test_device_select.cpp (+289/-406)
test/rocprim/test_device_transform.cpp (+316/-168)
test/rocprim/test_discard_iterator.cpp (+95/-124)
test/rocprim/test_intrinsics.cpp (+237/-327)
test/rocprim/test_intrinsics_atomic.cpp (+8/-6)
test/rocprim/test_invoke_result.cpp (+6/-8)
test/rocprim/test_linking.cpp (+42/-0)
test/rocprim/test_linking_lib.cpp (+103/-0)
test/rocprim/test_linking_new_scan.hpp (+216/-0)
test/rocprim/test_lookback_reproducibility.cpp (+140/-170)
test/rocprim/test_predicate_iterator.cpp (+164/-48)
test/rocprim/test_radix_key_codec.cpp (+39/-29)
test/rocprim/test_reverse_iterator.cpp (+16/-18)
test/rocprim/test_rocprim_tuple.cpp (+384/-0)
test/rocprim/test_rocprim_types.cpp (+235/-0)
test/rocprim/test_seed.hpp (+20/-5)
test/rocprim/test_temporary_storage_partitioning.cpp (+34/-46)
test/rocprim/test_texture_cache_iterator.cpp (+161/-44)
test/rocprim/test_thread.cpp (+41/-63)
test/rocprim/test_thread_algos.cpp (+493/-227)
test/rocprim/test_transform_iterator.cpp (+324/-99)
test/rocprim/test_tuple.cpp (+85/-0)
test/rocprim/test_type_traits_interface.cpp (+12/-95)
test/rocprim/test_utils.hpp (+343/-165)
test/rocprim/test_utils_assertions.hpp (+132/-33)
test/rocprim/test_utils_bfloat16.hpp (+6/-4)
test/rocprim/test_utils_custom_float_type.hpp (+18/-33)
test/rocprim/test_utils_custom_test_types.hpp (+34/-132)
test/rocprim/test_utils_data_generation.hpp (+209/-177)
test/rocprim/test_utils_data_generation_with_rocrand.hpp (+130/-0)
test/rocprim/test_utils_get_random_data.hpp (+44/-0)
test/rocprim/test_utils_sort_checker.cpp (+141/-0)
test/rocprim/test_utils_sort_checker.hpp (+165/-0)
test/rocprim/test_utils_sort_comparator.hpp (+27/-27)
test/rocprim/test_utils_types.hpp (+108/-118)
test/rocprim/test_warp_exchange.cpp (+134/-227)
test/rocprim/test_warp_load.cpp (+48/-44)
test/rocprim/test_warp_reduce.cpp (+3/-1)
test/rocprim/test_warp_reduce.hpp (+361/-400)
test/rocprim/test_warp_reduce.kernels.hpp (+85/-99)
test/rocprim/test_warp_scan.cpp (+9/-3)
test/rocprim/test_warp_scan.hpp (+746/-503)
test/rocprim/test_warp_scan.kernels.hpp (+214/-152)
test/rocprim/test_warp_sort.cpp (+3/-1)
test/rocprim/test_warp_sort.hpp (+53/-87)
test/rocprim/test_warp_sort.kernels.hpp (+35/-28)
test/rocprim/test_warp_store.cpp (+41/-41)
test/rocprim/test_zip_iterator.cpp (+65/-178)
toolchain-windows.cmake (+1/-1)
Reviewer Review Type Date Requested Status
Andreas Hasenack Approve
Ubuntu Sponsors Pending
Review via email: mp+499798@code.launchpad.net

Description of the change

Update to upstream version 7.1.0

To post a comment you must log in.
Revision history for this message
Igor Luppi (igorluppi) wrote :
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

--- a/debian/control
+++ b/debian/control
@@ -32,7 +33,7 @@ Description: parallel primitives for GPU-accelerated code - headers

 Package: librocprim-tests
 Section: libdevel
-Architecture: amd64 arm64 ppc64el
+Architecture: amd64 arm64
 XB-X-ROCm-GPU-Architecture: ${rocm:GPU-Architecture}
 Depends: ${misc:Depends}, ${shlibs:Depends},
 Build-Profiles: <!nocheck>

Please investigate the reverse dependencies of librocprim-tests to see what else would have to be dropped in ppc64el.

review: Needs Information
Revision history for this message
Igor Luppi (igorluppi) wrote :

Thanks Andreas

$ reverse-depends --arch ppc64el src:rocprim -x
Reverse-Depends
===============
* libhipcub-dev (for librocprim-dev)
* librocthrust-dev (for librocprim-dev)
  * libstdgpu-hip-dev

Its safe to drop since its only required by rocm stack. Also librocthrust-dev was detailed here: https://code.launchpad.net/~bullwinkle-team/ubuntu/+source/rocthrust/+git/rocthrust/+merge/499735
It should be fine.

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

+1, sponsored:

Uploading rocprim_7.1.0-0ubuntu3.dsc
Uploading rocprim_7.1.0.orig.tar.gz
Uploading rocprim_7.1.0-0ubuntu3.debian.tar.xz
Uploading rocprim_7.1.0-0ubuntu3_source.buildinfo
Uploading rocprim_7.1.0-0ubuntu3_source.changes

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
2index 809934d..2ac616b 100755
3--- a/.github/CODEOWNERS
4+++ b/.github/CODEOWNERS
5@@ -1,4 +1,4 @@
6-* @stanleytsang-amd @umfranzw @RobsonRLemos @lawruble13
7+* @stanleytsang-amd @umfranzw @RobsonRLemos
8 # Documentation files
9 docs/ @ROCm/rocm-documentation
10 *.md @ROCm/rocm-documentation
11diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
12index 0f76d8c..ec65005 100644
13--- a/.gitlab-ci.yml
14+++ b/.gitlab-ci.yml
15@@ -1,6 +1,6 @@
16 # MIT License
17 #
18-# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
19+# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
20 #
21 # Permission is hereby granted, free of charge, to any person obtaining a copy
22 # of this software and associated documentation files (the "Software"), to deal
23@@ -42,6 +42,17 @@ stages:
24 - test
25 - benchmark
26
27+workflow:
28+ rules:
29+ - if: $CI_MERGE_REQUEST_LABELS =~ /CI Skip/
30+ when: never
31+ - if: $CI_MERGE_REQUEST_TITLE !~ /Draft:/
32+ variables:
33+ ROCPRIM_TEST_RUNS: 1
34+ - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/
35+ variables:
36+ ROCPRIM_TEST_RUNS: 1
37+
38 variables:
39 PACKAGE_DIR: $BUILD_DIR/package
40 AUTOTUNE_CONFIG_DIR: ${CI_PROJECT_DIR}/autotune_config
41@@ -110,11 +121,10 @@ copyright-date:
42 -D BUILD_TEST=ON
43 -D BUILD_EXAMPLE=ON
44 -D BUILD_BENCHMARK=ON
45- -D GPU_TARGETS=$GPU_TARGETS
46- -D AMDGPU_TEST_TARGETS=$GPU_TARGETS
47+ -D AMDGPU_TARGETS=$GPU_TARGETS
48 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
49 -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
50- -D CMAKE_CXX_STANDARD=14
51+ -D CMAKE_CXX_STANDARD=17
52 -S $CI_PROJECT_DIR
53 -B $BUILD_DIR
54 - cmake
55@@ -164,13 +174,14 @@ build:cmake-minimum-apt:
56 - .rules:build
57 variables:
58 EXTRA_CMAKE_CXX_FLAGS: ""
59+ BUILD_TOOL_ARGS: ""
60 script:
61 - mkdir -p $BUILD_DIR
62 - cd $BUILD_DIR
63 - | # Add hardened libc++ assertions for tests only
64 if [[ $BUILD_TARGET == "TEST" ]]; then
65 echo "Configuring with hardened libc++!"
66- EXTRA_CMAKE_CXX_FLAGS+=" -D_GLIBCXX_ASSERTIONS=ON"
67+ EXTRA_CMAKE_CXX_FLAGS+=" -D_GLIBCXX_ASSERTIONS=ON -D ROCPRIM_ENABLE_ASSERTS=ON"
68 fi
69 - cmake
70 -G Ninja
71@@ -178,15 +189,15 @@ build:cmake-minimum-apt:
72 -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror $EXTRA_CMAKE_CXX_FLAGS"
73 -D CMAKE_BUILD_TYPE="$BUILD_TYPE"
74 -D BUILD_$BUILD_TARGET=ON
75+ -D WITH_ROCRAND=ON
76 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
77 -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
78 -D BUILD_EXAMPLE=ON
79- -D GPU_TARGETS=$GPU_TARGETS
80- -D AMDGPU_TEST_TARGETS=$GPU_TARGETS
81+ -D AMDGPU_TARGETS=$GPU_TARGETS
82 -D CMAKE_CXX_STANDARD="$BUILD_VERSION"
83 -S $CI_PROJECT_DIR
84 -B $BUILD_DIR
85- - cmake --build $BUILD_DIR
86+ - cmake --build $BUILD_DIR -- ${BUILD_TOOL_ARGS}
87 artifacts:
88 paths:
89 - $BUILD_DIR/.ninja_log
90@@ -197,9 +208,36 @@ build:cmake-minimum-apt:
91 - $BUILD_DIR/gtest/
92 - $BUILD_DIR/test/CTestTestfile.cmake
93 - $BUILD_DIR/test/rocprim/CTestTestfile.cmake
94+ - $BUILD_DIR/deps/rocrand/
95 - $BUILD_DIR/test/rocprim/test_*
96+ - $BUILD_DIR/test/rocprim/libtest_*
97 - $BUILD_DIR/test/test_*
98- expire_in: 2 weeks
99+ expire_in: 1 day
100+
101+build:spirv:
102+ stage: build
103+ needs: []
104+ extends:
105+ - .cmake-minimum
106+ - .build:common
107+ variables:
108+ # For unknown reasons spir-v builds ignore 'clang diagnostic' pragmas that
109+ # we use to ignore internal deprecations.
110+ EXTRA_CMAKE_CXX_FLAGS: "-Wno-deprecated-declarations -mf16c -DROCPRIM_EXPERIMENTAL_SPIRV"
111+ # Since not all targets are expected to build, do not stop building other
112+ # targets when any target fails.
113+ BUILD_TOOL_ARGS: "-k 0"
114+ GPU_TARGETS: "amdgcnspirv"
115+ image: "registry.streamhpc.internal/unstable-rocm:main"
116+ allow_failure: true
117+ parallel:
118+ # Debug builds disabled due to excessive build times for debug test builds
119+ matrix:
120+ - BUILD_TYPE: Release
121+ BUILD_TARGET: [BENCHMARK, TEST]
122+ BUILD_VERSION: 17
123+ artifacts:
124+ when: always
125
126 build:cmake-latest:
127 stage: build
128@@ -212,7 +250,7 @@ build:cmake-latest:
129 matrix:
130 - BUILD_TYPE: Release
131 BUILD_TARGET: [BENCHMARK, TEST]
132- BUILD_VERSION: [14, 17]
133+ BUILD_VERSION: 17
134
135 build:cmake-minimum:
136 needs: []
137@@ -223,7 +261,7 @@ build:cmake-minimum:
138 matrix:
139 - BUILD_TYPE: [Debug, Release]
140 BUILD_TARGET: [BENCHMARK, TEST]
141- BUILD_VERSION: 14
142+ BUILD_VERSION: 17
143
144 build:package:
145 stage: build
146@@ -240,7 +278,7 @@ build:package:
147 -G Ninja
148 -D CMAKE_CXX_COMPILER="$AMDCLANG"
149 -D CMAKE_BUILD_TYPE=Release
150- -D CMAKE_CXX_STANDARD=14
151+ -D CMAKE_CXX_STANDARD=17
152 -B $PACKAGE_DIR
153 -S $CI_PROJECT_DIR
154 - cd $PACKAGE_DIR
155@@ -249,7 +287,7 @@ build:package:
156 paths:
157 - $PACKAGE_DIR/rocprim*.deb
158 - $PACKAGE_DIR/rocprim*.zip
159- expire_in: 2 weeks
160+ expire_in: 1 day
161
162 build:windows:
163 stage: build
164@@ -261,19 +299,19 @@ build:windows:
165 - .deps:visual-studio-devshell
166 parallel:
167 matrix:
168- - BUILD_TYPE: [Debug, Release]
169- BUILD_TARGET: [BENCHMARK, TEST]
170+ - BUILD_TYPE: Release
171 script:
172 - mkdir -p $CI_PROJECT_DIR/build
173 - cmake -G Ninja
174 -S $CI_PROJECT_DIR
175 -B $CI_PROJECT_DIR/build
176- -D BUILD_$BUILD_TARGET=ON
177- -D GPU_TARGETS=$GPU_TARGET
178+ -D BUILD_TEST=ON
179+ -D BUILD_BENCHMARK=ON
180+ -D AMDGPU_TARGETS=$GPU_TARGET
181 -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
182 -D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}"
183 -D CMAKE_BUILD_TYPE="$BUILD_TYPE"
184- -D CMAKE_CXX_STANDARD=14
185+ -D CMAKE_CXX_STANDARD=17
186 - cmake --build "$CI_PROJECT_DIR/build"
187 artifacts:
188 paths:
189@@ -285,7 +323,7 @@ build:windows:
190 - $CI_PROJECT_DIR/build/CMakeCache.txt
191 - $CI_PROJECT_DIR/build/.ninja_log
192 - $CI_PROJECT_DIR/build/CTestTestfile.cmake
193- expire_in: 2 weeks
194+ expire_in: 1 day
195
196 autotune:build:
197 stage: autotune
198@@ -317,10 +355,10 @@ autotune:build:
199 -D BUILD_EXAMPLE=OFF
200 -D BUILD_BENCHMARK=ON
201 -D BENCHMARK_CONFIG_TUNING=ON
202- -D GPU_TARGETS=$GPU_TARGETS
203+ -D AMDGPU_TARGETS=$GPU_TARGETS
204 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
205 -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
206- -D CMAKE_CXX_STANDARD=14
207+ -D CMAKE_CXX_STANDARD=17
208 - cmake --build . --target $BENCHMARK_TARGETS
209 - 'rm -rf $BUILD_DIR/benchmark/benchmark*.parallel'
210 # The autotune benchmarks get very large, above GitLabs upload limit. Fortunately they compress well.
211@@ -334,19 +372,20 @@ autotune:build:
212 - $BUILD_DIR/deps/googlebenchmark/
213 expire_in: 1 week
214
215-test:
216+.test:common:
217 stage: test
218+ tags:
219+ - rocm
220+ - $GPU
221 extends:
222 - .cmake-minimum
223- - .rules:test
224- - .gpus:rocm
225 needs:
226 - job: build:cmake-minimum
227 parallel:
228 matrix:
229 - BUILD_TYPE: Release
230 BUILD_TARGET: TEST
231- BUILD_VERSION: 14
232+ BUILD_VERSION: 17
233 script:
234 - cd $BUILD_DIR
235 - cmake
236@@ -359,10 +398,88 @@ test:
237 - HSA_ENABLE_SDMA=0 ctest
238 --output-on-failure
239 --repeat-until-fail 2
240- --tests-regex "hip|$GPU_TARGET"
241 --resource-spec-file ./resources.json
242 --parallel $PARALLEL_JOBS
243
244+test:any-gpu:
245+ variables:
246+ GPU: ""
247+ PARALLEL_JOBS: 1
248+ extends:
249+ - .test:common
250+ rules:
251+ - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/ && $CI_MERGE_REQUEST_LABELS !~ /Arch::/
252+
253+test:label-arch:
254+ extends:
255+ - .gpus:rocm
256+ - .test:common
257+ - .rules:arch-labels
258+
259+test:all-gpus:
260+ variables:
261+ SHOULD_BE_UNDRAFTED: "true"
262+ extends:
263+ - .gpus:rocm
264+ - .test:common
265+ - .rules:test
266+
267+.test:common-spirv:
268+ stage: test
269+ tags:
270+ - rocm
271+ - $GPU
272+ extends:
273+ - .cmake-minimum
274+ allow_failure: true
275+ timeout: 3h
276+ needs:
277+ - job: build:spirv
278+ parallel:
279+ matrix:
280+ - BUILD_TYPE: Release
281+ BUILD_TARGET: TEST
282+ BUILD_VERSION: 17
283+ image: "registry.streamhpc.internal/unstable-rocm:main"
284+ script:
285+ - cd $BUILD_DIR
286+ - cmake
287+ -D CMAKE_PREFIX_PATH=/opt/rocm
288+ -P $CI_PROJECT_DIR/cmake/GenerateResourceSpec.cmake
289+ - cat ./resources.json
290+ # Parallel execution (with other AMDGPU processes) can oversubscribe the SDMA queue.
291+ # This causes the hipMemcpy to fail, which is not reported as an error by HIP.
292+ # As a temporary workaround, disable the SDMA for test stability.
293+ - HSA_ENABLE_SDMA=0 ctest
294+ --output-on-failure
295+ --repeat-until-fail 2
296+ --resource-spec-file ./resources.json
297+ --parallel $PARALLEL_JOBS
298+ --exclude-regex rocprim.device_partition
299+
300+test:any-gpu-spirv:
301+ variables:
302+ GPU: ""
303+ PARALLEL_JOBS: 1
304+ extends:
305+ - .test:common-spirv
306+ rules:
307+ - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/ && $CI_MERGE_REQUEST_LABELS !~ /Arch::/
308+
309+test:label-arch-spirv:
310+ extends:
311+ - .gpus:rocm
312+ - .test:common-spirv
313+ - .rules:arch-labels
314+
315+test:all-gpus-spirv:
316+ variables:
317+ SHOULD_BE_UNDRAFTED: "true"
318+ extends:
319+ - .gpus:rocm
320+ - .test:common-spirv
321+ - .rules:test
322+
323 .test-windows-base:
324 stage: test
325 extends:
326@@ -394,7 +511,6 @@ test-windows-release:
327 parallel:
328 matrix:
329 - BUILD_TYPE: Release
330- BUILD_TARGET: TEST
331
332 .test-package:
333 script:
334@@ -402,8 +518,8 @@ test-windows-release:
335 -G Ninja
336 -D CMAKE_CXX_COMPILER="$AMDCLANG"
337 -D CMAKE_BUILD_TYPE=Release
338- -D GPU_TARGETS=$GPU_TARGETS
339- -D CMAKE_CXX_STANDARD=14
340+ -D AMDGPU_TARGETS=$GPU_TARGETS
341+ -D CMAKE_CXX_STANDARD=17
342 -S "$CI_PROJECT_DIR/test/extra"
343 -B "$CI_PROJECT_DIR/package_test"
344 - cmake --build "$CI_PROJECT_DIR/package_test"
345@@ -425,10 +541,11 @@ test:install:
346 -G Ninja
347 -D CMAKE_CXX_COMPILER="$AMDCLANG"
348 -D CMAKE_BUILD_TYPE=Release
349- -D CMAKE_CXX_STANDARD=14
350+ -D CMAKE_CXX_STANDARD=17
351 -B build
352 -S $CI_PROJECT_DIR
353- - $SUDO_CMD cmake --build build --target install
354+ # Preserve $PATH when sudoing
355+ - $SUDO_CMD env PATH="$PATH" cmake --build build --target install
356 - !reference [.test-package, script]
357
358 test:deb:
359@@ -452,6 +569,10 @@ test:docs:
360 extends:
361 - .rules:test
362 - .build:docs
363+ artifacts:
364+ paths:
365+ - $DOCS_DIR/_build/html/
366+ expire_in: 2 weeks
367
368 .benchmark-base:
369 stage: benchmark
370@@ -468,7 +589,7 @@ benchmark:
371 matrix:
372 - BUILD_TYPE: Release
373 BUILD_TARGET: BENCHMARK
374- BUILD_VERSION: 14
375+ BUILD_VERSION: 17
376 extends:
377 - .cmake-minimum
378 - .gpus:rocm
379@@ -489,6 +610,11 @@ benchmark:
380 --benchmark_filename_regex "${BENCHMARK_FILENAME_REGEX}"
381 --benchmark_filter_regex "${BENCHMARK_ALGORITHM_REGEX}"
382 --seed "${BENCHMARK_SEED}"
383+ - python3
384+ .gitlab/report_noise.py
385+ --benchmark_json_dir "${BENCHMARK_RESULT_DIR}"
386+ --noise_threshold_percentage 1.0
387+ --accept_high_noise
388 artifacts:
389 paths:
390 - ${BENCHMARK_RESULT_DIR}
391@@ -577,6 +703,11 @@ autotune:execute-tuning:
392 --size="${AUTOTUNE_SIZE}"
393 --trials="${AUTOTUNE_TRIALS}"
394 --seed=82589933
395+ - python3
396+ .gitlab/report_noise.py
397+ --benchmark_json_dir "${AUTOTUNE_RESULT_DIR}"
398+ --noise_threshold_percentage 1.0
399+ --accept_high_noise
400
401 autotune:generate-config:
402 image: python:3.10.5-buster
403diff --git a/.gitlab/report_noise.py b/.gitlab/report_noise.py
404new file mode 100644
405index 0000000..974c2b1
406--- /dev/null
407+++ b/.gitlab/report_noise.py
408@@ -0,0 +1,251 @@
409+#!/usr/bin/env python3
410+
411+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
412+#
413+# Permission is hereby granted, free of charge, to any person obtaining a copy
414+# of this software and associated documentation files (the "Software"), to deal
415+# in the Software without restriction, including without limitation the rights
416+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
417+# copies of the Software, and to permit persons to whom the Software is
418+# furnished to do so, subject to the following conditions:
419+#
420+# The above copyright notice and this permission notice shall be included in
421+# all copies or substantial portions of the Software.
422+#
423+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
424+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
425+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
426+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
427+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
428+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
429+# THE SOFTWARE.
430+
431+
432+import argparse
433+import json
434+import os
435+import re
436+import stat
437+import statistics
438+import sys
439+
440+
441+class colors:
442+ OK = "\033[92m"
443+ FAIL = "\033[91m"
444+ END_COLOR = "\033[0m"
445+
446+
447+def print_results(results):
448+ # Store the length of the longest value in a column
449+ longest = {
450+ "name": max(len(result["name"]) for result in results),
451+ "noisy_permutations": max(
452+ len(result["noisy_permutations"]) for result in results
453+ ),
454+ "mean": max(len(result["mean"]) for result in results),
455+ "median": max(len(result["median"]) for result in results),
456+ "max": max(len(result["max"]) for result in results),
457+ "batch": max(len(result["batch"]) for result in results),
458+ "warmup": max(len(result["warmup"]) for result in results),
459+ "bytes": max(len(result["bytes"]) for result in results),
460+ }
461+
462+ # The name of a column can be longer than its values
463+ longest = {key: max(value, len(key)) for key, value in longest.items()}
464+
465+ printed = "name".ljust(longest["name"] + 1)
466+ printed += "noisy permutations".ljust(longest["noisy_permutations"] + 1)
467+ printed += "mean".ljust(longest["mean"] + 1)
468+ printed += "median".ljust(longest["median"] + 1)
469+ printed += "max".ljust(longest["max"] + 1)
470+ printed += "batch".ljust(longest["batch"] + 1)
471+ printed += "warmup".ljust(longest["warmup"] + 1)
472+ printed += "bytes".ljust(longest["bytes"] + 1)
473+ printed += "seed"
474+ print(printed)
475+
476+ for result in results:
477+ printed = result["name"].ljust(longest["name"])
478+
479+ printed += " "
480+ printed += colors.FAIL if result["noisy"] else colors.OK
481+ printed += (
482+ f'{result["noisy_permutations"].ljust(longest["noisy_permutations"])}'
483+ )
484+ printed += colors.END_COLOR
485+
486+ printed += " "
487+ printed += colors.FAIL if result["bad_mean"] else colors.OK
488+ printed += result["mean"].ljust(longest["mean"])
489+ printed += colors.END_COLOR
490+
491+ printed += " "
492+ printed += colors.FAIL if result["bad_median"] else colors.OK
493+ printed += result["median"].ljust(longest["median"])
494+ printed += colors.END_COLOR
495+
496+ printed += " "
497+ printed += colors.FAIL if result["bad_max"] else colors.OK
498+ printed += result["max"].ljust(longest["max"])
499+ printed += colors.END_COLOR
500+
501+ printed += " "
502+ printed += colors.FAIL if result["bad_batch"] else colors.OK
503+ printed += result["batch"].ljust(longest["batch"])
504+ printed += colors.END_COLOR
505+
506+ printed += " "
507+ printed += colors.FAIL if result["bad_warmup"] else colors.OK
508+ printed += result["warmup"].ljust(longest["warmup"])
509+ printed += colors.END_COLOR
510+
511+ printed += " "
512+ printed += colors.FAIL if result["bad_bytes"] else colors.OK
513+ printed += result["bytes"].ljust(longest["bytes"])
514+ printed += colors.END_COLOR
515+
516+ printed += " "
517+ printed += colors.FAIL if result["seed"] == "random" else colors.OK
518+ printed += result["seed"]
519+ printed += colors.END_COLOR
520+
521+ print(printed)
522+
523+
524+def get_results(benchmarks, threshold):
525+ def get_humanized_bytes(size):
526+ for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]:
527+ if size < 1024.0 or unit == "PiB":
528+ break
529+ size /= 1024.0
530+ return f"{size:.1f} {unit}"
531+
532+ success = True
533+
534+ results = []
535+
536+ for benchmark in benchmarks:
537+ data = benchmark["data"]
538+
539+ name = benchmark["name"]
540+
541+ permutations = data["benchmarks"]
542+
543+ cvs = [permutation["cv"] for permutation in permutations]
544+
545+ # The cv (coefficient of variation) is a standard way of quantifying noise
546+ noises = sum(cv * 100 > threshold for cv in cvs)
547+ noisy = noises > 0
548+
549+ if noisy:
550+ success = False
551+
552+ context = data["context"]
553+
554+ noisy_permutations = f"{noises}/{len(permutations)}"
555+
556+ mean = statistics.mean(cvs)
557+ median = statistics.median(cvs)
558+ max_ = max(cvs)
559+
560+ batch = context["batch_iterations"]
561+ warmup = context["warmup_iterations"]
562+
563+ bytes_ = int(context["size"])
564+ seed = context["seed"]
565+
566+ results.append(
567+ {
568+ "name": name,
569+ "noisy": noisy,
570+ "noisy_permutations": noisy_permutations,
571+ "bad_mean": mean * 100 > threshold,
572+ "mean": f"{mean:.1%}",
573+ "bad_median": median * 100 > threshold,
574+ "median": f"{median:.1%}",
575+ "bad_max": max_ * 100 > threshold,
576+ "max": f"{max_:.1%}",
577+ "bad_batch": int(batch) < 10,
578+ "batch": batch,
579+ "bad_warmup": int(warmup) < 5,
580+ "warmup": warmup,
581+ "bad_bytes": 0 < bytes_ < 128 * 1024 * 1024, # 128 MiB
582+ "bytes": get_humanized_bytes(int(context["size"])),
583+ "seed": seed,
584+ }
585+ )
586+
587+ return results, success
588+
589+
590+def load_benchmarks(benchmark_json_dir):
591+ def is_benchmark_json(filename):
592+ if not re.match(r".*\.json$", filename):
593+ return False
594+ path = os.path.join(benchmark_json_dir, filename)
595+ st_mode = os.stat(path).st_mode
596+
597+ # we are not interested in permissions, just whether it is a regular file (S_IFREG)
598+ return st_mode & stat.S_IFREG
599+
600+ benchmark_names = [
601+ name for name in os.listdir(benchmark_json_dir) if is_benchmark_json(name)
602+ ]
603+
604+ success = True
605+ benchmarks = []
606+ for benchmark_name in benchmark_names:
607+ with open(os.path.join(benchmark_json_dir, benchmark_name)) as f:
608+ try:
609+ benchmarks.append({"name": benchmark_name, "data": json.load(f)})
610+ except json.JSONDecodeError as e:
611+ print(
612+ f"{colors.FAIL}Failed to load {benchmark_name}{colors.END_COLOR}: {e}\n",
613+ file=sys.stderr,
614+ )
615+ success = False
616+
617+ return benchmarks, success
618+
619+
620+def main():
621+ parser = argparse.ArgumentParser()
622+ parser.add_argument(
623+ "--noise_threshold_percentage",
624+ help="The noise threshold percentage, past which benchmark permutations are considered to be too noisy",
625+ required=True,
626+ type=float,
627+ )
628+ parser.add_argument(
629+ "--benchmark_json_dir",
630+ help="The directory of benchmark JSON files, which to report the noise of",
631+ required=True,
632+ )
633+ parser.add_argument(
634+ "--accept_high_noise",
635+ help="Don't call exit(1) when there is a noisy benchmark permutation",
636+ action=argparse.BooleanOptionalAction,
637+ )
638+ args = parser.parse_args()
639+
640+ print(f"The noise threshold is {args.noise_threshold_percentage:.1f}%\n")
641+
642+ benchmarks, load_success = load_benchmarks(args.benchmark_json_dir)
643+ results, results_success = get_results(benchmarks, args.noise_threshold_percentage)
644+
645+ print_results(results)
646+
647+ if not load_success:
648+ return False
649+ if args.accept_high_noise:
650+ return True
651+ return results_success
652+
653+
654+if __name__ == "__main__":
655+ success = main()
656+ if success:
657+ exit(0)
658+ else:
659+ exit(1)
660diff --git a/.gitlab/run_benchmarks.py b/.gitlab/run_benchmarks.py
661index f406ea4..d61b4d9 100755
662--- a/.gitlab/run_benchmarks.py
663+++ b/.gitlab/run_benchmarks.py
664@@ -1,6 +1,6 @@
665 #!/usr/bin/env python3
666
667-# Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
668+# Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
669 #
670 # Permission is hereby granted, free of charge, to any person obtaining a copy
671 # of this software and associated documentation files (the "Software"), to deal
672@@ -22,13 +22,14 @@
673
674 import argparse
675 from collections import namedtuple
676+import json
677 import os
678 import re
679 import stat
680 import subprocess
681 import sys
682
683-BenchmarkContext = namedtuple('BenchmarkContext', ['gpu_architecture', 'benchmark_output_dir', 'benchmark_dir', 'benchmark_filename_regex', 'benchmark_filter_regex', 'size', 'trials', 'seed'])
684+BenchmarkContext = namedtuple('BenchmarkContext', ['gpu_architecture', 'benchmark_output_dir', 'benchmark_dir', 'benchmark_filename_regex', 'benchmark_filter_regex', 'size', 'trials', 'seed', 'skip_gathered'])
685
686 def run_benchmarks(benchmark_context):
687 def is_benchmark_executable(filename):
688@@ -41,6 +42,18 @@ def run_benchmarks(benchmark_context):
689 # and it is a regular file (S_IFREG)
690 return (st_mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) and (st_mode & stat.S_IFREG)
691
692+ def should_skip(results_json_path):
693+ if not benchmark_context.skip_gathered:
694+ return False
695+
696+ try:
697+ with open(results_json_path) as f:
698+ json.load(f)
699+ except (FileNotFoundError, json.JSONDecodeError):
700+ return False
701+
702+ return True
703+
704 success = True
705 benchmark_names = [name for name in os.listdir(benchmark_context.benchmark_dir) if is_benchmark_executable(name)]
706 print('The following benchmarks will be ran:\n{}'.format('\n'.join(benchmark_names)), file=sys.stderr, flush=True)
707@@ -49,11 +62,11 @@ def run_benchmarks(benchmark_context):
708
709 benchmark_path = os.path.join(benchmark_context.benchmark_dir, benchmark_name)
710 results_json_path = os.path.join(benchmark_context.benchmark_output_dir, results_json_name)
711+ if should_skip(results_json_path):
712+ print(f'Skipping {benchmark_name}, because its results have already been gathered at {results_json_path}', file=sys.stderr, flush=True)
713+ continue
714 args = [
715 benchmark_path,
716- '--name_format',
717- 'json',
718- '--benchmark_out_format=json',
719 f'--benchmark_out={results_json_path}',
720 f'--benchmark_filter={benchmark_context.benchmark_filter_regex}'
721 ]
722@@ -103,6 +116,11 @@ def main():
723 help='Controls the seed for random number generation for each benchmark case',
724 default='',
725 required=False)
726+ parser.add_argument('--skip_gathered',
727+ help='Skip running benchmarks whose JSON data has already been gathered',
728+ default=False,
729+ action='store_true',
730+ required=False)
731
732 args = parser.parse_args()
733
734@@ -114,7 +132,8 @@ def main():
735 args.benchmark_filter_regex,
736 args.size,
737 args.trials,
738- args.seed)
739+ args.seed,
740+ args.skip_gathered)
741
742 benchmark_run_successful = run_benchmarks(benchmark_context)
743
744diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
745deleted file mode 100644
746index d49a782..0000000
747--- a/.jenkins/common.groovy
748+++ /dev/null
749@@ -1,76 +0,0 @@
750-// This file is for internal AMD use.
751-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
752-
753-def runCompileCommand(platform, project, jobName, boolean debug=false)
754-{
755- project.paths.construct_build_prefix()
756-
757- String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release'
758- String buildTypeDir = debug ? 'debug' : 'release'
759- String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
760- //Set CI node's gfx arch as target if PR, otherwise use default targets of the library
761- String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
762-
763- def command = """#!/usr/bin/env bash
764- set -x
765- cd ${project.paths.project_build_prefix}
766- mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir}
767- ${auxiliary.gfxTargetParser()}
768- ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../..
769- make -j\$(nproc)
770- """
771-
772- platform.runCommand(this, command)
773-}
774-
775-
776-def runTestCommand (platform, project)
777-{
778- String sudo = auxiliary.sudo(platform.jenkinsLabel)
779-
780- def testCommand = "ctest --output-on-failure "
781- def testCommandExcludeRegex = ''
782- def testCommandExclude = "--exclude-regex \"${testCommandExcludeRegex}\""
783- def hmmExcludeRegex = ''
784- def hmmTestCommandExclude = "--exclude-regex \"${hmmExcludeRegex}\""
785- def hmmTestCommand = ''
786- if (platform.jenkinsLabel.contains('gfx90a'))
787- {
788- echo("HMM TESTS DISABLED")
789- /*hmmTestCommand = """
790- export HSA_XNACK=1
791- export ROCPRIM_USE_HMM=1
792- ${testCommand} ${hmmTestCommandExclude}
793- """*/
794- }
795- echo(env.JOB_NAME)
796- if (env.JOB_NAME.contains('bleeding-edge'))
797- {
798- testCommand = ''
799- testCommandExclude = ''
800- hmmTestCommand = ''
801- echo("TESTS DISABLED")
802- }
803- def command = """#!/usr/bin/env bash
804- set -x
805- cd ${project.paths.project_build_prefix}
806- cd ${project.testDirectory}
807- ${testCommand} ${testCommandExclude}
808- if (( \$? != 0 )); then
809- exit 1
810- fi
811- ${hmmTestCommand}
812- """
813-
814- platform.runCommand(this, command)
815-}
816-
817-def runPackageCommand(platform, project)
818-{
819- def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release")
820-
821- platform.runCommand(this, packageHelper[0])
822- platform.archiveArtifacts(this, packageHelper[1])
823-}
824-
825-return this
826diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy
827deleted file mode 100644
828index dd32cad..0000000
829--- a/.jenkins/precheckin.groovy
830+++ /dev/null
831@@ -1,81 +0,0 @@
832-#!/usr/bin/env groovy
833-@Library('rocJenkins@pong') _
834-import com.amd.project.*
835-import com.amd.docker.*
836-import java.nio.file.Path;
837-
838-def runCI =
839-{
840- nodeDetails, jobName->
841-
842- def prj = new rocProject('rocPRIM', 'PreCheckin')
843- prj.paths.build_command = './install -c'
844- prj.timeout.compile = 600
845-
846- def nodes = new dockerNodes(nodeDetails, jobName, prj)
847-
848- def commonGroovy
849-
850- boolean formatCheck = false
851-
852- def compileCommand =
853- {
854- platform, project->
855-
856- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
857- commonGroovy.runCompileCommand(platform, project, jobName)
858- }
859-
860- def testCommand =
861- {
862- platform, project->
863-
864- commonGroovy.runTestCommand(platform, project)
865- }
866-
867- def packageCommand =
868- {
869- platform, project->
870-
871- commonGroovy.runPackageCommand(platform, project)
872- }
873-
874- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
875-}
876-
877-ci: {
878- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
879-
880- def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])],
881- "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
882- "rocm-docker":[]]
883- propertyList = auxiliary.appendPropertyList(propertyList)
884-
885- def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])]
886- jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM')
887-
888- propertyList.each
889- {
890- jobName, property->
891- if (urlJobName == jobName)
892- properties(auxiliary.addCommonProperties(property))
893- }
894-
895- jobNameList.each
896- {
897- jobName, nodeDetails->
898- if (urlJobName == jobName)
899- stage(jobName) {
900- runCI(nodeDetails, jobName)
901- }
902- }
903-
904- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
905- if(!jobNameList.keySet().contains(urlJobName))
906- {
907- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
908- stage(urlJobName) {
909- runCI([ubuntu16:['gfx906']], urlJobName)
910- }
911- }
912-}
913diff --git a/.jenkins/static.groovy b/.jenkins/static.groovy
914deleted file mode 100644
915index 9466b27..0000000
916--- a/.jenkins/static.groovy
917+++ /dev/null
918@@ -1,82 +0,0 @@
919-#!/usr/bin/env groovy
920-@Library('rocJenkins@pong') _
921-import com.amd.project.*
922-import com.amd.docker.*
923-import java.nio.file.Path;
924-
925-def runCI =
926-{
927- nodeDetails, jobName->
928-
929- def prj = new rocProject('rocPRIM', 'static')
930- prj.paths.build_command = './install -c -s'
931- prj.timeout.compile = 600
932- prj.timeout.packaging = 120
933-
934- def nodes = new dockerNodes(nodeDetails, jobName, prj)
935-
936- def commonGroovy
937-
938- boolean formatCheck = false
939-
940- def compileCommand =
941- {
942- platform, project->
943-
944- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
945- commonGroovy.runCompileCommand(platform, project, jobName)
946- }
947-
948- def testCommand =
949- {
950- platform, project->
951-
952- commonGroovy.runTestCommand(platform, project)
953- }
954-
955- def packageCommand =
956- {
957- platform, project->
958-
959- commonGroovy.runPackageCommand(platform, project)
960- }
961-
962- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
963-}
964-
965-ci: {
966- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
967-
968- def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])],
969- "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
970- "rocm-docker":[]]
971- propertyList = auxiliary.appendPropertyList(propertyList)
972-
973- def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])]
974- jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM')
975-
976- propertyList.each
977- {
978- jobName, property->
979- if (urlJobName == jobName)
980- properties(auxiliary.addCommonProperties(property))
981- }
982-
983- jobNameList.each
984- {
985- jobName, nodeDetails->
986- if (urlJobName == jobName)
987- stage(jobName) {
988- runCI(nodeDetails, jobName)
989- }
990- }
991-
992- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
993- if(!jobNameList.keySet().contains(urlJobName))
994- {
995- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
996- stage(urlJobName) {
997- runCI([ubuntu16:['gfx906']], urlJobName)
998- }
999- }
1000-}
1001diff --git a/CHANGELOG.md b/CHANGELOG.md
1002index 624c2f5..8fcaa79 100644
1003--- a/CHANGELOG.md
1004+++ b/CHANGELOG.md
1005@@ -2,19 +2,177 @@
1006
1007 Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projects/rocPRIM/en/latest/](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/).
1008
1009+## rocPRIM 4.1.0 for ROCm 7.1
1010+
1011+### Added
1012+
1013+* Added `get_sreg_lanemask_lt`, `get_sreg_lanemask_le`, `get_sreg_lanemask_gt` and `get_sreg_lanemask_ge`.
1014+* Added `rocprim::transform_output_iterator` and `rocprim::make_transform_output_iterator`.
1015+* Added experimental support for SPIR-V, to use the correct tuned config for part of the appliable algorithms.
1016+* Added a new cmake option, `BUILD_OFFLOAD_COMPRESS`. When rocPRIM is build with this option enabled, the `--offload-compress` switch is passed to the compiler. This causes the compiler to compress the binary that it generates. Compression can be useful in cases where you are compiling for a large number of targets, since this often results in a large binary. Without compression, in some cases, the generated binary may become so large symbols are placed out of range, resulting in linking errors. The new `BUILD_OFFLOAD_COMPRESS` option is set to `ON` by default.
1017+* Added a new CMake option `-DUSE_SYSTEM_LIB` to allow tests to be built from `ROCm` libraries provided by the system.
1018+* Added `rocprim::apply` which applies a function to a `rocprim::tuple`.
1019+
1020+### Changed
1021+
1022+* Changed tests to support `ptr-to-const` output in `/test/rocprim/test_device_batch_memcpy.cpp`.
1023+
1024+### Optimizations
1025+
1026+* Improved performance of many algorithms, by updating their tuned configs.
1027+ * 891 specializations have been improved.
1028+ * 399 specializations have been added.
1029+
1030+### Upcoming changes
1031+
1032+* Deprecated the `->` operator for the `zip_iterator`.
1033+
1034+### Resolved issues
1035+
1036+* Fixed `device_select`, `device_merge`, and `device_merge_sort` not allocating the correct amount of virtual shared memory on the host.
1037+* Fixed the `->` operator for the `transform_iterator`, the `texture_cache_iterator` and the `arg_index_iterator`, by now returning a proxy pointer.
1038+ * The `arg_index_iterator` also now only returns the internal iterator for the `->`.
1039+
1040+## rocPRIM 4.0.1 for ROCm 7.0.2
1041+
1042+### Resolved issues
1043+
1044+* Fixed compilation issue when using `rocprim::texture_cache_iterator`.
1045+* Fixed a HIP version check used to determine whether hipStreamLegacy is supported. This resolves runtime errors that occur when hipStreamLegacy is used in versions of ROCm later than 6.4.
1046+
1047+## rocPRIM 4.0.0 for ROCm 7.0
1048+
1049+### Added
1050+
1051+* Added `rocprim::accumulator_t` to ensure parity with CCCL.
1052+* Added test for `rocprim::accumulator_t`
1053+* Added `rocprim::invoke_result_r` to ensure parity with CCCL.
1054+* Added function `is_build_in` into `rocprim::traits::get`.
1055+* Added virtual shared memory as a fallback option in `rocprim::device_merge` when it exceeds shared memory capacity, similar to `rocprim::device_select`, `rocprim::device_partition`, and `rocprim::device_merge_sort`, which already include this feature.
1056+* Added initial value support to device level inclusive scans.
1057+* Added new optimization to the backend for `device_transform` when the input and output are pointers.
1058+* Added `LoadType` to `transform_config`, which is used for the `device_transform` when the input and output are pointers.
1059+* Added `rocprim:device_transform` for n-ary transform operations API with as input `n` number of iterators inside a `rocprim::tuple`.
1060+* Added gfx950 support.
1061+* Added `rocprim::key_value_pair::operator==`.
1062+* Added the `rocprim::unrolled_copy` thread function to copy multiple items inside a thread.
1063+* Added the `rocprim::unrolled_thread_load` function to load multiple items inside a thread using `rocprim::thread_load`.
1064+* Added `rocprim::int128_t` and `rocprim::uint128_t` to benchmarks for improved performance evaluation on 128-bit integers.
1065+* Added `rocprim::int128_t` to the supported autotuning types to improve performance for 128-bit integers.
1066+* Added the `rocprim::merge_inplace` function for merging in-place.
1067+* Added initial value support for warp- and block-level inclusive scan.
1068+* Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag.
1069+* Added tests and documentation to `lookback_scan_state`. It is still in the `detail` namespace.
1070+
1071+### Optimizations
1072+
1073+* Improved performance of `rocprim::device_select` and `rocprim::device_partition` when using multiple streams on the MI3XX architecture.
1074+
1075+### Changed
1076+
1077+* Changed the parameters `long_radix_bits` and `LongRadixBits` from `segmented_radix_sort` to `radix_bits` and `RadixBits` respectively.
1078+* Marked the initialisation constructor of `rocprim::reverse_iterator<Iter>` `explicit`, use `rocprim::make_reverse_iterator`.
1079+* Merged `radix_key_codec` into type_traits system.
1080+* Renamed `type_traits_interface.hpp` to `type_traits.hpp`, rename the original `type_traits.hpp` to `type_traits_functions.hpp`.
1081+* The default scan accumulator types for device-level scan algorithms have changed. This is a breaking change.
1082+The previous default accumulator types could lead to situations in which unexpected overflow occured, such as
1083+when the input or inital type was smaller than the output type.
1084+ * This is a complete list of affected functions and how their default accumulator types are changing:
1085+ * `rocprim::inclusive_scan`
1086+ * Previous default: `class AccType = typename std::iterator_traits<InputIterator>::value_type>`
1087+ * Current default: `class AccType = rocprim::accumulator_t<BinaryFunction, typename std::iterator_traits<InputIterator>::value_type>`
1088+ * `rocprim::deterministic_inclusive_scan`
1089+ * Previous default: `class AccType = typename std::iterator_traits<InputIterator>::value_type>`
1090+ * Current default: `class AccType = rocprim::accumulator_t<BinaryFunction, typename std::iterator_traits<InputIterator>::value_type>`
1091+ * `rocprim::exclusive_scan`
1092+ * Previous default: `class AccType = detail::input_type_t<InitValueType>>`
1093+ * Current default: `class AccType = rocprim::accumulator_t<BinaryFunction, rocprim::detail::input_type_t<InitValueType>>`
1094+ * `rocprim::deterministic_exclusive_scan`
1095+ * Previous default: `class AccType = detail::input_type_t<InitValueType>>`
1096+ * Current default: `class AccType = rocprim::accumulator_t<BinaryFunction, rocprim::detail::input_type_t<InitValueType>>`
1097+* Undeprecated internal `detail::raw_storage`.
1098+* A new version of `rocprim::thread_load` and `rocprim::thread_store` replace the deprecated `rocprim::thread_load` and `rocprim::thread_store` functions. The versions avoid inline assembly where possible, and don't hinder the optimizer as much as a result.
1099+* Renamed `rocprim::load_cs` to `rocprim::load_nontemporal` and `rocprim::store_cs` to `rocprim::store_nontemporal` to express the intent of these load and store methods better.
1100+* All kernels now have hidden symbol visibility. All symbols now have inline namespaces that include the library version, for example, `rocprim::ROCPRIM_300400_NS::symbol` instead of `rocPRIM::symbol`, letting the user link multiple libraries built with different versions of rocPRIM.
1101+
1102+### Upcoming changes
1103+
1104+* `rocprim::invoke_result_binary_op` and `rocprim::invoke_result_binary_op_t` are deprecated. Use `rocprim::accumulator_t` now.
1105+
1106+### Removed
1107+
1108+* Removed `rocprim::detail::float_bit_mask` and relative tests, use `rocprim::traits::float_bit_mask` instead.
1109+* Removed `rocprim::traits::is_fundamental`, please use `rocprim::traits::get<T>::is_fundamental()` directly.
1110+* Removed the deprecated parameters `short_radix_bits` and `ShortRadixBits` from the `segmented_radix_sort` config. They were unused, it is only an API change.
1111+* Removed the deprecated `operator<<` from the iterators.
1112+* Removed the deprecated `TwiddleIn` and `TwiddleOut`. Use `radix_key_codec` instead.
1113+* Removed the deprecated flags API of `block_adjacent_difference`. Use `subtract_left()` or `block_discontinuity::flag_heads()` instead.
1114+* Removed the deprecated `to_exclusive` functions in the warp scans.
1115+* Removed the `rocprim::load_cs` from the `cache_load_modifier` enum. Use `rocprim::load_nontemporal` instead.
1116+* Removed the `rocprim::store_cs` from the `cache_store_modifier` enum. Use `rocprim::store_nontemporal` instead.
1117+* Removed the deprecated header file `rocprim/detail/match_result_type.hpp`. Include `rocprim/type_traits.hpp` instead.
1118+ * This header included `rocprim::detail::invoke_result`. Use `rocprim::invoke_result` instead.
1119+ * This header included `rocprim::detail::invoke_result_binary_op`. Use `rocprim::invoke_result_binary_op` instead.
1120+ * This header included `rocprim::detail::match_result_type`. Use `rocprim::invoke_result_binary_op_t` instead.
1121+* Removed the deprecated `rocprim::detail::radix_key_codec` function. Use `rocprim::radix_key_codec` instead.
1122+* Removed `rocprim/detail/radix_sort.hpp`, functionality can now be found in `rocprim/thread/radix_key_codec.hpp`.
1123+* Removed C++14 support, only C++17 is supported.
1124+* Due to the removal of `__AMDGCN_WAVEFRONT_SIZE` in the compiler, the following deprecated warp size-related symbols have been removed:
1125+ * `rocprim::device_warp_size()`
1126+ * For compile-time constants, this is replaced with `rocprim::arch::wavefront::min_size()` and `rocprim::arch::wavefront::max_size()`. Use this when allocating global or shared memory.
1127+ * For run-time constants, this is replaced with `rocprim::arch::wavefront::size().`
1128+ * `rocprim::warp_size()`
1129+ * Use `rocprim::host_warp_size()`, `rocprim::arch::wavefront::min_size()` or `rocprim::arch::wavefront::max_size()` instead.
1130+ * `ROCPRIM_WAVEFRONT_SIZE`
1131+ * Use `rocprim::arch::wavefront::min_size()` or `rocprim::arch::wavefront::max_size()` instead.
1132+ * `__AMDGCN_WAVEFRONT_SIZE`
1133+ * This was a fallback define for the compiler's removed symbol, having the same name.
1134+* This release removes support for custom builds on gfx940 and gfx941.
1135+
1136+### Resolved issues
1137+
1138+* Fixed an issue where `device_batch_memcpy` reported benchmarking throughput being 2x lower than it was in reality.
1139+* Fixed an issue where `device_segmented_reduce` reported autotuning throughput being 5x lower than it was in reality.
1140+* Fixed device radix sort not returning the correct required temporary storage when a double buffer contains `nullptr`.
1141+* Fixed constness of equality operators (`==` and `!=`) in `rocprim::key_value_pair`.
1142+* Fixed an issue for the comparison operators in `arg_index_iterator` and `texture_cache_iterator`, where `<` and `>` comparators were swapped.
1143+* Fixed an issue for the `rocprim::thread_reduce` not working correctly with a prefix value.
1144+
1145+### Known issues
1146+* When using `rocprim::deterministic_inclusive_scan_by_key` and `rocprim::deterministic_exclusive_scan_by_key` the intermediate values can change order on Navi3x
1147+ * However if a commutative scan operator is used then the final scan value (output array) will still always be consistent between runs
1148+
1149 ## rocPRIM 3.4.1 for ROCm 6.4.2
1150
1151 ### Upcoming changes
1152 * Changes to the template parameters of warp and block algorithms will be made in an upcoming release.
1153
1154-### Deprecations
1155 * Due to an upcoming compiler change the following warp size-related symbols will be removed in the next major release and are thus marked as deprecated:
1156 * `rocprim::device_warp_size()`
1157 * For compile-time constants, this is replaced with `rocprim::arch::wavefront::min_size()` and `rocprim::arch::wavefront::max_size()`. Use this when allocating global or shared memory.
1158 * For run-time constants, this is replaced with `rocprim::arch::wavefront::size().`
1159 * `rocprim::warp_size()`
1160- * `ROCPRIM_WAVEFRONT_SIZE
1161+ * `ROCPRIM_WAVEFRONT_SIZE`
1162
1163+* The default scan accumulator types for device-level scan algorithms will be changed in an upcoming release, resulting in a breaking change. Previously, the default accumulator type was set to the input type for the inclusive scans and to the initial value type for the exclusive scans. This could lead to unexpected overflow if the input or initial type was smaller than the output type when the accumulator type was't explicitly set using the `AccType` template parameter. The new default accumulator types will be set to the type that results when the input or initial value type is applied to the scan operator.
1164+
1165+ The following is the complete list of affected functions and how their default accumulator types are changing:
1166+
1167+ * `rocprim::inclusive_scan`
1168+ * current default: `class AccType = typename std::iterator_traits<InputIterator>::value_type>`
1169+ * future default: `class AccType = rocprim::invoke_result_binary_op_t<typename std::iterator_traits<InputIterator>::value_type, BinaryFunction>`
1170+ * `rocprim::deterministic_inclusive_scan`
1171+ * current default: `class AccType = typename std::iterator_traits<InputIterator>::value_type>`
1172+ * future default: `class AccType = rocprim::invoke_result_binary_op_t<typename std::iterator_traits<InputIterator>::value_type, BinaryFunction>`
1173+ * `rocprim::exclusive_scan`
1174+ * current default: `class AccType = detail::input_type_t<InitValueType>>`
1175+ * future default: `class AccType = rocprim::invoke_result_binary_op_t<rocprim::detail::input_type_t<InitValueType>, BinaryFunction>`
1176+ * `rocprim::deterministic_exclusive_scan`
1177+ * current default: `class AccType = detail::input_type_t<InitValueType>>`
1178+ * future default: `class AccType = rocprim::invoke_result_binary_op_t<rocprim::detail::input_type_t<InitValueType>, BinaryFunction>`
1179+
1180+* `rocprim::load_cs` and `rocprim::store_cs` are deprecated and will be removed in an upcoming release. Alternatively, you can use `rocprim::load_nontemporal` and `rocprim::store_nontemporal` to load and store values in specific conditions (like bypassing the cache) for `rocprim::thread_load` and `rocprim::thread_store`.
1181+
1182 ## rocPRIM 3.4.0 for ROCm 6.4.0
1183
1184 ### Added
1185@@ -34,8 +192,8 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec
1186 * Added the parallel `search` and `find_end` device functions similar to `std::search` and `std::find_end`, these functions search for the first and last occurrence of the sequence respectively.
1187 * Added a parallel device-level function, `rocprim::search_n`, similar to the C++ Standard Library `std::search_n` algorithm.
1188 * Added new constructors and a `base` function, and added `constexpr` specifier to all functions in `rocprim::reverse_iterator` to improve parity with the C++17 `std::reverse_iterator`.
1189-* Added hipGraph support to device run-length-encode for non trivial runs (`rocprim::run_length_encode_non_trivial_runs`).
1190-* Added configuration autotuning to device run-length-encode for non trivial runs (`rocprim::run_length_encode_non_trivial_runs`) for improved performance on selected architectures.
1191+* Added hipGraph support to device run-length-encode for nontrivial runs (`rocprim::run_length_encode_non_trivial_runs`).
1192+* Added configuration autotuning to device run-length-encode for nontrivial runs (`rocprim::run_length_encode_non_trivial_runs`) for improved performance on selected architectures.
1193 * Added configuration autotuning to device run-length-encode for trivial runs (`rocprim::run_length_encode`) for improved performance on selected architectures.
1194 * Added a new type traits interface to enable users to provide additional type trait information to rocPRIM, facilitating better compatibility with custom types.
1195
1196@@ -523,3 +681,5 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec
1197
1198 * Switched to HIP-Clang as the default compiler
1199 * CMake searches for rocPRIM locally first; if t's not found, CMake downloads it from GitHub
1200+
1201+
1202diff --git a/CMakeLists.txt b/CMakeLists.txt
1203index f6579e5..20999f1 100644
1204--- a/CMakeLists.txt
1205+++ b/CMakeLists.txt
1206@@ -1,6 +1,6 @@
1207 # MIT License
1208 #
1209-# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
1210+# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
1211 #
1212 # Permission is hereby granted, free of charge, to any person obtaining a copy
1213 # of this software and associated documentation files (the "Software"), to deal
1214@@ -41,11 +41,8 @@ set(CMAKE_HIP_STANDARD 14)
1215 set(CMAKE_HIP_STANDARD_REQUIRED ON)
1216 set(CMAKE_HIP_EXTENSIONS OFF)
1217
1218-# Set CXX standard
1219-if (CMAKE_CXX_STANDARD EQUAL 14)
1220- message(WARNING "C++14 will be deprecated in the next major release")
1221-elseif(NOT CMAKE_CXX_STANDARD EQUAL 17)
1222- message(FATAL_ERROR "Only C++14 and C++17 are supported")
1223+if(NOT CMAKE_CXX_STANDARD EQUAL 17)
1224+ message(FATAL_ERROR "Only C++17 is supported")
1225 endif()
1226
1227 if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
1228@@ -68,16 +65,31 @@ include(CMakeDependentOption)
1229 # Disables building tests, benchmarks, examples
1230 option(ONLY_INSTALL "Only install" OFF)
1231 cmake_dependent_option(BUILD_TEST "Build tests (requires googletest)" OFF "NOT ONLY_INSTALL" OFF)
1232+option(WITH_ROCRAND "Build tests with device-side data generation(requires rocRAND)" OFF)
1233 cmake_dependent_option(BUILD_BENCHMARK "Build benchmarks" OFF "NOT ONLY_INSTALL" OFF)
1234 cmake_dependent_option(BUILD_EXAMPLE "Build examples" OFF "NOT ONLY_INSTALL" OFF)
1235 option(BUILD_NAIVE_BENCHMARK "Build naive benchmarks" OFF)
1236 cmake_dependent_option(BUILD_DOCS "Build documentation (requires sphinx)" OFF "NOT ONLY_INSTALL" OFF)
1237 option(BUILD_CODE_COVERAGE "Build with code coverage enabled" OFF)
1238 option(ROCPRIM_INSTALL "Enable installation of rocPRIM (projects embedding rocPRIM may want to turn this OFF)" ON)
1239+option(ROCPRIM_ENABLE_ASSERTS "Enable asserts in release build)" OFF)
1240+option(BUILD_OFFLOAD_COMPRESS "Build rocPRIM with offload compression" ON)
1241+cmake_dependent_option(USE_SYSTEM_LIB "Use installed ROCm libs when building tests" OFF BUILD_TEST OFF)
1242
1243 check_language(HIP)
1244 cmake_dependent_option(USE_HIPCXX "Use CMake HIP language support" OFF CMAKE_HIP_COMPILER OFF)
1245
1246+include(CheckCXXCompilerFlag)
1247+
1248+if(BUILD_OFFLOAD_COMPRESS)
1249+ check_cxx_compiler_flag("--offload-compress" CXX_COMPILER_SUPPORTS_OFFLOAD_COMPRESS)
1250+ if(CXX_COMPILER_SUPPORTS_OFFLOAD_COMPRESS)
1251+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --offload-compress")
1252+ else()
1253+ message(STATUS "Warning: BUILD_OFFLOAD_COMPRESS=ON but flag not supported by compiler. Ignoring option.")
1254+ endif()
1255+endif()
1256+
1257 if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
1258 set(ROCPRIM_PROJECT_IS_TOP_LEVEL TRUE)
1259 else()
1260@@ -104,6 +116,21 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
1261 set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
1262 endif()
1263
1264+if(ROCPRIM_ENABLE_ASSERTS)
1265+ if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "")
1266+ string(TOUPPER ${CMAKE_BUILD_TYPE} BUILD_TYPE)
1267+ set(BUILD_TYPE_CXX_FLAGS "CMAKE_CXX_FLAGS_${BUILD_TYPE}")
1268+ set(BUILD_TYPE_C_FLAGS "CMAKE_C_FLAGS_${BUILD_TYPE}")
1269+ endif()
1270+
1271+ string(REGEX REPLACE "-DNDEBUG( |$)" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
1272+ string(REGEX REPLACE "-DNDEBUG( |$)" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
1273+ if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "")
1274+ string(REGEX REPLACE "-DNDEBUG( |$)" "" ${BUILD_TYPE_CXX_FLAGS} "${${BUILD_TYPE_CXX_FLAGS}}")
1275+ string(REGEX REPLACE "-DNDEBUG( |$)" "" ${BUILD_TYPE_C_FLAGS} "${${BUILD_TYPE_C_FLAGS}}")
1276+ endif()
1277+endif()
1278+
1279 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker search and installed rpath")
1280
1281 if(DEFINED BUILD_SHARED_LIBS)
1282@@ -115,6 +142,7 @@ set(BUILD_SHARED_LIBS OFF) # don't build client dependencies as shared
1283
1284 # Get dependencies (required here to get rocm-cmake)
1285 include(cmake/Dependencies.cmake)
1286+
1287 # Use target ID syntax if supported for GPU_TARGETS
1288 if(USE_HIPCXX)
1289 enable_language(HIP)
1290@@ -130,18 +158,24 @@ else()
1291 if(BUILD_ADDRESS_SANITIZER)
1292 # ASAN builds require xnack
1293 rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
1294- TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+"
1295+ TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+;gfx950:xnack+"
1296 )
1297 else()
1298 rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
1299- TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201"
1300+ TARGETS "gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201"
1301 )
1302 endif()
1303-
1304 set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE)
1305 endif()
1306 endif()
1307
1308+# Compressed offload binaries are currently not working with the SPIR-V target
1309+if("amdgcnspirv" IN_LIST GPU_TARGETS)
1310+ if(BUILD_OFFLOAD_COMPRESS)
1311+ message(FATAL_ERROR "Cannot combine SPIR-V and BUILD_OFFLOAD_COMPRESS")
1312+ endif()
1313+endif()
1314+
1315 # TODO: Fix VerifyCompiler for HIP on Windows
1316 if (NOT WIN32)
1317 include(cmake/VerifyCompiler.cmake)
1318@@ -149,25 +183,15 @@ endif()
1319 list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}/llvm ${ROCM_ROOT}/llvm ${ROCM_ROOT} ${ROCM_ROOT}/hip)
1320 find_package(hip REQUIRED CONFIG PATHS ${HIP_DIR} ${ROCM_PATH} /opt/rocm)
1321
1322-# FOR HANDLING ENABLE/DISABLE OPTIONAL BACKWARD COMPATIBILITY for FILE/FOLDER REORG
1323-option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" OFF)
1324-if(ROCPRIM_INSTALL AND BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32)
1325- rocm_wrap_header_dir(
1326- "${PROJECT_SOURCE_DIR}/rocprim/include/rocprim"
1327- WRAPPER_LOCATIONS rocprim/include/rocprim
1328- OUTPUT_LOCATIONS rocprim/wrapper/include/rocprim
1329- PATTERNS *.hpp
1330- )
1331-endif()
1332-
1333 if(BUILD_CODE_COVERAGE)
1334 add_compile_options(-fprofile-arcs -ftest-coverage)
1335 add_link_options(--coverage)
1336 endif()
1337
1338 # Setup VERSION
1339-set(VERSION_STRING "3.4.1")
1340+set(VERSION_STRING "4.1.0")
1341 rocm_setup_version(VERSION ${VERSION_STRING})
1342+math(EXPR rocprim_VERSION_NUMBER "${rocprim_VERSION_MAJOR} * 100000 + ${rocprim_VERSION_MINOR} * 100 + ${rocprim_VERSION_PATCH}")
1343
1344 # Print configuration summary
1345 include(cmake/Summary.cmake)
1346@@ -182,6 +206,13 @@ endif()
1347
1348 # Tests
1349 if(BUILD_TEST)
1350+ if(USE_SYSTEM_LIB)
1351+ find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/rocprim")
1352+ if (NOT ${rocprim_VERSION} VERSION_EQUAL ${VERSION_STRING})
1353+ message(WARNING "The installed rocprim version, ${rocprim_VERSION}, does not match project version ${VERSION_STRING}. Building tests with USE_SYSTEM_LIB=ON may not work properly.")
1354+ endif()
1355+ endif()
1356+
1357 if (ROCPRIM_PROJECT_IS_TOP_LEVEL)
1358 rocm_package_setup_client_component(tests)
1359 endif()
1360@@ -223,14 +254,14 @@ if (ROCPRIM_PROJECT_IS_TOP_LEVEL)
1361 rocm_package_add_deb_dependencies(STATIC_DEPENDS "hip-static-dev >= ${HIP_RUNTIME_MINIMUM}")
1362 rocm_package_add_rpm_dependencies(STATIC_DEPENDS "hip-static-devel >= ${HIP_RUNTIME_MINIMUM}")
1363
1364- set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.txt")
1365+ set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md")
1366 set(CPACK_RPM_PACKAGE_LICENSE "MIT")
1367
1368 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" )
1369
1370 rocm_create_package(
1371 NAME rocprim
1372- DESCRIPTION "Radeon Open Compute Parallel Primitives Library"
1373+ DESCRIPTION "rocPRIM is a header-only library that provides HIP parallel primitives."
1374 MAINTAINER "rocPRIM Maintainer <rocprim-maintainer@amd.com>"
1375 HEADER_ONLY
1376 )
1377diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
1378index 77c2847..8769ab6 100644
1379--- a/CONTRIBUTING.md
1380+++ b/CONTRIBUTING.md
1381@@ -41,6 +41,8 @@ and performs well across a variety of input types and sizes. More specifically:
1382 - Tests and benchmarks must be instantiated with all supported data types.
1383 - If the algorithm uses multiple data types (for instance, if it uses different types for input and output), a selected and representative few combinations should be tested instead of the full combination matrix.
1384
1385+Any utility needed by the tests **and** benchmarks must be added to the appropriate header within the `common` folder. Non-common utilities may be hosted in the corresponding headers from the `test` or `benchmark` folders. For a more detailed description of the cases to be considered for adding new utilities, please check [common](/common/README.md).
1386+
1387 We also employ automated testing and benchmarking via checks that are run when a pull request is created.
1388 These checks:
1389 - test all algorithms for correctness across a variety of input configurations (eg. types, sizes, etc.)
1390diff --git a/LICENSE.txt b/LICENSE.md
1391similarity index 94%
1392rename from LICENSE.txt
1393rename to LICENSE.md
1394index ba22abe..4d43ac8 100644
1395--- a/LICENSE.txt
1396+++ b/LICENSE.md
1397@@ -1,6 +1,6 @@
1398 MIT License
1399
1400-Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
1401+Copyright (C) Advanced Micro Devices, Inc.
1402
1403 Permission is hereby granted, free of charge, to any person obtaining a copy
1404 of this software and associated documentation files (the "Software"), to deal
1405@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1406 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1407 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1408 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1409-SOFTWARE.
1410\ No newline at end of file
1411+SOFTWARE.
1412diff --git a/README.md b/README.md
1413index 329b170..8caa96c 100644
1414--- a/README.md
1415+++ b/README.md
1416@@ -1,7 +1,7 @@
1417 # rocPRIM
1418
1419 > [!NOTE]
1420-> The published documentation is available at [rocPRIM](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
1421+> The published rocPRIM documentation is available [here](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
1422
1423 rocPRIM is a header-only library that provides HIP parallel primitives. You can use this library to
1424 develop performant GPU-accelerated code on AMD ROCm platforms.
1425@@ -28,55 +28,6 @@ Optional:
1426 * Required only for benchmarks. Building benchmarks is off by default.
1427 * This is automatically downloaded and built by the CMake script.
1428
1429-## Documentation
1430-
1431-Documentation for rocPRIM is available at
1432-[https://rocm.docs.amd.com/projects/rocPRIM/en/latest/](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/).
1433-
1434-### Build documentation locally
1435-
1436-```shell
1437-# Change directory to rocPRIM docs
1438-cd rocPRIM; cd docs
1439-
1440-# Install documentation dependencies
1441-python3 -m pip install -r sphinx/requirements.txt
1442-
1443-# Build the documentation
1444-python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
1445-
1446-# To serve the HTML docs locally
1447-cd _build/html
1448-python3 -m http.server
1449-```
1450-
1451-### Build documentation via CMake
1452-
1453-Install [rocm-cmake](https://github.com/ROCm/rocm-cmake/)
1454-
1455-```shell
1456-# Change directory to rocPRIM
1457-cd rocPRIM
1458-
1459-# Install documentation dependencies
1460-python3 -m pip install -r docs/sphinx/requirements.txt
1461-
1462-# Set C++ compiler
1463-# This example uses hipcc and assumes it is at the path /usr/bin
1464-export CXX=hipcc
1465-export PATH=/usr/bin:$PATH
1466-
1467-# Configure the project
1468-cmake -S . -B ./build -D BUILD_DOCS=ON
1469-
1470-# Build the documentation
1471-cmake --build ./build --target doc
1472-
1473-# To serve the HTML docs locally
1474-cd ./build/docs/html
1475-python3 -m http.server
1476-```
1477-
1478 ## Build and install
1479
1480 You can build and install rocPRIM on Linux or Windows.
1481@@ -104,6 +55,7 @@ You can build and install rocPRIM on Linux or Windows.
1482 # If you want to detect failures on a per GFX IP basis, setting it to some set of ips will create
1483 # separate tests with the ip name embedded into the test name. Building for all, but selecting
1484 # tests only of a specific architecture is possible for eg: ctest -R gfx803|gfx900
1485+ # USE_SYSTEM_LIB - OFF by default. Setting this flag to ON will build tests from the installed ROCm libs provided by the system. This only takes effect when BUILD_TEST is ON.
1486 #
1487 # ! IMPORTANT !
1488 # Set C++ compiler to HIP-clang. You can do it by adding 'CXX=<path-to-compiler>'
1489@@ -314,9 +266,8 @@ different types and operations, by passing compile-time configuration structures
1490 parameter. The main "knobs" are usually the size of the block and the number of items processed by a
1491 single thread.
1492
1493-rocPRIM has built-in default configurations for each of its primitives. In order to use the included
1494-configurations, you need to define the macro `ROCPRIM_TARGET_ARCH` as `803` if you want the
1495-algorithms optimized for gfx803 GCN version, or to `900` for gfx900.
1496+rocPRIM has built-in default configurations for each of its primitives, these will be used automatically
1497+based on the input types and the target architecture from the stream used.
1498
1499 ## hipCUB
1500
1501@@ -325,7 +276,86 @@ algorithms optimized for gfx803 GCN version, or to `900` for gfx900.
1502 [CUB](https://github.com/NVlabs/cub). You can use it to port projects that use the CUB library to the
1503 [HIP](https://github.com/ROCm/HIP) layer and run them on AMD hardware. In the
1504 [ROCm](https://rocm.docs.amd.com/en/latest/) environment, hipCUB uses the rocPRIM library as a
1505-backend; on CUDA platforms, it uses CUB as a backend.
1506+backend.
1507+
1508+## Building the documentation locally
1509+
1510+### Requirements
1511+
1512+#### Doxygen
1513+
1514+The build system uses Doxygen [version 1.9.4](https://github.com/doxygen/doxygen/releases/tag/Release_1_9_4). You can try using a newer version, but that might cause issues.
1515+
1516+After you have downloaded Doxygen version 1.9.4:
1517+
1518+```shell
1519+# Add doxygen to your PATH
1520+echo 'export PATH=<doxygen 1.9.4 path>/bin:$PATH' >> ~/.bashrc
1521+
1522+# Apply the updated .bashrc
1523+source ~/.bashrc
1524+
1525+# Confirm that you are using version 1.9.4
1526+doxygen --version
1527+```
1528+
1529+#### Python
1530+
1531+The build system uses Python version 3.10. You can try using a newer version, but that might cause issues.
1532+
1533+You can install Python 3.10 alongside your other Python versions using [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation):
1534+
1535+```shell
1536+# Install Python 3.10
1537+pyenv install 3.10
1538+
1539+# Create a Python 3.10 virtual environment
1540+pyenv virtualenv 3.10 venv_rocprim
1541+
1542+# Activate the virtual environment
1543+pyenv activate venv_rocprim
1544+```
1545+
1546+### Building
1547+
1548+After cloning this repository, and `cd`ing into it:
1549+
1550+```shell
1551+# Install Python dependencies
1552+python3 -m pip install -r docs/sphinx/requirements.txt
1553+
1554+# Build the documentation
1555+python3 -m sphinx -T -E -b html -d docs/_build/doctrees -D language=en docs docs/_build/html
1556+```
1557+
1558+You can then open `docs/_build/html/index.html` in your browser to view the documentation.
1559+
1560+### Build documentation via CMake
1561+
1562+Install [rocm-cmake](https://github.com/ROCm/rocm-cmake/)
1563+
1564+```shell
1565+# Change directory to rocPRIM
1566+cd rocPRIM
1567+
1568+# Install documentation dependencies
1569+python3 -m pip install -r docs/sphinx/requirements.txt
1570+
1571+# Set C++ compiler
1572+# This example uses hipcc and assumes it is at the path /usr/bin
1573+export CXX=hipcc
1574+export PATH=/usr/bin:$PATH
1575+
1576+# Configure the project
1577+cmake -S . -B ./build -D BUILD_DOCS=ON
1578+
1579+# Build the documentation
1580+cmake --build ./build --target doc
1581+
1582+# To serve the HTML docs locally
1583+cd ./build/docs/html
1584+python3 -m http.server
1585+```
1586
1587 ## Support
1588
1589diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
1590index fd8ee78..a45a6dd 100644
1591--- a/benchmark/CMakeLists.txt
1592+++ b/benchmark/CMakeLists.txt
1593@@ -1,6 +1,6 @@
1594 # MIT License
1595 #
1596-# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
1597+# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
1598 #
1599 # Permission is hereby granted, free of charge, to any person obtaining a copy
1600 # of this software and associated documentation files (the "Software"), to deal
1601@@ -147,6 +147,7 @@ add_rocprim_benchmark(benchmark_device_find_first_of.cpp)
1602 add_rocprim_benchmark(benchmark_device_find_end.cpp)
1603 add_rocprim_benchmark(benchmark_device_histogram.cpp)
1604 add_rocprim_benchmark(benchmark_device_merge.cpp)
1605+add_rocprim_benchmark(benchmark_device_merge_inplace.cpp)
1606 add_rocprim_benchmark(benchmark_device_merge_sort.cpp)
1607 add_rocprim_benchmark(benchmark_device_merge_sort_block_sort.cpp)
1608 add_rocprim_benchmark(benchmark_device_merge_sort_block_merge.cpp)
1609@@ -173,6 +174,7 @@ add_rocprim_benchmark(benchmark_device_segmented_radix_sort_keys.cpp)
1610 add_rocprim_benchmark(benchmark_device_segmented_radix_sort_pairs.cpp)
1611 add_rocprim_benchmark(benchmark_device_segmented_reduce.cpp)
1612 add_rocprim_benchmark(benchmark_device_transform.cpp)
1613+add_rocprim_benchmark(benchmark_device_transform_pointer.cpp)
1614 add_rocprim_benchmark(benchmark_predicate_iterator.cpp)
1615 add_rocprim_benchmark(benchmark_warp_exchange.cpp)
1616 add_rocprim_benchmark(benchmark_warp_reduce.cpp)
1617diff --git a/benchmark/ConfigAutotuneSettings.cmake b/benchmark/ConfigAutotuneSettings.cmake
1618index 37f3a97..acd4f12 100644
1619--- a/benchmark/ConfigAutotuneSettings.cmake
1620+++ b/benchmark/ConfigAutotuneSettings.cmake
1621@@ -1,6 +1,6 @@
1622 # MIT License
1623 #
1624-# Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
1625+# Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
1626 #
1627 # Permission is hereby granted, free of charge, to any person obtaining a copy
1628 # of this software and associated documentation files (the "Software"), to deal
1629@@ -21,18 +21,18 @@
1630 # SOFTWARE.
1631
1632 # All default fallback types as listed in scripts/autotune/fallback_config.json
1633-set(TUNING_TYPES "int64_t int short int8_t double float rocprim::half")
1634+set(TUNING_TYPES "rocprim::int128_t int64_t int short int8_t double float rocprim::half")
1635 # If config selection happens based on two types, the second type has limited fallbacks. The selection is based
1636 # on the size and it is ignored whether the type is floating-point or integral. The autotuning script uses the
1637 # benchmarks for the integral types as fallback, hence tuning for the floating-point types is not needed.
1638-set(LIMITED_TUNING_TYPES "int64_t int short int8_t")
1639+set(LIMITED_TUNING_TYPES "rocprim::int128_t int64_t int short int8_t")
1640
1641 function(read_config_autotune_settings file list_across_names list_across output_pattern_suffix)
1642 if(file STREQUAL "benchmark_device_adjacent_difference")
1643- set(list_across_names "DataType;Left;InPlace;BlockSize" PARENT_SCOPE)
1644+ set(list_across_names "DataType;Left;Aliasing;BlockSize" PARENT_SCOPE)
1645 set(list_across "${TUNING_TYPES};\
1646-true;false true;32 64 128 256 512 1024" PARENT_SCOPE)
1647- set(output_pattern_suffix "@DataType@_@Left@_@InPlace@_@BlockSize@" PARENT_SCOPE)
1648+true;no_alias in_place;32 64 128 256 512 1024" PARENT_SCOPE)
1649+ set(output_pattern_suffix "@DataType@_@Left@_@Aliasing@_@BlockSize@" PARENT_SCOPE)
1650 elseif(file STREQUAL "benchmark_device_adjacent_find")
1651 set(list_across_names "InputType;BlockSize" PARENT_SCOPE)
1652 set(list_across "${TUNING_TYPES};64 128 256 512 1024" PARENT_SCOPE)
1653@@ -44,13 +44,13 @@ true;false true;32 64 128 256 512 1024" PARENT_SCOPE)
1654 elseif(file STREQUAL "benchmark_device_merge_sort_block_merge")
1655 set(list_across_names "KeyType;ValueType;BlockSize;UseMergePath" PARENT_SCOPE)
1656 set(list_across "\
1657-${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES} custom_type<char,double>;\
1658+${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};\
1659 128 256 512 1024;true" PARENT_SCOPE)
1660 set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@UseMergePath@" PARENT_SCOPE)
1661 elseif(file STREQUAL "benchmark_device_merge_sort_block_sort")
1662 set(list_across_names "KeyType;ValueType;BlockSize;BlockSortMethod" PARENT_SCOPE)
1663 set(list_across "\
1664-${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES} custom_type<char,double>;\
1665+${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};\
1666 256 512 1024;rocprim::block_sort_algorithm::stable_merge_sort" PARENT_SCOPE)
1667 set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@BlockSortMethod@" PARENT_SCOPE)
1668 elseif(file STREQUAL "benchmark_device_radix_sort_block_sort")
1669@@ -85,24 +85,40 @@ ${TUNING_TYPES};${LIMITED_TUNING_TYPES};using_warp_scan reduce_then_scan" PARENT
1670 set(list_across "\
1671 binary_search upper_bound lower_bound;${TUNING_TYPES};${LIMITED_TUNING_TYPES};64 128 256;1 2 4 8 16" PARENT_SCOPE)
1672 set(output_pattern_suffix "@SubAlgorithm@_@ValueType@_@OutputType@_@BlockSize@_@ItemsPerThread@" PARENT_SCOPE)
1673+ elseif(file STREQUAL "benchmark_device_search_n")
1674+ set(list_across_names "InputType;BlockSize;ItemsPerThread;Threshold" PARENT_SCOPE)
1675+ set(list_across "\
1676+${TUNING_TYPES};64 128 256 512 1024;1 2 4 8 16;4 8 12 16" PARENT_SCOPE)
1677+ set(output_pattern_suffix "@InputType@_@BlockSize@_@ItemsPerThread@_@Threshold@" PARENT_SCOPE)
1678 elseif(file STREQUAL "benchmark_device_segmented_radix_sort_keys")
1679 set(list_across_names "\
1680-KeyType;LongBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE)
1681+KeyType;RadixBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE)
1682 set(list_across "${TUNING_TYPES};8;256;4 8 16;8;4;256;64;16;8;256" PARENT_SCOPE)
1683 set(output_pattern_suffix "\
1684-@KeyType@_@LongBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE)
1685+@KeyType@_@RadixBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE)
1686 elseif(file STREQUAL "benchmark_device_segmented_radix_sort_pairs")
1687 set(list_across_names "\
1688-KeyType;ValueType;LongBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE)
1689- set(list_across "${TUNING_TYPES};int8_t;8;256;4 8 16;8;4;256;64;16;8;256" PARENT_SCOPE)
1690+KeyType;ValueType;RadixBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE)
1691+ set(list_across "${TUNING_TYPES};${LIMITED_TUNING_TYPES};8;256;4 8 16;8;4;256;64;16;8;256" PARENT_SCOPE)
1692 set(output_pattern_suffix "\
1693-@KeyType@_@ValueType@_@LongBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE)
1694- elseif(file STREQUAL "benchmark_device_transform")
1695+@KeyType@_@ValueType@_@RadixBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE)
1696+elseif(file STREQUAL "benchmark_device_segmented_reduce")
1697+ set(list_across_names "DataType;BlockSize;ItemsPerThread" PARENT_SCOPE)
1698+ set(list_across "\
1699+${TUNING_TYPES};64 128 256;1 2 4 8 16" PARENT_SCOPE)
1700+ set(output_pattern_suffix "@DataType@_@BlockSize@_@ItemsPerThread@" PARENT_SCOPE)
1701+elseif(file STREQUAL "benchmark_device_transform")
1702 set(list_across_names "\
1703 DataType;BlockSize;" PARENT_SCOPE)
1704 set(list_across "${TUNING_TYPES};64 128 256 512 1024" PARENT_SCOPE)
1705 set(output_pattern_suffix "\
1706 @DataType@_@BlockSize@" PARENT_SCOPE)
1707+elseif(file STREQUAL "benchmark_device_transform_pointer")
1708+ set(list_across_names "\
1709+DataType;BlockSize;LoadType" PARENT_SCOPE)
1710+ set(list_across "${TUNING_TYPES};64 128 256 512 1024;rocprim::load_default rocprim::load_nontemporal" PARENT_SCOPE)
1711+ set(output_pattern_suffix "\
1712+@DataType@_@BlockSize@_@LoadType@" PARENT_SCOPE)
1713 elseif(file STREQUAL "benchmark_device_partition")
1714 set(list_across_names "DataType;BlockSize" PARENT_SCOPE)
1715 set(list_across "${TUNING_TYPES};128 192 256 384 512" PARENT_SCOPE)
1716diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp
1717index 8d237f7..71af4c1 100644
1718--- a/benchmark/benchmark_block_adjacent_difference.cpp
1719+++ b/benchmark/benchmark_block_adjacent_difference.cpp
1720@@ -1,6 +1,6 @@
1721 // MIT License
1722 //
1723-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
1724+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
1725 //
1726 // Permission is hereby granted, free of charge, to any person obtaining a copy
1727 // of this software and associated documentation files (the "Software"), to deal
1728@@ -21,11 +21,8 @@
1729 // SOFTWARE.
1730
1731 #include "benchmark_utils.hpp"
1732-// CmdParser
1733-#include "cmdparser.hpp"
1734
1735-// Google Benchmark
1736-#include <benchmark/benchmark.h>
1737+#include "../common/utils_device_ptr.hpp"
1738
1739 // HIP API
1740 #include <hip/hip_runtime.h>
1741@@ -34,83 +31,81 @@
1742 #include <rocprim/block/block_adjacent_difference.hpp>
1743 #include <rocprim/block/block_load_func.hpp>
1744 #include <rocprim/block/block_store_func.hpp>
1745+#include <rocprim/config.hpp>
1746+#include <rocprim/functional.hpp>
1747+#include <rocprim/intrinsics/thread.hpp>
1748+#include <rocprim/types.hpp>
1749
1750-#include <algorithm>
1751-#include <iostream>
1752-#include <limits>
1753+#include <cstddef>
1754+#include <stdint.h>
1755 #include <string>
1756+#include <type_traits>
1757 #include <vector>
1758
1759-#include <cstdio>
1760-#include <cstdlib>
1761-
1762-#ifndef DEFAULT_N
1763-const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4;
1764-#endif
1765-
1766-namespace rp = rocprim;
1767-
1768-template <class Benchmark,
1769- unsigned int BlockSize,
1770- unsigned int ItemsPerThread,
1771- bool WithTile,
1772- typename... Args>
1773-__global__ __launch_bounds__(BlockSize) void kernel(Args ...args)
1774+template<typename Benchmark,
1775+ unsigned int BlockSize,
1776+ unsigned int ItemsPerThread,
1777+ bool WithTile,
1778+ typename... Args>
1779+__global__ __launch_bounds__(BlockSize)
1780+void kernel(Args... args)
1781 {
1782 Benchmark::template run<BlockSize, ItemsPerThread, WithTile>(args...);
1783 }
1784
1785 struct subtract_left
1786 {
1787- template <unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
1788- __device__ static void run(const T* d_input, T* d_output, unsigned int trials)
1789+ template<unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
1790+ __device__
1791+ static void run(const T* d_input, T* d_output, unsigned int trials)
1792 {
1793- const unsigned int lid = threadIdx.x;
1794+ const unsigned int lid = threadIdx.x;
1795 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
1796
1797 T input[ItemsPerThread];
1798- rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
1799+ rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
1800
1801- using adjacent_diff_t = rp::block_adjacent_difference<T, BlockSize>;
1802+ using adjacent_diff_t = rocprim::block_adjacent_difference<T, BlockSize>;
1803 __shared__ typename adjacent_diff_t::storage_type storage;
1804
1805 ROCPRIM_NO_UNROLL
1806- for(unsigned int trial = 0; trial < trials; trial++)
1807+ for(unsigned int trial = 0; trial < trials; ++trial)
1808 {
1809 T output[ItemsPerThread];
1810 if(WithTile)
1811 {
1812- adjacent_diff_t().subtract_left(input, output, rp::minus<>{}, T(123), storage);
1813+ adjacent_diff_t().subtract_left(input, output, rocprim::minus<>{}, T(123), storage);
1814 }
1815 else
1816 {
1817- adjacent_diff_t().subtract_left(input, output, rp::minus<>{}, storage);
1818+ adjacent_diff_t().subtract_left(input, output, rocprim::minus<>{}, storage);
1819 }
1820
1821 for(unsigned int i = 0; i < ItemsPerThread; ++i)
1822 {
1823 input[i] += output[i];
1824 }
1825- rp::syncthreads();
1826+ rocprim::syncthreads();
1827 }
1828
1829- rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
1830+ rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
1831 }
1832 };
1833
1834 struct subtract_left_partial
1835 {
1836- template <unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
1837- __device__ static void
1838+ template<unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
1839+ __device__
1840+ static void
1841 run(const T* d_input, const unsigned int* tile_sizes, T* d_output, unsigned int trials)
1842 {
1843- const unsigned int lid = threadIdx.x;
1844+ const unsigned int lid = threadIdx.x;
1845 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
1846
1847 T input[ItemsPerThread];
1848- rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
1849+ rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
1850
1851- using adjacent_diff_t = rp::block_adjacent_difference<T, BlockSize>;
1852+ using adjacent_diff_t = rocprim::block_adjacent_difference<T, BlockSize>;
1853 __shared__ typename adjacent_diff_t::storage_type storage;
1854
1855 unsigned int tile_size = tile_sizes[blockIdx.x];
1856@@ -119,16 +114,25 @@ struct subtract_left_partial
1857 const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1;
1858
1859 ROCPRIM_NO_UNROLL
1860- for(unsigned int trial = 0; trial < trials; trial++)
1861+ for(unsigned int trial = 0; trial < trials; ++trial)
1862 {
1863 T output[ItemsPerThread];
1864 if(WithTile)
1865 {
1866- adjacent_diff_t().subtract_left_partial(input, output, rp::minus<>{}, T(123), tile_size, storage);
1867+ adjacent_diff_t().subtract_left_partial(input,
1868+ output,
1869+ rocprim::minus<>{},
1870+ T(123),
1871+ tile_size,
1872+ storage);
1873 }
1874 else
1875 {
1876- adjacent_diff_t().subtract_left_partial(input, output, rp::minus<>{}, tile_size, storage);
1877+ adjacent_diff_t().subtract_left_partial(input,
1878+ output,
1879+ rocprim::minus<>{},
1880+ tile_size,
1881+ storage);
1882 }
1883
1884 for(unsigned int i = 0; i < ItemsPerThread; ++i)
1885@@ -138,66 +142,69 @@ struct subtract_left_partial
1886
1887 // Change the tile_size to even out the distribution
1888 tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread);
1889- rp::syncthreads();
1890+ rocprim::syncthreads();
1891 }
1892- rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
1893+ rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
1894 }
1895 };
1896
1897 struct subtract_right
1898 {
1899- template <unsigned int BlockSize,
1900- unsigned int ItemsPerThread,
1901- bool WithTile,
1902- typename T>
1903- __device__ static void run(const T* d_input, T* d_output, unsigned int trials)
1904+ template<unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
1905+ __device__
1906+ static void run(const T* d_input, T* d_output, unsigned int trials)
1907 {
1908- const unsigned int lid = threadIdx.x;
1909+ const unsigned int lid = threadIdx.x;
1910 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
1911
1912 T input[ItemsPerThread];
1913- rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
1914+ rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
1915
1916- using adjacent_diff_t = rp::block_adjacent_difference<T, BlockSize>;
1917+ using adjacent_diff_t = rocprim::block_adjacent_difference<T, BlockSize>;
1918 __shared__ typename adjacent_diff_t::storage_type storage;
1919
1920 ROCPRIM_NO_UNROLL
1921- for(unsigned int trial = 0; trial < trials; trial++)
1922+ for(unsigned int trial = 0; trial < trials; ++trial)
1923 {
1924 T output[ItemsPerThread];
1925 if(WithTile)
1926 {
1927- adjacent_diff_t().subtract_right(input, output, rp::minus<>{}, T(123), storage);
1928+ adjacent_diff_t().subtract_right(input,
1929+ output,
1930+ rocprim::minus<>{},
1931+ T(123),
1932+ storage);
1933 }
1934 else
1935 {
1936- adjacent_diff_t().subtract_right(input, output, rp::minus<>{}, storage);
1937+ adjacent_diff_t().subtract_right(input, output, rocprim::minus<>{}, storage);
1938 }
1939
1940 for(unsigned int i = 0; i < ItemsPerThread; ++i)
1941 {
1942 input[i] += output[i];
1943 }
1944- rp::syncthreads();
1945+ rocprim::syncthreads();
1946 }
1947
1948- rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
1949+ rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
1950 }
1951 };
1952
1953 struct subtract_right_partial
1954 {
1955- template <unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
1956- __device__ static void
1957+ template<unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
1958+ __device__
1959+ static void
1960 run(const T* d_input, const unsigned int* tile_sizes, T* d_output, unsigned int trials)
1961 {
1962- const unsigned int lid = threadIdx.x;
1963+ const unsigned int lid = threadIdx.x;
1964 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
1965
1966 T input[ItemsPerThread];
1967- rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
1968+ rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
1969
1970- using adjacent_diff_t = rp::block_adjacent_difference<T, BlockSize>;
1971+ using adjacent_diff_t = rocprim::block_adjacent_difference<T, BlockSize>;
1972 __shared__ typename adjacent_diff_t::storage_type storage;
1973
1974 unsigned int tile_size = tile_sizes[blockIdx.x];
1975@@ -205,10 +212,14 @@ struct subtract_right_partial
1976 const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1;
1977
1978 ROCPRIM_NO_UNROLL
1979- for(unsigned int trial = 0; trial < trials; trial++)
1980+ for(unsigned int trial = 0; trial < trials; ++trial)
1981 {
1982 T output[ItemsPerThread];
1983- adjacent_diff_t().subtract_right_partial(input, output, rp::minus<>{}, tile_size, storage);
1984+ adjacent_diff_t().subtract_right_partial(input,
1985+ output,
1986+ rocprim::minus<>{},
1987+ tile_size,
1988+ storage);
1989
1990 for(unsigned int i = 0; i < ItemsPerThread; ++i)
1991 {
1992@@ -216,27 +227,31 @@ struct subtract_right_partial
1993 }
1994 // Change the tile_size to even out the distribution
1995 tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread);
1996- rp::syncthreads();
1997+ rocprim::syncthreads();
1998 }
1999- rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
2000+ rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
2001 }
2002 };
2003
2004-template<class Benchmark,
2005- class T,
2006+template<typename Benchmark,
2007+ typename T,
2008 unsigned int BlockSize,
2009 unsigned int ItemsPerThread,
2010 bool WithTile,
2011 unsigned int Trials = 100>
2012-auto run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream)
2013+auto run_benchmark(benchmark_utils::state&& state)
2014 -> std::enable_if_t<!std::is_same<Benchmark, subtract_left_partial>::value
2015 && !std::is_same<Benchmark, subtract_right_partial>::value>
2016 {
2017+ const auto& bytes = state.bytes;
2018+ const auto& seed = state.seed;
2019+ const auto& stream = state.stream;
2020+
2021 // Calculate the number of elements N
2022 size_t N = bytes / sizeof(T);
2023-
2024+
2025 constexpr auto items_per_block = BlockSize * ItemsPerThread;
2026- const auto num_blocks = (N + items_per_block - 1) / items_per_block;
2027+ const auto num_blocks = (N + items_per_block - 1) / items_per_block;
2028 // Round up size to the next multiple of items_per_block
2029 const auto size = num_blocks * items_per_block;
2030
2031@@ -244,70 +259,41 @@ auto run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& se
2032 const std::vector<T> input
2033 = get_random_data<T>(size, random_range.first, random_range.second, seed.get_0());
2034
2035- T* d_input;
2036- T* d_output;
2037- HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0])));
2038- HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T)));
2039- HIP_CHECK(
2040- hipMemcpy(
2041- d_input, input.data(),
2042- input.size() * sizeof(input[0]),
2043- hipMemcpyHostToDevice
2044- )
2045- );
2046-
2047- // HIP events creation
2048- hipEvent_t start, stop;
2049- HIP_CHECK(hipEventCreate(&start));
2050- HIP_CHECK(hipEventCreate(&stop));
2051-
2052- for(auto _ : state)
2053- {
2054- // Record start event
2055- HIP_CHECK(hipEventRecord(start, stream));
2056-
2057- hipLaunchKernelGGL(
2058- HIP_KERNEL_NAME(kernel<Benchmark, BlockSize, ItemsPerThread, WithTile>),
2059- dim3(num_blocks), dim3(BlockSize), 0, stream,
2060- d_input, d_output, Trials
2061- );
2062- HIP_CHECK(hipGetLastError());
2063-
2064- // Record stop event and wait until it completes
2065- HIP_CHECK(hipEventRecord(stop, stream));
2066- HIP_CHECK(hipEventSynchronize(stop));
2067-
2068- float elapsed_mseconds;
2069- HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop));
2070- state.SetIterationTime(elapsed_mseconds / 1000);
2071- }
2072-
2073- // Destroy HIP events
2074- HIP_CHECK(hipEventDestroy(start));
2075- HIP_CHECK(hipEventDestroy(stop));
2076+ common::device_ptr<T> d_input(input);
2077+ common::device_ptr<T> d_output(input.size());
2078
2079- state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
2080- state.SetItemsProcessed(state.iterations() * Trials * size);
2081+ state.run(
2082+ [&]
2083+ {
2084+ kernel<Benchmark, BlockSize, ItemsPerThread, WithTile>
2085+ <<<dim3(num_blocks), dim3(BlockSize), 0, stream>>>(d_input.get(),
2086+ d_output.get(),
2087+ Trials);
2088+ HIP_CHECK(hipGetLastError());
2089+ });
2090
2091- HIP_CHECK(hipFree(d_input));
2092- HIP_CHECK(hipFree(d_output));
2093+ state.set_throughput(size * Trials, sizeof(T));
2094 }
2095
2096-template<class Benchmark,
2097- class T,
2098+template<typename Benchmark,
2099+ typename T,
2100 unsigned int BlockSize,
2101 unsigned int ItemsPerThread,
2102 bool WithTile,
2103 unsigned int Trials = 100>
2104-auto run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream)
2105+auto run_benchmark(benchmark_utils::state&& state)
2106 -> std::enable_if_t<std::is_same<Benchmark, subtract_left_partial>::value
2107 || std::is_same<Benchmark, subtract_right_partial>::value>
2108 {
2109+ const auto& bytes = state.bytes;
2110+ const auto& seed = state.seed;
2111+ const auto& stream = state.stream;
2112+
2113 // Calculate the number of elements N
2114 size_t N = bytes / sizeof(T);
2115
2116 static constexpr auto items_per_block = BlockSize * ItemsPerThread;
2117- const auto num_blocks = (N + items_per_block - 1) / items_per_block;
2118+ const auto num_blocks = (N + items_per_block - 1) / items_per_block;
2119 // Round up size to the next multiple of items_per_block
2120 const auto size = num_blocks * items_per_block;
2121
2122@@ -323,171 +309,73 @@ auto run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& se
2123 random_range_tile_sizes.second,
2124 seed.get_1());
2125
2126- T* d_input;
2127- unsigned int* d_tile_sizes;
2128- T* d_output;
2129- HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0])));
2130- HIP_CHECK(hipMalloc(&d_tile_sizes, tile_sizes.size() * sizeof(tile_sizes[0])));
2131- HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(input[0])));
2132- HIP_CHECK(
2133- hipMemcpy(
2134- d_input, input.data(),
2135- input.size() * sizeof(input[0]),
2136- hipMemcpyHostToDevice
2137- )
2138- );
2139- HIP_CHECK(
2140- hipMemcpy(
2141- d_tile_sizes, tile_sizes.data(),
2142- tile_sizes.size() * sizeof(tile_sizes[0]),
2143- hipMemcpyHostToDevice
2144- )
2145- );
2146-
2147- // HIP events creation
2148- hipEvent_t start, stop;
2149- HIP_CHECK(hipEventCreate(&start));
2150- HIP_CHECK(hipEventCreate(&stop));
2151-
2152- for(auto _ : state)
2153- {
2154- // Record start event
2155- HIP_CHECK(hipEventRecord(start, stream));
2156-
2157- hipLaunchKernelGGL(
2158- HIP_KERNEL_NAME(kernel<Benchmark, BlockSize, ItemsPerThread, WithTile>),
2159- dim3(num_blocks), dim3(BlockSize), 0, stream,
2160- d_input, d_tile_sizes, d_output, Trials
2161- );
2162- HIP_CHECK(hipGetLastError());
2163-
2164- // Record stop event and wait until it completes
2165- HIP_CHECK(hipEventRecord(stop, stream));
2166- HIP_CHECK(hipEventSynchronize(stop));
2167-
2168- float elapsed_mseconds;
2169- HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop));
2170- state.SetIterationTime(elapsed_mseconds / 1000);
2171- }
2172+ common::device_ptr<T> d_input(input);
2173+ common::device_ptr<unsigned int> d_tile_sizes(tile_sizes);
2174+ common::device_ptr<T> d_output(input.size());
2175
2176- // Destroy HIP events
2177- HIP_CHECK(hipEventDestroy(start));
2178- HIP_CHECK(hipEventDestroy(stop));
2179-
2180- state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
2181- state.SetItemsProcessed(state.iterations() * Trials * size);
2182+ state.run(
2183+ [&]
2184+ {
2185+ kernel<Benchmark, BlockSize, ItemsPerThread, WithTile>
2186+ <<<dim3(num_blocks), dim3(BlockSize), 0, stream>>>(d_input.get(),
2187+ d_tile_sizes.get(),
2188+ d_output.get(),
2189+ Trials);
2190+ HIP_CHECK(hipGetLastError());
2191+ });
2192
2193- HIP_CHECK(hipFree(d_input));
2194- HIP_CHECK(hipFree(d_tile_sizes));
2195- HIP_CHECK(hipFree(d_output));
2196+ state.set_throughput(size * Trials, sizeof(T));
2197 }
2198
2199 #define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \
2200- benchmark::RegisterBenchmark( \
2201+ executor.queue_fn( \
2202 bench_naming::format_name("{lvl:block,algo:adjacent_difference,subalgo:" + name \
2203 + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \
2204 ",with_tile:" #WITH_TILE "}}") \
2205 .c_str(), \
2206- run_benchmark<Benchmark, T, BS, IPT, WITH_TILE>, \
2207- bytes, \
2208- seed, \
2209- stream)
2210-
2211-#define BENCHMARK_TYPE(type, block, with_tile) \
2212- CREATE_BENCHMARK(type, block, 1, with_tile), \
2213- CREATE_BENCHMARK(type, block, 3, with_tile), \
2214- CREATE_BENCHMARK(type, block, 4, with_tile), \
2215- CREATE_BENCHMARK(type, block, 8, with_tile), \
2216- CREATE_BENCHMARK(type, block, 16, with_tile), \
2217+ run_benchmark<Benchmark, T, BS, IPT, WITH_TILE>);
2218+
2219+#define BENCHMARK_TYPE(type, block, with_tile) \
2220+ CREATE_BENCHMARK(type, block, 1, with_tile) \
2221+ CREATE_BENCHMARK(type, block, 3, with_tile) \
2222+ CREATE_BENCHMARK(type, block, 4, with_tile) \
2223+ CREATE_BENCHMARK(type, block, 8, with_tile) \
2224+ CREATE_BENCHMARK(type, block, 16, with_tile) \
2225 CREATE_BENCHMARK(type, block, 32, with_tile)
2226
2227-template<class Benchmark>
2228-void add_benchmarks(const std::string& name,
2229- std::vector<benchmark::internal::Benchmark*>& benchmarks,
2230- size_t bytes,
2231- const managed_seed& seed,
2232- hipStream_t stream)
2233+template<typename Benchmark>
2234+void add_benchmarks(const std::string& name, benchmark_utils::executor& executor)
2235 {
2236- std::vector<benchmark::internal::Benchmark*> bs =
2237+ BENCHMARK_TYPE(int, 256, false)
2238+ BENCHMARK_TYPE(float, 256, false)
2239+ BENCHMARK_TYPE(int8_t, 256, false)
2240+ BENCHMARK_TYPE(rocprim::half, 256, false)
2241+ BENCHMARK_TYPE(long long, 256, false)
2242+ BENCHMARK_TYPE(double, 256, false)
2243+ BENCHMARK_TYPE(rocprim::int128_t, 256, false)
2244+ BENCHMARK_TYPE(rocprim::uint128_t, 256, false)
2245+
2246+ if(!std::is_same<Benchmark, subtract_right_partial>::value)
2247 {
2248- BENCHMARK_TYPE(int, 256, false),
2249- BENCHMARK_TYPE(float, 256, false),
2250- BENCHMARK_TYPE(int8_t, 256, false),
2251- BENCHMARK_TYPE(rocprim::half, 256, false),
2252- BENCHMARK_TYPE(long long, 256, false),
2253- BENCHMARK_TYPE(double, 256, false)
2254- };
2255-
2256- if(!std::is_same<Benchmark, subtract_right_partial>::value) {
2257- bs.insert(bs.end(), {
2258- BENCHMARK_TYPE(int, 256, true),
2259- BENCHMARK_TYPE(float, 256, true),
2260- BENCHMARK_TYPE(int8_t, 256, true),
2261- BENCHMARK_TYPE(rocprim::half, 256, true),
2262- BENCHMARK_TYPE(long long, 256, true),
2263- BENCHMARK_TYPE(double, 256, true)
2264- });
2265+ BENCHMARK_TYPE(int, 256, true)
2266+ BENCHMARK_TYPE(float, 256, true)
2267+ BENCHMARK_TYPE(int8_t, 256, true)
2268+ BENCHMARK_TYPE(rocprim::half, 256, true)
2269+ BENCHMARK_TYPE(long long, 256, true)
2270+ BENCHMARK_TYPE(double, 256, true)
2271+ BENCHMARK_TYPE(rocprim::int128_t, 256, true)
2272+ BENCHMARK_TYPE(rocprim::uint128_t, 256, true)
2273 }
2274-
2275- benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
2276 }
2277
2278-int main(int argc, char *argv[])
2279+int main(int argc, char* argv[])
2280 {
2281- cli::Parser parser(argc, argv);
2282- parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes");
2283- parser.set_optional<int>("trials", "trials", -1, "number of iterations");
2284- parser.set_optional<std::string>("name_format",
2285- "name_format",
2286- "human",
2287- "either: json,human,txt");
2288- parser.set_optional<std::string>("seed", "seed", "random", get_seed_message());
2289- parser.run_and_exit_if_error();
2290-
2291- // Parse argv
2292- benchmark::Initialize(&argc, argv);
2293- const size_t bytes = parser.get<size_t>("size");
2294- const int trials = parser.get<int>("trials");
2295- bench_naming::set_format(parser.get<std::string>("name_format"));
2296- const std::string seed_type = parser.get<std::string>("seed");
2297- const managed_seed seed(seed_type);
2298-
2299- // HIP
2300- hipStream_t stream = 0; // default
2301-
2302- // Benchmark info
2303- add_common_benchmark_info();
2304- benchmark::AddCustomContext("bytes", std::to_string(bytes));
2305- benchmark::AddCustomContext("seed", seed_type);
2306-
2307- // Add benchmarks
2308- std::vector<benchmark::internal::Benchmark*> benchmarks;
2309- add_benchmarks<subtract_left>("subtract_left", benchmarks, bytes, seed, stream);
2310- add_benchmarks<subtract_right>("subtract_right", benchmarks, bytes, seed, stream);
2311- add_benchmarks<subtract_left_partial>("subtract_left_partial", benchmarks, bytes, seed, stream);
2312- add_benchmarks<subtract_right_partial>("subtract_right_partial",
2313- benchmarks,
2314- bytes,
2315- seed,
2316- stream);
2317-
2318- // Use manual timing
2319- for(auto& b : benchmarks)
2320- {
2321- b->UseManualTime();
2322- b->Unit(benchmark::kMillisecond);
2323- }
2324+ benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0);
2325
2326- // Force number of iterations
2327- if(trials > 0)
2328- {
2329- for(auto& b : benchmarks)
2330- {
2331- b->Iterations(trials);
2332- }
2333- }
2334+ add_benchmarks<subtract_left>("subtract_left", executor);
2335+ add_benchmarks<subtract_right>("subtract_right", executor);
2336+ add_benchmarks<subtract_left_partial>("subtract_left_partial", executor);
2337+ add_benchmarks<subtract_right_partial>("subtract_right_partial", executor);
2338
2339- // Run benchmarks
2340- benchmark::RunSpecifiedBenchmarks();
2341- return 0;
2342+ executor.run();
2343 }
2344diff --git a/benchmark/benchmark_block_discontinuity.cpp b/benchmark/benchmark_block_discontinuity.cpp
2345index 1d07cdb..62fd01a 100644
2346--- a/benchmark/benchmark_block_discontinuity.cpp
2347+++ b/benchmark/benchmark_block_discontinuity.cpp
2348@@ -1,6 +1,6 @@
2349 // MIT License
2350 //
2351-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
2352+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
2353 //
2354 // Permission is hereby granted, free of charge, to any person obtaining a copy
2355 // of this software and associated documentation files (the "Software"), to deal
2356@@ -20,12 +20,7 @@
2357 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2358 // SOFTWARE.
2359
2360-// CmdParser
2361 #include "benchmark_utils.hpp"
2362-#include "cmdparser.hpp"
2363-
2364-// Google Benchmark
2365-#include <benchmark/benchmark.h>
2366
2367 // HIP API
2368 #include <hip/hip_runtime.h>
2369@@ -34,327 +29,243 @@
2370 #include <rocprim/block/block_discontinuity.hpp>
2371 #include <rocprim/block/block_load_func.hpp>
2372 #include <rocprim/block/block_store_func.hpp>
2373+#include <rocprim/config.hpp>
2374+#include <rocprim/functional.hpp>
2375+#include <rocprim/intrinsics/thread.hpp>
2376+#include <rocprim/types.hpp>
2377
2378-#include <iostream>
2379-#include <limits>
2380+#include <cstddef>
2381+#include <stdint.h>
2382 #include <string>
2383 #include <vector>
2384
2385-#include <cstdio>
2386-#include <cstdlib>
2387-
2388-#ifndef DEFAULT_N
2389-const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4;
2390-#endif
2391-
2392-namespace rp = rocprim;
2393-
2394-template<
2395- class Runner,
2396- class T,
2397- unsigned int BlockSize,
2398- unsigned int ItemsPerThread,
2399- bool WithTile,
2400- unsigned int Trials
2401->
2402-__global__
2403-__launch_bounds__(BlockSize)
2404-void kernel(const T * d_input, T * d_output)
2405+template<typename Runner,
2406+ typename T,
2407+ unsigned int BlockSize,
2408+ unsigned int ItemsPerThread,
2409+ bool WithTile,
2410+ unsigned int Trials>
2411+__global__ __launch_bounds__(BlockSize)
2412+void kernel(const T* d_input, T* d_output)
2413 {
2414 Runner::template run<T, BlockSize, ItemsPerThread, WithTile, Trials>(d_input, d_output);
2415 }
2416
2417 struct flag_heads
2418 {
2419- template<
2420- class T,
2421- unsigned int BlockSize,
2422- unsigned int ItemsPerThread,
2423- bool WithTile,
2424- unsigned int Trials
2425- >
2426+ template<typename T,
2427+ unsigned int BlockSize,
2428+ unsigned int ItemsPerThread,
2429+ bool WithTile,
2430+ unsigned int Trials>
2431 __device__
2432- static void run(const T * d_input, T * d_output)
2433+ static void run(const T* d_input, T* d_output)
2434 {
2435- const unsigned int lid = threadIdx.x;
2436+ const unsigned int lid = threadIdx.x;
2437 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
2438
2439 T input[ItemsPerThread];
2440- rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
2441+ rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
2442
2443 ROCPRIM_NO_UNROLL
2444- for(unsigned int trial = 0; trial < Trials; trial++)
2445+ for(unsigned int trial = 0; trial < Trials; ++trial)
2446 {
2447- rp::block_discontinuity<T, BlockSize> bdiscontinuity;
2448- bool head_flags[ItemsPerThread];
2449+ rocprim::block_discontinuity<T, BlockSize> bdiscontinuity;
2450+ bool head_flags[ItemsPerThread];
2451 if(WithTile)
2452 {
2453- bdiscontinuity.flag_heads(head_flags, T(123), input, rp::equal_to<T>());
2454+ bdiscontinuity.flag_heads(head_flags, T(123), input, rocprim::equal_to<T>());
2455 }
2456 else
2457 {
2458- bdiscontinuity.flag_heads(head_flags, input, rp::equal_to<T>());
2459+ bdiscontinuity.flag_heads(head_flags, input, rocprim::equal_to<T>());
2460 }
2461
2462- for(unsigned int i = 0; i < ItemsPerThread; i++)
2463+ for(unsigned int i = 0; i < ItemsPerThread; ++i)
2464 {
2465 input[i] += head_flags[i];
2466 }
2467- rp::syncthreads();
2468+ rocprim::syncthreads();
2469 }
2470
2471- rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
2472+ rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
2473 }
2474 };
2475
2476 struct flag_tails
2477 {
2478- template<
2479- class T,
2480- unsigned int BlockSize,
2481- unsigned int ItemsPerThread,
2482- bool WithTile,
2483- unsigned int Trials
2484- >
2485+ template<typename T,
2486+ unsigned int BlockSize,
2487+ unsigned int ItemsPerThread,
2488+ bool WithTile,
2489+ unsigned int Trials>
2490 __device__
2491- static void run(const T * d_input, T * d_output)
2492+ static void run(const T* d_input, T* d_output)
2493 {
2494- const unsigned int lid = threadIdx.x;
2495+ const unsigned int lid = threadIdx.x;
2496 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
2497
2498 T input[ItemsPerThread];
2499- rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
2500+ rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
2501
2502 ROCPRIM_NO_UNROLL
2503- for(unsigned int trial = 0; trial < Trials; trial++)
2504+ for(unsigned int trial = 0; trial < Trials; ++trial)
2505 {
2506- rp::block_discontinuity<T, BlockSize> bdiscontinuity;
2507- bool tail_flags[ItemsPerThread];
2508+ rocprim::block_discontinuity<T, BlockSize> bdiscontinuity;
2509+ bool tail_flags[ItemsPerThread];
2510 if(WithTile)
2511 {
2512- bdiscontinuity.flag_tails(tail_flags, T(123), input, rp::equal_to<T>());
2513+ bdiscontinuity.flag_tails(tail_flags, T(123), input, rocprim::equal_to<T>());
2514 }
2515 else
2516 {
2517- bdiscontinuity.flag_tails(tail_flags, input, rp::equal_to<T>());
2518+ bdiscontinuity.flag_tails(tail_flags, input, rocprim::equal_to<T>());
2519 }
2520
2521- for(unsigned int i = 0; i < ItemsPerThread; i++)
2522+ for(unsigned int i = 0; i < ItemsPerThread; ++i)
2523 {
2524 input[i] += tail_flags[i];
2525 }
2526- rp::syncthreads();
2527+ rocprim::syncthreads();
2528 }
2529
2530- rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
2531+ rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
2532 }
2533 };
2534
2535 struct flag_heads_and_tails
2536 {
2537- template<
2538- class T,
2539- unsigned int BlockSize,
2540- unsigned int ItemsPerThread,
2541- bool WithTile,
2542- unsigned int Trials
2543- >
2544+ template<typename T,
2545+ unsigned int BlockSize,
2546+ unsigned int ItemsPerThread,
2547+ bool WithTile,
2548+ unsigned int Trials>
2549 __device__
2550- static void run(const T * d_input, T * d_output)
2551+ static void run(const T* d_input, T* d_output)
2552 {
2553- const unsigned int lid = threadIdx.x;
2554+ const unsigned int lid = threadIdx.x;
2555 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
2556
2557 T input[ItemsPerThread];
2558- rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
2559+ rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
2560
2561 ROCPRIM_NO_UNROLL
2562- for(unsigned int trial = 0; trial < Trials; trial++)
2563+ for(unsigned int trial = 0; trial < Trials; ++trial)
2564 {
2565- rp::block_discontinuity<T, BlockSize> bdiscontinuity;
2566- bool head_flags[ItemsPerThread];
2567- bool tail_flags[ItemsPerThread];
2568+ rocprim::block_discontinuity<T, BlockSize> bdiscontinuity;
2569+ bool head_flags[ItemsPerThread];
2570+ bool tail_flags[ItemsPerThread];
2571 if(WithTile)
2572 {
2573- bdiscontinuity.flag_heads_and_tails(head_flags, T(123), tail_flags, T(234), input, rp::equal_to<T>());
2574+ bdiscontinuity.flag_heads_and_tails(head_flags,
2575+ T(123),
2576+ tail_flags,
2577+ T(234),
2578+ input,
2579+ rocprim::equal_to<T>());
2580 }
2581 else
2582 {
2583- bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, input, rp::equal_to<T>());
2584+ bdiscontinuity.flag_heads_and_tails(head_flags,
2585+ tail_flags,
2586+ input,
2587+ rocprim::equal_to<T>());
2588 }
2589
2590- for(unsigned int i = 0; i < ItemsPerThread; i++)
2591+ for(unsigned int i = 0; i < ItemsPerThread; ++i)
2592 {
2593 input[i] += head_flags[i];
2594 input[i] += tail_flags[i];
2595 }
2596- rp::syncthreads();
2597+ rocprim::syncthreads();
2598 }
2599
2600- rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
2601+ rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
2602 }
2603 };
2604
2605-template<class Benchmark,
2606- class T,
2607+template<typename Benchmark,
2608+ typename T,
2609 unsigned int BlockSize,
2610 unsigned int ItemsPerThread,
2611 bool WithTile,
2612 unsigned int Trials = 100>
2613-void run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream)
2614+void run_benchmark(benchmark_utils::state&& state)
2615 {
2616+ const auto& bytes = state.bytes;
2617+ const auto& seed = state.seed;
2618+ const auto& stream = state.stream;
2619+
2620 // Calculate the number of elements N
2621 size_t N = bytes / sizeof(T);
2622
2623 constexpr auto items_per_block = BlockSize * ItemsPerThread;
2624- const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block);
2625+ const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block);
2626
2627 const auto random_range = limit_random_range<T>(0, 10);
2628 std::vector<T> input
2629 = get_random_data<T>(size, random_range.first, random_range.second, seed.get_0());
2630- T * d_input;
2631- T * d_output;
2632+ T* d_input;
2633+ T* d_output;
2634 HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
2635 HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
2636- HIP_CHECK(
2637- hipMemcpy(
2638- d_input, input.data(),
2639- size * sizeof(T),
2640- hipMemcpyHostToDevice
2641- )
2642- );
2643+ HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
2644 HIP_CHECK(hipDeviceSynchronize());
2645
2646- // HIP events creation
2647- hipEvent_t start, stop;
2648- HIP_CHECK(hipEventCreate(&start));
2649- HIP_CHECK(hipEventCreate(&stop));
2650-
2651- for(auto _ : state)
2652- {
2653- // Record start event
2654- HIP_CHECK(hipEventRecord(start, stream));
2655-
2656- hipLaunchKernelGGL(
2657- HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, WithTile, Trials>),
2658- dim3(size/items_per_block), dim3(BlockSize), 0, stream,
2659- d_input, d_output
2660- );
2661- HIP_CHECK(hipGetLastError());
2662-
2663- // Record stop event and wait until it completes
2664- HIP_CHECK(hipEventRecord(stop, stream));
2665- HIP_CHECK(hipEventSynchronize(stop));
2666-
2667- float elapsed_mseconds;
2668- HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop));
2669- state.SetIterationTime(elapsed_mseconds / 1000);
2670- }
2671-
2672- // Destroy HIP events
2673- HIP_CHECK(hipEventDestroy(start));
2674- HIP_CHECK(hipEventDestroy(stop));
2675+ state.run(
2676+ [&]
2677+ {
2678+ kernel<Benchmark, T, BlockSize, ItemsPerThread, WithTile, Trials>
2679+ <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input, d_output);
2680+ HIP_CHECK(hipGetLastError());
2681+ });
2682
2683- state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
2684- state.SetItemsProcessed(state.iterations() * Trials * size);
2685+ state.set_throughput(size * Trials, sizeof(T));
2686
2687 HIP_CHECK(hipFree(d_input));
2688 HIP_CHECK(hipFree(d_output));
2689 }
2690
2691-#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \
2692- benchmark::RegisterBenchmark( \
2693- bench_naming::format_name("{lvl:block,algo:discontinuity,subalgo:" + name \
2694- + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \
2695- ",with_tile:" #WITH_TILE "}}") \
2696- .c_str(), \
2697- run_benchmark<Benchmark, T, BS, IPT, WITH_TILE>, \
2698- bytes, \
2699- seed, \
2700- stream)
2701-
2702-#define BENCHMARK_TYPE(type, block, bool) \
2703- CREATE_BENCHMARK(type, block, 1, bool), \
2704- CREATE_BENCHMARK(type, block, 2, bool), \
2705- CREATE_BENCHMARK(type, block, 3, bool), \
2706- CREATE_BENCHMARK(type, block, 4, bool), \
2707+#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \
2708+ executor.queue_fn(bench_naming::format_name("{lvl:block,algo:discontinuity,subalgo:" + name \
2709+ + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \
2710+ ",with_tile:" #WITH_TILE "}}") \
2711+ .c_str(), \
2712+ run_benchmark<Benchmark, T, BS, IPT, WITH_TILE>);
2713+
2714+#define BENCHMARK_TYPE(type, block, bool) \
2715+ CREATE_BENCHMARK(type, block, 1, bool) \
2716+ CREATE_BENCHMARK(type, block, 2, bool) \
2717+ CREATE_BENCHMARK(type, block, 3, bool) \
2718+ CREATE_BENCHMARK(type, block, 4, bool) \
2719 CREATE_BENCHMARK(type, block, 8, bool)
2720
2721-template<class Benchmark>
2722-void add_benchmarks(const std::string& name,
2723- std::vector<benchmark::internal::Benchmark*>& benchmarks,
2724- size_t bytes,
2725- const managed_seed& seed,
2726- hipStream_t stream)
2727+template<typename Benchmark>
2728+void add_benchmarks(const std::string& name, benchmark_utils::executor& executor)
2729 {
2730- std::vector<benchmark::internal::Benchmark*> bs =
2731- {
2732- BENCHMARK_TYPE(int, 256, false),
2733- BENCHMARK_TYPE(int, 256, true),
2734- BENCHMARK_TYPE(int8_t, 256, false),
2735- BENCHMARK_TYPE(int8_t, 256, true),
2736- BENCHMARK_TYPE(uint8_t, 256, false),
2737- BENCHMARK_TYPE(uint8_t, 256, true),
2738- BENCHMARK_TYPE(rocprim::half, 256, false),
2739- BENCHMARK_TYPE(rocprim::half, 256, true),
2740- BENCHMARK_TYPE(long long, 256, false),
2741- BENCHMARK_TYPE(long long, 256, true),
2742- };
2743-
2744- benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
2745+ BENCHMARK_TYPE(int, 256, false)
2746+ BENCHMARK_TYPE(int, 256, true)
2747+ BENCHMARK_TYPE(int8_t, 256, false)
2748+ BENCHMARK_TYPE(int8_t, 256, true)
2749+ BENCHMARK_TYPE(uint8_t, 256, false)
2750+ BENCHMARK_TYPE(uint8_t, 256, true)
2751+ BENCHMARK_TYPE(rocprim::half, 256, false)
2752+ BENCHMARK_TYPE(rocprim::half, 256, true)
2753+ BENCHMARK_TYPE(long long, 256, false)
2754+ BENCHMARK_TYPE(long long, 256, true)
2755+ BENCHMARK_TYPE(rocprim::int128_t, 256, false)
2756+ BENCHMARK_TYPE(rocprim::int128_t, 256, true)
2757+ BENCHMARK_TYPE(rocprim::uint128_t, 256, false)
2758+ BENCHMARK_TYPE(rocprim::uint128_t, 256, true)
2759 }
2760
2761-int main(int argc, char *argv[])
2762+int main(int argc, char* argv[])
2763 {
2764- cli::Parser parser(argc, argv);
2765- parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes");
2766- parser.set_optional<int>("trials", "trials", -1, "number of iterations");
2767- parser.set_optional<std::string>("name_format",
2768- "name_format",
2769- "human",
2770- "either: json,human,txt");
2771- parser.set_optional<std::string>("seed", "seed", "random", get_seed_message());
2772- parser.run_and_exit_if_error();
2773-
2774- // Parse argv
2775- benchmark::Initialize(&argc, argv);
2776- const size_t bytes = parser.get<size_t>("size");
2777- const int trials = parser.get<int>("trials");
2778- bench_naming::set_format(parser.get<std::string>("name_format"));
2779- const std::string seed_type = parser.get<std::string>("seed");
2780- const managed_seed seed(seed_type);
2781-
2782- // HIP
2783- hipStream_t stream = 0; // default
2784-
2785- // Benchmark info
2786- add_common_benchmark_info();
2787- benchmark::AddCustomContext("bytes", std::to_string(bytes));
2788- benchmark::AddCustomContext("seed", seed_type);
2789-
2790- // Add benchmarks
2791- std::vector<benchmark::internal::Benchmark*> benchmarks;
2792- add_benchmarks<flag_heads>("flag_heads", benchmarks, bytes, seed, stream);
2793- add_benchmarks<flag_tails>("flag_tails", benchmarks, bytes, seed, stream);
2794- add_benchmarks<flag_heads_and_tails>("flag_heads_and_tails", benchmarks, bytes, seed, stream);
2795-
2796- // Use manual timing
2797- for(auto& b : benchmarks)
2798- {
2799- b->UseManualTime();
2800- b->Unit(benchmark::kMillisecond);
2801- }
2802+ benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0);
2803
2804- // Force number of iterations
2805- if(trials > 0)
2806- {
2807- for(auto& b : benchmarks)
2808- {
2809- b->Iterations(trials);
2810- }
2811- }
2812+ add_benchmarks<flag_heads>("flag_heads", executor);
2813+ add_benchmarks<flag_tails>("flag_tails", executor);
2814+ add_benchmarks<flag_heads_and_tails>("flag_heads_and_tails", executor);
2815
2816- // Run benchmarks
2817- benchmark::RunSpecifiedBenchmarks();
2818- return 0;
2819+ executor.run();
2820 }
2821diff --git a/benchmark/benchmark_block_exchange.cpp b/benchmark/benchmark_block_exchange.cpp
2822index ec3c95f..889fc78 100644
2823--- a/benchmark/benchmark_block_exchange.cpp
2824+++ b/benchmark/benchmark_block_exchange.cpp
2825@@ -1,6 +1,6 @@
2826 // MIT License
2827 //
2828-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
2829+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
2830 //
2831 // Permission is hereby granted, free of charge, to any person obtaining a copy
2832 // of this software and associated documentation files (the "Software"), to deal
2833@@ -20,12 +20,10 @@
2834 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2835 // SOFTWARE.
2836
2837-// CmdParser
2838-#include "cmdparser.hpp"
2839 #include "benchmark_utils.hpp"
2840
2841-// Google Benchmark
2842-#include <benchmark/benchmark.h>
2843+#include "../common/utils_custom_type.hpp"
2844+#include "../common/utils_device_ptr.hpp"
2845
2846 // HIP API
2847 #include <hip/hip_runtime.h>
2848@@ -34,407 +32,270 @@
2849 #include <rocprim/block/block_exchange.hpp>
2850 #include <rocprim/block/block_load_func.hpp>
2851 #include <rocprim/block/block_store_func.hpp>
2852-
2853-#include <iostream>
2854-#include <limits>
2855+#include <rocprim/config.hpp>
2856+#include <rocprim/intrinsics/thread.hpp>
2857+#include <rocprim/types.hpp>
2858+
2859+#include <algorithm>
2860+#include <cstddef>
2861+#include <numeric>
2862+#include <stdint.h>
2863 #include <string>
2864 #include <vector>
2865
2866-#include <cstdio>
2867-#include <cstdlib>
2868-
2869-#ifndef DEFAULT_N
2870-const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4;
2871-#endif
2872-
2873-namespace rp = rocprim;
2874-
2875-template<
2876- class Runner,
2877- class T,
2878- unsigned int BlockSize,
2879- unsigned int ItemsPerThread,
2880- unsigned int Trials
2881->
2882-__global__
2883-__launch_bounds__(BlockSize)
2884-void kernel(const T * d_input, const unsigned int * d_ranks, T * d_output)
2885+template<typename Runner,
2886+ typename T,
2887+ unsigned int BlockSize,
2888+ unsigned int ItemsPerThread,
2889+ unsigned int Trials>
2890+__global__ __launch_bounds__(BlockSize)
2891+void kernel(const T* d_input, const unsigned int* d_ranks, T* d_output)
2892 {
2893 Runner::template run<T, BlockSize, ItemsPerThread, Trials>(d_input, d_ranks, d_output);
2894 }
2895
2896 struct blocked_to_striped
2897 {
2898- template<
2899- class T,
2900- unsigned int BlockSize,
2901- unsigned int ItemsPerThread,
2902- unsigned int Trials
2903- >
2904+ template<typename T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials>
2905 __device__
2906- static void run(const T * d_input, const unsigned int *, T * d_output)
2907+ static void run(const T* d_input, const unsigned int*, T* d_output)
2908 {
2909- const unsigned int lid = threadIdx.x;
2910+ const unsigned int lid = threadIdx.x;
2911 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
2912
2913 T input[ItemsPerThread];
2914- rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
2915+ rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
2916
2917 ROCPRIM_NO_UNROLL
2918- for(unsigned int trial = 0; trial < Trials; trial++)
2919+ for(unsigned int trial = 0; trial < Trials; ++trial)
2920 {
2921- rp::block_exchange<T, BlockSize, ItemsPerThread> exchange;
2922+ rocprim::block_exchange<T, BlockSize, ItemsPerThread> exchange;
2923 exchange.blocked_to_striped(input, input);
2924 ::rocprim::syncthreads();
2925 }
2926
2927- rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
2928+ rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
2929 }
2930 };
2931
2932 struct striped_to_blocked
2933 {
2934- template<
2935- class T,
2936- unsigned int BlockSize,
2937- unsigned int ItemsPerThread,
2938- unsigned int Trials
2939- >
2940+ template<typename T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials>
2941 __device__
2942- static void run(const T * d_input, const unsigned int *, T * d_output)
2943+ static void run(const T* d_input, const unsigned int*, T* d_output)
2944 {
2945- const unsigned int lid = threadIdx.x;
2946+ const unsigned int lid = threadIdx.x;
2947 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
2948
2949 T input[ItemsPerThread];
2950- rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
2951+ rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
2952
2953 ROCPRIM_NO_UNROLL
2954- for(unsigned int trial = 0; trial < Trials; trial++)
2955+ for(unsigned int trial = 0; trial < Trials; ++trial)
2956 {
2957- rp::block_exchange<T, BlockSize, ItemsPerThread> exchange;
2958+ rocprim::block_exchange<T, BlockSize, ItemsPerThread> exchange;
2959 exchange.striped_to_blocked(input, input);
2960 ::rocprim::syncthreads();
2961 }
2962
2963- rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
2964+ rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
2965 }
2966 };
2967
2968 struct blocked_to_warp_striped
2969 {
2970- template<
2971- class T,
2972- unsigned int BlockSize,
2973- unsigned int ItemsPerThread,
2974- unsigned int Trials
2975- >
2976+ template<typename T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials>
2977 __device__
2978- static void run(const T * d_input, const unsigned int *, T * d_output)
2979+ static void run(const T* d_input, const unsigned int*, T* d_output)
2980 {
2981- const unsigned int lid = threadIdx.x;
2982+ const unsigned int lid = threadIdx.x;
2983 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
2984
2985 T input[ItemsPerThread];
2986- rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
2987+ rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
2988
2989 ROCPRIM_NO_UNROLL
2990- for(unsigned int trial = 0; trial < Trials; trial++)
2991+ for(unsigned int trial = 0; trial < Trials; ++trial)
2992 {
2993- rp::block_exchange<T, BlockSize, ItemsPerThread> exchange;
2994+ rocprim::block_exchange<T, BlockSize, ItemsPerThread> exchange;
2995 exchange.blocked_to_warp_striped(input, input);
2996 ::rocprim::syncthreads();
2997 }
2998
2999- rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
3000+ rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
3001 }
3002 };
3003
3004 struct warp_striped_to_blocked
3005 {
3006- template<
3007- class T,
3008- unsigned int BlockSize,
3009- unsigned int ItemsPerThread,
3010- unsigned int Trials
3011- >
3012+ template<typename T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials>
3013 __device__
3014- static void run(const T * d_input, const unsigned int *, T * d_output)
3015+ static void run(const T* d_input, const unsigned int*, T* d_output)
3016 {
3017- const unsigned int lid = threadIdx.x;
3018+ const unsigned int lid = threadIdx.x;
3019 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
3020
3021 T input[ItemsPerThread];
3022- rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
3023+ rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
3024
3025 ROCPRIM_NO_UNROLL
3026- for(unsigned int trial = 0; trial < Trials; trial++)
3027+ for(unsigned int trial = 0; trial < Trials; ++trial)
3028 {
3029- rp::block_exchange<T, BlockSize, ItemsPerThread> exchange;
3030+ rocprim::block_exchange<T, BlockSize, ItemsPerThread> exchange;
3031 exchange.warp_striped_to_blocked(input, input);
3032 ::rocprim::syncthreads();
3033 }
3034
3035- rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
3036+ rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
3037 }
3038 };
3039
3040 struct scatter_to_blocked
3041 {
3042- template<
3043- class T,
3044- unsigned int BlockSize,
3045- unsigned int ItemsPerThread,
3046- unsigned int Trials
3047- >
3048+ template<typename T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials>
3049 __device__
3050- static void run(const T * d_input, const unsigned int * d_ranks, T * d_output)
3051+ static void run(const T* d_input, const unsigned int* d_ranks, T* d_output)
3052 {
3053- const unsigned int lid = threadIdx.x;
3054+ const unsigned int lid = threadIdx.x;
3055 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
3056
3057- T input[ItemsPerThread];
3058+ T input[ItemsPerThread];
3059 unsigned int ranks[ItemsPerThread];
3060- rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
3061- rp::block_load_direct_striped<BlockSize>(lid, d_ranks + block_offset, ranks);
3062+ rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
3063+ rocprim::block_load_direct_striped<BlockSize>(lid, d_ranks + block_offset, ranks);
3064
3065 ROCPRIM_NO_UNROLL
3066- for(unsigned int trial = 0; trial < Trials; trial++)
3067+ for(unsigned int trial = 0; trial < Trials; ++trial)
3068 {
3069- rp::block_exchange<T, BlockSize, ItemsPerThread> exchange;
3070+ rocprim::block_exchange<T, BlockSize, ItemsPerThread> exchange;
3071 exchange.scatter_to_blocked(input, input, ranks);
3072 ::rocprim::syncthreads();
3073 }
3074
3075- rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
3076+ rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
3077 }
3078 };
3079
3080 struct scatter_to_striped
3081 {
3082- template<
3083- class T,
3084- unsigned int BlockSize,
3085- unsigned int ItemsPerThread,
3086- unsigned int Trials
3087- >
3088+ template<typename T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials>
3089 __device__
3090- static void run(const T * d_input, const unsigned int * d_ranks, T * d_output)
3091+ static void run(const T* d_input, const unsigned int* d_ranks, T* d_output)
3092 {
3093- const unsigned int lid = threadIdx.x;
3094+ const unsigned int lid = threadIdx.x;
3095 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
3096
3097- T input[ItemsPerThread];
3098+ T input[ItemsPerThread];
3099 unsigned int ranks[ItemsPerThread];
3100- rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
3101- rp::block_load_direct_striped<BlockSize>(lid, d_ranks + block_offset, ranks);
3102+ rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
3103+ rocprim::block_load_direct_striped<BlockSize>(lid, d_ranks + block_offset, ranks);
3104
3105 ROCPRIM_NO_UNROLL
3106- for(unsigned int trial = 0; trial < Trials; trial++)
3107+ for(unsigned int trial = 0; trial < Trials; ++trial)
3108 {
3109- rp::block_exchange<T, BlockSize, ItemsPerThread> exchange;
3110+ rocprim::block_exchange<T, BlockSize, ItemsPerThread> exchange;
3111 exchange.scatter_to_striped(input, input, ranks);
3112 ::rocprim::syncthreads();
3113 }
3114
3115- rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
3116+ rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input);
3117 }
3118 };
3119
3120-template<class Benchmark,
3121- class T,
3122+template<typename Benchmark,
3123+ typename T,
3124 unsigned int BlockSize,
3125 unsigned int ItemsPerThread,
3126 unsigned int Trials = 100>
3127-void run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream)
3128+void run_benchmark(benchmark_utils::state&& state)
3129 {
3130+ const auto& bytes = state.bytes;
3131+ const auto& seed = state.seed;
3132+ const auto& stream = state.stream;
3133+
3134 // Calculate the number of elements N
3135 size_t N = bytes / sizeof(T);
3136-
3137+
3138 constexpr auto items_per_block = BlockSize * ItemsPerThread;
3139- const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block);
3140+ const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block);
3141
3142 std::vector<T> input(size);
3143 // Fill input
3144- for(size_t i = 0; i < size; i++)
3145+ for(size_t i = 0; i < size; ++i)
3146 {
3147 input[i] = T(i);
3148 }
3149 std::vector<unsigned int> ranks(size);
3150 // Fill ranks (for scatter operations)
3151 engine_type gen(seed.get_0());
3152- for(size_t bi = 0; bi < size / items_per_block; bi++)
3153+ for(size_t bi = 0; bi < size / items_per_block; ++bi)
3154 {
3155 auto block_ranks = ranks.begin() + bi * items_per_block;
3156 std::iota(block_ranks, block_ranks + items_per_block, 0);
3157 std::shuffle(block_ranks, block_ranks + items_per_block, gen);
3158 }
3159- T * d_input;
3160- unsigned int * d_ranks;
3161- T * d_output;
3162- HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
3163- HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_ranks), size * sizeof(unsigned int)));
3164- HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
3165- HIP_CHECK(
3166- hipMemcpy(
3167- d_input, input.data(),
3168- size * sizeof(T),
3169- hipMemcpyHostToDevice
3170- )
3171- );
3172- HIP_CHECK(
3173- hipMemcpy(
3174- d_ranks, ranks.data(),
3175- size * sizeof(unsigned int),
3176- hipMemcpyHostToDevice
3177- )
3178- );
3179+ common::device_ptr<T> d_input(input);
3180+ common::device_ptr<unsigned int> d_ranks(ranks);
3181+ common::device_ptr<T> d_output(size);
3182 HIP_CHECK(hipDeviceSynchronize());
3183
3184- // HIP events creation
3185- hipEvent_t start, stop;
3186- HIP_CHECK(hipEventCreate(&start));
3187- HIP_CHECK(hipEventCreate(&stop));
3188-
3189- for(auto _ : state)
3190- {
3191- // Record start event
3192- HIP_CHECK(hipEventRecord(start, stream));
3193-
3194- hipLaunchKernelGGL(
3195- HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>),
3196- dim3(size/items_per_block), dim3(BlockSize), 0, stream,
3197- d_input, d_ranks, d_output
3198- );
3199- HIP_CHECK(hipGetLastError());
3200-
3201- // Record stop event and wait until it completes
3202- HIP_CHECK(hipEventRecord(stop, stream));
3203- HIP_CHECK(hipEventSynchronize(stop));
3204-
3205- float elapsed_mseconds;
3206- HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop));
3207- state.SetIterationTime(elapsed_mseconds / 1000);
3208- }
3209-
3210- // Destroy HIP events
3211- HIP_CHECK(hipEventDestroy(start));
3212- HIP_CHECK(hipEventDestroy(stop));
3213-
3214- state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
3215- state.SetItemsProcessed(state.iterations() * Trials * size);
3216-
3217- HIP_CHECK(hipFree(d_input));
3218- HIP_CHECK(hipFree(d_ranks));
3219- HIP_CHECK(hipFree(d_output));
3220+ state.run(
3221+ [&]
3222+ {
3223+ kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>
3224+ <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input.get(),
3225+ d_ranks.get(),
3226+ d_output.get());
3227+ HIP_CHECK(hipGetLastError());
3228+ });
3229+
3230+ state.set_throughput(size * Trials, sizeof(T));
3231 }
3232
3233-#define CREATE_BENCHMARK(T, BS, IPT) \
3234- benchmark::RegisterBenchmark( \
3235- bench_naming::format_name("{lvl:block,algo:exchange,subalgo:" + name \
3236- + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT "}}") \
3237- .c_str(), \
3238- run_benchmark<Benchmark, T, BS, IPT>, \
3239- bytes, \
3240- seed, \
3241- stream)
3242-
3243-#define BENCHMARK_TYPE(type, block) \
3244- CREATE_BENCHMARK(type, block, 1), \
3245- CREATE_BENCHMARK(type, block, 2), \
3246- CREATE_BENCHMARK(type, block, 3), \
3247- CREATE_BENCHMARK(type, block, 4), \
3248- CREATE_BENCHMARK(type, block, 7), \
3249+#define CREATE_BENCHMARK(T, BS, IPT) \
3250+ executor.queue_fn(bench_naming::format_name("{lvl:block,algo:exchange,subalgo:" + name \
3251+ + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \
3252+ "}}") \
3253+ .c_str(), \
3254+ run_benchmark<Benchmark, T, BS, IPT>);
3255+
3256+#define BENCHMARK_TYPE(type, block) \
3257+ CREATE_BENCHMARK(type, block, 1) \
3258+ CREATE_BENCHMARK(type, block, 2) \
3259+ CREATE_BENCHMARK(type, block, 3) \
3260+ CREATE_BENCHMARK(type, block, 4) \
3261+ CREATE_BENCHMARK(type, block, 7) \
3262 CREATE_BENCHMARK(type, block, 8)
3263
3264-template<class Benchmark>
3265-void add_benchmarks(const std::string& name,
3266- std::vector<benchmark::internal::Benchmark*>& benchmarks,
3267- size_t bytes,
3268- const managed_seed& seed,
3269- hipStream_t stream)
3270+template<typename Benchmark>
3271+void add_benchmarks(const std::string& name, benchmark_utils::executor& executor)
3272 {
3273- using custom_float2 = custom_type<float, float>;
3274- using custom_double2 = custom_type<double, double>;
3275-
3276- std::vector<benchmark::internal::Benchmark*> bs =
3277- {
3278- BENCHMARK_TYPE(int, 256),
3279- BENCHMARK_TYPE(int8_t, 256),
3280- BENCHMARK_TYPE(rocprim::half, 256),
3281- BENCHMARK_TYPE(long long, 256),
3282- BENCHMARK_TYPE(custom_float2, 256),
3283- BENCHMARK_TYPE(float2, 256),
3284- BENCHMARK_TYPE(custom_double2, 256),
3285- BENCHMARK_TYPE(double2, 256),
3286- BENCHMARK_TYPE(float4, 256),
3287- };
3288-
3289- benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
3290+ using custom_float2 = common::custom_type<float, float>;
3291+ using custom_double2 = common::custom_type<double, double>;
3292+
3293+ BENCHMARK_TYPE(int, 256)
3294+ BENCHMARK_TYPE(int8_t, 256)
3295+ BENCHMARK_TYPE(rocprim::half, 256)
3296+ BENCHMARK_TYPE(long long, 256)
3297+ BENCHMARK_TYPE(custom_float2, 256)
3298+ BENCHMARK_TYPE(float2, 256)
3299+ BENCHMARK_TYPE(custom_double2, 256)
3300+ BENCHMARK_TYPE(double2, 256)
3301+ BENCHMARK_TYPE(float4, 256)
3302+ BENCHMARK_TYPE(rocprim::int128_t, 256)
3303+ BENCHMARK_TYPE(rocprim::uint128_t, 256)
3304 }
3305
3306-int main(int argc, char *argv[])
3307+int main(int argc, char* argv[])
3308 {
3309- cli::Parser parser(argc, argv);
3310- parser.set_optional<size_t>("bytes", "bytes", DEFAULT_BYTES, "number of values");
3311- parser.set_optional<int>("trials", "trials", -1, "number of iterations");
3312- parser.set_optional<std::string>("name_format",
3313- "name_format",
3314- "human",
3315- "either: json,human,txt");
3316- parser.set_optional<std::string>("seed", "seed", "random", get_seed_message());
3317- parser.run_and_exit_if_error();
3318-
3319- // Parse argv
3320- benchmark::Initialize(&argc, argv);
3321- const size_t bytes = parser.get<size_t>("bytes");
3322- const int trials = parser.get<int>("trials");
3323- bench_naming::set_format(parser.get<std::string>("name_format"));
3324- const std::string seed_type = parser.get<std::string>("seed");
3325- const managed_seed seed(seed_type);
3326-
3327- // HIP
3328- hipStream_t stream = 0; // default
3329-
3330- // Benchmark info
3331- add_common_benchmark_info();
3332- benchmark::AddCustomContext("bytes", std::to_string(bytes));
3333- benchmark::AddCustomContext("seed", seed_type);
3334-
3335- // Add benchmarks
3336- std::vector<benchmark::internal::Benchmark*> benchmarks;
3337- add_benchmarks<blocked_to_striped>("blocked_to_striped", benchmarks, bytes, seed, stream);
3338- add_benchmarks<striped_to_blocked>("striped_to_blocked", benchmarks, bytes, seed, stream);
3339- add_benchmarks<blocked_to_warp_striped>("blocked_to_warp_striped",
3340- benchmarks,
3341- bytes,
3342- seed,
3343- stream);
3344- add_benchmarks<warp_striped_to_blocked>("warp_striped_to_blocked",
3345- benchmarks,
3346- bytes,
3347- seed,
3348- stream);
3349- add_benchmarks<scatter_to_blocked>("scatter_to_blocked", benchmarks, bytes, seed, stream);
3350- add_benchmarks<scatter_to_striped>("scatter_to_striped", benchmarks, bytes, seed, stream);
3351-
3352- // Use manual timing
3353- for(auto& b : benchmarks)
3354- {
3355- b->UseManualTime();
3356- b->Unit(benchmark::kMillisecond);
3357- }
3358+ benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 1, 0);
3359
3360- // Force number of iterations
3361- if(trials > 0)
3362- {
3363- for(auto& b : benchmarks)
3364- {
3365- b->Iterations(trials);
3366- }
3367- }
3368+ add_benchmarks<blocked_to_striped>("blocked_to_striped", executor);
3369+ add_benchmarks<striped_to_blocked>("striped_to_blocked", executor);
3370+ add_benchmarks<blocked_to_warp_striped>("blocked_to_warp_striped", executor);
3371+ add_benchmarks<warp_striped_to_blocked>("warp_striped_to_blocked", executor);
3372+ add_benchmarks<scatter_to_blocked>("scatter_to_blocked", executor);
3373+ add_benchmarks<scatter_to_striped>("scatter_to_striped", executor);
3374
3375- // Run benchmarks
3376- benchmark::RunSpecifiedBenchmarks();
3377- return 0;
3378+ executor.run();
3379 }
3380diff --git a/benchmark/benchmark_block_histogram.cpp b/benchmark/benchmark_block_histogram.cpp
3381index 7dc2827..b108314 100644
3382--- a/benchmark/benchmark_block_histogram.cpp
3383+++ b/benchmark/benchmark_block_histogram.cpp
3384@@ -1,6 +1,6 @@
3385 // MIT License
3386 //
3387-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
3388+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
3389 //
3390 // Permission is hereby granted, free of charge, to any person obtaining a copy
3391 // of this software and associated documentation files (the "Software"), to deal
3392@@ -21,44 +21,24 @@
3393 // SOFTWARE.
3394
3395 #include "benchmark_utils.hpp"
3396-// CmdParser
3397-#include "cmdparser.hpp"
3398
3399-// Google Benchmark
3400-#include <benchmark/benchmark.h>
3401+#include "../common/utils_device_ptr.hpp"
3402
3403 // HIP API
3404 #include <hip/hip_runtime.h>
3405
3406 // rocPRIM
3407 #include <rocprim/block/block_histogram.hpp>
3408-#include <rocprim/block/block_load_func.hpp>
3409-#include <rocprim/block/block_store_func.hpp>
3410-
3411-#include <iostream>
3412-#include <limits>
3413-#include <string>
3414-#include <vector>
3415-
3416-#include <cstdio>
3417-#include <cstdlib>
3418-
3419-#ifndef DEFAULT_N
3420-const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4;
3421-#endif
3422-
3423-namespace rp = rocprim;
3424-
3425-template<
3426- class Runner,
3427- class T,
3428- unsigned int BlockSize,
3429- unsigned int ItemsPerThread,
3430- unsigned int BinSize,
3431- unsigned int Trials
3432->
3433-__global__
3434-__launch_bounds__(BlockSize)
3435+#include <rocprim/config.hpp>
3436+#include <rocprim/types.hpp>
3437+
3438+template<typename Runner,
3439+ typename T,
3440+ unsigned int BlockSize,
3441+ unsigned int ItemsPerThread,
3442+ unsigned int BinSize,
3443+ unsigned int Trials>
3444+__global__ __launch_bounds__(BlockSize)
3445 void kernel(const T* input, T* output)
3446 {
3447 Runner::template run<T, BlockSize, ItemsPerThread, BinSize, Trials>(input, output);
3448@@ -67,42 +47,42 @@ void kernel(const T* input, T* output)
3449 template<rocprim::block_histogram_algorithm algorithm>
3450 struct histogram
3451 {
3452- template<
3453- class T,
3454- unsigned int BlockSize,
3455- unsigned int ItemsPerThread,
3456- unsigned int BinSize,
3457- unsigned int Trials
3458- >
3459+ static constexpr auto algorithm_type = algorithm;
3460+ template<typename T,
3461+ unsigned int BlockSize,
3462+ unsigned int ItemsPerThread,
3463+ unsigned int BinSize,
3464+ unsigned int Trials>
3465 __device__
3466 static void run(const T* input, T* output)
3467 {
3468 // TODO: Move global_offset into final loop
3469 const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread;
3470- unsigned int global_offset = blockIdx.x * BinSize;
3471+ unsigned int global_offset = blockIdx.x * BinSize;
3472
3473 T values[ItemsPerThread];
3474- for(unsigned int k = 0; k < ItemsPerThread; k++)
3475+ for(unsigned int k = 0; k < ItemsPerThread; ++k)
3476 {
3477 values[k] = input[index + k];
3478 }
3479
3480- using bhistogram_t = rp::block_histogram<T, BlockSize, ItemsPerThread, BinSize, algorithm>;
3481+ using bhistogram_t
3482+ = rocprim::block_histogram<T, BlockSize, ItemsPerThread, BinSize, algorithm>;
3483 __shared__ T histogram[BinSize];
3484 __shared__ typename bhistogram_t::storage_type storage;
3485
3486 ROCPRIM_NO_UNROLL
3487- for(unsigned int trial = 0; trial < Trials; trial++)
3488+ for(unsigned int trial = 0; trial < Trials; ++trial)
3489 {
3490 bhistogram_t().histogram(values, histogram, storage);
3491- for(unsigned int k = 0; k < ItemsPerThread; k++)
3492+ for(unsigned int k = 0; k < ItemsPerThread; ++k)
3493 {
3494 values[k] = BinSize - 1 - values[k];
3495 }
3496 }
3497
3498 ROCPRIM_UNROLL
3499- for (unsigned int offset = 0; offset < BinSize; offset += BlockSize)
3500+ for(unsigned int offset = 0; offset < BinSize; offset += BlockSize)
3501 {
3502 if(offset + threadIdx.x < BinSize)
3503 {
3504@@ -113,161 +93,94 @@ struct histogram
3505 }
3506 };
3507
3508-template<
3509- class Benchmark,
3510- class T,
3511- unsigned int BlockSize,
3512- unsigned int ItemsPerThread,
3513- unsigned int BinSize = BlockSize,
3514- unsigned int Trials = 100
3515->
3516-void run_benchmark(benchmark::State& state, hipStream_t stream, size_t bytes)
3517+template<typename Benchmark,
3518+ typename T,
3519+ unsigned int BlockSize,
3520+ unsigned int ItemsPerThread,
3521+ unsigned int BinSize = BlockSize,
3522+ unsigned int Trials = 100>
3523+void run_benchmark(benchmark_utils::state&& state)
3524 {
3525+ const auto& stream = state.stream;
3526+ const auto& bytes = state.bytes;
3527+
3528 // Calculate the number of elements N
3529 size_t N = bytes / sizeof(T);
3530 // Make sure size is a multiple of BlockSize
3531 constexpr auto items_per_block = BlockSize * ItemsPerThread;
3532- const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block);
3533- const auto bin_size = BinSize * ((N + items_per_block - 1)/items_per_block);
3534+ const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block);
3535+ const auto bin_size = BinSize * ((N + items_per_block - 1) / items_per_block);
3536 // Allocate and fill memory
3537 std::vector<T> input(size, 0.0f);
3538- T * d_input;
3539- T * d_output;
3540- HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
3541- HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), bin_size * sizeof(T)));
3542- HIP_CHECK(
3543- hipMemcpy(
3544- d_input, input.data(),
3545- size * sizeof(T),
3546- hipMemcpyHostToDevice
3547- )
3548- );
3549+ common::device_ptr<T> d_input(input);
3550+ common::device_ptr<T> d_output(bin_size);
3551 HIP_CHECK(hipDeviceSynchronize());
3552
3553- // HIP events creation
3554- hipEvent_t start, stop;
3555- HIP_CHECK(hipEventCreate(&start));
3556- HIP_CHECK(hipEventCreate(&stop));
3557-
3558- for (auto _ : state)
3559- {
3560- // Record start event
3561- HIP_CHECK(hipEventRecord(start, stream));
3562-
3563- hipLaunchKernelGGL(
3564- HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, BinSize, Trials>),
3565- dim3(size/items_per_block), dim3(BlockSize), 0, stream,
3566- d_input, d_output
3567- );
3568- HIP_CHECK(hipGetLastError());
3569-
3570- // Record stop event and wait until it completes
3571- HIP_CHECK(hipEventRecord(stop, stream));
3572- HIP_CHECK(hipEventSynchronize(stop));
3573-
3574- float elapsed_mseconds;
3575- HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop));
3576- state.SetIterationTime(elapsed_mseconds / 1000);
3577- }
3578-
3579- // Destroy HIP events
3580- HIP_CHECK(hipEventDestroy(start));
3581- HIP_CHECK(hipEventDestroy(stop));
3582-
3583- state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials);
3584- state.SetItemsProcessed(state.iterations() * size * Trials);
3585-
3586- HIP_CHECK(hipFree(d_input));
3587- HIP_CHECK(hipFree(d_output));
3588-}
3589-
3590-// IPT - items per thread
3591-#define CREATE_BENCHMARK(T, BS, IPT) \
3592- benchmark::RegisterBenchmark( \
3593- bench_naming::format_name("{lvl:block,algo:histogram,key_type:" #T ",cfg:{bs:" #BS \
3594- ",ipt:" #IPT ",method:" \
3595- + method_name + "}}") \
3596- .c_str(), \
3597- run_benchmark<Benchmark, T, BS, IPT>, \
3598- stream, \
3599- bytes)
3600-
3601-#define BENCHMARK_TYPE(type, block) \
3602- CREATE_BENCHMARK(type, block, 1), \
3603- CREATE_BENCHMARK(type, block, 2), \
3604- CREATE_BENCHMARK(type, block, 3), \
3605- CREATE_BENCHMARK(type, block, 4), \
3606- CREATE_BENCHMARK(type, block, 8), \
3607- CREATE_BENCHMARK(type, block, 16)
3608-
3609-template<class Benchmark>
3610-void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
3611- const std::string& method_name,
3612- hipStream_t stream,
3613- size_t bytes)
3614-{
3615- std::vector<benchmark::internal::Benchmark*> new_benchmarks =
3616- {
3617- BENCHMARK_TYPE(int, 256),
3618- BENCHMARK_TYPE(int, 320),
3619- BENCHMARK_TYPE(int, 512),
3620+ state.run(
3621+ [&]
3622+ {
3623+ kernel<Benchmark, T, BlockSize, ItemsPerThread, BinSize, Trials>
3624+ <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input.get(),
3625+ d_output.get());
3626+ HIP_CHECK(hipGetLastError());
3627+ });
3628
3629- BENCHMARK_TYPE(unsigned long long, 256),
3630- BENCHMARK_TYPE(unsigned long long, 320)
3631- };
3632- benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end());
3633+ state.set_throughput(size * Trials, sizeof(T));
3634 }
3635
3636-int main(int argc, char *argv[])
3637+#define CREATE_BENCHMARK(Benchmark, method, T, BS, IPT) \
3638+ executor.queue_fn(bench_naming::format_name("{lvl:block,algo:histogram,key_type:" #T \
3639+ ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" \
3640+ + std::string(method) + "}}") \
3641+ .c_str(), \
3642+ run_benchmark<Benchmark, T, BS, IPT>);
3643+
3644+#define BENCHMARK_TYPE(Benchmark, method, T, BS) \
3645+ CREATE_BENCHMARK(Benchmark, method, T, BS, 1) \
3646+ CREATE_BENCHMARK(Benchmark, method, T, BS, 2) \
3647+ CREATE_BENCHMARK(Benchmark, method, T, BS, 3) \
3648+ CREATE_BENCHMARK(Benchmark, method, T, BS, 4) \
3649+ CREATE_BENCHMARK(Benchmark, method, T, BS, 8) \
3650+ CREATE_BENCHMARK(Benchmark, method, T, BS, 16)
3651+
3652+#define BENCHMARK_TYPE_128(Benchmark, method, T, BS) \
3653+ CREATE_BENCHMARK(Benchmark, method, T, BS, 1) \
3654+ CREATE_BENCHMARK(Benchmark, method, T, BS, 2) \
3655+ CREATE_BENCHMARK(Benchmark, method, T, BS, 3) \
3656+ CREATE_BENCHMARK(Benchmark, method, T, BS, 4) \
3657+ CREATE_BENCHMARK(Benchmark, method, T, BS, 8) \
3658+ CREATE_BENCHMARK(Benchmark, method, T, BS, 12)
3659+
3660+#define BENCHMARK_ATOMIC() \
3661+ BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", int, 256) \
3662+ BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", int, 320) \
3663+ BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", int, 512) \
3664+ \
3665+ BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", unsigned long long, 256) \
3666+ BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", unsigned long long, 320)
3667+
3668+#define BENCHMARK_SORT() \
3669+ BENCHMARK_TYPE(histogram_sort_t, "using_sort", int, 256) \
3670+ BENCHMARK_TYPE(histogram_sort_t, "using_sort", int, 320) \
3671+ BENCHMARK_TYPE(histogram_sort_t, "using_sort", int, 512) \
3672+ \
3673+ BENCHMARK_TYPE(histogram_sort_t, "using_sort", unsigned long long, 256) \
3674+ BENCHMARK_TYPE(histogram_sort_t, "using_sort", unsigned long long, 320) \
3675+ \
3676+ BENCHMARK_TYPE_128(histogram_sort_t, "using_sort", rocprim::int128_t, 256) \
3677+ BENCHMARK_TYPE_128(histogram_sort_t, "using_sort", rocprim::uint128_t, 256)
3678+
3679+int main(int argc, char* argv[])
3680 {
3681- cli::Parser parser(argc, argv);
3682- parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes");
3683- parser.set_optional<int>("trials", "trials", -1, "number of iterations");
3684- parser.set_optional<std::string>("name_format",
3685- "name_format",
3686- "human",
3687- "either: json,human,txt");
3688- parser.run_and_exit_if_error();
3689-
3690- // Parse argv
3691- benchmark::Initialize(&argc, argv);
3692- const size_t bytes = parser.get<size_t>("size");
3693- const int trials = parser.get<int>("trials");
3694- bench_naming::set_format(parser.get<std::string>("name_format"));
3695-
3696- // HIP
3697- hipStream_t stream = 0; // default
3698-
3699- // Benchmark info
3700- add_common_benchmark_info();
3701- benchmark::AddCustomContext("bytes", std::to_string(bytes));
3702+ benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0);
3703
3704- // Add benchmarks
3705- std::vector<benchmark::internal::Benchmark*> benchmarks;
3706- // using_atomic
3707- using histogram_a_t = histogram<rocprim::block_histogram_algorithm::using_atomic>;
3708- add_benchmarks<histogram_a_t>(benchmarks, "using_atomic", stream, bytes);
3709- // using_sort
3710- using histogram_s_t = histogram<rocprim::block_histogram_algorithm::using_sort>;
3711- add_benchmarks<histogram_s_t>(benchmarks, "using_sort", stream, bytes);
3712+#ifndef BENCHMARK_CONFIG_TUNING
3713+ using histogram_atomic_t = histogram<rocprim::block_histogram_algorithm::using_atomic>;
3714+ using histogram_sort_t = histogram<rocprim::block_histogram_algorithm::using_sort>;
3715
3716- // Use manual timing
3717- for(auto& b : benchmarks)
3718- {
3719- b->UseManualTime();
3720- b->Unit(benchmark::kMillisecond);
3721- }
3722-
3723- // Force number of iterations
3724- if(trials > 0)
3725- {
3726- for(auto& b : benchmarks)
3727- {
3728- b->Iterations(trials);
3729- }
3730- }
3731+ BENCHMARK_ATOMIC()
3732+ BENCHMARK_SORT()
3733+#endif
3734
3735- // Run benchmarks
3736- benchmark::RunSpecifiedBenchmarks();
3737- return 0;
3738+ executor.run();
3739 }
3740diff --git a/benchmark/benchmark_block_radix_rank.cpp b/benchmark/benchmark_block_radix_rank.cpp
3741index faee6d3..a708037 100644
3742--- a/benchmark/benchmark_block_radix_rank.cpp
3743+++ b/benchmark/benchmark_block_radix_rank.cpp
3744@@ -1,6 +1,6 @@
3745 // MIT License
3746 //
3747-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
3748+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
3749 //
3750 // Permission is hereby granted, free of charge, to any person obtaining a copy
3751 // of this software and associated documentation files (the "Software"), to deal
3752@@ -21,11 +21,9 @@
3753 // SOFTWARE.
3754
3755 #include "benchmark_utils.hpp"
3756-// CmdParser
3757-#include "cmdparser.hpp"
3758
3759-// Google Benchmark
3760-#include <benchmark/benchmark.h>
3761+#include "../common/utils_data_generation.hpp"
3762+#include "../common/utils_device_ptr.hpp"
3763
3764 // HIP API
3765 #include <hip/hip_runtime.h>
3766@@ -34,34 +32,31 @@
3767 #include <rocprim/block/block_load_func.hpp>
3768 #include <rocprim/block/block_radix_rank.hpp>
3769 #include <rocprim/block/block_store_func.hpp>
3770+#include <rocprim/config.hpp>
3771+#include <rocprim/types.hpp>
3772
3773 #include <chrono>
3774-#include <limits>
3775+#include <stdint.h>
3776 #include <string>
3777-
3778-#ifndef DEFAULT_N
3779-const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4;
3780-#endif
3781-
3782-namespace rp = rocprim;
3783+#include <vector>
3784
3785 template<typename T,
3786- unsigned int BlockSize,
3787- unsigned int ItemsPerThread,
3788- unsigned int RadixBits,
3789- bool Descending,
3790- rp::block_radix_rank_algorithm Algorithm,
3791- unsigned int Trials>
3792-__global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_input,
3793- unsigned int* ranks_output)
3794+ unsigned int BlockSize,
3795+ unsigned int ItemsPerThread,
3796+ unsigned int RadixBits,
3797+ bool Descending,
3798+ rocprim::block_radix_rank_algorithm Algorithm,
3799+ unsigned int Trials>
3800+__global__ __launch_bounds__(BlockSize)
3801+void rank_kernel(const T* keys_input, unsigned int* ranks_output)
3802 {
3803- using rank_type = rp::block_radix_rank<BlockSize, RadixBits, Algorithm>;
3804+ using rank_type = rocprim::block_radix_rank<BlockSize, RadixBits, Algorithm>;
3805
3806 const unsigned int lid = threadIdx.x;
3807 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
3808
3809 T keys[ItemsPerThread];
3810- rp::block_load_direct_striped<BlockSize>(lid, keys_input + block_offset, keys);
3811+ rocprim::block_load_direct_striped<BlockSize>(lid, keys_input + block_offset, keys);
3812
3813 unsigned int ranks[ItemsPerThread];
3814
3815@@ -75,7 +70,7 @@ __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_inpu
3816 while(begin_bit < end_bit)
3817 {
3818 const unsigned pass_bits = min(RadixBits, end_bit - begin_bit);
3819- if ROCPRIM_IF_CONSTEXPR(Descending)
3820+ if constexpr(Descending)
3821 {
3822 rank_type().rank_keys_desc(keys, ranks, storage, begin_bit, pass_bits);
3823 }
3824@@ -87,164 +82,93 @@ __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_inpu
3825 }
3826 }
3827
3828- rp::block_store_direct_striped<BlockSize>(lid, ranks_output + block_offset, ranks);
3829+ rocprim::block_store_direct_striped<BlockSize>(lid, ranks_output + block_offset, ranks);
3830 }
3831
3832 template<typename T,
3833- unsigned int BlockSize,
3834- unsigned int ItemsPerThread,
3835- rp::block_radix_rank_algorithm Algorithm,
3836- unsigned int RadixBits = 4,
3837- bool Descending = false,
3838- unsigned int Trials = 10>
3839-void run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream)
3840+ size_t BlockSize,
3841+ size_t ItemsPerThread,
3842+ rocprim::block_radix_rank_algorithm Algorithm,
3843+ size_t RadixBits = 4,
3844+ bool Descending = false,
3845+ size_t Trials = 10>
3846+void run_benchmark(benchmark_utils::state&& state)
3847 {
3848+ const auto& bytes = state.bytes;
3849+ const auto& seed = state.seed;
3850+ const auto& stream = state.stream;
3851+
3852 // Calculate the number of elements N
3853- size_t N = bytes / sizeof(T);
3854- constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
3855- const unsigned int grid_size = ((N + items_per_block - 1) / items_per_block);
3856- const unsigned int size = items_per_block * grid_size;
3857+ size_t N = bytes / sizeof(T);
3858+ constexpr size_t items_per_block = BlockSize * ItemsPerThread;
3859+ const size_t grid_size = ((N + items_per_block - 1) / items_per_block);
3860+ const size_t size = items_per_block * grid_size;
3861
3862 std::vector<T> input = get_random_data<T>(size,
3863- generate_limits<T>::min(),
3864- generate_limits<T>::max(),
3865+ common::generate_limits<T>::min(),
3866+ common::generate_limits<T>::max(),
3867 seed.get_0());
3868
3869- T* d_input;
3870- unsigned int* d_output;
3871- HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
3872- HIP_CHECK(hipMalloc(&d_output, size * sizeof(unsigned int)));
3873- HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
3874+ common::device_ptr<T> d_input(input);
3875+ common::device_ptr<unsigned int> d_output(size);
3876 HIP_CHECK(hipDeviceSynchronize());
3877
3878- for(auto _ : state)
3879- {
3880- auto start = std::chrono::steady_clock::now();
3881-
3882- hipLaunchKernelGGL(HIP_KERNEL_NAME(rank_kernel<T,
3883- BlockSize,
3884- ItemsPerThread,
3885- RadixBits,
3886- Descending,
3887- Algorithm,
3888- Trials>),
3889- dim3(grid_size),
3890- dim3(BlockSize),
3891- 0,
3892- stream,
3893- d_input,
3894- d_output);
3895- HIP_CHECK(hipPeekAtLastError());
3896- HIP_CHECK(hipDeviceSynchronize());
3897-
3898- auto end = std::chrono::steady_clock::now();
3899- auto elapsed_seconds
3900- = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
3901- state.SetIterationTime(elapsed_seconds.count());
3902- }
3903- state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
3904- state.SetItemsProcessed(state.iterations() * Trials * size);
3905+ state.run(
3906+ [&]
3907+ {
3908+ rank_kernel<T, BlockSize, ItemsPerThread, RadixBits, Descending, Algorithm, Trials>
3909+ <<<dim3(grid_size), dim3(BlockSize), 0, stream>>>(d_input.get(), d_output.get());
3910+ HIP_CHECK(hipGetLastError());
3911+ });
3912
3913- HIP_CHECK(hipFree(d_input));
3914- HIP_CHECK(hipFree(d_output));
3915+ state.set_throughput(size * Trials, sizeof(T));
3916 }
3917
3918-#define CREATE_BENCHMARK(T, BS, IPT, KIND) \
3919- benchmark::RegisterBenchmark( \
3920- bench_naming::format_name("{lvl:block,algo:radix_rank,key_type:" #T ",cfg:{bs:" #BS \
3921- ",ipt:" #IPT ",method:" #KIND "}}") \
3922- .c_str(), \
3923- run_benchmark<T, BS, IPT, KIND>, \
3924- bytes, \
3925- seed, \
3926- stream)
3927+#define CREATE_BENCHMARK(T, BS, IPT, KIND) \
3928+ executor.queue_fn(bench_naming::format_name("{lvl:block,algo:radix_rank,key_type:" #T \
3929+ ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" #KIND \
3930+ "}}") \
3931+ .c_str(), \
3932+ run_benchmark<T, BS, IPT, KIND>);
3933
3934 // clang-format off
3935-#define CREATE_BENCHMARK_KINDS(type, block, ipt) \
3936- CREATE_BENCHMARK(type, block, ipt, rp::block_radix_rank_algorithm::basic), \
3937- CREATE_BENCHMARK(type, block, ipt, rp::block_radix_rank_algorithm::basic_memoize), \
3938- CREATE_BENCHMARK(type, block, ipt, rp::block_radix_rank_algorithm::match)
3939-
3940-#define BENCHMARK_TYPE(type, block) \
3941- CREATE_BENCHMARK_KINDS(type, block, 1), \
3942- CREATE_BENCHMARK_KINDS(type, block, 4), \
3943- CREATE_BENCHMARK_KINDS(type, block, 8), \
3944- CREATE_BENCHMARK_KINDS(type, block, 12), \
3945- CREATE_BENCHMARK_KINDS(type, block, 16), \
3946+#define CREATE_BENCHMARK_KINDS(type, block, ipt) \
3947+ CREATE_BENCHMARK(type, block, ipt, rocprim::block_radix_rank_algorithm::basic) \
3948+ CREATE_BENCHMARK(type, block, ipt, rocprim::block_radix_rank_algorithm::basic_memoize) \
3949+ CREATE_BENCHMARK(type, block, ipt, rocprim::block_radix_rank_algorithm::match)
3950+
3951+#define BENCHMARK_TYPE(type, block) \
3952+ CREATE_BENCHMARK_KINDS(type, block, 1) \
3953+ CREATE_BENCHMARK_KINDS(type, block, 4) \
3954+ CREATE_BENCHMARK_KINDS(type, block, 8) \
3955+ CREATE_BENCHMARK_KINDS(type, block, 12) \
3956+ CREATE_BENCHMARK_KINDS(type, block, 16) \
3957 CREATE_BENCHMARK_KINDS(type, block, 20)
3958 // clang-format on
3959
3960-void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
3961- size_t bytes,
3962- const managed_seed& seed,
3963- hipStream_t stream)
3964+int main(int argc, char* argv[])
3965 {
3966- std::vector<benchmark::internal::Benchmark*> bs = {
3967- BENCHMARK_TYPE(int, 128),
3968- BENCHMARK_TYPE(int, 256),
3969- BENCHMARK_TYPE(int, 512),
3970+ benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0);
3971
3972- BENCHMARK_TYPE(uint8_t, 128),
3973- BENCHMARK_TYPE(uint8_t, 256),
3974- BENCHMARK_TYPE(uint8_t, 512),
3975+ BENCHMARK_TYPE(int, 128)
3976+ BENCHMARK_TYPE(int, 256)
3977+ BENCHMARK_TYPE(int, 512)
3978
3979- BENCHMARK_TYPE(long long, 128),
3980- BENCHMARK_TYPE(long long, 256),
3981- BENCHMARK_TYPE(long long, 512),
3982- };
3983+ BENCHMARK_TYPE(uint8_t, 128)
3984+ BENCHMARK_TYPE(uint8_t, 256)
3985+ BENCHMARK_TYPE(uint8_t, 512)
3986
3987- benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
3988-}
3989+ BENCHMARK_TYPE(long long, 128)
3990+ BENCHMARK_TYPE(long long, 256)
3991+ BENCHMARK_TYPE(long long, 512)
3992
3993-int main(int argc, char* argv[])
3994-{
3995- cli::Parser parser(argc, argv);
3996- parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes");
3997- parser.set_optional<int>("trials", "trials", -1, "number of iterations");
3998- parser.set_optional<std::string>("name_format",
3999- "name_format",
4000- "human",
4001- "either: json,human,txt");
4002- parser.set_optional<std::string>("seed", "seed", "random", get_seed_message());
4003- parser.run_and_exit_if_error();
4004-
4005- // Parse argv
4006- benchmark::Initialize(&argc, argv);
4007- const size_t bytes = parser.get<size_t>("size");
4008- const int trials = parser.get<int>("trials");
4009- bench_naming::set_format(parser.get<std::string>("name_format"));
4010- const std::string seed_type = parser.get<std::string>("seed");
4011- const managed_seed seed(seed_type);
4012-
4013- // HIP
4014- hipStream_t stream = 0; // default
4015-
4016- // Benchmark info
4017- add_common_benchmark_info();
4018- benchmark::AddCustomContext("bytes", std::to_string(bytes));
4019- benchmark::AddCustomContext("seed", seed_type);
4020-
4021- // Add benchmarks
4022- std::vector<benchmark::internal::Benchmark*> benchmarks;
4023- add_benchmarks(benchmarks, bytes, seed, stream);
4024-
4025- // Use manual timing
4026- for(auto& b : benchmarks)
4027- {
4028- b->UseManualTime();
4029- b->Unit(benchmark::kMillisecond);
4030- }
4031+ BENCHMARK_TYPE(rocprim::int128_t, 128)
4032+ BENCHMARK_TYPE(rocprim::int128_t, 256)
4033+ BENCHMARK_TYPE(rocprim::int128_t, 512)
4034
4035- // Force number of iterations
4036- if(trials > 0)
4037- {
4038- for(auto& b : benchmarks)
4039- {
4040- b->Iterations(trials);
4041- }
4042- }
4043+ BENCHMARK_TYPE(rocprim::uint128_t, 128)
4044+ BENCHMARK_TYPE(rocprim::uint128_t, 256)
4045+ BENCHMARK_TYPE(rocprim::uint128_t, 512)
4046
4047- // Run benchmarks
4048- benchmark::RunSpecifiedBenchmarks();
4049- return 0;
4050+ executor.run();
4051 }
4052diff --git a/benchmark/benchmark_block_radix_sort.cpp b/benchmark/benchmark_block_radix_sort.cpp
4053index ff14473..331f809 100644
4054--- a/benchmark/benchmark_block_radix_sort.cpp
4055+++ b/benchmark/benchmark_block_radix_sort.cpp
4056@@ -1,6 +1,6 @@
4057 // MIT License
4058 //
4059-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
4060+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
4061 //
4062 // Permission is hereby granted, free of charge, to any person obtaining a copy
4063 // of this software and associated documentation files (the "Software"), to deal
4064@@ -21,11 +21,10 @@
4065 // SOFTWARE.
4066
4067 #include "benchmark_utils.hpp"
4068-// CmdParser
4069-#include "cmdparser.hpp"
4070
4071-// Google Benchmark
4072-#include <benchmark/benchmark.h>
4073+#include "../common/utils_custom_type.hpp"
4074+#include "../common/utils_data_generation.hpp"
4075+#include "../common/utils_device_ptr.hpp"
4076
4077 // HIP API
4078 #include <hip/hip_runtime.h>
4079@@ -34,325 +33,244 @@
4080 #include <rocprim/block/block_load_func.hpp>
4081 #include <rocprim/block/block_radix_sort.hpp>
4082 #include <rocprim/block/block_store_func.hpp>
4083+#include <rocprim/config.hpp>
4084+#include <rocprim/types.hpp>
4085
4086-#include <iostream>
4087-#include <limits>
4088+#include <cstddef>
4089+#include <stdint.h>
4090 #include <string>
4091 #include <type_traits>
4092 #include <vector>
4093
4094-#include <cstdio>
4095-#include <cstdlib>
4096-
4097-#ifndef DEFAULT_N
4098-const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4;
4099-#endif
4100-
4101 enum class benchmark_kinds
4102 {
4103 sort_keys,
4104 sort_pairs
4105 };
4106
4107-namespace rp = rocprim;
4108-
4109-template<class T>
4110-using select_decomposer_t = std::
4111- conditional_t<is_custom_type<T>::value, custom_type_decomposer<T>, rp::identity_decomposer>;
4112+template<typename T>
4113+using select_decomposer_t = std::conditional_t<common::is_custom_type<T>::value,
4114+ custom_type_decomposer<T>,
4115+ rocprim::identity_decomposer>;
4116
4117-template<class T,
4118+template<typename T,
4119 unsigned int BlockSize,
4120 unsigned int RadixBitsPerPass,
4121 unsigned int ItemsPerThread,
4122 unsigned int Trials>
4123-__global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, T* output)
4124+__global__ __launch_bounds__(BlockSize)
4125+void sort_keys_kernel(const T* input, T* output)
4126 {
4127- const unsigned int lid = threadIdx.x;
4128+ const unsigned int lid = threadIdx.x;
4129 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
4130
4131 T keys[ItemsPerThread];
4132- rp::block_load_direct_striped<BlockSize>(lid, input + block_offset, keys);
4133+ rocprim::block_load_direct_striped<BlockSize>(lid, input + block_offset, keys);
4134
4135 ROCPRIM_NO_UNROLL
4136- for(unsigned int trial = 0; trial < Trials; trial++)
4137+ for(unsigned int trial = 0; trial < Trials; ++trial)
4138 {
4139- rp::block_radix_sort<T,
4140- BlockSize,
4141- ItemsPerThread,
4142- rocprim::empty_type,
4143- 1,
4144- 1,
4145- RadixBitsPerPass>
4146+ rocprim::block_radix_sort<T,
4147+ BlockSize,
4148+ ItemsPerThread,
4149+ rocprim::empty_type,
4150+ 1,
4151+ 1,
4152+ RadixBitsPerPass>
4153 sort;
4154 sort.sort(keys, 0, sizeof(T) * 8, select_decomposer_t<T>{});
4155 }
4156
4157- rp::block_store_direct_striped<BlockSize>(lid, output + block_offset, keys);
4158+ rocprim::block_store_direct_striped<BlockSize>(lid, output + block_offset, keys);
4159 }
4160
4161-template<class T,
4162+template<typename T,
4163 unsigned int BlockSize,
4164 unsigned int RadixBitsPerPass,
4165 unsigned int ItemsPerThread,
4166 unsigned int Trials>
4167-__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, T* output)
4168+__global__ __launch_bounds__(BlockSize)
4169+void sort_pairs_kernel(const T* input, T* output)
4170 {
4171- const unsigned int lid = threadIdx.x;
4172+ const unsigned int lid = threadIdx.x;
4173 const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
4174
4175 T keys[ItemsPerThread];
4176 T values[ItemsPerThread];
4177- rp::block_load_direct_striped<BlockSize>(lid, input + block_offset, keys);
4178- for(unsigned int i = 0; i < ItemsPerThread; i++)
4179+ rocprim::block_load_direct_striped<BlockSize>(lid, input + block_offset, keys);
4180+ for(unsigned int i = 0; i < ItemsPerThread; ++i)
4181 {
4182 values[i] = keys[i] + T(1);
4183 }
4184
4185 ROCPRIM_NO_UNROLL
4186- for(unsigned int trial = 0; trial < Trials; trial++)
4187+ for(unsigned int trial = 0; trial < Trials; ++trial)
4188 {
4189- rp::block_radix_sort<T, BlockSize, ItemsPerThread, T, 1, 1, RadixBitsPerPass> sort;
4190+ rocprim::block_radix_sort<T, BlockSize, ItemsPerThread, T, 1, 1, RadixBitsPerPass> sort;
4191 sort.sort(keys, values, 0, sizeof(T) * 8, select_decomposer_t<T>{});
4192 }
4193
4194- for(unsigned int i = 0; i < ItemsPerThread; i++)
4195+ for(unsigned int i = 0; i < ItemsPerThread; ++i)
4196 {
4197 keys[i] += values[i];
4198 }
4199- rp::block_store_direct_striped<BlockSize>(lid, output + block_offset, keys);
4200+ rocprim::block_store_direct_striped<BlockSize>(lid, output + block_offset, keys);
4201 }
4202
4203-template<class T,
4204- unsigned int BlockSize,
4205- unsigned int RadixBitsPerPass,
4206- unsigned int ItemsPerThread,
4207- unsigned int Trials = 10>
4208-void run_benchmark(benchmark::State& state,
4209- benchmark_kinds benchmark_kind,
4210- size_t bytes,
4211- const managed_seed& seed,
4212- hipStream_t stream)
4213+template<typename T,
4214+ benchmark_kinds BenchmarkKind,
4215+ unsigned int BlockSize,
4216+ unsigned int RadixBitsPerPass,
4217+ unsigned int ItemsPerThread,
4218+ unsigned int Trials = 10>
4219+void run_benchmark(benchmark_utils::state&& state)
4220 {
4221+ const auto& bytes = state.bytes;
4222+ const auto& seed = state.seed;
4223+ const auto& stream = state.stream;
4224+
4225 // Calculate the number of elements N
4226 size_t N = bytes / sizeof(T);
4227-
4228+
4229 constexpr auto items_per_block = BlockSize * ItemsPerThread;
4230- const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block);
4231+ const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block);
4232
4233 std::vector<T> input = get_random_data<T>(size,
4234- generate_limits<T>::min(),
4235- generate_limits<T>::max(),
4236+ common::generate_limits<T>::min(),
4237+ common::generate_limits<T>::max(),
4238 seed.get_0());
4239
4240- T* d_input;
4241- T * d_output;
4242- HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
4243- HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
4244- HIP_CHECK(
4245- hipMemcpy(
4246- d_input, input.data(),
4247- size * sizeof(T),
4248- hipMemcpyHostToDevice
4249- )
4250- );
4251+ common::device_ptr<T> d_input(input);
4252+ common::device_ptr<T> d_output(size);
4253 HIP_CHECK(hipDeviceSynchronize());
4254
4255- // HIP events creation
4256- hipEvent_t start, stop;
4257- HIP_CHECK(hipEventCreate(&start));
4258- HIP_CHECK(hipEventCreate(&stop));
4259-
4260- for(auto _ : state)
4261- {
4262- // Record start event
4263- HIP_CHECK(hipEventRecord(start, stream));
4264-
4265- if(benchmark_kind == benchmark_kinds::sort_keys)
4266- {
4267- hipLaunchKernelGGL(
4268- HIP_KERNEL_NAME(
4269- sort_keys_kernel<T, BlockSize, RadixBitsPerPass, ItemsPerThread, Trials>),
4270- dim3(size / items_per_block),
4271- dim3(BlockSize),
4272- 0,
4273- stream,
4274- d_input,
4275- d_output);
4276- }
4277- else if(benchmark_kind == benchmark_kinds::sort_pairs)
4278+ state.run(
4279+ [&]
4280 {
4281- hipLaunchKernelGGL(
4282- HIP_KERNEL_NAME(
4283- sort_pairs_kernel<T, BlockSize, RadixBitsPerPass, ItemsPerThread, Trials>),
4284- dim3(size / items_per_block),
4285- dim3(BlockSize),
4286- 0,
4287- stream,
4288- d_input,
4289- d_output);
4290- }
4291- HIP_CHECK(hipGetLastError());
4292-
4293- // Record stop event and wait until it completes
4294- HIP_CHECK(hipEventRecord(stop, stream));
4295- HIP_CHECK(hipEventSynchronize(stop));
4296-
4297- float elapsed_mseconds;
4298- HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop));
4299- state.SetIterationTime(elapsed_mseconds / 1000);
4300- }
4301-
4302- // Destroy HIP events
4303- HIP_CHECK(hipEventDestroy(start));
4304- HIP_CHECK(hipEventDestroy(stop));
4305-
4306- state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
4307- state.SetItemsProcessed(state.iterations() * Trials * size);
4308-
4309- HIP_CHECK(hipFree(d_input));
4310- HIP_CHECK(hipFree(d_output));
4311+ if constexpr(BenchmarkKind == benchmark_kinds::sort_keys)
4312+ {
4313+ sort_keys_kernel<T, BlockSize, RadixBitsPerPass, ItemsPerThread, Trials>
4314+ <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input.get(),
4315+ d_output.get());
4316+ }
4317+ else if constexpr(BenchmarkKind == benchmark_kinds::sort_pairs)
4318+ {
4319+ sort_pairs_kernel<T, BlockSize, RadixBitsPerPass, ItemsPerThread, Trials>
4320+ <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input.get(),
4321+ d_output.get());
4322+ }
4323+ HIP_CHECK(hipGetLastError());
4324+ });
4325+
4326+ state.set_throughput(size * Trials, sizeof(T));
4327 }
4328
4329 #define CREATE_BENCHMARK(T, BS, RB, IPT) \
4330- benchmark::RegisterBenchmark( \
4331+ executor.queue_fn( \
4332 bench_naming::format_name("{lvl:block,algo:radix_sort,key_type:" #T ",subalgo:" + name \
4333 + ",cfg:{bs:" #BS ",rb:" #RB ",ipt:" #IPT "}}") \
4334 .c_str(), \
4335- run_benchmark<T, BS, RB, IPT>, \
4336- benchmark_kind, \
4337- bytes, \
4338- seed, \
4339- stream)
4340-
4341-#define BENCHMARK_TYPE(type, block, radix_bits) \
4342- CREATE_BENCHMARK(type, block, radix_bits, 1), CREATE_BENCHMARK(type, block, radix_bits, 2), \
4343- CREATE_BENCHMARK(type, block, radix_bits, 3), \
4344- CREATE_BENCHMARK(type, block, radix_bits, 4), CREATE_BENCHMARK(type, block, radix_bits, 8)
4345-
4346-void add_benchmarks(benchmark_kinds benchmark_kind,
4347- const std::string& name,
4348- std::vector<benchmark::internal::Benchmark*>& benchmarks,
4349- size_t bytes,
4350- const managed_seed& seed,
4351- hipStream_t stream)
4352+ run_benchmark<T, BenchmarkKind, BS, RB, IPT>);
4353+
4354+#define BENCHMARK_TYPE(type, block, radix_bits) \
4355+ CREATE_BENCHMARK(type, block, radix_bits, 1) \
4356+ CREATE_BENCHMARK(type, block, radix_bits, 2) \
4357+ CREATE_BENCHMARK(type, block, radix_bits, 3) \
4358+ CREATE_BENCHMARK(type, block, radix_bits, 4) \
4359+ CREATE_BENCHMARK(type, block, radix_bits, 8)
4360+
4361+template<benchmark_kinds BenchmarkKind>
4362+void add_benchmarks(const std::string& name, benchmark_utils::executor& executor)
4363 {
4364- using custom_int_type = custom_type<int, int>;
4365-
4366- std::vector<benchmark::internal::Benchmark*> bs = {
4367- BENCHMARK_TYPE(int, 64, 3),
4368- BENCHMARK_TYPE(int, 512, 3),
4369-
4370- BENCHMARK_TYPE(int, 64, 4),
4371- BENCHMARK_TYPE(int, 128, 4),
4372- BENCHMARK_TYPE(int, 192, 4),
4373- BENCHMARK_TYPE(int, 256, 4),
4374- BENCHMARK_TYPE(int, 320, 4),
4375- BENCHMARK_TYPE(int, 512, 4),
4376-
4377- BENCHMARK_TYPE(int8_t, 64, 3),
4378- BENCHMARK_TYPE(int8_t, 512, 3),
4379-
4380- BENCHMARK_TYPE(int8_t, 64, 4),
4381- BENCHMARK_TYPE(int8_t, 128, 4),
4382- BENCHMARK_TYPE(int8_t, 192, 4),
4383- BENCHMARK_TYPE(int8_t, 256, 4),
4384- BENCHMARK_TYPE(int8_t, 320, 4),
4385- BENCHMARK_TYPE(int8_t, 512, 4),
4386-
4387- BENCHMARK_TYPE(uint8_t, 64, 3),
4388- BENCHMARK_TYPE(uint8_t, 512, 3),
4389-
4390- BENCHMARK_TYPE(uint8_t, 64, 4),
4391- BENCHMARK_TYPE(uint8_t, 128, 4),
4392- BENCHMARK_TYPE(uint8_t, 192, 4),
4393- BENCHMARK_TYPE(uint8_t, 256, 4),
4394- BENCHMARK_TYPE(uint8_t, 320, 4),
4395- BENCHMARK_TYPE(uint8_t, 512, 4),
4396-
4397- BENCHMARK_TYPE(rocprim::half, 64, 3),
4398- BENCHMARK_TYPE(rocprim::half, 512, 3),
4399-
4400- BENCHMARK_TYPE(rocprim::half, 64, 4),
4401- BENCHMARK_TYPE(rocprim::half, 128, 4),
4402- BENCHMARK_TYPE(rocprim::half, 192, 4),
4403- BENCHMARK_TYPE(rocprim::half, 256, 4),
4404- BENCHMARK_TYPE(rocprim::half, 320, 4),
4405- BENCHMARK_TYPE(rocprim::half, 512, 4),
4406-
4407- BENCHMARK_TYPE(long long, 64, 3),
4408- BENCHMARK_TYPE(long long, 512, 3),
4409-
4410- BENCHMARK_TYPE(long long, 64, 4),
4411- BENCHMARK_TYPE(long long, 128, 4),
4412- BENCHMARK_TYPE(long long, 192, 4),
4413- BENCHMARK_TYPE(long long, 256, 4),
4414- BENCHMARK_TYPE(long long, 320, 4),
4415- BENCHMARK_TYPE(long long, 512, 4),
4416-
4417- BENCHMARK_TYPE(custom_int_type, 64, 3),
4418- BENCHMARK_TYPE(custom_int_type, 512, 3),
4419-
4420- BENCHMARK_TYPE(custom_int_type, 64, 4),
4421- BENCHMARK_TYPE(custom_int_type, 128, 4),
4422- BENCHMARK_TYPE(custom_int_type, 192, 4),
4423- BENCHMARK_TYPE(custom_int_type, 256, 4),
4424- BENCHMARK_TYPE(custom_int_type, 320, 4),
4425- BENCHMARK_TYPE(custom_int_type, 512, 4),
4426- };
4427-
4428- benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
4429+ using custom_int_type = common::custom_type<int, int>;
4430+
4431+ BENCHMARK_TYPE(int, 64, 3)
4432+ BENCHMARK_TYPE(int, 512, 3)
4433+
4434+ BENCHMARK_TYPE(int, 64, 4)
4435+ BENCHMARK_TYPE(int, 128, 4)
4436+ BENCHMARK_TYPE(int, 192, 4)
4437+ BENCHMARK_TYPE(int, 256, 4)
4438+ BENCHMARK_TYPE(int, 320, 4)
4439+ BENCHMARK_TYPE(int, 512, 4)
4440+
4441+ BENCHMARK_TYPE(int8_t, 64, 3)
4442+ BENCHMARK_TYPE(int8_t, 512, 3)
4443+
4444+ BENCHMARK_TYPE(int8_t, 64, 4)
4445+ BENCHMARK_TYPE(int8_t, 128, 4)
4446+ BENCHMARK_TYPE(int8_t, 192, 4)
4447+ BENCHMARK_TYPE(int8_t, 256, 4)
4448+ BENCHMARK_TYPE(int8_t, 320, 4)
4449+ BENCHMARK_TYPE(int8_t, 512, 4)
4450+
4451+ BENCHMARK_TYPE(uint8_t, 64, 3)
4452+ BENCHMARK_TYPE(uint8_t, 512, 3)
4453+
4454+ BENCHMARK_TYPE(uint8_t, 64, 4)
4455+ BENCHMARK_TYPE(uint8_t, 128, 4)
4456+ BENCHMARK_TYPE(uint8_t, 192, 4)
4457+ BENCHMARK_TYPE(uint8_t, 256, 4)
4458+ BENCHMARK_TYPE(uint8_t, 320, 4)
4459+ BENCHMARK_TYPE(uint8_t, 512, 4)
4460+
4461+ BENCHMARK_TYPE(rocprim::half, 64, 3)
4462+ BENCHMARK_TYPE(rocprim::half, 512, 3)
4463+
4464+ BENCHMARK_TYPE(rocprim::half, 64, 4)
4465+ BENCHMARK_TYPE(rocprim::half, 128, 4)
4466+ BENCHMARK_TYPE(rocprim::half, 192, 4)
4467+ BENCHMARK_TYPE(rocprim::half, 256, 4)
4468+ BENCHMARK_TYPE(rocprim::half, 320, 4)
4469+ BENCHMARK_TYPE(rocprim::half, 512, 4)
4470+
4471+ BENCHMARK_TYPE(long long, 64, 3)
4472+ BENCHMARK_TYPE(long long, 512, 3)
4473+
4474+ BENCHMARK_TYPE(long long, 64, 4)
4475+ BENCHMARK_TYPE(long long, 128, 4)
4476+ BENCHMARK_TYPE(long long, 192, 4)
4477+ BENCHMARK_TYPE(long long, 256, 4)
4478+ BENCHMARK_TYPE(long long, 320, 4)
4479+ BENCHMARK_TYPE(long long, 512, 4)
4480+
4481+ BENCHMARK_TYPE(custom_int_type, 64, 3)
4482+ BENCHMARK_TYPE(custom_int_type, 512, 3)
4483+
4484+ BENCHMARK_TYPE(custom_int_type, 64, 4)
4485+ BENCHMARK_TYPE(custom_int_type, 128, 4)
4486+ BENCHMARK_TYPE(custom_int_type, 192, 4)
4487+ BENCHMARK_TYPE(custom_int_type, 256, 4)
4488+ BENCHMARK_TYPE(custom_int_type, 320, 4)
4489+ BENCHMARK_TYPE(custom_int_type, 512, 4)
4490+
4491+ BENCHMARK_TYPE(rocprim::int128_t, 64, 3)
4492+ BENCHMARK_TYPE(rocprim::int128_t, 512, 3)
4493+
4494+ BENCHMARK_TYPE(rocprim::int128_t, 64, 4)
4495+ BENCHMARK_TYPE(rocprim::int128_t, 128, 4)
4496+ BENCHMARK_TYPE(rocprim::int128_t, 192, 4)
4497+ BENCHMARK_TYPE(rocprim::int128_t, 256, 4)
4498+ BENCHMARK_TYPE(rocprim::int128_t, 320, 4)
4499+ BENCHMARK_TYPE(rocprim::int128_t, 512, 4)
4500+
4501+ BENCHMARK_TYPE(rocprim::uint128_t, 64, 3)
4502+ BENCHMARK_TYPE(rocprim::uint128_t, 512, 3)
4503+
4504+ BENCHMARK_TYPE(rocprim::uint128_t, 64, 4)
4505+ BENCHMARK_TYPE(rocprim::uint128_t, 128, 4)
4506+ BENCHMARK_TYPE(rocprim::uint128_t, 192, 4)
4507+ BENCHMARK_TYPE(rocprim::uint128_t, 256, 4)
4508+ BENCHMARK_TYPE(rocprim::uint128_t, 320, 4)
4509+ BENCHMARK_TYPE(rocprim::uint128_t, 512, 4)
4510 }
4511
4512-int main(int argc, char *argv[])
4513+int main(int argc, char* argv[])
4514 {
4515- cli::Parser parser(argc, argv);
4516- parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes");
4517- parser.set_optional<int>("trials", "trials", -1, "number of iterations");
4518- parser.set_optional<std::string>("name_format",
4519- "name_format",
4520- "human",
4521- "either: json,human,txt");
4522- parser.set_optional<std::string>("seed", "seed", "random", get_seed_message());
4523- parser.run_and_exit_if_error();
4524-
4525- // Parse argv
4526- benchmark::Initialize(&argc, argv);
4527- const size_t bytes = parser.get<size_t>("size");
4528- const int trials = parser.get<int>("trials");
4529- bench_naming::set_format(parser.get<std::string>("name_format"));
4530- const std::string seed_type = parser.get<std::string>("seed");
4531- const managed_seed seed(seed_type);
4532-
4533- // HIP
4534- hipStream_t stream = 0; // default
4535-
4536- // Benchmark info
4537- add_common_benchmark_info();
4538- benchmark::AddCustomContext("bytes", std::to_string(bytes));
4539- benchmark::AddCustomContext("seed", seed_type);
4540-
4541- // Add benchmarks
4542- std::vector<benchmark::internal::Benchmark*> benchmarks;
4543- add_benchmarks(benchmark_kinds::sort_keys, "keys", benchmarks, bytes, seed, stream);
4544- add_benchmarks(benchmark_kinds::sort_pairs, "pairs", benchmarks, bytes, seed, stream);
4545-
4546- // Use manual timing
4547- for(auto& b : benchmarks)
4548- {
4549- b->UseManualTime();
4550- b->Unit(benchmark::kMillisecond);
4551- }
4552+ benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0);
4553
4554- // Force number of iterations
4555- if(trials > 0)
4556- {
4557- for(auto& b : benchmarks)
4558- {
4559- b->Iterations(trials);
4560- }
4561- }
4562+ add_benchmarks<benchmark_kinds::sort_keys>("keys", executor);
4563+ add_benchmarks<benchmark_kinds::sort_pairs>("pairs", executor);
4564
4565- // Run benchmarks
4566- benchmark::RunSpecifiedBenchmarks();
4567- return 0;
4568+ executor.run();
4569 }
4570diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp
4571index 2571654..27678d0 100644
4572--- a/benchmark/benchmark_block_reduce.cpp
4573+++ b/benchmark/benchmark_block_reduce.cpp
4574@@ -1,6 +1,6 @@
4575 // MIT License
4576 //
4577-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
4578+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
4579 //
4580 // Permission is hereby granted, free of charge, to any person obtaining a copy
4581 // of this software and associated documentation files (the "Software"), to deal
4582@@ -21,41 +21,29 @@
4583 // SOFTWARE.
4584
4585 #include "benchmark_utils.hpp"
4586-// CmdParser
4587-#include "cmdparser.hpp"
4588
4589-// Google Benchmark
4590-#include <benchmark/benchmark.h>
4591+#include "../common/utils_custom_type.hpp"
4592+#include "../common/utils_device_ptr.hpp"
4593
4594 // HIP API
4595 #include <hip/hip_runtime.h>
4596
4597 // rocPRIM
4598 #include <rocprim/block/block_reduce.hpp>
4599+#include <rocprim/config.hpp>
4600+#include <rocprim/types.hpp>
4601
4602-#include <iostream>
4603-#include <limits>
4604+#include <cstddef>
4605+#include <stdint.h>
4606 #include <string>
4607 #include <vector>
4608
4609-#include <cstdio>
4610-#include <cstdlib>
4611-
4612-#ifndef DEFAULT_N
4613-const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4;
4614-#endif
4615-
4616-namespace rp = rocprim;
4617-
4618-template<
4619- class Runner,
4620- class T,
4621- unsigned int BlockSize,
4622- unsigned int ItemsPerThread,
4623- unsigned int Trials
4624->
4625-__global__
4626-__launch_bounds__(BlockSize)
4627+template<typename Runner,
4628+ typename T,
4629+ unsigned int BlockSize,
4630+ unsigned int ItemsPerThread,
4631+ unsigned int Trials>
4632+__global__ __launch_bounds__(BlockSize)
4633 void kernel(const T* input, T* output)
4634 {
4635 Runner::template run<T, BlockSize, ItemsPerThread, Trials>(input, output);
4636@@ -64,12 +52,7 @@ void kernel(const T* input, T* output)
4637 template<rocprim::block_reduce_algorithm algorithm>
4638 struct reduce
4639 {
4640- template<
4641- class T,
4642- unsigned int BlockSize,
4643- unsigned int ItemsPerThread,
4644- unsigned int Trials
4645- >
4646+ template<typename T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials>
4647 __device__
4648 static void run(const T* input, T* output)
4649 {
4650@@ -77,16 +60,16 @@ struct reduce
4651
4652 T values[ItemsPerThread];
4653 T reduced_value;
4654- for(unsigned int k = 0; k < ItemsPerThread; k++)
4655+ for(unsigned int k = 0; k < ItemsPerThread; ++k)
4656 {
4657 values[k] = input[i * ItemsPerThread + k];
4658 }
4659
4660- using breduce_t = rp::block_reduce<T, BlockSize, algorithm>;
4661+ using breduce_t = rocprim::block_reduce<T, BlockSize, algorithm>;
4662 __shared__ typename breduce_t::storage_type storage;
4663
4664 ROCPRIM_NO_UNROLL
4665- for(unsigned int trial = 0; trial < Trials; trial++)
4666+ for(unsigned int trial = 0; trial < Trials; ++trial)
4667 {
4668 breduce_t().reduce(values, reduced_value, storage);
4669 values[0] = reduced_value;
4670@@ -99,193 +82,113 @@ struct reduce
4671 }
4672 };
4673
4674-template<
4675- class Benchmark,
4676- class T,
4677- unsigned int BlockSize,
4678- unsigned int ItemsPerThread,
4679- unsigned int Trials = 100
4680->
4681-void run_benchmark(benchmark::State& state, hipStream_t stream, size_t bytes)
4682+template<typename Benchmark,
4683+ typename T,
4684+ unsigned int BlockSize,
4685+ unsigned int ItemsPerThread,
4686+ unsigned int Trials = 100>
4687+void run_benchmark(benchmark_utils::state&& state)
4688 {
4689+ const auto& bytes = state.bytes;
4690+ const auto& stream = state.stream;
4691+
4692 // Calculate the number of elements N
4693 size_t N = bytes / sizeof(T);
4694 // Make sure size is a multiple of BlockSize
4695 constexpr auto items_per_block = BlockSize * ItemsPerThread;
4696- const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block);
4697+ const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block);
4698 // Allocate and fill memory
4699 std::vector<T> input(size, T(1));
4700- T * d_input;
4701- T * d_output;
4702- HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
4703- HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
4704- HIP_CHECK(
4705- hipMemcpy(
4706- d_input, input.data(),
4707- size * sizeof(T),
4708- hipMemcpyHostToDevice
4709- )
4710- );
4711+ common::device_ptr<T> d_input(input);
4712+ common::device_ptr<T> d_output(size);
4713 HIP_CHECK(hipDeviceSynchronize());
4714
4715- // HIP events creation
4716- hipEvent_t start, stop;
4717- HIP_CHECK(hipEventCreate(&start));
4718- HIP_CHECK(hipEventCreate(&stop));
4719-
4720- for (auto _ : state)
4721- {
4722- // Record start event
4723- HIP_CHECK(hipEventRecord(start, stream));
4724-
4725- hipLaunchKernelGGL(
4726- HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>),
4727- dim3(size/items_per_block), dim3(BlockSize), 0, stream,
4728- d_input, d_output
4729- );
4730- HIP_CHECK(hipGetLastError());
4731-
4732- // Record stop event and wait until it completes
4733- HIP_CHECK(hipEventRecord(stop, stream));
4734- HIP_CHECK(hipEventSynchronize(stop));
4735-
4736- float elapsed_mseconds;
4737- HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop));
4738- state.SetIterationTime(elapsed_mseconds / 1000);
4739- }
4740-
4741- // Destroy HIP events
4742- HIP_CHECK(hipEventDestroy(start));
4743- HIP_CHECK(hipEventDestroy(stop));
4744-
4745- state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials);
4746- state.SetItemsProcessed(state.iterations() * size * Trials);
4747+ state.run(
4748+ [&]
4749+ {
4750+ kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>
4751+ <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input.get(),
4752+ d_output.get());
4753+ HIP_CHECK(hipGetLastError());
4754+ });
4755
4756- HIP_CHECK(hipFree(d_input));
4757- HIP_CHECK(hipFree(d_output));
4758+ state.set_throughput(size * Trials, sizeof(T));
4759 }
4760
4761-// IPT - items per thread
4762-#define CREATE_BENCHMARK(T, BS, IPT) \
4763- benchmark::RegisterBenchmark(bench_naming::format_name("{lvl:block,algo:reduce,key_type:" #T \
4764- ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" \
4765- + method_name + "}}") \
4766- .c_str(), \
4767- run_benchmark<Benchmark, T, BS, IPT>, \
4768- stream, \
4769- bytes)
4770-
4771-#define BENCHMARK_TYPE(type, block) \
4772- CREATE_BENCHMARK(type, block, 1), \
4773- CREATE_BENCHMARK(type, block, 2), \
4774- CREATE_BENCHMARK(type, block, 3), \
4775- CREATE_BENCHMARK(type, block, 4), \
4776- CREATE_BENCHMARK(type, block, 8), \
4777- CREATE_BENCHMARK(type, block, 11), \
4778+#define CREATE_BENCHMARK(T, BS, IPT) \
4779+ executor.queue_fn(bench_naming::format_name("{lvl:block,algo:reduce,key_type:" #T \
4780+ ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" \
4781+ + name + "}}") \
4782+ .c_str(), \
4783+ run_benchmark<Benchmark, T, BS, IPT>);
4784+
4785+#define BENCHMARK_TYPE(type, block) \
4786+ CREATE_BENCHMARK(type, block, 1) \
4787+ CREATE_BENCHMARK(type, block, 2) \
4788+ CREATE_BENCHMARK(type, block, 3) \
4789+ CREATE_BENCHMARK(type, block, 4) \
4790+ CREATE_BENCHMARK(type, block, 8) \
4791+ CREATE_BENCHMARK(type, block, 11) \
4792 CREATE_BENCHMARK(type, block, 16)
4793
4794-template<class Benchmark>
4795-void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
4796- const std::string& method_name,
4797- hipStream_t stream,
4798- size_t bytes)
4799+template<typename Benchmark>
4800+void add_benchmarks(const std::string& name, benchmark_utils::executor& executor)
4801 {
4802- using custom_float2 = custom_type<float, float>;
4803- using custom_double2 = custom_type<double, double>;
4804-
4805- std::vector<benchmark::internal::Benchmark*> new_benchmarks =
4806- {
4807- // When block size is less than or equal to warp size
4808- BENCHMARK_TYPE(int, 64),
4809- BENCHMARK_TYPE(float, 64),
4810- BENCHMARK_TYPE(double, 64),
4811- BENCHMARK_TYPE(int8_t, 64),
4812- BENCHMARK_TYPE(uint8_t, 64),
4813- BENCHMARK_TYPE(rocprim::half, 64),
4814-
4815- BENCHMARK_TYPE(int, 256),
4816- BENCHMARK_TYPE(float, 256),
4817- BENCHMARK_TYPE(double, 256),
4818- BENCHMARK_TYPE(int8_t, 256),
4819- BENCHMARK_TYPE(uint8_t, 256),
4820- BENCHMARK_TYPE(rocprim::half, 256),
4821-
4822- CREATE_BENCHMARK(custom_float2, 256, 1),
4823- CREATE_BENCHMARK(custom_float2, 256, 4),
4824- CREATE_BENCHMARK(custom_float2, 256, 8),
4825-
4826- CREATE_BENCHMARK(float2, 256, 1),
4827- CREATE_BENCHMARK(float2, 256, 4),
4828- CREATE_BENCHMARK(float2, 256, 8),
4829-
4830- CREATE_BENCHMARK(custom_double2, 256, 1),
4831- CREATE_BENCHMARK(custom_double2, 256, 4),
4832- CREATE_BENCHMARK(custom_double2, 256, 8),
4833-
4834- CREATE_BENCHMARK(double2, 256, 1),
4835- CREATE_BENCHMARK(double2, 256, 4),
4836- CREATE_BENCHMARK(double2, 256, 8),
4837-
4838- CREATE_BENCHMARK(float4, 256, 1),
4839- CREATE_BENCHMARK(float4, 256, 4),
4840- CREATE_BENCHMARK(float4, 256, 8),
4841- };
4842- benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end());
4843+ using custom_float2 = common::custom_type<float, float>;
4844+ using custom_double2 = common::custom_type<double, double>;
4845+
4846+ // When block size is less than or equal to warp size
4847+ BENCHMARK_TYPE(int, 64)
4848+ BENCHMARK_TYPE(float, 64)
4849+ BENCHMARK_TYPE(double, 64)
4850+ BENCHMARK_TYPE(int8_t, 64)
4851+ BENCHMARK_TYPE(uint8_t, 64)
4852+ BENCHMARK_TYPE(rocprim::half, 64)
4853+ BENCHMARK_TYPE(rocprim::int128_t, 64)
4854+ BENCHMARK_TYPE(rocprim::uint128_t, 64)
4855+
4856+ BENCHMARK_TYPE(int, 256)
4857+ BENCHMARK_TYPE(float, 256)
4858+ BENCHMARK_TYPE(double, 256)
4859+ BENCHMARK_TYPE(int8_t, 256)
4860+ BENCHMARK_TYPE(uint8_t, 256)
4861+ BENCHMARK_TYPE(rocprim::half, 256)
4862+ BENCHMARK_TYPE(rocprim::int128_t, 256)
4863+ BENCHMARK_TYPE(rocprim::uint128_t, 256)
4864+
4865+ CREATE_BENCHMARK(custom_float2, 256, 1)
4866+ CREATE_BENCHMARK(custom_float2, 256, 4)
4867+ CREATE_BENCHMARK(custom_float2, 256, 8)
4868+
4869+ CREATE_BENCHMARK(float2, 256, 1)
4870+ CREATE_BENCHMARK(float2, 256, 4)
4871+ CREATE_BENCHMARK(float2, 256, 8)
4872+
4873+ CREATE_BENCHMARK(custom_double2, 256, 1)
4874+ CREATE_BENCHMARK(custom_double2, 256, 4)
4875+ CREATE_BENCHMARK(custom_double2, 256, 8)
4876+
4877+ CREATE_BENCHMARK(double2, 256, 1)
4878+ CREATE_BENCHMARK(double2, 256, 4)
4879+ CREATE_BENCHMARK(double2, 256, 8)
4880+
4881+ CREATE_BENCHMARK(float4, 256, 1)
4882+ CREATE_BENCHMARK(float4, 256, 4)
4883+ CREATE_BENCHMARK(float4, 256, 8)
4884 }
4885
4886-int main(int argc, char *argv[])
4887+int main(int argc, char* argv[])
4888 {
4889- cli::Parser parser(argc, argv);
4890- parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes");
4891- parser.set_optional<int>("trials", "trials", -1, "number of iterations");
4892- parser.set_optional<std::string>("name_format",
4893- "name_format",
4894- "human",
4895- "either: json,human,txt");
4896- parser.run_and_exit_if_error();
4897-
4898- // Parse argv
4899- benchmark::Initialize(&argc, argv);
4900- const size_t bytes = parser.get<size_t>("size");
4901- const int trials = parser.get<int>("trials");
4902- bench_naming::set_format(parser.get<std::string>("name_format"));
4903+ benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 1, 0);
4904
4905- // HIP
4906- hipStream_t stream = 0; // default
4907-
4908- // Benchmark info
4909- add_common_benchmark_info();
4910- benchmark::AddCustomContext("bytes", std::to_string(bytes));
4911-
4912- // Add benchmarks
4913- std::vector<benchmark::internal::Benchmark*> benchmarks;
4914- // using_warp_scan
4915 using reduce_uwr_t = reduce<rocprim::block_reduce_algorithm::using_warp_reduce>;
4916- add_benchmarks<reduce_uwr_t>(benchmarks, "using_warp_reduce", stream, bytes);
4917- // reduce then scan
4918- using reduce_rr_t = reduce<rocprim::block_reduce_algorithm::raking_reduce>;
4919- add_benchmarks<reduce_rr_t>(benchmarks, "raking_reduce", stream, bytes);
4920- // reduce commutative only
4921- using reduce_rrco_t = reduce<rocprim::block_reduce_algorithm::raking_reduce_commutative_only>;
4922- add_benchmarks<reduce_rrco_t>(benchmarks, "raking_reduce_commutative_only", stream, bytes);
4923+ add_benchmarks<reduce_uwr_t>("using_warp_reduce", executor);
4924
4925- // Use manual timing
4926- for(auto& b : benchmarks)
4927- {
4928- b->UseManualTime();
4929- b->Unit(benchmark::kMillisecond);
4930- }
4931+ using reduce_rr_t = reduce<rocprim::block_reduce_algorithm::raking_reduce>;
4932+ add_benchmarks<reduce_rr_t>("raking_reduce", executor);
4933
4934- // Force number of iterations
4935- if(trials > 0)
4936- {
4937- for(auto& b : benchmarks)
4938- {
4939- b->Iterations(trials);
4940- }
4941- }
4942+ using reduce_rrco_t = reduce<rocprim::block_reduce_algorithm::raking_reduce_commutative_only>;
4943+ add_benchmarks<reduce_rrco_t>("raking_reduce_commutative_only", executor);
4944
4945- // Run benchmarks
4946- benchmark::RunSpecifiedBenchmarks();
4947- return 0;
4948+ executor.run();
4949 }
4950diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp
4951index e56bf09..3b20d0f 100644
4952--- a/benchmark/benchmark_block_run_length_decode.cpp
4953+++ b/benchmark/benchmark_block_run_length_decode.cpp
4954@@ -1,6 +1,6 @@
4955 // MIT License
4956 //
4957-// Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
4958+// Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved.
4959 //
4960 // Permission is hereby granted, free of charge, to any person obtaining a copy
4961 // of this software and associated documentation files (the "Software"), to deal
4962@@ -21,32 +21,33 @@
4963 // SOFTWARE.
4964
4965 #include "benchmark_utils.hpp"
4966-#include "cmdparser.hpp"
4967
4968-#include <benchmark/benchmark.h>
4969+#include "../common/utils_data_generation.hpp"
4970+#include "../common/utils_device_ptr.hpp"
4971
4972 #include <rocprim/block/block_load_func.hpp>
4973 #include <rocprim/block/block_run_length_decode.hpp>
4974 #include <rocprim/block/block_store_func.hpp>
4975+#include <rocprim/type_traits.hpp>
4976+#include <rocprim/types.hpp>
4977
4978-#include <random>
4979+#include <chrono>
4980+#include <cstddef>
4981+#include <string>
4982+#include <type_traits>
4983 #include <vector>
4984
4985-#ifndef DEFAULT_N
4986-const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4;
4987-#endif
4988-
4989-template<class ItemT,
4990- class OffsetT,
4991+template<typename ItemT,
4992+ typename OffsetT,
4993 unsigned BlockSize,
4994 unsigned RunsPerThread,
4995 unsigned DecodedItemsPerThread,
4996 unsigned Trials>
4997-__global__
4998- __launch_bounds__(BlockSize) void block_run_length_decode_kernel(const ItemT* d_run_items,
4999- const OffsetT* d_run_offsets,
5000- ItemT* d_decoded_items,
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches