Merge ~bullwinkle-team/ubuntu/+source/rocprim:bullwinkle/llvm-21/ubuntu/devel into ubuntu/+source/rocprim:ubuntu/devel
- Git
- lp:~bullwinkle-team/ubuntu/+source/rocprim
- bullwinkle/llvm-21/ubuntu/devel
- Merge into ubuntu/devel
| Status: | Merged | ||||
|---|---|---|---|---|---|
| Approved by: | Andreas Hasenack | ||||
| Approved revision: | 634c24c39d56d3d7c8f9b821d0685cd2f1711606 | ||||
| Merged at revision: | 634c24c39d56d3d7c8f9b821d0685cd2f1711606 | ||||
| Proposed branch: | ~bullwinkle-team/ubuntu/+source/rocprim:bullwinkle/llvm-21/ubuntu/devel | ||||
| Merge into: | ubuntu/+source/rocprim:ubuntu/devel | ||||
| Diff against target: |
143068 lines (+54614/-40047) 497 files modified
.github/CODEOWNERS (+1/-1) .gitlab-ci.yml (+163/-32) .gitlab/report_noise.py (+251/-0) .gitlab/run_benchmarks.py (+25/-6) CHANGELOG.md (+164/-4) CMakeLists.txt (+54/-23) CONTRIBUTING.md (+2/-0) LICENSE.md (+2/-2) README.md (+84/-54) benchmark/CMakeLists.txt (+3/-1) benchmark/ConfigAutotuneSettings.cmake (+30/-14) benchmark/benchmark_block_adjacent_difference.cpp (+155/-267) benchmark/benchmark_block_discontinuity.cpp (+125/-214) benchmark/benchmark_block_exchange.cpp (+127/-266) benchmark/benchmark_block_histogram.cpp (+99/-186) benchmark/benchmark_block_radix_rank.cpp (+81/-157) benchmark/benchmark_block_radix_sort.cpp (+169/-251) benchmark/benchmark_block_reduce.cpp (+103/-200) benchmark/benchmark_block_run_length_decode.cpp (+86/-136) benchmark/benchmark_block_scan.cpp (+121/-227) benchmark/benchmark_block_sort.cpp (+26/-94) benchmark/benchmark_block_sort.parallel.hpp (+56/-98) benchmark/benchmark_config_dispatch.cpp (+43/-81) benchmark/benchmark_device_adjacent_difference.cpp (+35/-101) benchmark/benchmark_device_adjacent_difference.parallel.cpp.in (+13/-10) benchmark/benchmark_device_adjacent_difference.parallel.hpp (+68/-176) benchmark/benchmark_device_adjacent_find.cpp (+24/-86) benchmark/benchmark_device_adjacent_find.parallel.cpp.in (+6/-3) benchmark/benchmark_device_adjacent_find.parallel.hpp (+35/-73) benchmark/benchmark_device_batch_memcpy.cpp (+228/-405) benchmark/benchmark_device_binary_search.cpp (+50/-244) benchmark/benchmark_device_binary_search.parallel.cpp.in (+9/-2) benchmark/benchmark_device_binary_search.parallel.hpp (+86/-100) benchmark/benchmark_device_find_end.cpp (+25/-78) benchmark/benchmark_device_find_end.hpp (+35/-87) benchmark/benchmark_device_find_first_of.cpp (+32/-99) benchmark/benchmark_device_find_first_of.parallel.cpp.in (+6/-2) benchmark/benchmark_device_find_first_of.parallel.hpp (+27/-57) benchmark/benchmark_device_histogram.cpp (+275/-549) benchmark/benchmark_device_histogram.parallel.cpp.in (+6/-4) benchmark/benchmark_device_histogram.parallel.hpp (+103/-117) benchmark/benchmark_device_memory.cpp (+886/-776) benchmark/benchmark_device_merge.cpp (+28/-106) benchmark/benchmark_device_merge.parallel.cpp.in (+6/-4) benchmark/benchmark_device_merge.parallel.hpp (+103/-225) benchmark/benchmark_device_merge_inplace.cpp (+201/-0) benchmark/benchmark_device_merge_sort.cpp (+33/-69) benchmark/benchmark_device_merge_sort.hpp (+53/-120) benchmark/benchmark_device_merge_sort_block_merge.cpp (+26/-88) benchmark/benchmark_device_merge_sort_block_merge.parallel.cpp.in (+6/-4) benchmark/benchmark_device_merge_sort_block_merge.parallel.hpp (+160/-231) benchmark/benchmark_device_merge_sort_block_sort.cpp (+27/-89) benchmark/benchmark_device_merge_sort_block_sort.parallel.cpp.in (+6/-4) benchmark/benchmark_device_merge_sort_block_sort.parallel.hpp (+73/-172) benchmark/benchmark_device_nth_element.cpp (+20/-70) benchmark/benchmark_device_nth_element.hpp (+30/-79) benchmark/benchmark_device_partial_sort.cpp (+20/-69) benchmark/benchmark_device_partial_sort.hpp (+26/-60) benchmark/benchmark_device_partial_sort_copy.cpp (+17/-66) benchmark/benchmark_device_partial_sort_copy.hpp (+31/-78) benchmark/benchmark_device_partition.cpp (+109/-177) benchmark/benchmark_device_partition.parallel.cpp.in (+6/-4) benchmark/benchmark_device_partition.parallel.hpp (+145/-359) benchmark/benchmark_device_radix_sort.cpp (+40/-57) benchmark/benchmark_device_radix_sort.hpp (+78/-200) benchmark/benchmark_device_radix_sort_block_sort.cpp (+25/-87) benchmark/benchmark_device_radix_sort_block_sort.parallel.cpp.in (+6/-4) benchmark/benchmark_device_radix_sort_block_sort.parallel.hpp (+73/-178) benchmark/benchmark_device_radix_sort_onesweep.cpp (+37/-79) benchmark/benchmark_device_radix_sort_onesweep.parallel.cpp.in (+6/-4) benchmark/benchmark_device_radix_sort_onesweep.parallel.hpp (+131/-282) benchmark/benchmark_device_reduce.cpp (+23/-85) benchmark/benchmark_device_reduce.parallel.cpp.in (+9/-4) benchmark/benchmark_device_reduce.parallel.hpp (+40/-87) benchmark/benchmark_device_reduce_by_key.cpp (+5/-130) benchmark/benchmark_device_reduce_by_key.parallel.cpp.in (+5/-3) benchmark/benchmark_device_reduce_by_key.parallel.hpp (+114/-89) benchmark/benchmark_device_reduce_by_key_deterministic.cpp (+4/-109) benchmark/benchmark_device_run_length_encode.cpp (+26/-101) benchmark/benchmark_device_run_length_encode.parallel.cpp.in (+5/-3) benchmark/benchmark_device_run_length_encode.parallel.hpp (+38/-81) benchmark/benchmark_device_run_length_encode_non_trivial_runs.cpp (+31/-102) benchmark/benchmark_device_run_length_encode_non_trivial_runs.parallel.cpp.in (+7/-4) benchmark/benchmark_device_run_length_encode_non_trivial_runs.parallel.hpp (+37/-80) benchmark/benchmark_device_scan.cpp (+5/-109) benchmark/benchmark_device_scan.parallel.cpp.in (+7/-4) benchmark/benchmark_device_scan.parallel.hpp (+90/-81) benchmark/benchmark_device_scan_by_key.cpp (+5/-122) benchmark/benchmark_device_scan_by_key.parallel.cpp.in (+7/-4) benchmark/benchmark_device_scan_by_key.parallel.hpp (+112/-98) benchmark/benchmark_device_scan_by_key_deterministic.cpp (+6/-102) benchmark/benchmark_device_scan_deterministic.cpp (+4/-88) benchmark/benchmark_device_search.cpp (+18/-74) benchmark/benchmark_device_search.hpp (+35/-87) benchmark/benchmark_device_search_n.cpp (+35/-45) benchmark/benchmark_device_search_n.parallel.cpp.in (+10/-6) benchmark/benchmark_device_search_n.parallel.hpp (+129/-337) benchmark/benchmark_device_segmented_radix_sort_keys.cpp (+36/-273) benchmark/benchmark_device_segmented_radix_sort_keys.parallel.cpp.in (+8/-7) benchmark/benchmark_device_segmented_radix_sort_keys.parallel.hpp (+112/-146) benchmark/benchmark_device_segmented_radix_sort_pairs.cpp (+47/-318) benchmark/benchmark_device_segmented_radix_sort_pairs.parallel.cpp.in (+8/-7) benchmark/benchmark_device_segmented_radix_sort_pairs.parallel.hpp (+126/-169) benchmark/benchmark_device_segmented_reduce.cpp (+37/-246) benchmark/benchmark_device_segmented_reduce.parallel.cpp.in (+32/-0) benchmark/benchmark_device_segmented_reduce.parallel.hpp (+196/-0) benchmark/benchmark_device_select.cpp (+95/-181) benchmark/benchmark_device_select.parallel.cpp.in (+6/-4) benchmark/benchmark_device_select.parallel.hpp (+157/-399) benchmark/benchmark_device_transform.cpp (+28/-90) benchmark/benchmark_device_transform.parallel.cpp.in (+10/-6) benchmark/benchmark_device_transform.parallel.hpp (+69/-81) benchmark/benchmark_device_transform_pointer.cpp (+72/-0) benchmark/benchmark_device_transform_pointer.parallel.cpp.in (+37/-0) benchmark/benchmark_predicate_iterator.cpp (+59/-165) benchmark/benchmark_utils.hpp (+801/-416) benchmark/benchmark_warp_exchange.cpp (+248/-275) benchmark/benchmark_warp_reduce.cpp (+123/-221) benchmark/benchmark_warp_scan.cpp (+175/-190) benchmark/benchmark_warp_sort.cpp (+160/-224) cmake/Dependencies.cmake (+49/-3) cmake/DownloadProject.CMakeLists.cmake.in (+27/-0) cmake/DownloadProject.cmake (+170/-0) cmake/GenerateResourceSpec.cmake (+8/-7) cmake/Summary.cmake (+59/-7) common/README.md (+13/-0) common/device_adjacent_difference.hpp (+163/-0) common/device_batch_memcpy.hpp (+134/-0) common/predicate_iterator.hpp (+43/-0) common/utils.hpp (+3/-0) common/utils_custom_type.hpp (+270/-0) common/utils_data_generation.hpp (+195/-0) common/utils_device_ptr.hpp (+1/-0) common/utils_half.hpp (+22/-15) common/warp_exchange.hpp (+121/-0) debian/changelog (+45/-0) debian/control (+9/-8) debian/librocprim-tests.install (+3/-2) debian/librocprim-tests.lintian-overrides (+7/-0) debian/not-installed (+1/-2) debian/patches/Extend-docs-conf.py-for-offline-build.patch (+3/-6) debian/patches/arch-conversion-macro.patch (+38/-38) debian/patches/series (+0/-4) debian/patches/skip-building-test_device_scan.patch (+2/-2) debian/patches/test-warp-sort-ppc64el.patch (+5/-5) debian/rules (+15/-4) debian/tests/control (+1/-1) dev/null (+0/-242) docs/block_ops/ops_classes/run_length_decode.rst (+12/-0) docs/conceptual/rocPRIM-operations.rst (+141/-0) docs/conceptual/rocPRIM-performance-tuning.rst (+2/-2) docs/conceptual/rocPRIM-scope.rst (+19/-0) docs/conceptual/rocPRIM-stripe-block.rst (+15/-0) docs/conceptual/rocPRIM-type-traits.rst (+44/-11) docs/conf.py (+4/-3) docs/device_ops/adjacent_find.rst (+1/-2) docs/device_ops/config.rst (+2/-14) docs/device_ops/find_end.rst (+1/-1) docs/device_ops/merge.rst (+12/-4) docs/device_ops/partial_sort.rst (+3/-2) docs/device_ops/search_n.rst (+4/-4) docs/device_ops/select.rst (+1/-0) docs/doxygen/Doxyfile (+474/-223) docs/doxygen/blockmodule.dox (+11/-1) docs/how-to/rocPRIM-spir-v.rst (+146/-0) docs/index.rst (+26/-14) docs/install/rocPRIM-build-install-linux.rst (+50/-0) docs/install/rocPRIM-build-install-windows.rst (+42/-0) docs/install/rocPRIM-install-overview.rst (+24/-0) docs/install/rocPRIM-prerequisites.rst (+20/-0) docs/reference/developer.rst (+20/-1) docs/reference/iterators.rst (+2/-0) docs/reference/reference.rst (+0/-2) docs/reference/rocPRIM-data-type-support.rst (+49/-0) docs/reference/rocPRIM-glossary.rst (+65/-0) docs/reference/types.rst (+1/-1) docs/sphinx/_toc.yml.in (+97/-82) docs/sphinx/requirements.in (+1/-1) docs/sphinx/requirements.txt (+137/-6) example/example_type_traits_interface.cpp (+2/-0) install (+0/-10) rmake.py (+6/-2) rocprim/CMakeLists.txt (+3/-20) rocprim/include/rocprim/block/block_adjacent_difference.hpp (+57/-809) rocprim/include/rocprim/block/block_discontinuity.hpp (+70/-70) rocprim/include/rocprim/block/block_exchange.hpp (+235/-117) rocprim/include/rocprim/block/block_histogram.hpp (+22/-22) rocprim/include/rocprim/block/block_load.hpp (+35/-35) rocprim/include/rocprim/block/block_load_func.hpp (+429/-153) rocprim/include/rocprim/block/block_radix_rank.hpp (+156/-85) rocprim/include/rocprim/block/block_radix_sort.hpp (+237/-110) rocprim/include/rocprim/block/block_reduce.hpp (+107/-52) rocprim/include/rocprim/block/block_run_length_decode.hpp (+11/-7) rocprim/include/rocprim/block/block_scan.hpp (+424/-180) rocprim/include/rocprim/block/block_shuffle.hpp (+99/-95) rocprim/include/rocprim/block/block_sort.hpp (+44/-44) rocprim/include/rocprim/block/block_store.hpp (+22/-22) rocprim/include/rocprim/block/block_store_func.hpp (+301/-99) rocprim/include/rocprim/block/detail/block_adjacent_difference_impl.hpp (+138/-111) rocprim/include/rocprim/block/detail/block_histogram_sort.hpp (+8/-2) rocprim/include/rocprim/block/detail/block_radix_rank_basic.hpp (+67/-55) rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp (+13/-11) rocprim/include/rocprim/block/detail/block_reduce_raking_reduce.hpp (+6/-5) rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp (+63/-81) rocprim/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp (+155/-32) rocprim/include/rocprim/block/detail/block_scan_warp_scan.hpp (+144/-69) rocprim/include/rocprim/block/detail/block_sort_bitonic.hpp (+5/-4) rocprim/include/rocprim/common.hpp (+5/-0) rocprim/include/rocprim/config.hpp (+82/-62) rocprim/include/rocprim/detail/merge_path.hpp (+19/-9) rocprim/include/rocprim/detail/temp_storage.hpp (+23/-23) rocprim/include/rocprim/detail/various.hpp (+108/-49) rocprim/include/rocprim/detail/virtual_shared_memory.hpp (+132/-0) rocprim/include/rocprim/device/config_types.hpp (+347/-121) rocprim/include/rocprim/device/detail/config/device_adjacent_difference.hpp (+79/-1) rocprim/include/rocprim/device/detail/config/device_adjacent_difference_inplace.hpp (+81/-3) rocprim/include/rocprim/device/detail/config/device_adjacent_find.hpp (+36/-26) rocprim/include/rocprim/device/detail/config/device_batch_copy.hpp (+263/-0) rocprim/include/rocprim/device/detail/config/device_batch_memcpy.hpp (+263/-0) rocprim/include/rocprim/device/detail/config/device_binary_search.hpp (+199/-58) rocprim/include/rocprim/device/detail/config/device_find_first_of.hpp (+14/-5) rocprim/include/rocprim/device/detail/config/device_histogram.hpp (+41/-41) rocprim/include/rocprim/device/detail/config/device_lower_bound.hpp (+196/-55) rocprim/include/rocprim/device/detail/config/device_merge.hpp (+161/-16) rocprim/include/rocprim/device/detail/config/device_merge_sort_block_merge.hpp (+609/-43) rocprim/include/rocprim/device/detail/config/device_merge_sort_block_sort.hpp (+230/-162) rocprim/include/rocprim/device/detail/config/device_partition_flag.hpp (+1/-1) rocprim/include/rocprim/device/detail/config/device_partition_predicate.hpp (+1/-1) rocprim/include/rocprim/device/detail/config/device_partition_three_way.hpp (+1/-1) rocprim/include/rocprim/device/detail/config/device_partition_two_way_flag.hpp (+1/-1) rocprim/include/rocprim/device/detail/config/device_partition_two_way_predicate.hpp (+1/-1) rocprim/include/rocprim/device/detail/config/device_radix_sort_block_sort.hpp (+278/-133) rocprim/include/rocprim/device/detail/config/device_radix_sort_onesweep.hpp (+813/-103) rocprim/include/rocprim/device/detail/config/device_reduce.hpp (+19/-9) rocprim/include/rocprim/device/detail/config/device_reduce_by_key.hpp (+1430/-90) rocprim/include/rocprim/device/detail/config/device_run_length_encode.hpp (+41/-26) rocprim/include/rocprim/device/detail/config/device_run_length_encode_non_trivial.hpp (+42/-29) rocprim/include/rocprim/device/detail/config/device_scan.hpp (+33/-19) rocprim/include/rocprim/device/detail/config/device_scan_by_key.hpp (+699/-75) rocprim/include/rocprim/device/detail/config/device_search_n.hpp (+663/-0) rocprim/include/rocprim/device/detail/config/device_segmented_radix_sort.hpp (+1077/-698) rocprim/include/rocprim/device/detail/config/device_segmented_reduce.hpp (+161/-1) rocprim/include/rocprim/device/detail/config/device_select_flag.hpp (+1/-1) rocprim/include/rocprim/device/detail/config/device_select_predicate.hpp (+1/-1) rocprim/include/rocprim/device/detail/config/device_select_predicated_flag.hpp (+2/-2) rocprim/include/rocprim/device/detail/config/device_select_unique.hpp (+1/-1) rocprim/include/rocprim/device/detail/config/device_select_unique_by_key.hpp (+2/-2) rocprim/include/rocprim/device/detail/config/device_transform.hpp (+21/-11) rocprim/include/rocprim/device/detail/config/device_transform_pointer.hpp (+695/-0) rocprim/include/rocprim/device/detail/config/device_upper_bound.hpp (+199/-58) rocprim/include/rocprim/device/detail/device_adjacent_difference.hpp (+6/-7) rocprim/include/rocprim/device/detail/device_adjacent_find.hpp (+19/-22) rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp (+310/-256) rocprim/include/rocprim/device/detail/device_config_helper.hpp (+297/-110) rocprim/include/rocprim/device/detail/device_histogram.hpp (+194/-61) rocprim/include/rocprim/device/detail/device_merge.hpp (+175/-168) rocprim/include/rocprim/device/detail/device_merge_sort.hpp (+116/-172) rocprim/include/rocprim/device/detail/device_merge_sort_mergepath.hpp (+349/-351) rocprim/include/rocprim/device/detail/device_nth_element.hpp (+247/-167) rocprim/include/rocprim/device/detail/device_partition.hpp (+431/-430) rocprim/include/rocprim/device/detail/device_radix_sort.hpp (+131/-120) rocprim/include/rocprim/device/detail/device_reduce.hpp (+5/-5) rocprim/include/rocprim/device/detail/device_reduce_by_key.hpp (+92/-36) rocprim/include/rocprim/device/detail/device_run_length_encode.hpp (+89/-27) rocprim/include/rocprim/device/detail/device_scan.hpp (+83/-30) rocprim/include/rocprim/device/detail/device_scan_by_key.hpp (+231/-216) rocprim/include/rocprim/device/detail/device_scan_common.hpp (+52/-31) rocprim/include/rocprim/device/detail/device_search.hpp (+138/-68) rocprim/include/rocprim/device/detail/device_search_n.hpp (+356/-290) rocprim/include/rocprim/device/detail/device_segmented_radix_sort.hpp (+307/-254) rocprim/include/rocprim/device/detail/device_segmented_reduce.hpp (+11/-11) rocprim/include/rocprim/device/detail/device_segmented_scan.hpp (+2/-2) rocprim/include/rocprim/device/detail/device_transform.hpp (+140/-62) rocprim/include/rocprim/device/detail/lookback_scan_state.hpp (+439/-252) rocprim/include/rocprim/device/detail/ordered_block_id.hpp (+100/-4) rocprim/include/rocprim/device/device_adjacent_difference.hpp (+110/-76) rocprim/include/rocprim/device/device_adjacent_find.hpp (+29/-20) rocprim/include/rocprim/device/device_binary_search.hpp (+53/-56) rocprim/include/rocprim/device/device_binary_search_config.hpp (+16/-16) rocprim/include/rocprim/device/device_copy.hpp (+13/-12) rocprim/include/rocprim/device/device_copy_config.hpp (+3/-32) rocprim/include/rocprim/device/device_find_first_of.hpp (+56/-49) rocprim/include/rocprim/device/device_histogram.hpp (+528/-262) rocprim/include/rocprim/device/device_memcpy.hpp (+13/-12) rocprim/include/rocprim/device/device_memcpy_config.hpp (+51/-40) rocprim/include/rocprim/device/device_merge.hpp (+279/-197) rocprim/include/rocprim/device/device_merge_inplace.hpp (+924/-0) rocprim/include/rocprim/device/device_merge_inplace_config.hpp (+51/-0) rocprim/include/rocprim/device/device_merge_sort.hpp (+666/-317) rocprim/include/rocprim/device/device_merge_sort_config.hpp (+8/-8) rocprim/include/rocprim/device/device_nth_element.hpp (+23/-20) rocprim/include/rocprim/device/device_partial_sort.hpp (+7/-3) rocprim/include/rocprim/device/device_partial_sort_config.hpp (+4/-4) rocprim/include/rocprim/device/device_partition.hpp (+400/-247) rocprim/include/rocprim/device/device_radix_sort.hpp (+252/-176) rocprim/include/rocprim/device/device_radix_sort_config.hpp (+5/-5) rocprim/include/rocprim/device/device_reduce.hpp (+129/-105) rocprim/include/rocprim/device/device_reduce_by_key.hpp (+80/-67) rocprim/include/rocprim/device/device_run_length_encode.hpp (+68/-58) rocprim/include/rocprim/device/device_run_length_encode_config.hpp (+3/-3) rocprim/include/rocprim/device/device_scan.hpp (+427/-169) rocprim/include/rocprim/device/device_scan_by_key.hpp (+77/-68) rocprim/include/rocprim/device/device_search_n.hpp (+3/-0) rocprim/include/rocprim/device/device_search_n_config.hpp (+6/-5) rocprim/include/rocprim/device/device_segmented_radix_sort.hpp (+670/-594) rocprim/include/rocprim/device/device_segmented_radix_sort_config.hpp (+3/-3) rocprim/include/rocprim/device/device_segmented_reduce.hpp (+109/-99) rocprim/include/rocprim/device/device_segmented_reduce_config.hpp (+79/-0) rocprim/include/rocprim/device/device_segmented_scan.hpp (+236/-225) rocprim/include/rocprim/device/device_select.hpp (+213/-198) rocprim/include/rocprim/device/device_transform.hpp (+238/-117) rocprim/include/rocprim/device/device_transform_config.hpp (+24/-7) rocprim/include/rocprim/device/specialization/device_radix_block_sort.hpp (+48/-29) rocprim/include/rocprim/device/specialization/device_radix_merge_sort.hpp (+20/-10) rocprim/include/rocprim/intrinsics/arch.hpp (+201/-9) rocprim/include/rocprim/intrinsics/atomic.hpp (+16/-6) rocprim/include/rocprim/intrinsics/bit.hpp (+40/-3) rocprim/include/rocprim/intrinsics/thread.hpp (+33/-46) rocprim/include/rocprim/intrinsics/warp.hpp (+24/-14) rocprim/include/rocprim/intrinsics/warp_shuffle.hpp (+41/-33) rocprim/include/rocprim/iterator.hpp (+1/-0) rocprim/include/rocprim/iterator/arg_index_iterator.hpp (+14/-19) rocprim/include/rocprim/iterator/constant_iterator.hpp (+7/-13) rocprim/include/rocprim/iterator/counting_iterator.hpp (+6/-12) rocprim/include/rocprim/iterator/detail/common.hpp (+58/-0) rocprim/include/rocprim/iterator/discard_iterator.hpp (+3/-9) rocprim/include/rocprim/iterator/predicate_iterator.hpp (+3/-5) rocprim/include/rocprim/iterator/reverse_iterator.hpp (+5/-11) rocprim/include/rocprim/iterator/texture_cache_iterator.hpp (+25/-32) rocprim/include/rocprim/iterator/transform_iterator.hpp (+19/-21) rocprim/include/rocprim/iterator/transform_output_iterator.hpp (+272/-0) rocprim/include/rocprim/iterator/zip_iterator.hpp (+11/-12) rocprim/include/rocprim/rocprim.hpp (+2/-4) rocprim/include/rocprim/rocprim_version.hpp.in (+3/-3) rocprim/include/rocprim/thread/radix_key_codec.hpp (+4/-634) rocprim/include/rocprim/thread/thread_copy.hpp (+221/-0) rocprim/include/rocprim/thread/thread_load.hpp (+156/-55) rocprim/include/rocprim/thread/thread_operators.hpp (+16/-4) rocprim/include/rocprim/thread/thread_reduce.hpp (+15/-66) rocprim/include/rocprim/thread/thread_scan.hpp (+2/-1) rocprim/include/rocprim/thread/thread_search.hpp (+3/-1) rocprim/include/rocprim/thread/thread_store.hpp (+100/-46) rocprim/include/rocprim/type_traits.hpp (+1277/-371) rocprim/include/rocprim/type_traits_functions.hpp (+561/-0) rocprim/include/rocprim/types.hpp (+18/-10) rocprim/include/rocprim/types/integer_sequence.hpp (+5/-5) rocprim/include/rocprim/types/key_value_pair.hpp (+28/-22) rocprim/include/rocprim/types/tuple.hpp (+52/-15) rocprim/include/rocprim/warp/detail/warp_reduce_crosslane.hpp (+102/-13) rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp (+89/-50) rocprim/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp (+34/-37) rocprim/include/rocprim/warp/detail/warp_reduce_shuffle.hpp (+30/-31) rocprim/include/rocprim/warp/detail/warp_scan_crosslane.hpp (+213/-12) rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp (+163/-90) rocprim/include/rocprim/warp/detail/warp_scan_shared_mem.hpp (+88/-47) rocprim/include/rocprim/warp/detail/warp_scan_shuffle.hpp (+114/-66) rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp (+29/-9) rocprim/include/rocprim/warp/detail/warp_sort_shuffle.hpp (+42/-27) rocprim/include/rocprim/warp/detail/warp_sort_stable.hpp (+17/-17) rocprim/include/rocprim/warp/warp_exchange.hpp (+665/-107) rocprim/include/rocprim/warp/warp_load.hpp (+144/-81) rocprim/include/rocprim/warp/warp_reduce.hpp (+160/-103) rocprim/include/rocprim/warp/warp_scan.hpp (+480/-205) rocprim/include/rocprim/warp/warp_sort.hpp (+271/-158) rocprim/include/rocprim/warp/warp_store.hpp (+121/-66) rtest.xml (+2/-2) scripts/autotune-search/main.py (+24/-5) scripts/autotune/create_optimization.py (+31/-1) scripts/autotune/fallback_config.json (+1/-2) scripts/autotune/templates/config_template (+2/-0) scripts/autotune/templates/histogram_config_template (+1/-1) scripts/autotune/templates/merge_config_template (+1/-1) scripts/autotune/templates/search_n_config_template (+19/-0) scripts/autotune/templates/segmented_radix_sort_config_template (+2/-2) scripts/autotune/templates/segmented_reduce_config_template (+19/-0) scripts/autotune/templates/transform_pointer_config_template (+20/-0) test/CMakeLists.txt (+11/-5) test/common_test_header.hpp (+12/-82) test/extra/CMakeLists.txt (+3/-5) test/hip/test_hip_api.cpp (+6/-5) test/hip/test_hip_async_copy.cpp (+83/-58) test/hip/test_ordered_block_id.cpp (+4/-3) test/hipgraph/test_hipgraph_algs.cpp (+29/-46) test/rocprim/CMakeLists.txt (+132/-10) test/rocprim/detail/get_rocprim_version.cpp (+6/-22) test/rocprim/identity_iterator.hpp (+7/-1) test/rocprim/internal/test_internal_merge_path.cpp (+6/-8) test/rocprim/test_accumulator_t.cpp (+149/-0) test/rocprim/test_arg_index_iterator.cpp (+134/-105) test/rocprim/test_bit_cast.cpp (+213/-0) test/rocprim/test_block_adjacent_difference.cpp.in (+27/-22) test/rocprim/test_block_adjacent_difference.hpp (+20/-60) test/rocprim/test_block_adjacent_difference.kernels.hpp (+158/-731) test/rocprim/test_block_discontinuity.cpp.in (+30/-22) test/rocprim/test_block_discontinuity.hpp (+8/-1) test/rocprim/test_block_discontinuity.kernels.hpp (+120/-214) test/rocprim/test_block_exchange.cpp (+4/-1) test/rocprim/test_block_exchange.kernels.hpp (+107/-207) test/rocprim/test_block_histogram.kernels.hpp (+35/-49) test/rocprim/test_block_load_store.cpp (+6/-5) test/rocprim/test_block_load_store.hpp (+51/-111) test/rocprim/test_block_load_store.kernels.hpp (+80/-85) test/rocprim/test_block_radix_rank.cpp.in (+4/-1) test/rocprim/test_block_radix_rank.hpp (+42/-31) test/rocprim/test_block_radix_sort.cpp.in (+19/-7) test/rocprim/test_block_radix_sort.kernels.hpp (+71/-77) test/rocprim/test_block_reduce.cpp (+4/-1) test/rocprim/test_block_reduce.hpp (+120/-84) test/rocprim/test_block_reduce.kernels.hpp (+31/-39) test/rocprim/test_block_run_length_decode.cpp (+49/-69) test/rocprim/test_block_scan.cpp.in (+26/-18) test/rocprim/test_block_scan.hpp (+278/-103) test/rocprim/test_block_scan.kernels.hpp (+160/-241) test/rocprim/test_block_shuffle.cpp (+4/-1) test/rocprim/test_block_shuffle.hpp (+77/-141) test/rocprim/test_block_sort.hpp (+47/-74) test/rocprim/test_block_sort_bitonic.cpp (+4/-1) test/rocprim/test_block_sort_merge.cpp (+4/-1) test/rocprim/test_block_sort_merge_stable.cpp (+4/-1) test/rocprim/test_config_dispatch.cpp (+43/-16) test/rocprim/test_constant_iterator.cpp (+112/-30) test/rocprim/test_counting_iterator.cpp (+114/-35) test/rocprim/test_device_adjacent_difference.cpp (+186/-289) test/rocprim/test_device_adjacent_find.cpp (+27/-35) test/rocprim/test_device_batch_memcpy.cpp (+97/-187) test/rocprim/test_device_binary_search.cpp (+104/-165) test/rocprim/test_device_find_end.cpp (+58/-89) test/rocprim/test_device_find_first_of.cpp (+45/-70) test/rocprim/test_device_histogram.cpp (+277/-503) test/rocprim/test_device_merge.cpp (+96/-30) test/rocprim/test_device_merge_inplace.cpp (+452/-0) test/rocprim/test_device_merge_sort.cpp (+198/-110) test/rocprim/test_device_nth_element.cpp (+55/-98) test/rocprim/test_device_partial_sort.cpp (+55/-97) test/rocprim/test_device_partition.cpp (+351/-359) test/rocprim/test_device_radix_sort.cpp.in (+24/-11) test/rocprim/test_device_radix_sort.hpp (+201/-276) test/rocprim/test_device_reduce.cpp (+208/-277) test/rocprim/test_device_reduce_by_key.cpp (+93/-176) test/rocprim/test_device_run_length_encode.cpp (+88/-147) test/rocprim/test_device_scan.cpp (+911/-1054) test/rocprim/test_device_scan_by_key.cpp (+841/-0) test/rocprim/test_device_search.cpp (+56/-89) test/rocprim/test_device_search_n.cpp (+266/-172) test/rocprim/test_device_segmented_radix_sort.cpp.in (+9/-3) test/rocprim/test_device_segmented_radix_sort.hpp (+268/-497) test/rocprim/test_device_segmented_reduce.cpp (+100/-108) test/rocprim/test_device_segmented_scan.cpp (+256/-302) test/rocprim/test_device_select.cpp (+289/-406) test/rocprim/test_device_transform.cpp (+316/-168) test/rocprim/test_discard_iterator.cpp (+95/-124) test/rocprim/test_intrinsics.cpp (+237/-327) test/rocprim/test_intrinsics_atomic.cpp (+8/-6) test/rocprim/test_invoke_result.cpp (+6/-8) test/rocprim/test_linking.cpp (+42/-0) test/rocprim/test_linking_lib.cpp (+103/-0) test/rocprim/test_linking_new_scan.hpp (+216/-0) test/rocprim/test_lookback_reproducibility.cpp (+140/-170) test/rocprim/test_predicate_iterator.cpp (+164/-48) test/rocprim/test_radix_key_codec.cpp (+39/-29) test/rocprim/test_reverse_iterator.cpp (+16/-18) test/rocprim/test_rocprim_tuple.cpp (+384/-0) test/rocprim/test_rocprim_types.cpp (+235/-0) test/rocprim/test_seed.hpp (+20/-5) test/rocprim/test_temporary_storage_partitioning.cpp (+34/-46) test/rocprim/test_texture_cache_iterator.cpp (+161/-44) test/rocprim/test_thread.cpp (+41/-63) test/rocprim/test_thread_algos.cpp (+493/-227) test/rocprim/test_transform_iterator.cpp (+324/-99) test/rocprim/test_tuple.cpp (+85/-0) test/rocprim/test_type_traits_interface.cpp (+12/-95) test/rocprim/test_utils.hpp (+343/-165) test/rocprim/test_utils_assertions.hpp (+132/-33) test/rocprim/test_utils_bfloat16.hpp (+6/-4) test/rocprim/test_utils_custom_float_type.hpp (+18/-33) test/rocprim/test_utils_custom_test_types.hpp (+34/-132) test/rocprim/test_utils_data_generation.hpp (+209/-177) test/rocprim/test_utils_data_generation_with_rocrand.hpp (+130/-0) test/rocprim/test_utils_get_random_data.hpp (+44/-0) test/rocprim/test_utils_sort_checker.cpp (+141/-0) test/rocprim/test_utils_sort_checker.hpp (+165/-0) test/rocprim/test_utils_sort_comparator.hpp (+27/-27) test/rocprim/test_utils_types.hpp (+108/-118) test/rocprim/test_warp_exchange.cpp (+134/-227) test/rocprim/test_warp_load.cpp (+48/-44) test/rocprim/test_warp_reduce.cpp (+3/-1) test/rocprim/test_warp_reduce.hpp (+361/-400) test/rocprim/test_warp_reduce.kernels.hpp (+85/-99) test/rocprim/test_warp_scan.cpp (+9/-3) test/rocprim/test_warp_scan.hpp (+746/-503) test/rocprim/test_warp_scan.kernels.hpp (+214/-152) test/rocprim/test_warp_sort.cpp (+3/-1) test/rocprim/test_warp_sort.hpp (+53/-87) test/rocprim/test_warp_sort.kernels.hpp (+35/-28) test/rocprim/test_warp_store.cpp (+41/-41) test/rocprim/test_zip_iterator.cpp (+65/-178) toolchain-windows.cmake (+1/-1) |
||||
| Related bugs: |
|
| Reviewer | Review Type | Date Requested | Status |
|---|---|---|---|
| Andreas Hasenack | Approve | ||
| Ubuntu Sponsors | Pending | ||
|
Review via email:
|
|||
Commit message
Description of the change
Update to upstream version 7.1.0
| Igor Luppi (igorluppi) wrote : | # |
| Andreas Hasenack (ahasenack) wrote : | # |
--- a/debian/control
+++ b/debian/control
@@ -32,7 +33,7 @@ Description: parallel primitives for GPU-accelerated code - headers
Package: librocprim-tests
Section: libdevel
-Architecture: amd64 arm64 ppc64el
+Architecture: amd64 arm64
XB-X-ROCm-
Depends: ${misc:Depends}, ${shlibs:Depends},
Build-Profiles: <!nocheck>
Please investigate the reverse dependencies of librocprim-tests to see what else would have to be dropped in ppc64el.
| Igor Luppi (igorluppi) wrote : | # |
Thanks Andreas
$ reverse-depends --arch ppc64el src:rocprim -x
Reverse-Depends
===============
* libhipcub-dev (for librocprim-dev)
* librocthrust-dev (for librocprim-dev)
* libstdgpu-hip-dev
Its safe to drop since its only required by rocm stack. Also librocthrust-dev was detailed here: https:/
It should be fine.
| Andreas Hasenack (ahasenack) wrote : | # |
+1, sponsored:
Uploading rocprim_
Uploading rocprim_
Uploading rocprim_
Uploading rocprim_
Uploading rocprim_
Preview Diff
| 1 | diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS |
| 2 | index 809934d..2ac616b 100755 |
| 3 | --- a/.github/CODEOWNERS |
| 4 | +++ b/.github/CODEOWNERS |
| 5 | @@ -1,4 +1,4 @@ |
| 6 | -* @stanleytsang-amd @umfranzw @RobsonRLemos @lawruble13 |
| 7 | +* @stanleytsang-amd @umfranzw @RobsonRLemos |
| 8 | # Documentation files |
| 9 | docs/ @ROCm/rocm-documentation |
| 10 | *.md @ROCm/rocm-documentation |
| 11 | diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml |
| 12 | index 0f76d8c..ec65005 100644 |
| 13 | --- a/.gitlab-ci.yml |
| 14 | +++ b/.gitlab-ci.yml |
| 15 | @@ -1,6 +1,6 @@ |
| 16 | # MIT License |
| 17 | # |
| 18 | -# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 19 | +# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 20 | # |
| 21 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 22 | # of this software and associated documentation files (the "Software"), to deal |
| 23 | @@ -42,6 +42,17 @@ stages: |
| 24 | - test |
| 25 | - benchmark |
| 26 | |
| 27 | +workflow: |
| 28 | + rules: |
| 29 | + - if: $CI_MERGE_REQUEST_LABELS =~ /CI Skip/ |
| 30 | + when: never |
| 31 | + - if: $CI_MERGE_REQUEST_TITLE !~ /Draft:/ |
| 32 | + variables: |
| 33 | + ROCPRIM_TEST_RUNS: 1 |
| 34 | + - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/ |
| 35 | + variables: |
| 36 | + ROCPRIM_TEST_RUNS: 1 |
| 37 | + |
| 38 | variables: |
| 39 | PACKAGE_DIR: $BUILD_DIR/package |
| 40 | AUTOTUNE_CONFIG_DIR: ${CI_PROJECT_DIR}/autotune_config |
| 41 | @@ -110,11 +121,10 @@ copyright-date: |
| 42 | -D BUILD_TEST=ON |
| 43 | -D BUILD_EXAMPLE=ON |
| 44 | -D BUILD_BENCHMARK=ON |
| 45 | - -D GPU_TARGETS=$GPU_TARGETS |
| 46 | - -D AMDGPU_TEST_TARGETS=$GPU_TARGETS |
| 47 | + -D AMDGPU_TARGETS=$GPU_TARGETS |
| 48 | -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c |
| 49 | -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx |
| 50 | - -D CMAKE_CXX_STANDARD=14 |
| 51 | + -D CMAKE_CXX_STANDARD=17 |
| 52 | -S $CI_PROJECT_DIR |
| 53 | -B $BUILD_DIR |
| 54 | - cmake |
| 55 | @@ -164,13 +174,14 @@ build:cmake-minimum-apt: |
| 56 | - .rules:build |
| 57 | variables: |
| 58 | EXTRA_CMAKE_CXX_FLAGS: "" |
| 59 | + BUILD_TOOL_ARGS: "" |
| 60 | script: |
| 61 | - mkdir -p $BUILD_DIR |
| 62 | - cd $BUILD_DIR |
| 63 | - | # Add hardened libc++ assertions for tests only |
| 64 | if [[ $BUILD_TARGET == "TEST" ]]; then |
| 65 | echo "Configuring with hardened libc++!" |
| 66 | - EXTRA_CMAKE_CXX_FLAGS+=" -D_GLIBCXX_ASSERTIONS=ON" |
| 67 | + EXTRA_CMAKE_CXX_FLAGS+=" -D_GLIBCXX_ASSERTIONS=ON -D ROCPRIM_ENABLE_ASSERTS=ON" |
| 68 | fi |
| 69 | - cmake |
| 70 | -G Ninja |
| 71 | @@ -178,15 +189,15 @@ build:cmake-minimum-apt: |
| 72 | -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror $EXTRA_CMAKE_CXX_FLAGS" |
| 73 | -D CMAKE_BUILD_TYPE="$BUILD_TYPE" |
| 74 | -D BUILD_$BUILD_TARGET=ON |
| 75 | + -D WITH_ROCRAND=ON |
| 76 | -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c |
| 77 | -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx |
| 78 | -D BUILD_EXAMPLE=ON |
| 79 | - -D GPU_TARGETS=$GPU_TARGETS |
| 80 | - -D AMDGPU_TEST_TARGETS=$GPU_TARGETS |
| 81 | + -D AMDGPU_TARGETS=$GPU_TARGETS |
| 82 | -D CMAKE_CXX_STANDARD="$BUILD_VERSION" |
| 83 | -S $CI_PROJECT_DIR |
| 84 | -B $BUILD_DIR |
| 85 | - - cmake --build $BUILD_DIR |
| 86 | + - cmake --build $BUILD_DIR -- ${BUILD_TOOL_ARGS} |
| 87 | artifacts: |
| 88 | paths: |
| 89 | - $BUILD_DIR/.ninja_log |
| 90 | @@ -197,9 +208,36 @@ build:cmake-minimum-apt: |
| 91 | - $BUILD_DIR/gtest/ |
| 92 | - $BUILD_DIR/test/CTestTestfile.cmake |
| 93 | - $BUILD_DIR/test/rocprim/CTestTestfile.cmake |
| 94 | + - $BUILD_DIR/deps/rocrand/ |
| 95 | - $BUILD_DIR/test/rocprim/test_* |
| 96 | + - $BUILD_DIR/test/rocprim/libtest_* |
| 97 | - $BUILD_DIR/test/test_* |
| 98 | - expire_in: 2 weeks |
| 99 | + expire_in: 1 day |
| 100 | + |
| 101 | +build:spirv: |
| 102 | + stage: build |
| 103 | + needs: [] |
| 104 | + extends: |
| 105 | + - .cmake-minimum |
| 106 | + - .build:common |
| 107 | + variables: |
| 108 | + # For unknown reasons spir-v builds ignore 'clang diagnostic' pragmas that |
| 109 | + # we use to ignore internal deprecations. |
| 110 | + EXTRA_CMAKE_CXX_FLAGS: "-Wno-deprecated-declarations -mf16c -DROCPRIM_EXPERIMENTAL_SPIRV" |
| 111 | + # Since not all targets are expected to build, do not stop building other |
| 112 | + # targets when any target fails. |
| 113 | + BUILD_TOOL_ARGS: "-k 0" |
| 114 | + GPU_TARGETS: "amdgcnspirv" |
| 115 | + image: "registry.streamhpc.internal/unstable-rocm:main" |
| 116 | + allow_failure: true |
| 117 | + parallel: |
| 118 | + # Debug builds disabled due to excessive build times for debug test builds |
| 119 | + matrix: |
| 120 | + - BUILD_TYPE: Release |
| 121 | + BUILD_TARGET: [BENCHMARK, TEST] |
| 122 | + BUILD_VERSION: 17 |
| 123 | + artifacts: |
| 124 | + when: always |
| 125 | |
| 126 | build:cmake-latest: |
| 127 | stage: build |
| 128 | @@ -212,7 +250,7 @@ build:cmake-latest: |
| 129 | matrix: |
| 130 | - BUILD_TYPE: Release |
| 131 | BUILD_TARGET: [BENCHMARK, TEST] |
| 132 | - BUILD_VERSION: [14, 17] |
| 133 | + BUILD_VERSION: 17 |
| 134 | |
| 135 | build:cmake-minimum: |
| 136 | needs: [] |
| 137 | @@ -223,7 +261,7 @@ build:cmake-minimum: |
| 138 | matrix: |
| 139 | - BUILD_TYPE: [Debug, Release] |
| 140 | BUILD_TARGET: [BENCHMARK, TEST] |
| 141 | - BUILD_VERSION: 14 |
| 142 | + BUILD_VERSION: 17 |
| 143 | |
| 144 | build:package: |
| 145 | stage: build |
| 146 | @@ -240,7 +278,7 @@ build:package: |
| 147 | -G Ninja |
| 148 | -D CMAKE_CXX_COMPILER="$AMDCLANG" |
| 149 | -D CMAKE_BUILD_TYPE=Release |
| 150 | - -D CMAKE_CXX_STANDARD=14 |
| 151 | + -D CMAKE_CXX_STANDARD=17 |
| 152 | -B $PACKAGE_DIR |
| 153 | -S $CI_PROJECT_DIR |
| 154 | - cd $PACKAGE_DIR |
| 155 | @@ -249,7 +287,7 @@ build:package: |
| 156 | paths: |
| 157 | - $PACKAGE_DIR/rocprim*.deb |
| 158 | - $PACKAGE_DIR/rocprim*.zip |
| 159 | - expire_in: 2 weeks |
| 160 | + expire_in: 1 day |
| 161 | |
| 162 | build:windows: |
| 163 | stage: build |
| 164 | @@ -261,19 +299,19 @@ build:windows: |
| 165 | - .deps:visual-studio-devshell |
| 166 | parallel: |
| 167 | matrix: |
| 168 | - - BUILD_TYPE: [Debug, Release] |
| 169 | - BUILD_TARGET: [BENCHMARK, TEST] |
| 170 | + - BUILD_TYPE: Release |
| 171 | script: |
| 172 | - mkdir -p $CI_PROJECT_DIR/build |
| 173 | - cmake -G Ninja |
| 174 | -S $CI_PROJECT_DIR |
| 175 | -B $CI_PROJECT_DIR/build |
| 176 | - -D BUILD_$BUILD_TARGET=ON |
| 177 | - -D GPU_TARGETS=$GPU_TARGET |
| 178 | + -D BUILD_TEST=ON |
| 179 | + -D BUILD_BENCHMARK=ON |
| 180 | + -D AMDGPU_TARGETS=$GPU_TARGET |
| 181 | -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe" |
| 182 | -D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}" |
| 183 | -D CMAKE_BUILD_TYPE="$BUILD_TYPE" |
| 184 | - -D CMAKE_CXX_STANDARD=14 |
| 185 | + -D CMAKE_CXX_STANDARD=17 |
| 186 | - cmake --build "$CI_PROJECT_DIR/build" |
| 187 | artifacts: |
| 188 | paths: |
| 189 | @@ -285,7 +323,7 @@ build:windows: |
| 190 | - $CI_PROJECT_DIR/build/CMakeCache.txt |
| 191 | - $CI_PROJECT_DIR/build/.ninja_log |
| 192 | - $CI_PROJECT_DIR/build/CTestTestfile.cmake |
| 193 | - expire_in: 2 weeks |
| 194 | + expire_in: 1 day |
| 195 | |
| 196 | autotune:build: |
| 197 | stage: autotune |
| 198 | @@ -317,10 +355,10 @@ autotune:build: |
| 199 | -D BUILD_EXAMPLE=OFF |
| 200 | -D BUILD_BENCHMARK=ON |
| 201 | -D BENCHMARK_CONFIG_TUNING=ON |
| 202 | - -D GPU_TARGETS=$GPU_TARGETS |
| 203 | + -D AMDGPU_TARGETS=$GPU_TARGETS |
| 204 | -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c |
| 205 | -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx |
| 206 | - -D CMAKE_CXX_STANDARD=14 |
| 207 | + -D CMAKE_CXX_STANDARD=17 |
| 208 | - cmake --build . --target $BENCHMARK_TARGETS |
| 209 | - 'rm -rf $BUILD_DIR/benchmark/benchmark*.parallel' |
| 210 | # The autotune benchmarks get very large, above GitLabs upload limit. Fortunately they compress well. |
| 211 | @@ -334,19 +372,20 @@ autotune:build: |
| 212 | - $BUILD_DIR/deps/googlebenchmark/ |
| 213 | expire_in: 1 week |
| 214 | |
| 215 | -test: |
| 216 | +.test:common: |
| 217 | stage: test |
| 218 | + tags: |
| 219 | + - rocm |
| 220 | + - $GPU |
| 221 | extends: |
| 222 | - .cmake-minimum |
| 223 | - - .rules:test |
| 224 | - - .gpus:rocm |
| 225 | needs: |
| 226 | - job: build:cmake-minimum |
| 227 | parallel: |
| 228 | matrix: |
| 229 | - BUILD_TYPE: Release |
| 230 | BUILD_TARGET: TEST |
| 231 | - BUILD_VERSION: 14 |
| 232 | + BUILD_VERSION: 17 |
| 233 | script: |
| 234 | - cd $BUILD_DIR |
| 235 | - cmake |
| 236 | @@ -359,10 +398,88 @@ test: |
| 237 | - HSA_ENABLE_SDMA=0 ctest |
| 238 | --output-on-failure |
| 239 | --repeat-until-fail 2 |
| 240 | - --tests-regex "hip|$GPU_TARGET" |
| 241 | --resource-spec-file ./resources.json |
| 242 | --parallel $PARALLEL_JOBS |
| 243 | |
| 244 | +test:any-gpu: |
| 245 | + variables: |
| 246 | + GPU: "" |
| 247 | + PARALLEL_JOBS: 1 |
| 248 | + extends: |
| 249 | + - .test:common |
| 250 | + rules: |
| 251 | + - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/ && $CI_MERGE_REQUEST_LABELS !~ /Arch::/ |
| 252 | + |
| 253 | +test:label-arch: |
| 254 | + extends: |
| 255 | + - .gpus:rocm |
| 256 | + - .test:common |
| 257 | + - .rules:arch-labels |
| 258 | + |
| 259 | +test:all-gpus: |
| 260 | + variables: |
| 261 | + SHOULD_BE_UNDRAFTED: "true" |
| 262 | + extends: |
| 263 | + - .gpus:rocm |
| 264 | + - .test:common |
| 265 | + - .rules:test |
| 266 | + |
| 267 | +.test:common-spirv: |
| 268 | + stage: test |
| 269 | + tags: |
| 270 | + - rocm |
| 271 | + - $GPU |
| 272 | + extends: |
| 273 | + - .cmake-minimum |
| 274 | + allow_failure: true |
| 275 | + timeout: 3h |
| 276 | + needs: |
| 277 | + - job: build:spirv |
| 278 | + parallel: |
| 279 | + matrix: |
| 280 | + - BUILD_TYPE: Release |
| 281 | + BUILD_TARGET: TEST |
| 282 | + BUILD_VERSION: 17 |
| 283 | + image: "registry.streamhpc.internal/unstable-rocm:main" |
| 284 | + script: |
| 285 | + - cd $BUILD_DIR |
| 286 | + - cmake |
| 287 | + -D CMAKE_PREFIX_PATH=/opt/rocm |
| 288 | + -P $CI_PROJECT_DIR/cmake/GenerateResourceSpec.cmake |
| 289 | + - cat ./resources.json |
| 290 | + # Parallel execution (with other AMDGPU processes) can oversubscribe the SDMA queue. |
| 291 | + # This causes the hipMemcpy to fail, which is not reported as an error by HIP. |
| 292 | + # As a temporary workaround, disable the SDMA for test stability. |
| 293 | + - HSA_ENABLE_SDMA=0 ctest |
| 294 | + --output-on-failure |
| 295 | + --repeat-until-fail 2 |
| 296 | + --resource-spec-file ./resources.json |
| 297 | + --parallel $PARALLEL_JOBS |
| 298 | + --exclude-regex rocprim.device_partition |
| 299 | + |
| 300 | +test:any-gpu-spirv: |
| 301 | + variables: |
| 302 | + GPU: "" |
| 303 | + PARALLEL_JOBS: 1 |
| 304 | + extends: |
| 305 | + - .test:common-spirv |
| 306 | + rules: |
| 307 | + - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/ && $CI_MERGE_REQUEST_LABELS !~ /Arch::/ |
| 308 | + |
| 309 | +test:label-arch-spirv: |
| 310 | + extends: |
| 311 | + - .gpus:rocm |
| 312 | + - .test:common-spirv |
| 313 | + - .rules:arch-labels |
| 314 | + |
| 315 | +test:all-gpus-spirv: |
| 316 | + variables: |
| 317 | + SHOULD_BE_UNDRAFTED: "true" |
| 318 | + extends: |
| 319 | + - .gpus:rocm |
| 320 | + - .test:common-spirv |
| 321 | + - .rules:test |
| 322 | + |
| 323 | .test-windows-base: |
| 324 | stage: test |
| 325 | extends: |
| 326 | @@ -394,7 +511,6 @@ test-windows-release: |
| 327 | parallel: |
| 328 | matrix: |
| 329 | - BUILD_TYPE: Release |
| 330 | - BUILD_TARGET: TEST |
| 331 | |
| 332 | .test-package: |
| 333 | script: |
| 334 | @@ -402,8 +518,8 @@ test-windows-release: |
| 335 | -G Ninja |
| 336 | -D CMAKE_CXX_COMPILER="$AMDCLANG" |
| 337 | -D CMAKE_BUILD_TYPE=Release |
| 338 | - -D GPU_TARGETS=$GPU_TARGETS |
| 339 | - -D CMAKE_CXX_STANDARD=14 |
| 340 | + -D AMDGPU_TARGETS=$GPU_TARGETS |
| 341 | + -D CMAKE_CXX_STANDARD=17 |
| 342 | -S "$CI_PROJECT_DIR/test/extra" |
| 343 | -B "$CI_PROJECT_DIR/package_test" |
| 344 | - cmake --build "$CI_PROJECT_DIR/package_test" |
| 345 | @@ -425,10 +541,11 @@ test:install: |
| 346 | -G Ninja |
| 347 | -D CMAKE_CXX_COMPILER="$AMDCLANG" |
| 348 | -D CMAKE_BUILD_TYPE=Release |
| 349 | - -D CMAKE_CXX_STANDARD=14 |
| 350 | + -D CMAKE_CXX_STANDARD=17 |
| 351 | -B build |
| 352 | -S $CI_PROJECT_DIR |
| 353 | - - $SUDO_CMD cmake --build build --target install |
| 354 | + # Preserve $PATH when sudoing |
| 355 | + - $SUDO_CMD env PATH="$PATH" cmake --build build --target install |
| 356 | - !reference [.test-package, script] |
| 357 | |
| 358 | test:deb: |
| 359 | @@ -452,6 +569,10 @@ test:docs: |
| 360 | extends: |
| 361 | - .rules:test |
| 362 | - .build:docs |
| 363 | + artifacts: |
| 364 | + paths: |
| 365 | + - $DOCS_DIR/_build/html/ |
| 366 | + expire_in: 2 weeks |
| 367 | |
| 368 | .benchmark-base: |
| 369 | stage: benchmark |
| 370 | @@ -468,7 +589,7 @@ benchmark: |
| 371 | matrix: |
| 372 | - BUILD_TYPE: Release |
| 373 | BUILD_TARGET: BENCHMARK |
| 374 | - BUILD_VERSION: 14 |
| 375 | + BUILD_VERSION: 17 |
| 376 | extends: |
| 377 | - .cmake-minimum |
| 378 | - .gpus:rocm |
| 379 | @@ -489,6 +610,11 @@ benchmark: |
| 380 | --benchmark_filename_regex "${BENCHMARK_FILENAME_REGEX}" |
| 381 | --benchmark_filter_regex "${BENCHMARK_ALGORITHM_REGEX}" |
| 382 | --seed "${BENCHMARK_SEED}" |
| 383 | + - python3 |
| 384 | + .gitlab/report_noise.py |
| 385 | + --benchmark_json_dir "${BENCHMARK_RESULT_DIR}" |
| 386 | + --noise_threshold_percentage 1.0 |
| 387 | + --accept_high_noise |
| 388 | artifacts: |
| 389 | paths: |
| 390 | - ${BENCHMARK_RESULT_DIR} |
| 391 | @@ -577,6 +703,11 @@ autotune:execute-tuning: |
| 392 | --size="${AUTOTUNE_SIZE}" |
| 393 | --trials="${AUTOTUNE_TRIALS}" |
| 394 | --seed=82589933 |
| 395 | + - python3 |
| 396 | + .gitlab/report_noise.py |
| 397 | + --benchmark_json_dir "${AUTOTUNE_RESULT_DIR}" |
| 398 | + --noise_threshold_percentage 1.0 |
| 399 | + --accept_high_noise |
| 400 | |
| 401 | autotune:generate-config: |
| 402 | image: python:3.10.5-buster |
| 403 | diff --git a/.gitlab/report_noise.py b/.gitlab/report_noise.py |
| 404 | new file mode 100644 |
| 405 | index 0000000..974c2b1 |
| 406 | --- /dev/null |
| 407 | +++ b/.gitlab/report_noise.py |
| 408 | @@ -0,0 +1,251 @@ |
| 409 | +#!/usr/bin/env python3 |
| 410 | + |
| 411 | +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. |
| 412 | +# |
| 413 | +# Permission is hereby granted, free of charge, to any person obtaining a copy |
| 414 | +# of this software and associated documentation files (the "Software"), to deal |
| 415 | +# in the Software without restriction, including without limitation the rights |
| 416 | +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 417 | +# copies of the Software, and to permit persons to whom the Software is |
| 418 | +# furnished to do so, subject to the following conditions: |
| 419 | +# |
| 420 | +# The above copyright notice and this permission notice shall be included in |
| 421 | +# all copies or substantial portions of the Software. |
| 422 | +# |
| 423 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 424 | +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 425 | +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 426 | +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 427 | +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 428 | +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 429 | +# THE SOFTWARE. |
| 430 | + |
| 431 | + |
| 432 | +import argparse |
| 433 | +import json |
| 434 | +import os |
| 435 | +import re |
| 436 | +import stat |
| 437 | +import statistics |
| 438 | +import sys |
| 439 | + |
| 440 | + |
| 441 | +class colors: |
| 442 | + OK = "\033[92m" |
| 443 | + FAIL = "\033[91m" |
| 444 | + END_COLOR = "\033[0m" |
| 445 | + |
| 446 | + |
| 447 | +def print_results(results): |
| 448 | + # Store the length of the longest value in a column |
| 449 | + longest = { |
| 450 | + "name": max(len(result["name"]) for result in results), |
| 451 | + "noisy_permutations": max( |
| 452 | + len(result["noisy_permutations"]) for result in results |
| 453 | + ), |
| 454 | + "mean": max(len(result["mean"]) for result in results), |
| 455 | + "median": max(len(result["median"]) for result in results), |
| 456 | + "max": max(len(result["max"]) for result in results), |
| 457 | + "batch": max(len(result["batch"]) for result in results), |
| 458 | + "warmup": max(len(result["warmup"]) for result in results), |
| 459 | + "bytes": max(len(result["bytes"]) for result in results), |
| 460 | + } |
| 461 | + |
| 462 | + # The name of a column can be longer than its values |
| 463 | + longest = {key: max(value, len(key)) for key, value in longest.items()} |
| 464 | + |
| 465 | + printed = "name".ljust(longest["name"] + 1) |
| 466 | + printed += "noisy permutations".ljust(longest["noisy_permutations"] + 1) |
| 467 | + printed += "mean".ljust(longest["mean"] + 1) |
| 468 | + printed += "median".ljust(longest["median"] + 1) |
| 469 | + printed += "max".ljust(longest["max"] + 1) |
| 470 | + printed += "batch".ljust(longest["batch"] + 1) |
| 471 | + printed += "warmup".ljust(longest["warmup"] + 1) |
| 472 | + printed += "bytes".ljust(longest["bytes"] + 1) |
| 473 | + printed += "seed" |
| 474 | + print(printed) |
| 475 | + |
| 476 | + for result in results: |
| 477 | + printed = result["name"].ljust(longest["name"]) |
| 478 | + |
| 479 | + printed += " " |
| 480 | + printed += colors.FAIL if result["noisy"] else colors.OK |
| 481 | + printed += ( |
| 482 | + f'{result["noisy_permutations"].ljust(longest["noisy_permutations"])}' |
| 483 | + ) |
| 484 | + printed += colors.END_COLOR |
| 485 | + |
| 486 | + printed += " " |
| 487 | + printed += colors.FAIL if result["bad_mean"] else colors.OK |
| 488 | + printed += result["mean"].ljust(longest["mean"]) |
| 489 | + printed += colors.END_COLOR |
| 490 | + |
| 491 | + printed += " " |
| 492 | + printed += colors.FAIL if result["bad_median"] else colors.OK |
| 493 | + printed += result["median"].ljust(longest["median"]) |
| 494 | + printed += colors.END_COLOR |
| 495 | + |
| 496 | + printed += " " |
| 497 | + printed += colors.FAIL if result["bad_max"] else colors.OK |
| 498 | + printed += result["max"].ljust(longest["max"]) |
| 499 | + printed += colors.END_COLOR |
| 500 | + |
| 501 | + printed += " " |
| 502 | + printed += colors.FAIL if result["bad_batch"] else colors.OK |
| 503 | + printed += result["batch"].ljust(longest["batch"]) |
| 504 | + printed += colors.END_COLOR |
| 505 | + |
| 506 | + printed += " " |
| 507 | + printed += colors.FAIL if result["bad_warmup"] else colors.OK |
| 508 | + printed += result["warmup"].ljust(longest["warmup"]) |
| 509 | + printed += colors.END_COLOR |
| 510 | + |
| 511 | + printed += " " |
| 512 | + printed += colors.FAIL if result["bad_bytes"] else colors.OK |
| 513 | + printed += result["bytes"].ljust(longest["bytes"]) |
| 514 | + printed += colors.END_COLOR |
| 515 | + |
| 516 | + printed += " " |
| 517 | + printed += colors.FAIL if result["seed"] == "random" else colors.OK |
| 518 | + printed += result["seed"] |
| 519 | + printed += colors.END_COLOR |
| 520 | + |
| 521 | + print(printed) |
| 522 | + |
| 523 | + |
| 524 | +def get_results(benchmarks, threshold): |
| 525 | + def get_humanized_bytes(size): |
| 526 | + for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]: |
| 527 | + if size < 1024.0 or unit == "PiB": |
| 528 | + break |
| 529 | + size /= 1024.0 |
| 530 | + return f"{size:.1f} {unit}" |
| 531 | + |
| 532 | + success = True |
| 533 | + |
| 534 | + results = [] |
| 535 | + |
| 536 | + for benchmark in benchmarks: |
| 537 | + data = benchmark["data"] |
| 538 | + |
| 539 | + name = benchmark["name"] |
| 540 | + |
| 541 | + permutations = data["benchmarks"] |
| 542 | + |
| 543 | + cvs = [permutation["cv"] for permutation in permutations] |
| 544 | + |
| 545 | + # The cv (coefficient of variation) is a standard way of quantifying noise |
| 546 | + noises = sum(cv * 100 > threshold for cv in cvs) |
| 547 | + noisy = noises > 0 |
| 548 | + |
| 549 | + if noisy: |
| 550 | + success = False |
| 551 | + |
| 552 | + context = data["context"] |
| 553 | + |
| 554 | + noisy_permutations = f"{noises}/{len(permutations)}" |
| 555 | + |
| 556 | + mean = statistics.mean(cvs) |
| 557 | + median = statistics.median(cvs) |
| 558 | + max_ = max(cvs) |
| 559 | + |
| 560 | + batch = context["batch_iterations"] |
| 561 | + warmup = context["warmup_iterations"] |
| 562 | + |
| 563 | + bytes_ = int(context["size"]) |
| 564 | + seed = context["seed"] |
| 565 | + |
| 566 | + results.append( |
| 567 | + { |
| 568 | + "name": name, |
| 569 | + "noisy": noisy, |
| 570 | + "noisy_permutations": noisy_permutations, |
| 571 | + "bad_mean": mean * 100 > threshold, |
| 572 | + "mean": f"{mean:.1%}", |
| 573 | + "bad_median": median * 100 > threshold, |
| 574 | + "median": f"{median:.1%}", |
| 575 | + "bad_max": max_ * 100 > threshold, |
| 576 | + "max": f"{max_:.1%}", |
| 577 | + "bad_batch": int(batch) < 10, |
| 578 | + "batch": batch, |
| 579 | + "bad_warmup": int(warmup) < 5, |
| 580 | + "warmup": warmup, |
| 581 | + "bad_bytes": 0 < bytes_ < 128 * 1024 * 1024, # 128 MiB |
| 582 | + "bytes": get_humanized_bytes(int(context["size"])), |
| 583 | + "seed": seed, |
| 584 | + } |
| 585 | + ) |
| 586 | + |
| 587 | + return results, success |
| 588 | + |
| 589 | + |
| 590 | +def load_benchmarks(benchmark_json_dir): |
| 591 | + def is_benchmark_json(filename): |
| 592 | + if not re.match(r".*\.json$", filename): |
| 593 | + return False |
| 594 | + path = os.path.join(benchmark_json_dir, filename) |
| 595 | + st_mode = os.stat(path).st_mode |
| 596 | + |
| 597 | + # we are not interested in permissions, just whether it is a regular file (S_IFREG) |
| 598 | + return st_mode & stat.S_IFREG |
| 599 | + |
| 600 | + benchmark_names = [ |
| 601 | + name for name in os.listdir(benchmark_json_dir) if is_benchmark_json(name) |
| 602 | + ] |
| 603 | + |
| 604 | + success = True |
| 605 | + benchmarks = [] |
| 606 | + for benchmark_name in benchmark_names: |
| 607 | + with open(os.path.join(benchmark_json_dir, benchmark_name)) as f: |
| 608 | + try: |
| 609 | + benchmarks.append({"name": benchmark_name, "data": json.load(f)}) |
| 610 | + except json.JSONDecodeError as e: |
| 611 | + print( |
| 612 | + f"{colors.FAIL}Failed to load {benchmark_name}{colors.END_COLOR}: {e}\n", |
| 613 | + file=sys.stderr, |
| 614 | + ) |
| 615 | + success = False |
| 616 | + |
| 617 | + return benchmarks, success |
| 618 | + |
| 619 | + |
| 620 | +def main(): |
| 621 | + parser = argparse.ArgumentParser() |
| 622 | + parser.add_argument( |
| 623 | + "--noise_threshold_percentage", |
| 624 | + help="The noise threshold percentage, past which benchmark permutations are considered to be too noisy", |
| 625 | + required=True, |
| 626 | + type=float, |
| 627 | + ) |
| 628 | + parser.add_argument( |
| 629 | + "--benchmark_json_dir", |
| 630 | + help="The directory of benchmark JSON files, which to report the noise of", |
| 631 | + required=True, |
| 632 | + ) |
| 633 | + parser.add_argument( |
| 634 | + "--accept_high_noise", |
| 635 | + help="Don't call exit(1) when there is a noisy benchmark permutation", |
| 636 | + action=argparse.BooleanOptionalAction, |
| 637 | + ) |
| 638 | + args = parser.parse_args() |
| 639 | + |
| 640 | + print(f"The noise threshold is {args.noise_threshold_percentage:.1f}%\n") |
| 641 | + |
| 642 | + benchmarks, load_success = load_benchmarks(args.benchmark_json_dir) |
| 643 | + results, results_success = get_results(benchmarks, args.noise_threshold_percentage) |
| 644 | + |
| 645 | + print_results(results) |
| 646 | + |
| 647 | + if not load_success: |
| 648 | + return False |
| 649 | + if args.accept_high_noise: |
| 650 | + return True |
| 651 | + return results_success |
| 652 | + |
| 653 | + |
| 654 | +if __name__ == "__main__": |
| 655 | + success = main() |
| 656 | + if success: |
| 657 | + exit(0) |
| 658 | + else: |
| 659 | + exit(1) |
| 660 | diff --git a/.gitlab/run_benchmarks.py b/.gitlab/run_benchmarks.py |
| 661 | index f406ea4..d61b4d9 100755 |
| 662 | --- a/.gitlab/run_benchmarks.py |
| 663 | +++ b/.gitlab/run_benchmarks.py |
| 664 | @@ -1,6 +1,6 @@ |
| 665 | #!/usr/bin/env python3 |
| 666 | |
| 667 | -# Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 668 | +# Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 669 | # |
| 670 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 671 | # of this software and associated documentation files (the "Software"), to deal |
| 672 | @@ -22,13 +22,14 @@ |
| 673 | |
| 674 | import argparse |
| 675 | from collections import namedtuple |
| 676 | +import json |
| 677 | import os |
| 678 | import re |
| 679 | import stat |
| 680 | import subprocess |
| 681 | import sys |
| 682 | |
| 683 | -BenchmarkContext = namedtuple('BenchmarkContext', ['gpu_architecture', 'benchmark_output_dir', 'benchmark_dir', 'benchmark_filename_regex', 'benchmark_filter_regex', 'size', 'trials', 'seed']) |
| 684 | +BenchmarkContext = namedtuple('BenchmarkContext', ['gpu_architecture', 'benchmark_output_dir', 'benchmark_dir', 'benchmark_filename_regex', 'benchmark_filter_regex', 'size', 'trials', 'seed', 'skip_gathered']) |
| 685 | |
| 686 | def run_benchmarks(benchmark_context): |
| 687 | def is_benchmark_executable(filename): |
| 688 | @@ -41,6 +42,18 @@ def run_benchmarks(benchmark_context): |
| 689 | # and it is a regular file (S_IFREG) |
| 690 | return (st_mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) and (st_mode & stat.S_IFREG) |
| 691 | |
| 692 | + def should_skip(results_json_path): |
| 693 | + if not benchmark_context.skip_gathered: |
| 694 | + return False |
| 695 | + |
| 696 | + try: |
| 697 | + with open(results_json_path) as f: |
| 698 | + json.load(f) |
| 699 | + except (FileNotFoundError, json.JSONDecodeError): |
| 700 | + return False |
| 701 | + |
| 702 | + return True |
| 703 | + |
| 704 | success = True |
| 705 | benchmark_names = [name for name in os.listdir(benchmark_context.benchmark_dir) if is_benchmark_executable(name)] |
| 706 | print('The following benchmarks will be ran:\n{}'.format('\n'.join(benchmark_names)), file=sys.stderr, flush=True) |
| 707 | @@ -49,11 +62,11 @@ def run_benchmarks(benchmark_context): |
| 708 | |
| 709 | benchmark_path = os.path.join(benchmark_context.benchmark_dir, benchmark_name) |
| 710 | results_json_path = os.path.join(benchmark_context.benchmark_output_dir, results_json_name) |
| 711 | + if should_skip(results_json_path): |
| 712 | + print(f'Skipping {benchmark_name}, because its results have already been gathered at {results_json_path}', file=sys.stderr, flush=True) |
| 713 | + continue |
| 714 | args = [ |
| 715 | benchmark_path, |
| 716 | - '--name_format', |
| 717 | - 'json', |
| 718 | - '--benchmark_out_format=json', |
| 719 | f'--benchmark_out={results_json_path}', |
| 720 | f'--benchmark_filter={benchmark_context.benchmark_filter_regex}' |
| 721 | ] |
| 722 | @@ -103,6 +116,11 @@ def main(): |
| 723 | help='Controls the seed for random number generation for each benchmark case', |
| 724 | default='', |
| 725 | required=False) |
| 726 | + parser.add_argument('--skip_gathered', |
| 727 | + help='Skip running benchmarks whose JSON data has already been gathered', |
| 728 | + default=False, |
| 729 | + action='store_true', |
| 730 | + required=False) |
| 731 | |
| 732 | args = parser.parse_args() |
| 733 | |
| 734 | @@ -114,7 +132,8 @@ def main(): |
| 735 | args.benchmark_filter_regex, |
| 736 | args.size, |
| 737 | args.trials, |
| 738 | - args.seed) |
| 739 | + args.seed, |
| 740 | + args.skip_gathered) |
| 741 | |
| 742 | benchmark_run_successful = run_benchmarks(benchmark_context) |
| 743 | |
| 744 | diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy |
| 745 | deleted file mode 100644 |
| 746 | index d49a782..0000000 |
| 747 | --- a/.jenkins/common.groovy |
| 748 | +++ /dev/null |
| 749 | @@ -1,76 +0,0 @@ |
| 750 | -// This file is for internal AMD use. |
| 751 | -// If you are interested in running your own Jenkins, please raise a github issue for assistance. |
| 752 | - |
| 753 | -def runCompileCommand(platform, project, jobName, boolean debug=false) |
| 754 | -{ |
| 755 | - project.paths.construct_build_prefix() |
| 756 | - |
| 757 | - String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release' |
| 758 | - String buildTypeDir = debug ? 'debug' : 'release' |
| 759 | - String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' |
| 760 | - //Set CI node's gfx arch as target if PR, otherwise use default targets of the library |
| 761 | - String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' |
| 762 | - |
| 763 | - def command = """#!/usr/bin/env bash |
| 764 | - set -x |
| 765 | - cd ${project.paths.project_build_prefix} |
| 766 | - mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} |
| 767 | - ${auxiliary.gfxTargetParser()} |
| 768 | - ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../.. |
| 769 | - make -j\$(nproc) |
| 770 | - """ |
| 771 | - |
| 772 | - platform.runCommand(this, command) |
| 773 | -} |
| 774 | - |
| 775 | - |
| 776 | -def runTestCommand (platform, project) |
| 777 | -{ |
| 778 | - String sudo = auxiliary.sudo(platform.jenkinsLabel) |
| 779 | - |
| 780 | - def testCommand = "ctest --output-on-failure " |
| 781 | - def testCommandExcludeRegex = '' |
| 782 | - def testCommandExclude = "--exclude-regex \"${testCommandExcludeRegex}\"" |
| 783 | - def hmmExcludeRegex = '' |
| 784 | - def hmmTestCommandExclude = "--exclude-regex \"${hmmExcludeRegex}\"" |
| 785 | - def hmmTestCommand = '' |
| 786 | - if (platform.jenkinsLabel.contains('gfx90a')) |
| 787 | - { |
| 788 | - echo("HMM TESTS DISABLED") |
| 789 | - /*hmmTestCommand = """ |
| 790 | - export HSA_XNACK=1 |
| 791 | - export ROCPRIM_USE_HMM=1 |
| 792 | - ${testCommand} ${hmmTestCommandExclude} |
| 793 | - """*/ |
| 794 | - } |
| 795 | - echo(env.JOB_NAME) |
| 796 | - if (env.JOB_NAME.contains('bleeding-edge')) |
| 797 | - { |
| 798 | - testCommand = '' |
| 799 | - testCommandExclude = '' |
| 800 | - hmmTestCommand = '' |
| 801 | - echo("TESTS DISABLED") |
| 802 | - } |
| 803 | - def command = """#!/usr/bin/env bash |
| 804 | - set -x |
| 805 | - cd ${project.paths.project_build_prefix} |
| 806 | - cd ${project.testDirectory} |
| 807 | - ${testCommand} ${testCommandExclude} |
| 808 | - if (( \$? != 0 )); then |
| 809 | - exit 1 |
| 810 | - fi |
| 811 | - ${hmmTestCommand} |
| 812 | - """ |
| 813 | - |
| 814 | - platform.runCommand(this, command) |
| 815 | -} |
| 816 | - |
| 817 | -def runPackageCommand(platform, project) |
| 818 | -{ |
| 819 | - def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release") |
| 820 | - |
| 821 | - platform.runCommand(this, packageHelper[0]) |
| 822 | - platform.archiveArtifacts(this, packageHelper[1]) |
| 823 | -} |
| 824 | - |
| 825 | -return this |
| 826 | diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy |
| 827 | deleted file mode 100644 |
| 828 | index dd32cad..0000000 |
| 829 | --- a/.jenkins/precheckin.groovy |
| 830 | +++ /dev/null |
| 831 | @@ -1,81 +0,0 @@ |
| 832 | -#!/usr/bin/env groovy |
| 833 | -@Library('rocJenkins@pong') _ |
| 834 | -import com.amd.project.* |
| 835 | -import com.amd.docker.* |
| 836 | -import java.nio.file.Path; |
| 837 | - |
| 838 | -def runCI = |
| 839 | -{ |
| 840 | - nodeDetails, jobName-> |
| 841 | - |
| 842 | - def prj = new rocProject('rocPRIM', 'PreCheckin') |
| 843 | - prj.paths.build_command = './install -c' |
| 844 | - prj.timeout.compile = 600 |
| 845 | - |
| 846 | - def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 847 | - |
| 848 | - def commonGroovy |
| 849 | - |
| 850 | - boolean formatCheck = false |
| 851 | - |
| 852 | - def compileCommand = |
| 853 | - { |
| 854 | - platform, project-> |
| 855 | - |
| 856 | - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" |
| 857 | - commonGroovy.runCompileCommand(platform, project, jobName) |
| 858 | - } |
| 859 | - |
| 860 | - def testCommand = |
| 861 | - { |
| 862 | - platform, project-> |
| 863 | - |
| 864 | - commonGroovy.runTestCommand(platform, project) |
| 865 | - } |
| 866 | - |
| 867 | - def packageCommand = |
| 868 | - { |
| 869 | - platform, project-> |
| 870 | - |
| 871 | - commonGroovy.runPackageCommand(platform, project) |
| 872 | - } |
| 873 | - |
| 874 | - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) |
| 875 | -} |
| 876 | - |
| 877 | -ci: { |
| 878 | - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) |
| 879 | - |
| 880 | - def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], |
| 881 | - "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], |
| 882 | - "rocm-docker":[]] |
| 883 | - propertyList = auxiliary.appendPropertyList(propertyList) |
| 884 | - |
| 885 | - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] |
| 886 | - jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM') |
| 887 | - |
| 888 | - propertyList.each |
| 889 | - { |
| 890 | - jobName, property-> |
| 891 | - if (urlJobName == jobName) |
| 892 | - properties(auxiliary.addCommonProperties(property)) |
| 893 | - } |
| 894 | - |
| 895 | - jobNameList.each |
| 896 | - { |
| 897 | - jobName, nodeDetails-> |
| 898 | - if (urlJobName == jobName) |
| 899 | - stage(jobName) { |
| 900 | - runCI(nodeDetails, jobName) |
| 901 | - } |
| 902 | - } |
| 903 | - |
| 904 | - // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 |
| 905 | - if(!jobNameList.keySet().contains(urlJobName)) |
| 906 | - { |
| 907 | - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) |
| 908 | - stage(urlJobName) { |
| 909 | - runCI([ubuntu16:['gfx906']], urlJobName) |
| 910 | - } |
| 911 | - } |
| 912 | -} |
| 913 | diff --git a/.jenkins/static.groovy b/.jenkins/static.groovy |
| 914 | deleted file mode 100644 |
| 915 | index 9466b27..0000000 |
| 916 | --- a/.jenkins/static.groovy |
| 917 | +++ /dev/null |
| 918 | @@ -1,82 +0,0 @@ |
| 919 | -#!/usr/bin/env groovy |
| 920 | -@Library('rocJenkins@pong') _ |
| 921 | -import com.amd.project.* |
| 922 | -import com.amd.docker.* |
| 923 | -import java.nio.file.Path; |
| 924 | - |
| 925 | -def runCI = |
| 926 | -{ |
| 927 | - nodeDetails, jobName-> |
| 928 | - |
| 929 | - def prj = new rocProject('rocPRIM', 'static') |
| 930 | - prj.paths.build_command = './install -c -s' |
| 931 | - prj.timeout.compile = 600 |
| 932 | - prj.timeout.packaging = 120 |
| 933 | - |
| 934 | - def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 935 | - |
| 936 | - def commonGroovy |
| 937 | - |
| 938 | - boolean formatCheck = false |
| 939 | - |
| 940 | - def compileCommand = |
| 941 | - { |
| 942 | - platform, project-> |
| 943 | - |
| 944 | - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" |
| 945 | - commonGroovy.runCompileCommand(platform, project, jobName) |
| 946 | - } |
| 947 | - |
| 948 | - def testCommand = |
| 949 | - { |
| 950 | - platform, project-> |
| 951 | - |
| 952 | - commonGroovy.runTestCommand(platform, project) |
| 953 | - } |
| 954 | - |
| 955 | - def packageCommand = |
| 956 | - { |
| 957 | - platform, project-> |
| 958 | - |
| 959 | - commonGroovy.runPackageCommand(platform, project) |
| 960 | - } |
| 961 | - |
| 962 | - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) |
| 963 | -} |
| 964 | - |
| 965 | -ci: { |
| 966 | - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) |
| 967 | - |
| 968 | - def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], |
| 969 | - "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], |
| 970 | - "rocm-docker":[]] |
| 971 | - propertyList = auxiliary.appendPropertyList(propertyList) |
| 972 | - |
| 973 | - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] |
| 974 | - jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM') |
| 975 | - |
| 976 | - propertyList.each |
| 977 | - { |
| 978 | - jobName, property-> |
| 979 | - if (urlJobName == jobName) |
| 980 | - properties(auxiliary.addCommonProperties(property)) |
| 981 | - } |
| 982 | - |
| 983 | - jobNameList.each |
| 984 | - { |
| 985 | - jobName, nodeDetails-> |
| 986 | - if (urlJobName == jobName) |
| 987 | - stage(jobName) { |
| 988 | - runCI(nodeDetails, jobName) |
| 989 | - } |
| 990 | - } |
| 991 | - |
| 992 | - // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 |
| 993 | - if(!jobNameList.keySet().contains(urlJobName)) |
| 994 | - { |
| 995 | - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) |
| 996 | - stage(urlJobName) { |
| 997 | - runCI([ubuntu16:['gfx906']], urlJobName) |
| 998 | - } |
| 999 | - } |
| 1000 | -} |
| 1001 | diff --git a/CHANGELOG.md b/CHANGELOG.md |
| 1002 | index 624c2f5..8fcaa79 100644 |
| 1003 | --- a/CHANGELOG.md |
| 1004 | +++ b/CHANGELOG.md |
| 1005 | @@ -2,19 +2,177 @@ |
| 1006 | |
| 1007 | Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projects/rocPRIM/en/latest/](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/). |
| 1008 | |
| 1009 | +## rocPRIM 4.1.0 for ROCm 7.1 |
| 1010 | + |
| 1011 | +### Added |
| 1012 | + |
| 1013 | +* Added `get_sreg_lanemask_lt`, `get_sreg_lanemask_le`, `get_sreg_lanemask_gt` and `get_sreg_lanemask_ge`. |
| 1014 | +* Added `rocprim::transform_output_iterator` and `rocprim::make_transform_output_iterator`. |
| 1015 | +* Added experimental support for SPIR-V, to use the correct tuned config for part of the appliable algorithms. |
| 1016 | +* Added a new cmake option, `BUILD_OFFLOAD_COMPRESS`. When rocPRIM is build with this option enabled, the `--offload-compress` switch is passed to the compiler. This causes the compiler to compress the binary that it generates. Compression can be useful in cases where you are compiling for a large number of targets, since this often results in a large binary. Without compression, in some cases, the generated binary may become so large symbols are placed out of range, resulting in linking errors. The new `BUILD_OFFLOAD_COMPRESS` option is set to `ON` by default. |
| 1017 | +* Added a new CMake option `-DUSE_SYSTEM_LIB` to allow tests to be built from `ROCm` libraries provided by the system. |
| 1018 | +* Added `rocprim::apply` which applies a function to a `rocprim::tuple`. |
| 1019 | + |
| 1020 | +### Changed |
| 1021 | + |
| 1022 | +* Changed tests to support `ptr-to-const` output in `/test/rocprim/test_device_batch_memcpy.cpp`. |
| 1023 | + |
| 1024 | +### Optimizations |
| 1025 | + |
| 1026 | +* Improved performance of many algorithms, by updating their tuned configs. |
| 1027 | + * 891 specializations have been improved. |
| 1028 | + * 399 specializations have been added. |
| 1029 | + |
| 1030 | +### Upcoming changes |
| 1031 | + |
| 1032 | +* Deprecated the `->` operator for the `zip_iterator`. |
| 1033 | + |
| 1034 | +### Resolved issues |
| 1035 | + |
| 1036 | +* Fixed `device_select`, `device_merge`, and `device_merge_sort` not allocating the correct amount of virtual shared memory on the host. |
| 1037 | +* Fixed the `->` operator for the `transform_iterator`, the `texture_cache_iterator` and the `arg_index_iterator`, by now returning a proxy pointer. |
| 1038 | + * The `arg_index_iterator` also now only returns the internal iterator for the `->`. |
| 1039 | + |
| 1040 | +## rocPRIM 4.0.1 for ROCm 7.0.2 |
| 1041 | + |
| 1042 | +### Resolved issues |
| 1043 | + |
| 1044 | +* Fixed compilation issue when using `rocprim::texture_cache_iterator`. |
| 1045 | +* Fixed a HIP version check used to determine whether hipStreamLegacy is supported. This resolves runtime errors that occur when hipStreamLegacy is used in versions of ROCm later than 6.4. |
| 1046 | + |
| 1047 | +## rocPRIM 4.0.0 for ROCm 7.0 |
| 1048 | + |
| 1049 | +### Added |
| 1050 | + |
| 1051 | +* Added `rocprim::accumulator_t` to ensure parity with CCCL. |
| 1052 | +* Added test for `rocprim::accumulator_t` |
| 1053 | +* Added `rocprim::invoke_result_r` to ensure parity with CCCL. |
| 1054 | +* Added function `is_build_in` into `rocprim::traits::get`. |
| 1055 | +* Added virtual shared memory as a fallback option in `rocprim::device_merge` when it exceeds shared memory capacity, similar to `rocprim::device_select`, `rocprim::device_partition`, and `rocprim::device_merge_sort`, which already include this feature. |
| 1056 | +* Added initial value support to device level inclusive scans. |
| 1057 | +* Added new optimization to the backend for `device_transform` when the input and output are pointers. |
| 1058 | +* Added `LoadType` to `transform_config`, which is used for the `device_transform` when the input and output are pointers. |
| 1059 | +* Added `rocprim:device_transform` for n-ary transform operations API with as input `n` number of iterators inside a `rocprim::tuple`. |
| 1060 | +* Added gfx950 support. |
| 1061 | +* Added `rocprim::key_value_pair::operator==`. |
| 1062 | +* Added the `rocprim::unrolled_copy` thread function to copy multiple items inside a thread. |
| 1063 | +* Added the `rocprim::unrolled_thread_load` function to load multiple items inside a thread using `rocprim::thread_load`. |
| 1064 | +* Added `rocprim::int128_t` and `rocprim::uint128_t` to benchmarks for improved performance evaluation on 128-bit integers. |
| 1065 | +* Added `rocprim::int128_t` to the supported autotuning types to improve performance for 128-bit integers. |
| 1066 | +* Added the `rocprim::merge_inplace` function for merging in-place. |
| 1067 | +* Added initial value support for warp- and block-level inclusive scan. |
| 1068 | +* Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag. |
| 1069 | +* Added tests and documentation to `lookback_scan_state`. It is still in the `detail` namespace. |
| 1070 | + |
| 1071 | +### Optimizations |
| 1072 | + |
| 1073 | +* Improved performance of `rocprim::device_select` and `rocprim::device_partition` when using multiple streams on the MI3XX architecture. |
| 1074 | + |
| 1075 | +### Changed |
| 1076 | + |
| 1077 | +* Changed the parameters `long_radix_bits` and `LongRadixBits` from `segmented_radix_sort` to `radix_bits` and `RadixBits` respectively. |
| 1078 | +* Marked the initialisation constructor of `rocprim::reverse_iterator<Iter>` `explicit`, use `rocprim::make_reverse_iterator`. |
| 1079 | +* Merged `radix_key_codec` into type_traits system. |
| 1080 | +* Renamed `type_traits_interface.hpp` to `type_traits.hpp`, rename the original `type_traits.hpp` to `type_traits_functions.hpp`. |
| 1081 | +* The default scan accumulator types for device-level scan algorithms have changed. This is a breaking change. |
| 1082 | +The previous default accumulator types could lead to situations in which unexpected overflow occured, such as |
| 1083 | +when the input or inital type was smaller than the output type. |
| 1084 | + * This is a complete list of affected functions and how their default accumulator types are changing: |
| 1085 | + * `rocprim::inclusive_scan` |
| 1086 | + * Previous default: `class AccType = typename std::iterator_traits<InputIterator>::value_type>` |
| 1087 | + * Current default: `class AccType = rocprim::accumulator_t<BinaryFunction, typename std::iterator_traits<InputIterator>::value_type>` |
| 1088 | + * `rocprim::deterministic_inclusive_scan` |
| 1089 | + * Previous default: `class AccType = typename std::iterator_traits<InputIterator>::value_type>` |
| 1090 | + * Current default: `class AccType = rocprim::accumulator_t<BinaryFunction, typename std::iterator_traits<InputIterator>::value_type>` |
| 1091 | + * `rocprim::exclusive_scan` |
| 1092 | + * Previous default: `class AccType = detail::input_type_t<InitValueType>>` |
| 1093 | + * Current default: `class AccType = rocprim::accumulator_t<BinaryFunction, rocprim::detail::input_type_t<InitValueType>>` |
| 1094 | + * `rocprim::deterministic_exclusive_scan` |
| 1095 | + * Previous default: `class AccType = detail::input_type_t<InitValueType>>` |
| 1096 | + * Current default: `class AccType = rocprim::accumulator_t<BinaryFunction, rocprim::detail::input_type_t<InitValueType>>` |
| 1097 | +* Undeprecated internal `detail::raw_storage`. |
| 1098 | +* A new version of `rocprim::thread_load` and `rocprim::thread_store` replace the deprecated `rocprim::thread_load` and `rocprim::thread_store` functions. The versions avoid inline assembly where possible, and don't hinder the optimizer as much as a result. |
| 1099 | +* Renamed `rocprim::load_cs` to `rocprim::load_nontemporal` and `rocprim::store_cs` to `rocprim::store_nontemporal` to express the intent of these load and store methods better. |
| 1100 | +* All kernels now have hidden symbol visibility. All symbols now have inline namespaces that include the library version, for example, `rocprim::ROCPRIM_300400_NS::symbol` instead of `rocPRIM::symbol`, letting the user link multiple libraries built with different versions of rocPRIM. |
| 1101 | + |
| 1102 | +### Upcoming changes |
| 1103 | + |
| 1104 | +* `rocprim::invoke_result_binary_op` and `rocprim::invoke_result_binary_op_t` are deprecated. Use `rocprim::accumulator_t` now. |
| 1105 | + |
| 1106 | +### Removed |
| 1107 | + |
| 1108 | +* Removed `rocprim::detail::float_bit_mask` and relative tests, use `rocprim::traits::float_bit_mask` instead. |
| 1109 | +* Removed `rocprim::traits::is_fundamental`, please use `rocprim::traits::get<T>::is_fundamental()` directly. |
| 1110 | +* Removed the deprecated parameters `short_radix_bits` and `ShortRadixBits` from the `segmented_radix_sort` config. They were unused, it is only an API change. |
| 1111 | +* Removed the deprecated `operator<<` from the iterators. |
| 1112 | +* Removed the deprecated `TwiddleIn` and `TwiddleOut`. Use `radix_key_codec` instead. |
| 1113 | +* Removed the deprecated flags API of `block_adjacent_difference`. Use `subtract_left()` or `block_discontinuity::flag_heads()` instead. |
| 1114 | +* Removed the deprecated `to_exclusive` functions in the warp scans. |
| 1115 | +* Removed the `rocprim::load_cs` from the `cache_load_modifier` enum. Use `rocprim::load_nontemporal` instead. |
| 1116 | +* Removed the `rocprim::store_cs` from the `cache_store_modifier` enum. Use `rocprim::store_nontemporal` instead. |
| 1117 | +* Removed the deprecated header file `rocprim/detail/match_result_type.hpp`. Include `rocprim/type_traits.hpp` instead. |
| 1118 | + * This header included `rocprim::detail::invoke_result`. Use `rocprim::invoke_result` instead. |
| 1119 | + * This header included `rocprim::detail::invoke_result_binary_op`. Use `rocprim::invoke_result_binary_op` instead. |
| 1120 | + * This header included `rocprim::detail::match_result_type`. Use `rocprim::invoke_result_binary_op_t` instead. |
| 1121 | +* Removed the deprecated `rocprim::detail::radix_key_codec` function. Use `rocprim::radix_key_codec` instead. |
| 1122 | +* Removed `rocprim/detail/radix_sort.hpp`, functionality can now be found in `rocprim/thread/radix_key_codec.hpp`. |
| 1123 | +* Removed C++14 support, only C++17 is supported. |
| 1124 | +* Due to the removal of `__AMDGCN_WAVEFRONT_SIZE` in the compiler, the following deprecated warp size-related symbols have been removed: |
| 1125 | + * `rocprim::device_warp_size()` |
| 1126 | + * For compile-time constants, this is replaced with `rocprim::arch::wavefront::min_size()` and `rocprim::arch::wavefront::max_size()`. Use this when allocating global or shared memory. |
| 1127 | + * For run-time constants, this is replaced with `rocprim::arch::wavefront::size().` |
| 1128 | + * `rocprim::warp_size()` |
| 1129 | + * Use `rocprim::host_warp_size()`, `rocprim::arch::wavefront::min_size()` or `rocprim::arch::wavefront::max_size()` instead. |
| 1130 | + * `ROCPRIM_WAVEFRONT_SIZE` |
| 1131 | + * Use `rocprim::arch::wavefront::min_size()` or `rocprim::arch::wavefront::max_size()` instead. |
| 1132 | + * `__AMDGCN_WAVEFRONT_SIZE` |
| 1133 | + * This was a fallback define for the compiler's removed symbol, having the same name. |
| 1134 | +* This release removes support for custom builds on gfx940 and gfx941. |
| 1135 | + |
| 1136 | +### Resolved issues |
| 1137 | + |
| 1138 | +* Fixed an issue where `device_batch_memcpy` reported benchmarking throughput being 2x lower than it was in reality. |
| 1139 | +* Fixed an issue where `device_segmented_reduce` reported autotuning throughput being 5x lower than it was in reality. |
| 1140 | +* Fixed device radix sort not returning the correct required temporary storage when a double buffer contains `nullptr`. |
| 1141 | +* Fixed constness of equality operators (`==` and `!=`) in `rocprim::key_value_pair`. |
| 1142 | +* Fixed an issue for the comparison operators in `arg_index_iterator` and `texture_cache_iterator`, where `<` and `>` comparators were swapped. |
| 1143 | +* Fixed an issue for the `rocprim::thread_reduce` not working correctly with a prefix value. |
| 1144 | + |
| 1145 | +### Known issues |
| 1146 | +* When using `rocprim::deterministic_inclusive_scan_by_key` and `rocprim::deterministic_exclusive_scan_by_key` the intermediate values can change order on Navi3x |
| 1147 | + * However if a commutative scan operator is used then the final scan value (output array) will still always be consistent between runs |
| 1148 | + |
| 1149 | ## rocPRIM 3.4.1 for ROCm 6.4.2 |
| 1150 | |
| 1151 | ### Upcoming changes |
| 1152 | * Changes to the template parameters of warp and block algorithms will be made in an upcoming release. |
| 1153 | |
| 1154 | -### Deprecations |
| 1155 | * Due to an upcoming compiler change the following warp size-related symbols will be removed in the next major release and are thus marked as deprecated: |
| 1156 | * `rocprim::device_warp_size()` |
| 1157 | * For compile-time constants, this is replaced with `rocprim::arch::wavefront::min_size()` and `rocprim::arch::wavefront::max_size()`. Use this when allocating global or shared memory. |
| 1158 | * For run-time constants, this is replaced with `rocprim::arch::wavefront::size().` |
| 1159 | * `rocprim::warp_size()` |
| 1160 | - * `ROCPRIM_WAVEFRONT_SIZE |
| 1161 | + * `ROCPRIM_WAVEFRONT_SIZE` |
| 1162 | |
| 1163 | +* The default scan accumulator types for device-level scan algorithms will be changed in an upcoming release, resulting in a breaking change. Previously, the default accumulator type was set to the input type for the inclusive scans and to the initial value type for the exclusive scans. This could lead to unexpected overflow if the input or initial type was smaller than the output type when the accumulator type was't explicitly set using the `AccType` template parameter. The new default accumulator types will be set to the type that results when the input or initial value type is applied to the scan operator. |
| 1164 | + |
| 1165 | + The following is the complete list of affected functions and how their default accumulator types are changing: |
| 1166 | + |
| 1167 | + * `rocprim::inclusive_scan` |
| 1168 | + * current default: `class AccType = typename std::iterator_traits<InputIterator>::value_type>` |
| 1169 | + * future default: `class AccType = rocprim::invoke_result_binary_op_t<typename std::iterator_traits<InputIterator>::value_type, BinaryFunction>` |
| 1170 | + * `rocprim::deterministic_inclusive_scan` |
| 1171 | + * current default: `class AccType = typename std::iterator_traits<InputIterator>::value_type>` |
| 1172 | + * future default: `class AccType = rocprim::invoke_result_binary_op_t<typename std::iterator_traits<InputIterator>::value_type, BinaryFunction>` |
| 1173 | + * `rocprim::exclusive_scan` |
| 1174 | + * current default: `class AccType = detail::input_type_t<InitValueType>>` |
| 1175 | + * future default: `class AccType = rocprim::invoke_result_binary_op_t<rocprim::detail::input_type_t<InitValueType>, BinaryFunction>` |
| 1176 | + * `rocprim::deterministic_exclusive_scan` |
| 1177 | + * current default: `class AccType = detail::input_type_t<InitValueType>>` |
| 1178 | + * future default: `class AccType = rocprim::invoke_result_binary_op_t<rocprim::detail::input_type_t<InitValueType>, BinaryFunction>` |
| 1179 | + |
| 1180 | +* `rocprim::load_cs` and `rocprim::store_cs` are deprecated and will be removed in an upcoming release. Alternatively, you can use `rocprim::load_nontemporal` and `rocprim::store_nontemporal` to load and store values in specific conditions (like bypassing the cache) for `rocprim::thread_load` and `rocprim::thread_store`. |
| 1181 | + |
| 1182 | ## rocPRIM 3.4.0 for ROCm 6.4.0 |
| 1183 | |
| 1184 | ### Added |
| 1185 | @@ -34,8 +192,8 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec |
| 1186 | * Added the parallel `search` and `find_end` device functions similar to `std::search` and `std::find_end`, these functions search for the first and last occurrence of the sequence respectively. |
| 1187 | * Added a parallel device-level function, `rocprim::search_n`, similar to the C++ Standard Library `std::search_n` algorithm. |
| 1188 | * Added new constructors and a `base` function, and added `constexpr` specifier to all functions in `rocprim::reverse_iterator` to improve parity with the C++17 `std::reverse_iterator`. |
| 1189 | -* Added hipGraph support to device run-length-encode for non trivial runs (`rocprim::run_length_encode_non_trivial_runs`). |
| 1190 | -* Added configuration autotuning to device run-length-encode for non trivial runs (`rocprim::run_length_encode_non_trivial_runs`) for improved performance on selected architectures. |
| 1191 | +* Added hipGraph support to device run-length-encode for nontrivial runs (`rocprim::run_length_encode_non_trivial_runs`). |
| 1192 | +* Added configuration autotuning to device run-length-encode for nontrivial runs (`rocprim::run_length_encode_non_trivial_runs`) for improved performance on selected architectures. |
| 1193 | * Added configuration autotuning to device run-length-encode for trivial runs (`rocprim::run_length_encode`) for improved performance on selected architectures. |
| 1194 | * Added a new type traits interface to enable users to provide additional type trait information to rocPRIM, facilitating better compatibility with custom types. |
| 1195 | |
| 1196 | @@ -523,3 +681,5 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec |
| 1197 | |
| 1198 | * Switched to HIP-Clang as the default compiler |
| 1199 | * CMake searches for rocPRIM locally first; if t's not found, CMake downloads it from GitHub |
| 1200 | + |
| 1201 | + |
| 1202 | diff --git a/CMakeLists.txt b/CMakeLists.txt |
| 1203 | index f6579e5..20999f1 100644 |
| 1204 | --- a/CMakeLists.txt |
| 1205 | +++ b/CMakeLists.txt |
| 1206 | @@ -1,6 +1,6 @@ |
| 1207 | # MIT License |
| 1208 | # |
| 1209 | -# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 1210 | +# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 1211 | # |
| 1212 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1213 | # of this software and associated documentation files (the "Software"), to deal |
| 1214 | @@ -41,11 +41,8 @@ set(CMAKE_HIP_STANDARD 14) |
| 1215 | set(CMAKE_HIP_STANDARD_REQUIRED ON) |
| 1216 | set(CMAKE_HIP_EXTENSIONS OFF) |
| 1217 | |
| 1218 | -# Set CXX standard |
| 1219 | -if (CMAKE_CXX_STANDARD EQUAL 14) |
| 1220 | - message(WARNING "C++14 will be deprecated in the next major release") |
| 1221 | -elseif(NOT CMAKE_CXX_STANDARD EQUAL 17) |
| 1222 | - message(FATAL_ERROR "Only C++14 and C++17 are supported") |
| 1223 | +if(NOT CMAKE_CXX_STANDARD EQUAL 17) |
| 1224 | + message(FATAL_ERROR "Only C++17 is supported") |
| 1225 | endif() |
| 1226 | |
| 1227 | if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) |
| 1228 | @@ -68,16 +65,31 @@ include(CMakeDependentOption) |
| 1229 | # Disables building tests, benchmarks, examples |
| 1230 | option(ONLY_INSTALL "Only install" OFF) |
| 1231 | cmake_dependent_option(BUILD_TEST "Build tests (requires googletest)" OFF "NOT ONLY_INSTALL" OFF) |
| 1232 | +option(WITH_ROCRAND "Build tests with device-side data generation(requires rocRAND)" OFF) |
| 1233 | cmake_dependent_option(BUILD_BENCHMARK "Build benchmarks" OFF "NOT ONLY_INSTALL" OFF) |
| 1234 | cmake_dependent_option(BUILD_EXAMPLE "Build examples" OFF "NOT ONLY_INSTALL" OFF) |
| 1235 | option(BUILD_NAIVE_BENCHMARK "Build naive benchmarks" OFF) |
| 1236 | cmake_dependent_option(BUILD_DOCS "Build documentation (requires sphinx)" OFF "NOT ONLY_INSTALL" OFF) |
| 1237 | option(BUILD_CODE_COVERAGE "Build with code coverage enabled" OFF) |
| 1238 | option(ROCPRIM_INSTALL "Enable installation of rocPRIM (projects embedding rocPRIM may want to turn this OFF)" ON) |
| 1239 | +option(ROCPRIM_ENABLE_ASSERTS "Enable asserts in release build)" OFF) |
| 1240 | +option(BUILD_OFFLOAD_COMPRESS "Build rocPRIM with offload compression" ON) |
| 1241 | +cmake_dependent_option(USE_SYSTEM_LIB "Use installed ROCm libs when building tests" OFF BUILD_TEST OFF) |
| 1242 | |
| 1243 | check_language(HIP) |
| 1244 | cmake_dependent_option(USE_HIPCXX "Use CMake HIP language support" OFF CMAKE_HIP_COMPILER OFF) |
| 1245 | |
| 1246 | +include(CheckCXXCompilerFlag) |
| 1247 | + |
| 1248 | +if(BUILD_OFFLOAD_COMPRESS) |
| 1249 | + check_cxx_compiler_flag("--offload-compress" CXX_COMPILER_SUPPORTS_OFFLOAD_COMPRESS) |
| 1250 | + if(CXX_COMPILER_SUPPORTS_OFFLOAD_COMPRESS) |
| 1251 | + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --offload-compress") |
| 1252 | + else() |
| 1253 | + message(STATUS "Warning: BUILD_OFFLOAD_COMPRESS=ON but flag not supported by compiler. Ignoring option.") |
| 1254 | + endif() |
| 1255 | +endif() |
| 1256 | + |
| 1257 | if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) |
| 1258 | set(ROCPRIM_PROJECT_IS_TOP_LEVEL TRUE) |
| 1259 | else() |
| 1260 | @@ -104,6 +116,21 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) |
| 1261 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo") |
| 1262 | endif() |
| 1263 | |
| 1264 | +if(ROCPRIM_ENABLE_ASSERTS) |
| 1265 | + if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "") |
| 1266 | + string(TOUPPER ${CMAKE_BUILD_TYPE} BUILD_TYPE) |
| 1267 | + set(BUILD_TYPE_CXX_FLAGS "CMAKE_CXX_FLAGS_${BUILD_TYPE}") |
| 1268 | + set(BUILD_TYPE_C_FLAGS "CMAKE_C_FLAGS_${BUILD_TYPE}") |
| 1269 | + endif() |
| 1270 | + |
| 1271 | + string(REGEX REPLACE "-DNDEBUG( |$)" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") |
| 1272 | + string(REGEX REPLACE "-DNDEBUG( |$)" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") |
| 1273 | + if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "") |
| 1274 | + string(REGEX REPLACE "-DNDEBUG( |$)" "" ${BUILD_TYPE_CXX_FLAGS} "${${BUILD_TYPE_CXX_FLAGS}}") |
| 1275 | + string(REGEX REPLACE "-DNDEBUG( |$)" "" ${BUILD_TYPE_C_FLAGS} "${${BUILD_TYPE_C_FLAGS}}") |
| 1276 | + endif() |
| 1277 | +endif() |
| 1278 | + |
| 1279 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker search and installed rpath") |
| 1280 | |
| 1281 | if(DEFINED BUILD_SHARED_LIBS) |
| 1282 | @@ -115,6 +142,7 @@ set(BUILD_SHARED_LIBS OFF) # don't build client dependencies as shared |
| 1283 | |
| 1284 | # Get dependencies (required here to get rocm-cmake) |
| 1285 | include(cmake/Dependencies.cmake) |
| 1286 | + |
| 1287 | # Use target ID syntax if supported for GPU_TARGETS |
| 1288 | if(USE_HIPCXX) |
| 1289 | enable_language(HIP) |
| 1290 | @@ -130,18 +158,24 @@ else() |
| 1291 | if(BUILD_ADDRESS_SANITIZER) |
| 1292 | # ASAN builds require xnack |
| 1293 | rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS |
| 1294 | - TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+" |
| 1295 | + TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+;gfx950:xnack+" |
| 1296 | ) |
| 1297 | else() |
| 1298 | rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS |
| 1299 | - TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201" |
| 1300 | + TARGETS "gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201" |
| 1301 | ) |
| 1302 | endif() |
| 1303 | - |
| 1304 | set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE) |
| 1305 | endif() |
| 1306 | endif() |
| 1307 | |
| 1308 | +# Compressed offload binaries are currently not working with the SPIR-V target |
| 1309 | +if("amdgcnspirv" IN_LIST GPU_TARGETS) |
| 1310 | + if(BUILD_OFFLOAD_COMPRESS) |
| 1311 | + message(FATAL_ERROR "Cannot combine SPIR-V and BUILD_OFFLOAD_COMPRESS") |
| 1312 | + endif() |
| 1313 | +endif() |
| 1314 | + |
| 1315 | # TODO: Fix VerifyCompiler for HIP on Windows |
| 1316 | if (NOT WIN32) |
| 1317 | include(cmake/VerifyCompiler.cmake) |
| 1318 | @@ -149,25 +183,15 @@ endif() |
| 1319 | list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}/llvm ${ROCM_ROOT}/llvm ${ROCM_ROOT} ${ROCM_ROOT}/hip) |
| 1320 | find_package(hip REQUIRED CONFIG PATHS ${HIP_DIR} ${ROCM_PATH} /opt/rocm) |
| 1321 | |
| 1322 | -# FOR HANDLING ENABLE/DISABLE OPTIONAL BACKWARD COMPATIBILITY for FILE/FOLDER REORG |
| 1323 | -option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" OFF) |
| 1324 | -if(ROCPRIM_INSTALL AND BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32) |
| 1325 | - rocm_wrap_header_dir( |
| 1326 | - "${PROJECT_SOURCE_DIR}/rocprim/include/rocprim" |
| 1327 | - WRAPPER_LOCATIONS rocprim/include/rocprim |
| 1328 | - OUTPUT_LOCATIONS rocprim/wrapper/include/rocprim |
| 1329 | - PATTERNS *.hpp |
| 1330 | - ) |
| 1331 | -endif() |
| 1332 | - |
| 1333 | if(BUILD_CODE_COVERAGE) |
| 1334 | add_compile_options(-fprofile-arcs -ftest-coverage) |
| 1335 | add_link_options(--coverage) |
| 1336 | endif() |
| 1337 | |
| 1338 | # Setup VERSION |
| 1339 | -set(VERSION_STRING "3.4.1") |
| 1340 | +set(VERSION_STRING "4.1.0") |
| 1341 | rocm_setup_version(VERSION ${VERSION_STRING}) |
| 1342 | +math(EXPR rocprim_VERSION_NUMBER "${rocprim_VERSION_MAJOR} * 100000 + ${rocprim_VERSION_MINOR} * 100 + ${rocprim_VERSION_PATCH}") |
| 1343 | |
| 1344 | # Print configuration summary |
| 1345 | include(cmake/Summary.cmake) |
| 1346 | @@ -182,6 +206,13 @@ endif() |
| 1347 | |
| 1348 | # Tests |
| 1349 | if(BUILD_TEST) |
| 1350 | + if(USE_SYSTEM_LIB) |
| 1351 | + find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/rocprim") |
| 1352 | + if (NOT ${rocprim_VERSION} VERSION_EQUAL ${VERSION_STRING}) |
| 1353 | + message(WARNING "The installed rocprim version, ${rocprim_VERSION}, does not match project version ${VERSION_STRING}. Building tests with USE_SYSTEM_LIB=ON may not work properly.") |
| 1354 | + endif() |
| 1355 | + endif() |
| 1356 | + |
| 1357 | if (ROCPRIM_PROJECT_IS_TOP_LEVEL) |
| 1358 | rocm_package_setup_client_component(tests) |
| 1359 | endif() |
| 1360 | @@ -223,14 +254,14 @@ if (ROCPRIM_PROJECT_IS_TOP_LEVEL) |
| 1361 | rocm_package_add_deb_dependencies(STATIC_DEPENDS "hip-static-dev >= ${HIP_RUNTIME_MINIMUM}") |
| 1362 | rocm_package_add_rpm_dependencies(STATIC_DEPENDS "hip-static-devel >= ${HIP_RUNTIME_MINIMUM}") |
| 1363 | |
| 1364 | - set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.txt") |
| 1365 | + set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md") |
| 1366 | set(CPACK_RPM_PACKAGE_LICENSE "MIT") |
| 1367 | |
| 1368 | set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" ) |
| 1369 | |
| 1370 | rocm_create_package( |
| 1371 | NAME rocprim |
| 1372 | - DESCRIPTION "Radeon Open Compute Parallel Primitives Library" |
| 1373 | + DESCRIPTION "rocPRIM is a header-only library that provides HIP parallel primitives." |
| 1374 | MAINTAINER "rocPRIM Maintainer <rocprim-maintainer@amd.com>" |
| 1375 | HEADER_ONLY |
| 1376 | ) |
| 1377 | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md |
| 1378 | index 77c2847..8769ab6 100644 |
| 1379 | --- a/CONTRIBUTING.md |
| 1380 | +++ b/CONTRIBUTING.md |
| 1381 | @@ -41,6 +41,8 @@ and performs well across a variety of input types and sizes. More specifically: |
| 1382 | - Tests and benchmarks must be instantiated with all supported data types. |
| 1383 | - If the algorithm uses multiple data types (for instance, if it uses different types for input and output), a selected and representative few combinations should be tested instead of the full combination matrix. |
| 1384 | |
| 1385 | +Any utility needed by the tests **and** benchmarks must be added to the appropriate header within the `common` folder. Non-common utilities may be hosted in the corresponding headers from the `test` or `benchmark` folders. For a more detailed description of the cases to be considered for adding new utilities, please check [common](/common/README.md). |
| 1386 | + |
| 1387 | We also employ automated testing and benchmarking via checks that are run when a pull request is created. |
| 1388 | These checks: |
| 1389 | - test all algorithms for correctness across a variety of input configurations (eg. types, sizes, etc.) |
| 1390 | diff --git a/LICENSE.txt b/LICENSE.md |
| 1391 | similarity index 94% |
| 1392 | rename from LICENSE.txt |
| 1393 | rename to LICENSE.md |
| 1394 | index ba22abe..4d43ac8 100644 |
| 1395 | --- a/LICENSE.txt |
| 1396 | +++ b/LICENSE.md |
| 1397 | @@ -1,6 +1,6 @@ |
| 1398 | MIT License |
| 1399 | |
| 1400 | -Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 1401 | +Copyright (C) Advanced Micro Devices, Inc. |
| 1402 | |
| 1403 | Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1404 | of this software and associated documentation files (the "Software"), to deal |
| 1405 | @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 1406 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 1407 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 1408 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 1409 | -SOFTWARE. |
| 1410 | \ No newline at end of file |
| 1411 | +SOFTWARE. |
| 1412 | diff --git a/README.md b/README.md |
| 1413 | index 329b170..8caa96c 100644 |
| 1414 | --- a/README.md |
| 1415 | +++ b/README.md |
| 1416 | @@ -1,7 +1,7 @@ |
| 1417 | # rocPRIM |
| 1418 | |
| 1419 | > [!NOTE] |
| 1420 | -> The published documentation is available at [rocPRIM](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). |
| 1421 | +> The published rocPRIM documentation is available [here](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). |
| 1422 | |
| 1423 | rocPRIM is a header-only library that provides HIP parallel primitives. You can use this library to |
| 1424 | develop performant GPU-accelerated code on AMD ROCm platforms. |
| 1425 | @@ -28,55 +28,6 @@ Optional: |
| 1426 | * Required only for benchmarks. Building benchmarks is off by default. |
| 1427 | * This is automatically downloaded and built by the CMake script. |
| 1428 | |
| 1429 | -## Documentation |
| 1430 | - |
| 1431 | -Documentation for rocPRIM is available at |
| 1432 | -[https://rocm.docs.amd.com/projects/rocPRIM/en/latest/](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/). |
| 1433 | - |
| 1434 | -### Build documentation locally |
| 1435 | - |
| 1436 | -```shell |
| 1437 | -# Change directory to rocPRIM docs |
| 1438 | -cd rocPRIM; cd docs |
| 1439 | - |
| 1440 | -# Install documentation dependencies |
| 1441 | -python3 -m pip install -r sphinx/requirements.txt |
| 1442 | - |
| 1443 | -# Build the documentation |
| 1444 | -python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html |
| 1445 | - |
| 1446 | -# To serve the HTML docs locally |
| 1447 | -cd _build/html |
| 1448 | -python3 -m http.server |
| 1449 | -``` |
| 1450 | - |
| 1451 | -### Build documentation via CMake |
| 1452 | - |
| 1453 | -Install [rocm-cmake](https://github.com/ROCm/rocm-cmake/) |
| 1454 | - |
| 1455 | -```shell |
| 1456 | -# Change directory to rocPRIM |
| 1457 | -cd rocPRIM |
| 1458 | - |
| 1459 | -# Install documentation dependencies |
| 1460 | -python3 -m pip install -r docs/sphinx/requirements.txt |
| 1461 | - |
| 1462 | -# Set C++ compiler |
| 1463 | -# This example uses hipcc and assumes it is at the path /usr/bin |
| 1464 | -export CXX=hipcc |
| 1465 | -export PATH=/usr/bin:$PATH |
| 1466 | - |
| 1467 | -# Configure the project |
| 1468 | -cmake -S . -B ./build -D BUILD_DOCS=ON |
| 1469 | - |
| 1470 | -# Build the documentation |
| 1471 | -cmake --build ./build --target doc |
| 1472 | - |
| 1473 | -# To serve the HTML docs locally |
| 1474 | -cd ./build/docs/html |
| 1475 | -python3 -m http.server |
| 1476 | -``` |
| 1477 | - |
| 1478 | ## Build and install |
| 1479 | |
| 1480 | You can build and install rocPRIM on Linux or Windows. |
| 1481 | @@ -104,6 +55,7 @@ You can build and install rocPRIM on Linux or Windows. |
| 1482 | # If you want to detect failures on a per GFX IP basis, setting it to some set of ips will create |
| 1483 | # separate tests with the ip name embedded into the test name. Building for all, but selecting |
| 1484 | # tests only of a specific architecture is possible for eg: ctest -R gfx803|gfx900 |
| 1485 | + # USE_SYSTEM_LIB - OFF by default. Setting this flag to ON will build tests from the installed ROCm libs provided by the system. This only takes effect when BUILD_TEST is ON. |
| 1486 | # |
| 1487 | # ! IMPORTANT ! |
| 1488 | # Set C++ compiler to HIP-clang. You can do it by adding 'CXX=<path-to-compiler>' |
| 1489 | @@ -314,9 +266,8 @@ different types and operations, by passing compile-time configuration structures |
| 1490 | parameter. The main "knobs" are usually the size of the block and the number of items processed by a |
| 1491 | single thread. |
| 1492 | |
| 1493 | -rocPRIM has built-in default configurations for each of its primitives. In order to use the included |
| 1494 | -configurations, you need to define the macro `ROCPRIM_TARGET_ARCH` as `803` if you want the |
| 1495 | -algorithms optimized for gfx803 GCN version, or to `900` for gfx900. |
| 1496 | +rocPRIM has built-in default configurations for each of its primitives, these will be used automatically |
| 1497 | +based on the input types and the target architecture from the stream used. |
| 1498 | |
| 1499 | ## hipCUB |
| 1500 | |
| 1501 | @@ -325,7 +276,86 @@ algorithms optimized for gfx803 GCN version, or to `900` for gfx900. |
| 1502 | [CUB](https://github.com/NVlabs/cub). You can use it to port projects that use the CUB library to the |
| 1503 | [HIP](https://github.com/ROCm/HIP) layer and run them on AMD hardware. In the |
| 1504 | [ROCm](https://rocm.docs.amd.com/en/latest/) environment, hipCUB uses the rocPRIM library as a |
| 1505 | -backend; on CUDA platforms, it uses CUB as a backend. |
| 1506 | +backend. |
| 1507 | + |
| 1508 | +## Building the documentation locally |
| 1509 | + |
| 1510 | +### Requirements |
| 1511 | + |
| 1512 | +#### Doxygen |
| 1513 | + |
| 1514 | +The build system uses Doxygen [version 1.9.4](https://github.com/doxygen/doxygen/releases/tag/Release_1_9_4). You can try using a newer version, but that might cause issues. |
| 1515 | + |
| 1516 | +After you have downloaded Doxygen version 1.9.4: |
| 1517 | + |
| 1518 | +```shell |
| 1519 | +# Add doxygen to your PATH |
| 1520 | +echo 'export PATH=<doxygen 1.9.4 path>/bin:$PATH' >> ~/.bashrc |
| 1521 | + |
| 1522 | +# Apply the updated .bashrc |
| 1523 | +source ~/.bashrc |
| 1524 | + |
| 1525 | +# Confirm that you are using version 1.9.4 |
| 1526 | +doxygen --version |
| 1527 | +``` |
| 1528 | + |
| 1529 | +#### Python |
| 1530 | + |
| 1531 | +The build system uses Python version 3.10. You can try using a newer version, but that might cause issues. |
| 1532 | + |
| 1533 | +You can install Python 3.10 alongside your other Python versions using [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation): |
| 1534 | + |
| 1535 | +```shell |
| 1536 | +# Install Python 3.10 |
| 1537 | +pyenv install 3.10 |
| 1538 | + |
| 1539 | +# Create a Python 3.10 virtual environment |
| 1540 | +pyenv virtualenv 3.10 venv_rocprim |
| 1541 | + |
| 1542 | +# Activate the virtual environment |
| 1543 | +pyenv activate venv_rocprim |
| 1544 | +``` |
| 1545 | + |
| 1546 | +### Building |
| 1547 | + |
| 1548 | +After cloning this repository, and `cd`ing into it: |
| 1549 | + |
| 1550 | +```shell |
| 1551 | +# Install Python dependencies |
| 1552 | +python3 -m pip install -r docs/sphinx/requirements.txt |
| 1553 | + |
| 1554 | +# Build the documentation |
| 1555 | +python3 -m sphinx -T -E -b html -d docs/_build/doctrees -D language=en docs docs/_build/html |
| 1556 | +``` |
| 1557 | + |
| 1558 | +You can then open `docs/_build/html/index.html` in your browser to view the documentation. |
| 1559 | + |
| 1560 | +### Build documentation via CMake |
| 1561 | + |
| 1562 | +Install [rocm-cmake](https://github.com/ROCm/rocm-cmake/) |
| 1563 | + |
| 1564 | +```shell |
| 1565 | +# Change directory to rocPRIM |
| 1566 | +cd rocPRIM |
| 1567 | + |
| 1568 | +# Install documentation dependencies |
| 1569 | +python3 -m pip install -r docs/sphinx/requirements.txt |
| 1570 | + |
| 1571 | +# Set C++ compiler |
| 1572 | +# This example uses hipcc and assumes it is at the path /usr/bin |
| 1573 | +export CXX=hipcc |
| 1574 | +export PATH=/usr/bin:$PATH |
| 1575 | + |
| 1576 | +# Configure the project |
| 1577 | +cmake -S . -B ./build -D BUILD_DOCS=ON |
| 1578 | + |
| 1579 | +# Build the documentation |
| 1580 | +cmake --build ./build --target doc |
| 1581 | + |
| 1582 | +# To serve the HTML docs locally |
| 1583 | +cd ./build/docs/html |
| 1584 | +python3 -m http.server |
| 1585 | +``` |
| 1586 | |
| 1587 | ## Support |
| 1588 | |
| 1589 | diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt |
| 1590 | index fd8ee78..a45a6dd 100644 |
| 1591 | --- a/benchmark/CMakeLists.txt |
| 1592 | +++ b/benchmark/CMakeLists.txt |
| 1593 | @@ -1,6 +1,6 @@ |
| 1594 | # MIT License |
| 1595 | # |
| 1596 | -# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 1597 | +# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 1598 | # |
| 1599 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1600 | # of this software and associated documentation files (the "Software"), to deal |
| 1601 | @@ -147,6 +147,7 @@ add_rocprim_benchmark(benchmark_device_find_first_of.cpp) |
| 1602 | add_rocprim_benchmark(benchmark_device_find_end.cpp) |
| 1603 | add_rocprim_benchmark(benchmark_device_histogram.cpp) |
| 1604 | add_rocprim_benchmark(benchmark_device_merge.cpp) |
| 1605 | +add_rocprim_benchmark(benchmark_device_merge_inplace.cpp) |
| 1606 | add_rocprim_benchmark(benchmark_device_merge_sort.cpp) |
| 1607 | add_rocprim_benchmark(benchmark_device_merge_sort_block_sort.cpp) |
| 1608 | add_rocprim_benchmark(benchmark_device_merge_sort_block_merge.cpp) |
| 1609 | @@ -173,6 +174,7 @@ add_rocprim_benchmark(benchmark_device_segmented_radix_sort_keys.cpp) |
| 1610 | add_rocprim_benchmark(benchmark_device_segmented_radix_sort_pairs.cpp) |
| 1611 | add_rocprim_benchmark(benchmark_device_segmented_reduce.cpp) |
| 1612 | add_rocprim_benchmark(benchmark_device_transform.cpp) |
| 1613 | +add_rocprim_benchmark(benchmark_device_transform_pointer.cpp) |
| 1614 | add_rocprim_benchmark(benchmark_predicate_iterator.cpp) |
| 1615 | add_rocprim_benchmark(benchmark_warp_exchange.cpp) |
| 1616 | add_rocprim_benchmark(benchmark_warp_reduce.cpp) |
| 1617 | diff --git a/benchmark/ConfigAutotuneSettings.cmake b/benchmark/ConfigAutotuneSettings.cmake |
| 1618 | index 37f3a97..acd4f12 100644 |
| 1619 | --- a/benchmark/ConfigAutotuneSettings.cmake |
| 1620 | +++ b/benchmark/ConfigAutotuneSettings.cmake |
| 1621 | @@ -1,6 +1,6 @@ |
| 1622 | # MIT License |
| 1623 | # |
| 1624 | -# Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 1625 | +# Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 1626 | # |
| 1627 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1628 | # of this software and associated documentation files (the "Software"), to deal |
| 1629 | @@ -21,18 +21,18 @@ |
| 1630 | # SOFTWARE. |
| 1631 | |
| 1632 | # All default fallback types as listed in scripts/autotune/fallback_config.json |
| 1633 | -set(TUNING_TYPES "int64_t int short int8_t double float rocprim::half") |
| 1634 | +set(TUNING_TYPES "rocprim::int128_t int64_t int short int8_t double float rocprim::half") |
| 1635 | # If config selection happens based on two types, the second type has limited fallbacks. The selection is based |
| 1636 | # on the size and it is ignored whether the type is floating-point or integral. The autotuning script uses the |
| 1637 | # benchmarks for the integral types as fallback, hence tuning for the floating-point types is not needed. |
| 1638 | -set(LIMITED_TUNING_TYPES "int64_t int short int8_t") |
| 1639 | +set(LIMITED_TUNING_TYPES "rocprim::int128_t int64_t int short int8_t") |
| 1640 | |
| 1641 | function(read_config_autotune_settings file list_across_names list_across output_pattern_suffix) |
| 1642 | if(file STREQUAL "benchmark_device_adjacent_difference") |
| 1643 | - set(list_across_names "DataType;Left;InPlace;BlockSize" PARENT_SCOPE) |
| 1644 | + set(list_across_names "DataType;Left;Aliasing;BlockSize" PARENT_SCOPE) |
| 1645 | set(list_across "${TUNING_TYPES};\ |
| 1646 | -true;false true;32 64 128 256 512 1024" PARENT_SCOPE) |
| 1647 | - set(output_pattern_suffix "@DataType@_@Left@_@InPlace@_@BlockSize@" PARENT_SCOPE) |
| 1648 | +true;no_alias in_place;32 64 128 256 512 1024" PARENT_SCOPE) |
| 1649 | + set(output_pattern_suffix "@DataType@_@Left@_@Aliasing@_@BlockSize@" PARENT_SCOPE) |
| 1650 | elseif(file STREQUAL "benchmark_device_adjacent_find") |
| 1651 | set(list_across_names "InputType;BlockSize" PARENT_SCOPE) |
| 1652 | set(list_across "${TUNING_TYPES};64 128 256 512 1024" PARENT_SCOPE) |
| 1653 | @@ -44,13 +44,13 @@ true;false true;32 64 128 256 512 1024" PARENT_SCOPE) |
| 1654 | elseif(file STREQUAL "benchmark_device_merge_sort_block_merge") |
| 1655 | set(list_across_names "KeyType;ValueType;BlockSize;UseMergePath" PARENT_SCOPE) |
| 1656 | set(list_across "\ |
| 1657 | -${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES} custom_type<char,double>;\ |
| 1658 | +${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};\ |
| 1659 | 128 256 512 1024;true" PARENT_SCOPE) |
| 1660 | set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@UseMergePath@" PARENT_SCOPE) |
| 1661 | elseif(file STREQUAL "benchmark_device_merge_sort_block_sort") |
| 1662 | set(list_across_names "KeyType;ValueType;BlockSize;BlockSortMethod" PARENT_SCOPE) |
| 1663 | set(list_across "\ |
| 1664 | -${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES} custom_type<char,double>;\ |
| 1665 | +${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};\ |
| 1666 | 256 512 1024;rocprim::block_sort_algorithm::stable_merge_sort" PARENT_SCOPE) |
| 1667 | set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@BlockSortMethod@" PARENT_SCOPE) |
| 1668 | elseif(file STREQUAL "benchmark_device_radix_sort_block_sort") |
| 1669 | @@ -85,24 +85,40 @@ ${TUNING_TYPES};${LIMITED_TUNING_TYPES};using_warp_scan reduce_then_scan" PARENT |
| 1670 | set(list_across "\ |
| 1671 | binary_search upper_bound lower_bound;${TUNING_TYPES};${LIMITED_TUNING_TYPES};64 128 256;1 2 4 8 16" PARENT_SCOPE) |
| 1672 | set(output_pattern_suffix "@SubAlgorithm@_@ValueType@_@OutputType@_@BlockSize@_@ItemsPerThread@" PARENT_SCOPE) |
| 1673 | + elseif(file STREQUAL "benchmark_device_search_n") |
| 1674 | + set(list_across_names "InputType;BlockSize;ItemsPerThread;Threshold" PARENT_SCOPE) |
| 1675 | + set(list_across "\ |
| 1676 | +${TUNING_TYPES};64 128 256 512 1024;1 2 4 8 16;4 8 12 16" PARENT_SCOPE) |
| 1677 | + set(output_pattern_suffix "@InputType@_@BlockSize@_@ItemsPerThread@_@Threshold@" PARENT_SCOPE) |
| 1678 | elseif(file STREQUAL "benchmark_device_segmented_radix_sort_keys") |
| 1679 | set(list_across_names "\ |
| 1680 | -KeyType;LongBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE) |
| 1681 | +KeyType;RadixBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE) |
| 1682 | set(list_across "${TUNING_TYPES};8;256;4 8 16;8;4;256;64;16;8;256" PARENT_SCOPE) |
| 1683 | set(output_pattern_suffix "\ |
| 1684 | -@KeyType@_@LongBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE) |
| 1685 | +@KeyType@_@RadixBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE) |
| 1686 | elseif(file STREQUAL "benchmark_device_segmented_radix_sort_pairs") |
| 1687 | set(list_across_names "\ |
| 1688 | -KeyType;ValueType;LongBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE) |
| 1689 | - set(list_across "${TUNING_TYPES};int8_t;8;256;4 8 16;8;4;256;64;16;8;256" PARENT_SCOPE) |
| 1690 | +KeyType;ValueType;RadixBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE) |
| 1691 | + set(list_across "${TUNING_TYPES};${LIMITED_TUNING_TYPES};8;256;4 8 16;8;4;256;64;16;8;256" PARENT_SCOPE) |
| 1692 | set(output_pattern_suffix "\ |
| 1693 | -@KeyType@_@ValueType@_@LongBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE) |
| 1694 | - elseif(file STREQUAL "benchmark_device_transform") |
| 1695 | +@KeyType@_@ValueType@_@RadixBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE) |
| 1696 | +elseif(file STREQUAL "benchmark_device_segmented_reduce") |
| 1697 | + set(list_across_names "DataType;BlockSize;ItemsPerThread" PARENT_SCOPE) |
| 1698 | + set(list_across "\ |
| 1699 | +${TUNING_TYPES};64 128 256;1 2 4 8 16" PARENT_SCOPE) |
| 1700 | + set(output_pattern_suffix "@DataType@_@BlockSize@_@ItemsPerThread@" PARENT_SCOPE) |
| 1701 | +elseif(file STREQUAL "benchmark_device_transform") |
| 1702 | set(list_across_names "\ |
| 1703 | DataType;BlockSize;" PARENT_SCOPE) |
| 1704 | set(list_across "${TUNING_TYPES};64 128 256 512 1024" PARENT_SCOPE) |
| 1705 | set(output_pattern_suffix "\ |
| 1706 | @DataType@_@BlockSize@" PARENT_SCOPE) |
| 1707 | +elseif(file STREQUAL "benchmark_device_transform_pointer") |
| 1708 | + set(list_across_names "\ |
| 1709 | +DataType;BlockSize;LoadType" PARENT_SCOPE) |
| 1710 | + set(list_across "${TUNING_TYPES};64 128 256 512 1024;rocprim::load_default rocprim::load_nontemporal" PARENT_SCOPE) |
| 1711 | + set(output_pattern_suffix "\ |
| 1712 | +@DataType@_@BlockSize@_@LoadType@" PARENT_SCOPE) |
| 1713 | elseif(file STREQUAL "benchmark_device_partition") |
| 1714 | set(list_across_names "DataType;BlockSize" PARENT_SCOPE) |
| 1715 | set(list_across "${TUNING_TYPES};128 192 256 384 512" PARENT_SCOPE) |
| 1716 | diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp |
| 1717 | index 8d237f7..71af4c1 100644 |
| 1718 | --- a/benchmark/benchmark_block_adjacent_difference.cpp |
| 1719 | +++ b/benchmark/benchmark_block_adjacent_difference.cpp |
| 1720 | @@ -1,6 +1,6 @@ |
| 1721 | // MIT License |
| 1722 | // |
| 1723 | -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 1724 | +// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 1725 | // |
| 1726 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1727 | // of this software and associated documentation files (the "Software"), to deal |
| 1728 | @@ -21,11 +21,8 @@ |
| 1729 | // SOFTWARE. |
| 1730 | |
| 1731 | #include "benchmark_utils.hpp" |
| 1732 | -// CmdParser |
| 1733 | -#include "cmdparser.hpp" |
| 1734 | |
| 1735 | -// Google Benchmark |
| 1736 | -#include <benchmark/benchmark.h> |
| 1737 | +#include "../common/utils_device_ptr.hpp" |
| 1738 | |
| 1739 | // HIP API |
| 1740 | #include <hip/hip_runtime.h> |
| 1741 | @@ -34,83 +31,81 @@ |
| 1742 | #include <rocprim/block/block_adjacent_difference.hpp> |
| 1743 | #include <rocprim/block/block_load_func.hpp> |
| 1744 | #include <rocprim/block/block_store_func.hpp> |
| 1745 | +#include <rocprim/config.hpp> |
| 1746 | +#include <rocprim/functional.hpp> |
| 1747 | +#include <rocprim/intrinsics/thread.hpp> |
| 1748 | +#include <rocprim/types.hpp> |
| 1749 | |
| 1750 | -#include <algorithm> |
| 1751 | -#include <iostream> |
| 1752 | -#include <limits> |
| 1753 | +#include <cstddef> |
| 1754 | +#include <stdint.h> |
| 1755 | #include <string> |
| 1756 | +#include <type_traits> |
| 1757 | #include <vector> |
| 1758 | |
| 1759 | -#include <cstdio> |
| 1760 | -#include <cstdlib> |
| 1761 | - |
| 1762 | -#ifndef DEFAULT_N |
| 1763 | -const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; |
| 1764 | -#endif |
| 1765 | - |
| 1766 | -namespace rp = rocprim; |
| 1767 | - |
| 1768 | -template <class Benchmark, |
| 1769 | - unsigned int BlockSize, |
| 1770 | - unsigned int ItemsPerThread, |
| 1771 | - bool WithTile, |
| 1772 | - typename... Args> |
| 1773 | -__global__ __launch_bounds__(BlockSize) void kernel(Args ...args) |
| 1774 | +template<typename Benchmark, |
| 1775 | + unsigned int BlockSize, |
| 1776 | + unsigned int ItemsPerThread, |
| 1777 | + bool WithTile, |
| 1778 | + typename... Args> |
| 1779 | +__global__ __launch_bounds__(BlockSize) |
| 1780 | +void kernel(Args... args) |
| 1781 | { |
| 1782 | Benchmark::template run<BlockSize, ItemsPerThread, WithTile>(args...); |
| 1783 | } |
| 1784 | |
| 1785 | struct subtract_left |
| 1786 | { |
| 1787 | - template <unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T> |
| 1788 | - __device__ static void run(const T* d_input, T* d_output, unsigned int trials) |
| 1789 | + template<unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T> |
| 1790 | + __device__ |
| 1791 | + static void run(const T* d_input, T* d_output, unsigned int trials) |
| 1792 | { |
| 1793 | - const unsigned int lid = threadIdx.x; |
| 1794 | + const unsigned int lid = threadIdx.x; |
| 1795 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 1796 | |
| 1797 | T input[ItemsPerThread]; |
| 1798 | - rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 1799 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 1800 | |
| 1801 | - using adjacent_diff_t = rp::block_adjacent_difference<T, BlockSize>; |
| 1802 | + using adjacent_diff_t = rocprim::block_adjacent_difference<T, BlockSize>; |
| 1803 | __shared__ typename adjacent_diff_t::storage_type storage; |
| 1804 | |
| 1805 | ROCPRIM_NO_UNROLL |
| 1806 | - for(unsigned int trial = 0; trial < trials; trial++) |
| 1807 | + for(unsigned int trial = 0; trial < trials; ++trial) |
| 1808 | { |
| 1809 | T output[ItemsPerThread]; |
| 1810 | if(WithTile) |
| 1811 | { |
| 1812 | - adjacent_diff_t().subtract_left(input, output, rp::minus<>{}, T(123), storage); |
| 1813 | + adjacent_diff_t().subtract_left(input, output, rocprim::minus<>{}, T(123), storage); |
| 1814 | } |
| 1815 | else |
| 1816 | { |
| 1817 | - adjacent_diff_t().subtract_left(input, output, rp::minus<>{}, storage); |
| 1818 | + adjacent_diff_t().subtract_left(input, output, rocprim::minus<>{}, storage); |
| 1819 | } |
| 1820 | |
| 1821 | for(unsigned int i = 0; i < ItemsPerThread; ++i) |
| 1822 | { |
| 1823 | input[i] += output[i]; |
| 1824 | } |
| 1825 | - rp::syncthreads(); |
| 1826 | + rocprim::syncthreads(); |
| 1827 | } |
| 1828 | |
| 1829 | - rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 1830 | + rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 1831 | } |
| 1832 | }; |
| 1833 | |
| 1834 | struct subtract_left_partial |
| 1835 | { |
| 1836 | - template <unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T> |
| 1837 | - __device__ static void |
| 1838 | + template<unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T> |
| 1839 | + __device__ |
| 1840 | + static void |
| 1841 | run(const T* d_input, const unsigned int* tile_sizes, T* d_output, unsigned int trials) |
| 1842 | { |
| 1843 | - const unsigned int lid = threadIdx.x; |
| 1844 | + const unsigned int lid = threadIdx.x; |
| 1845 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 1846 | |
| 1847 | T input[ItemsPerThread]; |
| 1848 | - rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 1849 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 1850 | |
| 1851 | - using adjacent_diff_t = rp::block_adjacent_difference<T, BlockSize>; |
| 1852 | + using adjacent_diff_t = rocprim::block_adjacent_difference<T, BlockSize>; |
| 1853 | __shared__ typename adjacent_diff_t::storage_type storage; |
| 1854 | |
| 1855 | unsigned int tile_size = tile_sizes[blockIdx.x]; |
| 1856 | @@ -119,16 +114,25 @@ struct subtract_left_partial |
| 1857 | const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; |
| 1858 | |
| 1859 | ROCPRIM_NO_UNROLL |
| 1860 | - for(unsigned int trial = 0; trial < trials; trial++) |
| 1861 | + for(unsigned int trial = 0; trial < trials; ++trial) |
| 1862 | { |
| 1863 | T output[ItemsPerThread]; |
| 1864 | if(WithTile) |
| 1865 | { |
| 1866 | - adjacent_diff_t().subtract_left_partial(input, output, rp::minus<>{}, T(123), tile_size, storage); |
| 1867 | + adjacent_diff_t().subtract_left_partial(input, |
| 1868 | + output, |
| 1869 | + rocprim::minus<>{}, |
| 1870 | + T(123), |
| 1871 | + tile_size, |
| 1872 | + storage); |
| 1873 | } |
| 1874 | else |
| 1875 | { |
| 1876 | - adjacent_diff_t().subtract_left_partial(input, output, rp::minus<>{}, tile_size, storage); |
| 1877 | + adjacent_diff_t().subtract_left_partial(input, |
| 1878 | + output, |
| 1879 | + rocprim::minus<>{}, |
| 1880 | + tile_size, |
| 1881 | + storage); |
| 1882 | } |
| 1883 | |
| 1884 | for(unsigned int i = 0; i < ItemsPerThread; ++i) |
| 1885 | @@ -138,66 +142,69 @@ struct subtract_left_partial |
| 1886 | |
| 1887 | // Change the tile_size to even out the distribution |
| 1888 | tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); |
| 1889 | - rp::syncthreads(); |
| 1890 | + rocprim::syncthreads(); |
| 1891 | } |
| 1892 | - rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 1893 | + rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 1894 | } |
| 1895 | }; |
| 1896 | |
| 1897 | struct subtract_right |
| 1898 | { |
| 1899 | - template <unsigned int BlockSize, |
| 1900 | - unsigned int ItemsPerThread, |
| 1901 | - bool WithTile, |
| 1902 | - typename T> |
| 1903 | - __device__ static void run(const T* d_input, T* d_output, unsigned int trials) |
| 1904 | + template<unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T> |
| 1905 | + __device__ |
| 1906 | + static void run(const T* d_input, T* d_output, unsigned int trials) |
| 1907 | { |
| 1908 | - const unsigned int lid = threadIdx.x; |
| 1909 | + const unsigned int lid = threadIdx.x; |
| 1910 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 1911 | |
| 1912 | T input[ItemsPerThread]; |
| 1913 | - rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 1914 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 1915 | |
| 1916 | - using adjacent_diff_t = rp::block_adjacent_difference<T, BlockSize>; |
| 1917 | + using adjacent_diff_t = rocprim::block_adjacent_difference<T, BlockSize>; |
| 1918 | __shared__ typename adjacent_diff_t::storage_type storage; |
| 1919 | |
| 1920 | ROCPRIM_NO_UNROLL |
| 1921 | - for(unsigned int trial = 0; trial < trials; trial++) |
| 1922 | + for(unsigned int trial = 0; trial < trials; ++trial) |
| 1923 | { |
| 1924 | T output[ItemsPerThread]; |
| 1925 | if(WithTile) |
| 1926 | { |
| 1927 | - adjacent_diff_t().subtract_right(input, output, rp::minus<>{}, T(123), storage); |
| 1928 | + adjacent_diff_t().subtract_right(input, |
| 1929 | + output, |
| 1930 | + rocprim::minus<>{}, |
| 1931 | + T(123), |
| 1932 | + storage); |
| 1933 | } |
| 1934 | else |
| 1935 | { |
| 1936 | - adjacent_diff_t().subtract_right(input, output, rp::minus<>{}, storage); |
| 1937 | + adjacent_diff_t().subtract_right(input, output, rocprim::minus<>{}, storage); |
| 1938 | } |
| 1939 | |
| 1940 | for(unsigned int i = 0; i < ItemsPerThread; ++i) |
| 1941 | { |
| 1942 | input[i] += output[i]; |
| 1943 | } |
| 1944 | - rp::syncthreads(); |
| 1945 | + rocprim::syncthreads(); |
| 1946 | } |
| 1947 | |
| 1948 | - rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 1949 | + rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 1950 | } |
| 1951 | }; |
| 1952 | |
| 1953 | struct subtract_right_partial |
| 1954 | { |
| 1955 | - template <unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T> |
| 1956 | - __device__ static void |
| 1957 | + template<unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T> |
| 1958 | + __device__ |
| 1959 | + static void |
| 1960 | run(const T* d_input, const unsigned int* tile_sizes, T* d_output, unsigned int trials) |
| 1961 | { |
| 1962 | - const unsigned int lid = threadIdx.x; |
| 1963 | + const unsigned int lid = threadIdx.x; |
| 1964 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 1965 | |
| 1966 | T input[ItemsPerThread]; |
| 1967 | - rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 1968 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 1969 | |
| 1970 | - using adjacent_diff_t = rp::block_adjacent_difference<T, BlockSize>; |
| 1971 | + using adjacent_diff_t = rocprim::block_adjacent_difference<T, BlockSize>; |
| 1972 | __shared__ typename adjacent_diff_t::storage_type storage; |
| 1973 | |
| 1974 | unsigned int tile_size = tile_sizes[blockIdx.x]; |
| 1975 | @@ -205,10 +212,14 @@ struct subtract_right_partial |
| 1976 | const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; |
| 1977 | |
| 1978 | ROCPRIM_NO_UNROLL |
| 1979 | - for(unsigned int trial = 0; trial < trials; trial++) |
| 1980 | + for(unsigned int trial = 0; trial < trials; ++trial) |
| 1981 | { |
| 1982 | T output[ItemsPerThread]; |
| 1983 | - adjacent_diff_t().subtract_right_partial(input, output, rp::minus<>{}, tile_size, storage); |
| 1984 | + adjacent_diff_t().subtract_right_partial(input, |
| 1985 | + output, |
| 1986 | + rocprim::minus<>{}, |
| 1987 | + tile_size, |
| 1988 | + storage); |
| 1989 | |
| 1990 | for(unsigned int i = 0; i < ItemsPerThread; ++i) |
| 1991 | { |
| 1992 | @@ -216,27 +227,31 @@ struct subtract_right_partial |
| 1993 | } |
| 1994 | // Change the tile_size to even out the distribution |
| 1995 | tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); |
| 1996 | - rp::syncthreads(); |
| 1997 | + rocprim::syncthreads(); |
| 1998 | } |
| 1999 | - rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 2000 | + rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 2001 | } |
| 2002 | }; |
| 2003 | |
| 2004 | -template<class Benchmark, |
| 2005 | - class T, |
| 2006 | +template<typename Benchmark, |
| 2007 | + typename T, |
| 2008 | unsigned int BlockSize, |
| 2009 | unsigned int ItemsPerThread, |
| 2010 | bool WithTile, |
| 2011 | unsigned int Trials = 100> |
| 2012 | -auto run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) |
| 2013 | +auto run_benchmark(benchmark_utils::state&& state) |
| 2014 | -> std::enable_if_t<!std::is_same<Benchmark, subtract_left_partial>::value |
| 2015 | && !std::is_same<Benchmark, subtract_right_partial>::value> |
| 2016 | { |
| 2017 | + const auto& bytes = state.bytes; |
| 2018 | + const auto& seed = state.seed; |
| 2019 | + const auto& stream = state.stream; |
| 2020 | + |
| 2021 | // Calculate the number of elements N |
| 2022 | size_t N = bytes / sizeof(T); |
| 2023 | - |
| 2024 | + |
| 2025 | constexpr auto items_per_block = BlockSize * ItemsPerThread; |
| 2026 | - const auto num_blocks = (N + items_per_block - 1) / items_per_block; |
| 2027 | + const auto num_blocks = (N + items_per_block - 1) / items_per_block; |
| 2028 | // Round up size to the next multiple of items_per_block |
| 2029 | const auto size = num_blocks * items_per_block; |
| 2030 | |
| 2031 | @@ -244,70 +259,41 @@ auto run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& se |
| 2032 | const std::vector<T> input |
| 2033 | = get_random_data<T>(size, random_range.first, random_range.second, seed.get_0()); |
| 2034 | |
| 2035 | - T* d_input; |
| 2036 | - T* d_output; |
| 2037 | - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); |
| 2038 | - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); |
| 2039 | - HIP_CHECK( |
| 2040 | - hipMemcpy( |
| 2041 | - d_input, input.data(), |
| 2042 | - input.size() * sizeof(input[0]), |
| 2043 | - hipMemcpyHostToDevice |
| 2044 | - ) |
| 2045 | - ); |
| 2046 | - |
| 2047 | - // HIP events creation |
| 2048 | - hipEvent_t start, stop; |
| 2049 | - HIP_CHECK(hipEventCreate(&start)); |
| 2050 | - HIP_CHECK(hipEventCreate(&stop)); |
| 2051 | - |
| 2052 | - for(auto _ : state) |
| 2053 | - { |
| 2054 | - // Record start event |
| 2055 | - HIP_CHECK(hipEventRecord(start, stream)); |
| 2056 | - |
| 2057 | - hipLaunchKernelGGL( |
| 2058 | - HIP_KERNEL_NAME(kernel<Benchmark, BlockSize, ItemsPerThread, WithTile>), |
| 2059 | - dim3(num_blocks), dim3(BlockSize), 0, stream, |
| 2060 | - d_input, d_output, Trials |
| 2061 | - ); |
| 2062 | - HIP_CHECK(hipGetLastError()); |
| 2063 | - |
| 2064 | - // Record stop event and wait until it completes |
| 2065 | - HIP_CHECK(hipEventRecord(stop, stream)); |
| 2066 | - HIP_CHECK(hipEventSynchronize(stop)); |
| 2067 | - |
| 2068 | - float elapsed_mseconds; |
| 2069 | - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); |
| 2070 | - state.SetIterationTime(elapsed_mseconds / 1000); |
| 2071 | - } |
| 2072 | - |
| 2073 | - // Destroy HIP events |
| 2074 | - HIP_CHECK(hipEventDestroy(start)); |
| 2075 | - HIP_CHECK(hipEventDestroy(stop)); |
| 2076 | + common::device_ptr<T> d_input(input); |
| 2077 | + common::device_ptr<T> d_output(input.size()); |
| 2078 | |
| 2079 | - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); |
| 2080 | - state.SetItemsProcessed(state.iterations() * Trials * size); |
| 2081 | + state.run( |
| 2082 | + [&] |
| 2083 | + { |
| 2084 | + kernel<Benchmark, BlockSize, ItemsPerThread, WithTile> |
| 2085 | + <<<dim3(num_blocks), dim3(BlockSize), 0, stream>>>(d_input.get(), |
| 2086 | + d_output.get(), |
| 2087 | + Trials); |
| 2088 | + HIP_CHECK(hipGetLastError()); |
| 2089 | + }); |
| 2090 | |
| 2091 | - HIP_CHECK(hipFree(d_input)); |
| 2092 | - HIP_CHECK(hipFree(d_output)); |
| 2093 | + state.set_throughput(size * Trials, sizeof(T)); |
| 2094 | } |
| 2095 | |
| 2096 | -template<class Benchmark, |
| 2097 | - class T, |
| 2098 | +template<typename Benchmark, |
| 2099 | + typename T, |
| 2100 | unsigned int BlockSize, |
| 2101 | unsigned int ItemsPerThread, |
| 2102 | bool WithTile, |
| 2103 | unsigned int Trials = 100> |
| 2104 | -auto run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) |
| 2105 | +auto run_benchmark(benchmark_utils::state&& state) |
| 2106 | -> std::enable_if_t<std::is_same<Benchmark, subtract_left_partial>::value |
| 2107 | || std::is_same<Benchmark, subtract_right_partial>::value> |
| 2108 | { |
| 2109 | + const auto& bytes = state.bytes; |
| 2110 | + const auto& seed = state.seed; |
| 2111 | + const auto& stream = state.stream; |
| 2112 | + |
| 2113 | // Calculate the number of elements N |
| 2114 | size_t N = bytes / sizeof(T); |
| 2115 | |
| 2116 | static constexpr auto items_per_block = BlockSize * ItemsPerThread; |
| 2117 | - const auto num_blocks = (N + items_per_block - 1) / items_per_block; |
| 2118 | + const auto num_blocks = (N + items_per_block - 1) / items_per_block; |
| 2119 | // Round up size to the next multiple of items_per_block |
| 2120 | const auto size = num_blocks * items_per_block; |
| 2121 | |
| 2122 | @@ -323,171 +309,73 @@ auto run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& se |
| 2123 | random_range_tile_sizes.second, |
| 2124 | seed.get_1()); |
| 2125 | |
| 2126 | - T* d_input; |
| 2127 | - unsigned int* d_tile_sizes; |
| 2128 | - T* d_output; |
| 2129 | - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); |
| 2130 | - HIP_CHECK(hipMalloc(&d_tile_sizes, tile_sizes.size() * sizeof(tile_sizes[0]))); |
| 2131 | - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(input[0]))); |
| 2132 | - HIP_CHECK( |
| 2133 | - hipMemcpy( |
| 2134 | - d_input, input.data(), |
| 2135 | - input.size() * sizeof(input[0]), |
| 2136 | - hipMemcpyHostToDevice |
| 2137 | - ) |
| 2138 | - ); |
| 2139 | - HIP_CHECK( |
| 2140 | - hipMemcpy( |
| 2141 | - d_tile_sizes, tile_sizes.data(), |
| 2142 | - tile_sizes.size() * sizeof(tile_sizes[0]), |
| 2143 | - hipMemcpyHostToDevice |
| 2144 | - ) |
| 2145 | - ); |
| 2146 | - |
| 2147 | - // HIP events creation |
| 2148 | - hipEvent_t start, stop; |
| 2149 | - HIP_CHECK(hipEventCreate(&start)); |
| 2150 | - HIP_CHECK(hipEventCreate(&stop)); |
| 2151 | - |
| 2152 | - for(auto _ : state) |
| 2153 | - { |
| 2154 | - // Record start event |
| 2155 | - HIP_CHECK(hipEventRecord(start, stream)); |
| 2156 | - |
| 2157 | - hipLaunchKernelGGL( |
| 2158 | - HIP_KERNEL_NAME(kernel<Benchmark, BlockSize, ItemsPerThread, WithTile>), |
| 2159 | - dim3(num_blocks), dim3(BlockSize), 0, stream, |
| 2160 | - d_input, d_tile_sizes, d_output, Trials |
| 2161 | - ); |
| 2162 | - HIP_CHECK(hipGetLastError()); |
| 2163 | - |
| 2164 | - // Record stop event and wait until it completes |
| 2165 | - HIP_CHECK(hipEventRecord(stop, stream)); |
| 2166 | - HIP_CHECK(hipEventSynchronize(stop)); |
| 2167 | - |
| 2168 | - float elapsed_mseconds; |
| 2169 | - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); |
| 2170 | - state.SetIterationTime(elapsed_mseconds / 1000); |
| 2171 | - } |
| 2172 | + common::device_ptr<T> d_input(input); |
| 2173 | + common::device_ptr<unsigned int> d_tile_sizes(tile_sizes); |
| 2174 | + common::device_ptr<T> d_output(input.size()); |
| 2175 | |
| 2176 | - // Destroy HIP events |
| 2177 | - HIP_CHECK(hipEventDestroy(start)); |
| 2178 | - HIP_CHECK(hipEventDestroy(stop)); |
| 2179 | - |
| 2180 | - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); |
| 2181 | - state.SetItemsProcessed(state.iterations() * Trials * size); |
| 2182 | + state.run( |
| 2183 | + [&] |
| 2184 | + { |
| 2185 | + kernel<Benchmark, BlockSize, ItemsPerThread, WithTile> |
| 2186 | + <<<dim3(num_blocks), dim3(BlockSize), 0, stream>>>(d_input.get(), |
| 2187 | + d_tile_sizes.get(), |
| 2188 | + d_output.get(), |
| 2189 | + Trials); |
| 2190 | + HIP_CHECK(hipGetLastError()); |
| 2191 | + }); |
| 2192 | |
| 2193 | - HIP_CHECK(hipFree(d_input)); |
| 2194 | - HIP_CHECK(hipFree(d_tile_sizes)); |
| 2195 | - HIP_CHECK(hipFree(d_output)); |
| 2196 | + state.set_throughput(size * Trials, sizeof(T)); |
| 2197 | } |
| 2198 | |
| 2199 | #define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ |
| 2200 | - benchmark::RegisterBenchmark( \ |
| 2201 | + executor.queue_fn( \ |
| 2202 | bench_naming::format_name("{lvl:block,algo:adjacent_difference,subalgo:" + name \ |
| 2203 | + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \ |
| 2204 | ",with_tile:" #WITH_TILE "}}") \ |
| 2205 | .c_str(), \ |
| 2206 | - run_benchmark<Benchmark, T, BS, IPT, WITH_TILE>, \ |
| 2207 | - bytes, \ |
| 2208 | - seed, \ |
| 2209 | - stream) |
| 2210 | - |
| 2211 | -#define BENCHMARK_TYPE(type, block, with_tile) \ |
| 2212 | - CREATE_BENCHMARK(type, block, 1, with_tile), \ |
| 2213 | - CREATE_BENCHMARK(type, block, 3, with_tile), \ |
| 2214 | - CREATE_BENCHMARK(type, block, 4, with_tile), \ |
| 2215 | - CREATE_BENCHMARK(type, block, 8, with_tile), \ |
| 2216 | - CREATE_BENCHMARK(type, block, 16, with_tile), \ |
| 2217 | + run_benchmark<Benchmark, T, BS, IPT, WITH_TILE>); |
| 2218 | + |
| 2219 | +#define BENCHMARK_TYPE(type, block, with_tile) \ |
| 2220 | + CREATE_BENCHMARK(type, block, 1, with_tile) \ |
| 2221 | + CREATE_BENCHMARK(type, block, 3, with_tile) \ |
| 2222 | + CREATE_BENCHMARK(type, block, 4, with_tile) \ |
| 2223 | + CREATE_BENCHMARK(type, block, 8, with_tile) \ |
| 2224 | + CREATE_BENCHMARK(type, block, 16, with_tile) \ |
| 2225 | CREATE_BENCHMARK(type, block, 32, with_tile) |
| 2226 | |
| 2227 | -template<class Benchmark> |
| 2228 | -void add_benchmarks(const std::string& name, |
| 2229 | - std::vector<benchmark::internal::Benchmark*>& benchmarks, |
| 2230 | - size_t bytes, |
| 2231 | - const managed_seed& seed, |
| 2232 | - hipStream_t stream) |
| 2233 | +template<typename Benchmark> |
| 2234 | +void add_benchmarks(const std::string& name, benchmark_utils::executor& executor) |
| 2235 | { |
| 2236 | - std::vector<benchmark::internal::Benchmark*> bs = |
| 2237 | + BENCHMARK_TYPE(int, 256, false) |
| 2238 | + BENCHMARK_TYPE(float, 256, false) |
| 2239 | + BENCHMARK_TYPE(int8_t, 256, false) |
| 2240 | + BENCHMARK_TYPE(rocprim::half, 256, false) |
| 2241 | + BENCHMARK_TYPE(long long, 256, false) |
| 2242 | + BENCHMARK_TYPE(double, 256, false) |
| 2243 | + BENCHMARK_TYPE(rocprim::int128_t, 256, false) |
| 2244 | + BENCHMARK_TYPE(rocprim::uint128_t, 256, false) |
| 2245 | + |
| 2246 | + if(!std::is_same<Benchmark, subtract_right_partial>::value) |
| 2247 | { |
| 2248 | - BENCHMARK_TYPE(int, 256, false), |
| 2249 | - BENCHMARK_TYPE(float, 256, false), |
| 2250 | - BENCHMARK_TYPE(int8_t, 256, false), |
| 2251 | - BENCHMARK_TYPE(rocprim::half, 256, false), |
| 2252 | - BENCHMARK_TYPE(long long, 256, false), |
| 2253 | - BENCHMARK_TYPE(double, 256, false) |
| 2254 | - }; |
| 2255 | - |
| 2256 | - if(!std::is_same<Benchmark, subtract_right_partial>::value) { |
| 2257 | - bs.insert(bs.end(), { |
| 2258 | - BENCHMARK_TYPE(int, 256, true), |
| 2259 | - BENCHMARK_TYPE(float, 256, true), |
| 2260 | - BENCHMARK_TYPE(int8_t, 256, true), |
| 2261 | - BENCHMARK_TYPE(rocprim::half, 256, true), |
| 2262 | - BENCHMARK_TYPE(long long, 256, true), |
| 2263 | - BENCHMARK_TYPE(double, 256, true) |
| 2264 | - }); |
| 2265 | + BENCHMARK_TYPE(int, 256, true) |
| 2266 | + BENCHMARK_TYPE(float, 256, true) |
| 2267 | + BENCHMARK_TYPE(int8_t, 256, true) |
| 2268 | + BENCHMARK_TYPE(rocprim::half, 256, true) |
| 2269 | + BENCHMARK_TYPE(long long, 256, true) |
| 2270 | + BENCHMARK_TYPE(double, 256, true) |
| 2271 | + BENCHMARK_TYPE(rocprim::int128_t, 256, true) |
| 2272 | + BENCHMARK_TYPE(rocprim::uint128_t, 256, true) |
| 2273 | } |
| 2274 | - |
| 2275 | - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); |
| 2276 | } |
| 2277 | |
| 2278 | -int main(int argc, char *argv[]) |
| 2279 | +int main(int argc, char* argv[]) |
| 2280 | { |
| 2281 | - cli::Parser parser(argc, argv); |
| 2282 | - parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes"); |
| 2283 | - parser.set_optional<int>("trials", "trials", -1, "number of iterations"); |
| 2284 | - parser.set_optional<std::string>("name_format", |
| 2285 | - "name_format", |
| 2286 | - "human", |
| 2287 | - "either: json,human,txt"); |
| 2288 | - parser.set_optional<std::string>("seed", "seed", "random", get_seed_message()); |
| 2289 | - parser.run_and_exit_if_error(); |
| 2290 | - |
| 2291 | - // Parse argv |
| 2292 | - benchmark::Initialize(&argc, argv); |
| 2293 | - const size_t bytes = parser.get<size_t>("size"); |
| 2294 | - const int trials = parser.get<int>("trials"); |
| 2295 | - bench_naming::set_format(parser.get<std::string>("name_format")); |
| 2296 | - const std::string seed_type = parser.get<std::string>("seed"); |
| 2297 | - const managed_seed seed(seed_type); |
| 2298 | - |
| 2299 | - // HIP |
| 2300 | - hipStream_t stream = 0; // default |
| 2301 | - |
| 2302 | - // Benchmark info |
| 2303 | - add_common_benchmark_info(); |
| 2304 | - benchmark::AddCustomContext("bytes", std::to_string(bytes)); |
| 2305 | - benchmark::AddCustomContext("seed", seed_type); |
| 2306 | - |
| 2307 | - // Add benchmarks |
| 2308 | - std::vector<benchmark::internal::Benchmark*> benchmarks; |
| 2309 | - add_benchmarks<subtract_left>("subtract_left", benchmarks, bytes, seed, stream); |
| 2310 | - add_benchmarks<subtract_right>("subtract_right", benchmarks, bytes, seed, stream); |
| 2311 | - add_benchmarks<subtract_left_partial>("subtract_left_partial", benchmarks, bytes, seed, stream); |
| 2312 | - add_benchmarks<subtract_right_partial>("subtract_right_partial", |
| 2313 | - benchmarks, |
| 2314 | - bytes, |
| 2315 | - seed, |
| 2316 | - stream); |
| 2317 | - |
| 2318 | - // Use manual timing |
| 2319 | - for(auto& b : benchmarks) |
| 2320 | - { |
| 2321 | - b->UseManualTime(); |
| 2322 | - b->Unit(benchmark::kMillisecond); |
| 2323 | - } |
| 2324 | + benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0); |
| 2325 | |
| 2326 | - // Force number of iterations |
| 2327 | - if(trials > 0) |
| 2328 | - { |
| 2329 | - for(auto& b : benchmarks) |
| 2330 | - { |
| 2331 | - b->Iterations(trials); |
| 2332 | - } |
| 2333 | - } |
| 2334 | + add_benchmarks<subtract_left>("subtract_left", executor); |
| 2335 | + add_benchmarks<subtract_right>("subtract_right", executor); |
| 2336 | + add_benchmarks<subtract_left_partial>("subtract_left_partial", executor); |
| 2337 | + add_benchmarks<subtract_right_partial>("subtract_right_partial", executor); |
| 2338 | |
| 2339 | - // Run benchmarks |
| 2340 | - benchmark::RunSpecifiedBenchmarks(); |
| 2341 | - return 0; |
| 2342 | + executor.run(); |
| 2343 | } |
| 2344 | diff --git a/benchmark/benchmark_block_discontinuity.cpp b/benchmark/benchmark_block_discontinuity.cpp |
| 2345 | index 1d07cdb..62fd01a 100644 |
| 2346 | --- a/benchmark/benchmark_block_discontinuity.cpp |
| 2347 | +++ b/benchmark/benchmark_block_discontinuity.cpp |
| 2348 | @@ -1,6 +1,6 @@ |
| 2349 | // MIT License |
| 2350 | // |
| 2351 | -// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 2352 | +// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 2353 | // |
| 2354 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 2355 | // of this software and associated documentation files (the "Software"), to deal |
| 2356 | @@ -20,12 +20,7 @@ |
| 2357 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 2358 | // SOFTWARE. |
| 2359 | |
| 2360 | -// CmdParser |
| 2361 | #include "benchmark_utils.hpp" |
| 2362 | -#include "cmdparser.hpp" |
| 2363 | - |
| 2364 | -// Google Benchmark |
| 2365 | -#include <benchmark/benchmark.h> |
| 2366 | |
| 2367 | // HIP API |
| 2368 | #include <hip/hip_runtime.h> |
| 2369 | @@ -34,327 +29,243 @@ |
| 2370 | #include <rocprim/block/block_discontinuity.hpp> |
| 2371 | #include <rocprim/block/block_load_func.hpp> |
| 2372 | #include <rocprim/block/block_store_func.hpp> |
| 2373 | +#include <rocprim/config.hpp> |
| 2374 | +#include <rocprim/functional.hpp> |
| 2375 | +#include <rocprim/intrinsics/thread.hpp> |
| 2376 | +#include <rocprim/types.hpp> |
| 2377 | |
| 2378 | -#include <iostream> |
| 2379 | -#include <limits> |
| 2380 | +#include <cstddef> |
| 2381 | +#include <stdint.h> |
| 2382 | #include <string> |
| 2383 | #include <vector> |
| 2384 | |
| 2385 | -#include <cstdio> |
| 2386 | -#include <cstdlib> |
| 2387 | - |
| 2388 | -#ifndef DEFAULT_N |
| 2389 | -const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; |
| 2390 | -#endif |
| 2391 | - |
| 2392 | -namespace rp = rocprim; |
| 2393 | - |
| 2394 | -template< |
| 2395 | - class Runner, |
| 2396 | - class T, |
| 2397 | - unsigned int BlockSize, |
| 2398 | - unsigned int ItemsPerThread, |
| 2399 | - bool WithTile, |
| 2400 | - unsigned int Trials |
| 2401 | -> |
| 2402 | -__global__ |
| 2403 | -__launch_bounds__(BlockSize) |
| 2404 | -void kernel(const T * d_input, T * d_output) |
| 2405 | +template<typename Runner, |
| 2406 | + typename T, |
| 2407 | + unsigned int BlockSize, |
| 2408 | + unsigned int ItemsPerThread, |
| 2409 | + bool WithTile, |
| 2410 | + unsigned int Trials> |
| 2411 | +__global__ __launch_bounds__(BlockSize) |
| 2412 | +void kernel(const T* d_input, T* d_output) |
| 2413 | { |
| 2414 | Runner::template run<T, BlockSize, ItemsPerThread, WithTile, Trials>(d_input, d_output); |
| 2415 | } |
| 2416 | |
| 2417 | struct flag_heads |
| 2418 | { |
| 2419 | - template< |
| 2420 | - class T, |
| 2421 | - unsigned int BlockSize, |
| 2422 | - unsigned int ItemsPerThread, |
| 2423 | - bool WithTile, |
| 2424 | - unsigned int Trials |
| 2425 | - > |
| 2426 | + template<typename T, |
| 2427 | + unsigned int BlockSize, |
| 2428 | + unsigned int ItemsPerThread, |
| 2429 | + bool WithTile, |
| 2430 | + unsigned int Trials> |
| 2431 | __device__ |
| 2432 | - static void run(const T * d_input, T * d_output) |
| 2433 | + static void run(const T* d_input, T* d_output) |
| 2434 | { |
| 2435 | - const unsigned int lid = threadIdx.x; |
| 2436 | + const unsigned int lid = threadIdx.x; |
| 2437 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 2438 | |
| 2439 | T input[ItemsPerThread]; |
| 2440 | - rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 2441 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 2442 | |
| 2443 | ROCPRIM_NO_UNROLL |
| 2444 | - for(unsigned int trial = 0; trial < Trials; trial++) |
| 2445 | + for(unsigned int trial = 0; trial < Trials; ++trial) |
| 2446 | { |
| 2447 | - rp::block_discontinuity<T, BlockSize> bdiscontinuity; |
| 2448 | - bool head_flags[ItemsPerThread]; |
| 2449 | + rocprim::block_discontinuity<T, BlockSize> bdiscontinuity; |
| 2450 | + bool head_flags[ItemsPerThread]; |
| 2451 | if(WithTile) |
| 2452 | { |
| 2453 | - bdiscontinuity.flag_heads(head_flags, T(123), input, rp::equal_to<T>()); |
| 2454 | + bdiscontinuity.flag_heads(head_flags, T(123), input, rocprim::equal_to<T>()); |
| 2455 | } |
| 2456 | else |
| 2457 | { |
| 2458 | - bdiscontinuity.flag_heads(head_flags, input, rp::equal_to<T>()); |
| 2459 | + bdiscontinuity.flag_heads(head_flags, input, rocprim::equal_to<T>()); |
| 2460 | } |
| 2461 | |
| 2462 | - for(unsigned int i = 0; i < ItemsPerThread; i++) |
| 2463 | + for(unsigned int i = 0; i < ItemsPerThread; ++i) |
| 2464 | { |
| 2465 | input[i] += head_flags[i]; |
| 2466 | } |
| 2467 | - rp::syncthreads(); |
| 2468 | + rocprim::syncthreads(); |
| 2469 | } |
| 2470 | |
| 2471 | - rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 2472 | + rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 2473 | } |
| 2474 | }; |
| 2475 | |
| 2476 | struct flag_tails |
| 2477 | { |
| 2478 | - template< |
| 2479 | - class T, |
| 2480 | - unsigned int BlockSize, |
| 2481 | - unsigned int ItemsPerThread, |
| 2482 | - bool WithTile, |
| 2483 | - unsigned int Trials |
| 2484 | - > |
| 2485 | + template<typename T, |
| 2486 | + unsigned int BlockSize, |
| 2487 | + unsigned int ItemsPerThread, |
| 2488 | + bool WithTile, |
| 2489 | + unsigned int Trials> |
| 2490 | __device__ |
| 2491 | - static void run(const T * d_input, T * d_output) |
| 2492 | + static void run(const T* d_input, T* d_output) |
| 2493 | { |
| 2494 | - const unsigned int lid = threadIdx.x; |
| 2495 | + const unsigned int lid = threadIdx.x; |
| 2496 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 2497 | |
| 2498 | T input[ItemsPerThread]; |
| 2499 | - rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 2500 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 2501 | |
| 2502 | ROCPRIM_NO_UNROLL |
| 2503 | - for(unsigned int trial = 0; trial < Trials; trial++) |
| 2504 | + for(unsigned int trial = 0; trial < Trials; ++trial) |
| 2505 | { |
| 2506 | - rp::block_discontinuity<T, BlockSize> bdiscontinuity; |
| 2507 | - bool tail_flags[ItemsPerThread]; |
| 2508 | + rocprim::block_discontinuity<T, BlockSize> bdiscontinuity; |
| 2509 | + bool tail_flags[ItemsPerThread]; |
| 2510 | if(WithTile) |
| 2511 | { |
| 2512 | - bdiscontinuity.flag_tails(tail_flags, T(123), input, rp::equal_to<T>()); |
| 2513 | + bdiscontinuity.flag_tails(tail_flags, T(123), input, rocprim::equal_to<T>()); |
| 2514 | } |
| 2515 | else |
| 2516 | { |
| 2517 | - bdiscontinuity.flag_tails(tail_flags, input, rp::equal_to<T>()); |
| 2518 | + bdiscontinuity.flag_tails(tail_flags, input, rocprim::equal_to<T>()); |
| 2519 | } |
| 2520 | |
| 2521 | - for(unsigned int i = 0; i < ItemsPerThread; i++) |
| 2522 | + for(unsigned int i = 0; i < ItemsPerThread; ++i) |
| 2523 | { |
| 2524 | input[i] += tail_flags[i]; |
| 2525 | } |
| 2526 | - rp::syncthreads(); |
| 2527 | + rocprim::syncthreads(); |
| 2528 | } |
| 2529 | |
| 2530 | - rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 2531 | + rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 2532 | } |
| 2533 | }; |
| 2534 | |
| 2535 | struct flag_heads_and_tails |
| 2536 | { |
| 2537 | - template< |
| 2538 | - class T, |
| 2539 | - unsigned int BlockSize, |
| 2540 | - unsigned int ItemsPerThread, |
| 2541 | - bool WithTile, |
| 2542 | - unsigned int Trials |
| 2543 | - > |
| 2544 | + template<typename T, |
| 2545 | + unsigned int BlockSize, |
| 2546 | + unsigned int ItemsPerThread, |
| 2547 | + bool WithTile, |
| 2548 | + unsigned int Trials> |
| 2549 | __device__ |
| 2550 | - static void run(const T * d_input, T * d_output) |
| 2551 | + static void run(const T* d_input, T* d_output) |
| 2552 | { |
| 2553 | - const unsigned int lid = threadIdx.x; |
| 2554 | + const unsigned int lid = threadIdx.x; |
| 2555 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 2556 | |
| 2557 | T input[ItemsPerThread]; |
| 2558 | - rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 2559 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 2560 | |
| 2561 | ROCPRIM_NO_UNROLL |
| 2562 | - for(unsigned int trial = 0; trial < Trials; trial++) |
| 2563 | + for(unsigned int trial = 0; trial < Trials; ++trial) |
| 2564 | { |
| 2565 | - rp::block_discontinuity<T, BlockSize> bdiscontinuity; |
| 2566 | - bool head_flags[ItemsPerThread]; |
| 2567 | - bool tail_flags[ItemsPerThread]; |
| 2568 | + rocprim::block_discontinuity<T, BlockSize> bdiscontinuity; |
| 2569 | + bool head_flags[ItemsPerThread]; |
| 2570 | + bool tail_flags[ItemsPerThread]; |
| 2571 | if(WithTile) |
| 2572 | { |
| 2573 | - bdiscontinuity.flag_heads_and_tails(head_flags, T(123), tail_flags, T(234), input, rp::equal_to<T>()); |
| 2574 | + bdiscontinuity.flag_heads_and_tails(head_flags, |
| 2575 | + T(123), |
| 2576 | + tail_flags, |
| 2577 | + T(234), |
| 2578 | + input, |
| 2579 | + rocprim::equal_to<T>()); |
| 2580 | } |
| 2581 | else |
| 2582 | { |
| 2583 | - bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, input, rp::equal_to<T>()); |
| 2584 | + bdiscontinuity.flag_heads_and_tails(head_flags, |
| 2585 | + tail_flags, |
| 2586 | + input, |
| 2587 | + rocprim::equal_to<T>()); |
| 2588 | } |
| 2589 | |
| 2590 | - for(unsigned int i = 0; i < ItemsPerThread; i++) |
| 2591 | + for(unsigned int i = 0; i < ItemsPerThread; ++i) |
| 2592 | { |
| 2593 | input[i] += head_flags[i]; |
| 2594 | input[i] += tail_flags[i]; |
| 2595 | } |
| 2596 | - rp::syncthreads(); |
| 2597 | + rocprim::syncthreads(); |
| 2598 | } |
| 2599 | |
| 2600 | - rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 2601 | + rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 2602 | } |
| 2603 | }; |
| 2604 | |
| 2605 | -template<class Benchmark, |
| 2606 | - class T, |
| 2607 | +template<typename Benchmark, |
| 2608 | + typename T, |
| 2609 | unsigned int BlockSize, |
| 2610 | unsigned int ItemsPerThread, |
| 2611 | bool WithTile, |
| 2612 | unsigned int Trials = 100> |
| 2613 | -void run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) |
| 2614 | +void run_benchmark(benchmark_utils::state&& state) |
| 2615 | { |
| 2616 | + const auto& bytes = state.bytes; |
| 2617 | + const auto& seed = state.seed; |
| 2618 | + const auto& stream = state.stream; |
| 2619 | + |
| 2620 | // Calculate the number of elements N |
| 2621 | size_t N = bytes / sizeof(T); |
| 2622 | |
| 2623 | constexpr auto items_per_block = BlockSize * ItemsPerThread; |
| 2624 | - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); |
| 2625 | + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); |
| 2626 | |
| 2627 | const auto random_range = limit_random_range<T>(0, 10); |
| 2628 | std::vector<T> input |
| 2629 | = get_random_data<T>(size, random_range.first, random_range.second, seed.get_0()); |
| 2630 | - T * d_input; |
| 2631 | - T * d_output; |
| 2632 | + T* d_input; |
| 2633 | + T* d_output; |
| 2634 | HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T))); |
| 2635 | HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T))); |
| 2636 | - HIP_CHECK( |
| 2637 | - hipMemcpy( |
| 2638 | - d_input, input.data(), |
| 2639 | - size * sizeof(T), |
| 2640 | - hipMemcpyHostToDevice |
| 2641 | - ) |
| 2642 | - ); |
| 2643 | + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); |
| 2644 | HIP_CHECK(hipDeviceSynchronize()); |
| 2645 | |
| 2646 | - // HIP events creation |
| 2647 | - hipEvent_t start, stop; |
| 2648 | - HIP_CHECK(hipEventCreate(&start)); |
| 2649 | - HIP_CHECK(hipEventCreate(&stop)); |
| 2650 | - |
| 2651 | - for(auto _ : state) |
| 2652 | - { |
| 2653 | - // Record start event |
| 2654 | - HIP_CHECK(hipEventRecord(start, stream)); |
| 2655 | - |
| 2656 | - hipLaunchKernelGGL( |
| 2657 | - HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, WithTile, Trials>), |
| 2658 | - dim3(size/items_per_block), dim3(BlockSize), 0, stream, |
| 2659 | - d_input, d_output |
| 2660 | - ); |
| 2661 | - HIP_CHECK(hipGetLastError()); |
| 2662 | - |
| 2663 | - // Record stop event and wait until it completes |
| 2664 | - HIP_CHECK(hipEventRecord(stop, stream)); |
| 2665 | - HIP_CHECK(hipEventSynchronize(stop)); |
| 2666 | - |
| 2667 | - float elapsed_mseconds; |
| 2668 | - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); |
| 2669 | - state.SetIterationTime(elapsed_mseconds / 1000); |
| 2670 | - } |
| 2671 | - |
| 2672 | - // Destroy HIP events |
| 2673 | - HIP_CHECK(hipEventDestroy(start)); |
| 2674 | - HIP_CHECK(hipEventDestroy(stop)); |
| 2675 | + state.run( |
| 2676 | + [&] |
| 2677 | + { |
| 2678 | + kernel<Benchmark, T, BlockSize, ItemsPerThread, WithTile, Trials> |
| 2679 | + <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input, d_output); |
| 2680 | + HIP_CHECK(hipGetLastError()); |
| 2681 | + }); |
| 2682 | |
| 2683 | - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); |
| 2684 | - state.SetItemsProcessed(state.iterations() * Trials * size); |
| 2685 | + state.set_throughput(size * Trials, sizeof(T)); |
| 2686 | |
| 2687 | HIP_CHECK(hipFree(d_input)); |
| 2688 | HIP_CHECK(hipFree(d_output)); |
| 2689 | } |
| 2690 | |
| 2691 | -#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ |
| 2692 | - benchmark::RegisterBenchmark( \ |
| 2693 | - bench_naming::format_name("{lvl:block,algo:discontinuity,subalgo:" + name \ |
| 2694 | - + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \ |
| 2695 | - ",with_tile:" #WITH_TILE "}}") \ |
| 2696 | - .c_str(), \ |
| 2697 | - run_benchmark<Benchmark, T, BS, IPT, WITH_TILE>, \ |
| 2698 | - bytes, \ |
| 2699 | - seed, \ |
| 2700 | - stream) |
| 2701 | - |
| 2702 | -#define BENCHMARK_TYPE(type, block, bool) \ |
| 2703 | - CREATE_BENCHMARK(type, block, 1, bool), \ |
| 2704 | - CREATE_BENCHMARK(type, block, 2, bool), \ |
| 2705 | - CREATE_BENCHMARK(type, block, 3, bool), \ |
| 2706 | - CREATE_BENCHMARK(type, block, 4, bool), \ |
| 2707 | +#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ |
| 2708 | + executor.queue_fn(bench_naming::format_name("{lvl:block,algo:discontinuity,subalgo:" + name \ |
| 2709 | + + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \ |
| 2710 | + ",with_tile:" #WITH_TILE "}}") \ |
| 2711 | + .c_str(), \ |
| 2712 | + run_benchmark<Benchmark, T, BS, IPT, WITH_TILE>); |
| 2713 | + |
| 2714 | +#define BENCHMARK_TYPE(type, block, bool) \ |
| 2715 | + CREATE_BENCHMARK(type, block, 1, bool) \ |
| 2716 | + CREATE_BENCHMARK(type, block, 2, bool) \ |
| 2717 | + CREATE_BENCHMARK(type, block, 3, bool) \ |
| 2718 | + CREATE_BENCHMARK(type, block, 4, bool) \ |
| 2719 | CREATE_BENCHMARK(type, block, 8, bool) |
| 2720 | |
| 2721 | -template<class Benchmark> |
| 2722 | -void add_benchmarks(const std::string& name, |
| 2723 | - std::vector<benchmark::internal::Benchmark*>& benchmarks, |
| 2724 | - size_t bytes, |
| 2725 | - const managed_seed& seed, |
| 2726 | - hipStream_t stream) |
| 2727 | +template<typename Benchmark> |
| 2728 | +void add_benchmarks(const std::string& name, benchmark_utils::executor& executor) |
| 2729 | { |
| 2730 | - std::vector<benchmark::internal::Benchmark*> bs = |
| 2731 | - { |
| 2732 | - BENCHMARK_TYPE(int, 256, false), |
| 2733 | - BENCHMARK_TYPE(int, 256, true), |
| 2734 | - BENCHMARK_TYPE(int8_t, 256, false), |
| 2735 | - BENCHMARK_TYPE(int8_t, 256, true), |
| 2736 | - BENCHMARK_TYPE(uint8_t, 256, false), |
| 2737 | - BENCHMARK_TYPE(uint8_t, 256, true), |
| 2738 | - BENCHMARK_TYPE(rocprim::half, 256, false), |
| 2739 | - BENCHMARK_TYPE(rocprim::half, 256, true), |
| 2740 | - BENCHMARK_TYPE(long long, 256, false), |
| 2741 | - BENCHMARK_TYPE(long long, 256, true), |
| 2742 | - }; |
| 2743 | - |
| 2744 | - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); |
| 2745 | + BENCHMARK_TYPE(int, 256, false) |
| 2746 | + BENCHMARK_TYPE(int, 256, true) |
| 2747 | + BENCHMARK_TYPE(int8_t, 256, false) |
| 2748 | + BENCHMARK_TYPE(int8_t, 256, true) |
| 2749 | + BENCHMARK_TYPE(uint8_t, 256, false) |
| 2750 | + BENCHMARK_TYPE(uint8_t, 256, true) |
| 2751 | + BENCHMARK_TYPE(rocprim::half, 256, false) |
| 2752 | + BENCHMARK_TYPE(rocprim::half, 256, true) |
| 2753 | + BENCHMARK_TYPE(long long, 256, false) |
| 2754 | + BENCHMARK_TYPE(long long, 256, true) |
| 2755 | + BENCHMARK_TYPE(rocprim::int128_t, 256, false) |
| 2756 | + BENCHMARK_TYPE(rocprim::int128_t, 256, true) |
| 2757 | + BENCHMARK_TYPE(rocprim::uint128_t, 256, false) |
| 2758 | + BENCHMARK_TYPE(rocprim::uint128_t, 256, true) |
| 2759 | } |
| 2760 | |
| 2761 | -int main(int argc, char *argv[]) |
| 2762 | +int main(int argc, char* argv[]) |
| 2763 | { |
| 2764 | - cli::Parser parser(argc, argv); |
| 2765 | - parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes"); |
| 2766 | - parser.set_optional<int>("trials", "trials", -1, "number of iterations"); |
| 2767 | - parser.set_optional<std::string>("name_format", |
| 2768 | - "name_format", |
| 2769 | - "human", |
| 2770 | - "either: json,human,txt"); |
| 2771 | - parser.set_optional<std::string>("seed", "seed", "random", get_seed_message()); |
| 2772 | - parser.run_and_exit_if_error(); |
| 2773 | - |
| 2774 | - // Parse argv |
| 2775 | - benchmark::Initialize(&argc, argv); |
| 2776 | - const size_t bytes = parser.get<size_t>("size"); |
| 2777 | - const int trials = parser.get<int>("trials"); |
| 2778 | - bench_naming::set_format(parser.get<std::string>("name_format")); |
| 2779 | - const std::string seed_type = parser.get<std::string>("seed"); |
| 2780 | - const managed_seed seed(seed_type); |
| 2781 | - |
| 2782 | - // HIP |
| 2783 | - hipStream_t stream = 0; // default |
| 2784 | - |
| 2785 | - // Benchmark info |
| 2786 | - add_common_benchmark_info(); |
| 2787 | - benchmark::AddCustomContext("bytes", std::to_string(bytes)); |
| 2788 | - benchmark::AddCustomContext("seed", seed_type); |
| 2789 | - |
| 2790 | - // Add benchmarks |
| 2791 | - std::vector<benchmark::internal::Benchmark*> benchmarks; |
| 2792 | - add_benchmarks<flag_heads>("flag_heads", benchmarks, bytes, seed, stream); |
| 2793 | - add_benchmarks<flag_tails>("flag_tails", benchmarks, bytes, seed, stream); |
| 2794 | - add_benchmarks<flag_heads_and_tails>("flag_heads_and_tails", benchmarks, bytes, seed, stream); |
| 2795 | - |
| 2796 | - // Use manual timing |
| 2797 | - for(auto& b : benchmarks) |
| 2798 | - { |
| 2799 | - b->UseManualTime(); |
| 2800 | - b->Unit(benchmark::kMillisecond); |
| 2801 | - } |
| 2802 | + benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0); |
| 2803 | |
| 2804 | - // Force number of iterations |
| 2805 | - if(trials > 0) |
| 2806 | - { |
| 2807 | - for(auto& b : benchmarks) |
| 2808 | - { |
| 2809 | - b->Iterations(trials); |
| 2810 | - } |
| 2811 | - } |
| 2812 | + add_benchmarks<flag_heads>("flag_heads", executor); |
| 2813 | + add_benchmarks<flag_tails>("flag_tails", executor); |
| 2814 | + add_benchmarks<flag_heads_and_tails>("flag_heads_and_tails", executor); |
| 2815 | |
| 2816 | - // Run benchmarks |
| 2817 | - benchmark::RunSpecifiedBenchmarks(); |
| 2818 | - return 0; |
| 2819 | + executor.run(); |
| 2820 | } |
| 2821 | diff --git a/benchmark/benchmark_block_exchange.cpp b/benchmark/benchmark_block_exchange.cpp |
| 2822 | index ec3c95f..889fc78 100644 |
| 2823 | --- a/benchmark/benchmark_block_exchange.cpp |
| 2824 | +++ b/benchmark/benchmark_block_exchange.cpp |
| 2825 | @@ -1,6 +1,6 @@ |
| 2826 | // MIT License |
| 2827 | // |
| 2828 | -// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 2829 | +// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 2830 | // |
| 2831 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 2832 | // of this software and associated documentation files (the "Software"), to deal |
| 2833 | @@ -20,12 +20,10 @@ |
| 2834 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 2835 | // SOFTWARE. |
| 2836 | |
| 2837 | -// CmdParser |
| 2838 | -#include "cmdparser.hpp" |
| 2839 | #include "benchmark_utils.hpp" |
| 2840 | |
| 2841 | -// Google Benchmark |
| 2842 | -#include <benchmark/benchmark.h> |
| 2843 | +#include "../common/utils_custom_type.hpp" |
| 2844 | +#include "../common/utils_device_ptr.hpp" |
| 2845 | |
| 2846 | // HIP API |
| 2847 | #include <hip/hip_runtime.h> |
| 2848 | @@ -34,407 +32,270 @@ |
| 2849 | #include <rocprim/block/block_exchange.hpp> |
| 2850 | #include <rocprim/block/block_load_func.hpp> |
| 2851 | #include <rocprim/block/block_store_func.hpp> |
| 2852 | - |
| 2853 | -#include <iostream> |
| 2854 | -#include <limits> |
| 2855 | +#include <rocprim/config.hpp> |
| 2856 | +#include <rocprim/intrinsics/thread.hpp> |
| 2857 | +#include <rocprim/types.hpp> |
| 2858 | + |
| 2859 | +#include <algorithm> |
| 2860 | +#include <cstddef> |
| 2861 | +#include <numeric> |
| 2862 | +#include <stdint.h> |
| 2863 | #include <string> |
| 2864 | #include <vector> |
| 2865 | |
| 2866 | -#include <cstdio> |
| 2867 | -#include <cstdlib> |
| 2868 | - |
| 2869 | -#ifndef DEFAULT_N |
| 2870 | -const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; |
| 2871 | -#endif |
| 2872 | - |
| 2873 | -namespace rp = rocprim; |
| 2874 | - |
| 2875 | -template< |
| 2876 | - class Runner, |
| 2877 | - class T, |
| 2878 | - unsigned int BlockSize, |
| 2879 | - unsigned int ItemsPerThread, |
| 2880 | - unsigned int Trials |
| 2881 | -> |
| 2882 | -__global__ |
| 2883 | -__launch_bounds__(BlockSize) |
| 2884 | -void kernel(const T * d_input, const unsigned int * d_ranks, T * d_output) |
| 2885 | +template<typename Runner, |
| 2886 | + typename T, |
| 2887 | + unsigned int BlockSize, |
| 2888 | + unsigned int ItemsPerThread, |
| 2889 | + unsigned int Trials> |
| 2890 | +__global__ __launch_bounds__(BlockSize) |
| 2891 | +void kernel(const T* d_input, const unsigned int* d_ranks, T* d_output) |
| 2892 | { |
| 2893 | Runner::template run<T, BlockSize, ItemsPerThread, Trials>(d_input, d_ranks, d_output); |
| 2894 | } |
| 2895 | |
| 2896 | struct blocked_to_striped |
| 2897 | { |
| 2898 | - template< |
| 2899 | - class T, |
| 2900 | - unsigned int BlockSize, |
| 2901 | - unsigned int ItemsPerThread, |
| 2902 | - unsigned int Trials |
| 2903 | - > |
| 2904 | + template<typename T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials> |
| 2905 | __device__ |
| 2906 | - static void run(const T * d_input, const unsigned int *, T * d_output) |
| 2907 | + static void run(const T* d_input, const unsigned int*, T* d_output) |
| 2908 | { |
| 2909 | - const unsigned int lid = threadIdx.x; |
| 2910 | + const unsigned int lid = threadIdx.x; |
| 2911 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 2912 | |
| 2913 | T input[ItemsPerThread]; |
| 2914 | - rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 2915 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 2916 | |
| 2917 | ROCPRIM_NO_UNROLL |
| 2918 | - for(unsigned int trial = 0; trial < Trials; trial++) |
| 2919 | + for(unsigned int trial = 0; trial < Trials; ++trial) |
| 2920 | { |
| 2921 | - rp::block_exchange<T, BlockSize, ItemsPerThread> exchange; |
| 2922 | + rocprim::block_exchange<T, BlockSize, ItemsPerThread> exchange; |
| 2923 | exchange.blocked_to_striped(input, input); |
| 2924 | ::rocprim::syncthreads(); |
| 2925 | } |
| 2926 | |
| 2927 | - rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 2928 | + rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 2929 | } |
| 2930 | }; |
| 2931 | |
| 2932 | struct striped_to_blocked |
| 2933 | { |
| 2934 | - template< |
| 2935 | - class T, |
| 2936 | - unsigned int BlockSize, |
| 2937 | - unsigned int ItemsPerThread, |
| 2938 | - unsigned int Trials |
| 2939 | - > |
| 2940 | + template<typename T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials> |
| 2941 | __device__ |
| 2942 | - static void run(const T * d_input, const unsigned int *, T * d_output) |
| 2943 | + static void run(const T* d_input, const unsigned int*, T* d_output) |
| 2944 | { |
| 2945 | - const unsigned int lid = threadIdx.x; |
| 2946 | + const unsigned int lid = threadIdx.x; |
| 2947 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 2948 | |
| 2949 | T input[ItemsPerThread]; |
| 2950 | - rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 2951 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 2952 | |
| 2953 | ROCPRIM_NO_UNROLL |
| 2954 | - for(unsigned int trial = 0; trial < Trials; trial++) |
| 2955 | + for(unsigned int trial = 0; trial < Trials; ++trial) |
| 2956 | { |
| 2957 | - rp::block_exchange<T, BlockSize, ItemsPerThread> exchange; |
| 2958 | + rocprim::block_exchange<T, BlockSize, ItemsPerThread> exchange; |
| 2959 | exchange.striped_to_blocked(input, input); |
| 2960 | ::rocprim::syncthreads(); |
| 2961 | } |
| 2962 | |
| 2963 | - rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 2964 | + rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 2965 | } |
| 2966 | }; |
| 2967 | |
| 2968 | struct blocked_to_warp_striped |
| 2969 | { |
| 2970 | - template< |
| 2971 | - class T, |
| 2972 | - unsigned int BlockSize, |
| 2973 | - unsigned int ItemsPerThread, |
| 2974 | - unsigned int Trials |
| 2975 | - > |
| 2976 | + template<typename T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials> |
| 2977 | __device__ |
| 2978 | - static void run(const T * d_input, const unsigned int *, T * d_output) |
| 2979 | + static void run(const T* d_input, const unsigned int*, T* d_output) |
| 2980 | { |
| 2981 | - const unsigned int lid = threadIdx.x; |
| 2982 | + const unsigned int lid = threadIdx.x; |
| 2983 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 2984 | |
| 2985 | T input[ItemsPerThread]; |
| 2986 | - rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 2987 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 2988 | |
| 2989 | ROCPRIM_NO_UNROLL |
| 2990 | - for(unsigned int trial = 0; trial < Trials; trial++) |
| 2991 | + for(unsigned int trial = 0; trial < Trials; ++trial) |
| 2992 | { |
| 2993 | - rp::block_exchange<T, BlockSize, ItemsPerThread> exchange; |
| 2994 | + rocprim::block_exchange<T, BlockSize, ItemsPerThread> exchange; |
| 2995 | exchange.blocked_to_warp_striped(input, input); |
| 2996 | ::rocprim::syncthreads(); |
| 2997 | } |
| 2998 | |
| 2999 | - rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 3000 | + rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 3001 | } |
| 3002 | }; |
| 3003 | |
| 3004 | struct warp_striped_to_blocked |
| 3005 | { |
| 3006 | - template< |
| 3007 | - class T, |
| 3008 | - unsigned int BlockSize, |
| 3009 | - unsigned int ItemsPerThread, |
| 3010 | - unsigned int Trials |
| 3011 | - > |
| 3012 | + template<typename T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials> |
| 3013 | __device__ |
| 3014 | - static void run(const T * d_input, const unsigned int *, T * d_output) |
| 3015 | + static void run(const T* d_input, const unsigned int*, T* d_output) |
| 3016 | { |
| 3017 | - const unsigned int lid = threadIdx.x; |
| 3018 | + const unsigned int lid = threadIdx.x; |
| 3019 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 3020 | |
| 3021 | T input[ItemsPerThread]; |
| 3022 | - rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 3023 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 3024 | |
| 3025 | ROCPRIM_NO_UNROLL |
| 3026 | - for(unsigned int trial = 0; trial < Trials; trial++) |
| 3027 | + for(unsigned int trial = 0; trial < Trials; ++trial) |
| 3028 | { |
| 3029 | - rp::block_exchange<T, BlockSize, ItemsPerThread> exchange; |
| 3030 | + rocprim::block_exchange<T, BlockSize, ItemsPerThread> exchange; |
| 3031 | exchange.warp_striped_to_blocked(input, input); |
| 3032 | ::rocprim::syncthreads(); |
| 3033 | } |
| 3034 | |
| 3035 | - rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 3036 | + rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 3037 | } |
| 3038 | }; |
| 3039 | |
| 3040 | struct scatter_to_blocked |
| 3041 | { |
| 3042 | - template< |
| 3043 | - class T, |
| 3044 | - unsigned int BlockSize, |
| 3045 | - unsigned int ItemsPerThread, |
| 3046 | - unsigned int Trials |
| 3047 | - > |
| 3048 | + template<typename T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials> |
| 3049 | __device__ |
| 3050 | - static void run(const T * d_input, const unsigned int * d_ranks, T * d_output) |
| 3051 | + static void run(const T* d_input, const unsigned int* d_ranks, T* d_output) |
| 3052 | { |
| 3053 | - const unsigned int lid = threadIdx.x; |
| 3054 | + const unsigned int lid = threadIdx.x; |
| 3055 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 3056 | |
| 3057 | - T input[ItemsPerThread]; |
| 3058 | + T input[ItemsPerThread]; |
| 3059 | unsigned int ranks[ItemsPerThread]; |
| 3060 | - rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 3061 | - rp::block_load_direct_striped<BlockSize>(lid, d_ranks + block_offset, ranks); |
| 3062 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 3063 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_ranks + block_offset, ranks); |
| 3064 | |
| 3065 | ROCPRIM_NO_UNROLL |
| 3066 | - for(unsigned int trial = 0; trial < Trials; trial++) |
| 3067 | + for(unsigned int trial = 0; trial < Trials; ++trial) |
| 3068 | { |
| 3069 | - rp::block_exchange<T, BlockSize, ItemsPerThread> exchange; |
| 3070 | + rocprim::block_exchange<T, BlockSize, ItemsPerThread> exchange; |
| 3071 | exchange.scatter_to_blocked(input, input, ranks); |
| 3072 | ::rocprim::syncthreads(); |
| 3073 | } |
| 3074 | |
| 3075 | - rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 3076 | + rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 3077 | } |
| 3078 | }; |
| 3079 | |
| 3080 | struct scatter_to_striped |
| 3081 | { |
| 3082 | - template< |
| 3083 | - class T, |
| 3084 | - unsigned int BlockSize, |
| 3085 | - unsigned int ItemsPerThread, |
| 3086 | - unsigned int Trials |
| 3087 | - > |
| 3088 | + template<typename T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials> |
| 3089 | __device__ |
| 3090 | - static void run(const T * d_input, const unsigned int * d_ranks, T * d_output) |
| 3091 | + static void run(const T* d_input, const unsigned int* d_ranks, T* d_output) |
| 3092 | { |
| 3093 | - const unsigned int lid = threadIdx.x; |
| 3094 | + const unsigned int lid = threadIdx.x; |
| 3095 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 3096 | |
| 3097 | - T input[ItemsPerThread]; |
| 3098 | + T input[ItemsPerThread]; |
| 3099 | unsigned int ranks[ItemsPerThread]; |
| 3100 | - rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 3101 | - rp::block_load_direct_striped<BlockSize>(lid, d_ranks + block_offset, ranks); |
| 3102 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input); |
| 3103 | + rocprim::block_load_direct_striped<BlockSize>(lid, d_ranks + block_offset, ranks); |
| 3104 | |
| 3105 | ROCPRIM_NO_UNROLL |
| 3106 | - for(unsigned int trial = 0; trial < Trials; trial++) |
| 3107 | + for(unsigned int trial = 0; trial < Trials; ++trial) |
| 3108 | { |
| 3109 | - rp::block_exchange<T, BlockSize, ItemsPerThread> exchange; |
| 3110 | + rocprim::block_exchange<T, BlockSize, ItemsPerThread> exchange; |
| 3111 | exchange.scatter_to_striped(input, input, ranks); |
| 3112 | ::rocprim::syncthreads(); |
| 3113 | } |
| 3114 | |
| 3115 | - rp::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 3116 | + rocprim::block_store_direct_striped<BlockSize>(lid, d_output + block_offset, input); |
| 3117 | } |
| 3118 | }; |
| 3119 | |
| 3120 | -template<class Benchmark, |
| 3121 | - class T, |
| 3122 | +template<typename Benchmark, |
| 3123 | + typename T, |
| 3124 | unsigned int BlockSize, |
| 3125 | unsigned int ItemsPerThread, |
| 3126 | unsigned int Trials = 100> |
| 3127 | -void run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) |
| 3128 | +void run_benchmark(benchmark_utils::state&& state) |
| 3129 | { |
| 3130 | + const auto& bytes = state.bytes; |
| 3131 | + const auto& seed = state.seed; |
| 3132 | + const auto& stream = state.stream; |
| 3133 | + |
| 3134 | // Calculate the number of elements N |
| 3135 | size_t N = bytes / sizeof(T); |
| 3136 | - |
| 3137 | + |
| 3138 | constexpr auto items_per_block = BlockSize * ItemsPerThread; |
| 3139 | - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); |
| 3140 | + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); |
| 3141 | |
| 3142 | std::vector<T> input(size); |
| 3143 | // Fill input |
| 3144 | - for(size_t i = 0; i < size; i++) |
| 3145 | + for(size_t i = 0; i < size; ++i) |
| 3146 | { |
| 3147 | input[i] = T(i); |
| 3148 | } |
| 3149 | std::vector<unsigned int> ranks(size); |
| 3150 | // Fill ranks (for scatter operations) |
| 3151 | engine_type gen(seed.get_0()); |
| 3152 | - for(size_t bi = 0; bi < size / items_per_block; bi++) |
| 3153 | + for(size_t bi = 0; bi < size / items_per_block; ++bi) |
| 3154 | { |
| 3155 | auto block_ranks = ranks.begin() + bi * items_per_block; |
| 3156 | std::iota(block_ranks, block_ranks + items_per_block, 0); |
| 3157 | std::shuffle(block_ranks, block_ranks + items_per_block, gen); |
| 3158 | } |
| 3159 | - T * d_input; |
| 3160 | - unsigned int * d_ranks; |
| 3161 | - T * d_output; |
| 3162 | - HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T))); |
| 3163 | - HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_ranks), size * sizeof(unsigned int))); |
| 3164 | - HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T))); |
| 3165 | - HIP_CHECK( |
| 3166 | - hipMemcpy( |
| 3167 | - d_input, input.data(), |
| 3168 | - size * sizeof(T), |
| 3169 | - hipMemcpyHostToDevice |
| 3170 | - ) |
| 3171 | - ); |
| 3172 | - HIP_CHECK( |
| 3173 | - hipMemcpy( |
| 3174 | - d_ranks, ranks.data(), |
| 3175 | - size * sizeof(unsigned int), |
| 3176 | - hipMemcpyHostToDevice |
| 3177 | - ) |
| 3178 | - ); |
| 3179 | + common::device_ptr<T> d_input(input); |
| 3180 | + common::device_ptr<unsigned int> d_ranks(ranks); |
| 3181 | + common::device_ptr<T> d_output(size); |
| 3182 | HIP_CHECK(hipDeviceSynchronize()); |
| 3183 | |
| 3184 | - // HIP events creation |
| 3185 | - hipEvent_t start, stop; |
| 3186 | - HIP_CHECK(hipEventCreate(&start)); |
| 3187 | - HIP_CHECK(hipEventCreate(&stop)); |
| 3188 | - |
| 3189 | - for(auto _ : state) |
| 3190 | - { |
| 3191 | - // Record start event |
| 3192 | - HIP_CHECK(hipEventRecord(start, stream)); |
| 3193 | - |
| 3194 | - hipLaunchKernelGGL( |
| 3195 | - HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>), |
| 3196 | - dim3(size/items_per_block), dim3(BlockSize), 0, stream, |
| 3197 | - d_input, d_ranks, d_output |
| 3198 | - ); |
| 3199 | - HIP_CHECK(hipGetLastError()); |
| 3200 | - |
| 3201 | - // Record stop event and wait until it completes |
| 3202 | - HIP_CHECK(hipEventRecord(stop, stream)); |
| 3203 | - HIP_CHECK(hipEventSynchronize(stop)); |
| 3204 | - |
| 3205 | - float elapsed_mseconds; |
| 3206 | - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); |
| 3207 | - state.SetIterationTime(elapsed_mseconds / 1000); |
| 3208 | - } |
| 3209 | - |
| 3210 | - // Destroy HIP events |
| 3211 | - HIP_CHECK(hipEventDestroy(start)); |
| 3212 | - HIP_CHECK(hipEventDestroy(stop)); |
| 3213 | - |
| 3214 | - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); |
| 3215 | - state.SetItemsProcessed(state.iterations() * Trials * size); |
| 3216 | - |
| 3217 | - HIP_CHECK(hipFree(d_input)); |
| 3218 | - HIP_CHECK(hipFree(d_ranks)); |
| 3219 | - HIP_CHECK(hipFree(d_output)); |
| 3220 | + state.run( |
| 3221 | + [&] |
| 3222 | + { |
| 3223 | + kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials> |
| 3224 | + <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input.get(), |
| 3225 | + d_ranks.get(), |
| 3226 | + d_output.get()); |
| 3227 | + HIP_CHECK(hipGetLastError()); |
| 3228 | + }); |
| 3229 | + |
| 3230 | + state.set_throughput(size * Trials, sizeof(T)); |
| 3231 | } |
| 3232 | |
| 3233 | -#define CREATE_BENCHMARK(T, BS, IPT) \ |
| 3234 | - benchmark::RegisterBenchmark( \ |
| 3235 | - bench_naming::format_name("{lvl:block,algo:exchange,subalgo:" + name \ |
| 3236 | - + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT "}}") \ |
| 3237 | - .c_str(), \ |
| 3238 | - run_benchmark<Benchmark, T, BS, IPT>, \ |
| 3239 | - bytes, \ |
| 3240 | - seed, \ |
| 3241 | - stream) |
| 3242 | - |
| 3243 | -#define BENCHMARK_TYPE(type, block) \ |
| 3244 | - CREATE_BENCHMARK(type, block, 1), \ |
| 3245 | - CREATE_BENCHMARK(type, block, 2), \ |
| 3246 | - CREATE_BENCHMARK(type, block, 3), \ |
| 3247 | - CREATE_BENCHMARK(type, block, 4), \ |
| 3248 | - CREATE_BENCHMARK(type, block, 7), \ |
| 3249 | +#define CREATE_BENCHMARK(T, BS, IPT) \ |
| 3250 | + executor.queue_fn(bench_naming::format_name("{lvl:block,algo:exchange,subalgo:" + name \ |
| 3251 | + + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \ |
| 3252 | + "}}") \ |
| 3253 | + .c_str(), \ |
| 3254 | + run_benchmark<Benchmark, T, BS, IPT>); |
| 3255 | + |
| 3256 | +#define BENCHMARK_TYPE(type, block) \ |
| 3257 | + CREATE_BENCHMARK(type, block, 1) \ |
| 3258 | + CREATE_BENCHMARK(type, block, 2) \ |
| 3259 | + CREATE_BENCHMARK(type, block, 3) \ |
| 3260 | + CREATE_BENCHMARK(type, block, 4) \ |
| 3261 | + CREATE_BENCHMARK(type, block, 7) \ |
| 3262 | CREATE_BENCHMARK(type, block, 8) |
| 3263 | |
| 3264 | -template<class Benchmark> |
| 3265 | -void add_benchmarks(const std::string& name, |
| 3266 | - std::vector<benchmark::internal::Benchmark*>& benchmarks, |
| 3267 | - size_t bytes, |
| 3268 | - const managed_seed& seed, |
| 3269 | - hipStream_t stream) |
| 3270 | +template<typename Benchmark> |
| 3271 | +void add_benchmarks(const std::string& name, benchmark_utils::executor& executor) |
| 3272 | { |
| 3273 | - using custom_float2 = custom_type<float, float>; |
| 3274 | - using custom_double2 = custom_type<double, double>; |
| 3275 | - |
| 3276 | - std::vector<benchmark::internal::Benchmark*> bs = |
| 3277 | - { |
| 3278 | - BENCHMARK_TYPE(int, 256), |
| 3279 | - BENCHMARK_TYPE(int8_t, 256), |
| 3280 | - BENCHMARK_TYPE(rocprim::half, 256), |
| 3281 | - BENCHMARK_TYPE(long long, 256), |
| 3282 | - BENCHMARK_TYPE(custom_float2, 256), |
| 3283 | - BENCHMARK_TYPE(float2, 256), |
| 3284 | - BENCHMARK_TYPE(custom_double2, 256), |
| 3285 | - BENCHMARK_TYPE(double2, 256), |
| 3286 | - BENCHMARK_TYPE(float4, 256), |
| 3287 | - }; |
| 3288 | - |
| 3289 | - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); |
| 3290 | + using custom_float2 = common::custom_type<float, float>; |
| 3291 | + using custom_double2 = common::custom_type<double, double>; |
| 3292 | + |
| 3293 | + BENCHMARK_TYPE(int, 256) |
| 3294 | + BENCHMARK_TYPE(int8_t, 256) |
| 3295 | + BENCHMARK_TYPE(rocprim::half, 256) |
| 3296 | + BENCHMARK_TYPE(long long, 256) |
| 3297 | + BENCHMARK_TYPE(custom_float2, 256) |
| 3298 | + BENCHMARK_TYPE(float2, 256) |
| 3299 | + BENCHMARK_TYPE(custom_double2, 256) |
| 3300 | + BENCHMARK_TYPE(double2, 256) |
| 3301 | + BENCHMARK_TYPE(float4, 256) |
| 3302 | + BENCHMARK_TYPE(rocprim::int128_t, 256) |
| 3303 | + BENCHMARK_TYPE(rocprim::uint128_t, 256) |
| 3304 | } |
| 3305 | |
| 3306 | -int main(int argc, char *argv[]) |
| 3307 | +int main(int argc, char* argv[]) |
| 3308 | { |
| 3309 | - cli::Parser parser(argc, argv); |
| 3310 | - parser.set_optional<size_t>("bytes", "bytes", DEFAULT_BYTES, "number of values"); |
| 3311 | - parser.set_optional<int>("trials", "trials", -1, "number of iterations"); |
| 3312 | - parser.set_optional<std::string>("name_format", |
| 3313 | - "name_format", |
| 3314 | - "human", |
| 3315 | - "either: json,human,txt"); |
| 3316 | - parser.set_optional<std::string>("seed", "seed", "random", get_seed_message()); |
| 3317 | - parser.run_and_exit_if_error(); |
| 3318 | - |
| 3319 | - // Parse argv |
| 3320 | - benchmark::Initialize(&argc, argv); |
| 3321 | - const size_t bytes = parser.get<size_t>("bytes"); |
| 3322 | - const int trials = parser.get<int>("trials"); |
| 3323 | - bench_naming::set_format(parser.get<std::string>("name_format")); |
| 3324 | - const std::string seed_type = parser.get<std::string>("seed"); |
| 3325 | - const managed_seed seed(seed_type); |
| 3326 | - |
| 3327 | - // HIP |
| 3328 | - hipStream_t stream = 0; // default |
| 3329 | - |
| 3330 | - // Benchmark info |
| 3331 | - add_common_benchmark_info(); |
| 3332 | - benchmark::AddCustomContext("bytes", std::to_string(bytes)); |
| 3333 | - benchmark::AddCustomContext("seed", seed_type); |
| 3334 | - |
| 3335 | - // Add benchmarks |
| 3336 | - std::vector<benchmark::internal::Benchmark*> benchmarks; |
| 3337 | - add_benchmarks<blocked_to_striped>("blocked_to_striped", benchmarks, bytes, seed, stream); |
| 3338 | - add_benchmarks<striped_to_blocked>("striped_to_blocked", benchmarks, bytes, seed, stream); |
| 3339 | - add_benchmarks<blocked_to_warp_striped>("blocked_to_warp_striped", |
| 3340 | - benchmarks, |
| 3341 | - bytes, |
| 3342 | - seed, |
| 3343 | - stream); |
| 3344 | - add_benchmarks<warp_striped_to_blocked>("warp_striped_to_blocked", |
| 3345 | - benchmarks, |
| 3346 | - bytes, |
| 3347 | - seed, |
| 3348 | - stream); |
| 3349 | - add_benchmarks<scatter_to_blocked>("scatter_to_blocked", benchmarks, bytes, seed, stream); |
| 3350 | - add_benchmarks<scatter_to_striped>("scatter_to_striped", benchmarks, bytes, seed, stream); |
| 3351 | - |
| 3352 | - // Use manual timing |
| 3353 | - for(auto& b : benchmarks) |
| 3354 | - { |
| 3355 | - b->UseManualTime(); |
| 3356 | - b->Unit(benchmark::kMillisecond); |
| 3357 | - } |
| 3358 | + benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 1, 0); |
| 3359 | |
| 3360 | - // Force number of iterations |
| 3361 | - if(trials > 0) |
| 3362 | - { |
| 3363 | - for(auto& b : benchmarks) |
| 3364 | - { |
| 3365 | - b->Iterations(trials); |
| 3366 | - } |
| 3367 | - } |
| 3368 | + add_benchmarks<blocked_to_striped>("blocked_to_striped", executor); |
| 3369 | + add_benchmarks<striped_to_blocked>("striped_to_blocked", executor); |
| 3370 | + add_benchmarks<blocked_to_warp_striped>("blocked_to_warp_striped", executor); |
| 3371 | + add_benchmarks<warp_striped_to_blocked>("warp_striped_to_blocked", executor); |
| 3372 | + add_benchmarks<scatter_to_blocked>("scatter_to_blocked", executor); |
| 3373 | + add_benchmarks<scatter_to_striped>("scatter_to_striped", executor); |
| 3374 | |
| 3375 | - // Run benchmarks |
| 3376 | - benchmark::RunSpecifiedBenchmarks(); |
| 3377 | - return 0; |
| 3378 | + executor.run(); |
| 3379 | } |
| 3380 | diff --git a/benchmark/benchmark_block_histogram.cpp b/benchmark/benchmark_block_histogram.cpp |
| 3381 | index 7dc2827..b108314 100644 |
| 3382 | --- a/benchmark/benchmark_block_histogram.cpp |
| 3383 | +++ b/benchmark/benchmark_block_histogram.cpp |
| 3384 | @@ -1,6 +1,6 @@ |
| 3385 | // MIT License |
| 3386 | // |
| 3387 | -// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 3388 | +// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 3389 | // |
| 3390 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3391 | // of this software and associated documentation files (the "Software"), to deal |
| 3392 | @@ -21,44 +21,24 @@ |
| 3393 | // SOFTWARE. |
| 3394 | |
| 3395 | #include "benchmark_utils.hpp" |
| 3396 | -// CmdParser |
| 3397 | -#include "cmdparser.hpp" |
| 3398 | |
| 3399 | -// Google Benchmark |
| 3400 | -#include <benchmark/benchmark.h> |
| 3401 | +#include "../common/utils_device_ptr.hpp" |
| 3402 | |
| 3403 | // HIP API |
| 3404 | #include <hip/hip_runtime.h> |
| 3405 | |
| 3406 | // rocPRIM |
| 3407 | #include <rocprim/block/block_histogram.hpp> |
| 3408 | -#include <rocprim/block/block_load_func.hpp> |
| 3409 | -#include <rocprim/block/block_store_func.hpp> |
| 3410 | - |
| 3411 | -#include <iostream> |
| 3412 | -#include <limits> |
| 3413 | -#include <string> |
| 3414 | -#include <vector> |
| 3415 | - |
| 3416 | -#include <cstdio> |
| 3417 | -#include <cstdlib> |
| 3418 | - |
| 3419 | -#ifndef DEFAULT_N |
| 3420 | -const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; |
| 3421 | -#endif |
| 3422 | - |
| 3423 | -namespace rp = rocprim; |
| 3424 | - |
| 3425 | -template< |
| 3426 | - class Runner, |
| 3427 | - class T, |
| 3428 | - unsigned int BlockSize, |
| 3429 | - unsigned int ItemsPerThread, |
| 3430 | - unsigned int BinSize, |
| 3431 | - unsigned int Trials |
| 3432 | -> |
| 3433 | -__global__ |
| 3434 | -__launch_bounds__(BlockSize) |
| 3435 | +#include <rocprim/config.hpp> |
| 3436 | +#include <rocprim/types.hpp> |
| 3437 | + |
| 3438 | +template<typename Runner, |
| 3439 | + typename T, |
| 3440 | + unsigned int BlockSize, |
| 3441 | + unsigned int ItemsPerThread, |
| 3442 | + unsigned int BinSize, |
| 3443 | + unsigned int Trials> |
| 3444 | +__global__ __launch_bounds__(BlockSize) |
| 3445 | void kernel(const T* input, T* output) |
| 3446 | { |
| 3447 | Runner::template run<T, BlockSize, ItemsPerThread, BinSize, Trials>(input, output); |
| 3448 | @@ -67,42 +47,42 @@ void kernel(const T* input, T* output) |
| 3449 | template<rocprim::block_histogram_algorithm algorithm> |
| 3450 | struct histogram |
| 3451 | { |
| 3452 | - template< |
| 3453 | - class T, |
| 3454 | - unsigned int BlockSize, |
| 3455 | - unsigned int ItemsPerThread, |
| 3456 | - unsigned int BinSize, |
| 3457 | - unsigned int Trials |
| 3458 | - > |
| 3459 | + static constexpr auto algorithm_type = algorithm; |
| 3460 | + template<typename T, |
| 3461 | + unsigned int BlockSize, |
| 3462 | + unsigned int ItemsPerThread, |
| 3463 | + unsigned int BinSize, |
| 3464 | + unsigned int Trials> |
| 3465 | __device__ |
| 3466 | static void run(const T* input, T* output) |
| 3467 | { |
| 3468 | // TODO: Move global_offset into final loop |
| 3469 | const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; |
| 3470 | - unsigned int global_offset = blockIdx.x * BinSize; |
| 3471 | + unsigned int global_offset = blockIdx.x * BinSize; |
| 3472 | |
| 3473 | T values[ItemsPerThread]; |
| 3474 | - for(unsigned int k = 0; k < ItemsPerThread; k++) |
| 3475 | + for(unsigned int k = 0; k < ItemsPerThread; ++k) |
| 3476 | { |
| 3477 | values[k] = input[index + k]; |
| 3478 | } |
| 3479 | |
| 3480 | - using bhistogram_t = rp::block_histogram<T, BlockSize, ItemsPerThread, BinSize, algorithm>; |
| 3481 | + using bhistogram_t |
| 3482 | + = rocprim::block_histogram<T, BlockSize, ItemsPerThread, BinSize, algorithm>; |
| 3483 | __shared__ T histogram[BinSize]; |
| 3484 | __shared__ typename bhistogram_t::storage_type storage; |
| 3485 | |
| 3486 | ROCPRIM_NO_UNROLL |
| 3487 | - for(unsigned int trial = 0; trial < Trials; trial++) |
| 3488 | + for(unsigned int trial = 0; trial < Trials; ++trial) |
| 3489 | { |
| 3490 | bhistogram_t().histogram(values, histogram, storage); |
| 3491 | - for(unsigned int k = 0; k < ItemsPerThread; k++) |
| 3492 | + for(unsigned int k = 0; k < ItemsPerThread; ++k) |
| 3493 | { |
| 3494 | values[k] = BinSize - 1 - values[k]; |
| 3495 | } |
| 3496 | } |
| 3497 | |
| 3498 | ROCPRIM_UNROLL |
| 3499 | - for (unsigned int offset = 0; offset < BinSize; offset += BlockSize) |
| 3500 | + for(unsigned int offset = 0; offset < BinSize; offset += BlockSize) |
| 3501 | { |
| 3502 | if(offset + threadIdx.x < BinSize) |
| 3503 | { |
| 3504 | @@ -113,161 +93,94 @@ struct histogram |
| 3505 | } |
| 3506 | }; |
| 3507 | |
| 3508 | -template< |
| 3509 | - class Benchmark, |
| 3510 | - class T, |
| 3511 | - unsigned int BlockSize, |
| 3512 | - unsigned int ItemsPerThread, |
| 3513 | - unsigned int BinSize = BlockSize, |
| 3514 | - unsigned int Trials = 100 |
| 3515 | -> |
| 3516 | -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t bytes) |
| 3517 | +template<typename Benchmark, |
| 3518 | + typename T, |
| 3519 | + unsigned int BlockSize, |
| 3520 | + unsigned int ItemsPerThread, |
| 3521 | + unsigned int BinSize = BlockSize, |
| 3522 | + unsigned int Trials = 100> |
| 3523 | +void run_benchmark(benchmark_utils::state&& state) |
| 3524 | { |
| 3525 | + const auto& stream = state.stream; |
| 3526 | + const auto& bytes = state.bytes; |
| 3527 | + |
| 3528 | // Calculate the number of elements N |
| 3529 | size_t N = bytes / sizeof(T); |
| 3530 | // Make sure size is a multiple of BlockSize |
| 3531 | constexpr auto items_per_block = BlockSize * ItemsPerThread; |
| 3532 | - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); |
| 3533 | - const auto bin_size = BinSize * ((N + items_per_block - 1)/items_per_block); |
| 3534 | + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); |
| 3535 | + const auto bin_size = BinSize * ((N + items_per_block - 1) / items_per_block); |
| 3536 | // Allocate and fill memory |
| 3537 | std::vector<T> input(size, 0.0f); |
| 3538 | - T * d_input; |
| 3539 | - T * d_output; |
| 3540 | - HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T))); |
| 3541 | - HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), bin_size * sizeof(T))); |
| 3542 | - HIP_CHECK( |
| 3543 | - hipMemcpy( |
| 3544 | - d_input, input.data(), |
| 3545 | - size * sizeof(T), |
| 3546 | - hipMemcpyHostToDevice |
| 3547 | - ) |
| 3548 | - ); |
| 3549 | + common::device_ptr<T> d_input(input); |
| 3550 | + common::device_ptr<T> d_output(bin_size); |
| 3551 | HIP_CHECK(hipDeviceSynchronize()); |
| 3552 | |
| 3553 | - // HIP events creation |
| 3554 | - hipEvent_t start, stop; |
| 3555 | - HIP_CHECK(hipEventCreate(&start)); |
| 3556 | - HIP_CHECK(hipEventCreate(&stop)); |
| 3557 | - |
| 3558 | - for (auto _ : state) |
| 3559 | - { |
| 3560 | - // Record start event |
| 3561 | - HIP_CHECK(hipEventRecord(start, stream)); |
| 3562 | - |
| 3563 | - hipLaunchKernelGGL( |
| 3564 | - HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, BinSize, Trials>), |
| 3565 | - dim3(size/items_per_block), dim3(BlockSize), 0, stream, |
| 3566 | - d_input, d_output |
| 3567 | - ); |
| 3568 | - HIP_CHECK(hipGetLastError()); |
| 3569 | - |
| 3570 | - // Record stop event and wait until it completes |
| 3571 | - HIP_CHECK(hipEventRecord(stop, stream)); |
| 3572 | - HIP_CHECK(hipEventSynchronize(stop)); |
| 3573 | - |
| 3574 | - float elapsed_mseconds; |
| 3575 | - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); |
| 3576 | - state.SetIterationTime(elapsed_mseconds / 1000); |
| 3577 | - } |
| 3578 | - |
| 3579 | - // Destroy HIP events |
| 3580 | - HIP_CHECK(hipEventDestroy(start)); |
| 3581 | - HIP_CHECK(hipEventDestroy(stop)); |
| 3582 | - |
| 3583 | - state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); |
| 3584 | - state.SetItemsProcessed(state.iterations() * size * Trials); |
| 3585 | - |
| 3586 | - HIP_CHECK(hipFree(d_input)); |
| 3587 | - HIP_CHECK(hipFree(d_output)); |
| 3588 | -} |
| 3589 | - |
| 3590 | -// IPT - items per thread |
| 3591 | -#define CREATE_BENCHMARK(T, BS, IPT) \ |
| 3592 | - benchmark::RegisterBenchmark( \ |
| 3593 | - bench_naming::format_name("{lvl:block,algo:histogram,key_type:" #T ",cfg:{bs:" #BS \ |
| 3594 | - ",ipt:" #IPT ",method:" \ |
| 3595 | - + method_name + "}}") \ |
| 3596 | - .c_str(), \ |
| 3597 | - run_benchmark<Benchmark, T, BS, IPT>, \ |
| 3598 | - stream, \ |
| 3599 | - bytes) |
| 3600 | - |
| 3601 | -#define BENCHMARK_TYPE(type, block) \ |
| 3602 | - CREATE_BENCHMARK(type, block, 1), \ |
| 3603 | - CREATE_BENCHMARK(type, block, 2), \ |
| 3604 | - CREATE_BENCHMARK(type, block, 3), \ |
| 3605 | - CREATE_BENCHMARK(type, block, 4), \ |
| 3606 | - CREATE_BENCHMARK(type, block, 8), \ |
| 3607 | - CREATE_BENCHMARK(type, block, 16) |
| 3608 | - |
| 3609 | -template<class Benchmark> |
| 3610 | -void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks, |
| 3611 | - const std::string& method_name, |
| 3612 | - hipStream_t stream, |
| 3613 | - size_t bytes) |
| 3614 | -{ |
| 3615 | - std::vector<benchmark::internal::Benchmark*> new_benchmarks = |
| 3616 | - { |
| 3617 | - BENCHMARK_TYPE(int, 256), |
| 3618 | - BENCHMARK_TYPE(int, 320), |
| 3619 | - BENCHMARK_TYPE(int, 512), |
| 3620 | + state.run( |
| 3621 | + [&] |
| 3622 | + { |
| 3623 | + kernel<Benchmark, T, BlockSize, ItemsPerThread, BinSize, Trials> |
| 3624 | + <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input.get(), |
| 3625 | + d_output.get()); |
| 3626 | + HIP_CHECK(hipGetLastError()); |
| 3627 | + }); |
| 3628 | |
| 3629 | - BENCHMARK_TYPE(unsigned long long, 256), |
| 3630 | - BENCHMARK_TYPE(unsigned long long, 320) |
| 3631 | - }; |
| 3632 | - benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); |
| 3633 | + state.set_throughput(size * Trials, sizeof(T)); |
| 3634 | } |
| 3635 | |
| 3636 | -int main(int argc, char *argv[]) |
| 3637 | +#define CREATE_BENCHMARK(Benchmark, method, T, BS, IPT) \ |
| 3638 | + executor.queue_fn(bench_naming::format_name("{lvl:block,algo:histogram,key_type:" #T \ |
| 3639 | + ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" \ |
| 3640 | + + std::string(method) + "}}") \ |
| 3641 | + .c_str(), \ |
| 3642 | + run_benchmark<Benchmark, T, BS, IPT>); |
| 3643 | + |
| 3644 | +#define BENCHMARK_TYPE(Benchmark, method, T, BS) \ |
| 3645 | + CREATE_BENCHMARK(Benchmark, method, T, BS, 1) \ |
| 3646 | + CREATE_BENCHMARK(Benchmark, method, T, BS, 2) \ |
| 3647 | + CREATE_BENCHMARK(Benchmark, method, T, BS, 3) \ |
| 3648 | + CREATE_BENCHMARK(Benchmark, method, T, BS, 4) \ |
| 3649 | + CREATE_BENCHMARK(Benchmark, method, T, BS, 8) \ |
| 3650 | + CREATE_BENCHMARK(Benchmark, method, T, BS, 16) |
| 3651 | + |
| 3652 | +#define BENCHMARK_TYPE_128(Benchmark, method, T, BS) \ |
| 3653 | + CREATE_BENCHMARK(Benchmark, method, T, BS, 1) \ |
| 3654 | + CREATE_BENCHMARK(Benchmark, method, T, BS, 2) \ |
| 3655 | + CREATE_BENCHMARK(Benchmark, method, T, BS, 3) \ |
| 3656 | + CREATE_BENCHMARK(Benchmark, method, T, BS, 4) \ |
| 3657 | + CREATE_BENCHMARK(Benchmark, method, T, BS, 8) \ |
| 3658 | + CREATE_BENCHMARK(Benchmark, method, T, BS, 12) |
| 3659 | + |
| 3660 | +#define BENCHMARK_ATOMIC() \ |
| 3661 | + BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", int, 256) \ |
| 3662 | + BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", int, 320) \ |
| 3663 | + BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", int, 512) \ |
| 3664 | + \ |
| 3665 | + BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", unsigned long long, 256) \ |
| 3666 | + BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", unsigned long long, 320) |
| 3667 | + |
| 3668 | +#define BENCHMARK_SORT() \ |
| 3669 | + BENCHMARK_TYPE(histogram_sort_t, "using_sort", int, 256) \ |
| 3670 | + BENCHMARK_TYPE(histogram_sort_t, "using_sort", int, 320) \ |
| 3671 | + BENCHMARK_TYPE(histogram_sort_t, "using_sort", int, 512) \ |
| 3672 | + \ |
| 3673 | + BENCHMARK_TYPE(histogram_sort_t, "using_sort", unsigned long long, 256) \ |
| 3674 | + BENCHMARK_TYPE(histogram_sort_t, "using_sort", unsigned long long, 320) \ |
| 3675 | + \ |
| 3676 | + BENCHMARK_TYPE_128(histogram_sort_t, "using_sort", rocprim::int128_t, 256) \ |
| 3677 | + BENCHMARK_TYPE_128(histogram_sort_t, "using_sort", rocprim::uint128_t, 256) |
| 3678 | + |
| 3679 | +int main(int argc, char* argv[]) |
| 3680 | { |
| 3681 | - cli::Parser parser(argc, argv); |
| 3682 | - parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes"); |
| 3683 | - parser.set_optional<int>("trials", "trials", -1, "number of iterations"); |
| 3684 | - parser.set_optional<std::string>("name_format", |
| 3685 | - "name_format", |
| 3686 | - "human", |
| 3687 | - "either: json,human,txt"); |
| 3688 | - parser.run_and_exit_if_error(); |
| 3689 | - |
| 3690 | - // Parse argv |
| 3691 | - benchmark::Initialize(&argc, argv); |
| 3692 | - const size_t bytes = parser.get<size_t>("size"); |
| 3693 | - const int trials = parser.get<int>("trials"); |
| 3694 | - bench_naming::set_format(parser.get<std::string>("name_format")); |
| 3695 | - |
| 3696 | - // HIP |
| 3697 | - hipStream_t stream = 0; // default |
| 3698 | - |
| 3699 | - // Benchmark info |
| 3700 | - add_common_benchmark_info(); |
| 3701 | - benchmark::AddCustomContext("bytes", std::to_string(bytes)); |
| 3702 | + benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0); |
| 3703 | |
| 3704 | - // Add benchmarks |
| 3705 | - std::vector<benchmark::internal::Benchmark*> benchmarks; |
| 3706 | - // using_atomic |
| 3707 | - using histogram_a_t = histogram<rocprim::block_histogram_algorithm::using_atomic>; |
| 3708 | - add_benchmarks<histogram_a_t>(benchmarks, "using_atomic", stream, bytes); |
| 3709 | - // using_sort |
| 3710 | - using histogram_s_t = histogram<rocprim::block_histogram_algorithm::using_sort>; |
| 3711 | - add_benchmarks<histogram_s_t>(benchmarks, "using_sort", stream, bytes); |
| 3712 | +#ifndef BENCHMARK_CONFIG_TUNING |
| 3713 | + using histogram_atomic_t = histogram<rocprim::block_histogram_algorithm::using_atomic>; |
| 3714 | + using histogram_sort_t = histogram<rocprim::block_histogram_algorithm::using_sort>; |
| 3715 | |
| 3716 | - // Use manual timing |
| 3717 | - for(auto& b : benchmarks) |
| 3718 | - { |
| 3719 | - b->UseManualTime(); |
| 3720 | - b->Unit(benchmark::kMillisecond); |
| 3721 | - } |
| 3722 | - |
| 3723 | - // Force number of iterations |
| 3724 | - if(trials > 0) |
| 3725 | - { |
| 3726 | - for(auto& b : benchmarks) |
| 3727 | - { |
| 3728 | - b->Iterations(trials); |
| 3729 | - } |
| 3730 | - } |
| 3731 | + BENCHMARK_ATOMIC() |
| 3732 | + BENCHMARK_SORT() |
| 3733 | +#endif |
| 3734 | |
| 3735 | - // Run benchmarks |
| 3736 | - benchmark::RunSpecifiedBenchmarks(); |
| 3737 | - return 0; |
| 3738 | + executor.run(); |
| 3739 | } |
| 3740 | diff --git a/benchmark/benchmark_block_radix_rank.cpp b/benchmark/benchmark_block_radix_rank.cpp |
| 3741 | index faee6d3..a708037 100644 |
| 3742 | --- a/benchmark/benchmark_block_radix_rank.cpp |
| 3743 | +++ b/benchmark/benchmark_block_radix_rank.cpp |
| 3744 | @@ -1,6 +1,6 @@ |
| 3745 | // MIT License |
| 3746 | // |
| 3747 | -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 3748 | +// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 3749 | // |
| 3750 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3751 | // of this software and associated documentation files (the "Software"), to deal |
| 3752 | @@ -21,11 +21,9 @@ |
| 3753 | // SOFTWARE. |
| 3754 | |
| 3755 | #include "benchmark_utils.hpp" |
| 3756 | -// CmdParser |
| 3757 | -#include "cmdparser.hpp" |
| 3758 | |
| 3759 | -// Google Benchmark |
| 3760 | -#include <benchmark/benchmark.h> |
| 3761 | +#include "../common/utils_data_generation.hpp" |
| 3762 | +#include "../common/utils_device_ptr.hpp" |
| 3763 | |
| 3764 | // HIP API |
| 3765 | #include <hip/hip_runtime.h> |
| 3766 | @@ -34,34 +32,31 @@ |
| 3767 | #include <rocprim/block/block_load_func.hpp> |
| 3768 | #include <rocprim/block/block_radix_rank.hpp> |
| 3769 | #include <rocprim/block/block_store_func.hpp> |
| 3770 | +#include <rocprim/config.hpp> |
| 3771 | +#include <rocprim/types.hpp> |
| 3772 | |
| 3773 | #include <chrono> |
| 3774 | -#include <limits> |
| 3775 | +#include <stdint.h> |
| 3776 | #include <string> |
| 3777 | - |
| 3778 | -#ifndef DEFAULT_N |
| 3779 | -const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; |
| 3780 | -#endif |
| 3781 | - |
| 3782 | -namespace rp = rocprim; |
| 3783 | +#include <vector> |
| 3784 | |
| 3785 | template<typename T, |
| 3786 | - unsigned int BlockSize, |
| 3787 | - unsigned int ItemsPerThread, |
| 3788 | - unsigned int RadixBits, |
| 3789 | - bool Descending, |
| 3790 | - rp::block_radix_rank_algorithm Algorithm, |
| 3791 | - unsigned int Trials> |
| 3792 | -__global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_input, |
| 3793 | - unsigned int* ranks_output) |
| 3794 | + unsigned int BlockSize, |
| 3795 | + unsigned int ItemsPerThread, |
| 3796 | + unsigned int RadixBits, |
| 3797 | + bool Descending, |
| 3798 | + rocprim::block_radix_rank_algorithm Algorithm, |
| 3799 | + unsigned int Trials> |
| 3800 | +__global__ __launch_bounds__(BlockSize) |
| 3801 | +void rank_kernel(const T* keys_input, unsigned int* ranks_output) |
| 3802 | { |
| 3803 | - using rank_type = rp::block_radix_rank<BlockSize, RadixBits, Algorithm>; |
| 3804 | + using rank_type = rocprim::block_radix_rank<BlockSize, RadixBits, Algorithm>; |
| 3805 | |
| 3806 | const unsigned int lid = threadIdx.x; |
| 3807 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 3808 | |
| 3809 | T keys[ItemsPerThread]; |
| 3810 | - rp::block_load_direct_striped<BlockSize>(lid, keys_input + block_offset, keys); |
| 3811 | + rocprim::block_load_direct_striped<BlockSize>(lid, keys_input + block_offset, keys); |
| 3812 | |
| 3813 | unsigned int ranks[ItemsPerThread]; |
| 3814 | |
| 3815 | @@ -75,7 +70,7 @@ __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_inpu |
| 3816 | while(begin_bit < end_bit) |
| 3817 | { |
| 3818 | const unsigned pass_bits = min(RadixBits, end_bit - begin_bit); |
| 3819 | - if ROCPRIM_IF_CONSTEXPR(Descending) |
| 3820 | + if constexpr(Descending) |
| 3821 | { |
| 3822 | rank_type().rank_keys_desc(keys, ranks, storage, begin_bit, pass_bits); |
| 3823 | } |
| 3824 | @@ -87,164 +82,93 @@ __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_inpu |
| 3825 | } |
| 3826 | } |
| 3827 | |
| 3828 | - rp::block_store_direct_striped<BlockSize>(lid, ranks_output + block_offset, ranks); |
| 3829 | + rocprim::block_store_direct_striped<BlockSize>(lid, ranks_output + block_offset, ranks); |
| 3830 | } |
| 3831 | |
| 3832 | template<typename T, |
| 3833 | - unsigned int BlockSize, |
| 3834 | - unsigned int ItemsPerThread, |
| 3835 | - rp::block_radix_rank_algorithm Algorithm, |
| 3836 | - unsigned int RadixBits = 4, |
| 3837 | - bool Descending = false, |
| 3838 | - unsigned int Trials = 10> |
| 3839 | -void run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) |
| 3840 | + size_t BlockSize, |
| 3841 | + size_t ItemsPerThread, |
| 3842 | + rocprim::block_radix_rank_algorithm Algorithm, |
| 3843 | + size_t RadixBits = 4, |
| 3844 | + bool Descending = false, |
| 3845 | + size_t Trials = 10> |
| 3846 | +void run_benchmark(benchmark_utils::state&& state) |
| 3847 | { |
| 3848 | + const auto& bytes = state.bytes; |
| 3849 | + const auto& seed = state.seed; |
| 3850 | + const auto& stream = state.stream; |
| 3851 | + |
| 3852 | // Calculate the number of elements N |
| 3853 | - size_t N = bytes / sizeof(T); |
| 3854 | - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; |
| 3855 | - const unsigned int grid_size = ((N + items_per_block - 1) / items_per_block); |
| 3856 | - const unsigned int size = items_per_block * grid_size; |
| 3857 | + size_t N = bytes / sizeof(T); |
| 3858 | + constexpr size_t items_per_block = BlockSize * ItemsPerThread; |
| 3859 | + const size_t grid_size = ((N + items_per_block - 1) / items_per_block); |
| 3860 | + const size_t size = items_per_block * grid_size; |
| 3861 | |
| 3862 | std::vector<T> input = get_random_data<T>(size, |
| 3863 | - generate_limits<T>::min(), |
| 3864 | - generate_limits<T>::max(), |
| 3865 | + common::generate_limits<T>::min(), |
| 3866 | + common::generate_limits<T>::max(), |
| 3867 | seed.get_0()); |
| 3868 | |
| 3869 | - T* d_input; |
| 3870 | - unsigned int* d_output; |
| 3871 | - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); |
| 3872 | - HIP_CHECK(hipMalloc(&d_output, size * sizeof(unsigned int))); |
| 3873 | - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); |
| 3874 | + common::device_ptr<T> d_input(input); |
| 3875 | + common::device_ptr<unsigned int> d_output(size); |
| 3876 | HIP_CHECK(hipDeviceSynchronize()); |
| 3877 | |
| 3878 | - for(auto _ : state) |
| 3879 | - { |
| 3880 | - auto start = std::chrono::steady_clock::now(); |
| 3881 | - |
| 3882 | - hipLaunchKernelGGL(HIP_KERNEL_NAME(rank_kernel<T, |
| 3883 | - BlockSize, |
| 3884 | - ItemsPerThread, |
| 3885 | - RadixBits, |
| 3886 | - Descending, |
| 3887 | - Algorithm, |
| 3888 | - Trials>), |
| 3889 | - dim3(grid_size), |
| 3890 | - dim3(BlockSize), |
| 3891 | - 0, |
| 3892 | - stream, |
| 3893 | - d_input, |
| 3894 | - d_output); |
| 3895 | - HIP_CHECK(hipPeekAtLastError()); |
| 3896 | - HIP_CHECK(hipDeviceSynchronize()); |
| 3897 | - |
| 3898 | - auto end = std::chrono::steady_clock::now(); |
| 3899 | - auto elapsed_seconds |
| 3900 | - = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); |
| 3901 | - state.SetIterationTime(elapsed_seconds.count()); |
| 3902 | - } |
| 3903 | - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); |
| 3904 | - state.SetItemsProcessed(state.iterations() * Trials * size); |
| 3905 | + state.run( |
| 3906 | + [&] |
| 3907 | + { |
| 3908 | + rank_kernel<T, BlockSize, ItemsPerThread, RadixBits, Descending, Algorithm, Trials> |
| 3909 | + <<<dim3(grid_size), dim3(BlockSize), 0, stream>>>(d_input.get(), d_output.get()); |
| 3910 | + HIP_CHECK(hipGetLastError()); |
| 3911 | + }); |
| 3912 | |
| 3913 | - HIP_CHECK(hipFree(d_input)); |
| 3914 | - HIP_CHECK(hipFree(d_output)); |
| 3915 | + state.set_throughput(size * Trials, sizeof(T)); |
| 3916 | } |
| 3917 | |
| 3918 | -#define CREATE_BENCHMARK(T, BS, IPT, KIND) \ |
| 3919 | - benchmark::RegisterBenchmark( \ |
| 3920 | - bench_naming::format_name("{lvl:block,algo:radix_rank,key_type:" #T ",cfg:{bs:" #BS \ |
| 3921 | - ",ipt:" #IPT ",method:" #KIND "}}") \ |
| 3922 | - .c_str(), \ |
| 3923 | - run_benchmark<T, BS, IPT, KIND>, \ |
| 3924 | - bytes, \ |
| 3925 | - seed, \ |
| 3926 | - stream) |
| 3927 | +#define CREATE_BENCHMARK(T, BS, IPT, KIND) \ |
| 3928 | + executor.queue_fn(bench_naming::format_name("{lvl:block,algo:radix_rank,key_type:" #T \ |
| 3929 | + ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" #KIND \ |
| 3930 | + "}}") \ |
| 3931 | + .c_str(), \ |
| 3932 | + run_benchmark<T, BS, IPT, KIND>); |
| 3933 | |
| 3934 | // clang-format off |
| 3935 | -#define CREATE_BENCHMARK_KINDS(type, block, ipt) \ |
| 3936 | - CREATE_BENCHMARK(type, block, ipt, rp::block_radix_rank_algorithm::basic), \ |
| 3937 | - CREATE_BENCHMARK(type, block, ipt, rp::block_radix_rank_algorithm::basic_memoize), \ |
| 3938 | - CREATE_BENCHMARK(type, block, ipt, rp::block_radix_rank_algorithm::match) |
| 3939 | - |
| 3940 | -#define BENCHMARK_TYPE(type, block) \ |
| 3941 | - CREATE_BENCHMARK_KINDS(type, block, 1), \ |
| 3942 | - CREATE_BENCHMARK_KINDS(type, block, 4), \ |
| 3943 | - CREATE_BENCHMARK_KINDS(type, block, 8), \ |
| 3944 | - CREATE_BENCHMARK_KINDS(type, block, 12), \ |
| 3945 | - CREATE_BENCHMARK_KINDS(type, block, 16), \ |
| 3946 | +#define CREATE_BENCHMARK_KINDS(type, block, ipt) \ |
| 3947 | + CREATE_BENCHMARK(type, block, ipt, rocprim::block_radix_rank_algorithm::basic) \ |
| 3948 | + CREATE_BENCHMARK(type, block, ipt, rocprim::block_radix_rank_algorithm::basic_memoize) \ |
| 3949 | + CREATE_BENCHMARK(type, block, ipt, rocprim::block_radix_rank_algorithm::match) |
| 3950 | + |
| 3951 | +#define BENCHMARK_TYPE(type, block) \ |
| 3952 | + CREATE_BENCHMARK_KINDS(type, block, 1) \ |
| 3953 | + CREATE_BENCHMARK_KINDS(type, block, 4) \ |
| 3954 | + CREATE_BENCHMARK_KINDS(type, block, 8) \ |
| 3955 | + CREATE_BENCHMARK_KINDS(type, block, 12) \ |
| 3956 | + CREATE_BENCHMARK_KINDS(type, block, 16) \ |
| 3957 | CREATE_BENCHMARK_KINDS(type, block, 20) |
| 3958 | // clang-format on |
| 3959 | |
| 3960 | -void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks, |
| 3961 | - size_t bytes, |
| 3962 | - const managed_seed& seed, |
| 3963 | - hipStream_t stream) |
| 3964 | +int main(int argc, char* argv[]) |
| 3965 | { |
| 3966 | - std::vector<benchmark::internal::Benchmark*> bs = { |
| 3967 | - BENCHMARK_TYPE(int, 128), |
| 3968 | - BENCHMARK_TYPE(int, 256), |
| 3969 | - BENCHMARK_TYPE(int, 512), |
| 3970 | + benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0); |
| 3971 | |
| 3972 | - BENCHMARK_TYPE(uint8_t, 128), |
| 3973 | - BENCHMARK_TYPE(uint8_t, 256), |
| 3974 | - BENCHMARK_TYPE(uint8_t, 512), |
| 3975 | + BENCHMARK_TYPE(int, 128) |
| 3976 | + BENCHMARK_TYPE(int, 256) |
| 3977 | + BENCHMARK_TYPE(int, 512) |
| 3978 | |
| 3979 | - BENCHMARK_TYPE(long long, 128), |
| 3980 | - BENCHMARK_TYPE(long long, 256), |
| 3981 | - BENCHMARK_TYPE(long long, 512), |
| 3982 | - }; |
| 3983 | + BENCHMARK_TYPE(uint8_t, 128) |
| 3984 | + BENCHMARK_TYPE(uint8_t, 256) |
| 3985 | + BENCHMARK_TYPE(uint8_t, 512) |
| 3986 | |
| 3987 | - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); |
| 3988 | -} |
| 3989 | + BENCHMARK_TYPE(long long, 128) |
| 3990 | + BENCHMARK_TYPE(long long, 256) |
| 3991 | + BENCHMARK_TYPE(long long, 512) |
| 3992 | |
| 3993 | -int main(int argc, char* argv[]) |
| 3994 | -{ |
| 3995 | - cli::Parser parser(argc, argv); |
| 3996 | - parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes"); |
| 3997 | - parser.set_optional<int>("trials", "trials", -1, "number of iterations"); |
| 3998 | - parser.set_optional<std::string>("name_format", |
| 3999 | - "name_format", |
| 4000 | - "human", |
| 4001 | - "either: json,human,txt"); |
| 4002 | - parser.set_optional<std::string>("seed", "seed", "random", get_seed_message()); |
| 4003 | - parser.run_and_exit_if_error(); |
| 4004 | - |
| 4005 | - // Parse argv |
| 4006 | - benchmark::Initialize(&argc, argv); |
| 4007 | - const size_t bytes = parser.get<size_t>("size"); |
| 4008 | - const int trials = parser.get<int>("trials"); |
| 4009 | - bench_naming::set_format(parser.get<std::string>("name_format")); |
| 4010 | - const std::string seed_type = parser.get<std::string>("seed"); |
| 4011 | - const managed_seed seed(seed_type); |
| 4012 | - |
| 4013 | - // HIP |
| 4014 | - hipStream_t stream = 0; // default |
| 4015 | - |
| 4016 | - // Benchmark info |
| 4017 | - add_common_benchmark_info(); |
| 4018 | - benchmark::AddCustomContext("bytes", std::to_string(bytes)); |
| 4019 | - benchmark::AddCustomContext("seed", seed_type); |
| 4020 | - |
| 4021 | - // Add benchmarks |
| 4022 | - std::vector<benchmark::internal::Benchmark*> benchmarks; |
| 4023 | - add_benchmarks(benchmarks, bytes, seed, stream); |
| 4024 | - |
| 4025 | - // Use manual timing |
| 4026 | - for(auto& b : benchmarks) |
| 4027 | - { |
| 4028 | - b->UseManualTime(); |
| 4029 | - b->Unit(benchmark::kMillisecond); |
| 4030 | - } |
| 4031 | + BENCHMARK_TYPE(rocprim::int128_t, 128) |
| 4032 | + BENCHMARK_TYPE(rocprim::int128_t, 256) |
| 4033 | + BENCHMARK_TYPE(rocprim::int128_t, 512) |
| 4034 | |
| 4035 | - // Force number of iterations |
| 4036 | - if(trials > 0) |
| 4037 | - { |
| 4038 | - for(auto& b : benchmarks) |
| 4039 | - { |
| 4040 | - b->Iterations(trials); |
| 4041 | - } |
| 4042 | - } |
| 4043 | + BENCHMARK_TYPE(rocprim::uint128_t, 128) |
| 4044 | + BENCHMARK_TYPE(rocprim::uint128_t, 256) |
| 4045 | + BENCHMARK_TYPE(rocprim::uint128_t, 512) |
| 4046 | |
| 4047 | - // Run benchmarks |
| 4048 | - benchmark::RunSpecifiedBenchmarks(); |
| 4049 | - return 0; |
| 4050 | + executor.run(); |
| 4051 | } |
| 4052 | diff --git a/benchmark/benchmark_block_radix_sort.cpp b/benchmark/benchmark_block_radix_sort.cpp |
| 4053 | index ff14473..331f809 100644 |
| 4054 | --- a/benchmark/benchmark_block_radix_sort.cpp |
| 4055 | +++ b/benchmark/benchmark_block_radix_sort.cpp |
| 4056 | @@ -1,6 +1,6 @@ |
| 4057 | // MIT License |
| 4058 | // |
| 4059 | -// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 4060 | +// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 4061 | // |
| 4062 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 4063 | // of this software and associated documentation files (the "Software"), to deal |
| 4064 | @@ -21,11 +21,10 @@ |
| 4065 | // SOFTWARE. |
| 4066 | |
| 4067 | #include "benchmark_utils.hpp" |
| 4068 | -// CmdParser |
| 4069 | -#include "cmdparser.hpp" |
| 4070 | |
| 4071 | -// Google Benchmark |
| 4072 | -#include <benchmark/benchmark.h> |
| 4073 | +#include "../common/utils_custom_type.hpp" |
| 4074 | +#include "../common/utils_data_generation.hpp" |
| 4075 | +#include "../common/utils_device_ptr.hpp" |
| 4076 | |
| 4077 | // HIP API |
| 4078 | #include <hip/hip_runtime.h> |
| 4079 | @@ -34,325 +33,244 @@ |
| 4080 | #include <rocprim/block/block_load_func.hpp> |
| 4081 | #include <rocprim/block/block_radix_sort.hpp> |
| 4082 | #include <rocprim/block/block_store_func.hpp> |
| 4083 | +#include <rocprim/config.hpp> |
| 4084 | +#include <rocprim/types.hpp> |
| 4085 | |
| 4086 | -#include <iostream> |
| 4087 | -#include <limits> |
| 4088 | +#include <cstddef> |
| 4089 | +#include <stdint.h> |
| 4090 | #include <string> |
| 4091 | #include <type_traits> |
| 4092 | #include <vector> |
| 4093 | |
| 4094 | -#include <cstdio> |
| 4095 | -#include <cstdlib> |
| 4096 | - |
| 4097 | -#ifndef DEFAULT_N |
| 4098 | -const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; |
| 4099 | -#endif |
| 4100 | - |
| 4101 | enum class benchmark_kinds |
| 4102 | { |
| 4103 | sort_keys, |
| 4104 | sort_pairs |
| 4105 | }; |
| 4106 | |
| 4107 | -namespace rp = rocprim; |
| 4108 | - |
| 4109 | -template<class T> |
| 4110 | -using select_decomposer_t = std:: |
| 4111 | - conditional_t<is_custom_type<T>::value, custom_type_decomposer<T>, rp::identity_decomposer>; |
| 4112 | +template<typename T> |
| 4113 | +using select_decomposer_t = std::conditional_t<common::is_custom_type<T>::value, |
| 4114 | + custom_type_decomposer<T>, |
| 4115 | + rocprim::identity_decomposer>; |
| 4116 | |
| 4117 | -template<class T, |
| 4118 | +template<typename T, |
| 4119 | unsigned int BlockSize, |
| 4120 | unsigned int RadixBitsPerPass, |
| 4121 | unsigned int ItemsPerThread, |
| 4122 | unsigned int Trials> |
| 4123 | -__global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, T* output) |
| 4124 | +__global__ __launch_bounds__(BlockSize) |
| 4125 | +void sort_keys_kernel(const T* input, T* output) |
| 4126 | { |
| 4127 | - const unsigned int lid = threadIdx.x; |
| 4128 | + const unsigned int lid = threadIdx.x; |
| 4129 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 4130 | |
| 4131 | T keys[ItemsPerThread]; |
| 4132 | - rp::block_load_direct_striped<BlockSize>(lid, input + block_offset, keys); |
| 4133 | + rocprim::block_load_direct_striped<BlockSize>(lid, input + block_offset, keys); |
| 4134 | |
| 4135 | ROCPRIM_NO_UNROLL |
| 4136 | - for(unsigned int trial = 0; trial < Trials; trial++) |
| 4137 | + for(unsigned int trial = 0; trial < Trials; ++trial) |
| 4138 | { |
| 4139 | - rp::block_radix_sort<T, |
| 4140 | - BlockSize, |
| 4141 | - ItemsPerThread, |
| 4142 | - rocprim::empty_type, |
| 4143 | - 1, |
| 4144 | - 1, |
| 4145 | - RadixBitsPerPass> |
| 4146 | + rocprim::block_radix_sort<T, |
| 4147 | + BlockSize, |
| 4148 | + ItemsPerThread, |
| 4149 | + rocprim::empty_type, |
| 4150 | + 1, |
| 4151 | + 1, |
| 4152 | + RadixBitsPerPass> |
| 4153 | sort; |
| 4154 | sort.sort(keys, 0, sizeof(T) * 8, select_decomposer_t<T>{}); |
| 4155 | } |
| 4156 | |
| 4157 | - rp::block_store_direct_striped<BlockSize>(lid, output + block_offset, keys); |
| 4158 | + rocprim::block_store_direct_striped<BlockSize>(lid, output + block_offset, keys); |
| 4159 | } |
| 4160 | |
| 4161 | -template<class T, |
| 4162 | +template<typename T, |
| 4163 | unsigned int BlockSize, |
| 4164 | unsigned int RadixBitsPerPass, |
| 4165 | unsigned int ItemsPerThread, |
| 4166 | unsigned int Trials> |
| 4167 | -__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, T* output) |
| 4168 | +__global__ __launch_bounds__(BlockSize) |
| 4169 | +void sort_pairs_kernel(const T* input, T* output) |
| 4170 | { |
| 4171 | - const unsigned int lid = threadIdx.x; |
| 4172 | + const unsigned int lid = threadIdx.x; |
| 4173 | const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; |
| 4174 | |
| 4175 | T keys[ItemsPerThread]; |
| 4176 | T values[ItemsPerThread]; |
| 4177 | - rp::block_load_direct_striped<BlockSize>(lid, input + block_offset, keys); |
| 4178 | - for(unsigned int i = 0; i < ItemsPerThread; i++) |
| 4179 | + rocprim::block_load_direct_striped<BlockSize>(lid, input + block_offset, keys); |
| 4180 | + for(unsigned int i = 0; i < ItemsPerThread; ++i) |
| 4181 | { |
| 4182 | values[i] = keys[i] + T(1); |
| 4183 | } |
| 4184 | |
| 4185 | ROCPRIM_NO_UNROLL |
| 4186 | - for(unsigned int trial = 0; trial < Trials; trial++) |
| 4187 | + for(unsigned int trial = 0; trial < Trials; ++trial) |
| 4188 | { |
| 4189 | - rp::block_radix_sort<T, BlockSize, ItemsPerThread, T, 1, 1, RadixBitsPerPass> sort; |
| 4190 | + rocprim::block_radix_sort<T, BlockSize, ItemsPerThread, T, 1, 1, RadixBitsPerPass> sort; |
| 4191 | sort.sort(keys, values, 0, sizeof(T) * 8, select_decomposer_t<T>{}); |
| 4192 | } |
| 4193 | |
| 4194 | - for(unsigned int i = 0; i < ItemsPerThread; i++) |
| 4195 | + for(unsigned int i = 0; i < ItemsPerThread; ++i) |
| 4196 | { |
| 4197 | keys[i] += values[i]; |
| 4198 | } |
| 4199 | - rp::block_store_direct_striped<BlockSize>(lid, output + block_offset, keys); |
| 4200 | + rocprim::block_store_direct_striped<BlockSize>(lid, output + block_offset, keys); |
| 4201 | } |
| 4202 | |
| 4203 | -template<class T, |
| 4204 | - unsigned int BlockSize, |
| 4205 | - unsigned int RadixBitsPerPass, |
| 4206 | - unsigned int ItemsPerThread, |
| 4207 | - unsigned int Trials = 10> |
| 4208 | -void run_benchmark(benchmark::State& state, |
| 4209 | - benchmark_kinds benchmark_kind, |
| 4210 | - size_t bytes, |
| 4211 | - const managed_seed& seed, |
| 4212 | - hipStream_t stream) |
| 4213 | +template<typename T, |
| 4214 | + benchmark_kinds BenchmarkKind, |
| 4215 | + unsigned int BlockSize, |
| 4216 | + unsigned int RadixBitsPerPass, |
| 4217 | + unsigned int ItemsPerThread, |
| 4218 | + unsigned int Trials = 10> |
| 4219 | +void run_benchmark(benchmark_utils::state&& state) |
| 4220 | { |
| 4221 | + const auto& bytes = state.bytes; |
| 4222 | + const auto& seed = state.seed; |
| 4223 | + const auto& stream = state.stream; |
| 4224 | + |
| 4225 | // Calculate the number of elements N |
| 4226 | size_t N = bytes / sizeof(T); |
| 4227 | - |
| 4228 | + |
| 4229 | constexpr auto items_per_block = BlockSize * ItemsPerThread; |
| 4230 | - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); |
| 4231 | + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); |
| 4232 | |
| 4233 | std::vector<T> input = get_random_data<T>(size, |
| 4234 | - generate_limits<T>::min(), |
| 4235 | - generate_limits<T>::max(), |
| 4236 | + common::generate_limits<T>::min(), |
| 4237 | + common::generate_limits<T>::max(), |
| 4238 | seed.get_0()); |
| 4239 | |
| 4240 | - T* d_input; |
| 4241 | - T * d_output; |
| 4242 | - HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T))); |
| 4243 | - HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T))); |
| 4244 | - HIP_CHECK( |
| 4245 | - hipMemcpy( |
| 4246 | - d_input, input.data(), |
| 4247 | - size * sizeof(T), |
| 4248 | - hipMemcpyHostToDevice |
| 4249 | - ) |
| 4250 | - ); |
| 4251 | + common::device_ptr<T> d_input(input); |
| 4252 | + common::device_ptr<T> d_output(size); |
| 4253 | HIP_CHECK(hipDeviceSynchronize()); |
| 4254 | |
| 4255 | - // HIP events creation |
| 4256 | - hipEvent_t start, stop; |
| 4257 | - HIP_CHECK(hipEventCreate(&start)); |
| 4258 | - HIP_CHECK(hipEventCreate(&stop)); |
| 4259 | - |
| 4260 | - for(auto _ : state) |
| 4261 | - { |
| 4262 | - // Record start event |
| 4263 | - HIP_CHECK(hipEventRecord(start, stream)); |
| 4264 | - |
| 4265 | - if(benchmark_kind == benchmark_kinds::sort_keys) |
| 4266 | - { |
| 4267 | - hipLaunchKernelGGL( |
| 4268 | - HIP_KERNEL_NAME( |
| 4269 | - sort_keys_kernel<T, BlockSize, RadixBitsPerPass, ItemsPerThread, Trials>), |
| 4270 | - dim3(size / items_per_block), |
| 4271 | - dim3(BlockSize), |
| 4272 | - 0, |
| 4273 | - stream, |
| 4274 | - d_input, |
| 4275 | - d_output); |
| 4276 | - } |
| 4277 | - else if(benchmark_kind == benchmark_kinds::sort_pairs) |
| 4278 | + state.run( |
| 4279 | + [&] |
| 4280 | { |
| 4281 | - hipLaunchKernelGGL( |
| 4282 | - HIP_KERNEL_NAME( |
| 4283 | - sort_pairs_kernel<T, BlockSize, RadixBitsPerPass, ItemsPerThread, Trials>), |
| 4284 | - dim3(size / items_per_block), |
| 4285 | - dim3(BlockSize), |
| 4286 | - 0, |
| 4287 | - stream, |
| 4288 | - d_input, |
| 4289 | - d_output); |
| 4290 | - } |
| 4291 | - HIP_CHECK(hipGetLastError()); |
| 4292 | - |
| 4293 | - // Record stop event and wait until it completes |
| 4294 | - HIP_CHECK(hipEventRecord(stop, stream)); |
| 4295 | - HIP_CHECK(hipEventSynchronize(stop)); |
| 4296 | - |
| 4297 | - float elapsed_mseconds; |
| 4298 | - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); |
| 4299 | - state.SetIterationTime(elapsed_mseconds / 1000); |
| 4300 | - } |
| 4301 | - |
| 4302 | - // Destroy HIP events |
| 4303 | - HIP_CHECK(hipEventDestroy(start)); |
| 4304 | - HIP_CHECK(hipEventDestroy(stop)); |
| 4305 | - |
| 4306 | - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); |
| 4307 | - state.SetItemsProcessed(state.iterations() * Trials * size); |
| 4308 | - |
| 4309 | - HIP_CHECK(hipFree(d_input)); |
| 4310 | - HIP_CHECK(hipFree(d_output)); |
| 4311 | + if constexpr(BenchmarkKind == benchmark_kinds::sort_keys) |
| 4312 | + { |
| 4313 | + sort_keys_kernel<T, BlockSize, RadixBitsPerPass, ItemsPerThread, Trials> |
| 4314 | + <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input.get(), |
| 4315 | + d_output.get()); |
| 4316 | + } |
| 4317 | + else if constexpr(BenchmarkKind == benchmark_kinds::sort_pairs) |
| 4318 | + { |
| 4319 | + sort_pairs_kernel<T, BlockSize, RadixBitsPerPass, ItemsPerThread, Trials> |
| 4320 | + <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input.get(), |
| 4321 | + d_output.get()); |
| 4322 | + } |
| 4323 | + HIP_CHECK(hipGetLastError()); |
| 4324 | + }); |
| 4325 | + |
| 4326 | + state.set_throughput(size * Trials, sizeof(T)); |
| 4327 | } |
| 4328 | |
| 4329 | #define CREATE_BENCHMARK(T, BS, RB, IPT) \ |
| 4330 | - benchmark::RegisterBenchmark( \ |
| 4331 | + executor.queue_fn( \ |
| 4332 | bench_naming::format_name("{lvl:block,algo:radix_sort,key_type:" #T ",subalgo:" + name \ |
| 4333 | + ",cfg:{bs:" #BS ",rb:" #RB ",ipt:" #IPT "}}") \ |
| 4334 | .c_str(), \ |
| 4335 | - run_benchmark<T, BS, RB, IPT>, \ |
| 4336 | - benchmark_kind, \ |
| 4337 | - bytes, \ |
| 4338 | - seed, \ |
| 4339 | - stream) |
| 4340 | - |
| 4341 | -#define BENCHMARK_TYPE(type, block, radix_bits) \ |
| 4342 | - CREATE_BENCHMARK(type, block, radix_bits, 1), CREATE_BENCHMARK(type, block, radix_bits, 2), \ |
| 4343 | - CREATE_BENCHMARK(type, block, radix_bits, 3), \ |
| 4344 | - CREATE_BENCHMARK(type, block, radix_bits, 4), CREATE_BENCHMARK(type, block, radix_bits, 8) |
| 4345 | - |
| 4346 | -void add_benchmarks(benchmark_kinds benchmark_kind, |
| 4347 | - const std::string& name, |
| 4348 | - std::vector<benchmark::internal::Benchmark*>& benchmarks, |
| 4349 | - size_t bytes, |
| 4350 | - const managed_seed& seed, |
| 4351 | - hipStream_t stream) |
| 4352 | + run_benchmark<T, BenchmarkKind, BS, RB, IPT>); |
| 4353 | + |
| 4354 | +#define BENCHMARK_TYPE(type, block, radix_bits) \ |
| 4355 | + CREATE_BENCHMARK(type, block, radix_bits, 1) \ |
| 4356 | + CREATE_BENCHMARK(type, block, radix_bits, 2) \ |
| 4357 | + CREATE_BENCHMARK(type, block, radix_bits, 3) \ |
| 4358 | + CREATE_BENCHMARK(type, block, radix_bits, 4) \ |
| 4359 | + CREATE_BENCHMARK(type, block, radix_bits, 8) |
| 4360 | + |
| 4361 | +template<benchmark_kinds BenchmarkKind> |
| 4362 | +void add_benchmarks(const std::string& name, benchmark_utils::executor& executor) |
| 4363 | { |
| 4364 | - using custom_int_type = custom_type<int, int>; |
| 4365 | - |
| 4366 | - std::vector<benchmark::internal::Benchmark*> bs = { |
| 4367 | - BENCHMARK_TYPE(int, 64, 3), |
| 4368 | - BENCHMARK_TYPE(int, 512, 3), |
| 4369 | - |
| 4370 | - BENCHMARK_TYPE(int, 64, 4), |
| 4371 | - BENCHMARK_TYPE(int, 128, 4), |
| 4372 | - BENCHMARK_TYPE(int, 192, 4), |
| 4373 | - BENCHMARK_TYPE(int, 256, 4), |
| 4374 | - BENCHMARK_TYPE(int, 320, 4), |
| 4375 | - BENCHMARK_TYPE(int, 512, 4), |
| 4376 | - |
| 4377 | - BENCHMARK_TYPE(int8_t, 64, 3), |
| 4378 | - BENCHMARK_TYPE(int8_t, 512, 3), |
| 4379 | - |
| 4380 | - BENCHMARK_TYPE(int8_t, 64, 4), |
| 4381 | - BENCHMARK_TYPE(int8_t, 128, 4), |
| 4382 | - BENCHMARK_TYPE(int8_t, 192, 4), |
| 4383 | - BENCHMARK_TYPE(int8_t, 256, 4), |
| 4384 | - BENCHMARK_TYPE(int8_t, 320, 4), |
| 4385 | - BENCHMARK_TYPE(int8_t, 512, 4), |
| 4386 | - |
| 4387 | - BENCHMARK_TYPE(uint8_t, 64, 3), |
| 4388 | - BENCHMARK_TYPE(uint8_t, 512, 3), |
| 4389 | - |
| 4390 | - BENCHMARK_TYPE(uint8_t, 64, 4), |
| 4391 | - BENCHMARK_TYPE(uint8_t, 128, 4), |
| 4392 | - BENCHMARK_TYPE(uint8_t, 192, 4), |
| 4393 | - BENCHMARK_TYPE(uint8_t, 256, 4), |
| 4394 | - BENCHMARK_TYPE(uint8_t, 320, 4), |
| 4395 | - BENCHMARK_TYPE(uint8_t, 512, 4), |
| 4396 | - |
| 4397 | - BENCHMARK_TYPE(rocprim::half, 64, 3), |
| 4398 | - BENCHMARK_TYPE(rocprim::half, 512, 3), |
| 4399 | - |
| 4400 | - BENCHMARK_TYPE(rocprim::half, 64, 4), |
| 4401 | - BENCHMARK_TYPE(rocprim::half, 128, 4), |
| 4402 | - BENCHMARK_TYPE(rocprim::half, 192, 4), |
| 4403 | - BENCHMARK_TYPE(rocprim::half, 256, 4), |
| 4404 | - BENCHMARK_TYPE(rocprim::half, 320, 4), |
| 4405 | - BENCHMARK_TYPE(rocprim::half, 512, 4), |
| 4406 | - |
| 4407 | - BENCHMARK_TYPE(long long, 64, 3), |
| 4408 | - BENCHMARK_TYPE(long long, 512, 3), |
| 4409 | - |
| 4410 | - BENCHMARK_TYPE(long long, 64, 4), |
| 4411 | - BENCHMARK_TYPE(long long, 128, 4), |
| 4412 | - BENCHMARK_TYPE(long long, 192, 4), |
| 4413 | - BENCHMARK_TYPE(long long, 256, 4), |
| 4414 | - BENCHMARK_TYPE(long long, 320, 4), |
| 4415 | - BENCHMARK_TYPE(long long, 512, 4), |
| 4416 | - |
| 4417 | - BENCHMARK_TYPE(custom_int_type, 64, 3), |
| 4418 | - BENCHMARK_TYPE(custom_int_type, 512, 3), |
| 4419 | - |
| 4420 | - BENCHMARK_TYPE(custom_int_type, 64, 4), |
| 4421 | - BENCHMARK_TYPE(custom_int_type, 128, 4), |
| 4422 | - BENCHMARK_TYPE(custom_int_type, 192, 4), |
| 4423 | - BENCHMARK_TYPE(custom_int_type, 256, 4), |
| 4424 | - BENCHMARK_TYPE(custom_int_type, 320, 4), |
| 4425 | - BENCHMARK_TYPE(custom_int_type, 512, 4), |
| 4426 | - }; |
| 4427 | - |
| 4428 | - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); |
| 4429 | + using custom_int_type = common::custom_type<int, int>; |
| 4430 | + |
| 4431 | + BENCHMARK_TYPE(int, 64, 3) |
| 4432 | + BENCHMARK_TYPE(int, 512, 3) |
| 4433 | + |
| 4434 | + BENCHMARK_TYPE(int, 64, 4) |
| 4435 | + BENCHMARK_TYPE(int, 128, 4) |
| 4436 | + BENCHMARK_TYPE(int, 192, 4) |
| 4437 | + BENCHMARK_TYPE(int, 256, 4) |
| 4438 | + BENCHMARK_TYPE(int, 320, 4) |
| 4439 | + BENCHMARK_TYPE(int, 512, 4) |
| 4440 | + |
| 4441 | + BENCHMARK_TYPE(int8_t, 64, 3) |
| 4442 | + BENCHMARK_TYPE(int8_t, 512, 3) |
| 4443 | + |
| 4444 | + BENCHMARK_TYPE(int8_t, 64, 4) |
| 4445 | + BENCHMARK_TYPE(int8_t, 128, 4) |
| 4446 | + BENCHMARK_TYPE(int8_t, 192, 4) |
| 4447 | + BENCHMARK_TYPE(int8_t, 256, 4) |
| 4448 | + BENCHMARK_TYPE(int8_t, 320, 4) |
| 4449 | + BENCHMARK_TYPE(int8_t, 512, 4) |
| 4450 | + |
| 4451 | + BENCHMARK_TYPE(uint8_t, 64, 3) |
| 4452 | + BENCHMARK_TYPE(uint8_t, 512, 3) |
| 4453 | + |
| 4454 | + BENCHMARK_TYPE(uint8_t, 64, 4) |
| 4455 | + BENCHMARK_TYPE(uint8_t, 128, 4) |
| 4456 | + BENCHMARK_TYPE(uint8_t, 192, 4) |
| 4457 | + BENCHMARK_TYPE(uint8_t, 256, 4) |
| 4458 | + BENCHMARK_TYPE(uint8_t, 320, 4) |
| 4459 | + BENCHMARK_TYPE(uint8_t, 512, 4) |
| 4460 | + |
| 4461 | + BENCHMARK_TYPE(rocprim::half, 64, 3) |
| 4462 | + BENCHMARK_TYPE(rocprim::half, 512, 3) |
| 4463 | + |
| 4464 | + BENCHMARK_TYPE(rocprim::half, 64, 4) |
| 4465 | + BENCHMARK_TYPE(rocprim::half, 128, 4) |
| 4466 | + BENCHMARK_TYPE(rocprim::half, 192, 4) |
| 4467 | + BENCHMARK_TYPE(rocprim::half, 256, 4) |
| 4468 | + BENCHMARK_TYPE(rocprim::half, 320, 4) |
| 4469 | + BENCHMARK_TYPE(rocprim::half, 512, 4) |
| 4470 | + |
| 4471 | + BENCHMARK_TYPE(long long, 64, 3) |
| 4472 | + BENCHMARK_TYPE(long long, 512, 3) |
| 4473 | + |
| 4474 | + BENCHMARK_TYPE(long long, 64, 4) |
| 4475 | + BENCHMARK_TYPE(long long, 128, 4) |
| 4476 | + BENCHMARK_TYPE(long long, 192, 4) |
| 4477 | + BENCHMARK_TYPE(long long, 256, 4) |
| 4478 | + BENCHMARK_TYPE(long long, 320, 4) |
| 4479 | + BENCHMARK_TYPE(long long, 512, 4) |
| 4480 | + |
| 4481 | + BENCHMARK_TYPE(custom_int_type, 64, 3) |
| 4482 | + BENCHMARK_TYPE(custom_int_type, 512, 3) |
| 4483 | + |
| 4484 | + BENCHMARK_TYPE(custom_int_type, 64, 4) |
| 4485 | + BENCHMARK_TYPE(custom_int_type, 128, 4) |
| 4486 | + BENCHMARK_TYPE(custom_int_type, 192, 4) |
| 4487 | + BENCHMARK_TYPE(custom_int_type, 256, 4) |
| 4488 | + BENCHMARK_TYPE(custom_int_type, 320, 4) |
| 4489 | + BENCHMARK_TYPE(custom_int_type, 512, 4) |
| 4490 | + |
| 4491 | + BENCHMARK_TYPE(rocprim::int128_t, 64, 3) |
| 4492 | + BENCHMARK_TYPE(rocprim::int128_t, 512, 3) |
| 4493 | + |
| 4494 | + BENCHMARK_TYPE(rocprim::int128_t, 64, 4) |
| 4495 | + BENCHMARK_TYPE(rocprim::int128_t, 128, 4) |
| 4496 | + BENCHMARK_TYPE(rocprim::int128_t, 192, 4) |
| 4497 | + BENCHMARK_TYPE(rocprim::int128_t, 256, 4) |
| 4498 | + BENCHMARK_TYPE(rocprim::int128_t, 320, 4) |
| 4499 | + BENCHMARK_TYPE(rocprim::int128_t, 512, 4) |
| 4500 | + |
| 4501 | + BENCHMARK_TYPE(rocprim::uint128_t, 64, 3) |
| 4502 | + BENCHMARK_TYPE(rocprim::uint128_t, 512, 3) |
| 4503 | + |
| 4504 | + BENCHMARK_TYPE(rocprim::uint128_t, 64, 4) |
| 4505 | + BENCHMARK_TYPE(rocprim::uint128_t, 128, 4) |
| 4506 | + BENCHMARK_TYPE(rocprim::uint128_t, 192, 4) |
| 4507 | + BENCHMARK_TYPE(rocprim::uint128_t, 256, 4) |
| 4508 | + BENCHMARK_TYPE(rocprim::uint128_t, 320, 4) |
| 4509 | + BENCHMARK_TYPE(rocprim::uint128_t, 512, 4) |
| 4510 | } |
| 4511 | |
| 4512 | -int main(int argc, char *argv[]) |
| 4513 | +int main(int argc, char* argv[]) |
| 4514 | { |
| 4515 | - cli::Parser parser(argc, argv); |
| 4516 | - parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes"); |
| 4517 | - parser.set_optional<int>("trials", "trials", -1, "number of iterations"); |
| 4518 | - parser.set_optional<std::string>("name_format", |
| 4519 | - "name_format", |
| 4520 | - "human", |
| 4521 | - "either: json,human,txt"); |
| 4522 | - parser.set_optional<std::string>("seed", "seed", "random", get_seed_message()); |
| 4523 | - parser.run_and_exit_if_error(); |
| 4524 | - |
| 4525 | - // Parse argv |
| 4526 | - benchmark::Initialize(&argc, argv); |
| 4527 | - const size_t bytes = parser.get<size_t>("size"); |
| 4528 | - const int trials = parser.get<int>("trials"); |
| 4529 | - bench_naming::set_format(parser.get<std::string>("name_format")); |
| 4530 | - const std::string seed_type = parser.get<std::string>("seed"); |
| 4531 | - const managed_seed seed(seed_type); |
| 4532 | - |
| 4533 | - // HIP |
| 4534 | - hipStream_t stream = 0; // default |
| 4535 | - |
| 4536 | - // Benchmark info |
| 4537 | - add_common_benchmark_info(); |
| 4538 | - benchmark::AddCustomContext("bytes", std::to_string(bytes)); |
| 4539 | - benchmark::AddCustomContext("seed", seed_type); |
| 4540 | - |
| 4541 | - // Add benchmarks |
| 4542 | - std::vector<benchmark::internal::Benchmark*> benchmarks; |
| 4543 | - add_benchmarks(benchmark_kinds::sort_keys, "keys", benchmarks, bytes, seed, stream); |
| 4544 | - add_benchmarks(benchmark_kinds::sort_pairs, "pairs", benchmarks, bytes, seed, stream); |
| 4545 | - |
| 4546 | - // Use manual timing |
| 4547 | - for(auto& b : benchmarks) |
| 4548 | - { |
| 4549 | - b->UseManualTime(); |
| 4550 | - b->Unit(benchmark::kMillisecond); |
| 4551 | - } |
| 4552 | + benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0); |
| 4553 | |
| 4554 | - // Force number of iterations |
| 4555 | - if(trials > 0) |
| 4556 | - { |
| 4557 | - for(auto& b : benchmarks) |
| 4558 | - { |
| 4559 | - b->Iterations(trials); |
| 4560 | - } |
| 4561 | - } |
| 4562 | + add_benchmarks<benchmark_kinds::sort_keys>("keys", executor); |
| 4563 | + add_benchmarks<benchmark_kinds::sort_pairs>("pairs", executor); |
| 4564 | |
| 4565 | - // Run benchmarks |
| 4566 | - benchmark::RunSpecifiedBenchmarks(); |
| 4567 | - return 0; |
| 4568 | + executor.run(); |
| 4569 | } |
| 4570 | diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp |
| 4571 | index 2571654..27678d0 100644 |
| 4572 | --- a/benchmark/benchmark_block_reduce.cpp |
| 4573 | +++ b/benchmark/benchmark_block_reduce.cpp |
| 4574 | @@ -1,6 +1,6 @@ |
| 4575 | // MIT License |
| 4576 | // |
| 4577 | -// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 4578 | +// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 4579 | // |
| 4580 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 4581 | // of this software and associated documentation files (the "Software"), to deal |
| 4582 | @@ -21,41 +21,29 @@ |
| 4583 | // SOFTWARE. |
| 4584 | |
| 4585 | #include "benchmark_utils.hpp" |
| 4586 | -// CmdParser |
| 4587 | -#include "cmdparser.hpp" |
| 4588 | |
| 4589 | -// Google Benchmark |
| 4590 | -#include <benchmark/benchmark.h> |
| 4591 | +#include "../common/utils_custom_type.hpp" |
| 4592 | +#include "../common/utils_device_ptr.hpp" |
| 4593 | |
| 4594 | // HIP API |
| 4595 | #include <hip/hip_runtime.h> |
| 4596 | |
| 4597 | // rocPRIM |
| 4598 | #include <rocprim/block/block_reduce.hpp> |
| 4599 | +#include <rocprim/config.hpp> |
| 4600 | +#include <rocprim/types.hpp> |
| 4601 | |
| 4602 | -#include <iostream> |
| 4603 | -#include <limits> |
| 4604 | +#include <cstddef> |
| 4605 | +#include <stdint.h> |
| 4606 | #include <string> |
| 4607 | #include <vector> |
| 4608 | |
| 4609 | -#include <cstdio> |
| 4610 | -#include <cstdlib> |
| 4611 | - |
| 4612 | -#ifndef DEFAULT_N |
| 4613 | -const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; |
| 4614 | -#endif |
| 4615 | - |
| 4616 | -namespace rp = rocprim; |
| 4617 | - |
| 4618 | -template< |
| 4619 | - class Runner, |
| 4620 | - class T, |
| 4621 | - unsigned int BlockSize, |
| 4622 | - unsigned int ItemsPerThread, |
| 4623 | - unsigned int Trials |
| 4624 | -> |
| 4625 | -__global__ |
| 4626 | -__launch_bounds__(BlockSize) |
| 4627 | +template<typename Runner, |
| 4628 | + typename T, |
| 4629 | + unsigned int BlockSize, |
| 4630 | + unsigned int ItemsPerThread, |
| 4631 | + unsigned int Trials> |
| 4632 | +__global__ __launch_bounds__(BlockSize) |
| 4633 | void kernel(const T* input, T* output) |
| 4634 | { |
| 4635 | Runner::template run<T, BlockSize, ItemsPerThread, Trials>(input, output); |
| 4636 | @@ -64,12 +52,7 @@ void kernel(const T* input, T* output) |
| 4637 | template<rocprim::block_reduce_algorithm algorithm> |
| 4638 | struct reduce |
| 4639 | { |
| 4640 | - template< |
| 4641 | - class T, |
| 4642 | - unsigned int BlockSize, |
| 4643 | - unsigned int ItemsPerThread, |
| 4644 | - unsigned int Trials |
| 4645 | - > |
| 4646 | + template<typename T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials> |
| 4647 | __device__ |
| 4648 | static void run(const T* input, T* output) |
| 4649 | { |
| 4650 | @@ -77,16 +60,16 @@ struct reduce |
| 4651 | |
| 4652 | T values[ItemsPerThread]; |
| 4653 | T reduced_value; |
| 4654 | - for(unsigned int k = 0; k < ItemsPerThread; k++) |
| 4655 | + for(unsigned int k = 0; k < ItemsPerThread; ++k) |
| 4656 | { |
| 4657 | values[k] = input[i * ItemsPerThread + k]; |
| 4658 | } |
| 4659 | |
| 4660 | - using breduce_t = rp::block_reduce<T, BlockSize, algorithm>; |
| 4661 | + using breduce_t = rocprim::block_reduce<T, BlockSize, algorithm>; |
| 4662 | __shared__ typename breduce_t::storage_type storage; |
| 4663 | |
| 4664 | ROCPRIM_NO_UNROLL |
| 4665 | - for(unsigned int trial = 0; trial < Trials; trial++) |
| 4666 | + for(unsigned int trial = 0; trial < Trials; ++trial) |
| 4667 | { |
| 4668 | breduce_t().reduce(values, reduced_value, storage); |
| 4669 | values[0] = reduced_value; |
| 4670 | @@ -99,193 +82,113 @@ struct reduce |
| 4671 | } |
| 4672 | }; |
| 4673 | |
| 4674 | -template< |
| 4675 | - class Benchmark, |
| 4676 | - class T, |
| 4677 | - unsigned int BlockSize, |
| 4678 | - unsigned int ItemsPerThread, |
| 4679 | - unsigned int Trials = 100 |
| 4680 | -> |
| 4681 | -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t bytes) |
| 4682 | +template<typename Benchmark, |
| 4683 | + typename T, |
| 4684 | + unsigned int BlockSize, |
| 4685 | + unsigned int ItemsPerThread, |
| 4686 | + unsigned int Trials = 100> |
| 4687 | +void run_benchmark(benchmark_utils::state&& state) |
| 4688 | { |
| 4689 | + const auto& bytes = state.bytes; |
| 4690 | + const auto& stream = state.stream; |
| 4691 | + |
| 4692 | // Calculate the number of elements N |
| 4693 | size_t N = bytes / sizeof(T); |
| 4694 | // Make sure size is a multiple of BlockSize |
| 4695 | constexpr auto items_per_block = BlockSize * ItemsPerThread; |
| 4696 | - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); |
| 4697 | + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); |
| 4698 | // Allocate and fill memory |
| 4699 | std::vector<T> input(size, T(1)); |
| 4700 | - T * d_input; |
| 4701 | - T * d_output; |
| 4702 | - HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T))); |
| 4703 | - HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T))); |
| 4704 | - HIP_CHECK( |
| 4705 | - hipMemcpy( |
| 4706 | - d_input, input.data(), |
| 4707 | - size * sizeof(T), |
| 4708 | - hipMemcpyHostToDevice |
| 4709 | - ) |
| 4710 | - ); |
| 4711 | + common::device_ptr<T> d_input(input); |
| 4712 | + common::device_ptr<T> d_output(size); |
| 4713 | HIP_CHECK(hipDeviceSynchronize()); |
| 4714 | |
| 4715 | - // HIP events creation |
| 4716 | - hipEvent_t start, stop; |
| 4717 | - HIP_CHECK(hipEventCreate(&start)); |
| 4718 | - HIP_CHECK(hipEventCreate(&stop)); |
| 4719 | - |
| 4720 | - for (auto _ : state) |
| 4721 | - { |
| 4722 | - // Record start event |
| 4723 | - HIP_CHECK(hipEventRecord(start, stream)); |
| 4724 | - |
| 4725 | - hipLaunchKernelGGL( |
| 4726 | - HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>), |
| 4727 | - dim3(size/items_per_block), dim3(BlockSize), 0, stream, |
| 4728 | - d_input, d_output |
| 4729 | - ); |
| 4730 | - HIP_CHECK(hipGetLastError()); |
| 4731 | - |
| 4732 | - // Record stop event and wait until it completes |
| 4733 | - HIP_CHECK(hipEventRecord(stop, stream)); |
| 4734 | - HIP_CHECK(hipEventSynchronize(stop)); |
| 4735 | - |
| 4736 | - float elapsed_mseconds; |
| 4737 | - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); |
| 4738 | - state.SetIterationTime(elapsed_mseconds / 1000); |
| 4739 | - } |
| 4740 | - |
| 4741 | - // Destroy HIP events |
| 4742 | - HIP_CHECK(hipEventDestroy(start)); |
| 4743 | - HIP_CHECK(hipEventDestroy(stop)); |
| 4744 | - |
| 4745 | - state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); |
| 4746 | - state.SetItemsProcessed(state.iterations() * size * Trials); |
| 4747 | + state.run( |
| 4748 | + [&] |
| 4749 | + { |
| 4750 | + kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials> |
| 4751 | + <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input.get(), |
| 4752 | + d_output.get()); |
| 4753 | + HIP_CHECK(hipGetLastError()); |
| 4754 | + }); |
| 4755 | |
| 4756 | - HIP_CHECK(hipFree(d_input)); |
| 4757 | - HIP_CHECK(hipFree(d_output)); |
| 4758 | + state.set_throughput(size * Trials, sizeof(T)); |
| 4759 | } |
| 4760 | |
| 4761 | -// IPT - items per thread |
| 4762 | -#define CREATE_BENCHMARK(T, BS, IPT) \ |
| 4763 | - benchmark::RegisterBenchmark(bench_naming::format_name("{lvl:block,algo:reduce,key_type:" #T \ |
| 4764 | - ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" \ |
| 4765 | - + method_name + "}}") \ |
| 4766 | - .c_str(), \ |
| 4767 | - run_benchmark<Benchmark, T, BS, IPT>, \ |
| 4768 | - stream, \ |
| 4769 | - bytes) |
| 4770 | - |
| 4771 | -#define BENCHMARK_TYPE(type, block) \ |
| 4772 | - CREATE_BENCHMARK(type, block, 1), \ |
| 4773 | - CREATE_BENCHMARK(type, block, 2), \ |
| 4774 | - CREATE_BENCHMARK(type, block, 3), \ |
| 4775 | - CREATE_BENCHMARK(type, block, 4), \ |
| 4776 | - CREATE_BENCHMARK(type, block, 8), \ |
| 4777 | - CREATE_BENCHMARK(type, block, 11), \ |
| 4778 | +#define CREATE_BENCHMARK(T, BS, IPT) \ |
| 4779 | + executor.queue_fn(bench_naming::format_name("{lvl:block,algo:reduce,key_type:" #T \ |
| 4780 | + ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" \ |
| 4781 | + + name + "}}") \ |
| 4782 | + .c_str(), \ |
| 4783 | + run_benchmark<Benchmark, T, BS, IPT>); |
| 4784 | + |
| 4785 | +#define BENCHMARK_TYPE(type, block) \ |
| 4786 | + CREATE_BENCHMARK(type, block, 1) \ |
| 4787 | + CREATE_BENCHMARK(type, block, 2) \ |
| 4788 | + CREATE_BENCHMARK(type, block, 3) \ |
| 4789 | + CREATE_BENCHMARK(type, block, 4) \ |
| 4790 | + CREATE_BENCHMARK(type, block, 8) \ |
| 4791 | + CREATE_BENCHMARK(type, block, 11) \ |
| 4792 | CREATE_BENCHMARK(type, block, 16) |
| 4793 | |
| 4794 | -template<class Benchmark> |
| 4795 | -void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks, |
| 4796 | - const std::string& method_name, |
| 4797 | - hipStream_t stream, |
| 4798 | - size_t bytes) |
| 4799 | +template<typename Benchmark> |
| 4800 | +void add_benchmarks(const std::string& name, benchmark_utils::executor& executor) |
| 4801 | { |
| 4802 | - using custom_float2 = custom_type<float, float>; |
| 4803 | - using custom_double2 = custom_type<double, double>; |
| 4804 | - |
| 4805 | - std::vector<benchmark::internal::Benchmark*> new_benchmarks = |
| 4806 | - { |
| 4807 | - // When block size is less than or equal to warp size |
| 4808 | - BENCHMARK_TYPE(int, 64), |
| 4809 | - BENCHMARK_TYPE(float, 64), |
| 4810 | - BENCHMARK_TYPE(double, 64), |
| 4811 | - BENCHMARK_TYPE(int8_t, 64), |
| 4812 | - BENCHMARK_TYPE(uint8_t, 64), |
| 4813 | - BENCHMARK_TYPE(rocprim::half, 64), |
| 4814 | - |
| 4815 | - BENCHMARK_TYPE(int, 256), |
| 4816 | - BENCHMARK_TYPE(float, 256), |
| 4817 | - BENCHMARK_TYPE(double, 256), |
| 4818 | - BENCHMARK_TYPE(int8_t, 256), |
| 4819 | - BENCHMARK_TYPE(uint8_t, 256), |
| 4820 | - BENCHMARK_TYPE(rocprim::half, 256), |
| 4821 | - |
| 4822 | - CREATE_BENCHMARK(custom_float2, 256, 1), |
| 4823 | - CREATE_BENCHMARK(custom_float2, 256, 4), |
| 4824 | - CREATE_BENCHMARK(custom_float2, 256, 8), |
| 4825 | - |
| 4826 | - CREATE_BENCHMARK(float2, 256, 1), |
| 4827 | - CREATE_BENCHMARK(float2, 256, 4), |
| 4828 | - CREATE_BENCHMARK(float2, 256, 8), |
| 4829 | - |
| 4830 | - CREATE_BENCHMARK(custom_double2, 256, 1), |
| 4831 | - CREATE_BENCHMARK(custom_double2, 256, 4), |
| 4832 | - CREATE_BENCHMARK(custom_double2, 256, 8), |
| 4833 | - |
| 4834 | - CREATE_BENCHMARK(double2, 256, 1), |
| 4835 | - CREATE_BENCHMARK(double2, 256, 4), |
| 4836 | - CREATE_BENCHMARK(double2, 256, 8), |
| 4837 | - |
| 4838 | - CREATE_BENCHMARK(float4, 256, 1), |
| 4839 | - CREATE_BENCHMARK(float4, 256, 4), |
| 4840 | - CREATE_BENCHMARK(float4, 256, 8), |
| 4841 | - }; |
| 4842 | - benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); |
| 4843 | + using custom_float2 = common::custom_type<float, float>; |
| 4844 | + using custom_double2 = common::custom_type<double, double>; |
| 4845 | + |
| 4846 | + // When block size is less than or equal to warp size |
| 4847 | + BENCHMARK_TYPE(int, 64) |
| 4848 | + BENCHMARK_TYPE(float, 64) |
| 4849 | + BENCHMARK_TYPE(double, 64) |
| 4850 | + BENCHMARK_TYPE(int8_t, 64) |
| 4851 | + BENCHMARK_TYPE(uint8_t, 64) |
| 4852 | + BENCHMARK_TYPE(rocprim::half, 64) |
| 4853 | + BENCHMARK_TYPE(rocprim::int128_t, 64) |
| 4854 | + BENCHMARK_TYPE(rocprim::uint128_t, 64) |
| 4855 | + |
| 4856 | + BENCHMARK_TYPE(int, 256) |
| 4857 | + BENCHMARK_TYPE(float, 256) |
| 4858 | + BENCHMARK_TYPE(double, 256) |
| 4859 | + BENCHMARK_TYPE(int8_t, 256) |
| 4860 | + BENCHMARK_TYPE(uint8_t, 256) |
| 4861 | + BENCHMARK_TYPE(rocprim::half, 256) |
| 4862 | + BENCHMARK_TYPE(rocprim::int128_t, 256) |
| 4863 | + BENCHMARK_TYPE(rocprim::uint128_t, 256) |
| 4864 | + |
| 4865 | + CREATE_BENCHMARK(custom_float2, 256, 1) |
| 4866 | + CREATE_BENCHMARK(custom_float2, 256, 4) |
| 4867 | + CREATE_BENCHMARK(custom_float2, 256, 8) |
| 4868 | + |
| 4869 | + CREATE_BENCHMARK(float2, 256, 1) |
| 4870 | + CREATE_BENCHMARK(float2, 256, 4) |
| 4871 | + CREATE_BENCHMARK(float2, 256, 8) |
| 4872 | + |
| 4873 | + CREATE_BENCHMARK(custom_double2, 256, 1) |
| 4874 | + CREATE_BENCHMARK(custom_double2, 256, 4) |
| 4875 | + CREATE_BENCHMARK(custom_double2, 256, 8) |
| 4876 | + |
| 4877 | + CREATE_BENCHMARK(double2, 256, 1) |
| 4878 | + CREATE_BENCHMARK(double2, 256, 4) |
| 4879 | + CREATE_BENCHMARK(double2, 256, 8) |
| 4880 | + |
| 4881 | + CREATE_BENCHMARK(float4, 256, 1) |
| 4882 | + CREATE_BENCHMARK(float4, 256, 4) |
| 4883 | + CREATE_BENCHMARK(float4, 256, 8) |
| 4884 | } |
| 4885 | |
| 4886 | -int main(int argc, char *argv[]) |
| 4887 | +int main(int argc, char* argv[]) |
| 4888 | { |
| 4889 | - cli::Parser parser(argc, argv); |
| 4890 | - parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes"); |
| 4891 | - parser.set_optional<int>("trials", "trials", -1, "number of iterations"); |
| 4892 | - parser.set_optional<std::string>("name_format", |
| 4893 | - "name_format", |
| 4894 | - "human", |
| 4895 | - "either: json,human,txt"); |
| 4896 | - parser.run_and_exit_if_error(); |
| 4897 | - |
| 4898 | - // Parse argv |
| 4899 | - benchmark::Initialize(&argc, argv); |
| 4900 | - const size_t bytes = parser.get<size_t>("size"); |
| 4901 | - const int trials = parser.get<int>("trials"); |
| 4902 | - bench_naming::set_format(parser.get<std::string>("name_format")); |
| 4903 | + benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 1, 0); |
| 4904 | |
| 4905 | - // HIP |
| 4906 | - hipStream_t stream = 0; // default |
| 4907 | - |
| 4908 | - // Benchmark info |
| 4909 | - add_common_benchmark_info(); |
| 4910 | - benchmark::AddCustomContext("bytes", std::to_string(bytes)); |
| 4911 | - |
| 4912 | - // Add benchmarks |
| 4913 | - std::vector<benchmark::internal::Benchmark*> benchmarks; |
| 4914 | - // using_warp_scan |
| 4915 | using reduce_uwr_t = reduce<rocprim::block_reduce_algorithm::using_warp_reduce>; |
| 4916 | - add_benchmarks<reduce_uwr_t>(benchmarks, "using_warp_reduce", stream, bytes); |
| 4917 | - // reduce then scan |
| 4918 | - using reduce_rr_t = reduce<rocprim::block_reduce_algorithm::raking_reduce>; |
| 4919 | - add_benchmarks<reduce_rr_t>(benchmarks, "raking_reduce", stream, bytes); |
| 4920 | - // reduce commutative only |
| 4921 | - using reduce_rrco_t = reduce<rocprim::block_reduce_algorithm::raking_reduce_commutative_only>; |
| 4922 | - add_benchmarks<reduce_rrco_t>(benchmarks, "raking_reduce_commutative_only", stream, bytes); |
| 4923 | + add_benchmarks<reduce_uwr_t>("using_warp_reduce", executor); |
| 4924 | |
| 4925 | - // Use manual timing |
| 4926 | - for(auto& b : benchmarks) |
| 4927 | - { |
| 4928 | - b->UseManualTime(); |
| 4929 | - b->Unit(benchmark::kMillisecond); |
| 4930 | - } |
| 4931 | + using reduce_rr_t = reduce<rocprim::block_reduce_algorithm::raking_reduce>; |
| 4932 | + add_benchmarks<reduce_rr_t>("raking_reduce", executor); |
| 4933 | |
| 4934 | - // Force number of iterations |
| 4935 | - if(trials > 0) |
| 4936 | - { |
| 4937 | - for(auto& b : benchmarks) |
| 4938 | - { |
| 4939 | - b->Iterations(trials); |
| 4940 | - } |
| 4941 | - } |
| 4942 | + using reduce_rrco_t = reduce<rocprim::block_reduce_algorithm::raking_reduce_commutative_only>; |
| 4943 | + add_benchmarks<reduce_rrco_t>("raking_reduce_commutative_only", executor); |
| 4944 | |
| 4945 | - // Run benchmarks |
| 4946 | - benchmark::RunSpecifiedBenchmarks(); |
| 4947 | - return 0; |
| 4948 | + executor.run(); |
| 4949 | } |
| 4950 | diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp |
| 4951 | index e56bf09..3b20d0f 100644 |
| 4952 | --- a/benchmark/benchmark_block_run_length_decode.cpp |
| 4953 | +++ b/benchmark/benchmark_block_run_length_decode.cpp |
| 4954 | @@ -1,6 +1,6 @@ |
| 4955 | // MIT License |
| 4956 | // |
| 4957 | -// Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 4958 | +// Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 4959 | // |
| 4960 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 4961 | // of this software and associated documentation files (the "Software"), to deal |
| 4962 | @@ -21,32 +21,33 @@ |
| 4963 | // SOFTWARE. |
| 4964 | |
| 4965 | #include "benchmark_utils.hpp" |
| 4966 | -#include "cmdparser.hpp" |
| 4967 | |
| 4968 | -#include <benchmark/benchmark.h> |
| 4969 | +#include "../common/utils_data_generation.hpp" |
| 4970 | +#include "../common/utils_device_ptr.hpp" |
| 4971 | |
| 4972 | #include <rocprim/block/block_load_func.hpp> |
| 4973 | #include <rocprim/block/block_run_length_decode.hpp> |
| 4974 | #include <rocprim/block/block_store_func.hpp> |
| 4975 | +#include <rocprim/type_traits.hpp> |
| 4976 | +#include <rocprim/types.hpp> |
| 4977 | |
| 4978 | -#include <random> |
| 4979 | +#include <chrono> |
| 4980 | +#include <cstddef> |
| 4981 | +#include <string> |
| 4982 | +#include <type_traits> |
| 4983 | #include <vector> |
| 4984 | |
| 4985 | -#ifndef DEFAULT_N |
| 4986 | -const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; |
| 4987 | -#endif |
| 4988 | - |
| 4989 | -template<class ItemT, |
| 4990 | - class OffsetT, |
| 4991 | +template<typename ItemT, |
| 4992 | + typename OffsetT, |
| 4993 | unsigned BlockSize, |
| 4994 | unsigned RunsPerThread, |
| 4995 | unsigned DecodedItemsPerThread, |
| 4996 | unsigned Trials> |
| 4997 | -__global__ |
| 4998 | - __launch_bounds__(BlockSize) void block_run_length_decode_kernel(const ItemT* d_run_items, |
| 4999 | - const OffsetT* d_run_offsets, |
| 5000 | - ItemT* d_decoded_items, |

Uploaded to ppa: https:/ /launchpad. net/~igorluppi/ +archive/ ubuntu/ rocprim- lp2140327/ +packages
(-proposed and archs enabled)