Merge ~bullwinkle-team/ubuntu/+source/rocthrust:bullwinkle/llvm-21/ubuntu/devel into ubuntu/+source/rocthrust:ubuntu/devel

Proposed by Bojan Aleksovski
Status: Approved
Approved by: Andreas Hasenack
Approved revision: 86ad8de1ddef993aa9597ccd224574c0484d384f
Proposed branch: ~bullwinkle-team/ubuntu/+source/rocthrust:bullwinkle/llvm-21/ubuntu/devel
Merge into: ubuntu/+source/rocthrust:ubuntu/devel
Diff against target: 361892 lines (+132806/-124112)
1519 files modified
.clang-format (+164/-99)
.githooks/install (+1/-1)
.github/CODEOWNERS (+1/-1)
.gitlab-ci.yml (+135/-28)
.gitlab/run_benchmarks.py (+1/-1)
CHANGELOG.md (+75/-4)
CMakeLists.txt (+86/-44)
CONTRIBUTING.md (+2/-2)
NOTICES.txt (+39/-28)
README.md (+132/-49)
benchmark/bench/adjacent_difference/basic.cu (+115/-109)
benchmark/bench/adjacent_difference/custom.cu (+126/-122)
benchmark/bench/adjacent_difference/in_place.cu (+116/-115)
benchmark/bench/copy/basic.cu (+191/-0)
benchmark/bench/copy/if.cu (+203/-0)
benchmark/bench/equal/basic.cu (+145/-0)
benchmark/bench/fill/basic.cu (+116/-109)
benchmark/bench/for_each/basic.cu (+169/-0)
benchmark/bench/inner_product/basic.cu (+122/-118)
benchmark/bench/merge/basic.cu (+187/-0)
benchmark/bench/partition/basic.cu (+156/-144)
benchmark/bench/reduce/basic.cu (+116/-114)
benchmark/bench/reduce/by_key.cu (+211/-0)
benchmark/bench/scan/exclusive/by_key.cu (+177/-0)
benchmark/bench/scan/exclusive/max.cu (+118/-116)
benchmark/bench/scan/exclusive/sum.cu (+116/-112)
benchmark/bench/scan/inclusive/by_key.cu (+177/-0)
benchmark/bench/scan/inclusive/max.cu (+123/-110)
benchmark/bench/scan/inclusive/sum.cu (+116/-112)
benchmark/bench/set_operations/base.hpp (+161/-0)
benchmark/bench/set_operations/by_key.hpp (+183/-0)
benchmark/bench/set_operations/difference.cu (+38/-40)
benchmark/bench/set_operations/difference_by_key.cu (+49/-58)
benchmark/bench/set_operations/intersection.cu (+38/-40)
benchmark/bench/set_operations/intersection_by_key.cu (+49/-57)
benchmark/bench/set_operations/symmetric_difference.cu (+38/-40)
benchmark/bench/set_operations/symmetric_difference_by_key.cu (+49/-58)
benchmark/bench/set_operations/union.cu (+38/-40)
benchmark/bench/set_operations/union_by_key.cu (+49/-58)
benchmark/bench/shuffle/basic.cu (+194/-0)
benchmark/bench/sort/keys.cu (+127/-124)
benchmark/bench/sort/keys_custom.cu (+127/-124)
benchmark/bench/sort/pairs.cu (+170/-0)
benchmark/bench/sort/pairs_custom.cu (+171/-0)
benchmark/bench/tabulate/basic.cu (+117/-113)
benchmark/bench/transform/basic.cu (+387/-0)
benchmark/bench/transform_reduce/sum.cu (+170/-0)
benchmark/bench/unique/basic.cu (+169/-0)
benchmark/bench/unique/by_key.cu (+205/-0)
benchmark/bench/vectorized_search/basic.cu (+160/-0)
benchmark/bench/vectorized_search/lower_bound.cu (+160/-0)
benchmark/bench/vectorized_search/upper_bound.cu (+160/-0)
benchmark/bench_utils/bench_utils.hpp (+527/-522)
benchmark/bench_utils/cmdparser.hpp (+492/-489)
benchmark/bench_utils/common/types.hpp (+52/-5)
benchmark/bench_utils/custom_reporter.hpp (+630/-662)
benchmark/bench_utils/generation_utils.hpp (+522/-551)
cmake/Benchmarks.cmake (+4/-30)
cmake/Dependencies.cmake (+313/-50)
cmake/DownloadProject.CMakeLists.cmake.in (+1/-1)
cmake/DownloadProject.cmake (+7/-1)
cmake/FindROCMCmake.cmake (+0/-1)
cmake/GenerateResourceSpec.cmake (+10/-8)
cmake/Summary.cmake (+67/-7)
cmake/VerifyCompiler.cmake (+1/-1)
custom.properties (+1/-1)
debian/changelog (+20/-0)
debian/control (+7/-6)
debian/patches/0004-Skip-transform_input_output_iterator-test.patch (+3/-3)
debian/patches/series (+0/-1)
debian/rules (+7/-1)
debian/tests/control (+1/-1)
debian/watch (+5/-0)
dev/null (+0/-250)
doc/CHANGELOG.md (+0/-1)
docs/bitwise-repro.rst (+2/-2)
docs/conf.py (+3/-3)
docs/cpp_api.rst (+0/-12)
docs/data-type-support.rst (+0/-1)
docs/doxygen/Doxyfile (+584/-261)
docs/hip-execution-policies.rst (+4/-2)
docs/hipgraph-support.rst (+3/-1)
docs/how-to/rocThrust-build-backends.rst (+73/-0)
docs/how-to/run-rocThrust-tests-on-multiple-gpus.rst (+5/-11)
docs/how-to/use-rocThrust-in-a-project.rst (+1/-2)
docs/index.rst (+4/-2)
docs/install/rocThrust-install-overview.rst (+14/-5)
docs/install/rocThrust-install-script.rst (+4/-3)
docs/install/rocThrust-install-with-cmake.rst (+10/-8)
docs/install/rocThrust-prerequisites.rst (+1/-1)
docs/install/rocThrust-rmake-install.rst (+4/-4)
docs/license.rst (+1/-0)
docs/reference/rocThrust-hipstdpar.rst (+2/-2)
docs/sphinx/_toc.yml.in (+3/-2)
docs/sphinx/requirements.in (+1/-1)
docs/sphinx/requirements.txt (+138/-7)
examples/CMakeLists.txt (+3/-2)
examples/README.md (+6/-4)
examples/arbitrary_transformation.cu (+81/-51)
examples/basic_vector.cu (+59/-35)
examples/bounding_box.cu (+57/-36)
examples/bucket_sort2d.cu (+42/-37)
examples/constant_iterator.cu (+36/-18)
examples/counting_iterator.cu (+56/-38)
examples/cpp_integration/CMakeLists.txt (+2/-1)
examples/cpp_integration/README (+1/-2)
examples/cpp_integration/device.cu (+29/-10)
examples/cpp_integration/device.h (+20/-0)
examples/cpp_integration/host.cpp (+31/-11)
examples/cuda/async_reduce.cu (+27/-8)
examples/cuda/custom_temporary_allocation.cu (+43/-30)
examples/cuda/explicit_cuda_stream.cu (+23/-6)
examples/cuda/global_device_vector.cu (+37/-19)
examples/cuda/range_view.cu (+81/-89)
examples/device_ptr.cu (+25/-6)
examples/discrete_voronoi.cu (+176/-144)
examples/dot_products_with_zip.cu (+117/-109)
examples/expand.cu (+43/-36)
examples/fill_copy_sequence.cu (+38/-16)
examples/histogram.cu (+67/-52)
examples/include/host_device.h (+2/-2)
examples/include/timer.h (+11/-10)
examples/lambda.cu (+48/-24)
examples/lexicographical_sort.cu (+80/-61)
examples/max_abs_diff.cu (+45/-26)
examples/minimal_custom_backend.cu (+33/-11)
examples/minmax.cu (+37/-20)
examples/mode.cu (+82/-63)
examples/monte_carlo.cu (+43/-27)
examples/monte_carlo_disjoint_sequences.cu (+34/-17)
examples/mr_basic.cu (+81/-66)
examples/norm.cu (+42/-23)
examples/padded_grid_reduction.cu (+77/-54)
examples/permutation_iterator.cu (+47/-26)
examples/print_version.cu (+35/-0)
examples/raw_reference_cast.cu (+43/-27)
examples/remove_points2d.cu (+71/-44)
examples/repeated_range.cu (+88/-63)
examples/run_length_decoding.cu (+72/-46)
examples/run_length_encoding.cu (+59/-35)
examples/saxpy.cu (+65/-44)
examples/scan_by_key.cu (+89/-72)
examples/scan_matrix_by_rows.cu (+43/-17)
examples/set_operations.cu (+41/-21)
examples/simple_moving_average.cu (+56/-28)
examples/sort.cu (+59/-26)
examples/sorting_aos_vs_soa.cu (+31/-10)
examples/sparse_vector.cu (+124/-85)
examples/stream_compaction.cu (+60/-43)
examples/strided_range.cu (+93/-69)
examples/sum.cu (+25/-5)
examples/sum_rows.cu (+49/-25)
examples/summary_statistics.cu (+135/-110)
examples/summed_area_table.cu (+55/-35)
examples/tiled_range.cu (+89/-65)
examples/transform_input_output_iterator.cu (+56/-30)
examples/transform_iterator.cu (+98/-81)
examples/transform_output_iterator.cu (+33/-12)
examples/uninitialized_vector.cu (+34/-18)
examples/weld_vertices.cu (+60/-43)
examples/word_count.cu (+71/-51)
extra/CMakeLists.txt (+3/-5)
extra/test_rocthrust_package.cpp (+13/-17)
install (+0/-12)
internal/test/thrust.example.discrete_voronoi.filecheck (+11/-0)
rmake.py (+9/-9)
rtest.py (+13/-13)
scripts/code-format/check-format.sh (+57/-0)
scripts/copyright-date/check-copyright.sh (+2/-0)
scripts/gdb-pretty-printers.py (+70/-41)
test/CMakeLists.txt (+100/-31)
test/address_stability.cmake (+12/-0)
test/async_reduce.cmake (+9/-0)
test/bitwise_repro/bwr_db.hpp (+256/-255)
test/bitwise_repro/bwr_utils.hpp (+309/-284)
test/hipstdpar/CMakeLists.txt (+11/-4)
test/hipstdpar/test_algorithms.cpp (+134/-140)
test/hipstdpar/test_interpose.cpp (+113/-78)
test/test_address_stability.cpp (+150/-0)
test/test_adjacent_difference.cpp (+155/-183)
test/test_advance.cpp (+73/-51)
test/test_alignment.cpp (+203/-0)
test/test_allocator.cpp (+205/-105)
test/test_allocator_aware_policies.cpp (+103/-123)
test/test_async_copy.cpp (+352/-153)
test/test_async_for_each.cpp (+69/-68)
test/test_async_reduce.cpp (+749/-949)
test/test_async_reduce_into.cpp (+553/-0)
test/test_async_scan.cpp (+137/-132)
test/test_async_sort.cpp (+229/-232)
test/test_async_transform.cpp (+387/-295)
test/test_binary_search.cpp (+805/-527)
test/test_binary_search_descending.cpp (+88/-151)
test/test_binary_search_vector.cpp (+428/-571)
test/test_binary_search_vector_descending.cpp (+179/-268)
test/test_caching_allocator.cpp (+25/-4)
test/test_complex.cpp (+1176/-197)
test/test_complex_transform.cpp (+250/-256)
test/test_complex_various.cpp (+1139/-0)
test/test_constant_iterator.cpp (+119/-95)
test/test_copy.cpp (+819/-593)
test/test_copy_n.cpp (+187/-240)
test/test_count.cpp (+120/-106)
test/test_counting_iterator.cpp (+165/-135)
test/test_decompose.cpp (+103/-0)
test/test_dependencies_aware_policies.cpp (+143/-0)
test/test_dereference.cpp (+95/-73)
test/test_device_delete.cpp (+31/-33)
test/test_device_ptr.cpp (+154/-142)
test/test_device_reference.cpp (+207/-201)
test/test_discard_iterator.cpp (+60/-59)
test/test_distance.cpp (+27/-26)
test/test_equal.cpp (+128/-112)
test/test_event.cpp (+182/-0)
test/test_fill.cpp (+324/-400)
test/test_find.cpp (+286/-278)
test/test_for_each.cpp (+345/-328)
test/test_functional.cpp (+380/-0)
test/test_functional_arithmetic.cpp (+109/-0)
test/test_functional_bitwise.cpp (+131/-0)
test/test_functional_logical.cpp (+101/-0)
test/test_functional_placeholders_arithmetic.cpp (+142/-0)
test/test_functional_placeholders_bitwise.cpp (+153/-0)
test/test_functional_placeholders_compound_assignment.cpp (+292/-0)
test/test_functional_placeholders_logical.cpp (+125/-0)
test/test_functional_placeholders_miscellaneous.cpp (+132/-0)
test/test_functional_placeholders_relational.cpp (+93/-0)
test/test_future.cpp (+221/-0)
test/test_gather.cpp (+319/-375)
test/test_generate.cpp (+124/-132)
test/test_imag_assertions.hpp (+146/-0)
test/test_inner_product.cpp (+202/-99)
test/test_is_contiguous_iterator.cpp (+167/-0)
test/test_is_operator_function_object.cpp (+144/-0)
test/test_is_partitioned.cpp (+55/-62)
test/test_is_sorted.cpp (+55/-64)
test/test_is_sorted_until.cpp (+73/-88)
test/test_logical.cpp (+200/-0)
test/test_max_element.cpp (+105/-90)
test/test_memory.cpp (+171/-185)
test/test_merge.cpp (+183/-246)
test/test_merge_by_key.cpp (+421/-453)
test/test_merge_key_value.cpp (+146/-0)
test/test_metaprogramming.cpp (+67/-0)
test/test_min_and_max.cpp (+111/-0)
test/test_min_element.cpp (+103/-90)
test/test_minmax_element.cpp (+131/-114)
test/test_mismatch.cpp (+31/-45)
test/test_mr_disjoint_pool.cpp (+214/-206)
test/test_mr_new.cpp (+38/-20)
test/test_mr_pool.cpp (+279/-272)
test/test_mr_pool_options.cpp (+65/-47)
test/test_out_of_memory_recovery.cpp (+55/-0)
test/test_pair.cpp (+312/-259)
test/test_pair_reduce.cpp (+40/-44)
test/test_pair_scan.cpp (+60/-95)
test/test_pair_scan_by_key.cpp (+90/-0)
test/test_pair_sort.cpp (+38/-42)
test/test_pair_sort_by_key.cpp (+68/-0)
test/test_pair_transform.cpp (+50/-51)
test/test_parallel_for.cpp (+32/-42)
test/test_param_fixtures.hpp (+329/-0)
test/test_partition.cpp (+1218/-1528)
test/test_partition_point.cpp (+79/-56)
test/test_permutation_iterator.cpp (+204/-260)
test/test_preprocessor.cpp (+743/-0)
test/test_random.cpp (+478/-496)
test/test_real_assertions.hpp (+146/-0)
test/test_reduce.cpp (+202/-190)
test/test_reduce_by_key.cpp (+330/-412)
test/test_reduce_large.cpp (+58/-0)
test/test_remove.cpp (+446/-577)
test/test_replace.cpp (+416/-601)
test/test_reproducibility.cpp (+426/-455)
test/test_reverse.cpp (+222/-0)
test/test_reverse_iterator.cpp (+85/-104)
test/test_scan.cpp (+729/-568)
test/test_scan_by_key.exclusive.cpp (+510/-0)
test/test_scan_by_key.inclusive.cpp (+508/-0)
test/test_scatter.cpp (+246/-338)
test/test_seed.in.hpp (+6/-6)
test/test_sequence.cpp (+115/-108)
test/test_set_difference.cpp (+155/-158)
test/test_set_difference_by_key.cpp (+285/-342)
test/test_set_difference_by_key_descending.cpp (+95/-131)
test/test_set_difference_descending.cpp (+42/-64)
test/test_set_difference_key_value.cpp (+82/-0)
test/test_set_intersection.cpp (+186/-186)
test/test_set_intersection_by_key.cpp (+255/-346)
test/test_set_intersection_by_key_descending.cpp (+89/-113)
test/test_set_intersection_descending.cpp (+42/-62)
test/test_set_intersection_key_value.cpp (+55/-67)
test/test_set_symmetric_difference.cpp (+186/-246)
test/test_set_symmetric_difference_by_key.cpp (+279/-354)
test/test_set_symmetric_difference_by_key_descending.cpp (+98/-142)
test/test_set_symmetric_difference_descending.cpp (+48/-72)
test/test_set_union.cpp (+99/-150)
test/test_set_union_by_key.cpp (+279/-354)
test/test_set_union_by_key_descending.cpp (+88/-416)
test/test_set_union_descending.cpp (+40/-68)
test/test_set_union_key_value.cpp (+178/-255)
test/test_shuffle.cpp (+394/-307)
test/test_sort.cpp (+272/-322)
test/test_sort_by_key.cpp (+143/-199)
test/test_sort_by_key_variable_bits.cpp (+38/-32)
test/test_sort_permutation_iterator.cpp (+103/-218)
test/test_sort_variable_bits.cpp (+33/-27)
test/test_stable_sort.cpp (+109/-130)
test/test_stable_sort_by_key.cpp (+129/-154)
test/test_stable_sort_by_key_large_keys.cpp (+6/-4)
test/test_stable_sort_by_key_large_keys_and_values.cpp (+5/-3)
test/test_stable_sort_by_key_large_values.cpp (+9/-13)
test/test_stable_sort_large.cpp (+24/-33)
test/test_swap_ranges.cpp (+174/-181)
test/test_tabulate.cpp (+88/-86)
test/test_tabulate_output_iterator.cpp (+181/-0)
test/test_transform.cpp (+817/-1026)
test/test_transform_input_output_iterator.cpp (+171/-0)
test/test_transform_iterator.cpp (+278/-91)
test/test_transform_output_iterator.cpp (+124/-0)
test/test_transform_output_iterator_reduce_by_key.cpp (+76/-0)
test/test_transform_reduce.cpp (+91/-93)
test/test_transform_scan.cpp (+331/-350)
test/test_trivial_sequence.cpp (+86/-0)
test/test_tuple.cpp (+599/-463)
test/test_tuple_algorithms.cpp (+77/-0)
test/test_tuple_reduce.cpp (+40/-46)
test/test_tuple_scan.cpp (+97/-0)
test/test_tuple_sort.cpp (+48/-55)
test/test_tuple_transform.cpp (+40/-46)
test/test_type_traits.cpp (+255/-0)
test/test_uninitialized_copy.cpp (+163/-191)
test/test_uninitialized_fill.cpp (+161/-211)
test/test_unique.cpp (+281/-253)
test/test_unique_by_key.cpp (+497/-443)
test/test_universal_memory.cpp (+142/-112)
test/test_utils.hpp (+615/-348)
test/test_utils_tester.cpp (+70/-0)
test/test_vector.cpp (+599/-674)
test/test_vector_allocators.cpp (+222/-175)
test/test_vector_insert.cpp (+183/-249)
test/test_vector_manipulation.cpp (+97/-85)
test/test_zip_function.cpp (+200/-0)
test/test_zip_iterator.cpp (+342/-339)
test/test_zip_iterator_reduce.cpp (+40/-41)
test/test_zip_iterator_reduce_by_key.cpp (+154/-173)
test/test_zip_iterator_scan.cpp (+105/-96)
test/test_zip_iterator_sort.cpp (+25/-30)
test/test_zip_iterator_sort_by_key.cpp (+48/-72)
testing/CMakeLists.txt (+43/-21)
testing/address_stability.cmake (+12/-0)
testing/address_stability.cu (+143/-0)
testing/adjacent_difference.cu (+88/-99)
testing/advance.cu (+70/-53)
testing/alignment.cu (+129/-227)
testing/allocator.cu (+83/-116)
testing/allocator_aware_policies.cu (+110/-144)
testing/async/exclusive_scan/CMakeLists.txt (+1/-0)
testing/async/exclusive_scan/counting_iterator.cu (+32/-12)
testing/async/exclusive_scan/discard_output.cu (+32/-7)
testing/async/exclusive_scan/large_indices.cu (+61/-42)
testing/async/exclusive_scan/large_types.cu (+30/-10)
testing/async/exclusive_scan/mixed_types.cu (+41/-20)
testing/async/exclusive_scan/mixin.h (+48/-35)
testing/async/exclusive_scan/simple.cu (+31/-12)
testing/async/exclusive_scan/stateful_operator.cu (+38/-12)
testing/async/exclusive_scan/using_vs_adl.cu (+76/-51)
testing/async/inclusive_scan/CMakeLists.txt (+1/-0)
testing/async/inclusive_scan/counting_iterator.cu (+32/-12)
testing/async/inclusive_scan/discard_output.cu (+33/-9)
testing/async/inclusive_scan/large_indices.cu (+59/-40)
testing/async/inclusive_scan/large_types.cu (+30/-10)
testing/async/inclusive_scan/mixed_types.cu (+46/-23)
testing/async/inclusive_scan/mixin.h (+63/-30)
testing/async/inclusive_scan/simple.cu (+60/-14)
testing/async/inclusive_scan/stateful_operator.cu (+38/-11)
testing/async/inclusive_scan/using_vs_adl.cu (+76/-51)
testing/async/mixin.h (+107/-71)
testing/async/test_policy_overloads.h (+102/-155)
testing/async_copy.cu (+127/-206)
testing/async_for_each.cu (+53/-62)
testing/async_reduce.cmake (+9/-0)
testing/async_reduce.cu (+433/-655)
testing/async_reduce_into.cu (+331/-446)
testing/async_sort.cu (+133/-265)
testing/async_transform.cu (+203/-278)
testing/binary_search.cu (+159/-230)
testing/binary_search_descending.cu (+90/-101)
testing/binary_search_vector.cu (+209/-269)
testing/binary_search_vector_descending.cu (+93/-133)
testing/caching_allocator.cu (+27/-9)
testing/complex.cu (+56/-50)
testing/complex_transform.cu (+84/-79)
testing/constant_iterator.cu (+85/-76)
testing/copy.cu (+415/-438)
testing/copy_n.cu (+164/-198)
testing/count.cu (+57/-66)
testing/counting_iterator.cu (+167/-164)
testing/cpp/adjacent_difference.cu (+59/-30)
testing/cuda/adjacent_difference.cu (+87/-62)
testing/cuda/binary_search.cu (+22/-7)
testing/cuda/complex.cu (+31/-43)
testing/cuda/copy.cu (+38/-31)
testing/cuda/copy_if.cu (+110/-112)
testing/cuda/count.cu (+40/-36)
testing/cuda/cudart.cu (+22/-4)
testing/cuda/device_side_universal_vector.cu (+47/-52)
testing/cuda/equal.cu (+62/-40)
testing/cuda/fill.cu (+90/-88)
testing/cuda/find.cu (+83/-80)
testing/cuda/for_each.cu (+100/-91)
testing/cuda/gather.cu (+110/-82)
testing/cuda/generate.cu (+46/-42)
testing/cuda/inner_product.cu (+37/-21)
testing/cuda/is_partitioned.cu (+51/-26)
testing/cuda/is_sorted.cu (+34/-19)
testing/cuda/is_sorted_until.cu (+49/-35)
testing/cuda/logical.cu (+98/-99)
testing/cuda/max_element.cu (+53/-43)
testing/cuda/memory.cu (+44/-34)
testing/cuda/merge.cu (+53/-61)
testing/cuda/merge_by_key.cu (+74/-50)
testing/cuda/merge_sort.cu (+101/-92)
testing/cuda/min_element.cu (+49/-36)
testing/cuda/minmax_element.cu (+50/-42)
testing/cuda/mismatch.cu (+83/-46)
testing/cuda/pair_sort.cu (+28/-17)
testing/cuda/pair_sort_by_key.cu (+32/-20)
testing/cuda/partition.cu (+271/-228)
testing/cuda/partition_point.cu (+41/-27)
testing/cuda/reduce.cu (+41/-30)
testing/cuda/reduce_by_key.cu (+188/-129)
testing/cuda/remove.cu (+173/-236)
testing/cuda/replace.cu (+101/-101)
testing/cuda/reverse.cu (+39/-54)
testing/cuda/scan.cu (+114/-94)
testing/cuda/scan_by_key.cu (+103/-126)
testing/cuda/scatter.cu (+91/-83)
testing/cuda/sequence.cu (+50/-58)
testing/cuda/set_difference.cu (+40/-31)
testing/cuda/set_difference_by_key.cu (+73/-70)
testing/cuda/set_intersection.cu (+42/-42)
testing/cuda/set_intersection_by_key.cu (+71/-63)
testing/cuda/set_symmetric_difference.cu (+41/-42)
testing/cuda/set_symmetric_difference_by_key.cu (+73/-64)
testing/cuda/set_union.cu (+39/-42)
testing/cuda/set_union_by_key.cu (+75/-66)
testing/cuda/sort.cu (+229/-72)
testing/cuda/sort_by_key.cu (+49/-73)
testing/cuda/stream_legacy.cu (+20/-2)
testing/cuda/stream_per_thread.cu (+20/-2)
testing/cuda/swap_ranges.cu (+37/-43)
testing/cuda/tabulate.cu (+47/-51)
testing/cuda/transform.cu (+252/-142)
testing/cuda/transform_reduce.cu (+38/-28)
testing/cuda/transform_scan.cu (+175/-76)
testing/cuda/uninitialized_copy.cu (+48/-64)
testing/cuda/uninitialized_fill.cu (+108/-140)
testing/cuda/unique.cu (+109/-194)
testing/cuda/unique_by_key.cu (+206/-183)
testing/decompose.cu (+60/-43)
testing/dependencies_aware_policies.cu (+99/-166)
testing/dereference.cu (+91/-61)
testing/device_delete.cu (+42/-31)
testing/device_ptr.cu (+82/-68)
testing/device_reference.cu (+201/-179)
testing/discard_iterator.cu (+26/-9)
testing/distance.cu (+29/-12)
testing/docs/doxybook_test.h (+28/-19)
testing/equal.cu (+90/-96)
testing/event.cu (+39/-41)
testing/fill.cu (+237/-289)
testing/find.cu (+189/-210)
testing/fix_clang_nvcc_11.5.h (+21/-6)
testing/for_each.cu (+201/-236)
testing/functional.cu (+233/-224)
testing/functional_arithmetic.cu (+74/-63)
testing/functional_bitwise.cu (+70/-54)
testing/functional_logical.cu (+62/-46)
testing/functional_placeholders_arithmetic.cu (+90/-60)
testing/functional_placeholders_bitwise.cu (+86/-58)
testing/functional_placeholders_compound_assignment.cu (+199/-133)
testing/functional_placeholders_logical.cu (+58/-43)
testing/functional_placeholders_miscellaneous.cu (+78/-19)
testing/functional_placeholders_relational.cu (+57/-42)
testing/future.cu (+65/-93)
testing/gather.cu (+241/-246)
testing/generate.cu (+103/-119)
testing/generate_const_iterators.cu (+28/-11)
testing/inner_product.cu (+89/-92)
testing/is_contiguous_iterator.cu (+99/-165)
testing/is_operator_function_object.cu (+61/-126)
testing/is_partitioned.cu (+21/-30)
testing/is_sorted.cu (+35/-53)
testing/is_sorted_until.cu (+67/-84)
testing/logical.cu (+73/-94)
testing/max_element.cu (+78/-72)
testing/memory.cu (+90/-93)
testing/merge.cu (+64/-105)
testing/merge_by_key.cu (+241/-218)
testing/merge_key_value.cu (+68/-65)
testing/metaprogramming.cu (+63/-0)
testing/min_and_max.cu (+59/-34)
testing/min_element.cu (+77/-73)
testing/minmax_element.cu (+97/-93)
testing/mismatch.cu (+48/-46)
testing/mr_disjoint_pool.cu (+211/-202)
testing/mr_new.cu (+36/-18)
testing/mr_pool.cu (+296/-290)
testing/mr_pool_options.cu (+70/-52)
testing/namespace_wrapped.cu (+21/-9)
testing/omp/nvcc_independence.cpp (+32/-15)
testing/omp/reduce_intervals.cu (+36/-31)
testing/out_of_memory_recovery.cu (+24/-4)
testing/pair.cu (+132/-89)
testing/pair_reduce.cu (+29/-15)
testing/pair_scan.cu (+33/-20)
testing/pair_scan_by_key.cu (+36/-19)
testing/pair_sort.cu (+30/-13)
testing/pair_sort_by_key.cu (+27/-9)
testing/pair_transform.cu (+39/-15)
testing/partition.cu (+941/-1340)
testing/partition_point.cu (+43/-63)
testing/permutation_iterator.cu (+181/-233)
testing/preprocessor.cu (+23/-3)
testing/random.cu (+212/-263)
testing/reduce.cu (+118/-124)
testing/reduce_by_key.cu (+209/-214)
testing/reduce_large.cu (+35/-19)
testing/remove.cu (+304/-489)
testing/replace.cu (+256/-427)
testing/reverse.cu (+34/-75)
testing/reverse_iterator.cu (+51/-45)
testing/scan.cu (+474/-426)
testing/scan_by_key.exclusive.cu (+95/-291)
testing/scan_by_key.inclusive.cu (+90/-237)
testing/scatter.cu (+210/-263)
testing/sequence.cu (+77/-91)
testing/set_difference.cu (+82/-103)
testing/set_difference_by_key.cu (+159/-160)
testing/set_difference_by_key_descending.cu (+65/-55)
testing/set_difference_descending.cu (+33/-29)
testing/set_difference_key_value.cu (+28/-16)
testing/set_intersection.cu (+87/-115)
testing/set_intersection_by_key.cu (+138/-148)
testing/set_intersection_by_key_descending.cu (+62/-52)
testing/set_intersection_descending.cu (+33/-29)
testing/set_intersection_key_value.cu (+27/-14)
testing/set_symmetric_difference.cu (+80/-109)
testing/set_symmetric_difference_by_key.cu (+156/-162)
testing/set_symmetric_difference_by_key_descending.cu (+65/-56)
testing/set_symmetric_difference_descending.cu (+34/-29)
testing/set_union.cu (+61/-89)
testing/set_union_by_key.cu (+156/-162)
testing/set_union_by_key_descending.cu (+65/-56)
testing/set_union_descending.cu (+31/-29)
testing/set_union_key_value.cu (+38/-31)
testing/shuffle.cu (+255/-164)
testing/sort.cu (+60/-62)
testing/sort_by_key.cu (+88/-93)
testing/sort_by_key_variable_bits.cu (+41/-29)
testing/sort_permutation_iterator.cu (+95/-162)
testing/sort_variable_bits.cu (+41/-28)
testing/stable_sort.cu (+68/-106)
testing/stable_sort_by_key.cu (+80/-79)
testing/stable_sort_by_key_large_keys.cu (+19/-2)
testing/stable_sort_by_key_large_keys_and_values.cu (+17/-0)
testing/stable_sort_by_key_large_values.cu (+20/-9)
testing/stable_sort_large.cu (+20/-19)
testing/swap_ranges.cu (+78/-64)
testing/tabulate.cu (+21/-38)
testing/tabulate_output_iterator.cu (+161/-0)
testing/transform.cu (+457/-609)
testing/transform_input_output_iterator.cu (+93/-84)
testing/transform_iterator.cu (+251/-68)
testing/transform_output_iterator.cu (+64/-61)
testing/transform_output_iterator_reduce_by_key.cu (+31/-15)
testing/transform_reduce.cu (+43/-73)
testing/transform_scan.cu (+287/-310)
testing/trivial_sequence.cu (+52/-37)
testing/tuple.cu (+192/-83)
testing/tuple_algorithms.cu (+19/-4)
testing/tuple_reduce.cu (+40/-26)
testing/tuple_scan.cu (+46/-36)
testing/tuple_sort.cu (+50/-36)
testing/tuple_transform.cu (+41/-29)
testing/type_traits.cu (+230/-108)
testing/uninitialized_copy.cu (+66/-113)
testing/uninitialized_fill.cu (+117/-179)
testing/unique.cu (+215/-228)
testing/unique_by_key.cu (+384/-367)
testing/unittest/assertions.h (+525/-431)
testing/unittest/ctest.h (+32/-13)
testing/unittest/cuda/testframework.cu (+75/-45)
testing/unittest/cuda/testframework.h (+29/-12)
testing/unittest/exceptions.h (+38/-32)
testing/unittest/hip/testframework.cu (+87/-56)
testing/unittest/hip/testframework.h (+29/-12)
testing/unittest/meta.h (+80/-73)
testing/unittest/random.h (+81/-73)
testing/unittest/runtime_static_assert.h (+98/-104)
testing/unittest/special_types.h (+102/-115)
testing/unittest/system.h (+21/-5)
testing/unittest/testframework.cu (+216/-202)
testing/unittest/testframework.h (+457/-442)
testing/unittest/unittest.h (+27/-9)
testing/unittest/util.h (+41/-22)
testing/unittest/util_async.h (+26/-16)
testing/unittest_static_assert.cmake (+15/-0)
testing/unittest_static_assert.cu (+34/-14)
testing/unittest_tester.cu (+38/-22)
testing/universal_memory.cu (+74/-48)
testing/vector.cu (+537/-543)
testing/vector_allocators.cu (+208/-181)
testing/vector_insert.cu (+176/-251)
testing/vector_manipulation.cu (+92/-70)
testing/zip_function.cu (+33/-14)
testing/zip_iterator.cu (+165/-181)
testing/zip_iterator_reduce.cu (+33/-19)
testing/zip_iterator_reduce_by_key.cu (+74/-60)
testing/zip_iterator_scan.cu (+73/-55)
testing/zip_iterator_sort.cu (+29/-13)
testing/zip_iterator_sort_by_key.cu (+43/-32)
thrust/CMakeLists.txt (+3/-24)
thrust/addressof.h (+2/-5)
thrust/adjacent_difference.h (+41/-41)
thrust/advance.h (+7/-8)
thrust/allocate_unique.h (+127/-216)
thrust/async/copy.h (+71/-88)
thrust/async/for_each.h (+48/-58)
thrust/async/reduce.h (+244/-358)
thrust/async/scan.h (+193/-220)
thrust/async/sort.h (+65/-49)
thrust/async/transform.h (+37/-27)
thrust/binary_search.h (+474/-486)
thrust/complex.h (+130/-256)
thrust/copy.h (+95/-107)
thrust/count.h (+42/-34)
thrust/detail/adjacent_difference.inl (+4/-4)
thrust/detail/algorithm_wrapper.h (+1/-1)
thrust/detail/alignment.h (+47/-102)
thrust/detail/allocator/allocator_traits.h (+184/-205)
thrust/detail/allocator/allocator_traits.inl (+144/-224)
thrust/detail/allocator/copy_construct_range.h (+10/-18)
thrust/detail/allocator/copy_construct_range.inl (+116/-179)
thrust/detail/allocator/destroy_range.h (+3/-5)
thrust/detail/allocator/destroy_range.inl (+52/-77)
thrust/detail/allocator/fill_construct_range.h (+3/-7)
thrust/detail/allocator/fill_construct_range.inl (+40/-50)
thrust/detail/allocator/malloc_allocator.h (+12/-17)
thrust/detail/allocator/malloc_allocator.inl (+1/-2)
thrust/detail/allocator/no_throw_allocator.h (+33/-35)
thrust/detail/allocator/tagged_allocator.h (+57/-62)
thrust/detail/allocator/tagged_allocator.inl (+0/-1)
thrust/detail/allocator/temporary_allocator.h (+29/-41)
thrust/detail/allocator/temporary_allocator.inl (+2/-2)
thrust/detail/allocator/value_initialize_range.h (+4/-9)
thrust/detail/allocator/value_initialize_range.inl (+40/-50)
thrust/detail/allocator_aware_execution_policy.h (+33/-32)
thrust/detail/binary_search.inl (+26/-26)
thrust/detail/caching_allocator.h (+9/-16)
thrust/detail/complex/arithmetic.h (+61/-69)
thrust/detail/complex/c99math.h (+70/-55)
thrust/detail/complex/catrig.h (+189/-84)
thrust/detail/complex/catrigf.h (+181/-79)
thrust/detail/complex/ccosh.h (+34/-14)
thrust/detail/complex/ccoshf.h (+39/-23)
thrust/detail/complex/cexp.h (+29/-17)
thrust/detail/complex/cexpf.h (+31/-19)
thrust/detail/complex/clog.h (+55/-44)
thrust/detail/complex/clogf.h (+54/-42)
thrust/detail/complex/cpow.h (+15/-16)
thrust/detail/complex/cproj.h (+11/-7)
thrust/detail/complex/csinh.h (+33/-15)
thrust/detail/complex/csinhf.h (+34/-13)
thrust/detail/complex/csqrt.h (+39/-16)
thrust/detail/complex/csqrtf.h (+40/-16)
thrust/detail/complex/ctanh.h (+23/-17)
thrust/detail/complex/ctanhf.h (+19/-12)
thrust/detail/complex/math_private.h (+26/-23)
thrust/detail/complex/stream.h (+27/-28)
thrust/detail/config.h (+1/-2)
thrust/detail/config/compiler.h (+103/-150)
thrust/detail/config/compiler_fence.h (+31/-22)
thrust/detail/config/config.h (+17/-18)
thrust/detail/config/cpp_compatibility.h (+86/-84)
thrust/detail/config/cpp_dialect.h (+72/-55)
thrust/detail/config/deprecated.h (+84/-29)
thrust/detail/config/device_system.h (+20/-16)
thrust/detail/config/diagnostic.h (+42/-23)
thrust/detail/config/execution_space.h (+31/-25)
thrust/detail/config/global_workarounds.h (+1/-2)
thrust/detail/config/host_system.h (+9/-8)
thrust/detail/config/libcxx.h (+69/-0)
thrust/detail/config/memory_resource.h (+18/-10)
thrust/detail/config/namespace.h (+158/-79)
thrust/detail/config/rtti.h (+66/-0)
thrust/detail/config/simple_defines.h (+5/-2)
thrust/detail/config/visibility.h (+22/-0)
thrust/detail/contiguous_storage.h (+107/-144)
thrust/detail/contiguous_storage.inl (+194/-302)
thrust/detail/copy.h (+31/-56)
thrust/detail/copy.inl (+4/-4)
thrust/detail/copy_if.h (+28/-44)
thrust/detail/copy_if.inl (+5/-5)
thrust/detail/count.h (+22/-33)
thrust/detail/count.inl (+2/-2)
thrust/detail/cpp_version_check.h (+8/-4)
thrust/detail/dependencies_aware_execution_policy.h (+50/-68)
thrust/detail/device_free.inl (+1/-1)
thrust/detail/device_malloc.inl (+2/-2)
thrust/detail/device_ptr.inl (+15/-31)
thrust/detail/equal.inl (+4/-4)
thrust/detail/event_error.h (+27/-32)
thrust/detail/execute_with_allocator.h (+56/-76)
thrust/detail/execute_with_allocator_fwd.h (+50/-39)
thrust/detail/execute_with_dependencies.h (+160/-198)
thrust/detail/execution_policy.h (+24/-20)
thrust/detail/extrema.inl (+6/-6)
thrust/detail/fill.inl (+2/-2)
thrust/detail/find.inl (+3/-3)
thrust/detail/for_each.inl (+2/-2)
thrust/detail/function.h (+17/-125)
thrust/detail/functional.inl (+60/-48)
thrust/detail/functional/actor.h (+173/-91)
thrust/detail/functional/address_stability.h (+137/-0)
thrust/detail/functional/operators.h (+367/-6)
thrust/detail/gather.inl (+11/-11)
thrust/detail/generate.inl (+2/-2)
thrust/detail/get_iterator_value.h (+13/-13)
thrust/detail/inner_product.inl (+4/-4)
thrust/detail/integer_math.h (+54/-57)
thrust/detail/integer_traits.h (+61/-71)
thrust/detail/internal_functional.h (+170/-288)
thrust/detail/logical.inl (+3/-3)
thrust/detail/malloc_and_free.h (+32/-21)
thrust/detail/malloc_and_free_fwd.h (+22/-12)
thrust/detail/memory_algorithms.h (+102/-123)
thrust/detail/memory_wrapper.h (+1/-1)
thrust/detail/merge.inl (+18/-18)
thrust/detail/minmax.h (+9/-13)
thrust/detail/mismatch.inl (+4/-4)
thrust/detail/modern_gcc_required.h (+0/-1)
thrust/detail/mpl/math.h (+28/-47)
thrust/detail/numeric_traits.h (+70/-85)
thrust/detail/numeric_wrapper.h (+1/-1)
thrust/detail/nv_target.h (+4/-6)
thrust/detail/overlapped_copy.h (+32/-46)
thrust/detail/pair.inl (+2/-3)
thrust/detail/partition.inl (+22/-22)
thrust/detail/pointer.h (+128/-153)
thrust/detail/pointer.inl (+71/-107)
thrust/detail/preprocessor.h (+364/-367)
thrust/detail/range/head_flags.h (+72/-185)
thrust/detail/range/tail_flags.h (+78/-89)
thrust/detail/raw_pointer_cast.h (+7/-12)
thrust/detail/raw_reference_cast.h (+82/-190)
thrust/detail/reduce.inl (+15/-16)
thrust/detail/reference.h (+99/-116)
thrust/detail/reference_forward_declaration.h (+1/-1)
thrust/detail/remove.inl (+11/-12)
thrust/detail/replace.inl (+11/-12)
thrust/detail/reverse.inl (+3/-4)
thrust/detail/scan.inl (+63/-32)
thrust/detail/scatter.inl (+11/-12)
thrust/detail/select_system.h (+19/-26)
thrust/detail/seq.h (+12/-15)
thrust/detail/sequence.inl (+3/-4)
thrust/detail/set_operations.inl (+70/-71)
thrust/detail/shuffle.inl (+3/-5)
thrust/detail/sort.inl (+16/-17)
thrust/detail/static_assert.h (+16/-11)
thrust/detail/static_map.h (+79/-89)
thrust/detail/swap_ranges.inl (+2/-3)
thrust/detail/tabulate.inl (+1/-2)
thrust/detail/temporary_array.h (+96/-120)
thrust/detail/temporary_array.inl (+63/-89)
thrust/detail/temporary_buffer.h (+21/-22)
thrust/detail/transform.inl (+14/-15)
thrust/detail/transform_reduce.inl (+1/-2)
thrust/detail/transform_scan.inl (+98/-60)
thrust/detail/trivial_sequence.h (+64/-57)
thrust/detail/tuple.inl (+79/-80)
thrust/detail/tuple_algorithms.h (+11/-49)
thrust/detail/tuple_meta_transform.h (+13/-18)
thrust/detail/tuple_transform.h (+25/-44)
thrust/detail/type_deduction.h (+46/-31)
thrust/detail/type_traits.h (+189/-572)
thrust/detail/type_traits/has_member_function.h (+27/-15)
thrust/detail/type_traits/has_nested_type.h (+23/-12)
thrust/detail/type_traits/is_call_possible.h (+130/-121)
thrust/detail/type_traits/is_commutative.h (+69/-0)
thrust/detail/type_traits/is_metafunction_defined.h (+4/-9)
thrust/detail/type_traits/iterator/is_discard_iterator.h (+3/-5)
thrust/detail/type_traits/iterator/is_output_iterator.h (+25/-24)
thrust/detail/type_traits/minimum_type.h (+75/-93)
thrust/detail/type_traits/pointer_traits.h (+185/-213)
thrust/detail/uninitialized_copy.inl (+4/-6)
thrust/detail/uninitialized_fill.inl (+2/-3)
thrust/detail/unique.inl (+20/-21)
thrust/detail/use_default.h (+2/-2)
thrust/detail/util/align.h (+10/-19)
thrust/detail/vector_base.h (+557/-571)
thrust/detail/vector_base.inl (+355/-521)
thrust/device_allocator.h (+75/-87)
thrust/device_delete.h (+3/-4)
thrust/device_free.h (+2/-2)
thrust/device_make_unique.h (+16/-14)
thrust/device_malloc.h (+6/-3)
thrust/device_malloc_allocator.h (+133/-119)
thrust/device_new.h (+5/-8)
thrust/device_new_allocator.h (+133/-113)
thrust/device_ptr.h (+101/-109)
thrust/device_reference.h (+137/-147)
thrust/device_vector.h (+260/-220)
thrust/distance.h (+8/-8)
thrust/equal.h (+47/-42)
thrust/event.h (+0/-1)
thrust/execution_policy.h (+23/-31)
thrust/extrema.h (+101/-109)
thrust/fill.h (+30/-42)
thrust/find.h (+52/-58)
thrust/for_each.h (+33/-52)
thrust/functional.h (+533/-524)
thrust/future.h (+46/-39)
thrust/gather.h (+135/-123)
thrust/generate.h (+30/-51)
thrust/host_vector.h (+243/-224)
thrust/inner_product.h (+72/-73)
thrust/iterator/constant_iterator.h (+114/-108)
thrust/iterator/counting_iterator.h (+102/-91)
thrust/iterator/detail/any_assign.h (+8/-13)
thrust/iterator/detail/any_system_tag.h (+14/-3)
thrust/iterator/detail/constant_iterator_base.h (+14/-23)
thrust/iterator/detail/counting_iterator.inl (+61/-74)
thrust/iterator/detail/device_system_tag.h (+1/-1)
thrust/iterator/detail/discard_iterator_base.h (+22/-21)
thrust/iterator/detail/distance_from_result.h (+18/-9)
thrust/iterator/detail/host_system_tag.h (+1/-1)
thrust/iterator/detail/is_iterator_category.h (+24/-25)
thrust/iterator/detail/iterator_adaptor_base.h (+47/-68)
thrust/iterator/detail/iterator_category_to_system.h (+40/-38)
thrust/iterator/detail/iterator_category_to_traversal.h (+63/-86)
thrust/iterator/detail/iterator_category_with_system_and_traversal.h (+14/-20)
thrust/iterator/detail/iterator_facade_category.h (+97/-134)
thrust/iterator/detail/iterator_traits.h (+95/-0)
thrust/iterator/detail/iterator_traits.inl (+49/-56)
thrust/iterator/detail/iterator_traversal_tags.h (+12/-12)
thrust/iterator/detail/join_iterator.h (+82/-89)
thrust/iterator/detail/minimum_category.h (+20/-24)
thrust/iterator/detail/minimum_system.h (+41/-46)
thrust/iterator/detail/normal_iterator.h (+28/-33)
thrust/iterator/detail/permutation_iterator_base.h (+17/-20)
thrust/iterator/detail/retag.h (+47/-75)
thrust/iterator/detail/reverse_iterator_base.h (+6/-9)
thrust/iterator/detail/tabulate_output_iterator.inl (+61/-0)
thrust/iterator/detail/tagged_iterator.h (+26/-29)
thrust/iterator/detail/transform_input_output_iterator.inl (+58/-54)
thrust/iterator/detail/transform_iterator.inl (+64/-33)
thrust/iterator/detail/transform_output_iterator.inl (+38/-34)
thrust/iterator/detail/tuple_of_iterator_references.h (+73/-158)
thrust/iterator/detail/universal_categories.h (+74/-35)
thrust/iterator/detail/zip_iterator.inl (+0/-1)
thrust/iterator/detail/zip_iterator_base.h (+128/-192)
thrust/iterator/discard_iterator.h (+55/-48)
thrust/iterator/iterator_adaptor.h (+135/-119)
thrust/iterator/iterator_categories.h (+38/-44)
thrust/iterator/iterator_facade.h (+521/-413)
thrust/iterator/iterator_traits.h (+36/-15)
thrust/iterator/permutation_iterator.h (+83/-69)
thrust/iterator/retag.h (+1/-3)
thrust/iterator/reverse_iterator.h (+51/-54)
thrust/iterator/tabulate_output_iterator.h (+117/-0)
thrust/iterator/transform_input_output_iterator.h (+37/-35)
thrust/iterator/transform_iterator.h (+160/-140)
thrust/iterator/transform_output_iterator.h (+35/-36)
thrust/iterator/zip_iterator.h (+93/-95)
thrust/limits.h (+14/-2)
thrust/logical.h (+83/-68)
thrust/memory.h (+66/-68)
thrust/merge.h (+260/-212)
thrust/mismatch.h (+46/-49)
thrust/mr/allocator.h (+144/-158)
thrust/mr/device_memory_resource.h (+4/-9)
thrust/mr/disjoint_pool.h (+323/-357)
thrust/mr/disjoint_sync_pool.h (+62/-64)
thrust/mr/disjoint_tls_pool.h (+9/-14)
thrust/mr/fancy_pointer_resource.h (+41/-42)
thrust/mr/host_memory_resource.h (+1/-3)
thrust/mr/memory_resource.h (+133/-132)
thrust/mr/new.h (+54/-60)
thrust/mr/polymorphic_adaptor.h (+35/-37)
thrust/mr/pool.h (+402/-423)
thrust/mr/pool_options.h (+121/-75)
thrust/mr/sync_pool.h (+56/-59)
thrust/mr/tls_pool.h (+10/-12)
thrust/mr/universal_memory_resource.h (+0/-1)
thrust/mr/validator.h (+11/-13)
thrust/optional.h (+1074/-1070)
thrust/pair.h (+76/-148)
thrust/partition.h (+298/-351)
thrust/per_device_resource.h (+43/-52)
thrust/random.h (+16/-20)
thrust/random/detail/discard_block_engine.inl (+4/-5)
thrust/random/detail/erfcinv.h (+56/-45)
thrust/random/detail/linear_congruential_engine.inl (+4/-5)
thrust/random/detail/linear_congruential_engine_discard.h (+32/-31)
thrust/random/detail/linear_feedback_shift_engine.inl (+4/-5)
thrust/random/detail/linear_feedback_shift_engine_wordmask.h (+7/-10)
thrust/random/detail/mod.h (+15/-20)
thrust/random/detail/normal_distribution.inl (+77/-141)
thrust/random/detail/normal_distribution_base.h (+99/-84)
thrust/random/detail/random_core_access.h (+19/-22)
thrust/random/detail/subtract_with_carry_engine.inl (+4/-5)
thrust/random/detail/uniform_int_distribution.inl (+5/-6)
thrust/random/detail/uniform_real_distribution.inl (+4/-5)
thrust/random/detail/xor_combine_engine.inl (+4/-5)
thrust/random/detail/xor_combine_engine_max.h (+94/-211)
thrust/random/discard_block_engine.h (+133/-144)
thrust/random/linear_congruential_engine.h (+122/-128)
thrust/random/linear_feedback_shift_engine.h (+122/-135)
thrust/random/normal_distribution.h (+139/-163)
thrust/random/subtract_with_carry_engine.h (+121/-130)
thrust/random/uniform_int_distribution.h (+135/-158)
thrust/random/uniform_real_distribution.h (+134/-156)
thrust/random/xor_combine_engine.h (+150/-167)
thrust/reduce.h (+214/-244)
thrust/remove.h (+156/-190)
thrust/replace.h (+180/-179)
thrust/reverse.h (+35/-40)
thrust/rocthrust_version.hpp.in (+3/-3)
thrust/scan.h (+499/-460)
thrust/scatter.h (+166/-146)
thrust/sequence.h (+50/-73)
thrust/sequence_access.h (+2/-2)
thrust/set_operations.h (+1016/-950)
thrust/shuffle.h (+24/-27)
thrust/sort.h (+283/-322)
thrust/swap.h (+42/-47)
thrust/system/cpp/detail/adjacent_difference.h (+0/-1)
thrust/system/cpp/detail/assign_value.h (+0/-1)
thrust/system/cpp/detail/binary_search.h (+0/-1)
thrust/system/cpp/detail/copy.h (+0/-1)
thrust/system/cpp/detail/copy_if.h (+0/-1)
thrust/system/cpp/detail/count.h (+1/-2)
thrust/system/cpp/detail/equal.h (+1/-2)
thrust/system/cpp/detail/execution_policy.h (+25/-15)
thrust/system/cpp/detail/extrema.h (+0/-1)
thrust/system/cpp/detail/fill.h (+1/-2)
thrust/system/cpp/detail/find.h (+0/-1)
thrust/system/cpp/detail/for_each.h (+0/-1)
thrust/system/cpp/detail/gather.h (+1/-2)
thrust/system/cpp/detail/generate.h (+1/-2)
thrust/system/cpp/detail/get_value.h (+0/-1)
thrust/system/cpp/detail/inner_product.h (+1/-2)
thrust/system/cpp/detail/iter_swap.h (+0/-1)
thrust/system/cpp/detail/logical.h (+1/-2)
thrust/system/cpp/detail/malloc_and_free.h (+0/-1)
thrust/system/cpp/detail/memory.inl (+0/-1)
thrust/system/cpp/detail/merge.h (+0/-1)
thrust/system/cpp/detail/mismatch.h (+1/-2)
thrust/system/cpp/detail/par.h (+11/-16)
thrust/system/cpp/detail/partition.h (+0/-1)
thrust/system/cpp/detail/per_device_resource.h (+0/-1)
thrust/system/cpp/detail/reduce.h (+0/-1)
thrust/system/cpp/detail/reduce_by_key.h (+0/-1)
thrust/system/cpp/detail/remove.h (+0/-1)
thrust/system/cpp/detail/replace.h (+1/-2)
thrust/system/cpp/detail/reverse.h (+1/-2)
thrust/system/cpp/detail/scan.h (+0/-1)
thrust/system/cpp/detail/scan_by_key.h (+0/-1)
thrust/system/cpp/detail/scatter.h (+1/-2)
thrust/system/cpp/detail/sequence.h (+1/-2)
thrust/system/cpp/detail/set_operations.h (+0/-1)
thrust/system/cpp/detail/sort.h (+0/-1)
thrust/system/cpp/detail/swap_ranges.h (+0/-1)
thrust/system/cpp/detail/tabulate.h (+1/-2)
thrust/system/cpp/detail/temporary_buffer.h (+0/-1)
thrust/system/cpp/detail/transform.h (+0/-1)
thrust/system/cpp/detail/transform_reduce.h (+1/-2)
thrust/system/cpp/detail/transform_scan.h (+1/-2)
thrust/system/cpp/detail/uninitialized_copy.h (+1/-2)
thrust/system/cpp/detail/uninitialized_fill.h (+1/-2)
thrust/system/cpp/detail/unique.h (+0/-1)
thrust/system/cpp/detail/unique_by_key.h (+0/-1)
thrust/system/cpp/execution_policy.h (+0/-3)
thrust/system/cpp/memory.h (+29/-19)
thrust/system/cpp/memory_resource.h (+21/-17)
thrust/system/cpp/pointer.h (+23/-21)
thrust/system/cpp/vector.h (+19/-5)
thrust/system/cuda/config.h (+28/-29)
thrust/system/cuda/detail/adjacent_difference.h (+161/-230)
thrust/system/cuda/detail/assign_value.h (+35/-32)
thrust/system/cuda/detail/async/copy.h (+184/-359)
thrust/system/cuda/detail/async/customization.h (+60/-74)
thrust/system/cuda/detail/async/exclusive_scan.h (+60/-95)
thrust/system/cuda/detail/async/for_each.h (+48/-69)
thrust/system/cuda/detail/async/inclusive_scan.h (+159/-98)
thrust/system/cuda/detail/async/reduce.h (+93/-200)
thrust/system/cuda/detail/async/scan.h (+1/-2)
thrust/system/cuda/detail/async/sort.h (+143/-332)
thrust/system/cuda/detail/async/transform.h (+52/-72)
thrust/system/cuda/detail/binary_search.h (+14/-14)
thrust/system/cuda/detail/cdp_dispatch.h (+35/-36)
thrust/system/cuda/detail/copy.h (+46/-101)
thrust/system/cuda/detail/copy_if.h (+54/-30)
thrust/system/cuda/detail/core/agent_launcher.h (+232/-1079)
thrust/system/cuda/detail/core/triple_chevron_launch.h (+129/-104)
thrust/system/cuda/detail/core/util.h (+632/-608)
thrust/system/cuda/detail/count.h (+25/-34)
thrust/system/cuda/detail/cross_system.h (+143/-246)
thrust/system/cuda/detail/dispatch.h (+191/-55)
thrust/system/cuda/detail/equal.h (+19/-29)
thrust/system/cuda/detail/error.inl (+0/-1)
thrust/system/cuda/detail/execution_policy.h (+45/-18)
thrust/system/cuda/detail/extrema.h (+325/-397)
thrust/system/cuda/detail/fill.h (+40/-39)
thrust/system/cuda/detail/find.h (+181/-126)
thrust/system/cuda/detail/for_each.h (+41/-62)
thrust/system/cuda/detail/future.inl (+417/-554)
thrust/system/cuda/detail/gather.h (+44/-60)
thrust/system/cuda/detail/generate.h (+29/-32)
thrust/system/cuda/detail/get_value.h (+31/-28)
thrust/system/cuda/detail/inner_product.h (+33/-52)
thrust/system/cuda/detail/internal/copy_cross_system.h (+149/-192)
thrust/system/cuda/detail/internal/copy_device_to_device.h (+57/-66)
thrust/system/cuda/detail/iter_swap.h (+24/-21)
thrust/system/cuda/detail/logical.h (+1/-2)
thrust/system/cuda/detail/make_unsigned_special.h (+31/-21)
thrust/system/cuda/detail/malloc_and_free.h (+49/-55)
thrust/system/cuda/detail/memory.inl (+0/-1)
thrust/system/cuda/detail/merge.h (+177/-945)
thrust/system/cuda/detail/mismatch.h (+166/-65)
thrust/system/cuda/detail/par.h (+71/-84)
thrust/system/cuda/detail/par_to_seq.h (+16/-15)
thrust/system/cuda/detail/parallel_for.h (+19/-117)
thrust/system/cuda/detail/partition.h (+31/-24)
thrust/system/cuda/detail/per_device_resource.h (+23/-20)
thrust/system/cuda/detail/reduce.h (+661/-832)
thrust/system/cuda/detail/reduce_by_key.h (+865/-1026)
thrust/system/cuda/detail/remove.h (+43/-70)
thrust/system/cuda/detail/replace.h (+106/-155)
thrust/system/cuda/detail/reverse.h (+29/-40)
thrust/system/cuda/detail/scan.h (+235/-232)
thrust/system/cuda/detail/scan_by_key.h (+218/-297)
thrust/system/cuda/detail/scatter.h (+33/-60)
thrust/system/cuda/detail/sequence.h (+1/-2)
thrust/system/cuda/detail/set_operations.h (+1315/-1498)
thrust/system/cuda/detail/sort.h (+355/-506)
thrust/system/cuda/detail/swap_ranges.h (+54/-53)
thrust/system/cuda/detail/tabulate.h (+39/-36)
thrust/system/cuda/detail/temporary_buffer.h (+0/-1)
thrust/system/cuda/detail/terminate.h (+16/-19)
thrust/system/cuda/detail/transform.h (+312/-325)
thrust/system/cuda/detail/transform_reduce.h (+109/-31)
thrust/system/cuda/detail/transform_scan.h (+76/-64)
thrust/system/cuda/detail/uninitialized_copy.h (+56/-59)
thrust/system/cuda/detail/uninitialized_fill.h (+51/-55)
thrust/system/cuda/detail/unique.h (+514/-654)
thrust/system/cuda/detail/unique_by_key.h (+20/-9)
thrust/system/cuda/detail/util.h (+138/-179)
thrust/system/cuda/error.h (+65/-68)
thrust/system/cuda/execution_policy.h (+1/-0)
thrust/system/cuda/future.h (+28/-22)
thrust/system/cuda/memory.h (+33/-20)
thrust/system/cuda/memory_resource.h (+53/-60)
thrust/system/cuda/pointer.h (+22/-19)
thrust/system/cuda/vector.h (+28/-9)
thrust/system/detail/adl/adjacent_difference.h (+7/-7)
thrust/system/detail/adl/assign_value.h (+6/-6)
thrust/system/detail/adl/async/copy.h (+4/-5)
thrust/system/detail/adl/async/for_each.h (+4/-5)
thrust/system/detail/adl/async/reduce.h (+4/-5)
thrust/system/detail/adl/async/scan.h (+4/-5)
thrust/system/detail/adl/async/sort.h (+4/-5)
thrust/system/detail/adl/async/transform.h (+4/-5)
thrust/system/detail/adl/binary_search.h (+6/-6)
thrust/system/detail/adl/copy.h (+6/-6)
thrust/system/detail/adl/copy_if.h (+6/-6)
thrust/system/detail/adl/count.h (+6/-7)
thrust/system/detail/adl/equal.h (+6/-6)
thrust/system/detail/adl/extrema.h (+6/-7)
thrust/system/detail/adl/fill.h (+6/-6)
thrust/system/detail/adl/find.h (+6/-6)
thrust/system/detail/adl/for_each.h (+6/-6)
thrust/system/detail/adl/gather.h (+6/-6)
thrust/system/detail/adl/generate.h (+6/-6)
thrust/system/detail/adl/get_value.h (+6/-6)
thrust/system/detail/adl/inner_product.h (+6/-6)
thrust/system/detail/adl/iter_swap.h (+6/-6)
thrust/system/detail/adl/logical.h (+6/-6)
thrust/system/detail/adl/malloc_and_free.h (+6/-6)
thrust/system/detail/adl/merge.h (+6/-6)
thrust/system/detail/adl/mismatch.h (+6/-6)
thrust/system/detail/adl/partition.h (+6/-7)
thrust/system/detail/adl/per_device_resource.h (+4/-5)
thrust/system/detail/adl/reduce.h (+6/-6)
thrust/system/detail/adl/reduce_by_key.h (+6/-7)
thrust/system/detail/adl/remove.h (+6/-6)
thrust/system/detail/adl/replace.h (+6/-6)
thrust/system/detail/adl/reverse.h (+6/-6)
thrust/system/detail/adl/scan.h (+6/-6)
thrust/system/detail/adl/scan_by_key.h (+6/-6)
thrust/system/detail/adl/scatter.h (+6/-6)
thrust/system/detail/adl/sequence.h (+6/-6)
thrust/system/detail/adl/set_operations.h (+6/-6)
thrust/system/detail/adl/sort.h (+6/-6)
thrust/system/detail/adl/swap_ranges.h (+6/-6)
thrust/system/detail/adl/tabulate.h (+6/-6)
thrust/system/detail/adl/temporary_buffer.h (+6/-6)
thrust/system/detail/adl/transform.h (+6/-6)
thrust/system/detail/adl/transform_reduce.h (+6/-6)
thrust/system/detail/adl/transform_scan.h (+6/-6)
thrust/system/detail/adl/uninitialized_copy.h (+6/-6)
thrust/system/detail/adl/uninitialized_fill.h (+6/-6)
thrust/system/detail/adl/unique.h (+6/-6)
thrust/system/detail/adl/unique_by_key.h (+6/-6)
thrust/system/detail/bad_alloc.h (+25/-27)
thrust/system/detail/errno.h (+2/-4)
thrust/system/detail/error_category.inl (+219/-154)
thrust/system/detail/error_code.inl (+49/-70)
thrust/system/detail/error_condition.inl (+38/-47)
thrust/system/detail/generic/adjacent_difference.h (+11/-16)
thrust/system/detail/generic/adjacent_difference.inl (+2/-3)
thrust/system/detail/generic/advance.h (+2/-5)
thrust/system/detail/generic/advance.inl (+0/-1)
thrust/system/detail/generic/binary_search.h (+99/-112)
thrust/system/detail/generic/binary_search.inl (+5/-5)
thrust/system/detail/generic/copy.h (+10/-24)
thrust/system/detail/generic/copy.inl (+28/-37)
thrust/system/detail/generic/copy_if.h (+21/-27)
thrust/system/detail/generic/copy_if.inl (+75/-80)
thrust/system/detail/generic/count.h (+8/-11)
thrust/system/detail/generic/count.inl (+2/-2)
thrust/system/detail/generic/distance.h (+4/-6)
thrust/system/detail/generic/distance.inl (+0/-1)
thrust/system/detail/generic/equal.h (+11/-10)
thrust/system/detail/generic/equal.inl (+25/-18)
thrust/system/detail/generic/extrema.h (+13/-36)
thrust/system/detail/generic/extrema.inl (+110/-114)
thrust/system/detail/generic/fill.h (+6/-15)
thrust/system/detail/generic/find.h (+10/-24)
thrust/system/detail/generic/find.inl (+6/-6)
thrust/system/detail/generic/for_each.h (+13/-29)
thrust/system/detail/generic/gather.h (+34/-42)
thrust/system/detail/generic/gather.inl (+59/-65)
thrust/system/detail/generic/generate.h (+8/-20)
thrust/system/detail/generic/generate.inl (+30/-36)
thrust/system/detail/generic/inner_product.h (+23/-22)
thrust/system/detail/generic/inner_product.inl (+8/-12)
thrust/system/detail/generic/logical.h (+15/-19)
thrust/system/detail/generic/memory.h (+18/-25)
thrust/system/detail/generic/memory.inl (+0/-2)
thrust/system/detail/generic/merge.h (+61/-53)
thrust/system/detail/generic/merge.inl (+8/-9)
thrust/system/detail/generic/mismatch.h (+11/-20)
thrust/system/detail/generic/mismatch.inl (+2/-2)
thrust/system/detail/generic/partition.h (+88/-129)
thrust/system/detail/generic/partition.inl (+6/-7)
thrust/system/detail/generic/per_device_resource.h (+8/-12)
thrust/system/detail/generic/reduce.h (+15/-20)
thrust/system/detail/generic/reduce.inl (+1/-2)
thrust/system/detail/generic/reduce_by_key.h (+44/-50)
thrust/system/detail/generic/reduce_by_key.inl (+16/-18)
thrust/system/detail/generic/remove.h (+45/-72)
thrust/system/detail/generic/remove.inl (+3/-4)
thrust/system/detail/generic/replace.h (+58/-61)
thrust/system/detail/generic/replace.inl (+2/-2)
thrust/system/detail/generic/reverse.h (+10/-18)
thrust/system/detail/generic/reverse.inl (+1/-3)
thrust/system/detail/generic/scalar/binary_search.h (+23/-36)
thrust/system/detail/generic/scalar/binary_search.inl (+3/-3)
thrust/system/detail/generic/scan.h (+35/-62)
thrust/system/detail/generic/scan_by_key.h (+85/-103)
thrust/system/detail/generic/scan_by_key.inl (+12/-9)
thrust/system/detail/generic/scatter.h (+34/-42)
thrust/system/detail/generic/scatter.inl (+48/-54)
thrust/system/detail/generic/select_system.h (+60/-76)
thrust/system/detail/generic/select_system.inl (+72/-102)
thrust/system/detail/generic/select_system_exists.h (+106/-103)
thrust/system/detail/generic/sequence.h (+10/-25)
thrust/system/detail/generic/sequence.inl (+1/-1)
thrust/system/detail/generic/set_operations.h (+237/-274)
thrust/system/detail/generic/set_operations.inl (+34/-35)
thrust/system/detail/generic/shuffle.h (+18/-17)
thrust/system/detail/generic/sort.h (+76/-117)
thrust/system/detail/generic/sort.inl (+7/-8)
thrust/system/detail/generic/swap_ranges.h (+7/-10)
thrust/system/detail/generic/swap_ranges.inl (+37/-28)
thrust/system/detail/generic/tabulate.h (+4/-12)
thrust/system/detail/generic/tabulate.inl (+1/-3)
thrust/system/detail/generic/tag.h (+6/-8)
thrust/system/detail/generic/temporary_buffer.h (+15/-17)
thrust/system/detail/generic/transform.h (+62/-65)
thrust/system/detail/generic/transform.inl (+15/-16)
thrust/system/detail/generic/transform_reduce.h (+13/-16)
thrust/system/detail/generic/transform_reduce.inl (+0/-1)
thrust/system/detail/generic/transform_scan.h (+48/-29)
thrust/system/detail/generic/transform_scan.inl (+61/-35)
thrust/system/detail/generic/uninitialized_copy.h (+8/-20)
thrust/system/detail/generic/uninitialized_copy.inl (+76/-94)
thrust/system/detail/generic/uninitialized_fill.h (+8/-20)
thrust/system/detail/generic/uninitialized_fill.inl (+59/-69)
thrust/system/detail/generic/unique.h (+34/-61)
thrust/system/detail/generic/unique.inl (+6/-7)
thrust/system/detail/generic/unique_by_key.h (+41/-55)
thrust/system/detail/generic/unique_by_key.inl (+5/-6)
thrust/system/detail/internal/decompose.h (+80/-78)
thrust/system/detail/sequential/adjacent_difference.h (+15/-19)
thrust/system/detail/sequential/assign_value.h (+7/-8)
thrust/system/detail/sequential/binary_search.h (+34/-58)
thrust/system/detail/sequential/copy.h (+7/-21)
thrust/system/detail/sequential/copy.inl (+1/-2)
thrust/system/detail/sequential/copy_backward.h (+3/-9)
thrust/system/detail/sequential/copy_if.h (+16/-18)
thrust/system/detail/sequential/count.h (+0/-1)
thrust/system/detail/sequential/equal.h (+0/-1)
thrust/system/detail/sequential/execution_policy.h (+19/-15)
thrust/system/detail/sequential/extrema.h (+22/-51)
thrust/system/detail/sequential/fill.h (+0/-1)
thrust/system/detail/sequential/find.h (+9/-18)
thrust/system/detail/sequential/for_each.h (+11/-32)
thrust/system/detail/sequential/gather.h (+0/-1)
thrust/system/detail/sequential/general_copy.h (+32/-53)
thrust/system/detail/sequential/generate.h (+0/-1)
thrust/system/detail/sequential/get_value.h (+8/-11)
thrust/system/detail/sequential/inner_product.h (+0/-1)
thrust/system/detail/sequential/insertion_sort.h (+26/-39)
thrust/system/detail/sequential/iter_swap.h (+22/-11)
thrust/system/detail/sequential/logical.h (+0/-1)
thrust/system/detail/sequential/malloc_and_free.h (+10/-13)
thrust/system/detail/sequential/merge.h (+33/-38)
thrust/system/detail/sequential/merge.inl (+2/-3)
thrust/system/detail/sequential/mismatch.h (+0/-1)
thrust/system/detail/sequential/partition.h (+114/-145)
thrust/system/detail/sequential/per_device_resource.h (+0/-1)
thrust/system/detail/sequential/reduce.h (+10/-19)
thrust/system/detail/sequential/reduce_by_key.h (+25/-29)
thrust/system/detail/sequential/remove.h (+50/-73)
thrust/system/detail/sequential/replace.h (+0/-1)
thrust/system/detail/sequential/reverse.h (+0/-1)
thrust/system/detail/sequential/scan.h (+85/-42)
thrust/system/detail/sequential/scan_by_key.h (+52/-57)
thrust/system/detail/sequential/scatter.h (+0/-1)
thrust/system/detail/sequential/sequence.h (+0/-1)
thrust/system/detail/sequential/set_operations.h (+70/-88)
thrust/system/detail/sequential/sort.h (+18/-23)
thrust/system/detail/sequential/sort.inl (+97/-113)
thrust/system/detail/sequential/stable_merge_sort.h (+18/-23)
thrust/system/detail/sequential/stable_merge_sort.inl (+9/-9)
thrust/system/detail/sequential/stable_primitive_sort.h (+10/-18)
thrust/system/detail/sequential/stable_primitive_sort.inl (+59/-85)
thrust/system/detail/sequential/stable_radix_sort.h (+10/-18)
thrust/system/detail/sequential/stable_radix_sort.inl (+259/-262)
thrust/system/detail/sequential/swap_ranges.h (+0/-1)
thrust/system/detail/sequential/tabulate.h (+0/-1)
thrust/system/detail/sequential/temporary_buffer.h (+0/-1)
thrust/system/detail/sequential/transform.h (+0/-1)
thrust/system/detail/sequential/transform_reduce.h (+0/-1)
thrust/system/detail/sequential/transform_scan.h (+1/-2)
thrust/system/detail/sequential/trivial_copy.h (+12/-18)
thrust/system/detail/sequential/uninitialized_copy.h (+1/-2)
thrust/system/detail/sequential/uninitialized_fill.h (+0/-1)
thrust/system/detail/sequential/unique.h (+24/-40)
thrust/system/detail/sequential/unique_by_key.h (+32/-42)
thrust/system/detail/system_error.inl (+0/-1)
thrust/system/error_code.h (+221/-232)
thrust/system/hip/config.h (+29/-28)
thrust/system/hip/detail/adjacent_difference.h (+166/-192)
thrust/system/hip/detail/assign_value.h (+58/-33)
thrust/system/hip/detail/async/copy.h (+179/-362)
thrust/system/hip/detail/async/customization.h (+63/-72)
thrust/system/hip/detail/async/exclusive_scan.h (+51/-76)
thrust/system/hip/detail/async/for_each.h (+45/-71)
thrust/system/hip/detail/async/inclusive_scan.h (+165/-103)
thrust/system/hip/detail/async/reduce.h (+105/-198)
thrust/system/hip/detail/async/sort.h (+142/-342)
thrust/system/hip/detail/async/transform.h (+48/-74)
thrust/system/hip/detail/binary_search.h (+505/-595)
thrust/system/hip/detail/copy.h (+69/-86)
thrust/system/hip/detail/copy_if.h (+266/-334)
thrust/system/hip/detail/count.h (+23/-25)
thrust/system/hip/detail/cross_system.h (+160/-243)
thrust/system/hip/detail/dispatch.h (+37/-31)
thrust/system/hip/detail/equal.h (+17/-17)
thrust/system/hip/detail/execution_policy.h (+56/-34)
thrust/system/hip/detail/extrema.h (+265/-287)
thrust/system/hip/detail/fill.h (+46/-23)
thrust/system/hip/detail/find.h (+194/-114)
thrust/system/hip/detail/for_each.h (+39/-36)
thrust/system/hip/detail/future.inl (+421/-545)
thrust/system/hip/detail/gather.h (+40/-45)
thrust/system/hip/detail/general/temp_storage.h (+248/-258)
thrust/system/hip/detail/general/various.h (+18/-15)
thrust/system/hip/detail/generate.h (+31/-30)
thrust/system/hip/detail/get_value.h (+54/-30)
thrust/system/hip/detail/guarded_driver_types.h (+1/-2)
thrust/system/hip/detail/guarded_hip_runtime_api.h (+7/-7)
thrust/system/hip/detail/inner_product.h (+31/-39)
thrust/system/hip/detail/internal/copy_cross_system.h (+136/-216)
thrust/system/hip/detail/internal/copy_device_to_device.h (+54/-49)
thrust/system/hip/detail/iter_swap.h (+32/-14)
thrust/system/hip/detail/make_unsigned_special.h (+32/-21)
thrust/system/hip/detail/malloc_and_free.h (+37/-39)
thrust/system/hip/detail/merge.h (+292/-345)
thrust/system/hip/detail/mismatch.h (+171/-50)
thrust/system/hip/detail/nv/detail/__preprocessor.h (+14/-15)
thrust/system/hip/detail/nv/detail/__target_macros.h (+10/-10)
thrust/system/hip/detail/nv/target.h (+2/-2)
thrust/system/hip/detail/par.h (+322/-266)
thrust/system/hip/detail/par_to_seq.h (+27/-33)
thrust/system/hip/detail/parallel_for.h (+127/-115)
thrust/system/hip/detail/partition.h (+575/-695)
thrust/system/hip/detail/per_device_resource.h (+23/-15)
thrust/system/hip/detail/reduce.h (+124/-146)
thrust/system/hip/detail/reduce_by_key.h (+287/-286)
thrust/system/hip/detail/remove.h (+36/-45)
thrust/system/hip/detail/replace.h (+101/-120)
thrust/system/hip/detail/reverse.h (+39/-40)
thrust/system/hip/detail/scan.h (+352/-367)
thrust/system/hip/detail/scan_by_key.h (+431/-461)
thrust/system/hip/detail/scatter.h (+30/-51)
thrust/system/hip/detail/set_operations.h (+1514/-1521)
thrust/system/hip/detail/sort.h (+379/-446)
thrust/system/hip/detail/swap_ranges.h (+57/-51)
thrust/system/hip/detail/tabulate.h (+38/-35)
thrust/system/hip/detail/terminate.h (+25/-15)
thrust/system/hip/detail/transform.h (+249/-290)
thrust/system/hip/detail/transform_reduce.h (+34/-17)
thrust/system/hip/detail/transform_scan.h (+71/-61)
thrust/system/hip/detail/uninitialized_copy.h (+46/-44)
thrust/system/hip/detail/uninitialized_fill.h (+50/-45)
thrust/system/hip/detail/unique.h (+177/-182)
thrust/system/hip/detail/unique_by_key.h (+203/-217)
thrust/system/hip/detail/util.h (+500/-442)
thrust/system/hip/error.h (+72/-72)
thrust/system/hip/execution_policy.h (+2/-2)
thrust/system/hip/future.h (+32/-19)
thrust/system/hip/hipstdpar/hipstdpar_lib.hpp (+46/-22)
thrust/system/hip/hipstdpar/impl/batch.hpp (+77/-74)
thrust/system/hip/hipstdpar/impl/copy.hpp (+107/-129)
thrust/system/hip/hipstdpar/impl/generation.hpp (+115/-133)
thrust/system/hip/hipstdpar/impl/heap.hpp (+30/-10)
thrust/system/hip/hipstdpar/impl/hipstd.hpp (+54/-41)
thrust/system/hip/hipstdpar/impl/interpose_allocations_v0.hpp (+140/-113)
thrust/system/hip/hipstdpar/impl/interpose_allocations_v1.hpp (+279/-0)
thrust/system/hip/hipstdpar/impl/lexicographical_comparison.hpp (+113/-110)
thrust/system/hip/hipstdpar/impl/math_lib.hpp (+393/-0)
thrust/system/hip/hipstdpar/impl/merge.hpp (+76/-102)
thrust/system/hip/hipstdpar/impl/min_max.hpp (+153/-185)
thrust/system/hip/hipstdpar/impl/numeric.hpp (+463/-767)
thrust/system/hip/hipstdpar/impl/order_changing.hpp (+75/-73)
thrust/system/hip/hipstdpar/impl/partitioning.hpp (+144/-173)
thrust/system/hip/hipstdpar/impl/removing.hpp (+185/-249)
thrust/system/hip/hipstdpar/impl/search.hpp (+836/-1063)
thrust/system/hip/hipstdpar/impl/set.hpp (+280/-486)
thrust/system/hip/hipstdpar/impl/sorting.hpp (+394/-478)
thrust/system/hip/hipstdpar/impl/swap.hpp (+41/-33)
thrust/system/hip/hipstdpar/impl/transformation.hpp (+173/-257)
thrust/system/hip/hipstdpar/impl/uninitialized.hpp (+240/-361)
thrust/system/hip/memory.h (+33/-20)
thrust/system/hip/memory_resource.h (+61/-67)
thrust/system/hip/pointer.h (+27/-22)
thrust/system/hip/vector.h (+29/-9)
thrust/system/omp/detail/adjacent_difference.h (+12/-14)
thrust/system/omp/detail/assign_value.h (+0/-1)
thrust/system/omp/detail/binary_search.h (+25/-27)
thrust/system/omp/detail/copy.h (+6/-19)
thrust/system/omp/detail/copy.inl (+8/-9)
thrust/system/omp/detail/copy_if.h (+17/-18)
thrust/system/omp/detail/copy_if.inl (+0/-1)
thrust/system/omp/detail/count.h (+0/-1)
thrust/system/omp/detail/default_decomposition.h (+1/-2)
thrust/system/omp/detail/default_decomposition.inl (+0/-1)
thrust/system/omp/detail/equal.h (+0/-1)
thrust/system/omp/detail/execution_policy.h (+26/-26)
thrust/system/omp/detail/extrema.h (+11/-18)
thrust/system/omp/detail/fill.h (+0/-1)
thrust/system/omp/detail/find.h (+3/-7)
thrust/system/omp/detail/for_each.h (+9/-19)
thrust/system/omp/detail/for_each.inl (+2/-3)
thrust/system/omp/detail/gather.h (+0/-1)
thrust/system/omp/detail/generate.h (+0/-1)
thrust/system/omp/detail/get_value.h (+0/-1)
thrust/system/omp/detail/inner_product.h (+0/-1)
thrust/system/omp/detail/iter_swap.h (+0/-1)
thrust/system/omp/detail/logical.h (+0/-1)
thrust/system/omp/detail/malloc_and_free.h (+0/-1)
thrust/system/omp/detail/memory.inl (+0/-1)
thrust/system/omp/detail/merge.h (+0/-1)
thrust/system/omp/detail/mismatch.h (+0/-1)
thrust/system/omp/detail/par.h (+11/-16)
thrust/system/omp/detail/partition.h (+38/-47)
thrust/system/omp/detail/partition.inl (+0/-1)
thrust/system/omp/detail/per_device_resource.h (+0/-1)
thrust/system/omp/detail/pragma_omp.h (+35/-37)
thrust/system/omp/detail/reduce.h (+7/-13)
thrust/system/omp/detail/reduce.inl (+1/-2)
thrust/system/omp/detail/reduce_by_key.h (+10/-13)
thrust/system/omp/detail/reduce_by_key.inl (+0/-1)
thrust/system/omp/detail/reduce_intervals.h (+7/-7)
thrust/system/omp/detail/reduce_intervals.inl (+5/-6)
thrust/system/omp/detail/remove.h (+26/-41)
thrust/system/omp/detail/remove.inl (+0/-1)
thrust/system/omp/detail/replace.h (+0/-1)
thrust/system/omp/detail/reverse.h (+0/-1)
thrust/system/omp/detail/scan.h (+0/-1)
thrust/system/omp/detail/scan_by_key.h (+0/-1)
thrust/system/omp/detail/scatter.h (+0/-1)
thrust/system/omp/detail/sequence.h (+0/-1)
thrust/system/omp/detail/set_operations.h (+0/-1)
thrust/system/omp/detail/sort.h (+14/-15)
thrust/system/omp/detail/sort.inl (+5/-6)
thrust/system/omp/detail/swap_ranges.h (+0/-1)
thrust/system/omp/detail/tabulate.h (+0/-1)
thrust/system/omp/detail/temporary_buffer.h (+0/-1)
thrust/system/omp/detail/transform.h (+0/-1)
thrust/system/omp/detail/transform_reduce.h (+0/-1)
thrust/system/omp/detail/transform_scan.h (+0/-1)
thrust/system/omp/detail/uninitialized_copy.h (+0/-1)
thrust/system/omp/detail/uninitialized_fill.h (+0/-1)
thrust/system/omp/detail/unique.h (+16/-31)
thrust/system/omp/detail/unique.inl (+1/-2)
thrust/system/omp/detail/unique_by_key.h (+25/-31)
thrust/system/omp/detail/unique_by_key.inl (+2/-3)
thrust/system/omp/execution_policy.h (+0/-3)
thrust/system/omp/memory.h (+29/-18)
thrust/system/omp/memory_resource.h (+21/-15)
thrust/system/omp/pointer.h (+21/-18)
thrust/system/omp/vector.h (+19/-5)
thrust/system/system_error.h (+80/-80)
thrust/system/tbb/detail/adjacent_difference.h (+12/-14)
thrust/system/tbb/detail/assign_value.h (+0/-1)
thrust/system/tbb/detail/binary_search.h (+0/-1)
thrust/system/tbb/detail/copy.h (+6/-19)
thrust/system/tbb/detail/copy.inl (+8/-9)
thrust/system/tbb/detail/copy_if.h (+7/-16)
thrust/system/tbb/detail/copy_if.inl (+9/-10)
thrust/system/tbb/detail/count.h (+0/-1)
thrust/system/tbb/detail/equal.h (+0/-1)
thrust/system/tbb/detail/execution_policy.h (+20/-17)
thrust/system/tbb/detail/extrema.h (+11/-18)
thrust/system/tbb/detail/fill.h (+0/-1)
thrust/system/tbb/detail/find.h (+2/-5)
thrust/system/tbb/detail/for_each.h (+8/-17)
thrust/system/tbb/detail/for_each.inl (+0/-1)
thrust/system/tbb/detail/gather.h (+0/-1)
thrust/system/tbb/detail/generate.h (+0/-1)
thrust/system/tbb/detail/get_value.h (+0/-1)
thrust/system/tbb/detail/inner_product.h (+0/-1)
thrust/system/tbb/detail/iter_swap.h (+0/-1)
thrust/system/tbb/detail/logical.h (+0/-1)
thrust/system/tbb/detail/malloc_and_free.h (+0/-1)
thrust/system/tbb/detail/memory.inl (+0/-1)
thrust/system/tbb/detail/merge.h (+28/-27)
thrust/system/tbb/detail/merge.inl (+11/-5)
thrust/system/tbb/detail/mismatch.h (+0/-1)
thrust/system/tbb/detail/par.h (+11/-16)
thrust/system/tbb/detail/partition.h (+38/-47)
thrust/system/tbb/detail/partition.inl (+0/-1)
thrust/system/tbb/detail/per_device_resource.h (+0/-1)
thrust/system/tbb/detail/reduce.h (+7/-13)
thrust/system/tbb/detail/reduce.inl (+40/-33)
thrust/system/tbb/detail/reduce_by_key.h (+18/-20)
thrust/system/tbb/detail/reduce_by_key.inl (+204/-156)
thrust/system/tbb/detail/reduce_intervals.h (+55/-46)
thrust/system/tbb/detail/remove.h (+30/-41)
thrust/system/tbb/detail/remove.inl (+0/-1)
thrust/system/tbb/detail/replace.h (+0/-1)
thrust/system/tbb/detail/reverse.h (+0/-1)
thrust/system/tbb/detail/scan.h (+10/-21)
thrust/system/tbb/detail/scan.inl (+111/-54)
thrust/system/tbb/detail/scan_by_key.h (+0/-1)
thrust/system/tbb/detail/scatter.h (+0/-1)
thrust/system/tbb/detail/sequence.h (+0/-1)
thrust/system/tbb/detail/set_operations.h (+0/-1)
thrust/system/tbb/detail/sort.h (+17/-18)
thrust/system/tbb/detail/sort.inl (+8/-7)
thrust/system/tbb/detail/swap_ranges.h (+0/-1)
thrust/system/tbb/detail/tabulate.h (+0/-1)
thrust/system/tbb/detail/temporary_buffer.h (+0/-1)
thrust/system/tbb/detail/transform.h (+0/-1)
thrust/system/tbb/detail/transform_reduce.h (+0/-1)
thrust/system/tbb/detail/transform_scan.h (+0/-1)
thrust/system/tbb/detail/uninitialized_copy.h (+0/-1)
thrust/system/tbb/detail/uninitialized_fill.h (+0/-1)
thrust/system/tbb/detail/unique.h (+16/-31)
thrust/system/tbb/detail/unique.inl (+1/-2)
thrust/system/tbb/detail/unique_by_key.h (+25/-31)
thrust/system/tbb/detail/unique_by_key.inl (+2/-3)
thrust/system/tbb/execution_policy.h (+0/-3)
thrust/system/tbb/memory.h (+27/-18)
thrust/system/tbb/memory_resource.h (+21/-15)
thrust/system/tbb/pointer.h (+21/-18)
thrust/system/tbb/vector.h (+19/-5)
thrust/system_error.h (+1/-8)
thrust/tabulate.h (+21/-26)
thrust/transform.h (+422/-412)
thrust/transform_reduce.h (+40/-49)
thrust/transform_scan.h (+253/-131)
thrust/tuple.h (+505/-526)
thrust/type_traits/integer_sequence.h (+38/-64)
thrust/type_traits/is_contiguous_iterator.h (+68/-67)
thrust/type_traits/is_execution_policy.h (+15/-4)
thrust/type_traits/is_operator_less_or_greater_function_object.h (+25/-25)
thrust/type_traits/is_operator_plus_function_object.h (+11/-9)
thrust/type_traits/is_trivially_relocatable.h (+58/-70)
thrust/type_traits/logical_metafunctions.h (+35/-246)
thrust/type_traits/remove_cvref.h (+13/-66)
thrust/type_traits/void_t.h (+13/-11)
thrust/uninitialized_copy.h (+50/-56)
thrust/uninitialized_fill.h (+38/-49)
thrust/unique.h (+271/-326)
thrust/universal_allocator.h (+1/-3)
thrust/universal_ptr.h (+0/-2)
thrust/universal_vector.h (+14/-0)
thrust/version.h (+22/-8)
thrust/zip_function.h (+95/-61)
toolchain-windows.cmake (+1/-1)
Reviewer Review Type Date Requested Status
Andreas Hasenack Approve
Ubuntu Sponsors Pending
Review via email: mp+499735@code.launchpad.net

Description of the change

Update to new upstream version 7.1.0

To post a comment you must log in.
Revision history for this message
Bojan Aleksovski (b0b0a) wrote :

uploaded package to this ppa: https://launchpad.net/~b0b0a/+archive/ubuntu/rocthrust-2139676/

(-proposed and amd64, amd64v3, arm64 archs enabled)

Gonna trigger autopkgtest when it is built and published

Revision history for this message
Bojan Aleksovski (b0b0a) wrote :

`reverse-depends --arch ppc64el src:rocthrust -x` returns libstdgpu-hip-dev (src:stdgpu) reverse dependency for for librocthrust-dev.

Revision history for this message
Bojan Aleksovski (b0b0a) wrote :

Sebastien Bacher at Debcrafters channel: stdgpu has been removed from Debian testing (https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1119033) and will be removed from Ubuntu as well until FTBFS gets fixed.

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

+ * d/p/0002-Allow-system-sqlite.patch: remove patch

Via grep I see that tests use sqlite3 (like ./test/bitwise_repro/bwr_db.hpp). Inspecting the librocthrust-tests package that is in the archive now, I don't see a dependency on sqlite3, even though it's a build-depends of src:rocthrust. Perhaps it's statically linked? My question here is just to be sure we are not dropping an external sqlite dependency in favor of something vendored. Or, asking in another way, why was this patch dropped?

review: Needs Information
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Ah, I think I see it. USE_SYSTEM_SQLITE was a debian change in the previous package, now upstream has a proper SQLITE_USE_SYSTEM_PACKAGE, is that it?

Revision history for this message
Bojan Aleksovski (b0b0a) wrote :

Yes https://github.com/ROCm/rocThrust/releases/tag/rocm-7.1.0 - upstream 7.1.0 release notes:
- Added a new CMake option -DSQLITE_USE_SYSTEM_PACKAGE to allow SQLite to be provided by the system.

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Could you please create a valid d/watch file? I know it's not something this PR broke, and was like this already in ubuntu, but since we are updating to a new usptream version, it's important to have a working d/watch file.

review: Needs Fixing
f4d6881... by Bojan Aleksovski

d/changelog: update changelog with watch file entry

292a41d... by Bojan Aleksovski

d/watch: create a valid watch file

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

awesome, +1

review: Approve
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Sponsored:

Uploading rocthrust_7.1.0-0ubuntu2.dsc
Uploading rocthrust_7.1.0.orig.tar.gz
Uploading rocthrust_7.1.0-0ubuntu2.debian.tar.xz
Uploading rocthrust_7.1.0-0ubuntu2_source.buildinfo
Uploading rocthrust_7.1.0-0ubuntu2_source.changes

Revision history for this message
Bojan Aleksovski (b0b0a) wrote :

Thank you Andreas!

Unmerged commits

f4d6881... by Bojan Aleksovski

d/changelog: update changelog with watch file entry

292a41d... by Bojan Aleksovski

d/watch: create a valid watch file

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/.clang-format b/.clang-format
2index d3b0fef..ec5070f 100644
3--- a/.clang-format
4+++ b/.clang-format
5@@ -1,164 +1,229 @@
6-# Style file for MLSE Libraries based on the modified rocBLAS style
7-
8-# Common settings
9-BasedOnStyle: LLVM
10-TabWidth: 2
11-IndentWidth: 2
12-UseTab: Never
13-ColumnLimit: 120
14-
15-# Other languages JavaScript, Proto
16-
17----
18-Language: Cpp
19-
20-# http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code
21-# int formatted_code;
22-# // clang-format off
23-# void unformatted_code ;
24-# // clang-format on
25-# void formatted_code_again;
26-
27-DisableFormat: false
28-Standard: c++14
29+# Note that we don't specify the language in this file because some files are
30+# detected as Cpp, but others are detected as ObjC and we want this formatting
31+# to apply to all types of files.
32+BasedOnStyle: LLVM
33 AccessModifierOffset: -2
34-AlignAfterOpenBracket: true
35-AlignConsecutiveAssignments: true
36-AlignConsecutiveDeclarations: true
37-AlignEscapedNewlinesLeft: true
38-AlignOperands: true
39+AlignAfterOpenBracket: Align
40+AlignConsecutiveAssignments: Consecutive
41+AlignConsecutiveBitFields: Consecutive
42+AlignConsecutiveMacros: Consecutive
43+AlignEscapedNewlines: Left
44+AlignOperands: AlignAfterOperator
45 AllowAllArgumentsOnNextLine: true
46-AlignTrailingComments: false
47+AlignTrailingComments:
48+ Kind: Never
49 AllowAllParametersOfDeclarationOnNextLine: true
50+AllowAllConstructorInitializersOnNextLine: true
51 AllowShortBlocksOnASingleLine: false
52 AllowShortCaseLabelsOnASingleLine: false
53 AllowShortFunctionsOnASingleLine: Empty
54-AllowShortIfStatementsOnASingleLine: false
55+AllowShortIfStatementsOnASingleLine: Never
56+AllowShortLambdasOnASingleLine: Empty
57 AllowShortLoopsOnASingleLine: false
58-AlwaysBreakAfterDefinitionReturnType: false
59 AlwaysBreakAfterReturnType: None
60-AlwaysBreakBeforeMultilineStrings: false
61-AlwaysBreakTemplateDeclarations: true
62+AlwaysBreakTemplateDeclarations: Yes
63 AttributeMacros: [
64+ # rocThrust
65 'THRUST_DEVICE',
66 'THRUST_FORCEINLINE',
67 'THRUST_HOST_DEVICE',
68 'THRUST_HOST',
69+ 'THRUST_NODISCARD',
70+ 'THRUST_DEPRECATED',
71+ 'THRUST_INLINE_CONSTANT',
72+ # thrust
73+ '_CCCL_ALIGNAS_TYPE',
74+ '_CCCL_ALIGNAS',
75+ '_CCCL_CONSTEXPR_CXX14',
76+ '_CCCL_CONSTEXPR_CXX17',
77+ '_CCCL_CONSTEXPR_CXX20',
78+ '_CCCL_CONSTEXPR_CXX23',
79 '_CCCL_DEVICE',
80+ '_CCCL_FALLTHROUGH',
81 '_CCCL_FORCEINLINE',
82 '_CCCL_HOST_DEVICE',
83 '_CCCL_HOST',
84+ '_CCCL_NO_UNIQUE_ADDRESS',
85+ '_CCCL_NODISCARD_FRIEND',
86+ '_CCCL_NODISCARD',
87+ '_CCCL_NORETURN',
88+ '_CCCL_VISIBILITY_HIDDEN',
89+ 'CUB_RUNTIME_FUNCTION',
90+ 'CUB_DETAIL_KERNEL_ATTRIBUTES',
91 'THRUST_RUNTIME_FUNCTION',
92 'THRUST_DETAIL_KERNEL_ATTRIBUTES',
93+ '_LIBCUDACXX_ALIGNOF',
94+ '_LIBCUDACXX_ALWAYS_INLINE',
95+ '_LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS',
96+ '_LIBCUDACXX_CONSTINIT',
97+ '_LIBCUDACXX_DEPRECATED_IN_CXX11',
98+ '_LIBCUDACXX_DEPRECATED_IN_CXX14',
99+ '_LIBCUDACXX_DEPRECATED_IN_CXX17',
100+ '_LIBCUDACXX_DEPRECATED_IN_CXX20',
101+ '_LIBCUDACXX_DEPRECATED',
102+ '_LIBCUDACXX_DISABLE_EXTENTSION_WARNING',
103+ '_LIBCUDACXX_EXCLUDE_FROM_EXPLICIT_INSTANTIATION',
104+ '_LIBCUDACXX_EXPORTED_FROM_ABI',
105+ '_LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS',
106+ '_LIBCUDACXX_HIDDEN',
107+ '_LIBCUDACXX_HIDE_FROM_ABI_AFTER_V1',
108+ '_LIBCUDACXX_HIDE_FROM_ABI',
109+ '_LIBCUDACXX_INLINE_VISIBILITY',
110+ '_LIBCUDACXX_INTERNAL_LINKAGE',
111+ '_LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS',
112+ '_LIBCUDACXX_NO_DESTROY',
113+ '_LIBCUDACXX_NO_SANITIZE',
114+ '_LIBCUDACXX_NOALIAS',
115+ '_LIBCUDACXX_OVERRIDABLE_FUNC_VIS',
116+ '_LIBCUDACXX_STANDALONE_DEBUG',
117+ '_LIBCUDACXX_TEMPLATE_DATA_VIS',
118+ '_LIBCUDACXX_TEMPLATE_VIS',
119+ '_LIBCUDACXX_THREAD_SAFETY_ANNOTATION',
120+ '_LIBCUDACXX_USING_IF_EXISTS',
121+ '_LIBCUDACXX_WEAK',
122 ]
123 BinPackArguments: false
124 BinPackParameters: false
125-
126-# Configure each individual brace in BraceWrapping
127 BreakBeforeBraces: Custom
128-# Control of individual brace wrapping cases
129-BraceWrapping: {
130- AfterCaseLabel: 'false'
131- AfterClass: 'true'
132- AfterControlStatement: 'true'
133- AfterEnum : 'true'
134- AfterFunction : 'true'
135- AfterNamespace : 'true'
136- AfterStruct : 'true'
137- AfterUnion : 'true'
138- BeforeCatch : 'true'
139- BeforeElse : 'true'
140- IndentBraces : 'false'
141- SplitEmptyFunction: 'false'
142- SplitEmptyRecord: 'false'
143-}
144-
145+BraceWrapping:
146+ AfterCaseLabel: false
147+ AfterClass: true
148+ AfterControlStatement: true
149+ AfterEnum: true
150+ AfterFunction: true
151+ AfterNamespace: true
152+ AfterStruct: true
153+ AfterUnion: true
154+ BeforeCatch: true
155+ BeforeElse: true
156+ IndentBraces: false
157+ SplitEmptyFunction: false
158+ SplitEmptyRecord: false
159 BreakBeforeConceptDeclarations: true
160 BreakBeforeBinaryOperators: NonAssignment
161 BreakBeforeTernaryOperators: true
162 BreakConstructorInitializers: BeforeComma
163 BreakInheritanceList: BeforeComma
164+ColumnLimit: 120
165+CompactNamespaces: false
166+ContinuationIndentWidth: 2
167 EmptyLineAfterAccessModifier: Never
168 EmptyLineBeforeAccessModifier: Always
169-
170+FixNamespaceComments: true
171+IfMacros: [
172+ # rocThrust
173+ 'THRUST_IF_CONSTEXPR',
174+ # thrust
175+ '_CCCL_IF_CONSTEXPR',
176+ '_CCCL_ELSE_IF_CONSTEXPR',
177+]
178+IndentWrappedFunctionNames: false
179+IncludeBlocks: Regroup
180+IncludeCategories:
181+ # rocThrust
182+ - Regex: '^<(rocprim/rocprim.hpp|thrust/detail/config.h|thrust/system/hip/config.h)'
183+ Priority: 0
184+ SortPriority: 0
185+ - Regex: '^<rocprim/'
186+ Priority: 2
187+ SortPriority: 1
188+ # Use the priority set by thrust (below)
189+ # - Regex: '^<thrust/'
190+ # Priority: 3
191+ # SortPriority: 2
192+ - Regex: '^<hip/'
193+ Priority: 4
194+ SortPriority: 3
195+ - Regex: '^<[a-z_]*>$'
196+ Priority: 5
197+ SortPriority: 4
198+ - Regex: '^<hip'
199+ Priority: 0
200+ SortPriority: 0
201+ # thrust
202+ - Regex: '^<(cuda/std/detail/__config|cub/config.cuh|thrust/detail/config.h|thrust/system/cuda/config.h)'
203+ Priority: 0
204+ SortPriority: 0
205+ - Regex: '^<cub/'
206+ Priority: 2
207+ SortPriority: 1
208+ - Regex: '^<thrust/'
209+ Priority: 3
210+ SortPriority: 2
211+ - Regex: '^<cuda/'
212+ Priority: 4
213+ SortPriority: 3
214+ - Regex: '^<[a-z_]*>$'
215+ Priority: 5
216+ SortPriority: 4
217+ - Regex: '^<cuda'
218+ Priority: 0
219+ SortPriority: 0
220 InsertBraces: true
221+IndentCaseLabels: true
222 InsertNewlineAtEOF: true
223 InsertTrailingCommas: Wrapped
224 IndentRequires: true
225 IndentPPDirectives: AfterHash
226-PackConstructorInitializers: Never
227-PenaltyBreakAssignment: 30
228-PenaltyBreakTemplateDeclaration: 0
229-PenaltyIndentedWhitespace: 2
230-RemoveSemicolon: false
231-SpaceAfterLogicalNot: false
232-SpaceAfterTemplateKeyword: true
233-SpaceBeforeCtorInitializerColon: true
234-SpaceBeforeInheritanceColon: true
235-SpaceBeforeRangeBasedForLoopColon: true
236-
237-
238-CommentPragmas: '^ IWYU pragma:'
239-CompactNamespaces: false
240-ConstructorInitializerAllOnOneLineOrOnePerLine: false
241-ConstructorInitializerIndentWidth: 4
242-ContinuationIndentWidth: 2
243-Cpp11BracedListStyle: true
244-SpaceBeforeCpp11BracedList: false
245-ExperimentalAutoDetectBinPacking: false
246-ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
247-IndentCaseLabels: true
248-FixNamespaceComments: true
249-IndentWrappedFunctionNames: false
250+IndentWidth: 2
251 KeepEmptyLinesAtTheStartOfBlocks: false
252-MacroBlockBegin: ''
253-MacroBlockEnd: ''
254-#JavaScriptQuotes: Double
255 MaxEmptyLinesToKeep: 1
256+Macros:
257+- _LIBCUDACXX_TEMPLATE(...)=template<...>
258+- _LIBCUDACXX_REQUIRES(...)=requires (...)
259 NamespaceIndentation: None
260-ObjCBlockIndentWidth: 4
261-#ObjCSpaceAfterProperty: true
262-#ObjCSpaceBeforeProtocolList: true
263+PackConstructorInitializers: Never
264+PenaltyBreakAssignment: 30
265 PenaltyBreakBeforeFirstCallParameter: 50
266 PenaltyBreakComment: 0
267 PenaltyBreakFirstLessLess: 0
268 PenaltyBreakString: 70
269+PenaltyBreakTemplateDeclaration: 0
270 PenaltyExcessCharacter: 100
271 PenaltyReturnTypeOnItsOwnLine: 90
272+PenaltyIndentedWhitespace: 2
273 PointerAlignment: Left
274+ReflowComments: true
275+RemoveSemicolon: false
276+SortIncludes: CaseInsensitive
277 SpaceAfterCStyleCast: true
278+SpaceAfterLogicalNot: false
279+SpaceAfterTemplateKeyword: true
280 SpaceBeforeAssignmentOperators: true
281+SpaceBeforeCpp11BracedList: false
282+SpaceBeforeCtorInitializerColon: true
283+SpaceBeforeInheritanceColon: true
284 SpaceBeforeParens: ControlStatements
285+SpaceBeforeRangeBasedForLoopColon: true
286 SpaceInEmptyParentheses: false
287 SpacesBeforeTrailingComments: 1
288 SpacesInAngles: Never
289-SpacesInContainerLiterals: true
290 SpacesInCStyleCastParentheses: false
291 SpacesInParentheses: false
292 SpacesInSquareBrackets: false
293-#SpaceAfterTemplateKeyword: true
294-#SpaceBeforeInheritanceColon: true
295-
296-#SortUsingDeclarations: true
297-SortIncludes: CaseInsensitive
298-
299-ReflowComments: true
300-
301-#IncludeBlocks: Preserve
302-#IndentPPDirectives: AfterHash
303-
304+# Standard: c++20 not supported by rocThrust
305+Standard: c++17
306 StatementMacros: [
307+ # rocThrust
308 'THRUST_EXEC_CHECK_DISABLE',
309- 'THRUST_NAMESPACE_BEGIN',
310- 'THRUST_NAMESPACE_END',
311- 'THRUST_EXEC_CHECK_DISABLE',
312+ # thrust
313+ '_CCCL_EXEC_CHECK_DISABLE',
314 'CUB_NAMESPACE_BEGIN',
315 'CUB_NAMESPACE_END',
316 'THRUST_NAMESPACE_BEGIN',
317 'THRUST_NAMESPACE_END',
318 '_LIBCUDACXX_BEGIN_NAMESPACE_STD',
319 '_LIBCUDACXX_END_NAMESPACE_STD',
320+ '_LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION',
321+ '_LIBCUDACXX_END_NAMESPACE_STD_NOVERSION',
322+ '_LIBCUDACXX_BEGIN_NAMESPACE_RANGES',
323+ '_LIBCUDACXX_END_NAMESPACE_RANGES',
324+ '_LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI',
325+ '_LIBCUDACXX_END_NAMESPACE_RANGES_ABI',
326+ '_LIBCUDACXX_BEGIN_NAMESPACE_CPO',
327+ '_LIBCUDACXX_END_NAMESPACE_CPO',
328+ '_LIBCUDACXX_BEGIN_NAMESPACE_VIEWS',
329+ '_LIBCUDACXX_END_NAMESPACE_VIEWS',
330 ]
331 TabWidth: 2
332 UseTab: Never
333----
334diff --git a/.githooks/install b/.githooks/install
335index a98d897..31506e6 100755
336--- a/.githooks/install
337+++ b/.githooks/install
338@@ -3,6 +3,6 @@
339 cd "$(git rev-parse --git-dir)"
340 cd hooks
341
342-echo "Installing hooks..."
343+echo "Installing hooks..."
344 ln -s ../../.githooks/pre-commit pre-commit
345 echo "Done!"
346diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
347index 809934d..430fdb8 100755
348--- a/.github/CODEOWNERS
349+++ b/.github/CODEOWNERS
350@@ -1,4 +1,4 @@
351-* @stanleytsang-amd @umfranzw @RobsonRLemos @lawruble13
352+* @stanleytsang-amd @umfranzw @RobsonRLemos
353 # Documentation files
354 docs/ @ROCm/rocm-documentation
355 *.md @ROCm/rocm-documentation
356diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
357index 27245ad..95422e6 100644
358--- a/.gitlab-ci.yml
359+++ b/.gitlab-ci.yml
360@@ -1,5 +1,5 @@
361 # ########################################################################
362-# Copyright 2019-2024 Advanced Micro Devices, Inc.
363+# Copyright 2019-2025 Advanced Micro Devices, Inc.
364 # ########################################################################
365
366 include:
367@@ -9,6 +9,7 @@ include:
368 - /defaults.yaml
369 - /deps-cmake.yaml
370 - /deps-docs.yaml
371+ - /deps-format.yaml
372 - /deps-rocm.yaml
373 - /deps-windows.yaml
374 - /deps-nvcc.yaml
375@@ -23,12 +24,20 @@ stages:
376 - test # Tests if unit tests are passing (CTest)
377 - benchmark # Runs the non-internal benchmarks (Google Benchmark)
378
379+workflow:
380+ rules:
381+ - if: $CI_MERGE_REQUEST_LABELS !~ /CI Skip/
382+
383 variables:
384 # Helper variables
385 PACKAGE_DIR: $BUILD_DIR/package
386 ROCPRIM_GIT_BRANCH: develop_stream
387 ROCPRIM_DIR: ${CI_PROJECT_DIR}/rocPRIM
388
389+clang-format:
390+ extends:
391+ - .lint:clang-format
392+
393 copyright-date:
394 extends:
395 - .deps:rocm
396@@ -74,7 +83,6 @@ copyright-date:
397 -D CMAKE_CXX_COMPILER=hipcc
398 -D CMAKE_BUILD_TYPE=Release
399 -D BUILD_TEST=OFF
400- -D BUILD_HIPSTDPAR_TEST=OFF
401 -D BUILD_EXAMPLE=OFF
402 -D ROCM_DEP_ROCMCORE=OFF
403 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
404@@ -116,7 +124,6 @@ copyright-date:
405 -D CMAKE_BUILD_TYPE=$BUILD_TYPE
406 -D BUILD_$BUILD_TARGET=ON
407 -D GPU_TARGETS=$GPU_TARGETS
408- -D AMDGPU_TEST_TARGETS=$GPU_TARGETS
409 -D RNG_SEED_COUNT=$rng_seed_count
410 -D PRNG_SEEDS=$prng_seeds
411 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
412@@ -128,7 +135,7 @@ copyright-date:
413 - cmake --build $CI_PROJECT_DIR/build
414 artifacts:
415 paths:
416- - $CI_PROJECT_DIR/build/benchmarks/*
417+ - $CI_PROJECT_DIR/build/benchmark/*
418 - $CI_PROJECT_DIR/build/test/*
419 - $CI_PROJECT_DIR/build/testing/*
420 - $CI_PROJECT_DIR/build/deps/*
421@@ -137,7 +144,7 @@ copyright-date:
422 - $CI_PROJECT_DIR/build/.ninja_log
423 exclude:
424 - $CI_PROJECT_DIR/build/**/*.o
425- expire_in: 2 weeks
426+ expire_in: 1 day
427
428 build:cmake-latest:
429 stage: build
430@@ -147,8 +154,8 @@ build:cmake-latest:
431 parallel:
432 matrix:
433 - BUILD_TYPE: Release
434- BUILD_TARGET: [BENCHMARKS, TEST, EXAMPLES]
435- BUILD_VERSION: [14, 17]
436+ BUILD_TARGET: [BENCHMARK, TEST, EXAMPLE]
437+ BUILD_VERSION: 17
438
439 build:cmake-minimum:
440 stage: build
441@@ -158,8 +165,8 @@ build:cmake-minimum:
442 parallel:
443 matrix:
444 - BUILD_TYPE: Release
445- BUILD_TARGET: [BENCHMARKS, TEST, EXAMPLES]
446- BUILD_VERSION: 14
447+ BUILD_TARGET: [BENCHMARK, TEST, EXAMPLE]
448+ BUILD_VERSION: 17
449
450 build:package:
451 stage: build
452@@ -183,7 +190,7 @@ build:package:
453 paths:
454 - $PACKAGE_DIR/rocthrust*.deb
455 - $PACKAGE_DIR/rocthrust*.zip
456- expire_in: 2 weeks
457+ expire_in: 1 day
458
459 build:windows:
460 stage: build
461@@ -219,19 +226,20 @@ build:windows:
462 -D CMAKE_BUILD_TYPE=Release
463 -D GPU_TARGETS=$GPU_TARGET
464 -D BUILD_TEST=ON
465- -D BUILD_EXAMPLES=OFF
466- -D BUILD_BENCHMARKS=OFF
467+ -D BUILD_EXAMPLE=OFF
468+ -D BUILD_BENCHMARK=OFF
469+ -D CMAKE_TOOLCHAIN_FILE=$CI_PROJECT_DIR/toolchain-windows.cmake
470 -D CMAKE_CXX_FLAGS=-Wno-deprecated-declarations
471 -D CMAKE_CXX_COMPILER:FILEPATH="${env:HIP_PATH}/bin/clang++.exe"
472 -D CMAKE_INSTALL_PREFIX:PATH="$CI_PROJECT_DIR/build/install"
473- -D CMAKE_CXX_STANDARD=14
474+ -D CMAKE_CXX_STANDARD=17
475 -D CMAKE_PREFIX_PATH:PATH="$ROCPRIM_DIR/build/install;${env:HIP_PATH}" *>&1
476 - \& cmake --build "$CI_PROJECT_DIR/build" *>&1
477 artifacts:
478 paths:
479 - $CI_PROJECT_DIR/build/
480 - $ROCPRIM_DIR/build/install
481- expire_in: 2 weeks
482+ expire_in: 1 day
483
484 test:package:
485 stage: test
486@@ -268,20 +276,25 @@ test:doc:
487 extends:
488 - .build:docs
489 - .rules:test
490+ artifacts:
491+ paths:
492+ - $DOCS_DIR/_build/html/
493+ expire_in: 2 weeks
494
495-test:
496+.test:rocm:
497 stage: test
498+ tags:
499+ - rocm
500+ - $GPU
501 extends:
502 - .cmake-minimum
503- - .rules:test
504- - .gpus:rocm
505 needs:
506 - job: build:cmake-minimum
507 parallel:
508 matrix:
509 - BUILD_TYPE: Release
510 BUILD_TARGET: TEST
511- BUILD_VERSION: 14
512+ BUILD_VERSION: 17
513 script:
514 - cd $CI_PROJECT_DIR/build
515 - cmake
516@@ -294,12 +307,80 @@ test:
517 - HSA_ENABLE_SDMA=0 ctest
518 --output-on-failure
519 --repeat-until-fail 2
520- --tests-regex $GPU_TARGET
521 --resource-spec-file ./resources.json
522 --parallel $PARALLEL_JOBS
523
524+test:rocm-any-gpu:
525+ variables:
526+ GPU: ""
527+ PARALLEL_JOBS: 1
528+ extends:
529+ - .test:rocm
530+ rules:
531+ - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/ && $CI_MERGE_REQUEST_LABELS !~ /Arch::/
532+
533+test:rocm-label-arch:
534+ extends:
535+ - .gpus:rocm
536+ - .test:rocm
537+ - .rules:arch-labels
538+
539+test:rocm-all-gpus:
540+ variables:
541+ SHOULD_BE_UNDRAFTED: "true"
542+ extends:
543+ - .gpus:rocm
544+ - .test:rocm
545+ - .rules:test
546+
547+test-bitwise-repro-generate:
548+ stage: test
549+ extends:
550+ - .cmake-minimum
551+ - .rules:manual
552+ - .gpus:rocm
553+ needs:
554+ - job: build:cmake-minimum
555+ parallel:
556+ matrix:
557+ - BUILD_TYPE: Release
558+ BUILD_TARGET: TEST
559+ BUILD_VERSION: 17
560+ cache:
561+ key: database
562+ paths:
563+ - $CI_PROJECT_DIR/bitwise.db
564+ policy: push
565+ script:
566+ - cd $CI_PROJECT_DIR/build/test/
567+ - ROCTHRUST_BWR_PATH=$CI_PROJECT_DIR/bitwise.db ROCTHRUST_BWR_GENERATE=1 ./reproducibility.hip
568+
569+test-bitwise-repro:
570+ stage: test
571+ extends:
572+ - .cmake-minimum
573+ - .rules:test
574+ - .gpus:rocm
575+ needs:
576+ - job: build:cmake-minimum
577+ parallel:
578+ matrix:
579+ - BUILD_TYPE: Release
580+ BUILD_TARGET: TEST
581+ BUILD_VERSION: 17
582+ cache:
583+ key: database
584+ paths:
585+ - $CI_PROJECT_DIR/bitwise.db
586+ policy: pull
587+ script:
588+ - cd $CI_PROJECT_DIR/build/test/
589+ - ROCTHRUST_BWR_PATH=$CI_PROJECT_DIR/bitwise.db ./reproducibility.hip
590+
591 .rocm-windows:test:
592 extends:
593+ - .deps:rocm-windows
594+ - .deps:visual-studio-devshell
595 - .gpus:rocm-windows
596 - .rules:test
597 stage: test
598@@ -358,19 +439,21 @@ build:cuda-and-omp:
599 tags:
600 - build
601 variables:
602- CCCL_GIT_BRANCH: v2.5.0
603+ CCCL_GIT_BRANCH: v2.8.5
604 CCCL_DIR: ${CI_PROJECT_DIR}/cccl
605 needs: []
606 script:
607 - git clone -b $CCCL_GIT_BRANCH https://github.com/NVIDIA/cccl.git $CCCL_DIR
608 # Replace CCCL Thrust headers with rocThrust headers
609- - rm -R $CCCL_DIR/thrust/thrust
610+ - rm -r $CCCL_DIR/thrust/thrust
611 - cp -r $CI_PROJECT_DIR/thrust $CCCL_DIR/thrust
612+ - sed -i '/include(cmake\/ThrustAddSubdir.cmake)/i \ include(../cmake/CCCLAddSubdirHelper.cmake)' $CCCL_DIR/thrust/CMakeLists.txt
613 # Build tests and examples from CCCL Thrust
614 - cmake
615 -G Ninja
616 -D CMAKE_BUILD_TYPE=Release
617 -D CMAKE_CUDA_ARCHITECTURES="$GPU_TARGETS"
618+ -D CCCL_SOURCE_DIR="$CCCL_DIR"
619 -D THRUST_ENABLE_TESTING=ON
620 -D THRUST_ENABLE_EXAMPLES=ON
621 -D THRUST_ENABLE_BENCHMARKS=OFF
622@@ -379,6 +462,7 @@ build:cuda-and-omp:
623 -D THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA=ON
624 -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
625 -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
626+ -D THRUST_MULTICONFIG_ENABLE_DIALECT_CPP14=FALSE
627 -D CMAKE_CUDA_COMPILER_LAUNCHER=phc_sccache_cuda
628 -B $CI_PROJECT_DIR/build
629 -S $CCCL_DIR/thrust
630@@ -402,16 +486,17 @@ build:cuda-and-omp:
631 - $CCCL_DIR/thrust/cmake/ThrustRunTest.cmake
632 - $CCCL_DIR/thrust/cmake/ThrustRunExample.cmake
633 - $CI_PROJECT_DIR/build/.ninja_log
634- expire_in: 1 week
635+ expire_in: 1 day
636
637-test:cuda-and-omp:
638+.test:cuda-and-omp:
639 stage: test
640+ tags:
641+ - nvcc
642+ - $GPU
643 needs:
644 - build:cuda-and-omp
645 extends:
646 - .nvcc
647- - .gpus:nvcc
648- - .rules:test
649 before_script:
650 # This is only needed because of the legacy before_script in .gpus:nvcc would otherwise overwrite before_script
651 - !reference [.nvcc, before_script]
652@@ -420,6 +505,23 @@ test:cuda-and-omp:
653 # These tests are executed on the build stage because they require sources
654 - ctest --output-on-failure --exclude-regex "thrust.example.cmake.add_subdir|thrust.test.cmake.check_source_files"
655
656+test:cuda-and-omp-any-gpu:
657+ variables:
658+ GPU: ""
659+ PARALLEL_JOBS: 1
660+ extends:
661+ - .test:cuda-and-omp
662+ rules:
663+ - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/
664+
665+test:cuda-and-omp-all-gpus:
666+ variables:
667+ SHOULD_BE_UNDRAFTED: "true"
668+ extends:
669+ - .gpus:nvcc
670+ - .test:cuda-and-omp
671+ - .rules:test
672+
673 .benchmark-base:
674 stage: benchmark
675 extends:
676@@ -430,7 +532,12 @@ test:cuda-and-omp:
677
678 benchmark:
679 needs:
680- - build:cmake-minimum
681+ - job: build:cmake-minimum
682+ parallel:
683+ matrix:
684+ - BUILD_TYPE: Release
685+ BUILD_TARGET: BENCHMARK
686+ BUILD_VERSION: 17
687 extends:
688 - .cmake-minimum
689 - .gpus:rocm
690@@ -445,7 +552,7 @@ benchmark:
691 - mkdir -p "${BENCHMARK_RESULT_DIR}"
692 - python3
693 .gitlab/run_benchmarks.py
694- --benchmark_dir "${CI_PROJECT_DIR}/build/benchmarks"
695+ --benchmark_dir "${CI_PROJECT_DIR}/build/benchmark"
696 --benchmark_gpu_architecture "${GPU_TARGET}"
697 --benchmark_output_dir "${BENCHMARK_RESULT_DIR}"
698 --benchmark_filename_regex "${BENCHMARK_FILENAME_REGEX}"
699@@ -453,4 +560,4 @@ benchmark:
700 artifacts:
701 paths:
702 - ${BENCHMARK_RESULT_DIR}
703- expire_in: 1 week
704+ expire_in: 1 day
705diff --git a/.gitlab/run_benchmarks.py b/.gitlab/run_benchmarks.py
706index a3fe982..6a94c25 100755
707--- a/.gitlab/run_benchmarks.py
708+++ b/.gitlab/run_benchmarks.py
709@@ -66,7 +66,7 @@ def run_benchmarks(benchmark_context):
710 subprocess.check_call(args)
711 end_time = time.time()
712 duration = end_time - start_time
713-
714+
715 print(f'Benchmark {benchmark_name} took {duration:.3f} seconds to run', file=sys.stderr, flush=True)
716 except subprocess.CalledProcessError as error:
717 print(f'Could not run benchmark at {benchmark_path}. Error: "{error}"', file=sys.stderr, flush=True)
718diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
719deleted file mode 100644
720index 27b0a24..0000000
721--- a/.jenkins/common.groovy
722+++ /dev/null
723@@ -1,78 +0,0 @@
724-// This file is for internal AMD use.
725-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
726-
727-def runCompileCommand(platform, project, jobName, boolean debug=false, boolean sameOrg=true)
728-{
729- project.paths.construct_build_prefix()
730-
731- String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release'
732- String buildTypeDir = debug ? 'debug' : 'release'
733- String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
734- //Set CI node's gfx arch as target if PR, otherwise use default targets of the library
735- String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
736-
737- def getDependenciesCommand = ""
738- if (project.installLibraryDependenciesFromCI)
739- {
740- project.libraryDependencies.each
741- {
742- libraryName ->
743- getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, 'develop', sameOrg)
744- }
745- }
746-
747- def command = """#!/usr/bin/env bash
748- set -x
749- ${getDependenciesCommand}
750- cd ${project.paths.project_build_prefix}
751- mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir}
752- ${auxiliary.gfxTargetParser()}
753- ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../..
754- make -j\$(nproc)
755- """
756-
757- platform.runCommand(this, command)
758-}
759-
760-def runTestCommand (platform, project)
761-{
762- String sudo = auxiliary.sudo(platform.jenkinsLabel)
763-
764- def testCommand = "ctest --output-on-failure"
765- def hmmTestCommand = ''
766- // Note: temporarily disable scan tests below while waiting for a compiler fix
767- def excludeRegex = /(reduce_by_key.hip|scan)/
768- testCommandExclude = "--exclude-regex \"${excludeRegex}\""
769-
770- if (platform.jenkinsLabel.contains('gfx90a'))
771- {
772- hmmTestCommand = ""
773- // temporarily disable hmm testing
774- // """
775- // export HSA_XNACK=1
776- // export ROCTHRUST_USE_HMM=1
777- // ${testCommand} ${testCommandExclude}
778- // """
779- }
780-
781- def command = """
782- #!/usr/bin/env bash
783- set -x
784- cd ${project.paths.project_build_prefix}
785- cd ${project.testDirectory}
786- ${testCommand} ${testCommandExclude}
787- ${hmmTestCommand}
788- """
789-
790- platform.runCommand(this, command)
791-}
792-
793-def runPackageCommand(platform, project)
794-{
795- def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release")
796-
797- platform.runCommand(this, packageHelper[0])
798- platform.archiveArtifacts(this, packageHelper[1])
799-}
800-
801-return this
802diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy
803deleted file mode 100644
804index df19050..0000000
805--- a/.jenkins/precheckin.groovy
806+++ /dev/null
807@@ -1,93 +0,0 @@
808-#!/usr/bin/env groovy
809-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
810-@Library('rocJenkins@pong') _
811-
812-// This file is for internal AMD use.
813-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
814-
815-import com.amd.project.*
816-import com.amd.docker.*
817-import java.nio.file.Path;
818-
819-def runCI =
820-{
821- nodeDetails, jobName->
822-
823- def prj = new rocProject('rocThrust', 'precheckin')
824-
825- prj.defaults.ccache = true
826- prj.timeout.compile = 420
827- prj.libraryDependencies = ["rocPRIM"]
828-
829- // Define test architectures, optional rocm version argument is available
830- def nodes = new dockerNodes(nodeDetails, jobName, prj)
831-
832- boolean formatCheck = false
833-
834- def commonGroovy
835-
836- def compileCommand =
837- {
838- platform, project->
839-
840- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
841- commonGroovy.runCompileCommand(platform, project, jobName)
842- }
843-
844- def testCommand =
845- {
846- platform, project->
847-
848- commonGroovy.runTestCommand(platform, project)
849- }
850-
851- def packageCommand =
852- {
853- platform, project->
854-
855- commonGroovy.runPackageCommand(platform, project)
856- }
857-
858- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
859-}
860-
861-ci: {
862- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
863-
864- def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])],
865- "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
866- "rocm-docker":[]]
867- propertyList = auxiliary.appendPropertyList(propertyList)
868-
869- Set standardJobNameSet = ["compute-rocm-dkms-no-npi", "compute-rocm-dkms-no-npi-hipclang", "rocm-docker"]
870-
871- def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]),
872- "compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]),
873- "rocm-docker":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']])]
874- jobNameList = auxiliary.appendJobNameList(jobNameList)
875-
876- auxiliary.registerDependencyBranchParameter(["rocPRIM"])
877-
878- propertyList.each
879- {
880- jobName, property->
881- if (urlJobName == jobName)
882- properties(auxiliary.addCommonProperties(property))
883- }
884-
885- Set seenJobNames = []
886- jobNameList.each
887- {
888- jobName, nodeDetails->
889- seenJobNames.add(jobName)
890- if (urlJobName == jobName)
891- runCI(nodeDetails, jobName)
892- }
893-
894- // For url job names that are outside of the standardJobNameSet i.e. compute-rocm-dkms-no-npi-1901
895- if(!seenJobNames.contains(urlJobName))
896- {
897- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
898- runCI([ubuntu16:['gfx906']], urlJobName)
899- }
900-}
901diff --git a/.jenkins/staticanalysis.groovy b/.jenkins/staticanalysis.groovy
902deleted file mode 100644
903index 7d2cf06..0000000
904--- a/.jenkins/staticanalysis.groovy
905+++ /dev/null
906@@ -1,46 +0,0 @@
907-#!/usr/bin/env groovy
908-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
909-@Library('rocJenkins@pong') _
910-
911-// This is file for internal AMD use.
912-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
913-
914-import com.amd.project.*
915-import com.amd.docker.*
916-import java.nio.file.Path
917-
918-def runCompileCommand(platform, project, jobName, boolean debug=false)
919-{
920- project.paths.construct_build_prefix()
921-}
922-
923-def runCI =
924-{
925- nodeDetails, jobName->
926-
927- def prj = new rocProject('rocThrust', 'StaticAnalysis')
928-
929- // Define test architectures, optional rocm version argument is available
930- def nodes = new dockerNodes(nodeDetails, jobName, prj)
931-
932- boolean formatCheck = false
933- boolean staticAnalysis = true
934-
935- def compileCommand =
936- {
937- platform, project->
938-
939- runCompileCommand(platform, project, jobName, false)
940- }
941-
942- buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis)
943-}
944-
945-ci: {
946- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
947-
948- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])]))
949- stage(urlJobName) {
950- runCI([ubuntu20:['any']], urlJobName)
951- }
952-}
953diff --git a/.jenkins/staticlibrary.groovy b/.jenkins/staticlibrary.groovy
954deleted file mode 100644
955index 08f9e3c..0000000
956--- a/.jenkins/staticlibrary.groovy
957+++ /dev/null
958@@ -1,88 +0,0 @@
959-#!/usr/bin/env groovy
960-@Library('rocJenkins@pong') _
961-import com.amd.project.*
962-import com.amd.docker.*
963-import java.nio.file.Path;
964-
965-def runCI =
966-{
967- nodeDetails, jobName->
968-
969- def prj = new rocProject('rocThrust', 'Static Library PreCheckin')
970-
971- prj.defaults.ccache = true
972- prj.timeout.compile = 420
973- prj.libraryDependencies = ["rocPRIM"]
974-
975- def nodes = new dockerNodes(nodeDetails, jobName, prj)
976-
977- def commonGroovy
978-
979- boolean formatCheck = false
980-
981- def compileCommand =
982- {
983- platform, project->
984-
985- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
986- commonGroovy.runCompileCommand(platform, project, jobName, false, true)
987- }
988-
989-
990- def testCommand =
991- {
992- platform, project->
993-
994- commonGroovy.runTestCommand(platform, project)
995- }
996-
997- def packageCommand =
998- {
999- platform, project->
1000-
1001- commonGroovy.runPackageCommand(platform, project)
1002- }
1003-
1004- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
1005-}
1006-
1007-ci: {
1008- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
1009-
1010- def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])],
1011- "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
1012- "rocm-docker":[]]
1013- propertyList = auxiliary.appendPropertyList(propertyList)
1014-
1015- def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]),
1016- "compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]),
1017- "rocm-docker":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']])]
1018- jobNameList = auxiliary.appendJobNameList(jobNameList)
1019-
1020- auxiliary.registerDependencyBranchParameter(["rocPRIM"])
1021-
1022- propertyList.each
1023- {
1024- jobName, property->
1025- if (urlJobName == jobName)
1026- properties(auxiliary.addCommonProperties(property))
1027- }
1028-
1029- jobNameList.each
1030- {
1031- jobName, nodeDetails->
1032- if (urlJobName == jobName)
1033- stage(jobName) {
1034- runCI(nodeDetails, jobName)
1035- }
1036- }
1037-
1038- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
1039- if(!jobNameList.keySet().contains(urlJobName))
1040- {
1041- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
1042- stage(urlJobName) {
1043- runCI([ubuntu16:['gfx906']], urlJobName)
1044- }
1045- }
1046-}
1047diff --git a/CHANGELOG.md b/CHANGELOG.md
1048index 35d3225..a5e4ced 100644
1049--- a/CHANGELOG.md
1050+++ b/CHANGELOG.md
1051@@ -3,6 +3,79 @@
1052 Documentation for rocThrust available at
1053 [https://rocm.docs.amd.com/projects/rocThrust/en/latest/](https://rocm.docs.amd.com/projects/rocThrust/en/latest/).
1054
1055+## rocThrust 4.1.0 for ROCm 7.1
1056+
1057+### Added
1058+
1059+* Added a new CMake option `-DSQLITE_USE_SYSTEM_PACKAGE` to allow SQLite to be provided by the system.
1060+* Introduced `libhipcxx` as a soft depedency. When `liphipcxx` can be included, rocthrust, may use structs and methods defined in `libhipcxx`. This allows for a more complete behaviour parity with CCCL and mirrors CCCL's thrust own depedency on `libcudacxx`.
1061+* Added a new CMake option `-DUSE_SYSTEM_LIB` to allow tests to be built from `ROCm` libraries provided by the system.
1062+
1063+### Known Issues
1064+* `event` test is failing on CI and local runs on MI300, MI250 and MI210.
1065+
1066+* rocThrust, as well as its dependencies rocPRIM and rocRAND have been moved into the new rocm-libraries "monorepo" repository (https://github.com/ROCm/rocm-libraries). This repository contains a number of ROCm libraries that are frequently used together.
1067+ * The repository migration requires a few changes to the way that rocThrust's ROCm library dependencies are fetched.
1068+ * There are new cmake options for obtaining rocPRIM and (optionally, if BUILD_BENCHMARKS is enabled) rocRAND.
1069+ * cmake build options `ROCPRIM_FETCH_METHOD` and `ROCRAND_FETCH_METHOD` may be set to one of the following:
1070+ * `PACKAGE` - (default) searches for a preinstalled packaged version of the dependency. If it is not found, the build will fall back using option `DOWNLOAD`, described below.
1071+ * `DOWNLOAD` - downloads the dependency from the rocm-libraries repository. If git >= 2.25 is present, this option uses a sparse checkout that avoids downloading more than it needs to. If not, the whole monorepo is downloaded (this may take some time).
1072+ * `MONOREPO` - this options is intended to be used if you are building rocThrust from within a copy of the rocm-libraries repository that you have cloned (and therefore already contains the dependencies rocPRIM and rocRAND). When selected, the build will try find the dependency in the local repository tree. If it cannot be found, the build will attempt to add it to the local tree using a sparse-checkout. If that also fails, it will fall back to using the `DOWNLOAD` option.
1073+
1074+### Changed
1075+
1076+* The previously hidden cmake build option `FORCE_DEPENDENCIES_DOWNLOAD` has been unhidden and renamed `EXTERNAL_DEPS_FORCE_DOWNLOAD` to differentiate it from the new rocPRIM and rocRAND dependency options described above. It's behaviour remains the same - it forces non-ROCm dependencies (Google Benchmark, Google Test, and SQLite) to be downloaded instead of searching for existing installed packages. This option defaults to `OFF`.
1077+
1078+### Removed
1079+ * The previous dependency-related build options `DOWNLOAD_ROCPRIM` and `DOWNLOAD_ROCRAND` have been removed. Please use `ROCPRIM_FETCH_METHOD=DOWNLOAD` and `ROCRAND_FETCH_METHOD=DOWNLOAD` instead.
1080+
1081+## rocThrust 4.0.0 for ROCm 7.0
1082+
1083+### Changed
1084+
1085+* Updated the required version of Google Benchmark from 1.8.0 to 1.9.0.
1086+* Drop `c++14` support for rocthrust.
1087+* Renamed `cpp14_required.h` to `cpp_version_check.h`
1088+* Refactored `test_header.hpp` into separte modules `test_param_fixtures.hpp`, `test_real_assertions.hpp`, `test_imag_assertions.hpp`, and `test_utils.hpp`.
1089+ * This is done to prevent unit tests from having access to modules that they're not testing. This will improve the accuracy of code coverage reports.
1090+
1091+### Added
1092+* Additional unit tests for:
1093+ * binary_search
1094+ * complex
1095+ * c99math
1096+ * catrig
1097+ * ccosh
1098+ * cexp
1099+ * clog
1100+ * csin
1101+ * csqrt
1102+ * ctan
1103+* Added `test_param_fixtures.hpp` to store all the parameters for typed test suites.
1104+* Added `test_real_assertions.hpp` to handle unit test assertions for real numbers.
1105+* Added `test_imag_assertions.hpp` to handle unit test assertions for imaginary numbers.
1106+* `clang++` is now used to compile google benchmarks on Windows.
1107+* Added gfx950 support.
1108+* Merged changes from upstream CCCL/thrust 2.6.0
1109+
1110+### Removed
1111+
1112+* `device_malloc_allocator.h` has been removed. This header file was unused and should not impact users.
1113+* Removed C++14 support, only C++17 is supported.
1114+* `test_header.hpp` has been removed. The `HIP_CHECK` function, as well as the `test` and `inter_run_bwr` namespaces, have been moved to `test_utils.hpp`.
1115+* `test_assertions.hpp` has been split into `test_real_assertions.hpp` and `test_imag_assertions.hpp`.
1116+
1117+### Upcoming changes
1118+
1119+* `thrust::device_malloc_allocator` is deprecated as of this version. It will be removed in an upcoming version.
1120+
1121+### Resolved issues
1122+
1123+* Fixed an issue with internal calls to unqualified `distance()` which would be ambigious due to also visibile implementation through ADL.
1124+
1125+### Known Issues
1126+* The order of the values being compared by thrust::exclusive_scan_by_key and thrust::inclusive_scan_by_key can change between runs when integers are being compared. This can cause incorrect output when a non-commutative operator such as division is being used.
1127+
1128 ## rocThrust 3.3.0 for ROCm 6.4
1129
1130 ### Added
1131@@ -26,11 +99,10 @@ Documentation for rocThrust available at
1132 * `--test|-t` is no longer a required flag for `rtest.py`. Instead, the user can use either `--emulation|-e` or `--test|-t`, but not both.
1133 * Split the contents of HIPSTDPAR's forwarding header into several implementation headers.
1134 * Fixed `copy_if` to work with large data types (512 bytes)
1135-* Updated the required version of Google Benchmark from 1.8.0 to 1.9.0.
1136
1137 ### Known Issues
1138-* `thrust::inclusive_scan_by_key` might produce incorrect results when it's used with -O2 or -O3 optimization.
1139- - The error is caused by a recent compiler change. There is a fix available that will be released at a later date.
1140+* `thrust::inclusive_scan_by_key` might produce incorrect results when it's used with -O2 or -O3 optimization.
1141+ - The error is caused by a recent compiler change. There is a fix available that will be released at a later date.
1142
1143 ## rocThrust 3.2.0 for ROCm 6.3
1144
1145@@ -109,7 +181,6 @@ Documentation for rocThrust available at
1146 * Updated `docs` directory structure to match the standard of [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core).
1147 * Removed references to and workarounds for deprecated hcc
1148
1149-
1150 ## rocThrust 2.17.0 for ROCm 5.5
1151
1152 ### Additions
1153diff --git a/CMakeLists.txt b/CMakeLists.txt
1154index 887c1e5..5413549 100644
1155--- a/CMakeLists.txt
1156+++ b/CMakeLists.txt
1157@@ -1,9 +1,22 @@
1158 # ########################################################################
1159-# Copyright 2019-2024 Advanced Micro Devices, Inc.
1160+# Copyright 2019-2025 Advanced Micro Devices, Inc.
1161 # ########################################################################
1162
1163 cmake_minimum_required(VERSION 3.10.2 FATAL_ERROR)
1164
1165+# --------------------------------------
1166+# Update these variables at release time
1167+#
1168+# Set the library version
1169+set(VERSION_STRING "4.1.0")
1170+# Set the minimum required rocPRIM version
1171+set(MIN_ROCPRIM_PACKAGE_VERSION "4.0.0" CACHE STRING "Minimum version of rocPRIM to search for when ROCPRIM_FETCH_METHOD is set to PACKAGE.")
1172+# Set the minimum required rocRAND version
1173+set(MIN_ROCRAND_PACKAGE_VERSION "4.0.0" CACHE STRING "Minimum version of rocRAND to search for when ROCRAND_FETCH_METHOD is set to PACKAGE.")
1174+# Set download branch for dependencies rocPRIM and rocRAND
1175+set(ROCM_DEP_RELEASE_BRANCH "release/rocm-rel-7.1" CACHE STRING "Download branch for ROCm dependencies")
1176+# --------------------------------------
1177+
1178 # Install prefix
1179 if(WIN32)
1180 set(CMAKE_INSTALL_PREFIX ${PROJECT_BINARY_DIR}/package CACHE PATH "Install path prefix, prepended onto install directories")
1181@@ -22,6 +35,12 @@ endif()
1182 set(CMAKE_CXX_STANDARD_REQUIRED ON)
1183 set(CMAKE_CXX_EXTENSIONS OFF)
1184
1185+if (CMAKE_CXX_STANDARD EQUAL 14)
1186+ message(WARNING "C++14 will be deprecated in the next major release")
1187+elseif(NOT CMAKE_CXX_STANDARD EQUAL 17)
1188+ message(FATAL_ERROR "Only C++14 and C++17 are supported")
1189+endif()
1190+
1191 # Set HIP flags
1192 set(CMAKE_HIP_STANDARD 17)
1193 set(CMAKE_HIP_STANDARD_REQUIRED ON)
1194@@ -34,20 +53,23 @@ include(CMakeDependentOption)
1195 # Disable -Werror
1196 option(DISABLE_WERROR "Disable building with Werror" ON)
1197 option(BUILD_TEST "Build tests" OFF)
1198+option(CODE_COVERAGE "Enable code coverage" OFF)
1199 option(BUILD_HIPSTDPAR_TEST "Build hipstdpar tests" OFF)
1200 option(BUILD_HIPSTDPAR_TEST_WITH_TBB "Build hipstdpar tests with TBB" OFF)
1201-option(BUILD_EXAMPLES "Build examples" OFF)
1202-option(BUILD_BENCHMARKS "Build benchmarks" OFF)
1203-option(DOWNLOAD_ROCPRIM "Download rocPRIM and do not search for rocPRIM package" OFF)
1204-option(DOWNLOAD_ROCRAND "Download rocRAND and do not search for rocRAND package" OFF)
1205+option(BUILD_EXAMPLE "Build example" OFF)
1206+option(BUILD_BENCHMARK "Build benchmark" OFF)
1207 option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF)
1208 cmake_dependent_option(ENABLE_UPSTREAM_TESTS "Enable upstream (thrust) tests" ON BUILD_TEST OFF)
1209-#Set the header wrapper OFF by default.
1210-option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" OFF)
1211+cmake_dependent_option(USE_SYSTEM_LIB "Use existing system ROCm library installation when building tests" OFF BUILD_TEST OFF)
1212+option(EXTERNAL_DEPS_FORCE_DOWNLOAD "Force download of non-ROCm dependencies (eg. Google Test, Google Benchmark)" OFF)
1213
1214 check_language(HIP)
1215 cmake_dependent_option(USE_HIPCXX "Use CMake HIP language support" OFF CMAKE_HIP_COMPILER OFF)
1216
1217+# Allow the user to optionally select offset type dispatch to fixed 32 or 64 bit types
1218+set(THRUST_DISPATCH_TYPE "Dynamic" CACHE STRING "Select Thrust offset type dispatch." FORCE)
1219+set_property(CACHE THRUST_DISPATCH_TYPE PROPERTY STRINGS "Dynamic" "Force32bit" "Force64bit")
1220+
1221 #Adding CMAKE_PREFIX_PATH
1222 list( APPEND CMAKE_PREFIX_PATH /opt/rocm/llvm /opt/rocm ${ROCM_PATH} )
1223
1224@@ -59,7 +81,7 @@ list(APPEND CMAKE_MODULE_PATH
1225 )
1226
1227 # Set a default build type if none was specified
1228-if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
1229+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES AND NOT CODE_COVERAGE)
1230 message(STATUS "Setting build type to 'Release' as none was specified.")
1231 set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE)
1232 set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
1233@@ -94,17 +116,37 @@ else()
1234 if(BUILD_ADDRESS_SANITIZER)
1235 # ASAN builds require xnack
1236 rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
1237- TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+"
1238+ TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+;gfx950:xnack+"
1239 )
1240 else()
1241 rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
1242- TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201"
1243+ TARGETS "gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201"
1244 )
1245 endif()
1246 set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE)
1247 endif()
1248 endif()
1249
1250+# Set up options for obtaining dependencies rocPRIM and rocRAND.
1251+# PACKAGE: Search for an install package that contains the dependency.
1252+# MONOREPO: Assume this is a monorepo checkout and search for the dependency in the directory at ../../projects/.
1253+# DOWNLOAD: Download the dependency from the monorepo.
1254+set(FETCH_METHOD_OPTIONS "PACKAGE" "MONOREPO" "DOWNLOAD")
1255+
1256+set(ROCPRIM_FETCH_METHOD "PACKAGE" CACHE STRING "How to obtain the rocPRIM dependency")
1257+set(ROCRAND_FETCH_METHOD "PACKAGE" CACHE STRING "How to obtain the rocRAND dependency")
1258+
1259+# This function checks to see if the fetch method variable it's passed is defined, and contains a valid value.
1260+# If it does not contain a valid value, it issues a fatal failure with an error message.
1261+function(check_fetch_method method)
1262+ if (DEFINED ${method} AND NOT ${${method}} IN_LIST FETCH_METHOD_OPTIONS)
1263+ message(FATAL_ERROR "Unrecognized ${method}: \"${${method}}\". Valid options are: ${FETCH_METHOD_OPTIONS}.")
1264+ endif()
1265+endfunction()
1266+
1267+check_fetch_method(ROCPRIM_FETCH_METHOD)
1268+check_fetch_method(ROCRAND_FETCH_METHOD)
1269+
1270 # Get dependencies
1271 include(cmake/Dependencies.cmake)
1272
1273@@ -130,16 +172,13 @@ if (NOT THRUST_HOST_SYSTEM IN_LIST THRUST_HOST_SYSTEM_OPTIONS)
1274 )
1275 endif ()
1276
1277-if(DISABLE_WERROR)
1278- add_compile_options(-Wall -Wextra)
1279-else()
1280- add_compile_options(-Wall -Wextra -Werror)
1281+set(COMPILE_OPTIONS -Wall -Wextra)
1282+if(NOT DISABLE_WERROR)
1283+ list(APPEND COMPILE_OPTIONS -Werror)
1284 endif()
1285
1286-if (CMAKE_CXX_STANDARD EQUAL 14)
1287- message(WARNING "C++14 will be deprecated in the next major release")
1288-elseif(NOT CMAKE_CXX_STANDARD EQUAL 17)
1289- message(FATAL_ERROR "Only C++14 and C++17 are supported")
1290+if(NOT CMAKE_CXX_STANDARD EQUAL 17)
1291+ message(FATAL_ERROR "Only C++17 is supported")
1292 endif()
1293
1294 if (WIN32)
1295@@ -149,31 +188,43 @@ endif()
1296
1297 # Address Sanitizer
1298 if(BUILD_ADDRESS_SANITIZER)
1299- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -shared-libasan")
1300- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -shared-libasan")
1301+ list(APPEND COMPILE_OPTIONS -fsanitize=address -shared-libasan)
1302 add_link_options(-fuse-ld=lld)
1303 endif()
1304
1305 # Setup VERSION
1306-rocm_setup_version(VERSION "3.3.0")
1307-
1308-# Print configuration summary
1309-include(cmake/Summary.cmake)
1310-print_configuration_summary()
1311+rocm_setup_version(VERSION ${VERSION_STRING})
1312+math(EXPR rocthrust_VERSION_NUMBER "${rocthrust_VERSION_MAJOR} * 100000 + ${rocthrust_VERSION_MINOR} * 100 + ${rocthrust_VERSION_PATCH}")
1313
1314 # Thrust (with HIP backend)
1315 add_subdirectory(thrust)
1316
1317-if(BUILD_TEST OR BUILD_BENCHMARKS OR BUILD_HIPSTDPAR_TEST)
1318+if(BUILD_TEST OR BUILD_BENCHMARK OR BUILD_HIPSTDPAR_TEST)
1319 rocm_package_setup_component(clients)
1320 endif()
1321
1322+if(CODE_COVERAGE)
1323+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -fprofile-instr-generate -fcoverage-mapping")
1324+endif()
1325+
1326 # Tests
1327+if(BUILD_TEST AND USE_SYSTEM_LIB)
1328+ find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/rocprim")
1329+ if (${rocprim_VERSION} VERSION_LESS ${MIN_ROCPRIM_PACKAGE_VERSION})
1330+ message(WARNING "The installed rocprim version, ${rocprim_VERSION}, is less than the minimum required version ${MIN_ROCPRIM_PACKAGE_VERSION}. Building tests with USE_SYSTEM_LIB=ON may not work properly.")
1331+ endif()
1332+ find_package(rocthrust REQUIRED CONFIG PATHS "/opt/rocm/rocthrust")
1333+ if (NOT ${rocthrust_VERSION} VERSION_EQUAL ${VERSION_STRING})
1334+ message(WARNING "The installed rocthrust version, ${rocthrust_VERSION}, does not match project version ${VERSION_STRING}. Building tests with USE_SYSTEM_LIB=ON may not work properly.")
1335+ endif()
1336+endif()
1337+
1338 if(BUILD_TEST OR BUILD_HIPSTDPAR_TEST)
1339 rocm_package_setup_client_component(tests)
1340 if (ENABLE_UPSTREAM_TESTS)
1341 enable_testing()
1342 endif()
1343+
1344 # We still want the testing to be compiled to catch some errors
1345 #TODO: Get testing folder working with HIP on Windows
1346 if (NOT WIN32 AND BUILD_TEST)
1347@@ -184,28 +235,15 @@ if(BUILD_TEST OR BUILD_HIPSTDPAR_TEST)
1348 endif()
1349
1350 # Examples
1351-if(BUILD_EXAMPLES)
1352+if(BUILD_EXAMPLE)
1353 add_subdirectory(examples)
1354 endif()
1355
1356-# Benchmarks
1357-if(BUILD_BENCHMARKS)
1358- add_subdirectory(benchmarks)
1359- add_subdirectory(internal/benchmark)
1360+# Benchmark
1361+if(BUILD_BENCHMARK)
1362+ add_subdirectory(benchmark)
1363 endif()
1364
1365-#Create header wrapper for backward compatibility
1366-if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32)
1367- rocm_wrap_header_dir(
1368- ${CMAKE_SOURCE_DIR}/thrust
1369- PATTERNS "*.h" "*.inl" "*.cuh" "*.hpp"
1370- HEADER_LOCATION include/thrust
1371- GUARDS SYMLINK WRAPPER
1372- WRAPPER_LOCATIONS rocthrust/${CMAKE_INSTALL_INCLUDEDIR}/thrust
1373- OUTPUT_LOCATIONS rocthrust/wrapper/include/thrust
1374- )
1375-endif( )
1376-
1377 set(THRUST_OPTIONS_DEBUG ${THRUST_OPTIONS_WARNINGS})
1378 set(THRUST_OPTIONS_RELEASE ${THRUST_OPTIONS_WARNINGS})
1379
1380@@ -230,7 +268,11 @@ set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PR
1381
1382 rocm_create_package(
1383 NAME rocthrust
1384- DESCRIPTION "Radeon Open Compute Thrust library"
1385+ DESCRIPTION "rocThrust is a ROCm port of the Thrust library, written in HIP"
1386 MAINTAINER "rocthrust-maintainer@amd.com"
1387 HEADER_ONLY
1388 )
1389+
1390+# Print configuration summary
1391+include(cmake/Summary.cmake)
1392+print_configuration_summary()
1393diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
1394index dfe3344..e985798 100644
1395--- a/CONTRIBUTING.md
1396+++ b/CONTRIBUTING.md
1397@@ -6,7 +6,7 @@
1398
1399 # Contributing to rocThrust #
1400
1401-We welcome contributions to rocThrust. Please follow these details to help ensure your contributions will be successfully accepted.
1402+We welcome contributions to rocThrust. Please follow these details to help ensure your contributions will be successfully accepted.
1403
1404 ## Issue Discussion ##
1405
1406@@ -97,4 +97,4 @@ During code reviews, another developer will take a look through your proposed ch
1407 needed), they may leave a comment. You can follow up and respond to the comment, and/or create comments of your own if you have questions or ideas.
1408 When a modification request has been completed, the conversation thread about it will be marked as resolved.
1409
1410-To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request.
1411\ No newline at end of file
1412+To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request.
1413diff --git a/NOTICES.txt b/NOTICES.txt
1414index dc4f99e..b91ff6f 100644
1415--- a/NOTICES.txt
1416+++ b/NOTICES.txt
1417@@ -2,7 +2,7 @@ Notices and licenses file
1418 _________________________
1419
1420
1421-AMD copyrighted code (Apache 2.0)
1422+AMD copyrighted code (Apache 2.0)
1423 Copyright © 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
1424
1425 Licensed under the Apache License, Version 2.0 (the "License");
1426@@ -18,33 +18,44 @@ See the License for the specific language governing permissions and
1427 limitations under the License.
1428
1429
1430-crascit-downloadproject v-u (MIT)
1431+crascit-downloadproject v-u (MIT)
1432 # Distributed under the OSI-approved MIT License. See accompanying
1433 # file LICENSE or https://github.com/Crascit/DownloadProject for details.
1434
1435-Dependencies on scipy-scipy v-u (MIT)
1436-Copyright (C) 2003-2013 SciPy Developers.
1437-Modifications Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
1438-
1439-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
1440-following conditions are met:
1441-
1442-Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
1443-Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer
1444-in the documentation and/or other materials provided with the distribution.
1445-Neither the name of Enthought nor the names of the SciPy Developers may be used to endorse or promote products derived from
1446-this software without specific prior written permission.
1447-
1448-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
1449-INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
1450-IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
1451-OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
1452-OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
1453-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1454-POSSIBILITY OF SUCH DAMAGE.
1455-
1456-
1457-Dependencies on thrust-thrust v1.9.2 (Apache 2.0)
1458+Dependencies on scipy-scipy v-u (BSD-3-Clause)
1459+Copyright (C) 2003-2013 SciPy Developers.
1460+Modifications Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
1461+
1462+Redistribution and use in source and binary forms, with or without
1463+modification, are permitted provided that the following conditions
1464+are met:
1465+
1466+1. Redistributions of source code must retain the above copyright
1467+ notice, this list of conditions and the following disclaimer.
1468+
1469+2. Redistributions in binary form must reproduce the above
1470+ copyright notice, this list of conditions and the following
1471+ disclaimer in the documentation and/or other materials provided
1472+ with the distribution.
1473+
1474+3. Neither the name of the copyright holder nor the names of its
1475+ contributors may be used to endorse or promote products derived
1476+ from this software without specific prior written permission.
1477+
1478+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1479+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
1480+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
1481+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
1482+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
1483+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
1484+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
1485+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
1486+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1487+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
1488+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1489+
1490+
1491+Dependencies on thrust-thrust v1.9.2 (Apache 2.0)
1492 Copyright 2008-2013 NVIDIA Corporation
1493 Modifications Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
1494
1495@@ -61,7 +72,7 @@ See the License for the specific language governing permissions and
1496 limitations under the License.
1497
1498
1499-rocmsoftwareplatform-rocfft v-u (MIT)
1500+rocmsoftwareplatform-rocfft v-u (MIT)
1501 Copyright © 2016 Advanced Micro Devices, Inc.
1502
1503 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
1504@@ -71,7 +82,7 @@ The above copyright notice and this permission notice shall be included in all c
1505 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE
1506
1507
1508-thrust-thrust v1.9.2 (Apache 2.0)
1509+thrust-thrust v1.9.2 (Apache 2.0)
1510 Copyright 2008-2013 NVIDIA Corporation
1511
1512 Licensed under the Apache License, Version 2.0 (the "License");
1513@@ -87,7 +98,7 @@ See the License for the specific language governing permissions and
1514 limitations under the License.
1515
1516
1517-thrust-thrust v1.9.2 (BSD3)
1518+thrust-thrust v1.9.2 (BSD3)
1519 Copyright (c) 2011, Duane Merrill. All rights reserved.
1520 Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
1521
1522diff --git a/README.md b/README.md
1523index 8332574..8a74370 100644
1524--- a/README.md
1525+++ b/README.md
1526@@ -1,12 +1,12 @@
1527 # rocThrust
1528
1529 > [!NOTE]
1530-> The published documentation is available at [rocThrust](https://rocm.docs.amd.com/projects/rocThrust/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
1531+> The published rocThrust documentation is available [here](https://rocm.docs.amd.com/projects/rocThrust/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
1532
1533 Thrust is a parallel algorithm library. It has been ported to
1534 [HIP](https://github.com/ROCm/HIP) and [ROCm](https://www.github.com/ROCm/ROCm), which use
1535 the [rocPRIM](https://github.com/ROCm/rocPRIM) library. The HIP-ported library
1536-works on HIP and ROCm software. Currently there is no CUDA backend in place.
1537+works on HIP and ROCm software.
1538
1539 ## Requirements
1540
1541@@ -35,46 +35,49 @@ For ROCm hardware requirements, refer to:
1542 * [Linux support](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html)
1543 * [Windows support](https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html)
1544
1545-## Documentation
1546-
1547-Documentation for rocThrust available at
1548-[https://rocm.docs.amd.com/projects/rocThrust/en/latest/](https://rocm.docs.amd.com/projects/rocThrust/en/latest/).
1549-
1550-You can build our documentation locally using the following commands:
1551-
1552-```shell
1553-# Go to rocThrust docs directory
1554-cd rocThrust; cd docs
1555+## Build and install
1556
1557-# Install Python dependencies
1558-python3 -m pip install -r sphinx/requirements.txt
1559+### Obtaining the source code
1560
1561-# Build the documentation
1562-python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
1563+rocThrust can be cloned in two ways:
1564+1. Clone rocThrust along with other ROCm libraries that are frequently used together (note that this may take some time to complete):
1565+```sh
1566+git clone https://github.com/ROCm/rocm-libraries.git
1567+cd rocm-libraries
1568+```
1569
1570-# For e.g. serve the HTML docs locally
1571-cd _build/html
1572-python3 -m http.server
1573+2. To clone rocThrust individually (faster, but requires git version 2.25+):
1574+```sh
1575+git clone --no-checkout --depth=1 --filter=tree:0 https://github.com/ROCm/rocm-libraries.git
1576+cd rocm-libraries
1577+git sparse-checkout init --cone
1578+git sparse-checkout set projects/rocthrust
1579+git checkout develop
1580 ```
1581
1582-## Build and install
1583+### Building the library
1584
1585 ```sh
1586-git clone https://github.com/ROCm/rocThrust
1587+# Go to the rocthrust directory
1588+cd projects/rocthrust
1589
1590-# Go to rocThrust directory, create and go to the build directory.
1591-cd rocThrust; mkdir build; cd build
1592+# Create a directory for the build and go to it.
1593+mkdir build; cd build
1594
1595-# Configure rocThrust, setup options for your system.
1596 # Build options:
1597-# DISABLE_WERROR - ON by default, This flag disable the -Werror compiler flag
1598-# BUILD_TEST - OFF by default,
1599-# BUILD_HIPSTDPAR_TEST - OFF by default,
1600-# BUILD_EXAMPLES - OFF by default,
1601-# BUILD_BENCHMARKS - OFF by default,
1602-# DOWNLOAD_ROCPRIM - OFF by default, when ON rocPRIM will be downloaded to the build folder,
1603-# RNG_SEED_COUNT - 0 by default, controls non-repeatable random dataset count
1604-# PRNG_SEEDS - 1 by default, reproducible seeds to generate random data
1605+# DISABLE_WERROR - ON by default, this flag disables the -Werror compiler flag
1606+# BUILD_TEST - OFF by default,
1607+# BUILD_HIPSTDPAR_TEST - OFF by default,
1608+# BUILD_EXAMPLE - OFF by default,
1609+# BUILD_BENCHMARK - OFF by default,
1610+# ROCPRIM_FETCH_METHOD - PACKAGE is the default, see below for a description of available options
1611+# ROCRAND_FETCH_METHOD - PACKAGE is the default, see below for a description of available options
1612+# EXTERNAL_DEPS_FORCE_DOWNLOAD - OFF by default, forces download for non-ROCm dependencies (eg. Google Test / Benchmark)
1613+# BUILD_ADDRESS_SANITIZER - OFF by default, builds with clang address sanitizer enabled.
1614+# RNG_SEED_COUNT - 0 by default, controls non-repeatable random dataset count
1615+# PRNG_SEEDS - 1 by default, reproducible seeds to generate random data
1616+# USE_HIPCXX - OFF by default, builds with CMake HIP language support. This eliminates the need to set CXX.
1617+# USE_SYSTEM_LIB - OFF by default, builds tests using the installed ROCm libs provided by the system. This only takes effect when BUILD_TEST is ON.
1618 #
1619 # ! IMPORTANT !
1620 # On ROCm platform set C++ compiler to HipCC. You can do it by adding 'CXX=<path-to-hipcc>'
1621@@ -95,19 +98,42 @@ make package
1622 [sudo] make install
1623 ```
1624
1625+### Build Options for Fetching Dependencies
1626+
1627+rocThrust can (optionally) automatically fetch a number of dependencies for you at cmake configuration time.
1628+Alternatively, it can seach for an existing installation on your system.
1629+The following cmake build options control how dependencies are located.
1630+
1631+- `FORCE_DEPENDENCIES_DOWNLOAD` (default: `OFF`) - when set to `ON`, non-ROCm dependencies (Google Test, Google Benchmark) will always be downloaded, even if they are already installed ony your system. When set to `OFF`, rocThrust first searches for existing installations of the dependencies on your system, and only downloads them if they cannot be found.
1632+
1633+- `ROCPRIM_FETCH_METHOD` (default: `PACKAGE`) - controls the way that the rocPRIM dependency is fetched. This option must be set to one of:
1634+ - `PACKAGE` - searches for an existing installation of the dependency. If it is not found, rocThrust will fall back to using the `DOWNLOAD` setting (below).
1635+ - `DOWNLOAD` - downloads the dependency from the rocm-libraries repository. If git version 2.25+ is present, uses a [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to pull only rocThrust files. If not, the whole [rocm-libraries](https://github.com/ROCm/rocm-libraries/) repository will be downloaded (this may take some time).
1636+ - `MONOREPO` - this option is useful if you are building rocThrust from within a checkout of the rocm-libraries repository (which already includes rocPRIM and rocRAND). When enabled, rocThrust will try to find the dependency in the local repository tree. If it cannot be found, rocThrust will fall back to usign the `DOWNLOAD` option (above).
1637+- `ROCRAND_FETCH_METHOD` (default: `PACKAGE`) - this option is only considered when the `BUILD_BENCHMARKS` option is set to `ON`. It controls the way that the rocRAND dependency is fetched. See `ROCPRIM_FETCH_METHOD` (above) for available options.
1638+
1639+To specify an option, add it to your camke command, prefixed with the `-D` switch, (eg. `[CXX=hipcc] cmake -DFORCE_DEPENDENCIES_DOWNLOAD=ON -DROCPRIM_FETCH_METHOD="MONOREPO" ../.`).
1640+
1641 ### HIP on Windows
1642
1643-We've added initial support for HIP on Windows. To install, use the provided `rmake.py` Python script:
1644+We've added initial support for HIP on Windows.
1645+To install, first clone rocThrust using the steps described in [obtaining the source code](#obtaining-the-source-code).
1646+Then, use the provided `rmake.py` Python script:
1647
1648 ```shell
1649-git clone https://github.com/ROCm/rocThrust.git
1650-cd rocThrust
1651+cd projects/rocThrust
1652
1653 # the -i option will install rocPRIM to C:\hipSDK by default
1654 python rmake.py -i
1655
1656 # the -c option will build all clients including unit tests
1657 python rmake.py -c
1658+
1659+# to build for a specific architecture only, use the -a option
1660+python rmake.py -ci -a gfx1100
1661+
1662+# for a full list of available options, please refer to the help documentation
1663+python rmake.py -h
1664 ```
1665
1666 ### Macro options
1667@@ -139,9 +165,9 @@ target_link_libraries(<your_target> roc::rocthrust)
1668
1669 ```sh
1670 # Go to rocThrust build directory
1671-cd rocThrust; cd build
1672+cd projects/rocthrust; cd build
1673
1674-# Configure with examples flag on
1675+# Configure with test flag on
1676 CXX=hipcc cmake -DBUILD_TEST=ON ..
1677
1678 # Build tests
1679@@ -178,7 +204,7 @@ There is a utility script in the repo that may be called independently:
1680
1681 ```shell
1682 # Go to rocThrust build directory
1683-cd rocThrust; cd build
1684+cd projects/rocthrust; cd build
1685
1686 # Invoke directly or use CMake script mode via cmake -P
1687 ../cmake/GenerateResourceSpec.cmake
1688@@ -236,10 +262,10 @@ There are two CMake configuration-time options that control random data fed to u
1689
1690 ```sh
1691 # Go to rocThrust build directory
1692-cd rocThrust; cd build
1693+cd projects/rocthrust; cd build
1694
1695-# Configure with examples flag on
1696-CXX=hipcc cmake -DBUILD_EXAMPLES=ON ..
1697+# Configure with example flag on
1698+CXX=hipcc cmake -DBUILD_EXAMPLE=ON ..
1699
1700 # Build examples
1701 make -j4
1702@@ -257,16 +283,16 @@ make -j4
1703
1704 ```sh
1705 # Go to rocThrust build directory
1706-cd rocThrust; cd build
1707+cd projects/rocthrust; cd build
1708
1709-# Configure with benchmarks flag on
1710-CXX=hipcc cmake -DBUILD_BENCHMARKS=ON ..
1711+# Configure with benchmark flag on
1712+CXX=hipcc cmake -DBUILD_BENCHMARK=ON ..
1713
1714 # Build benchmarks
1715 make -j4
1716
1717 # Run the benchmarks
1718-./benchmarks/benchmark_thrust_bench
1719+./benchmark/benchmark_thrust_bench
1720 ```
1721
1722 ## HIPSTDPAR
1723@@ -284,11 +310,11 @@ HIPSTDPAR is currently packaged along rocThrust. The `hipstdpar` package is set
1724 ### Tests
1725 rocThrust also includes tests to check the correct building of HIPSTDPAR implementations. They are located in the [tests/hipstdpar](/test/hipstdpar/) folder. When configuring the project with the `BUILD_TEST` option, these tests will not be enabled by default. To enable them, set `BUILD_HIPSTDPAR_TEST=ON`. Additionally, you can configure only HIPSTDPAR's tests by disabling `BUILD_TEST` and enabling `BUILD_HIPSTDPAR_TEST`. In general, the following steps can be followed for building and running the tests:
1726
1727+First, clone rocThrust using the steps described in [obtaining the source code](#obtaining-the-source-code).
1728+Then, build the tests as follows:
1729 ```sh
1730-git clone https://github.com/ROCm/rocThrust
1731-
1732 # Go to rocThrust directory, create and go to the build directory.
1733-cd rocThrust; mkdir build; cd build
1734+cd projects/rocthrust; mkdir build; cd build
1735
1736 # Configure rocThrust.
1737 [CXX=hipcc] cmake ../. -D BUILD_TEST=ON # Configure rocThrust's tests.
1738@@ -309,10 +335,67 @@ ctest --output-on-failure
1739 * Notice that oneTBB (oneAPI TBB) may fail to compile when libstdc++-9 or -10 is used, due to them using legacy TBB interfaces that are incompatible with the oneTBB ones (see the [release notes](https://www.intel.com/content/www/us/en/developer/articles/release-notes/intel-oneapi-threading-building-blocks-release-notes.html)).
1740 * CMake (3.10.2 or later)
1741
1742+## Building the documentation locally
1743+
1744+### Requirements
1745+
1746+#### Doxygen
1747+
1748+The build system uses Doxygen [version 1.9.4](https://github.com/doxygen/doxygen/releases/tag/Release_1_9_4). You can try using a newer version, but that might cause issues.
1749+
1750+After you have downloaded Doxygen version 1.9.4:
1751+
1752+```shell
1753+# Add doxygen to your PATH
1754+echo 'export PATH=<doxygen 1.9.4 path>/bin:$PATH' >> ~/.bashrc
1755+
1756+# Apply the updated .bashrc
1757+source ~/.bashrc
1758+
1759+# Confirm that you are using version 1.9.4
1760+doxygen --version
1761+```
1762+
1763+#### Python
1764+
1765+The build system uses Python version 3.10. You can try using a newer version, but that might cause issues.
1766+
1767+You can install Python 3.10 alongside your other Python versions using [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation):
1768+
1769+```shell
1770+# Install Python 3.10
1771+pyenv install 3.10
1772+
1773+# Create a Python 3.10 virtual environment
1774+pyenv virtualenv 3.10 venv_rocthrust
1775+
1776+# Activate the virtual environment
1777+pyenv activate venv_rocthrust
1778+```
1779+
1780+### Building
1781+
1782+After cloning this repository (see [obtaining the source code](#obtaining-the-source-code)):
1783+
1784+```shell
1785+cd rocm-libraries/projects/rocthrust
1786+
1787+# Install Python dependencies
1788+python3 -m pip install -r docs/sphinx/requirements.txt
1789+
1790+# Build the documentation
1791+python3 -m sphinx -T -E -b html -d docs/_build/doctrees -D language=en docs docs/_build/html
1792+```
1793+
1794+You can then open `docs/_build/html/index.html` in your browser to view the documentation.
1795+
1796 ## Support
1797
1798 You can report bugs and feature requests through the GitHub
1799-[issue tracker](https://github.com/ROCm/rocThrust/issues).
1800+[issue tracker](https://github.com/ROCm/rocm-libraries/issues).
1801+To help ensure that your issue is seen by the right team more quicly, when creating your issue, please apply the label `project: rocthrust`.
1802+Similarly, to filter the exising issue list down to only those affecting rocThrust, you can add the filter `label:"project: rocthrust"`,
1803+or follow [this link](https://github.com/ROCm/rocm-libraries/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22project%3A%20rocthrust%22).
1804
1805 ## License
1806
1807diff --git a/benchmarks/CMakeLists.txt b/benchmark/CMakeLists.txt
1808similarity index 100%
1809rename from benchmarks/CMakeLists.txt
1810rename to benchmark/CMakeLists.txt
1811diff --git a/benchmarks/bench/adjacent_difference/basic.cu b/benchmark/bench/adjacent_difference/basic.cu
1812similarity index 56%
1813rename from benchmarks/bench/adjacent_difference/basic.cu
1814rename to benchmark/bench/adjacent_difference/basic.cu
1815index 6733c56..8cf5120 100644
1816--- a/benchmarks/bench/adjacent_difference/basic.cu
1817+++ b/benchmark/bench/adjacent_difference/basic.cu
1818@@ -1,24 +1,30 @@
1819-// MIT License
1820-//
1821-// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
1822-//
1823-// Permission is hereby granted, free of charge, to any person obtaining a copy
1824-// of this software and associated documentation files (the "Software"), to deal
1825-// in the Software without restriction, including without limitation the rights
1826-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1827-// copies of the Software, and to permit persons to whom the Software is
1828-// furnished to do so, subject to the following conditions:
1829-//
1830-// The above copyright notice and this permission notice shall be included in all
1831-// copies or substantial portions of the Software.
1832-//
1833-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1834-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1835-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1836-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1837-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1838-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1839-// SOFTWARE.
1840+/******************************************************************************
1841+ * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
1842+ * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
1843+ *
1844+ * Redistribution and use in source and binary forms, with or without
1845+ * modification, are permitted provided that the following conditions are met:
1846+ * * Redistributions of source code must retain the above copyright
1847+ * notice, this list of conditions and the following disclaimer.
1848+ * * Redistributions in binary form must reproduce the above copyright
1849+ * notice, this list of conditions and the following disclaimer in the
1850+ * documentation and/or other materials provided with the distribution.
1851+ * * Neither the name of the NVIDIA CORPORATION nor the
1852+ * names of its contributors may be used to endorse or promote products
1853+ * derived from this software without specific prior written permission.
1854+ *
1855+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
1856+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1857+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1858+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
1859+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1860+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1861+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1862+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1863+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1864+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1865+ *
1866+ ******************************************************************************/
1867
1868 // Benchmark utils
1869 #include "../../bench_utils/bench_utils.hpp"
1870@@ -32,121 +38,121 @@
1871 #include <benchmark/benchmark.h>
1872
1873 // STL
1874-#include <cstdlib>
1875+#include <cstddef>
1876 #include <string>
1877 #include <vector>
1878
1879 struct basic
1880 {
1881- template <typename T, typename Policy>
1882- float64_t run(thrust::device_vector<T>& input, thrust::device_vector<T>& output, Policy policy)
1883- {
1884- bench_utils::gpu_timer d_timer;
1885+ template <typename T, typename Policy>
1886+ float64_t run(thrust::device_vector<T>& input, thrust::device_vector<T>& output, Policy policy)
1887+ {
1888+ thrust::adjacent_difference(policy, input.cbegin(), input.cend(), output.begin());
1889
1890- d_timer.start(0);
1891- thrust::adjacent_difference(policy, input.cbegin(), input.cend(), output.begin());
1892- d_timer.stop(0);
1893+ bench_utils::gpu_timer d_timer;
1894
1895- return d_timer.get_duration();
1896- }
1897+ d_timer.start(0);
1898+ thrust::adjacent_difference(policy, input.cbegin(), input.cend(), output.begin());
1899+ d_timer.stop(0);
1900+
1901+ return d_timer.get_duration();
1902+ }
1903 };
1904
1905 template <class Benchmark, class T>
1906 void run_benchmark(benchmark::State& state, const std::size_t elements, const std::string seed_type)
1907 {
1908- // Benchmark object
1909- Benchmark benchmark {};
1910+ // Benchmark object
1911+ Benchmark benchmark{};
1912
1913- // GPU times
1914- std::vector<double> gpu_times;
1915+ // GPU times
1916+ std::vector<double> gpu_times;
1917
1918- // Generate input
1919- thrust::device_vector<T> input = bench_utils::generate(elements, seed_type);
1920+ // Generate input
1921+ thrust::device_vector<T> input = bench_utils::generate(elements, seed_type);
1922
1923- // Output
1924- thrust::device_vector<T> output(elements);
1925+ // Output
1926+ thrust::device_vector<T> output(elements);
1927
1928- bench_utils::caching_allocator_t alloc {};
1929- thrust::detail::device_t policy {};
1930+ bench_utils::caching_allocator_t alloc{};
1931+ thrust::detail::device_t policy{};
1932
1933- for(auto _ : state)
1934- {
1935- float64_t duration = benchmark.template run<T>(input, output, policy(alloc));
1936- state.SetIterationTime(duration);
1937- gpu_times.push_back(duration);
1938- }
1939+ for (auto _ : state)
1940+ {
1941+ float64_t duration = benchmark.template run<T>(input, output, policy(alloc));
1942+ state.SetIterationTime(duration);
1943+ gpu_times.push_back(duration);
1944+ }
1945
1946- // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
1947- // it will actually be the global memory bandwidth gotten.
1948- state.SetBytesProcessed(state.iterations() * 2 * elements * sizeof(T));
1949- state.SetItemsProcessed(state.iterations() * elements);
1950+ // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
1951+ // it will actually be the global memory bandwidth gotten.
1952+ state.SetBytesProcessed(state.iterations() * 2 * elements * sizeof(T));
1953+ state.SetItemsProcessed(state.iterations() * elements);
1954
1955- const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
1956- state.counters["gpu_noise"] = gpu_cv;
1957+ const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
1958+ state.counters["gpu_noise"] = gpu_cv;
1959 }
1960
1961-#define CREATE_BENCHMARK(T, Elements) \
1962- benchmark::RegisterBenchmark( \
1963- bench_utils::bench_naming::format_name("{algo:adjacent_difference,subalgo:" + name \
1964- + ",input_type:" #T + ",elements:" #Elements) \
1965- .c_str(), \
1966- run_benchmark<Benchmark, T>, \
1967- Elements, \
1968- seed_type)
1969+#define CREATE_BENCHMARK(T, Elements) \
1970+ benchmark::RegisterBenchmark( \
1971+ bench_utils::bench_naming::format_name( \
1972+ "{algo:adjacent_difference,subalgo:" + name + ",input_type:" #T + ",elements:" #Elements) \
1973+ .c_str(), \
1974+ run_benchmark<Benchmark, T>, \
1975+ Elements, \
1976+ seed_type)
1977
1978-#define BENCHMARK_TYPE(type) \
1979- CREATE_BENCHMARK(type, 1 << 16), CREATE_BENCHMARK(type, 1 << 20), \
1980- CREATE_BENCHMARK(type, 1 << 24), CREATE_BENCHMARK(type, 1 << 28)
1981+#define BENCHMARK_TYPE(type) \
1982+ CREATE_BENCHMARK(type, 1 << 16), CREATE_BENCHMARK(type, 1 << 20), CREATE_BENCHMARK(type, 1 << 24), \
1983+ CREATE_BENCHMARK(type, 1 << 28)
1984
1985 template <class Benchmark>
1986-void add_benchmarks(const std::string& name,
1987- std::vector<benchmark::internal::Benchmark*>& benchmarks,
1988- const std::string seed_type)
1989+void add_benchmarks(
1990+ const std::string& name, std::vector<benchmark::internal::Benchmark*>& benchmarks, const std::string seed_type)
1991 {
1992- std::vector<benchmark::internal::Benchmark*> bs = {BENCHMARK_TYPE(int8_t),
1993- BENCHMARK_TYPE(int16_t),
1994- BENCHMARK_TYPE(int32_t),
1995- BENCHMARK_TYPE(int64_t),
1996- BENCHMARK_TYPE(float32_t),
1997- BENCHMARK_TYPE(float64_t)};
1998-
1999- benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
2000+ std::vector<benchmark::internal::Benchmark*> bs = {
2001+ BENCHMARK_TYPE(int8_t),
2002+ BENCHMARK_TYPE(int16_t),
2003+ BENCHMARK_TYPE(int32_t),
2004+ BENCHMARK_TYPE(int64_t),
2005+ BENCHMARK_TYPE(float32_t),
2006+ BENCHMARK_TYPE(float64_t)};
2007+
2008+ benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
2009 }
2010
2011 int main(int argc, char* argv[])
2012 {
2013- cli::Parser parser(argc, argv);
2014- parser.set_optional<std::string>(
2015- "name_format", "name_format", "human", "either: json,human,txt");
2016- parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
2017- parser.run_and_exit_if_error();
2018-
2019- // Parse argv
2020- benchmark::Initialize(&argc, argv);
2021- bench_utils::bench_naming::set_format(
2022- parser.get<std::string>("name_format")); /* either: json,human,txt */
2023- const std::string seed_type = parser.get<std::string>("seed");
2024-
2025- // Benchmark info
2026- bench_utils::add_common_benchmark_info();
2027- benchmark::AddCustomContext("seed", seed_type);
2028-
2029- // Add benchmark
2030- std::vector<benchmark::internal::Benchmark*> benchmarks;
2031- add_benchmarks<basic>("basic", benchmarks, seed_type);
2032-
2033- // Use manual timing
2034- for(auto& b : benchmarks)
2035- {
2036- b->UseManualTime();
2037- b->Unit(benchmark::kMicrosecond);
2038- b->MinTime(0.4); // in seconds
2039- }
2040-
2041- // Run benchmarks
2042- benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
2043-
2044- // Finish
2045- benchmark::Shutdown();
2046- return 0;
2047+ cli::Parser parser(argc, argv);
2048+ parser.set_optional<std::string>("name_format", "name_format", "human", "either: json,human,txt");
2049+ parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
2050+ parser.run_and_exit_if_error();
2051+
2052+ // Parse argv
2053+ benchmark::Initialize(&argc, argv);
2054+ bench_utils::bench_naming::set_format(parser.get<std::string>("name_format")); /* either: json,human,txt */
2055+ const std::string seed_type = parser.get<std::string>("seed");
2056+
2057+ // Benchmark info
2058+ bench_utils::add_common_benchmark_info();
2059+ benchmark::AddCustomContext("seed", seed_type);
2060+
2061+ // Add benchmark
2062+ std::vector<benchmark::internal::Benchmark*> benchmarks;
2063+ add_benchmarks<basic>("basic", benchmarks, seed_type);
2064+
2065+ // Use manual timing
2066+ for (auto& b : benchmarks)
2067+ {
2068+ b->UseManualTime();
2069+ b->Unit(benchmark::kMicrosecond);
2070+ b->MinTime(0.4); // in seconds
2071+ }
2072+
2073+ // Run benchmarks
2074+ benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
2075+
2076+ // Finish
2077+ benchmark::Shutdown();
2078+ return 0;
2079 }
2080diff --git a/benchmarks/bench/adjacent_difference/custom.cu b/benchmark/bench/adjacent_difference/custom.cu
2081similarity index 53%
2082rename from benchmarks/bench/adjacent_difference/custom.cu
2083rename to benchmark/bench/adjacent_difference/custom.cu
2084index 24d6c03..dd47604 100644
2085--- a/benchmarks/bench/adjacent_difference/custom.cu
2086+++ b/benchmark/bench/adjacent_difference/custom.cu
2087@@ -1,24 +1,30 @@
2088-// MIT License
2089-//
2090-// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
2091-//
2092-// Permission is hereby granted, free of charge, to any person obtaining a copy
2093-// of this software and associated documentation files (the "Software"), to deal
2094-// in the Software without restriction, including without limitation the rights
2095-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
2096-// copies of the Software, and to permit persons to whom the Software is
2097-// furnished to do so, subject to the following conditions:
2098-//
2099-// The above copyright notice and this permission notice shall be included in all
2100-// copies or substantial portions of the Software.
2101-//
2102-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2103-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2104-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2105-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2106-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2107-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2108-// SOFTWARE.
2109+/******************************************************************************
2110+ * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
2111+ * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
2112+ *
2113+ * Redistribution and use in source and binary forms, with or without
2114+ * modification, are permitted provided that the following conditions are met:
2115+ * * Redistributions of source code must retain the above copyright
2116+ * notice, this list of conditions and the following disclaimer.
2117+ * * Redistributions in binary form must reproduce the above copyright
2118+ * notice, this list of conditions and the following disclaimer in the
2119+ * documentation and/or other materials provided with the distribution.
2120+ * * Neither the name of the NVIDIA CORPORATION nor the
2121+ * names of its contributors may be used to endorse or promote products
2122+ * derived from this software without specific prior written permission.
2123+ *
2124+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
2125+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
2126+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
2127+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
2128+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
2129+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
2130+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
2131+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2132+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
2133+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2134+ *
2135+ ******************************************************************************/
2136
2137 // Benchmark utils
2138 #include "../../bench_utils/bench_utils.hpp"
2139@@ -32,141 +38,139 @@
2140 #include <benchmark/benchmark.h>
2141
2142 // STL
2143-#include <cstdlib>
2144+#include <cstddef>
2145 #include <string>
2146 #include <vector>
2147
2148 template <typename T>
2149 struct custom_op
2150 {
2151- T val;
2152+ T val;
2153
2154- custom_op() = delete;
2155+ custom_op() = delete;
2156
2157- explicit custom_op(T val)
2158- : val(val)
2159- {
2160- }
2161+ explicit custom_op(T val)
2162+ : val(val)
2163+ {}
2164
2165- __device__ T operator()(const T& lhs, const T& rhs)
2166- {
2167- return lhs * rhs + val;
2168- }
2169+ __device__ T operator()(const T& lhs, const T& rhs)
2170+ {
2171+ return lhs * rhs + val; // Hope to gen mad
2172+ }
2173 };
2174
2175 template <int Val>
2176 struct custom
2177 {
2178- template <typename T, typename Policy>
2179- float64_t run(thrust::device_vector<T>& input, thrust::device_vector<T>& output, Policy policy)
2180- {
2181- bench_utils::gpu_timer d_timer;
2182-
2183- d_timer.start(0);
2184- thrust::adjacent_difference(
2185- policy, input.cbegin(), input.cend(), output.begin(), custom_op<T> {Val});
2186- d_timer.stop(0);
2187-
2188- return d_timer.get_duration();
2189- }
2190+ template <typename T, typename Policy>
2191+ float64_t run(thrust::device_vector<T>& input, thrust::device_vector<T>& output, Policy policy)
2192+ {
2193+ thrust::adjacent_difference(policy, input.cbegin(), input.cend(), output.begin(), custom_op<T>{Val});
2194+
2195+ bench_utils::gpu_timer d_timer;
2196+
2197+ d_timer.start(0);
2198+ thrust::adjacent_difference(policy, input.cbegin(), input.cend(), output.begin(), custom_op<T>{Val});
2199+ d_timer.stop(0);
2200+
2201+ return d_timer.get_duration();
2202+ }
2203 };
2204
2205 template <class Benchmark, class T>
2206 void run_benchmark(benchmark::State& state, const std::size_t elements, const std::string seed_type)
2207 {
2208- // Benchmark object
2209- Benchmark benchmark {};
2210+ // Benchmark object
2211+ Benchmark benchmark{};
2212
2213- // GPU times
2214- std::vector<double> gpu_times;
2215+ // GPU times
2216+ std::vector<double> gpu_times;
2217
2218- // Generate input
2219- thrust::device_vector<T> input = bench_utils::generate(elements, seed_type);
2220+ // Generate input
2221+ thrust::device_vector<T> input = bench_utils::generate(elements, seed_type);
2222
2223- // Output
2224- thrust::device_vector<T> output(elements);
2225+ // Output
2226+ thrust::device_vector<T> output(elements);
2227
2228- bench_utils::caching_allocator_t alloc {};
2229- thrust::detail::device_t policy {};
2230+ bench_utils::caching_allocator_t alloc{};
2231+ thrust::detail::device_t policy{};
2232
2233- for(auto _ : state)
2234- {
2235- float64_t duration = benchmark.template run<T>(input, output, policy(alloc));
2236- state.SetIterationTime(duration);
2237- gpu_times.push_back(duration);
2238- }
2239+ for (auto _ : state)
2240+ {
2241+ float64_t duration = benchmark.template run<T>(input, output, policy(alloc));
2242+ state.SetIterationTime(duration);
2243+ gpu_times.push_back(duration);
2244+ }
2245
2246- // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
2247- // it will actually be the global memory bandwidth gotten.
2248- state.SetBytesProcessed(state.iterations() * 2 * elements * sizeof(T));
2249- state.SetItemsProcessed(state.iterations() * elements);
2250+ // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
2251+ // it will actually be the global memory bandwidth gotten.
2252+ state.SetBytesProcessed(state.iterations() * 2 * elements * sizeof(T));
2253+ state.SetItemsProcessed(state.iterations() * elements);
2254
2255- const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
2256- state.counters["gpu_noise"] = gpu_cv;
2257+ const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
2258+ state.counters["gpu_noise"] = gpu_cv;
2259 }
2260
2261-#define CREATE_BENCHMARK(T, Elements) \
2262- benchmark::RegisterBenchmark( \
2263- bench_utils::bench_naming::format_name("{algo:adjacent_difference,subalgo:" + name \
2264- + ",input_type:" #T + ",elements:" #Elements) \
2265- .c_str(), \
2266- run_benchmark<Benchmark<Val>, T>, \
2267- Elements, \
2268- seed_type)
2269+#define CREATE_BENCHMARK(T, Elements) \
2270+ benchmark::RegisterBenchmark( \
2271+ bench_utils::bench_naming::format_name( \
2272+ "{algo:adjacent_difference,subalgo:" + name + ",input_type:" #T + ",elements:" #Elements) \
2273+ .c_str(), \
2274+ run_benchmark<Benchmark<Val>, T>, \
2275+ Elements, \
2276+ seed_type)
2277
2278-#define BENCHMARK_TYPE(type) \
2279- CREATE_BENCHMARK(type, 1 << 16), CREATE_BENCHMARK(type, 1 << 20), \
2280- CREATE_BENCHMARK(type, 1 << 24), CREATE_BENCHMARK(type, 1 << 28)
2281+#define BENCHMARK_TYPE(type) \
2282+ CREATE_BENCHMARK(type, 1 << 16), CREATE_BENCHMARK(type, 1 << 20), CREATE_BENCHMARK(type, 1 << 24), \
2283+ CREATE_BENCHMARK(type, 1 << 28)
2284
2285 template <template <int> class Benchmark, int Val = 42 /*magic number in Thrust's benchmark*/>
2286-void add_benchmarks(const std::string& name,
2287- std::vector<benchmark::internal::Benchmark*>& benchmarks,
2288- const std::string seed_type)
2289+void add_benchmarks(
2290+ const std::string& name, std::vector<benchmark::internal::Benchmark*>& benchmarks, const std::string seed_type)
2291 {
2292- std::vector<benchmark::internal::Benchmark*> bs = {BENCHMARK_TYPE(int8_t),
2293- BENCHMARK_TYPE(int16_t),
2294- BENCHMARK_TYPE(int32_t),
2295- BENCHMARK_TYPE(int64_t),
2296- BENCHMARK_TYPE(float32_t),
2297- BENCHMARK_TYPE(float64_t)};
2298-
2299- benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
2300+ std::vector<benchmark::internal::Benchmark*> bs = {
2301+ BENCHMARK_TYPE(int8_t),
2302+ BENCHMARK_TYPE(int16_t),
2303+ BENCHMARK_TYPE(int32_t),
2304+ BENCHMARK_TYPE(int64_t),
2305+ BENCHMARK_TYPE(float32_t),
2306+ BENCHMARK_TYPE(float64_t)};
2307+
2308+ benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
2309 }
2310
2311 int main(int argc, char* argv[])
2312 {
2313- cli::Parser parser(argc, argv);
2314- parser.set_optional<std::string>(
2315- "name_format", "name_format", "human", "either: json,human,txt");
2316- parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
2317- parser.run_and_exit_if_error();
2318-
2319- // Parse argv
2320- benchmark::Initialize(&argc, argv);
2321- bench_utils::bench_naming::set_format(
2322- parser.get<std::string>("name_format")); /* either: json,human,txt */
2323- const std::string seed_type = parser.get<std::string>("seed");
2324-
2325- // Benchmark info
2326- bench_utils::add_common_benchmark_info();
2327- benchmark::AddCustomContext("seed", seed_type);
2328-
2329- // Add benchmark
2330- std::vector<benchmark::internal::Benchmark*> benchmarks;
2331- add_benchmarks<custom>("custom", benchmarks, seed_type);
2332-
2333- // Use manual timing
2334- for(auto& b : benchmarks)
2335- {
2336- b->UseManualTime();
2337- b->Unit(benchmark::kMicrosecond);
2338- b->MinTime(0.4); // in seconds
2339- }
2340-
2341- // Run benchmarks
2342- benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
2343-
2344- // Finish
2345- benchmark::Shutdown();
2346- return 0;
2347+ cli::Parser parser(argc, argv);
2348+ parser.set_optional<std::string>("name_format", "name_format", "human", "either: json,human,txt");
2349+ parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
2350+ parser.run_and_exit_if_error();
2351+
2352+ // Parse argv
2353+ benchmark::Initialize(&argc, argv);
2354+ bench_utils::bench_naming::set_format(parser.get<std::string>("name_format")); /* either: json,human,txt */
2355+ const std::string seed_type = parser.get<std::string>("seed");
2356+
2357+ // Benchmark info
2358+ bench_utils::add_common_benchmark_info();
2359+ benchmark::AddCustomContext("seed", seed_type);
2360+
2361+ // Add benchmark
2362+ std::vector<benchmark::internal::Benchmark*> benchmarks;
2363+ add_benchmarks<custom>("custom", benchmarks, seed_type);
2364+
2365+ // Use manual timing
2366+ for (auto& b : benchmarks)
2367+ {
2368+ b->UseManualTime();
2369+ b->Unit(benchmark::kMicrosecond);
2370+ b->MinTime(0.4); // in seconds
2371+ }
2372+
2373+ // Run benchmarks
2374+ benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
2375+
2376+ // Finish
2377+ benchmark::Shutdown();
2378+ return 0;
2379 }
2380diff --git a/benchmarks/bench/copy/basic.cu b/benchmark/bench/adjacent_difference/in_place.cu
2381similarity index 53%
2382rename from benchmarks/bench/copy/basic.cu
2383rename to benchmark/bench/adjacent_difference/in_place.cu
2384index 52f22c5..4647152 100644
2385--- a/benchmarks/bench/copy/basic.cu
2386+++ b/benchmark/bench/adjacent_difference/in_place.cu
2387@@ -1,30 +1,36 @@
2388-// MIT License
2389-//
2390-// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
2391-//
2392-// Permission is hereby granted, free of charge, to any person obtaining a copy
2393-// of this software and associated documentation files (the "Software"), to deal
2394-// in the Software without restriction, including without limitation the rights
2395-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
2396-// copies of the Software, and to permit persons to whom the Software is
2397-// furnished to do so, subject to the following conditions:
2398-//
2399-// The above copyright notice and this permission notice shall be included in all
2400-// copies or substantial portions of the Software.
2401-//
2402-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2403-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2404-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2405-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2406-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2407-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2408-// SOFTWARE.
2409+/******************************************************************************
2410+ * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
2411+ * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
2412+ *
2413+ * Redistribution and use in source and binary forms, with or without
2414+ * modification, are permitted provided that the following conditions are met:
2415+ * * Redistributions of source code must retain the above copyright
2416+ * notice, this list of conditions and the following disclaimer.
2417+ * * Redistributions in binary form must reproduce the above copyright
2418+ * notice, this list of conditions and the following disclaimer in the
2419+ * documentation and/or other materials provided with the distribution.
2420+ * * Neither the name of the NVIDIA CORPORATION nor the
2421+ * names of its contributors may be used to endorse or promote products
2422+ * derived from this software without specific prior written permission.
2423+ *
2424+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
2425+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
2426+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
2427+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
2428+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
2429+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
2430+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
2431+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2432+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
2433+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2434+ *
2435+ ******************************************************************************/
2436
2437 // Benchmark utils
2438 #include "../../bench_utils/bench_utils.hpp"
2439
2440 // rocThrust
2441-#include <thrust/copy.h>
2442+#include <thrust/adjacent_difference.h>
2443 #include <thrust/device_vector.h>
2444 #include <thrust/execution_policy.h>
2445
2446@@ -32,123 +38,118 @@
2447 #include <benchmark/benchmark.h>
2448
2449 // STL
2450-#include <cstdlib>
2451+#include <cstddef>
2452 #include <string>
2453 #include <vector>
2454
2455-struct basic
2456+struct in_place
2457 {
2458- template <typename T, typename Policy>
2459- float64_t run(thrust::device_vector<T>& input, thrust::device_vector<T>& output, Policy policy)
2460- {
2461- bench_utils::gpu_timer d_timer;
2462+ template <typename T, typename Policy>
2463+ float64_t run(thrust::device_vector<T>& vec, Policy policy)
2464+ {
2465+ thrust::adjacent_difference(policy, vec.begin(), vec.end(), vec.begin());
2466
2467- d_timer.start(0);
2468- thrust::copy(policy, input.cbegin(), input.cend(), output.begin());
2469- d_timer.stop(0);
2470+ bench_utils::gpu_timer d_timer;
2471
2472- return d_timer.get_duration();
2473- }
2474+ d_timer.start(0);
2475+ thrust::adjacent_difference(policy, vec.begin(), vec.end(), vec.begin());
2476+ d_timer.stop(0);
2477+
2478+ return d_timer.get_duration();
2479+ }
2480 };
2481
2482 template <class Benchmark, class T>
2483-void run_benchmark(benchmark::State& state,
2484- const std::size_t elements,
2485- const std::string /*seed_type*/)
2486+void run_benchmark(benchmark::State& state, const std::size_t elements, const std::string /*seed_type*/)
2487 {
2488- // Benchmark object
2489- Benchmark benchmark {};
2490-
2491- // GPU times
2492- std::vector<double> gpu_times;
2493+ // Benchmark object
2494+ Benchmark benchmark{};
2495
2496- // Generate input
2497- thrust::device_vector<T> input(elements, 1);
2498+ // GPU times
2499+ std::vector<double> gpu_times;
2500
2501- // Output
2502- thrust::device_vector<T> output(elements);
2503+ // Generate vec
2504+ thrust::device_vector<T> vec(elements, 0);
2505
2506- bench_utils::caching_allocator_t alloc;
2507- thrust::detail::device_t policy {};
2508+ bench_utils::caching_allocator_t alloc{};
2509+ thrust::detail::device_t policy{};
2510
2511- for(auto _ : state)
2512- {
2513- float64_t duration = benchmark.template run<T>(input, output, policy(alloc));
2514- state.SetIterationTime(duration);
2515- gpu_times.push_back(duration);
2516- }
2517+ for (auto _ : state)
2518+ {
2519+ float64_t duration = benchmark.template run<T>(vec, policy(alloc));
2520+ state.SetIterationTime(duration);
2521+ gpu_times.push_back(duration);
2522+ }
2523
2524- // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
2525- // it will actually be the global memory bandwidth gotten.
2526- state.SetBytesProcessed(state.iterations() * 2 * elements * sizeof(T));
2527- state.SetItemsProcessed(state.iterations() * elements);
2528+ // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
2529+ // it will actually be the global memory bandwidth gotten.
2530+ state.SetBytesProcessed(state.iterations() * 2 * elements * sizeof(T));
2531+ state.SetItemsProcessed(state.iterations() * elements);
2532
2533- const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
2534- state.counters["gpu_noise"] = gpu_cv;
2535+ const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
2536+ state.counters["gpu_noise"] = gpu_cv;
2537 }
2538
2539 #define CREATE_BENCHMARK(T, Elements) \
2540- benchmark::RegisterBenchmark( \
2541- bench_utils::bench_naming::format_name("{algo:copy,subalgo:" + name + ",input_type:" #T \
2542- + ",elements:" #Elements) \
2543- .c_str(), \
2544- run_benchmark<Benchmark, T>, \
2545- Elements, \
2546- seed_type)
2547-
2548-#define BENCHMARK_TYPE(type) \
2549- CREATE_BENCHMARK(type, 1 << 16), CREATE_BENCHMARK(type, 1 << 20), \
2550- CREATE_BENCHMARK(type, 1 << 24), CREATE_BENCHMARK(type, 1 << 28)
2551+ benchmark::RegisterBenchmark( \
2552+ bench_utils::bench_naming::format_name( \
2553+ "{algo:adjacent_difference,subalgo:" + name + ",input_type:" #T + ",elements:" #Elements) \
2554+ .c_str(), \
2555+ run_benchmark<Benchmark, T>, \
2556+ Elements, \
2557+ seed_type)
2558+
2559+#define BENCHMARK_TYPE(type) \
2560+ CREATE_BENCHMARK(type, 1 << 16), CREATE_BENCHMARK(type, 1 << 20), CREATE_BENCHMARK(type, 1 << 24), \
2561+ CREATE_BENCHMARK(type, 1 << 28)
2562
2563 template <class Benchmark>
2564-void add_benchmarks(const std::string& name,
2565- std::vector<benchmark::internal::Benchmark*>& benchmarks,
2566- const std::string seed_type)
2567+void add_benchmarks(
2568+ const std::string& name, std::vector<benchmark::internal::Benchmark*>& benchmarks, const std::string seed_type)
2569 {
2570- std::vector<benchmark::internal::Benchmark*> bs = {BENCHMARK_TYPE(int8_t),
2571- BENCHMARK_TYPE(int16_t),
2572- BENCHMARK_TYPE(int32_t),
2573- BENCHMARK_TYPE(int64_t),
2574- BENCHMARK_TYPE(float32_t),
2575- BENCHMARK_TYPE(float64_t)};
2576-
2577- benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
2578+ std::vector<benchmark::internal::Benchmark*> bs = {
2579+ BENCHMARK_TYPE(int8_t),
2580+ BENCHMARK_TYPE(int16_t),
2581+ BENCHMARK_TYPE(int32_t),
2582+ BENCHMARK_TYPE(int64_t),
2583+ BENCHMARK_TYPE(float32_t),
2584+ BENCHMARK_TYPE(float64_t)};
2585+
2586+ benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
2587 }
2588
2589 int main(int argc, char* argv[])
2590 {
2591- cli::Parser parser(argc, argv);
2592- parser.set_optional<std::string>(
2593- "name_format", "name_format", "human", "either: json,human,txt");
2594- parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
2595- parser.run_and_exit_if_error();
2596-
2597- // Parse argv
2598- benchmark::Initialize(&argc, argv);
2599- bench_utils::bench_naming::set_format(
2600- parser.get<std::string>("name_format")); /* either: json,human,txt */
2601- const std::string seed_type = parser.get<std::string>("seed");
2602-
2603- // Benchmark info
2604- bench_utils::add_common_benchmark_info();
2605- benchmark::AddCustomContext("seed", seed_type);
2606-
2607- // Add benchmark
2608- std::vector<benchmark::internal::Benchmark*> benchmarks;
2609- add_benchmarks<basic>("basic", benchmarks, seed_type);
2610-
2611- // Use manual timing
2612- for(auto& b : benchmarks)
2613- {
2614- b->UseManualTime();
2615- b->Unit(benchmark::kMicrosecond);
2616- b->MinTime(0.4); // in seconds
2617- }
2618-
2619- // Run benchmarks
2620- benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
2621-
2622- // Finish
2623- benchmark::Shutdown();
2624- return 0;
2625+ cli::Parser parser(argc, argv);
2626+ parser.set_optional<std::string>("name_format", "name_format", "human", "either: json,human,txt");
2627+ parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
2628+ parser.run_and_exit_if_error();
2629+
2630+ // Parse argv
2631+ benchmark::Initialize(&argc, argv);
2632+ bench_utils::bench_naming::set_format(parser.get<std::string>("name_format")); /* either: json,human,txt */
2633+ const std::string seed_type = parser.get<std::string>("seed");
2634+
2635+ // Benchmark info
2636+ bench_utils::add_common_benchmark_info();
2637+ benchmark::AddCustomContext("seed", seed_type);
2638+
2639+ // Add benchmark
2640+ std::vector<benchmark::internal::Benchmark*> benchmarks;
2641+ add_benchmarks<in_place>("in_place", benchmarks, seed_type);
2642+
2643+ // Use manual timing
2644+ for (auto& b : benchmarks)
2645+ {
2646+ b->UseManualTime();
2647+ b->Unit(benchmark::kMicrosecond);
2648+ b->MinTime(0.4); // in seconds
2649+ }
2650+
2651+ // Run benchmarks
2652+ benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
2653+
2654+ // Finish
2655+ benchmark::Shutdown();
2656+ return 0;
2657 }
2658diff --git a/benchmark/bench/copy/basic.cu b/benchmark/bench/copy/basic.cu
2659new file mode 100644
2660index 0000000..42f1a1d
2661--- /dev/null
2662+++ b/benchmark/bench/copy/basic.cu
2663@@ -0,0 +1,191 @@
2664+/******************************************************************************
2665+ * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
2666+ * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
2667+ *
2668+ * Redistribution and use in source and binary forms, with or without
2669+ * modification, are permitted provided that the following conditions are met:
2670+ * * Redistributions of source code must retain the above copyright
2671+ * notice, this list of conditions and the following disclaimer.
2672+ * * Redistributions in binary form must reproduce the above copyright
2673+ * notice, this list of conditions and the following disclaimer in the
2674+ * documentation and/or other materials provided with the distribution.
2675+ * * Neither the name of the NVIDIA CORPORATION nor the
2676+ * names of its contributors may be used to endorse or promote products
2677+ * derived from this software without specific prior written permission.
2678+ *
2679+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
2680+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
2681+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
2682+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
2683+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
2684+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
2685+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
2686+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2687+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
2688+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2689+ *
2690+ ******************************************************************************/
2691+
2692+// Benchmark utils
2693+#include "../../bench_utils/bench_utils.hpp"
2694+
2695+// rocThrust
2696+#include <thrust/copy.h>
2697+#include <thrust/count.h>
2698+#include <thrust/device_vector.h>
2699+#include <thrust/execution_policy.h>
2700+
2701+// Google Benchmark
2702+#include <benchmark/benchmark.h>
2703+
2704+// STL
2705+#include <cstddef>
2706+#include <string>
2707+#include <vector>
2708+#if !_THRUST_HAS_DEVICE_SYSTEM_STD
2709+# include <type_traits>
2710+#endif
2711+
2712+struct basic
2713+{
2714+ template <typename T, typename Policy>
2715+ float64_t run(thrust::device_vector<T>& input, thrust::device_vector<T>& output, Policy policy)
2716+ {
2717+ thrust::copy(policy, input.cbegin(), input.cend(), output.begin());
2718+
2719+ bench_utils::gpu_timer d_timer;
2720+
2721+ d_timer.start(0);
2722+ thrust::copy(policy, input.cbegin(), input.cend(), output.begin());
2723+ d_timer.stop(0);
2724+
2725+ return d_timer.get_duration();
2726+ }
2727+};
2728+
2729+template <class Benchmark, class T>
2730+void run_benchmark(benchmark::State& state, const std::size_t elements, const std::string /*seed_type*/)
2731+{
2732+ // Benchmark object
2733+ Benchmark benchmark{};
2734+
2735+ // GPU times
2736+ std::vector<double> gpu_times;
2737+
2738+ // Generate input
2739+ thrust::device_vector<T> input(elements, T{1});
2740+
2741+ // Output
2742+ thrust::device_vector<T> output(elements);
2743+
2744+ bench_utils::caching_allocator_t alloc;
2745+ thrust::detail::device_t policy{};
2746+
2747+ for (auto _ : state)
2748+ {
2749+ float64_t duration = benchmark.template run<T>(input, output, policy(alloc));
2750+ state.SetIterationTime(duration);
2751+ gpu_times.push_back(duration);
2752+ }
2753+
2754+ // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
2755+ // it will actually be the global memory bandwidth gotten.
2756+ state.SetBytesProcessed(state.iterations() * 2 * elements * sizeof(T));
2757+ state.SetItemsProcessed(state.iterations() * elements);
2758+
2759+ const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
2760+ state.counters["gpu_noise"] = gpu_cv;
2761+}
2762+
2763+#define CREATE_BENCHMARK(T, Elements) \
2764+ benchmark::RegisterBenchmark( \
2765+ bench_utils::bench_naming::format_name("{algo:copy,subalgo:" + name + ",input_type:" #T + ",elements:" #Elements) \
2766+ .c_str(), \
2767+ run_benchmark<Benchmark, T>, \
2768+ Elements, \
2769+ seed_type)
2770+
2771+#define BENCHMARK_TYPE(type) \
2772+ CREATE_BENCHMARK(type, 1 << 16), CREATE_BENCHMARK(type, 1 << 20), CREATE_BENCHMARK(type, 1 << 24), \
2773+ CREATE_BENCHMARK(type, 1 << 28)
2774+
2775+// Non-trivially-copyable/relocatable type which is not allowed to be copied using std::memcpy or cudaMemcpy
2776+struct non_trivial
2777+{
2778+ int a;
2779+ int b;
2780+
2781+ non_trivial() = default;
2782+
2783+ THRUST_HOST_DEVICE explicit non_trivial(int i)
2784+ : a(i)
2785+ , b(i)
2786+ {}
2787+
2788+ // the user-defined copy constructor prevents the type from being trivially copyable
2789+ THRUST_HOST_DEVICE non_trivial(const non_trivial& nt)
2790+ : a(nt.a)
2791+ , b(nt.b)
2792+ {}
2793+
2794+ non_trivial& operator=(const non_trivial&) = default;
2795+};
2796+
2797+static_assert(!_THRUST_STD::is_trivially_copyable<non_trivial>::value, ""); // as required by the C++ standard
2798+static_assert(!thrust::is_trivially_relocatable<non_trivial>::value, ""); // thrust uses this check internally
2799+
2800+template <class Benchmark>
2801+void add_benchmarks(
2802+ const std::string& name, std::vector<benchmark::internal::Benchmark*>& benchmarks, const std::string seed_type)
2803+{
2804+ std::vector<benchmark::internal::Benchmark*> bs = {
2805+ BENCHMARK_TYPE(int8_t),
2806+ BENCHMARK_TYPE(uint8_t),
2807+ BENCHMARK_TYPE(int16_t),
2808+ BENCHMARK_TYPE(uint16_t),
2809+ BENCHMARK_TYPE(int32_t),
2810+ BENCHMARK_TYPE(uint32_t),
2811+ BENCHMARK_TYPE(int64_t),
2812+ BENCHMARK_TYPE(uint64_t),
2813+ BENCHMARK_TYPE(float32_t),
2814+ BENCHMARK_TYPE(float64_t),
2815+ BENCHMARK_TYPE(non_trivial)};
2816+
2817+ benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
2818+}
2819+
2820+int main(int argc, char* argv[])
2821+{
2822+ cli::Parser parser(argc, argv);
2823+ parser.set_optional<std::string>("name_format", "name_format", "human", "either: json,human,txt");
2824+ parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
2825+ parser.run_and_exit_if_error();
2826+
2827+ // Parse argv
2828+ benchmark::Initialize(&argc, argv);
2829+ bench_utils::bench_naming::set_format(parser.get<std::string>("name_format")); /* either: json,human,txt */
2830+ const std::string seed_type = parser.get<std::string>("seed");
2831+
2832+ // Benchmark info
2833+ bench_utils::add_common_benchmark_info();
2834+ benchmark::AddCustomContext("seed", seed_type);
2835+
2836+ // Add benchmark
2837+ std::vector<benchmark::internal::Benchmark*> benchmarks;
2838+ add_benchmarks<basic>("basic", benchmarks, seed_type);
2839+
2840+ // Use manual timing
2841+ for (auto& b : benchmarks)
2842+ {
2843+ b->UseManualTime();
2844+ b->Unit(benchmark::kMicrosecond);
2845+ b->MinTime(0.4); // in seconds
2846+ }
2847+
2848+ // Run benchmarks
2849+ benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
2850+
2851+ // Finish
2852+ benchmark::Shutdown();
2853+ return 0;
2854+}
2855diff --git a/benchmark/bench/copy/if.cu b/benchmark/bench/copy/if.cu
2856new file mode 100644
2857index 0000000..2c93d05
2858--- /dev/null
2859+++ b/benchmark/bench/copy/if.cu
2860@@ -0,0 +1,203 @@
2861+/******************************************************************************
2862+ * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
2863+ * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
2864+ *
2865+ * Redistribution and use in source and binary forms, with or without
2866+ * modification, are permitted provided that the following conditions are met:
2867+ * * Redistributions of source code must retain the above copyright
2868+ * notice, this list of conditions and the following disclaimer.
2869+ * * Redistributions in binary form must reproduce the above copyright
2870+ * notice, this list of conditions and the following disclaimer in the
2871+ * documentation and/or other materials provided with the distribution.
2872+ * * Neither the name of the NVIDIA CORPORATION nor the
2873+ * names of its contributors may be used to endorse or promote products
2874+ * derived from this software without specific prior written permission.
2875+ *
2876+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
2877+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
2878+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
2879+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
2880+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
2881+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
2882+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
2883+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2884+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
2885+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2886+ *
2887+ ******************************************************************************/
2888+
2889+// Benchmark utils
2890+#include "../../bench_utils/bench_utils.hpp"
2891+
2892+// rocThrust
2893+#include <thrust/copy.h>
2894+#include <thrust/count.h>
2895+#include <thrust/device_vector.h>
2896+#include <thrust/execution_policy.h>
2897+
2898+// Google Benchmark
2899+#include <benchmark/benchmark.h>
2900+
2901+// STL
2902+#include <cstddef>
2903+#include <string>
2904+#include <vector>
2905+
2906+template <class T>
2907+struct less_then_t
2908+{
2909+ T m_val;
2910+
2911+ __host__ __device__ bool operator()(const T& val) const
2912+ {
2913+ return val < m_val;
2914+ }
2915+};
2916+
2917+struct _if
2918+{
2919+ template <typename T, typename Policy>
2920+ float64_t
2921+ run(thrust::device_vector<T>& input, thrust::device_vector<T>& output, less_then_t<T> select_op, Policy policy)
2922+ {
2923+ thrust::copy_if(policy, input.cbegin(), input.cend(), output.begin(), select_op);
2924+
2925+ bench_utils::gpu_timer d_timer;
2926+
2927+ d_timer.start(0);
2928+ thrust::copy_if(policy, input.cbegin(), input.cend(), output.begin(), select_op);
2929+ d_timer.stop(0);
2930+
2931+ return d_timer.get_duration();
2932+ }
2933+};
2934+
2935+template <class Benchmark, class T>
2936+void run_benchmark(
2937+ benchmark::State& state, const std::size_t elements, const std::string seed_type, const int entropy_reduction)
2938+{
2939+ using select_op_t = less_then_t<T>;
2940+
2941+ // Benchmark object
2942+ Benchmark benchmark{};
2943+
2944+ // GPU times
2945+ std::vector<double> gpu_times;
2946+
2947+ T val = bench_utils::value_from_entropy<T>(bench_utils::get_entropy_percentage(entropy_reduction));
2948+ select_op_t select_op{val};
2949+
2950+ // Generate input and output
2951+ thrust::device_vector<T> input;
2952+ try
2953+ {
2954+ input = bench_utils::generate(elements, seed_type);
2955+ }
2956+ catch (const ::thrust::system::detail::bad_alloc& e)
2957+ {
2958+ (void) hipGetLastError();
2959+ state.SkipWithError(("thrust::system::detail::bad_alloc: " + std::string(e.what())).c_str());
2960+ return;
2961+ }
2962+ const auto selected_elements = thrust::count_if(input.cbegin(), input.cend(), select_op);
2963+ thrust::device_vector<T> output;
2964+ try
2965+ {
2966+ output = thrust::device_vector<T>(selected_elements);
2967+ }
2968+ catch (const ::thrust::system::detail::bad_alloc& e)
2969+ {
2970+ (void) hipGetLastError();
2971+ state.SkipWithError(("thrust::system::detail::bad_alloc: " + std::string(e.what())).c_str());
2972+ return;
2973+ }
2974+
2975+ bench_utils::caching_allocator_t alloc{};
2976+ thrust::detail::device_t policy{};
2977+
2978+ for (auto _ : state)
2979+ {
2980+ float64_t duration = benchmark.template run<T>(input, output, select_op, policy(alloc));
2981+ state.SetIterationTime(duration);
2982+ gpu_times.push_back(duration);
2983+ }
2984+
2985+ // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
2986+ // it will actually be the global memory bandwidth gotten.
2987+ state.SetBytesProcessed(state.iterations() * (elements + selected_elements) * sizeof(T));
2988+ state.SetItemsProcessed(state.iterations() * elements);
2989+
2990+ const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
2991+ state.counters["gpu_noise"] = gpu_cv;
2992+}
2993+
2994+#define CREATE_BENCHMARK(T, Elements, EntropyReduction) \
2995+ benchmark::RegisterBenchmark( \
2996+ bench_utils::bench_naming::format_name( \
2997+ "{algo:copy,subalgo:" + name + ",input_type:" #T + ",elements:" #Elements \
2998+ + ",entropy:" + std::to_string(bench_utils::get_entropy_percentage(EntropyReduction))) \
2999+ .c_str(), \
3000+ run_benchmark<Benchmark, T>, \
3001+ Elements, \
3002+ seed_type, \
3003+ EntropyReduction)
3004+
3005+#define BENCHMARK_TYPE_ENTROPY(type, entropy) \
3006+ CREATE_BENCHMARK(type, 1 << 16, entropy), CREATE_BENCHMARK(type, 1 << 20, entropy), \
3007+ CREATE_BENCHMARK(type, 1 << 24, entropy), CREATE_BENCHMARK(type, 1 << 28, entropy)
3008+
3009+template <class Benchmark>
3010+void add_benchmarks(
3011+ const std::string& name, std::vector<benchmark::internal::Benchmark*>& benchmarks, const std::string seed_type)
3012+{
3013+ constexpr int entropy_reductions[] = {0, 2, 4200}; // 1.000, 0.544, 0.000;
3014+
3015+ for (int entropy_reduction : entropy_reductions)
3016+ {
3017+ std::vector<benchmark::internal::Benchmark*> bs = {
3018+ BENCHMARK_TYPE_ENTROPY(int8_t, entropy_reduction),
3019+ BENCHMARK_TYPE_ENTROPY(int16_t, entropy_reduction),
3020+ BENCHMARK_TYPE_ENTROPY(int32_t, entropy_reduction),
3021+ BENCHMARK_TYPE_ENTROPY(int64_t, entropy_reduction),
3022+ BENCHMARK_TYPE_ENTROPY(float, entropy_reduction),
3023+ BENCHMARK_TYPE_ENTROPY(double, entropy_reduction),
3024+ BENCHMARK_TYPE_ENTROPY(bench_utils::large_data, entropy_reduction)}; // rocThrust issue #565
3025+ benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
3026+ }
3027+}
3028+
3029+int main(int argc, char* argv[])
3030+{
3031+ cli::Parser parser(argc, argv);
3032+ parser.set_optional<std::string>("name_format", "name_format", "human", "either: json,human,txt");
3033+ parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
3034+ parser.run_and_exit_if_error();
3035+
3036+ // Parse argv
3037+ benchmark::Initialize(&argc, argv);
3038+ bench_utils::bench_naming::set_format(parser.get<std::string>("name_format")); /* either: json,human,txt */
3039+ const std::string seed_type = parser.get<std::string>("seed");
3040+
3041+ // Benchmark info
3042+ bench_utils::add_common_benchmark_info();
3043+ benchmark::AddCustomContext("seed", seed_type);
3044+
3045+ // Add benchmark
3046+ std::vector<benchmark::internal::Benchmark*> benchmarks;
3047+ add_benchmarks<_if>("if", benchmarks, seed_type);
3048+
3049+ // Use manual timing
3050+ for (auto& b : benchmarks)
3051+ {
3052+ b->UseManualTime();
3053+ b->Unit(benchmark::kMicrosecond);
3054+ b->MinTime(0.4); // in seconds
3055+ }
3056+
3057+ // Run benchmarks
3058+ benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
3059+
3060+ // Finish
3061+ benchmark::Shutdown();
3062+ return 0;
3063+}
3064diff --git a/benchmark/bench/equal/basic.cu b/benchmark/bench/equal/basic.cu
3065new file mode 100644
3066index 0000000..bda269f
3067--- /dev/null
3068+++ b/benchmark/bench/equal/basic.cu
3069@@ -0,0 +1,145 @@
3070+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3071+// SPDX-FileCopyrightText: Modifications Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
3072+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3073+
3074+// Benchmark utils
3075+#include "../../bench_utils/bench_utils.hpp"
3076+
3077+// rocThrust
3078+#include <thrust/device_vector.h>
3079+#include <thrust/equal.h>
3080+#include <thrust/execution_policy.h>
3081+
3082+// Google Benchmark
3083+#include <benchmark/benchmark.h>
3084+
3085+// STL
3086+#include <algorithm>
3087+#include <cstddef>
3088+#include <string>
3089+#include <vector>
3090+
3091+struct basic
3092+{
3093+ template <typename T, typename Policy>
3094+ float64_t run(thrust::device_vector<T>& a, thrust::device_vector<T>& b, Policy policy)
3095+ {
3096+ bench_utils::do_not_optimize(thrust::equal(policy, a.begin(), a.end(), b.begin()));
3097+
3098+ bench_utils::gpu_timer d_timer;
3099+
3100+ d_timer.start(0);
3101+ bench_utils::do_not_optimize(thrust::equal(policy, a.begin(), a.end(), b.begin()));
3102+ d_timer.stop(0);
3103+
3104+ return d_timer.get_duration();
3105+ }
3106+};
3107+
3108+template <class Benchmark, class T>
3109+void run_benchmark(benchmark::State& state,
3110+ const std::size_t elements,
3111+ const std::string, /*seed_type*/
3112+ const float64_t common_prefix_ratio)
3113+{
3114+ // Benchmark object
3115+ Benchmark benchmark{};
3116+
3117+ // GPU times
3118+ std::vector<double> gpu_times;
3119+
3120+ thrust::device_vector<T> a(elements, T{1});
3121+ thrust::device_vector<T> b(elements, T{1});
3122+
3123+ const auto same_elements = std::min(static_cast<std::size_t>(elements * common_prefix_ratio), elements);
3124+
3125+ bench_utils::caching_allocator_t alloc;
3126+ thrust::detail::device_t policy{};
3127+
3128+ thrust::fill(policy(alloc), b.begin() + same_elements, b.end(), T{2});
3129+
3130+ for (auto _ : state)
3131+ {
3132+ float64_t duration = benchmark.template run<T>(a, b, policy(alloc));
3133+ state.SetIterationTime(duration);
3134+ gpu_times.push_back(duration);
3135+ }
3136+
3137+ // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
3138+ // it will actually be the global memory bandwidth gotten.
3139+ // using `same_elements` instead of `elements` corresponds to the
3140+ // actual elements read in an early exit
3141+ state.SetBytesProcessed(state.iterations() * 2 * std::max(same_elements, std::size_t(1)) * sizeof(T));
3142+ state.SetItemsProcessed(state.iterations() * std::max(same_elements, std::size_t(1)));
3143+
3144+ const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
3145+ state.counters["gpu_noise"] = gpu_cv;
3146+}
3147+
3148+#define CREATE_BENCHMARK(T, Elements, CommonPrefixRatio) \
3149+ benchmark::RegisterBenchmark( \
3150+ bench_utils::bench_naming::format_name("{algo:equal,subalgo:" + name + ",input_type:" #T + ",elements:" #Elements \
3151+ + ", common_prefix_ratio:" #CommonPrefixRatio) \
3152+ .c_str(), \
3153+ run_benchmark<Benchmark, T>, \
3154+ Elements, \
3155+ seed_type, \
3156+ CommonPrefixRatio)
3157+
3158+#define BENCHMARK_ELEMENTS(type, elements) \
3159+ CREATE_BENCHMARK(type, elements, 1.0), CREATE_BENCHMARK(type, elements, 0.5), CREATE_BENCHMARK(type, elements, 0.0)
3160+
3161+#define BENCHMARK_TYPE(type) \
3162+ BENCHMARK_ELEMENTS(type, 1 << 16), BENCHMARK_ELEMENTS(type, 1 << 20), BENCHMARK_ELEMENTS(type, 1 << 24), \
3163+ BENCHMARK_ELEMENTS(type, 1 << 28)
3164+
3165+template <class Benchmark>
3166+void add_benchmarks(
3167+ const std::string& name, std::vector<benchmark::internal::Benchmark*>& benchmarks, const std::string seed_type)
3168+{
3169+ std::vector<benchmark::internal::Benchmark*> bs = {
3170+ BENCHMARK_TYPE(int8_t),
3171+ BENCHMARK_TYPE(int16_t),
3172+ BENCHMARK_TYPE(int32_t),
3173+ BENCHMARK_TYPE(uint32_t),
3174+ BENCHMARK_TYPE(int64_t),
3175+ BENCHMARK_TYPE(uint64_t)};
3176+
3177+ benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
3178+}
3179+
3180+int main(int argc, char* argv[])
3181+{
3182+ cli::Parser parser(argc, argv);
3183+ parser.set_optional<std::string>("name_format", "name_format", "human", "either: json,human,txt");
3184+ parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
3185+ parser.run_and_exit_if_error();
3186+
3187+ // Parse argv
3188+ benchmark::Initialize(&argc, argv);
3189+ bench_utils::bench_naming::set_format(parser.get<std::string>("name_format")); /* either: json,human,txt */
3190+ const std::string seed_type = parser.get<std::string>("seed");
3191+
3192+ // Benchmark info
3193+ bench_utils::add_common_benchmark_info();
3194+ benchmark::AddCustomContext("seed", seed_type);
3195+
3196+ // Add benchmark
3197+ std::vector<benchmark::internal::Benchmark*> benchmarks;
3198+ add_benchmarks<basic>("basic", benchmarks, seed_type);
3199+
3200+ // Use manual timing
3201+ for (auto& b : benchmarks)
3202+ {
3203+ b->UseManualTime();
3204+ b->Unit(benchmark::kMicrosecond);
3205+ b->MinTime(0.4); // in seconds
3206+ }
3207+
3208+ // Run benchmarks
3209+ benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
3210+
3211+ // Finish
3212+ benchmark::Shutdown();
3213+ return 0;
3214+}
3215diff --git a/benchmarks/bench/fill/basic.cu b/benchmark/bench/fill/basic.cu
3216similarity index 54%
3217rename from benchmarks/bench/fill/basic.cu
3218rename to benchmark/bench/fill/basic.cu
3219index f88b0fc..62403bd 100644
3220--- a/benchmarks/bench/fill/basic.cu
3221+++ b/benchmark/bench/fill/basic.cu
3222@@ -1,24 +1,30 @@
3223-// MIT License
3224-//
3225-// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
3226-//
3227-// Permission is hereby granted, free of charge, to any person obtaining a copy
3228-// of this software and associated documentation files (the "Software"), to deal
3229-// in the Software without restriction, including without limitation the rights
3230-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
3231-// copies of the Software, and to permit persons to whom the Software is
3232-// furnished to do so, subject to the following conditions:
3233-//
3234-// The above copyright notice and this permission notice shall be included in all
3235-// copies or substantial portions of the Software.
3236-//
3237-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
3238-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
3239-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
3240-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
3241-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
3242-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3243-// SOFTWARE.
3244+/******************************************************************************
3245+ * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
3246+ * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
3247+ *
3248+ * Redistribution and use in source and binary forms, with or without
3249+ * modification, are permitted provided that the following conditions are met:
3250+ * * Redistributions of source code must retain the above copyright
3251+ * notice, this list of conditions and the following disclaimer.
3252+ * * Redistributions in binary form must reproduce the above copyright
3253+ * notice, this list of conditions and the following disclaimer in the
3254+ * documentation and/or other materials provided with the distribution.
3255+ * * Neither the name of the NVIDIA CORPORATION nor the
3256+ * names of its contributors may be used to endorse or promote products
3257+ * derived from this software without specific prior written permission.
3258+ *
3259+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
3260+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
3261+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
3262+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
3263+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
3264+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
3265+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
3266+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
3267+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
3268+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3269+ *
3270+ ******************************************************************************/
3271
3272 // Benchmark utils
3273 #include "../../bench_utils/bench_utils.hpp"
3274@@ -32,120 +38,121 @@
3275 #include <benchmark/benchmark.h>
3276
3277 // STL
3278-#include <cstdlib>
3279+#include <cstddef>
3280 #include <string>
3281 #include <vector>
3282
3283 template <int Val>
3284 struct basic
3285 {
3286- template <typename T, typename Policy>
3287- float64_t run(thrust::device_vector<T>& output, Policy policy)
3288- {
3289- bench_utils::gpu_timer d_timer;
3290+ template <typename T, typename Policy>
3291+ float64_t run(thrust::device_vector<T>& output, Policy policy)
3292+ {
3293+ thrust::fill(policy, output.begin(), output.end(), T{Val});
3294
3295- d_timer.start(0);
3296- thrust::fill(policy, output.begin(), output.end(), T {Val});
3297- d_timer.stop(0);
3298+ bench_utils::gpu_timer d_timer;
3299
3300- return d_timer.get_duration();
3301- }
3302+ d_timer.start(0);
3303+ thrust::fill(policy, output.begin(), output.end(), T{Val});
3304+ d_timer.stop(0);
3305+
3306+ return d_timer.get_duration();
3307+ }
3308 };
3309
3310 template <class Benchmark, class T>
3311-void run_benchmark(benchmark::State& state,
3312- const std::size_t elements,
3313- const std::string /*seed_type*/)
3314+void run_benchmark(benchmark::State& state, const std::size_t elements, const std::string /*seed_type*/)
3315 {
3316- // Benchmark object
3317- Benchmark benchmark {};
3318+ // Benchmark object
3319+ Benchmark benchmark{};
3320
3321- // GPU times
3322- std::vector<double> gpu_times;
3323+ // GPU times
3324+ std::vector<double> gpu_times;
3325
3326- // Output
3327- thrust::device_vector<T> output(elements);
3328+ // Output
3329+ thrust::device_vector<T> output(elements);
3330
3331- bench_utils::caching_allocator_t alloc {};
3332- thrust::detail::device_t policy {};
3333+ bench_utils::caching_allocator_t alloc{};
3334+ thrust::detail::device_t policy{};
3335
3336- for(auto _ : state)
3337- {
3338- float64_t duration = benchmark.template run<T>(output, policy(alloc));
3339- state.SetIterationTime(duration);
3340- gpu_times.push_back(duration);
3341- }
3342+ for (auto _ : state)
3343+ {
3344+ float64_t duration = benchmark.template run<T>(output, policy(alloc));
3345+ state.SetIterationTime(duration);
3346+ gpu_times.push_back(duration);
3347+ }
3348
3349- // BytesProcessed is only the number the writes, because thrust::fill does not receive an input
3350- state.SetBytesProcessed(state.iterations() * elements * sizeof(T));
3351- state.SetItemsProcessed(state.iterations() * elements);
3352+ // BytesProcessed is only the number the writes, because thrust::fill does not receive an input
3353+ state.SetBytesProcessed(state.iterations() * elements * sizeof(T));
3354+ state.SetItemsProcessed(state.iterations() * elements);
3355
3356- const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
3357- state.counters["gpu_noise"] = gpu_cv;
3358+ const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
3359+ state.counters["gpu_noise"] = gpu_cv;
3360 }
3361
3362-#define CREATE_BENCHMARK(T, Elements) \
3363- benchmark::RegisterBenchmark( \
3364- bench_utils::bench_naming::format_name("{algo:fill,subalgo:" + name + ",input_type:" #T \
3365- + ",elements:" #Elements) \
3366- .c_str(), \
3367- run_benchmark<Benchmark<Val>, T>, \
3368- Elements, \
3369- seed_type)
3370+#define CREATE_BENCHMARK(T, Elements) \
3371+ benchmark::RegisterBenchmark( \
3372+ bench_utils::bench_naming::format_name("{algo:fill,subalgo:" + name + ",input_type:" #T + ",elements:" #Elements) \
3373+ .c_str(), \
3374+ run_benchmark<Benchmark<Val>, T>, \
3375+ Elements, \
3376+ seed_type)
3377
3378-#define BENCHMARK_TYPE(type) \
3379- CREATE_BENCHMARK(type, 1 << 16), CREATE_BENCHMARK(type, 1 << 20), \
3380- CREATE_BENCHMARK(type, 1 << 24), CREATE_BENCHMARK(type, 1 << 28)
3381+#define BENCHMARK_TYPE(type) \
3382+ CREATE_BENCHMARK(type, 1 << 16), CREATE_BENCHMARK(type, 1 << 20), CREATE_BENCHMARK(type, 1 << 24), \
3383+ CREATE_BENCHMARK(type, 1 << 28)
3384
3385 template <template <int> class Benchmark, int Val = 42 /*magic number in Thrust's benchmark*/>
3386-void add_benchmarks(const std::string& name,
3387- std::vector<benchmark::internal::Benchmark*>& benchmarks,
3388- const std::string seed_type)
3389+void add_benchmarks(
3390+ const std::string& name, std::vector<benchmark::internal::Benchmark*>& benchmarks, const std::string seed_type)
3391 {
3392- std::vector<benchmark::internal::Benchmark*> bs = {BENCHMARK_TYPE(int8_t),
3393- BENCHMARK_TYPE(int16_t),
3394- BENCHMARK_TYPE(int32_t),
3395- BENCHMARK_TYPE(int64_t),
3396- BENCHMARK_TYPE(float32_t),
3397- BENCHMARK_TYPE(float64_t)};
3398-
3399- benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
3400+ std::vector<benchmark::internal::Benchmark*> bs = {
3401+ BENCHMARK_TYPE(int8_t),
3402+ BENCHMARK_TYPE(int16_t),
3403+ BENCHMARK_TYPE(int32_t),
3404+ BENCHMARK_TYPE(int64_t),
3405+#if THRUST_BENCHMARKS_HAVE_INT128_SUPPORT
3406+ BENCHMARK_TYPE(int128_t),
3407+#endif
3408+ BENCHMARK_TYPE(float32_t),
3409+ BENCHMARK_TYPE(float64_t)
3410+ };
3411+
3412+ benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
3413 }
3414
3415 int main(int argc, char* argv[])
3416 {
3417- cli::Parser parser(argc, argv);
3418- parser.set_optional<std::string>(
3419- "name_format", "name_format", "human", "either: json,human,txt");
3420- parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
3421- parser.run_and_exit_if_error();
3422-
3423- // Parse argv
3424- benchmark::Initialize(&argc, argv);
3425- bench_utils::bench_naming::set_format(
3426- parser.get<std::string>("name_format")); /* either: json,human,txt */
3427- const std::string seed_type = parser.get<std::string>("seed");
3428-
3429- // Benchmark info
3430- bench_utils::add_common_benchmark_info();
3431- benchmark::AddCustomContext("seed", seed_type);
3432-
3433- // Add benchmark
3434- std::vector<benchmark::internal::Benchmark*> benchmarks;
3435- add_benchmarks<basic>("basic", benchmarks, seed_type);
3436-
3437- // Use manual timing
3438- for(auto& b : benchmarks)
3439- {
3440- b->UseManualTime();
3441- b->Unit(benchmark::kMicrosecond);
3442- b->MinTime(0.4); // in seconds
3443- }
3444-
3445- // Run benchmarks
3446- benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
3447-
3448- // Finish
3449- benchmark::Shutdown();
3450- return 0;
3451+ cli::Parser parser(argc, argv);
3452+ parser.set_optional<std::string>("name_format", "name_format", "human", "either: json,human,txt");
3453+ parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
3454+ parser.run_and_exit_if_error();
3455+
3456+ // Parse argv
3457+ benchmark::Initialize(&argc, argv);
3458+ bench_utils::bench_naming::set_format(parser.get<std::string>("name_format")); /* either: json,human,txt */
3459+ const std::string seed_type = parser.get<std::string>("seed");
3460+
3461+ // Benchmark info
3462+ bench_utils::add_common_benchmark_info();
3463+ benchmark::AddCustomContext("seed", seed_type);
3464+
3465+ // Add benchmark
3466+ std::vector<benchmark::internal::Benchmark*> benchmarks;
3467+ add_benchmarks<basic>("basic", benchmarks, seed_type);
3468+
3469+ // Use manual timing
3470+ for (auto& b : benchmarks)
3471+ {
3472+ b->UseManualTime();
3473+ b->Unit(benchmark::kMicrosecond);
3474+ b->MinTime(0.4); // in seconds
3475+ }
3476+
3477+ // Run benchmarks
3478+ benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
3479+
3480+ // Finish
3481+ benchmark::Shutdown();
3482+ return 0;
3483 }
3484diff --git a/benchmark/bench/for_each/basic.cu b/benchmark/bench/for_each/basic.cu
3485new file mode 100644
3486index 0000000..6a5bd58
3487--- /dev/null
3488+++ b/benchmark/bench/for_each/basic.cu
3489@@ -0,0 +1,169 @@
3490+/******************************************************************************
3491+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3492+ * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
3493+ *
3494+ * Redistribution and use in source and binary forms, with or without
3495+ * modification, are permitted provided that the following conditions are met:
3496+ * * Redistributions of source code must retain the above copyright
3497+ * notice, this list of conditions and the following disclaimer.
3498+ * * Redistributions in binary form must reproduce the above copyright
3499+ * notice, this list of conditions and the following disclaimer in the
3500+ * documentation and/or other materials provided with the distribution.
3501+ * * Neither the name of the NVIDIA CORPORATION nor the
3502+ * names of its contributors may be used to endorse or promote products
3503+ * derived from this software without specific prior written permission.
3504+ *
3505+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
3506+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
3507+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
3508+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
3509+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
3510+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
3511+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
3512+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
3513+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
3514+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3515+ *
3516+ ******************************************************************************/
3517+
3518+// Benchmark utils
3519+#include "../../bench_utils/bench_utils.hpp"
3520+
3521+// rocThrust
3522+#include <thrust/device_vector.h>
3523+#include <thrust/execution_policy.h>
3524+#include <thrust/for_each.h>
3525+
3526+// Google Benchmark
3527+#include <benchmark/benchmark.h>
3528+
3529+// STL
3530+#include <cstddef>
3531+#include <string>
3532+#include <vector>
3533+
3534+template <class T>
3535+struct square_t
3536+{
3537+ __device__ void operator()(T& x) const
3538+ {
3539+ x = x * x;
3540+ }
3541+};
3542+
3543+struct basic
3544+{
3545+ template <typename T, typename OpT, typename Policy>
3546+ float64_t run(thrust::device_vector<T>& in, OpT op, Policy policy)
3547+ {
3548+ thrust::for_each(policy, in.begin(), in.end(), op);
3549+
3550+ bench_utils::gpu_timer d_timer;
3551+
3552+ d_timer.start(0);
3553+ thrust::for_each(policy, in.begin(), in.end(), op);
3554+ d_timer.stop(0);
3555+
3556+ return d_timer.get_duration();
3557+ }
3558+};
3559+
3560+template <class Benchmark, class T>
3561+void run_benchmark(benchmark::State& state, const std::size_t elements, const std::string /*seed_type*/)
3562+{
3563+ // Benchmark object
3564+ Benchmark benchmark{};
3565+
3566+ // GPU times
3567+ std::vector<double> gpu_times;
3568+
3569+ // Generate input
3570+ thrust::device_vector<T> in(elements, T{1});
3571+
3572+ square_t<T> op{};
3573+ bench_utils::caching_allocator_t alloc{};
3574+ thrust::detail::device_t policy{};
3575+
3576+ for (auto _ : state)
3577+ {
3578+ float64_t duration = benchmark.template run<T>(in, op, policy(alloc));
3579+ state.SetIterationTime(duration);
3580+ gpu_times.push_back(duration);
3581+ }
3582+
3583+ // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
3584+ // it will actually be the global memory bandwidth gotten.
3585+ state.SetBytesProcessed(state.iterations() * 2 * elements * sizeof(T));
3586+ state.SetItemsProcessed(state.iterations() * elements);
3587+
3588+ const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
3589+ state.counters["gpu_noise"] = gpu_cv;
3590+}
3591+
3592+#define CREATE_BENCHMARK(T, Elements) \
3593+ benchmark::RegisterBenchmark( \
3594+ bench_utils::bench_naming::format_name( \
3595+ "{algo:for_each,subalgo:" + name + ",input_type:" #T + ",elements:" #Elements) \
3596+ .c_str(), \
3597+ run_benchmark<Benchmark, T>, \
3598+ Elements, \
3599+ seed_type)
3600+
3601+#define BENCHMARK_TYPE(type) \
3602+ CREATE_BENCHMARK(type, 1 << 16), CREATE_BENCHMARK(type, 1 << 20), CREATE_BENCHMARK(type, 1 << 24), \
3603+ CREATE_BENCHMARK(type, 1 << 28)
3604+
3605+template <class Benchmark>
3606+void add_benchmarks(
3607+ const std::string& name, std::vector<benchmark::internal::Benchmark*>& benchmarks, const std::string seed_type)
3608+{
3609+ std::vector<benchmark::internal::Benchmark*> bs = {
3610+ BENCHMARK_TYPE(int8_t),
3611+ BENCHMARK_TYPE(int16_t),
3612+ BENCHMARK_TYPE(int32_t),
3613+ BENCHMARK_TYPE(int64_t),
3614+#if THRUST_BENCHMARKS_HAVE_INT128_SUPPORT
3615+ BENCHMARK_TYPE(int128_t),
3616+#endif
3617+ BENCHMARK_TYPE(float32_t),
3618+ BENCHMARK_TYPE(float64_t)
3619+ };
3620+
3621+ benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
3622+}
3623+
3624+int main(int argc, char* argv[])
3625+{
3626+ cli::Parser parser(argc, argv);
3627+ parser.set_optional<std::string>("name_format", "name_format", "human", "either: json,human,txt");
3628+ parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
3629+ parser.run_and_exit_if_error();
3630+
3631+ // Parse argv
3632+ benchmark::Initialize(&argc, argv);
3633+ bench_utils::bench_naming::set_format(parser.get<std::string>("name_format")); /* either: json,human,txt */
3634+ const std::string seed_type = parser.get<std::string>("seed");
3635+
3636+ // Benchmark info
3637+ bench_utils::add_common_benchmark_info();
3638+ benchmark::AddCustomContext("seed", seed_type);
3639+
3640+ // Add benchmark
3641+ std::vector<benchmark::internal::Benchmark*> benchmarks;
3642+ add_benchmarks<basic>("basic", benchmarks, seed_type);
3643+
3644+ // Use manual timing
3645+ for (auto& b : benchmarks)
3646+ {
3647+ b->UseManualTime();
3648+ b->Unit(benchmark::kMicrosecond);
3649+ b->MinTime(0.4); // in seconds
3650+ }
3651+
3652+ // Run benchmarks
3653+ benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
3654+
3655+ // Finish
3656+ benchmark::Shutdown();
3657+ return 0;
3658+}
3659diff --git a/benchmarks/bench/inner_product/basic.cu b/benchmark/bench/inner_product/basic.cu
3660similarity index 54%
3661rename from benchmarks/bench/inner_product/basic.cu
3662rename to benchmark/bench/inner_product/basic.cu
3663index 67a9b62..842de89 100644
3664--- a/benchmarks/bench/inner_product/basic.cu
3665+++ b/benchmark/bench/inner_product/basic.cu
3666@@ -1,24 +1,30 @@
3667-// MIT License
3668-//
3669-// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
3670-//
3671-// Permission is hereby granted, free of charge, to any person obtaining a copy
3672-// of this software and associated documentation files (the "Software"), to deal
3673-// in the Software without restriction, including without limitation the rights
3674-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
3675-// copies of the Software, and to permit persons to whom the Software is
3676-// furnished to do so, subject to the following conditions:
3677-//
3678-// The above copyright notice and this permission notice shall be included in all
3679-// copies or substantial portions of the Software.
3680-//
3681-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
3682-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
3683-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
3684-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
3685-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
3686-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3687-// SOFTWARE.
3688+/******************************************************************************
3689+ * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
3690+ * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
3691+ *
3692+ * Redistribution and use in source and binary forms, with or without
3693+ * modification, are permitted provided that the following conditions are met:
3694+ * * Redistributions of source code must retain the above copyright
3695+ * notice, this list of conditions and the following disclaimer.
3696+ * * Redistributions in binary form must reproduce the above copyright
3697+ * notice, this list of conditions and the following disclaimer in the
3698+ * documentation and/or other materials provided with the distribution.
3699+ * * Neither the name of the NVIDIA CORPORATION nor the
3700+ * names of its contributors may be used to endorse or promote products
3701+ * derived from this software without specific prior written permission.
3702+ *
3703+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
3704+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
3705+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
3706+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
3707+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
3708+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
3709+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
3710+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
3711+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
3712+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3713+ *
3714+ ******************************************************************************/
3715
3716 // Benchmark utils
3717 #include "../../bench_utils/bench_utils.hpp"
3718@@ -32,126 +38,124 @@
3719 #include <benchmark/benchmark.h>
3720
3721 // STL
3722-#include <cstdlib>
3723+#include <cstddef>
3724 #include <string>
3725 #include <vector>
3726
3727 struct basic
3728 {
3729- template <typename T, typename Policy>
3730- float64_t run(thrust::device_vector<T>& lhs, thrust::device_vector<T>& rhs, Policy policy)
3731- {
3732- bench_utils::gpu_timer d_timer;
3733+ template <typename T, typename Policy>
3734+ float64_t run(thrust::device_vector<T>& lhs, thrust::device_vector<T>& rhs, Policy policy)
3735+ {
3736+ thrust::inner_product(policy, lhs.begin(), lhs.end(), rhs.begin(), T{0});
3737
3738- d_timer.start(0);
3739- thrust::inner_product(policy, lhs.cbegin(), lhs.cend(), rhs.begin(), T {0});
3740- d_timer.stop(0);
3741+ bench_utils::gpu_timer d_timer;
3742
3743- return d_timer.get_duration();
3744- }
3745+ d_timer.start(0);
3746+ thrust::inner_product(policy, lhs.begin(), lhs.end(), rhs.begin(), T{0});
3747+ d_timer.stop(0);
3748+
3749+ return d_timer.get_duration();
3750+ }
3751 };
3752
3753 template <class Benchmark, class T>
3754 void run_benchmark(benchmark::State& state, const std::size_t elements, const std::string seed_type)
3755 {
3756- // Benchmark object
3757- Benchmark benchmark {};
3758-
3759- // GPU times
3760- std::vector<double> gpu_times;
3761-
3762- // Generate input
3763- thrust::device_vector<T> generator = bench_utils::generate(elements, seed_type);
3764- thrust::device_vector<T> lhs = generator;
3765- thrust::device_vector<T> rhs = generator;
3766-
3767- bench_utils::caching_allocator_t alloc {};
3768- thrust::detail::device_t policy {};
3769-
3770- for(auto _ : state)
3771- {
3772- float64_t duration = benchmark.template run<T>(lhs, rhs, policy(alloc));
3773- state.SetIterationTime(duration);
3774- gpu_times.push_back(duration);
3775- }
3776-
3777- // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
3778- // it will actually be the global memory bandwidth gotten.
3779- state.SetBytesProcessed(state.iterations() * (2 * elements + 1) * sizeof(T));
3780- state.SetItemsProcessed(state.iterations() * elements);
3781-
3782- const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
3783- state.counters["gpu_noise"] = gpu_cv;
3784+ // Benchmark object
3785+ Benchmark benchmark{};
3786+
3787+ // GPU times
3788+ std::vector<double> gpu_times;
3789+
3790+ // Generate input
3791+ thrust::device_vector<T> generator = bench_utils::generate(elements, seed_type);
3792+ thrust::device_vector<T> lhs = generator;
3793+ thrust::device_vector<T> rhs = generator;
3794+
3795+ bench_utils::caching_allocator_t alloc{};
3796+ thrust::detail::device_t policy{};
3797+
3798+ for (auto _ : state)
3799+ {
3800+ float64_t duration = benchmark.template run<T>(lhs, rhs, policy(alloc));
3801+ state.SetIterationTime(duration);
3802+ gpu_times.push_back(duration);
3803+ }
3804+
3805+ // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
3806+ // it will actually be the global memory bandwidth gotten.
3807+ state.SetBytesProcessed(state.iterations() * (2 * elements + 1) * sizeof(T));
3808+ state.SetItemsProcessed(state.iterations() * elements);
3809+
3810+ const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
3811+ state.counters["gpu_noise"] = gpu_cv;
3812 }
3813
3814-#define CREATE_BENCHMARK(T, Elements) \
3815- benchmark::RegisterBenchmark( \
3816- bench_utils::bench_naming::format_name("{algo:inner_product,subalgo:" + name \
3817- + ",input_type:" #T + ",elements:" #Elements) \
3818- .c_str(), \
3819- run_benchmark<Benchmark, T>, \
3820- Elements, \
3821- seed_type)
3822+#define CREATE_BENCHMARK(T, Elements) \
3823+ benchmark::RegisterBenchmark( \
3824+ bench_utils::bench_naming::format_name( \
3825+ "{algo:inner_product,subalgo:" + name + ",input_type:" #T + ",elements:" #Elements) \
3826+ .c_str(), \
3827+ run_benchmark<Benchmark, T>, \
3828+ Elements, \
3829+ seed_type)
3830
3831-#define BENCHMARK_TYPE(type) \
3832- CREATE_BENCHMARK(type, 1 << 16), CREATE_BENCHMARK(type, 1 << 20), \
3833- CREATE_BENCHMARK(type, 1 << 24), CREATE_BENCHMARK(type, 1 << 28)
3834+#define BENCHMARK_TYPE(type) \
3835+ CREATE_BENCHMARK(type, 1 << 16), CREATE_BENCHMARK(type, 1 << 20), CREATE_BENCHMARK(type, 1 << 24), \
3836+ CREATE_BENCHMARK(type, 1 << 28)
3837
3838 template <class Benchmark>
3839-void add_benchmarks(const std::string& name,
3840- std::vector<benchmark::internal::Benchmark*>& benchmarks,
3841- const std::string seed_type)
3842+void add_benchmarks(
3843+ const std::string& name, std::vector<benchmark::internal::Benchmark*>& benchmarks, const std::string seed_type)
3844 {
3845- std::vector<benchmark::internal::Benchmark*> bs
3846- = { BENCHMARK_TYPE(int8_t),
3847- BENCHMARK_TYPE(int16_t),
3848- BENCHMARK_TYPE(int32_t),
3849- BENCHMARK_TYPE(int64_t)
3850+ std::vector<benchmark::internal::Benchmark*> bs = {
3851+ BENCHMARK_TYPE(int8_t),
3852+ BENCHMARK_TYPE(int16_t),
3853+ BENCHMARK_TYPE(int32_t),
3854+ BENCHMARK_TYPE(int64_t),
3855 #if THRUST_BENCHMARKS_HAVE_INT128_SUPPORT
3856- ,
3857- BENCHMARK_TYPE(int128_t)
3858+ BENCHMARK_TYPE(int128_t),
3859 #endif
3860- ,
3861- BENCHMARK_TYPE(float32_t),
3862- BENCHMARK_TYPE(float64_t) };
3863+ BENCHMARK_TYPE(float32_t),
3864+ BENCHMARK_TYPE(float64_t)
3865+ };
3866
3867- benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
3868+ benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
3869 }
3870
3871 int main(int argc, char* argv[])
3872 {
3873- cli::Parser parser(argc, argv);
3874- parser.set_optional<std::string>(
3875- "name_format", "name_format", "human", "either: json,human,txt");
3876- parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
3877- parser.run_and_exit_if_error();
3878-
3879- // Parse argv
3880- benchmark::Initialize(&argc, argv);
3881- bench_utils::bench_naming::set_format(
3882- parser.get<std::string>("name_format")); /* either: json,human,txt */
3883- const std::string seed_type = parser.get<std::string>("seed");
3884-
3885- // Benchmark info
3886- bench_utils::add_common_benchmark_info();
3887- benchmark::AddCustomContext("seed", seed_type);
3888-
3889- // Add benchmark
3890- std::vector<benchmark::internal::Benchmark*> benchmarks;
3891- add_benchmarks<basic>("basic", benchmarks, seed_type);
3892-
3893- // Use manual timing
3894- for(auto& b : benchmarks)
3895- {
3896- b->UseManualTime();
3897- b->Unit(benchmark::kMicrosecond);
3898- b->MinTime(0.4); // in seconds
3899- }
3900-
3901- // Run benchmarks
3902- benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
3903-
3904- // Finish
3905- benchmark::Shutdown();
3906- return 0;
3907+ cli::Parser parser(argc, argv);
3908+ parser.set_optional<std::string>("name_format", "name_format", "human", "either: json,human,txt");
3909+ parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
3910+ parser.run_and_exit_if_error();
3911+
3912+ // Parse argv
3913+ benchmark::Initialize(&argc, argv);
3914+ bench_utils::bench_naming::set_format(parser.get<std::string>("name_format")); /* either: json,human,txt */
3915+ const std::string seed_type = parser.get<std::string>("seed");
3916+
3917+ // Benchmark info
3918+ bench_utils::add_common_benchmark_info();
3919+ benchmark::AddCustomContext("seed", seed_type);
3920+
3921+ // Add benchmark
3922+ std::vector<benchmark::internal::Benchmark*> benchmarks;
3923+ add_benchmarks<basic>("basic", benchmarks, seed_type);
3924+
3925+ // Use manual timing
3926+ for (auto& b : benchmarks)
3927+ {
3928+ b->UseManualTime();
3929+ b->Unit(benchmark::kMicrosecond);
3930+ b->MinTime(0.4); // in seconds
3931+ }
3932+
3933+ // Run benchmarks
3934+ benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
3935+
3936+ // Finish
3937+ benchmark::Shutdown();
3938+ return 0;
3939 }
3940diff --git a/benchmark/bench/merge/basic.cu b/benchmark/bench/merge/basic.cu
3941new file mode 100644
3942index 0000000..e8bfb4a
3943--- /dev/null
3944+++ b/benchmark/bench/merge/basic.cu
3945@@ -0,0 +1,187 @@
3946+/******************************************************************************
3947+ * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
3948+ * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
3949+ *
3950+ * Redistribution and use in source and binary forms, with or without
3951+ * modification, are permitted provided that the following conditions are met:
3952+ * * Redistributions of source code must retain the above copyright
3953+ * notice, this list of conditions and the following disclaimer.
3954+ * * Redistributions in binary form must reproduce the above copyright
3955+ * notice, this list of conditions and the following disclaimer in the
3956+ * documentation and/or other materials provided with the distribution.
3957+ * * Neither the name of the NVIDIA CORPORATION nor the
3958+ * names of its contributors may be used to endorse or promote products
3959+ * derived from this software without specific prior written permission.
3960+ *
3961+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
3962+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
3963+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
3964+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
3965+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
3966+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
3967+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
3968+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
3969+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
3970+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3971+ *
3972+ ******************************************************************************/
3973+
3974+// Benchmark utils
3975+#include "../../bench_utils/bench_utils.hpp"
3976+
3977+// rocThrust
3978+#include <thrust/device_vector.h>
3979+#include <thrust/execution_policy.h>
3980+#include <thrust/merge.h>
3981+#include <thrust/sort.h>
3982+
3983+// Google Benchmark
3984+#include <benchmark/benchmark.h>
3985+
3986+// STL
3987+#include <cstddef>
3988+#include <string>
3989+#include <vector>
3990+
3991+struct basic
3992+{
3993+ template <typename T, typename Policy>
3994+ float64_t
3995+ run(thrust::device_vector<T>& in, thrust::device_vector<T>& out, const std::size_t elements_in_lhs, Policy policy)
3996+ {
3997+ thrust::merge(
3998+ policy, in.cbegin(), in.cbegin() + elements_in_lhs, in.cbegin() + elements_in_lhs, in.cend(), out.begin());
3999+
4000+ bench_utils::gpu_timer d_timer;
4001+
4002+ d_timer.start(0);
4003+ thrust::merge(
4004+ policy, in.cbegin(), in.cbegin() + elements_in_lhs, in.cbegin() + elements_in_lhs, in.cend(), out.begin());
4005+ d_timer.stop(0);
4006+
4007+ return d_timer.get_duration();
4008+ }
4009+};
4010+
4011+template <class Benchmark, class T>
4012+void run_benchmark(benchmark::State& state,
4013+ const std::size_t elements,
4014+ const std::string seed_type,
4015+ const int entropy_reduction,
4016+ const std::size_t input_size_ratio)
4017+{
4018+ // Benchmark object
4019+ Benchmark benchmark{};
4020+
4021+ // GPU times
4022+ std::vector<double> gpu_times;
4023+
4024+ // Generate input
4025+ const auto entropy = bench_utils::get_entropy_percentage(entropy_reduction) / 100.0f;
4026+ const auto elements_in_lhs = static_cast<std::size_t>(static_cast<double>(input_size_ratio * elements) / 100.0);
4027+
4028+ thrust::device_vector<T> in = bench_utils::generate(elements, seed_type, entropy);
4029+
4030+ thrust::sort(in.begin(), in.begin() + elements_in_lhs);
4031+ thrust::sort(in.begin() + elements_in_lhs, in.end());
4032+
4033+ // Output
4034+ thrust::device_vector<T> out(elements);
4035+
4036+ bench_utils::caching_allocator_t alloc{};
4037+ thrust::detail::device_t policy{};
4038+
4039+ for (auto _ : state)
4040+ {
4041+ float64_t duration = benchmark.template run<T>(in, out, elements_in_lhs, policy(alloc));
4042+ state.SetIterationTime(duration);
4043+ gpu_times.push_back(duration);
4044+ }
4045+
4046+ // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
4047+ // it will actually be the global memory bandwidth gotten.
4048+ state.SetBytesProcessed(state.iterations() * 2 * elements * sizeof(T));
4049+ state.SetItemsProcessed(state.iterations() * elements);
4050+
4051+ const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
4052+ state.counters["gpu_noise"] = gpu_cv;
4053+}
4054+
4055+#define CREATE_BENCHMARK(T, Elements, EntropyReduction, InputSizeRatio) \
4056+ benchmark::RegisterBenchmark( \
4057+ bench_utils::bench_naming::format_name( \
4058+ "{algo:merge,subalgo:" + name + ",input_type:" #T + ",elements:" #Elements + ",entropy:" \
4059+ + std::to_string(bench_utils::get_entropy_percentage(EntropyReduction)) + ",input_size_ratio:" #InputSizeRatio) \
4060+ .c_str(), \
4061+ run_benchmark<Benchmark, T>, \
4062+ Elements, \
4063+ seed_type, \
4064+ EntropyReduction, \
4065+ InputSizeRatio)
4066+
4067+#define BENCHMARK_ELEMENTS(type, elements, entropy) \
4068+ CREATE_BENCHMARK(type, elements, entropy, 25), CREATE_BENCHMARK(type, elements, entropy, 50), \
4069+ CREATE_BENCHMARK(type, elements, entropy, 75)
4070+
4071+#define BENCHMARK_TYPE_ENTROPY(type, entropy) \
4072+ BENCHMARK_ELEMENTS(type, 1 << 16, entropy), BENCHMARK_ELEMENTS(type, 1 << 20, entropy), \
4073+ BENCHMARK_ELEMENTS(type, 1 << 24, entropy), BENCHMARK_ELEMENTS(type, 1 << 28, entropy)
4074+
4075+template <class Benchmark>
4076+void add_benchmarks(
4077+ const std::string& name, std::vector<benchmark::internal::Benchmark*>& benchmarks, const std::string seed_type)
4078+{
4079+ constexpr int entropy_reductions[] = {0, 4}; // 1.000, 0.201;
4080+
4081+ for (int entropy_reduction : entropy_reductions)
4082+ {
4083+ std::vector<benchmark::internal::Benchmark*> bs = {
4084+ BENCHMARK_TYPE_ENTROPY(int8_t, entropy_reduction),
4085+ BENCHMARK_TYPE_ENTROPY(int16_t, entropy_reduction),
4086+ BENCHMARK_TYPE_ENTROPY(int32_t, entropy_reduction),
4087+ BENCHMARK_TYPE_ENTROPY(int64_t, entropy_reduction),
4088+#if THRUST_BENCHMARKS_HAVE_INT128_SUPPORT
4089+ BENCHMARK_TYPE_ENTROPY(int128_t, entropy_reduction),
4090+#endif
4091+ BENCHMARK_TYPE_ENTROPY(float, entropy_reduction),
4092+ BENCHMARK_TYPE_ENTROPY(double, entropy_reduction)
4093+ };
4094+ benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
4095+ }
4096+}
4097+
4098+int main(int argc, char* argv[])
4099+{
4100+ cli::Parser parser(argc, argv);
4101+ parser.set_optional<std::string>("name_format", "name_format", "human", "either: json,human,txt");
4102+ parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
4103+ parser.run_and_exit_if_error();
4104+
4105+ // Parse argv
4106+ benchmark::Initialize(&argc, argv);
4107+ bench_utils::bench_naming::set_format(parser.get<std::string>("name_format")); /* either: json,human,txt */
4108+ const std::string seed_type = parser.get<std::string>("seed");
4109+
4110+ // Benchmark info
4111+ bench_utils::add_common_benchmark_info();
4112+ benchmark::AddCustomContext("seed", seed_type);
4113+
4114+ // Add benchmark
4115+ std::vector<benchmark::internal::Benchmark*> benchmarks;
4116+ add_benchmarks<basic>("basic", benchmarks, seed_type);
4117+
4118+ // Use manual timing
4119+ for (auto& b : benchmarks)
4120+ {
4121+ b->UseManualTime();
4122+ b->Unit(benchmark::kMicrosecond);
4123+ b->MinTime(0.4); // in seconds
4124+ }
4125+
4126+ // Run benchmarks
4127+ benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
4128+
4129+ // Finish
4130+ benchmark::Shutdown();
4131+ return 0;
4132+}
4133diff --git a/benchmarks/bench/partition/basic.cu b/benchmark/bench/partition/basic.cu
4134similarity index 52%
4135rename from benchmarks/bench/partition/basic.cu
4136rename to benchmark/bench/partition/basic.cu
4137index dfa5de1..9a9aa62 100644
4138--- a/benchmarks/bench/partition/basic.cu
4139+++ b/benchmark/bench/partition/basic.cu
4140@@ -1,189 +1,201 @@
4141-// MIT License
4142-//
4143-// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
4144-//
4145-// Permission is hereby granted, free of charge, to any person obtaining a copy
4146-// of this software and associated documentation files (the "Software"), to deal
4147-// in the Software without restriction, including without limitation the rights
4148-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
4149-// copies of the Software, and to permit persons to whom the Software is
4150-// furnished to do so, subject to the following conditions:
4151-//
4152-// The above copyright notice and this permission notice shall be included in all
4153-// copies or substantial portions of the Software.
4154-//
4155-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
4156-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
4157-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
4158-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
4159-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
4160-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4161-// SOFTWARE.
4162+/******************************************************************************
4163+ * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
4164+ * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
4165+ *
4166+ * Redistribution and use in source and binary forms, with or without
4167+ * modification, are permitted provided that the following conditions are met:
4168+ * * Redistributions of source code must retain the above copyright
4169+ * notice, this list of conditions and the following disclaimer.
4170+ * * Redistributions in binary form must reproduce the above copyright
4171+ * notice, this list of conditions and the following disclaimer in the
4172+ * documentation and/or other materials provided with the distribution.
4173+ * * Neither the name of the NVIDIA CORPORATION nor the
4174+ * names of its contributors may be used to endorse or promote products
4175+ * derived from this software without specific prior written permission.
4176+ *
4177+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
4178+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
4179+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
4180+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
4181+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
4182+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
4183+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
4184+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
4185+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
4186+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4187+ *
4188+ ******************************************************************************/
4189
4190 // Benchmark utils
4191 #include "../../bench_utils/bench_utils.hpp"
4192
4193 // rocThrust
4194-#include <thrust/copy.h>
4195-#include <thrust/count.h>
4196 #include <thrust/device_vector.h>
4197 #include <thrust/execution_policy.h>
4198-#include <thrust/iterator/reverse_iterator.h>
4199+#include <thrust/partition.h>
4200
4201 // Google Benchmark
4202 #include <benchmark/benchmark.h>
4203
4204 // STL
4205-#include <cstdlib>
4206+#include <cstddef>
4207 #include <string>
4208 #include <vector>
4209
4210 template <class T>
4211 struct less_then_t
4212 {
4213- T m_val;
4214+ T m_val;
4215
4216- __host__ __device__ bool operator()(const T& val) const
4217- {
4218- return val < m_val;
4219- }
4220+ __host__ __device__ bool operator()(const T& val) const
4221+ {
4222+ return val < m_val;
4223+ }
4224 };
4225
4226 struct basic
4227 {
4228- template <typename T, typename Policy>
4229- float64_t run(thrust::device_vector<T>& input,
4230- thrust::device_vector<T>& output,
4231- const std::size_t elements,
4232- less_then_t<T> select_op,
4233- Policy policy)
4234- {
4235- bench_utils::gpu_timer d_timer;
4236-
4237- d_timer.start(0);
4238- thrust::copy_if(policy,
4239- input.cbegin(),
4240- input.cend(),
4241- output.begin(),
4242- thrust::make_reverse_iterator(output.begin() + elements),
4243- select_op);
4244- d_timer.stop(0);
4245-
4246- return d_timer.get_duration();
4247- }
4248+ template <typename T, typename Policy>
4249+ float64_t run(thrust::device_vector<T>& input,
4250+ thrust::device_vector<T>& output,
4251+ const std::size_t elements,
4252+ less_then_t<T> select_op,
4253+ Policy policy)
4254+ {
4255+ thrust::partition_copy(
4256+ policy,
4257+ input.cbegin(),
4258+ input.cend(),
4259+ output.begin(),
4260+ thrust::make_reverse_iterator(output.begin() + elements),
4261+ select_op);
4262+
4263+ bench_utils::gpu_timer d_timer;
4264+
4265+ d_timer.start(0);
4266+ thrust::partition_copy(
4267+ policy,
4268+ input.cbegin(),
4269+ input.cend(),
4270+ output.begin(),
4271+ thrust::make_reverse_iterator(output.begin() + elements),
4272+ select_op);
4273+ d_timer.stop(0);
4274+
4275+ return d_timer.get_duration();
4276+ }
4277 };
4278
4279 template <class Benchmark, class T>
4280-void run_benchmark(benchmark::State& state,
4281- const std::size_t elements,
4282- const std::string seed_type,
4283- const int entropy_reduction)
4284+void run_benchmark(
4285+ benchmark::State& state, const std::size_t elements, const std::string seed_type, const int entropy_reduction)
4286 {
4287- // Benchmark object
4288- Benchmark benchmark {};
4289+ using select_op_t = less_then_t<T>;
4290
4291- // GPU times
4292- std::vector<double> gpu_times;
4293+ // Benchmark object
4294+ Benchmark benchmark{};
4295
4296- // Generate input
4297- T val = bench_utils::value_from_entropy<T>(
4298- bench_utils::get_entropy_percentage(entropy_reduction));
4299- less_then_t<T> select_op {val};
4300+ // GPU times
4301+ std::vector<double> gpu_times;
4302
4303- thrust::device_vector<T> input = bench_utils::generate(elements, seed_type);
4304+ // Generate input
4305+ T val = bench_utils::value_from_entropy<T>(bench_utils::get_entropy_percentage(entropy_reduction));
4306+ select_op_t select_op{val};
4307
4308- // Output
4309- thrust::device_vector<T> output(elements);
4310+ thrust::device_vector<T> input = bench_utils::generate(elements, seed_type);
4311
4312- bench_utils::caching_allocator_t alloc {};
4313- thrust::detail::device_t policy {};
4314+ // Output
4315+ thrust::device_vector<T> output(elements);
4316
4317- for(auto _ : state)
4318- {
4319- float64_t duration
4320- = benchmark.template run<T>(input, output, elements, select_op, policy(alloc));
4321- state.SetIterationTime(duration);
4322- gpu_times.push_back(duration);
4323- }
4324+ bench_utils::caching_allocator_t alloc{};
4325+ thrust::detail::device_t policy{};
4326
4327- // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
4328- // it will actually be the global memory bandwidth gotten.
4329- state.SetBytesProcessed(state.iterations() * 2 * elements * sizeof(T));
4330- state.SetItemsProcessed(state.iterations() * elements);
4331+ for (auto _ : state)
4332+ {
4333+ float64_t duration = benchmark.template run<T>(input, output, elements, select_op, policy(alloc));
4334+ state.SetIterationTime(duration);
4335+ gpu_times.push_back(duration);
4336+ }
4337
4338- const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
4339- state.counters["gpu_noise"] = gpu_cv;
4340+ // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
4341+ // it will actually be the global memory bandwidth gotten.
4342+ state.SetBytesProcessed(state.iterations() * 2 * elements * sizeof(T));
4343+ state.SetItemsProcessed(state.iterations() * elements);
4344+
4345+ const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
4346+ state.counters["gpu_noise"] = gpu_cv;
4347 }
4348
4349-#define CREATE_BENCHMARK(T, Elements, EntropyReduction) \
4350- benchmark::RegisterBenchmark( \
4351- bench_utils::bench_naming::format_name( \
4352- "{algo:partition,subalgo:" + name + ",input_type:" #T + ",elements:" #Elements \
4353- + ",entropy:" + std::to_string(bench_utils::get_entropy_percentage(EntropyReduction))) \
4354- .c_str(), \
4355- run_benchmark<Benchmark, T>, \
4356- Elements, \
4357- seed_type, \
4358- EntropyReduction)
4359-
4360-#define BENCHMARK_TYPE_ENTROPY(type, entropy) \
4361- CREATE_BENCHMARK(type, 1 << 16, entropy), CREATE_BENCHMARK(type, 1 << 20, entropy), \
4362- CREATE_BENCHMARK(type, 1 << 24, entropy), CREATE_BENCHMARK(type, 1 << 28, entropy)
4363+#define CREATE_BENCHMARK(T, Elements, EntropyReduction) \
4364+ benchmark::RegisterBenchmark( \
4365+ bench_utils::bench_naming::format_name( \
4366+ "{algo:partition,subalgo:" + name + ",input_type:" #T + ",elements:" #Elements \
4367+ + ",entropy:" + std::to_string(bench_utils::get_entropy_percentage(EntropyReduction))) \
4368+ .c_str(), \
4369+ run_benchmark<Benchmark, T>, \
4370+ Elements, \
4371+ seed_type, \
4372+ EntropyReduction)
4373+
4374+#define BENCHMARK_TYPE_ENTROPY(type, entropy) \
4375+ CREATE_BENCHMARK(type, 1 << 16, entropy), CREATE_BENCHMARK(type, 1 << 20, entropy), \
4376+ CREATE_BENCHMARK(type, 1 << 24, entropy), CREATE_BENCHMARK(type, 1 << 28, entropy)
4377
4378 template <class Benchmark>
4379-void add_benchmarks(const std::string& name,
4380- std::vector<benchmark::internal::Benchmark*>& benchmarks,
4381- const std::string seed_type)
4382+void add_benchmarks(
4383+ const std::string& name, std::vector<benchmark::internal::Benchmark*>& benchmarks, const std::string seed_type)
4384 {
4385- constexpr int entropy_reductions[] = {0, 2, 6}; // 1.000, 0.544, 0.000;
4386-
4387- for(int entropy_reduction : entropy_reductions)
4388- {
4389- std::vector<benchmark::internal::Benchmark*> bs
4390- = {BENCHMARK_TYPE_ENTROPY(int8_t, entropy_reduction),
4391- BENCHMARK_TYPE_ENTROPY(int16_t, entropy_reduction),
4392- BENCHMARK_TYPE_ENTROPY(int32_t, entropy_reduction),
4393- BENCHMARK_TYPE_ENTROPY(int64_t, entropy_reduction),
4394- BENCHMARK_TYPE_ENTROPY(float, entropy_reduction),
4395- BENCHMARK_TYPE_ENTROPY(double, entropy_reduction)};
4396- benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
4397- }
4398+ constexpr int entropy_reductions[] = {0, 2, 4200}; // 1.000, 0.544, 0.000;
4399+
4400+ for (int entropy_reduction : entropy_reductions)
4401+ {
4402+ std::vector<benchmark::internal::Benchmark*> bs = {
4403+ BENCHMARK_TYPE_ENTROPY(int8_t, entropy_reduction),
4404+ BENCHMARK_TYPE_ENTROPY(int16_t, entropy_reduction),
4405+ BENCHMARK_TYPE_ENTROPY(int32_t, entropy_reduction),
4406+ BENCHMARK_TYPE_ENTROPY(int64_t, entropy_reduction),
4407+#if THRUST_BENCHMARKS_HAVE_INT128_SUPPORT
4408+ BENCHMARK_TYPE_ENTROPY(int128_t, entropy_reduction),
4409+#endif
4410+ BENCHMARK_TYPE_ENTROPY(float, entropy_reduction),
4411+ BENCHMARK_TYPE_ENTROPY(double, entropy_reduction)
4412+ };
4413+ benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
4414+ }
4415 }
4416
4417 int main(int argc, char* argv[])
4418 {
4419- cli::Parser parser(argc, argv);
4420- parser.set_optional<std::string>(
4421- "name_format", "name_format", "human", "either: json,human,txt");
4422- parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
4423- parser.run_and_exit_if_error();
4424-
4425- // Parse argv
4426- benchmark::Initialize(&argc, argv);
4427- bench_utils::bench_naming::set_format(
4428- parser.get<std::string>("name_format")); /* either: json,human,txt */
4429- const std::string seed_type = parser.get<std::string>("seed");
4430-
4431- // Benchmark info
4432- bench_utils::add_common_benchmark_info();
4433- benchmark::AddCustomContext("seed", seed_type);
4434-
4435- // Add benchmark
4436- std::vector<benchmark::internal::Benchmark*> benchmarks;
4437- add_benchmarks<basic>("basic", benchmarks, seed_type);
4438-
4439- // Use manual timing
4440- for(auto& b : benchmarks)
4441- {
4442- b->UseManualTime();
4443- b->Unit(benchmark::kMicrosecond);
4444- b->MinTime(0.4); // in seconds
4445- }
4446-
4447- // Run benchmarks
4448- benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
4449-
4450- // Finish
4451- benchmark::Shutdown();
4452- return 0;
4453+ cli::Parser parser(argc, argv);
4454+ parser.set_optional<std::string>("name_format", "name_format", "human", "either: json,human,txt");
4455+ parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
4456+ parser.run_and_exit_if_error();
4457+
4458+ // Parse argv
4459+ benchmark::Initialize(&argc, argv);
4460+ bench_utils::bench_naming::set_format(parser.get<std::string>("name_format")); /* either: json,human,txt */
4461+ const std::string seed_type = parser.get<std::string>("seed");
4462+
4463+ // Benchmark info
4464+ bench_utils::add_common_benchmark_info();
4465+ benchmark::AddCustomContext("seed", seed_type);
4466+
4467+ // Add benchmark
4468+ std::vector<benchmark::internal::Benchmark*> benchmarks;
4469+ add_benchmarks<basic>("basic", benchmarks, seed_type);
4470+
4471+ // Use manual timing
4472+ for (auto& b : benchmarks)
4473+ {
4474+ b->UseManualTime();
4475+ b->Unit(benchmark::kMicrosecond);
4476+ b->MinTime(0.4); // in seconds
4477+ }
4478+
4479+ // Run benchmarks
4480+ benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
4481+
4482+ // Finish
4483+ benchmark::Shutdown();
4484+ return 0;
4485 }
4486diff --git a/benchmarks/bench/transform_reduce/sum.cu b/benchmark/bench/reduce/basic.cu
4487similarity index 54%
4488rename from benchmarks/bench/transform_reduce/sum.cu
4489rename to benchmark/bench/reduce/basic.cu
4490index 6dc60ec..f08025b 100644
4491--- a/benchmarks/bench/transform_reduce/sum.cu
4492+++ b/benchmark/bench/reduce/basic.cu
4493@@ -1,24 +1,30 @@
4494-// MIT License
4495-//
4496-// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
4497-//
4498-// Permission is hereby granted, free of charge, to any person obtaining a copy
4499-// of this software and associated documentation files (the "Software"), to deal
4500-// in the Software without restriction, including without limitation the rights
4501-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
4502-// copies of the Software, and to permit persons to whom the Software is
4503-// furnished to do so, subject to the following conditions:
4504-//
4505-// The above copyright notice and this permission notice shall be included in all
4506-// copies or substantial portions of the Software.
4507-//
4508-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
4509-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
4510-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
4511-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
4512-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
4513-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4514-// SOFTWARE.
4515+/******************************************************************************
4516+ * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
4517+ * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
4518+ *
4519+ * Redistribution and use in source and binary forms, with or without
4520+ * modification, are permitted provided that the following conditions are met:
4521+ * * Redistributions of source code must retain the above copyright
4522+ * notice, this list of conditions and the following disclaimer.
4523+ * * Redistributions in binary form must reproduce the above copyright
4524+ * notice, this list of conditions and the following disclaimer in the
4525+ * documentation and/or other materials provided with the distribution.
4526+ * * Neither the name of the NVIDIA CORPORATION nor the
4527+ * names of its contributors may be used to endorse or promote products
4528+ * derived from this software without specific prior written permission.
4529+ *
4530+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
4531+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
4532+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
4533+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
4534+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
4535+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
4536+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
4537+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
4538+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
4539+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4540+ *
4541+ ******************************************************************************/
4542
4543 // Benchmark utils
4544 #include "../../bench_utils/bench_utils.hpp"
4545@@ -32,125 +38,121 @@
4546 #include <benchmark/benchmark.h>
4547
4548 // STL
4549-#include <cstdlib>
4550+#include <cstddef>
4551 #include <string>
4552 #include <vector>
4553
4554-struct sum
4555+struct basic
4556 {
4557- template <typename T, typename Policy>
4558- float64_t run(thrust::device_vector<T>& input, Policy policy)
4559- {
4560- bench_utils::gpu_timer d_timer;
4561-
4562- d_timer.start(0);
4563- bench_utils::do_not_optimize(
4564- thrust::reduce(policy, input.begin(), input.end(), T {}, thrust::plus<T> {}));
4565- d_timer.stop(0);
4566-
4567- return d_timer.get_duration();
4568- }
4569+ template <typename T, typename Policy>
4570+ float64_t run(thrust::device_vector<T>& in, Policy policy)
4571+ {
4572+ bench_utils::do_not_optimize(thrust::reduce(policy, in.begin(), in.end()));
4573+
4574+ bench_utils::gpu_timer d_timer;
4575+
4576+ d_timer.start(0);
4577+ bench_utils::do_not_optimize(thrust::reduce(policy, in.begin(), in.end()));
4578+ d_timer.stop(0);
4579+
4580+ return d_timer.get_duration();
4581+ }
4582 };
4583
4584 template <class Benchmark, class T>
4585 void run_benchmark(benchmark::State& state, const std::size_t elements, const std::string seed_type)
4586 {
4587- // Benchmark object
4588- Benchmark benchmark {};
4589+ // Benchmark object
4590+ Benchmark benchmark{};
4591
4592- // GPU times
4593- std::vector<double> gpu_times;
4594+ // GPU times
4595+ std::vector<double> gpu_times;
4596
4597- // Generate input
4598- thrust::device_vector<T> input = bench_utils::generate(elements, seed_type);
4599+ // Generate input
4600+ thrust::device_vector<T> in = bench_utils::generate(elements, seed_type);
4601
4602- bench_utils::caching_allocator_t alloc {};
4603- thrust::detail::device_t policy {};
4604+ bench_utils::caching_allocator_t alloc{};
4605+ thrust::detail::device_t policy{};
4606
4607- for(auto _ : state)
4608- {
4609- float64_t duration = benchmark.template run<T>(input, policy(alloc));
4610- state.SetIterationTime(duration);
4611- gpu_times.push_back(duration);
4612- }
4613+ for (auto _ : state)
4614+ {
4615+ float64_t duration = benchmark.template run<T>(in, policy(alloc));
4616+ state.SetIterationTime(duration);
4617+ gpu_times.push_back(duration);
4618+ }
4619
4620- // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
4621- // it will actually be the global memory bandwidth gotten.
4622- state.SetBytesProcessed(state.iterations() * (elements + 1) * sizeof(T));
4623- state.SetItemsProcessed(state.iterations() * elements);
4624+ // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
4625+ // it will actually be the global memory bandwidth gotten.
4626+ state.SetBytesProcessed(state.iterations() * (elements + 1) * sizeof(T));
4627+ state.SetItemsProcessed(state.iterations() * elements);
4628
4629- const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
4630- state.counters["gpu_noise"] = gpu_cv;
4631+ const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
4632+ state.counters["gpu_noise"] = gpu_cv;
4633 }
4634
4635-#define CREATE_BENCHMARK(T, Elements) \
4636- benchmark::RegisterBenchmark( \
4637- bench_utils::bench_naming::format_name("{algo:transform_reduce,subalgo:" + name \
4638- + ",input_type:" #T + ",elements:" #Elements) \
4639- .c_str(), \
4640- run_benchmark<Benchmark, T>, \
4641- Elements, \
4642- seed_type)
4643+#define CREATE_BENCHMARK(T, Elements) \
4644+ benchmark::RegisterBenchmark( \
4645+ bench_utils::bench_naming::format_name("{algo:reduce,subalgo:" + name + ",input_type:" #T + ",elements:" #Elements) \
4646+ .c_str(), \
4647+ run_benchmark<Benchmark, T>, \
4648+ Elements, \
4649+ seed_type)
4650
4651-#define BENCHMARK_TYPE(type) \
4652- CREATE_BENCHMARK(type, 1 << 16), CREATE_BENCHMARK(type, 1 << 20), \
4653- CREATE_BENCHMARK(type, 1 << 24), CREATE_BENCHMARK(type, 1 << 28)
4654+#define BENCHMARK_TYPE(type) \
4655+ CREATE_BENCHMARK(type, 1 << 16), CREATE_BENCHMARK(type, 1 << 20), CREATE_BENCHMARK(type, 1 << 24), \
4656+ CREATE_BENCHMARK(type, 1 << 28)
4657
4658 template <class Benchmark>
4659-void add_benchmarks(const std::string& name,
4660- std::vector<benchmark::internal::Benchmark*>& benchmarks,
4661- const std::string seed_type)
4662+void add_benchmarks(
4663+ const std::string& name, std::vector<benchmark::internal::Benchmark*>& benchmarks, const std::string seed_type)
4664 {
4665- std::vector<benchmark::internal::Benchmark*> bs
4666- = { BENCHMARK_TYPE(int8_t),
4667- BENCHMARK_TYPE(int16_t),
4668- BENCHMARK_TYPE(int32_t),
4669- BENCHMARK_TYPE(int64_t)
4670+ std::vector<benchmark::internal::Benchmark*> bs = {
4671+ BENCHMARK_TYPE(int8_t),
4672+ BENCHMARK_TYPE(int16_t),
4673+ BENCHMARK_TYPE(int32_t),
4674+ BENCHMARK_TYPE(int64_t),
4675 #if THRUST_BENCHMARKS_HAVE_INT128_SUPPORT
4676- ,
4677- BENCHMARK_TYPE(int128_t)
4678+ BENCHMARK_TYPE(int128_t),
4679 #endif
4680- ,
4681- BENCHMARK_TYPE(float32_t),
4682- BENCHMARK_TYPE(float64_t) };
4683+ BENCHMARK_TYPE(float32_t),
4684+ BENCHMARK_TYPE(float64_t)
4685+ };
4686
4687- benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
4688+ benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
4689 }
4690
4691 int main(int argc, char* argv[])
4692 {
4693- cli::Parser parser(argc, argv);
4694- parser.set_optional<std::string>(
4695- "name_format", "name_format", "human", "either: json,human,txt");
4696- parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
4697- parser.run_and_exit_if_error();
4698-
4699- // Parse argv
4700- benchmark::Initialize(&argc, argv);
4701- bench_utils::bench_naming::set_format(
4702- parser.get<std::string>("name_format")); /* either: json,human,txt */
4703- const std::string seed_type = parser.get<std::string>("seed");
4704-
4705- // Benchmark info
4706- bench_utils::add_common_benchmark_info();
4707- benchmark::AddCustomContext("seed", seed_type);
4708-
4709- // Add benchmark
4710- std::vector<benchmark::internal::Benchmark*> benchmarks;
4711- add_benchmarks<sum>("sum", benchmarks, seed_type);
4712-
4713- // Use manual timing
4714- for(auto& b : benchmarks)
4715- {
4716- b->UseManualTime();
4717- b->Unit(benchmark::kMicrosecond);
4718- b->MinTime(0.4); // in seconds
4719- }
4720-
4721- // Run benchmarks
4722- benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
4723-
4724- // Finish
4725- benchmark::Shutdown();
4726- return 0;
4727+ cli::Parser parser(argc, argv);
4728+ parser.set_optional<std::string>("name_format", "name_format", "human", "either: json,human,txt");
4729+ parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
4730+ parser.run_and_exit_if_error();
4731+
4732+ // Parse argv
4733+ benchmark::Initialize(&argc, argv);
4734+ bench_utils::bench_naming::set_format(parser.get<std::string>("name_format")); /* either: json,human,txt */
4735+ const std::string seed_type = parser.get<std::string>("seed");
4736+
4737+ // Benchmark info
4738+ bench_utils::add_common_benchmark_info();
4739+ benchmark::AddCustomContext("seed", seed_type);
4740+
4741+ // Add benchmark
4742+ std::vector<benchmark::internal::Benchmark*> benchmarks;
4743+ add_benchmarks<basic>("basic", benchmarks, seed_type);
4744+
4745+ // Use manual timing
4746+ for (auto& b : benchmarks)
4747+ {
4748+ b->UseManualTime();
4749+ b->Unit(benchmark::kMicrosecond);
4750+ b->MinTime(0.4); // in seconds
4751+ }
4752+
4753+ // Run benchmarks
4754+ benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
4755+
4756+ // Finish
4757+ benchmark::Shutdown();
4758+ return 0;
4759 }
4760diff --git a/benchmark/bench/reduce/by_key.cu b/benchmark/bench/reduce/by_key.cu
4761new file mode 100644
4762index 0000000..ea4dee5
4763--- /dev/null
4764+++ b/benchmark/bench/reduce/by_key.cu
4765@@ -0,0 +1,211 @@
4766+/******************************************************************************
4767+ * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
4768+ * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
4769+ *
4770+ * Redistribution and use in source and binary forms, with or without
4771+ * modification, are permitted provided that the following conditions are met:
4772+ * * Redistributions of source code must retain the above copyright
4773+ * notice, this list of conditions and the following disclaimer.
4774+ * * Redistributions in binary form must reproduce the above copyright
4775+ * notice, this list of conditions and the following disclaimer in the
4776+ * documentation and/or other materials provided with the distribution.
4777+ * * Neither the name of the NVIDIA CORPORATION nor the
4778+ * names of its contributors may be used to endorse or promote products
4779+ * derived from this software without specific prior written permission.
4780+ *
4781+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
4782+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
4783+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
4784+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
4785+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
4786+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
4787+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
4788+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
4789+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
4790+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4791+ *
4792+ ******************************************************************************/
4793+
4794+// Benchmark utils
4795+#include "../../bench_utils/bench_utils.hpp"
4796+
4797+// rocThrust
4798+#include <thrust/device_vector.h>
4799+#include <thrust/execution_policy.h>
4800+#include <thrust/reduce.h>
4801+#include <thrust/unique.h>
4802+
4803+// Google Benchmark
4804+#include <benchmark/benchmark.h>
4805+
4806+// STL
4807+#include <cstddef>
4808+#include <string>
4809+#include <vector>
4810+
4811+struct by_key
4812+{
4813+ template <typename KeyT, typename ValueT, typename Policy>
4814+ float64_t run(thrust::device_vector<KeyT>& in_keys,
4815+ thrust::device_vector<ValueT>& in_vals,
4816+ thrust::device_vector<KeyT>& out_keys,
4817+ thrust::device_vector<ValueT>& out_vals,
4818+ Policy policy)
4819+ {
4820+ thrust::reduce_by_key(policy, in_keys.begin(), in_keys.end(), in_vals.begin(), out_keys.begin(), out_vals.begin());
4821+
4822+ bench_utils::gpu_timer d_timer;
4823+
4824+ d_timer.start(0);
4825+ thrust::reduce_by_key(policy, in_keys.begin(), in_keys.end(), in_vals.begin(), out_keys.begin(), out_vals.begin());
4826+ d_timer.stop(0);
4827+
4828+ return d_timer.get_duration();
4829+ }
4830+};
4831+
4832+template <class Benchmark, class KeyT, class ValueT>
4833+void run_benchmark(
4834+ benchmark::State& state, const std::size_t elements, const std::string seed_type, const std::size_t max_segment_size)
4835+{
4836+ // Benchmark object
4837+ Benchmark benchmark{};
4838+
4839+ // GPU times
4840+ std::vector<double> gpu_times;
4841+
4842+ constexpr std::size_t min_segment_size = 1;
4843+
4844+ // Generate input and output
4845+ thrust::device_vector<KeyT> in_keys;
4846+ thrust::device_vector<KeyT> out_keys;
4847+ thrust::device_vector<ValueT> in_vals;
4848+ try
4849+ {
4850+ in_keys = bench_utils::generate.uniform.key_segments(elements, seed_type, min_segment_size, max_segment_size);
4851+ out_keys = in_keys;
4852+ in_vals = thrust::device_vector<ValueT>(elements);
4853+ }
4854+ catch (const ::thrust::system::detail::bad_alloc& e)
4855+ {
4856+ (void) hipGetLastError();
4857+ state.SkipWithError(("thrust::system::detail::bad_alloc: " + std::string(e.what())).c_str());
4858+ return;
4859+ }
4860+ const std::size_t unique_keys = thrust::distance(out_keys.begin(), thrust::unique(out_keys.begin(), out_keys.end()));
4861+ thrust::device_vector<ValueT> out_vals;
4862+ try
4863+ {
4864+ out_vals = thrust::device_vector<ValueT>(unique_keys);
4865+ }
4866+ catch (const ::thrust::system::detail::bad_alloc& e)
4867+ {
4868+ (void) hipGetLastError();
4869+ state.SkipWithError(("thrust::system::detail::bad_alloc: " + std::string(e.what())).c_str());
4870+ return;
4871+ }
4872+
4873+ bench_utils::caching_allocator_t alloc{};
4874+ thrust::detail::device_t policy{};
4875+
4876+ for (auto _ : state)
4877+ {
4878+ float64_t duration = benchmark.template run<KeyT, ValueT>(in_keys, in_vals, out_keys, out_vals, policy(alloc));
4879+ state.SetIterationTime(duration);
4880+ gpu_times.push_back(duration);
4881+ }
4882+
4883+ // BytesProcessed include read and written bytes, so when the BytesProcessed/s are reported
4884+ // it will actually be the global memory bandwidth gotten.
4885+ state.SetBytesProcessed(state.iterations() * ((elements + unique_keys) * (sizeof(KeyT) + sizeof(ValueT))));
4886+ state.SetItemsProcessed(state.iterations() * elements);
4887+
4888+ const double gpu_cv = bench_utils::StatisticsCV(gpu_times);
4889+ state.counters["gpu_noise"] = gpu_cv;
4890+}
4891+
4892+#define CREATE_BENCHMARK(KeyT, ValueT, Elements, MaxSegmentSize) \
4893+ benchmark::RegisterBenchmark( \
4894+ bench_utils::bench_naming::format_name( \
4895+ "{algo:reduce,subalgo:" + name + ",key_type:" #KeyT + ",value_type:" #ValueT + ",elements:" #Elements \
4896+ + ",max_segment_size:" #MaxSegmentSize) \
4897+ .c_str(), \
4898+ run_benchmark<Benchmark, KeyT, ValueT>, \
4899+ Elements, \
4900+ seed_type, \
4901+ MaxSegmentSize)
4902+
4903+#define BENCHMARK_ELEMENTS(key_type, value_type, elements) \
4904+ CREATE_BENCHMARK(key_type, value_type, elements, 1), CREATE_BENCHMARK(key_type, value_type, elements, 4), \
4905+ CREATE_BENCHMARK(key_type, value_type, elements, 8)
4906+
4907+#define BENCHMARK_VALUE_TYPE(key_type, value_type) \
4908+ BENCHMARK_ELEMENTS(key_type, value_type, 1 << 16), BENCHMARK_ELEMENTS(key_type, value_type, 1 << 20), \
4909+ BENCHMARK_ELEMENTS(key_type, value_type, 1 << 24), BENCHMARK_ELEMENTS(key_type, value_type, 1 << 28)
4910+
4911+#if THRUST_BENCHMARKS_HAVE_INT128_SUPPORT
4912+# define BENCHMARK_KEY_TYPE(key_type) \
4913+ BENCHMARK_VALUE_TYPE(key_type, int8_t), BENCHMARK_VALUE_TYPE(key_type, int16_t), \
4914+ BENCHMARK_VALUE_TYPE(key_type, int32_t), BENCHMARK_VALUE_TYPE(key_type, int64_t), \
4915+ BENCHMARK_VALUE_TYPE(key_type, int128_t), BENCHMARK_VALUE_TYPE(key_type, float), \
4916+ BENCHMARK_VALUE_TYPE(key_type, double)
4917+#else
4918+# define BENCHMARK_KEY_TYPE(key_type) \
4919+ BENCHMARK_VALUE_TYPE(key_type, int8_t), BENCHMARK_VALUE_TYPE(key_type, int16_t), \
4920+ BENCHMARK_VALUE_TYPE(key_type, int32_t), BENCHMARK_VALUE_TYPE(key_type, int64_t), \
4921+ BENCHMARK_VALUE_TYPE(key_type, float), BENCHMARK_VALUE_TYPE(key_type, double)
4922+#endif
4923+
4924+template <class Benchmark>
4925+void add_benchmarks(
4926+ const std::string& name, std::vector<benchmark::internal::Benchmark*>& benchmarks, const std::string seed_type)
4927+{
4928+ std::vector<benchmark::internal::Benchmark*> bs = {
4929+ BENCHMARK_KEY_TYPE(int8_t),
4930+ BENCHMARK_KEY_TYPE(int16_t),
4931+ BENCHMARK_KEY_TYPE(int32_t),
4932+ BENCHMARK_KEY_TYPE(int64_t),
4933+#if THRUST_BENCHMARKS_HAVE_INT128_SUPPORT
4934+ BENCHMARK_KEY_TYPE(int128_t),
4935+#endif
4936+ BENCHMARK_KEY_TYPE(float32_t),
4937+ BENCHMARK_KEY_TYPE(float64_t)
4938+ };
4939+ benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
4940+}
4941+
4942+int main(int argc, char* argv[])
4943+{
4944+ cli::Parser parser(argc, argv);
4945+ parser.set_optional<std::string>("name_format", "name_format", "human", "either: json,human,txt");
4946+ parser.set_optional<std::string>("seed", "seed", "random", bench_utils::get_seed_message());
4947+ parser.run_and_exit_if_error();
4948+
4949+ // Parse argv
4950+ benchmark::Initialize(&argc, argv);
4951+ bench_utils::bench_naming::set_format(parser.get<std::string>("name_format")); /* either: json,human,txt */
4952+ const std::string seed_type = parser.get<std::string>("seed");
4953+
4954+ // Benchmark info
4955+ bench_utils::add_common_benchmark_info();
4956+ benchmark::AddCustomContext("seed", seed_type);
4957+
4958+ // Add benchmark
4959+ std::vector<benchmark::internal::Benchmark*> benchmarks;
4960+ add_benchmarks<by_key>("by_key", benchmarks, seed_type);
4961+
4962+ // Use manual timing
4963+ for (auto& b : benchmarks)
4964+ {
4965+ b->UseManualTime();
4966+ b->Unit(benchmark::kMicrosecond);
4967+ b->MinTime(0.4); // in seconds
4968+ }
4969+
4970+ // Run benchmarks
4971+ benchmark::RunSpecifiedBenchmarks(bench_utils::ChooseCustomReporter());
4972+
4973+ // Finish
4974+ benchmark::Shutdown();
4975+ return 0;
4976+}
4977diff --git a/benchmark/bench/scan/exclusive/by_key.cu b/benchmark/bench/scan/exclusive/by_key.cu
4978new file mode 100644
4979index 0000000..02a4f52
4980--- /dev/null
4981+++ b/benchmark/bench/scan/exclusive/by_key.cu
4982@@ -0,0 +1,177 @@
4983+/******************************************************************************
4984+ * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
4985+ * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
4986+ *
4987+ * Redistribution and use in source and binary forms, with or without
4988+ * modification, are permitted provided that the following conditions are met:
4989+ * * Redistributions of source code must retain the above copyright
4990+ * notice, this list of conditions and the following disclaimer.
4991+ * * Redistributions in binary form must reproduce the above copyright
4992+ * notice, this list of conditions and the following disclaimer in the
4993+ * documentation and/or other materials provided with the distribution.
4994+ * * Neither the name of the NVIDIA CORPORATION nor the
4995+ * names of its contributors may be used to endorse or promote products
4996+ * derived from this software without specific prior written permission.
4997+ *
4998+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
4999+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
5000+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches