Merge ~bullwinkle-team/ubuntu/+source/rccl:bullwinkle/llvm-21/ubuntu/devel into ubuntu/+source/rccl:ubuntu/devel

Proposed by Bojan Aleksovski
Status: Approved
Approved by: Andreas Hasenack
Approved revision: 148928e0a0e4650f4ea7f857ea5dcebed64d02bb
Proposed branch: ~bullwinkle-team/ubuntu/+source/rccl:bullwinkle/llvm-21/ubuntu/devel
Merge into: ubuntu/+source/rccl:ubuntu/devel
Diff against target: 283928 lines (+233669/-13501)
605 files modified
.azuredevops/multinode-ci-nightly.yml (+71/-0)
.azuredevops/multinode-ci-pr.yml (+77/-0)
.azuredevops/multinode-ci-slurm-nightly.yml (+44/-0)
.azuredevops/multinode-ci-slurm-pr.yml (+49/-0)
.azuredevops/rocm-ci.yml (+46/-0)
.azuredevops/slurm/build.sh (+51/-0)
.azuredevops/slurm/test_rccl-UnitTests.sh (+16/-0)
.azuredevops/slurm/test_rccl-tests.sh (+62/-0)
.azuredevops/templates/build.yml (+86/-0)
.azuredevops/templates/test_rccl-UnitTests.yml (+69/-0)
.azuredevops/templates/test_rccl-tests.yml (+77/-0)
.azuredevops/tests/pytest/HelloWorld.py (+5/-0)
.github/CODEOWNERS (+5/-0)
.github/PULL_REQUEST_TEMPLATE.md (+23/-0)
.github/dependabot.yml (+17/-0)
.github/scripts/therock_configure_ci.py (+131/-0)
.github/workflows/therock-ci-linux.yml (+126/-0)
.github/workflows/therock-ci.yml (+81/-0)
.github/workflows/therock-test-packages-multi-node.yml (+57/-0)
.github/workflows/therock-test-packages-single-node.yml (+69/-0)
.gitignore (+5/-1)
.gitmodules (+10/-0)
.readthedocs.yaml (+18/-0)
CHANGELOG.md (+203/-1)
CMakeLists.txt (+1298/-291)
LICENSE.txt (+32/-30)
NOTICES.txt (+125/-63)
README.md (+85/-44)
cmake/CheckSymbolExistsNoWarn.cmake (+40/-0)
cmake/Dependencies.cmake (+81/-5)
cmake/DownloadProject.cmake (+2/-2)
cmake/FindIBVerbs.cmake (+39/-0)
cmake/Findmscclpp_nccl.cmake (+36/-0)
cmake/MSCCLPP.cmake (+229/-0)
cmake/rcclRAS.cmake (+24/-0)
cmake/scripts/add_faults.sh (+27/-0)
cmake/scripts/add_unroll.sh (+42/-0)
cmake/scripts/extract_metadata.cmake (+81/-0)
cmake/scripts/git_version.cmake (+1/-0)
debian/changelog (+29/-0)
debian/control (+15/-7)
debian/librccl-doc.install (+1/-0)
debian/librccl1-tests.install (+2/-1)
debian/librccl1.install (+1/-0)
debian/patches/0003-use-local-mathjax.patch (+8/-6)
debian/patches/0004-do-not-use-rocm-core-headers.patch (+25/-0)
debian/patches/fix-install-libdir.patch (+24/-0)
debian/patches/series (+2/-1)
debian/rules (+16/-4)
debian/shlibs (+1/-0)
dev/null (+0/-47)
docker/Dockerfile.ubuntu (+124/-0)
docker/README.md (+42/-0)
docs/.gitignore (+5/-0)
docs/api-reference/api-library.rst (+11/-0)
docs/api-reference/env-variables.rst (+165/-0)
docs/api-reference/library-specification.rst (+13/-15)
docs/attributions.rst (+9/-9)
docs/conf.py (+36/-0)
docs/doxygen/Doxyfile (+208/-91)
docs/doxygen/mainpage.txt (+27/-0)
docs/how-to/rccl-usage-tips.rst (+265/-0)
docs/how-to/troubleshooting-rccl.rst (+249/-0)
docs/how-to/using-nccl.rst (+333/-0)
docs/how-to/using-rccl-tuner-plugin-api.rst (+135/-0)
docs/index.rst (+50/-0)
docs/install/building-installing.rst (+103/-0)
docs/install/docker-install.rst (+52/-0)
docs/install/installation.rst (+85/-0)
docs/license.rst (+8/-0)
docs/sphinx/_toc.yml.in (+45/-0)
docs/sphinx/requirements.in (+1/-0)
docs/sphinx/requirements.txt (+277/-0)
docs/what-is-rccl.rst (+31/-0)
ext-net/README.md (+419/-0)
ext-net/example/Makefile (+22/-0)
ext-net/example/nccl/common.h (+21/-0)
ext-net/example/nccl/err.h (+17/-0)
ext-net/example/nccl/net.h (+41/-0)
ext-net/example/nccl/net_device.h (+32/-0)
ext-net/example/nccl/net_v10.h (+101/-0)
ext-net/example/nccl/net_v2.h (+50/-0)
ext-net/example/nccl/net_v3.h (+50/-0)
ext-net/example/nccl/net_v4.h (+61/-0)
ext-net/example/nccl/net_v5.h (+54/-0)
ext-net/example/nccl/net_v6.h (+68/-0)
ext-net/example/nccl/net_v7.h (+75/-0)
ext-net/example/nccl/net_v8.h (+79/-0)
ext-net/example/nccl/net_v9.h (+93/-0)
ext-net/example/nccl/types.h (+23/-0)
ext-net/example/plugin.c (+418/-0)
ext-net/google-fastsocket/Makefile (+2/-2)
ext-profiler/README.md (+461/-0)
ext-profiler/example/Makefile (+22/-0)
ext-profiler/example/README.md (+239/-0)
ext-profiler/example/event.c (+30/-0)
ext-profiler/example/event.h (+194/-0)
ext-profiler/example/nccl/common.h (+15/-0)
ext-profiler/example/nccl/err.h (+19/-0)
ext-profiler/example/nccl/net_ib_v1.h (+34/-0)
ext-profiler/example/nccl/net_socket_v1.h (+32/-0)
ext-profiler/example/nccl/profiler.h (+76/-0)
ext-profiler/example/nccl/profiler_net.h (+22/-0)
ext-profiler/example/nccl/profiler_v1.h (+109/-0)
ext-profiler/example/nccl/profiler_v2.h (+106/-0)
ext-profiler/example/nccl/profiler_v3.h (+114/-0)
ext-profiler/example/nccl/profiler_v4.h (+123/-0)
ext-profiler/example/nccl/types.h (+21/-0)
ext-profiler/example/plugin.c (+633/-0)
ext-profiler/example/plugin.h (+13/-0)
ext-profiler/example/print_event.c (+294/-0)
ext-profiler/example/print_event.h (+16/-0)
ext-src/bf16-tuning.patch (+26/-0)
ext-src/check_ibv_access_relaxed_ordering.cc (+8/-0)
ext-src/cpx.patch (+12/-0)
ext-src/device-flag.patch (+199/-0)
ext-src/disable-executor.patch (+368/-0)
ext-src/disable-format-checks.patch (+15/-0)
ext-src/mem-reg.patch (+147/-0)
ext-src/mscclpp_ibv_access_relaxed_ordering.patch (+51/-0)
ext-src/no-cache.patch (+445/-0)
ext-src/non-multiple-128-fix.patch (+16/-0)
ext-src/read-allred.patch (+530/-0)
ext-src/reg-fix.patch (+43/-0)
ext-src/remove-clip.patch (+54/-0)
ext-tuner/README.md (+182/-0)
ext-tuner/basic/Makefile (+23/-0)
ext-tuner/basic/README.md (+197/-0)
ext-tuner/basic/nccl/common.h (+15/-0)
ext-tuner/basic/nccl/err.h (+17/-0)
ext-tuner/basic/nccl/tuner.h (+97/-0)
ext-tuner/basic/plugin.c (+34/-0)
ext-tuner/example/Makefile (+55/-0)
ext-tuner/example/README.md (+163/-0)
ext-tuner/example/nccl/common.h (+15/-0)
ext-tuner/example/nccl/err.h (+17/-0)
ext-tuner/example/nccl/tuner.h (+97/-0)
ext-tuner/example/nccl_tuner.conf (+45/-0)
ext-tuner/example/plugin.c (+456/-0)
ext-tuner/example/scripts/README.md (+106/-0)
ext-tuner/example/scripts/optimize_config.py (+430/-0)
ext-tuner/example/scripts/sample_performance_data.csv (+24/-0)
ext-tuner/example/test/Makefile (+30/-0)
ext-tuner/example/test/README.md (+205/-0)
ext-tuner/example/test/test_plugin.c (+856/-0)
ext-tuner/model_demo/Makefile (+4/-4)
ext-tuner/model_demo/README.md (+95/-0)
ext-tuner/model_demo/nccl/common.h (+15/-0)
ext-tuner/model_demo/nccl/err.h (+17/-0)
ext-tuner/model_demo/nccl/tuner.h (+97/-0)
ext-tuner/model_demo/plugin.c (+234/-0)
install.sh (+282/-132)
makefiles/common.mk (+74/-9)
makefiles/version.mk (+2/-2)
pkg/debian/Makefile (+1/-1)
pkg/debian/libnccl-dev.install.in (+1/-1)
pkg/debian/rules (+3/-0)
pkg/redhat/nccl.spec.in (+4/-2)
pkg/txz/create_txz.sh.in (+1/-1)
rtest.xml (+1/-1)
src/Makefile (+48/-26)
src/allocator.cc (+198/-0)
src/bootstrap.cc (+946/-227)
src/channel.cc (+140/-24)
src/collectives.cc (+503/-0)
src/debug.cc (+250/-38)
src/device/Makefile (+126/-0)
src/device/all_gather.h (+674/-0)
src/device/all_reduce.h (+1133/-0)
src/device/alltoall_pivot.h (+27/-22)
src/device/broadcast.h (+129/-0)
src/device/common.cu (+45/-0)
src/device/common.h (+675/-0)
src/device/common_kernel.h (+812/-0)
src/device/generate.py (+629/-0)
src/device/msccl_kernel_impl.h (+409/-0)
src/device/network/unpack/unpack.h (+286/-0)
src/device/network/unpack/unpack_defs.h (+61/-0)
src/device/onerank.cu (+92/-0)
src/device/op128.h (+539/-0)
src/device/primitives.h (+58/-25)
src/device/prims_ll.h (+319/-95)
src/device/prims_ll128.h (+758/-0)
src/device/prims_simple.h (+1343/-0)
src/device/rccl_metadata.h (+4/-6)
src/device/reduce.h (+81/-0)
src/device/reduce_kernel.h (+1123/-0)
src/device/reduce_scatter.h (+604/-0)
src/device/sendrecv.h (+282/-0)
src/device/symmetric/all_gather.cuh (+367/-0)
src/device/symmetric/all_reduce.cuh (+432/-0)
src/device/symmetric/generate.py (+247/-0)
src/device/symmetric/kernel.cuh (+27/-0)
src/device/symmetric/primitives.cuh (+477/-0)
src/device/symmetric/reduce_scatter.cuh (+387/-0)
src/enqueue.cc (+2279/-1066)
src/graph/connect.cc (+676/-462)
src/graph/paths.cc (+545/-233)
src/graph/rings.cc (+17/-11)
src/graph/rome_models.cc (+1249/-206)
src/graph/rome_models.h (+7/-3)
src/graph/search.cc (+685/-396)
src/graph/topo.cc (+1049/-242)
src/graph/topo.h (+151/-56)
src/graph/tuning.cc (+589/-173)
src/graph/xml.cc (+289/-67)
src/graph/xml.h (+131/-21)
src/group.cc (+617/-115)
src/include/alloc.h (+386/-104)
src/include/allocator.h (+13/-0)
src/include/alt_rsmi.h (+62/-0)
src/include/api_trace.h (+217/-0)
src/include/archinfo.h (+16/-8)
src/include/argcheck.h (+2/-0)
src/include/bitops.h (+469/-0)
src/include/bootstrap.h (+14/-4)
src/include/channel.h (+16/-30)
src/include/checks.h (+80/-66)
src/include/coll_net.h (+2/-2)
src/include/collectives.h (+882/-112)
src/include/comm.h (+528/-80)
src/include/core.h (+4/-23)
src/include/cpuset.h (+25/-0)
src/include/cudawrap.h (+68/-23)
src/include/debug.h (+7/-11)
src/include/device.h (+755/-0)
src/include/enqueue.h (+19/-5)
src/include/gdrwrap.h (+63/-10)
src/include/graph.h (+56/-33)
src/include/group.h (+63/-19)
src/include/hip_rocm_version_info.h (+58/-0)
src/include/ibvcore.h (+1090/-0)
src/include/ibvsymbols.h (+46/-0)
src/include/ibvwrap.h (+21/-1031)
src/include/info.h (+3/-89)
src/include/ipcsocket.h (+41/-0)
src/include/latency_profiler/CollTrace.h (+61/-0)
src/include/latency_profiler/CollTraceEvent.h (+68/-0)
src/include/latency_profiler/CollTraceFunc.h (+39/-0)
src/include/latency_profiler/CollTraceUtils.h (+49/-0)
src/include/latency_profiler/EventQueue.h (+46/-0)
src/include/latency_profiler/MIT-LICENSE.txt (+21/-0)
src/include/mlx5/mlx5dvcore.h (+18/-0)
src/include/mlx5/mlx5dvsymbols.h (+23/-0)
src/include/mlx5/mlx5dvwrap.h (+41/-0)
src/include/mnnvl.h (+15/-0)
src/include/msccl/msccl_kernel.h (+48/-0)
src/include/msccl/msccl_lifecycle.h (+48/-0)
src/include/msccl/msccl_parser.h (+105/-0)
src/include/msccl/msccl_scheduler.h (+53/-0)
src/include/msccl/msccl_setup.h (+34/-0)
src/include/msccl/msccl_status.h (+23/-0)
src/include/msccl/msccl_struct.h (+261/-0)
src/include/mscclpp/mscclpp_nccl.h (+66/-0)
src/include/nccl_common.h (+94/-0)
src/include/net.h (+1/-21)
src/include/net_device.h (+32/-0)
src/include/npkit/npkit.h (+12/-6)
src/include/npkit/npkit_event.h (+38/-0)
src/include/npkit/npkit_struct.h (+1/-1)
src/include/nvmlwrap.h (+187/-0)
src/include/nvtx.h (+165/-1)
src/include/nvtx3/nvToolsExt.h (+51/-47)
src/include/nvtx3/nvToolsExtCounters.h (+335/-0)
src/include/nvtx3/nvToolsExtCuda.h (+18/-18)
src/include/nvtx3/nvToolsExtCudaRt.h (+11/-11)
src/include/nvtx3/nvToolsExtMem.h (+694/-0)
src/include/nvtx3/nvToolsExtMemCudaRt.h (+150/-0)
src/include/nvtx3/nvToolsExtOpenCL.h (+4/-4)
src/include/nvtx3/nvToolsExtPayload.h (+1106/-0)
src/include/nvtx3/nvToolsExtPayloadHelper.h (+170/-0)
src/include/nvtx3/nvToolsExtSemanticsCounters.h (+88/-0)
src/include/nvtx3/nvToolsExtSemanticsScope.h (+30/-0)
src/include/nvtx3/nvToolsExtSync.h (+14/-14)
src/include/nvtx3/nvtx3.hpp (+1240/-584)
src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h (+31/-0)
src/include/nvtx3/nvtxDetail/nvtxExtImpl.h (+99/-0)
src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h (+148/-0)
src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h (+74/-0)
src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h (+133/-0)
src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h (+155/-0)
src/include/nvtx3/nvtxDetail/nvtxExtInit.h (+378/-0)
src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h (+272/-0)
src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h (+151/-0)
src/include/nvtx3/nvtxDetail/nvtxExtTypes.h (+44/-0)
src/include/nvtx3/nvtxDetail/nvtxImpl.h (+10/-13)
src/include/nvtx3/nvtxDetail/nvtxImplCore.h (+1/-1)
src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h (+9/-9)
src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h (+17/-17)
src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h (+1/-1)
src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h (+1/-1)
src/include/nvtx3/nvtxDetail/nvtxInit.h (+4/-4)
src/include/nvtx3/nvtxDetail/nvtxInitDecls.h (+1/-1)
src/include/nvtx3/nvtxDetail/nvtxInitDefs.h (+1/-1)
src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h (+2/-2)
src/include/nvtx3/nvtxDetail/nvtxTypes.h (+2/-2)
src/include/nvtx_payload_schemas.h (+188/-0)
src/include/nvtx_stub.h (+8/-0)
src/include/p2p.h (+57/-0)
src/include/param.h (+10/-25)
src/include/plugin/nccl_net.h (+60/-0)
src/include/plugin/nccl_profiler.h (+79/-0)
src/include/plugin/nccl_tuner.h (+22/-0)
src/include/plugin/net/net_v10.h (+158/-0)
src/include/plugin/net/net_v6.h (+6/-206)
src/include/plugin/net/net_v7.h (+120/-0)
src/include/plugin/net/net_v8.h (+134/-0)
src/include/plugin/net/net_v9.h (+152/-0)
src/include/plugin/plugin.h (+24/-0)
src/include/plugin/profiler/net_ib.h (+13/-0)
src/include/plugin/profiler/net_ib_v1.h (+34/-0)
src/include/plugin/profiler/net_socket.h (+13/-0)
src/include/plugin/profiler/net_socket_v1.h (+32/-0)
src/include/plugin/profiler/profiler_v1.h (+107/-0)
src/include/plugin/profiler/profiler_v2.h (+104/-0)
src/include/plugin/profiler/profiler_v3.h (+112/-0)
src/include/plugin/profiler/profiler_v4.h (+123/-0)
src/include/plugin/tuner/tuner_v2.h (+53/-0)
src/include/plugin/tuner/tuner_v3.h (+55/-0)
src/include/plugin/tuner/tuner_v4.h (+56/-0)
src/include/profiler.h (+58/-20)
src/include/proxy.h (+273/-47)
src/include/proxy_trace/proxy_trace.h (+184/-0)
src/include/ras.h (+26/-0)
src/include/rccl_common.h (+99/-0)
src/include/rccl_float8.h (+1219/-0)
src/include/rccl_vars.h (+8/-0)
src/include/recorder.h (+173/-0)
src/include/register.h (+83/-0)
src/include/register_inline.h (+33/-0)
src/include/rocmwrap.h (+27/-10)
src/include/roctx.h (+153/-0)
src/include/shm.h (+31/-10)
src/include/shmutils.h (+26/-0)
src/include/socket.h (+52/-20)
src/include/strongstream.h (+62/-67)
src/include/symmetric.h (+90/-0)
src/include/timer.h (+7/-7)
src/include/transport.h (+109/-26)
src/include/tuner.h (+23/-0)
src/include/utils.h (+110/-21)
src/init.cc (+2406/-847)
src/init_nvtx.cc (+29/-0)
src/misc/alt_rsmi.cc (+712/-0)
src/misc/api_trace.c (+9/-0)
src/misc/api_trace.cc (+722/-0)
src/misc/archinfo.cc (+75/-0)
src/misc/argcheck.cc (+20/-10)
src/misc/cudawrap.cc (+247/-110)
src/misc/gdrwrap.cc (+17/-22)
src/misc/ibvsymbols.cc (+166/-0)
src/misc/ibvwrap.cc (+187/-193)
src/misc/ipcsocket.cc (+232/-0)
src/misc/latency_profiler/CollTrace.cc (+228/-0)
src/misc/latency_profiler/CollTraceEvent.cc (+42/-0)
src/misc/latency_profiler/CollTraceFunc.cc (+140/-0)
src/misc/latency_profiler/CollTraceUtils.cc (+87/-0)
src/misc/latency_profiler/MIT-LICENSE.txt (+21/-0)
src/misc/mlx5dvsymbols.cc (+77/-0)
src/misc/mlx5dvwrap.cc (+75/-0)
src/misc/msccl/msccl_lifecycle.cc (+784/-0)
src/misc/msccl/msccl_parser.cc (+790/-0)
src/misc/msccl/msccl_setup.cc (+566/-0)
src/misc/msccl/msccl_status.cc (+79/-0)
src/misc/mscclpp/mscclpp_nccl.cc (+12/-0)
src/misc/mscclpp/mscclpp_nccl_syms.txt (+36/-0)
src/misc/npkit.cc (+13/-8)
src/misc/nvmlwrap.cc (+68/-1)
src/misc/param.cc (+25/-8)
src/misc/proxy_trace/proxy_trace.cc (+263/-0)
src/misc/recorder.cc (+701/-0)
src/misc/rocm_smi_wrap.cc (+142/-57)
src/misc/rocmwrap.cc (+126/-22)
src/misc/roctx.cc (+142/-0)
src/misc/shmutils.cc (+183/-53)
src/misc/socket.cc (+638/-230)
src/misc/strongstream.cc (+300/-188)
src/misc/utils.cc (+37/-21)
src/mnnvl.cc (+89/-0)
src/msccl.cc (+49/-0)
src/nccl.h.in (+807/-364)
src/plugin/net.cc (+372/-0)
src/plugin/net/net_v10.cc (+32/-0)
src/plugin/net/net_v6.cc (+178/-0)
src/plugin/net/net_v7.cc (+174/-0)
src/plugin/net/net_v8.cc (+196/-0)
src/plugin/net/net_v9.cc (+121/-0)
src/plugin/plugin_open.cc (+139/-0)
src/plugin/profiler.cc (+564/-0)
src/plugin/profiler/profiler_v1.cc (+147/-0)
src/plugin/profiler/profiler_v2.cc (+105/-0)
src/plugin/profiler/profiler_v3.cc (+111/-0)
src/plugin/profiler/profiler_v4.cc (+21/-0)
src/plugin/tuner.cc (+100/-0)
src/plugin/tuner/tuner_v2.cc (+66/-0)
src/plugin/tuner/tuner_v3.cc (+38/-0)
src/plugin/tuner/tuner_v4.cc (+22/-0)
src/proxy.cc (+1176/-371)
src/ras/client.cc (+318/-0)
src/ras/client_support.cc (+1920/-0)
src/ras/collectives.cc (+988/-0)
src/ras/peers.cc (+948/-0)
src/ras/ras.cc (+704/-0)
src/ras/ras_internal.h (+558/-0)
src/ras/rasnet.cc (+1341/-0)
src/rccl_wrap.cc (+381/-0)
src/register/coll_reg.cc (+461/-0)
src/register/register.cc (+317/-0)
src/register/sendrecv_reg.cc (+42/-0)
src/symmetric.cc (+302/-0)
src/transport.cc (+277/-99)
src/transport/coll_net.cc (+1057/-245)
src/transport/generic.cc (+85/-0)
src/transport/net.cc (+1142/-351)
src/transport/net_ib.cc (+1948/-563)
src/transport/net_socket.cc (+309/-207)
src/transport/nvls.cc (+1025/-0)
src/transport/p2p.cc (+858/-185)
src/transport/profiler.cc (+56/-0)
src/transport/shm.cc (+388/-132)
test/AllGatherTests.cpp (+162/-0)
test/AllReduceTests.cpp (+252/-0)
test/AllToAllTests.cpp (+115/-0)
test/AllToAllVTests.cpp (+191/-0)
test/AllocTests.cpp (+149/-0)
test/AltRsmiTests.cpp (+906/-0)
test/ArgCheckTests.cpp (+327/-0)
test/BitOpsTests.cpp (+283/-0)
test/BroadcastTests.cpp (+123/-0)
test/CMakeLists.txt (+147/-83)
test/CollRegTests.cpp (+102/-0)
test/CommTests.cpp (+26/-0)
test/EnqueueTests.cpp (+322/-0)
test/GatherTests.cpp (+124/-0)
test/GroupCallTests.cpp (+311/-0)
test/IpcsocketTests.cpp (+157/-0)
test/NetSocketTests.cpp (+1493/-0)
test/NonBlockingTests.cpp (+71/-0)
test/P2pTests.cpp (+1410/-0)
test/ParamTests.cpp (+41/-0)
test/ParamTestsConfFile.txt (+3/-0)
test/ProxyTests.cpp (+433/-0)
test/RcclWrapTests.cpp (+2319/-0)
test/ReduceScatterTests.cpp (+123/-0)
test/ReduceTests.cpp (+123/-0)
test/ScatterTests.cpp (+123/-0)
test/SendRecvTests.cpp (+135/-24)
test/ShmTests.cpp (+1037/-0)
test/StandaloneTests.cpp (+373/-0)
test/TransportTests.cpp (+243/-0)
test/_RecorderTests.cpp (+76/-0)
test/common/CallCollectiveForked.cpp (+170/-0)
test/common/CallCollectiveForked.hpp (+17/-0)
test/common/CollRegUtils.hpp (+52/-0)
test/common/CollectiveArgs.cpp (+25/-19)
test/common/CollectiveArgs.hpp (+13/-5)
test/common/EnvVars.cpp (+256/-40)
test/common/EnvVars.hpp (+27/-10)
test/common/ErrCode.hpp (+30/-11)
test/common/PtrUnion.cpp (+51/-14)
test/common/PtrUnion.hpp (+9/-4)
test/common/RcclMockFuncs.hpp (+13/-0)
test/common/StandaloneUtils.cpp (+76/-0)
test/common/StandaloneUtils.hpp (+54/-0)
test/common/TestBed.cpp (+395/-124)
test/common/TestBed.hpp (+69/-18)
test/common/TestBedChild.cpp (+497/-130)
test/common/TestBedChild.hpp (+31/-16)
test/common/TransportUtils.hpp (+77/-0)
test/common/main.cpp (+29/-1)
test/common/main_fixtures.cpp (+45/-0)
test/graph/XmlTests.cpp (+1752/-0)
test/latency_profiler/LatencyProfilerUnitTest.cpp (+103/-0)
test/proxy_trace/ProxyTraceUnitTests.cpp (+134/-0)
toolchain-linux.cmake (+29/-0)
tools/EmptyKernelTest/EmptyKernelTest.cpp (+157/-0)
tools/EmptyKernelTest/Makefile (+26/-0)
tools/GraphBench/GraphBench.cpp (+20/-0)
tools/HelloRccl/HelloRccl.cpp (+10/-8)
tools/HelloRccl/runTest.sh (+2/-11)
tools/JitterBench/Common.hpp (+52/-0)
tools/JitterBench/Compatibility.hpp (+83/-0)
tools/JitterBench/GetClosestNumaNode.hpp (+8/-0)
tools/JitterBench/JitterBench.cpp (+522/-0)
tools/JitterBench/Makefile (+25/-0)
tools/JitterBench/Timeline.hpp (+51/-0)
tools/JitterBench/runSweep.sh (+12/-0)
tools/RcclReplayer/Makefile (+12/-0)
tools/RcclReplayer/README.md (+87/-0)
tools/RcclReplayer/rcclReplayer.cpp (+680/-0)
tools/RcclReplayer/rcclReplayer.hpp (+104/-0)
tools/TopoVisual/README.md (+1/-1)
tools/TransferBench/README.md (+2/-12)
tools/ib-test/ib_test.cpp (+1/-1)
tools/ib-test/include/nccl.h (+4/-1)
tools/msccl-algorithms/allgather_16n_direct_0_3m_ll128.xml (+1954/-0)
tools/msccl-algorithms/allgather_16n_direct_0_3m_ll128_op.xml (+2050/-0)
tools/msccl-algorithms/allgather_32n_direct_0_6m_ll128.xml (+8002/-0)
tools/msccl-algorithms/allgather_32n_direct_0_6m_ll128_op.xml (+8194/-0)
tools/msccl-algorithms/allreduce-allpairs-8n-ll-32tb-op.xml (+6130/-0)
tools/msccl-algorithms/allreduce-allpairs-8n-ll-32tb.xml (+5874/-0)
tools/msccl-algorithms/allreduce-allpairs-8n-ll-64tb-op.xml (+12242/-0)
tools/msccl-algorithms/allreduce-allpairs-8n-ll-64tb.xml (+11730/-0)
tools/msccl-algorithms/allreduce-allpairs-8n-simple-op.xml (+12242/-0)
tools/msccl-algorithms/allreduce-allpairs-8n-simple.xml (+11730/-0)
tools/msccl-algorithms/alltoall-8n-0-9kb.xml (+380/-0)
tools/msccl-algorithms/alltoall-8n-190kb-512kb.xml (+1460/-0)
tools/msccl-algorithms/alltoall-8n-512kb-7mb.xml (+1460/-0)
tools/msccl-algorithms/alltoall-8n-7mb-43mb.xml (+2004/-0)
tools/msccl-algorithms/alltoall-8n-9kb-190kb.xml (+740/-0)
tools/msccl-unit-test-algorithms/all-reduce-ring-ll.xml (+12708/-0)
tools/msccl-unit-test-algorithms/all-reduce-ring-ll128.xml (+12708/-0)
tools/msccl-unit-test-algorithms/all-reduce-ring-simple.xml (+12708/-0)
tools/p2p-latency-test/Makefile (+23/-0)
tools/p2p-latency-test/README.md (+14/-0)
tools/p2p-latency-test/build_and_run.sh (+31/-0)
tools/p2p-latency-test/ll_latency_test.cpp (+191/-0)
tools/p2p-latency-test/ll_latency_test.cu (+181/-0)
tools/p2p-latency-test/p2p_latency_test.cpp (+125/-0)
tools/rccl-prim-test/copy_kernel.h (+9/-9)
tools/rccl-prim-test/rccl_prim_test.cpp (+71/-57)
tools/scripts/exclude_static_list.txt (+1/-0)
tools/scripts/npkit_trace_analysis.py (+144/-0)
tools/scripts/npkit_trace_generator.py (+354/-0)
tools/scripts/pytorch-all-reduce/README.md (+32/-0)
tools/scripts/pytorch-all-reduce/all_reduce.py (+106/-0)
tools/scripts/pytorch-all-reduce/trace_runs.sh (+13/-0)
tools/scripts/pytorch-log-parser.py (+118/-0)
tools/scripts/rcclDiagnostics.py (+782/-0)
tools/scripts/rccl_bw_test.py (+21/-4)
tools/scripts/replace_static.sh (+128/-0)
tools/scripts/rocprof-log-parser.py (+68/-0)
tools/scripts/topo_val.sh (+1/-1)
tools/scripts/ucx_ompi_rccl_rccltests_TB_script.sh (+306/-0)
tools/time-trace/rccl-TimeTrace.sh (+50/-0)
tools/time-trace/time_trace_generator.py (+112/-0)
tools/topo_expl/Makefile (+57/-5)
tools/topo_expl/README.md (+101/-0)
tools/topo_expl/include/device_table.h (+18/-0)
tools/topo_expl/include/model.h (+4/-4)
tools/topo_expl/include/nccl.h (+816/-339)
tools/topo_expl/include/utils.h (+21/-16)
tools/topo_expl/model.cpp (+72/-31)
tools/topo_expl/models/topo_16p1h.xml (+18/-18)
tools/topo_expl/models/topo_16p1h_vm.xml (+18/-18)
tools/topo_expl/models/topo_16p_gio-1s-1rp-cascade.xml (+273/-0)
tools/topo_expl/models/topo_16p_gio-3s-1rp-split-flat.xml (+313/-0)
tools/topo_expl/models/topo_3p_pcie.xml (+5/-5)
tools/topo_expl/models/topo_3p_pcie_1.xml (+5/-5)
tools/topo_expl/models/topo_4p1h.xml (+6/-6)
tools/topo_expl/models/topo_4p1h_1.xml (+6/-6)
tools/topo_expl/models/topo_4p2h.xml (+10/-10)
tools/topo_expl/models/topo_4p2h_1.xml (+10/-10)
tools/topo_expl/models/topo_4p2h_2nic.xml (+10/-10)
tools/topo_expl/models/topo_4p3l.xml (+6/-6)
tools/topo_expl/models/topo_4p3l_2h.xml (+10/-10)
tools/topo_expl/models/topo_4p3l_ia.xml (+10/-10)
tools/topo_expl/models/topo_4p3l_n2.xml (+10/-10)
tools/topo_expl/models/topo_4p3l_n2_1.xml (+10/-10)
tools/topo_expl/models/topo_4p3l_n4.xml (+10/-10)
tools/topo_expl/models/topo_4p4h.xml (+18/-18)
tools/topo_expl/models/topo_4p_942.xml (+58/-0)
tools/topo_expl/models/topo_8p1h.xml (+10/-10)
tools/topo_expl/models/topo_8p1h_1.xml (+10/-10)
tools/topo_expl/models/topo_8p1h_2.xml (+10/-10)
tools/topo_expl/models/topo_8p1h_3.xml (+10/-10)
tools/topo_expl/models/topo_8p1h_4.xml (+10/-10)
tools/topo_expl/models/topo_8p1h_5.xml (+10/-10)
tools/topo_expl/models/topo_8p1h_n1.xml (+10/-10)
tools/topo_expl/models/topo_8p6l.xml (+10/-10)
tools/topo_expl/models/topo_8p6l_1nic.xml (+10/-10)
tools/topo_expl/models/topo_8p6l_2nic.xml (+10/-10)
tools/topo_expl/models/topo_8p6l_3nic.xml (+10/-10)
tools/topo_expl/models/topo_8p6l_4nic.xml (+10/-10)
tools/topo_expl/models/topo_8p6l_5nic.xml (+10/-10)
tools/topo_expl/models/topo_8p6l_6nic.xml (+10/-10)
tools/topo_expl/models/topo_8p_4nics.xml (+10/-10)
tools/topo_expl/models/topo_8p_90a.xml (+10/-10)
tools/topo_expl/models/topo_8p_90a_1.xml (+10/-10)
tools/topo_expl/models/topo_8p_942.xml (+182/-0)
tools/topo_expl/models/topo_8p_942vm.xml (+134/-0)
tools/topo_expl/models/topo_8p_950.xml (+167/-0)
tools/topo_expl/models/topo_8p_pcie.xml (+10/-10)
tools/topo_expl/models/topo_8p_pcie_1.xml (+10/-10)
tools/topo_expl/models/topo_8p_pcie_2nic.xml (+10/-10)
tools/topo_expl/models/topo_8p_rome.xml (+10/-10)
tools/topo_expl/models/topo_8p_rome_4n_1.xml (+10/-10)
tools/topo_expl/models/topo_8p_rome_4n_2.xml (+10/-10)
tools/topo_expl/models/topo_8p_rome_4nics.xml (+10/-10)
tools/topo_expl/models/topo_8p_rome_n2.xml (+10/-10)
tools/topo_expl/models/topo_8p_rome_n2_1.xml (+10/-10)
tools/topo_expl/models/topo_8p_rome_n2_2.xml (+10/-10)
tools/topo_expl/models/topo_8p_rome_n4.xml (+10/-10)
tools/topo_expl/models/topo_8p_rome_n4_1.xml (+10/-10)
tools/topo_expl/models/topo_8p_rome_pcie.xml (+10/-10)
tools/topo_expl/models/topo_8p_rome_vm1.xml (+10/-10)
tools/topo_expl/models/topo_8p_ts1.xml (+10/-10)
tools/topo_expl/models/topo_8p_ts1_1.xml (+10/-10)
tools/topo_expl/models/topo_8p_ts1_n4.xml (+10/-10)
tools/topo_expl/models/topo_8p_ts1_n4_1.xml (+10/-10)
tools/topo_expl/models/topo_8p_ts1_n4_2.xml (+10/-10)
tools/topo_expl/models/topo_collnet_n1.xml (+10/-10)
tools/topo_expl/models/topo_collnet_n4.xml (+10/-10)
tools/topo_expl/topo_expl.cpp (+146/-97)
tools/topo_expl/utils.cpp (+699/-421)
Reviewer Review Type Date Requested Status
Andreas Hasenack Approve
git-ubuntu import Pending
Ubuntu Sponsors Pending
Review via email: mp+499803@code.launchpad.net

Description of the change

Update to new upstream version 7.1.0

To post a comment you must log in.
Revision history for this message
Bojan Aleksovski (b0b0a) wrote :

Uploaded package to this PPA: https://launchpad.net/~b0b0a/+archive/ubuntu/rccl-2140330

(-proposed and archs enabled)

Will trigger autopkgtest when it is built and published

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Please show the reverse dependency analysis on what else will have to be dropped on ppc64el:
--- a/debian/control
+++ b/debian/control
@@ -31,7 +38,7 @@ Rules-Requires-Root: no

 Package: librccl1
 Section: libs
-Architecture: amd64 arm64 ppc64el
+Architecture: amd64 arm64
 Depends: ${misc:Depends}, ${shlibs:Depends},
 Description: ROCm Communication Collectives Library - library
  RCCL (pronounced "Rickle") is a library of collective communication routines
@@ -45,7 +52,7 @@ Description: ROCm Communication Collectives Library - library

 Package: librccl-dev
 Section: libdevel
-Architecture: amd64 arm64 ppc64el
+Architecture: amd64 arm64
 Depends: librccl1 (= ${binary:Version}),${misc:Depends}, ${shlibs:Depends},
          libamdhip64-dev,
 Suggests: librccl-doc
@@ -61,7 +68,7 @@ Description: ROCm Communication Collectives Library - headers

 Package: librccl1-tests
 Section: libdevel
-Architecture: amd64 arm64 ppc64el
+Architecture: amd64 arm64
 Depends: librccl1 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends},
 Build-Profiles: <!nocheck>
 Description: ROCm Communication Collectives Library - tests

review: Needs Fixing
Revision history for this message
Bojan Aleksovski (b0b0a) wrote :

`reverse-depends --arch ppc64el {librccl1,librccl-dev,librccl1-tests}`:
- librccl1: reverse-depends librccl-dev and librccl1-tests
- librccl-dev and librccl1-tests: No reverse dependencies found
thus nothing additionally needs to be dropped on ppc64el

On a further note, this MP is blocked by rocm-hipamd.

Revision history for this message
Bojan Aleksovski (b0b0a) wrote :

We are evaluating a change that might bump rccl to 7.1.0-0ubuntu2 regarding rocm-hipamd dependency.
Please postpone further review until further notice.
Thank you!

Revision history for this message
Talha Can Havadar (tchavadar) wrote :

As long as there is no direct dependency to rocm-hipamd please refrain to add it to build-depends. As of today there is a transitive dependency to rocm-hipamd through hipcc. But if we add this to rccl wven though it doesnt depend on it directly then in the future hipcc may drop hipamd dependency and then this would make us update rccl again (ideally)

Please investigate the source code of rccl and check if we have dependency to hipamd before trying to add it to build-depends

Revision history for this message
Bojan Aleksovski (b0b0a) wrote :

Andreas please continue with the review.
libamdhip64-dev (=> 7.1~) has been added as a build-dep (reasoning: https://code.launchpad.net/~bullwinkle-team/ubuntu/+source/rccl/+git/rccl/+merge/499891).
Thank you!

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

--- a/debian/librccl1.install
+++ b/debian/librccl1.install
@@ -1 +1,2 @@
-usr/lib/*/librccl.so.*
+usr/lib/librccl.so.*
+usr/bin/rcclras

Why are the libraries now installed directly in /usr/lib, instead of /usr/lib/<arch>? Very few libraries are installed directly in /usr/lib:

$ l /usr/lib/*.so.*|wc -l
8

$ l /usr/lib/x86_64-linux-gnu/*.so.*|wc -l
2666

review: Needs Information
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

I was checking if we needed to change the soname, but looks like it's tied to NCCL_MAJOR, defined in makefiles/version.mk, and that has remained at 2. Which just begs the question, why wasn't our soname already 2?

./makefiles/version.mk:NCCL_MAJOR := 2
./src/Makefile:LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR))

Perhaps check that in parallel, for another future PR if needed.

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

So, symbols. I see the symbols in the previous package were not demangled, and the new symbols for this version are also not demangled. This makes it hard to see if symbols were dropped, in which case we would have to either bump the soname.

Perhaps just use d/shlibs like rocalution was doing (
https://code.launchpad.net/~bullwinkle-team/ubuntu/+source/rocalution/+git/rocalution/+merge/499090).

Or else we need proper demangling of symbols in the old (current) package, and then the new one here, and let dpkg-gensymbols tell us which symbols were dropped/added.

Right now I understand the only rdeps are within the package, but this may change.

79a6749... by Bojan Aleksovski

d/patches: add new fix-install-libdir.patch

Revision history for this message
Bojan Aleksovski (b0b0a) wrote :

Hello Andreas,

Please find the effort to have the comments fixed:
- Regarding the installation of /usr/lib instead of /usr/lib/<arch>, I've added fix-install-libdir.patch that has been forwarded upstream (it seems they did this in upstream unintentionally as part of a non-related commit)
- Regarding the soname, in CMakeLists.txt L1316 it is set to 1.0:
`## Setup librccl.so version
rocm_set_soversion(rccl "1.0")`
the one that showed up for you is for unrelated nccl.
- Regarding the symbols, I've added d/shlibs instead as per your recommendation until we get them demangled in the future.

The easiest place to view the mentioned fixes is https://launchpadlibrarian.net/846853345/rccl_7.1.0-0ubuntu2~ppa1_7.1.0-0ubuntu2~ppa2.diff.gz

Thank you!

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

+1

review: Approve
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Uploading rccl_7.1.0-0ubuntu2.dsc
Uploading rccl_7.1.0.orig.tar.gz
Uploading rccl_7.1.0-0ubuntu2.debian.tar.xz
Uploading rccl_7.1.0-0ubuntu2_source.buildinfo
Uploading rccl_7.1.0-0ubuntu2_source.changes

Unmerged commits

148928e... by Bojan Aleksovski

d/changelog: add 7.1.0-0ubuntu2 entry

a7f7989... by Bojan Aleksovski

d/control: add libamdhip64-dev (=> 7.1~) as a build dep

It is a direct hard dependency of rccl

e8b5890... by Bojan Aleksovski

d/changelog: add resolute entry for 7.1.0

2d112ca... by Bojan Aleksovski

d/rules: [Temporary] remove gfx908 and gfx90a ROCm ISAs

Remove gfx908 and gfx90a ISAs due to LP amd64 and amd64v3 builders
resetting themselves and failing to build without any buildlog.
*This shall be reverted when LP infra problem is solved*

851a82e... by Bojan Aleksovski

d/control: remove ppc64el build

79a6749... by Bojan Aleksovski

d/patches: add new fix-install-libdir.patch

5a5a124... by Bojan Aleksovski

d/{librccl1.symbols,shlibs}: remove symbols file and instead add shlibs

Otherwise proper demangling of symbols will be needed

b9e72fe... by Bojan Aleksovski

d/librccl-doc.install: add doc install for msccl-algorithms

309a4a8... by Bojan Aleksovski

d/librccl1-tests.install: fix path and add msccl-unit-test-algorithms

Tests reference the algorithms

23c6bee... by Bojan Aleksovski

d/librccl1.install: add rcclras install

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/.azuredevops/multinode-ci-nightly.yml b/.azuredevops/multinode-ci-nightly.yml
2new file mode 100644
3index 0000000..4b9b143
4--- /dev/null
5+++ b/.azuredevops/multinode-ci-nightly.yml
6@@ -0,0 +1,71 @@
7+resources:
8+ repositories:
9+ - repository: pipelines_repo
10+ type: github
11+ endpoint: ROCm
12+ name: ROCm/ROCm
13+
14+variables:
15+- group: common
16+- template: /.azuredevops/variables-global.yml@pipelines_repo
17+- name: pytestFolder
18+ value: '.azuredevops/tests/pytest'
19+
20+parameters:
21+- name: pytestList
22+ type: object
23+ default:
24+ - HelloWorld
25+
26+trigger: none
27+pr: none
28+schedules:
29+ - cron: "0 5 * 11-3 *" # 11 PM CST (November - March)
30+ displayName: "Nightly Build (CST)"
31+ branches:
32+ include:
33+ - develop
34+ always: false
35+
36+ - cron: "0 4 * 4-10 *" # 11 PM CDT (April - October)
37+ displayName: "Nightly Build (CDT)"
38+ branches:
39+ include:
40+ - develop
41+ always: false
42+
43+jobs:
44+- job: rccl
45+ timeoutInMinutes: 180
46+ pool: rocm-ci_rccl_pool
47+ workspace:
48+ clean: all
49+ steps:
50+ - task: DeleteFiles@1
51+ inputs:
52+ Contents: '**/*'
53+ - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
54+ parameters:
55+ submoduleBehaviour: recursive
56+ - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml@pipelines_repo
57+ parameters:
58+ installEnabled: false
59+ printDiskSpace: false
60+ extraBuildFlags: >-
61+ -DCMAKE_BUILD_TYPE=Release
62+ -DBUILD_TESTS=ON
63+ -GNinja
64+ - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml@pipelines_repo
65+ parameters:
66+ componentName: rccl
67+ testDir: $(Build.SourcesDirectory)/build/test
68+ testExecutable: 'LD_LIBRARY_PATH=$(Build.SourcesDirectory)/build:${LD_LIBRARY_PATH} NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests'
69+ testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
70+ - ${{ each pytestScript in parameters.pytestList }}:
71+ - task: Bash@3
72+ displayName: Test ${{ pytestScript }}
73+ continueOnError: true
74+ inputs:
75+ targetType: inline
76+ workingDirectory: $(Build.SourcesDirectory)/$(pytestFolder)
77+ script: pytest ${{ pytestScript }}.py
78diff --git a/.azuredevops/multinode-ci-pr.yml b/.azuredevops/multinode-ci-pr.yml
79new file mode 100644
80index 0000000..040377e
81--- /dev/null
82+++ b/.azuredevops/multinode-ci-pr.yml
83@@ -0,0 +1,77 @@
84+resources:
85+ repositories:
86+ - repository: pipelines_repo
87+ type: github
88+ endpoint: ROCm
89+ name: ROCm/ROCm
90+
91+variables:
92+- group: common
93+- template: /.azuredevops/variables-global.yml@pipelines_repo
94+- name: pytestFolder
95+ value: '.azuredevops/tests/pytest'
96+
97+parameters:
98+- name: pytestList
99+ type: object
100+ default:
101+ - HelloWorld
102+
103+trigger: none
104+pr:
105+ autoCancel: true
106+ branches:
107+ include:
108+ - develop
109+ paths:
110+ exclude:
111+ - .github
112+ - .jenkins
113+ - docs
114+ - '*.md'
115+ - LICENSE.txt
116+ - NOTICES.txt
117+ drafts: false
118+
119+stages:
120+- stage: rcclStage
121+ displayName: 'RCCL develop PR'
122+ jobs:
123+ - deployment: rccl_pr_approval
124+ displayName: "CI Run Requires Approval"
125+ environment: rccl
126+ - job: rccl
127+ timeoutInMinutes: 180
128+ pool: rocm-ci_rccl_pool
129+ workspace:
130+ clean: all
131+ steps:
132+ - task: DeleteFiles@1
133+ inputs:
134+ Contents: '**/*'
135+ - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
136+ parameters:
137+ submoduleBehaviour: recursive
138+ - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml@pipelines_repo
139+ parameters:
140+ installEnabled: false
141+ printDiskSpace: false
142+ extraBuildFlags: >-
143+ -DCMAKE_BUILD_TYPE=Release
144+ -DBUILD_TESTS=ON
145+ -DGPU_TARGETS=gfx942
146+ -GNinja
147+ - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml@pipelines_repo
148+ parameters:
149+ componentName: rccl
150+ testDir: $(Build.SourcesDirectory)/build/test
151+ testExecutable: 'LD_LIBRARY_PATH=$(Build.SourcesDirectory)/build:${LD_LIBRARY_PATH} NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests'
152+ testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
153+ - ${{ each pytestScript in parameters.pytestList }}:
154+ - task: Bash@3
155+ displayName: Test ${{ pytestScript }}
156+ continueOnError: true
157+ inputs:
158+ targetType: inline
159+ workingDirectory: $(Build.SourcesDirectory)/$(pytestFolder)
160+ script: pytest ${{ pytestScript }}.py
161diff --git a/.azuredevops/multinode-ci-slurm-nightly.yml b/.azuredevops/multinode-ci-slurm-nightly.yml
162new file mode 100644
163index 0000000..b7438f3
164--- /dev/null
165+++ b/.azuredevops/multinode-ci-slurm-nightly.yml
166@@ -0,0 +1,44 @@
167+resources:
168+ repositories:
169+ - repository: pipelines_repo
170+ type: github
171+ endpoint: ROCm
172+ name: ROCm/ROCm
173+
174+variables:
175+- group: common
176+- template: /.azuredevops/variables-global.yml@pipelines_repo
177+
178+trigger: none
179+pr: none
180+schedules:
181+ - cron: "0 5 * 11-3 *" # 11 PM CST (November - March)
182+ displayName: "Nightly Build (CST)"
183+ branches:
184+ include:
185+ - develop
186+ always: false
187+
188+ - cron: "0 4 * 4-10 *" # 11 PM CDT (April - October)
189+ displayName: "Nightly Build (CDT)"
190+ branches:
191+ include:
192+ - develop
193+ always: false
194+
195+jobs:
196+- job: rccl
197+ timeoutInMinutes: 180
198+ pool: rocm-ci_rccl_slurm_pool
199+ workspace:
200+ clean: all
201+ steps:
202+ - task: DeleteFiles@1
203+ inputs:
204+ Contents: '**/*'
205+ - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
206+ parameters:
207+ submoduleBehaviour: recursive
208+ - template: templates/build.yml
209+ - template: templates/test_rccl-UnitTests.yml
210+ - template: templates/test_rccl-tests.yml
211diff --git a/.azuredevops/multinode-ci-slurm-pr.yml b/.azuredevops/multinode-ci-slurm-pr.yml
212new file mode 100644
213index 0000000..c3d89ff
214--- /dev/null
215+++ b/.azuredevops/multinode-ci-slurm-pr.yml
216@@ -0,0 +1,49 @@
217+resources:
218+ repositories:
219+ - repository: pipelines_repo
220+ type: github
221+ endpoint: ROCm
222+ name: ROCm/ROCm
223+
224+variables:
225+- group: common
226+- template: /.azuredevops/variables-global.yml@pipelines_repo
227+
228+trigger: none
229+pr:
230+ autoCancel: true
231+ branches:
232+ include:
233+ - develop
234+ paths:
235+ exclude:
236+ - .github
237+ - .jenkins
238+ - docs
239+ - '*.md'
240+ - LICENSE.txt
241+ - NOTICES.txt
242+ drafts: false
243+
244+stages:
245+- stage: rcclStage
246+ displayName: 'RCCL develop PR'
247+ jobs:
248+ - deployment: rccl_pr_approval
249+ displayName: "CI Run Requires Approval"
250+ environment: rccl
251+ - job: rccl
252+ timeoutInMinutes: 180
253+ pool: rocm-ci_rccl_slurm_pool
254+ workspace:
255+ clean: all
256+ steps:
257+ - task: DeleteFiles@1
258+ inputs:
259+ Contents: '**/*'
260+ - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
261+ parameters:
262+ submoduleBehaviour: recursive
263+ - template: templates/build.yml
264+ - template: templates/test_rccl-UnitTests.yml
265+ - template: templates/test_rccl-tests.yml
266diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml
267new file mode 100644
268index 0000000..e87ae14
269--- /dev/null
270+++ b/.azuredevops/rocm-ci.yml
271@@ -0,0 +1,46 @@
272+resources:
273+ repositories:
274+ - repository: pipelines_repo
275+ type: github
276+ endpoint: ROCm
277+ name: ROCm/ROCm
278+
279+variables:
280+- group: common
281+- template: /.azuredevops/variables-global.yml@pipelines_repo
282+
283+trigger:
284+ batch: true
285+ branches:
286+ include:
287+ - develop
288+ - mainline
289+ paths:
290+ exclude:
291+ - .github
292+ - .jenkins
293+ - docs
294+ - '.*.y*ml'
295+ - '*.md'
296+ - LICENSE.txt
297+ - NOTICES.txt
298+
299+pr:
300+ autoCancel: true
301+ branches:
302+ include:
303+ - develop
304+ - mainline
305+ paths:
306+ exclude:
307+ - .github
308+ - .jenkins
309+ - docs
310+ - '.*.y*ml'
311+ - '*.md'
312+ - LICENSE.txt
313+ - NOTICES.txt
314+ drafts: false
315+
316+jobs:
317+ - template: ${{ variables.CI_COMPONENT_PATH }}/rccl.yml@pipelines_repo
318diff --git a/.azuredevops/slurm/build.sh b/.azuredevops/slurm/build.sh
319new file mode 100644
320index 0000000..9ebcefd
321--- /dev/null
322+++ b/.azuredevops/slurm/build.sh
323@@ -0,0 +1,51 @@
324+#!/bin/bash
325+#SBATCH --job-name=rccl-build
326+#SBATCH --output=rccl-build-%j.out
327+#SBATCH --error=rccl-build-%j.out
328+#SBATCH --time=60
329+#SBATCH --nodes=1
330+#SBATCH --exclusive
331+#SBATCH --partition=gt
332+
333+short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
334+echo "Node identifier: $short_id"
335+
336+source /etc/profile.d/lmod.sh
337+module load rocm/6.4.1
338+
339+# Setup local binary path
340+export PATH="$HOME/.local/bin:$PATH"
341+mkdir -p "$HOME/.local/bin"
342+
343+# Install Ninja if not already available
344+if ! command -v ninja &>/dev/null; then
345+ echo "Ninja not found. Installing locally..."
346+ wget -q https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip -O /tmp/ninja.zip
347+ unzip -q /tmp/ninja.zip -d "$HOME/.local/bin"
348+ chmod +x "$HOME/.local/bin/ninja"
349+fi
350+
351+echo "Using Ninja at: $(which ninja)"
352+ninja --version
353+
354+# Define GPU target
355+export GPU_TARGETS="gfx942"
356+
357+cd "${SLURM_SUBMIT_DIR:-$PWD}"
358+## Building RCCL
359+mkdir -p build
360+cd build
361+cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" ..
362+cmake --build .
363+cmake --build . --target install
364+
365+
366+cd "${SLURM_SUBMIT_DIR:-$PWD}"
367+## Building RCCL-Tests
368+git clone https://github.com/ROCm/rccl-tests
369+cd rccl-tests
370+mkdir -p build
371+cd build
372+cmake -DCMAKE_PREFIX_PATH="$BINARIES_DIR;$MPI_HOME" -DUSE_MPI=ON -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DROCM_PATH="$ROCM_PATH" ..
373+cmake --build .
374+cmake --build . --target install
375diff --git a/.azuredevops/slurm/test_rccl-UnitTests.sh b/.azuredevops/slurm/test_rccl-UnitTests.sh
376new file mode 100644
377index 0000000..f397fab
378--- /dev/null
379+++ b/.azuredevops/slurm/test_rccl-UnitTests.sh
380@@ -0,0 +1,16 @@
381+#!/bin/bash
382+#SBATCH --job-name=rccl-UnitTests
383+#SBATCH --output=%x-%j.out
384+#SBATCH --error=%x-%j.out
385+#SBATCH --time=120
386+#SBATCH --nodes=1
387+#SBATCH --exclusive
388+#SBATCH --partition=gt
389+
390+short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
391+echo "Node identifier: $short_id"
392+
393+source /etc/profile.d/lmod.sh
394+module load rocm/6.4.1
395+cd "$BINARIES_DIR/bin"
396+LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 HSA_NO_SCRATCH_RECLAIM=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/rccl-UnitTests_output.xml --gtest_color=yes
397diff --git a/.azuredevops/slurm/test_rccl-tests.sh b/.azuredevops/slurm/test_rccl-tests.sh
398new file mode 100644
399index 0000000..a4509ce
400--- /dev/null
401+++ b/.azuredevops/slurm/test_rccl-tests.sh
402@@ -0,0 +1,62 @@
403+#!/bin/bash
404+#SBATCH --job-name=rccl-tests
405+#SBATCH --output=%x-%j.out
406+#SBATCH --error=%x-%j.out
407+#SBATCH --time=60
408+#SBATCH --nodes=1
409+#SBATCH --exclusive
410+#SBATCH --partition=gt
411+
412+short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
413+echo "Node identifier: $short_id"
414+
415+source /etc/profile.d/lmod.sh
416+module load rocm/6.4.1
417+
418+cd ${PIPELINE_WORKSPACE}/TestResults
419+mkdir -p ${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs
420+export WORKDIR=${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs
421+
422+export PATH="$BINARIES_DIR/bin:$PATH"
423+export LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH"
424+
425+### create hostlist
426+#nodelist=($(scontrol show hostnames))
427+#echo "SLURM nodes:"
428+#echo ${nodelist[@]}
429+#echo ""
430+#
431+#hosts_8ppn=()
432+#for node in "${nodelist[@]}"
433+#do
434+# hosts_8ppn+=("${node}:8")
435+#done
436+#echo ${hosts_8ppn[@]}
437+
438+### Run multi- and single-node RCCL-Tests
439+## Run single-node RCCL-Tests
440+for n in 1
441+do
442+ total=$((n*8))
443+ #h_8ppn=`echo ${hosts_8ppn[@]:0:${n}} | tr ' ' ','`
444+
445+ for coll in all_reduce all_gather reduce_scatter alltoall alltoallv broadcast gather reduce scatter sendrecv
446+ do
447+ for dtype in float bfloat16 half fp8_e5m2
448+ do
449+ out_filename="${WORKDIR}/rccl-tests_${coll}_1KB-16GB_nodes${n}_gpus${total}_${dtype}.log"
450+ #cmd="${MPI_HOME}/bin/mpirun -np ${total} --host ${h_8ppn} -mca pml ob1 -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 -x NCCL_IB_GID_INDEX=3 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json"
451+ cmd="${MPI_HOME}/bin/mpirun -np ${total} -mca pml ^ucx -mca osc ^ucx -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json"
452+
453+ echo "Running ${coll}" 2>&1 | tee ${out_filename}
454+ echo "Run cmd: ${cmd}" 2>&1 | tee -a ${out_filename}
455+ eval ${cmd} 2>&1 | tee -a ${out_filename}
456+
457+ sleep 2
458+ done
459+ done
460+done
461+
462+## To add
463+### Summarize results
464+### Convert to junit
465diff --git a/.azuredevops/templates/build.yml b/.azuredevops/templates/build.yml
466new file mode 100644
467index 0000000..fc671a7
468--- /dev/null
469+++ b/.azuredevops/templates/build.yml
470@@ -0,0 +1,86 @@
471+# small subset of files to check for install to determine pass/fail
472+parameters:
473+- name: expectedInstallFiles
474+ type: object
475+ default:
476+ - bin/rccl-UnitTests
477+ - include/rccl/rccl.h
478+ - lib/cmake/rccl/rccl-config.cmake
479+ - lib/librccl.so
480+ - share/doc/rccl/LICENSE.txt
481+ - share/rccl/msccl-algorithms
482+ - share/rccl/msccl-unit-test-algorithms
483+
484+steps:
485+ - task: Bash@3
486+ displayName: Build Job
487+ env:
488+ BINARIES_DIR: $(Build.BinariesDirectory)
489+ inputs:
490+ targetType: inline
491+ script: |
492+ echo "##[section]Starting build job..."
493+
494+ rm -rf $(Build.BinariesDirectory)/*
495+
496+ echo "Submitting build job..."
497+ mkdir -p $(Build.BinariesDirectory)
498+ BUILD_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/build.sh)
499+ echo "Submitted build job: $BUILD_JOB_ID"
500+ echo "##vso[task.setvariable variable=BUILD_JOB_ID]$BUILD_JOB_ID"
501+
502+ echo "Waiting for build job to start..."
503+ while squeue -j $BUILD_JOB_ID 2>/dev/null | grep -q $BUILD_JOB_ID; do
504+ echo "##[section]Build job $BUILD_JOB_ID is still running..."
505+ sleep 60
506+ done
507+
508+ echo "Waiting for final status via sacct..."
509+ LOOP_COUNT=0
510+ MAX_LOOPS=30 # Maximum of 30 loops (30 minutes)
511+ while true; do
512+ STATE=$(sacct -j "$BUILD_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
513+ echo "##[section]Build job state: $STATE"
514+ if [[ "$STATE" == "COMPLETED" ]]; then
515+ break
516+ elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
517+ echo "Build failed with state $STATE"
518+ break
519+ fi
520+ sleep 60
521+ LOOP_COUNT=$((LOOP_COUNT + 1))
522+ if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
523+ echo "Time limit reached while waiting for final status."
524+ exit 1 # Exit with an error code if time limit is reached
525+ fi
526+ done
527+
528+ echo "Checking for expected installed files..."
529+ MISSING_FILES=0
530+
531+ expectedFiles="${{ join(' ', parameters.expectedInstallFiles) }}"
532+ i=1
533+ total=$(echo "$expectedFiles" | wc -w)
534+ while [ $i -le $total ]; do
535+ relpath=$(echo "$expectedFiles" | cut -d ' ' -f"$i")
536+ fullpath="$BINARIES_DIR/$relpath"
537+ if [ ! -e "$fullpath" ]; then
538+ echo "##vso[task.logissue type=error]Missing expected file: $fullpath"
539+ MISSING_FILES=1
540+ fi
541+ i=$((i + 1))
542+ done
543+
544+ if [ "$MISSING_FILES" -eq 1 ]; then
545+ echo "One or more expected files are missing from the install directory."
546+ exit 1
547+ else
548+ echo "All expected files are present in the install directory."
549+ fi
550+ - task: Bash@3
551+ displayName: Build Logs
552+ condition: always()
553+ inputs:
554+ targetType: inline
555+ script: |
556+ cat rccl-build-${BUILD_JOB_ID}.out || echo "No log found"
557diff --git a/.azuredevops/templates/test_rccl-UnitTests.yml b/.azuredevops/templates/test_rccl-UnitTests.yml
558new file mode 100644
559index 0000000..8d195b1
560--- /dev/null
561+++ b/.azuredevops/templates/test_rccl-UnitTests.yml
562@@ -0,0 +1,69 @@
563+steps:
564+ - task: Bash@3
565+ displayName: RCCL UnitTests
566+ env:
567+ BINARIES_DIR: $(Build.BinariesDirectory)
568+ PIPELINE_WORKSPACE: $(Pipeline.Workspace)
569+ inputs:
570+ targetType: inline
571+ script: |
572+ echo "Submitting test job..."
573+ TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-UnitTests.sh)
574+ echo "Submitted test job: $TEST_JOB_ID"
575+ echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
576+
577+ echo "Waiting for test job to start..."
578+ while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do
579+ echo "##[section]Test job $TEST_JOB_ID is still running..."
580+ sleep 60
581+ done
582+
583+ echo "Waiting for final status via sacct..."
584+ LOOP_COUNT=0
585+ MAX_LOOPS=120 # Maximum of 120 loops (120 minutes)
586+ while true; do
587+ STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
588+ echo "##[section]Test job state: $STATE"
589+ if [[ "$STATE" == "COMPLETED" ]]; then
590+ break
591+ elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
592+ echo "Test failed with state $STATE"
593+ break
594+ fi
595+ sleep 60
596+ LOOP_COUNT=$((LOOP_COUNT + 1))
597+ if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
598+ echo "Time limit reached while waiting for final status."
599+ exit 1 # Exit with an error code if time limit is reached
600+ fi
601+ done
602+
603+ echo "Checking test result XML for failures..."
604+ TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-UnitTests_output.xml' | head -n1)
605+ if [ -z "$TEST_XML" ]; then
606+ echo "##vso[task.logissue type=error]No $TEST_XML file found"
607+ echo "##vso[task.complete result=Failed;]DONE"
608+ exit 1
609+ fi
610+
611+ if grep -q 'failures="[^0]' "$TEST_XML"; then
612+ echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML"
613+ echo "##vso[task.complete result=Failed;]DONE"
614+ exit 1
615+ else
616+ echo "No test failures detected."
617+ fi
618+ - task: Bash@3
619+ displayName: Test Logs
620+ condition: always()
621+ inputs:
622+ targetType: inline
623+ script: |
624+ cat rccl-UnitTests-${TEST_JOB_ID}.out || echo "No log found"
625+ - task: PublishTestResults@2
626+ displayName: 'Publish Results'
627+ condition: succeededOrFailed()
628+ inputs:
629+ searchFolder: $(Pipeline.Workspace)
630+ testResultsFormat: JUnit
631+ testResultsFiles: '**/rccl-UnitTests_output.xml'
632diff --git a/.azuredevops/templates/test_rccl-tests.yml b/.azuredevops/templates/test_rccl-tests.yml
633new file mode 100644
634index 0000000..9b047c4
635--- /dev/null
636+++ b/.azuredevops/templates/test_rccl-tests.yml
637@@ -0,0 +1,77 @@
638+steps:
639+ - task: Bash@3
640+ displayName: RCCL-Tests
641+ env:
642+ BINARIES_DIR: $(Build.BinariesDirectory)
643+ PIPELINE_WORKSPACE: $(Pipeline.Workspace)
644+ inputs:
645+ targetType: inline
646+ script: |
647+ echo "Submitting test job..."
648+ TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-tests.sh)
649+ echo "Submitted test job: $TEST_JOB_ID"
650+ echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
651+
652+ echo "Waiting for test job to start..."
653+ while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do
654+ echo "##[section]Test job $TEST_JOB_ID is still running..."
655+ sleep 60
656+ done
657+
658+ echo "Waiting for final status via sacct..."
659+ LOOP_COUNT=0
660+ MAX_LOOPS=120 # Maximum of 120 loops (120 minutes)
661+ while true; do
662+ STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
663+ echo "##[section]Test job state: $STATE"
664+ if [[ "$STATE" == "COMPLETED" ]]; then
665+ break
666+ elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
667+ echo "Test failed with state $STATE"
668+ break
669+ fi
670+ sleep 60
671+ LOOP_COUNT=$((LOOP_COUNT + 1))
672+ if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
673+ echo "Time limit reached while waiting for final status."
674+ exit 1 # Exit with an error code if time limit is reached
675+ fi
676+ done
677+
678+ echo "Checking test result json for failures..."
679+ TEST_JSON=$(find "$(Pipeline.Workspace)" -name 'rccl-tests*.json')
680+ if [ -z "$TEST_JSON" ]; then
681+ echo "##vso[task.logissue type=error]No $TEST_JSON file(s) found"
682+ echo "##vso[task.complete result=Failed;]DONE"
683+ exit 1
684+ fi
685+
686+ #echo "Checking test result XML for failures..."
687+ #TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-tests_output.xml' | head -n1)
688+ #if [ -z "$TEST_XML" ]; then
689+ # echo "##vso[task.logissue type=error]No $TES_XML file found"
690+ # echo "##vso[task.complete result=Failed;]DONE"
691+ # exit 1
692+ #fi
693+
694+ #if grep -q 'failures="[^0]' "$TEST_XML"; then
695+ # echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML"
696+ # echo "##vso[task.complete result=Failed;]DONE"
697+ # exit 1
698+ #else
699+ # echo "No test failures detected."
700+ #fi
701+ - task: Bash@3
702+ displayName: Test Logs
703+ condition: always()
704+ inputs:
705+ targetType: inline
706+ script: |
707+ cat rccl-tests-${TEST_JOB_ID}.out || echo "No log found"
708+# - task: PublishTestResults@2
709+# displayName: 'Publish Results'
710+# condition: succeededOrFailed()
711+# inputs:
712+# searchFolder: $(Pipeline.Workspace)
713+# testResultsFormat: JUnit
714+# testResultsFiles: '**/rccl-tests_output.xml'
715diff --git a/.azuredevops/tests/pytest/HelloWorld.py b/.azuredevops/tests/pytest/HelloWorld.py
716new file mode 100644
717index 0000000..52f05a1
718--- /dev/null
719+++ b/.azuredevops/tests/pytest/HelloWorld.py
720@@ -0,0 +1,5 @@
721+import pytest
722+
723+def test_HelloWorld():
724+ greeting = "Hello, World!"
725+ assert greeting == "Hello, World!"
726diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
727new file mode 100755
728index 0000000..35c9980
729--- /dev/null
730+++ b/.github/CODEOWNERS
731@@ -0,0 +1,5 @@
732+* @wenkaidu @gilbertlee-amd @PedramAlizadeh @nusislam @nileshnegi @KawtharShafie @AtlantaPepsi @mberenjk @corey-derochie-amd @mustafabar @thananon @JhaShweta1 @BertanDogancay @rahulvaidya20 @isaki001 @PJAvinash @AbandiGa @Nikhil-Nunna @haripriya-amd @atulkulk @ddebonis-amd @amd-mengshwu @Kapil-Shyam-Pawar @weilewei @nawrinsu @speriaswamy-amd# Documentation files
733+docs/ @ROCm/rocm-documentation
734+*.md @ROCm/rocm-documentation
735+*.rst @ROCm/rocm-documentation
736+.readthedocs.yaml @ROCm/rocm-documentation
737diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
738new file mode 100644
739index 0000000..636de79
740--- /dev/null
741+++ b/.github/PULL_REQUEST_TEMPLATE.md
742@@ -0,0 +1,23 @@
743+## Details
744+___Do not mention proprietary info or link to internal work items in this PR.___
745+
746+**Work item:** _"Internal", or link to GitHub issue (if applicable)._
747+
748+**What were the changes?**
749+_One sentence describing the work done._
750+
751+**Why were the changes made?**
752+_Explain the motivation behind the work. Provide any publicly-available historical context._
753+
754+**How was the outcome achieved?**
755+_Technical details behind the work. Explain any publicly-available hardware peculiarities._
756+
757+**Additional Documentation:**
758+_What else should the reviewer know?_
759+
760+## Approval Checklist
761+___Do not approve until these items are satisfied.___
762+- [ ] Verify the CHANGELOG has been updated, if
763+ - there are any NCCL API version changes,
764+ - any changes impact library users, and/or
765+ - any changes impact any other ROCm library.
766diff --git a/.github/dependabot.yml b/.github/dependabot.yml
767new file mode 100644
768index 0000000..848c7f5
769--- /dev/null
770+++ b/.github/dependabot.yml
771@@ -0,0 +1,17 @@
772+# To get started with Dependabot version updates, you'll need to specify which
773+# package ecosystems to update and where the package manifests are located.
774+# Please see the documentation for all configuration options:
775+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
776+
777+version: 2
778+updates:
779+ - package-ecosystem: "pip" # See documentation for possible values
780+ directory: "/docs/sphinx" # Location of package manifests
781+ open-pull-requests-limit: 10
782+ schedule:
783+ interval: "daily"
784+ labels:
785+ - "dependencies"
786+ - "ci:docs-only"
787+ reviewers:
788+ - "samjwu"
789diff --git a/.github/scripts/therock_configure_ci.py b/.github/scripts/therock_configure_ci.py
790new file mode 100644
791index 0000000..2afff17
792--- /dev/null
793+++ b/.github/scripts/therock_configure_ci.py
794@@ -0,0 +1,131 @@
795+import fnmatch
796+import json
797+import os
798+from pathlib import Path
799+import subprocess
800+import sys
801+from typing import Iterable, Optional, Mapping
802+
803+def gha_set_output(vars: Mapping[str, str | Path]):
804+ """Sets values in a step's output parameters.
805+
806+ This appends to the file located at the $GITHUB_OUTPUT environment variable.
807+
808+ See
809+ * https://docs.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-output-parameter
810+ * https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/passing-information-between-jobs
811+ """
812+ print(f"Setting github output:\n{vars}")
813+
814+ step_output_file = os.getenv("GITHUB_OUTPUT")
815+ if not step_output_file:
816+ print(" Warning: GITHUB_OUTPUT env var not set, can't set github outputs")
817+ return
818+
819+ with open(step_output_file, "a") as f:
820+ f.writelines(f"{k}={str(v)}" + "\n" for k, v in vars.items())
821+
822+def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]:
823+ """Returns the paths of modified files relative to the base reference."""
824+ try:
825+ return subprocess.run(
826+ ["git", "diff", "--name-only", base_ref],
827+ stdout=subprocess.PIPE,
828+ check=True,
829+ text=True,
830+ timeout=60,
831+ ).stdout.splitlines()
832+ except TimeoutError:
833+ print(
834+ "Computing modified files timed out. Not using PR diff to determine"
835+ " jobs to run.",
836+ file=sys.stderr,
837+ )
838+ return None
839+
840+GITHUB_WORKFLOWS_CI_PATTERNS = [
841+ "therock*.yml",
842+]
843+
844+
845+def is_path_workflow_file_related_to_ci(path: str) -> bool:
846+ return any(
847+ fnmatch.fnmatch(path, ".github/workflows/" + pattern)
848+ for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
849+ )
850+
851+def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]) -> bool:
852+ if paths is None:
853+ return False
854+ return any(is_path_workflow_file_related_to_ci(p) for p in paths)
855+
856+# Paths matching any of these patterns are considered to have no influence over
857+# build or test workflows so any related jobs can be skipped if all paths
858+# modified by a commit/PR match a pattern in this list.
859+SKIPPABLE_PATH_PATTERNS = [
860+ "docs/*",
861+ "*.gitignore",
862+ "*.md",
863+ "*LICENSE*",
864+ "*NOTICES*",
865+ '.github/CODEOWNERS',
866+ '.github/*.md',
867+ '.github/dependabot.yml',
868+ '.azuredevops*',
869+]
870+
871+def is_path_skippable(path: str) -> bool:
872+ """Determines if a given relative path to a file matches any skippable patterns."""
873+ return any(fnmatch.fnmatch(path, pattern) for pattern in SKIPPABLE_PATH_PATTERNS)
874+
875+def check_for_non_skippable_path(paths: Optional[Iterable[str]]) -> bool:
876+ """Returns true if at least one path is not in the skippable set."""
877+ if paths is None:
878+ return False
879+ return any(not is_path_skippable(p) for p in paths)
880+
881+def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool:
882+ """Returns true if CI workflows should run given a list of modified paths."""
883+
884+ if paths is None:
885+ print("No files were modified, skipping TheRock CI jobs")
886+ return False
887+
888+ paths_set = set(paths)
889+ github_workflows_paths = set(
890+ [p for p in paths if p.startswith(".github/workflows")]
891+ )
892+ other_paths = paths_set - github_workflows_paths
893+
894+ related_to_ci = check_for_workflow_file_related_to_ci(github_workflows_paths)
895+ contains_other_non_skippable_files = check_for_non_skippable_path(other_paths)
896+
897+ print("should_ci_run_given_modified_paths findings:")
898+ print(f" contains_other_non_skippable_files: {contains_other_non_skippable_files}")
899+
900+ if related_to_ci:
901+ print("Enabling build jobs since a related workflow file was modified")
902+ return True
903+ elif contains_other_non_skippable_files:
904+ print("Enabling TheRock CI jobs since a non-skippable path was modified")
905+ return True
906+ else:
907+ print(
908+ "Only unrelated and/or skippable paths were modified, skipping TheRock CI jobs"
909+ )
910+ return False
911+
912+def main(args):
913+ base_ref = args.get("base_ref")
914+ modified_paths = get_modified_paths(base_ref)
915+ print("modified_paths (max 200):", modified_paths[:200])
916+ enable_jobs = should_ci_run_given_modified_paths(modified_paths)
917+ output = {
918+ 'enable_therock_ci': json.dumps(enable_jobs)
919+ }
920+ gha_set_output(output)
921+
922+if __name__ == "__main__":
923+ args = {}
924+ args["base_ref"] = os.environ.get("BASE_REF", "HEAD^1")
925+ main(args)
926diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml
927new file mode 100644
928index 0000000..d390446
929--- /dev/null
930+++ b/.github/workflows/therock-ci-linux.yml
931@@ -0,0 +1,126 @@
932+name: TheRock CI Linux
933+
934+on:
935+ workflow_call:
936+ inputs:
937+ amdgpu_families:
938+ type: string
939+ extra_cmake_options:
940+ type: string
941+
942+permissions:
943+ contents: read
944+
945+jobs:
946+ therock-build-linux:
947+ name: Build Linux Packages
948+ runs-on: azure-linux-scale-rocm
949+ permissions:
950+ id-token: write
951+ container:
952+ image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:283673fe3e1bf498d079e3f386b794af1b4f71845a9a0107c6cf7aa304dce050
953+ options: -v /runner/config:/home/awsconfig/
954+ env:
955+ AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
956+ TEATIME_FORCE_INTERACTIVE: 0
957+ AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
958+ steps:
959+ - name: Checkout TheRock repository
960+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
961+ with:
962+ repository: "ROCm/TheRock"
963+ ref: 16ee54fb580a4dde62dc4133f978e73370a545af
964+
965+ - name: Checkout rccl repository
966+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
967+ with:
968+ repository: "ROCm/rccl"
969+ path: rccl
970+
971+ - name: Checkout rccl-tests repository
972+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
973+ with:
974+ repository: "ROCm/rccl-tests"
975+ path: rccl-tests
976+
977+ - name: Runner Health Settings
978+ run: |
979+ df -h
980+ echo cmake --version
981+ echo "Installed Python versions:"
982+ ls -d /opt/python
983+ echo "python: $(which python), python3: $(which python3)"
984+ echo "Git version: $(git --version)"
985+ git config --global --add safe.directory $PWD
986+ git config fetch.parallel 10
987+
988+ - name: Fetch sources
989+ run: |
990+ ./build_tools/fetch_sources.py --jobs 12
991+
992+ - name: Install python deps
993+ run: |
994+ pip install -r requirements.txt
995+ pip freeze
996+
997+ - name: Configure Projects
998+ env:
999+ amdgpu_families: ${{ env.AMDGPU_FAMILIES }}
1000+ package_version: ADHOCBUILD
1001+ extra_cmake_options: ${{ inputs.extra_cmake_options }}
1002+ BUILD_DIR: build
1003+ run: |
1004+ python3 build_tools/github_actions/build_configure.py
1005+
1006+ - name: Build therock-dist
1007+ run: cmake --build build
1008+
1009+ - name: Build therock-archives
1010+ run: cmake --build build --target therock-archives
1011+
1012+ - name: Report
1013+ #if: ${{ !cancelled() }}
1014+ run: |
1015+ echo "Full SDK du:"
1016+ echo "------------"
1017+ du -h -d 1 build/dist/rocm
1018+ echo "Artifact Archives:"
1019+ echo "------------------"
1020+ ls -lh build/artifacts/*.tar.xz
1021+ echo "Artifacts:"
1022+ echo "----------"
1023+ du -h -d 1 build/artifacts
1024+
1025+ - name: Configure AWS Credentials for non-forked repos
1026+ if: ${{ always() && !github.event.pull_request.head.repo.fork }}
1027+ uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1
1028+ with:
1029+ aws-region: us-east-2
1030+ role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
1031+
1032+ - name: Post Build Upload
1033+ if: always()
1034+ run: |
1035+ python3 build_tools/github_actions/post_build_upload.py \
1036+ --run-id ${{ github.run_id }} \
1037+ --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
1038+ --build-dir build \
1039+ --upload
1040+
1041+ therock-test-linux-multi-node:
1042+ name: "Test multi-node"
1043+ needs: [therock-build-linux]
1044+ uses: ./.github/workflows/therock-test-packages-multi-node.yml
1045+ with:
1046+ amdgpu_families: ${{ inputs.amdgpu_families }}
1047+ test_runs_on: vultr-linux-rocm
1048+ artifact_run_id: ${{ github.run_id }}
1049+
1050+ therock-test-linux-single-node:
1051+ name: "Test single-node"
1052+ needs: [therock-build-linux]
1053+ uses: ./.github/workflows/therock-test-packages-single-node.yml
1054+ with:
1055+ amdgpu_families: ${{ inputs.amdgpu_families }}
1056+ test_runs_on: linux-mi325-1gpu-ossci-rocm
1057+ artifact_run_id: ${{ github.run_id }}
1058diff --git a/.github/workflows/therock-ci.yml b/.github/workflows/therock-ci.yml
1059new file mode 100644
1060index 0000000..1d866b1
1061--- /dev/null
1062+++ b/.github/workflows/therock-ci.yml
1063@@ -0,0 +1,81 @@
1064+name: TheRock CI for rccl
1065+
1066+on:
1067+ push:
1068+ branches:
1069+ - develop
1070+ workflow_dispatch:
1071+
1072+permissions:
1073+ contents: read
1074+
1075+concurrency:
1076+ # A PR number if a pull request and otherwise the commit hash. This cancels
1077+ # queued and in-progress runs for the same PR (presubmit) or commit
1078+ # (postsubmit). The workflow name is prepended to avoid conflicts between
1079+ # different workflows.
1080+ group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
1081+ cancel-in-progress: true
1082+
1083+jobs:
1084+ setup:
1085+ runs-on: ubuntu-24.04
1086+ env:
1087+ # The commit being checked out is the merge commit for a PR. Its first
1088+ # parent will be the tip of the base branch.
1089+ BASE_REF: HEAD^
1090+ outputs:
1091+ enable_therock_ci: ${{ steps.configure.outputs.enable_therock_ci }}
1092+ steps:
1093+ - name: "Checking out repository"
1094+ uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
1095+ with:
1096+ # We need the parent commit to do a diff
1097+ fetch-depth: 2
1098+
1099+ - name: "Configuring CI options"
1100+ id: configure
1101+ run: python .github/scripts/therock_configure_ci.py
1102+
1103+ therock-ci-linux:
1104+ name: TheRock CI Linux
1105+ needs: setup
1106+ if: ${{ needs.setup.outputs.enable_therock_ci == 'true' }}
1107+ permissions:
1108+ contents: read
1109+ id-token: write
1110+ uses: ./.github/workflows/therock-ci-linux.yml
1111+ secrets: inherit
1112+ with:
1113+ amdgpu_families: "gfx94X-dcgpu"
1114+ extra_cmake_options: >
1115+ -DTHEROCK_ENABLE_ALL=OFF
1116+ -DTHEROCK_BUILD_TESTING=ON
1117+ -DTHEROCK_BUNDLE_SYSDEPS=ON
1118+ -DTHEROCK_ENABLE_COMM_LIBS=ON
1119+ -DTHEROCK_ENABLE_ROCPROFV3=ON
1120+ -DTHEROCK_USE_EXTERNAL_RCCL=ON
1121+ -DTHEROCK_USE_EXTERNAL_RCCL_TESTS=ON
1122+ -DTHEROCK_RCCL_SOURCE_DIR=./rccl
1123+ -DTHEROCK_RCCL_TESTS_SOURCE_DIR=./rccl-tests
1124+ -DTHEROCK_ENABLE_MPI=ON
1125+
1126+ therock_ci_summary:
1127+ name: TheRock CI Summary
1128+ if: always()
1129+ needs:
1130+ - setup
1131+ - therock-ci-linux
1132+ runs-on: ubuntu-24.04
1133+ steps:
1134+ - name: Output failed jobs
1135+ run: |
1136+ echo '${{ toJson(needs) }}'
1137+ FAILED_JOBS="$(echo '${{ toJson(needs) }}' \
1138+ | jq --raw-output \
1139+ 'map_values(select(.result!="success" and .result!="skipped")) | keys | join(",")' \
1140+ )"
1141+ if [[ "${FAILED_JOBS}" != "" ]]; then
1142+ echo "The following jobs failed: ${FAILED_JOBS}"
1143+ exit 1
1144+ fi
1145diff --git a/.github/workflows/therock-test-packages-multi-node.yml b/.github/workflows/therock-test-packages-multi-node.yml
1146new file mode 100644
1147index 0000000..0aa2def
1148--- /dev/null
1149+++ b/.github/workflows/therock-test-packages-multi-node.yml
1150@@ -0,0 +1,57 @@
1151+name: TheRock Test Packages multi-node
1152+
1153+on:
1154+ workflow_call:
1155+ inputs:
1156+ amdgpu_families:
1157+ type: string
1158+ test_runs_on:
1159+ type: string
1160+ artifact_run_id:
1161+ type: string
1162+ workflow_dispatch:
1163+ inputs:
1164+ amdgpu_families:
1165+ type: string
1166+ test_runs_on:
1167+ type: string
1168+ artifact_run_id:
1169+ type: string
1170+
1171+permissions:
1172+ contents: read
1173+
1174+jobs:
1175+ test_rccl_multi_node:
1176+ name: 'Test multi-node'
1177+ runs-on: ${{ inputs.test_runs_on }}
1178+ defaults:
1179+ run:
1180+ shell: bash
1181+ env:
1182+ VENV_DIR: ${{ github.workspace }}/.venv
1183+ ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
1184+ OUTPUT_ARTIFACTS_DIR: /home/arravikum/dist_new/dist/rocm
1185+ THEROCK_BIN_DIR: "./build/bin"
1186+ steps:
1187+ - name: Checkout Repository
1188+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
1189+ with:
1190+ repository: "ROCm/TheRock"
1191+ ref: 890c856134d955441790c8ed2d60ad4fb027f4e5
1192+
1193+ - name: Run setup test environment workflow
1194+ uses: './.github/actions/setup_test_environment'
1195+ with:
1196+ ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
1197+ AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
1198+ OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
1199+ VENV_DIR: ${{ env.VENV_DIR }}
1200+ FETCH_ARTIFACT_ARGS: "--rccl"
1201+ IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
1202+
1203+ - name: Test
1204+ run: |
1205+ source /home/arravikum/TheRock/.venv/bin/activate
1206+ cd /home/arravikum/cvs
1207+ pytest -vvv --log-file=/tmp/rccl_log.log -s ./tests/rccl/rccl_multinode_cvs.py --cluster_file ./input/cluster.json --config_file ./input/mi300_config.json --html=/var/www/html/cvs/ci_test_report.html --capture=tee-sys --self-contained-html
1208diff --git a/.github/workflows/therock-test-packages-single-node.yml b/.github/workflows/therock-test-packages-single-node.yml
1209new file mode 100644
1210index 0000000..809ae81
1211--- /dev/null
1212+++ b/.github/workflows/therock-test-packages-single-node.yml
1213@@ -0,0 +1,69 @@
1214+name: TheRock Test Packages single-node
1215+
1216+on:
1217+ workflow_call:
1218+ inputs:
1219+ amdgpu_families:
1220+ type: string
1221+ test_runs_on:
1222+ type: string
1223+ artifact_run_id:
1224+ type: string
1225+ workflow_dispatch:
1226+ inputs:
1227+ amdgpu_families:
1228+ type: string
1229+ test_runs_on:
1230+ type: string
1231+ artifact_run_id:
1232+ type: string
1233+
1234+permissions:
1235+ contents: read
1236+
1237+jobs:
1238+ test_rccl_single_node:
1239+ name: 'Test single-node'
1240+ runs-on: ${{ inputs.test_runs_on }}
1241+ container:
1242+ image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26
1243+ options: --ipc host
1244+ --group-add video
1245+ --device /dev/kfd
1246+ --device /dev/dri
1247+ --group-add 992
1248+ --env-file /etc/podinfo/gha-gpu-isolation-settings
1249+ defaults:
1250+ run:
1251+ shell: bash
1252+ env:
1253+ VENV_DIR: ${{ github.workspace }}/.venv
1254+ ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
1255+ OUTPUT_ARTIFACTS_DIR: "./build"
1256+ THEROCK_BIN_DIR: "./build/bin"
1257+ steps:
1258+ - name: Checkout Repository
1259+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
1260+ with:
1261+ repository: "ROCm/TheRock"
1262+ ref: f89dcd5c5625baecb467b9287e952c5c819073fd
1263+
1264+ - name: Run setup test environment workflow
1265+ uses: './.github/actions/setup_test_environment'
1266+ with:
1267+ ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
1268+ AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
1269+ OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
1270+ VENV_DIR: ${{ env.VENV_DIR }}
1271+ FETCH_ARTIFACT_ARGS: "--rccl --tests"
1272+ IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
1273+
1274+ - name: Test
1275+ timeout-minutes: 15
1276+ # Currently, TheRock CI in RCCL always builds with MPI-supported enabled which causes the
1277+ # RCCL correctness tests to fail on the mi325 runners which don't have MPI pre-installed.
1278+ # TODO (geomin12): Rebuild rccl-tests without MPI to enable RCCL correctness tests.
1279+ run: |
1280+ pytest ./build_tools/github_actions/test_executable_scripts/test_rccl.py -v -s \
1281+ --log-cli-level=info \
1282+ -k "not test_rccl_correctness_tests"
1283diff --git a/.gitignore b/.gitignore
1284index 87ad949..d3dedfb 100644
1285--- a/.gitignore
1286+++ b/.gitignore
1287@@ -1,4 +1,8 @@
1288 # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
1289-/build
1290 *.gcov
1291 /coverage/
1292+build/
1293+ext/
1294+
1295+# Visual Studio Code
1296+.vscode
1297\ No newline at end of file
1298diff --git a/.gitmodules b/.gitmodules
1299new file mode 100644
1300index 0000000..eae2c51
1301--- /dev/null
1302+++ b/.gitmodules
1303@@ -0,0 +1,10 @@
1304+[submodule "ext-src/mscclpp"]
1305+ path = ext-src/mscclpp
1306+ url = https://github.com/microsoft/mscclpp.git
1307+ ignore = dirty
1308+ shallow = true
1309+[submodule "ext-src/json"]
1310+ path = ext-src/json
1311+ url = https://github.com/nlohmann/json.git
1312+ ignore = dirty
1313+ shallow = true
1314diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
1315deleted file mode 100644
1316index d3ad3a2..0000000
1317--- a/.jenkins/common.groovy
1318+++ /dev/null
1319@@ -1,41 +0,0 @@
1320-// This file is for internal AMD use.
1321-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
1322-
1323-def runCompileCommand(platform, project, jobName)
1324-{
1325- project.paths.construct_build_prefix()
1326-
1327- def command = """#!/usr/bin/env bash
1328- set -x
1329- cd ${project.paths.project_build_prefix}
1330- LD_LIBRARY_PATH=/opt/rocm/hcc/lib ${project.paths.build_command}
1331- """
1332-
1333- platform.runCommand(this,command)
1334-}
1335-
1336-def runTestCommand (platform, project, gfilter)
1337-{
1338- String sudo = auxiliary.sudo(platform.jenkinsLabel)
1339-
1340- def command = """#!/usr/bin/env bash
1341- set -x
1342- cd ${project.paths.project_build_prefix}/build/release/test
1343- ${sudo} ulimit -l unlimited
1344- ulimit -a
1345- ${sudo} RCCL_ENABLE_SIGNALHANDLER=1 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
1346- """
1347-
1348- platform.runCommand(this, command)
1349- junit "${project.paths.project_build_prefix}/build/release/test/*.xml"
1350-}
1351-
1352-def runPackageCommand(platform, project, jobName)
1353-{
1354- def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release")
1355-
1356- platform.runCommand(this, packageHelper[0])
1357- platform.archiveArtifacts(this, packageHelper[1])
1358-}
1359-
1360-return this
1361diff --git a/.jenkins/extended.groovy b/.jenkins/extended.groovy
1362deleted file mode 100644
1363index 647cd2d..0000000
1364--- a/.jenkins/extended.groovy
1365+++ /dev/null
1366@@ -1,88 +0,0 @@
1367-#!/usr/bin/env groovy
1368-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
1369-@Library('rocJenkins@pong') _
1370-
1371-// This is file for internal AMD use.
1372-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
1373-
1374-import com.amd.project.*
1375-import com.amd.docker.*
1376-import java.nio.file.Path
1377-
1378-def runCI =
1379-{
1380- nodeDetails, jobName->
1381-
1382- def prj = new rocProject('rccl', 'Extended')
1383-
1384- prj.timeout.test = 1440
1385- prj.paths.build_command = './install.sh -t '
1386-
1387- // Define test architectures, optional rocm version argument is available
1388- def nodes = new dockerNodes(nodeDetails, jobName, prj)
1389-
1390- boolean formatCheck = false
1391-
1392- def commonGroovy
1393-
1394- def compileCommand =
1395- {
1396- platform, project->
1397-
1398- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
1399- commonGroovy.runCompileCommand(platform, project, jobName)
1400- }
1401-
1402- def testCommand =
1403- {
1404- platform, project->
1405-
1406- commonGroovy.runTestCommand(platform, project, "*")
1407- }
1408-
1409- def packageCommand =
1410- {
1411- platform, project->
1412-
1413- commonGroovy.runPackageCommand(platform, project, jobName)
1414- }
1415-
1416- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
1417-}
1418-
1419-ci: {
1420- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
1421-
1422- def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
1423-
1424- propertyList = auxiliary.appendPropertyList(propertyList)
1425-
1426- def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([centos8:['8gfx906']])]
1427-
1428- jobNameList = auxiliary.appendJobNameList(jobNameList)
1429-
1430- propertyList.each
1431- {
1432- jobName, property->
1433- if (urlJobName == jobName)
1434- properties(auxiliary.addCommonProperties(property))
1435- }
1436-
1437- jobNameList.each
1438- {
1439- jobName, nodeDetails->
1440- if (urlJobName == jobName)
1441- stage(jobName) {
1442- runCI(nodeDetails, jobName)
1443- }
1444- }
1445-
1446- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
1447- if(!jobNameList.keySet().contains(urlJobName))
1448- {
1449- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
1450- stage(urlJobName) {
1451- runCI([ubuntu18:['4gfx906']], urlJobName)
1452- }
1453- }
1454-}
1455diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy
1456deleted file mode 100644
1457index 61dc43f..0000000
1458--- a/.jenkins/precheckin.groovy
1459+++ /dev/null
1460@@ -1,88 +0,0 @@
1461-#!/usr/bin/env groovy
1462-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
1463-@Library('rocJenkins@pong') _
1464-
1465-// This is file for internal AMD use.
1466-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
1467-
1468-import com.amd.project.*
1469-import com.amd.docker.*
1470-import java.nio.file.Path
1471-
1472-def runCI =
1473-{
1474- nodeDetails, jobName->
1475-
1476- def prj = new rocProject('rccl', 'PreCheckin')
1477-
1478- prj.timeout.test = 1440
1479- prj.paths.build_command = './install.sh -t '
1480-
1481- // Define test architectures, optional rocm version argument is available
1482- def nodes = new dockerNodes(nodeDetails, jobName, prj)
1483-
1484- boolean formatCheck = false
1485-
1486- def commonGroovy
1487-
1488- def compileCommand =
1489- {
1490- platform, project->
1491-
1492- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
1493- commonGroovy.runCompileCommand(platform, project, jobName)
1494- }
1495-
1496- def testCommand =
1497- {
1498- platform, project->
1499-
1500- commonGroovy.runTestCommand(platform, project, "-*ManagedMem")
1501- }
1502-
1503- def packageCommand =
1504- {
1505- platform, project->
1506-
1507- commonGroovy.runPackageCommand(platform, project, jobName)
1508- }
1509-
1510- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
1511-}
1512-
1513-ci: {
1514- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
1515-
1516- def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
1517-
1518- propertyList = auxiliary.appendPropertyList(propertyList)
1519-
1520- def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([sles15sp1:['4gfx906'],centos8:['8gfx908'],centos7:['8gfx906'],ubuntu18:['4gfx906', '4gfx908']])]
1521-
1522- jobNameList = auxiliary.appendJobNameList(jobNameList)
1523-
1524- propertyList.each
1525- {
1526- jobName, property->
1527- if (urlJobName == jobName)
1528- properties(auxiliary.addCommonProperties(property))
1529- }
1530-
1531- jobNameList.each
1532- {
1533- jobName, nodeDetails->
1534- if (urlJobName == jobName)
1535- stage(jobName) {
1536- runCI(nodeDetails, jobName)
1537- }
1538- }
1539-
1540- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
1541- if(!jobNameList.keySet().contains(urlJobName))
1542- {
1543- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
1544- stage(urlJobName) {
1545- runCI([ubuntu18:['4gfx906']], urlJobName)
1546- }
1547- }
1548-}
1549diff --git a/.jenkins/staticanalysis.groovy b/.jenkins/staticanalysis.groovy
1550deleted file mode 100644
1551index aa5fec3..0000000
1552--- a/.jenkins/staticanalysis.groovy
1553+++ /dev/null
1554@@ -1,68 +0,0 @@
1555-#!/usr/bin/env groovy
1556-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
1557-@Library('rocJenkins@pong') _
1558-
1559-// This is file for internal AMD use.
1560-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
1561-
1562-import com.amd.project.*
1563-import com.amd.docker.*
1564-import java.nio.file.Path
1565-
1566-def runCompileCommand(platform, project, jobName, boolean debug=false)
1567-{
1568- project.paths.construct_build_prefix()
1569-
1570- def command = """#!/usr/bin/env bash
1571- set -x
1572- ${project.paths.project_build_prefix}/docs/run_doc.sh
1573- """
1574-
1575- try
1576- {
1577- platform.runCommand(this, command)
1578- }
1579- catch(e)
1580- {
1581- throw e
1582- }
1583-
1584- publishHTML([allowMissing: false,
1585- alwaysLinkToLastBuild: false,
1586- keepAll: false,
1587- reportDir: "${project.paths.project_build_prefix}/docs/source/_build/html",
1588- reportFiles: "index.html",
1589- reportName: "Documentation",
1590- reportTitles: "Documentation"])
1591-}
1592-
1593-def runCI =
1594-{
1595- nodeDetails, jobName->
1596-
1597- def prj = new rocProject('rccl-internal', 'StaticAnalysis')
1598-
1599- // Define test architectures, optional rocm version argument is available
1600- def nodes = new dockerNodes(nodeDetails, jobName, prj)
1601-
1602- boolean formatCheck = false
1603- boolean staticAnalysis = true
1604-
1605- def compileCommand =
1606- {
1607- platform, project->
1608-
1609- runCompileCommand(platform, project, jobName, false)
1610- }
1611-
1612- buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis)
1613-}
1614-
1615-ci: {
1616- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
1617-
1618- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])]))
1619- stage(urlJobName) {
1620- runCI([ubuntu20:['cpu']], urlJobName)
1621- }
1622-}
1623diff --git a/.jenkins/staticlibrary.groovy b/.jenkins/staticlibrary.groovy
1624deleted file mode 100644
1625index 2ab6696..0000000
1626--- a/.jenkins/staticlibrary.groovy
1627+++ /dev/null
1628@@ -1,85 +0,0 @@
1629-#!/usr/bin/env groovy
1630-@Library('rocJenkins@pong') _
1631-import com.amd.project.*
1632-import com.amd.docker.*
1633-import java.nio.file.Path;
1634-
1635-def runCI =
1636-{
1637- nodeDetails, jobName->
1638-
1639- def prj = new rocProject('rccl', 'Static Library PreCheckin')
1640-
1641- prj.timeout.test = 1440
1642- prj.paths.build_command = './install.sh -t --static'
1643-
1644- def nodes = new dockerNodes(nodeDetails, jobName, prj)
1645-
1646- def commonGroovy
1647-
1648- boolean formatCheck = false
1649-
1650- def compileCommand =
1651- {
1652- platform, project->
1653-
1654- commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
1655- commonGroovy.runCompileCommand(platform, project, jobName)
1656- }
1657-
1658-
1659- def testCommand =
1660- {
1661- platform, project->
1662-
1663- commonGroovy.runTestCommand(platform, project, "*sum_float32*")
1664- }
1665-
1666- def packageCommand =
1667- {
1668- platform, project->
1669-
1670- commonGroovy.runPackageCommand(platform, project, jobName)
1671- }
1672-
1673- buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
1674-}
1675-
1676-ci: {
1677- String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
1678-
1679- def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])],
1680- "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
1681- "rocm-docker":[]]
1682- propertyList = auxiliary.appendPropertyList(propertyList)
1683-
1684- def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]),
1685- "compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]),
1686- "rocm-docker":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']])]
1687- jobNameList = auxiliary.appendJobNameList(jobNameList)
1688-
1689- propertyList.each
1690- {
1691- jobName, property->
1692- if (urlJobName == jobName)
1693- properties(auxiliary.addCommonProperties(property))
1694- }
1695-
1696- jobNameList.each
1697- {
1698- jobName, nodeDetails->
1699- if (urlJobName == jobName)
1700- stage(jobName) {
1701- runCI(nodeDetails, jobName)
1702- }
1703- }
1704-
1705- // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
1706- if(!jobNameList.keySet().contains(urlJobName))
1707- {
1708- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
1709- stage(urlJobName) {
1710- runCI([ubuntu16:['4gfx906']], urlJobName)
1711- }
1712- }
1713-}
1714diff --git a/.readthedocs.yaml b/.readthedocs.yaml
1715new file mode 100644
1716index 0000000..ced8a98
1717--- /dev/null
1718+++ b/.readthedocs.yaml
1719@@ -0,0 +1,18 @@
1720+# Read the Docs configuration file
1721+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
1722+
1723+version: 2
1724+
1725+build:
1726+ os: ubuntu-22.04
1727+ tools:
1728+ python: "3.10"
1729+
1730+sphinx:
1731+ configuration: docs/conf.py
1732+
1733+formats: [htmlzip, pdf, epub]
1734+
1735+python:
1736+ install:
1737+ - requirements: docs/sphinx/requirements.txt
1738diff --git a/CHANGELOG.md b/CHANGELOG.md
1739index 448c973..6da80ac 100644
1740--- a/CHANGELOG.md
1741+++ b/CHANGELOG.md
1742@@ -1,7 +1,209 @@
1743-# Change Log for RCCL
1744+# Changelog for RCCL
1745
1746 Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
1747
1748+## Unreleased - RCCL 2.27.7 for ROCm 7.1.0
1749+
1750+### Added
1751+* Added `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv.
1752+* Added `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages.
1753+
1754+### Changed
1755+
1756+* The MSCCL++ feature is now disabled by default. The `--disable-mscclpp` build flag is replaced with `--enable-mscclpp` in the `rccl/install.sh` script.
1757+* Compatibility with NCCL 2.27.7
1758+
1759+### Resolved issues
1760+* Improve small message performance for alltoall by enabling and optimizing batched P2P operations.
1761+
1762+### Known issues
1763+* Symmetric memory kernels are currently disabled due to ongoing CUMEM enablement work.
1764+
1765+## RCCL 2.26.6 for ROCm 7.0.0
1766+
1767+### Resolved issues
1768+
1769+* Resolved an issue when using more than 64 channels when multiple collectives are used in the same `ncclGroup()` call.
1770+* Fixed unit test failures in tests ending with `ManagedMem` and `ManagedMemGraph` suffixes.
1771+* Suboptimal algorithmic switching point for AllReduce on MI300x.
1772+* Fixed the known issue "When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault." with a design change to use `comm` instead of `rank` for `mscclStatus`. The Global map for `comm` to `mscclStatus` is still not thread safe but should be explicitly handled by mutexes for read writes. This is tested for correctness, but there is a plan to use a thread-safe map data structure in upcoming changes.
1773+
1774+### Added
1775+
1776+* Added new GPU target `gfx950`.
1777+* Added support for `unroll=1` in device-code generation to improve performance,
1778+* Set a default of 112 channels for a single node with `8 * gfx950`,
1779+* Enabled LL128 protocol on `gfx950`.
1780+* Added MSCCL support for AllGather multinode gfx942/gfx950 (i.e., 16 and 32 GPUs). To enable, set the environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`.
1781+* Thread thresholds for LL/LL128 are selected in Tuning Models for the MI300X. This impacts the number of channels used for AG and RS. Channel tuning model is bypassed if `NCCL_THREAD_THRESHOLDS`, `NCCL_MIN_NCHANNELS', or 'NCCL_MAX_NCHANNELS` are set.
1782+* Multi-node tuning for AllGather, AllReduce, and ReduceScatter that leverages LL/LL64/LL128 protocol to use nontemporal vector load/store for tunable message size ranges.
1783+* LL/LL128 usage ranges for AR, AG, and RS are part of the tuning models, which enable architecture-specific tuning in conjunction with the existing Rome Models scheme in RCCL.
1784+* Two new APIs are exposed as part of an initiative to separate RCCL code. These APIs are `rcclGetAlgoInfo` and `rcclFuncMaxSendRecvCount`. However, user-level invocation requires that RCCL be built with `RCCL_EXPOSE_STATIC` enabled.
1785+* Enabled double-buffering in `reduceCopyPacks` to trigger pipelining, especially to overlap `bf16` arithmetic and bridge the gap between `fp32` performance and `bf16` for both `gfx942` and `gfx950`. Pipelining has been made tunable via `rcclSetPipelining`, similar to algorithms/protocols so that regression is avoided in certain message sizes.
1786+* Added a direct allgather algorithm. This is enabled by default for multi-node if there are 16 nodes or fewer. The message size threshold is 4MB.
1787+* Added `RCCL_OVERRIDE_PROTO` and `RCCL_OVERRIDE_ALGO` to allow direct replacement of protocol and algorithm choices. Unlike `NCCL_PROTO` and `NCCL_ALGO`, which re-run the model across enabled combinations and may not guarantee the intended override, these new options enforce the specified selections explicitly.
1788+
1789+### Changed
1790+
1791+* Compatibility with NCCL 2.23.4
1792+* Compatibility with NCCL 2.24.3
1793+* Compatibility with NCCL 2.25.1
1794+* Compatibility with NCCL 2.26.6
1795+
1796+## RCCL 2.22.3 for ROCm 6.4.2
1797+
1798+### Added
1799+
1800+* Added support for the LL128 protocol on gfx942.
1801+
1802+## RCCL 2.22.3 for ROCm 6.4.1
1803+
1804+### Resolved issues
1805+
1806+* Fixed the accuracy issue for MSCCLPP `allreduce7` kernel in graph mode.
1807+* Fixed IntraNet performance.
1808+* Fixed an issue where, in rare circumstances, the application could stop responding due to a proxy thread synchronization issue.
1809+
1810+### Known issues
1811+
1812+* When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault.
1813+ The recommended workaround is to disable MSCCL with `export RCCL_MSCCL_ENABLE=0`.
1814+* Within the RCCL-UnitTests test suite, failures occur in tests ending with the `ManagedMem` and `ManagedMemGraph` suffixes. These failures only affect the test results and do not affect the RCCL component itself. This issue will be resolved in the next major release.
1815+
1816+## RCCL 2.22.3 for ROCm 6.4.0
1817+
1818+### Added
1819+
1820+* `RCCL_SOCKET_REUSEADDR` and `RCCL_SOCKET_LINGER` environment parameters.
1821+* Setting `NCCL_DEBUG=TRACE NCCL_DEBUG_SUBSYS=VERBS` will generate traces for fifo and data `ibv_post_sends`.
1822+* Added `--log-trace` flag to enable traces through the install.sh script (e.g. `./install.sh --log-trace`).
1823+
1824+### Changed
1825+
1826+* Compatibility with NCCL 2.22.3
1827+* Added support for the rail-optimized tree algorithm for the MI300 series. This feature requires the use of all eight GPUs within
1828+ each node. It limits NIC traffic to use only GPUs of the same index across nodes and should not impact performance
1829+ on non-rail-optimized network topologies. The original method of building trees can be enabled by setting the
1830+ environment variable `RCCL_DISABLE_RAIL_TREES=1`.
1831+* Additional debug information about how the trees are built can be logged to the GRAPH logging subsys by setting
1832+ `RCCL_OUTPUT_TREES=1`.
1833+* Added documentation about the NPS4 and CPX partition modes performance benefits on the MI300X.
1834+
1835+## RCCL 2.21.5 for ROCm 6.3.1
1836+
1837+### Added
1838+
1839+### Changed
1840+
1841+* Enhanced user documentation
1842+
1843+### Resolved issues
1844+
1845+* Corrected user help strings in `install.sh`
1846+
1847+## RCCL 2.21.5 for ROCm 6.3.0
1848+
1849+### Added
1850+
1851+* MSCCL++ integration for AllReduce and AllGather on gfx942
1852+* Performance collection to rccl_replayer
1853+* Tuner Plugin example for MI300
1854+* Tuning table for large number of nodes
1855+* Support for amdclang++
1856+* Allow NIC ID remapping using `NCCL_RINGS_REMAP` environment variable
1857+
1858+### Changed
1859+
1860+* Compatibility with NCCL 2.21.5
1861+* Increased channel count for MI300X multi-node
1862+* Enabled MSCCL for single-process multi-threaded contexts
1863+* Enabled gfx12
1864+* Enabled CPX mode for MI300X
1865+* Enabled tracing with rocprof
1866+* Improved version reporting
1867+* Enabled GDRDMA for Linux kernel 6.4.0+
1868+
1869+### Resolved issues
1870+
1871+* Fixed model matching with PXN enable
1872+
1873+## RCCL 2.20.5 for ROCm 6.2.1
1874+### Fixed
1875+- GDR support flag now set with DMABUF
1876+### Known issues
1877+- On systems running Linux kernel 6.8.0, such as Ubuntu 24.04, Direct Memory Access (DMA) transfers between the GPU and NIC are disabled and impacts multi-node RCCL performance.
1878+ - This issue was reproduced with RCCL 2.20.5 (ROCm 6.2.0 and 6.2.1) on systems with Broadcom Thor-2 NICs and affects other systems with RoCE networks using Linux 6.8.0 or newer.
1879+ - Older RCCL versions are also impacted.
1880+ - This issue will be addressed in a future ROCm release.
1881+
1882+## RCCL 2.20.5 for ROCm 6.2.0
1883+### Changed
1884+- Compatibility with NCCL 2.20.5
1885+- Compatibility with NCCL 2.19.4
1886+- Performance tuning for some collective operations on MI300
1887+- Enabled NVTX code in RCCL
1888+- Replaced rccl_bfloat16 with hip_bfloat16
1889+- NPKit updates:
1890+ - Removed warm-up iteration removal by default, need to opt in now
1891+ - Doubled the size of buffers to accommodate for more channels
1892+- Modified rings to be rail-optimized topology friendly
1893+- Replaced ROCmSoftwarePlatform links with ROCm links
1894+### Added
1895+- Support for fp8 and rccl_bfloat8
1896+- Support for using HIP contiguous memory
1897+- Implemented ROC-TX for host-side profiling
1898+- Enabled static build
1899+- Added new rome model
1900+- Added fp16 and fp8 cases to unit tests
1901+- New unit test for main kernel stack size
1902+- New -n option for topo_expl to override # of nodes
1903+- Improved debug messages of memory allocations
1904+### Fixed
1905+- Bug when configuring RCCL for only LL128 protocol
1906+- Scratch memory allocation after API change for MSCCL
1907+
1908+## RCCL 2.18.6 for ROCm 6.1.0
1909+### Changed
1910+- Compatibility with NCCL 2.18.6
1911+
1912+## RCCL 2.18.3 for ROCm 6.0.0
1913+### Changed
1914+- Compatibility with NCCL 2.18.3
1915+
1916+## RCCL 2.17.1-1 for ROCm 5.7.0
1917+### Changed
1918+- Compatibility with NCCL 2.17.1-1
1919+- Performance tuning for some collective operations
1920+### Added
1921+- Minor improvements to MSCCL codepath
1922+- NCCL_NCHANNELS_PER_PEER support
1923+- Improved compilation performance
1924+- Support for gfx94x
1925+### Fixed
1926+- Potential race-condition during ncclSocketClose()
1927+
1928+## RCCL 2.16.2 for ROCm 5.6.0
1929+### Changed
1930+- Compatibility with NCCL 2.16.2
1931+### Fixed
1932+- Remove workaround and use indirect function call
1933+
1934+## RCCL 2.15.5 for ROCm 5.5.0
1935+### Changed
1936+- Compatibility with NCCL 2.15.5
1937+- Unit test executable renamed to rccl-UnitTests
1938+### Added
1939+- HW-topology aware binary tree implementation
1940+- Experimental support for MSCCL
1941+- New unit tests for hipGraph support
1942+- NPKit integration
1943+### Fixed
1944+- rocm-smi ID conversion
1945+- Support for HIP_VISIBLE_DEVICES for unit tests
1946+- Support for p2p transfers to non (HIP) visible devices
1947+### Removed
1948+- Removed TransferBench from tools. Exists in standalone repo: https://github.com/ROCm/TransferBench
1949+
1950 ## RCCL-2.13.4 for ROCm 5.4.0
1951 ### Changed
1952 - Compatibility with NCCL 2.13.4
1953diff --git a/CMakeLists.txt b/CMakeLists.txt
1954index 67a2069..c8281f2 100644
1955--- a/CMakeLists.txt
1956+++ b/CMakeLists.txt
1957@@ -1,61 +1,388 @@
1958-# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
1959+# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
1960+# Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
1961
1962-cmake_minimum_required(VERSION 3.5)
1963-INCLUDE(CheckIncludeFiles)
1964-INCLUDE(CheckSymbolExists)
1965+# CMake version minimum requirements
1966+#==================================================================================================
1967+cmake_minimum_required(VERSION 3.16)
1968
1969-# We use C++14 features, this will add compile option: -std=c++14
1970-set( CMAKE_CXX_STANDARD 14 )
1971-# Without this line, it will add -std=gnu++14 instead, which has some issues.
1972-set( CMAKE_CXX_EXTENSIONS OFF )
1973+# CMake Toolchain file to define compilers and path to ROCm
1974+#==================================================================================================
1975+if (NOT CMAKE_TOOLCHAIN_FILE)
1976+ set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake")
1977+ message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
1978+endif()
1979
1980+# RCCL project
1981+#==================================================================================================
1982 project(rccl CXX)
1983
1984-include(cmake/Dependencies.cmake)
1985+# Build options
1986+#==================================================================================================
1987+option(BUILD_ADDRESS_SANITIZER "Enable address sanitizer" OFF)
1988+option(BUILD_BFD "Enable custom backtrace (if bfd.h exists)" OFF)
1989+option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF)
1990+option(BUILD_SHARED_LIBS "Build as shared library" ON)
1991+option(BUILD_TESTS "Build unit test programs" OFF)
1992+option(COLLTRACE "Collective Trace Option" ON)
1993+option(DUMP_ASM "Disassemble and dump" OFF)
1994+option(ENABLE_CODE_COVERAGE "Enable code coverage" OFF)
1995+option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" ON)
1996+option(ENABLE_MSCCLPP "Enable MSCCL++" OFF)
1997+option(ENABLE_MSCCLPP_CLIP "Enable MSCCL++ CLIP" OFF)
1998+option(ENABLE_MSCCLPP_EXECUTOR "Enable MSCCL++ Executor" OFF)
1999+option(ENABLE_MSCCLPP_FORMAT_CHECKS "Enable formatting checks in MSCCL++" OFF)
2000+option(ENABLE_NPKIT "Enable NPKit" OFF)
2001+option(ENABLE_IFC "Enable indirect function call" OFF)
2002+option(GENERATE_SYM_KERNELS "Generate symmetric memory kernels" OFF)
2003+option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
2004+option(ROCTX "Enable ROCTX" ON)
2005+option(PROFILE "Enable profiling" OFF)
2006+option(TIMETRACE "Enable time-trace during compilation" OFF)
2007+option(TRACE "Enable additional tracing" OFF)
2008+option(FAULT_INJECTION "Enable fault injection" ON)
2009+
2010+# Default GPU architectures to build
2011+#==================================================================================================
2012+set(DEFAULT_GPUS
2013+ gfx906
2014+ gfx908
2015+ gfx90a
2016+ gfx942
2017+ gfx950
2018+ gfx1030
2019+ gfx1100
2020+ gfx1101
2021+ gfx1102
2022+ gfx1200
2023+ gfx1201)
2024+
2025+# Load CMake modules
2026+#==================================================================================================
2027+include(CheckIncludeFiles)
2028+include(CheckSymbolExists)
2029+include(cmake/Dependencies.cmake) # GTest, rocm-cmake, rocm_local_targets
2030+include(cmake/CheckSymbolExistsNoWarn.cmake)
2031+
2032+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
2033
2034-# Detect compiler support for target ID
2035-# This section is deprecated. Please use rocm_check_target_ids for future use.
2036-if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )
2037- execute_process(COMMAND ${CMAKE_CXX_COMPILER} "--help"
2038- OUTPUT_VARIABLE CXX_OUTPUT
2039- OUTPUT_STRIP_TRAILING_WHITESPACE
2040- ERROR_STRIP_TRAILING_WHITESPACE)
2041- string(REGEX MATCH ".mcode\-object\-version" TARGET_ID_SUPPORT ${CXX_OUTPUT})
2042+# Build only for local GPU architecture
2043+if (BUILD_LOCAL_GPU_TARGET_ONLY)
2044+ message(STATUS "Building only for local GPU target")
2045+ if (COMMAND rocm_local_targets)
2046+ rocm_local_targets(DEFAULT_GPUS)
2047+ else()
2048+ message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.")
2049+ endif()
2050 endif()
2051
2052+# Determine which GPU architectures to build for
2053+set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
2054+
2055+# Modify GPU architectures for Address Sanitizer builds by appending "xnack+"
2056+if (BUILD_ADDRESS_SANITIZER)
2057+ SET(amdgpu_targets "")
2058+ foreach(amdgpu_target IN LISTS GPU_TARGETS)
2059+ if(NOT amdgpu_target STREQUAL "")
2060+ string(FIND "${amdgpu_target}" ":xnack+" HAS_XNACK_SUFFIX)
2061+ if(HAS_XNACK_SUFFIX EQUAL -1)
2062+ list(APPEND amdgpu_targets "${amdgpu_target}:xnack+")
2063+ else()
2064+ list(APPEND amdgpu_targets "${amdgpu_target}")
2065+ endif()
2066+ endif()
2067+ endforeach()
2068+ SET(GPU_TARGETS "${amdgpu_targets}")
2069+endif()
2070+
2071+# Check if clang compiler can offload to GPU_TARGETS
2072+if (COMMAND rocm_check_target_ids)
2073+ message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}")
2074+ rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS})
2075+else()
2076+ message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs.")
2077+ set(SUPPORTED_GPUS ${DEFAULT_GPUS})
2078+endif()
2079+
2080+set(GPU_TARGETS "${SUPPORTED_GPUS}")
2081+message(STATUS "Compiling for ${GPU_TARGETS}")
2082+
2083+## NOTE: Reload rocm-cmake in order to update GPU_TARGETS
2084+include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults
2085+
2086+# Try to establish ROCM_PATH (for find_package)
2087+#==================================================================================================
2088 if(NOT DEFINED ROCM_PATH)
2089- get_filename_component(_real_path ${CMAKE_CXX_COMPILER} REALPATH)
2090- get_filename_component(_new_path "${_real_path}" DIRECTORY)
2091- get_filename_component(ROCM_PATH "${_new_path}/../.." REALPATH)
2092+ # Guess default location
2093+ set(ROCM_PATH "/opt/rocm")
2094+ message(WARNING "Unable to find ROCM_PATH: Falling back to ${ROCM_PATH}")
2095+else()
2096+ message(STATUS "ROCM_PATH found: ${ROCM_PATH}")
2097 endif()
2098+set(ENV{ROCM_PATH} ${ROCM_PATH})
2099
2100+if("${CMAKE_CXX_COMPILER}" MATCHES ".*amdclang\\+\\+")
2101+ message(STATUS "Compiling with amdclang++")
2102+ set(COMPILER_EXE_NAME amdclang++)
2103+ set(COMPILER_GREP_STRING "AMD clang version")
2104+ set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $4}'")
2105+elseif("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+")
2106+ message(STATUS "Compiling with clang++")
2107+ set(COMPILER_EXE_NAME clang++)
2108+ set(COMPILER_GREP_STRING "AMD clang version")
2109+ set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $4}'")
2110+elseif("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc$")
2111+ message(STATUS "Compiling with hipcc")
2112+ set(COMPILER_EXE_NAME hipcc)
2113+ set(COMPILER_GREP_STRING "HIP version")
2114+ set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $3}' | awk -F\"-\" '{ printf $1}'")
2115+else()
2116+ message(FATAL_ERROR "RCCL can be built only with hipcc or amdclang++")
2117+endif()
2118+
2119+# Set CMAKE flags
2120+#==================================================================================================
2121 set(CMAKE_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "")
2122+set(CMAKE_CXX_STANDARD 17) # We use C++17 features, this will add compile option: -std=c++17
2123+set(CMAKE_CXX_EXTENSIONS OFF) # Without this line, it will add -std=gnu++17 instead, which has some issues.
2124+if(ROCM_PATH)
2125+ list(APPEND CMAKE_PREFIX_PATH # Add ROCM_PATH to CMake search paths (for finding HIP / HSA
2126+ ${ROCM_PATH}
2127+ ${ROCM_PATH}/hip
2128+ ${ROCM_PATH}/llvm)
2129+endif()
2130+
2131+# Check for required dependencies
2132+#==================================================================================================
2133+## Check for Threads
2134+set(THREADS_PREFER_PTHREAD_FLAG ON)
2135+find_package(Threads REQUIRED)
2136
2137-#Set the AMDGPU_TARGETS with backward compatiblity
2138-if(COMMAND rocm_check_target_ids)
2139- rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
2140- TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030;gfx1100;gfx1101;gfx1102"
2141- )
2142+## Check for HIP
2143+find_package(hip REQUIRED)
2144+message(STATUS "HIP compiler: ${HIP_COMPILER}")
2145+message(STATUS "HIP runtime: ${HIP_RUNTIME}")
2146+if(NOT "${HIP_COMPILER}" MATCHES "clang")
2147+ message(FATAL_ERROR "RCCL requires clang-based compiler (amdclang++ or hipcc)")
2148+endif()
2149+
2150+## Check for compiler version
2151+find_program(compiler_executable ${COMPILER_EXE_NAME})
2152+message(STATUS "${COMPILER_EXE_NAME} executable: ${compiler_executable}")
2153+execute_process(
2154+ COMMAND bash "-c" "${compiler_executable} --version | grep \"${COMPILER_GREP_STRING}\" | ${COMPILER_AWK_CMD}"
2155+ OUTPUT_VARIABLE compiler_version_string)
2156+message(STATUS "${COMPILER_EXE_NAME} version: ${compiler_version_string}")
2157+
2158+## Check for HIP version
2159+find_program(hipconfig_executable hipconfig)
2160+message(STATUS "hipconfig executable: ${hipconfig_executable}")
2161+execute_process(
2162+ COMMAND bash "-c" "${hipconfig_executable} -v | awk -F\"-\" '{ printf $1 }'"
2163+ OUTPUT_VARIABLE hip_version_string)
2164+message(STATUS "${COMPILER_EXE_NAME} HIP version: ${hip_version_string}")
2165+
2166+## Check for ROCm version
2167+set(EXPLICIT_ROCM_VERSION "" CACHE STRING "Explicit ROCM version to compile to (auto detect if empty)")
2168+if(EXPLICIT_ROCM_VERSION)
2169+ set(rocm_version_string "${EXPLICIT_ROCM_VERSION}")
2170+elseif(ROCM_PATH)
2171+ message(STATUS "Reading ROCM version from ${ROCM_PATH}/.info/version")
2172+ file(READ "${ROCM_PATH}/.info/version" rocm_version_string)
2173 else()
2174- # Use target ID syntax if supported for AMDGPU_TARGETS
2175- if(TARGET_ID_SUPPORT)
2176- set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx1030;gfx1100;gfx1101;gfx1102")
2177+ message(FATAL_ERROR "Could not determine ROCM version (set EXPLICIT_ROCM_VERSION or set ROCM_PATH to a valid installation)")
2178+endif()
2179+string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" rocm_version_matches ${rocm_version_string})
2180+if (rocm_version_matches)
2181+ set(ROCM_MAJOR_VERSION ${CMAKE_MATCH_1})
2182+ set(ROCM_MINOR_VERSION ${CMAKE_MATCH_2})
2183+ set(ROCM_PATCH_VERSION ${CMAKE_MATCH_3})
2184+
2185+ message(STATUS "ROCm version: ${ROCM_MAJOR_VERSION}.${ROCM_MINOR_VERSION}.${ROCM_PATCH_VERSION}")
2186+
2187+ # Convert the version components to int for comparison
2188+ math(EXPR ROCM_VERSION "(10000 * ${ROCM_MAJOR_VERSION}) + (100 * ${ROCM_MINOR_VERSION}) + ${ROCM_PATCH_VERSION}")
2189+ add_definitions("-DROCM_VERSION=${ROCM_VERSION}")
2190+else()
2191+ message(WARNING "Failed to extract ROCm version.")
2192+endif()
2193+
2194+### Required for checking HIP device symbols when building with amdclang++
2195+set(CMAKE_REQUIRED_LIBRARIES hip::device)
2196+
2197+### Check for hipDeviceMallocUncached support
2198+check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
2199+
2200+### Check for hipHostMallocUncached support
2201+check_symbol_exists("hipHostMallocUncached" "hip/hip_runtime_api.h" HIP_HOST_UNCACHED_MEMORY)
2202+
2203+### Check for hipDeviceMallocContiguous support
2204+check_symbol_exists("hipDeviceMallocContiguous" "hip/hip_runtime_api.h" HIP_CONTIGUOUS_MEMORY)
2205+
2206+unset(CMAKE_REQUIRED_LIBRARIES)
2207+
2208+### Check for indirect function call support
2209+if(ENABLE_IFC)
2210+ if("${hip_version_string}" VERSION_GREATER_EQUAL "5.5.30201")
2211+ set(IFC_ENABLED ON)
2212+ message(STATUS "Indirect function call enabled")
2213+ else()
2214+ set(IFC_ENABLED OFF)
2215+ message(WARNING "Indirect function call disabled - requires HIP version >= 5.5.30201")
2216+ endif()
2217+else()
2218+ set(IFC_ENABLED OFF)
2219+endif()
2220+
2221+## Check for LL128 support
2222+if("${hip_version_string}" VERSION_GREATER_EQUAL "6.1.33591")
2223+ set(LL128_ENABLED ON)
2224+ message(STATUS "RCCL LL128 protocol enabled")
2225+else()
2226+ message(STATUS "RCCL LL128 protocol disabled - requires HIP version >= 6.1.33591")
2227+endif()
2228+
2229+## Check for hsa-runtime64
2230+find_package(hsa-runtime64 REQUIRED)
2231+get_target_property(HSA_INCLUDE_PATH hsa-runtime64::hsa-runtime64 INTERFACE_INCLUDE_DIRECTORIES)
2232+message(STATUS "HSA runtime: ${HSA_INCLUDE_PATH}")
2233+
2234+## Check for ROCM-smi
2235+find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
2236+if (rocm_smi_FOUND)
2237+ message(STATUS "Found rocm_smi at ${ROCM_SMI_INCLUDE_DIR}")
2238+else()
2239+ message(STATUS "Checking old include directory structure for rocm_smi")
2240+ set(ROCM_SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
2241+ set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
2242+ set(ROCM_SMI_LIBRARIES rocm_smi64)
2243+endif()
2244+check_include_file_cxx("${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
2245+### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
2246+file(READ "${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
2247+string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
2248+if(${matchres} EQUAL -1)
2249+ message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
2250+else()
2251+ message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
2252+ set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
2253+endif ()
2254+
2255+## Check for BFD library if custom backtrace is requested
2256+if(BUILD_BFD)
2257+ enable_language(C)
2258+ check_include_files(bfd.h HAVE_BFD)
2259+ if (HAVE_BFD)
2260+ message(STATUS "-- Found BFD support")
2261+
2262+ ### Required for checking HIP device symbols when building with amdclang++
2263+ set(CMAKE_REQUIRED_LIBRARIES hip::device)
2264+
2265+ # Check for specific BFD feature support
2266+ CHECK_SYMBOL_EXISTS(bfd_get_section_flags "bfd.h" HAVE_DECL_BFD_GET_SECTION_FLAGS)
2267+ CHECK_SYMBOL_EXISTS(bfd_get_section_vma "bfd.h" HAVE_DECL_BFD_GET_SECTION_VMA)
2268+ CHECK_CXX_SOURCE_COMPILES(
2269+ "#include <bfd.h>
2270+
2271+ int main (int argc, char **argv){
2272+ bfd_size_type size;
2273+ bfd abfd;
2274+ asection sec;
2275+ size = bfd_section_size(&abfd, &sec);
2276+ return (int)(size);
2277+ }"
2278+ HAVE_TWO_ARG_BFD_SECTION_SIZE)
2279+
2280+ unset(CMAKE_REQUIRED_LIBRARIES)
2281+
2282+ # Check for iberty support
2283+ find_library(HAVE_IBERTY iberty PATHS /usr/lib64 /usr/lib/ PATH_SUFFIXES x86_64-linux-gnu)
2284+ if(HAVE_IBERTY)
2285+ message(STATUS "iberty found @ ${HAVE_IBERTY}")
2286+ endif()
2287+
2288+ # Check for demangle support
2289+ find_path(DEMANGLE_DIR demangle.h PATHS /usr/include PATH_SUFFIXES libiberty)
2290+ if(NOT DEMANGLE_DIR)
2291+ message(WARNING "Could not find demangle.h ${DEMANGLE_DIR}")
2292 else()
2293- set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900;gfx906;gfx908")
2294+ message(STATUS "Found demangle.h in ${DEMANGLE_DIR}")
2295 endif()
2296+ else()
2297+ message(WARNING "bfd.h header not found - Disabling custom backtrace")
2298+ endif()
2299+endif()
2300+
2301+# Check for --amdgpu-kernarg-preload-count
2302+check_cxx_compiler_flag("-mllvm --amdgpu-kernarg-preload-count=16" HAVE_KERNARG_PRELOAD)
2303+if (HAVE_KERNARG_PRELOAD)
2304+ message(STATUS "Kernarg preloading to SGPR enabled")
2305+endif()
2306+
2307+check_cxx_compiler_flag("-parallel-jobs=12" HAVE_PARALLEL_JOBS)
2308+if (HAVE_PARALLEL_JOBS)
2309+ message(STATUS "Parallel jobs enabled")
2310+endif()
2311+
2312+## Disable building MSCCL++ if the build environment is invalid
2313+## Currently MSCCL++ is supported only on gfx942 and gfx950, and only on Ubuntu and CentOS
2314+set(MSCCLPP_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")
2315+
2316+# Check if any of the supported architectures are in GPU_TARGETS
2317+set(ARCH_MATCH_FOUND OFF)
2318+set(MSCCLPP_GPU_TARGETS "")
2319+foreach(ARCH IN LISTS GPU_TARGETS)
2320+ if(ARCH IN_LIST MSCCLPP_SUPPORTED_ARCHS)
2321+ set(ARCH_MATCH_FOUND ON)
2322+ list(APPEND MSCCLPP_GPU_TARGETS "${ARCH}")
2323+ endif()
2324+endforeach()
2325+set(MSCCLPP_GPU_TARGETS "${MSCCLPP_GPU_TARGETS}" CACHE STRING "GPU Targets supported by MSCCL++" FORCE)
2326+
2327+if (ENABLE_MSCCLPP AND NOT ARCH_MATCH_FOUND)
2328+ set(ENABLE_MSCCLPP OFF)
2329+ message(WARNING "Can only build MSCCL++ for supported GPU_TARGETS: ${MSCCLPP_SUPPORTED_ARCHS}; current GPU_TARGETS: ${GPU_TARGETS}; so disabling MSCCL++ build")
2330+endif()
2331+
2332+# MSCCL++ is only supported on ROCm 6.2.0 or newer
2333+if (ENABLE_MSCCLPP AND ROCM_VERSION VERSION_LESS "60200")
2334+ set(ENABLE_MSCCLPP OFF)
2335+ message(WARNING "MSCCL++ integration only supported on ROCm 6.2.0 or greater; disabling MSCCL++ build")
2336+endif()
2337+
2338+# cmake_host_system_information(RESULT HOST_OS_ID QUERY DISTRIB_ID) ## Requires cmake 3.22
2339+execute_process(
2340+ COMMAND bash -c "grep '^ID=' /etc/os-release | cut -d'=' -f2 | cut -d'\"' -f2"
2341+ OUTPUT_VARIABLE HOST_OS_ID
2342+ OUTPUT_STRIP_TRAILING_WHITESPACE
2343+)
2344+
2345+execute_process(
2346+ COMMAND bash -c "grep '^ID_LIKE=' /etc/os-release | cut -d'=' -f2 | cut -d'\"' -f2"
2347+ OUTPUT_VARIABLE HOST_OS_FAMILY
2348+ OUTPUT_STRIP_TRAILING_WHITESPACE
2349+)
2350+
2351+if (ENABLE_MSCCLPP AND NOT(${HOST_OS_ID} STREQUAL "ubuntu" OR ${HOST_OS_ID} STREQUAL "centos"))
2352+ set(ENABLE_MSCCLPP OFF)
2353+ message(WARNING "MSCCL++ integration not supported on this OS (${HOST_OS_ID}); disabling MSCCL++ build")
2354 endif()
2355-set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target")
2356
2357-option(BUILD_TESTS "Build test programs" OFF)
2358-option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
2359-option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF)
2360-option(BUILD_ALLREDUCE_ONLY "Build AllReduce + sum + float kernel only" OFF)
2361-#Set the header wrapper ON by default.
2362-option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" ON)
2363+# Check for ROCTX
2364+if(ROCTX)
2365+ find_library(ROCTX_LIB NAMES roctx64)
2366+ if(ROCTX_LIB)
2367+ set(ROCTX_ENABLE ON)
2368+ message(STATUS "ROCTX library found: ${ROCTX_LIB}")
2369+ else()
2370+ message(WARNING "ROCTX library not found. Skipping ROCTX linking.")
2371+ endif()
2372+endif()
2373
2374-# parse version from Makefile NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH must exist
2375-# NCCL_SUFFIX is optional NCCL_VERSION formatting is ((X) * 1000 + (Y) * 100 +
2376-# (Z)) so we must first detect one or two digits first
2377+# Determine version from makefiles/version.mk and fill in templates
2378+#==================================================================================================
2379+## parse version from Makefile NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH must exist
2380+## NCCL_SUFFIX is optional
2381+## NCCL_VERSION formatting is ((X) * 1000 + (Y) * 100 + (Z)) so we must first detect one or two digits first
2382 file(READ makefiles/version.mk version_mk_text)
2383 if("${version_mk_text}" MATCHES "NCCL_MAJOR *:= *([0-9]*)")
2384 set(NCCL_MAJOR ${CMAKE_MATCH_1})
2385@@ -88,295 +415,966 @@ else()
2386 set(NCCL_VERSION "${NCCL_MAJOR}${NCCL_MINOR}0${NCCL_PATCH}")
2387 endif()
2388
2389-# Setup VERSION
2390+## Setup VERSION
2391 set(VERSION_STRING "${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}")
2392 rocm_setup_version(VERSION ${VERSION_STRING})
2393
2394-list(APPEND CMAKE_PREFIX_PATH
2395- ${ROCM_PATH}
2396- ${ROCM_PATH}/hip
2397- ${ROCM_PATH}/llvm
2398- ${ROCM_PATH}/hcc)
2399+## Fill in version information for main header file
2400+configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/include/rccl/rccl.h) # For external linking
2401+configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/include/nccl.h) # Used by some internal files
2402
2403-find_package(hip REQUIRED)
2404-message(STATUS "HIP compiler: ${HIP_COMPILER}")
2405-message(STATUS "HIP runtime: ${HIP_RUNTIME}")
2406+# Collect list of all source files
2407+#==================================================================================================
2408+# E.g: find src -type f \( -name "*.cc" -o -name "*.h" -o -name "*.hpp" \) | sort
2409+set(SRC_FILES
2410+ src/allocator.cc
2411+ src/bootstrap.cc
2412+ src/channel.cc
2413+ src/collectives.cc
2414+ src/debug.cc
2415+ src/enqueue.cc
2416+ src/group.cc
2417+ src/init.cc
2418+ src/init_nvtx.cc
2419+ src/mnnvl.cc
2420+ src/msccl.cc
2421+ src/proxy.cc
2422+ src/rccl_wrap.cc
2423+ src/symmetric.cc
2424+ src/transport.cc
2425+ src/device/all_gather.h
2426+ src/device/all_reduce.h
2427+ src/device/alltoall_pivot.h
2428+ src/device/broadcast.h
2429+ src/device/common.h
2430+ src/device/common_kernel.h
2431+ src/device/op128.h
2432+ src/device/primitives.h
2433+ src/device/prims_ll128.h
2434+ src/device/prims_ll.h
2435+ src/device/prims_simple.h
2436+ src/device/reduce.h
2437+ src/device/reduce_kernel.h
2438+ src/device/reduce_scatter.h
2439+ src/device/rccl_metadata.h
2440+ src/device/sendrecv.h
2441+ src/device/common.cu
2442+ src/device/onerank.cu
2443+ src/device/network/unpack/unpack_defs.h
2444+ src/device/network/unpack/unpack.h
2445+ src/device/symmetric/all_gather.cuh
2446+ src/device/symmetric/all_reduce.cuh
2447+ src/device/symmetric/kernel.cuh
2448+ src/device/symmetric/primitives.cuh
2449+ src/device/symmetric/reduce_scatter.cuh
2450+ src/graph/connect.cc
2451+ src/graph/paths.cc
2452+ src/graph/rings.cc
2453+ src/graph/rings.h
2454+ src/graph/rome_models.cc
2455+ src/graph/rome_models.h
2456+ src/graph/search.cc
2457+ src/graph/topo.cc
2458+ src/graph/topo.h
2459+ src/graph/trees.cc
2460+ src/graph/tuning.cc
2461+ src/graph/xml.cc
2462+ src/graph/xml.h
2463+ src/include/alloc.h
2464+ src/include/allocator.h
2465+ src/include/alt_rsmi.h
2466+ src/include/archinfo.h
2467+ src/include/api_trace.h
2468+ src/include/argcheck.h
2469+ src/include/BfdBacktrace.hpp
2470+ src/include/bitops.h
2471+ src/include/bootstrap.h
2472+ src/include/channel.h
2473+ src/include/checks.h
2474+ src/include/collectives.h
2475+ src/include/coll_net.h
2476+ src/include/comm.h
2477+ src/include/core.h
2478+ src/include/cpuset.h
2479+# src/include/cudawrap.h
2480+ src/include/debug.h
2481+ src/include/device.h
2482+ src/include/enqueue.h
2483+ src/include/gdrwrap.h
2484+ src/include/git_version.h
2485+ src/include/graph.h
2486+ src/include/group.h
2487+ src/include/hip_rocm_version_info.h
2488+ src/include/ibvcore.h
2489+ src/include/ibvsymbols.h
2490+ src/include/ibvwrap.h
2491+ src/include/info.h
2492+ src/include/ipcsocket.h
2493+ src/include/mnnvl.h
2494+ src/include/nccl_common.h
2495+ src/include/net_device.h
2496+ src/include/net.h
2497+ src/include/nvmlwrap.h
2498+ src/include/nvtx.h
2499+ src/include/nvtx_payload_schemas.h
2500+ src/include/nvtx_stub.h
2501+ src/include/p2p.h
2502+ src/include/param.h
2503+ src/include/profiler.h
2504+ src/include/proxy.h
2505+ src/include/ras.h
2506+ src/include/rccl_common.h
2507+ src/include/rccl_vars.h
2508+ src/include/register.h
2509+ src/include/register_inline.h
2510+ src/include/rccl_float8.h
2511+ src/include/rocm_smi_wrap.h
2512+ src/include/rocmwrap.h
2513+ src/include/roctx.h
2514+ src/include/recorder.h
2515+ src/include/shm.h
2516+ src/include/shmutils.h
2517+ src/include/signals.h
2518+ src/include/socket.h
2519+ src/include/strongstream.h
2520+ src/include/symmetric.h
2521+ src/include/timer.h
2522+ src/include/transport.h
2523+ src/include/trees.h
2524+ src/include/tuner.h
2525+ src/include/utils.h
2526+ src/include/mlx5/mlx5dvcore.h
2527+ src/include/mlx5/mlx5dvsymbols.h
2528+ src/include/mlx5/mlx5dvwrap.h
2529+ src/include/msccl/msccl_lifecycle.h
2530+ src/include/msccl/msccl_parser.h
2531+ src/include/msccl/msccl_scheduler.h
2532+ src/include/msccl/msccl_setup.h
2533+ src/include/msccl/msccl_status.h
2534+ src/include/msccl/msccl_struct.h
2535+ src/include/npkit/npkit.h
2536+ src/include/npkit/npkit_event.h
2537+ src/include/npkit/npkit_struct.h
2538+ src/include/nvtx3/nvToolsExt.h
2539+ src/include/nvtx3/nvToolsExtCounters.h
2540+ src/include/nvtx3/nvToolsExtCuda.h
2541+ src/include/nvtx3/nvToolsExtCudaRt.h
2542+ src/include/nvtx3/nvToolsExtMem.h
2543+ src/include/nvtx3/nvToolsExtMemCudaRt.h
2544+ src/include/nvtx3/nvToolsExtOpenCL.h
2545+ src/include/nvtx3/nvToolsExtPayload.h
2546+ src/include/nvtx3/nvToolsExtPayloadHelper.h
2547+ src/include/nvtx3/nvToolsExtSemanticsCounters.h
2548+ src/include/nvtx3/nvToolsExtSemanticsScope.h
2549+ src/include/nvtx3/nvToolsExtSync.h
2550+ src/include/nvtx3/nvtx3.hpp
2551+ src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h
2552+ src/include/nvtx3/nvtxDetail/nvtxExtImpl.h
2553+ src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h
2554+ src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h
2555+ src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h
2556+ src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h
2557+ src/include/nvtx3/nvtxDetail/nvtxExtInit.h
2558+ src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h
2559+ src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h
2560+ src/include/nvtx3/nvtxDetail/nvtxExtTypes.h
2561+ src/include/nvtx3/nvtxDetail/nvtxImpl.h
2562+ src/include/nvtx3/nvtxDetail/nvtxImplCore.h
2563+ src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h
2564+ src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h
2565+ src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h
2566+ src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h
2567+ src/include/nvtx3/nvtxDetail/nvtxInit.h
2568+ src/include/nvtx3/nvtxDetail/nvtxInitDecls.h
2569+ src/include/nvtx3/nvtxDetail/nvtxInitDefs.h
2570+ src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h
2571+ src/include/nvtx3/nvtxDetail/nvtxTypes.h
2572+ src/include/proxy_trace/proxy_trace.h
2573+ src/include/plugin/nccl_net.h
2574+ src/include/plugin/nccl_profiler.h
2575+ src/include/plugin/nccl_tuner.h
2576+ src/include/plugin/plugin.h
2577+ src/include/plugin/net/net_v6.h
2578+ src/include/plugin/net/net_v7.h
2579+ src/include/plugin/net/net_v8.h
2580+ src/include/plugin/net/net_v9.h
2581+ src/include/plugin/net/net_v10.h
2582+ src/include/plugin/profiler/net_ib_v1.h
2583+ src/include/plugin/profiler/net_ib.h
2584+ src/include/plugin/profiler/net_socket_v1.h
2585+ src/include/plugin/profiler/net_socket.h
2586+ src/include/plugin/profiler/profiler_v1.h
2587+ src/include/plugin/profiler/profiler_v2.h
2588+ src/include/plugin/profiler/profiler_v3.h
2589+ src/include/plugin/profiler/profiler_v4.h
2590+ src/include/plugin/tuner/tuner_v2.h
2591+ src/include/plugin/tuner/tuner_v3.h
2592+ src/include/plugin/tuner/tuner_v4.h
2593+ src/misc/alt_rsmi.cc
2594+ src/misc/archinfo.cc
2595+ src/misc/argcheck.cc
2596+ src/misc/api_trace.c
2597+ src/misc/api_trace.cc
2598+# src/misc/cudawrap.cc
2599+# src/misc/gdrwrap.cc
2600+ src/misc/ibvsymbols.cc
2601+ src/misc/ibvwrap.cc
2602+ src/misc/ipcsocket.cc
2603+ src/misc/mlx5dvsymbols.cc
2604+ src/misc/mlx5dvwrap.cc
2605+ src/misc/npkit.cc
2606+# src/misc/nvmlwrap.cc
2607+ src/misc/nvmlwrap_stub.cc
2608+ src/misc/param.cc
2609+ src/misc/rocm_smi_wrap.cc
2610+ src/misc/rocmwrap.cc
2611+ src/misc/roctx.cc
2612+ src/misc/recorder.cc
2613+ src/misc/shmutils.cc
2614+ src/misc/signals.cc
2615+ src/misc/socket.cc
2616+ src/misc/strongstream.cc
2617+ src/misc/utils.cc
2618+ src/misc/msccl/msccl_lifecycle.cc
2619+ src/misc/msccl/msccl_parser.cc
2620+ src/misc/msccl/msccl_setup.cc
2621+ src/misc/msccl/msccl_status.cc
2622+ src/misc/proxy_trace/proxy_trace.cc
2623+ src/plugin/net.cc
2624+ src/plugin/plugin_open.cc
2625+ src/plugin/profiler.cc
2626+ src/plugin/tuner.cc
2627+ src/plugin/net/net_v6.cc
2628+ src/plugin/net/net_v7.cc
2629+ src/plugin/net/net_v8.cc
2630+ src/plugin/net/net_v9.cc
2631+ src/plugin/net/net_v10.cc
2632+ src/plugin/profiler/profiler_v1.cc
2633+ src/plugin/profiler/profiler_v2.cc
2634+ src/plugin/profiler/profiler_v3.cc
2635+ src/plugin/profiler/profiler_v4.cc
2636+ src/plugin/tuner/tuner_v2.cc
2637+ src/plugin/tuner/tuner_v3.cc
2638+ src/plugin/tuner/tuner_v4.cc
2639+ src/ras/client.cc
2640+ src/ras/client_support.cc
2641+ src/ras/collectives.cc
2642+ src/ras/peers.cc
2643+ src/ras/ras.cc
2644+ src/ras/ras_internal.h
2645+ src/ras/rasnet.cc
2646+ src/register/coll_reg.cc
2647+ src/register/register.cc
2648+ src/register/sendrecv_reg.cc
2649+ src/transport/coll_net.cc
2650+ src/transport/generic.cc
2651+ src/transport/net.cc
2652+ src/transport/net_ib.cc
2653+ src/transport/net_socket.cc
2654+ src/transport/nvls.cc
2655+ src/transport/p2p.cc
2656+ src/transport/profiler.cc
2657+ src/transport/shm.cc
2658+ src/include/latency_profiler/CollTrace.h
2659+ src/include/latency_profiler/CollTraceEvent.h
2660+ src/include/latency_profiler/CollTraceFunc.h
2661+ src/include/latency_profiler/CollTraceUtils.h
2662+ src/include/latency_profiler/EventQueue.h
2663+ src/misc/latency_profiler/CollTrace.cc
2664+ src/misc/latency_profiler/CollTraceEvent.cc
2665+ src/misc/latency_profiler/CollTraceFunc.cc
2666+ src/misc/latency_profiler/CollTraceUtils.cc
2667+)
2668
2669-if(BUILD_STATIC)
2670- option(BUILD_SHARED_LIBS "Build as a shared library" OFF)
2671-else()
2672- option(BUILD_SHARED_LIBS "Build as a shared library" ON)
2673+if (ENABLE_MSCCL_KERNEL)
2674+ set(MSCCL_KERNEL_SOURCES
2675+ src/device/msccl_kernel_impl.h
2676+ src/include/msccl/msccl_kernel.h
2677+ )
2678+ list(APPEND SRC_FILES ${MSCCL_KERNEL_SOURCES})
2679 endif()
2680
2681-if(BUILD_ADDRESS_SANITIZER)
2682- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -shared-libasan")
2683- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -shared-libasan")
2684- add_link_options(-fuse-ld=lld)
2685-endif()
2686-
2687-configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/include/rccl/rccl.h)
2688-configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/include/rccl/nccl.h)
2689-
2690-include_directories(${PROJECT_BINARY_DIR}/include) # for generated rccl.h header
2691-include_directories(${PROJECT_BINARY_DIR}/include/rccl) # for generated rccl.h header
2692-include_directories(src)
2693-include_directories(src/include)
2694-include_directories(src/collectives)
2695-include_directories(src/collectives/device)
2696-
2697-if (BUILD_ALLREDUCE_ONLY)
2698- add_definitions(-DBUILD_ALLREDUCE_ONLY)
2699- set(CU_SOURCES
2700- src/collectives/device/all_reduce.cu
2701- src/collectives/device/sendrecv.cu
2702- src/collectives/device/functions.cu)
2703-else()
2704- set(CU_SOURCES
2705- src/collectives/device/all_reduce.cu
2706- src/collectives/device/all_gather.cu
2707- src/collectives/device/alltoall_pivot.cu
2708- src/collectives/device/reduce.cu
2709- src/collectives/device/broadcast.cu
2710- src/collectives/device/reduce_scatter.cu
2711- src/collectives/device/sendrecv.cu
2712- src/collectives/device/onerank_reduce.cu
2713- src/collectives/device/functions.cu)
2714-endif()
2715-
2716-set(CPP_SOURCES)
2717-foreach(filename ${CU_SOURCES})
2718- string(REPLACE ".cu"
2719- ".cpp"
2720- cpp_filename
2721- ${filename})
2722- configure_file(${filename} ${cpp_filename} COPYONLY)
2723- list(APPEND CPP_SOURCES ${cpp_filename})
2724-endforeach(filename)
2725-
2726-set(CC_SOURCES
2727- src/init.cc
2728- src/graph/trees.cc
2729- src/graph/rings.cc
2730- src/graph/paths.cc
2731- src/graph/search.cc
2732- src/graph/connect.cc
2733- src/graph/tuning.cc
2734- src/graph/topo.cc
2735- src/graph/xml.cc
2736- src/graph/rome_models.cc
2737- src/collectives/all_reduce_api.cc
2738- src/collectives/all_gather_api.cc
2739- src/collectives/reduce_api.cc
2740- src/collectives/broadcast_api.cc
2741- src/collectives/reduce_scatter_api.cc
2742- src/collectives/sendrecv_api.cc
2743- src/collectives/gather_api.cc
2744- src/collectives/scatter_api.cc
2745- src/collectives/all_to_all_api.cc
2746- src/collectives/all_to_allv_api.cc
2747- src/channel.cc
2748- src/misc/argcheck.cc
2749- src/misc/nvmlwrap_stub.cc
2750- src/misc/utils.cc
2751- src/misc/ibvwrap.cc
2752- src/misc/nvmlwrap_stub.cc
2753- src/misc/rocm_smi_wrap.cc
2754- src/misc/profiler.cc
2755- src/misc/npkit.cc
2756- src/misc/shmutils.cc
2757- src/misc/signals.cc # RCCL
2758- src/misc/socket.cc
2759- src/misc/param.cc
2760- src/misc/rocmwrap.cc
2761- src/misc/strongstream.cc
2762- src/transport/coll_net.cc
2763- src/transport/net.cc
2764- src/transport/net_ib.cc
2765- src/transport/net_socket.cc
2766- src/transport/p2p.cc
2767- src/transport/shm.cc
2768- src/transport.cc
2769- src/debug.cc
2770- src/group.cc
2771- src/bootstrap.cc
2772- src/proxy.cc
2773- src/net.cc
2774- src/enqueue.cc
2775- ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
2776-
2777-foreach(filename ${CC_SOURCES})
2778- list(APPEND CPP_SOURCES ${filename})
2779-endforeach(filename)
2780-
2781-add_library(rccl ${CPP_SOURCES})
2782-
2783-# Create a custom target that creates/updates git_version.cpp
2784-# that executes whenever rccl is built
2785+if (ENABLE_MSCCLPP)
2786+ set(MSCCLPP_SOURCES
2787+ src/include/mscclpp/mscclpp_nccl.h
2788+ src/misc/mscclpp/mscclpp_nccl.cc
2789+ )
2790+ list(APPEND SRC_FILES ${MSCCLPP_SOURCES})
2791+endif()
2792+
2793+# Hipify source files (copy of source generated into hipify directory)
2794+#==================================================================================================
2795+find_program(hipify-perl_executable hipify-perl)
2796+if(NOT hipify-perl_executable)
2797+ message(FATAL_ERROR "hipify-perl not found")
2798+endif()
2799+set(HIPIFY_DIR "${CMAKE_CURRENT_BINARY_DIR}/hipify")
2800+
2801+## Loop over each source file to hipify
2802+foreach(SRC_FILE ${SRC_FILES})
2803+ # Check that file exists
2804+ if (NOT EXISTS ${CMAKE_SOURCE_DIR}/${SRC_FILE})
2805+ message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${CMAKE_SOURCE_DIR}/${SRC_FILE}")
2806+ endif()
2807+
2808+ # Establish hipified copy of the source file
2809+ set(HIP_FILE "${HIPIFY_DIR}/${SRC_FILE}")
2810+ get_filename_component(HIP_FILE_DIR ${HIP_FILE} DIRECTORY)
2811+
2812+ # Make sure the file name is unique and there is no duplicate
2813+ add_file_unique(HIP_SOURCES ${HIP_FILE})
2814+
2815+ # Convert .cu files to .cpp so that they get processed properly
2816+ string(REPLACE "\.cuh" "\.h" HIP_FILE ${HIP_FILE})
2817+ string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
2818+ list(APPEND HIP_SOURCES ${HIP_FILE})
2819+
2820+ # Create a custom command to create hipified source code
2821+ if (FAULT_INJECTION)
2822+ add_custom_command(
2823+ OUTPUT ${HIP_FILE}
2824+ COMMAND mkdir -p ${HIP_FILE_DIR}
2825+ && ${hipify-perl_executable} -quiet-warnings ${CMAKE_SOURCE_DIR}/${SRC_FILE} -o ${HIP_FILE}
2826+ && ${CMAKE_COMMAND} -E env bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/add_unroll.sh ${HIP_FILE}
2827+ && ${CMAKE_COMMAND} -E env bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/add_faults.sh ${HIP_FILE}
2828+ MAIN_DEPENDENCY ${SRC_FILE}
2829+ COMMENT "Hipifying ${SRC_FILE} -> ${HIP_FILE}"
2830+ )
2831+ else()
2832+ add_custom_command(
2833+ OUTPUT ${HIP_FILE}
2834+ COMMAND mkdir -p ${HIP_FILE_DIR}
2835+ && ${hipify-perl_executable} -quiet-warnings ${CMAKE_SOURCE_DIR}/${SRC_FILE} -o ${HIP_FILE}
2836+ && ${CMAKE_COMMAND} -E env bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/add_unroll.sh ${HIP_FILE}
2837+ MAIN_DEPENDENCY ${SRC_FILE}
2838+ COMMENT "Hipifying ${SRC_FILE} -> ${HIP_FILE}"
2839+ )
2840+ endif()
2841+endforeach()
2842+
2843+# Adding custom target to hipify all the source files
2844+# This is required to make sure that all the hipified source files are
2845+# available before compiling the unit tests executable(s)
2846+add_custom_target(hipify_all DEPENDS ${HIP_SOURCES})
2847+
2848+if (BUILD_TESTS)
2849+ if (ROCM_VERSION VERSION_GREATER_EQUAL "60400" AND CMAKE_BUILD_TYPE MATCHES "Debug")
2850+ ## Set definition for exposing rccl static function
2851+ add_definitions(-DRCCL_EXPOSE_STATIC)
2852+
2853+ set(HIPIFY_SRC_DIR "${PROJECT_BINARY_DIR}/hipify/src")
2854+ set(REPLACE_SCRIPT "${CMAKE_SOURCE_DIR}/tools/scripts/replace_static.sh")
2855+ message ("Replacing static functions in ${HIPIFY_SRC_DIR} with ${REPLACE_SCRIPT} for unit tests")
2856+ # Create a list of files which needs to be modified to remove static
2857+ set(TEST_NONSTATIC_SOURCE_FILES
2858+ ${HIPIFY_SRC_DIR}/misc/alt_rsmi.cc
2859+ ${HIPIFY_SRC_DIR}/register/coll_reg.cc
2860+ ${HIPIFY_SRC_DIR}/transport/shm.cc
2861+ ${HIPIFY_SRC_DIR}/transport/p2p.cc
2862+ )
2863+
2864+ set(EXCLUDE_STATIC_FILE "${CMAKE_SOURCE_DIR}/tools/scripts/exclude_static_list.txt")
2865+ # Read the exclude list file into a CMake variable
2866+ file(READ "${EXCLUDE_STATIC_FILE}" EXCLUDE_STATIC_CONTENTS)
2867+ string(REPLACE "\n" ";" EXCLUDE_STATIC_LINES "${EXCLUDE_STATIC_CONTENTS}")
2868+ # Create a mapping from full/relative filename to exclude list
2869+ unset(EXCLUDE_MAP)
2870+ foreach(line ${EXCLUDE_STATIC_LINES})
2871+ if(line MATCHES "^([a-zA-Z0-9_./-]+):([a-zA-Z0-9_,]*)")
2872+ set(fname "${CMAKE_MATCH_1}")
2873+ set(exlist "${CMAKE_MATCH_2}")
2874+ # Map both the basename and the full/relative path for flexibility
2875+ get_filename_component(basename "${fname}" NAME)
2876+ set(EXCLUDE_MAP_${fname} "${exlist}")
2877+ set(EXCLUDE_MAP_${basename} "${exlist}")
2878+ endif()
2879+ endforeach()
2880+
2881+ # Now, for each file, get the exclude list and pass to the script
2882+ # Create a custom command to backup the original files and remove static
2883+ # Always run replace script on hipified files, but preserve original backups
2884+ foreach(srcfile ${TEST_NONSTATIC_SOURCE_FILES})
2885+ # Try to match using the full/relative path first, then fallback to basename
2886+ set(exclude_list "")
2887+ if(DEFINED EXCLUDE_MAP_${srcfile})
2888+ set(exclude_list "${EXCLUDE_MAP_${srcfile}}")
2889+ else()
2890+ get_filename_component(basename "${srcfile}" NAME)
2891+ if(DEFINED EXCLUDE_MAP_${basename})
2892+ set(exclude_list "${EXCLUDE_MAP_${basename}}")
2893+ endif()
2894+ endif()
2895+ add_custom_command(
2896+ OUTPUT "${srcfile}.staticbak"
2897+ COMMAND bash -c "\
2898+ ${CMAKE_COMMAND} -E echo 'Processing ${srcfile} for static replacement' && \
2899+ if [ ! -f '${srcfile}.staticbak' ]; then \
2900+ ${CMAKE_COMMAND} -E copy '${srcfile}' '${srcfile}.staticbak' && \
2901+ ${CMAKE_COMMAND} -E echo 'Created backup: ${srcfile}.staticbak'; \
2902+ fi && \
2903+ ${CMAKE_COMMAND} -E echo 'Exposing internal functions/variables in ${srcfile}' && \
2904+ ${REPLACE_SCRIPT} ${srcfile} --replace-vars --exclude-list=${exclude_list} 2>&1 \
2905+ "
2906+ DEPENDS "${srcfile}" "${REPLACE_SCRIPT}" hipify_all
2907+ COMMENT "Removing static from ${srcfile} (backup preserved at ${srcfile}.staticbak)"
2908+ VERBATIM
2909+ )
2910+ list(APPEND STATIC_BAK_FILES "${srcfile}.staticbak")
2911+ endforeach()
2912+ add_custom_target(replace_static_in_hipify ALL DEPENDS ${STATIC_BAK_FILES})
2913+ add_dependencies(replace_static_in_hipify hipify_all)
2914+ else()
2915+ # Restore originals in the build directory if backup exists
2916+ foreach(srcfile ${HIPIFY_SRC_FILES})
2917+ if(EXISTS "${srcfile}.staticbak")
2918+ execute_process(
2919+ COMMAND ${CMAKE_COMMAND} -E copy "${srcfile}.staticbak" "${srcfile}"
2920+ )
2921+ execute_process(
2922+ COMMAND ${CMAKE_COMMAND} -E remove "${srcfile}.staticbak"
2923+ )
2924+ endif()
2925+ endforeach()
2926+ endif()
2927+endif()
2928+
2929+# Generate device/host tables and all the collective functions that are going to be in librccl.so
2930+#==================================================================================================
2931+find_package(Python3 COMPONENTS Interpreter REQUIRED)
2932+if (NOT Python3_FOUND)
2933+ message(FATAL_ERROR "RCCL requires Python3 for generating host/device tables")
2934+endif()
2935+
2936+set(GEN_DIR "${HIPIFY_DIR}/gensrc")
2937+set(GEN_SYM_DIR "${GEN_DIR}/symmetric")
2938+
2939+if(ONLY_FUNCS)
2940+ message(WARNING "Using ONLY_FUNCS = ${ONLY_FUNCS}. Not meant for release builds.")
2941+endif()
2942+
2943+# Execute the python script to generate required collective functions
2944+execute_process(
2945+ COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/src/device/generate.py ${GEN_DIR} ${IFC_ENABLED} ${COLLTRACE} ${ENABLE_MSCCL_KERNEL} ${BUILD_LOCAL_GPU_TARGET_ONLY} ${ONLY_FUNCS}
2946+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
2947+ RESULT_VARIABLE gen_py_result
2948+ ERROR_VARIABLE gen_py_error
2949+)
2950+if (gen_py_result)
2951+ message(SEND_ERROR "Error: ${gen_py_error}")
2952+ message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/generate.py failed")
2953+endif()
2954+
2955+if (GENERATE_SYM_KERNELS)
2956+ # Execute the python script to generate required symmetric memory kernels
2957+ execute_process(
2958+ COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py ${GEN_SYM_DIR}
2959+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
2960+ RESULT_VARIABLE gen_sym_py_result
2961+ ERROR_VARIABLE gen_sym_py_error
2962+ )
2963+ if (gen_sym_py_result)
2964+ message(SEND_ERROR "Error: ${gen_sym_py_error}")
2965+ message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py failed")
2966+ endif()
2967+endif()
2968+
2969+# Find the generated files in the output directory
2970+file(GLOB_RECURSE GENERATED_FILES "${GEN_DIR}/*")
2971+
2972+# Append all found generated files to the list
2973+foreach(file ${GENERATED_FILES})
2974+ list(APPEND HIP_SOURCES ${file})
2975+endforeach()
2976+
2977+# Create an initial git_version.cpp file (that will be updated with latest git version)
2978+#==================================================================================================
2979+file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "")
2980+list(APPEND HIP_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
2981+
2982+# Create a custom target that updates git_version.cpp and executes whenever rccl is built
2983 add_custom_target(git_version_check
2984 COMMENT "Updating git_version.cpp if necessary"
2985- COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/git_version.cmake
2986+ COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/git_version.cmake
2987 VERBATIM
2988 )
2989
2990-# Create a dummy git_version.cpp file in case it doesn't exist
2991-configure_file(src/nccl.h.in ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
2992
2993-# Execute git_version_check whenever rccl library is built
2994-add_dependencies(rccl git_version_check)
2995+# Set up RCCL library
2996+#==================================================================================================
2997+## Set RCCL source files
2998+add_library(rccl ${HIP_SOURCES})
2999
3000-if(TRACE)
3001- add_definitions(-DENABLE_TRACE)
3002-endif()
3003+## Set RCCL dependencies
3004+add_dependencies(rccl git_version_check) # Execute git_version_check during build
3005
3006-if(PROFILE)
3007- add_definitions(-DENABLE_PROFILING)
3008+if (BUILD_TESTS AND ROCM_VERSION VERSION_GREATER_EQUAL "60400" AND CMAKE_BUILD_TYPE MATCHES "Debug")
3009+ ## Set static replacement dependency for fixture unit tests
3010+ add_dependencies(rccl replace_static_in_hipify)
3011 endif()
3012
3013-if(NPKIT_FLAGS)
3014- add_definitions(${NPKIT_FLAGS})
3015+## Set RCCL include directories
3016+target_include_directories(rccl PRIVATE ${PROJECT_BINARY_DIR}/include) # for generated rccl.h header
3017+target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src) # for hipfied headers
3018+target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device)
3019+target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device/network/unpack)
3020+target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include)
3021+target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/mlx5)
3022+target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/plugin)
3023+target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/gensrc)
3024+target_include_directories(rccl PRIVATE ${HSA_INCLUDE_PATH})
3025+target_include_directories(rccl PRIVATE ${ROCM_SMI_INCLUDE_DIR})
3026+if(DEMANGLE_DIR)
3027+ target_include_directories(rccl PRIVATE ${DEMANGLE_DIR})
3028 endif()
3029
3030-set(COLLTRACE 1 CACHE BOOL "Collective Trace Option")
3031+## Set RCCL compile definitions
3032 if(COLLTRACE)
3033- add_definitions(-DENABLE_COLLTRACE)
3034+ target_compile_definitions(rccl PRIVATE ENABLE_COLLTRACE)
3035+endif()
3036+if(ENABLE_MSCCL_KERNEL)
3037+ target_compile_definitions(rccl PRIVATE COMPILE_MSCCL_KERNEL)
3038+endif()
3039+if(ENABLE_MSCCLPP)
3040+ target_compile_definitions(rccl PRIVATE ENABLE_MSCCLPP)
3041+endif()
3042+if(HAVE_ROCM_SMI64CONFIG)
3043+ target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG)
3044+endif()
3045+if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX)
3046+ target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX)
3047 endif()
3048
3049-enable_language(C)
3050-CHECK_INCLUDE_FILES(bfd.h HAVE_BFD)
3051-if (HAVE_BFD)
3052- add_definitions(-DHAVE_BFD)
3053- message ("-- Found BFD")
3054- CHECK_SYMBOL_EXISTS(bfd_get_section_flags "bfd.h" HAVE_DECL_BFD_GET_SECTION_FLAGS)
3055+# NPKit flags
3056+## May be better to move these to a separate file
3057+if(ENABLE_NPKIT)
3058+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT)
3059+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
3060+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
3061+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY)
3062+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT)
3063+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY)
3064+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT)
3065+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY)
3066+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT)
3067+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY)
3068+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_COPY_SEND_EXIT)
3069+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY)
3070+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_EXIT)
3071+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_ENTRY)
3072+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_EXIT)
3073+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY)
3074+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT)
3075+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY)
3076+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
3077+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_SEND_ENTRY)
3078+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_SEND_EXIT)
3079+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY)
3080+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT)
3081+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_ENTRY)
3082+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_EXIT)
3083+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY)
3084+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT)
3085+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY)
3086+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT)
3087+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY)
3088+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT)
3089+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY)
3090+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT)
3091+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_ENTRY)
3092+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_EXIT)
3093+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY)
3094+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT)
3095+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY)
3096+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT)
3097+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
3098+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
3099+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY)
3100+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT)
3101+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY)
3102+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT)
3103+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY)
3104+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT)
3105+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY)
3106+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT)
3107+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_SEND_ENTRY)
3108+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
3109+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_TEST_ENTRY)
3110+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_TEST_EXIT)
3111+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_RECV_ENTRY)
3112+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_RECV_EXIT)
3113+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY)
3114+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT)
3115+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY)
3116+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT)
3117+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY)
3118+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
3119+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY)
3120+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT)
3121+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY)
3122+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
3123+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY)
3124+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT)
3125+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY)
3126+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT)
3127+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY)
3128+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT)
3129+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY)
3130+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT)
3131+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY)
3132+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT)
3133+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY)
3134+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT)
3135+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_SEND_ENTRY)
3136+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT)
3137+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY)
3138+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT)
3139+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_ENTRY)
3140+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_EXIT)
3141+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY)
3142+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT)
3143+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY)
3144+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT)
3145+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY)
3146+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT)
3147+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY)
3148+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT)
3149+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_REDUCE_ENTRY)
3150+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_REDUCE_EXIT)
3151+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_SEND_ENTRY)
3152+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_SEND_EXIT)
3153+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RECV_ENTRY)
3154+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT)
3155+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY)
3156+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT)
3157+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY)
3158+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT)
3159+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY)
3160+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT)
3161+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_BROADCAST_RING_ENTRY)
3162+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_BROADCAST_RING_EXIT)
3163+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_ENTRY)
3164+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_EXIT)
3165+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_ENTRY)
3166+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_EXIT)
3167+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_ENTRY)
3168+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_EXIT)
3169+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_ENTRY)
3170+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_EXIT)
3171+ target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
3172+endif()
3173+
3174+if(PROFILE)
3175+ target_compile_definitions(rccl PRIVATE ENABLE_PROFILING)
3176+endif()
3177+if(ROCTX_ENABLE)
3178+ target_compile_definitions(rccl PRIVATE ROCTX_ENABLE)
3179+else()
3180+ target_compile_definitions(rccl PRIVATE NVTX_NO_IMPL)
3181+ target_compile_definitions(rccl PRIVATE NVTX_DISABLE)
3182+endif()
3183+if(TRACE)
3184+ target_compile_definitions(rccl PRIVATE ENABLE_TRACE)
3185+endif()
3186+if(${HIP_CONTIGUOUS_MEMORY})
3187+ target_compile_definitions(rccl PRIVATE HIP_CONTIGUOUS_MEMORY)
3188+ message(STATUS "HIP_CONTIGUOUS_MEMORY enabled")
3189+else()
3190+ message(STATUS "HIP_CONTIGUOUS_MEMORY disabled")
3191+endif()
3192+if("${hip_version_string}" VERSION_GREATER_EQUAL "5.7.31920")
3193+ target_compile_definitions(rccl PRIVATE HIP_UNCACHED_MEMORY)
3194+ message(STATUS "HIP_UNCACHED_MEMORY enabled")
3195+else()
3196+ message(STATUS "HIP_UNCACHED_MEMORY disabled - requires HIP version >= 5.7.31920")
3197+ # keep --hipcc-func-supp on older HIP and compiler
3198+ if(NOT IFC_ENABLED)
3199+ target_compile_options(rccl PRIVATE --hipcc-func-supp)
3200+ message(STATUS "--hipcc-func-supp enabled")
3201+ else()
3202+ message(STATUS "--hipcc-func-supp disabled")
3203+ endif()
3204+endif()
3205+if (HIP_HOST_UNCACHED_MEMORY)
3206+ target_compile_definitions(rccl PRIVATE HIP_HOST_UNCACHED_MEMORY)
3207+ message(STATUS "HIP_HOST_UNCACHED_MEMORY enabled")
3208+else()
3209+ message(STATUS "HIP_HOST_UNCACHED_MEMORY disabled")
3210+endif()
3211+if (BUILD_BFD)
3212+ if (HAVE_BFD)
3213+ target_compile_definitions(rccl PRIVATE HAVE_BFD)
3214+ endif()
3215 if (HAVE_DECL_BFD_GET_SECTION_FLAGS)
3216- add_definitions(-DHAVE_DECL_BFD_GET_SECTION_FLAGS)
3217+ target_compile_definitions(rccl PRIVATE HAVE_DECL_BFD_GET_SECTION_FLAGS)
3218 endif()
3219- CHECK_SYMBOL_EXISTS(bfd_get_section_vma "bfd.h" HAVE_DECL_BFD_GET_SECTION_VMA)
3220 if (HAVE_DECL_BFD_GET_SECTION_VMA)
3221- add_definitions(-DHAVE_DECL_BFD_GET_SECTION_VMA)
3222+ target_compile_definitions(rccl PRIVATE HAVE_DECL_BFD_GET_SECTION_VMA)
3223 endif()
3224- CHECK_CXX_SOURCE_COMPILES(
3225- "#include <bfd.h>
3226-
3227- int main (int argc, char **argv) {
3228- bfd_size_type size;
3229- bfd abfd;
3230- asection sec;
3231- size = bfd_section_size(&abfd, &sec);
3232- return (int)(size);
3233- }"
3234- HAVE_TWO_ARG_BFD_SECTION_SIZE)
3235 if (HAVE_TWO_ARG_BFD_SECTION_SIZE)
3236- add_definitions(-DHAVE_TWO_ARG_BFD_SECTION_SIZE)
3237- endif()
3238- find_path(DEMANGLE_HEADER demangle.h PATHS /usr/include PATH_SUFFIXES libiberty)
3239- if(NOT DEMANGLE_HEADER)
3240- message("Could not find demangle.h ${DEMANGLE_HEADER}")
3241- else()
3242- add_definitions(-DHAVE_CPLUS_DEMANGLE)
3243- message("Found demangle.h in ${DEMANGLE_HEADER}")
3244- set (HAVE_CPLUS_DEMANGLE 1)
3245- set (HAVE_DECL_BASENAME "1")
3246- INCLUDE_DIRECTORIES(${DEMANGLE_HEADER})
3247+ target_compile_definitions(rccl PRIVATE HAVE_TWO_ARG_BFD_SECTION_SIZE)
3248 endif()
3249 endif()
3250+if (IFC_ENABLED)
3251+ target_compile_definitions(rccl PRIVATE USE_INDIRECT_FUNCTION_CALL)
3252+endif()
3253+if(DEMANGLE_DIR)
3254+ target_compile_definitions(rccl PRIVATE "HAVE_CPLUS_DEMANGLE=1")
3255+ target_compile_definitions(rccl PRIVATE "HAVE_DECL_BASENAME=1")
3256+endif()
3257+if(LL128_ENABLED)
3258+ target_compile_definitions(rccl PRIVATE ENABLE_LL128)
3259+endif()
3260
3261-find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
3262-if (rocm_smi_FOUND)
3263- message ("-- Found rocm_smi at ${ROCM_SMI_INCLUDE_DIR}")
3264- CHECK_INCLUDE_FILE_CXX("${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
3265+## Set RCCL compile options
3266+if (HAVE_PARALLEL_JOBS)
3267+ target_compile_options(rccl PRIVATE -parallel-jobs=12)
3268+endif()
3269+
3270+if (ROCM_VERSION VERSION_GREATER_EQUAL "60200")
3271+ target_compile_options(rccl PRIVATE --offload-compress) # Compress GPU code at compile time.
3272+ target_link_libraries(rccl PRIVATE --offload-compress) # Compress GPU code at link time.
3273+ message(STATUS "--offload-compress enabled - ROCm version >= 6.2.0")
3274 else()
3275- message ("-- Checking old include directory structure for rocm_smi")
3276- set(ROCM_SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
3277- set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
3278- set(ROCM_SMI_LIBRARIES rocm_smi64)
3279- CHECK_INCLUDE_FILE_CXX("${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
3280+ message(STATUS "--offload-compress disabled - ROCm version < 6.2.0")
3281 endif()
3282-IF(HAVE_ROCM_SMI64CONFIG)
3283- add_definitions(-DUSE_ROCM_SMI64CONFIG)
3284-ENDIF()
3285
3286-foreach(target ${AMDGPU_TARGETS})
3287- target_link_libraries(rccl PRIVATE --amdgpu-target=${target})
3288-endforeach()
3289+target_compile_options(rccl PRIVATE -Werror=uninitialized)
3290+target_compile_options(rccl PRIVATE -Werror=sometimes-uninitialized)
3291+target_compile_options(rccl PRIVATE -Wall)
3292+target_compile_options(rccl PRIVATE -Werror=deprecated-copy-with-user-provided-copy)
3293+target_compile_options(rccl PRIVATE -Wno-format-nonliteral)
3294+target_compile_options(rccl PRIVATE -Wno-unused-function)
3295+target_compile_options(rccl PRIVATE -fgpu-rdc)
3296
3297-if("${HIP_COMPILER}" MATCHES "clang")
3298- target_compile_options(rccl PRIVATE -fvisibility=hidden --hipcc-func-supp)
3299- foreach(target ${AMDGPU_TARGETS})
3300- target_compile_options(rccl PRIVATE -fgpu-rdc)
3301- endforeach()
3302- target_link_libraries(rccl PRIVATE -fgpu-rdc)
3303- target_include_directories(rccl PRIVATE ${ROCM_PATH}/hsa/include)
3304- find_program( hipcc_executable hipcc )
3305- execute_process(COMMAND bash "-c" "${hipcc_executable} -help | grep 'parallel-jobs'" OUTPUT_VARIABLE hipcc_parallel_jobs)
3306- if("${hipcc_parallel_jobs}" MATCHES "parallel-jobs")
3307- target_compile_options(rccl PRIVATE -parallel-jobs=8 PRIVATE -Wno-format-nonliteral)
3308- target_link_libraries(rccl PRIVATE -parallel-jobs=8)
3309+## Set RCCL compile and linker options for unit tests and code coverage
3310+if(ENABLE_CODE_COVERAGE)
3311+ if(NOT CMAKE_BUILD_TYPE MATCHES "Debug")
3312+ message(FATAL_ERROR "Code coverage is enabled, but the build type is '${CMAKE_BUILD_TYPE}'. "
3313+ "Code coverage requires 'Debug' build types to expose internal symbols. "
3314+ "Please set CMAKE_BUILD_TYPE to 'Debug' and reconfigure.")
3315 endif()
3316
3317- # RCCL static lib uses -fgpu-rdc which requires hipcc as the linker and archiver
3318- if(BUILD_STATIC)
3319- target_link_libraries(rccl PRIVATE --emit-static-lib)
3320- set(CMAKE_AR "${hipcc_executable}")
3321- get_property(link_libraries TARGET rccl PROPERTY LINK_LIBRARIES)
3322- string (REPLACE ";" " " LINK_PROPS "${link_libraries}")
3323- set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> -o <TARGET> ${LINK_PROPS} <LINK_FLAGS> <OBJECTS>")
3324+ message(STATUS "Code coverage is enabled with build type '${CMAKE_BUILD_TYPE}'.")
3325+
3326+ target_compile_options(rccl PRIVATE
3327+ -fvisibility=default -Xarch_host -fprofile-instr-generate
3328+ -Xarch_host -fcoverage-mapping)
3329+
3330+ set(COVERAGE_SHARED_LINKER_FLAGS
3331+ -fprofile-generate
3332+ -Wl,--enable-new-dtags,--build-id=sha1,--rpath,$ORIGIN
3333+ )
3334+
3335+ set(COVERAGE_EXE_LINKER_FLAGS
3336+ -fprofile-generate
3337+ -Wl,--enable-new-dtags,--build-id=sha1,--rpath,$ORIGIN/../lib
3338+ )
3339+
3340+ target_link_options(rccl PRIVATE ${COVERAGE_SHARED_LINKER_FLAGS})
3341+ target_link_options(rccl PRIVATE ${COVERAGE_EXE_LINKER_FLAGS})
3342+elseif(BUILD_TESTS) # Enable default/hidden visibility based on build type and ROCM_VERSION
3343+ if (ROCM_VERSION VERSION_GREATER_EQUAL "60400" AND CMAKE_BUILD_TYPE MATCHES "Debug")
3344+ target_compile_options(rccl PRIVATE -fvisibility=default)
3345+ else()
3346+ target_compile_options(rccl PRIVATE -fvisibility=hidden)
3347 endif()
3348+else() # Enable hidden visibility for library without tests/code coverage enabled
3349+ target_compile_options(rccl PRIVATE -fvisibility=hidden)
3350 endif()
3351
3352-if("${HIP_COMPILER}" MATCHES "hcc")
3353- find_program( hcc_executable hcc )
3354- execute_process(COMMAND bash "-c" "${hcc_executable} --version | sed -e '1!d' -e 's/.*based on HCC\\s*//'" OUTPUT_VARIABLE hcc_version_string)
3355- execute_process(COMMAND bash "-c" "echo \"${hcc_version_string}\" | awk -F\".\" '{ printf $1}'" OUTPUT_VARIABLE hcc_major_version)
3356- execute_process(COMMAND bash "-c" "echo \"${hcc_version_string}\" | awk -F\".\" '{ printf $2}'" OUTPUT_VARIABLE hcc_minor_version)
3357- if ("${hcc_major_version}.${hcc_minor_version}" VERSION_LESS "4.0")
3358- target_link_libraries(rccl PRIVATE -hc-function-calls)
3359- endif()
3360+if (HAVE_KERNARG_PRELOAD)
3361+ target_compile_options(rccl PRIVATE -mllvm --amdgpu-kernarg-preload-count=16)
3362 endif()
3363
3364-target_include_directories(rccl PRIVATE ${ROCM_SMI_INCLUDE_DIR})
3365-target_link_libraries(rccl PRIVATE hip::device dl -l${ROCM_SMI_LIBRARIES} -L${ROCM_SMI_LIB_DIR})
3366-target_link_libraries(rccl INTERFACE hip::host)
3367+if (DUMP_ASM) # Save temporary files from kernel compilation
3368+ message(STATUS "Disassembling librccl.so to asm")
3369+ # Maintain symbols but without changing code. Keep additional data in dwarf section of binary.
3370+ target_compile_options(rccl PRIVATE -gline-tables-only)
3371+ set(OBJ_DUMP ${ROCM_PATH}/llvm/bin/llvm-objdump)
3372+
3373+ add_custom_command(TARGET rccl POST_BUILD
3374+ COMMENT "Disassembling RCCL library"
3375+ COMMAND /bin/bash -c "${OBJ_DUMP} --offload-fatbin librccl.so"
3376+ VERBATIM
3377+ )
3378+ foreach(GPUARCH ${GPU_TARGETS})
3379+ add_custom_command(TARGET rccl POST_BUILD
3380+ COMMENT "Disassembling RCCL library to dump assembly for ${GPUARCH}"
3381+ COMMAND /bin/bash -c "${OBJ_DUMP} -d -l --source --symbolize-operands librccl.so.0.hipv4-amdgcn-amd-amdhsa--${GPUARCH} > librccl.${GPUARCH}.s"
3382+ VERBATIM
3383+ )
3384+ endforeach()
3385+endif()
3386+
3387+## NOTE: This is currently being handled by rocm-cmake, however may need to be re-enabled in the future
3388+#foreach(target ${GPU_TARGETS})
3389+# target_compile_options(rccl PRIVATE --offload-arch=${target})
3390+#endforeach()
3391+
3392+if(BUILD_ADDRESS_SANITIZER)
3393+ target_compile_options(rccl PRIVATE -fsanitize=address -shared-libasan)
3394+endif()
3395+if(TIMETRACE)
3396+ target_compile_options(rccl PRIVATE -ftime-trace)
3397+endif()
3398+if (FAULT_INJECTION)
3399+ target_compile_definitions(rccl PRIVATE ENABLE_FAULT_INJECTION)
3400+ message(STATUS "Fault injection enabled")
3401+endif()
3402+
3403+## Set RCCL linked library directories
3404+target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR})
3405
3406-if(HAVE_BFD)
3407- target_link_libraries(rccl PRIVATE bfd dl z)
3408- find_library(HAVE_IBERTY iberty PATHS /usr/lib64 /usr/lib/
3409- PATH_SUFFIXES x86_64-linux-gnu)
3410+if (ROCM_VERSION VERSION_GREATER_EQUAL "60100")
3411+ option(RCCL_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON)
3412+else()
3413+ if(RCCL_ROCPROFILER_REGISTER)
3414+ message(AUTHOR_WARNING "RCCL_ROCPROFILER_REGISTER is not valid option for ROCm < 6.2. Current ROCm version: ${ROCM_VERSION}")
3415+ endif()
3416+ set(RCCL_ROCPROFILER_REGISTER OFF CACHE BOOL "" FORCE)
3417+endif()
3418+if(RCCL_ROCPROFILER_REGISTER)
3419+ find_package(rocprofiler-register REQUIRED)
3420+ target_compile_definitions(rccl PRIVATE RCCL_ROCPROFILER_REGISTER=1)
3421+ target_link_libraries(
3422+ rccl PRIVATE rocprofiler-register::rocprofiler-register)
3423+endif()
3424+
3425+## Set RCCL linked libraries
3426+if (HAVE_BFD)
3427+ target_link_libraries(rccl PRIVATE bfd)
3428 if(HAVE_IBERTY)
3429- message("iberty found @ ${HAVE_IBERTY} ")
3430- target_link_libraries(rccl PRIVATE iberty dl z)
3431+ target_link_libraries(rccl PRIVATE iberty z)
3432+ endif()
3433+endif()
3434+if (ROCTX_ENABLE)
3435+ target_link_libraries(rccl PRIVATE -lroctx64)
3436+endif()
3437+target_link_libraries(rccl PRIVATE -fgpu-rdc) # Required when linking relocatable device code
3438+target_link_libraries(rccl PRIVATE Threads::Threads)
3439+target_link_libraries(rccl INTERFACE hip::host)
3440+target_link_libraries(rccl PRIVATE hip::device)
3441+target_link_libraries(rccl PRIVATE dl)
3442+target_link_libraries(rccl PRIVATE ${ROCM_SMI_LIBRARIES})
3443+target_link_libraries(rccl PRIVATE fmt::fmt-header-only)
3444+if(ENABLE_MSCCLPP)
3445+ target_link_libraries(rccl PRIVATE mscclpp_nccl)
3446+endif()
3447+
3448+## Set RCCL link options
3449+## Find out available memory
3450+execute_process(
3451+ COMMAND bash "-c" "cat /sys/fs/cgroup/memory.max"
3452+ OUTPUT_VARIABLE memory_max_string)
3453+if (${memory_max_string} MATCHES "^[0-9]+")
3454+ math(EXPR memory_in_gb "${memory_max_string} / (1024 * 1024 * 1024)")
3455+else()
3456+ execute_process(
3457+ COMMAND bash "-c" "free | grep -o '[[:digit:]]*' | head -1"
3458+ OUTPUT_VARIABLE memory_max_string)
3459+ ## memory_max_string holds the free memory in KB
3460+ if (${memory_max_string} MATCHES "^[0-9]+")
3461+ math(EXPR memory_in_gb "${memory_max_string} / (1024 * 1024)") ## KB to GB conversion
3462+ else()
3463+ cmake_host_system_information(RESULT memory_max_string QUERY AVAILABLE_PHYSICAL_MEMORY )
3464+ math(EXPR memory_in_gb "${memory_max_string} / 1024")
3465 endif()
3466 endif()
3467+## Reserve 16GB for each linker job. Limit max number of linker jobs to 16
3468+if (HAVE_PARALLEL_JOBS)
3469+ math(EXPR num_linker_jobs "(${memory_in_gb} + 15) / 16")
3470+ if (${num_linker_jobs} GREATER_EQUAL "16")
3471+ set(num_linker_jobs "16")
3472+ endif()
3473+ message(STATUS "Use ${num_linker_jobs} jobs for linking")
3474+ target_link_options(rccl PRIVATE -parallel-jobs=${num_linker_jobs}) # Use multiple threads to link
3475+endif()
3476+if(BUILD_ADDRESS_SANITIZER)
3477+ target_link_options(rccl PRIVATE -fuse-ld=lld)
3478+endif()
3479+if(TIMETRACE)
3480+ target_link_options(rccl PRIVATE -ftime-trace)
3481+endif()
3482+
3483+if(NOT BUILD_SHARED_LIBS)
3484+ message(STATUS "Building static RCCL library")
3485+else()
3486+ message(STATUS "Building shared RCCL library")
3487+endif()
3488+if (HAVE_KERNARG_PRELOAD)
3489+ target_link_options(rccl PRIVATE "SHELL:-Xoffload-linker -mllvm=-amdgpu-kernarg-preload-count=16")
3490+endif()
3491+
3492+if(ENABLE_MSCCLPP)
3493+ include(cmake/MSCCLPP.cmake)
3494+endif()
3495
3496-#Setup librccl.so version
3497+## Track linking time
3498+set_property(TARGET rccl PROPERTY RULE_LAUNCH_LINK "${CMAKE_COMMAND} -E time")
3499+
3500+## Setup librccl.so version
3501 rocm_set_soversion(rccl "1.0")
3502
3503-rocm_install_targets(TARGETS
3504- rccl
3505- )
3506-rocm_install(FILES ${PROJECT_BINARY_DIR}/include/rccl/rccl.h src/include/nccl_net.h
3507- DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl)
3508-
3509-rocm_export_targets(NAMESPACE
3510- roc::
3511- TARGETS
3512- rccl
3513- DEPENDS
3514- hip)
3515-if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY)
3516- #Create wrapper files
3517- rocm_wrap_header_dir( "${PROJECT_BINARY_DIR}/include/rccl"
3518- PATTERNS "rccl.h"
3519- GUARDS SYMLINK WRAPPER
3520- WRAPPER_LOCATIONS ${CMAKE_INSTALL_INCLUDEDIR} rccl/${CMAKE_INSTALL_INCLUDEDIR})
3521- #install the wrapper header file to package
3522- rocm_install( FILES ${PROJECT_BINARY_DIR}/rccl/include/rccl.h src/include/nccl_net.h
3523- DESTINATION "./rccl/${CMAKE_INSTALL_INCLUDEDIR}/" )
3524- rocm_install( FILES ${PROJECT_BINARY_DIR}/include/rccl.h src/include/nccl_net.h
3525- DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/" )
3526-endif()
3527-
3528-rocm_package_add_dependencies(DEPENDS "hip-rocclr >= 3.5.0" "rocm-smi-lib >= 4.0.0")
3529+if(NOT BUILD_SHARED_LIBS)
3530+ # To create a static lib with `-fgpu-rdc`, you need `--emit-static-lib` and `--hip-link`.
3531+ # You also need to invoke amdclang++ again to trigger GPU code generation.
3532+ set(static_link_flags
3533+ ${CXXFLAGS}
3534+ --hip-link
3535+ -fgpu-rdc
3536+ --emit-static-lib
3537+ )
3538+
3539+ # Find all the libraries we need to link at link time to include them in the clang link
3540+ # command line.
3541+ get_target_property(rccl_libs rccl LINK_LIBRARIES)
3542+ foreach(target ${rccl_libs})
3543+ if(TARGET ${target})
3544+ get_target_property(location ${target} LOCATION)
3545+ if(location)
3546+ LIST(APPEND static_link_flags -l${location})
3547+ endif()
3548+ endif()
3549+ endforeach()
3550+
3551+ foreach(target ${GPU_TARGETS})
3552+ list(APPEND static_link_flags --offload-arch=${target})
3553+ endforeach()
3554+ list(JOIN static_link_flags " " flags_str)
3555+
3556+ # Invoking amdclang++ this way will produce a static archive, so just override ARCHIVE_CREATE.
3557+ set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_CXX_COMPILER> ${flags_str} -o <TARGET> <OBJECTS>")
3558+endif()
3559+
3560+# Install settings
3561+#==================================================================================================
3562+## Specify install targets
3563+rocm_install_targets(TARGETS rccl)
3564+rocm_install(FILES ${PROJECT_BINARY_DIR}/include/rccl/rccl.h src/include/plugin/nccl_net.h
3565+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl)
3566+rocm_install(FILES src/include/api_trace.h
3567+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl/amd_detail)
3568+file(COPY tools/msccl-algorithms DESTINATION ${PROJECT_BINARY_DIR})
3569+file(COPY tools/msccl-unit-test-algorithms DESTINATION ${PROJECT_BINARY_DIR})
3570+## Install Algorithm files under share folder
3571+rocm_install(DIRECTORY ${PROJECT_BINARY_DIR}/msccl-algorithms DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
3572+rocm_install(DIRECTORY ${PROJECT_BINARY_DIR}/msccl-unit-test-algorithms DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
3573+
3574+rocm_export_targets(
3575+ NAMESPACE roc::
3576+ TARGETS rccl
3577+ DEPENDS hip)
3578+
3579+## Set package dependencies
3580+if(BUILD_ADDRESS_SANITIZER)
3581+ set(DEPENDS_HIP_RUNTIME "hip-runtime-amd-asan" )
3582+else()
3583+ set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" )
3584+endif()
3585+rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "rocm-smi-lib >= 4.0.0")
3586+set(CPACK_DEB_COMPONENT_INSTALL ON)
3587 set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
3588+set(CPACK_RPM_COMPONENT_INSTALL ON)
3589 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "${ROCM_PATH}")
3590
3591 find_file (DEBIAN debian_version debconf.conf PATHS /etc)
3592@@ -385,13 +1383,14 @@ if(DEBIAN)
3593 file(WRITE "${CMAKE_BINARY_DIR}/copyright"
3594 "Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
3595 Upstream-Name: rccl
3596-Source: https://github.com/ROCmSoftwarePlatform/rccl
3597+Source: https://github.com/ROCm/rccl
3598
3599 Files: *
3600 Copyright: (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
3601-Modifications Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
3602+Modifications Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved.
3603+Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
3604 License: See LICENSE.txt for license information\n")
3605- install(FILES "${CMAKE_BINARY_DIR}/copyright" DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
3606+ rocm_install(FILES "${CMAKE_BINARY_DIR}/copyright" DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
3607 # Write changelog file
3608 find_program( date_executable date )
3609 execute_process(COMMAND ${date_executable} -R OUTPUT_VARIABLE TIMESTAMP)
3610@@ -402,24 +1401,32 @@ License: See LICENSE.txt for license information\n")
3611
3612 -- RCCL Maintainer <rccl-maintainer@amd.com> ${TIMESTAMP}\n")
3613 find_program( gzip_executable gzip )
3614- execute_process(COMMAND bash "-c" "${gzip_executable} -9 -c ${CMAKE_BINARY_DIR}/changelog"
3615+ execute_process(COMMAND bash "-c" "${gzip_executable} -9 -c -n ${CMAKE_BINARY_DIR}/changelog"
3616 WORKING_DIRECTORY ${CMAKE_BINARY_DIR} OUTPUT_FILE "${CMAKE_BINARY_DIR}/changelog.Debian.gz")
3617- install(FILES "${CMAKE_BINARY_DIR}/changelog.Debian.gz" DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
3618+ rocm_install(FILES "${CMAKE_BINARY_DIR}/changelog.Debian.gz" DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
3619 set(CPACK_DEBIAN_PACKAGE_DESCRIPTION "ROCm Communication Collectives Library
3620 Optimized primitives for collective multi-GPU communication")
3621 endif()
3622
3623+## Building RCCL RAS
3624+include(cmake/rcclRAS.cmake)
3625+
3626 if(BUILD_TESTS)
3627 rocm_package_setup_component(clients)
3628- rocm_package_setup_client_component(tests)
3629+ rocm_package_setup_client_component(tests PACKAGE_NAME unittests)
3630 add_subdirectory(test)
3631+
3632+ if(BUILD_SHARED_LIBS)
3633+ add_custom_command(TARGET rccl POST_BUILD
3634+ COMMENT "Extracting metadata from librccl.so"
3635+ COMMAND COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/extract_metadata.cmake
3636+ VERBATIM
3637+ )
3638+ endif()
3639 endif()
3640
3641 rocm_create_package(
3642- NAME
3643- rccl
3644- DESCRIPTION
3645- "ROCm Communication Collectives Library"
3646- MAINTAINER
3647- "RCCL Maintainer <rccl-maintainer@amd.com>"
3648+ NAME rccl
3649+ DESCRIPTION "ROCm Communication Collectives Library"
3650+ MAINTAINER "RCCL Maintainer <rccl-maintainer@amd.com>"
3651 LDCONFIG)
3652diff --git a/LICENSE.txt b/LICENSE.txt
3653index 11b343f..da5cc8f 100644
3654--- a/LICENSE.txt
3655+++ b/LICENSE.txt
3656@@ -4,41 +4,43 @@ Attributions
3657 Contains contributions from NVIDIA.
3658
3659 Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
3660-Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
3661-
3662- Redistribution and use in source and binary forms, with or without
3663- modification, are permitted provided that the following conditions
3664- are met:
3665- * Redistributions of source code must retain the above copyright
3666- notice, this list of conditions and the following disclaimer.
3667- * Redistributions in binary form must reproduce the above copyright
3668- notice, this list of conditions and the following disclaimer in the
3669- documentation and/or other materials provided with the distribution.
3670- * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
3671- Laboratory, the U.S. Department of Energy, nor the names of their
3672- contributors may be used to endorse or promote products derived
3673- from this software without specific prior written permission.
3674-
3675- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
3676- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3677- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
3678- PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
3679- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
3680- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
3681- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
3682- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
3683- OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
3684- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
3685- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3686-
3687- The U.S. Department of Energy funded the development of this software
3688- under subcontract 7078610 with Lawrence Berkeley National Laboratory.
3689+Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
3690+Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
3691+
3692+Redistribution and use in source and binary forms, with or without
3693+modification, are permitted provided that the following conditions
3694+are met:
3695+
3696+* Redistributions of source code must retain the above copyright
3697+ notice, this list of conditions and the following disclaimer.
3698+* Redistributions in binary form must reproduce the above copyright
3699+ notice, this list of conditions and the following disclaimer in the
3700+ documentation and/or other materials provided with the distribution.
3701+* Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
3702+ Laboratory, the U.S. Department of Energy, nor the names of their
3703+ contributors may be used to endorse or promote products derived
3704+ from this software without specific prior written permission.
3705+
3706+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
3707+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3708+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
3709+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
3710+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
3711+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
3712+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
3713+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
3714+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
3715+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
3716+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3717+
3718+The U.S. Department of Energy funded the development of this software
3719+under subcontract 7078610 with Lawrence Berkeley National Laboratory.
3720
3721
3722 This code also includes files from the NVIDIA Tools Extension SDK project.
3723
3724 See:
3725
3726- https://github.com/NVIDIA/NVTX
3727+https://github.com/NVIDIA/NVTX
3728
3729 for more information and license details.
3730diff --git a/NOTICES.txt b/NOTICES.txt
3731index 75794b0..372651b 100644
3732--- a/NOTICES.txt
3733+++ b/NOTICES.txt
3734@@ -1,66 +1,128 @@
3735 Notices and Licenses file
3736 _______________________________________________________________
3737
3738-Dependencies on nvidia-nccl v2.3.7-1 (BSD3)
3739-Copyright (c) 2015-2018, NVIDIA CORPORATION.
3740-Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc.
3741-
3742-Redistribution and use in source and binary forms, with or without
3743-modification, are permitted provided that the following conditions
3744-are met:
3745- * Redistributions of source code must retain the above copyright
3746- notice, this list of conditions and the following disclaimer.
3747- * Redistributions in binary form must reproduce the above copyright
3748- notice, this list of conditions and the following disclaimer in the
3749- documentation and/or other materials provided with the distribution.
3750- * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
3751- Laboratory, the U.S. Department of Energy, nor the names of their
3752- contributors may be used to endorse or promote products derived
3753- from this software without specific prior written permission.
3754-
3755-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
3756-EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3757-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
3758-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
3759-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
3760-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
3761-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
3762-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
3763-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
3764-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
3765-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3766-
3767-The U.S. Department of Energy funded the development of this software
3768-under subcontract 7078610 with Lawrence Berkeley National Laboratory.
3769-
3770-
3771-nvidia-nccl v2.3.7-1 (BSD2)
3772-Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
3773-
3774-Redistribution and use in source and binary forms, with or without
3775-modification, are permitted provided that the following conditions
3776-are met:
3777- * Redistributions of source code must retain the above copyright
3778- notice, this list of conditions and the following disclaimer.
3779- * Redistributions in binary form must reproduce the above copyright
3780- notice, this list of conditions and the following disclaimer in the
3781- documentation and/or other materials provided with the distribution.
3782- * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
3783- Laboratory, the U.S. Department of Energy, nor the names of their
3784- contributors may be used to endorse or promote products derived
3785- from this software without specific prior written permission.
3786-
3787-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
3788-EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3789-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
3790-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
3791-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
3792-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
3793-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
3794-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
3795-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
3796-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
3797-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3798-
3799-The U.S. Department of Energy funded the development of this software
3800-under subcontract 7078610 with Lawrence Berkeley National Laboratory.
3801+Dependencies on nvidia-nccl v2.27.3-1 (BSD3)
3802+
3803+Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
3804+Modifications Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
3805+Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
3806+
3807+ Redistribution and use in source and binary forms, with or without
3808+ modification, are permitted provided that the following conditions
3809+ are met:
3810+ * Redistributions of source code must retain the above copyright
3811+ notice, this list of conditions and the following disclaimer.
3812+ * Redistributions in binary form must reproduce the above copyright
3813+ notice, this list of conditions and the following disclaimer in the
3814+ documentation and/or other materials provided with the distribution.
3815+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
3816+ Laboratory, the U.S. Department of Energy, nor the names of their
3817+ contributors may be used to endorse or promote products derived
3818+ from this software without specific prior written permission.
3819+
3820+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
3821+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3822+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
3823+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
3824+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
3825+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
3826+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
3827+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
3828+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
3829+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
3830+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3831+
3832+ The U.S. Department of Energy funded the development of this software
3833+ under subcontract 7078610 with Lawrence Berkeley National Laboratory.
3834+
3835+
3836+This code also includes files from the NVIDIA Tools Extension SDK project.
3837+
3838+See:
3839+
3840+ https://github.com/NVIDIA/NVTX
3841+
3842+for more information and license details.
3843+
3844+_______________________________________________________________
3845+
3846+Dependencies on NPKit (MIT License)
3847+
3848+ Copyright (c) Microsoft Corporation.
3849+
3850+ Permission is hereby granted, free of charge, to any person obtaining a copy
3851+ of this software and associated documentation files (the "Software"), to deal
3852+ in the Software without restriction, including without limitation the rights
3853+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
3854+ copies of the Software, and to permit persons to whom the Software is
3855+ furnished to do so, subject to the following conditions:
3856+
3857+ The above copyright notice and this permission notice shall be included in all
3858+ copies or substantial portions of the Software.
3859+
3860+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
3861+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
3862+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
3863+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
3864+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
3865+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3866+ SOFTWARE
3867+
3868+_______________________________________________________________
3869+
3870+Dependencies on MSCCL++ (MIT License)
3871+
3872+ Copyright (c) Microsoft Corporation.
3873+
3874+ Permission is hereby granted, free of charge, to any person obtaining a copy
3875+ of this software and associated documentation files (the "Software"), to deal
3876+ in the Software without restriction, including without limitation the rights
3877+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
3878+ copies of the Software, and to permit persons to whom the Software is
3879+ furnished to do so, subject to the following conditions:
3880+
3881+ The above copyright notice and this permission notice shall be included in all
3882+ copies or substantial portions of the Software.
3883+
3884+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
3885+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
3886+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
3887+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
3888+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
3889+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3890+ SOFTWARE
3891+
3892+See:
3893+
3894+ https://github.com/microsoft/mscclpp
3895+
3896+for more information and license details.
3897+
3898+_______________________________________________________________
3899+
3900+Dependencies on Latency Profiler (MIT License)
3901+
3902+ Copyright (c) Meta Platforms, Inc. and affiliates.
3903+
3904+ Permission is hereby granted, free of charge, to any person obtaining a copy
3905+ of this software and associated documentation files (the "Software"), to deal
3906+ in the Software without restriction, including without limitation the rights
3907+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
3908+ copies of the Software, and to permit persons to whom the Software is
3909+ furnished to do so, subject to the following conditions:
3910+
3911+ The above copyright notice and this permission notice shall be included in all
3912+ copies or substantial portions of the Software.
3913+
3914+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
3915+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
3916+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
3917+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
3918+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
3919+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3920+ SOFTWARE.
3921+
3922+See:
3923+
3924+ src/include/latency_profiler
3925+ src/misc/latency_profiler
3926diff --git a/README.md b/README.md
3927index 8bdce66..d25f661 100644
3928--- a/README.md
3929+++ b/README.md
3930@@ -2,6 +2,11 @@
3931
3932 ROCm Communication Collectives Library
3933
3934+[![RCCL](https://dev.azure.com/ROCm-CI/ROCm-CI/_apis/build/status%2Frccl?repoName=ROCm%2Frccl&branchName=develop)](https://dev.azure.com/ROCm-CI/ROCm-CI/_build/latest?definitionId=107&repoName=ROCm%2Frccl&branchName=develop)
3935+[![TheRock CI](https://github.com/ROCm/rccl/actions/workflows/therock-ci.yml/badge.svg?branch=develop&event=push)](https://github.com/ROCm/rccl/actions/workflows/therock-ci.yml)
3936+
3937+> **Note:** The published documentation is available at [RCCL](https://rocm.docs.amd.com/projects/rccl/en/latest/index.html) in an organized easy-to-read format that includes a table of contents and search functionality. The documentation source files reside in the [rccl/docs](https://github.com/ROCm/rccl/tree/develop/docs) folder in this repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
3938+
3939 ## Introduction
3940
3941 RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is also initial support for direct GPU-to-GPU send and receive operations. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
3942@@ -11,44 +16,81 @@ The collective operations are implemented using ring and tree algorithms and hav
3943 ## Requirements
3944
3945 1. ROCm supported GPUs
3946-2. ROCm stack installed on the system (HIP runtime & HCC or HIP-Clang)
3947+2. ROCm stack installed on the system (HIP runtime & HIP-Clang)
3948
3949 ## Quickstart RCCL Build
3950
3951-RCCL directly depends on HIP runtime, plus the HCC C++ compiler or the HIP-Clang compiler which are part of the ROCm software stack.
3952-For ROCm installation instructions, see https://github.com/RadeonOpenCompute/ROCm.
3953+RCCL directly depends on HIP runtime plus the HIP-Clang compiler, which are part of the ROCm software stack.
3954+For ROCm installation instructions, see https://github.com/ROCm/ROCm.
3955+
3956+The root of this repository has a helper script `install.sh` to build and install RCCL with a single command. It hard-codes configurations that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install RCCL.
3957+
3958+### To build the library using the install script:
3959+
3960+```shell
3961+./install.sh
3962+```
3963
3964-The root of this repository has a helper script 'install.sh' to build and install RCCL on Ubuntu with a single command. It does not take a lot of options and hard-codes configuration that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install.
3965+For more info on build options/flags when using the install script, use `./install.sh --help`
3966+```shell
3967+./install.sh --help
3968+RCCL build & installation helper script
3969+ Options:
3970+ --address-sanitizer Build with address sanitizer enabled
3971+ -c|--enable-code-coverage Enable code coverage
3972+ -d|--dependencies Install RCCL dependencies
3973+ --debug Build debug library
3974+ --enable_backtrace Build with custom backtrace support
3975+ --disable-colltrace Build without collective trace
3976+ --disable-msccl-kernel Build without MSCCL kernels
3977+ --enable-mscclpp Build with MSCCL++ support
3978+ --enable-mscclpp-clip Build MSCCL++ with clip wrapper on bfloat16 and half addition routines
3979+ --disable-roctx Build without ROCTX logging
3980+ -f|--fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)
3981+ -h|--help Prints this help message
3982+ -i|--install Install RCCL library (see --prefix argument below)
3983+ -j|--jobs Specify how many parallel compilation jobs to run ($nproc by default)
3984+ -l|--local_gpu_only Only compile for local GPU architecture
3985+ --amdgpu_targets Only compile for specified GPU architecture(s). For multiple targets, separate by ';' (builds for all supported GPU architectures by default)
3986+ --no_clean Don't delete files if they already exist
3987+ --npkit-enable Compile with npkit enabled
3988+ --log-trace Build with log trace enabled (i.e. NCCL_DEBUG=TRACE)
3989+ --openmp-test-enable Enable OpenMP in rccl unit tests
3990+ -p|--package_build Build RCCL package
3991+ --prefix Specify custom directory to install RCCL to (default: `/opt/rocm`)
3992+ --run_tests_all Run all rccl unit tests (must be built already)
3993+ -r|--run_tests_quick Run small subset of rccl unit tests (must be built already)
3994+ --static Build RCCL as a static library instead of shared library
3995+ -t|--tests_build Build rccl unit tests, but do not run
3996+ --time-trace Plot the build time of RCCL (requires `ninja-build` package installed on the system)
3997+ --verbose Show compile commands
3998+```
3999
4000-* `./install.sh` -- builds library including unit tests
4001-* `./install.sh -i` -- builds and installs the library to /opt/rocm/rccl; installation path can be changed with --prefix argument (see below.)
4002-* `./install.sh -d` -- installs all necessary dependencies for RCCL. Should be re-invoked if the build folder is removed.
4003-* `./install.sh -h` -- shows help
4004-* `./install.sh -t` -- builds library including unit tests
4005-* `./install.sh -r` -- runs unit tests (must be already built)
4006-* `./install.sh -p` -- builds RCCL package
4007-* `./install.sh -s` -- builds RCCL as a static library (default: shared)
4008-* `./install.sh -hcc` -- builds RCCL with hcc compiler; note that hcc is now deprecated. (default:hip-clang)
4009-* `./install.sh --prefix` -- specify custom path to install RCCL to (default:/opt/rocm)
4010+By default, RCCL builds for all GPU targets defined in `DEFAULT_GPUS` in `CMakeLists.txt`. To target specific GPU(s), and potentially reduce build time, use `--amdgpu_targets` as a `;` separated string listing GPU(s) to target.
4011
4012 ## Manual build
4013-#### To build the library :
4014+
4015+### To build the library using CMake:
4016
4017 ```shell
4018-$ git clone https://github.com/ROCmSoftwarePlatform/rccl.git
4019+$ git clone --recursive https://github.com/ROCm/rccl.git
4020 $ cd rccl
4021 $ mkdir build
4022 $ cd build
4023-$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ ..
4024-$ make -j
4025+$ cmake ..
4026+$ make -j 16 # Or some other suitable number of parallel jobs
4027 ```
4028-You may substitute an installation path of your own choosing by passing CMAKE_INSTALL_PREFIX. For example:
4029+If you have already cloned, you can checkout the external submodules manually.
4030 ```shell
4031-$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
4032+$ git submodule update --init --recursive --depth=1
4033+```
4034+You may substitute an installation path of your own choosing by passing `CMAKE_INSTALL_PREFIX`. For example:
4035+```shell
4036+$ cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install -DCMAKE_BUILD_TYPE=Release ..
4037 ```
4038 Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
4039
4040-#### To build the RCCL package and install package :
4041+### To build the RCCL package and install package :
4042
4043 Assuming you have already cloned this repository and built the library as shown in the previous section:
4044
4045@@ -58,49 +100,48 @@ $ make package
4046 $ sudo dpkg -i *.deb
4047 ```
4048
4049-RCCL package install requires sudo/root access because it creates a directory called "rccl" under /opt/rocm/. This is an optional step and RCCL can be used directly by including the path containing librccl.so.
4050+RCCL package install requires sudo/root access because it installs under `/opt/rocm/`. This is an optional step as RCCL can instead be used directly by including the path containing `librccl.so`.
4051+
4052+## Docker build
4053
4054-## Enabling peer-to-peer transport
4055-In order to enable peer-to-peer access on machines with PCIe-connected GPUs, the HSA environment variable HSA_FORCE_FINE_GRAIN_PCIE=1 is required to be set, on top of requiring GPUs that support peer-to-peer access and proper large BAR addressing support.
4056+Refer to [docker/README.md](docker/README.md "docker/README.md")
4057
4058 ## Tests
4059
4060-There are unit tests implemented with the Googletest framework in RCCL. The unit tests require Googletest 1.10 or higher to build and execute properly (installed with the -d option to install.sh).
4061-To invoke the unit tests, go to the build folder, then the test subfolder, and execute the appropriate unit test executable(s).
4062+There are rccl unit tests implemented with the Googletest framework in RCCL. The rccl unit tests require Googletest 1.10 or higher to build and execute properly (installed with the -d option to install.sh).
4063+To invoke the rccl unit tests, go to the build folder, then the test subfolder, and execute the appropriate rccl unit test executable(s).
4064
4065-Unit test names are now of the format:
4066+rccl unit test names are now of the format:
4067
4068 CollectiveCall.[Type of test]
4069
4070-Filtering of unit tests should be done with environment variable and by passing the --gtest_filter command line flag, for example:
4071+Filtering of rccl unit tests should be done with environment variable and by passing the `--gtest_filter` command line flag, for example:
4072
4073 ```shell
4074-UT_DATATYPES=ncclBfloat16 UT_REDOPS=prod ./UnitTests --gtest_filter="AllReduce.C*"
4075+UT_DATATYPES=ncclBfloat16 UT_REDOPS=prod ./rccl-UnitTests --gtest_filter="AllReduce.C*"
4076 ```
4077-will run only AllReduce correctness tests with float16 datatype. A list of available filtering environment variables appears at the top of every run. See "Running a Subset of the Tests" at https://chromium.googlesource.com/external/github.com/google/googletest/+/HEAD/googletest/docs/advanced.md for more information on how to form more advanced filters.
4078
4079+will run only AllReduce correctness tests with float16 datatype. A list of available filtering environment variables appears at the top of every run. See "Running a Subset of the Tests" at https://google.github.io/googletest/advanced.html#running-a-subset-of-the-tests for more information on how to form more advanced filters.
4080
4081-There are also other performance and error-checking tests for RCCL. These are maintained separately at https://github.com/ROCmSoftwarePlatform/rccl-tests.
4082+There are also other performance and error-checking tests for RCCL. These are maintained separately at https://github.com/ROCm/rccl-tests.
4083 See the rccl-tests README for more information on how to build and run those tests.
4084
4085-## NPKit
4086-
4087-RCCL integrates [NPKit](https://github.com/microsoft/npkit), a profiler framework that enables collecting fine-grained trace events in RCCL components, especially in giant collective GPU kernels.
4088-
4089-Please check [NPKit sample workflow for RCCL](https://github.com/microsoft/NPKit/tree/main/rccl_samples) as a fully automated usage example. It also provides good templates for the following manual instructions.
4090-
4091-To manually build RCCL with NPKit enabled, pass `-DNPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_...(other NPKit compile-time switches)"` with cmake command. All NPKit compile-time switches are declared in the RCCL code base as macros with prefix `ENABLE_NPKIT_`, and they control which information will be collected. Also note that currently NPKit only supports collecting non-overlapped events on GPU, and `-DNPKIT_FLAGS` should follow this rule.
4092+## Library and API Documentation
4093
4094-To manually run RCCL with NPKit enabled, environment variable `NPKIT_DUMP_DIR` needs to be set as the NPKit event dump directory. Also note that currently NPKit only supports 1 GPU per process.
4095+Please refer to the [RCCL Documentation Site](https://rocm.docs.amd.com/projects/rccl/en/latest/) for current documentation.
4096
4097-To manually analyze NPKit dump results, please leverage [npkit_trace_generator.py](https://github.com/microsoft/NPKit/blob/main/rccl_samples/npkit_trace_generator.py).
4098+### How to build documentation
4099
4100-## Library and API Documentation
4101+Run the steps below to build documentation locally.
4102
4103-Please refer to the [Library documentation](https://rccl.readthedocs.io/) for current documentation.
4104+```shell
4105+cd docs
4106+pip3 install -r sphinx/requirements.txt
4107+python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
4108+```
4109
4110 ## Copyright
4111
4112-All source code and accompanying documentation is copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
4113+All source code and accompanying documentation is copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
4114
4115-All modifications are copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
4116+All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
4117diff --git a/cmake/CheckSymbolExistsNoWarn.cmake b/cmake/CheckSymbolExistsNoWarn.cmake
4118new file mode 100644
4119index 0000000..b478f59
4120--- /dev/null
4121+++ b/cmake/CheckSymbolExistsNoWarn.cmake
4122@@ -0,0 +1,40 @@
4123+# MIT License
4124+#
4125+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
4126+#
4127+# Permission is hereby granted, free of charge, to any person obtaining a copy
4128+# of this software and associated documentation files (the "Software"), to deal
4129+# in the Software without restriction, including without limitation the rights
4130+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
4131+# copies of the Software, and to permit persons to whom the Software is
4132+# furnished to do so, subject to the following conditions:
4133+#
4134+# The above copyright notice and this permission notice shall be included in all
4135+# copies or substantial portions of the Software.
4136+#
4137+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
4138+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
4139+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
4140+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
4141+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
4142+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4143+# SOFTWARE.
4144+
4145+# These overrides are due to CMake CHECK_SYMBOL_EXISTS modifying CMAKE_CXX_FLAGS to do a test compile,
4146+# while ROCMChecks gives a warning if this variable is modified manually without a target.
4147+
4148+# We now choose to disable ROCMChecks for this one case.
4149+
4150+set(DISABLE_ROCM_CHECK OFF)
4151+
4152+function(rocm_check_toolchain_var var access value list_file)
4153+ if(NOT DISABLE_ROCM_CHECK)
4154+ _rocm_check_toolchain_var("${var}" "${access}" "${value}" "${list_file}")
4155+ endif()
4156+endfunction()
4157+
4158+macro(CHECK_SYMBOL_EXISTS)
4159+ set(DISABLE_ROCM_CHECK ON)
4160+ _check_symbol_exists(${ARGN})
4161+ set(DISABLE_ROCM_CHECK OFF)
4162+endmacro()
4163diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
4164index 124c268..fc084ec 100644
4165--- a/cmake/Dependencies.cmake
4166+++ b/cmake/Dependencies.cmake
4167@@ -32,6 +32,8 @@
4168 # For downloading, building, and installing required dependencies
4169 include(cmake/DownloadProject.cmake)
4170
4171+include(FetchContent)
4172+
4173 if(NOT INSTALL_DEPENDENCIES)
4174 find_package(GTest 1.11)
4175 endif()
4176@@ -50,7 +52,7 @@ if(NOT GTest_FOUND AND BUILD_TESTS OR INSTALL_DEPENDENCIES)
4177
4178 download_project(PROJ googletest
4179 GIT_REPOSITORY https://github.com/google/googletest.git
4180- GIT_TAG release-1.11.0
4181+ GIT_TAG release-1.12.0
4182 INSTALL_DIR ${GTEST_ROOT}
4183 CMAKE_ARGS -DBUILD_GTEST=ON -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> ${COMPILER_OVERRIDE} -DBUILD_SHARED_LIBS=OFF
4184 LOG_DOWNLOAD TRUE
4185@@ -60,31 +62,37 @@ if(NOT GTest_FOUND AND BUILD_TESTS OR INSTALL_DEPENDENCIES)
4186 UPDATE_DISCONNECTED TRUE
4187 )
4188 set(GTEST_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/gtest/include CACHE PATH "")
4189+ set(GMOCK_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/gmock/include CACHE PATH "")
4190 if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib)
4191 set(GTEST_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgtest.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgtest_main.a CACHE PATH "")
4192+ set(GMOCK_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgmock.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgmock_main.a CACHE PATH "")
4193 elseif(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64)
4194 set(GTEST_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgtest.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgtest_main.a CACHE PATH "")
4195+ set(GMOCK_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgmock.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgmock_main.a CACHE PATH "")
4196 else()
4197 message(FATAL_ERROR "Cannot find gtest library installation path.")
4198 find_package(GTest REQUIRED CONFIG PATHS ${GTEST_ROOT})
4199+ find_package(GMock REQUIRED CONFIG PATHS ${GTEST_ROOT})
4200 endif()
4201+elseif(GTest_FOUND AND BUILD_TESTS)
4202+ set(GTEST_BOTH_LIBRARIES "GTest::gtest;GTest::gtest_main")
4203+ set(GMOCK_BOTH_LIBRARIES "GTest::gmock;GTest::gmock_main")
4204 endif()
4205
4206-
4207 # Find or download/install rocm-cmake project
4208 set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern )
4209 find_package(ROCM 0.7.3 QUIET CONFIG PATHS /opt/rocm)
4210 if(NOT ROCM_FOUND)
4211 set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download")
4212 file(
4213- DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip
4214+ DOWNLOAD https://github.com/ROCm/rocm-cmake/archive/${rocm_cmake_tag}.zip
4215 ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
4216 STATUS rocm_cmake_download_status LOG rocm_cmake_download_log
4217 )
4218 list(GET rocm_cmake_download_status 0 rocm_cmake_download_error_code)
4219 if(rocm_cmake_download_error_code)
4220 message(FATAL_ERROR "Error: downloading "
4221- "https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip failed "
4222+ "https://github.com/ROCm/rocm-cmake/archive/${rocm_cmake_tag}.zip failed "
4223 "error_code: ${rocm_cmake_download_error_code} "
4224 "log: ${rocm_cmake_download_log} "
4225 )
4226@@ -106,6 +114,74 @@ if(NOT ROCM_FOUND)
4227 find_package( ROCM 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake )
4228 endif()
4229
4230+set(CMAKE_INSTALL_LIBDIR lib CACHE STRING "Define install directory for libraries" FORCE)
4231+
4232+# Find or download/install fmt
4233+find_package(fmt QUIET)
4234+if(NOT fmt_FOUND)
4235+ set(FMT_INSTALL OFF)
4236+ message(STATUS "fmt not found, fetching from source...")
4237+ FetchContent_Declare(
4238+ fmt
4239+ GIT_REPOSITORY https://github.com/fmtlib/fmt
4240+ GIT_TAG e69e5f977d458f2650bb346dadf2ad30c5320281 # 10.2.1
4241+ )
4242+ FetchContent_MakeAvailable(fmt)
4243+else()
4244+ message(STATUS "Using system fmt")
4245+ get_target_property(FMT_INCLUDE_DIRS fmt::fmt-header-only INTERFACE_INCLUDE_DIRECTORIES)
4246+ message(STATUS "fmt include directories: ${FMT_INCLUDE_DIRS}")
4247+endif()
4248+
4249+# Find available local ROCM targets
4250+# NOTE: This will eventually be part of ROCm-CMake and should be removed at that time
4251+function(rocm_local_targets VARIABLE)
4252+ set(${VARIABLE} "NOTFOUND" PARENT_SCOPE)
4253+ find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS /opt/rocm/bin ENV ROCM_PATH)
4254+ if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND")
4255+ execute_process(
4256+ COMMAND "${_rocm_agent_enumerator}"
4257+ RESULT_VARIABLE _found_agents
4258+ OUTPUT_VARIABLE _rocm_agents
4259+ ERROR_QUIET
4260+ )
4261+ if (_found_agents EQUAL 0)
4262+ string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}")
4263+ unset(result)
4264+ foreach (agent IN LISTS _rocm_agents)
4265+ if (NOT agent STREQUAL "gfx000")
4266+ list(APPEND result "${agent}")
4267+ endif()
4268+ endforeach()
4269+ if(result)
4270+ list(REMOVE_DUPLICATES result)
4271+ set(${VARIABLE} "${result}" PARENT_SCOPE)
4272+ endif()
4273+ endif()
4274+ endif()
4275+endfunction()
4276+
4277+# Iterate over the "source" list and check if there is a duplicate file name
4278+# NOTE: This is due to compiler bug '--save-temps' and can be removed when fix availabe
4279+function(add_file_unique FILE_LIST FILE)
4280+ get_filename_component(FILE_NAME "${FILE}" NAME)
4281+
4282+ # Iterate over whatever is in the list so far
4283+ foreach(curr_file IN LISTS ${FILE_LIST})
4284+ get_filename_component(curr_file_name ${curr_file} NAME)
4285+
4286+ # Check if duplicate
4287+ if(${FILE_NAME} STREQUAL ${curr_file_name})
4288+ get_filename_component(DIR_PATH "${FILE}" DIRECTORY)
4289+ get_filename_component(FILE_NAME_WE "${FILE}" NAME_WE)
4290+ get_filename_component(FILE_EXT "${FILE}" EXT)
4291+
4292+ # Construct a new file name by adding _tmp
4293+ set(HIP_FILE "${DIR_PATH}/${FILE_NAME_WE}_tmp${FILE_EXT}" PARENT_SCOPE)
4294+ endif()
4295+ endforeach()
4296+endfunction()
4297+
4298 include(ROCMSetupVersion)
4299 include(ROCMCreatePackage)
4300 include(ROCMInstallTargets)
4301@@ -113,4 +189,4 @@ include(ROCMPackageConfigHelpers)
4302 include(ROCMInstallSymlinks)
4303 include(ROCMCheckTargetIds)
4304 include(ROCMClients)
4305-include( ROCMHeaderWrapper )
4306+include(ROCMHeaderWrapper)
4307diff --git a/cmake/DownloadProject.cmake b/cmake/DownloadProject.cmake
4308index 110bbd5..765b1f6 100644
4309--- a/cmake/DownloadProject.cmake
4310+++ b/cmake/DownloadProject.cmake
4311@@ -158,7 +158,7 @@ function(download_project)
4312 if(result)
4313 message(FATAL_ERROR "CMake step for ${DL_ARGS_PROJ} failed: ${result}")
4314 endif()
4315- execute_process(COMMAND ${CMAKE_COMMAND} --build .
4316+ execute_process(COMMAND ${CMAKE_COMMAND} --build . -j16
4317 RESULT_VARIABLE result
4318 ${OUTPUT_QUIET}
4319 WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}"
4320@@ -167,4 +167,4 @@ function(download_project)
4321 message(FATAL_ERROR "Build step for ${DL_ARGS_PROJ} failed: ${result}")
4322 endif()
4323
4324-endfunction()
4325\ No newline at end of file
4326+endfunction()
4327diff --git a/cmake/FindIBVerbs.cmake b/cmake/FindIBVerbs.cmake
4328new file mode 100644
4329index 0000000..d02c0e1
4330--- /dev/null
4331+++ b/cmake/FindIBVerbs.cmake
4332@@ -0,0 +1,39 @@
4333+# MIT License
4334+#
4335+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
4336+#
4337+# Permission is hereby granted, free of charge, to any person obtaining a copy
4338+# of this software and associated documentation files (the "Software"), to deal
4339+# in the Software without restriction, including without limitation the rights
4340+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
4341+# copies of the Software, and to permit persons to whom the Software is
4342+# furnished to do so, subject to the following conditions:
4343+#
4344+# The above copyright notice and this permission notice shall be included in all
4345+# copies or substantial portions of the Software.
4346+#
4347+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
4348+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
4349+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
4350+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
4351+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
4352+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4353+# SOFTWARE.
4354+
4355+find_path(IBVERBS_INCLUDE_DIRS
4356+ NAMES infiniband/verbs.h
4357+ HINTS
4358+ ${IBVERBS_INCLUDE_DIR}
4359+ ${IBVERBS_ROOT_DIR}
4360+ ${IBVERBS_ROOT_DIR}/include)
4361+
4362+find_library(IBVERBS_LIBRARIES
4363+ NAMES ibverbs
4364+ HINTS
4365+ ${IBVERBS_LIB_DIR}
4366+ ${IBVERBS_ROOT_DIR}
4367+ ${IBVERBS_ROOT_DIR}/lib)
4368+
4369+include(FindPackageHandleStandardArgs)
4370+find_package_handle_standard_args(IBVerbs DEFAULT_MSG IBVERBS_INCLUDE_DIRS IBVERBS_LIBRARIES)
4371+mark_as_advanced(IBVERBS_INCLUDE_DIR IBVERBS_LIBRARIES)
4372diff --git a/cmake/Findmscclpp_nccl.cmake b/cmake/Findmscclpp_nccl.cmake
4373new file mode 100644
4374index 0000000..313b0c6
4375--- /dev/null
4376+++ b/cmake/Findmscclpp_nccl.cmake
4377@@ -0,0 +1,36 @@
4378+# MIT License
4379+#
4380+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
4381+#
4382+# Permission is hereby granted, free of charge, to any person obtaining a copy
4383+# of this software and associated documentation files (the "Software"), to deal
4384+# in the Software without restriction, including without limitation the rights
4385+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
4386+# copies of the Software, and to permit persons to whom the Software is
4387+# furnished to do so, subject to the following conditions:
4388+#
4389+# The above copyright notice and this permission notice shall be included in all
4390+# copies or substantial portions of the Software.
4391+#
4392+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
4393+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
4394+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
4395+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
4396+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
4397+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4398+# SOFTWARE.
4399+
4400+find_path(MSCCLPP_INCLUDE_DIRS
4401+ NAMES mscclpp/gpu.hpp
4402+ HINTS
4403+ ${MSCCLPP_ROOT}/include)
4404+
4405+find_library(MSCCLPP_LIBRARIES
4406+ NAMES mscclpp_nccl
4407+ HINTS
4408+ ${MSCCLPP_ROOT}/lib)
4409+
4410+include (FindPackageHandleStandardArgs)
4411+find_package_handle_standard_args(mscclpp_nccl DEFAULT_MSG MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES)
4412+mark_as_advanced(MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES)
4413+
4414\ No newline at end of file
4415diff --git a/cmake/MSCCLPP.cmake b/cmake/MSCCLPP.cmake
4416new file mode 100644
4417index 0000000..21fbc64
4418--- /dev/null
4419+++ b/cmake/MSCCLPP.cmake
4420@@ -0,0 +1,229 @@
4421+# MIT License
4422+#
4423+# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
4424+#
4425+# Permission is hereby granted, free of charge, to any person obtaining a copy
4426+# of this software and associated documentation files (the "Software"), to deal
4427+# in the Software without restriction, including without limitation the rights
4428+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
4429+# copies of the Software, and to permit persons to whom the Software is
4430+# furnished to do so, subject to the following conditions:
4431+#
4432+# The above copyright notice and this permission notice shall be included in all
4433+# copies or substantial portions of the Software.
4434+#
4435+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
4436+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
4437+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
4438+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
4439+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
4440+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4441+# SOFTWARE.
4442+
4443+# Dependencies
4444+
4445+# HIP dependency is handled earlier in the project cmake file
4446+# when VerifyCompiler.cmake is included.
4447+
4448+# GIT
4449+
4450+# Test dependencies
4451+
4452+# For downloading, building, and installing required dependencies
4453+include(cmake/DownloadProject.cmake)
4454+
4455+if(ENABLE_MSCCLPP)
4456+ # Try to find the mscclpp install
4457+ set(MSCCLPP_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/ext/mscclpp CACHE PATH "")
4458+ execute_process(
4459+ COMMAND mkdir -p ${MSCCLPP_ROOT}
4460+ )
4461+ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
4462+ find_package(mscclpp_nccl)
4463+
4464+ #if(NOT mscclpp_nccl_FOUND)
4465+ # Ensure the source code is checked out
4466+ set(MSCCLPP_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp CACHE PATH "")
4467+ set(JSON_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/json CACHE PATH "")
4468+ if((NOT EXISTS ${MSCCLPP_SOURCE}/CMakeLists.txt) OR (NOT EXISTS ${JSON_SOURCE}/CMakeLists.txt))
4469+ message(STATUS "Checking out external code")
4470+ execute_process(
4471+ COMMAND git submodule update --init --recursive
4472+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
4473+ )
4474+ endif()
4475+
4476+ execute_process(
4477+ COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
4478+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4479+ )
4480+
4481+ execute_process(
4482+ COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
4483+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4484+ )
4485+
4486+ execute_process(
4487+ COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch
4488+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4489+ )
4490+
4491+ execute_process(
4492+ COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
4493+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4494+ )
4495+
4496+ execute_process(
4497+ COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch
4498+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4499+ )
4500+
4501+ execute_process(
4502+ COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch
4503+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4504+ )
4505+
4506+ execute_process(
4507+ COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/reg-fix.patch
4508+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4509+ )
4510+
4511+ execute_process(
4512+ COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/no-cache.patch
4513+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4514+ )
4515+
4516+ execute_process(
4517+ COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/device-flag.patch
4518+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4519+ )
4520+
4521+ execute_process(
4522+ COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/remove-clip.patch
4523+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4524+ )
4525+
4526+ execute_process(
4527+ COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-executor.patch
4528+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4529+ )
4530+
4531+ execute_process(
4532+ COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-format-checks.patch
4533+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4534+ )
4535+
4536+ set(CMAKE_INHERITED_ARGS "")
4537+ set(CMAKE_ARGS_LIST "CMAKE_PREFIX_PATH;CMAKE_INSTALL_RPATH_USE_LINK_PATH;HIP_COMPILER")
4538+ foreach(arg IN LISTS CMAKE_ARGS_LIST)
4539+ if(DEFINED ${arg})
4540+ string(REPLACE ";" "%" ARG_VALUE "${${arg}}") # Replace ; with new list separator symbol % to avoid CMake errors
4541+ string(STRIP "${ARG_VALUE}" ARG_VALUE) # Eliminate whitespace, reducing to empty string if necessary
4542+
4543+ # Only add a cmake argument if it has a value
4544+ if("${ARG_VALUE}" STREQUAL "")
4545+ continue()
4546+ endif()
4547+ string(APPEND CMAKE_INHERITED_ARGS "-D${arg}=\"${ARG_VALUE}\" ")
4548+ endif()
4549+ endforeach()
4550+
4551+ if(NOT DEFINED CACHE{MSCCLPP_GPU_TARGETS})
4552+ message(STATUS "Building MSCCL++ only for supported variants: gfx942;gfx950")
4553+ set(MSCCLPP_GPU_TARGETS "gfx942;gfx950")
4554+ if(BUILD_ADDRESS_SANITIZER)
4555+ set(MSCCLPP_GPU_TARGETS "gfx942:xnack+;gfx950:xnack+")
4556+ endif()
4557+ else()
4558+ message(STATUS "Building MSCCL++ for ${MSCCLPP_GPU_TARGETS}")
4559+ endif()
4560+
4561+ string(REPLACE ";" "%" MSCCLPP_GPU_TARGETS "${MSCCLPP_GPU_TARGETS}")
4562+
4563+ download_project(PROJ mscclpp_nccl
4564+ #GIT_REPOSITORY https://github.com/microsoft/mscclpp.git
4565+ #GIT_TAG 4ee15b7ad085daaf74349d4c49c9b8480d28f0dc
4566+ INSTALL_DIR ${MSCCLPP_ROOT}
4567+ LIST_SEPARATOR %
4568+ CMAKE_ARGS "-DGPU_TARGETS=${MSCCLPP_GPU_TARGETS}" -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DMSCCLPP_CLIP_ENABLED=${ENABLE_MSCCLPP_CLIP} -DMSCCLPP_ENABLE_EXECUTOR=${ENABLE_MSCCLPP_EXECUTOR} -DMSCCLPP_ENABLE_FORMAT_CHECKS=${ENABLE_MSCCLPP_FORMAT_CHECKS} -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INHERITED_ARGS}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE}
4569+ LOG_DOWNLOAD FALSE
4570+ LOG_CONFIGURE FALSE
4571+ LOG_BUILD FALSE
4572+ LOG_INSTALL FALSE
4573+ UPDATE_DISCONNECTED TRUE
4574+ SOURCE_DIR ${MSCCLPP_SOURCE}
4575+ )
4576+
4577+ find_package(mscclpp_nccl REQUIRED)
4578+
4579+ execute_process(
4580+ COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-format-checks.patch
4581+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4582+ )
4583+
4584+ execute_process(
4585+ COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-executor.patch
4586+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4587+ )
4588+
4589+ execute_process(
4590+ COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/remove-clip.patch
4591+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4592+ )
4593+
4594+ execute_process(
4595+ COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/device-flag.patch
4596+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4597+ )
4598+
4599+ execute_process(
4600+ COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/no-cache.patch
4601+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4602+ )
4603+
4604+ execute_process(
4605+ COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/reg-fix.patch
4606+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4607+ )
4608+
4609+ execute_process(
4610+ COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch
4611+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4612+ )
4613+
4614+ execute_process(
4615+ COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch
4616+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4617+ )
4618+
4619+ execute_process(
4620+ COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
4621+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4622+ )
4623+
4624+ execute_process(
4625+ COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch
4626+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4627+ )
4628+
4629+ execute_process(
4630+ COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
4631+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4632+ )
4633+
4634+ execute_process(
4635+ COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
4636+ WORKING_DIRECTORY ${MSCCLPP_SOURCE}
4637+ )
4638+
4639+ #endif()
4640+
4641+ execute_process(COMMAND objcopy
4642+ --redefine-syms=${CMAKE_CURRENT_SOURCE_DIR}/src/misc/mscclpp/mscclpp_nccl_syms.txt
4643+ "${MSCCLPP_ROOT}/lib/libmscclpp_nccl_static.a"
4644+ "${PROJECT_BINARY_DIR}/libmscclpp_nccl.a"
4645+ )
4646+ add_library(mscclpp_nccl STATIC IMPORTED)
4647+ set_target_properties(mscclpp_nccl PROPERTIES IMPORTED_LOCATION ${PROJECT_BINARY_DIR}/libmscclpp_nccl.a)
4648+
4649+endif()
4650diff --git a/cmake/rcclRAS.cmake b/cmake/rcclRAS.cmake
4651new file mode 100644
4652index 0000000..255623e
4653--- /dev/null
4654+++ b/cmake/rcclRAS.cmake
4655@@ -0,0 +1,24 @@
4656+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
4657+
4658+cmake_minimum_required(VERSION 3.16)
4659+
4660+message("Building rccl RAS client executable")
4661+
4662+add_executable(rcclras "${PROJECT_BINARY_DIR}/hipify/src/ras/client.cc")
4663+
4664+target_include_directories(rcclras PRIVATE ${PROJECT_BINARY_DIR}/include)
4665+target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src)
4666+target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src/include)
4667+
4668+target_link_libraries(rcclras PRIVATE hip::host)
4669+target_link_libraries(rcclras PRIVATE dl)
4670+
4671+if(BUILD_SHARED_LIBS)
4672+ target_link_libraries(rcclras PRIVATE rccl hip::device)
4673+else()
4674+ add_dependencies(rcclras rccl)
4675+ target_link_libraries(rcclras PRIVATE dl rt -lrccl -L${CMAKE_BINARY_DIR} -lamdhip64 -L${ROCM_PATH}/lib)
4676+endif()
4677+
4678+
4679+rocm_install(TARGETS rcclras)
4680diff --git a/cmake/scripts/add_faults.sh b/cmake/scripts/add_faults.sh
4681new file mode 100755
4682index 0000000..5d6c59f
4683--- /dev/null
4684+++ b/cmake/scripts/add_faults.sh
4685@@ -0,0 +1,27 @@
4686+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
4687+#
4688+# Permission is hereby granted, free of charge, to any person obtaining a copy
4689+# of this software and associated documentation files (the "Software"), to deal
4690+# in the Software without restriction, including without limitation the rights
4691+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
4692+# copies of the Software, and to permit persons to whom the Software is
4693+# furnished to do so, subject to the following conditions:
4694+#
4695+# The above copyright notice and this permission notice shall be included in all
4696+# copies or substantial portions of the Software.
4697+#
4698+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
4699+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
4700+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
4701+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
4702+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
4703+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4704+# SOFTWARE.
4705+
4706+HIP_FILE=$1
4707+
4708+if [[ "$HIP_FILE" =~ .*/src/device/.*\.h ]]; then
4709+ sed -i "s/__syncthreads()/__syncthreads(); insert_random_delay_per_warp()/" "$HIP_FILE"
4710+
4711+ echo "Added fault injection to $HIP_FILE"
4712+fi
4713\ No newline at end of file
4714diff --git a/cmake/scripts/add_unroll.sh b/cmake/scripts/add_unroll.sh
4715new file mode 100755
4716index 0000000..3b324b3
4717--- /dev/null
4718+++ b/cmake/scripts/add_unroll.sh
4719@@ -0,0 +1,42 @@
4720+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
4721+#
4722+# Permission is hereby granted, free of charge, to any person obtaining a copy
4723+# of this software and associated documentation files (the "Software"), to deal
4724+# in the Software without restriction, including without limitation the rights
4725+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
4726+# copies of the Software, and to permit persons to whom the Software is
4727+# furnished to do so, subject to the following conditions:
4728+#
4729+# The above copyright notice and this permission notice shall be included in all
4730+# copies or substantial portions of the Software.
4731+#
4732+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
4733+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
4734+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
4735+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
4736+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
4737+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4738+# SOFTWARE.
4739+
4740+HIP_FILE=$1
4741+
4742+if [[ "$HIP_FILE" =~ .*/src/device/.*\.h ]]; then
4743+ perl -pi -e 's/(template<typename T, typename RedOp(?:, typename Proto)?)(, bool isNetOffload.*?)?>/\1, int USE_ACC, int COLL_UNROLL, int Pipeline\2>/g' "$HIP_FILE"
4744+ perl -pi -e 's/(template<typename T, typename RedOp(?:, typename Proto)?(?:, int RCCLMetadata)?)(, bool isNetOffload.*?)?>/\1, int USE_ACC, int COLL_UNROLL, int Pipeline\2>/g' "$HIP_FILE"
4745+ perl -pi -e 's/(ProtoSimple<[^,]*?,[^,]+?)>/\1, USE_ACC, COLL_UNROLL>/g' "$HIP_FILE"
4746+ perl -pi -e 's/(runRing<T.*?)((, (true|false))?>\()/\1, USE_ACC, COLL_UNROLL\2/g' "$HIP_FILE"
4747+ perl -pi -e 's/(runTreeUpDown<T.*?)>\(/\1, USE_ACC, COLL_UNROLL>(/' "$HIP_FILE"
4748+ perl -pi -e 's/(runTreeSplit<T.*?)>\(/\1, USE_ACC, COLL_UNROLL>(/' "$HIP_FILE"
4749+
4750+ perl -pi -e 's/(runTreeSplit<T, RedOp, (ProtoLL|ProtoLL128), USE_ACC, COLL_UNROLL.*?)>/\1, 0>/' "$HIP_FILE"
4751+ perl -pi -e 's/(runTreeUpDown<T, RedOp, (ProtoLL|ProtoLL128), USE_ACC, COLL_UNROLL.*?)>/\1, 0>/' "$HIP_FILE"
4752+ perl -pi -e 's/(runRing<T, RedOp, (ProtoLL|ProtoLL128), USE_ACC, COLL_UNROLL.*?)>/\1, 0>/' "$HIP_FILE"
4753+ perl -pi -e 's/(runRing<T, RedOp, (ProtoLL|ProtoLL128), (RCCL_ONE_NODE_RING_SIMPLE|RCCL_METADATA_EMPTY), USE_ACC, COLL_UNROLL.*?)>/\1, 0>/' "$HIP_FILE"
4754+
4755+ perl -pi -e 's/(runRing<T, RedOp, Proto, (RCCL_ONE_NODE_RING_SIMPLE|RCCL_METADATA_EMPTY), USE_ACC, COLL_UNROLL.*?)>/\1, Pipeline>/' "$HIP_FILE"
4756+ perl -pi -e 's/(runRing<T, RedOp, Proto, USE_ACC, COLL_UNROLL.*?)>/\1, Pipeline>/' "$HIP_FILE"
4757+ perl -pi -e 's/(runTreeSplit<T, RedOp, Proto, USE_ACC, COLL_UNROLL.*?)>/\1, Pipeline>/' "$HIP_FILE"
4758+ perl -pi -e 's/(runTreeUpDown<T, RedOp, Proto, USE_ACC, COLL_UNROLL.*?)>/\1, Pipeline>/' "$HIP_FILE"
4759+ sed -i "s/\\(struct RunWorkBatch<ncclFunc[^>]*\\)>*/\\1, USE_ACC, COLL_UNROLL, Pipeline>/" "$HIP_FILE"
4760+ sed -i "s/\\(RunWorkColl<[^,]*,[^,]*,[^,]*,[^,]*,[^>]*\\)>/\\1, USE_ACC, COLL_UNROLL, Pipeline>/" "$HIP_FILE"
4761+fi
4762\ No newline at end of file
4763diff --git a/cmake/scripts/extract_metadata.cmake b/cmake/scripts/extract_metadata.cmake
4764new file mode 100644
4765index 0000000..7e70124
4766--- /dev/null
4767+++ b/cmake/scripts/extract_metadata.cmake
4768@@ -0,0 +1,81 @@
4769+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
4770+#
4771+# Permission is hereby granted, free of charge, to any person obtaining a copy
4772+# of this software and associated documentation files (the "Software"), to deal
4773+# in the Software without restriction, including without limitation the rights
4774+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
4775+# copies of the Software, and to permit persons to whom the Software is
4776+# furnished to do so, subject to the following conditions:
4777+#
4778+# The above copyright notice and this permission notice shall be included in all
4779+# copies or substantial portions of the Software.
4780+#
4781+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
4782+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
4783+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
4784+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
4785+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
4786+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4787+# SOFTWARE.
4788+
4789+set(EXTRACT_TIMEOUT 5 CACHE STRING "Timeout in seconds for roc-obj-* calls")
4790+
4791+## List the objects for each gfx architecture
4792+execute_process( COMMAND roc-obj-ls librccl.so
4793+ RESULT_VARIABLE list_result
4794+ OUTPUT_VARIABLE cmd_output
4795+ ERROR_VARIABLE cmd_error
4796+ OUTPUT_STRIP_TRAILING_WHITESPACE
4797+ ERROR_STRIP_TRAILING_WHITESPACE
4798+ TIMEOUT ${EXTRACT_TIMEOUT}
4799+)
4800+
4801+if(list_result EQUAL 0)
4802+ ## Convert cmd output to list of lines
4803+ string(REGEX REPLACE "\n$" "" cmd_output "${cmd_output}")
4804+ string(REPLACE "\n" ";" cmd_output "${cmd_output}")
4805+
4806+ ## Extract file paths for the selected gfx archs
4807+ foreach(line ${cmd_output})
4808+ if(line MATCHES "(gfx90a|gfx942|gfx950)")
4809+ string(REGEX MATCH "\\file://(.*)" file_match ${line})
4810+ if(file_match)
4811+ list(APPEND file_paths ${file_match})
4812+ endif()
4813+ endif()
4814+ endforeach()
4815+
4816+ ## Extract objects from files
4817+ foreach(file ${file_paths})
4818+ execute_process(
4819+ COMMAND roc-obj-extract ${file}
4820+ RESULT_VARIABLE extraction_result
4821+ ERROR_VARIABLE extraction_error
4822+ OUTPUT_STRIP_TRAILING_WHITESPACE
4823+ ERROR_STRIP_TRAILING_WHITESPACE
4824+ TIMEOUT ${EXTRACT_TIMEOUT}
4825+ )
4826+ if(extraction_result STREQUAL "TIMEOUT")
4827+ message(
4828+ WARNING
4829+ "[Timeout] Extraction of '${file}' did not finish within ${EXTRACT_TIMEOUT}s. stderr: ${extraction_error}.
4830+ Timeouts have been known to happen as a result of mismatched ROCm versions/executables/etc."
4831+ )
4832+ elseif(NOT extraction_result EQUAL 0)
4833+ message(
4834+ WARNING
4835+ "[Error ${extraction_result}] Could not extract objects from '${file}'. stderr: ${extraction_error}"
4836+ )
4837+ endif()
4838+ endforeach()
4839+
4840+elseif(list_result STREQUAL "TIMEOUT")
4841+ message(
4842+ WARNING
4843+ "[Timeout] roc-obj-ls did not finish within ${EXTRACT_TIMEOUT}s. stderr: ${cmd_error}.
4844+ Timeouts have been known to happen as a result of mismatched ROCm versions/executables/etc"
4845+ )
4846+else()
4847+ ## We don't want to stop building unit-tests if this command fails.
4848+ message(WARNING "[Error ${list_result}] roc-obj-ls failed. stderr: ${cmd_error}")
4849+endif()
4850diff --git a/cmake/git_version.cmake b/cmake/scripts/git_version.cmake
4851similarity index 99%
4852rename from cmake/git_version.cmake
4853rename to cmake/scripts/git_version.cmake
4854index e070739..fde9485 100755
4855--- a/cmake/git_version.cmake
4856+++ b/cmake/scripts/git_version.cmake
4857@@ -19,6 +19,7 @@
4858 # SOFTWARE.
4859
4860 # Attempt to collect the latest git hash
4861+set(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
4862 execute_process(COMMAND git log --pretty=format:'%h' -n 1
4863 OUTPUT_VARIABLE GIT_REV
4864 ERROR_QUIET)
4865diff --git a/custom.properties b/custom.properties
4866deleted file mode 100644
4867index 45e5972..0000000
4868--- a/custom.properties
4869+++ /dev/null
4870@@ -1,3 +0,0 @@
4871-booktitle=RCCL API Guide
4872-spreadsheet.xml=docs/classification-map.xml
4873-document.locale=enus
4874\ No newline at end of file
4875diff --git a/debian/changelog b/debian/changelog
4876index fcb999b..c1a6a4f 100644
4877--- a/debian/changelog
4878+++ b/debian/changelog
4879@@ -1,3 +1,32 @@
4880+rccl (7.1.0-0ubuntu2) resolute; urgency=medium
4881+
4882+ * d/control: add libamdhip64-dev (=> 7.1~) as a build dep
4883+
4884+ -- Bojan Aleksovski <bojan.aleksovski@canonical.com> Thu, 05 Feb 2026 14:15:33 +0100
4885+
4886+rccl (7.1.0-0ubuntu1) resolute; urgency=medium
4887+
4888+ * d/p/0004: [Temporary] Patch rccl to remove rocm-core as build-dep
4889+ * d/control: Update build deps for ROCm v7.1.0
4890+ * d/control: update maintainer to ubuntu devel
4891+ * d/rules: align ROCM_ISAS with expected list syntax
4892+ * d/librccl1.install: add rcclras install
4893+ * d/librccl1-tests.install: fix path and add msccl-unit-test-algorithms
4894+ * d/librccl-doc.install: add doc install for msccl-algorithms
4895+ * d/{librccl1.symbols,shlibs}: remove symbols file and instead add shlibs
4896+ * d/patches: add new fix-install-libdir.patch
4897+ * d/control: remove ppc64el build
4898+ * d/rules: [Temporary] remove gfx908 and gfx90a ROCm ISAs
4899+
4900+ -- Bojan Aleksovski <bojan.aleksovski@canonical.com> Fri, 30 Jan 2026 10:07:24 +0100
4901+
4902+rccl (7.1.0-0ubuntu1~exp1) resolute; urgency=medium
4903+
4904+ * new upstream 7.1.0 release
4905+ * drop 0002-expand-architecture-support.patch as it was accepted upstream
4906+
4907+ -- Tim Flink <Tim.Flink@amd.com> Mon, 19 Jan 2026 17:42:17 -0600
4908+
4909 rccl (5.4.3-3ubuntu2) resolute; urgency=medium
4910
4911 * No change rebuild for rocm-hipamd.
4912diff --git a/debian/control b/debian/control
4913index 12c6691..c0c8d8e 100644
4914--- a/debian/control
4915+++ b/debian/control
4916@@ -5,24 +5,32 @@ Priority: optional
4917 Standards-Version: 4.6.2
4918 Vcs-Git: https://salsa.debian.org/rocm-team/rccl.git
4919 Vcs-Browser: https://salsa.debian.org/rocm-team/rccl
4920-Maintainer: Debian ROCm Team <debian-ai@lists.debian.org>
4921+Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>
4922+XSBC-Original-Maintainer: Debian ROCm Team <debian-ai@lists.debian.org>
4923 Uploaders: Cordell Bloor <cgmb@slerp.xyz>,
4924 Christian Kastner <ckk@debian.org>,
4925+ Tim Flink <Tim.Flink@amd.com>
4926 Build-Depends: debhelper-compat (= 13),
4927 cmake,
4928- hipcc (>= 5.6.1~),
4929- libamd-comgr-dev (>= 6.0~),
4930- libhsa-runtime-dev (>= 5.7.1~),
4931+ hipcc (>= 7.1~),
4932+ libamd-comgr-dev (>= 7.1~),
4933+ libhsa-runtime-dev (>= 7.1~),
4934+ libamdhip64-dev (>= 7.1~),
4935 rocm-cmake,
4936 librocm-smi-dev,
4937 liboam-dev,
4938 chrpath <!nocheck>,
4939+ pkg-rocm-tools (>= 0.9.4),
4940+ libfmt-dev,
4941+ hipify-perl (>= 7.0.0),
4942+ libdrm-dev,
4943 libgtest-dev <!nocheck>
4944 Build-Depends-Indep: dh-sequence-sphinxdoc <!nodoc>,
4945 doxygen <!nodoc>,
4946 python3-breathe <!nodoc>,
4947 python3-sphinx <!nodoc>,
4948 python3-sphinx-rtd-theme <!nodoc>,
4949+ python3-rocm-docs (>= 1.20.0-1~) <!nodoc>,
4950 libjs-jquery <!nodoc>,
4951 libjs-mathjax <!nodoc>,
4952 libjs-sphinxdoc <!nodoc>,
4953@@ -31,7 +39,7 @@ Rules-Requires-Root: no
4954
4955 Package: librccl1
4956 Section: libs
4957-Architecture: amd64 arm64 ppc64el
4958+Architecture: amd64 arm64
4959 Depends: ${misc:Depends}, ${shlibs:Depends},
4960 Description: ROCm Communication Collectives Library - library
4961 RCCL (pronounced "Rickle") is a library of collective communication routines
4962@@ -45,7 +53,7 @@ Description: ROCm Communication Collectives Library - library
4963
4964 Package: librccl-dev
4965 Section: libdevel
4966-Architecture: amd64 arm64 ppc64el
4967+Architecture: amd64 arm64
4968 Depends: librccl1 (= ${binary:Version}),${misc:Depends}, ${shlibs:Depends},
4969 libamdhip64-dev,
4970 Suggests: librccl-doc
4971@@ -61,7 +69,7 @@ Description: ROCm Communication Collectives Library - headers
4972
4973 Package: librccl1-tests
4974 Section: libdevel
4975-Architecture: amd64 arm64 ppc64el
4976+Architecture: amd64 arm64
4977 Depends: librccl1 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends},
4978 Build-Profiles: <!nocheck>
4979 Description: ROCm Communication Collectives Library - tests
4980diff --git a/debian/librccl-doc.install b/debian/librccl-doc.install
4981new file mode 100644
4982index 0000000..865d42f
4983--- /dev/null
4984+++ b/debian/librccl-doc.install
4985@@ -0,0 +1 @@
4986+usr/share/rccl/msccl-algorithms/*
4987diff --git a/debian/librccl1-tests.install b/debian/librccl1-tests.install
4988index 0e38109..623ee95 100644
4989--- a/debian/librccl1-tests.install
4990+++ b/debian/librccl1-tests.install
4991@@ -1,2 +1,3 @@
4992-usr/bin/UnitTests usr/libexec/rocm/librccl1-tests
4993+usr/bin/rccl-UnitTests usr/libexec/rocm/librccl1-tests
4994+usr/share/rccl/msccl-unit-test-algorithms/*
4995
4996diff --git a/debian/librccl1.install b/debian/librccl1.install
4997index b97befa..8080107 100644
4998--- a/debian/librccl1.install
4999+++ b/debian/librccl1.install
5000@@ -1 +1,2 @@
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches