diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/.github/workflows/llvm-project-tests.yml llvm-toolchain-18-18.1.0-rc1/.github/workflows/llvm-project-tests.yml --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/.github/workflows/llvm-project-tests.yml 2024-01-24 12:45:36.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/.github/workflows/llvm-project-tests.yml 2024-01-30 07:40:26.000000000 +0000 @@ -14,7 +14,7 @@ required: false os_list: required: false - default: '["ubuntu-latest", "windows-2019", "macOS-11"]' + default: '["ubuntu-latest", "windows-2019", "macOS-13"]' python_version: required: false type: string @@ -38,9 +38,7 @@ type: string # Use windows-2019 due to: # https://developercommunity.visualstudio.com/t/Prev-Issue---with-__assume-isnan-/1597317 - # We're using a specific version of macOS due to: - # https://github.com/actions/virtual-environments/issues/5900 - default: '["ubuntu-latest", "windows-2019", "macOS-11"]' + default: '["ubuntu-latest", "windows-2019", "macOS-13"]' python_version: required: false @@ -95,7 +93,8 @@ # run creates a new cache entry so we want to ensure that we have # enough cache space for all the tests to run at once and still # fit under the 10 GB limit. - max-size: 500M + # Default to 2G to workaround: https://github.com/hendrikmuhs/ccache-action/issues/174 + max-size: 2G key: ${{ matrix.os }} variant: sccache - name: Build and Test diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/.github/workflows/llvm-tests.yml llvm-toolchain-18-18.1.0-rc1/.github/workflows/llvm-tests.yml --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/.github/workflows/llvm-tests.yml 2023-12-03 09:16:39.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/.github/workflows/llvm-tests.yml 2024-01-30 07:40:26.000000000 +0000 @@ -27,31 +27,13 @@ cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} jobs: - check_all: + check-all: if: github.repository_owner == 'llvm' - name: Test llvm,clang,libclc + name: Build and Test uses: ./.github/workflows/llvm-project-tests.yml with: build_target: check-all - projects: clang;libclc - - # These need to be separate from the check_all job, becuase there is not enough disk - # space to build all these projects on Windows. - build_lldb: - if: github.repository_owner == 'llvm' - name: Build lldb - uses: ./.github/workflows/llvm-project-tests.yml - with: - build_target: '' - projects: clang;lldb - - check_lld: - if: github.repository_owner == 'llvm' - name: Test lld - uses: ./.github/workflows/llvm-project-tests.yml - with: - build_target: check-lld - projects: lld + projects: clang;lld;libclc;lldb abi-dump-setup: if: github.repository_owner == 'llvm' @@ -143,7 +125,7 @@ else touch llvm.symbols fi - abi-dumper "$EXTRA_ARGS" -lver ${{ matrix.ref }} -skip-cxx -public-headers ./install/include/${{ needs.abi-dump-setup.outputs.ABI_HEADERS }} -o ${{ matrix.ref }}.abi ./install/lib/libLLVM.so + abi-dumper $EXTRA_ARGS -lver ${{ matrix.ref }} -skip-cxx -public-headers ./install/include/${{ needs.abi-dump-setup.outputs.ABI_HEADERS }} -o ${{ matrix.ref }}.abi ./install/lib/libLLVM.so # Remove symbol versioning from dumps, so we can compare across major versions. sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' ${{ matrix.ref }}.abi - name: Upload ABI file @@ -193,7 +175,7 @@ # FIXME: Reading of gzip'd abi files on the GitHub runners stop # working some time in March of 2021, likely due to a change in the # runner's environment. - abi-compliance-checker "$EXTRA_ARGS" -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c" + abi-compliance-checker $EXTRA_ARGS -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c" - name: Upload ABI Comparison if: always() uses: actions/upload-artifact@v3 diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/include/clang/Basic/BuiltinsAMDGPU.def llvm-toolchain-18-18.1.0-rc1/clang/include/clang/Basic/BuiltinsAMDGPU.def --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/include/clang/Basic/BuiltinsAMDGPU.def 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/include/clang/Basic/BuiltinsAMDGPU.def 2024-01-30 07:40:21.000000000 +0000 @@ -436,5 +436,67 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64") +//===----------------------------------------------------------------------===// +// WMMA builtins. +// Postfix w32 indicates the builtin requires wavefront size of 32. +// Postfix w64 indicates the builtin requires wavefront size of 64. +// +// Some of these are very similar to their GFX11 counterparts, but they don't +// require replication of the A,B matrices, so they use fewer vector elements. +// Therefore, we add an "_gfx12" suffix to distinguish them from the existing +// builtins. +//===----------------------------------------------------------------------===// +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8hV8hV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12, "V8fV8sV8sV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8hV8hV8hV8h", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12, "V8sV8sV8sV8s", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32") +// These are gfx12-only, but for consistency with the other WMMA variants we're +// keeping the "_gfx12" suffix. +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32") + +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, "V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4hV4hV4hV4h", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12, "V4sV4sV4sV4s", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64") +// These are gfx12-only, but for consistency with the other WMMA variants we're +// keeping the "_gfx12" suffix. +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64") + +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32, "V8fV8hV16hV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32, "V8fV8sV16sV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32, "V8hV8hV16hV8hs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32, "V8sV8sV16sV8ss", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32, "V8iIbiIbV2iV8isIb", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32") + +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64, "V4fV4hV8hV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64, "V4fV4sV8sV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64, "V4hV4hV8hV4hs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64, "V4sV4sV8sV4ss", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64, "V4iIbiIbiV4isIb", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") + #undef BUILTIN #undef TARGET_BUILTIN diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/include/clang/Basic/CodeGenOptions.def llvm-toolchain-18-18.1.0-rc1/clang/include/clang/Basic/CodeGenOptions.def --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/include/clang/Basic/CodeGenOptions.def 2024-01-24 12:45:36.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/include/clang/Basic/CodeGenOptions.def 2024-01-30 07:40:26.000000000 +0000 @@ -369,6 +369,9 @@ /// The default TLS model to use. ENUM_CODEGENOPT(DefaultTLSModel, TLSModel, 2, GeneralDynamicTLSModel) +/// Whether to enable TLSDESC. AArch64 enables TLSDESC regardless of this value. +CODEGENOPT(EnableTLSDESC, 1, 0) + /// Bit size of immediate TLS offsets (0 == use the default). VALUE_CODEGENOPT(TLSSize, 8, 0) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/include/clang/Driver/Options.td llvm-toolchain-18-18.1.0-rc1/clang/include/clang/Driver/Options.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/include/clang/Driver/Options.td 2024-01-24 12:45:36.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/include/clang/Driver/Options.td 2024-01-30 07:40:28.000000000 +0000 @@ -4419,6 +4419,8 @@ HelpText<"Specify bit size of immediate TLS offsets (AArch64 ELF only): " "12 (for 4KB) | 24 (for 16MB, default) | 32 (for 4GB) | 48 (for 256TB, needs -mcmodel=large)">, MarshallingInfoInt>; +def mtls_dialect_EQ : Joined<["-"], "mtls-dialect=">, Group, + Flags<[TargetSpecific]>, HelpText<"Which thread-local storage dialect to use for dynamic accesses of TLS variables">; def mimplicit_it_EQ : Joined<["-"], "mimplicit-it=">, Group; def mdefault_build_attributes : Joined<["-"], "mdefault-build-attributes">, Group; def mno_default_build_attributes : Joined<["-"], "mno-default-build-attributes">, Group; @@ -7066,6 +7068,9 @@ Values<"disabled,enabled,forced">, NormalizedValues<["Disabled","Enabled","Forced"]>, MarshallingInfoEnum, "Enabled">; +def enable_tlsdesc : Flag<["-"], "enable-tlsdesc">, + MarshallingInfoFlag>; + } // let Visibility = [CC1Option] //===----------------------------------------------------------------------===// diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/AST/TemplateBase.cpp llvm-toolchain-18-18.1.0-rc1/clang/lib/AST/TemplateBase.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/AST/TemplateBase.cpp 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/lib/AST/TemplateBase.cpp 2024-01-30 07:40:21.000000000 +0000 @@ -450,7 +450,8 @@ getAsIntegral() == Other.getAsIntegral(); case StructuralValue: { - if (getStructuralValueType() != Other.getStructuralValueType()) + if (getStructuralValueType().getCanonicalType() != + Other.getStructuralValueType().getCanonicalType()) return false; llvm::FoldingSetNodeID A, B; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/CodeGen/BackendUtil.cpp llvm-toolchain-18-18.1.0-rc1/clang/lib/CodeGen/BackendUtil.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/CodeGen/BackendUtil.cpp 2024-01-24 12:45:36.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/lib/CodeGen/BackendUtil.cpp 2024-01-30 07:40:26.000000000 +0000 @@ -401,6 +401,7 @@ Options.UniqueBasicBlockSectionNames = CodeGenOpts.UniqueBasicBlockSectionNames; Options.TLSSize = CodeGenOpts.TLSSize; + Options.EnableTLSDESC = CodeGenOpts.EnableTLSDESC; Options.EmulatedTLS = CodeGenOpts.EmulatedTLS; Options.DebuggerTuning = CodeGenOpts.getDebuggerTuning(); Options.EmitStackSizeSection = CodeGenOpts.StackSizeSection; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/CodeGen/CGBuiltin.cpp llvm-toolchain-18-18.1.0-rc1/clang/lib/CodeGen/CGBuiltin.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/CodeGen/CGBuiltin.cpp 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/lib/CodeGen/CGBuiltin.cpp 2024-01-30 07:40:28.000000000 +0000 @@ -18279,65 +18279,216 @@ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32: - case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: { + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: { // These operations perform a matrix multiplication and accumulation of // the form: // D = A * B + C - // The return type always matches the type of matrix C. - unsigned ArgForMatchingRetType; + // We need to specify one type for matrices AB and one for matrices CD. + // Sparse matrix operations can have different types for A and B as well as + // an additional type for sparsity index. + // Destination type should be put before types used for source operands. + SmallVector ArgsForMatchingMatrixTypes; + // On GFX12, the intrinsics with 16-bit accumulator use a packed layout. + // There is no need for the variable opsel argument, so always set it to + // "false". + bool AppendFalseForOpselArg = false; unsigned BuiltinWMMAOp; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64: - ArgForMatchingRetType = 2; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64: - ArgForMatchingRetType = 2; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16; break; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12: + AppendFalseForOpselArg = true; + LLVM_FALLTHROUGH; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64: - ArgForMatchingRetType = 2; + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16; break; + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12: + AppendFalseForOpselArg = true; + LLVM_FALLTHROUGH; case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64: - ArgForMatchingRetType = 2; + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16; break; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64: - ArgForMatchingRetType = 2; + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied; break; case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64: - ArgForMatchingRetType = 2; + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied; break; case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: - ArgForMatchingRetType = 4; + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12: + ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8; break; case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64: - ArgForMatchingRetType = 4; + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12: + ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4; break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12: + ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8; + break; } SmallVector Args; for (int i = 0, e = E->getNumArgs(); i != e; ++i) Args.push_back(EmitScalarExpr(E->getArg(i))); + if (AppendFalseForOpselArg) + Args.push_back(Builder.getFalse()); - Function *F = CGM.getIntrinsic(BuiltinWMMAOp, - {Args[ArgForMatchingRetType]->getType()}); + SmallVector ArgTypes; + for (auto ArgIdx : ArgsForMatchingMatrixTypes) + ArgTypes.push_back(Args[ArgIdx]->getType()); + Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes); return Builder.CreateCall(F, Args); } diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/Driver/Driver.cpp llvm-toolchain-18-18.1.0-rc1/clang/lib/Driver/Driver.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/Driver/Driver.cpp 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/lib/Driver/Driver.cpp 2024-01-30 07:40:28.000000000 +0000 @@ -4764,9 +4764,9 @@ case phases::Backend: { if (isUsingLTO() && TargetDeviceOffloadKind == Action::OFK_None) { types::ID Output; - if (Args.hasArg(options::OPT_ffat_lto_objects)) - Output = Args.hasArg(options::OPT_emit_llvm) ? types::TY_LTO_IR - : types::TY_PP_Asm; + if (Args.hasArg(options::OPT_ffat_lto_objects) && + !Args.hasArg(options::OPT_emit_llvm)) + Output = types::TY_PP_Asm; else if (Args.hasArg(options::OPT_S)) Output = types::TY_LTO_IR; else diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/Driver/ToolChains/Clang.cpp llvm-toolchain-18-18.1.0-rc1/clang/lib/Driver/ToolChains/Clang.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/Driver/ToolChains/Clang.cpp 2024-01-24 12:45:36.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/lib/Driver/ToolChains/Clang.cpp 2024-01-30 07:40:28.000000000 +0000 @@ -5822,6 +5822,9 @@ Args.AddLastArg(CmdArgs, options::OPT_mtls_size_EQ); } + if (isTLSDESCEnabled(TC, Args)) + CmdArgs.push_back("-enable-tlsdesc"); + // Add the target cpu std::string CPU = getCPUName(D, Args, Triple, /*FromAs*/ false); if (!CPU.empty()) { diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/Driver/ToolChains/CommonArgs.cpp llvm-toolchain-18-18.1.0-rc1/clang/lib/Driver/ToolChains/CommonArgs.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/Driver/ToolChains/CommonArgs.cpp 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/lib/Driver/ToolChains/CommonArgs.cpp 2024-01-30 07:40:28.000000000 +0000 @@ -729,6 +729,33 @@ return Triple.isPS(); } +bool tools::isTLSDESCEnabled(const ToolChain &TC, + const llvm::opt::ArgList &Args) { + const llvm::Triple &Triple = TC.getEffectiveTriple(); + Arg *A = Args.getLastArg(options::OPT_mtls_dialect_EQ); + if (!A) + return Triple.hasDefaultTLSDESC(); + StringRef V = A->getValue(); + bool SupportedArgument = false, EnableTLSDESC = false; + bool Unsupported = !Triple.isOSBinFormatELF(); + if (Triple.isRISCV()) { + SupportedArgument = V == "desc" || V == "trad"; + EnableTLSDESC = V == "desc"; + } else if (Triple.isX86()) { + SupportedArgument = V == "gnu"; + } else { + Unsupported = true; + } + if (Unsupported) { + TC.getDriver().Diag(diag::err_drv_unsupported_opt_for_target) + << A->getSpelling() << Triple.getTriple(); + } else if (!SupportedArgument) { + TC.getDriver().Diag(diag::err_drv_unsupported_option_argument_for_target) + << A->getSpelling() << V << Triple.getTriple(); + } + return EnableTLSDESC; +} + void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, ArgStringList &CmdArgs, const InputInfo &Output, const InputInfo &Input, bool IsThinLTO) { @@ -783,6 +810,28 @@ "-generate-arange-section")); } + // Pass vector library arguments to LTO. + Arg *ArgVecLib = Args.getLastArg(options::OPT_fveclib); + if (ArgVecLib && ArgVecLib->getNumValues() == 1) { + // Map the vector library names from clang front-end to opt front-end. The + // values are taken from the TargetLibraryInfo class command line options. + std::optional OptVal = + llvm::StringSwitch>(ArgVecLib->getValue()) + .Case("Accelerate", "Accelerate") + .Case("LIBMVEC", "LIBMVEC-X86") + .Case("MASSV", "MASSV") + .Case("SVML", "SVML") + .Case("SLEEF", "sleefgnuabi") + .Case("Darwin_libsystem_m", "Darwin_libsystem_m") + .Case("ArmPL", "ArmPL") + .Case("none", "none") + .Default(std::nullopt); + + if (OptVal) + CmdArgs.push_back(Args.MakeArgString( + Twine(PluginOptPrefix) + "-vector-library=" + OptVal.value())); + } + // Try to pass driver level flags relevant to LTO code generation down to // the plugin. @@ -988,6 +1037,9 @@ CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-emulated-tls")); } + if (isTLSDESCEnabled(ToolChain, Args)) + CmdArgs.push_back( + Args.MakeArgString(Twine(PluginOptPrefix) + "-enable-tlsdesc")); if (Args.hasFlag(options::OPT_fstack_size_section, options::OPT_fno_stack_size_section, false)) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/Driver/ToolChains/CommonArgs.h llvm-toolchain-18-18.1.0-rc1/clang/lib/Driver/ToolChains/CommonArgs.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/Driver/ToolChains/CommonArgs.h 2023-12-03 09:16:48.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/lib/Driver/ToolChains/CommonArgs.h 2024-01-30 07:40:26.000000000 +0000 @@ -144,6 +144,9 @@ bool areOptimizationsEnabled(const llvm::opt::ArgList &Args); bool isUseSeparateSections(const llvm::Triple &Triple); +// Parse -mtls-dialect=. Return true if the target supports both general-dynamic +// and TLSDESC, and TLSDESC is requested. +bool isTLSDESCEnabled(const ToolChain &TC, const llvm::opt::ArgList &Args); /// \p EnvVar is split by system delimiter for environment variables. /// If \p ArgName is "-I", "-L", or an empty string, each entry from \p EnvVar diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/Sema/SemaInit.cpp llvm-toolchain-18-18.1.0-rc1/clang/lib/Sema/SemaInit.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/Sema/SemaInit.cpp 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/lib/Sema/SemaInit.cpp 2024-01-30 07:40:28.000000000 +0000 @@ -4200,7 +4200,7 @@ /// \param IsListInit Is this list-initialization? /// \param IsInitListCopy Is this non-list-initialization resulting from a /// list-initialization from {x} where x is the same -/// aggregate type as the entity? +/// type as the entity? static void TryConstructorInitialization(Sema &S, const InitializedEntity &Entity, const InitializationKind &Kind, @@ -4230,14 +4230,6 @@ Entity.getKind() != InitializedEntity::EK_LambdaToBlockConversionBlockElement); - bool CopyElisionPossible = false; - auto ElideConstructor = [&] { - // Convert qualifications if necessary. - Sequence.AddQualificationConversionStep(DestType, VK_PRValue); - if (ILE) - Sequence.RewrapReferenceInitList(DestType, ILE); - }; - // C++17 [dcl.init]p17: // - If the initializer expression is a prvalue and the cv-unqualified // version of the source type is the same class as the class of the @@ -4250,17 +4242,11 @@ if (S.getLangOpts().CPlusPlus17 && !RequireActualConstructor && UnwrappedArgs.size() == 1 && UnwrappedArgs[0]->isPRValue() && S.Context.hasSameUnqualifiedType(UnwrappedArgs[0]->getType(), DestType)) { - if (ILE && !DestType->isAggregateType()) { - // CWG2311: T{ prvalue_of_type_T } is not eligible for copy elision - // Make this an elision if this won't call an initializer-list - // constructor. (Always on an aggregate type or check constructors first.) - assert(!IsInitListCopy && - "IsInitListCopy only possible with aggregate types"); - CopyElisionPossible = true; - } else { - ElideConstructor(); - return; - } + // Convert qualifications if necessary. + Sequence.AddQualificationConversionStep(DestType, VK_PRValue); + if (ILE) + Sequence.RewrapReferenceInitList(DestType, ILE); + return; } const RecordType *DestRecordType = DestType->getAs(); @@ -4305,12 +4291,6 @@ S, Kind.getLocation(), Args, CandidateSet, DestType, Ctors, Best, CopyInitialization, AllowExplicit, /*OnlyListConstructors=*/true, IsListInit, RequireActualConstructor); - - if (CopyElisionPossible && Result == OR_No_Viable_Function) { - // No initializer list candidate - ElideConstructor(); - return; - } } // C++11 [over.match.list]p1: @@ -4592,9 +4572,9 @@ return; } - // C++11 [dcl.init.list]p3, per DR1467 and DR2137: - // - If T is an aggregate class and the initializer list has a single element - // of type cv U, where U is T or a class derived from T, the object is + // C++11 [dcl.init.list]p3, per DR1467: + // - If T is a class type and the initializer list has a single element of + // type cv U, where U is T or a class derived from T, the object is // initialized from that element (by copy-initialization for // copy-list-initialization, or by direct-initialization for // direct-list-initialization). @@ -4605,7 +4585,7 @@ // - Otherwise, if T is an aggregate, [...] (continue below). if (S.getLangOpts().CPlusPlus11 && InitList->getNumInits() == 1 && !IsDesignatedInit) { - if (DestType->isRecordType() && DestType->isAggregateType()) { + if (DestType->isRecordType()) { QualType InitType = InitList->getInit(0)->getType(); if (S.Context.hasSameUnqualifiedType(InitType, DestType) || S.IsDerivedFrom(InitList->getBeginLoc(), InitType, DestType)) { diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/Sema/SemaOverload.cpp llvm-toolchain-18-18.1.0-rc1/clang/lib/Sema/SemaOverload.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/lib/Sema/SemaOverload.cpp 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/lib/Sema/SemaOverload.cpp 2024-01-30 07:40:21.000000000 +0000 @@ -1568,37 +1568,19 @@ // called for those cases. if (CXXConstructorDecl *Constructor = dyn_cast(ICS.UserDefined.ConversionFunction)) { - QualType FromType; - SourceLocation FromLoc; - // C++11 [over.ics.list]p6, per DR2137: - // C++17 [over.ics.list]p6: - // If C is not an initializer-list constructor and the initializer list - // has a single element of type cv U, where U is X or a class derived - // from X, the implicit conversion sequence has Exact Match rank if U is - // X, or Conversion rank if U is derived from X. - if (const auto *InitList = dyn_cast(From); - InitList && InitList->getNumInits() == 1 && - !S.isInitListConstructor(Constructor)) { - const Expr *SingleInit = InitList->getInit(0); - FromType = SingleInit->getType(); - FromLoc = SingleInit->getBeginLoc(); - } else { - FromType = From->getType(); - FromLoc = From->getBeginLoc(); - } - QualType FromCanon = - S.Context.getCanonicalType(FromType.getUnqualifiedType()); + QualType FromCanon + = S.Context.getCanonicalType(From->getType().getUnqualifiedType()); QualType ToCanon = S.Context.getCanonicalType(ToType).getUnqualifiedType(); if (Constructor->isCopyConstructor() && (FromCanon == ToCanon || - S.IsDerivedFrom(FromLoc, FromCanon, ToCanon))) { + S.IsDerivedFrom(From->getBeginLoc(), FromCanon, ToCanon))) { // Turn this into a "standard" conversion sequence, so that it // gets ranked with standard conversion sequences. DeclAccessPair Found = ICS.UserDefined.FoundConversionFunction; ICS.setStandard(); ICS.Standard.setAsIdentityConversion(); - ICS.Standard.setFromType(FromType); + ICS.Standard.setFromType(From->getType()); ICS.Standard.setAllToTypes(ToType); ICS.Standard.CopyConstructor = Constructor; ICS.Standard.FoundCopyConstructor = Found; @@ -5324,18 +5306,18 @@ IsDesignatedInit) return Result; - // Per DR1467 and DR2137: - // If the parameter type is an aggregate class X and the initializer list - // has a single element of type cv U, where U is X or a class derived from - // X, the implicit conversion sequence is the one required to convert the - // element to the parameter type. + // Per DR1467: + // If the parameter type is a class X and the initializer list has a single + // element of type cv U, where U is X or a class derived from X, the + // implicit conversion sequence is the one required to convert the element + // to the parameter type. // // Otherwise, if the parameter type is a character array [... ] // and the initializer list has a single element that is an // appropriately-typed string literal (8.5.2 [dcl.init.string]), the // implicit conversion sequence is the identity conversion. if (From->getNumInits() == 1 && !IsDesignatedInit) { - if (ToType->isRecordType() && ToType->isAggregateType()) { + if (ToType->isRecordType()) { QualType InitType = From->getInit(0)->getType(); if (S.Context.hasSameUnqualifiedType(InitType, ToType) || S.IsDerivedFrom(From->getBeginLoc(), InitType, ToType)) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CXX/drs/dr14xx.cpp llvm-toolchain-18-18.1.0-rc1/clang/test/CXX/drs/dr14xx.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CXX/drs/dr14xx.cpp 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/CXX/drs/dr14xx.cpp 2024-01-30 07:40:21.000000000 +0000 @@ -488,6 +488,16 @@ } } // nonaggregate + namespace SelfInitIsNotListInit { + struct S { + S(); + explicit S(S &); + S(const S &); + }; + S s1; + S s2 = {s1}; // ok, not list-initialization so we pick the non-explicit constructor + } + struct NestedInit { int a, b, c; }; NestedInit ni[1] = {{NestedInit{1, 2, 3}}}; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CXX/drs/dr21xx.cpp llvm-toolchain-18-18.1.0-rc1/clang/test/CXX/drs/dr21xx.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CXX/drs/dr21xx.cpp 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/CXX/drs/dr21xx.cpp 2024-01-30 07:40:21.000000000 +0000 @@ -11,16 +11,6 @@ // cxx98-error@-1 {{variadic macros are a C99 feature}} #endif -namespace std { - __extension__ typedef __SIZE_TYPE__ size_t; - - template struct initializer_list { - const E *p; size_t n; - initializer_list(const E *p, size_t n); - initializer_list(); - }; -} - namespace dr2100 { // dr2100: 12 template struct X {}; template struct A { @@ -142,41 +132,6 @@ #endif } -namespace dr2137 { // dr2137: 18 -#if __cplusplus >= 201103L - struct Q { - Q(); - Q(Q&&); - Q(std::initializer_list) = delete; // #dr2137-Qcons - }; - - Q x = Q { Q() }; - // since-cxx11-error@-1 {{call to deleted constructor of 'Q'}} - // since-cxx11-note@#dr2137-Qcons {{'Q' has been explicitly marked deleted here}} - - int f(Q); // #dr2137-f - int y = f({ Q() }); - // since-cxx11-error@-1 {{call to deleted constructor of 'Q'}} - // since-cxx11-note@#dr2137-Qcons {{'Q' has been explicitly marked deleted here}} - // since-cxx11-note@#dr2137-f {{passing argument to parameter here}} - - struct U { - U(); - U(const U&); - }; - - struct Derived : U { - Derived(); - Derived(const Derived&); - } d; - - int g(Derived); - int g(U(&&)[1]) = delete; - - int z = g({ d }); -#endif -} - namespace dr2140 { // dr2140: 9 #if __cplusplus >= 201103L union U { int a; decltype(nullptr) b; }; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CXX/drs/dr23xx.cpp llvm-toolchain-18-18.1.0-rc1/clang/test/CXX/drs/dr23xx.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CXX/drs/dr23xx.cpp 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/CXX/drs/dr23xx.cpp 2024-01-30 07:40:28.000000000 +0000 @@ -6,16 +6,6 @@ // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s // RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s -namespace std { - __extension__ typedef __SIZE_TYPE__ size_t; - - template struct initializer_list { - const E *p; size_t n; - initializer_list(const E *p, size_t n); - initializer_list(); - }; -} - #if __cplusplus >= 201103L namespace dr2303 { // dr2303: 12 template @@ -57,81 +47,6 @@ } //namespace dr2303 #endif -namespace dr2311 { // dr2311: 18 open -#if __cplusplus >= 201707L -template -void test() { - // Ensure none of these try to call a move constructor. - T a = T{T(0)}; - T b{T(0)}; - auto c{T(0)}; - T d = {T(0)}; - auto e = {T(0)}; -#if __cplusplus >= 202302L - auto f = auto{T(0)}; -#endif - void(*fn)(T); - fn({T(0)}); -} - -struct NonMovable { - NonMovable(int); - NonMovable(NonMovable&&) = delete; -}; -struct NonMovableNonApplicableIList { - NonMovableNonApplicableIList(int); - NonMovableNonApplicableIList(NonMovableNonApplicableIList&&) = delete; - NonMovableNonApplicableIList(std::initializer_list); -}; -struct ExplicitMovable { - ExplicitMovable(int); - explicit ExplicitMovable(ExplicitMovable&&); -}; -struct ExplicitNonMovable { - ExplicitNonMovable(int); - explicit ExplicitNonMovable(ExplicitNonMovable&&) = delete; -}; -struct ExplicitNonMovableNonApplicableIList { - ExplicitNonMovableNonApplicableIList(int); - explicit ExplicitNonMovableNonApplicableIList(ExplicitNonMovableNonApplicableIList&&) = delete; - ExplicitNonMovableNonApplicableIList(std::initializer_list); -}; -struct CopyOnly { - CopyOnly(int); - CopyOnly(const CopyOnly&); - CopyOnly(CopyOnly&&) = delete; -}; -struct ExplicitCopyOnly { - ExplicitCopyOnly(int); - explicit ExplicitCopyOnly(const ExplicitCopyOnly&); - explicit ExplicitCopyOnly(ExplicitCopyOnly&&) = delete; -}; - -template void test(); -template void test(); -template void test(); -template void test(); -template void test(); -template void test(); -template void test(); - -struct any { - template - any(T&&); -}; - -template -struct X { - X(); - X(T) = delete; // #dr2311-X -}; - -X> x{ X>() }; -// since-cxx17-error@-1 {{call to deleted constructor of 'X>'}} -// since-cxx17-note@#dr2311-X {{'X' has been explicitly marked deleted here}} -#endif -} - // dr2331: na // dr2335 is in dr2335.cxx diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGen/RISCV/tls-dialect.c llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGen/RISCV/tls-dialect.c --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGen/RISCV/tls-dialect.c 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGen/RISCV/tls-dialect.c 2024-01-30 07:40:26.000000000 +0000 @@ -0,0 +1,14 @@ +// REQUIRES: riscv-registered-target +/// cc1 -enable-tlsdesc (due to -mtls-dialect=desc) enables TLSDESC. +// RUN: %clang_cc1 -triple riscv64 -S -mrelocation-model pic -pic-level 1 -enable-tlsdesc %s -o - | FileCheck %s --check-prefix=DESC +// RUN: %clang_cc1 -triple riscv64 -S -mrelocation-model pic -pic-level 1 %s -o - | FileCheck %s --check-prefix=NODESC + +__thread int x; + +// DESC: %tlsdesc_hi +// DESC-NOT: %tls_gd_pcrel_hi +// NODESC: %tls_gd_pcrel_hi +// NODESC-NOT: %tlsdesc_hi +int use() { + return x; +} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/amdgpu-features.cl llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/amdgpu-features.cl --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/amdgpu-features.cl 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/amdgpu-features.cl 2024-01-30 07:40:21.000000000 +0000 @@ -100,8 +100,8 @@ // GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64" diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl 2024-01-30 07:40:21.000000000 +0000 @@ -1,59 +1,60 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s typedef float v2f __attribute__((ext_vector_type(2))); -// CHECK-GFX940-LABEL: @test_cvt_f32_bf8 -// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) +// CHECK-LABEL: @test_cvt_f32_bf8 +// CHECK: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) void test_cvt_f32_bf8(global int* out, int a) { *out = __builtin_amdgcn_cvt_f32_bf8(a, 0); } -// CHECK-GFX940-LABEL: @test_cvt_f32_fp8 -// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) +// CHECK-LABEL: @test_cvt_f32_fp8 +// CHECK: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) void test_cvt_f32_fp8(global int* out, int a) { *out = __builtin_amdgcn_cvt_f32_fp8(a, 1); } -// CHECK-GFX940-LABEL: @test_cvt_pk_f32_bf8 -// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) +// CHECK-LABEL: @test_cvt_pk_f32_bf8 +// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) void test_cvt_pk_f32_bf8(global v2f* out, int a) { *out = __builtin_amdgcn_cvt_pk_f32_bf8(a, false); } -// CHECK-GFX940-LABEL: @test_cvt_pk_f32_fp8 -// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) +// CHECK-LABEL: @test_cvt_pk_f32_fp8 +// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) void test_cvt_pk_f32_fp8(global v2f* out, int a) { *out = __builtin_amdgcn_cvt_pk_f32_fp8(a, true); } -// CHECK-GFX940-LABEL: @test_cvt_pk_bf8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false) +// CHECK-LABEL: @test_cvt_pk_bf8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false) void test_cvt_pk_bf8_f32(global int* out, int old, float a, float b) { *out = __builtin_amdgcn_cvt_pk_bf8_f32(a, b, old, false); } -// CHECK-GFX940-LABEL: @test_cvt_pk_fp8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true) +// CHECK-LABEL: @test_cvt_pk_fp8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true) void test_cvt_pk_fp8_f32(global int* out, int old, float a, float b) { *out = __builtin_amdgcn_cvt_pk_fp8_f32(a, b, old, true); } -// CHECK-GFX940-LABEL: @test_cvt_sr_bf8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2) +// CHECK-LABEL: @test_cvt_sr_bf8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2) void test_cvt_sr_bf8_f32(global int* out, int old, float a, int b) { *out = __builtin_amdgcn_cvt_sr_bf8_f32(a, b, old, 2); } -// CHECK-GFX940-LABEL: @test_cvt_sr_fp8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3) +// CHECK-LABEL: @test_cvt_sr_fp8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3) void test_cvt_sr_fp8_f32(global int* out, int old, float a, int b) { *out = __builtin_amdgcn_cvt_sr_fp8_f32(a, b, old, 3); diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl 2024-01-30 07:40:21.000000000 +0000 @@ -0,0 +1,156 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef float v8f __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef int v8i __attribute__((ext_vector_type(8))); + +// Wave32 + +// +// amdgcn_wmma_f32_16x16x16_f16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c); +} + +// +// amdgcn_wmma_f32_16x16x16_bf16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c); +} + +// +// amdgcn_wmma_f16_16x16x16_f16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c) +{ + *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c); +} + +// +// amdgcn_wmma_bf16_16x16x16_bf16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c) +{ + *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c); +} + +// +// amdgcn_wmma_i32_16x16x16_iu8 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false); +} + +// +// amdgcn_wmma_i32_16x16x16_iu4 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false); +} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl 2024-01-30 07:40:21.000000000 +0000 @@ -0,0 +1,155 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef float v4f __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef int v4i __attribute__((ext_vector_type(4))); + +// Wave64 + +// +// amdgcn_wmma_f32_16x16x16_f16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c); +} + +// +// amdgcn_wmma_f32_16x16x16_bf16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c); +} + +// +// amdgcn_wmma_f16_16x16x16_f16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c) +{ + *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c); +} + +// +// amdgcn_wmma_bf16_16x16x16_bf16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c) +{ + *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c); +} + +// +// amdgcn_wmma_i32_16x16x16_iu8 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false); +} + +// +// amdgcn_wmma_i32_16x16x16_iu4 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false); +} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl 2024-01-30 07:40:21.000000000 +0000 @@ -0,0 +1,135 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef float v8f __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef int v8i __attribute__((ext_vector_type(8))); +typedef half v16h __attribute__((ext_vector_type(16))); +typedef short v16s __attribute__((ext_vector_type(16))); + +// Wave32 + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index) +{ + *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index) +{ + *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index); +} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl 2024-01-30 07:40:21.000000000 +0000 @@ -0,0 +1,134 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); + +// Wave64 + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x half> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index) +{ + *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index) +{ + *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index); +} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl 2024-01-30 07:40:21.000000000 +0000 @@ -16,7 +16,7 @@ // Wave32 -void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f, +void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f, global v16h* out16h, v16h a16h, v16h b16h, v16h c16h, global v16s* out16s, v2i a2i, v2i b2i, v16s c16s, global v8i* out8i, v4i a4i, v4i b4i, v8i c8i) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl 2024-01-30 07:40:21.000000000 +0000 @@ -2,7 +2,6 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100 -typedef float v4f __attribute__((ext_vector_type(4))); typedef float v8f __attribute__((ext_vector_type(8))); typedef half v16h __attribute__((ext_vector_type(16))); typedef int v2i __attribute__((ext_vector_type(2))); @@ -20,7 +19,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]]) // CHECK-GFX1100-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]] // CHECK-GFX1100-NEXT: ret void // @@ -35,7 +34,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]]) // CHECK-GFX1100-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -50,7 +49,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -65,7 +64,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -80,7 +79,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -95,7 +94,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -110,7 +109,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) // CHECK-GFX1100-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -125,7 +124,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) // CHECK-GFX1100-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl 2024-01-30 07:40:21.000000000 +0000 @@ -3,12 +3,10 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100 typedef float v4f __attribute__((ext_vector_type(4))); -typedef float v8f __attribute__((ext_vector_type(8))); typedef half v8h __attribute__((ext_vector_type(8))); typedef half v16h __attribute__((ext_vector_type(16))); typedef int v2i __attribute__((ext_vector_type(2))); typedef int v4i __attribute__((ext_vector_type(4))); -typedef int v8i __attribute__((ext_vector_type(8))); typedef short v8s __attribute__((ext_vector_type(8))); typedef short v16s __attribute__((ext_vector_type(16))); @@ -22,7 +20,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]]) // CHECK-GFX1100-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]] // CHECK-GFX1100-NEXT: ret void // @@ -37,7 +35,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]]) // CHECK-GFX1100-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -52,7 +50,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -67,7 +65,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -82,7 +80,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -97,7 +95,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -112,7 +110,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) // CHECK-GFX1100-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -127,7 +125,7 @@ // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) // CHECK-GFX1100-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/Driver/fat-lto-objects.c llvm-toolchain-18-18.1.0-rc1/clang/test/Driver/fat-lto-objects.c --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/Driver/fat-lto-objects.c 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/Driver/fat-lto-objects.c 2024-01-30 07:40:21.000000000 +0000 @@ -23,11 +23,17 @@ // CHECK-CC-S-EL-LTO-SAME: -emit-llvm // CHECK-CC-S-EL-LTO-SAME: -ffat-lto-objects -/// When fat LTO is enabled wihtout -S we expect native object output and -ffat-lto-object to be passed to cc1. +/// When fat LTO is enabled without -S we expect native object output and -ffat-lto-object to be passed to cc1. // RUN: %clang --target=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -### %s -c 2>&1 | FileCheck %s -check-prefix=CHECK-CC-C-LTO // CHECK-CC-C-LTO: -cc1 -// CHECK-CC-C-LTO: -emit-obj -// CHECK-CC-C-LTO: -ffat-lto-objects +// CHECK-CC-C-LTO-SAME: -emit-obj +// CHECK-CC-C-LTO-SAME: -ffat-lto-objects + +/// When fat LTO is enabled with -c and -emit-llvm we expect bitcode output and -ffat-lto-object to be passed to cc1. +// RUN: %clang --target=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -### %s -c -emit-llvm 2>&1 | FileCheck %s -check-prefix=CHECK-CC-C-EL-LTO +// CHECK-CC-C-EL-LTO: -cc1 +// CHECK-CC-C-EL-LTO-SAME: -emit-llvm-bc +// CHECK-CC-C-EL-LTO-SAME: -ffat-lto-objects /// Make sure we don't have a warning for -ffat-lto-objects being unused // RUN: %clang --target=x86_64-unknown-linux-gnu -ffat-lto-objects -fdriver-only -Werror -v %s -c 2>&1 | FileCheck %s -check-prefix=CHECK-CC-NOLTO diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/Driver/fveclib.c llvm-toolchain-18-18.1.0-rc1/clang/test/Driver/fveclib.c --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/Driver/fveclib.c 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/Driver/fveclib.c 2024-01-30 07:40:21.000000000 +0000 @@ -31,3 +31,21 @@ // RUN: %clang -fveclib=Accelerate %s -nodefaultlibs -target arm64-apple-ios8.0.0 -### 2>&1 | FileCheck --check-prefix=CHECK-LINK-NODEFAULTLIBS %s // CHECK-LINK-NODEFAULTLIBS-NOT: "-framework" "Accelerate" + + +/* Verify that the correct vector library is passed to LTO flags. */ + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=LIBMVEC -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-LIBMVEC %s +// CHECK-LTO-LIBMVEC: "-plugin-opt=-vector-library=LIBMVEC-X86" + +// RUN: %clang -### --target=powerpc64-unknown-linux-gnu -fveclib=MASSV -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-MASSV %s +// CHECK-LTO-MASSV: "-plugin-opt=-vector-library=MASSV" + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=SVML -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-SVML %s +// CHECK-LTO-SVML: "-plugin-opt=-vector-library=SVML" + +// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=SLEEF -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-SLEEF %s +// CHECK-LTO-SLEEF: "-plugin-opt=-vector-library=sleefgnuabi" + +// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=ArmPL -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-ARMPL %s +// CHECK-LTO-ARMPL: "-plugin-opt=-vector-library=ArmPL" diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/Driver/tls-dialect.c llvm-toolchain-18-18.1.0-rc1/clang/test/Driver/tls-dialect.c --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/Driver/tls-dialect.c 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/Driver/tls-dialect.c 2024-01-30 07:40:26.000000000 +0000 @@ -0,0 +1,25 @@ +// RUN: %clang -### --target=riscv64-freebsd -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=DESC %s +// RUN: %clang -### --target=riscv64-linux -mtls-dialect=trad %s 2>&1 | FileCheck --check-prefix=NODESC %s +// RUN: %clang -### --target=riscv64-linux %s 2>&1 | FileCheck --check-prefix=NODESC %s +// RUN: %clang -### --target=x86_64-linux -mtls-dialect=gnu %s 2>&1 | FileCheck --check-prefix=NODESC %s + +/// LTO +// RUN: %clang -### --target=riscv64-linux -flto -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=LTO-DESC %s +// RUN: %clang -### --target=riscv64-linux -flto %s 2>&1 | FileCheck --check-prefix=LTO-NODESC %s + +/// Unsupported target +/// GCC supports -mtls-dialect= for AArch64, but we just unsupport it for AArch64 as it is very rarely used. +// RUN: not %clang --target=aarch64-linux -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-TARGET %s +// RUN: not %clang --target=x86_64-apple-macos -mtls-dialect=desc -flto %s 2>&1 | FileCheck -check-prefix=UNSUPPORTED-TARGET %s + +/// Unsupported argument +// RUN: not %clang -### --target=riscv64-linux -mtls-dialect=gnu2 %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-ARG %s +// RUN: not %clang -### --target=x86_64-linux -mtls-dialect=gnu2 %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-ARG %s + +// DESC: "-cc1" {{.*}}"-enable-tlsdesc" +// NODESC-NOT: "-enable-tlsdesc" +// LTO-DESC: "-plugin-opt=-enable-tlsdesc" +// LTO-NODESC-NOT: "-plugin-opt=-enable-tlsdesc" + +// UNSUPPORTED-TARGET: error: unsupported option '-mtls-dialect=' for target +// UNSUPPORTED-ARG: error: unsupported argument 'gnu2' to option '-mtls-dialect=' for target diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp llvm-toolchain-18-18.1.0-rc1/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp 2024-01-26 08:58:40.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp 2024-01-30 07:40:21.000000000 +0000 @@ -336,3 +336,21 @@ (b.operator Tbar(), ...); } } + +namespace ReportedRegression1 { + const char kt[] = "dummy"; + + template + class SomeTempl { }; + + template + class SomeTempl { + public: + int exit_code() const { return 0; } + }; + + int use() { + SomeTempl dummy; + return dummy.exit_code(); + } +} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl llvm-toolchain-18-18.1.0-rc1/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl 2024-01-30 07:40:21.000000000 +0000 @@ -0,0 +1,107 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef float v8f __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef int v8i __attribute__((ext_vector_type(8))); + +// Wave32 + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w32: +// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c) +{ + *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w32: +// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c) +{ + *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w32: +// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w32: +// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32: +// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false); +} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl llvm-toolchain-18-18.1.0-rc1/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl 2024-01-30 07:40:21.000000000 +0000 @@ -0,0 +1,104 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef float v4f __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef int v4i __attribute__((ext_vector_type(4))); + +// Wave64 + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w64: +// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w64: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w64: +// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c) +{ + *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w64: +// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c) +{ + *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w64: +// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w64: +// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32: +// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false); +} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl llvm-toolchain-18-18.1.0-rc1/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl 2024-01-30 07:40:21.000000000 +0000 @@ -0,0 +1,110 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef float v8f __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef int v8i __attribute__((ext_vector_type(8))); +typedef half v16h __attribute__((ext_vector_type(16))); +typedef short v16s __attribute__((ext_vector_type(16))); + +// Wave32 + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w32: +// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index) +{ + *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w32: +// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index) +{ + *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w32: +// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w32: +// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w32: +// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index); +} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl llvm-toolchain-18-18.1.0-rc1/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl 2024-01-30 07:40:21.000000000 +0000 @@ -0,0 +1,109 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); + +// Wave64 + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w64: +// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index) +{ + *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w64: +// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index) +{ + *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w64: +// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w64: +// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w64: +// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index); +} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/debian/changelog llvm-toolchain-18-18.1.0-rc1/debian/changelog --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/debian/changelog 2024-01-26 08:44:56.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/debian/changelog 2024-01-30 18:40:57.000000000 +0000 @@ -1,3 +1,16 @@ +llvm-toolchain-18 (1:18.1.0-rc1-1~exp1ubuntu1) noble; urgency=medium + + [ Sylvestre Ledru ] + * hwasan_symbolize is now built also on i386 + + -- Matthias Klose Tue, 30 Jan 2024 19:40:57 +0100 + +llvm-toolchain-18 (1:18.1.0-rc1-1~exp1) experimental; urgency=medium + + * First RC + + -- Sylvestre Ledru Tue, 30 Jan 2024 09:01:06 +0100 + llvm-toolchain-18 (1:18.1.0~++20240126095841+0991d3c7b53d-1~exp1) experimental; urgency=medium * Branching of 18 diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/debian/clang-tools-X.Y.install.in llvm-toolchain-18-18.1.0-rc1/debian/clang-tools-X.Y.install.in --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/debian/clang-tools-X.Y.install.in 2024-01-24 12:58:34.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/debian/clang-tools-X.Y.install.in 2024-01-30 18:40:31.000000000 +0000 @@ -62,7 +62,7 @@ usr/lib/llvm-@LLVM_VERSION@/libexec/intercept-cc # See compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake / HWASAN_SUPPORTED_ARCH -[amd64 arm64 riscv64] usr/lib/llvm-@LLVM_VERSION@/lib/clang/@LLVM_VERSION@/bin/hwasan_symbolize +[amd64 arm64 riscv64 i386] usr/lib/llvm-@LLVM_VERSION@/lib/clang/@LLVM_VERSION@/bin/hwasan_symbolize clang/tools/scan-build-@LLVM_VERSION@ usr/share/clang/ clang/tools/scan-view-@LLVM_VERSION@ usr/share/clang/ diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp llvm-toolchain-18-18.1.0-rc1/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp 2024-01-30 07:40:21.000000000 +0000 @@ -121,26 +121,7 @@ test_pair_rv(); test_pair_rv(); - /* For ExplicitTypes::CopyOnly, two of the viable candidates for initializing from a non-const xvalue are: - * pair(const pair&); // (defaulted copy constructor) - * template explicit pair(const pair&&); [U1 = ExplicitTypes::CopyOnly, U2 = int] - * This results in diverging behavior for test_convertible which uses copy-list-initialization - * Prior to CWG2137, this would have selected the first (non-explicit) ctor as explicit ctors would not be considered - * Afterwards, it should select the second since it is a better match, and then failed because it is explicit - * - * This may change with future defect reports, and some compilers only have partial support for CWG2137, - * so use std::is_convertible directly to avoid a copy-list-initialization - */ - { - using P1 = std::pair; - using P2 = std::pair; - using UP1 = std::pair&&; - using UP2 = std::pair&&; - static_assert(std::is_constructible::value, ""); - static_assert(std::is_convertible::value, ""); - static_assert(std::is_constructible::value, ""); - static_assert(std::is_convertible::value, ""); - } + test_pair_rv(); test_pair_rv(); test_pair_rv(); diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/lld/ELF/Arch/RISCV.cpp llvm-toolchain-18-18.1.0-rc1/lld/ELF/Arch/RISCV.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/lld/ELF/Arch/RISCV.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/lld/ELF/Arch/RISCV.cpp 2024-01-30 07:40:21.000000000 +0000 @@ -61,6 +61,7 @@ AUIPC = 0x17, JALR = 0x67, LD = 0x3003, + LUI = 0x37, LW = 0x2003, SRLI = 0x5013, SUB = 0x40000033, @@ -73,6 +74,7 @@ X_T0 = 5, X_T1 = 6, X_T2 = 7, + X_A0 = 10, X_T3 = 28, }; @@ -102,6 +104,26 @@ (extractBits(imm, 4, 0) << 7); } +namespace { +struct SymbolAnchor { + uint64_t offset; + Defined *d; + bool end; // true for the anchor of st_value+st_size +}; +} // namespace + +struct elf::RISCVRelaxAux { + // This records symbol start and end offsets which will be adjusted according + // to the nearest relocDeltas element. + SmallVector anchors; + // For relocations[i], the actual offset is + // r_offset - (i ? relocDeltas[i-1] : 0). + std::unique_ptr relocDeltas; + // For relocations[i], the actual type is relocTypes[i]. + std::unique_ptr relocTypes; + SmallVector writes; +}; + RISCV::RISCV() { copyRel = R_RISCV_COPY; pltRel = R_RISCV_JUMP_SLOT; @@ -119,6 +141,7 @@ tlsGotRel = R_RISCV_TLS_TPREL32; } gotRel = symbolicRel; + tlsDescRel = R_RISCV_TLSDESC; // .got[0] = _DYNAMIC gotHeaderEntriesNum = 1; @@ -187,6 +210,8 @@ case R_RISCV_JUMP_SLOT: // These relocations are defined as not having an implicit addend. return 0; + case R_RISCV_TLSDESC: + return config->is64 ? read64le(buf + 8) : read32le(buf + 4); } } @@ -295,6 +320,12 @@ case R_RISCV_PCREL_LO12_I: case R_RISCV_PCREL_LO12_S: return R_RISCV_PC_INDIRECT; + case R_RISCV_TLSDESC_HI20: + case R_RISCV_TLSDESC_LOAD_LO12: + case R_RISCV_TLSDESC_ADD_LO12: + return R_TLSDESC_PC; + case R_RISCV_TLSDESC_CALL: + return R_TLSDESC_CALL; case R_RISCV_TLS_GD_HI20: return R_TLSGD_PC; case R_RISCV_TLS_GOT_HI20: @@ -419,6 +450,7 @@ case R_RISCV_GOT_HI20: case R_RISCV_PCREL_HI20: + case R_RISCV_TLSDESC_HI20: case R_RISCV_TLS_GD_HI20: case R_RISCV_TLS_GOT_HI20: case R_RISCV_TPREL_HI20: @@ -430,6 +462,8 @@ } case R_RISCV_PCREL_LO12_I: + case R_RISCV_TLSDESC_LOAD_LO12: + case R_RISCV_TLSDESC_ADD_LO12: case R_RISCV_TPREL_LO12_I: case R_RISCV_LO12_I: { uint64_t hi = (val + 0x800) >> 12; @@ -513,32 +547,133 @@ break; case R_RISCV_RELAX: - return; // Ignored (for now) - + return; + case R_RISCV_TLSDESC: + // The addend is stored in the second word. + if (config->is64) + write64le(loc + 8, val); + else + write32le(loc + 4, val); + break; default: llvm_unreachable("unknown relocation"); } } +static bool relaxable(ArrayRef relocs, size_t i) { + return i + 1 != relocs.size() && relocs[i + 1].type == R_RISCV_RELAX; +} + +static void tlsdescToIe(uint8_t *loc, const Relocation &rel, uint64_t val) { + switch (rel.type) { + case R_RISCV_TLSDESC_HI20: + case R_RISCV_TLSDESC_LOAD_LO12: + write32le(loc, 0x00000013); // nop + break; + case R_RISCV_TLSDESC_ADD_LO12: + write32le(loc, utype(AUIPC, X_A0, hi20(val))); // auipc a0, + break; + case R_RISCV_TLSDESC_CALL: + if (config->is64) + write32le(loc, itype(LD, X_A0, X_A0, lo12(val))); // ld a0,(a0) + else + write32le(loc, itype(LW, X_A0, X_A0, lo12(val))); // lw a0,(a0) + break; + default: + llvm_unreachable("unsupported relocation for TLSDESC to IE"); + } +} + +static void tlsdescToLe(uint8_t *loc, const Relocation &rel, uint64_t val) { + switch (rel.type) { + case R_RISCV_TLSDESC_HI20: + case R_RISCV_TLSDESC_LOAD_LO12: + write32le(loc, 0x00000013); // nop + return; + case R_RISCV_TLSDESC_ADD_LO12: + if (isInt<12>(val)) + write32le(loc, 0x00000013); // nop + else + write32le(loc, utype(LUI, X_A0, hi20(val))); // lui a0, + return; + case R_RISCV_TLSDESC_CALL: + if (isInt<12>(val)) + write32le(loc, itype(ADDI, X_A0, 0, val)); // addi a0,zero, + else + write32le(loc, itype(ADDI, X_A0, X_A0, lo12(val))); // addi a0,a0, + return; + default: + llvm_unreachable("unsupported relocation for TLSDESC to LE"); + } +} + void RISCV::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { uint64_t secAddr = sec.getOutputSection()->addr; if (auto *s = dyn_cast(&sec)) secAddr += s->outSecOff; else if (auto *ehIn = dyn_cast(&sec)) secAddr += ehIn->getParent()->outSecOff; - for (size_t i = 0, size = sec.relocs().size(); i != size; ++i) { - const Relocation &rel = sec.relocs()[i]; + uint64_t tlsdescVal = 0; + bool tlsdescRelax = false, isToLe = false; + const ArrayRef relocs = sec.relocs(); + for (size_t i = 0, size = relocs.size(); i != size; ++i) { + const Relocation &rel = relocs[i]; uint8_t *loc = buf + rel.offset; - const uint64_t val = + uint64_t val = sec.getRelocTargetVA(sec.file, rel.type, rel.addend, secAddr + rel.offset, *rel.sym, rel.expr); switch (rel.expr) { case R_RELAX_HINT: + continue; + case R_TLSDESC_PC: + // For R_RISCV_TLSDESC_HI20, store &got(sym)-PC to be used by the + // following two instructions L[DW] and ADDI. + if (rel.type == R_RISCV_TLSDESC_HI20) + tlsdescVal = val; + else + val = tlsdescVal; break; + case R_RELAX_TLS_GD_TO_IE: + // Only R_RISCV_TLSDESC_HI20 reaches here. tlsdescVal will be finalized + // after we see R_RISCV_TLSDESC_ADD_LO12 in the R_RELAX_TLS_GD_TO_LE case. + // The net effect is that tlsdescVal will be smaller than `val` to take + // into account of NOP instructions (in the absence of R_RISCV_RELAX) + // before AUIPC. + tlsdescVal = val + rel.offset; + isToLe = false; + tlsdescRelax = relaxable(relocs, i); + if (!tlsdescRelax) + tlsdescToIe(loc, rel, val); + continue; + case R_RELAX_TLS_GD_TO_LE: + // See the comment in handleTlsRelocation. For TLSDESC=>IE, + // R_RISCV_TLSDESC_{LOAD_LO12,ADD_LO12,CALL} also reach here. If isToIe is + // true, this is actually TLSDESC=>IE optimization. + if (rel.type == R_RISCV_TLSDESC_HI20) { + tlsdescVal = val; + isToLe = true; + tlsdescRelax = relaxable(relocs, i); + } else { + if (!isToLe && rel.type == R_RISCV_TLSDESC_ADD_LO12) + tlsdescVal -= rel.offset; + val = tlsdescVal; + } + // When NOP conversion is eligible and relaxation applies, don't write a + // NOP in case an unrelated instruction follows the current instruction. + if (tlsdescRelax && + (rel.type == R_RISCV_TLSDESC_HI20 || + rel.type == R_RISCV_TLSDESC_LOAD_LO12 || + (rel.type == R_RISCV_TLSDESC_ADD_LO12 && isToLe && !hi20(val)))) + continue; + if (isToLe) + tlsdescToLe(loc, rel, val); + else + tlsdescToIe(loc, rel, val); + continue; case R_RISCV_LEB128: if (i + 1 < size) { - const Relocation &rel1 = sec.relocs()[i + 1]; + const Relocation &rel1 = relocs[i + 1]; if (rel.type == R_RISCV_SET_ULEB128 && rel1.type == R_RISCV_SUB_ULEB128 && rel.offset == rel1.offset) { auto val = rel.sym->getVA(rel.addend) - rel1.sym->getVA(rel1.addend); @@ -554,32 +689,12 @@ ": R_RISCV_SET_ULEB128 not paired with R_RISCV_SUB_SET128"); return; default: - relocate(loc, rel, val); break; } + relocate(loc, rel, val); } } -namespace { -struct SymbolAnchor { - uint64_t offset; - Defined *d; - bool end; // true for the anchor of st_value+st_size -}; -} // namespace - -struct elf::RISCVRelaxAux { - // This records symbol start and end offsets which will be adjusted according - // to the nearest relocDeltas element. - SmallVector anchors; - // For relocations[i], the actual offset is r_offset - (i ? relocDeltas[i-1] : - // 0). - std::unique_ptr relocDeltas; - // For relocations[i], the actual type is relocTypes[i]. - std::unique_ptr relocTypes; - SmallVector writes; -}; - static void initSymbolAnchors() { SmallVector storage; for (OutputSection *osec : outputSections) { @@ -715,14 +830,16 @@ static bool relax(InputSection &sec) { const uint64_t secAddr = sec.getVA(); + const MutableArrayRef relocs = sec.relocs(); auto &aux = *sec.relaxAux; bool changed = false; ArrayRef sa = ArrayRef(aux.anchors); uint64_t delta = 0; + bool tlsdescRelax = false, toLeShortForm = false; - std::fill_n(aux.relocTypes.get(), sec.relocs().size(), R_RISCV_NONE); + std::fill_n(aux.relocTypes.get(), relocs.size(), R_RISCV_NONE); aux.writes.clear(); - for (auto [i, r] : llvm::enumerate(sec.relocs())) { + for (auto [i, r] : llvm::enumerate(relocs)) { const uint64_t loc = secAddr + r.offset - delta; uint32_t &cur = aux.relocDeltas[i], remove = 0; switch (r.type) { @@ -743,25 +860,37 @@ } case R_RISCV_CALL: case R_RISCV_CALL_PLT: - if (i + 1 != sec.relocs().size() && - sec.relocs()[i + 1].type == R_RISCV_RELAX) + if (relaxable(relocs, i)) relaxCall(sec, i, loc, r, remove); break; case R_RISCV_TPREL_HI20: case R_RISCV_TPREL_ADD: case R_RISCV_TPREL_LO12_I: case R_RISCV_TPREL_LO12_S: - if (i + 1 != sec.relocs().size() && - sec.relocs()[i + 1].type == R_RISCV_RELAX) + if (relaxable(relocs, i)) relaxTlsLe(sec, i, loc, r, remove); break; case R_RISCV_HI20: case R_RISCV_LO12_I: case R_RISCV_LO12_S: - if (i + 1 != sec.relocs().size() && - sec.relocs()[i + 1].type == R_RISCV_RELAX) + if (relaxable(relocs, i)) relaxHi20Lo12(sec, i, loc, r, remove); break; + case R_RISCV_TLSDESC_HI20: + // For TLSDESC=>LE, we can use the short form if hi20 is zero. + tlsdescRelax = relaxable(relocs, i); + toLeShortForm = tlsdescRelax && r.expr == R_RELAX_TLS_GD_TO_LE && + !hi20(r.sym->getVA(r.addend)); + [[fallthrough]]; + case R_RISCV_TLSDESC_LOAD_LO12: + // For TLSDESC=>LE/IE, AUIPC and L[DW] are removed if relaxable. + if (tlsdescRelax && r.expr != R_TLSDESC_PC) + remove = 4; + break; + case R_RISCV_TLSDESC_ADD_LO12: + if (toLeShortForm) + remove = 4; + break; } // For all anchors whose offsets are <= r.offset, they are preceded by diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/lld/ELF/Relocations.cpp llvm-toolchain-18-18.1.0-rc1/lld/ELF/Relocations.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/lld/ELF/Relocations.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/lld/ELF/Relocations.cpp 2024-01-30 07:40:21.000000000 +0000 @@ -1274,29 +1274,34 @@ if (config->emachine == EM_MIPS) return handleMipsTlsRelocation(type, sym, c, offset, addend, expr); + bool isRISCV = config->emachine == EM_RISCV; if (oneof(expr) && config->shared) { + // R_RISCV_TLSDESC_{LOAD_LO12,ADD_LO12_I,CALL} reference a label. Do not + // set NEEDS_TLSDESC on the label. if (expr != R_TLSDESC_CALL) { - sym.setFlags(NEEDS_TLSDESC); + if (!isRISCV || type == R_RISCV_TLSDESC_HI20) + sym.setFlags(NEEDS_TLSDESC); c.addReloc({expr, type, offset, addend, &sym}); } return 1; } // ARM, Hexagon, LoongArch and RISC-V do not support GD/LD to IE/LE - // relaxation. + // optimizations. + // RISC-V supports TLSDESC to IE/LE optimizations. // For PPC64, if the file has missing R_PPC64_TLSGD/R_PPC64_TLSLD, disable - // relaxation as well. - bool toExecRelax = !config->shared && config->emachine != EM_ARM && - config->emachine != EM_HEXAGON && - config->emachine != EM_LOONGARCH && - config->emachine != EM_RISCV && - !c.file->ppc64DisableTLSRelax; + // optimization as well. + bool execOptimize = + !config->shared && config->emachine != EM_ARM && + config->emachine != EM_HEXAGON && config->emachine != EM_LOONGARCH && + !(isRISCV && expr != R_TLSDESC_PC && expr != R_TLSDESC_CALL) && + !c.file->ppc64DisableTLSRelax; // If we are producing an executable and the symbol is non-preemptable, it - // must be defined and the code sequence can be relaxed to use Local-Exec. + // must be defined and the code sequence can be optimized to use Local-Exec. // // ARM and RISC-V do not support any relaxations for TLS relocations, however, // we can omit the DTPMOD dynamic relocations and resolve them at link time @@ -1309,8 +1314,8 @@ // module index, with a special value of 0 for the current module. GOT[e1] is // unused. There only needs to be one module index entry. if (oneof(expr)) { - // Local-Dynamic relocs can be relaxed to Local-Exec. - if (toExecRelax) { + // Local-Dynamic relocs can be optimized to Local-Exec. + if (execOptimize) { c.addReloc({target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE), type, offset, addend, &sym}); return target->getTlsGdRelaxSkip(type); @@ -1322,16 +1327,17 @@ return 1; } - // Local-Dynamic relocs can be relaxed to Local-Exec. + // Local-Dynamic relocs can be optimized to Local-Exec. if (expr == R_DTPREL) { - if (toExecRelax) + if (execOptimize) expr = target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE); c.addReloc({expr, type, offset, addend, &sym}); return 1; } // Local-Dynamic sequence where offset of tls variable relative to dynamic - // thread pointer is stored in the got. This cannot be relaxed to Local-Exec. + // thread pointer is stored in the got. This cannot be optimized to + // Local-Exec. if (expr == R_TLSLD_GOT_OFF) { sym.setFlags(NEEDS_GOT_DTPREL); c.addReloc({expr, type, offset, addend, &sym}); @@ -1341,14 +1347,18 @@ if (oneof(expr)) { - if (!toExecRelax) { + if (!execOptimize) { sym.setFlags(NEEDS_TLSGD); c.addReloc({expr, type, offset, addend, &sym}); return 1; } - // Global-Dynamic relocs can be relaxed to Initial-Exec or Local-Exec + // Global-Dynamic/TLSDESC can be optimized to Initial-Exec or Local-Exec // depending on the symbol being locally defined or not. + // + // R_RISCV_TLSDESC_{LOAD_LO12,ADD_LO12_I,CALL} reference a non-preemptible + // label, so the LE optimization will be categorized as + // R_RELAX_TLS_GD_TO_LE. We fix the categorization in RISCV::relocateAlloc. if (sym.isPreemptible) { sym.setFlags(NEEDS_TLSGD_TO_IE); c.addReloc({target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_IE), type, @@ -1363,9 +1373,9 @@ if (oneof(expr)) { ctx.hasTlsIe.store(true, std::memory_order_relaxed); - // Initial-Exec relocs can be relaxed to Local-Exec if the symbol is locally - // defined. - if (toExecRelax && isLocalInExecutable) { + // Initial-Exec relocs can be optimized to Local-Exec if the symbol is + // locally defined. + if (execOptimize && isLocalInExecutable) { c.addReloc({R_RELAX_TLS_IE_TO_LE, type, offset, addend, &sym}); } else if (expr != R_TLSIE_HINT) { sym.setFlags(NEEDS_TLSIE); @@ -1463,7 +1473,7 @@ in.got->hasGotOffRel.store(true, std::memory_order_relaxed); } - // Process TLS relocations, including relaxing TLS relocations. Note that + // Process TLS relocations, including TLS optimizations. Note that // R_TPREL and R_TPREL_NEG relocations are resolved in processAux. if (sym.isTls()) { if (unsigned processed = diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/lld/test/ELF/riscv-tlsdesc-gd-mixed.s llvm-toolchain-18-18.1.0-rc1/lld/test/ELF/riscv-tlsdesc-gd-mixed.s --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/lld/test/ELF/riscv-tlsdesc-gd-mixed.s 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/lld/test/ELF/riscv-tlsdesc-gd-mixed.s 2024-01-30 07:40:21.000000000 +0000 @@ -0,0 +1,26 @@ +# REQUIRES: riscv +# RUN: llvm-mc -filetype=obj -triple=riscv64 %s -o %t.o +# RUN: ld.lld -shared %t.o -o %t.so +# RUN: llvm-readobj -r %t.so | FileCheck %s --check-prefix=RELA + +## Both TLSDESC and DTPMOD64/DTPREL64 should be present. +# RELA: .rela.dyn { +# RELA-NEXT: 0x[[#%X,ADDR:]] R_RISCV_TLSDESC a 0x0 +# RELA-NEXT: 0x[[#ADDR+16]] R_RISCV_TLS_DTPMOD64 a 0x0 +# RELA-NEXT: 0x[[#ADDR+24]] R_RISCV_TLS_DTPREL64 a 0x0 +# RELA-NEXT: } + + la.tls.gd a0,a + call __tls_get_addr@plt + +.Ltlsdesc_hi0: + auipc a2, %tlsdesc_hi(a) + ld a3, %tlsdesc_load_lo(.Ltlsdesc_hi0)(a2) + addi a0, a2, %tlsdesc_add_lo(.Ltlsdesc_hi0) + jalr t0, 0(a3), %tlsdesc_call(.Ltlsdesc_hi0) + +.section .tbss,"awT",@nobits +.globl a +.zero 8 +a: +.zero 4 diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/lld/test/ELF/riscv-tlsdesc-relax.s llvm-toolchain-18-18.1.0-rc1/lld/test/ELF/riscv-tlsdesc-relax.s --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/lld/test/ELF/riscv-tlsdesc-relax.s 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/lld/test/ELF/riscv-tlsdesc-relax.s 2024-01-30 07:40:21.000000000 +0000 @@ -0,0 +1,189 @@ +# REQUIRES: riscv +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: llvm-mc -filetype=obj -triple=riscv64 --defsym PAD=0 -mattr=+c,+relax a.s -o a.64.o +# RUN: llvm-mc -filetype=obj -triple=riscv64 --defsym PAD=5000 -mattr=+c,+relax a.s -o aa.64.o +# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c,+relax c.s -o c.64.o +# RUN: ld.lld -shared -soname=c.64.so c.64.o -o c.64.so + +# RUN: ld.lld -shared -z now a.64.o c.64.o -o a.64.so -z separate-code +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.so | FileCheck %s --check-prefix=GD64 + +## Test the TLSDESC to LE optimization. Also check --emit-relocs. +# RUN: ld.lld -e 0 -z now a.64.o c.64.o -o a.64.le -z separate-code --emit-relocs +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -hdr a.64.le | FileCheck %s --check-prefix=LE64 +# RUN: ld.lld -e 0 -z now aa.64.o c.64.o -o aa.64.le -z separate-code +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d aa.64.le | FileCheck %s --check-prefix=LE64A + +## Test the TLSDESC to IE optimization. +# RUN: ld.lld -e 0 -z now a.64.o c.64.so -o a.64.ie -z separate-code +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.ie | FileCheck %s --check-prefix=IE64 + +# GD64: .got 00000018 00000000000020c0 +# GD64-LABEL: <_start>: +# GD64-NEXT: jal {{.*}} +# GD64-LABEL: : +## &.got[c]-. = 0x20c0+8 - 0x1004 = 0x10c4 +# GD64: 1004: auipc a2, 0x1 +# GD64-NEXT: c.add a7, a7 +# GD64-NEXT: ld a3, 0xc4(a2) +# GD64-NEXT: c.add a7, a7 +# GD64-NEXT: addi a0, a2, 0xc4 +# GD64-NEXT: c.add a7, a7 +# GD64-NEXT: jalr t0, 0x0(a3) +# GD64-NEXT: c.add a0, tp +# GD64-NEXT: jal {{.*}} +## &.got[c]-. = 0x20c0+8 - 0x1020 = 0x10a8 +# GD64-NEXT: 1020: auipc a4, 0x1 +# GD64-NEXT: ld a5, 0xa8(a4) +# GD64-NEXT: addi a0, a4, 0xa8 +# GD64-NEXT: jalr t0, 0x0(a5) +# GD64-NEXT: c.add a0, tp +## &.got[c]-. = 0x20c0+8 - 0x1032 = 0x1096 +# GD64-NEXT: 1032: auipc a6, 0x1 +# GD64-NEXT: ld a7, 0x96(a6) +# GD64-NEXT: addi a0, a6, 0x96 +# GD64-NEXT: jalr t0, 0x0(a7) +# GD64-NEXT: c.add a0, tp + +# LE64-LABEL: <_start>: +# LE64-NEXT: jal {{.*}} +# LE64-LABEL: : +# LE64-NEXT: c.add a7, a7 +# LE64-NEXT: R_RISCV_TLSDESC_HI20 b +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: c.add a7, a7 +# LE64-NEXT: R_RISCV_TLSDESC_LOAD_LO12 .Ltlsdesc_hi0 +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: 11008: c.add a7, a7 +# LE64-NEXT: R_RISCV_TLSDESC_ADD_LO12 .Ltlsdesc_hi0 +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: addi a0, zero, 0x7ff +# LE64-NEXT: R_RISCV_TLSDESC_CALL .Ltlsdesc_hi0 +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: c.add a0, tp +# LE64-NEXT: jal {{.*}} +# LE64-NEXT: R_RISCV_JAL foo +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: addi a0, zero, 0x7ff +# LE64-NEXT: R_RISCV_TLSDESC_HI20 b +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: R_RISCV_TLSDESC_LOAD_LO12 .Ltlsdesc_hi1 +# LE64-NEXT: R_RISCV_TLSDESC_ADD_LO12 .Ltlsdesc_hi1 +# LE64-NEXT: R_RISCV_TLSDESC_CALL .Ltlsdesc_hi1 +# LE64-NEXT: c.add a0, tp +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: R_RISCV_TLSDESC_HI20 b +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: R_RISCV_TLSDESC_LOAD_LO12 .Ltlsdesc_hi2 +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: R_RISCV_TLSDESC_ADD_LO12 .Ltlsdesc_hi2 +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: addi a0, zero, 0x7ff +# LE64-NEXT: R_RISCV_TLSDESC_CALL .Ltlsdesc_hi2 +# LE64-NEXT: c.add a0, tp + +# LE64A-LABEL: <_start>: +# LE64A-NEXT: jal {{.*}} +# LE64A-LABEL: : +# LE64A-NEXT: c.add a7, a7 +# LE64A-NEXT: c.add a7, a7 +# LE64A-NEXT: 11008: lui a0, 0x2 +# LE64A-NEXT: c.add a7, a7 +# LE64A-NEXT: addi a0, a0, -0x479 +# LE64A-NEXT: c.add a0, tp +# LE64A-NEXT: jal {{.*}} +# LE64A-NEXT: lui a0, 0x2 +# LE64A-NEXT: addi a0, a0, -0x479 +# LE64A-NEXT: c.add a0, tp +# LE64A-NEXT: addi zero, zero, 0x0 +# LE64A-NEXT: addi zero, zero, 0x0 +# LE64A-NEXT: lui a0, 0x2 +# LE64A-NEXT: addi a0, a0, -0x479 +# LE64A-NEXT: c.add a0, tp + +# IE64: .got 00000010 00000000000120e0 +# IE64-LABEL: <_start>: +# IE64-NEXT: jal {{.*}} +# IE64-LABEL: : +# IE64-NEXT: c.add a7, a7 +# IE64-NEXT: c.add a7, a7 +## &.got[c]-. = 0x120e0+8 - 0x11008 = 0x10e0 +# IE64-NEXT: 11008: auipc a0, 0x1 +# IE64-NEXT: c.add a7, a7 +# IE64-NEXT: ld a0, 0xe0(a0) +# IE64-NEXT: c.add a0, tp +# IE64-NEXT: jal {{.*}} +## &.got[c]-. = 0x120e0+8 - 0x11018 = 0x10d0 +# IE64-NEXT: 11018: auipc a0, 0x1 +# IE64-NEXT: ld a0, 0xd0(a0) +# IE64-NEXT: c.add a0, tp +## &.got[c]-. = 0x120e0+8 - 0x1102a = 0x10be +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: 1102a: auipc a0, 0x1 +# IE64-NEXT: ld a0, 0xbe(a0) +# IE64-NEXT: c.add a0, tp + +#--- a.s +.globl _start +_start: +.balign 16 + call foo + +foo: +.Ltlsdesc_hi0: +.option norelax +## All 4 instructions have an R_RISCV_RELAX. +## Check that optimization/relaxation are not affected by irrelevant instructions. + auipc a2, %tlsdesc_hi(b) + .reloc .-4, R_RISCV_RELAX, 0 + c.add a7, a7 + ld a3, %tlsdesc_load_lo(.Ltlsdesc_hi0)(a2) + .reloc .-4, R_RISCV_RELAX, 0 + c.add a7, a7 + addi a0, a2, %tlsdesc_add_lo(.Ltlsdesc_hi0) + .reloc .-4, R_RISCV_RELAX, 0 + c.add a7, a7 + jalr t0, 0(a3), %tlsdesc_call(.Ltlsdesc_hi0) + .reloc .-4, R_RISCV_RELAX, 0 + add a0, a0, tp +.option relax + + call foo + +.Ltlsdesc_hi1: +.option norelax +## AUIPC has an R_RISCV_RELAX. We perform relaxation, ignoring whether other +## instructions have R_RISCV_RELAX. + auipc a4, %tlsdesc_hi(b) + .reloc .-4, R_RISCV_RELAX, 0 + ld a5, %tlsdesc_load_lo(.Ltlsdesc_hi1)(a4) + addi a0, a4, %tlsdesc_add_lo(.Ltlsdesc_hi1) + jalr t0, 0(a5), %tlsdesc_call(.Ltlsdesc_hi1) + add a0, a0, tp +.option relax + +.Ltlsdesc_hi2: +.option norelax +## AUIPC does not have R_RISCV_RELAX. No relaxation. + auipc a6, %tlsdesc_hi(b) + ld a7, %tlsdesc_load_lo(.Ltlsdesc_hi2)(a6) + .reloc .-4, R_RISCV_RELAX, 0 + addi a0, a6, %tlsdesc_add_lo(.Ltlsdesc_hi2) + .reloc .-4, R_RISCV_RELAX, 0 + jalr t0, 0(a7), %tlsdesc_call(.Ltlsdesc_hi2) + add a0, a0, tp +.option relax + +.section .tbss +.globl a +.zero 8 +a: +.zero 2039+PAD ## Place b at 0x7ff+PAD + +#--- c.s +.tbss +.globl b +b: +.zero 4 diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/lld/test/ELF/riscv-tlsdesc.s llvm-toolchain-18-18.1.0-rc1/lld/test/ELF/riscv-tlsdesc.s --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/lld/test/ELF/riscv-tlsdesc.s 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/lld/test/ELF/riscv-tlsdesc.s 2024-01-30 07:40:21.000000000 +0000 @@ -0,0 +1,194 @@ +# REQUIRES: riscv +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: llvm-mc -filetype=obj -triple=riscv64 a.s -o a.64.o +# RUN: llvm-mc -filetype=obj -triple=riscv64 c.s -o c.64.o +# RUN: ld.lld -shared -soname=c.64.so c.64.o -o c.64.so +# RUN: llvm-mc -filetype=obj -triple=riscv32 --defsym ELF32=1 a.s -o a.32.o +# RUN: llvm-mc -filetype=obj -triple=riscv32 --defsym ELF32=1 c.s -o c.32.o +# RUN: ld.lld -shared -soname=c.32.so c.32.o -o c.32.so + +# RUN: ld.lld -shared -z now a.64.o c.64.o -o a.64.so +# RUN: llvm-readobj -r -x .got a.64.so | FileCheck --check-prefix=GD64-RELA %s +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.so | FileCheck %s --check-prefix=GD64 + +# RUN: ld.lld -shared -z now a.64.o c.64.o -o rel.64.so -z rel +# RUN: llvm-readobj -r -x .got rel.64.so | FileCheck --check-prefix=GD64-REL %s + +# RUN: ld.lld -e 0 -z now a.64.o c.64.o -o a.64.le +# RUN: llvm-readelf -r a.64.le | FileCheck --check-prefix=NOREL %s +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.le | FileCheck %s --check-prefix=LE64 + +# RUN: ld.lld -e 0 -z now a.64.o c.64.so -o a.64.ie +# RUN: llvm-readobj -r a.64.ie | FileCheck --check-prefix=IE64-RELA %s +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.ie | FileCheck %s --check-prefix=IE64 + +## 32-bit code is mostly the same. We only test a few variants. The IE optimization uses the LW instruction. + +# RUN: ld.lld -shared -z now a.32.o c.32.o -o rel.32.so -z rel +# RUN: llvm-readobj -r -x .got rel.32.so | FileCheck --check-prefix=GD32-REL %s +# RUN: ld.lld -e 0 -z now a.32.o c.32.so -o a.32.ie +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.32.ie | FileCheck %s --check-prefix=IE32 + +# GD64-RELA: .rela.dyn { +# GD64-RELA-NEXT: 0x2408 R_RISCV_TLSDESC - 0x7FF +# GD64-RELA-NEXT: 0x23E8 R_RISCV_TLSDESC a 0x0 +# GD64-RELA-NEXT: 0x23F8 R_RISCV_TLSDESC c 0x0 +# GD64-RELA-NEXT: } +# GD64-RELA: Hex dump of section '.got': +# GD64-RELA-NEXT: 0x000023e0 20230000 00000000 00000000 00000000 # +# GD64-RELA-NEXT: 0x000023f0 00000000 00000000 00000000 00000000 . + +# GD64-REL: .rel.dyn { +# GD64-REL-NEXT: 0x23F0 R_RISCV_TLSDESC - +# GD64-REL-NEXT: 0x23D0 R_RISCV_TLSDESC a +# GD64-REL-NEXT: 0x23E0 R_RISCV_TLSDESC c +# GD64-REL-NEXT: } +# GD64-REL: Hex dump of section '.got': +# GD64-REL-NEXT: 0x000023c8 08230000 00000000 00000000 00000000 . +# GD64-REL-NEXT: 0x000023d8 00000000 00000000 00000000 00000000 . +# GD64-REL-NEXT: 0x000023e8 00000000 00000000 00000000 00000000 . +# GD64-REL-NEXT: 0x000023f8 ff070000 00000000 . + +# GD32-REL: .rel.dyn { +# GD32-REL-NEXT: 0x2274 R_RISCV_TLSDESC - +# GD32-REL-NEXT: 0x2264 R_RISCV_TLSDESC a +# GD32-REL-NEXT: 0x226C R_RISCV_TLSDESC c +# GD32-REL-NEXT: } +# GD32-REL: Hex dump of section '.got': +# GD32-REL-NEXT: 0x00002260 00220000 00000000 00000000 00000000 . +# GD32-REL-NEXT: 0x00002270 00000000 00000000 ff070000 . + +# GD64: .got 00000038 00000000000023e0 + +## &.got[a]-. = 0x23e0+8 - 0x12e0 = 0x1108 +# GD64: 12e0: auipc a0, 0x1 +# GD64-NEXT: ld a1, 0x108(a0) +# GD64-NEXT: addi a0, a0, 0x108 +# GD64-NEXT: jalr t0, 0x0(a1) +# GD64-NEXT: add a0, a0, tp + +## &.got[b]-. = 0x23e0+40 - 0x12f4 = 0x1114 +# GD64-NEXT: 12f4: auipc a2, 0x1 +# GD64-NEXT: ld a3, 0x114(a2) +# GD64-NEXT: addi a0, a2, 0x114 +# GD64-NEXT: jalr t0, 0x0(a3) +# GD64-NEXT: add a0, a0, tp + +## &.got[c]-. = 0x23e0+24 - 0x1308 = 0x10f0 +# GD64-NEXT: 1308: auipc a4, 0x1 +# GD64-NEXT: ld a5, 0xf0(a4) +# GD64-NEXT: addi a0, a4, 0xf0 +# GD64-NEXT: jalr t0, 0x0(a5) +# GD64-NEXT: add a0, a0, tp + +# NOREL: no relocations + +# LE64-LABEL: <.text>: +## st_value(a) = 8 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: addi a0, zero, 0x8 +# LE64-NEXT: add a0, a0, tp +## st_value(b) = 2047 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: addi a0, zero, 0x7ff +# LE64-NEXT: add a0, a0, tp +## st_value(c) = 2048 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: lui a0, 0x1 +# LE64-NEXT: addi a0, a0, -0x800 +# LE64-NEXT: add a0, a0, tp + +# IE64-RELA: .rela.dyn { +# IE64-RELA-NEXT: 0x123B0 R_RISCV_TLS_TPREL64 c 0x0 +# IE64-RELA-NEXT: } + +# IE64: .got 00000010 00000000000123a8 + +## a and b are optimized to use LE. c is optimized to IE. +# IE64-LABEL: <.text>: +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi a0, zero, 0x8 +# IE64-NEXT: add a0, a0, tp +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi a0, zero, 0x7ff +# IE64-NEXT: add a0, a0, tp +## &.got[c]-. = 0x123a8+8 - 0x112b8 = 0x10f8 +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: 112b8: auipc a0, 0x1 +# IE64-NEXT: ld a0, 0xf8(a0) +# IE64-NEXT: add a0, a0, tp + +# IE32: .got 00000008 00012248 + +# IE32-LABEL: <.text>: +## st_value(a) = 8 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: addi a0, zero, 0x8 +# IE32-NEXT: add a0, a0, tp +## st_value(b) = 2047 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: addi a0, zero, 0x7ff +# IE32-NEXT: add a0, a0, tp +## &.got[c]-. = 0x12248+4 - 0x111cc = 0x1080 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: 111cc: auipc a0, 0x1 +# IE32-NEXT: lw a0, 0x80(a0) +# IE32-NEXT: add a0, a0, tp + +#--- a.s +.macro load dst, src +.ifdef ELF32 +lw \dst, \src +.else +ld \dst, \src +.endif +.endm + +.Ltlsdesc_hi0: + auipc a0, %tlsdesc_hi(a) + load a1, %tlsdesc_load_lo(.Ltlsdesc_hi0)(a0) + addi a0, a0, %tlsdesc_add_lo(.Ltlsdesc_hi0) + jalr t0, 0(a1), %tlsdesc_call(.Ltlsdesc_hi0) + add a0, a0, tp + +.Ltlsdesc_hi1: + auipc a2, %tlsdesc_hi(b) + load a3, %tlsdesc_load_lo(.Ltlsdesc_hi1)(a2) + addi a0, a2, %tlsdesc_add_lo(.Ltlsdesc_hi1) + jalr t0, 0(a3), %tlsdesc_call(.Ltlsdesc_hi1) + add a0, a0, tp + +.Ltlsdesc_hi2: + auipc a4, %tlsdesc_hi(c) + load a5, %tlsdesc_load_lo(.Ltlsdesc_hi2)(a4) + addi a0, a4, %tlsdesc_add_lo(.Ltlsdesc_hi2) + jalr t0, 0(a5), %tlsdesc_call(.Ltlsdesc_hi2) + add a0, a0, tp + +.section .tbss +.globl a +.zero 8 +a: +.zero 2039 ## Place b at 0x7ff +b: +.zero 1 + +#--- c.s +.tbss +.globl c +c: .zero 4 diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/cmake/modules/LLVMConfig.cmake.in llvm-toolchain-18-18.1.0-rc1/llvm/cmake/modules/LLVMConfig.cmake.in --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/cmake/modules/LLVMConfig.cmake.in 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/cmake/modules/LLVMConfig.cmake.in 2024-01-30 07:40:21.000000000 +0000 @@ -90,6 +90,11 @@ find_package(CURL) endif() +set(LLVM_ENABLE_HTTPLIB @LLVM_ENABLE_HTTPLIB@) +if(LLVM_ENABLE_HTTPLIB) + find_package(httplib) +endif() + set(LLVM_WITH_Z3 @LLVM_WITH_Z3@) set(LLVM_ENABLE_DIA_SDK @LLVM_ENABLE_DIA_SDK@) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm-toolchain-18-18.1.0-rc1/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 2024-01-30 07:40:29.000000000 +0000 @@ -2601,6 +2601,11 @@ [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree] >; +def int_amdgcn_s_wait_event_export_ready : + ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, + Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] +>; + // WMMA (Wave Matrix Multiply-Accumulate) intrinsics // // These operations perform a matrix multiplication and accumulation of @@ -2608,10 +2613,10 @@ class AMDGPUWmmaIntrinsic : Intrinsic< - [CD], // %D + [CD], // %D [ AB, // %A - AB, // %B + LLVMMatchType<1>, // %B LLVMMatchType<0>, // %C ], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] @@ -2619,49 +2624,50 @@ class AMDGPUWmmaIntrinsicOPSEL : Intrinsic< - [CD], // %D + [CD], // %D [ AB, // %A - AB, // %B + LLVMMatchType<1>, // %B LLVMMatchType<0>, // %C - llvm_i1_ty, // %high + llvm_i1_ty, // %high (op_sel) for GFX11, 0 for GFX12 ], [IntrNoMem, IntrConvergent, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree] >; class AMDGPUWmmaIntrinsicIU : Intrinsic< - [CD], // %D + [CD], // %D [ llvm_i1_ty, // %A_sign AB, // %A llvm_i1_ty, // %B_sign - AB, // %B + LLVMMatchType<1>, // %B LLVMMatchType<0>, // %C llvm_i1_ty, // %clamp ], [IntrNoMem, IntrConvergent, ImmArg>, ImmArg>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree] >; -def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic; -def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic; -// The regular, untied f16/bf16 wmma intrinsics only write to one half -// of the registers (set via the op_sel bit). -// The content of the other 16-bit of the registers is undefined. -def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL; -def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL; -// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix -// registers to the input accumulator registers. -// Essentially, the content of the other 16-bit is preserved from the input. -def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL; -def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL; -def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU; -def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU; +// WMMA GFX11Only -def int_amdgcn_s_wait_event_export_ready : - ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, - Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] ->; +// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. +// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers. +// The content of the other 16-bit half is preserved from the input. +def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL; +def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL; + +// WMMA GFX11Plus + +def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU; +def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU; + +// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. +// The content of the other 16-bit half is undefined. +// GFX12: The op_sel bit must be 0. +def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL; +def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL; //===----------------------------------------------------------------------===// // GFX12 Intrinsics @@ -2681,6 +2687,65 @@ [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; + +// WMMA (Wave Matrix Multiply-Accumulate) intrinsics +// +// These operations perform a matrix multiplication and accumulation of +// the form: D = A * B + C . + +// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>. +def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic; +// A and B are <16 x iu4>. +def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU; + +// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics +// +// These operations perform a sparse matrix multiplication and accumulation of +// the form: D = A * B + C. +// A is sparse matrix, half the size of B, and is expanded using sparsity index. + +class AMDGPUSWmmacIntrinsicIdx : + Intrinsic< + [CD], // %D + [ + A, // %A + B, // %B + LLVMMatchType<0>, // %C + Index // %Sparsity index for A + ], + [IntrNoMem, IntrConvergent, IntrWillReturn] +>; + +class AMDGPUSWmmacIntrinsicIUIdx : + Intrinsic< + [CD], // %D + [ + llvm_i1_ty, // %A_sign + A, // %A + llvm_i1_ty, // %B_sign + B, // %B + LLVMMatchType<0>, // %C + Index, // %Sparsity index for A + llvm_i1_ty, // %clamp + ], + [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, ImmArg>] +>; + +def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_bf16_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_i32_16x16x32_iu8 : AMDGPUSWmmacIntrinsicIUIdx; +def int_amdgcn_swmmac_i32_16x16x32_iu4 : AMDGPUSWmmacIntrinsicIUIdx; +def int_amdgcn_swmmac_i32_16x16x64_iu4 : AMDGPUSWmmacIntrinsicIUIdx; +def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx; + def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn; def int_amdgcn_flat_atomic_fmin_num : AMDGPUAtomicRtn; @@ -2712,6 +2777,10 @@ def int_amdgcn_global_load_tr : AMDGPULoadTr; +// i32 @llvm.amdgcn.wave.id() +def int_amdgcn_wave_id : + DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/include/llvm/Target/TargetInstrPredicate.td llvm-toolchain-18-18.1.0-rc1/llvm/include/llvm/Target/TargetInstrPredicate.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/include/llvm/Target/TargetInstrPredicate.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/include/llvm/Target/TargetInstrPredicate.td 2024-01-30 07:40:21.000000000 +0000 @@ -152,6 +152,34 @@ string ImmVal = Value; } +// Check that the operand at position `Index` is less than `Imm`. +// If field `FunctionMapper` is a non-empty string, then function +// `FunctionMapper` is applied to the operand value, and the return value is then +// compared against `Imm`. +class CheckImmOperandLT : CheckOperandBase { + int ImmVal = Imm; +} + +// Check that the operand at position `Index` is greater than `Imm`. +// If field `FunctionMapper` is a non-empty string, then function +// `FunctionMapper` is applied to the operand value, and the return value is then +// compared against `Imm`. +class CheckImmOperandGT : CheckOperandBase { + int ImmVal = Imm; +} + +// Check that the operand at position `Index` is less than or equal to `Imm`. +// If field `FunctionMapper` is a non-empty string, then function +// `FunctionMapper` is applied to the operand value, and the return value is then +// compared against `Imm`. +class CheckImmOperandLE : CheckNot>; + +// Check that the operand at position `Index` is greater than or equal to `Imm`. +// If field `FunctionMapper` is a non-empty string, then function +// `FunctionMapper` is applied to the operand value, and the return value is then +// compared against `Imm`. +class CheckImmOperandGE : CheckNot>; + // Expands to a call to `FunctionMapper` if field `FunctionMapper` is set. // Otherwise, it expands to a CheckNot>. class CheckRegOperandSimple : CheckOperandBase; @@ -203,6 +231,12 @@ class CheckAny Sequence> : CheckPredicateSequence; +// Check that the operand at position `Index` is in range [Start, End]. +// If field `FunctionMapper` is a non-empty string, then function +// `FunctionMapper` is applied to the operand value, and the return value is then +// compared against range [Start, End]. +class CheckImmOperandRange + : CheckAll<[CheckImmOperandGE, CheckImmOperandLE]>; // Used to expand the body of a function predicate. See the definition of // TIIPredicate below. diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/include/llvm/TargetParser/Triple.h llvm-toolchain-18-18.1.0-rc1/llvm/include/llvm/TargetParser/Triple.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/include/llvm/TargetParser/Triple.h 2024-01-24 12:45:36.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/include/llvm/TargetParser/Triple.h 2024-01-30 07:40:26.000000000 +0000 @@ -1033,11 +1033,11 @@ isWindowsCygwinEnvironment() || isOHOSFamily(); } - /// Tests whether the target uses TLS Descriptor by default. + /// True if the target supports both general-dynamic and TLSDESC, and TLSDESC + /// is enabled by default. bool hasDefaultTLSDESC() const { // TODO: Improve check for other platforms, like Android, and RISC-V - // Note: This is currently only used on RISC-V. - return isOSBinFormatELF() && isAArch64(); + return false; } /// Tests whether the target uses -data-sections as default. diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp 2024-01-24 12:45:36.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp 2024-01-30 07:40:26.000000000 +0000 @@ -4098,16 +4098,6 @@ return MI.getOperand(Idx); } -const MachineOperand & -AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - llvm_unreachable("Unexpected opcode"); - case AArch64::LDRBBroX: - return MI.getOperand(4); - } -} - static const TargetRegisterClass *getRegClass(const MachineInstr &MI, Register Reg) { if (MI.getParent() == nullptr) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AArch64/AArch64InstrInfo.h llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AArch64/AArch64InstrInfo.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AArch64/AArch64InstrInfo.h 2024-01-03 13:06:20.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AArch64/AArch64InstrInfo.h 2024-01-30 07:40:26.000000000 +0000 @@ -111,9 +111,6 @@ /// Returns the immediate offset operator of a load/store. static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI); - /// Returns the shift amount operator of a load/store. - static const MachineOperand &getLdStAmountOp(const MachineInstr &MI); - /// Returns whether the instruction is FP or NEON. static bool isFpOrNEON(const MachineInstr &MI); diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp 2024-01-11 22:50:54.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp 2024-01-30 07:40:26.000000000 +0000 @@ -62,8 +62,6 @@ "Number of load/store from unscaled generated"); STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); -STATISTIC(NumConstOffsetFolded, - "Number of const offset of index address folded"); DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming", "Controls which pairs are considered for renaming"); @@ -77,11 +75,6 @@ static cl::opt UpdateLimit("aarch64-update-scan-limit", cl::init(100), cl::Hidden); -// The LdStConstLimit limits how far we search for const offset instructions -// when we form index address load/store instructions. -static cl::opt LdStConstLimit("aarch64-load-store-const-scan-limit", - cl::init(10), cl::Hidden); - // Enable register renaming to find additional store pairing opportunities. static cl::opt EnableRenaming("aarch64-load-store-renaming", cl::init(true), cl::Hidden); @@ -178,13 +171,6 @@ findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit); - // Scan the instruction list to find a register assigned with a const - // value that can be combined with the current instruction (a load or store) - // using base addressing with writeback. Scan forwards. - MachineBasicBlock::iterator - findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit, - unsigned &Offset); - // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using // pre or post indexed addressing with writeback. Scan backwards. @@ -196,19 +182,11 @@ bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset); - bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI, - unsigned IndexReg, unsigned &Offset); - // Merge a pre- or post-index base register update into a ld/st instruction. MachineBasicBlock::iterator mergeUpdateInsn(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update, bool IsPreIdx); - MachineBasicBlock::iterator - mergeConstOffsetInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update, unsigned Offset, - int Scale); - // Find and merge zero store instructions. bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI); @@ -221,9 +199,6 @@ // Find and merge a base register updates before or after a ld/st instruction. bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI); - // Find and merge a index ldr/st instructions into a base ld/st instruction. - bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale); - bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); bool runOnMachineFunction(MachineFunction &Fn) override; @@ -506,16 +481,6 @@ } } -static unsigned getBaseAddressOpcode(unsigned Opc) { - // TODO: Add more index address loads/stores. - switch (Opc) { - default: - llvm_unreachable("Opcode has no base address equivalent!"); - case AArch64::LDRBBroX: - return AArch64::LDRBBui; - } -} - static unsigned getPostIndexedOpcode(unsigned Opc) { switch (Opc) { default: @@ -757,20 +722,6 @@ } } -// Make sure this is a reg+reg Ld/St -static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) { - unsigned Opc = MI.getOpcode(); - switch (Opc) { - default: - return false; - // Scaled instructions. - // TODO: Add more index address loads/stores. - case AArch64::LDRBBroX: - Scale = 1; - return true; - } -} - static bool isRewritableImplicitDef(unsigned Opc) { switch (Opc) { default: @@ -2097,63 +2048,6 @@ return NextI; } -MachineBasicBlock::iterator -AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update, - unsigned Offset, int Scale) { - assert((Update->getOpcode() == AArch64::MOVKWi) && - "Unexpected const mov instruction to merge!"); - MachineBasicBlock::iterator E = I->getParent()->end(); - MachineBasicBlock::iterator NextI = next_nodbg(I, E); - MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E); - MachineInstr &MemMI = *I; - unsigned Mask = (1 << 12) * Scale - 1; - unsigned Low = Offset & Mask; - unsigned High = Offset - Low; - Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); - Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg(); - MachineInstrBuilder AddMIB, MemMIB; - - // Add IndexReg, BaseReg, High (the BaseReg may be SP) - AddMIB = - BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri)) - .addDef(IndexReg) - .addUse(BaseReg) - .addImm(High >> 12) // shifted value - .addImm(12); // shift 12 - (void)AddMIB; - // Ld/St DestReg, IndexReg, Imm12 - unsigned NewOpc = getBaseAddressOpcode(I->getOpcode()); - MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .add(getLdStRegOp(MemMI)) - .add(AArch64InstrInfo::getLdStOffsetOp(MemMI)) - .addImm(Low / Scale) - .setMemRefs(I->memoperands()) - .setMIFlags(I->mergeFlagsWith(*Update)); - (void)MemMIB; - - ++NumConstOffsetFolded; - LLVM_DEBUG(dbgs() << "Creating base address load/store.\n"); - LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); - LLVM_DEBUG(PrevI->print(dbgs())); - LLVM_DEBUG(dbgs() << " "); - LLVM_DEBUG(Update->print(dbgs())); - LLVM_DEBUG(dbgs() << " "); - LLVM_DEBUG(I->print(dbgs())); - LLVM_DEBUG(dbgs() << " with instruction:\n "); - LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs())); - LLVM_DEBUG(dbgs() << " "); - LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs())); - LLVM_DEBUG(dbgs() << "\n"); - - // Erase the old instructions for the block. - I->eraseFromParent(); - PrevI->eraseFromParent(); - Update->eraseFromParent(); - - return NextI; -} - bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset) { @@ -2201,31 +2095,6 @@ return false; } -bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI, - MachineInstr &MI, - unsigned IndexReg, - unsigned &Offset) { - // The update instruction source and destination register must be the - // same as the load/store index register. - if (MI.getOpcode() == AArch64::MOVKWi && - TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) { - - // movz + movk hold a large offset of a Ld/St instruction. - MachineBasicBlock::iterator B = MI.getParent()->begin(); - MachineBasicBlock::iterator MBBI = &MI; - MBBI = prev_nodbg(MBBI, B); - MachineInstr &MovzMI = *MBBI; - if (MovzMI.getOpcode() == AArch64::MOVZWi) { - unsigned Low = MovzMI.getOperand(1).getImm(); - unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm(); - Offset = High + Low; - // 12-bit optionally shifted immediates are legal for adds. - return Offset >> 24 == 0; - } - } - return false; -} - MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); @@ -2381,60 +2250,6 @@ return E; } -MachineBasicBlock::iterator -AArch64LoadStoreOpt::findMatchingConstOffsetBackward( - MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) { - MachineBasicBlock::iterator B = I->getParent()->begin(); - MachineBasicBlock::iterator E = I->getParent()->end(); - MachineInstr &MemMI = *I; - MachineBasicBlock::iterator MBBI = I; - - // If the load is the first instruction in the block, there's obviously - // not any matching load or store. - if (MBBI == B) - return E; - - // Make sure the IndexReg is killed and the shift amount is zero. - // TODO: Relex this restriction to extend, simplify processing now. - if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() || - !AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() || - (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0)) - return E; - - Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg(); - - // Track which register units have been modified and used between the first - // insn (inclusive) and the second insn. - ModifiedRegUnits.clear(); - UsedRegUnits.clear(); - unsigned Count = 0; - do { - MBBI = prev_nodbg(MBBI, B); - MachineInstr &MI = *MBBI; - - // Don't count transient instructions towards the search limit since there - // may be different numbers of them if e.g. debug information is present. - if (!MI.isTransient()) - ++Count; - - // If we found a match, return it. - if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) { - return MBBI; - } - - // Update the status of what the instruction clobbered and used. - LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); - - // Otherwise, if the index register is used or modified, we have no match, - // so return early. - if (!ModifiedRegUnits.available(IndexReg) || - !UsedRegUnits.available(IndexReg)) - return E; - - } while (MBBI != B && Count < Limit); - return E; -} - bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; @@ -2619,34 +2434,6 @@ return false; } -bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, - int Scale) { - MachineInstr &MI = *MBBI; - MachineBasicBlock::iterator E = MI.getParent()->end(); - MachineBasicBlock::iterator Update; - - // Don't know how to handle unscaled pre/post-index versions below, so bail. - if (TII->hasUnscaledLdStOffset(MI.getOpcode())) - return false; - - // Look back to try to find a const offset for index LdSt instruction. For - // example, - // mov x8, #LargeImm ; = a * (1<<12) + imm12 - // ldr x1, [x0, x8] - // merged into: - // add x8, x0, a * (1<<12) - // ldr x1, [x8, imm12] - unsigned Offset; - Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset); - if (Update != E && (Offset & (Scale - 1)) == 0) { - // Merge the imm12 into the ld/st. - MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale); - return true; - } - - return false; -} - bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt) { @@ -2723,22 +2510,6 @@ Modified = true; else ++MBBI; - } - - // 5) Find a register assigned with a const value that can be combined with - // into the load or store. e.g., - // mov x8, #LargeImm ; = a * (1<<12) + imm12 - // ldr x1, [x0, x8] - // ; becomes - // add x8, x0, a * (1<<12) - // ldr x1, [x8, imm12] - for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - MBBI != E;) { - int Scale; - if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale)) - Modified = true; - else - ++MBBI; } return Modified; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPU.td llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPU.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPU.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPU.td 2024-01-30 07:40:29.000000000 +0000 @@ -1506,6 +1506,7 @@ FeatureFlatAtomicFaddF32Inst, FeatureImageInsts, FeatureExtendedImageInsts, + FeatureFP8ConversionInsts, FeaturePackedTID, FeatureVcmpxPermlaneHazard, FeatureSALUFloatInsts, diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPUGISel.td llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPUGISel.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPUGISel.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPUGISel.td 2024-01-30 07:40:21.000000000 +0000 @@ -59,6 +59,30 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_wmmavisrc : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_wmmamods : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_wmmamodsf16Neg : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_wmmamodsf16NegAbs : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_swmmacindex8 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_swmmacindex16 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def gi_vop3opselmods : GIComplexOperandMatcher, GIComplexPatternEquiv; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp 2024-01-30 07:40:29.000000000 +0000 @@ -3048,6 +3048,336 @@ return true; } +static MachineSDNode *buildRegSequence32(SmallVectorImpl &Elts, + llvm::SelectionDAG *CurDAG, + const SDLoc &DL) { + unsigned DstRegClass; + EVT DstTy; + switch (Elts.size()) { + case 8: + DstRegClass = AMDGPU::VReg_256RegClassID; + DstTy = MVT::v8i32; + break; + case 4: + DstRegClass = AMDGPU::VReg_128RegClassID; + DstTy = MVT::v4i32; + break; + case 2: + DstRegClass = AMDGPU::VReg_64RegClassID; + DstTy = MVT::v2i32; + break; + default: + llvm_unreachable("unhandled Reg sequence size"); + } + + SmallVector Ops; + Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32)); + for (unsigned i = 0; i < Elts.size(); ++i) { + Ops.push_back(Elts[i]); + Ops.push_back(CurDAG->getTargetConstant( + SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32)); + } + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops); +} + +static MachineSDNode *buildRegSequence16(SmallVectorImpl &Elts, + llvm::SelectionDAG *CurDAG, + const SDLoc &DL) { + SmallVector PackedElts; + assert("unhandled Reg sequence size" && + (Elts.size() == 8 || Elts.size() == 16)); + + // Pack 16-bit elements in pairs into 32-bit register. If both elements are + // unpacked from 32-bit source use it, otherwise pack them using v_perm. + for (unsigned i = 0; i < Elts.size(); i += 2) { + SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i])); + SDValue HiSrc; + if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) { + PackedElts.push_back(HiSrc); + } else { + SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32); + MachineSDNode *Packed = + CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32, + {Elts[i + 1], Elts[i], PackLoLo}); + PackedElts.push_back(SDValue(Packed, 0)); + } + } + + return buildRegSequence32(PackedElts, CurDAG, DL); +} + +static MachineSDNode *buildRegSequence(SmallVectorImpl &Elts, + llvm::SelectionDAG *CurDAG, + const SDLoc &DL, unsigned ElementSize) { + if (ElementSize == 16) + return buildRegSequence16(Elts, CurDAG, DL); + if (ElementSize == 32) + return buildRegSequence32(Elts, CurDAG, DL); + llvm_unreachable("Unhandled element size"); +} + +static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, + SmallVectorImpl &Elts, SDValue &Src, + llvm::SelectionDAG *CurDAG, const SDLoc &DL, + unsigned ElementSize) { + if (ModOpcode == ISD::FNEG) { + Mods |= SISrcMods::NEG; + // Check if all elements also have abs modifier + SmallVector NegAbsElts; + for (auto El : Elts) { + if (El.getOpcode() != ISD::FABS) + break; + NegAbsElts.push_back(El->getOperand(0)); + } + if (Elts.size() != NegAbsElts.size()) { + // Neg + Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0); + } else { + // Neg and Abs + Mods |= SISrcMods::NEG_HI; + Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0); + } + } else { + assert(ModOpcode == ISD::FABS); + // Abs + Mods |= SISrcMods::NEG_HI; + Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0); + } +} + +// Check all f16 elements for modifiers while looking through b32 and v2b16 +// build vector, stop if element does not satisfy ModifierCheck. +static void +checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, + std::function ModifierCheck) { + for (unsigned i = 0; i < BV->getNumOperands(); ++i) { + if (auto *F16Pair = + dyn_cast(stripBitcast(BV->getOperand(i)))) { + for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) { + SDValue ElF16 = stripBitcast(F16Pair->getOperand(i)); + if (!ModifierCheck(ElF16)) + break; + } + } + } +} + +bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + Src = In; + unsigned Mods = SISrcMods::OP_SEL_1; + + // mods are on f16 elements + if (auto *BV = dyn_cast(stripBitcast(In))) { + SmallVector EltsF16; + + checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool { + if (Element.getOpcode() != ISD::FNEG) + return false; + EltsF16.push_back(Element.getOperand(0)); + return true; + }); + + // All elements have neg modifier + if (BV->getNumOperands() * 2 == EltsF16.size()) { + Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0); + Mods |= SISrcMods::NEG; + Mods |= SISrcMods::NEG_HI; + } + } + + // mods are on v2f16 elements + if (auto *BV = dyn_cast(stripBitcast(In))) { + SmallVector EltsV2F16; + for (unsigned i = 0; i < BV->getNumOperands(); ++i) { + SDValue ElV2f16 = stripBitcast(BV->getOperand(i)); + // Based on first element decide which mod we match, neg or abs + if (ElV2f16.getOpcode() != ISD::FNEG) + break; + EltsV2F16.push_back(ElV2f16.getOperand(0)); + } + + // All pairs of elements have neg modifier + if (BV->getNumOperands() == EltsV2F16.size()) { + Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0); + Mods |= SISrcMods::NEG; + Mods |= SISrcMods::NEG_HI; + } + } + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + Src = In; + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned ModOpcode; + + // mods are on f16 elements + if (auto *BV = dyn_cast(stripBitcast(In))) { + SmallVector EltsF16; + checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool { + // Based on first element decide which mod we match, neg or abs + if (EltsF16.empty()) + ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS; + if (ElF16.getOpcode() != ModOpcode) + return false; + EltsF16.push_back(ElF16.getOperand(0)); + return true; + }); + + // All elements have ModOpcode modifier + if (BV->getNumOperands() * 2 == EltsF16.size()) + selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In), + 16); + } + + // mods are on v2f16 elements + if (auto *BV = dyn_cast(stripBitcast(In))) { + SmallVector EltsV2F16; + + for (unsigned i = 0; i < BV->getNumOperands(); ++i) { + SDValue ElV2f16 = stripBitcast(BV->getOperand(i)); + // Based on first element decide which mod we match, neg or abs + if (EltsV2F16.empty()) + ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS; + if (ElV2f16->getOpcode() != ModOpcode) + break; + EltsV2F16.push_back(ElV2f16->getOperand(0)); + } + + // All elements have ModOpcode modifier + if (BV->getNumOperands() == EltsV2F16.size()) + selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In), + 32); + } + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + Src = In; + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned ModOpcode; + SmallVector EltsF32; + + if (auto *BV = dyn_cast(stripBitcast(In))) { + for (unsigned i = 0; i < BV->getNumOperands(); ++i) { + SDValue ElF32 = stripBitcast(BV->getOperand(i)); + // Based on first element decide which mod we match, neg or abs + if (EltsF32.empty()) + ModOpcode = (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS; + if (ElF32.getOpcode() != ModOpcode) + break; + EltsF32.push_back(ElF32.getOperand(0)); + } + + // All elements had ModOpcode modifier + if (BV->getNumOperands() == EltsF32.size()) + selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In), + 32); + } + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const { + if (auto *BV = dyn_cast(In)) { + BitVector UndefElements; + if (SDValue Splat = BV->getSplatValue(&UndefElements)) + if (isInlineImmediate(Splat.getNode())) { + if (const ConstantSDNode *C = dyn_cast(Splat)) { + unsigned Imm = C->getAPIntValue().getSExtValue(); + Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32); + return true; + } + if (const ConstantFPSDNode *C = dyn_cast(Splat)) { + unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue(); + Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32); + return true; + } + llvm_unreachable("unhandled Constant node"); + } + } + + // 16 bit splat + SDValue SplatSrc32 = stripBitcast(In); + if (auto *SplatSrc32BV = dyn_cast(SplatSrc32)) { + if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) { + SDValue SplatSrc16 = stripBitcast(Splat32); + if (auto *SplatSrc16BV = dyn_cast(SplatSrc16)) { + if (SDValue Splat = SplatSrc16BV->getSplatValue()) { + + // f16 + if (isInlineImmediate(Splat.getNode())) { + const ConstantFPSDNode *C = dyn_cast(Splat); + int64_t Imm = C->getValueAPF().bitcastToAPInt().getSExtValue(); + Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i16); + return true; + } + + // bf16 + if (const ConstantSDNode *C = dyn_cast(Splat)) { + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + APInt BF16Value = C->getAPIntValue(); + APInt F32Value = BF16Value.zext(32).shl(16); + if (TII->isInlineConstant(F32Value)) { + int64_t Imm = F32Value.getSExtValue(); + Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32); + return true; + } + } + } + } + } + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src, + SDValue &IndexKey) const { + unsigned Key = 0; + Src = In; + + if (In.getOpcode() == ISD::SRL) { + const llvm::SDValue &ShiftSrc = In.getOperand(0); + ConstantSDNode *ShiftAmt = dyn_cast(In.getOperand(1)); + if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt && + ShiftAmt->getZExtValue() % 8 == 0) { + Key = ShiftAmt->getZExtValue() / 8; + Src = ShiftSrc; + } + } + + IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src, + SDValue &IndexKey) const { + unsigned Key = 0; + Src = In; + + if (In.getOpcode() == ISD::SRL) { + const llvm::SDValue &ShiftSrc = In.getOperand(0); + ConstantSDNode *ShiftAmt = dyn_cast(In.getOperand(1)); + if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt && + ShiftAmt->getZExtValue() == 16) { + Key = 1; + Src = ShiftSrc; + } + } + + IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const { Src = In; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h 2024-01-30 07:40:21.000000000 +0000 @@ -240,6 +240,16 @@ bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const; bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; + bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src, + SDValue &SrcMods) const; + bool SelectWMMAModsF16Neg(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src, + SDValue &SrcMods) const; + bool SelectWMMAVISrc(SDValue In, SDValue &Src) const; + + bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const; + bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const; + bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 2024-01-30 07:40:21.000000000 +0000 @@ -3956,6 +3956,219 @@ }}; } +static Register buildRegSequence(SmallVectorImpl &Elts, + MachineInstr *InsertPt, + MachineRegisterInfo &MRI) { + const TargetRegisterClass *DstRegClass; + switch (Elts.size()) { + case 8: + DstRegClass = &AMDGPU::VReg_256RegClass; + break; + case 4: + DstRegClass = &AMDGPU::VReg_128RegClass; + break; + case 2: + DstRegClass = &AMDGPU::VReg_64RegClass; + break; + default: + llvm_unreachable("unhandled Reg sequence size"); + } + + MachineIRBuilder B(*InsertPt); + auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE) + .addDef(MRI.createVirtualRegister(DstRegClass)); + for (unsigned i = 0; i < Elts.size(); ++i) { + MIB.addReg(Elts[i]); + MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i)); + } + return MIB->getOperand(0).getReg(); +} + +static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, + SmallVectorImpl &Elts, Register &Src, + MachineInstr *InsertPt, + MachineRegisterInfo &MRI) { + if (ModOpcode == TargetOpcode::G_FNEG) { + Mods |= SISrcMods::NEG; + // Check if all elements also have abs modifier + SmallVector NegAbsElts; + for (auto El : Elts) { + Register FabsSrc; + if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc)))) + break; + NegAbsElts.push_back(FabsSrc); + } + if (Elts.size() != NegAbsElts.size()) { + // Neg + Src = buildRegSequence(Elts, InsertPt, MRI); + } else { + // Neg and Abs + Mods |= SISrcMods::NEG_HI; + Src = buildRegSequence(NegAbsElts, InsertPt, MRI); + } + } else { + assert(ModOpcode == TargetOpcode::G_FABS); + // Abs + Mods |= SISrcMods::NEG_HI; + Src = buildRegSequence(Elts, InsertPt, MRI); + } +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const { + Register Src = Root.getReg(); + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned ModOpcode; + SmallVector EltsF32; + + if (GBuildVector *BV = dyn_cast(MRI->getVRegDef(Src))) { + for (unsigned i = 0; i < BV->getNumSources(); ++i) { + MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(i)); + // Based on first element decide which mod we match, neg or abs + if (EltsF32.empty()) + ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG + : AMDGPU::G_FABS; + if (ElF32->getOpcode() != ModOpcode) + break; + EltsF32.push_back(ElF32->getOperand(1).getReg()); + } + + // All elements had ModOpcode modifier + if (BV->getNumSources() == EltsF32.size()) { + selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(), + *MRI); + } + } + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const { + Register Src = Root.getReg(); + unsigned Mods = SISrcMods::OP_SEL_1; + SmallVector EltsV2F16; + + if (GConcatVectors *CV = dyn_cast(MRI->getVRegDef(Src))) { + for (unsigned i = 0; i < CV->getNumSources(); ++i) { + Register FNegSrc; + if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc)))) + break; + EltsV2F16.push_back(FNegSrc); + } + + // All elements had ModOpcode modifier + if (CV->getNumSources() == EltsV2F16.size()) { + Mods |= SISrcMods::NEG; + Mods |= SISrcMods::NEG_HI; + Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI); + } + } + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const { + Register Src = Root.getReg(); + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned ModOpcode; + SmallVector EltsV2F16; + + if (GConcatVectors *CV = dyn_cast(MRI->getVRegDef(Src))) { + for (unsigned i = 0; i < CV->getNumSources(); ++i) { + MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i)); + // Based on first element decide which mod we match, neg or abs + if (EltsV2F16.empty()) + ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG + : AMDGPU::G_FABS; + if (ElV2F16->getOpcode() != ModOpcode) + break; + EltsV2F16.push_back(ElV2F16->getOperand(1).getReg()); + } + + // All elements had ModOpcode modifier + if (CV->getNumSources() == EltsV2F16.size()) { + MachineIRBuilder B(*Root.getParent()); + selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(), + *MRI); + } + } + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const { + std::optional FPValReg; + if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) { + if (TII.isInlineConstant(FPValReg->Value.bitcastToAPInt())) { + return {{[=](MachineInstrBuilder &MIB) { + MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue()); + }}}; + } + // Non-inlineable splat floats should not fall-through for integer immediate + // checks. + return {}; + } + + APInt ICst; + if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) { + if (TII.isInlineConstant(ICst)) { + return { + {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}}; + } + } + + return {}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const { + Register Src = + getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); + unsigned Key = 0; + + Register ShiftSrc; + std::optional ShiftAmt; + if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) && + MRI->getType(ShiftSrc).getSizeInBits() == 32 && + ShiftAmt->Value.getZExtValue() % 8 == 0) { + Key = ShiftAmt->Value.getZExtValue() / 8; + Src = ShiftSrc; + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const { + + Register Src = + getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); + unsigned Key = 0; + + Register ShiftSrc; + std::optional ShiftAmt; + if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) && + MRI->getType(ShiftSrc).getSizeInBits() == 32 && + ShiftAmt->Value.getZExtValue() == 16) { + Src = ShiftSrc; + Key = 1; + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { Register Src; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h 2024-01-30 07:40:21.000000000 +0000 @@ -200,6 +200,19 @@ selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectWMMAModsF32NegAbs(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectWMMAModsF16Neg(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectWMMAModsF16NegAbs(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectWMMAVISrc(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSWMMACIndex8(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSWMMACIndex16(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp 2024-01-30 07:40:21.000000000 +0000 @@ -4178,10 +4178,45 @@ Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); - const ArgDescriptor *Arg; + const ArgDescriptor *Arg = nullptr; const TargetRegisterClass *ArgRC; LLT ArgTy; - std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); + + CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); + const ArgDescriptor WorkGroupIDX = + ArgDescriptor::createRegister(AMDGPU::TTMP9); + // If GridZ is not programmed in an entry function then the hardware will set + // it to all zeros, so there is no need to mask the GridY value in the low + // order bits. + const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( + AMDGPU::TTMP7, + AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu); + const ArgDescriptor WorkGroupIDZ = + ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); + if (ST.hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) { + switch (ArgType) { + case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: + Arg = &WorkGroupIDX; + ArgRC = &AMDGPU::SReg_32RegClass; + ArgTy = LLT::scalar(32); + break; + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: + Arg = &WorkGroupIDY; + ArgRC = &AMDGPU::SReg_32RegClass; + ArgTy = LLT::scalar(32); + break; + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: + Arg = &WorkGroupIDZ; + ArgRC = &AMDGPU::SReg_32RegClass; + ArgTy = LLT::scalar(32); + break; + default: + break; + } + } + + if (!Arg) + std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); if (!Arg) { if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { @@ -6848,6 +6883,21 @@ return true; } +bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI, + MachineIRBuilder &B) const { + // With architected SGPRs, waveIDinGroup is in TTMP8[29:25]. + if (!ST.hasArchitectedSGPRs()) + return false; + LLT S32 = LLT::scalar(32); + Register DstReg = MI.getOperand(0).getReg(); + auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8)); + auto LSB = B.buildConstant(S32, 25); + auto Width = B.buildConstant(S32, 5); + B.buildUbfx(DstReg, TTMP8, LSB, Width); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { MachineIRBuilder &B = Helper.MIRBuilder; @@ -6970,6 +7020,8 @@ case Intrinsic::amdgcn_workgroup_id_z: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_wave_id: + return legalizeWaveID(MI, B); case Intrinsic::amdgcn_lds_kernel_id: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::LDS_KERNEL_ID); @@ -7134,6 +7186,29 @@ return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); case Intrinsic::amdgcn_image_bvh_intersect_ray: return legalizeBVHIntrinsic(MI, B); + case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { + Register Index = MI.getOperand(5).getReg(); + LLT S32 = LLT::scalar(32); + if (MRI.getType(Index) != S32) + MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0)); + return true; + } + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: + case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { + Register Index = MI.getOperand(7).getReg(); + LLT S32 = LLT::scalar(32); + if (MRI.getType(Index) != S32) + MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0)); + return true; + } case Intrinsic::amdgcn_fmed3: { GISelChangeObserver &Observer = Helper.Observer; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h 2024-01-30 07:40:21.000000000 +0000 @@ -212,6 +212,7 @@ bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeImageIntrinsic( MachineInstr &MI, MachineIRBuilder &B, diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 2024-01-30 07:40:21.000000000 +0000 @@ -4505,6 +4505,22 @@ case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8: + case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: + case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td 2024-01-30 07:40:21.000000000 +0000 @@ -414,6 +414,22 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; // The dummy boolean output is divergent from the IR's perspective, diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp 2024-01-30 07:40:29.000000000 +0000 @@ -151,6 +151,8 @@ ImmTyOpSelHi, ImmTyNegLo, ImmTyNegHi, + ImmTyIndexKey8bit, + ImmTyIndexKey16bit, ImmTyDPP8, ImmTyDppCtrl, ImmTyDppRowMask, @@ -383,6 +385,8 @@ bool isGDS() const { return isImmTy(ImmTyGDS); } bool isLDS() const { return isImmTy(ImmTyLDS); } bool isCPol() const { return isImmTy(ImmTyCPol); } + bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); } + bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); } bool isDppBankMask() const { return isImmTy(ImmTyDppBankMask); } @@ -656,6 +660,14 @@ return isVISrcF16() || isVISrcB32(); } + bool isVISrc_64F16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f16); + } + + bool isVISrc_64B32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32); + } + bool isVISrc_64B64() const { return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i64); } @@ -672,6 +684,14 @@ return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32); } + bool isVISrc_256B32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i32); + } + + bool isVISrc_256F32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f32); + } + bool isVISrc_256B64() const { return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i64); } @@ -1047,6 +1067,8 @@ case ImmTyOffset1: OS << "Offset1"; break; case ImmTySMEMOffsetMod: OS << "SMEMOffsetMod"; break; case ImmTyCPol: OS << "CPol"; break; + case ImmTyIndexKey8bit: OS << "index_key"; break; + case ImmTyIndexKey16bit: OS << "index_key"; break; case ImmTyTFE: OS << "TFE"; break; case ImmTyD16: OS << "D16"; break; case ImmTyFORMAT: OS << "FORMAT"; break; @@ -1604,6 +1626,11 @@ ParseStatus parseRegWithFPInputMods(OperandVector &Operands); ParseStatus parseRegWithIntInputMods(OperandVector &Operands); ParseStatus parseVReg32OrOff(OperandVector &Operands); + ParseStatus tryParseIndexKey(OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy); + ParseStatus parseIndexKey8bit(OperandVector &Operands); + ParseStatus parseIndexKey16bit(OperandVector &Operands); + ParseStatus parseDfmtNfmt(int64_t &Format); ParseStatus parseUfmt(int64_t &Format); ParseStatus parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc, @@ -1784,6 +1811,8 @@ void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); void cvtVOP3P(MCInst &Inst, const OperandVector &Operands); + void cvtSWMMAC(MCInst &Inst, const OperandVector &Operands); + void cvtVOPD(MCInst &Inst, const OperandVector &Operands); void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx); @@ -3500,6 +3529,9 @@ return !isInlineConstant(Inst, OpIdx); } else if (MO.isReg()) { auto Reg = MO.getReg(); + if (!Reg) { + return false; + } const MCRegisterInfo *TRI = getContext().getRegisterInfo(); auto PReg = mc2PseudoReg(Reg); return isSGPR(PReg, TRI) && PReg != SGPR_NULL; @@ -4364,7 +4396,11 @@ uint64_t TSFlags = MII.get(Opc).TSFlags; // v_dot4 fp8/bf8 neg_lo/neg_hi not allowed on src0 and src1 (allowed on src2) - if (!(TSFlags & SIInstrFlags::IsDOT)) + // v_wmma iu4/iu8 neg_lo not allowed on src2 (allowed on src0, src1) + // v_swmmac f16/bf16 neg_lo/neg_hi not allowed on src2 (allowed on src0, src1) + // other wmma/swmmac instructions don't have neg_lo/neg_hi operand. + if (!(TSFlags & SIInstrFlags::IsDOT) && !(TSFlags & SIInstrFlags::IsWMMA) && + !(TSFlags & SIInstrFlags::IsSWMMAC)) return true; int NegIdx = AMDGPU::getNamedOperandIdx(Opc, OpName); @@ -6465,6 +6501,33 @@ return true; } +ParseStatus AMDGPUAsmParser::tryParseIndexKey(OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy) { + const char *Pref = "index_key"; + int64_t ImmVal = 0; + SMLoc Loc = getLoc(); + auto Res = parseIntWithPrefix(Pref, ImmVal); + if (!Res.isSuccess()) + return Res; + + if (ImmTy == AMDGPUOperand::ImmTyIndexKey16bit && (ImmVal < 0 || ImmVal > 1)) + return Error(Loc, Twine("out of range ", StringRef(Pref))); + + if (ImmTy == AMDGPUOperand::ImmTyIndexKey8bit && (ImmVal < 0 || ImmVal > 3)) + return Error(Loc, Twine("out of range ", StringRef(Pref))); + + Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, ImmTy)); + return ParseStatus::Success; +} + +ParseStatus AMDGPUAsmParser::parseIndexKey8bit(OperandVector &Operands) { + return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey8bit); +} + +ParseStatus AMDGPUAsmParser::parseIndexKey16bit(OperandVector &Operands) { + return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey16bit); +} + // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their // values to live in a joint format operand in the MCInst encoding. ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { @@ -8303,12 +8366,20 @@ const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0; if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi || - Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) { + Opc == AMDGPU::V_CVT_SR_FP8_F32_vi || + Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) { Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods Inst.addOperand(Inst.getOperand(0)); } - if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in)) { + // Adding vdst_in operand is already covered for these DPP instructions in + // cvtVOP3DPP. + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) && + !(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) { assert(!IsPacked); Inst.addOperand(Inst.getOperand(0)); } @@ -8329,10 +8400,12 @@ } int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo); - if (NegLoIdx != -1) { + if (NegLoIdx != -1) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo); + + int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi); + if (NegHiIdx != -1) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi); - } const int Ops[] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, @@ -8352,11 +8425,11 @@ if (OpSelHiIdx != -1) OpSelHi = Inst.getOperand(OpSelHiIdx).getImm(); - if (NegLoIdx != -1) { - int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi); + if (NegLoIdx != -1) NegLo = Inst.getOperand(NegLoIdx).getImm(); + + if (NegHiIdx != -1) NegHi = Inst.getOperand(NegHiIdx).getImm(); - } for (int J = 0; J < 3; ++J) { int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]); @@ -8392,6 +8465,43 @@ cvtVOP3P(Inst, Operands, OptIdx); } +static void addSrcModifiersAndSrc(MCInst &Inst, const OperandVector &Operands, + unsigned i, unsigned Opc, unsigned OpName) { + if (AMDGPU::getNamedOperandIdx(Opc, OpName) != -1) + ((AMDGPUOperand &)*Operands[i]).addRegOrImmWithFPInputModsOperands(Inst, 2); + else + ((AMDGPUOperand &)*Operands[i]).addRegOperands(Inst, 1); +} + +void AMDGPUAsmParser::cvtSWMMAC(MCInst &Inst, const OperandVector &Operands) { + unsigned Opc = Inst.getOpcode(); + + ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); + addSrcModifiersAndSrc(Inst, Operands, 2, Opc, AMDGPU::OpName::src0_modifiers); + addSrcModifiersAndSrc(Inst, Operands, 3, Opc, AMDGPU::OpName::src1_modifiers); + ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); // srcTiedDef + ((AMDGPUOperand &)*Operands[4]).addRegOperands(Inst, 1); // src2 + + OptionalImmIndexMap OptIdx; + for (unsigned i = 5; i < Operands.size(); ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + OptIdx[Op.getImmTy()] = i; + } + + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_8bit)) + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyIndexKey8bit); + + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_16bit)) + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyIndexKey16bit); + + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp)) + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI); + + cvtVOP3P(Inst, Operands, OptIdx); +} + //===----------------------------------------------------------------------===// // VOPD //===----------------------------------------------------------------------===// @@ -8770,6 +8880,22 @@ } } + int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); + if (VdstInIdx == static_cast(Inst.getNumOperands())) { + Inst.addOperand(Inst.getOperand(0)); + } + + bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12; + if (IsVOP3CvtSrDpp) { + if (Src2ModIdx == static_cast(Inst.getNumOperands())) { + Inst.addOperand(MCOperand::createImm(0)); + Inst.addOperand(MCOperand::createReg(0)); + } + } + auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(), MCOI::TIED_TO); if (TiedTo != -1) { diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp 2024-01-30 07:40:29.000000000 +0000 @@ -260,8 +260,12 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64) DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 16) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 16) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32) @@ -704,6 +708,10 @@ break; Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address, CS); + if (Res) + break; + + Res = tryDecodeInst(DecoderTableWMMAGFX1264, MI, QW, Address, CS); } while (false); if (Res && AMDGPU::isMAC(MI.getOpcode())) { @@ -712,6 +720,13 @@ AMDGPU::OpName::src2_modifiers); } + if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) { + // Insert dummy unused src2_modifiers. + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src2_modifiers); + } + if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) && !AMDGPU::hasGDS(STI)) { insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds); @@ -942,6 +957,7 @@ // first add optional MI operands to check FI DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); + if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) { convertVOP3PDPPInst(MI); } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) || @@ -951,6 +967,15 @@ if (isMacDPP(MI)) convertMacDPPInst(MI); + int VDstInIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); + if (VDstInIdx != -1) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in); + + if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2); + unsigned DescNumOps = MCII->get(Opc).getNumOperands(); if (MI.getNumOperands() < DescNumOps && AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) { @@ -977,6 +1002,15 @@ if (isMacDPP(MI)) convertMacDPPInst(MI); + int VDstInIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); + if (VDstInIdx != -1) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in); + + if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2); + unsigned Opc = MI.getOpcode(); unsigned DescNumOps = MCII->get(Opc).getNumOperands(); if (MI.getNumOperands() < DescNumOps && diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp 2024-01-30 07:40:29.000000000 +0000 @@ -1716,14 +1716,14 @@ } bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { - if (!SIInstrInfo::isWMMA(*MI)) + if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) return false; const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) { - if (!SIInstrInfo::isWMMA(I)) + auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) { + if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I)) return false; // Src0 or Src1 of the current wmma instruction overlaps with the dest of @@ -1753,6 +1753,7 @@ const MachineOperand *Src2Mods = TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers); const bool NoSrc2Mods = + !Src2Mods || (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0; // Exception: there is no hazard if the wmma instructions are of the same // type and there is no input modifier on src2 of the current instruction. @@ -1760,6 +1761,18 @@ TII->pseudoToMCOpcode(MI->getOpcode()))); } + // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall) + // but Index can't overlap with PrevDstReg. + if (AMDGPU::isGFX12Plus(ST)) { + if (SIInstrInfo::isSWMMAC(*MI)) { + const Register CurIndex = + TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); + if (TRI->regsOverlap(PrevDstReg, CurIndex)) + return true; + } + return false; + } + return false; }; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp 2024-01-30 07:40:21.000000000 +0000 @@ -1275,6 +1275,23 @@ (ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue; } + // Print three values of neg/opsel for wmma instructions (prints 0 when there + // is no src_modifier operand instead of not printing anything). + if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsSWMMAC || + MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsWMMA) { + NumOps = 0; + int DefaultValue = Mod == SISrcMods::OP_SEL_1; + for (int OpName : + {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers}) { + int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName); + if (Idx != -1) + Ops[NumOps++] = MI->getOperand(Idx).getImm(); + else + Ops[NumOps++] = DefaultValue; + } + } + const bool HasDstSel = NumOps > 0 && Mod == SISrcMods::OP_SEL_0 && @@ -1305,6 +1322,16 @@ const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Opc = MI->getOpcode(); + if (isCvt_F32_Fp8_Bf8_e64(Opc)) { + auto SrcMod = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); + unsigned Mod = MI->getOperand(SrcMod).getImm(); + unsigned Index0 = !!(Mod & SISrcMods::OP_SEL_0); + unsigned Index1 = !!(Mod & SISrcMods::OP_SEL_1); + if (Index0 || Index1) + O << " op_sel:[" << Index0 << ',' << Index1 << ']'; + return; + } if (isPermlane16(Opc)) { auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); @@ -1336,6 +1363,26 @@ printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O); } +void AMDGPUInstPrinter::printIndexKey8bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + auto Imm = MI->getOperand(OpNo).getImm() & 0x7; + if (Imm == 0) + return; + + O << " index_key:" << Imm; +} + +void AMDGPUInstPrinter::printIndexKey16bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + auto Imm = MI->getOperand(OpNo).getImm() & 0x7; + if (Imm == 0) + return; + + O << " index_key:" << Imm; +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h 2024-01-30 07:40:21.000000000 +0000 @@ -139,6 +139,10 @@ const MCSubtargetInfo &STI, raw_ostream &O); void printNegHi(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printIndexKey8bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printIndexKey16bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printInterpSlot(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printInterpAttr(const MCInst *MI, unsigned OpNo, diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIDefines.h llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIDefines.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIDefines.h 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIDefines.h 2024-01-30 07:40:21.000000000 +0000 @@ -167,6 +167,9 @@ // ds_gws_* instructions. GWS = UINT64_C(1) << 62, + + // Is a SWMMAC instruction. + IsSWMMAC = UINT64_C(1) << 63, }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp 2024-01-30 07:40:21.000000000 +0000 @@ -208,6 +208,7 @@ assert(Old.isReg() && Fold.isImm()); if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) || + (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) || (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT))) return false; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 2024-01-30 07:40:29.000000000 +0000 @@ -2072,11 +2072,45 @@ const SIMachineFunctionInfo &MFI, EVT VT, AMDGPUFunctionArgInfo::PreloadedValue PVID) const { - const ArgDescriptor *Reg; + const ArgDescriptor *Reg = nullptr; const TargetRegisterClass *RC; LLT Ty; - std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID); + CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv(); + const ArgDescriptor WorkGroupIDX = + ArgDescriptor::createRegister(AMDGPU::TTMP9); + // If GridZ is not programmed in an entry function then the hardware will set + // it to all zeros, so there is no need to mask the GridY value in the low + // order bits. + const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( + AMDGPU::TTMP7, + AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu); + const ArgDescriptor WorkGroupIDZ = + ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); + if (Subtarget->hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) { + switch (PVID) { + case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: + Reg = &WorkGroupIDX; + RC = &AMDGPU::SReg_32RegClass; + Ty = LLT::scalar(32); + break; + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: + Reg = &WorkGroupIDY; + RC = &AMDGPU::SReg_32RegClass; + Ty = LLT::scalar(32); + break; + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: + Reg = &WorkGroupIDZ; + RC = &AMDGPU::SReg_32RegClass; + Ty = LLT::scalar(32); + break; + default: + break; + } + } + + if (!Reg) + std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID); if (!Reg) { if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) { // It's possible for a kernarg intrinsic call to appear in a kernel with @@ -2505,28 +2539,24 @@ } } - if (Info.hasWorkGroupIDX()) { - Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs); - if (!HasArchitectedSGPRs) + if (!HasArchitectedSGPRs) { + if (Info.hasWorkGroupIDX()) { + Register Reg = Info.addWorkGroupIDX(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } - CCInfo.AllocateReg(Reg); - } - - if (Info.hasWorkGroupIDY()) { - Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs); - if (!HasArchitectedSGPRs) + if (Info.hasWorkGroupIDY()) { + Register Reg = Info.addWorkGroupIDY(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } - CCInfo.AllocateReg(Reg); - } - - if (Info.hasWorkGroupIDZ()) { - Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs); - if (!HasArchitectedSGPRs) + if (Info.hasWorkGroupIDZ()) { + Register Reg = Info.addWorkGroupIDZ(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); - - CCInfo.AllocateReg(Reg); + CCInfo.AllocateReg(Reg); + } } if (Info.hasWorkGroupInfo()) { @@ -7890,6 +7920,17 @@ return Loads[0]; } +SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const { + // With architected SGPRs, waveIDinGroup is in TTMP8[29:25]. + if (!Subtarget->hasArchitectedSGPRs()) + return {}; + SDLoc SL(Op); + MVT VT = MVT::i32; + SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT); + return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8, + DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT)); +} + SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op, unsigned Dim, const ArgDescriptor &Arg) const { @@ -8060,6 +8101,8 @@ case Intrinsic::amdgcn_workgroup_id_z: return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_wave_id: + return lowerWaveID(DAG, Op); case Intrinsic::amdgcn_lds_kernel_id: { if (MFI->isEntryFunction()) return getLDSKernelId(DAG, DL); @@ -8242,6 +8285,36 @@ SIInstrInfo::MO_ABS32_LO); return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; } + case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { + if (Op.getOperand(4).getValueType() == MVT::i32) + return SDValue(); + + SDLoc SL(Op); + auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), + Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), IndexKeyi32); + } + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: + case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { + if (Op.getOperand(6).getValueType() == MVT::i32) + return SDValue(); + + SDLoc SL(Op); + auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), + {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), + IndexKeyi32, Op.getOperand(7)}); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIISelLowering.h llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIISelLowering.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIISelLowering.h 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIISelLowering.h 2024-01-30 07:40:21.000000000 +0000 @@ -80,6 +80,7 @@ SDValue lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, unsigned NewOpcode) const; + SDValue lowerWaveID(SelectionDAG &DAG, SDValue Op) const; SDValue lowerWorkitemID(SelectionDAG &DAG, SDValue Op, unsigned Dim, const ArgDescriptor &ArgDesc) const; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIInstrFormats.td llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIInstrFormats.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIInstrFormats.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIInstrFormats.td 2024-01-30 07:40:21.000000000 +0000 @@ -161,6 +161,9 @@ // ds_gws_* instructions. field bit GWS = 0; + // This bit indicates that this is one of SWMMAC instructions. + field bit IsSWMMAC = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -248,6 +251,8 @@ let TSFlags{62} = GWS; + let TSFlags{63} = IsSWMMAC; + let SchedRW = [Write32Bit]; let AsmVariantName = AMDGPUAsmVariants.Default; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIInstrInfo.h llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIInstrInfo.h 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIInstrInfo.h 2024-01-30 07:40:29.000000000 +0000 @@ -802,6 +802,14 @@ return isMFMA(MI) || isWMMA(MI); } + static bool isSWMMAC(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC; + } + + bool isSWMMAC(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::IsSWMMAC; + } + bool isDOT(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::IsDOT; } diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIInstrInfo.td llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIInstrInfo.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIInstrInfo.td 2024-01-30 07:40:29.000000000 +0000 @@ -1088,6 +1088,9 @@ def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">; def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">; +def IndexKey16bit : CustomOperand; +def IndexKey8bit : CustomOperand; + def dpp8 : CustomOperand; def dpp_ctrl : CustomOperand; @@ -1344,6 +1347,13 @@ def VOP3PModsNeg : ComplexPattern; def WMMAOpSelVOP3PMods : ComplexPattern; +def WMMAModsF32NegAbs : ComplexPattern; +def WMMAModsF16Neg : ComplexPattern; +def WMMAModsF16NegAbs : ComplexPattern; +def WMMAVISrc : ComplexPattern; +def SWMMACIndex8 : ComplexPattern; +def SWMMACIndex16 : ComplexPattern; + def VOP3OpSel : ComplexPattern; def VOP3OpSelMods : ComplexPattern; @@ -1684,8 +1694,9 @@ !if(HasOMod, (ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod0:$clamp, omod0:$omod), - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - clampmod0:$clamp)) + !if (HasClamp, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod0:$clamp), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0))) /* else */, // VOP1 without modifiers !if (HasClamp, @@ -2278,6 +2289,9 @@ field bit IsDOT = 0; field bit IsSingle = 0; field bit IsWMMA = 0; + field bit IsSWMMAC = 0; + + field bit IsFP8 = 0; field bit HasDst = !ne(DstVT.Value, untyped.Value); field bit HasDst32 = HasDst; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h 2024-01-30 07:40:21.000000000 +0000 @@ -751,35 +751,21 @@ } // Add system SGPRs. - Register addWorkGroupIDX(bool HasArchitectedSGPRs) { - Register Reg = - HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP9 : getNextSystemSGPR(); - ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg); - if (!HasArchitectedSGPRs) - NumSystemSGPRs += 1; - + Register addWorkGroupIDX() { + ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR()); + NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDX.getRegister(); } - Register addWorkGroupIDY(bool HasArchitectedSGPRs) { - Register Reg = - HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR(); - unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u; - ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask); - if (!HasArchitectedSGPRs) - NumSystemSGPRs += 1; - + Register addWorkGroupIDY() { + ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR()); + NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDY.getRegister(); } - Register addWorkGroupIDZ(bool HasArchitectedSGPRs) { - Register Reg = - HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR(); - unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u; - ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask); - if (!HasArchitectedSGPRs) - NumSystemSGPRs += 1; - + Register addWorkGroupIDZ() { + ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR()); + NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDZ.getRegister(); } diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIRegisterInfo.td llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIRegisterInfo.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/SIRegisterInfo.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/SIRegisterInfo.td 2024-01-30 07:40:29.000000000 +0000 @@ -1341,9 +1341,14 @@ // VISrc_* Operands with a VGPR or an inline constant //===----------------------------------------------------------------------===// +def VISrc_64_f16 : RegOrF16 <"VReg_64", "OPERAND_REG_INLINE_C">; +def VISrc_64_b32 : RegOrB32 <"VReg_64", "OPERAND_REG_INLINE_C">; def VISrc_64_f64 : RegOrF64 <"VReg_64", "OPERAND_REG_INLINE_C">; +def VISrc_128_f16 : RegOrF16 <"VReg_128", "OPERAND_REG_INLINE_C">; def VISrc_128_b32 : RegOrB32 <"VReg_128", "OPERAND_REG_INLINE_C">; def VISrc_128_f32 : RegOrF32 <"VReg_128", "OPERAND_REG_INLINE_C">; +def VISrc_256_b32 : RegOrB32 <"VReg_256", "OPERAND_REG_INLINE_C">; +def VISrc_256_f32 : RegOrF32 <"VReg_256", "OPERAND_REG_INLINE_C">; def VISrc_256_f64 : RegOrF64 <"VReg_256", "OPERAND_REG_INLINE_C">; def VISrc_512_b32 : RegOrB32 <"VReg_512", "OPERAND_REG_INLINE_C">; def VISrc_512_f32 : RegOrF32 <"VReg_512", "OPERAND_REG_INLINE_C">; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp 2024-01-30 07:40:21.000000000 +0000 @@ -529,6 +529,17 @@ Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12; } +bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) { + return Opc == AMDGPU::V_CVT_F32_BF8_e64_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_e64_gfx12 || + Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 || + Opc == AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12; +} + bool isGenericAtomic(unsigned Opc) { return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN || Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX || diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h 2024-01-30 07:40:21.000000000 +0000 @@ -535,6 +535,9 @@ LLVM_READNONE bool isGenericAtomic(unsigned Opc); +LLVM_READNONE +bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc); + namespace VOPD { enum Component : unsigned { diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/VOP1Instructions.td llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/VOP1Instructions.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/VOP1Instructions.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/VOP1Instructions.td 2024-01-30 07:40:29.000000000 +0000 @@ -571,6 +571,7 @@ } // End SubtargetPredicate = isGFX9Only class VOPProfile_Base_CVT_F32_F8 : VOPProfileI2F { + let HasExtDPP = 1; let HasExtSDWA = 1; let HasExtSDWA9 = 1; let HasExt = 1; @@ -599,6 +600,7 @@ (inst_sdwa 0, $src, 0, 0, index) >; +let SubtargetPredicate = isGFX9Only in { let OtherPredicates = [HasCvtFP8VOP1Bug] in { def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)), (V_CVT_F32_FP8_sdwa 0, $src, 0, 0, 0)>; @@ -617,6 +619,7 @@ def : Cvt_F32_F8_Pat; def : Cvt_F32_F8_Pat; } +} // End SubtargetPredicate = isGFX9Only class Cvt_PK_F32_F8_Pat : GCNPat< @@ -626,11 +629,77 @@ (inst_e32 $src)) >; -foreach Index = [0, -1] in { - def : Cvt_PK_F32_F8_Pat; - def : Cvt_PK_F32_F8_Pat; +let SubtargetPredicate = isGFX9Only in { + foreach Index = [0, -1] in { + def : Cvt_PK_F32_F8_Pat; + def : Cvt_PK_F32_F8_Pat; + } +} + + +// Similar to VOPProfile_Base_CVT_F32_F8, but for VOP3 instructions. +def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F { + let HasOpSel = 1; + let HasExtVOP3DPP = 0; +} + +def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, untyped, untyped]> { + let HasOpSel = 1; + let HasExtDPP = 1; + let HasExtVOP3DPP = 1; + let IsFP8 = 1; + let HasClamp = 0; + let HasOMod = 0; + let HasModifiers = 1; + let Src1VOP3DPP = Src1RC64; +} + +let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 0, + SchedRW = [WriteFloatCvt] in { + defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>; + defm V_CVT_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>; + defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>; + defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>; +} + +class Cvt_F32_F8_Pat_OpSel index, + VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat< + (f32 (node i32:$src, index)), + !if (index, + (inst_e64 !if(index{0}, + !if(index{1}, !or(SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1), + SRCMODS.OP_SEL_0), + !if(index{1}, SRCMODS.OP_SEL_1, 0)), + $src, 0), + (inst_e32 $src)) +>; + +let SubtargetPredicate = isGFX12Plus in { + foreach Index = [0, 1, 2, 3] in { + def : Cvt_F32_F8_Pat_OpSel; + def : Cvt_F32_F8_Pat_OpSel; + } +} + +class Cvt_PK_F32_F8_Pat_OpSel : GCNPat< + (v2f32 (node i32:$src, index)), + !if (index, + (inst_e64 SRCMODS.OP_SEL_0, $src, 0, 0, SRCMODS.NONE), + (inst_e32 $src)) +>; + +let SubtargetPredicate = isGFX12Plus in { + foreach Index = [0, -1] in { + def : Cvt_PK_F32_F8_Pat_OpSel; + def : Cvt_PK_F32_F8_Pat_OpSel; + } } let SubtargetPredicate = isGFX10Plus in { @@ -853,6 +922,20 @@ VOP3_Real_with_name; +// Define VOP1 instructions using the pseudo instruction with its old profile and +// VOP3 using the OpSel profile for the pseudo instruction. +defm V_CVT_F32_FP8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06c, "V_CVT_F32_FP8", "v_cvt_f32_fp8">; +defm V_CVT_F32_FP8 : VOP1_Realtriple_e64_with_name; + +defm V_CVT_F32_BF8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06d, "V_CVT_F32_BF8", "v_cvt_f32_bf8">; +defm V_CVT_F32_BF8 : VOP1_Realtriple_e64_with_name; + +defm V_CVT_PK_F32_FP8 : VOP1_Real_e32_with_name; +defm V_CVT_PK_F32_FP8 : VOP3_Real_with_name; + +defm V_CVT_PK_F32_BF8 : VOP1_Real_e32_with_name; +defm V_CVT_PK_F32_BF8 : VOP3_Real_with_name; + defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00c, "V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">; defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00d, diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/VOP3Instructions.td llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/VOP3Instructions.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/VOP3Instructions.td 2024-01-30 07:40:29.000000000 +0000 @@ -520,8 +520,26 @@ let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0, FP32InputMods:$src1_modifiers, Src1RC64:$src1, VGPR_32:$vdst_in, op_sel0:$op_sel); + let InsVOP3DPP = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + + let InsVOP3DPP16 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl, FI:$fi); + let InsVOP3DPP8 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, FI:$fi); + let HasClamp = 0; - let HasExtVOP3DPP = 0; + let HasExtVOP3DPP = 1; } def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, @@ -530,14 +548,36 @@ FP32InputMods:$src1_modifiers, Src1RC64:$src1, FP32InputMods:$src2_modifiers, VGPR_32:$src2, op_sel0:$op_sel); + let InsVOP3DPP16 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + FP32InputMods:$src2_modifiers, VGPR_32:$src2, + op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl, FI:$fi); + let InsVOP3DPP8 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + FP32InputMods:$src2_modifiers, VGPR_32:$src2, + op_sel0:$op_sel, dpp8:$dpp8, FI:$fi); let HasClamp = 0; let HasSrc2 = 0; let HasSrc2Mods = 1; + let HasExtVOP3DPP = 1; + let HasOpSel = 1; let AsmVOP3OpSel = !subst(", $src2_modifiers", "", getAsmVOP3OpSel<3, HasClamp, HasOMod, HasSrc0FloatMods, HasSrc1FloatMods, HasSrc2FloatMods>.ret); - let HasExtVOP3DPP = 0; + let AsmVOP3DPP16 = !subst(", $src2_modifiers", "", + getAsmVOP3DPP16.ret>.ret); + let AsmVOP3DPP8 = !subst(", $src2_modifiers", "", + getAsmVOP3DPP8.ret>.ret); } def IsPow2Plus1: PatLeaf<(i32 imm), [{ @@ -618,13 +658,13 @@ class Cvt_PK_F8_F32_Pat : GCNPat< (i32 (node f32:$src0, f32:$src1, i32:$old, index)), - (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, !if(index, SRCMODS.OP_SEL_0, 0)) + (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0) >; class Cvt_SR_F8_F32_Pat index, VOP3_Pseudo inst> : GCNPat< (i32 (node f32:$src0, i32:$src1, i32:$old, index)), (inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, - !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, !if(index{1}, SRCMODS.OP_SEL_0, 0)) + !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, 0) >; foreach Index = [0, -1] in { @@ -998,6 +1038,11 @@ defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>; defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>; +defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_gfx12<0x369>; +defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36a>; +defm V_CVT_SR_FP8_F32 : VOP3Only_Realtriple_gfx12<0x36b>; +defm V_CVT_SR_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36c>; + //===----------------------------------------------------------------------===// // GFX11, GFX12 //===----------------------------------------------------------------------===// diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/VOP3PInstructions.td llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/VOP3PInstructions.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/VOP3PInstructions.td 2024-01-30 07:40:29.000000000 +0000 @@ -936,16 +936,19 @@ !cast(NAME # _threeaddr # Suffix)>; } - if !eq(Type, WMMAOpSel) then { - def : WMMAOpSelPat(NAME # _twoaddr # Suffix), node, P>; - } else if !eq(Type, WMMAUIClamp) then { - def : WMMAUIClampPat(NAME # _twoaddr # Suffix), node, P>; - } else { - def : WMMARegularPat(NAME # _twoaddr # Suffix), node, P>; + let SubtargetPredicate = isGFX11Only in { + if !eq(Type, WMMAOpSel) then { + def : WMMAOpSelPat(NAME # _twoaddr # Suffix), node, P>; + } else if !eq(Type, WMMAUIClamp) then { + def : WMMAUIClampPat(NAME # _twoaddr # Suffix), node, P>; + } else { + def : WMMARegularPat(NAME # _twoaddr # Suffix), node, P>; + } } } + let WaveSizePredicate = isWave32 in { defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>; defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>; @@ -969,6 +972,398 @@ } +class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, + bit _IsIU, bit _IsFP8BF8> + : VOP3P_Profile> { + bit IsIU = _IsIU; + bit IsFP8BF8 = _IsFP8BF8; + bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8)); + + int IndexType = _IndexType; + + let IsPacked = 1; + let IsWMMA = !not(_IsSWMMAC); + let IsSWMMAC = _IsSWMMAC; + + bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP); + bit IsAB_BF16 = !and(IsF16BF16, isIntType.ret); + bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32)); + bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16)); + bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16)); + + bit NegLo01 = !or(IsF16BF16, IsIU); + bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA); + bit NegHi01 = IsF16BF16; + bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA); + bit NegLoAny = !or(NegLo01, NegLo2); + bit NegHiAny = !or(NegHi01, NegHi2); + + let DstRC = !cond(!eq(ArgTy[0], v8f32): VDst_256, + !eq(ArgTy[0], v8i32): VDst_256, + !eq(ArgTy[0], v8f16): VDst_128, + !eq(ArgTy[0], v8i16): VDst_128, + !eq(ArgTy[0], v4f32): VDst_128, + !eq(ArgTy[0], v4i32): VDst_128, + !eq(ArgTy[0], v4f16): VDst_64, + !eq(ArgTy[0], v4i16): VDst_64); + let Src0RC64 = !cond(!eq(ArgTy[1], v8f16): VRegSrc_128, + !eq(ArgTy[1], v4f16): VRegSrc_64, + !eq(ArgTy[1], v4i16): VRegSrc_64, + !eq(ArgTy[1], v8i16): VRegSrc_128, + !eq(ArgTy[1], v4i32): VRegSrc_128, + !eq(ArgTy[1], v2i32): VRegSrc_64, + !eq(ArgTy[1], i32) : VRegSrc_32); + let Src1RC64 = !cond(!eq(ArgTy[2], v16f16): VRegSrc_256, + !eq(ArgTy[2], v16i16): VRegSrc_256, + !eq(ArgTy[2], v8f16): VRegSrc_128, + !eq(ArgTy[2], v8i16): VRegSrc_128, + !eq(ArgTy[2], v4i32): VRegSrc_128, + !eq(ArgTy[1], v4i16): VRegSrc_64, + !eq(ArgTy[1], v4f16): VRegSrc_64, + !eq(ArgTy[2], v2i32): VRegSrc_64, + !eq(ArgTy[2], i32) : VRegSrc_32); + let Src2RC64 = !if(IsSWMMAC, DstRC, + !cond(!eq(ArgTy[3], v8f32): VISrc_256_f32, + !eq(ArgTy[3], v8i32): VISrc_256_b32, + !eq(ArgTy[3], v8f16): VISrc_128_f16, + !eq(ArgTy[3], v8i16): VISrc_128_f32, // bf16 + !eq(ArgTy[3], v4f16): VISrc_64_f16, + !eq(ArgTy[3], v4i16): VISrc_64_b32, + !eq(ArgTy[3], v4i32): VISrc_128_b32, + !eq(ArgTy[3], v4f32): VISrc_128_f32)); + + // For f16 and bf16 matrices A and B, each element can be modified by + // fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is + // overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext) + // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each + // element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1). + + // Opcode | src0/src1 - matrix A/B | src2 - matrix C or Index + // --------------------------------------------------------------------------- + // wmma f32_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f32) + // wmma f32_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f32) + // --------------------------------------------------------------------------- + // wmma f16_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f16 or bf16) + // wmma bf16_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f16 or bf16) + // --------------------------------------------------------------------------- + // wmma i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for + // | neg_lo = 1 i4/i8(sext) | i32 matrices + // --------------------------------------------------------------------------- + // wmma f32_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f32) + // (4 instructions) | f8 and bf8 matrices | neg_hi = 1 abs C(f32) + // --------------------------------------------------------------------------- + // swmmac f32_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix + // swmmac f32_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst + // --------------------------------------------------------------------------- + // swmmac f16_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix + // swmmac bf16_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst + // --------------------------------------------------------------------------- + // swmmac i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for sparse matrix + // | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst + // --------------------------------------------------------------------------- + // swmmac f32_fp8/bf8 | not allowed for | not allowed for sparse matrix + // (4 instructions) | f8 and bf8 matrices | A Index - matrix C is in dst + + // pseudo + + // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16 + // use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers, + // remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32 + // f16 or bf16). swmmac use index_key and don't use src 2 modifiers. + + dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers)); + dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers)); + dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers)); + dag IndexKey = !cond(!eq(IndexType, 0) : (ins), + !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit), + !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit)); + dag Clamp = !if(IsIU, (ins clampmod0:$clamp), (ins)); + dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi), + !and(NegLoAny, !not(NegHiAny)) : (ins neg_lo0:$neg_lo), + !and(!not(NegLoAny), !not(NegHiAny)) : (ins)); + + let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1), + !cond(IsWMMA : !con(Src2Mods, (ins Src2RC64:$src2)), + IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)), + Clamp, Neg); + + // asm + + string IndexKeyAsm = !cond(!eq(IndexType, 0) : "", + !eq(IndexType, 8) : "$index_key_8bit", + !eq(IndexType, 16) : "$index_key_16bit"); + string ClampAsm = !if(IsIU, "$clamp", ""); + string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi", + !and(NegLoAny, !not(NegHiAny)) : "$neg_lo", + !and(!not(NegLoAny), !not(NegHiAny)) : ""); + + let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm; + + // isel patterns + + dag Src0InPat = !cond(IsAB_F16 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))), + IsAB_BF16 : (ins Src0VT:$src0), + IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), + IsFP8BF8 : (ins Src0VT:$src0)); + dag Src0OutPat = !cond(IsAB_F16 : (ins i32:$src0_modifiers, Src0VT:$src0), + IsAB_BF16 : (ins (i32 8), Src0VT:$src0), + IsIU : (ins i32:$src0_modifiers, Src0VT:$src0), + IsFP8BF8 : (ins Src0VT:$src0)); + dag Src1InPat = !cond(IsAB_F16 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))), + IsAB_BF16 : (ins Src1VT:$src1), + IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), + IsFP8BF8 : (ins Src1VT:$src1)); + dag Src1OutPat = !cond(IsAB_F16 : (ins i32:$src1_modifiers, Src1VT:$src1), + IsAB_BF16 : (ins (i32 8), Src1VT:$src1), + IsIU : (ins i32:$src1_modifiers, Src1VT:$src1), + IsFP8BF8 : (ins Src1VT:$src1)); + dag Src2InPatWmma = !cond(IsC_F32 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))), + IsC_F16 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))), + IsC_BF16 : (ins Src2VT:$src2), + IsIU : (ins Src2VT:$src2), + IsSWMMAC : (ins)); + dag Src2OutPatWmma = !cond(IsC_F32 : (ins i32:$src2_modifiers, Src2VT:$src2), + IsC_F16 : (ins i32:$src2_modifiers, Src2VT:$src2), + IsC_BF16 : (ins (i32 8), Src2VT:$src2), + IsIU : (ins Src2VT:$src2), + IsSWMMAC : (ins)); + dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins)); + dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2), + !eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))), + !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit)))); + dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2), + !eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit), + !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit)); + dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2))); + dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2)); + + + dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat); + dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat); + + dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat); + dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat); + + // wmma pattern where src2 is inline imm uses _threeaddr pseudo, + // can't use _twoaddr since it would violate src2 tied to vdst constraint. + dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, ClampPat); + dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat); +} + +multiclass WMMAInstGFX12 { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in + def _twoaddr : VOP3P_Pseudo{ + let PseudoInstr = Instr#PseudoInstrSuffix; + } + + let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in + def _threeaddr : VOP3P_Pseudo{ + let PseudoInstr = Instr#PseudoInstrSuffix; + } + + } + def : WMMAOpcodeMapping(NAME # _twoaddr), + !cast(NAME # _threeaddr)>; +} + +multiclass SWMMACInstGFX12 { + def _twoaddr : VOP3P_Pseudo{ + let Mnemonic = Instr; + let PseudoInstr = Instr#PseudoInstrSuffix; + let mayRaiseFPException = 0; + let ReadsModeReg = 0; + let AsmMatchConverter = "cvtSWMMAC"; + + let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef"; + } +} + +// First argument in Profile is types for matrices D, A, B and C (D = A * B + C) +// as used by llvm ir, types are vectors(with matrix elements) +// wave32: +// For 16x16 matrices, lanes 0 to 31 will have 8 matrix elts, +// for 16 x 32 16 elts and for 16 x 64 lanes have 32 elts. +// wave64: +// lanes will have half the size of elements in lanes compared to wave32 with +// exception of 16x16_iu4: lanes0-31 will have 8xi4, remaining lanes are ignored + +// general idea on element distribution differences: +// wave32: lane n has 8 matrix elements +// wave64: lane n has first 4, lane n+32 has other 4 elements + +// index size, for each 2 elements in lane you need 4bits in index + +// Non-standard types (iu8, iu4, fp8, bf8) will be packed in vectors of i32s. +// Original type for them is in comment on the right and refers to A and B. + +def F32_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], 0, 0, 0, 0>; +def F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], 0, 0, 0, 0>; +def F16_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], 0, 0, 0, 0>; +def BF16_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], 0, 0, 0, 0>; +def I32_IU8_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 8xi8 +def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, i32, i32, v8i32], 0, 0, 1, 0>; // 8xi4 +def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], 0, 0, 0, 1>; // 8xf8 +def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 16xi4 + +def F32_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], 0, 0, 0, 0>; +def F32_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], 0, 0, 0, 0>; +def F16_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], 0, 0, 0, 0>; +def BF16_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], 0, 0, 0, 0>; +def I32_IU8_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 4xi8 +def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 * +def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, i32, i32, v4f32], 0, 0, 0, 1>; // 4xf8 +def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 + +def F32_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], 1, 16, 0, 0>; +def F32_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], 1, 16, 0, 0>; +def F16_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], 1, 16, 0, 0>; +def BF16_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], 1, 16, 0, 0>; +def I32_IU8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 16, 1, 0>; // 8xi8, 16xi8 +def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, i32, v2i32, v8i32], 1, 16, 1, 0>; // 8xi4, 16xi4 +def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 0, 1, 0>; // 16xi4, 32xi4 ** +def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v4i32, v8f32], 1, 16, 0, 1>; // 8xf8, 16xf8 + +def F32_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], 1, 8, 0, 0>; +def F32_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], 1, 8, 0, 0>; +def F16_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], 1, 8, 0, 0>; +def BF16_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], 1, 8, 0, 0>; +def I32_IU8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 8, 1, 0>; // 4xi8, 8xi8 +def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 1, 16, 1, 0>; // 8xi4, 8xi4 *** +def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 16, 1, 0>; // 8xi4, 16xi4 +def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, 8, 0, 1>; // 4xf8, 8xf8 + +// * IU4X16_WMMA_w64 lanes 0-31 will have 8xi4, remaining lanes are ignored +// ** IU4X64_SWMMAC_w32 index is i32, index_key is not used +// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored +// for matrix A, index is i16; Matrix B uses all lanes + +let WaveSizePredicate = isWave32 in { +defm V_WMMA_F32_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w32, "_w32">; +defm V_WMMA_BF16_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w32, "_w32">; +defm V_WMMA_I32_16X16X16_IU8_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w32, "_w32">; +defm V_WMMA_I32_16X16X16_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w32, "_w32">; +defm V_WMMA_I32_16X16X32_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w32, "_w32">; + +defm V_SWMMAC_F32_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F16_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w32, "_w32">; +defm V_SWMMAC_BF16_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w32, "_w32">; +defm V_SWMMAC_I32_16X16X32_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w32, "_w32">; +defm V_SWMMAC_I32_16X16X32_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w32, "_w32">; +defm V_SWMMAC_I32_16X16X64_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">; +} + +let WaveSizePredicate = isWave64 in { +defm V_WMMA_F32_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w64, "_w64">; +defm V_WMMA_F16_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w64, "_w64">; +defm V_WMMA_BF16_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w64, "_w64">; +defm V_WMMA_I32_16X16X16_IU8_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w64, "_w64">; +defm V_WMMA_I32_16X16X16_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w64, "_w64">; +defm V_WMMA_I32_16X16X32_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w64, "_w64">; + +defm V_SWMMAC_F32_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F16_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w64, "_w64">; +defm V_SWMMAC_BF16_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w64, "_w64">; +defm V_SWMMAC_I32_16X16X32_IU8_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w64, "_w64">; +defm V_SWMMAC_I32_16X16X32_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w64, "_w64">; +defm V_SWMMAC_I32_16X16X64_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">; +} + +// IsGFX11OpselIntrinsic: f16_f16 and bf16_bf16 Intrinsics have imm operand that +// controls opsel. Used by gfx11, removed in gfx12 (operand must be 0). +multiclass WMMAPat { + def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)), + (P.DstVT !setdagop(P.WmmaOutPat, !cast(Inst#"_twoaddr")))>; + let AddedComplexity = 4 in + def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInlineInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)), + (P.DstVT !setdagop(P.WmmaInlineOutPat, !cast(Inst#"_threeaddr")))>; +} + +class SWMMACPat : + GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)), + (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>; + +class SWMMACPat_w64 : + GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)), + (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>{ + let WaveSizePredicate = isWave64; + } + +let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in { + defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>; + defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w32", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w32,1>; + defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w32", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w32", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w32", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w32>; + + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : GCNPat <(I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacInPat, int_amdgcn_swmmac_i32_16x16x64_iu4)), + (I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacOutPat, V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr))>; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; +} + +let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in { + defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>; + defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w64", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w64,1>; + defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w64", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w64", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w64", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w64>; + + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; +} + + //===----------------------------------------------------------------------===// // Begin Real Encodings //===----------------------------------------------------------------------===// @@ -1005,6 +1400,99 @@ VOP3Pe_gfx11_gfx12(backing_ps_name).Pfl>; } +class VOP3PeWmma op, VOPProfile P, VOP3PWMMA_Profile WMMAP> + : VOP3Pe_gfx11_gfx12{ + // opsel + let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0, + !eq(WMMAP.IndexType, 8) : index_key_8bit{0}, + !eq(WMMAP.IndexType, 16) : index_key_16bit{0}); + let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0); + let Inst{13} = 0; + // opsel_hi + let Inst{59} = 1; + let Inst{60} = 1; + let Inst{14} = 1; + // neg_lo + let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0); + let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0); + let Inst{63} = !if(WMMAP.NegLo2, src2_modifiers{0}, 0); + // neg_hi + let Inst{8} = !if(WMMAP.NegHi01, src0_modifiers{1}, 0); + let Inst{9} = !if(WMMAP.NegHi01, src1_modifiers{1}, 0); + let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0); + // clamp + let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0); +} + +multiclass VOP3P_WMMA_Real_Base op, VOP3PWMMA_Profile WMMAP, + string backing_ps_name = NAME, + string asmName = !cast(NAME).Mnemonic> { + def Gen.Suffix : + VOP3P_Real_Gen(backing_ps_name), Gen, asmName>, + VOP3PeWmma(backing_ps_name).Pfl, WMMAP>; +} + +multiclass VOP3P_Real_WMMA_gfx12 op, VOP3PWMMA_Profile WMMAP> { + let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in { + defm _twoaddr : VOP3P_WMMA_Real_Base ; + } +} + +multiclass VOP3P_Real_WMMA_gfx12w64 op, VOP3PWMMA_Profile WMMAP> { + let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX12" in { + defm _twoaddr : VOP3P_WMMA_Real_Base ; + } +} + +defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>; +defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>; +defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>; +defm V_WMMA_BF16_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x043, BF16_BF16_WMMA_w32>; +defm V_WMMA_I32_16X16X16_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x044, I32_IU8_WMMA_w32>; +defm V_WMMA_I32_16X16X16_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x045, I32_IU4X16_WMMA_w32>; +defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x046, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x047, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x048, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x049, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x04a, I32_IU4X32_WMMA_w32>; + +defm V_WMMA_F32_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x040, F32_F16_WMMA_w64>; +defm V_WMMA_F32_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x041, F32_BF16_WMMA_w64>; +defm V_WMMA_F16_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x042, F16_F16_WMMA_w64>; +defm V_WMMA_BF16_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x043, BF16_BF16_WMMA_w64>; +defm V_WMMA_I32_16X16X16_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x044, I32_IU8_WMMA_w64>; +defm V_WMMA_I32_16X16X16_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>; +defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>; + + +defm V_SWMMAC_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x050, F32_F16_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x051, F32_BF16_SWMMAC_w32>; +defm V_SWMMAC_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x052, F16_F16_SWMMAC_w32>; +defm V_SWMMAC_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x053, BF16_BF16_SWMMAC_w32>; +defm V_SWMMAC_I32_16X16X32_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x054, I32_IU8_SWMMAC_w32>; +defm V_SWMMAC_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>; +defm V_SWMMAC_I32_16X16X64_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>; + +defm V_SWMMAC_F32_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x050, F32_F16_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>; +defm V_SWMMAC_F16_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x052, F16_F16_SWMMAC_w64>; +defm V_SWMMAC_BF16_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>; +defm V_SWMMAC_I32_16X16X32_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>; +defm V_SWMMAC_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>; +defm V_SWMMAC_I32_16X16X64_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>; + multiclass VOP3P_Real_with_name op, string backing_ps_name = NAME, string asmName = !cast(NAME).Mnemonic> { diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/VOPInstructions.td llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/VOPInstructions.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/AMDGPU/VOPInstructions.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/AMDGPU/VOPInstructions.td 2024-01-30 07:40:21.000000000 +0000 @@ -124,6 +124,7 @@ let IsPacked = P.IsPacked; let IsMAI = P.IsMAI; let IsWMMA = P.IsWMMA; + let IsSWMMAC = P.IsSWMMAC; let AsmOperands = !if(isVop3OpSel, P.AsmVOP3OpSel, @@ -305,6 +306,11 @@ class VOP3OpSel_gfx11_gfx12 op, VOPProfile p> : VOP3OpSel_gfx10; +class VOP3FP8OpSel_gfx11_gfx12 op, VOPProfile p> : VOP3e_gfx10 { + let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0); + let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0); +} + class VOP3DotOpSel_gfx11_gfx12 op, VOPProfile p> : VOP3OpSel_gfx11_gfx12{ let Inst{11} = ?; let Inst{12} = ?; @@ -378,6 +384,8 @@ bits<4> src2_modifiers; bits<9> src2; bits<1> clamp; + bits<2> index_key_8bit; + bits<1> index_key_16bit; let Inst{7-0} = vdst; let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 @@ -738,7 +746,7 @@ let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs. let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?); - let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, 0),?); + let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?); let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?); let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?); let Inst{15} = !if(P.HasClamp, clamp, 0); @@ -1406,14 +1414,20 @@ defvar ps = !cast(opName#"_e64"); let AsmString = asmName # ps.AsmOperands, IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { - if ps.Pfl.HasOpSel then - def _e64#Gen.Suffix : - VOP3_Real_Gen, - VOP3OpSel_gfx11_gfx12; - if !not(ps.Pfl.HasOpSel) then - def _e64#Gen.Suffix : - VOP3_Real_Gen, - VOP3e_gfx11_gfx12; + if ps.Pfl.IsFP8 then { + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3FP8OpSel_gfx11_gfx12; + } else { + if ps.Pfl.HasOpSel then + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3OpSel_gfx11_gfx12; + if !not(ps.Pfl.HasOpSel) then + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3e_gfx11_gfx12; + } } def Gen.Suffix#"_VOP3_alias" : MnemonicAlias, Requires<[Gen.AssemblerPredicate]>, LetDummies; } diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/CMakeLists.txt llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/CMakeLists.txt --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/CMakeLists.txt 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/CMakeLists.txt 2024-01-30 07:40:22.000000000 +0000 @@ -5,6 +5,7 @@ tablegen(LLVM RISCVGenAsmMatcher.inc -gen-asm-matcher) tablegen(LLVM RISCVGenAsmWriter.inc -gen-asm-writer) tablegen(LLVM RISCVGenCompressInstEmitter.inc -gen-compress-inst-emitter) +tablegen(LLVM RISCVGenMacroFusion.inc -gen-macro-fusion-pred) tablegen(LLVM RISCVGenDAGISel.inc -gen-dag-isel) tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM RISCVGenInstrInfo.inc -gen-instr-info) @@ -43,7 +44,6 @@ RISCVISelDAGToDAG.cpp RISCVISelLowering.cpp RISCVMachineFunctionInfo.cpp - RISCVMacroFusion.cpp RISCVMergeBaseOffset.cpp RISCVOptWInstrs.cpp RISCVPostRAExpandPseudoInsts.cpp diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCV.td llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCV.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCV.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCV.td 2024-01-30 07:40:22.000000000 +0000 @@ -31,6 +31,12 @@ include "GISel/RISCVRegisterBanks.td" //===----------------------------------------------------------------------===// +// RISC-V macro fusions. +//===----------------------------------------------------------------------===// + +include "RISCVMacroFusion.td" + +//===----------------------------------------------------------------------===// // RISC-V Scheduling Models //===----------------------------------------------------------------------===// diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCVFeatures.td llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCVFeatures.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCVFeatures.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCVFeatures.td 2024-01-30 07:40:29.000000000 +0000 @@ -1044,30 +1044,6 @@ : SubtargetFeature<"dlen-factor-2", "DLenFactor2", "true", "Vector unit DLEN(data path width) is half of VLEN">; -def TuneLUIADDIFusion - : SubtargetFeature<"lui-addi-fusion", "HasLUIADDIFusion", - "true", "Enable LUI+ADDI macrofusion">; - -def TuneAUIPCADDIFusion - : SubtargetFeature<"auipc-addi-fusion", "HasAUIPCADDIFusion", - "true", "Enable AUIPC+ADDI macrofusion">; - -def TuneZExtHFusion - : SubtargetFeature<"zexth-fusion", "HasZExtHFusion", - "true", "Enable SLLI+SRLI to be fused to zero extension of halfword">; - -def TuneZExtWFusion - : SubtargetFeature<"zextw-fusion", "HasZExtWFusion", - "true", "Enable SLLI+SRLI to be fused to zero extension of word">; - -def TuneShiftedZExtWFusion - : SubtargetFeature<"shifted-zextw-fusion", "HasShiftedZExtWFusion", - "true", "Enable SLLI+SRLI to be fused when computing (shifted) zero extension of word">; - -def TuneLDADDFusion - : SubtargetFeature<"ld-add-fusion", "HasLDADDFusion", - "true", "Enable LD+ADD macrofusion.">; - def TuneNoDefaultUnroll : SubtargetFeature<"no-default-unroll", "EnableDefaultUnroll", "false", "Disable default unroll preference.">; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,210 +0,0 @@ -//===- RISCVMacroFusion.cpp - RISC-V Macro Fusion -------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file This file contains the RISC-V implementation of the DAG scheduling -/// mutation to pair instructions back to back. -// -//===----------------------------------------------------------------------===// -// -#include "RISCVMacroFusion.h" -#include "RISCVSubtarget.h" -#include "llvm/CodeGen/MacroFusion.h" -#include "llvm/CodeGen/TargetInstrInfo.h" - -using namespace llvm; - -static bool checkRegisters(Register FirstDest, const MachineInstr &SecondMI) { - if (!SecondMI.getOperand(1).isReg()) - return false; - - if (SecondMI.getOperand(1).getReg() != FirstDest) - return false; - - // If the input is virtual make sure this is the only user. - if (FirstDest.isVirtual()) { - auto &MRI = SecondMI.getMF()->getRegInfo(); - return MRI.hasOneNonDBGUse(FirstDest); - } - - return SecondMI.getOperand(0).getReg() == FirstDest; -} - -// Fuse load with add: -// add rd, rs1, rs2 -// ld rd, 0(rd) -static bool isLDADD(const MachineInstr *FirstMI, const MachineInstr &SecondMI) { - if (SecondMI.getOpcode() != RISCV::LD) - return false; - - if (!SecondMI.getOperand(2).isImm()) - return false; - - if (SecondMI.getOperand(2).getImm() != 0) - return false; - - // Given SecondMI, when FirstMI is unspecified, we must return - // if SecondMI may be part of a fused pair at all. - if (!FirstMI) - return true; - - if (FirstMI->getOpcode() != RISCV::ADD) - return true; - - return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI); -} - -// Fuse zero extension of halfword: -// slli rd, rs1, 48 -// srli rd, rd, 48 -static bool isZExtH(const MachineInstr *FirstMI, const MachineInstr &SecondMI) { - if (SecondMI.getOpcode() != RISCV::SRLI) - return false; - - if (!SecondMI.getOperand(2).isImm()) - return false; - - if (SecondMI.getOperand(2).getImm() != 48) - return false; - - // Given SecondMI, when FirstMI is unspecified, we must return - // if SecondMI may be part of a fused pair at all. - if (!FirstMI) - return true; - - if (FirstMI->getOpcode() != RISCV::SLLI) - return false; - - if (FirstMI->getOperand(2).getImm() != 48) - return false; - - return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI); -} - -// Fuse zero extension of word: -// slli rd, rs1, 32 -// srli rd, rd, 32 -static bool isZExtW(const MachineInstr *FirstMI, const MachineInstr &SecondMI) { - if (SecondMI.getOpcode() != RISCV::SRLI) - return false; - - if (!SecondMI.getOperand(2).isImm()) - return false; - - if (SecondMI.getOperand(2).getImm() != 32) - return false; - - // Given SecondMI, when FirstMI is unspecified, we must return - // if SecondMI may be part of a fused pair at all. - if (!FirstMI) - return true; - - if (FirstMI->getOpcode() != RISCV::SLLI) - return false; - - if (FirstMI->getOperand(2).getImm() != 32) - return false; - - return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI); -} - -// Fuse shifted zero extension of word: -// slli rd, rs1, 32 -// srli rd, rd, x -// where 0 <= x < 32 -static bool isShiftedZExtW(const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { - if (SecondMI.getOpcode() != RISCV::SRLI) - return false; - - if (!SecondMI.getOperand(2).isImm()) - return false; - - unsigned SRLIImm = SecondMI.getOperand(2).getImm(); - if (SRLIImm >= 32) - return false; - - // Given SecondMI, when FirstMI is unspecified, we must return - // if SecondMI may be part of a fused pair at all. - if (!FirstMI) - return true; - - if (FirstMI->getOpcode() != RISCV::SLLI) - return false; - - if (FirstMI->getOperand(2).getImm() != 32) - return false; - - return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI); -} - -// Fuse AUIPC followed by ADDI -// auipc rd, imm20 -// addi rd, rd, imm12 -static bool isAUIPCADDI(const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { - if (SecondMI.getOpcode() != RISCV::ADDI) - return false; - // Assume the 1st instr to be a wildcard if it is unspecified. - if (!FirstMI) - return true; - - if (FirstMI->getOpcode() != RISCV::AUIPC) - return false; - - return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI); -} - -// Fuse LUI followed by ADDI or ADDIW. -// rd = imm[31:0] which decomposes to -// lui rd, imm[31:12] -// addi(w) rd, rd, imm[11:0] -static bool isLUIADDI(const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { - if (SecondMI.getOpcode() != RISCV::ADDI && - SecondMI.getOpcode() != RISCV::ADDIW) - return false; - // Assume the 1st instr to be a wildcard if it is unspecified. - if (!FirstMI) - return true; - - if (FirstMI->getOpcode() != RISCV::LUI) - return false; - - return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI); -} - -static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, - const TargetSubtargetInfo &TSI, - const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { - const RISCVSubtarget &ST = static_cast(TSI); - - if (ST.hasLUIADDIFusion() && isLUIADDI(FirstMI, SecondMI)) - return true; - - if (ST.hasAUIPCADDIFusion() && isAUIPCADDI(FirstMI, SecondMI)) - return true; - - if (ST.hasZExtHFusion() && isZExtH(FirstMI, SecondMI)) - return true; - - if (ST.hasZExtWFusion() && isZExtW(FirstMI, SecondMI)) - return true; - - if (ST.hasShiftedZExtWFusion() && isShiftedZExtW(FirstMI, SecondMI)) - return true; - - if (ST.hasLDADDFusion() && isLDADD(FirstMI, SecondMI)) - return true; - - return false; -} - -std::unique_ptr llvm::createRISCVMacroFusionDAGMutation() { - return createMacroFusionDAGMutation(shouldScheduleAdjacent); -} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCVMacroFusion.h llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCVMacroFusion.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCVMacroFusion.h 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCVMacroFusion.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,28 +0,0 @@ -//===- RISCVMacroFusion.h - RISC-V Macro Fusion -----------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file This file contains the RISC-V definition of the DAG scheduling -/// mutation to pair instructions back to back. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_RISCV_RISCVMACROFUSION_H -#define LLVM_LIB_TARGET_RISCV_RISCVMACROFUSION_H - -#include "llvm/CodeGen/MachineScheduler.h" - -namespace llvm { - -/// Note that you have to add: -/// DAG.addMutation(createRISCVMacroFusionDAGMutation()); -/// to RISCVPassConfig::createMachineScheduler() to have an effect. -std::unique_ptr createRISCVMacroFusionDAGMutation(); - -} // namespace llvm - -#endif diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCVMacroFusion.td llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCVMacroFusion.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCVMacroFusion.td 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCVMacroFusion.td 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,93 @@ +//==----- RISCVMacroFusion.td - Macro Fusion Definitions -----*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// The following definitions describe the macro fusion predicators. + +// Fuse LUI followed by ADDI or ADDIW: +// rd = imm[31:0] which decomposes to +// lui rd, imm[31:12] +// addi(w) rd, rd, imm[11:0] +def TuneLUIADDIFusion + : SimpleFusion<"lui-addi-fusion", "HasLUIADDIFusion", + "Enable LUI+ADDI macro fusion", + CheckOpcode<[LUI]>, + CheckOpcode<[ADDI, ADDIW]>>; + +// Fuse AUIPC followed by ADDI: +// auipc rd, imm20 +// addi rd, rd, imm12 +def TuneAUIPCADDIFusion + : SimpleFusion<"auipc-addi-fusion", "HasAUIPCADDIFusion", + "Enable AUIPC+ADDI macrofusion", + CheckOpcode<[AUIPC]>, + CheckOpcode<[ADDI]>>; + +// Fuse zero extension of halfword: +// slli rd, rs1, 48 +// srli rd, rd, 48 +def TuneZExtHFusion + : SimpleFusion<"zexth-fusion", "HasZExtHFusion", + "Enable SLLI+SRLI to be fused to zero extension of halfword", + CheckAll<[ + CheckOpcode<[SLLI]>, + CheckIsImmOperand<2>, + CheckImmOperand<2, 48> + ]>, + CheckAll<[ + CheckOpcode<[SRLI]>, + CheckIsImmOperand<2>, + CheckImmOperand<2, 48> + ]>>; + +// Fuse zero extension of word: +// slli rd, rs1, 32 +// srli rd, rd, 32 +def TuneZExtWFusion + : SimpleFusion<"zextw-fusion", "HasZExtWFusion", + "Enable SLLI+SRLI to be fused to zero extension of word", + CheckAll<[ + CheckOpcode<[SLLI]>, + CheckIsImmOperand<2>, + CheckImmOperand<2, 32> + ]>, + CheckAll<[ + CheckOpcode<[SRLI]>, + CheckIsImmOperand<2>, + CheckImmOperand<2, 32> + ]>>; + +// Fuse shifted zero extension of word: +// slli rd, rs1, 32 +// srli rd, rd, x +// where 0 <= x < 32 +def TuneShiftedZExtWFusion + : SimpleFusion<"shifted-zextw-fusion", "HasShiftedZExtWFusion", + "Enable SLLI+SRLI to be fused when computing (shifted) word zero extension", + CheckAll<[ + CheckOpcode<[SLLI]>, + CheckIsImmOperand<2>, + CheckImmOperand<2, 32> + ]>, + CheckAll<[ + CheckOpcode<[SRLI]>, + CheckIsImmOperand<2>, + CheckImmOperandRange<2, 0, 31> + ]>>; + +// Fuse load with add: +// add rd, rs1, rs2 +// ld rd, 0(rd) +def TuneLDADDFusion + : SimpleFusion<"ld-add-fusion", "HasLDADDFusion", "Enable LD+ADD macrofusion", + CheckOpcode<[ADD]>, + CheckAll<[ + CheckOpcode<[LD]>, + CheckIsImmOperand<2>, + CheckImmOperand<2, 0> + ]>>; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCVSubtarget.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCVSubtarget.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCVSubtarget.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCVSubtarget.cpp 2024-01-30 07:40:22.000000000 +0000 @@ -16,8 +16,9 @@ #include "GISel/RISCVRegisterBankInfo.h" #include "RISCV.h" #include "RISCVFrameLowering.h" -#include "RISCVMacroFusion.h" #include "RISCVTargetMachine.h" +#include "llvm/CodeGen/MacroFusion.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/ErrorHandling.h" @@ -29,6 +30,9 @@ #define GET_SUBTARGETINFO_CTOR #include "RISCVGenSubtargetInfo.inc" +#define GET_RISCV_MACRO_FUSION_PRED_IMPL +#include "RISCVGenMacroFusion.inc" + namespace llvm::RISCVTuneInfoTable { #define GET_RISCVTuneInfoTable_IMPL @@ -187,7 +191,7 @@ void RISCVSubtarget::getPostRAMutations( std::vector> &Mutations) const { - Mutations.push_back(createRISCVMacroFusionDAGMutation()); + Mutations.push_back(createMacroFusionDAGMutation(getMacroFusions())); } /// Enable use of alias analysis during code generation (during MI diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCVSubtarget.h llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCVSubtarget.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCVSubtarget.h 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCVSubtarget.h 2024-01-30 07:40:22.000000000 +0000 @@ -27,6 +27,9 @@ #include "llvm/Target/TargetMachine.h" #include +#define GET_RISCV_MACRO_FUSION_PRED_DECL +#include "RISCVGenMacroFusion.inc" + #define GET_SUBTARGETINFO_HEADER #include "RISCVGenSubtargetInfo.inc" @@ -196,11 +199,6 @@ return UserReservedRegister[i]; } - bool hasMacroFusion() const { - return hasLUIADDIFusion() || hasAUIPCADDIFusion() || hasZExtHFusion() || - hasZExtWFusion() || hasShiftedZExtWFusion() || hasLDADDFusion(); - } - // Vector codegen related methods. bool hasVInstructions() const { return HasStdExtZve32x; } bool hasVInstructionsI64() const { return HasStdExtZve64x; } diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp 2024-01-30 07:40:22.000000000 +0000 @@ -14,7 +14,6 @@ #include "MCTargetDesc/RISCVBaseInfo.h" #include "RISCV.h" #include "RISCVMachineFunctionInfo.h" -#include "RISCVMacroFusion.h" #include "RISCVTargetObjectFile.h" #include "RISCVTargetTransformInfo.h" #include "TargetInfo/RISCVTargetInfo.h" @@ -26,6 +25,8 @@ #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/MIRYamlMapping.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/MacroFusion.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" @@ -361,9 +362,10 @@ DAG->addMutation(createLoadClusterDAGMutation( DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true)); } - if (ST.hasMacroFusion()) { + const auto &MacroFusions = ST.getMacroFusions(); + if (!MacroFusions.empty()) { DAG = DAG ? DAG : createGenericSchedLive(C); - DAG->addMutation(createRISCVMacroFusionDAGMutation()); + DAG->addMutation(createMacroFusionDAGMutation(MacroFusions)); } return DAG; } @@ -371,9 +373,10 @@ ScheduleDAGInstrs * createPostMachineScheduler(MachineSchedContext *C) const override { const RISCVSubtarget &ST = C->MF->getSubtarget(); - if (ST.hasMacroFusion()) { + const auto &MacroFusions = ST.getMacroFusions(); + if (!MacroFusions.empty()) { ScheduleDAGMI *DAG = createGenericSchedPostRA(C); - DAG->addMutation(createRISCVMacroFusionDAGMutation()); + DAG->addMutation(createMacroFusionDAGMutation(MacroFusions)); return DAG; } return nullptr; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/X86/X86AsmPrinter.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/X86/X86AsmPrinter.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Target/X86/X86AsmPrinter.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Target/X86/X86AsmPrinter.cpp 2024-01-30 07:40:29.000000000 +0000 @@ -877,7 +877,6 @@ OutStreamer->emitInt32(FeatureFlagsAnd); // data emitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding - OutStreamer->endSection(Nt); OutStreamer->switchSection(Cur); } } diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/TargetParser/TargetParser.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/TargetParser/TargetParser.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/TargetParser/TargetParser.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/TargetParser/TargetParser.cpp 2024-01-30 07:40:22.000000000 +0000 @@ -294,6 +294,7 @@ Features["gfx12-insts"] = true; Features["atomic-fadd-rtn-insts"] = true; Features["image-insts"] = true; + Features["fp8-conversion-insts"] = true; break; case GK_GFX1151: case GK_GFX1150: diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp llvm-toolchain-18-18.1.0-rc1/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp 2024-01-30 07:40:29.000000000 +0000 @@ -1957,6 +1957,8 @@ bool CostTooHigh = false; const bool AddBranchWeights; + Loop *OuterLoop = nullptr; + public: GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, TargetTransformInfo *TTI, const DataLayout &DL, @@ -2053,6 +2055,9 @@ DT->eraseNode(SCEVCheckBlock); LI->removeBlock(SCEVCheckBlock); } + + // Outer loop is used as part of the later cost calculations. + OuterLoop = L->getParentLoop(); } InstructionCost getCost() { @@ -2076,16 +2081,61 @@ LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); RTCheckCost += C; } - if (MemCheckBlock) + if (MemCheckBlock) { + InstructionCost MemCheckCost = 0; for (Instruction &I : *MemCheckBlock) { if (MemCheckBlock->getTerminator() == &I) continue; InstructionCost C = TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); - RTCheckCost += C; + MemCheckCost += C; } + // If the runtime memory checks are being created inside an outer loop + // we should find out if these checks are outer loop invariant. If so, + // the checks will likely be hoisted out and so the effective cost will + // reduce according to the outer loop trip count. + if (OuterLoop) { + ScalarEvolution *SE = MemCheckExp.getSE(); + // TODO: If profitable, we could refine this further by analysing every + // individual memory check, since there could be a mixture of loop + // variant and invariant checks that mean the final condition is + // variant. + const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond); + if (SE->isLoopInvariant(Cond, OuterLoop)) { + // It seems reasonable to assume that we can reduce the effective + // cost of the checks even when we know nothing about the trip + // count. Assume that the outer loop executes at least twice. + unsigned BestTripCount = 2; + + // If exact trip count is known use that. + if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop)) + BestTripCount = SmallTC; + else if (LoopVectorizeWithBlockFrequency) { + // Else use profile data if available. + if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop)) + BestTripCount = *EstimatedTC; + } + + InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; + + // Let's ensure the cost is always at least 1. + NewMemCheckCost = std::max(*NewMemCheckCost.getValue(), + (InstructionCost::CostType)1); + + LLVM_DEBUG(dbgs() + << "We expect runtime memory checks to be hoisted " + << "out of the outer loop. Cost reduced from " + << MemCheckCost << " to " << NewMemCheckCost << '\n'); + + MemCheckCost = NewMemCheckCost; + } + } + + RTCheckCost += MemCheckCost; + } + if (SCEVCheckBlock || MemCheckBlock) LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost << "\n"); @@ -2144,8 +2194,8 @@ BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); // Create new preheader for vector loop. - if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) - PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); + if (OuterLoop) + OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI); SCEVCheckBlock->getTerminator()->eraseFromParent(); SCEVCheckBlock->moveBefore(LoopVectorPreHeader); @@ -2179,8 +2229,8 @@ DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); MemCheckBlock->moveBefore(LoopVectorPreHeader); - if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) - PL->addBasicBlockToLoop(MemCheckBlock, *LI); + if (OuterLoop) + OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI); BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll 2024-01-30 07:40:22.000000000 +0000 @@ -63,52 +63,140 @@ ret void } -; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C) +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C) define amdgpu_kernel void @wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { - %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C) + %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C) store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) define amdgpu_kernel void @wmma_f32_16x16x16_ibf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { - %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) + %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false) +; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false) define amdgpu_kernel void @wmma_f16_16x16x16_f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out) { bb: - %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false) + %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false) store <16 x half> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false) +; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false) define amdgpu_kernel void @wmma_f16_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) { bb: - %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false) + %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false) store <16 x i16> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false) +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false) define amdgpu_kernel void @wmma_i32_16x16x16_ui8(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { bb: - %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false) + %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false) store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false) +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false) define amdgpu_kernel void @wmma_i32_16x16x16_ui4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { bb: - %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false) + %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false) store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 ret void } +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) +define amdgpu_kernel void @swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) + store <8 x i16> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) +define amdgpu_kernel void @swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) + store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) +define amdgpu_kernel void @swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) + store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) +define amdgpu_kernel void @swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) + store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + ; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep) define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) { bb: @@ -190,12 +278,23 @@ declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1 -declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half>, <16 x half> , <8 x float>) #1 -declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16>, <16 x i16> , <8 x float>) #1 -declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1 -declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1 -declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1 -declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1 +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1 +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1 +declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1 +declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1 +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1 +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1 +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16, i1) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AArch64/arm64-addrmode.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AArch64/arm64-addrmode.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AArch64/arm64-addrmode.ll 2024-01-03 13:06:20.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AArch64/arm64-addrmode.ll 2024-01-30 07:40:26.000000000 +0000 @@ -214,8 +214,9 @@ define i8 @LdOffset_i8(ptr %a) { ; CHECK-LABEL: LdOffset_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288 -; CHECK-NEXT: ldrb w0, [x8, #3704] +; CHECK-NEXT: mov w8, #56952 // =0xde78 +; CHECK-NEXT: movk w8, #15, lsl #16 +; CHECK-NEXT: ldrb w0, [x0, x8] ; CHECK-NEXT: ret %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992 %val = load i8, ptr %arrayidx, align 1 @@ -226,8 +227,9 @@ define i32 @LdOffset_i8_zext32(ptr %a) { ; CHECK-LABEL: LdOffset_i8_zext32: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288 -; CHECK-NEXT: ldrb w0, [x8, #3704] +; CHECK-NEXT: mov w8, #56952 // =0xde78 +; CHECK-NEXT: movk w8, #15, lsl #16 +; CHECK-NEXT: ldrb w0, [x0, x8] ; CHECK-NEXT: ret %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992 %val = load i8, ptr %arrayidx, align 1 @@ -253,8 +255,9 @@ define i64 @LdOffset_i8_zext64(ptr %a) { ; CHECK-LABEL: LdOffset_i8_zext64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288 -; CHECK-NEXT: ldrb w0, [x8, #3704] +; CHECK-NEXT: mov w8, #56952 // =0xde78 +; CHECK-NEXT: movk w8, #15, lsl #16 +; CHECK-NEXT: ldrb w0, [x0, x8] ; CHECK-NEXT: ret %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992 %val = load i8, ptr %arrayidx, align 1 diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir 2024-01-03 13:06:20.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir 2024-01-30 07:40:26.000000000 +0000 @@ -14,8 +14,9 @@ ; CHECK-LABEL: name: LdOffset ; CHECK: liveins: $x0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x8 = ADDXri $x0, 253, 12 - ; CHECK-NEXT: renamable $w0 = LDRBBui killed renamable $x8, 3704 + ; CHECK-NEXT: renamable $w8 = MOVZWi 56952, 0 + ; CHECK-NEXT: renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8 + ; CHECK-NEXT: renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0 ; CHECK-NEXT: RET undef $lr, implicit $w0 renamable $w8 = MOVZWi 56952, 0 renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8 diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,504 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x half> %C + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <16 x half> %B + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <16 x half> %B + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +; both neg and abs patterns (wmma matrix C f32 or f16 ) + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %fneg.fabs.C = fneg <8 x float> %fabs.C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) + %fneg.fabs.C = fneg <8 x half> %fabs.C + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %el3 = extractelement <8 x float> %C, i32 3 + %el3.fabs = call float @llvm.fabs.f32(float %el3) + %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3 + %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +; A or B matrix modifier and constant in C + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> , i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +; pack f16 elements with v_perm_b32 since they don't come from same b32 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: flat_load_b128 v[12:15], v[8:9] +; GFX12-NEXT: flat_load_b128 v[16:19], v[8:9] offset:16 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX12-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX12-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; GFX12-NEXT: v_lshl_or_b32 v12, v13, 16, v8 +; GFX12-NEXT: v_lshl_or_b32 v13, v15, 16, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_lshl_or_b32 v14, v17, 16, v14 +; GFX12-NEXT: v_lshl_or_b32 v15, v19, 16, v16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %C = load <16 x half>, ptr %Caddr + %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> + %fneg.C_shuffle = fneg <8 x half> %C_shuffle + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +declare <8 x half> @llvm.fabs.v8f16(<8 x half>) +declare <8 x float> @llvm.fabs.v8f32(<8 x float>) +declare float @llvm.fabs.f32(float) + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,519 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x42004200 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6 +; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 +; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,309 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,321 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v20, v[20:21], off +; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v20, v[20:21], off +; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v16, v[16:17], off +; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0) + store <8 x half> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1) + store <8 x half> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v16, v[16:17], off +; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0) + store <8 x i16> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1) + store <8 x i16> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0) + store <8 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0) + store <8 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v11, v[11:12], off +; GFX12-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9 +; GFX12-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7 +; GFX12-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5 +; GFX12-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off +; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0) + store <8 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0) + store <8 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,370 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,459 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x half> %C + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +; both neg and abs patterns (wmma matrix C f32 or f16 ) + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %fneg.fabs.C = fneg <4 x float> %fabs.C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) + %fneg.fabs.C = fneg <4 x half> %fabs.C + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %el3 = extractelement <4 x float> %C, i32 3 + %el3.fabs = call float @llvm.fabs.f32(float %el3) + %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3 + %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +; A or B matrix modifier and constant in C + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> , i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +; pack f16 elements with v_perm_b32 since they don't come from same b32 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_lshl_or_b32 v4, v9, 16, v4 +; GFX12-NEXT: v_lshl_or_b32 v5, v11, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %C = load <8 x half>, ptr %Caddr + %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> + %fneg.C_shuffle = fneg <4 x half> %C_shuffle + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +declare <4 x half> @llvm.fabs.v4f16(<4 x half>) +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) +declare float @llvm.fabs.f32(float) + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,430 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v9, s3 +; GFX12-NEXT: v_mov_b32_e32 v8, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v9, s3 +; GFX12-NEXT: v_mov_b32_e32 v8, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x42004200 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,274 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,472 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v10, v[10:11], off +; GFX12-NEXT: v_mov_b32_e32 v23, v9 +; GFX12-NEXT: v_mov_b32_e32 v22, v8 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v9 +; GFX12-NEXT: v_mov_b32_e32 v26, v8 +; GFX12-NEXT: v_mov_b32_e32 v25, v7 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v31, v9 +; GFX12-NEXT: v_mov_b32_e32 v30, v8 +; GFX12-NEXT: v_mov_b32_e32 v29, v7 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v10, v[10:11], off +; GFX12-NEXT: v_mov_b32_e32 v23, v9 +; GFX12-NEXT: v_mov_b32_e32 v22, v8 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v9 +; GFX12-NEXT: v_mov_b32_e32 v26, v8 +; GFX12-NEXT: v_mov_b32_e32 v25, v7 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v31, v9 +; GFX12-NEXT: v_mov_b32_e32 v30, v8 +; GFX12-NEXT: v_mov_b32_e32 v29, v7 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v22, v[8:9], off +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v7 +; GFX12-NEXT: v_mov_b32_e32 v18, v6 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0) + store <4 x half> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1) + store <4 x half> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2) + store <4 x half> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3) + store <4 x half> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v22, v[8:9], off +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v7 +; GFX12-NEXT: v_mov_b32_e32 v18, v6 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0) + store <4 x i16> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1) + store <4 x i16> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2) + store <4 x i16> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3) + store <4 x i16> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0) + store <4 x i32> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0) + store <4 x i32> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v6, v[6:7], off +; GFX12-NEXT: v_mov_b32_e32 v15, v5 +; GFX12-NEXT: v_mov_b32_e32 v14, v4 +; GFX12-NEXT: v_mov_b32_e32 v13, v3 +; GFX12-NEXT: v_mov_b32_e32 v12, v2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 +; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v16, v6 +; GFX12-NEXT: v_mov_b32_e32 v15, v5 +; GFX12-NEXT: v_mov_b32_e32 v14, v4 +; GFX12-NEXT: v_mov_b32_e32 v13, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off +; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8) +declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,333 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8) +declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll 2024-01-30 07:40:22.000000000 +0000 @@ -55,7 +55,6 @@ ; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 ; GFX12-NEXT: v_mov_b32_e32 v31, v0 -; GFX12-NEXT: s_mov_b32 s12, ttmp9 ; GFX12-NEXT: s_mov_b64 s[8:9], 0 ; GFX12-NEXT: s_mov_b32 s32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s + +define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) { +; GFX12-LABEL: test_cvt_f32_bf8_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_f32_bf8_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 0) + ret float %ret +} + +define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) { +; GFX12-LABEL: test_cvt_f32_bf8_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0] +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 1) + ret float %ret +} + +define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) { +; GFX12-LABEL: test_cvt_f32_bf8_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1] +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 2) + ret float %ret +} + +define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) { +; GFX12-LABEL: test_cvt_f32_fp8_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1] +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %tmp0, i32 3) + ret float %ret +} + +define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_pk_bf8_f32_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %tmp1, float %y, i32 %old, i1 false) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_pk_fp8_f32_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %tmp1, float %y, i32 %old, i1 true) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_sr_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %tmp1, i32 %r, i32 %old, i32 0) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 1) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 2) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32) +declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32) +declare i32 @llvm.amdgcn.cvt.pk.bf8.f32(float, float, i32, i1) +declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1) +declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32) +declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32) + +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1 + +attributes #0 = { nounwind convergent } +attributes #1 = { nounwind readnone convergent } diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,197 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=gcn-dpp-combine %s -o - | FileCheck -check-prefix=GFX12 %s + +--- +name: test_cvt_f32_bf8_byte0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX12-LABEL: name: test_cvt_f32_bf8_byte0 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_F32_BF8_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_dpp [[DEF]], [[COPY]], 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_dpp]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %2:vgpr_32 = V_CVT_F32_BF8_e32 killed %1, implicit $mode, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN_TO_EPILOG $vgpr0 + +... +--- +name: test_cvt_f32_bf8_byte2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX12-LABEL: name: test_cvt_f32_bf8_byte2 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec + ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN_TO_EPILOG $vgpr0 + +... +--- +name: test_cvt_f32_fp8_byte3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX12-LABEL: name: test_cvt_f32_fp8_byte3 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec + ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed %1, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN_TO_EPILOG $vgpr0 + +... +--- +name: test_cvt_pk_bf8_f32_word0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_pk_bf8_f32_word0 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_PK_BF8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_PK_BF8_F32_e64_dpp [[DEF]], 0, [[COPY4]], 0, [[COPY3]], [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_PK_BF8_F32_e64_dpp]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_PK_BF8_F32_e64 0, killed %6, 0, %1, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_cvt_pk_fp8_f32_word1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_pk_fp8_f32_word1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY4]], [[COPY4]], 228, 15, 15, -1, implicit $exec + ; GFX12-NEXT: [[V_CVT_PK_FP8_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_PK_FP8_F32_e64 8, killed [[V_MOV_B32_dpp]], 0, [[COPY3]], [[COPY2]], 0, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_PK_FP8_F32_e64_]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_PK_FP8_F32_e64 8, killed %6, 0, %1, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_cvt_sr_bf8_f32_byte0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_sr_bf8_f32_byte0 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_SR_BF8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_SR_BF8_F32_e64_dpp [[DEF]], 0, [[COPY4]], 0, [[COPY3]], 0, [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_SR_BF8_F32_e64_dpp]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_SR_BF8_F32_e64 0, killed %6, 0, %1, 0, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_cvt_sr_fp8_f32_byte2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_sr_fp8_f32_byte2 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY4]], [[COPY4]], 228, 15, 15, -1, implicit $exec + ; GFX12-NEXT: [[V_CVT_SR_FP8_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_SR_FP8_F32_e64 8, killed [[V_MOV_B32_dpp]], 0, [[COPY3]], 0, [[COPY2]], 0, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_SR_FP8_F32_e64_]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_SR_FP8_F32_e64 8, killed %6, 0, %1, 0, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll 2024-01-30 07:40:22.000000000 +0000 @@ -1,4 +1,6 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32) declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32) @@ -9,182 +11,524 @@ declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32) declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32) -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte0: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0{{$}} define float @test_cvt_f32_bf8_byte0(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_bf8_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte1: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1 define float @test_cvt_f32_bf8_byte1(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte2: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2 define float @test_cvt_f32_bf8_byte2(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte3: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3 define float @test_cvt_f32_bf8_byte3(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte0: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0{{$}} define float @test_cvt_f32_fp8_byte0(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_fp8_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte1: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1 define float @test_cvt_f32_fp8_byte1(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte2: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2 define float @test_cvt_f32_fp8_byte2(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte3: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3 define float @test_cvt_f32_fp8_byte3(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word0: -; GCN: v_cvt_pk_f32_bf8_e32 v[0:1], v0{{$}} define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_bf8_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_bf8_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word1: -; GCN: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1 define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_bf8_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_bf8_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word0: -; GCN: v_cvt_pk_f32_fp8_e32 v[0:1], v0{{$}} define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_fp8_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_fp8_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word1: -; GCN: v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1 define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_fp8_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_fp8_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word0: -; GCN: v_cvt_pk_bf8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_bf8_f32_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_bf8_f32_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word1: -; GCN: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_bf8_f32_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_bf8_f32_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word0: -; GCN: v_cvt_pk_fp8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_fp8_f32_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_fp8_f32_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word1: -; GCN: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_fp8_f32_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_fp8_f32_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte0: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte0(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 0) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte1: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 1) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte2: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 2) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte3: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 3) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte0: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 0) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte1: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 1) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte2: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 2) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte3: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 3) ret i32 %ret } diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: not --crash llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX9-SDAG-ERR %s +; RUN: not --crash llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX9-GISEL-ERR %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s + +; GFX9-SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.wave.id +; GFX9-GISEL-ERR: LLVM ERROR: unable to legalize instruction: {{.*}} = G_INTRINSIC intrinsic(@llvm.amdgcn.wave.id) + +define amdgpu_cs void @test_wave_id(ptr addrspace(1) %out) { +; GFX9-LABEL: test_wave_id: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_bfe_u32 s0, ttmp8, 0x50019 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: test_wave_id: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_bfe_u32 s0, ttmp8, 0x50019 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %waveid = call i32 @llvm.amdgcn.wave.id() + store i32 %waveid, ptr addrspace(1) %out + ret void +} + +define amdgpu_gfx void @test_wave_id_callable(ptr addrspace(1) %out) { +; GFX9-LABEL: test_wave_id_callable: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_bfe_u32 s34, ttmp8, 0x50019 +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_wave_id_callable: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_bfe_u32 s0, ttmp8, 0x50019 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %waveid = call i32 @llvm.amdgcn.wave.id() + store i32 %waveid, ptr addrspace(1) %out + ret void +} + +declare i32 @llvm.amdgcn.wave.id() diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,295 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s + +define amdgpu_kernel void @workgroup_ids_kernel() { +; GFX9-LABEL: workgroup_ids_kernel: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX9ARCH-SDAG-LABEL: workgroup_ids_kernel: +; GFX9ARCH-SDAG: ; %bb.0: ; %.entry +; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16 +; GFX9ARCH-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX9ARCH-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9ARCH-SDAG-NEXT: s_endpgm +; +; GFX9ARCH-GISEL-LABEL: workgroup_ids_kernel: +; GFX9ARCH-GISEL: ; %bb.0: ; %.entry +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX9ARCH-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9ARCH-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9ARCH-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: workgroup_ids_kernel: +; GFX12-SDAG: ; %bb.0: ; %.entry +; GFX12-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GFX12-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 16 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: workgroup_ids_kernel: +; GFX12-GISEL: ; %bb.0: ; %.entry +; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +.entry: + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + %idy = call i32 @llvm.amdgcn.workgroup.id.y() + %idz = call i32 @llvm.amdgcn.workgroup.id.z() + %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0 + %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1 + %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2 + call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_kernel void @caller() { +; GFX9-SDAG-LABEL: caller: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-SDAG-NEXT: s_mov_b32 s38, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-SDAG-NEXT: s_add_u32 s36, s36, s7 +; GFX9-SDAG-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-SDAG-NEXT: s_add_u32 s8, s2, 36 +; GFX9-SDAG-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-SDAG-NEXT: s_getpc_b64 s[2:3] +; GFX9-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 +; GFX9-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x0 +; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-SDAG-NEXT: s_mov_b32 s12, s6 +; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_swappc_b64 s[30:31], s[14:15] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: caller: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-GISEL-NEXT: s_mov_b32 s38, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-GISEL-NEXT: s_add_u32 s36, s36, s7 +; GFX9-GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-GISEL-NEXT: s_add_u32 s8, s2, 36 +; GFX9-GISEL-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-GISEL-NEXT: s_mov_b32 s12, s6 +; GFX9-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[14:15] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX9ARCH-SDAG-LABEL: caller: +; GFX9ARCH-SDAG: ; %bb.0: +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s38, -1 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s36, s36, s6 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s37, s37, 0 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s2, 36 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s3, 0 +; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[2:3] +; GFX9ARCH-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 +; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9ARCH-SDAG-NEXT: s_endpgm +; +; GFX9ARCH-GISEL-LABEL: caller: +; GFX9ARCH-GISEL: ; %bb.0: +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s38, -1 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s36, s36, s6 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s2, 36 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s3, 0 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9ARCH-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9ARCH-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: caller: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 +; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-SDAG-NEXT: s_mov_b32 s7, callee@abs32@hi +; GFX12-SDAG-NEXT: s_mov_b32 s6, callee@abs32@lo +; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: caller: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 +; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-GISEL-NEXT: s_mov_b32 s6, callee@abs32@lo +; GFX12-GISEL-NEXT: s_mov_b32 s7, callee@abs32@hi +; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-GISEL-NEXT: s_endpgm + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + call void @callee(i32 %idx) #0 + ret void +} + +declare void @callee(i32) #0 + +define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) { +; GFX9-LABEL: workgroup_ids_device_func: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9ARCH-SDAG-LABEL: workgroup_ids_device_func: +; GFX9ARCH-SDAG: ; %bb.0: +; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v6, ttmp9 +; GFX9ARCH-SDAG-NEXT: s_and_b32 s4, ttmp7, 0xffff +; GFX9ARCH-SDAG-NEXT: global_store_dword v[0:1], v6, off +; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s4, ttmp7, 16 +; GFX9ARCH-SDAG-NEXT: global_store_dword v[2:3], v0, off +; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9ARCH-SDAG-NEXT: global_store_dword v[4:5], v0, off +; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9ARCH-GISEL-LABEL: workgroup_ids_device_func: +; GFX9ARCH-GISEL: ; %bb.0: +; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v6, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_and_b32 s4, ttmp7, 0xffff +; GFX9ARCH-GISEL-NEXT: s_lshr_b32 s5, ttmp7, 16 +; GFX9ARCH-GISEL-NEXT: global_store_dword v[0:1], v6, off +; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9ARCH-GISEL-NEXT: global_store_dword v[2:3], v0, off +; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9ARCH-GISEL-NEXT: global_store_dword v[4:5], v0, off +; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: workgroup_ids_device_func: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0 +; GFX12-NEXT: s_lshr_b32 s1, ttmp7, 16 +; GFX12-NEXT: v_mov_b32_e32 v8, s1 +; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_store_b32 v[2:3], v7, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_store_b32 v[4:5], v8, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %id.x = call i32 @llvm.amdgcn.workgroup.id.x() + %id.y = call i32 @llvm.amdgcn.workgroup.id.y() + %id.z = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %id.x, ptr addrspace(1) %outx + store volatile i32 %id.y, ptr addrspace(1) %outy + store volatile i32 %id.z, ptr addrspace(1) %outz + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() +declare i32 @llvm.amdgcn.workgroup.id.y() +declare i32 @llvm.amdgcn.workgroup.id.z() +declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg) + +attributes #0 = { nounwind "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX9ARCH: {{.*}} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,187 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s + +define amdgpu_cs void @_amdgpu_cs_main() { +; GFX9-LABEL: _amdgpu_cs_main: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX9ARCH-SDAG-LABEL: _amdgpu_cs_main: +; GFX9ARCH-SDAG: ; %bb.0: ; %.entry +; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16 +; GFX9ARCH-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX9ARCH-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9ARCH-SDAG-NEXT: s_endpgm +; +; GFX9ARCH-GISEL-LABEL: _amdgpu_cs_main: +; GFX9ARCH-GISEL: ; %bb.0: ; %.entry +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX9ARCH-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9ARCH-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9ARCH-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: _amdgpu_cs_main: +; GFX12-SDAG: ; %bb.0: ; %.entry +; GFX12-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GFX12-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 16 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: _amdgpu_cs_main: +; GFX12-GISEL: ; %bb.0: ; %.entry +; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +.entry: + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + %idy = call i32 @llvm.amdgcn.workgroup.id.y() + %idz = call i32 @llvm.amdgcn.workgroup.id.z() + %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0 + %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1 + %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2 + call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_cs void @caller() { +; GFX9-LABEL: caller: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[8:9] +; GFX9-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX9ARCH-SDAG-LABEL: caller: +; GFX9ARCH-SDAG: ; %bb.0: +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s10, -1 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s8, s0 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s9, 0 +; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[0:1] +; GFX9ARCH-SDAG-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[8:9] +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9ARCH-SDAG-NEXT: s_endpgm +; +; GFX9ARCH-GISEL-LABEL: caller: +; GFX9ARCH-GISEL: ; %bb.0: +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s10, -1 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s8, s0 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[8:9] +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9ARCH-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: caller: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX12-SDAG-NEXT: s_mov_b32 s1, callee@abs32@hi +; GFX12-SDAG-NEXT: s_mov_b32 s0, callee@abs32@lo +; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: caller: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX12-GISEL-NEXT: s_mov_b32 s0, callee@abs32@lo +; GFX12-GISEL-NEXT: s_mov_b32 s1, callee@abs32@hi +; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-GISEL-NEXT: s_endpgm + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + call amdgpu_gfx void @callee(i32 %idx) + ret void +} + +declare amdgpu_gfx void @callee(i32) + +define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) { +; GFX9-LABEL: workgroup_ids_gfx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9ARCH-LABEL: workgroup_ids_gfx: +; GFX9ARCH: ; %bb.0: +; GFX9ARCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9ARCH-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: workgroup_ids_gfx: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %id.x = call i32 @llvm.amdgcn.workgroup.id.x() + %id.y = call i32 @llvm.amdgcn.workgroup.id.y() + %id.z = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %id.x, ptr addrspace(1) %outx + store volatile i32 %id.y, ptr addrspace(1) %outy + store volatile i32 %id.z, ptr addrspace(1) %outz + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() +declare i32 @llvm.amdgcn.workgroup.id.y() +declare i32 @llvm.amdgcn.workgroup.id.z() +declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX9-GISEL: {{.*}} +; GFX9-SDAG: {{.*}} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll 1970-01-01 00:00:00.000000000 +0000 @@ -1,128 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s - -define amdgpu_cs void @_amdgpu_cs_main() { -; GFX9-SDAG-LABEL: _amdgpu_cs_main: -; GFX9-SDAG: ; %bb.0: ; %.entry -; GFX9-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 -; GFX9-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: _amdgpu_cs_main: -; GFX9-GISEL: ; %bb.0: ; %.entry -; GFX9-GISEL-NEXT: s_mov_b32 s0, ttmp9 -; GFX9-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff -; GFX9-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: _amdgpu_cs_main: -; GFX12-SDAG: ; %bb.0: ; %.entry -; GFX12-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 -; GFX12-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: _amdgpu_cs_main: -; GFX12-GISEL: ; %bb.0: ; %.entry -; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9 -; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff -; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm -.entry: - %idx = call i32 @llvm.amdgcn.workgroup.id.x() - %idy = call i32 @llvm.amdgcn.workgroup.id.y() - %idz = call i32 @llvm.amdgcn.workgroup.id.z() - %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0 - %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1 - %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2 - call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0) - ret void -} - -define amdgpu_cs void @caller() { -; GFX9-SDAG-LABEL: caller: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_getpc_b64 s[8:9] -; GFX9-SDAG-NEXT: s_mov_b32 s8, s0 -; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 -; GFX9-SDAG-NEXT: s_mov_b32 s5, callee@abs32@hi -; GFX9-SDAG-NEXT: s_mov_b32 s4, callee@abs32@lo -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_add_u32 s8, s8, s0 -; GFX9-SDAG-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[8:9] -; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX9-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: caller: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_getpc_b64 s[8:9] -; GFX9-GISEL-NEXT: s_mov_b32 s8, s0 -; GFX9-GISEL-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 -; GFX9-GISEL-NEXT: s_mov_b32 s4, callee@abs32@lo -; GFX9-GISEL-NEXT: s_mov_b32 s5, callee@abs32@hi -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-GISEL-NEXT: s_mov_b32 s32, 0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_add_u32 s8, s8, s0 -; GFX9-GISEL-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[8:9] -; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: caller: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX12-SDAG-NEXT: s_mov_b32 s1, callee@abs32@hi -; GFX12-SDAG-NEXT: s_mov_b32 s0, callee@abs32@lo -; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 -; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: caller: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX12-GISEL-NEXT: s_mov_b32 s0, callee@abs32@lo -; GFX12-GISEL-NEXT: s_mov_b32 s1, callee@abs32@hi -; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 -; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX12-GISEL-NEXT: s_endpgm - %idx = call i32 @llvm.amdgcn.workgroup.id.x() - call amdgpu_gfx void @callee(i32 %idx) - ret void -} - -declare amdgpu_gfx void @callee(i32) - -declare i32 @llvm.amdgcn.workgroup.id.x() -declare i32 @llvm.amdgcn.workgroup.id.y() -declare i32 @llvm.amdgcn.workgroup.id.z() -declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX12: {{.*}} -; GFX9: {{.*}} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,499 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x half> %C + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <16 x half> %B + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <16 x half> %B + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +; both neg and abs patterns (wmma matrix C f32 or f16 ) + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %fneg.fabs.C = fneg <8 x float> %fabs.C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) + %fneg.fabs.C = fneg <8 x half> %fabs.C + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %el3 = extractelement <8 x float> %C, i32 3 + %el3.fabs = call float @llvm.fabs.f32(float %el3) + %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3 + %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +; A or B matrix modifier and constant in C + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> , i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +; pack f16 elements with v_perm_b32 since they don't come from same b32 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: flat_load_b128 v[12:15], v[8:9] offset:16 +; GFX12-NEXT: flat_load_b128 v[16:19], v[8:9] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX12-NEXT: v_perm_b32 v15, v15, v14, 0x5040100 +; GFX12-NEXT: v_perm_b32 v14, v13, v12, 0x5040100 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_perm_b32 v13, v19, v18, 0x5040100 +; GFX12-NEXT: v_perm_b32 v12, v17, v16, 0x5040100 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %C = load <16 x half>, ptr %Caddr + %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> + %fneg.C_shuffle = fneg <8 x half> %C_shuffle + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +declare <8 x half> @llvm.fabs.v8f16(<8 x half>) +declare <8 x float> @llvm.fabs.v8f32(<8 x float>) +declare float @llvm.fabs.f32(float) + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,431 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10 +; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10 +; GFX12-NEXT: v_mov_b32_e32 v17, v10 +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10 +; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10 +; GFX12-NEXT: v_mov_b32_e32 v17, v10 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v10, 0x42004200 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GFX12-NEXT: v_mov_b32_e32 v13, v10 +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GFX12-NEXT: v_mov_b32_e32 v13, v10 +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v11, v4 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,309 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,321 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v20, v[20:21], off +; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v20, v[20:21], off +; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v16, v[16:17], off +; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0) + store <8 x half> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1) + store <8 x half> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v16, v[16:17], off +; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0) + store <8 x i16> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1) + store <8 x i16> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0) + store <8 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0) + store <8 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v11, v[11:12], off +; GFX12-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9 +; GFX12-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7 +; GFX12-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5 +; GFX12-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16 +; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0) + store <8 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0) + store <8 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,370 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,456 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x half> %C + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +; both neg and abs patterns (wmma matrix C f32 or f16 ) + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %fneg.fabs.C = fneg <4 x float> %fabs.C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) + %fneg.fabs.C = fneg <4 x half> %fabs.C + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %el3 = extractelement <4 x float> %C, i32 3 + %el3.fabs = call float @llvm.fabs.f32(float %el3) + %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3 + %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +; A or B matrix modifier and constant in C + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> , i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +; pack f16 elements with v_perm_b32 since they don't come from same b32 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 +; GFX12-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %C = load <8 x half>, ptr %Caddr + %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> + %fneg.C_shuffle = fneg <4 x half> %C_shuffle + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +declare <4 x half> @llvm.fabs.v4f16(<4 x half>) +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) +declare float @llvm.fabs.f32(float) + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,373 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, v6 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v9, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, v6 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v9, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x42004200 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, v6 +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x3fc03fc0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, v6 +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,274 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,472 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v10, v[10:11], off +; GFX12-NEXT: v_mov_b32_e32 v23, v9 +; GFX12-NEXT: v_mov_b32_e32 v22, v8 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v9 +; GFX12-NEXT: v_mov_b32_e32 v26, v8 +; GFX12-NEXT: v_mov_b32_e32 v25, v7 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v31, v9 +; GFX12-NEXT: v_mov_b32_e32 v30, v8 +; GFX12-NEXT: v_mov_b32_e32 v29, v7 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v10, v[10:11], off +; GFX12-NEXT: v_mov_b32_e32 v23, v9 +; GFX12-NEXT: v_mov_b32_e32 v22, v8 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v9 +; GFX12-NEXT: v_mov_b32_e32 v26, v8 +; GFX12-NEXT: v_mov_b32_e32 v25, v7 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v31, v9 +; GFX12-NEXT: v_mov_b32_e32 v30, v8 +; GFX12-NEXT: v_mov_b32_e32 v29, v7 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v22, v[8:9], off +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v7 +; GFX12-NEXT: v_mov_b32_e32 v18, v6 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0) + store <4 x half> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1) + store <4 x half> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2) + store <4 x half> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3) + store <4 x half> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v22, v[8:9], off +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v7 +; GFX12-NEXT: v_mov_b32_e32 v18, v6 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0) + store <4 x i16> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1) + store <4 x i16> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2) + store <4 x i16> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3) + store <4 x i16> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0) + store <4 x i32> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0) + store <4 x i32> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v6, v[6:7], off +; GFX12-NEXT: v_mov_b32_e32 v15, v5 +; GFX12-NEXT: v_mov_b32_e32 v14, v4 +; GFX12-NEXT: v_mov_b32_e32 v13, v3 +; GFX12-NEXT: v_mov_b32_e32 v12, v2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 +; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v16, v6 +; GFX12-NEXT: v_mov_b32_e32 v15, v5 +; GFX12-NEXT: v_mov_b32_e32 v14, v4 +; GFX12-NEXT: v_mov_b32_e32 v13, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off +; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8) +declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,333 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8) +declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir 2024-01-30 07:40:29.000000000 +0000 @@ -0,0 +1,354 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s + +# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0. +# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0 +# $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1 + +--- +name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... +--- +name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 + + ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 + + ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec +... diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir 2024-01-30 07:40:29.000000000 +0000 @@ -0,0 +1,355 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s + +# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0. +# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0 +# $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1 + +--- +name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 + + ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 + + ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec +... diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll 2024-01-30 07:40:22.000000000 +0000 @@ -5,43 +5,25 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { -; GFX9-SDAG-LABEL: workgroup_id_x: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm ; -; GFX9-GISEL-LABEL: workgroup_id_x: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: workgroup_id_x: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm +; GFX9-LABEL: workgroup_id_x: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: workgroup_id_x: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: workgroup_id_x: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %idx, ptr addrspace(1) %ptrx @@ -52,23 +34,23 @@ ; GFX9-LABEL: workgroup_id_xy: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, ttmp9 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, ttmp7 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: workgroup_id_xy: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 -; GFX12-NEXT: v_mov_b32_e32 v2, ttmp7 +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, ttmp7 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: global_store_b32 v0, v2, s[2:3] +; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -81,37 +63,21 @@ } define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry, ptr addrspace(1) %ptrz) { -; GFX9-SDAG-LABEL: workgroup_id_xyz: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[2:3] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[6:7] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: workgroup_id_xyz: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-GISEL-NEXT: s_and_b32 s0, ttmp7, 0xffff -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: s_lshr_b32 s0, ttmp7, 16 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[6:7] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: workgroup_id_xyz: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_and_b32 s6, ttmp7, 0xffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s0, ttmp7, 16 +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: workgroup_id_xyz: ; GFX12: ; %bb.0: @@ -119,15 +85,15 @@ ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX12-NEXT: s_and_b32 s2, ttmp7, 0xffff -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_lshr_b32 s3, ttmp7, 16 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX12-NEXT: global_store_b32 v0, v2, s[6:7] -; GFX12-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: global_store_b32 v1, v2, s[6:7] +; GFX12-NEXT: global_store_b32 v1, v3, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -144,3 +110,8 @@ declare i32 @llvm.amdgcn.workgroup.id.x() declare i32 @llvm.amdgcn.workgroup.id.y() declare i32 @llvm.amdgcn.workgroup.id.z() +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} +; GFX9-GISEL: {{.*}} +; GFX9-SDAG: {{.*}} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/SystemZ/zos-ppa2.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/SystemZ/zos-ppa2.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/SystemZ/zos-ppa2.ll 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/SystemZ/zos-ppa2.ll 2024-01-30 07:40:26.000000000 +0000 @@ -24,7 +24,7 @@ ; CHECK: .byte 0 ; CHECK: .byte 3 ; CHECK: .short 30 -; CHECK: .ascii "\323\323\345\324@@@@@@\361\370\360\360\361\371\367\360\360\361\360\361\360\360\360\360\360\360\360\360" +; CHECK: .ascii "\323\323\345\324@@@@@@{{((\\3[0-7]{2}){4})}}\361\371\367\360\360\361\360\361\360\360\360\360\360\360\360\360" define void @void_test() { entry: ret void diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/X86/note-cet-property-inlineasm.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/X86/note-cet-property-inlineasm.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/CodeGen/X86/note-cet-property-inlineasm.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/CodeGen/X86/note-cet-property-inlineasm.ll 2024-01-30 07:40:26.000000000 +0000 @@ -0,0 +1,30 @@ +; RUN: llc -mtriple x86_64-unknown-linux-gnu %s -o %t.o -filetype=obj +; RUN: llvm-readobj -n %t.o | FileCheck %s + +module asm ".pushsection \22.note.gnu.property\22,\22a\22,@note" +module asm " .p2align 3" +module asm " .long 1f - 0f" +module asm " .long 4f - 1f" +module asm " .long 5" +module asm "0: .asciz \22GNU\22" +module asm "1: .p2align 3" +module asm " .long 0xc0008002" +module asm " .long 3f - 2f" +module asm "2: .long ((1U << 0) | 0 | 0 | 0)" +module asm "3: .p2align 3" +module asm "4:" +module asm " .popsection" + +!llvm.module.flags = !{!0, !1} + +!0 = !{i32 4, !"cf-protection-return", i32 1} +!1 = !{i32 4, !"cf-protection-branch", i32 1} + +; CHECK: Type: NT_GNU_PROPERTY_TYPE_0 +; CHECK-NEXT: Property [ +; CHECK-NEXT: x86 feature: IBT, SHSTK +; CHECK-NEXT: ] +; CHECK: Type: NT_GNU_PROPERTY_TYPE_0 +; CHECK-NEXT: Property [ +; CHECK-NEXT: x86 ISA needed: x86-64-baseline +; CHECK-NEXT: ] diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s 2024-01-30 07:40:22.000000000 +0000 @@ -397,6 +397,51 @@ v_ctz_i32_b32 v255, 0xaf123456 // GFX12: encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf] +v_cvt_f32_bf8_e32 v1, s3 +// GFX12: encoding: [0x03,0xda,0x02,0x7e] + +v_cvt_f32_bf8_e32 v1, 3 +// GFX12: encoding: [0x83,0xda,0x02,0x7e] + +v_cvt_f32_bf8_e32 v1, v3 +// GFX12: encoding: [0x03,0xdb,0x02,0x7e] + +v_cvt_f32_fp8_e32 v1, s3 +// GFX12: encoding: [0x03,0xd8,0x02,0x7e] + +v_cvt_f32_fp8_e32 v1, 3 +// GFX12: encoding: [0x83,0xd8,0x02,0x7e] + +v_cvt_f32_fp8_e32 v1, v3 +// GFX12: encoding: [0x03,0xd9,0x02,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], s3 +// GFX12: encoding: [0x03,0xde,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[3:4], s5 +// GFX12: encoding: [0x05,0xde,0x06,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], 3 +// GFX12: encoding: [0x83,0xde,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[3:4], 3 +// GFX12: encoding: [0x83,0xde,0x06,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], v3 +// GFX12: encoding: [0x03,0xdf,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[3:4], v3 +// GFX12: encoding: [0x03,0xdf,0x06,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], s3 +// GFX12: encoding: [0x03,0xdc,0x04,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], 3 +// GFX12: encoding: [0x83,0xdc,0x04,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], v3 +// GFX12: encoding: [0x03,0xdd,0x04,0x7e] + v_cvt_f16_f32 v5, v1 // GFX12: encoding: [0x01,0x15,0x0a,0x7e] diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s 2024-01-30 07:40:22.000000000 +0000 @@ -337,6 +337,18 @@ v_ctz_i32_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x05,0x30] +v_cvt_f32_fp8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc +// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac] + +v_cvt_f32_fp8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe +// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e] + +v_cvt_f32_bf8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc +// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac] + +v_cvt_f32_bf8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe +// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e] + v_cvt_f16_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s 2024-01-30 07:40:22.000000000 +0000 @@ -73,6 +73,18 @@ v_ctz_i32_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: encoding: [0xe9,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00] +v_cvt_f32_fp8 v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_fp8 v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05] + +v_cvt_f32_bf8 v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_bf8 v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05] + v_cvt_f16_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: encoding: [0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05] diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s 2024-01-30 07:40:22.000000000 +0000 @@ -1099,6 +1099,42 @@ v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 // GFX12: encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] +v_cvt_pk_fp8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_pk_fp8_f32 v1, -v2, |v3| +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20] + +v_cvt_pk_fp8_f32 v1, s2, 3 +// GFX12: encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00] + +v_cvt_pk_bf8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_pk_bf8_f32 v1, -v2, |v3| +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20] + +v_cvt_pk_bf8_f32 v1, s2, 3 +// GFX12: encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00] + +v_cvt_sr_fp8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_sr_fp8_f32 v10, s2, v5 +// GFX12: encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00] + +v_cvt_sr_fp8_f32 v5, -|v255|, v4 +// GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20] + +v_cvt_sr_bf8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_sr_bf8_f32 v10, s2, v5 +// GFX12: encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00] + +v_cvt_sr_bf8_f32 v5, -|v255|, v4 +// GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20] + v_cvt_pk_i16_f32 v5, v1, v2 // GFX12: encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00] diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s 2024-01-30 07:40:22.000000000 +0000 @@ -1015,6 +1015,114 @@ v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3] +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_pk_bf8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3] +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] // GFX12: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s 2024-01-30 07:40:22.000000000 +0000 @@ -570,6 +570,54 @@ v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: [0xff,0x87,0x0e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,2,3,0,1] +// GFX12: encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21] + +v_cvt_pk_fp8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_fp8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_fp8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + +v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s 2024-01-30 07:40:22.000000000 +0000 @@ -396,6 +396,144 @@ v_ctz_i32_b32_e64 v255, 0xaf123456 // GFX12: encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] +v_cvt_f32_bf8_e64 v1, s3 +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], s3 +// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], 3 +// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], v3 +// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], s3 +// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], 3 +// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], v3 +// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], s3 +// GFX12: encoding: [0x03,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], s3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], 3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], v3 +// GFX12: encoding: [0x03,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], v3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], s3 +// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], 3 +// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], 3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], v3 +// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], v3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] + v_cvt_f16_f32_e64 v5, v1 // GFX12: encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00] diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s 2024-01-30 07:40:22.000000000 +0000 @@ -336,6 +336,18 @@ v_ctz_i32_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +V_CVT_F32_FP8_e64_dpp v5, v1 quad_perm:[3,1,2,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0x27,0x00,0x2d] + +V_CVT_F32_FP8_e64_dpp v1, v3 quad_perm:[2,1,0,3] row_mask:0x5 bank_mask:0xe +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0xc6,0x00,0x5e] + +V_CVT_F32_BF8_e64_dpp v5, v1 quad_perm:[0,3,2,1] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0x6c,0x00,0x2d] + +V_CVT_F32_BF8_e64_dpp v1, v3 quad_perm:[0,1,3,2] row_mask:0x5 bank_mask:0xe +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0xb4,0x00,0x5e] + v_cvt_f16_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: [0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s 2024-01-30 07:40:22.000000000 +0000 @@ -84,6 +84,18 @@ v_ctz_i32_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: [0xff,0x00,0xba,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_cvt_f32_fp8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_fp8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] + +v_cvt_f32_bf8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_bf8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] + v_cvt_f16_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,1529 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: %s + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], s[0:3], v[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], 1.0, v[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0, v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3], v[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], 1.0, v[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0, v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], s[0:3], v[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], 1.0, v[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0, v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3], v[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0, v[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0, v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], 1, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] + + + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], s0, v1, v[2:9] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1, v[2:9] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], 1, v1, v[2:9] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1, v[2:9] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], 1, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] + + + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3], v[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], 1.0, v[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0, v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3], v[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0, v[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0, v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3], v[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], 1.0, v[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0, v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3], v[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0, v[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0, v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], 1, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], s0, v[1:2], v11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1], v11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], 1, v[1:2], v11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1, v11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], 1, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,1529 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: %s + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], s[0:1], v[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], 1.0, v[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0, v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1], v[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], 1.0, v[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0, v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], s[0:1], v[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], 1.0, v[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0, v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1], v[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0, v[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0, v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], 1, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] + + + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], 1, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] + + + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1], v[2:5], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], 1.0, v[2:5], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0, v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1], v[2:5], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0, v[2:5], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0, v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1], v[2:5], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], 1.0, v[2:5], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0, v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1], v[2:5], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0, v[2:5], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0, v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], 1, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], s0, v1, v6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s1, v6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], 1, v1, v6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1, v6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], 1, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt 2024-01-30 07:40:22.000000000 +0000 @@ -397,6 +397,42 @@ # GFX12: v_ctz_i32_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf] 0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf +# GFX12: v_cvt_f32_bf8_e32 v1, s3 ; encoding: [0x03,0xda,0x02,0x7e] +0x03,0xda,0x02,0x7e + +# GFX12: v_cvt_f32_bf8_e32 v1, 3 ; encoding: [0x83,0xda,0x02,0x7e] +0x83,0xda,0x02,0x7e + +# GFX12: v_cvt_f32_bf8_e32 v1, v3 ; encoding: [0x03,0xdb,0x02,0x7e] +0x03,0xdb,0x02,0x7e + +# GFX12: v_cvt_f32_fp8_e32 v1, s3 ; encoding: [0x03,0xd8,0x02,0x7e] +0x03,0xd8,0x02,0x7e + +# GFX12: v_cvt_f32_fp8_e32 v1, 3 ; encoding: [0x83,0xd8,0x02,0x7e] +0x83,0xd8,0x02,0x7e + +# GFX12: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e] +0x03,0xd9,0x02,0x7e + +# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], s3 ; encoding: [0x03,0xde,0x04,0x7e] +0x03,0xde,0x04,0x7e + +# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], 3 ; encoding: [0x83,0xde,0x04,0x7e] +0x83,0xde,0x04,0x7e + +# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], v3 ; encoding: [0x03,0xdf,0x04,0x7e] +0x03,0xdf,0x04,0x7e + +# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], s3 ; encoding: [0x03,0xdc,0x04,0x7e] +0x03,0xdc,0x04,0x7e + +# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], 3 ; encoding: [0x83,0xdc,0x04,0x7e] +0x83,0xdc,0x04,0x7e + +# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3 ; encoding: [0x03,0xdd,0x04,0x7e] +0x03,0xdd,0x04,0x7e + # GFX12: v_cvt_f16_f32_e32 v5, v1 ; encoding: [0x01,0x15,0x0a,0x7e] 0x01,0x15,0x0a,0x7e diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt 2024-01-30 07:40:22.000000000 +0000 @@ -337,6 +337,18 @@ # GFX12: v_ctz_i32_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX12: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc ; encoding: [0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac] +0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac + +# GFX12: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe ; encoding: [0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e] +0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e + +# GFX12: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc ; encoding: [0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac] +0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac + +# GFX12: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe ; encoding: [0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e] +0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e + # GFX12: v_cvt_f16_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt 2024-01-30 07:40:22.000000000 +0000 @@ -49,6 +49,18 @@ # GFX12: v_ctz_i32_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xea,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12: v_cvt_f32_fp8_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa] +0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_fp8_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05] +0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05 + +# GFX12: v_cvt_f32_bf8_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa] +0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_bf8_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05] +0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05 + # GFX12: v_cvt_f16_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05 diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt 2024-01-30 07:40:22.000000000 +0000 @@ -993,6 +993,42 @@ # GFX12: v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] 0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf +# GFX12: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00] +0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00 + +# GFX12: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00] +0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00 + +# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_sr_fp8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00] +0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00 + +# GFX12: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20] +0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20 + +# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00] +0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00 + +# GFX12: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20] +0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20 + # GFX12: v_cvt_pk_i16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00] 0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00 diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt 2024-01-30 07:40:22.000000000 +0000 @@ -825,6 +825,114 @@ # GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] 0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30 +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt 2024-01-30 07:40:22.000000000 +0000 @@ -495,6 +495,54 @@ # GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] 0xff,0x87,0x0e,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00 +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21] +0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00 + # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt 2024-01-30 07:40:22.000000000 +0000 @@ -396,6 +396,42 @@ # GFX12: v_ctz_i32_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf +# GFX12: v_cvt_f32_bf8_e64 v1, s3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00] +0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] +0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_bf8_e64 v1, v3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00] +0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00 + +# GFX12: v_cvt_f32_fp8_e64 v1, s3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00] +0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_fp8_e64 v1, 3 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00] +0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_fp8_e64 v1, v3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00] +0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00 + +# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] +0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00] +0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] +0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00 + +# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] +0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] +0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] +0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00 + # GFX12: v_cvt_f16_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00 diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt 2024-01-30 07:40:22.000000000 +0000 @@ -336,6 +336,18 @@ # GFX12: v_ctz_i32_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX12: v_cvt_f32_fp8_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x2 bank_mask:0xd ; encoding: [0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d] +0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d + +# GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e] +0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e + +# GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x2 bank_mask:0xd ; encoding: [0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d] +0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d + +# GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e] +0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e + # GFX12: v_cvt_f16_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt 2024-01-30 07:40:22.000000000 +0000 @@ -72,6 +72,18 @@ # GFX12: v_ctz_i32_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xba,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0xff,0x00,0xba,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX12: v_cvt_f32_fp8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] +0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] +0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05 + +# GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] +0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] +0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05 + # GFX12: v_cvt_f16_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,1628 @@ +# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX12-ERR %s + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x40,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x58,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x40,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x41,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x41,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x42,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x42,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x43,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x43,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x48,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x41,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x44,0xcc,0x81,0x04,0x12,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x01,0x04,0x12,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x03,0x11,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x03,0x10,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] + + + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr src0 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] # sgpr src1 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1/*Invalid register, operand has 'VGPR_32' register class*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x18] # sgpr src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], 1/*Invalid immediate*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1/*Invalid immediate*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x46,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x46,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x48,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x48,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x47,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x47,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x49,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x49,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x41,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x4a,0xcc,0x81,0x04,0x12,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x01,0x04,0x12,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x03,0x11,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x03,0x10,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] + + + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c] +# GFX12:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0xc0,0x50,0xcc,0x00,0x09,0x52,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0] +# GFX12:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x44,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18] + +[0x0c,0x40,0x50,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x01,0x50,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x18] + + + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0xc0,0x51,0xcc,0x00,0x09,0x52,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x44,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18] + +[0x0c,0x40,0x51,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x01,0x50,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x18] + + + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0xc0,0x52,0xcc,0x00,0x09,0x42,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x44,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18] # sgpr src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18] + +[0x0c,0x40,0x52,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x01,0x40,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x18] + + + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0xc0,0x53,0xcc,0x00,0x09,0x42,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x44,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18] # sgpr src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18] + +[0x0c,0x40,0x53,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x01,0x40,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x18] + + + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x54,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x01,0x04,0x3a,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x03,0x39,0x1c] # 1 src1 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x03,0x38,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x18] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x02,0x18] + + + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x60,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x58,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c] # sgpr src0 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c] # sgpr src1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x01,0x2c,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18] # sgpr src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18] + +[0x03,0x40,0x55,0xcc,0x81,0x02,0x2e,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], 1/*Invalid immediate*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x01,0x02,0x2e,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2d,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1/*Invalid immediate*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x18] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] + + + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x56,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x01,0x04,0x3a,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x03,0x39,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x03,0x38,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x18] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x02,0x18] + + + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x57,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x57,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x58,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x58,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x59,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x59,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x5a,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x18] diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt 2024-01-30 07:40:22.000000000 +0000 @@ -0,0 +1,1628 @@ +# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX12-ERR %s + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x40,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x40,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x41,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x41,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x42,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x42,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x43,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x43,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x44,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x46,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x46,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x47,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x47,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x48,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x48,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x49,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x49,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x4a,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0xc0,0x50,0xcc,0x00,0x05,0x2a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x60,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x44,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x50,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0xc0,0x51,0xcc,0x00,0x05,0x2a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x60,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x44,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x51,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0xc0,0x52,0xcc,0x00,0x05,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x60,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:3 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x44,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x52,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0xc0,0x53,0xcc,0x00,0x05,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x60,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:3 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x44,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x53,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x18] + + + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x54,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x01,0x02,0x1e,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1c,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x18] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] + + + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x55,0xcc,0x81,0x02,0x1a,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x01,0x02,0x1a,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x19,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x18,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x18] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] + + + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x60,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x58,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x56,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x01,0x02,0x1e,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1c,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x18] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] + + + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x57,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x57,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x18] + + + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x58,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x58,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x18] + + + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x59,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x59,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x18] + + + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x5a,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x18] diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/TableGen/address-space-patfrags.td llvm-toolchain-18-18.1.0-rc1/llvm/test/TableGen/address-space-patfrags.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/TableGen/address-space-patfrags.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/TableGen/address-space-patfrags.td 2024-01-30 07:40:22.000000000 +0000 @@ -46,7 +46,7 @@ let InOperandList = (ins GPR32:$src0, GPR32:$src1); } -// SDAG: case 1: { +// SDAG: case 0: { // SDAG-NEXT: // Predicate_pat_frag_b // SDAG-NEXT: // Predicate_truncstorei16_addrspace // SDAG-NEXT: SDNode *N = Node; @@ -69,7 +69,7 @@ >; -// SDAG: case 6: { +// SDAG: case 4: { // SDAG: // Predicate_pat_frag_a // SDAG-NEXT: SDNode *N = Node; // SDAG-NEXT: (void)N; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll llvm-toolchain-18-18.1.0-rc1/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll 1970-01-01 00:00:00.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll 2024-01-30 07:40:26.000000000 +0000 @@ -0,0 +1,217 @@ +; REQUIRES: asserts +; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define void @no_outer_loop(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %off, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'no_outer_loop' +; CHECK: Calculating cost of runtime checks: +; CHECK-NOT: We expect runtime memory checks to be hoisted out of the outer loop. +; CHECK: Total cost of runtime checks: 4 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %entry ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %off + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + ret void +} + +define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc' +; CHECK: Calculating cost of runtime checks: +; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3 +; CHECK: Total cost of runtime checks: 3 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond27.not = icmp eq i64 %outer.iv.next, %m + br i1 %exitcond27.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3' +; CHECK: Calculating cost of runtime checks: +; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 +; CHECK: Total cost of runtime checks: 2 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond26.not = icmp eq i64 %outer.iv.next, 3 + br i1 %exitcond26.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64' +; CHECK: Calculating cost of runtime checks: +; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1 +; CHECK: Total cost of runtime checks: 1 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond26.not = icmp eq i64 %outer.iv.next, 64 + br i1 %exitcond26.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3' +; CHECK: Calculating cost of runtime checks: +; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 +; CHECK: Total cost of runtime checks: 2 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond26.not = icmp eq i64 %outer.iv.next, %m + br i1 %exitcond26.not, label %outer.exit, label %outer.loop, !prof !0 + +outer.exit: + ret void +} + + +define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks' +; CHECK: Calculating cost of runtime checks: +; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 +; CHECK: Total cost of runtime checks: 2 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ] + %0 = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ] + %1 = add nuw nsw i64 %iv.inner, %0 + %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1 + %2 = load i32, ptr %arrayidx.us, align 4 + %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1 + %3 = load i32, ptr %arrayidx8.us, align 4 + %add9.us = add nsw i32 %3, %2 + store i32 %add9.us, ptr %arrayidx8.us, align 4 + %iv.inner.next = add nuw nsw i64 %iv.inner, 1 + %inner.exit.cond = icmp eq i64 %iv.inner.next, %n + br i1 %inner.exit.cond, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %outer.exit.cond = icmp eq i64 %outer.iv.next, 3 + br i1 %outer.exit.cond, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +!0 = !{!"branch_weights", i32 10, i32 20} diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp llvm-toolchain-18-18.1.0-rc1/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp 2024-01-30 07:40:22.000000000 +0000 @@ -57,7 +57,8 @@ // We de-duplicate the predicates by code string, and use this map to track // all the patterns with "identical" predicates. - StringMap> NodePredicatesByCodeToRun; + MapVector, StringMap> + NodePredicatesByCodeToRun; std::vector PatternPredicates; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/utils/TableGen/PredicateExpander.cpp llvm-toolchain-18-18.1.0-rc1/llvm/utils/TableGen/PredicateExpander.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/utils/TableGen/PredicateExpander.cpp 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/utils/TableGen/PredicateExpander.cpp 2024-01-30 07:40:22.000000000 +0000 @@ -59,6 +59,30 @@ OS << ")"; } +void PredicateExpander::expandCheckImmOperandLT(raw_ostream &OS, int OpIndex, + int ImmVal, + StringRef FunctionMapper) { + if (!FunctionMapper.empty()) + OS << FunctionMapper << "("; + OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex + << ").getImm()"; + if (!FunctionMapper.empty()) + OS << ")"; + OS << (shouldNegate() ? " >= " : " < ") << ImmVal; +} + +void PredicateExpander::expandCheckImmOperandGT(raw_ostream &OS, int OpIndex, + int ImmVal, + StringRef FunctionMapper) { + if (!FunctionMapper.empty()) + OS << FunctionMapper << "("; + OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex + << ").getImm()"; + if (!FunctionMapper.empty()) + OS << ")"; + OS << (shouldNegate() ? " <= " : " > ") << ImmVal; +} + void PredicateExpander::expandCheckRegOperand(raw_ostream &OS, int OpIndex, const Record *Reg, StringRef FunctionMapper) { @@ -352,6 +376,16 @@ Rec->getValueAsString("ImmVal"), Rec->getValueAsString("FunctionMapper")); + if (Rec->isSubClassOf("CheckImmOperandLT")) + return expandCheckImmOperandLT(OS, Rec->getValueAsInt("OpIndex"), + Rec->getValueAsInt("ImmVal"), + Rec->getValueAsString("FunctionMapper")); + + if (Rec->isSubClassOf("CheckImmOperandGT")) + return expandCheckImmOperandGT(OS, Rec->getValueAsInt("OpIndex"), + Rec->getValueAsInt("ImmVal"), + Rec->getValueAsString("FunctionMapper")); + if (Rec->isSubClassOf("CheckImmOperandSimple")) return expandCheckImmOperandSimple(OS, Rec->getValueAsInt("OpIndex"), Rec->getValueAsString("FunctionMapper")); diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/utils/TableGen/PredicateExpander.h llvm-toolchain-18-18.1.0-rc1/llvm/utils/TableGen/PredicateExpander.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/llvm/utils/TableGen/PredicateExpander.h 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/llvm/utils/TableGen/PredicateExpander.h 2024-01-30 07:40:22.000000000 +0000 @@ -61,6 +61,10 @@ StringRef FunctionMapperer); void expandCheckImmOperandSimple(raw_ostream &OS, int OpIndex, StringRef FunctionMapper); + void expandCheckImmOperandLT(raw_ostream &OS, int OpIndex, int ImmVal, + StringRef FunctionMapper); + void expandCheckImmOperandGT(raw_ostream &OS, int OpIndex, int ImmVal, + StringRef FunctionMapper); void expandCheckRegOperand(raw_ostream &OS, int OpIndex, const Record *Reg, StringRef FunctionMapper); void expandCheckRegOperandSimple(raw_ostream &OS, int OpIndex, diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td llvm-toolchain-18-18.1.0-rc1/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td 2024-01-30 07:40:22.000000000 +0000 @@ -253,22 +253,23 @@ //===---------------------------------------------------------------------===// // WMMA intrinsics -class ROCDL_Wmma_IntrOp traits = []> : +class ROCDL_Wmma_IntrOp overloadedOperands, + list traits = []> : LLVM_IntrOpBase, + [0], overloadedOperands, traits, 1>, Arguments<(ins Variadic:$args)> { let assemblyFormat = "$args attr-dict `:` functional-type($args, $res)"; } // Available on RDNA3 -def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16">; -def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16">; -def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16">; -def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16">; -def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8">; -def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4">; +def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16", [0]>; +def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16", [0]>; +def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16", [0]>; +def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16", [0]>; +def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8", [1]>; +def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4", [1]>; //===---------------------------------------------------------------------===// // Operations on raw buffer resources (stride of 0, bounds checks either off or in diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/mlir/include/mlir/IR/Dialect.h llvm-toolchain-18-18.1.0-rc1/mlir/include/mlir/IR/Dialect.h --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/mlir/include/mlir/IR/Dialect.h 2023-12-03 09:16:44.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/mlir/include/mlir/IR/Dialect.h 2024-01-30 07:40:26.000000000 +0000 @@ -281,7 +281,11 @@ /// Register a set of type classes with this dialect. template void addTypes() { - (addType(), ...); + // This initializer_list argument pack expansion is essentially equal to + // using a fold expression with a comma operator. Clang however, refuses + // to compile a fold expression with a depth of more than 256 by default. + // There seem to be no such limitations for initializer_list. + (void)std::initializer_list{0, (addType(), 0)...}; } /// Register a type instance with this dialect. @@ -292,7 +296,11 @@ /// Register a set of attribute classes with this dialect. template void addAttributes() { - (addAttribute(), ...); + // This initializer_list argument pack expansion is essentially equal to + // using a fold expression with a comma operator. Clang however, refuses + // to compile a fold expression with a depth of more than 256 by default. + // There seem to be no such limitations for initializer_list. + (void)std::initializer_list{0, (addAttribute(), 0)...}; } /// Register an attribute instance with this dialect. diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp llvm-toolchain-18-18.1.0-rc1/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp 2024-01-09 22:37:53.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp 2024-01-30 07:40:26.000000000 +0000 @@ -529,7 +529,8 @@ /*alignment=*/0); for (auto [index, arg] : llvm::enumerate(args)) { Value ptr = rewriter.create( - loc, ptrType, structType, tempAlloc, ArrayRef{0, index}); + loc, ptrType, structType, tempAlloc, + ArrayRef{0, static_cast(index)}); rewriter.create(loc, arg, ptr); } std::array printfArgs = {stringStart, tempAlloc}; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp llvm-toolchain-18-18.1.0-rc1/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp 2024-01-24 12:45:36.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp 2024-01-30 07:40:26.000000000 +0000 @@ -1041,13 +1041,14 @@ auto arrayPtr = builder.create( loc, llvmPointerType, llvmPointerType, arraySize, /*alignment=*/0); for (const auto &en : llvm::enumerate(arguments)) { + const auto index = static_cast(en.index()); Value fieldPtr = builder.create(loc, llvmPointerType, structType, structPtr, - ArrayRef{0, en.index()}); + ArrayRef{0, index}); builder.create(loc, en.value(), fieldPtr); - auto elementPtr = builder.create( - loc, llvmPointerType, llvmPointerType, arrayPtr, - ArrayRef{en.index()}); + auto elementPtr = + builder.create(loc, llvmPointerType, llvmPointerType, + arrayPtr, ArrayRef{index}); builder.create(loc, fieldPtr, elementPtr); } return arrayPtr; diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp llvm-toolchain-18-18.1.0-rc1/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp 2024-01-24 12:45:36.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp 2024-01-30 07:40:26.000000000 +0000 @@ -488,7 +488,8 @@ // Other patterns will turn this into a type-consistent GEP. auto gepOp = rewriter.create( loc, address.getType(), rewriter.getI8Type(), address, - ArrayRef{storeOffset + index * elementSize}); + ArrayRef{ + static_cast(storeOffset + index * elementSize)}); rewriter.create(loc, extractOp, gepOp); } @@ -524,9 +525,9 @@ // We create an `i8` indexed GEP here as that is the easiest (offset is // already known). Other patterns turn this into a type-consistent GEP. - auto gepOp = - rewriter.create(loc, address.getType(), rewriter.getI8Type(), - address, ArrayRef{currentOffset}); + auto gepOp = rewriter.create( + loc, address.getType(), rewriter.getI8Type(), address, + ArrayRef{static_cast(currentOffset)}); rewriter.create(loc, valueToStore, gepOp); // No need to care about padding here since we already checked previously diff -Nru llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/mlir/test/Target/LLVMIR/rocdl.mlir llvm-toolchain-18-18.1.0-rc1/mlir/test/Target/LLVMIR/rocdl.mlir --- llvm-toolchain-18-18.1.0~++20240126095841+0991d3c7b53d/mlir/test/Target/LLVMIR/rocdl.mlir 2024-01-26 08:58:41.000000000 +0000 +++ llvm-toolchain-18-18.1.0-rc1/mlir/test/Target/LLVMIR/rocdl.mlir 2024-01-30 07:40:22.000000000 +0000 @@ -248,53 +248,53 @@ // ---- Wave32 ----- // f16 -> f32 - // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}}) %r0 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg0 : (vector<16xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32> // bf16 -> f32 - // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}}) %r1 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg0 : (vector<16xi16>, vector<16xi16>, vector<8xf32>) -> vector<8xf32> // f16 -> f16 (OPSEL = {0,1}) - // CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}}) + // CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}}) %r2 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg1, %zero : (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16> // bf16 -> bf16 (OPSEL = {0,1}) - // CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}}) + // CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}}) %r4 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg2, %zero : (vector<16xi16>, vector<16xi16>, vector<16xi16>, i1) -> vector<16xi16> // int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) - // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}}) + // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}}) %r5 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg3, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<8xi32>, i1) -> vector<8xi32> // int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) - // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}}) + // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}}) %r6 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg3, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32> // ---- Wave64 ----- // f16 -> f32 - // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}}) %r7 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg6 : (vector<16xf16>, vector<16xf16>, vector<4xf32>) -> vector<4xf32> // bf16 -> f32 - // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}}) %r8 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg6 : (vector<16xi16>, vector<16xi16>, vector<4xf32>) -> vector<4xf32> // f16 -> f16 (OPSEL = {0,1}) - // CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}}) + // CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}}) %r9 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg7, %zero : (vector<16xf16>, vector<16xf16>, vector<8xf16>, i1) -> vector<8xf16> // bf16 -> bf16 (OPSEL = {0,1}) - // CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}}) + // CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}}) %r11 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg8, %zero : (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1) -> vector<8xi16> // int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) - // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}) + // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}) %r12 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg5, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<4xi32>, i1) -> vector<4xi32> // int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) - // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}) + // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}) %r13 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg5, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<4xi32>, i1) -> vector<4xi32> llvm.return %r0 : vector<8xf32>