Merge ~bullwinkle-team/ubuntu/+source/rocm-smi-lib:bullwinkle/ubuntu/devel into ubuntu/+source/rocm-smi-lib:ubuntu/devel
- Git
- lp:~bullwinkle-team/ubuntu/+source/rocm-smi-lib
- bullwinkle/ubuntu/devel
- Merge into ubuntu/devel
| Status: | Merged | ||||
|---|---|---|---|---|---|
| Merged at revision: | 401aba6b66ff49ce5c298d30193f938209edf0d7 | ||||
| Proposed branch: | ~bullwinkle-team/ubuntu/+source/rocm-smi-lib:bullwinkle/ubuntu/devel | ||||
| Merge into: | ubuntu/+source/rocm-smi-lib:ubuntu/devel | ||||
| Diff against target: |
7412 lines (+2083/-1573) 140 files modified
.azuredevops/rocm-ci.yml (+4/-2) .github/CODEOWNERS (+1/-1) .github/palamida.yml (+5/-0) .github/workflows/kws-caller.yml (+15/-0) .github/workflows/rocm_ci_caller.yml (+16/-10) CHANGELOG.md (+89/-0) CMakeLists.txt (+26/-42) LICENSE.md (+1/-2) README.md (+1/-1) cmake_modules/help_package.cmake (+1/-1) cmake_modules/utils.cmake (+1/-1) debian/changelog (+36/-0) debian/control (+8/-6) debian/liboam7.install (+2/-0) debian/liboam7.symbols.amd64 (+1/-1) debian/librocm-smi64-7.install (+2/-0) debian/librocm-smi64-7.symbols.amd64 (+1/-1) debian/not-installed (+1/-1) debian/patches/0002-add-version-script-to-control-exposed-symbols.patch (+1/-1) debian/patches/0005-oam-rocm_smi-fix-version-string-issue-when-no-git-av.patch (+45/-0) debian/patches/series (+1/-0) debian/rules (+7/-0) dev/null (+0/-200) docs/conf.py (+1/-1) docs/license.md (+1/-1) include/rocm_smi/rocm_smi.h (+34/-18) include/rocm_smi/rocm_smi_common.h (+1/-1) include/rocm_smi/rocm_smi_counters.h (+1/-1) include/rocm_smi/rocm_smi_device.h (+1/-1) include/rocm_smi/rocm_smi_exception.h (+1/-1) include/rocm_smi/rocm_smi_gpu_metrics.h (+182/-64) include/rocm_smi/rocm_smi_io_link.h (+1/-1) include/rocm_smi/rocm_smi_kfd.h (+1/-1) include/rocm_smi/rocm_smi_logger.h (+1/-1) include/rocm_smi/rocm_smi_main.h (+1/-1) include/rocm_smi/rocm_smi_monitor.h (+1/-1) include/rocm_smi/rocm_smi_power_mon.h (+1/-1) include/rocm_smi/rocm_smi_properties.h (+1/-1) include/rocm_smi/rocm_smi_utils.h (+4/-1) oam/CMakeLists.txt (+4/-2) oam/src/oamConfig.in (+2/-2) python_smi_tools/README.md (+1/-1) python_smi_tools/rocm_smi.py (+176/-88) python_smi_tools/rsmiBindings.py (+28/-5) python_smi_tools/rsmiBindings.py.in (+1/-1) python_smi_tools/rsmiBindingsInit.py.in (+1/-1) rocm_smi/CMakeLists.txt (+3/-5) rocm_smi/example/rocm_smi_example.cc (+50/-1) src/rocm_smi.cc (+397/-156) src/rocm_smi64Config.in (+1/-1) src/rocm_smi_counters.cc (+1/-1) src/rocm_smi_device.cc (+9/-7) src/rocm_smi_gpu_metrics.cc (+565/-744) src/rocm_smi_io_link.cc (+1/-1) src/rocm_smi_kfd.cc (+3/-3) src/rocm_smi_logger.cc (+1/-1) src/rocm_smi_main.cc (+1/-1) src/rocm_smi_monitor.cc (+10/-3) src/rocm_smi_properties.cc (+1/-1) src/rocm_smi_utils.cc (+56/-25) tests/rocm_smi_test/functional/api_support_read.cc (+1/-1) tests/rocm_smi_test/functional/api_support_read.h (+1/-1) tests/rocm_smi_test/functional/computepartition_read_write.cc (+1/-1) tests/rocm_smi_test/functional/computepartition_read_write.h (+1/-1) tests/rocm_smi_test/functional/err_cnt_read.cc (+1/-1) tests/rocm_smi_test/functional/err_cnt_read.h (+1/-1) tests/rocm_smi_test/functional/evt_notif_read_write.cc (+1/-1) tests/rocm_smi_test/functional/evt_notif_read_write.h (+1/-1) tests/rocm_smi_test/functional/fan_read.cc (+7/-8) tests/rocm_smi_test/functional/fan_read.h (+1/-1) tests/rocm_smi_test/functional/fan_read_write.cc (+5/-1) tests/rocm_smi_test/functional/fan_read_write.h (+1/-1) tests/rocm_smi_test/functional/frequencies_read.cc (+1/-1) tests/rocm_smi_test/functional/frequencies_read.h (+1/-1) tests/rocm_smi_test/functional/frequencies_read_write.cc (+1/-1) tests/rocm_smi_test/functional/frequencies_read_write.h (+1/-1) tests/rocm_smi_test/functional/gpu_busy_read.cc (+3/-2) tests/rocm_smi_test/functional/gpu_busy_read.h (+1/-1) tests/rocm_smi_test/functional/gpu_metrics_read.cc (+46/-2) tests/rocm_smi_test/functional/gpu_metrics_read.h (+1/-1) tests/rocm_smi_test/functional/hw_topology_read.cc (+1/-1) tests/rocm_smi_test/functional/hw_topology_read.h (+1/-1) tests/rocm_smi_test/functional/id_info_read.cc (+1/-1) tests/rocm_smi_test/functional/id_info_read.h (+1/-1) tests/rocm_smi_test/functional/init_shutdown_refcount.cc (+1/-1) tests/rocm_smi_test/functional/init_shutdown_refcount.h (+1/-1) tests/rocm_smi_test/functional/measure_api_execution_time.cc (+1/-1) tests/rocm_smi_test/functional/measure_api_execution_time.h (+1/-1) tests/rocm_smi_test/functional/mem_page_info_read.cc (+1/-1) tests/rocm_smi_test/functional/mem_page_info_read.h (+1/-1) tests/rocm_smi_test/functional/mem_util_read.cc (+23/-11) tests/rocm_smi_test/functional/mem_util_read.h (+1/-1) tests/rocm_smi_test/functional/memorypartition_read_write.cc (+1/-1) tests/rocm_smi_test/functional/memorypartition_read_write.h (+1/-1) tests/rocm_smi_test/functional/metrics_counter_read.cc (+1/-1) tests/rocm_smi_test/functional/metrics_counter_read.h (+1/-1) tests/rocm_smi_test/functional/mutual_exclusion.cc (+1/-1) tests/rocm_smi_test/functional/mutual_exclusion.h (+1/-1) tests/rocm_smi_test/functional/overdrive_read.cc (+1/-1) tests/rocm_smi_test/functional/overdrive_read.h (+1/-1) tests/rocm_smi_test/functional/overdrive_read_write.cc (+1/-1) tests/rocm_smi_test/functional/overdrive_read_write.h (+1/-1) tests/rocm_smi_test/functional/pci_read_write.cc (+1/-1) tests/rocm_smi_test/functional/pci_read_write.h (+1/-1) tests/rocm_smi_test/functional/perf_cntr_read_write.cc (+1/-1) tests/rocm_smi_test/functional/perf_cntr_read_write.h (+1/-1) tests/rocm_smi_test/functional/perf_determinism.cc (+9/-1) tests/rocm_smi_test/functional/perf_determinism.h (+1/-1) tests/rocm_smi_test/functional/perf_level_read.cc (+10/-5) tests/rocm_smi_test/functional/perf_level_read.h (+1/-1) tests/rocm_smi_test/functional/perf_level_read_write.cc (+10/-5) tests/rocm_smi_test/functional/perf_level_read_write.h (+1/-1) tests/rocm_smi_test/functional/power_cap_read_write.cc (+14/-7) tests/rocm_smi_test/functional/power_cap_read_write.h (+1/-1) tests/rocm_smi_test/functional/power_read.cc (+26/-11) tests/rocm_smi_test/functional/power_read.h (+1/-1) tests/rocm_smi_test/functional/power_read_write.cc (+8/-15) tests/rocm_smi_test/functional/power_read_write.h (+1/-1) tests/rocm_smi_test/functional/process_info_read.cc (+1/-1) tests/rocm_smi_test/functional/process_info_read.h (+1/-1) tests/rocm_smi_test/functional/sys_info_read.cc (+11/-5) tests/rocm_smi_test/functional/sys_info_read.h (+1/-1) tests/rocm_smi_test/functional/temp_read.cc (+2/-2) tests/rocm_smi_test/functional/temp_read.h (+1/-1) tests/rocm_smi_test/functional/version_read.cc (+1/-1) tests/rocm_smi_test/functional/version_read.h (+1/-1) tests/rocm_smi_test/functional/volt_freq_curv_read.cc (+1/-1) tests/rocm_smi_test/functional/volt_freq_curv_read.h (+1/-1) tests/rocm_smi_test/functional/volt_read.cc (+2/-2) tests/rocm_smi_test/functional/volt_read.h (+1/-1) tests/rocm_smi_test/functional/xgmi_read_write.cc (+1/-1) tests/rocm_smi_test/functional/xgmi_read_write.h (+1/-1) tests/rocm_smi_test/main.cc (+1/-1) tests/rocm_smi_test/test_base.cc (+22/-13) tests/rocm_smi_test/test_base.h (+10/-3) tests/rocm_smi_test/test_common.cc (+2/-1) tests/rocm_smi_test/test_common.h (+1/-1) tests/rocm_smi_test/test_utils.cc (+1/-1) tests/rocm_smi_test/test_utils.h (+1/-1) third_party/shared_mutex/shared_mutex.cc (+1/-1) |
||||
| Related bugs: |
|
| Reviewer | Review Type | Date Requested | Status |
|---|---|---|---|
| Frank Heimes (community) | Approve | ||
| Igor Luppi (community) | Approve | ||
| Andreas Hasenack | Pending | ||
| Ubuntu Sponsors | Pending | ||
| Talha Can Havadar | Pending | ||
|
Review via email:
|
|||
This proposal supersedes a proposal from 2025-12-04.
Commit message
Description of the change
New upstream version 7.1.0
Tested this package in:
https:/
And here in this ppa where I experimented with upstream llvm:
https:/
Also built llama.cpp snap (indirectly depends this package):
https:/
Functionally things seems to be working ok.
| Andreas Hasenack (ahasenack) : Posted in a previous version of this proposal | # |
| Andreas Hasenack (ahasenack) : Posted in a previous version of this proposal | # |
| Andreas Hasenack (ahasenack) : Posted in a previous version of this proposal | # |
| Andreas Hasenack (ahasenack) wrote : Posted in a previous version of this proposal | # |
| Andreas Hasenack (ahasenack) : Posted in a previous version of this proposal | # |
| Andreas Hasenack (ahasenack) : Posted in a previous version of this proposal | # |
| Talha Can Havadar (tchavadar) wrote : Posted in a previous version of this proposal | # |
Why do we have this change in rocm-smi-lib?
d/control:
```
-Architecture: linux-any
+Architecture: amd64 arm64
```
| Talha Can Havadar (tchavadar) wrote : Posted in a previous version of this proposal | # |
Also about b/debian/
Based on our previous discussions, having ubuntu suffix in the version would block the sync from debian but I dont see a real need to block the sync from debian for rocm-smi-lib (this doesnt depend on amd llvm fork)
| Igor Luppi (igorluppi) wrote : Posted in a previous version of this proposal | # |
Please @Talha, take a second review. I have changed the patch and also the soversion. Thanks!!
| Talha Can Havadar (tchavadar) wrote : Posted in a previous version of this proposal | # |
| Talha Can Havadar (tchavadar) wrote : Posted in a previous version of this proposal | # |
I just updated the branch with the fix and gonna merge this to bullwinkle/
| Talha Can Havadar (tchavadar) wrote : Posted in a previous version of this proposal | # |
igor can you change propsed branch to bullwinkle/
| Igor Luppi (igorluppi) : | # |
| Talha Can Havadar (tchavadar) wrote : | # |
created https:/
| Frank Heimes (fheimes) wrote : | # |
First of all thanks for this significant work!
I have a few / the following thoughts:
- I am surprised that this ROCm package also builds for platforms other than amd64, amd64v3
I haven't expected that - it builds also for arm64, armhf, ppc64el and s390x
(The previous version even for i386 and riscv64.)
Does that make sense? Especially the librocm-"smi64" armhf ?
In d/control I see that the architecture is 'linux-any' - might make sense to limit this more.
(But if all these arch. are upstream supported and work fine, ignore my ignorance, but then the symbols
file would need to be available for more than amd64.)
- I bumped into the above because there is only a liboam7.symbols file for amd64.
(Hence lintian, running on the binary DEBs, complains about the missing symbols file for all other
architectures.)
- Then I believe that version 7.1.0-0ubuntu"1" was never uploaded outside of the PPA, right (since it's also marked with 'UNRELEASED')?
So we could also combine/squash the changelog entries for 0ubuntu1 and 0ubuntu2 to a new 0ubuntu1
and upload this. - just a thought.
(It would be a bit cleaner for the archive, but I believe uploading an 0ubuntu2 would also be ok).
The only issue that I found was when I tried to get the orig-tarball, what I usually do using uscan.
But that didn't work, because the watch file seems to be broken:
$ uscan
uscan warn: debian/watch is an obsolete version 1 watch file;
please upgrade to a higher version
(see uscan(1) for details).
uscan warn: debian/watch is an obsolete version 1 watch file;
please upgrade to a higher version
(see uscan(1) for details).
uscan warn: debian/watch is an obsolete version 1 watch file;
please upgrade to a higher version
(see uscan(1) for details).
uscan warn: debian/watch is an obsolete version 1 watch file;
please upgrade to a higher version
(see uscan(1) for details).
uscan warn: debian/watch is an obsolete version 1 watch file;
please upgrade to a higher version
(see uscan(1) for details).
uscan warn: there appears to be a version 2 format line in
the version 1 watch file debian/watch;
Have you forgotten a 'version=2' line at the start, perhaps?
Skipping the line: Version: 5
uscan warn: there appears to be a version 2 format line in
the version 1 watch file debian/watch;
Have you forgotten a 'version=2' line at the start, perhaps?
Skipping the line: Source: https:/
uscan warn: there appears to be a version 2 format line in
the version 1 watch file debian/watch;
Have you forgotten a 'version=2' line at the start, perhaps?
Skipping the line: Matching-Pattern: https:/
uscan warn: there appears to be a version 2 format line in
the version 1 watch file debian/watch;
Have you forgotten a 'version=2' line at the start, perhaps?
Skipping the line: Filenamemangle: s%.*/rocm-
uscan warn: there appears to be a version 2 format line in
the version 1 watch file debian/watch;
Have you forgotten a 'version=2' line at the start, ...
| Frank Heimes (fheimes) wrote : | # |
Good point, the issue with the watch file is likely because it's a v5, and I have tried it on a noble systems (rather than in a resolute container) -- good catch.
| Talha Can Havadar (tchavadar) wrote : | # |
Hello Frank,
Thank you very much for your time, yeah you are absolutely correct on questioning arch changes.
Please see the original content of the debian/control of the package here: https:/
I believe there was a glitch/bug or whatever in our debian sync bot/tool which caused adding all architectures to this package during import. it probably replaces all `linux-any`s with all archs explicitly.
so I believe it should stay as any.
about 0ubuntu1 yes it was only released in the ppa, I dont think anybody really installed the package from the ppa but it is a public ppa anyways so to not break the update path I would like to keep 0ubuntu2 but do you want me to merge these changelog entries and keep it as 0ubuntu2 and add a comment in the entry to state this?
| Talha Can Havadar (tchavadar) wrote (last edit ): | # |
For uscan I think v5 only works on questing and above. So I usually use a resolute container or find a newer version of uscan somewhere and use that locally to run v5 watch scripts
| Frank Heimes (fheimes) wrote : | # |
Hi Talha,
thanks for the reference to salsa - looks like it was always linux-any.
Yes, the watch file version is 5, hence it works fine on a resolute system.
(Funnily enough it would not have helped me, since it fetches of course the latest version, which is 7.1.1 - but this is a version bump to 7.0.1.)
No, if you want to stick with 0ubuntu2, I think there is no need for an update and we can go as is.
| Talha Can Havadar (tchavadar) wrote : | # |
Thank you very much Frank!
Preview Diff
| 1 | diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml |
| 2 | index dcb8198..430c585 100644 |
| 3 | --- a/.azuredevops/rocm-ci.yml |
| 4 | +++ b/.azuredevops/rocm-ci.yml |
| 5 | @@ -14,26 +14,28 @@ trigger: |
| 6 | branches: |
| 7 | include: |
| 8 | - amd-staging |
| 9 | + - amd-mainline |
| 10 | paths: |
| 11 | exclude: |
| 12 | - .github |
| 13 | - docs |
| 14 | - '.*.y*ml' |
| 15 | - '*.md' |
| 16 | - - License.txt |
| 17 | + - LICENSE |
| 18 | |
| 19 | pr: |
| 20 | autoCancel: true |
| 21 | branches: |
| 22 | include: |
| 23 | - amd-staging |
| 24 | + - amd-mainline |
| 25 | paths: |
| 26 | exclude: |
| 27 | - .github |
| 28 | - docs |
| 29 | - '.*.y*ml' |
| 30 | - '*.md' |
| 31 | - - License.txt |
| 32 | + - LICENSE |
| 33 | drafts: false |
| 34 | |
| 35 | jobs: |
| 36 | diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS |
| 37 | index 50f8b9f..75ee317 100644 |
| 38 | --- a/.github/CODEOWNERS |
| 39 | +++ b/.github/CODEOWNERS |
| 40 | @@ -1,4 +1,4 @@ |
| 41 | -* @bill-shuzhou-liu @dmitrii-galantsev @charis-poag-amd @oliveiradan |
| 42 | +* @bill-shuzhou-liu @dmitrii-galantsev @charis-poag-amd @oliveiradan @marifamd @gabrpham |
| 43 | |
| 44 | docs/* @ROCm/rocm-documentation |
| 45 | *.md @ROCm/rocm-documentation |
| 46 | diff --git a/.github/palamida.yml b/.github/palamida.yml |
| 47 | new file mode 100644 |
| 48 | index 0000000..47bd57a |
| 49 | --- /dev/null |
| 50 | +++ b/.github/palamida.yml |
| 51 | @@ -0,0 +1,5 @@ |
| 52 | +disabled: false |
| 53 | +scmId: gh-emu-rocm |
| 54 | +branchesToScan: |
| 55 | + - amd-staging |
| 56 | + - amd-mainline |
| 57 | \ No newline at end of file |
| 58 | diff --git a/.github/workflows/kws-caller.yml b/.github/workflows/kws-caller.yml |
| 59 | new file mode 100644 |
| 60 | index 0000000..c0f4f26 |
| 61 | --- /dev/null |
| 62 | +++ b/.github/workflows/kws-caller.yml |
| 63 | @@ -0,0 +1,15 @@ |
| 64 | +name: Rocm Validation Suite KWS |
| 65 | +on: |
| 66 | + push: |
| 67 | + branches: [amd-staging, amd-mainline] |
| 68 | + pull_request: |
| 69 | + types: [opened, synchronize, reopened] |
| 70 | + workflow_dispatch: |
| 71 | +jobs: |
| 72 | + kws: |
| 73 | + if: ${{ github.event_name == 'pull_request' }} |
| 74 | + uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/kws.yml@mainline |
| 75 | + secrets: inherit |
| 76 | + with: |
| 77 | + pr_number: ${{github.event.pull_request.number}} |
| 78 | + base_branch: ${{github.base_ref}} |
| 79 | diff --git a/.github/workflows/rocm_ci_caller.yml b/.github/workflows/rocm_ci_caller.yml |
| 80 | index c3a28cc..9643cdf 100644 |
| 81 | --- a/.github/workflows/rocm_ci_caller.yml |
| 82 | +++ b/.github/workflows/rocm_ci_caller.yml |
| 83 | @@ -1,19 +1,25 @@ |
| 84 | -name: ROCm CI Caller |
| 85 | -on: |
| 86 | +name: ROCm CI Caller |
| 87 | +on: |
| 88 | pull_request: |
| 89 | - branches: [release/rocm-rel-6.4] |
| 90 | + branches: [amd-staging, release/rocm-rel-*, amd-mainline] |
| 91 | types: [opened, reopened, synchronize] |
| 92 | + push: |
| 93 | + branches: [amd-mainline] |
| 94 | workflow_dispatch: |
| 95 | + issue_comment: |
| 96 | + types: [created] |
| 97 | |
| 98 | jobs: |
| 99 | call-workflow: |
| 100 | - uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/rocm_ci.yml@mainline |
| 101 | + if: github.event_name != 'issue_comment' ||(github.event_name == 'issue_comment' && github.event.issue.pull_request && (startsWith(github.event.comment.body, '!verify') || startsWith(github.event.comment.body, '!verify release') || startsWith(github.event.comment.body, '!verify retest'))) |
| 102 | + uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/rocm_ci.yml@mainline |
| 103 | secrets: inherit |
| 104 | with: |
| 105 | - input_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} |
| 106 | - input_pr_num: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 0 }} |
| 107 | - input_pr_url: ${{ github.event_name == 'pull_request' && github.event.pull_request.html_url || '' }} |
| 108 | - input_pr_title: ${{ github.event_name == 'pull_request' && github.event.pull_request.title || '' }} |
| 109 | + input_sha: ${{github.event_name == 'pull_request' && github.event.pull_request.head.sha || (github.event_name == 'push' && github.sha) || (github.event_name == 'issue_comment' && github.event.issue.pull_request.head.sha) || github.sha}} |
| 110 | + input_pr_num: ${{github.event_name == 'pull_request' && github.event.pull_request.number || (github.event_name == 'issue_comment' && github.event.issue.number) || 0}} |
| 111 | + input_pr_url: ${{github.event_name == 'pull_request' && github.event.pull_request.html_url || (github.event_name == 'issue_comment' && github.event.issue.pull_request.html_url) || ''}} |
| 112 | + input_pr_title: ${{github.event_name == 'pull_request' && github.event.pull_request.title || (github.event_name == 'issue_comment' && github.event.issue.pull_request.title) || ''}} |
| 113 | repository_name: ${{ github.repository }} |
| 114 | - base_ref: ${{ github.event_name == 'pull_request' && github.base_ref || github.ref }} |
| 115 | - trigger_event_type: ${{ github.event_name }} |
| 116 | + base_ref: ${{github.event_name == 'pull_request' && github.event.pull_request.base.ref || (github.event_name == 'issue_comment' && github.event.issue.pull_request.base.ref) || github.ref}} |
| 117 | + trigger_event_type: ${{ github.event_name }} |
| 118 | + comment_text: ${{ github.event_name == 'issue_comment' && github.event.comment.body || '' }} |
| 119 | diff --git a/CHANGELOG.md b/CHANGELOG.md |
| 120 | index 683e654..ab2eac0 100644 |
| 121 | --- a/CHANGELOG.md |
| 122 | +++ b/CHANGELOG.md |
| 123 | @@ -4,6 +4,95 @@ Full documentation for rocm_smi_lib is available at [https://rocm.docs.amd.com/] |
| 124 | |
| 125 | ***All information listed below is for reference and subject to change.*** |
| 126 | |
| 127 | +## rocm_smi_lib for ROCm 7.0.0 |
| 128 | + |
| 129 | +### Added |
| 130 | + |
| 131 | +- **Added support for GPU metrics 1.8**. |
| 132 | + - Added new fields for `rsmi_gpu_metrics_t` including: |
| 133 | + - Adding the following metrics to allow new calculations for violation status: |
| 134 | + - Per XCP metrics `gfx_below_host_limit_ppt_acc[XCP][MAX_XCC]` - GFX Clock Host limit Package Power Tracking violation counts |
| 135 | + - Per XCP metrics `gfx_below_host_limit_thm_acc[XCP][MAX_XCC]` - GFX Clock Host limit Thermal (TVIOL) violation counts |
| 136 | + - Per XCP metrics `gfx_low_utilization_acc[XCP][MAX_XCC]` - violation counts for how did low utilization caused the GPU to be below application clocks. |
| 137 | + - Per XCP metrics `gfx_below_host_limit_total_acc[XCP][MAX_XCC]`- violation counts for how long GPU was held below application clocks any limiter (see above new violation metrics). |
| 138 | + - Increasing available JPEG engines to 40. |
| 139 | + Current ASICs may not support all 40. These will be indicated as UINT16_MAX or N/A in CLI. |
| 140 | + |
| 141 | +### Changed |
| 142 | + |
| 143 | +- N/A |
| 144 | + |
| 145 | +### Removed |
| 146 | + |
| 147 | +- **Removed backwards compatibility `rsmi_dev_gpu_metrics_info_get()`'s `jpeg_activity` or `vcn_activity` fields: use `xcp_stats.jpeg_busy` or `xcp_stats.vcn_busy`** |
| 148 | + - Backwards compability is removed for `jpeg_activity` and `vcn_activity` fields, if the `jpeg_busy` or `vcn_busy` field is available. |
| 149 | + - <i>Reasons for this change</i>: |
| 150 | + - Providing both `vcn_activity`/`jpeg_activity` and XCP (partition) stats `vcn_busy`/`jpeg_busy` caused confusion for users about which field to use. By removing backward compatibility, it is easier to identify the relevant field. |
| 151 | + - The `jpeg_busy` field increased in size (for supported ASICs), making backward compatibility unable to fully copy the structure into `jpeg_activity`. |
| 152 | + |
| 153 | + See below for comparison of updated CLI outputs: |
| 154 | + |
| 155 | + Original output: |
| 156 | + ```shell |
| 157 | + $ rocm-smi --showmetrics |
| 158 | + GPU[0] : vcn_activity (%): [0, 'N/A', 'N/A', 'N/A'] |
| 159 | + GPU[0] : jpeg_activity (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 160 | + GPU[0] XCP[0] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 161 | + GPU[0] XCP[1] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 162 | + GPU[0] XCP[2] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 163 | + GPU[0] XCP[3] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 164 | + GPU[0] XCP[4] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 165 | + GPU[0] XCP[5] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 166 | + GPU[0] XCP[6] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 167 | + GPU[0] XCP[7] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 168 | + GPU[0] XCP[0] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 169 | + GPU[0] XCP[1] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 170 | + GPU[0] XCP[2] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 171 | + GPU[0] XCP[3] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 172 | + GPU[0] XCP[4] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 173 | + GPU[0] XCP[5] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 174 | + GPU[0] XCP[6] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 175 | + GPU[0] XCP[7] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 176 | + ``` |
| 177 | + New output: |
| 178 | + ```shell |
| 179 | + $ rocm-smi --showmetrics |
| 180 | + GPU[0] : vcn_activity (%): ['N/A', 'N/A', 'N/A', 'N/A'] |
| 181 | + GPU[0] : jpeg_activity (%): ['N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 182 | + GPU[0] XCP[0] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 183 | + GPU[0] XCP[1] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 184 | + GPU[0] XCP[2] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 185 | + GPU[0] XCP[3] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 186 | + GPU[0] XCP[4] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 187 | + GPU[0] XCP[5] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 188 | + GPU[0] XCP[6] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 189 | + GPU[0] XCP[7] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] |
| 190 | + GPU[0] XCP[0] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 191 | + GPU[0] XCP[1] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 192 | + GPU[0] XCP[2] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 193 | + GPU[0] XCP[3] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 194 | + GPU[0] XCP[4] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 195 | + GPU[0] XCP[5] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 196 | + GPU[0] XCP[6] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 197 | + GPU[0] XCP[7] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A'] |
| 198 | + ``` |
| 199 | + |
| 200 | +### Optimized |
| 201 | + |
| 202 | +- N/A |
| 203 | + |
| 204 | +### Resolved issues |
| 205 | + |
| 206 | +- N/A |
| 207 | + |
| 208 | +### Upcoming changes |
| 209 | + |
| 210 | +- N/A |
| 211 | + |
| 212 | +### Known issues |
| 213 | + |
| 214 | +- N/A |
| 215 | + |
| 216 | ## rocm_smi_lib for ROCm 6.4.1 |
| 217 | |
| 218 | ### Added |
| 219 | diff --git a/CMakeLists.txt b/CMakeLists.txt |
| 220 | old mode 100755 |
| 221 | new mode 100644 |
| 222 | index a374078..327cb30 |
| 223 | --- a/CMakeLists.txt |
| 224 | +++ b/CMakeLists.txt |
| 225 | @@ -5,15 +5,13 @@ message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") |
| 226 | message(" CMake ROCm SMI (Library) [root] ") |
| 227 | message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") |
| 228 | cmake_minimum_required(VERSION 3.14) |
| 229 | +project(rocm_smi_lib) |
| 230 | |
| 231 | set(ROCM_SMI_LIBS_TARGET "rocm_smi_libraries") |
| 232 | |
| 233 | set(BUILD_SHARED_LIBS ON CACHE BOOL "Build shared library (.so) or not.") |
| 234 | |
| 235 | -## Set default module path if not already set |
| 236 | -if(NOT DEFINED CMAKE_MODULE_PATH) |
| 237 | - set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules/") |
| 238 | -endif() |
| 239 | +list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") |
| 240 | ## Include common cmake modules |
| 241 | include(utils) |
| 242 | |
| 243 | @@ -23,7 +21,7 @@ find_package(PkgConfig) |
| 244 | set(CMAKE_INSTALL_LIBDIR "lib" CACHE STRING "Library install directory") |
| 245 | |
| 246 | if (NOT DEFINED CPACK_RESOURCE_FILE_LICENSE) |
| 247 | - set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/License.txt") |
| 248 | + set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md") |
| 249 | endif() |
| 250 | |
| 251 | set(ROCM_SMI "rocm_smi") |
| 252 | @@ -41,10 +39,12 @@ set(SHARE_INSTALL_PREFIX |
| 253 | # provide git to utilities |
| 254 | find_program (GIT NAMES git) |
| 255 | |
| 256 | +# sets DRM_INCLUDE_DIRS |
| 257 | +pkg_check_modules(DRM REQUIRED libdrm) |
| 258 | |
| 259 | ## Setup the package version based on git tags. |
| 260 | set(PKG_VERSION_GIT_TAG_PREFIX "rsmi_pkg_ver") |
| 261 | -get_package_version_number("7.6.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) |
| 262 | +get_package_version_number("7.8.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) |
| 263 | message("Package version: ${PKG_VERSION_STR}") |
| 264 | set(${ROCM_SMI_LIBS_TARGET}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}") |
| 265 | set(${ROCM_SMI_LIBS_TARGET}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}") |
| 266 | @@ -101,7 +101,7 @@ set(CMAKE_CXX_FLAGS |
| 267 | |
| 268 | # Clang does not set the build-id |
| 269 | if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") |
| 270 | - set (CMAKE_SHARED_LINKER_FLAGS "-Wl,--build-id=sha1") |
| 271 | + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--build-id=sha1") |
| 272 | endif() |
| 273 | |
| 274 | # Use this instead of above for 32 bit |
| 275 | @@ -135,10 +135,16 @@ else () |
| 276 | set(CMAKE_CXX_FLAGS |
| 277 | "${CMAKE_CXX_FLAGS} -DFORTIFY_SOURCE=2 -fstack-protector-all -Wcast-align") |
| 278 | ## More security breach mitigation flags |
| 279 | - set(CMAKE_CXX_FLAGS |
| 280 | - "${CMAKE_CXX_FLAGS} -Wl,-z,noexecstack -Wl,-znoexecheap -Wl,-z,relro ") |
| 281 | - set(CMAKE_CXX_FLAGS |
| 282 | - "${CMAKE_CXX_FLAGS} -Wtrampolines -Wl,-z,now") |
| 283 | + set(HARDENING_LDFLAGS |
| 284 | + "${HARDENING_LDFLAGS} -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now") |
| 285 | + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${HARDENING_LDFLAGS}") |
| 286 | + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${HARDENING_LDFLAGS}") |
| 287 | + |
| 288 | + include(CheckCXXCompilerFlag) |
| 289 | + check_cxx_compiler_flag("-Wtrampolines" CXX_SUPPORTS_WTRAMPOLINES) |
| 290 | + if (CXX_SUPPORTS_WTRAMPOLINES) |
| 291 | + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wtrampolines") |
| 292 | + endif () |
| 293 | endif () |
| 294 | |
| 295 | set(COMMON_SRC_DIR "${PROJECT_SOURCE_DIR}/src") |
| 296 | @@ -197,16 +203,15 @@ set(CPACK_RPM_COMPONENT_INSTALL ON) |
| 297 | # python doesn't need to be asan |
| 298 | set(CPACK_DEBIAN_PACKAGE_DEPENDS "libc6, python3") |
| 299 | set(CPACK_DEBIAN_ASAN_PACKAGE_DEPENDS "${CPACK_DEBIAN_PACKAGE_DEPENDS}") |
| 300 | +set(CPACK_RPM_PACKAGE_REQUIRES "python3") |
| 301 | +set(CPACK_RPM_ASAN_PACKAGE_REQUIRES "${CPACK_RPM_PACKAGE_REQUIRES}") |
| 302 | # Only add dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON is given |
| 303 | if(ROCM_DEP_ROCMCORE) |
| 304 | - set(CPACK_DEBIAN_PACKAGE_DEPENDS "${CPACK_DEBIAN_PACKAGE_DEPENDS}, rocm-core") |
| 305 | - # rocm-core needs to be asan |
| 306 | - # override original variable because CPACK_DEBIAN_PACKAGE_DEPENDS changed |
| 307 | - set(CPACK_DEBIAN_ASAN_PACKAGE_DEPENDS "${CPACK_DEBIAN_PACKAGE_DEPENDS}-asan") |
| 308 | + string(APPEND CPACK_DEBIAN_PACKAGE_DEPENDS ", rocm-core") |
| 309 | + string(APPEND CPACK_DEBIAN_ASAN_PACKAGE_DEPENDS ", rocm-core-asan") |
| 310 | + string(APPEND CPACK_RPM_PACKAGE_REQUIRES ", rocm-core") |
| 311 | + string(APPEND CPACK_RPM_ASAN_PACKAGE_REQUIRES ", rocm-core-asan") |
| 312 | endif() |
| 313 | -# carefully reuse DEB's "DEPENDS" for RPM's "REQUIRES" |
| 314 | -set(CPACK_RPM_PACKAGE_REQUIRES "python3") |
| 315 | -set(CPACK_RPM_ASAN_PACKAGE_REQUIRES "${CPACK_RPM_PACKAGE_REQUIRES}") |
| 316 | |
| 317 | #Component Specific Configuration/Flags |
| 318 | set(CPACK_DEBIAN_DEV_PACKAGE_NAME ${ROCM_SMI_PACKAGE}) |
| 319 | @@ -221,33 +226,12 @@ set(CPACK_RPM_STATIC_PACKAGE_NAME ${ROCM_SMI_PACKAGE}-static-devel) |
| 320 | add_subdirectory("rocm_smi") |
| 321 | add_subdirectory("oam") |
| 322 | |
| 323 | -option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorg with backward compatibility" OFF) |
| 324 | - |
| 325 | # Add tests |
| 326 | if(BUILD_TESTS) |
| 327 | set(TESTS_COMPONENT "tests") |
| 328 | add_subdirectory("tests/rocm_smi_test") |
| 329 | endif() |
| 330 | |
| 331 | -if(FILE_REORG_BACKWARD_COMPATIBILITY) |
| 332 | -# To enable/disable #error in wrapper header files |
| 333 | - if(NOT DEFINED ROCM_HEADER_WRAPPER_WERROR) |
| 334 | - if(DEFINED ENV{ROCM_HEADER_WRAPPER_WERROR}) |
| 335 | - set(ROCM_HEADER_WRAPPER_WERROR "$ENV{ROCM_HEADER_WRAPPER_WERROR}" |
| 336 | - CACHE STRING "Header wrapper warnings as errors.") |
| 337 | - else() |
| 338 | - set(ROCM_HEADER_WRAPPER_WERROR "OFF" CACHE STRING "Header wrapper warnings as errors.") |
| 339 | - endif() |
| 340 | - endif() |
| 341 | - if(ROCM_HEADER_WRAPPER_WERROR) |
| 342 | - set(deprecated_error 1) |
| 343 | - else() |
| 344 | - set(deprecated_error 0) |
| 345 | - endif() |
| 346 | - |
| 347 | - include(rocm_smi-backward-compat.cmake) |
| 348 | -endif() |
| 349 | - |
| 350 | include(CMakePackageConfigHelpers) |
| 351 | |
| 352 | set(LIB_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}") |
| 353 | @@ -293,14 +277,14 @@ install(EXPORT rocm_smiTargets |
| 354 | |
| 355 | #License file |
| 356 | set(CPACK_RPM_PACKAGE_LICENSE "NCSA") |
| 357 | -# install license file in share/doc/rocm_smi-asan folder |
| 358 | +# install license file in share/doc/rocm-smi-lib-asan folder |
| 359 | if( ENABLE_ASAN_PACKAGING ) |
| 360 | install(FILES ${CPACK_RESOURCE_FILE_LICENSE} |
| 361 | - DESTINATION ${CMAKE_INSTALL_DATADIR}/doc/${ROCM_SMI}-asan RENAME LICENSE.txt |
| 362 | + DESTINATION ${CMAKE_INSTALL_DATADIR}/doc/${ROCM_SMI_PACKAGE}-asan RENAME LICENSE.md |
| 363 | COMPONENT asan) |
| 364 | endif() |
| 365 | install( FILES ${CPACK_RESOURCE_FILE_LICENSE} |
| 366 | - DESTINATION ${CMAKE_INSTALL_DATADIR}/doc/${ROCM_SMI} RENAME LICENSE.txt |
| 367 | + DESTINATION ${CMAKE_INSTALL_DATADIR}/doc/${ROCM_SMI_PACKAGE} RENAME LICENSE.md |
| 368 | COMPONENT dev) |
| 369 | |
| 370 | ########################### |
| 371 | diff --git a/License.txt b/LICENSE.md |
| 372 | similarity index 94% |
| 373 | rename from License.txt |
| 374 | rename to LICENSE.md |
| 375 | index 31f9503..4d43ac8 100644 |
| 376 | --- a/License.txt |
| 377 | +++ b/LICENSE.md |
| 378 | @@ -1,6 +1,6 @@ |
| 379 | MIT License |
| 380 | |
| 381 | -Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. |
| 382 | +Copyright (C) Advanced Micro Devices, Inc. |
| 383 | |
| 384 | Permission is hereby granted, free of charge, to any person obtaining a copy |
| 385 | of this software and associated documentation files (the "Software"), to deal |
| 386 | @@ -19,4 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 387 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 388 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 389 | SOFTWARE. |
| 390 | - |
| 391 | diff --git a/README.md b/README.md |
| 392 | old mode 100755 |
| 393 | new mode 100644 |
| 394 | index 13a593a..5ef16eb |
| 395 | --- a/README.md |
| 396 | +++ b/README.md |
| 397 | @@ -1,6 +1,6 @@ |
| 398 | # 🛠️ Maintenance Mode Notice 🛠️ |
| 399 | |
| 400 | -Starting with ROCm 6.5, only critical bug fixes will be applied to ROCm-SMI. |
| 401 | +Starting with ROCm 7.0, only critical bug fixes will be applied to ROCm-SMI. |
| 402 | For a seamless experience and continued support, please switch to [AMD-SMI](https://github.com/ROCm/amdsmi). |
| 403 | |
| 404 | ## Use C++ in ROCm SMI |
| 405 | diff --git a/cmake_modules/help_package.cmake b/cmake_modules/help_package.cmake |
| 406 | index 94f71ce..9bfb07e 100644 |
| 407 | --- a/cmake_modules/help_package.cmake |
| 408 | +++ b/cmake_modules/help_package.cmake |
| 409 | @@ -85,7 +85,7 @@ function(generic_package) |
| 410 | "${CMAKE_INSTALL_PREFIX}" |
| 411 | CACHE STRING "Default packaging prefix.") |
| 412 | set(CPACK_RESOURCE_FILE_LICENSE |
| 413 | - "${CMAKE_CURRENT_SOURCE_DIR}/License.txt" |
| 414 | + "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md" |
| 415 | CACHE STRING "") |
| 416 | set(CPACK_RPM_PACKAGE_LICENSE |
| 417 | "MIT" |
| 418 | diff --git a/cmake_modules/utils.cmake b/cmake_modules/utils.cmake |
| 419 | old mode 100755 |
| 420 | new mode 100644 |
| 421 | index 7131761..77aadee |
| 422 | --- a/cmake_modules/utils.cmake |
| 423 | +++ b/cmake_modules/utils.cmake |
| 424 | @@ -3,7 +3,7 @@ |
| 425 | ## The University of Illinois/NCSA |
| 426 | ## Open Source License (NCSA) |
| 427 | ## |
| 428 | -## Copyright (c) 2014-2017, Advanced Micro Devices, Inc. All rights reserved. |
| 429 | +## Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved. |
| 430 | ## |
| 431 | ## Developed by: |
| 432 | ## |
| 433 | diff --git a/debian/changelog b/debian/changelog |
| 434 | index 41c6f20..09f56d1 100644 |
| 435 | --- a/debian/changelog |
| 436 | +++ b/debian/changelog |
| 437 | @@ -1,3 +1,39 @@ |
| 438 | +rocm-smi-lib (7.1.0-0ubuntu2) resolute; urgency=medium |
| 439 | + |
| 440 | + [ Talha Can Havadar ] |
| 441 | + * New upstream version 7.1.0 (LP: #2138653) |
| 442 | + * d/p/0002-add-version-script-to-control-exposed-symbols.patch: |
| 443 | + Fix FTBFS, error bad value for '-march=' switch |
| 444 | + |
| 445 | + [ Gennaro Oliva ] |
| 446 | + * d/control: add missing libdrm-dev for librocm-smi-lib (Closes: #1121159) |
| 447 | + |
| 448 | + -- Talha Can Havadar <talha.can.havadar@canonical.com> Fri, 19 Dec 2025 08:45:45 +0100 |
| 449 | + |
| 450 | +rocm-smi-lib (7.1.0-0ubuntu1) UNRELEASED; urgency=medium |
| 451 | + |
| 452 | + [ Igor Luppi ] |
| 453 | + * d/{control,liboam1*,librocm-smi64-1*}: Update SOVERSION from 1 to 7 |
| 454 | + * d/p/0002-add-version-script-to-control-exposed-symbols.patch: fix soversion |
| 455 | + * d/rules: fix soversion |
| 456 | + |
| 457 | + [ Zhai Zhaoxuan ] |
| 458 | + * d/control: add pkg-config in build-depends |
| 459 | + * d/not-installed: update the name of duplicated LICENSE file |
| 460 | + |
| 461 | + [ Talha Can Havadar ] |
| 462 | + * d/rules: make symbol checking strict for builds |
| 463 | + * d/patches: fix so version problem due to missing git |
| 464 | + |
| 465 | + -- Igor Luppi <igor.luppi@canonical.com> Wed, 17 Dec 2025 16:21:06 -0300 |
| 466 | + |
| 467 | +rocm-smi-lib (6.4.3-0ubuntu1) questing; urgency=medium |
| 468 | + |
| 469 | + * New upstream version 6.4.3 |
| 470 | + * d/control: update maintainer information |
| 471 | + |
| 472 | + -- Igor Luppi <igor.luppi@canonical.com> Mon, 22 Sep 2025 15:52:03 -0300 |
| 473 | + |
| 474 | rocm-smi-lib (6.4.1-1.1) unstable; urgency=medium |
| 475 | |
| 476 | * Non-maintainer upload. |
| 477 | diff --git a/debian/control b/debian/control |
| 478 | index b3d0850..eba2d34 100644 |
| 479 | --- a/debian/control |
| 480 | +++ b/debian/control |
| 481 | @@ -1,5 +1,6 @@ |
| 482 | Source: rocm-smi-lib |
| 483 | -Maintainer: Debian ROCm Team <debian-ai@lists.debian.org> |
| 484 | +Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com> |
| 485 | +XSBC-Original-Maintainer: Debian ROCm Team <debian-ai@lists.debian.org> |
| 486 | Uploaders: Mo Zhou <lumin@debian.org>, |
| 487 | Étienne Mollier <emollier@debian.org>, |
| 488 | Xuanteng Huang <xuanteng.huang@outlook.com>, |
| 489 | @@ -9,6 +10,7 @@ Priority: optional |
| 490 | Build-Depends: debhelper-compat (= 13), |
| 491 | cmake, |
| 492 | libdrm-dev, |
| 493 | + pkgconf, |
| 494 | Standards-Version: 4.7.2 |
| 495 | Vcs-Browser: https://salsa.debian.org/rocm-team/rocm-smi-lib |
| 496 | Vcs-Git: https://salsa.debian.org/rocm-team/rocm-smi-lib.git |
| 497 | @@ -19,12 +21,12 @@ Architecture: linux-any |
| 498 | Section: utils |
| 499 | Depends: ${misc:Depends}, |
| 500 | python3:any, |
| 501 | - librocm-smi64-1 (= ${binary:Version}) |
| 502 | + librocm-smi64-7 (= ${binary:Version}) |
| 503 | Description: ROCm System Management Interface (ROCm SMI) command-line interface |
| 504 | This is the reference implementation from AMD, exposing the ROCm SMI library |
| 505 | to the user. It presents a Python executable, `rocm-smi`. |
| 506 | |
| 507 | -Package: librocm-smi64-1 |
| 508 | +Package: librocm-smi64-7 |
| 509 | Architecture: linux-any |
| 510 | Section: libs |
| 511 | Depends: ${misc:Depends}, |
| 512 | @@ -39,9 +41,9 @@ Package: librocm-smi-dev |
| 513 | Architecture: linux-any |
| 514 | Section: libdevel |
| 515 | Depends: ${misc:Depends}, |
| 516 | - librocm-smi64-1 (= ${binary:Version}), |
| 517 | liboam-dev (= ${binary:Version}), |
| 518 | libdrm-dev, |
| 519 | + librocm-smi64-7 (= ${binary:Version}) |
| 520 | Description: ROCm System Management Interface (ROCm SMI) library headers |
| 521 | ROCm SMI is part of the ROCm software stack. It is a C library for Linux |
| 522 | that provides a user-space interface for applications to monitor and |
| 523 | @@ -49,7 +51,7 @@ Description: ROCm System Management Interface (ROCm SMI) library headers |
| 524 | . |
| 525 | This package contains the development headers. |
| 526 | |
| 527 | -Package: liboam1 |
| 528 | +Package: liboam7 |
| 529 | Architecture: linux-any |
| 530 | Section: libs |
| 531 | Depends: ${misc:Depends}, |
| 532 | @@ -64,7 +66,7 @@ Package: liboam-dev |
| 533 | Architecture: linux-any |
| 534 | Section: libdevel |
| 535 | Depends: ${misc:Depends}, |
| 536 | - liboam1 (= ${binary:Version}) |
| 537 | + liboam7 (= ${binary:Version}) |
| 538 | Description: Datacenter flavor of a GPU system-management API headers |
| 539 | OCP Accelerator Module (OAM), is an Open Compute Project (OCP) hardware |
| 540 | standard, used in datacenters and high-performance-computing (HPC) clusters. |
| 541 | diff --git a/debian/liboam1.install b/debian/liboam1.install |
| 542 | deleted file mode 100644 |
| 543 | index d2ebc4e..0000000 |
| 544 | --- a/debian/liboam1.install |
| 545 | +++ /dev/null |
| 546 | @@ -1,2 +0,0 @@ |
| 547 | -usr/lib/${DEB_HOST_MULTIARCH}/liboam.so.1 usr/lib/${DEB_HOST_MULTIARCH}/ |
| 548 | -usr/lib/${DEB_HOST_MULTIARCH}/liboam.so.1.* usr/lib/${DEB_HOST_MULTIARCH}/ |
| 549 | diff --git a/debian/liboam7.install b/debian/liboam7.install |
| 550 | new file mode 100644 |
| 551 | index 0000000..9c6ccd3 |
| 552 | --- /dev/null |
| 553 | +++ b/debian/liboam7.install |
| 554 | @@ -0,0 +1,2 @@ |
| 555 | +usr/lib/${DEB_HOST_MULTIARCH}/liboam.so.7 usr/lib/${DEB_HOST_MULTIARCH}/ |
| 556 | +usr/lib/${DEB_HOST_MULTIARCH}/liboam.so.7.* usr/lib/${DEB_HOST_MULTIARCH}/ |
| 557 | diff --git a/debian/liboam1.symbols.amd64 b/debian/liboam7.symbols.amd64 |
| 558 | similarity index 99% |
| 559 | rename from debian/liboam1.symbols.amd64 |
| 560 | rename to debian/liboam7.symbols.amd64 |
| 561 | index 08d444b..9348cc4 100644 |
| 562 | --- a/debian/liboam1.symbols.amd64 |
| 563 | +++ b/debian/liboam7.symbols.amd64 |
| 564 | @@ -1,4 +1,4 @@ |
| 565 | -liboam.so.1 liboam1 #MINVER# |
| 566 | +liboam.so.7 liboam7 #MINVER# |
| 567 | * Build-Depends-Package: liboam-dev |
| 568 | amdoam_discover_devices@Base 4.5.2 |
| 569 | amdoam_free@Base 4.5.2 |
| 570 | diff --git a/debian/librocm-smi64-1.install b/debian/librocm-smi64-1.install |
| 571 | deleted file mode 100644 |
| 572 | index 9ea6b8c..0000000 |
| 573 | --- a/debian/librocm-smi64-1.install |
| 574 | +++ /dev/null |
| 575 | @@ -1,2 +0,0 @@ |
| 576 | -usr/lib/${DEB_HOST_MULTIARCH}/librocm_smi64.so.1 usr/lib/${DEB_HOST_MULTIARCH}/ |
| 577 | -usr/lib/${DEB_HOST_MULTIARCH}/librocm_smi64.so.1.* usr/lib/${DEB_HOST_MULTIARCH}/ |
| 578 | diff --git a/debian/librocm-smi64-7.install b/debian/librocm-smi64-7.install |
| 579 | new file mode 100644 |
| 580 | index 0000000..13c97fe |
| 581 | --- /dev/null |
| 582 | +++ b/debian/librocm-smi64-7.install |
| 583 | @@ -0,0 +1,2 @@ |
| 584 | +usr/lib/${DEB_HOST_MULTIARCH}/librocm_smi64.so.7 usr/lib/${DEB_HOST_MULTIARCH}/ |
| 585 | +usr/lib/${DEB_HOST_MULTIARCH}/librocm_smi64.so.7.* usr/lib/${DEB_HOST_MULTIARCH}/ |
| 586 | diff --git a/debian/librocm-smi64-1.symbols.amd64 b/debian/librocm-smi64-7.symbols.amd64 |
| 587 | similarity index 99% |
| 588 | rename from debian/librocm-smi64-1.symbols.amd64 |
| 589 | rename to debian/librocm-smi64-7.symbols.amd64 |
| 590 | index 4de3a4d..77834e3 100644 |
| 591 | --- a/debian/librocm-smi64-1.symbols.amd64 |
| 592 | +++ b/debian/librocm-smi64-7.symbols.amd64 |
| 593 | @@ -1,4 +1,4 @@ |
| 594 | -librocm_smi64.so.1 librocm-smi64-1 #MINVER# |
| 595 | +librocm_smi64.so.7 librocm-smi64-7 #MINVER# |
| 596 | * Build-Depends-Package: librocm-smi64-dev |
| 597 | devInfoTypesStrings@Base 6.1.2 |
| 598 | logFileName@Base 6.1.2 |
| 599 | diff --git a/debian/librocm-smi64-1.version b/debian/librocm-smi64-7.version |
| 600 | similarity index 100% |
| 601 | rename from debian/librocm-smi64-1.version |
| 602 | rename to debian/librocm-smi64-7.version |
| 603 | diff --git a/debian/not-installed b/debian/not-installed |
| 604 | index 1ad2705..d87845d 100644 |
| 605 | --- a/debian/not-installed |
| 606 | +++ b/debian/not-installed |
| 607 | @@ -2,4 +2,4 @@ |
| 608 | usr/oam/* |
| 609 | usr/rocm_smi/* |
| 610 | # duplicate license file |
| 611 | -usr/share/doc/rocm_smi/LICENSE.txt |
| 612 | +usr/share/doc/rocm-smi-lib/LICENSE.md |
| 613 | diff --git a/debian/patches/0002-add-version-script-to-control-exposed-symbols.patch b/debian/patches/0002-add-version-script-to-control-exposed-symbols.patch |
| 614 | index 4e9e6af..f21bf27 100644 |
| 615 | --- a/debian/patches/0002-add-version-script-to-control-exposed-symbols.patch |
| 616 | +++ b/debian/patches/0002-add-version-script-to-control-exposed-symbols.patch |
| 617 | @@ -16,7 +16,7 @@ Forwarded: not-needed |
| 618 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0 -DDEBUG") |
| 619 | endif () |
| 620 | |
| 621 | -+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--version-script=${CMAKE_SOURCE_DIR}/debian/librocm-smi64-1.version") |
| 622 | ++set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--version-script=${CMAKE_SOURCE_DIR}/debian/librocm-smi64-7.version") |
| 623 | + |
| 624 | ## Address Sanitize Flag |
| 625 | if (${ADDRESS_SANITIZER}) |
| 626 | diff --git a/debian/patches/0005-oam-rocm_smi-fix-version-string-issue-when-no-git-av.patch b/debian/patches/0005-oam-rocm_smi-fix-version-string-issue-when-no-git-av.patch |
| 627 | new file mode 100644 |
| 628 | index 0000000..4a54a1c |
| 629 | --- /dev/null |
| 630 | +++ b/debian/patches/0005-oam-rocm_smi-fix-version-string-issue-when-no-git-av.patch |
| 631 | @@ -0,0 +1,45 @@ |
| 632 | +From: Talha Can Havadar <talha.can.havadar@canonical.com> |
| 633 | +Date: Wed, 17 Dec 2025 18:14:30 +0100 |
| 634 | +Subject: oam: rocm_smi: fix version string issue when no git available |
| 635 | + |
| 636 | +Git is not available in some build environment and this makes SO version |
| 637 | +to default to 1 which is not correct. In such cases we can fallback to |
| 638 | +CPACK version that is already set correctly. |
| 639 | + |
| 640 | +Bug: https://github.com/ROCm/rocm-systems/pull/2361 |
| 641 | +--- |
| 642 | + oam/CMakeLists.txt | 4 ++-- |
| 643 | + rocm_smi/CMakeLists.txt | 5 +++-- |
| 644 | + 2 files changed, 5 insertions(+), 4 deletions(-) |
| 645 | + |
| 646 | +diff --git a/oam/CMakeLists.txt b/oam/CMakeLists.txt |
| 647 | +index 7aa1b5f..c74523e 100644 |
| 648 | +--- a/oam/CMakeLists.txt |
| 649 | ++++ b/oam/CMakeLists.txt |
| 650 | +@@ -36,8 +36,8 @@ set(SO_VERSION_GIT_TAG_PREFIX "oam_so_ver") |
| 651 | + message("Package version: ${PKG_VERSION_STR}") |
| 652 | + |
| 653 | + # Debian package specific variables |
| 654 | +-# Set a default value for the package version |
| 655 | +-get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT) |
| 656 | ++# Set a default value for the package version - use the main package version as fallback |
| 657 | ++get_version_from_tag("${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}" ${SO_VERSION_GIT_TAG_PREFIX} GIT) |
| 658 | + |
| 659 | + # VERSION_* variables should be set by get_version_from_tag |
| 660 | + if ( ${ROCM_PATCH_VERSION} ) |
| 661 | +diff --git a/rocm_smi/CMakeLists.txt b/rocm_smi/CMakeLists.txt |
| 662 | +index 68c90ec..39c8848 100644 |
| 663 | +--- a/rocm_smi/CMakeLists.txt |
| 664 | ++++ b/rocm_smi/CMakeLists.txt |
| 665 | +@@ -38,8 +38,9 @@ set(SO_VERSION_GIT_TAG_PREFIX "rsmi_so_ver") |
| 666 | + message("Package version: ${PKG_VERSION_STR}") |
| 667 | + |
| 668 | + # Debian package specific variables |
| 669 | +-# Set a default value for the package version |
| 670 | +-get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT) |
| 671 | ++# Set a default value for the package version - use the main package version as fallback |
| 672 | ++get_version_from_tag("${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}" ${SO_VERSION_GIT_TAG_PREFIX} GIT) |
| 673 | ++ |
| 674 | + |
| 675 | + # VERSION_* variables should be set by get_version_from_tag |
| 676 | + if ( ${ROCM_PATCH_VERSION} ) |
| 677 | diff --git a/debian/patches/series b/debian/patches/series |
| 678 | index 7b38bbb..be3307d 100644 |
| 679 | --- a/debian/patches/series |
| 680 | +++ b/debian/patches/series |
| 681 | @@ -1,3 +1,4 @@ |
| 682 | 0002-add-version-script-to-control-exposed-symbols.patch |
| 683 | 0003-remove-example-target-using-internal-apis.patch |
| 684 | 0004-revert-remove-reset-partition.patch |
| 685 | +0005-oam-rocm_smi-fix-version-string-issue-when-no-git-av.patch |
| 686 | diff --git a/debian/rules b/debian/rules |
| 687 | index 1c468a9..ceeaa2f 100755 |
| 688 | --- a/debian/rules |
| 689 | +++ b/debian/rules |
| 690 | @@ -13,3 +13,10 @@ export DEB_BUILD_MAINT_OPTIONS = hardening=+all |
| 691 | |
| 692 | execute_before_dh_missing-indep: |
| 693 | rm -vf $(CURDIR)/debian/tmp/usr/bin/rocm_smi.py |
| 694 | + |
| 695 | +# see https://manpages.debian.org/testing/debhelper/dh_makeshlibs.1.en.html |
| 696 | +# and https://manpages.debian.org/testing/dpkg-dev/dpkg-gensymbols.1.en.html |
| 697 | +# To make sure gensymbols fails the build, -c4 to be more strict |
| 698 | +override_dh_makeshlibs: |
| 699 | + dh_makeshlibs -V -plibrocm-smi64-7 -- -c4 |
| 700 | + dh_makeshlibs -V -pliboam7 -- -c4 |
| 701 | diff --git a/docs/conf.py b/docs/conf.py |
| 702 | old mode 100755 |
| 703 | new mode 100644 |
| 704 | index cfd8d87..cba57b5 |
| 705 | --- a/docs/conf.py |
| 706 | +++ b/docs/conf.py |
| 707 | @@ -29,7 +29,7 @@ shutil.copy2('../CHANGELOG.md','./CHANGELOG.md') |
| 708 | # for PDF output on Read the Docs |
| 709 | project = "ROCm SMI LIB Documentation" |
| 710 | author = "Advanced Micro Devices, Inc." |
| 711 | -copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved." |
| 712 | +copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved." |
| 713 | version = version_number |
| 714 | release = version_number |
| 715 | |
| 716 | diff --git a/docs/index.rst b/docs/index.rst |
| 717 | old mode 100755 |
| 718 | new mode 100644 |
| 719 | diff --git a/docs/license.md b/docs/license.md |
| 720 | index 234cd49..aaf95ff 100644 |
| 721 | --- a/docs/license.md |
| 722 | +++ b/docs/license.md |
| 723 | @@ -1,4 +1,4 @@ |
| 724 | # License |
| 725 | |
| 726 | -```{include} ../License.txt |
| 727 | +```{include} ../LICENSE.md |
| 728 | ``` |
| 729 | diff --git a/include/rocm_smi/kfd_ioctl.h b/include/rocm_smi/kfd_ioctl.h |
| 730 | old mode 100755 |
| 731 | new mode 100644 |
| 732 | diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h |
| 733 | old mode 100755 |
| 734 | new mode 100644 |
| 735 | index aa9ea97..933b35a |
| 736 | --- a/include/rocm_smi/rocm_smi.h |
| 737 | +++ b/include/rocm_smi/rocm_smi.h |
| 738 | @@ -3,7 +3,7 @@ |
| 739 | * The University of Illinois/NCSA |
| 740 | * Open Source License (NCSA) |
| 741 | * |
| 742 | - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. |
| 743 | + * Copyright (c) 2017-2025, Advanced Micro Devices, Inc. |
| 744 | * All rights reserved. |
| 745 | * |
| 746 | * Developed by: |
| 747 | @@ -522,9 +522,10 @@ typedef enum { |
| 748 | typedef enum { |
| 749 | RSMI_VOLT_TYPE_FIRST = 0, |
| 750 | |
| 751 | - RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU |
| 752 | - //!< voltage |
| 753 | - RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDGFX, |
| 754 | + RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU voltage |
| 755 | + RSMI_VOLT_TYPE_VDDBOARD, //!< Voltage for VDDBOARD |
| 756 | + |
| 757 | + RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDBOARD, |
| 758 | RSMI_VOLT_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type |
| 759 | } rsmi_voltage_type_t; |
| 760 | |
| 761 | @@ -957,6 +958,11 @@ typedef struct metrics_table_header_t metrics_table_header_t; |
| 762 | #define RSMI_MAX_NUM_JPEG_ENGS 32 |
| 763 | |
| 764 | /** |
| 765 | + * @brief This should match kRSMI_MAX_NUM_JPEG_ENG_V1 |
| 766 | + */ |
| 767 | +#define RSMI_MAX_NUM_JPEG_ENG_V1 40 |
| 768 | + |
| 769 | +/** |
| 770 | * @brief This should match kRSMI_MAX_NUM_CLKS |
| 771 | */ |
| 772 | #define RSMI_MAX_NUM_CLKS 4 |
| 773 | @@ -1003,7 +1009,7 @@ struct amdgpu_xcp_metrics_t { |
| 774 | */ |
| 775 | /* Utilization Instantaneous (%) */ |
| 776 | uint32_t gfx_busy_inst[RSMI_MAX_NUM_XCC]; |
| 777 | - uint16_t jpeg_busy[RSMI_MAX_NUM_JPEG_ENGS]; |
| 778 | + uint16_t jpeg_busy[RSMI_MAX_NUM_JPEG_ENG_V1]; |
| 779 | uint16_t vcn_busy[RSMI_MAX_NUM_VCNS]; |
| 780 | |
| 781 | /* Utilization Accumulated (%) */ |
| 782 | @@ -1014,6 +1020,14 @@ struct amdgpu_xcp_metrics_t { |
| 783 | */ |
| 784 | /* Total App Clock Counter Accumulated */ |
| 785 | uint64_t gfx_below_host_limit_acc[RSMI_MAX_NUM_XCC]; |
| 786 | + |
| 787 | + /** |
| 788 | + * v1.8 additions |
| 789 | + */ |
| 790 | + uint64_t gfx_below_host_limit_ppt_acc[RSMI_MAX_NUM_XCC]; |
| 791 | + uint64_t gfx_below_host_limit_thm_acc[RSMI_MAX_NUM_XCC]; |
| 792 | + uint64_t gfx_low_utilization_acc[RSMI_MAX_NUM_XCC]; |
| 793 | + uint64_t gfx_below_host_limit_total_acc[RSMI_MAX_NUM_XCC]; |
| 794 | }; |
| 795 | |
| 796 | typedef struct { |
| 797 | @@ -1220,7 +1234,7 @@ typedef struct { |
| 798 | /* |
| 799 | * v1.7 additions |
| 800 | */ |
| 801 | - /* VRAM max bandwidth at max memory clock (GB/s) */ |
| 802 | + /* VRAM max bandwidth at max memory clock */ |
| 803 | uint64_t vram_max_bandwidth; |
| 804 | |
| 805 | /* XGMI link status(up/down) */ |
| 806 | @@ -1367,8 +1381,8 @@ rsmi_status_t rsmi_num_monitor_devices(uint32_t *num_devices); |
| 807 | * @brief Get the device id associated with the device with provided device |
| 808 | * index. |
| 809 | * |
| 810 | - * @details Given a device index @p dv_ind and a pointer to a uint32_t @p id, |
| 811 | - * this function will write the device id value to the uint64_t pointed to by |
| 812 | + * @details Given a device index @p dv_ind and a pointer to a uint16_t @p id, |
| 813 | + * this function will write the device id value to the uint16_t pointed to by |
| 814 | * @p id. This ID is an identification of the type of device, so calling this |
| 815 | * function for different devices will give the same value if they are kind |
| 816 | * of device. Consequently, this function should not be used to distinguish |
| 817 | @@ -1377,7 +1391,7 @@ rsmi_status_t rsmi_num_monitor_devices(uint32_t *num_devices); |
| 818 | * |
| 819 | * @param[in] dv_ind a device index |
| 820 | * |
| 821 | - * @param[inout] id a pointer to uint64_t to which the device id will be |
| 822 | + * @param[inout] id a pointer to uint16_t to which the device id will be |
| 823 | * written |
| 824 | * If this parameter is nullptr, this function will return |
| 825 | * ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided, |
| 826 | @@ -1395,12 +1409,12 @@ rsmi_status_t rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id); |
| 827 | /** |
| 828 | * @brief Get the device revision associated with the device |
| 829 | * |
| 830 | - * @details Given a device index @p dv_ind and a pointer to a uint32_t to |
| 831 | + * @details Given a device index @p dv_ind and a pointer to a uint16_t to |
| 832 | * which the revision will be written |
| 833 | * |
| 834 | * @param[in] dv_ind a device index |
| 835 | * |
| 836 | - * @param[inout] revision a pointer to uint32_t to which the device revision |
| 837 | + * @param[inout] revision a pointer to uint16_t to which the device revision |
| 838 | * will be written |
| 839 | * |
| 840 | * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. |
| 841 | @@ -1412,14 +1426,14 @@ rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision); |
| 842 | * @brief Get the SKU for a desired device associated with the device with |
| 843 | * provided device index. |
| 844 | * |
| 845 | - * @details Given a device index @p dv_ind and a pointer to a char @p sku, |
| 846 | + * @details Given a device index @p dv_ind and a pointer to a uint16_t @p sku, |
| 847 | * this function will attempt to obtain the SKU from the Product Information |
| 848 | * FRU chip, present on server ASICs. It will write the sku value to the |
| 849 | - * char array pointed to by @p sku. |
| 850 | + * uint16_t pointed to by @p sku. |
| 851 | * |
| 852 | * @param[in] dv_ind a device index |
| 853 | * |
| 854 | - * @param[inout] sku a pointer to char to which the sku will be written |
| 855 | + * @param[inout] sku a pointer to uint16_t to which the sku will be written |
| 856 | * |
| 857 | * If this parameter is nullptr, this function will return |
| 858 | * ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided, |
| 859 | @@ -1438,13 +1452,13 @@ rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *sku); |
| 860 | * @brief Get the device vendor id associated with the device with provided |
| 861 | * device index. |
| 862 | * |
| 863 | - * @details Given a device index @p dv_ind and a pointer to a uint32_t @p id, |
| 864 | - * this function will write the device vendor id value to the uint64_t pointed |
| 865 | + * @details Given a device index @p dv_ind and a pointer to a uint16_t @p id, |
| 866 | + * this function will write the device vendor id value to the uint16_t pointed |
| 867 | * to by @p id. |
| 868 | * |
| 869 | * @param[in] dv_ind a device index |
| 870 | * |
| 871 | - * @param[inout] id a pointer to uint64_t to which the device vendor id will |
| 872 | + * @param[inout] id a pointer to uint16_t to which the device vendor id will |
| 873 | * be written |
| 874 | * If this parameter is nullptr, this function will return |
| 875 | * ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided, |
| 876 | @@ -2962,7 +2976,9 @@ rsmi_status_t rsmi_dev_gpu_reset(uint32_t dv_ind); |
| 877 | * If this parameter is nullptr, this function will return |
| 878 | * ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided, |
| 879 | * arguments and ::RSMI_STATUS_NOT_SUPPORTED if it is not supported with the |
| 880 | - * provided arguments. |
| 881 | + * provided arguments. In the event where there are some values are missing from |
| 882 | + * or not available on the device, the respective values will be set to |
| 883 | + * UINT64_MAX. |
| 884 | * |
| 885 | * @retval ::RSMI_STATUS_SUCCESS call was successful |
| 886 | * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not |
| 887 | diff --git a/include/rocm_smi/rocm_smi_common.h b/include/rocm_smi/rocm_smi_common.h |
| 888 | old mode 100755 |
| 889 | new mode 100644 |
| 890 | index 618d1d6..67264f6 |
| 891 | --- a/include/rocm_smi/rocm_smi_common.h |
| 892 | +++ b/include/rocm_smi/rocm_smi_common.h |
| 893 | @@ -5,7 +5,7 @@ |
| 894 | * The University of Illinois/NCSA |
| 895 | * Open Source License (NCSA) |
| 896 | * |
| 897 | - * Copyright (c) 2018-2023, Advanced Micro Devices, Inc. |
| 898 | + * Copyright (c) 2018-2025, Advanced Micro Devices, Inc. |
| 899 | * All rights reserved. |
| 900 | * |
| 901 | * Developed by: |
| 902 | diff --git a/include/rocm_smi/rocm_smi_counters.h b/include/rocm_smi/rocm_smi_counters.h |
| 903 | old mode 100755 |
| 904 | new mode 100644 |
| 905 | index 091c89d..1447df6 |
| 906 | --- a/include/rocm_smi/rocm_smi_counters.h |
| 907 | +++ b/include/rocm_smi/rocm_smi_counters.h |
| 908 | @@ -5,7 +5,7 @@ |
| 909 | * The University of Illinois/NCSA |
| 910 | * Open Source License (NCSA) |
| 911 | * |
| 912 | - * Copyright (c) 2019, Advanced Micro Devices, Inc. |
| 913 | + * Copyright (c) 2025, Advanced Micro Devices, Inc. |
| 914 | * All rights reserved. |
| 915 | * |
| 916 | * Developed by: |
| 917 | diff --git a/include/rocm_smi/rocm_smi_device.h b/include/rocm_smi/rocm_smi_device.h |
| 918 | old mode 100755 |
| 919 | new mode 100644 |
| 920 | index a891a66..d00d037 |
| 921 | --- a/include/rocm_smi/rocm_smi_device.h |
| 922 | +++ b/include/rocm_smi/rocm_smi_device.h |
| 923 | @@ -3,7 +3,7 @@ |
| 924 | * The University of Illinois/NCSA |
| 925 | * Open Source License (NCSA) |
| 926 | * |
| 927 | - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. |
| 928 | + * Copyright (c) 2017-2025, Advanced Micro Devices, Inc. |
| 929 | * All rights reserved. |
| 930 | * |
| 931 | * Developed by: |
| 932 | diff --git a/include/rocm_smi/rocm_smi_exception.h b/include/rocm_smi/rocm_smi_exception.h |
| 933 | old mode 100755 |
| 934 | new mode 100644 |
| 935 | index 7c898fb..70e4949 |
| 936 | --- a/include/rocm_smi/rocm_smi_exception.h |
| 937 | +++ b/include/rocm_smi/rocm_smi_exception.h |
| 938 | @@ -5,7 +5,7 @@ |
| 939 | * The University of Illinois/NCSA |
| 940 | * Open Source License (NCSA) |
| 941 | * |
| 942 | - * Copyright (c) 2018, Advanced Micro Devices, Inc. |
| 943 | + * Copyright (c) 2025, Advanced Micro Devices, Inc. |
| 944 | * All rights reserved. |
| 945 | * |
| 946 | * Developed by: |
| 947 | diff --git a/include/rocm_smi/rocm_smi_gpu_metrics.h b/include/rocm_smi/rocm_smi_gpu_metrics.h |
| 948 | index 5712ea4..d9325cf 100644 |
| 949 | --- a/include/rocm_smi/rocm_smi_gpu_metrics.h |
| 950 | +++ b/include/rocm_smi/rocm_smi_gpu_metrics.h |
| 951 | @@ -1,44 +1,23 @@ |
| 952 | /* |
| 953 | - * ============================================================================= |
| 954 | - * The University of Illinois/NCSA |
| 955 | - * Open Source License (NCSA) |
| 956 | - * |
| 957 | - * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. |
| 958 | - * All rights reserved. |
| 959 | - * |
| 960 | - * Developed by: |
| 961 | - * |
| 962 | - * AMD Research and AMD ROC Software Development |
| 963 | - * |
| 964 | - * Advanced Micro Devices, Inc. |
| 965 | - * |
| 966 | - * www.amd.com |
| 967 | + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. |
| 968 | * |
| 969 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 970 | - * of this software and associated documentation files (the "Software"), to |
| 971 | - * deal with the Software without restriction, including without limitation |
| 972 | - * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| 973 | - * and/or sell copies of the Software, and to permit persons to whom the |
| 974 | - * Software is furnished to do so, subject to the following conditions: |
| 975 | + * of this software and associated documentation files (the "Software"), to deal |
| 976 | + * in the Software without restriction, including without limitation the rights |
| 977 | + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 978 | + * copies of the Software, and to permit persons to whom the Software is |
| 979 | + * furnished to do so, subject to the following conditions: |
| 980 | * |
| 981 | - * - Redistributions of source code must retain the above copyright notice, |
| 982 | - * this list of conditions and the following disclaimers. |
| 983 | - * - Redistributions in binary form must reproduce the above copyright |
| 984 | - * notice, this list of conditions and the following disclaimers in |
| 985 | - * the documentation and/or other materials provided with the distribution. |
| 986 | - * - Neither the names of <Name of Development Group, Name of Institution>, |
| 987 | - * nor the names of its contributors may be used to endorse or promote |
| 988 | - * products derived from this Software without specific prior written |
| 989 | - * permission. |
| 990 | + * The above copyright notice and this permission notice shall be included in |
| 991 | + * all copies or substantial portions of the Software. |
| 992 | * |
| 993 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 994 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 995 | - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| 996 | - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR |
| 997 | - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
| 998 | - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
| 999 | - * DEALINGS WITH THE SOFTWARE. |
| 1000 | - * |
| 1001 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 1002 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 1003 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 1004 | + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 1005 | + * THE SOFTWARE. |
| 1006 | */ |
| 1007 | |
| 1008 | #ifndef ROCM_SMI_ROCM_SMI_GPU_METRICS_H_ |
| 1009 | @@ -52,9 +31,12 @@ |
| 1010 | #include <cassert> |
| 1011 | #include <cstdint> |
| 1012 | #include <cstring> |
| 1013 | +#include <iostream> |
| 1014 | #include <string> |
| 1015 | #include <map> |
| 1016 | #include <memory> |
| 1017 | +#include <mutex> |
| 1018 | +#include <thread> |
| 1019 | #include <type_traits> |
| 1020 | #include <tuple> |
| 1021 | #include <variant> |
| 1022 | @@ -72,10 +54,11 @@ constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_1 = 1; |
| 1023 | constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_2 = 2; |
| 1024 | constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_3 = 3; |
| 1025 | constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_4 = 4; |
| 1026 | +constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_8 = 8; |
| 1027 | constexpr uint32_t kRSMI_LATEST_GPU_METRICS_API_CONTENT_MAJOR_VER |
| 1028 | = kRSMI_GPU_METRICS_API_CONTENT_MAJOR_VER_1; |
| 1029 | -constexpr uint32_t kRSMI_LATEST_GPU_METRICS_API_CONTENT_MINON_VER |
| 1030 | - = kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_4; |
| 1031 | +constexpr uint32_t kRSMI_LATEST_GPU_METRICS_API_CONTENT_MINOR_VER |
| 1032 | + = kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_8; |
| 1033 | |
| 1034 | |
| 1035 | // Note: This *must* match NUM_HBM_INSTANCES |
| 1036 | @@ -96,6 +79,10 @@ constexpr uint32_t kRSMI_MAX_NUM_VCNS = 4; |
| 1037 | // Note: This *must* match NUM_JPEG_ENG |
| 1038 | constexpr uint32_t kRSMI_MAX_JPEG_ENGINES = 32; |
| 1039 | |
| 1040 | +// Note: Updated for amdgpu_xcp_metrics_v1_2. |
| 1041 | +// Document provides NUM_JPEG_ENG_V1 but will rename to kRSMI_MAX_NUM_JPEG_ENG_V1 |
| 1042 | +constexpr uint32_t kRSMI_MAX_NUM_JPEG_ENG_V1 = 40; |
| 1043 | + |
| 1044 | // Note: This *must* match MAX_XCC |
| 1045 | constexpr uint32_t kRSMI_MAX_NUM_XCC = 8; |
| 1046 | |
| 1047 | @@ -108,6 +95,15 @@ struct AMDGpuMetricsHeader_v1_t { |
| 1048 | uint8_t m_format_revision; |
| 1049 | uint8_t m_content_revision; |
| 1050 | }; |
| 1051 | +struct amdgpu_xcp_metrics { |
| 1052 | + /* Utilization Instantaneous (%) */ |
| 1053 | + uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC]; |
| 1054 | + uint16_t jpeg_busy[kRSMI_MAX_JPEG_ENGINES]; |
| 1055 | + uint16_t vcn_busy[kRSMI_MAX_NUM_VCNS]; |
| 1056 | + |
| 1057 | + /* Utilization Accumulated (%) */ |
| 1058 | + uint64_t gfx_busy_acc[kRSMI_MAX_NUM_XCC]; |
| 1059 | +}; |
| 1060 | |
| 1061 | struct amdgpu_xcp_metrics_v1_1 { |
| 1062 | /* Utilization Instantaneous (%) */ |
| 1063 | @@ -122,14 +118,21 @@ struct amdgpu_xcp_metrics_v1_1 { |
| 1064 | uint64_t gfx_below_host_limit_acc[kRSMI_MAX_NUM_XCC]; |
| 1065 | }; |
| 1066 | |
| 1067 | -struct amdgpu_xcp_metrics { |
| 1068 | +/* new for gpu metrics v1.8 */ |
| 1069 | +struct amdgpu_xcp_metrics_v1_2 { |
| 1070 | /* Utilization Instantaneous (%) */ |
| 1071 | uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC]; |
| 1072 | - uint16_t jpeg_busy[kRSMI_MAX_JPEG_ENGINES]; |
| 1073 | + uint16_t jpeg_busy[kRSMI_MAX_NUM_JPEG_ENG_V1]; |
| 1074 | uint16_t vcn_busy[kRSMI_MAX_NUM_VCNS]; |
| 1075 | |
| 1076 | /* Utilization Accumulated (%) */ |
| 1077 | uint64_t gfx_busy_acc[kRSMI_MAX_NUM_XCC]; |
| 1078 | + |
| 1079 | + /* Total App Clock Counter Accumulated */ |
| 1080 | + uint64_t gfx_below_host_limit_ppt_acc[kRSMI_MAX_NUM_XCC]; |
| 1081 | + uint64_t gfx_below_host_limit_thm_acc[kRSMI_MAX_NUM_XCC]; |
| 1082 | + uint64_t gfx_low_utilization_acc[kRSMI_MAX_NUM_XCC]; |
| 1083 | + uint64_t gfx_below_host_limit_total_acc[kRSMI_MAX_NUM_XCC]; |
| 1084 | }; |
| 1085 | |
| 1086 | struct AMDGpuMetricsBase_t { |
| 1087 | @@ -602,7 +605,7 @@ struct AMDGpuMetrics_v17_t { |
| 1088 | uint16_t m_average_gfx_activity; |
| 1089 | uint16_t m_average_umc_activity; // memory controller |
| 1090 | |
| 1091 | - /* VRAM max bandwidth at max memory clock */ |
| 1092 | + /* VRAM max bandwidth at max memory clock (GB/s) */ |
| 1093 | uint64_t m_vram_max_bandwidth; // new for 1.7 |
| 1094 | |
| 1095 | /* Energy (15.259uJ (2^-16) units) */ |
| 1096 | @@ -685,7 +688,107 @@ struct AMDGpuMetrics_v17_t { |
| 1097 | /* PCIE other end recovery counter */ |
| 1098 | uint32_t m_pcie_lc_perf_other_end_recovery; |
| 1099 | }; |
| 1100 | -using AMGpuMetricsLatest_t = AMDGpuMetrics_v17_t; |
| 1101 | + |
| 1102 | +struct AMDGpuMetrics_v18_t { |
| 1103 | + ~AMDGpuMetrics_v18_t() = default; |
| 1104 | + struct AMDGpuMetricsHeader_v1_t m_common_header; |
| 1105 | + |
| 1106 | + /* Temperature (Celsius) */ |
| 1107 | + uint16_t m_temperature_hotspot; |
| 1108 | + uint16_t m_temperature_mem; |
| 1109 | + uint16_t m_temperature_vrsoc; |
| 1110 | + |
| 1111 | + /* Power (Watts) */ |
| 1112 | + uint16_t m_current_socket_power; |
| 1113 | + |
| 1114 | + /* Utilization (%) */ |
| 1115 | + uint16_t m_average_gfx_activity; |
| 1116 | + uint16_t m_average_umc_activity; // memory controller |
| 1117 | + |
| 1118 | + /* VRAM max bandwidthi (in GB/sec) at max memory clock */ |
| 1119 | + uint64_t m_mem_max_bandwidth; |
| 1120 | + |
| 1121 | + /* Energy (15.259uJ (2^-16) units) */ |
| 1122 | + uint64_t m_energy_accumulator; |
| 1123 | + |
| 1124 | + /* Driver attached timestamp (in ns) */ |
| 1125 | + uint64_t m_system_clock_counter; |
| 1126 | + |
| 1127 | + /* Accumulation cycle counter */ |
| 1128 | + uint32_t m_accumulation_counter; |
| 1129 | + |
| 1130 | + /* Accumulated throttler residencies */ |
| 1131 | + uint32_t m_prochot_residency_acc; |
| 1132 | + uint32_t m_ppt_residency_acc; |
| 1133 | + uint32_t m_socket_thm_residency_acc; |
| 1134 | + uint32_t m_vr_thm_residency_acc; |
| 1135 | + uint32_t m_hbm_thm_residency_acc; |
| 1136 | + |
| 1137 | + /* Clock Lock Status. Each bit corresponds to clock instance */ |
| 1138 | + uint32_t m_gfxclk_lock_status; |
| 1139 | + |
| 1140 | + /* Link width (number of lanes) and speed (in 0.1 GT/s) */ |
| 1141 | + uint16_t m_pcie_link_width; |
| 1142 | + uint16_t m_pcie_link_speed; |
| 1143 | + |
| 1144 | + /* XGMI bus width and bitrate (in Gbps) */ |
| 1145 | + uint16_t m_xgmi_link_width; |
| 1146 | + uint16_t m_xgmi_link_speed; |
| 1147 | + |
| 1148 | + /* Utilization Accumulated (%) */ |
| 1149 | + uint32_t m_gfx_activity_acc; |
| 1150 | + uint32_t m_mem_activity_acc; |
| 1151 | + |
| 1152 | + /*PCIE accumulated bandwidth (GB/sec) */ |
| 1153 | + uint64_t m_pcie_bandwidth_acc; |
| 1154 | + |
| 1155 | + /*PCIE instantaneous bandwidth (GB/sec) */ |
| 1156 | + uint64_t m_pcie_bandwidth_inst; |
| 1157 | + |
| 1158 | + /* PCIE L0 to recovery state transition accumulated count */ |
| 1159 | + uint64_t m_pcie_l0_to_recov_count_acc; |
| 1160 | + |
| 1161 | + /* PCIE replay accumulated count */ |
| 1162 | + uint64_t m_pcie_replay_count_acc; |
| 1163 | + |
| 1164 | + /* PCIE replay rollover accumulated count */ |
| 1165 | + uint64_t m_pcie_replay_rover_count_acc; |
| 1166 | + |
| 1167 | + /* PCIE NAK sent accumulated count */ |
| 1168 | + uint32_t m_pcie_nak_sent_count_acc; |
| 1169 | + |
| 1170 | + /* PCIE NAK received accumulated count */ |
| 1171 | + uint32_t m_pcie_nak_rcvd_count_acc; |
| 1172 | + |
| 1173 | + /* XGMI accumulated data transfer size(KiloBytes) */ |
| 1174 | + uint64_t m_xgmi_read_data_acc[kRSMI_MAX_NUM_XGMI_LINKS]; |
| 1175 | + uint64_t m_xgmi_write_data_acc[kRSMI_MAX_NUM_XGMI_LINKS]; |
| 1176 | + |
| 1177 | + /* XGMI link status(active/inactive) */ |
| 1178 | + uint16_t m_xgmi_link_status[kRSMI_MAX_NUM_XGMI_LINKS]; |
| 1179 | + |
| 1180 | + uint16_t m_padding; |
| 1181 | + |
| 1182 | + /* PMFW attached timestamp (10ns resolution) */ |
| 1183 | + uint64_t m_firmware_timestamp; |
| 1184 | + |
| 1185 | + /* Current clocks (Mhz) */ |
| 1186 | + uint16_t m_current_gfxclk[kRSMI_MAX_NUM_GFX_CLKS]; |
| 1187 | + uint16_t m_current_socclk[kRSMI_MAX_NUM_CLKS]; |
| 1188 | + uint16_t m_current_vclk0[kRSMI_MAX_NUM_CLKS]; |
| 1189 | + uint16_t m_current_dclk0[kRSMI_MAX_NUM_CLKS]; |
| 1190 | + uint16_t m_current_uclk; |
| 1191 | + |
| 1192 | + /* Number of current partition */ |
| 1193 | + uint16_t m_num_partition; |
| 1194 | + |
| 1195 | + /* XCP metrics stats */ |
| 1196 | + struct amdgpu_xcp_metrics_v1_2 m_xcp_stats[kRSMI_MAX_NUM_XCP]; |
| 1197 | + |
| 1198 | + /* PCIE other end recovery counter */ |
| 1199 | + uint32_t m_pcie_lc_perf_other_end_recovery; |
| 1200 | +}; |
| 1201 | +using AMGpuMetricsLatest_t = AMDGpuMetrics_v18_t; |
| 1202 | |
| 1203 | /** |
| 1204 | * This is GPU Metrics version that gets to public access. |
| 1205 | @@ -900,11 +1003,18 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t |
| 1206 | kMetricJpegBusy, // v1.6 |
| 1207 | kMetricVcnBusy, // v1.6 |
| 1208 | kMetricGfxBusyAcc, // v1.6 |
| 1209 | + kMetricGfxBelowHostLimitAccumulator, // v1.7 |
| 1210 | + |
| 1211 | kMetricPcieLCPerfOtherEndRecov, // v1.6 |
| 1212 | |
| 1213 | kMetricVramMaxBandwidth, // v1.7 |
| 1214 | kMetricXgmiLinkStatus, // v1.7 |
| 1215 | - kMetricGfxBelowHostLimitAccumulator, // v1.7 |
| 1216 | + |
| 1217 | + kMetricGfxBelowHostLimitPptAcc, // v1.8 |
| 1218 | + kMetricGfxBelowHostLimitThmAcc, // v1.8 |
| 1219 | + kMetricGfxBelowHostLimitTotalAcc, // v1.8 |
| 1220 | + kMetricGfxLowUtilitizationAcc, // v1.8 |
| 1221 | + |
| 1222 | }; |
| 1223 | using AMDGpuMetricsUnitTypeTranslationTbl_t = std::map<AMDGpuMetricsUnitType_t, std::string>; |
| 1224 | |
| 1225 | @@ -943,6 +1053,7 @@ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t |
| 1226 | kGpuMetricV15 = (0x1 << 5), |
| 1227 | kGpuMetricV16 = (0x1 << 6), |
| 1228 | kGpuMetricV17 = (0x1 << 7), |
| 1229 | + kGpuMetricV18 = (0x1 << 8), // Added new version flag |
| 1230 | }; |
| 1231 | using AMDGpuMetricVersionTranslationTbl_t = std::map<uint16_t, AMDGpuMetricVersionFlags_t>; |
| 1232 | using GpuMetricTypePtr_t = std::shared_ptr<void>; |
| 1233 | @@ -952,27 +1063,24 @@ class GpuMetricsBase_t { |
| 1234 | virtual ~GpuMetricsBase_t() = default; |
| 1235 | virtual size_t sizeof_metric_table() = 0; |
| 1236 | virtual GpuMetricTypePtr_t get_metrics_table() = 0; |
| 1237 | - virtual void dump_internal_metrics_table() = 0; |
| 1238 | virtual AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() = 0; |
| 1239 | virtual rsmi_status_t populate_metrics_dynamic_tbl() = 0; |
| 1240 | virtual AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() = 0; |
| 1241 | virtual void set_device_id(uint32_t device_id) { m_device_id = device_id; } |
| 1242 | virtual void set_partition_id(uint32_t partition_id) { m_partition_id = partition_id; } |
| 1243 | virtual AMDGpuDynamicMetricsTbl_t get_metrics_dynamic_tbl() { |
| 1244 | - return m_metrics_dynamic_tbl; |
| 1245 | + return m_base_metrics_dynamic_tbl; |
| 1246 | } |
| 1247 | |
| 1248 | protected: |
| 1249 | - AMDGpuDynamicMetricsTbl_t m_metrics_dynamic_tbl; |
| 1250 | + AMDGpuDynamicMetricsTbl_t m_base_metrics_dynamic_tbl; |
| 1251 | uint64_t m_metrics_timestamp; |
| 1252 | uint32_t m_device_id; |
| 1253 | uint32_t m_partition_id; |
| 1254 | - |
| 1255 | }; |
| 1256 | using GpuMetricsBasePtr = std::shared_ptr<GpuMetricsBase_t>; |
| 1257 | using AMDGpuMetricFactories_t = const std::map<AMDGpuMetricVersionFlags_t, GpuMetricsBasePtr>; |
| 1258 | |
| 1259 | - |
| 1260 | class GpuMetricsBase_v11_t final : public GpuMetricsBase_t { |
| 1261 | public: |
| 1262 | virtual ~GpuMetricsBase_v11_t() = default; |
| 1263 | @@ -989,10 +1097,6 @@ class GpuMetricsBase_v11_t final : public GpuMetricsBase_t { |
| 1264 | return m_gpu_metric_ptr; |
| 1265 | } |
| 1266 | |
| 1267 | - void dump_internal_metrics_table() override { |
| 1268 | - return; |
| 1269 | - } |
| 1270 | - |
| 1271 | AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { |
| 1272 | return AMDGpuMetricVersionFlags_t::kGpuMetricV11; |
| 1273 | } |
| 1274 | @@ -1022,10 +1126,6 @@ class GpuMetricsBase_v12_t final : public GpuMetricsBase_t { |
| 1275 | return m_gpu_metric_ptr; |
| 1276 | } |
| 1277 | |
| 1278 | - void dump_internal_metrics_table() override { |
| 1279 | - return; |
| 1280 | - } |
| 1281 | - |
| 1282 | AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { |
| 1283 | return AMDGpuMetricVersionFlags_t::kGpuMetricV12; |
| 1284 | } |
| 1285 | @@ -1054,8 +1154,6 @@ class GpuMetricsBase_v13_t final : public GpuMetricsBase_t { |
| 1286 | return (m_gpu_metric_ptr); |
| 1287 | } |
| 1288 | |
| 1289 | - void dump_internal_metrics_table() override; |
| 1290 | - |
| 1291 | AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { |
| 1292 | return AMDGpuMetricVersionFlags_t::kGpuMetricV13; |
| 1293 | } |
| 1294 | @@ -1085,8 +1183,6 @@ class GpuMetricsBase_v14_t final : public GpuMetricsBase_t { |
| 1295 | return m_gpu_metric_ptr; |
| 1296 | } |
| 1297 | |
| 1298 | - void dump_internal_metrics_table() override; |
| 1299 | - |
| 1300 | AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { |
| 1301 | return AMDGpuMetricVersionFlags_t::kGpuMetricV14; |
| 1302 | } |
| 1303 | @@ -1116,8 +1212,6 @@ class GpuMetricsBase_v15_t final : public GpuMetricsBase_t { |
| 1304 | return m_gpu_metric_ptr; |
| 1305 | } |
| 1306 | |
| 1307 | - void dump_internal_metrics_table() override; |
| 1308 | - |
| 1309 | AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { |
| 1310 | return AMDGpuMetricVersionFlags_t::kGpuMetricV15; |
| 1311 | } |
| 1312 | @@ -1147,8 +1241,6 @@ class GpuMetricsBase_v16_t final : public GpuMetricsBase_t { |
| 1313 | return m_gpu_metric_ptr; |
| 1314 | } |
| 1315 | |
| 1316 | - void dump_internal_metrics_table() override; |
| 1317 | - |
| 1318 | AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { |
| 1319 | return AMDGpuMetricVersionFlags_t::kGpuMetricV16; |
| 1320 | } |
| 1321 | @@ -1177,8 +1269,6 @@ class GpuMetricsBase_v17_t final : public GpuMetricsBase_t { |
| 1322 | return m_gpu_metric_ptr; |
| 1323 | } |
| 1324 | |
| 1325 | - void dump_internal_metrics_table() override; |
| 1326 | - |
| 1327 | AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { |
| 1328 | return AMDGpuMetricVersionFlags_t::kGpuMetricV17; |
| 1329 | } |
| 1330 | @@ -1191,6 +1281,34 @@ class GpuMetricsBase_v17_t final : public GpuMetricsBase_t { |
| 1331 | std::shared_ptr<AMDGpuMetrics_v17_t> m_gpu_metric_ptr; |
| 1332 | }; |
| 1333 | |
| 1334 | +class GpuMetricsBase_v18_t final : public GpuMetricsBase_t { |
| 1335 | + public: |
| 1336 | + ~GpuMetricsBase_v18_t() = default; |
| 1337 | + |
| 1338 | + size_t sizeof_metric_table() override { |
| 1339 | + return sizeof(AMDGpuMetrics_v18_t); |
| 1340 | + } |
| 1341 | + |
| 1342 | + GpuMetricTypePtr_t get_metrics_table() override { |
| 1343 | + if (!m_gpu_metric_ptr) { |
| 1344 | + m_gpu_metric_ptr.reset(&m_gpu_metrics_tbl, [](AMDGpuMetrics_v18_t*){}); |
| 1345 | + } |
| 1346 | + assert(m_gpu_metric_ptr != nullptr); |
| 1347 | + return m_gpu_metric_ptr; |
| 1348 | + } |
| 1349 | + |
| 1350 | + AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { |
| 1351 | + return AMDGpuMetricVersionFlags_t::kGpuMetricV18; |
| 1352 | + } |
| 1353 | + |
| 1354 | + rsmi_status_t populate_metrics_dynamic_tbl() override; |
| 1355 | + AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override; |
| 1356 | + |
| 1357 | + private: |
| 1358 | + AMDGpuMetrics_v18_t m_gpu_metrics_tbl; |
| 1359 | + std::shared_ptr<AMDGpuMetrics_v18_t> m_gpu_metric_ptr; |
| 1360 | +}; |
| 1361 | + |
| 1362 | template<typename T> |
| 1363 | rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, |
| 1364 | AMDGpuMetricsUnitType_t metric_counter, T& metric_value); |
| 1365 | diff --git a/include/rocm_smi/rocm_smi_io_link.h b/include/rocm_smi/rocm_smi_io_link.h |
| 1366 | old mode 100755 |
| 1367 | new mode 100644 |
| 1368 | index 5903ab9..b1265ba |
| 1369 | --- a/include/rocm_smi/rocm_smi_io_link.h |
| 1370 | +++ b/include/rocm_smi/rocm_smi_io_link.h |
| 1371 | @@ -3,7 +3,7 @@ |
| 1372 | * The University of Illinois/NCSA |
| 1373 | * Open Source License (NCSA) |
| 1374 | * |
| 1375 | - * Copyright (c) 2020, Advanced Micro Devices, Inc. |
| 1376 | + * Copyright (c) 2025, Advanced Micro Devices, Inc. |
| 1377 | * All rights reserved. |
| 1378 | * |
| 1379 | * Developed by: |
| 1380 | diff --git a/include/rocm_smi/rocm_smi_kfd.h b/include/rocm_smi/rocm_smi_kfd.h |
| 1381 | old mode 100755 |
| 1382 | new mode 100644 |
| 1383 | index 2759dfd..68e7403 |
| 1384 | --- a/include/rocm_smi/rocm_smi_kfd.h |
| 1385 | +++ b/include/rocm_smi/rocm_smi_kfd.h |
| 1386 | @@ -3,7 +3,7 @@ |
| 1387 | * The University of Illinois/NCSA |
| 1388 | * Open Source License (NCSA) |
| 1389 | * |
| 1390 | - * Copyright (c) 2019, Advanced Micro Devices, Inc. |
| 1391 | + * Copyright (c) 2025, Advanced Micro Devices, Inc. |
| 1392 | * All rights reserved. |
| 1393 | * |
| 1394 | * Developed by: |
| 1395 | diff --git a/include/rocm_smi/rocm_smi_logger.h b/include/rocm_smi/rocm_smi_logger.h |
| 1396 | index f83240f..d51e487 100644 |
| 1397 | --- a/include/rocm_smi/rocm_smi_logger.h |
| 1398 | +++ b/include/rocm_smi/rocm_smi_logger.h |
| 1399 | @@ -3,7 +3,7 @@ |
| 1400 | * The University of Illinois/NCSA |
| 1401 | * Open Source License (NCSA) |
| 1402 | * |
| 1403 | - * Copyright (c) 2023, Advanced Micro Devices, Inc. |
| 1404 | + * Copyright (c) 2025, Advanced Micro Devices, Inc. |
| 1405 | * All rights reserved. |
| 1406 | * |
| 1407 | * Developed by: |
| 1408 | diff --git a/include/rocm_smi/rocm_smi_main.h b/include/rocm_smi/rocm_smi_main.h |
| 1409 | old mode 100755 |
| 1410 | new mode 100644 |
| 1411 | index 1d639d7..045ab2e |
| 1412 | --- a/include/rocm_smi/rocm_smi_main.h |
| 1413 | +++ b/include/rocm_smi/rocm_smi_main.h |
| 1414 | @@ -5,7 +5,7 @@ |
| 1415 | * The University of Illinois/NCSA |
| 1416 | * Open Source License (NCSA) |
| 1417 | * |
| 1418 | - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. |
| 1419 | + * Copyright (c) 2017-2025, Advanced Micro Devices, Inc. |
| 1420 | * All rights reserved. |
| 1421 | * |
| 1422 | * Developed by: |
| 1423 | diff --git a/include/rocm_smi/rocm_smi_monitor.h b/include/rocm_smi/rocm_smi_monitor.h |
| 1424 | old mode 100755 |
| 1425 | new mode 100644 |
| 1426 | index ad28464..f52144a |
| 1427 | --- a/include/rocm_smi/rocm_smi_monitor.h |
| 1428 | +++ b/include/rocm_smi/rocm_smi_monitor.h |
| 1429 | @@ -5,7 +5,7 @@ |
| 1430 | * The University of Illinois/NCSA |
| 1431 | * Open Source License (NCSA) |
| 1432 | * |
| 1433 | - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. |
| 1434 | + * Copyright (c) 2017-2025, Advanced Micro Devices, Inc. |
| 1435 | * All rights reserved. |
| 1436 | * |
| 1437 | * Developed by: |
| 1438 | diff --git a/include/rocm_smi/rocm_smi_power_mon.h b/include/rocm_smi/rocm_smi_power_mon.h |
| 1439 | old mode 100755 |
| 1440 | new mode 100644 |
| 1441 | index 71e4c08..0a8a2f4 |
| 1442 | --- a/include/rocm_smi/rocm_smi_power_mon.h |
| 1443 | +++ b/include/rocm_smi/rocm_smi_power_mon.h |
| 1444 | @@ -5,7 +5,7 @@ |
| 1445 | * The University of Illinois/NCSA |
| 1446 | * Open Source License (NCSA) |
| 1447 | * |
| 1448 | - * Copyright (c) 2017, Advanced Micro Devices, Inc. |
| 1449 | + * Copyright (c) 2025, Advanced Micro Devices, Inc. |
| 1450 | * All rights reserved. |
| 1451 | * |
| 1452 | * Developed by: |
| 1453 | diff --git a/include/rocm_smi/rocm_smi_properties.h b/include/rocm_smi/rocm_smi_properties.h |
| 1454 | index 67d285c..0aa4f4a 100644 |
| 1455 | --- a/include/rocm_smi/rocm_smi_properties.h |
| 1456 | +++ b/include/rocm_smi/rocm_smi_properties.h |
| 1457 | @@ -3,7 +3,7 @@ |
| 1458 | * The University of Illinois/NCSA |
| 1459 | * Open Source License (NCSA) |
| 1460 | * |
| 1461 | - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. |
| 1462 | + * Copyright (c) 2017-2025, Advanced Micro Devices, Inc. |
| 1463 | * All rights reserved. |
| 1464 | * |
| 1465 | * Developed by: |
| 1466 | diff --git a/include/rocm_smi/rocm_smi_utils.h b/include/rocm_smi/rocm_smi_utils.h |
| 1467 | old mode 100755 |
| 1468 | new mode 100644 |
| 1469 | index 5263d35..42d009d |
| 1470 | --- a/include/rocm_smi/rocm_smi_utils.h |
| 1471 | +++ b/include/rocm_smi/rocm_smi_utils.h |
| 1472 | @@ -3,7 +3,7 @@ |
| 1473 | * The University of Illinois/NCSA |
| 1474 | * Open Source License (NCSA) |
| 1475 | * |
| 1476 | - * Copyright (c) 2018-2023, Advanced Micro Devices, Inc. |
| 1477 | + * Copyright (c) 2018-2025, Advanced Micro Devices, Inc. |
| 1478 | * All rights reserved. |
| 1479 | * |
| 1480 | * Developed by: |
| 1481 | @@ -136,6 +136,8 @@ rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind, |
| 1482 | std::string leftTrim(const std::string &s); |
| 1483 | std::string rightTrim(const std::string &s); |
| 1484 | std::string trim(const std::string &s); |
| 1485 | +std::string trimAllWhiteSpace(const std::string &s); |
| 1486 | +std::string removeWhitespace(const std::string &s); |
| 1487 | std::string removeNewLines(const std::string &s); |
| 1488 | |
| 1489 | std::string removeString(const std::string origStr, |
| 1490 | @@ -144,6 +146,7 @@ void system_wait(int milli_seconds); |
| 1491 | int countDigit(uint64_t n); |
| 1492 | std::string find_file_in_folder(const std::string& folder, |
| 1493 | const std::string& regex); |
| 1494 | +uint64_t get_multiplier_from_char(char units_char); |
| 1495 | template <typename T> |
| 1496 | std::string print_int_as_hex(T i, bool showHexNotation = true, |
| 1497 | int overloadBitSize = 0) { |
| 1498 | diff --git a/oam/CMakeLists.txt b/oam/CMakeLists.txt |
| 1499 | index 181ee1e..7aa1b5f 100644 |
| 1500 | --- a/oam/CMakeLists.txt |
| 1501 | +++ b/oam/CMakeLists.txt |
| 1502 | @@ -94,14 +94,16 @@ endif () |
| 1503 | # use the target_include_directories() command to specify the include directories for the target |
| 1504 | target_include_directories(${OAM_TARGET} |
| 1505 | PUBLIC |
| 1506 | + "$<BUILD_INTERFACE:${DRM_INCLUDE_DIRS}>" |
| 1507 | + "$<BUILD_INTERFACE:${AMDGPU_DRM_INCLUDE_DIRS}>" |
| 1508 | "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>" |
| 1509 | "$<INSTALL_INTERFACE:{OAM_NAME}/include>") |
| 1510 | |
| 1511 | ## Add the install directives for the runtime library. |
| 1512 | install(TARGETS ${OAM_TARGET} |
| 1513 | EXPORT rocm_smiTargets |
| 1514 | - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} |
| 1515 | - COMPONENT dev) |
| 1516 | + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT dev |
| 1517 | + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT dev) |
| 1518 | install(TARGETS ${OAM_TARGET} |
| 1519 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} |
| 1520 | COMPONENT asan) |
| 1521 | diff --git a/oam/include/oam/amd_oam.h b/oam/include/oam/amd_oam.h |
| 1522 | old mode 100755 |
| 1523 | new mode 100644 |
| 1524 | diff --git a/oam/include/oam/oam_mapi.h b/oam/include/oam/oam_mapi.h |
| 1525 | old mode 100755 |
| 1526 | new mode 100644 |
| 1527 | diff --git a/oam/src/amd_oam.cc b/oam/src/amd_oam.cc |
| 1528 | old mode 100755 |
| 1529 | new mode 100644 |
| 1530 | diff --git a/oam/src/oamConfig.in b/oam/src/oamConfig.in |
| 1531 | old mode 100755 |
| 1532 | new mode 100644 |
| 1533 | index bde279c..5f5b96b |
| 1534 | --- a/oam/src/oamConfig.in |
| 1535 | +++ b/oam/src/oamConfig.in |
| 1536 | @@ -5,7 +5,7 @@ |
| 1537 | * The University of Illinois/NCSA |
| 1538 | * Open Source License (NCSA) |
| 1539 | * |
| 1540 | - * Copyright (c) 2017, Advanced Micro Devices, Inc. |
| 1541 | + * Copyright (c) 2025, Advanced Micro Devices, Inc. |
| 1542 | * All rights reserved. |
| 1543 | * |
| 1544 | * Developed by: |
| 1545 | @@ -53,4 +53,4 @@ |
| 1546 | #define rocm_smi_VERSION_PATCH @rocm_smi_VERSION_PATCH@ |
| 1547 | #define rocm_smi_VERSION_BUILD "@rocm_smi_VERSION_BUILD@" |
| 1548 | |
| 1549 | -#endif // INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_ |
| 1550 | \ No newline at end of file |
| 1551 | +#endif // INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_ |
| 1552 | diff --git a/python_smi_tools/README.md b/python_smi_tools/README.md |
| 1553 | index 81d9175..0247e40 100644 |
| 1554 | --- a/python_smi_tools/README.md |
| 1555 | +++ b/python_smi_tools/README.md |
| 1556 | @@ -456,4 +456,4 @@ The information contained herein is for informational purposes only, and is subj |
| 1557 | |
| 1558 | AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. |
| 1559 | |
| 1560 | -Copyright (c) 2014-2024 Advanced Micro Devices, Inc. All rights reserved. |
| 1561 | +Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All rights reserved. |
| 1562 | diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py |
| 1563 | index 68ab1b8..6eb22a2 100755 |
| 1564 | --- a/python_smi_tools/rocm_smi.py |
| 1565 | +++ b/python_smi_tools/rocm_smi.py |
| 1566 | @@ -17,10 +17,11 @@ import logging |
| 1567 | import os |
| 1568 | import sys |
| 1569 | import subprocess |
| 1570 | -import _thread |
| 1571 | +import threading |
| 1572 | import time |
| 1573 | import multiprocessing |
| 1574 | import trace |
| 1575 | +from os.path import exists |
| 1576 | from io import StringIO |
| 1577 | from time import ctime |
| 1578 | from subprocess import check_output |
| 1579 | @@ -49,7 +50,7 @@ except ImportError: |
| 1580 | # Minor version - Increment when adding a new feature, set to 0 when major is incremented |
| 1581 | # Patch version - Increment when adding a fix, set to 0 when minor is incremented |
| 1582 | # Hash version - Shortened commit hash. Print here and not with lib for consistency with amd-smi |
| 1583 | -SMI_MAJ = 3 |
| 1584 | +SMI_MAJ = 4 |
| 1585 | SMI_MIN = 0 |
| 1586 | SMI_PAT = 0 |
| 1587 | # SMI_HASH is provided by rsmiBindings |
| 1588 | @@ -86,6 +87,9 @@ validClockNames = clk_type_names[1:-2] |
| 1589 | validClockNames.append('pcie') |
| 1590 | validClockNames.sort() |
| 1591 | |
| 1592 | +# Thread stop condition |
| 1593 | +stop_threads = False |
| 1594 | + |
| 1595 | def driverInitialized(): |
| 1596 | """ Returns true if amdgpu is found in the list of initialized modules |
| 1597 | """ |
| 1598 | @@ -472,7 +476,7 @@ def getAllocatedMemoryPercent(device): |
| 1599 | mem_use_pct = 0 |
| 1600 | if vram_used is None: |
| 1601 | return allocated_memory_vram |
| 1602 | - if vram_used != None and vram_total != None and float(vram_total) != 0: |
| 1603 | + if vram_used is not None and vram_total is not None and float(vram_total) != 0: |
| 1604 | # take floor of result (round down to nearest integer) |
| 1605 | mem_use_pct = (100 * (float(vram_used) / float(vram_total))) // 1 |
| 1606 | allocated_memory_vram['value'] = mem_use_pct |
| 1607 | @@ -527,7 +531,7 @@ def getProcessName(pid): |
| 1608 | except subprocess.CalledProcessError as e: |
| 1609 | pName = 'UNKNOWN' |
| 1610 | |
| 1611 | - if pName == None: |
| 1612 | + if pName is None: |
| 1613 | pName = 'UNKNOWN' |
| 1614 | |
| 1615 | # Remove the substrings surrounding from process name (b' and \n') |
| 1616 | @@ -866,13 +870,16 @@ def printEventList(device, delay, eventList): |
| 1617 | if not rsmi_ret_ok(ret, device, 'set_event_notification_mask'): |
| 1618 | printErrLog(device, 'Unable to set event notification mask.') |
| 1619 | return |
| 1620 | - while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c' |
| 1621 | + while not stop_threads: # Exit condition from user keyboard input of 'q' or 'ctrl + c' |
| 1622 | num_elements = c_uint32(1) |
| 1623 | data = rsmi_evt_notification_data_t(1) |
| 1624 | rocmsmi.rsmi_event_notification_get(delay, byref(num_elements), byref(data)) |
| 1625 | if len(data.message) > 0: |
| 1626 | print2DArray([['\rGPU[%d]:\t' % (data.dv_ind), ctime().split()[3], notification_type_names[data.event.value - 1], |
| 1627 | data.message.decode('utf8') + '\r']]) |
| 1628 | + ret = rocmsmi.rsmi_event_notification_stop(device) |
| 1629 | + if not rsmi_ret_ok(ret, device, 'stop_event_notification'): |
| 1630 | + printErrLog(device, 'Unable to end event notifications.') |
| 1631 | |
| 1632 | def printLog(device, metricName, value=None, extraSpace=False, useItalics=False, xcp=None): |
| 1633 | """ Print out to the SMI log |
| 1634 | @@ -915,8 +922,8 @@ def printLog(device, metricName, value=None, extraSpace=False, useItalics=False, |
| 1635 | |
| 1636 | # Handle non UTF-8 locale |
| 1637 | try: |
| 1638 | - print(logstr + '\n', end='') |
| 1639 | - except UnicodeEncodeError: |
| 1640 | + print(logstr.encode('utf-8', 'ignore').decode('utf-8')) |
| 1641 | + except UnicodeError: |
| 1642 | print(logstr.encode('ascii', 'ignore').decode('ascii')) |
| 1643 | |
| 1644 | sys.stdout.flush() |
| 1645 | @@ -1086,18 +1093,12 @@ def resetClocks(deviceList): |
| 1646 | ret = rocmsmi.rsmi_dev_overdrive_level_set(device, rsmi_dev_perf_level_t(0)) |
| 1647 | if rsmi_ret_ok(ret, device, 'set_overdrive_level'): |
| 1648 | printLog(device, 'OverDrive set to 0', None) |
| 1649 | - else: |
| 1650 | - printLog(device, 'Unable to reset OverDrive', None) |
| 1651 | ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0)) |
| 1652 | if rsmi_ret_ok(ret, device, 'set_perf_level'): |
| 1653 | printLog(device, 'Successfully reset clocks', None) |
| 1654 | - else: |
| 1655 | - printLog(device, 'Unable to reset clocks', None) |
| 1656 | ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0)) |
| 1657 | if rsmi_ret_ok(ret, device, 'set_perf_level'): |
| 1658 | printLog(device, 'Performance level reset to auto', None) |
| 1659 | - else: |
| 1660 | - printLog(device, 'Unable to reset performance level to auto', None) |
| 1661 | |
| 1662 | |
| 1663 | def resetFans(deviceList): |
| 1664 | @@ -1111,8 +1112,6 @@ def resetFans(deviceList): |
| 1665 | ret = rocmsmi.rsmi_dev_fan_reset(device, sensor_ind) |
| 1666 | if rsmi_ret_ok(ret, device, silent=True): |
| 1667 | printLog(device, 'Successfully reset fan speed to driver control', None) |
| 1668 | - else: |
| 1669 | - printLog(device, 'Not supported on the given system', None) |
| 1670 | printLogSpacer() |
| 1671 | |
| 1672 | |
| 1673 | @@ -1134,13 +1133,6 @@ def resetProfile(deviceList): |
| 1674 | ret = rocmsmi.rsmi_dev_power_profile_set(device, 0, profileString('BOOTUP DEFAULT')) |
| 1675 | if rsmi_ret_ok(ret, device, 'set_power_profile'): |
| 1676 | printLog(device, 'Successfully reset Power Profile', None) |
| 1677 | - else: |
| 1678 | - printErrLog(device, 'Unable to reset Power Profile') |
| 1679 | - ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0)) |
| 1680 | - if rsmi_ret_ok(ret, device, 'set_perf_level'): |
| 1681 | - printLog(device, 'Successfully reset Performance Level', None) |
| 1682 | - else: |
| 1683 | - printErrLog(device, 'Unable to reset Performance Level') |
| 1684 | printLogSpacer() |
| 1685 | |
| 1686 | |
| 1687 | @@ -1154,8 +1146,6 @@ def resetXgmiErr(deviceList): |
| 1688 | ret = rocmsmi.rsmi_dev_xgmi_error_reset(device) |
| 1689 | if rsmi_ret_ok(ret, device, 'reset xgmi'): |
| 1690 | printLog(device, 'Successfully reset XGMI Error count', None) |
| 1691 | - else: |
| 1692 | - logging.error('GPU[%s]\t\t: Unable to reset XGMI error count', device) |
| 1693 | printLogSpacer() |
| 1694 | |
| 1695 | |
| 1696 | @@ -1169,8 +1159,6 @@ def resetPerfDeterminism(deviceList): |
| 1697 | ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0)) |
| 1698 | if rsmi_ret_ok(ret, device, 'disable performance determinism'): |
| 1699 | printLog(device, 'Successfully disabled performance determinism', None) |
| 1700 | - else: |
| 1701 | - logging.error('GPU[%s]\t\t: Unable to disable performance determinism', device) |
| 1702 | printLogSpacer() |
| 1703 | |
| 1704 | |
| 1705 | @@ -1203,10 +1191,10 @@ def setClockRange(deviceList, clkType, minvalue, maxvalue, autoRespond): |
| 1706 | if rsmi_ret_ok(ret, device, silent=True): |
| 1707 | printLog(device, 'Successfully set %s from %s(MHz) to %s(MHz)' % (clkType, minvalue, maxvalue), None) |
| 1708 | else: |
| 1709 | - printErrLog(device, 'Unable to set %s from %s(MHz) to %s(MHz)' % (clkType, minvalue, maxvalue)) |
| 1710 | - RETCODE = 1 |
| 1711 | if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: |
| 1712 | printLog(device, 'Setting %s range is not supported for this device.' % (clkType), None) |
| 1713 | + else: |
| 1714 | + RETCODE = 1 |
| 1715 | |
| 1716 | def setClockExtremum(deviceList, level, clkType, clkValue, autoRespond): |
| 1717 | """ Set the range for the specified clktype in the PowerPlay table for a list of devices. |
| 1718 | @@ -1247,10 +1235,10 @@ def setClockExtremum(deviceList, level, clkType, clkValue, autoRespond): |
| 1719 | if rsmi_ret_ok(ret, device, silent=True): |
| 1720 | printLog(device, 'Successfully set %s %s to %s(MHz)' % (level, clkType, clkValue), None) |
| 1721 | else: |
| 1722 | - printErrLog(device, 'Unable to set %s %s to %s(MHz)' % (level, clkType, clkValue)) |
| 1723 | - RETCODE = 1 |
| 1724 | if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: |
| 1725 | printLog(device, 'Setting %s %s clock is not supported for this device.' % (level, clkType), None) |
| 1726 | + else: |
| 1727 | + RETCODE = 1 |
| 1728 | |
| 1729 | |
| 1730 | def setVoltageCurve(deviceList, point, clk, volt, autoRespond): |
| 1731 | @@ -1276,9 +1264,6 @@ def setVoltageCurve(deviceList, point, clk, volt, autoRespond): |
| 1732 | ret = rocmsmi.rsmi_dev_od_volt_info_set(device, int(point), int(clk), int(volt)) |
| 1733 | if rsmi_ret_ok(ret, device, 'set_voltage_curve'): |
| 1734 | printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None) |
| 1735 | - else: |
| 1736 | - printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt)) |
| 1737 | - RETCODE = 1 |
| 1738 | |
| 1739 | |
| 1740 | def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond): |
| 1741 | @@ -1309,7 +1294,6 @@ def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond): |
| 1742 | if rsmi_ret_ok(ret, device, 'set_power_play_table_level_' + str(clkType)): |
| 1743 | printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None) |
| 1744 | else: |
| 1745 | - printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt)) |
| 1746 | RETCODE = 1 |
| 1747 | elif clkType == 'mclk': |
| 1748 | ret = rocmsmi.rsmi_dev_od_clk_info_set(device, rsmi_freq_ind_t(int(point)), int(clk), |
| 1749 | @@ -1317,7 +1301,6 @@ def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond): |
| 1750 | if rsmi_ret_ok(ret, device, 'set_power_play_table_level_' + str(clkType)): |
| 1751 | printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None) |
| 1752 | else: |
| 1753 | - printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt)) |
| 1754 | RETCODE = 1 |
| 1755 | else: |
| 1756 | printErrLog(device, 'Unable to set %s range' % (clkType)) |
| 1757 | @@ -1357,8 +1340,6 @@ def setClockOverDrive(deviceList, clktype, value, autoRespond): |
| 1758 | ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(3)) |
| 1759 | if rsmi_ret_ok(ret, device, 'set_perf_level_manual_' + str(clktype)): |
| 1760 | printLog(device, 'Performance level set to manual', None) |
| 1761 | - else: |
| 1762 | - printErrLog(device, 'Unable to set performance level to manual') |
| 1763 | if clktype == 'mclk': |
| 1764 | fsFile = os.path.join('/sys/class/drm', 'card%d' % (device), 'device', 'pp_mclk_od') |
| 1765 | if not os.path.isfile(fsFile): |
| 1766 | @@ -1432,14 +1413,13 @@ def setClocks(deviceList, clktype, clk): |
| 1767 | if rsmi_ret_ok(ret, device, 'set_perf_level_manual'): |
| 1768 | printLog(device, 'Performance level was set to manual', None) |
| 1769 | else: |
| 1770 | - printErrLog(device, 'Unable to set performance level to manual') |
| 1771 | RETCODE = 1 |
| 1772 | return |
| 1773 | if clktype != 'pcie': |
| 1774 | # Validate frequency bitmask |
| 1775 | freq = rsmi_frequencies_t() |
| 1776 | ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clktype], byref(freq)) |
| 1777 | - if rsmi_ret_ok(ret, device, 'get_gpu_clk_freq_' + str(clktype)) == False: |
| 1778 | + if not rsmi_ret_ok(ret, device, 'get_gpu_clk_freq_' + str(clktype)): |
| 1779 | RETCODE = 1 |
| 1780 | return |
| 1781 | # The freq_bitmask should be less than 2^(freqs.num_supported) |
| 1782 | @@ -1453,13 +1433,12 @@ def setClocks(deviceList, clktype, clk): |
| 1783 | if rsmi_ret_ok(ret, device, 'set_gpu_clk_freq_' + str(clktype)): |
| 1784 | printLog(device, 'Successfully set %s bitmask to' % (clktype), hex(freq_bitmask)) |
| 1785 | else: |
| 1786 | - printErrLog(device, 'Unable to set %s bitmask to: %s' % (clktype, hex(freq_bitmask))) |
| 1787 | RETCODE = 1 |
| 1788 | else: |
| 1789 | # Validate the bandwidth bitmask |
| 1790 | bw = rsmi_pcie_bandwidth_t() |
| 1791 | ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw)) |
| 1792 | - if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth') == False: |
| 1793 | + if not rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth'): |
| 1794 | RETCODE = 1 |
| 1795 | return |
| 1796 | # The freq_bitmask should be less than 2^(bw.transfer_rate.num_supported) |
| 1797 | @@ -1473,7 +1452,6 @@ def setClocks(deviceList, clktype, clk): |
| 1798 | if rsmi_ret_ok(ret, device, 'set_PCIe_bandwidth'): |
| 1799 | printLog(device, 'Successfully set %s to level bitmask' % (clktype), hex(freq_bitmask)) |
| 1800 | else: |
| 1801 | - printErrLog(device, 'Unable to set %s bitmask to: %s' % (clktype, hex(freq_bitmask))) |
| 1802 | RETCODE = 1 |
| 1803 | printLogSpacer() |
| 1804 | |
| 1805 | @@ -1498,7 +1476,6 @@ def setPerfDeterminism(deviceList, clkvalue): |
| 1806 | if rsmi_ret_ok(ret, device, 'set_perf_determinism'): |
| 1807 | printLog(device, 'Successfully enabled performance determinism and set GFX clock frequency', str(clkvalue)) |
| 1808 | else: |
| 1809 | - printErrLog(device, 'Unable to set performance determinism and clock frequency to %s' % (str(clkvalue))) |
| 1810 | RETCODE = 1 |
| 1811 | |
| 1812 | |
| 1813 | @@ -1521,9 +1498,6 @@ def resetGpu(device): |
| 1814 | ret = rocmsmi.rsmi_dev_gpu_reset(resetDev) |
| 1815 | if rsmi_ret_ok(ret, resetDev, 'reset_gpu'): |
| 1816 | printLog(resetDev, 'Successfully reset GPU %d' % (resetDev), None) |
| 1817 | - else: |
| 1818 | - printErrLog(resetDev, 'Unable to reset GPU %d' % (resetDev)) |
| 1819 | - logging.debug('GPU reset failed with return value of %d' % ret) |
| 1820 | printLogSpacer() |
| 1821 | |
| 1822 | |
| 1823 | @@ -1690,7 +1664,7 @@ def setPowerOverDrive(deviceList, value, autoRespond): |
| 1824 | new_power_cap.value = int(value) * 1000000 |
| 1825 | |
| 1826 | ret = rocmsmi.rsmi_dev_power_cap_range_get(device, 0, byref(power_cap_max), byref(power_cap_min)) |
| 1827 | - if rsmi_ret_ok(ret, device, 'get_power_cap_range') == False: |
| 1828 | + if not rsmi_ret_ok(ret, device, 'get_power_cap_range'): |
| 1829 | printErrLog(device, 'Unable to parse Power OverDrive range') |
| 1830 | RETCODE = 1 |
| 1831 | continue |
| 1832 | @@ -1897,7 +1871,6 @@ def setMemoryPartition(deviceList, memoryPartition, autoRespond): |
| 1833 | printLog(device, 'Issue reloading driver, please check dmsg for errors', |
| 1834 | None, addExtraLine) |
| 1835 | else: |
| 1836 | - rsmi_ret_ok(ret, device, 'set_memory_partition') |
| 1837 | printErrLog(device, 'Failed to set memory partition, even though device supports it.') |
| 1838 | printLogSpacer() |
| 1839 | |
| 1840 | @@ -2380,7 +2353,7 @@ def getCoarseGrainUtil(device, typeName=None): |
| 1841 | """ |
| 1842 | timestamp = c_uint64(0) |
| 1843 | |
| 1844 | - if typeName != None: |
| 1845 | + if typeName is not None: |
| 1846 | |
| 1847 | try: |
| 1848 | i = utilization_counter_name.index(typeName) |
| 1849 | @@ -2589,6 +2562,26 @@ def showPcieBw(deviceList): |
| 1850 | max_pkt_sz = c_uint64() |
| 1851 | printLogSpacer(' Measured PCIe Bandwidth ') |
| 1852 | for device in deviceList: |
| 1853 | + # Get BW from GPU metrics from version >= 1.5 |
| 1854 | + header = metrics_table_header_t() |
| 1855 | + ret_version = rocmsmi.rsmi_dev_metrics_header_info_get(device, byref(header)) |
| 1856 | + if rsmi_ret_ok(ret_version, device, 'get_metrics_header', True): |
| 1857 | + if header.format_revision >= 1 and header.content_revision >= 5: |
| 1858 | + gpu_metrics = rsmi_gpu_metrics_t() |
| 1859 | + ret = rocmsmi.rsmi_dev_gpu_metrics_info_get(device, byref(gpu_metrics)) |
| 1860 | + if rsmi_ret_ok(ret, device, "get_gpu_metrics", True): |
| 1861 | + metric_bw = gpu_metrics.pcie_bandwidth_inst |
| 1862 | + if metric_bw != ctypes.c_uint64(-1).value and metric_bw > 0: |
| 1863 | + bandwidth_mbps = metric_bw / 8.0 # Convert megabits to megabytes |
| 1864 | + bwstr = f"{bandwidth_mbps:.3f}" |
| 1865 | + printLog(device, "Current PCIe bandwidth (MB/s)", bwstr) |
| 1866 | + continue |
| 1867 | + else: |
| 1868 | + printLog(device, "GPU metrics pcie_bandwidth_inst is invalid", None) |
| 1869 | + else: |
| 1870 | + printLog(device, "Failed to get GPU metrics info", None) |
| 1871 | + |
| 1872 | + # Use legacy API (For GPU metric version < 1.5 or failed) |
| 1873 | ret = rocmsmi.rsmi_dev_pci_throughput_get(device, byref(sent), byref(received), byref(max_pkt_sz)) |
| 1874 | if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth'): |
| 1875 | # Use 1024.0 to ensure that the result is a float and not integer division |
| 1876 | @@ -2707,8 +2700,6 @@ def showPower(deviceList): |
| 1877 | elif checkIfSecondaryDie(device): |
| 1878 | printLog(device, 'Average Graphics Package Power (W)', "N/A (Secondary die)") |
| 1879 | secondaryPresent=True |
| 1880 | - else: |
| 1881 | - printErrLog(device, 'Unable to get Average or Current Socket Graphics Package Power Consumption') |
| 1882 | if secondaryPresent: |
| 1883 | printLog(None, "\n\t\tPrimary die (usually one above or below the secondary) shows total (primary + secondary) socket power information", None) |
| 1884 | printLogSpacer() |
| 1885 | @@ -2823,13 +2814,20 @@ def showRange(deviceList, rangeType): |
| 1886 | return |
| 1887 | printLogSpacer(' Show Valid %s Range ' % (rangeType)) |
| 1888 | odvf = rsmi_od_volt_freq_data_t() |
| 1889 | + uint64_max = UIntegerTypes.UINT64_T |
| 1890 | for device in deviceList: |
| 1891 | ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf)) |
| 1892 | if rsmi_ret_ok(ret, device, 'get_od_volt', silent=False): |
| 1893 | if rangeType == 'sclk': |
| 1894 | + if odvf.curr_sclk_range.lower_bound == uint64_max or odvf.curr_sclk_range.upper_bound == uint64_max: |
| 1895 | + printLog(device, 'Unable to display %s range' % (rangeType), None) |
| 1896 | + continue |
| 1897 | printLog(device, 'Valid sclk range: %sMhz - %sMhz' % ( |
| 1898 | int(odvf.curr_sclk_range.lower_bound / 1000000), int(odvf.curr_sclk_range.upper_bound / 1000000)), None) |
| 1899 | if rangeType == 'mclk': |
| 1900 | + if odvf.curr_mclk_range.lower_bound == uint64_max or odvf.curr_mclk_range.upper_bound == uint64_max: |
| 1901 | + printLog(device, 'Unable to display %s range' % (rangeType), None) |
| 1902 | + continue |
| 1903 | printLog(device, 'Valid mclk range: %sMhz - %sMhz' % ( |
| 1904 | int(odvf.curr_mclk_range.lower_bound / 1000000), int(odvf.curr_mclk_range.upper_bound / 1000000)), None) |
| 1905 | if rangeType == 'voltage': |
| 1906 | @@ -2996,8 +2994,9 @@ def showEvents(deviceList, eventTypes): |
| 1907 | :param eventTypes: List of event type names (can be a single-item list) |
| 1908 | """ |
| 1909 | printLogSpacer(' Show Events ') |
| 1910 | - printLog(None, 'press \'q\' or \'ctrl + c\' to quit', None) |
| 1911 | + printLog(None, 'press \'q\' or \'ctrl + c\' and then \'Enter\' to quit', None) |
| 1912 | eventTypeList = [] |
| 1913 | + thread_list = [] |
| 1914 | for event in eventTypes: # Cleaning list from wrong values |
| 1915 | if event.replace(',', '').upper() in notification_type_names: |
| 1916 | eventTypeList.append(event.replace(',', '').upper()) |
| 1917 | @@ -3009,22 +3008,23 @@ def showEvents(deviceList, eventTypes): |
| 1918 | # Create a separate thread for each GPU |
| 1919 | for device in deviceList: |
| 1920 | try: |
| 1921 | - _thread.start_new_thread(printEventList, (device, 1000, eventTypeList)) |
| 1922 | + thread = threading.Thread(target=printEventList, args=(device, 1000, eventTypeList)) |
| 1923 | + thread_list.append(thread) |
| 1924 | + thread.start() |
| 1925 | time.sleep(0.25) |
| 1926 | except Exception as e: |
| 1927 | printErrLog(device, 'Unable to start new thread. %s' % (e)) |
| 1928 | return |
| 1929 | - while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c' |
| 1930 | - getch = _Getch() |
| 1931 | - user_input = getch() |
| 1932 | + while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c' and then 'Enter' |
| 1933 | + user_input = input() |
| 1934 | # Catch user input for q or Ctrl + c |
| 1935 | if user_input == 'q' or user_input == '\x03': |
| 1936 | - for device in deviceList: |
| 1937 | - ret = rocmsmi.rsmi_event_notification_stop(device) |
| 1938 | - if not rsmi_ret_ok(ret, device, 'stop_event_notification'): |
| 1939 | - printErrLog(device, 'Unable to end event notifications.') |
| 1940 | + global stop_threads |
| 1941 | + stop_threads = True |
| 1942 | print('\r') |
| 1943 | break |
| 1944 | + for thread in thread_list: |
| 1945 | + thread.join() |
| 1946 | |
| 1947 | |
| 1948 | def printTempGraph(deviceList, delay, temp_type): |
| 1949 | @@ -3037,7 +3037,7 @@ def printTempGraph(deviceList, delay, temp_type): |
| 1950 | for i in range(devices): |
| 1951 | printEmptyLine() |
| 1952 | originalTerminalWidth = os.get_terminal_size()[0] |
| 1953 | - while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c' |
| 1954 | + while not stop_threads: # Exit condition from user keyboard input of 'q' or 'ctrl + c' |
| 1955 | terminalWidth = os.get_terminal_size()[0] |
| 1956 | printStrings = list() |
| 1957 | for device in deviceList: |
| 1958 | @@ -3117,19 +3117,26 @@ def showTempGraph(deviceList): |
| 1959 | deviceList.sort() |
| 1960 | temp_type = getTemperatureLabel(deviceList) |
| 1961 | printLogSpacer(' Temperature Graph ' + temp_type.capitalize() + ' ') |
| 1962 | + thread_list = [] |
| 1963 | # Start a thread for constantly printing |
| 1964 | try: |
| 1965 | # Create a thread (call print function, devices, delay in ms) |
| 1966 | - _thread.start_new_thread(printTempGraph, (deviceList, 150, temp_type)) |
| 1967 | + thread = threading.Thread(target=printTempGraph, args=(deviceList, 150, temp_type)) |
| 1968 | + thread.start() |
| 1969 | + thread_list.append(thread) |
| 1970 | except Exception as e: |
| 1971 | printErrLog(device, 'Unable to start new thread. %s' % (e)) |
| 1972 | # Catch user input for program termination |
| 1973 | while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c' |
| 1974 | getch = _Getch() |
| 1975 | user_input = getch() |
| 1976 | + global stop_threads |
| 1977 | + stop_threads = True; |
| 1978 | # Catch user input for q or Ctrl + c |
| 1979 | if user_input == 'q' or user_input == '\x03': |
| 1980 | break |
| 1981 | + for thread in thread_list: |
| 1982 | + thread.join() |
| 1983 | # Reset color to default before exit |
| 1984 | print('\033[A\x1b[0m\r') |
| 1985 | printLogSpacer() |
| 1986 | @@ -3178,8 +3185,6 @@ def showVoltageCurve(deviceList): |
| 1987 | printLog(device, 'Voltage point %d: %sMhz %smV' % ( |
| 1988 | position, int(list(odvf.curve.vc_points)[position].frequency / 1000000), |
| 1989 | int(list(odvf.curve.vc_points)[position].voltage)), None) |
| 1990 | - else: |
| 1991 | - printErrLog(device, 'Voltage curve Points unsupported.', is_warning=True) |
| 1992 | printLogSpacer() |
| 1993 | |
| 1994 | |
| 1995 | @@ -3232,8 +3237,6 @@ def showAccessibleTopology(deviceList): |
| 1996 | ret = rocmsmi.rsmi_is_P2P_accessible(srcdevice, destdevice, byref(accessible)) |
| 1997 | if rsmi_ret_ok(ret, metric='is_P2P_accessible'): |
| 1998 | gpu_links_type[srcdevice][destdevice] = accessible.value |
| 1999 | - else: |
| 2000 | - printErrLog(srcdevice, 'Cannot read link accessibility: Unsupported on this machine') |
| 2001 | if PRINT_JSON: |
| 2002 | formatMatrixToJSON(deviceList, gpu_links_type, "(Topology) Link accessibility between DRM devices {} and {}") |
| 2003 | return |
| 2004 | @@ -3272,7 +3275,6 @@ def showWeightTopology(deviceList): |
| 2005 | if rsmi_ret_ok(ret, metric='get_link_weight_topology'): |
| 2006 | gpu_links_weight[srcdevice][destdevice] = weight |
| 2007 | else: |
| 2008 | - printErrLog(srcdevice, 'Cannot read Link Weight: Not supported on this machine') |
| 2009 | gpu_links_weight[srcdevice][destdevice] = None |
| 2010 | |
| 2011 | |
| 2012 | @@ -3291,7 +3293,7 @@ def showWeightTopology(deviceList): |
| 2013 | for gpu2 in deviceList: |
| 2014 | if (gpu1 == gpu2): |
| 2015 | printTableRow('%-12s', '0') |
| 2016 | - elif (gpu_links_weight[gpu1][gpu2] == None): |
| 2017 | + elif (gpu_links_weight[gpu1][gpu2] is None): |
| 2018 | printTableRow('%-12s', 'N/A') |
| 2019 | else: |
| 2020 | printTableRow('%-12s', gpu_links_weight[gpu1][gpu2].value) |
| 2021 | @@ -3319,7 +3321,6 @@ def showHopsTopology(deviceList): |
| 2022 | if rsmi_ret_ok(ret, metric='get_link_type_topology'): |
| 2023 | gpu_links_hops[srcdevice][destdevice] = hops |
| 2024 | else: |
| 2025 | - printErrLog(srcdevice, 'Cannot read Link Hops: Not supported on this machine') |
| 2026 | gpu_links_hops[srcdevice][destdevice] = None |
| 2027 | |
| 2028 | if PRINT_JSON: |
| 2029 | @@ -3337,7 +3338,7 @@ def showHopsTopology(deviceList): |
| 2030 | for gpu2 in deviceList: |
| 2031 | if (gpu1 == gpu2): |
| 2032 | printTableRow('%-12s', '0') |
| 2033 | - elif (gpu_links_hops[gpu1][gpu2] == None): |
| 2034 | + elif (gpu_links_hops[gpu1][gpu2] is None): |
| 2035 | printTableRow('%-12s', 'N/A') |
| 2036 | else: |
| 2037 | printTableRow('%-12s', gpu_links_hops[gpu1][gpu2].value) |
| 2038 | @@ -3370,7 +3371,6 @@ def showTypeTopology(deviceList): |
| 2039 | else: |
| 2040 | gpu_links_type[srcdevice][destdevice] = "XXXX" |
| 2041 | else: |
| 2042 | - printErrLog(srcdevice, 'Cannot read Link Type: Not supported on this machine') |
| 2043 | gpu_links_type[srcdevice][destdevice] = "XXXX" |
| 2044 | |
| 2045 | if PRINT_JSON: |
| 2046 | @@ -3406,14 +3406,10 @@ def showNumaTopology(deviceList): |
| 2047 | ret = rocmsmi.rsmi_topo_get_numa_node_number(device, byref(numa_numbers)) |
| 2048 | if rsmi_ret_ok(ret, device, 'get_numa_node_number'): |
| 2049 | printLog(device, "(Topology) Numa Node", numa_numbers.value) |
| 2050 | - else: |
| 2051 | - printErrLog(device, "Cannot read Numa Node") |
| 2052 | |
| 2053 | ret = rocmsmi.rsmi_topo_numa_affinity_get(device, byref(numa_numbers)) |
| 2054 | if rsmi_ret_ok(ret, metric='get_numa_affinity_topology'): |
| 2055 | printLog(device, "(Topology) Numa Affinity", numa_numbers.value) |
| 2056 | - else: |
| 2057 | - printErrLog(device, 'Cannot read Numa Affinity') |
| 2058 | |
| 2059 | |
| 2060 | def showHwTopology(deviceList): |
| 2061 | @@ -3496,8 +3492,7 @@ def showComputePartition(deviceList): |
| 2062 | elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: |
| 2063 | printLog(device, 'Not supported on the given system', None) |
| 2064 | else: |
| 2065 | - rsmi_ret_ok(ret, device, 'get_compute_partition') |
| 2066 | - printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.') |
| 2067 | + printLog(device, 'Failed to retrieve compute partition, even though device supports it.') |
| 2068 | printLogSpacer() |
| 2069 | |
| 2070 | def showMemoryPartition(deviceList): |
| 2071 | @@ -3514,8 +3509,7 @@ def showMemoryPartition(deviceList): |
| 2072 | elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: |
| 2073 | printLog(device, 'Not supported on the given system', None) |
| 2074 | else: |
| 2075 | - rsmi_ret_ok(ret, device, 'get_memory_partition') |
| 2076 | - printErrLog(device, 'Failed to retrieve current memory partition, even though device supports it.') |
| 2077 | + printLog(device, 'Failed to retrieve current memory partition, even though device supports it.') |
| 2078 | printLogSpacer() |
| 2079 | |
| 2080 | class UIntegerTypes(IntEnum): |
| 2081 | @@ -3799,6 +3793,38 @@ def showGPUMetrics(deviceList): |
| 2082 | }, |
| 2083 | "xcp_stats.gfx_below_host_limit_acc": { |
| 2084 | "value": gpu_metrics.xcp_stats, |
| 2085 | + "unit": count, |
| 2086 | + }, |
| 2087 | + "xcp_stats.gfx_below_host_limit_ppt_acc": { |
| 2088 | + "value": gpu_metrics.xcp_stats, |
| 2089 | + "unit": count, |
| 2090 | + }, |
| 2091 | + "xcp_stats.gfx_below_host_limit_thm_acc": { |
| 2092 | + "value": gpu_metrics.xcp_stats, |
| 2093 | + "unit": count, |
| 2094 | + }, |
| 2095 | + "xcp_stats.gfx_low_utilization_acc": { |
| 2096 | + "value": gpu_metrics.xcp_stats, |
| 2097 | + "unit": count, |
| 2098 | + }, |
| 2099 | + "xcp_stats.gfx_below_host_limit_total_acc": { |
| 2100 | + "value": gpu_metrics.xcp_stats, |
| 2101 | + "unit": count, |
| 2102 | + }, |
| 2103 | + "xcp_stats.gfx_below_host_limit_ppt_acc": { |
| 2104 | + "value": gpu_metrics.xcp_stats, |
| 2105 | + "unit": percent_unit, |
| 2106 | + }, |
| 2107 | + "xcp_stats.gfx_below_host_limit_thm_acc": { |
| 2108 | + "value": gpu_metrics.xcp_stats, |
| 2109 | + "unit": percent_unit, |
| 2110 | + }, |
| 2111 | + "xcp_stats.gfx_low_utilization_acc": { |
| 2112 | + "value": gpu_metrics.xcp_stats, |
| 2113 | + "unit": percent_unit, |
| 2114 | + }, |
| 2115 | + "xcp_stats.gfx_below_host_limit_total_acc": { |
| 2116 | + "value": gpu_metrics.xcp_stats, |
| 2117 | "unit": percent_unit, |
| 2118 | }, |
| 2119 | } |
| 2120 | @@ -3841,14 +3867,37 @@ def showGPUMetrics(deviceList): |
| 2121 | for _, val in enumerate(item.gfx_below_host_limit_acc): |
| 2122 | print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T)) |
| 2123 | printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp)) |
| 2124 | + if 'xcp_stats.gfx_below_host_limit_ppt_acc' in k: |
| 2125 | + for curr_xcp, item in enumerate(v['value']): |
| 2126 | + print_xcp_detail = [] |
| 2127 | + for _, val in enumerate(item.gfx_below_host_limit_ppt_acc): |
| 2128 | + print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T)) |
| 2129 | + printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp)) |
| 2130 | + if 'xcp_stats.gfx_below_host_limit_thm_acc' in k: |
| 2131 | + for curr_xcp, item in enumerate(v['value']): |
| 2132 | + print_xcp_detail = [] |
| 2133 | + for _, val in enumerate(item.gfx_below_host_limit_thm_acc): |
| 2134 | + print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T)) |
| 2135 | + printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp)) |
| 2136 | + if 'xcp_stats.gfx_low_utilization_acc' in k: |
| 2137 | + for curr_xcp, item in enumerate(v['value']): |
| 2138 | + print_xcp_detail = [] |
| 2139 | + for _, val in enumerate(item.gfx_low_utilization_acc): |
| 2140 | + print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T)) |
| 2141 | + printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp)) |
| 2142 | + if 'xcp_stats.gfx_below_host_limit_total_acc' in k: |
| 2143 | + for curr_xcp, item in enumerate(v['value']): |
| 2144 | + print_xcp_detail = [] |
| 2145 | + for _, val in enumerate(item.gfx_below_host_limit_total_acc): |
| 2146 | + print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T)) |
| 2147 | + printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp)) |
| 2148 | |
| 2149 | if int(device) < (len(deviceList) - 1): |
| 2150 | printLogSpacer() |
| 2151 | elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: |
| 2152 | printLog(device, 'Not supported on the given system', None) |
| 2153 | else: |
| 2154 | - rsmi_ret_ok(ret, device, 'get_gpu_metrics') |
| 2155 | - printErrLog(device, 'Failed to retrieve GPU metrics, metric version may not be supported for this device.') |
| 2156 | + printLog(device, 'Failed to retrieve GPU metrics, metric version may not be supported for this device.') |
| 2157 | printLogSpacer() |
| 2158 | |
| 2159 | def checkAmdGpus(deviceList): |
| 2160 | @@ -3863,6 +3912,44 @@ def checkAmdGpus(deviceList): |
| 2161 | return False |
| 2162 | |
| 2163 | |
| 2164 | +def check_runtime_status() -> bool: |
| 2165 | + """Check the runtime status of all AMD GPU devices managed by the amdgpu driver. |
| 2166 | + |
| 2167 | + This function scans the directories under the specified path to verify the |
| 2168 | + runtime power management status of each device. It checks the "runtime_status" |
| 2169 | + file for each device to determine if the device is in an "active" state. If any |
| 2170 | + device is not in an "active" state it returns False. If the file is inaccessible, |
| 2171 | + this may be due to a system that does not support runtime power management. |
| 2172 | + Some GPUs support runtime power management, while others may not. This is why the default status |
| 2173 | + is set to True. |
| 2174 | + |
| 2175 | + bool: False if any device is not in "active" state, True otherwise. |
| 2176 | + """ |
| 2177 | + base_path = "/sys/class/drm" |
| 2178 | + status = True # Default to True, assuming active unless proven otherwise |
| 2179 | + for device in os.listdir(base_path): |
| 2180 | + if os.path.isdir(os.path.join(base_path, device)): |
| 2181 | + runtime_status_path = os.path.join(base_path, device, "power", "runtime_status") |
| 2182 | + try: |
| 2183 | + with open(runtime_status_path, 'r') as file: |
| 2184 | + current_status = file.read().strip() |
| 2185 | + if current_status != "active": |
| 2186 | + status = False |
| 2187 | + continue |
| 2188 | + else: |
| 2189 | + logging.debug(f"Runtime status for {device}: {current_status}") |
| 2190 | + status = True |
| 2191 | + except FileNotFoundError: |
| 2192 | + # File does not exist, skip this device |
| 2193 | + continue |
| 2194 | + except PermissionError as e: |
| 2195 | + # Handle permission errors gracefully |
| 2196 | + logging.debug(f"Permission denied while accessing {runtime_status_path} \nError: {e}") |
| 2197 | + continue |
| 2198 | + else: |
| 2199 | + pass |
| 2200 | + return status |
| 2201 | + |
| 2202 | def component_str(component): |
| 2203 | """ Returns the component String value |
| 2204 | |
| 2205 | @@ -4374,7 +4461,7 @@ if __name__ == '__main__': |
| 2206 | |
| 2207 | if not PRINT_JSON: |
| 2208 | print('\n') |
| 2209 | - if not isConciseInfoRequested(args) and args.showhw == False: |
| 2210 | + if not isConciseInfoRequested(args) and not args.showhw: |
| 2211 | printLogSpacer(headerString) |
| 2212 | |
| 2213 | if args.showallinfo: |
| 2214 | @@ -4429,7 +4516,8 @@ if __name__ == '__main__': |
| 2215 | |
| 2216 | if not checkAmdGpus(deviceList): |
| 2217 | logging.warning('No AMD GPUs specified') |
| 2218 | - |
| 2219 | + if not check_runtime_status(): |
| 2220 | + logging.warning('AMD GPU device(s) is/are in a low-power state. Check power control/runtime_status\n') |
| 2221 | if isConciseInfoRequested(args): |
| 2222 | showAllConcise(deviceList) |
| 2223 | if args.showhw: |
| 2224 | @@ -4482,7 +4570,7 @@ if __name__ == '__main__': |
| 2225 | showPcieReplayCount(deviceList) |
| 2226 | if args.showserial: |
| 2227 | showSerialNumber(deviceList) |
| 2228 | - if args.showpids != None: |
| 2229 | + if args.showpids is not None: |
| 2230 | showPids(args.showpids) |
| 2231 | if args.showpidgpus or str(args.showpidgpus) == '[]': |
| 2232 | showGpusByPid(args.showpidgpus) |
| 2233 | @@ -4626,10 +4714,10 @@ if __name__ == '__main__': |
| 2234 | devCsv = '' |
| 2235 | sysCsv = '' |
| 2236 | # JSON won't have any 'system' data without one of these flags |
| 2237 | - if args.showdriverversion and args.showallinfo == False: |
| 2238 | + if args.showdriverversion and not args.showallinfo: |
| 2239 | sysCsv = formatCsv(['system']) |
| 2240 | print('%s' % (sysCsv)) |
| 2241 | - elif args.showallinfo is True: |
| 2242 | + elif args.showallinfo: |
| 2243 | sysCsv = formatCsv(['system']) |
| 2244 | devCsv = formatCsv(deviceList) |
| 2245 | print('%s\n%s' % (sysCsv, devCsv)) |
| 2246 | @@ -4637,8 +4725,8 @@ if __name__ == '__main__': |
| 2247 | devCsv = formatCsv(deviceList) |
| 2248 | print(devCsv) |
| 2249 | |
| 2250 | - if not isConciseInfoRequested(args) and args.showhw == False: |
| 2251 | + if not isConciseInfoRequested(args) and not args.showhw: |
| 2252 | printLogSpacer(footerString) |
| 2253 | |
| 2254 | rsmi_ret_ok(rocmsmi.rsmi_shut_down()) |
| 2255 | - exit(RETCODE) |
| 2256 | \ No newline at end of file |
| 2257 | + exit(RETCODE) |
| 2258 | diff --git a/python_smi_tools/rsmiBindings.py b/python_smi_tools/rsmiBindings.py |
| 2259 | index 3a8d11a..69c7860 100644 |
| 2260 | --- a/python_smi_tools/rsmiBindings.py |
| 2261 | +++ b/python_smi_tools/rsmiBindings.py |
| 2262 | @@ -108,7 +108,19 @@ class rsmi_dev_perf_level_t(c_int): |
| 2263 | RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100 |
| 2264 | |
| 2265 | |
| 2266 | -notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_PRE_RESET', 'GPU_POST_RESET', 'RING_HANG'] |
| 2267 | +notification_type_names = [ |
| 2268 | + 'VM_FAULT', |
| 2269 | + 'THERMAL_THROTTLE', |
| 2270 | + 'GPU_PRE_RESET', |
| 2271 | + 'GPU_POST_RESET', |
| 2272 | + 'MIGRATE_START', |
| 2273 | + 'MIGRATE_END', |
| 2274 | + 'PAGE_FAULT_START', |
| 2275 | + 'PAGE_FAULT_END', |
| 2276 | + 'QUEUE_EVICTION', |
| 2277 | + 'QUEUE_RESTORE', |
| 2278 | + 'UNMAP_FROM_GPU' |
| 2279 | +] |
| 2280 | |
| 2281 | |
| 2282 | class rsmi_evt_notification_type_t(c_int): |
| 2283 | @@ -118,8 +130,14 @@ class rsmi_evt_notification_type_t(c_int): |
| 2284 | RSMI_EVT_NOTIF_THERMAL_THROTTLE = 2 |
| 2285 | RSMI_EVT_NOTIF_GPU_PRE_RESET = 3 |
| 2286 | RSMI_EVT_NOTIF_GPU_POST_RESET = 4 |
| 2287 | - RSMI_EVT_NOTIF_RING_HANG = 5 |
| 2288 | - RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_RING_HANG |
| 2289 | + RSMI_EVT_NOTIF_MIGRATE_START = 5 |
| 2290 | + RSMI_EVT_NOTIF_MIGRATE_END = 6 |
| 2291 | + RSMI_EVT_NOTIF_PAGE_FAULT_START = 7 |
| 2292 | + RSMI_EVT_NOTIF_PAGE_FAULT_END = 8 |
| 2293 | + RSMI_EVT_NOTIF_QUEUE_EVICTION = 9 |
| 2294 | + RSMI_EVT_NOTIF_QUEUE_RESTORE = 10 |
| 2295 | + RSMI_EVT_NOTIF_UNMAP_FROM_GPU = 11 |
| 2296 | + RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_UNMAP_FROM_GPU |
| 2297 | |
| 2298 | |
| 2299 | class rsmi_voltage_metric_t(c_int): |
| 2300 | @@ -545,11 +563,12 @@ class rsmi_error_count_t(Structure): |
| 2301 | _fields_ = [('correctable_err', c_uint64), |
| 2302 | ('uncorrectable_err', c_uint64)] |
| 2303 | |
| 2304 | +MAX_EVENT_NOTIFICATION_MSG_SIZE = 96 |
| 2305 | |
| 2306 | class rsmi_evt_notification_data_t(Structure): |
| 2307 | _fields_ = [('dv_ind', c_uint32), |
| 2308 | ('event', rsmi_evt_notification_type_t), |
| 2309 | - ('message', c_char*64)] |
| 2310 | + ('message', c_char*MAX_EVENT_NOTIFICATION_MSG_SIZE)] |
| 2311 | |
| 2312 | |
| 2313 | class rsmi_process_info_t(Structure): |
| 2314 | @@ -666,10 +685,14 @@ class amdgpu_xcp_metrics_t(Structure): |
| 2315 | # amdgpu_xcp_metrics_t._pack_ = 1 # source:False |
| 2316 | amdgpu_xcp_metrics_t._fields_ = [ |
| 2317 | ('gfx_busy_inst', c_uint32 * 8), |
| 2318 | - ('jpeg_busy', c_uint16 * 32), |
| 2319 | + ('jpeg_busy', c_uint16 * 40), |
| 2320 | ('vcn_busy', c_uint16 * 4), |
| 2321 | ('gfx_busy_acc', c_uint64 * 8), |
| 2322 | ('gfx_below_host_limit_acc', c_uint64 * 8), |
| 2323 | + ('gfx_below_host_limit_ppt_acc', c_uint64 * 8), |
| 2324 | + ('gfx_below_host_limit_thm_acc', c_uint64 * 8), |
| 2325 | + ('gfx_low_utilization_acc', c_uint64 * 8), |
| 2326 | + ('gfx_below_host_limit_total_acc', c_uint64 * 8), |
| 2327 | ] |
| 2328 | xcp_stats_t = amdgpu_xcp_metrics_t |
| 2329 | |
| 2330 | diff --git a/python_smi_tools/rsmiBindings.py.in b/python_smi_tools/rsmiBindings.py.in |
| 2331 | index 18a8535..aaed228 100644 |
| 2332 | --- a/python_smi_tools/rsmiBindings.py.in |
| 2333 | +++ b/python_smi_tools/rsmiBindings.py.in |
| 2334 | @@ -23,7 +23,7 @@ def initRsmiBindings(silent=False): |
| 2335 | print(args) |
| 2336 | |
| 2337 | rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH') |
| 2338 | - if (rocm_smi_lib_path != None): |
| 2339 | + if (rocm_smi_lib_path is not None): |
| 2340 | path_librocm = rocm_smi_lib_path |
| 2341 | else: |
| 2342 | path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@' |
| 2343 | diff --git a/python_smi_tools/rsmiBindingsInit.py.in b/python_smi_tools/rsmiBindingsInit.py.in |
| 2344 | index 12b9218..7c75c4a 100644 |
| 2345 | --- a/python_smi_tools/rsmiBindingsInit.py.in |
| 2346 | +++ b/python_smi_tools/rsmiBindingsInit.py.in |
| 2347 | @@ -23,7 +23,7 @@ def initRsmiBindings(silent=False): |
| 2348 | print(args) |
| 2349 | |
| 2350 | rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH') |
| 2351 | - if (rocm_smi_lib_path != None): |
| 2352 | + if (rocm_smi_lib_path is not None): |
| 2353 | path_librocm = rocm_smi_lib_path |
| 2354 | else: |
| 2355 | path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@' |
| 2356 | diff --git a/rocm_smi-backward-compat.cmake b/rocm_smi-backward-compat.cmake |
| 2357 | deleted file mode 100644 |
| 2358 | index d53542b..0000000 |
| 2359 | --- a/rocm_smi-backward-compat.cmake |
| 2360 | +++ /dev/null |
| 2361 | @@ -1,200 +0,0 @@ |
| 2362 | -# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. |
| 2363 | -# Permission is hereby granted, free of charge, to any person obtaining a copy |
| 2364 | -# of this software and associated documentation files (the "Software"), to deal |
| 2365 | -# in the Software without restriction, including without limitation the rights |
| 2366 | -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 2367 | -# copies of the Software, and to permit persons to whom the Software is |
| 2368 | -# furnished to do so, subject to the following conditions: |
| 2369 | -# |
| 2370 | -# The above copyright notice and this permission notice shall be included in |
| 2371 | -# all copies or substantial portions of the Software. |
| 2372 | -# |
| 2373 | -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 2374 | -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 2375 | -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 2376 | -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 2377 | -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 2378 | -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 2379 | -# THE SOFTWARE. |
| 2380 | - |
| 2381 | -cmake_minimum_required(VERSION 3.16.8) |
| 2382 | - |
| 2383 | -set(RSMI_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) |
| 2384 | -set(RSMI_WRAPPER_DIR ${RSMI_BUILD_DIR}/wrapper_dir) |
| 2385 | -set(RSMI_WRAPPER_INC_DIR ${RSMI_WRAPPER_DIR}/include/${ROCM_SMI}) |
| 2386 | -set(OAM_TARGET_NAME "oam") |
| 2387 | -set(OAM_WRAPPER_INC_DIR ${RSMI_WRAPPER_DIR}/include/${OAM_TARGET_NAME}) |
| 2388 | -set(RSMI_WRAPPER_LIB_DIR ${RSMI_WRAPPER_DIR}/${ROCM_SMI}/lib) |
| 2389 | -set(OAM_WRAPPER_LIB_DIR ${RSMI_WRAPPER_DIR}/${OAM_TARGET_NAME}/lib) |
| 2390 | -## package headers |
| 2391 | -set(PUBLIC_RSMI_HEADERS |
| 2392 | - rocm_smi.h |
| 2393 | - ${ROCM_SMI_TARGET}Config.h |
| 2394 | - kfd_ioctl.h) |
| 2395 | -set(OAM_HEADERS |
| 2396 | - oam_mapi.h |
| 2397 | - amd_oam.h) |
| 2398 | - |
| 2399 | -#Function to generate header template file |
| 2400 | -function(create_header_template) |
| 2401 | - file(WRITE ${RSMI_WRAPPER_DIR}/header.hpp.in "/* |
| 2402 | - Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. |
| 2403 | - |
| 2404 | - Permission is hereby granted, free of charge, to any person obtaining a copy |
| 2405 | - of this software and associated documentation files (the \"Software\"), to deal |
| 2406 | - in the Software without restriction, including without limitation the rights |
| 2407 | - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 2408 | - copies of the Software, and to permit persons to whom the Software is |
| 2409 | - furnished to do so, subject to the following conditions: |
| 2410 | - |
| 2411 | - The above copyright notice and this permission notice shall be included in |
| 2412 | - all copies or substantial portions of the Software. |
| 2413 | - |
| 2414 | - THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 2415 | - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 2416 | - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 2417 | - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 2418 | - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 2419 | - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 2420 | - THE SOFTWARE. |
| 2421 | - */ |
| 2422 | - |
| 2423 | -#ifndef @include_guard@ |
| 2424 | -#define @include_guard@ |
| 2425 | - |
| 2426 | -#ifndef ROCM_HEADER_WRAPPER_WERROR |
| 2427 | -#define ROCM_HEADER_WRAPPER_WERROR @deprecated_error@ |
| 2428 | -#endif |
| 2429 | -#if ROCM_HEADER_WRAPPER_WERROR /* ROCM_HEADER_WRAPPER_WERROR 1 */ |
| 2430 | -#error \"This file is deprecated. Use file from include path /opt/rocm-ver/include/ and prefix with @prefix_name@\" |
| 2431 | -#else /* ROCM_HEADER_WRAPPER_WERROR 0 */ |
| 2432 | -#if defined(__GNUC__) |
| 2433 | -#warning \"This file is deprecated. Use file from include path /opt/rocm-ver/include/ and prefix with @prefix_name@\" |
| 2434 | -#else |
| 2435 | -#pragma message(\"This file is deprecated. Use file from include path /opt/rocm-ver/include/ and prefix with @prefix_name@\") |
| 2436 | -#endif |
| 2437 | -#endif /* ROCM_HEADER_WRAPPER_WERROR */ |
| 2438 | - |
| 2439 | -@include_statements@ |
| 2440 | - |
| 2441 | -#endif") |
| 2442 | -endfunction() |
| 2443 | - |
| 2444 | -#use header template file and generate wrapper header files |
| 2445 | -function(generate_wrapper_header) |
| 2446 | - file(MAKE_DIRECTORY ${RSMI_WRAPPER_INC_DIR}) |
| 2447 | - set(prefix_name "${prefix_name}${ROCM_SMI}") |
| 2448 | - #Generate wrapper header files from the list |
| 2449 | - foreach(header_file ${PUBLIC_RSMI_HEADERS}) |
| 2450 | - # set include guard |
| 2451 | - get_filename_component(INC_GAURD_NAME ${header_file} NAME_WE) |
| 2452 | - string(TOUPPER ${INC_GAURD_NAME} INC_GAURD_NAME) |
| 2453 | - set(include_guard "${include_guard}COMGR_WRAPPER_INCLUDE_${INC_GAURD_NAME}_H") |
| 2454 | - #set #include statement |
| 2455 | - get_filename_component(file_name ${header_file} NAME) |
| 2456 | - set(include_statements "${include_statements}#include \"../../../${CMAKE_INSTALL_INCLUDEDIR}/${ROCM_SMI}/${file_name}\"\n") |
| 2457 | - configure_file(${RSMI_WRAPPER_DIR}/header.hpp.in ${RSMI_WRAPPER_INC_DIR}/${file_name}) |
| 2458 | - unset(include_guard) |
| 2459 | - unset(include_statements) |
| 2460 | - endforeach() |
| 2461 | - unset(prefix_name) |
| 2462 | - |
| 2463 | -#OAM Wrpper Header file generation |
| 2464 | - file(MAKE_DIRECTORY ${OAM_WRAPPER_INC_DIR}) |
| 2465 | - set(prefix_name "${prefix_name}${OAM_TARGET_NAME}") |
| 2466 | - #Generate wrapper header files from the list |
| 2467 | - foreach(header_file ${OAM_HEADERS}) |
| 2468 | - # set include guard |
| 2469 | - get_filename_component(INC_GAURD_NAME ${header_file} NAME_WE) |
| 2470 | - string(TOUPPER ${INC_GAURD_NAME} INC_GAURD_NAME) |
| 2471 | - set(include_guard "${include_guard}COMGR_WRAPPER_INCLUDE_${INC_GAURD_NAME}_H") |
| 2472 | - #set #include statement |
| 2473 | - get_filename_component(file_name ${header_file} NAME) |
| 2474 | - set(include_statements "${include_statements}#include \"../../../${CMAKE_INSTALL_INCLUDEDIR}/${OAM_TARGET_NAME}/${file_name}\"\n") |
| 2475 | - configure_file(${RSMI_WRAPPER_DIR}/header.hpp.in ${OAM_WRAPPER_INC_DIR}/${file_name}) |
| 2476 | - unset(include_guard) |
| 2477 | - unset(include_statements) |
| 2478 | - endforeach() |
| 2479 | - unset(prefix_name) |
| 2480 | - |
| 2481 | -endfunction() |
| 2482 | - |
| 2483 | -#function to create symlink to libraries |
| 2484 | -function(create_library_symlink) |
| 2485 | - |
| 2486 | - file(MAKE_DIRECTORY ${RSMI_WRAPPER_LIB_DIR}) |
| 2487 | - if(BUILD_SHARED_LIBS) |
| 2488 | - |
| 2489 | - #get rsmi lib versions |
| 2490 | - set(SO_VERSION_GIT_TAG_PREFIX "rsmi_so_ver") |
| 2491 | - get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT) |
| 2492 | - if(${ROCM_PATCH_VERSION}) |
| 2493 | - set(VERSION_PATCH ${ROCM_PATCH_VERSION}) |
| 2494 | - set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") |
| 2495 | - else() |
| 2496 | - set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}") |
| 2497 | - endif() |
| 2498 | - |
| 2499 | - #link RSMI library files |
| 2500 | - set(LIB_RSMI "${ROCM_SMI_LIB_NAME}.so") |
| 2501 | - set(library_files "${LIB_RSMI}" "${LIB_RSMI}.${VERSION_MAJOR}" "${LIB_RSMI}.${SO_VERSION_STRING}") |
| 2502 | - else() |
| 2503 | - set(LIB_RSMI "${ROCM_SMI_LIB_NAME}.a") |
| 2504 | - set(library_files "${LIB_RSMI}") |
| 2505 | - endif() |
| 2506 | - |
| 2507 | - foreach(file_name ${library_files}) |
| 2508 | - add_custom_target(link_${file_name} ALL |
| 2509 | - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} |
| 2510 | - COMMAND ${CMAKE_COMMAND} -E create_symlink |
| 2511 | - ../../${CMAKE_INSTALL_LIBDIR}/${file_name} ${RSMI_WRAPPER_LIB_DIR}/${file_name}) |
| 2512 | - endforeach() |
| 2513 | - |
| 2514 | - file(MAKE_DIRECTORY ${OAM_WRAPPER_LIB_DIR}) |
| 2515 | - if(BUILD_SHARED_LIBS) |
| 2516 | - |
| 2517 | - #get OAM lib versions |
| 2518 | - set(SO_VERSION_GIT_TAG_PREFIX "oam_so_ver") |
| 2519 | - get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT) |
| 2520 | - if(${ROCM_PATCH_VERSION}) |
| 2521 | - set(VERSION_PATCH ${ROCM_PATCH_VERSION}) |
| 2522 | - set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") |
| 2523 | - else() |
| 2524 | - set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}") |
| 2525 | - endif() |
| 2526 | - |
| 2527 | - #link OAM library files |
| 2528 | - set(LIB_OAM "lib${OAM_TARGET_NAME}.so") |
| 2529 | - set(library_files "${LIB_OAM}" "${LIB_OAM}.${VERSION_MAJOR}" "${LIB_OAM}.${SO_VERSION_STRING}") |
| 2530 | - else() |
| 2531 | - set(LIB_OAM "lib${OAM_TARGET_NAME}.a") |
| 2532 | - set(library_files "${LIB_OAM}") |
| 2533 | - endif() |
| 2534 | - |
| 2535 | - foreach(file_name ${library_files}) |
| 2536 | - add_custom_target(link_${file_name} ALL |
| 2537 | - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} |
| 2538 | - COMMAND ${CMAKE_COMMAND} -E create_symlink |
| 2539 | - ../../${CMAKE_INSTALL_LIBDIR}/${file_name} ${OAM_WRAPPER_LIB_DIR}/${file_name}) |
| 2540 | - endforeach() |
| 2541 | - |
| 2542 | -endfunction() |
| 2543 | - |
| 2544 | -#Creater a template for header file |
| 2545 | -create_header_template() |
| 2546 | -#Use template header file and generater wrapper header files |
| 2547 | -generate_wrapper_header() |
| 2548 | -install(DIRECTORY ${RSMI_WRAPPER_INC_DIR} |
| 2549 | - DESTINATION ${ROCM_SMI}/include |
| 2550 | - COMPONENT dev) |
| 2551 | -install(DIRECTORY ${OAM_WRAPPER_INC_DIR} |
| 2552 | - DESTINATION ${OAM_TARGET_NAME}/include |
| 2553 | - COMPONENT dev) |
| 2554 | -# Create symlink to library files |
| 2555 | -create_library_symlink() |
| 2556 | -install(DIRECTORY ${RSMI_WRAPPER_LIB_DIR} |
| 2557 | - DESTINATION ${ROCM_SMI} |
| 2558 | - COMPONENT dev) |
| 2559 | -install(DIRECTORY ${OAM_WRAPPER_LIB_DIR} |
| 2560 | - DESTINATION ${OAM_TARGET_NAME} |
| 2561 | - COMPONENT dev ) |
| 2562 | diff --git a/rocm_smi/CMakeLists.txt b/rocm_smi/CMakeLists.txt |
| 2563 | old mode 100755 |
| 2564 | new mode 100644 |
| 2565 | index 257309b..23485ae |
| 2566 | --- a/rocm_smi/CMakeLists.txt |
| 2567 | +++ b/rocm_smi/CMakeLists.txt |
| 2568 | @@ -88,15 +88,13 @@ target_include_directories(${ROCM_SMI_TARGET} PRIVATE |
| 2569 | # use the target_include_directories() command to specify the include directories for the target |
| 2570 | target_include_directories(${ROCM_SMI_TARGET} |
| 2571 | PUBLIC |
| 2572 | + "$<BUILD_INTERFACE:${DRM_INCLUDE_DIRS}>" |
| 2573 | + "$<BUILD_INTERFACE:${AMDGPU_DRM_INCLUDE_DIRS}>" |
| 2574 | "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>" |
| 2575 | "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>" |
| 2576 | ) |
| 2577 | |
| 2578 | -if(FILE_REORG_BACKWARD_COMPATIBILITY) |
| 2579 | - target_include_directories(${ROCM_SMI_TARGET} |
| 2580 | - PUBLIC |
| 2581 | - "$<INSTALL_INTERFACE:${ROCM_SMI}/include>") |
| 2582 | -endif() |
| 2583 | +target_include_directories(${ROCM_SMI_TARGET} INTERFACE ${DRM_INCLUDE_DIRS}) |
| 2584 | |
| 2585 | ## Set the VERSION and SOVERSION values |
| 2586 | set_property(TARGET ${ROCM_SMI_TARGET} PROPERTY |
| 2587 | diff --git a/rocm_smi/example/rocm_smi_example.cc b/rocm_smi/example/rocm_smi_example.cc |
| 2588 | old mode 100755 |
| 2589 | new mode 100644 |
| 2590 | index f6a3b8b..a370abe |
| 2591 | --- a/rocm_smi/example/rocm_smi_example.cc |
| 2592 | +++ b/rocm_smi/example/rocm_smi_example.cc |
| 2593 | @@ -5,7 +5,7 @@ |
| 2594 | * The University of Illinois/NCSA |
| 2595 | * Open Source License (NCSA) |
| 2596 | * |
| 2597 | - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. |
| 2598 | + * Copyright (c) 2017-2025, Advanced Micro Devices, Inc. |
| 2599 | * All rights reserved. |
| 2600 | * |
| 2601 | * Developed by: |
| 2602 | @@ -991,6 +991,11 @@ int main() { |
| 2603 | std::cout << "\t -> " << std::dec << dclk << "\n"; |
| 2604 | } |
| 2605 | |
| 2606 | + std::cout << "\t**.jpeg_activity[] : " << std::dec << "\n"; |
| 2607 | + for (const auto& jpeg : gpu_metrics.jpeg_activity) { |
| 2608 | + std::cout << "\t -> " << std::dec << jpeg << "\n"; |
| 2609 | + } |
| 2610 | + |
| 2611 | std::cout << std::dec << "xcp_stats.gfx_busy_inst = \n"; |
| 2612 | auto xcp = 0; |
| 2613 | for (auto& row : gpu_metrics.xcp_stats) { |
| 2614 | @@ -1046,6 +1051,50 @@ int main() { |
| 2615 | xcp++; |
| 2616 | } |
| 2617 | |
| 2618 | + xcp = 0; |
| 2619 | + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_ppt_acc = \n"; // new for 1.8 |
| 2620 | + for (auto& row : gpu_metrics.xcp_stats) { |
| 2621 | + std::cout << "XCP[" << xcp << "] = " << "[ "; |
| 2622 | + std::copy(std::begin(row.gfx_below_host_limit_ppt_acc), |
| 2623 | + std::end(row.gfx_below_host_limit_ppt_acc), |
| 2624 | + amd::smi::make_ostream_joiner(&std::cout, ", ")); |
| 2625 | + std::cout << " ]\n"; |
| 2626 | + xcp++; |
| 2627 | + } |
| 2628 | + |
| 2629 | + xcp = 0; |
| 2630 | + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_thm_acc = \n"; // new for 1.8 |
| 2631 | + for (auto& row : gpu_metrics.xcp_stats) { |
| 2632 | + std::cout << "XCP[" << xcp << "] = " << "[ "; |
| 2633 | + std::copy(std::begin(row.gfx_below_host_limit_thm_acc), |
| 2634 | + std::end(row.gfx_below_host_limit_thm_acc), |
| 2635 | + amd::smi::make_ostream_joiner(&std::cout, ", ")); |
| 2636 | + std::cout << " ]\n"; |
| 2637 | + xcp++; |
| 2638 | + } |
| 2639 | + |
| 2640 | + xcp = 0; |
| 2641 | + std::cout << std::dec << "xcp_stats.gfx_low_utilization_acc = \n"; |
| 2642 | + for (auto& row : gpu_metrics.xcp_stats) { |
| 2643 | + std::cout << "XCP[" << xcp << "] = " << "[ "; |
| 2644 | + std::copy(std::begin(row.gfx_low_utilization_acc), |
| 2645 | + std::end(row.gfx_low_utilization_acc), |
| 2646 | + amd::smi::make_ostream_joiner(&std::cout, ", ")); |
| 2647 | + std::cout << " ]\n"; |
| 2648 | + xcp++; |
| 2649 | + } |
| 2650 | + |
| 2651 | + xcp = 0; |
| 2652 | + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_total_acc = \n"; |
| 2653 | + for (auto& row : gpu_metrics.xcp_stats) { |
| 2654 | + std::cout << "XCP[" << xcp << "] = " << "[ "; |
| 2655 | + std::copy(std::begin(row.gfx_below_host_limit_total_acc), |
| 2656 | + std::end(row.gfx_below_host_limit_total_acc), |
| 2657 | + amd::smi::make_ostream_joiner(&std::cout, ", ")); |
| 2658 | + std::cout << " ]\n"; |
| 2659 | + xcp++; |
| 2660 | + } |
| 2661 | + |
| 2662 | std::cout << "\n"; |
| 2663 | std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; |
| 2664 | constexpr uint16_t kMAX_ITER_TEST = 10; |
| 2665 | diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc |
| 2666 | old mode 100755 |
| 2667 | new mode 100644 |
| 2668 | index 758ef3a..561798a |
| 2669 | --- a/src/rocm_smi.cc |
| 2670 | +++ b/src/rocm_smi.cc |
| 2671 | @@ -3,7 +3,7 @@ |
| 2672 | * The University of Illinois/NCSA |
| 2673 | * Open Source License (NCSA) |
| 2674 | * |
| 2675 | - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. |
| 2676 | + * Copyright (c) 2017-2025, Advanced Micro Devices, Inc. |
| 2677 | * All rights reserved. |
| 2678 | * |
| 2679 | * Developed by: |
| 2680 | @@ -48,6 +48,7 @@ |
| 2681 | #include <fcntl.h> |
| 2682 | #include <poll.h> |
| 2683 | #include <pthread.h> |
| 2684 | +#include <inttypes.h> |
| 2685 | |
| 2686 | #include <cstddef> |
| 2687 | #include <string> |
| 2688 | @@ -198,39 +199,6 @@ static uint64_t freq_string_to_int(const std::vector<std::string> &freq_lines, |
| 2689 | return static_cast<uint64_t>(freq*multiplier); |
| 2690 | } |
| 2691 | |
| 2692 | -static void freq_volt_string_to_point(std::string in_line, |
| 2693 | - rsmi_od_vddc_point_t *pt) { |
| 2694 | - std::istringstream fs_vlt(in_line); |
| 2695 | - |
| 2696 | - assert(pt != nullptr); |
| 2697 | - THROW_IF_NULLPTR_DEREF(pt) |
| 2698 | - |
| 2699 | - uint32_t ind; |
| 2700 | - float freq; |
| 2701 | - float volts; |
| 2702 | - std::string junk; |
| 2703 | - std::string freq_units_str; |
| 2704 | - std::string volts_units_str; |
| 2705 | - |
| 2706 | - fs_vlt >> ind; |
| 2707 | - fs_vlt >> junk; // colon |
| 2708 | - fs_vlt >> freq; |
| 2709 | - fs_vlt >> freq_units_str; |
| 2710 | - fs_vlt >> volts; |
| 2711 | - fs_vlt >> volts_units_str; |
| 2712 | - |
| 2713 | - if (freq < 0) { |
| 2714 | - throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_SIZE, __FUNCTION__); |
| 2715 | - } |
| 2716 | - |
| 2717 | - long double multiplier = get_multiplier_from_str(freq_units_str[0]); |
| 2718 | - |
| 2719 | - pt->frequency = static_cast<uint64_t>(freq*multiplier); |
| 2720 | - |
| 2721 | - multiplier = get_multiplier_from_str(volts_units_str[0]); |
| 2722 | - pt->voltage = static_cast<uint64_t>(volts*multiplier); |
| 2723 | -} |
| 2724 | - |
| 2725 | static void od_value_pair_str_to_range(std::string in_line, rsmi_range_t *rg) { |
| 2726 | std::istringstream fs_rng(in_line); |
| 2727 | |
| 2728 | @@ -318,6 +286,7 @@ static rsmi_status_t get_dev_value_str(amd::smi::DevInfoTypes type, |
| 2729 | |
| 2730 | return amd::smi::ErrnoToRsmiStatus(ret); |
| 2731 | } |
| 2732 | + |
| 2733 | static rsmi_status_t get_dev_value_int(amd::smi::DevInfoTypes type, |
| 2734 | uint32_t dv_ind, uint64_t *val_int) { |
| 2735 | assert(val_int != nullptr); |
| 2736 | @@ -369,9 +338,31 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type, |
| 2737 | return amd::smi::ErrnoToRsmiStatus(ret); |
| 2738 | } |
| 2739 | |
| 2740 | + if (val_str.empty()) { |
| 2741 | + std::ostringstream ss; |
| 2742 | + ss << __PRETTY_FUNCTION__ |
| 2743 | + << " | ======= end ======= " |
| 2744 | + << " | Fail " |
| 2745 | + << " | Device #: " << dv_ind |
| 2746 | + << " | Type: " << monitorTypesToString.at(type) |
| 2747 | + << " | Cause: SYSFS read was empty" |
| 2748 | + << " | Returning = " |
| 2749 | + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |"; |
| 2750 | + LOG_INFO(ss); |
| 2751 | + return RSMI_STATUS_UNEXPECTED_DATA; |
| 2752 | + } |
| 2753 | + |
| 2754 | if (!amd::smi::IsInteger(val_str)) { |
| 2755 | - std::cerr << "Expected integer value from monitor," |
| 2756 | - " but got \"" << val_str << "\"" << std::endl; |
| 2757 | + std::ostringstream ss; |
| 2758 | + ss << __PRETTY_FUNCTION__ |
| 2759 | + << " | ======= end ======= " |
| 2760 | + << " | Fail " |
| 2761 | + << " | Device #: " << dv_ind |
| 2762 | + << " | Type: " << monitorTypesToString.at(type) |
| 2763 | + << " | Cause: Expected integer value from monitor, but got "<< val_str |
| 2764 | + << " | Returning = " |
| 2765 | + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |"; |
| 2766 | + LOG_INFO(ss); |
| 2767 | return RSMI_STATUS_UNEXPECTED_DATA; |
| 2768 | } |
| 2769 | |
| 2770 | @@ -398,9 +389,31 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type, |
| 2771 | return amd::smi::ErrnoToRsmiStatus(ret); |
| 2772 | } |
| 2773 | |
| 2774 | + if (val_str.empty()) { |
| 2775 | + std::ostringstream ss; |
| 2776 | + ss << __PRETTY_FUNCTION__ |
| 2777 | + << " | ======= end ======= " |
| 2778 | + << " | Fail " |
| 2779 | + << " | Device #: " << dv_ind |
| 2780 | + << " | Type: " << monitorTypesToString.at(type) |
| 2781 | + << " | Cause: SYSFS read was empty" |
| 2782 | + << " | Returning = " |
| 2783 | + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |"; |
| 2784 | + LOG_INFO(ss); |
| 2785 | + return RSMI_STATUS_UNEXPECTED_DATA; |
| 2786 | + } |
| 2787 | + |
| 2788 | if (!amd::smi::IsInteger(val_str)) { |
| 2789 | - std::cerr << "Expected integer value from monitor," |
| 2790 | - " but got \"" << val_str << "\"" << std::endl; |
| 2791 | + std::ostringstream ss; |
| 2792 | + ss << __PRETTY_FUNCTION__ |
| 2793 | + << " | ======= end ======= " |
| 2794 | + << " | Fail " |
| 2795 | + << " | Device #: " << dv_ind |
| 2796 | + << " | Type: " << monitorTypesToString.at(type) |
| 2797 | + << " | Cause: Expected integer value from monitor, but got "<< val_str |
| 2798 | + << " | Returning = " |
| 2799 | + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |"; |
| 2800 | + LOG_INFO(ss); |
| 2801 | return RSMI_STATUS_UNEXPECTED_DATA; |
| 2802 | } |
| 2803 | |
| 2804 | @@ -735,7 +748,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, |
| 2805 | fs2 >> ec->correctable_err; |
| 2806 | |
| 2807 | ss << __PRETTY_FUNCTION__ << " | ======= end =======" |
| 2808 | - << ", reporting " << amd::smi::getRSMIStatusString(ret);; |
| 2809 | + << ", reporting " << amd::smi::getRSMIStatusString(ret); |
| 2810 | LOG_TRACE(ss); |
| 2811 | return ret; |
| 2812 | CATCH |
| 2813 | @@ -795,11 +808,15 @@ rsmi_topo_numa_affinity_get(uint32_t dv_ind, int32_t *numa_node) { |
| 2814 | TRY |
| 2815 | rsmi_status_t ret; |
| 2816 | |
| 2817 | - CHK_SUPPORT_NAME_ONLY(numa_node) |
| 2818 | - |
| 2819 | DEVICE_MUTEX |
| 2820 | + if (!numa_node) { |
| 2821 | + return RSMI_STATUS_INVALID_ARGS; |
| 2822 | + } |
| 2823 | std::string str_val; |
| 2824 | ret = get_dev_value_str(amd::smi::kDevNumaNode, dv_ind, &str_val); |
| 2825 | + if (ret != RSMI_STATUS_SUCCESS) { |
| 2826 | + return ret; |
| 2827 | + } |
| 2828 | *numa_node = std::stoi(str_val, nullptr); |
| 2829 | |
| 2830 | return ret; |
| 2831 | @@ -846,12 +863,46 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) { |
| 2832 | rsmi_status_t ret; |
| 2833 | ss << __PRETTY_FUNCTION__ << "| ======= start ======="; |
| 2834 | LOG_TRACE(ss); |
| 2835 | + if (id == nullptr) { |
| 2836 | + return RSMI_STATUS_INVALID_ARGS; |
| 2837 | + } |
| 2838 | CHK_SUPPORT_NAME_ONLY(id) |
| 2839 | + // Set the device ID to max value |
| 2840 | + *id = std::numeric_limits<uint16_t>::max(); |
| 2841 | |
| 2842 | + // Get the device ID from KGD |
| 2843 | ret = get_id(dv_ind, amd::smi::kDevDevID, id); |
| 2844 | - ss << __PRETTY_FUNCTION__ << " | ======= end =======" |
| 2845 | - << ", reporting " << amd::smi::getRSMIStatusString(ret); |
| 2846 | LOG_TRACE(ss); |
| 2847 | + ss << __PRETTY_FUNCTION__ |
| 2848 | + << (ret == RSMI_STATUS_SUCCESS ? |
| 2849 | + " | No fall back needed retrieved from KGD" : " | fall back needed") |
| 2850 | + << " | Device #: " << std::to_string(dv_ind) |
| 2851 | + << " | Data: device_id = " << std::to_string(*id) |
| 2852 | + << " | ret = " << getRSMIStatusString(ret, false); |
| 2853 | + LOG_DEBUG(ss); |
| 2854 | + // If the device ID is not supported, use KFD's device ID |
| 2855 | + if (ret != RSMI_STATUS_SUCCESS) { |
| 2856 | + GET_DEV_AND_KFDNODE_FROM_INDX |
| 2857 | + uint32_t node_id; |
| 2858 | + uint64_t kfd_device_id; |
| 2859 | + int ret_kfd = kfd_node->get_node_id(&node_id); |
| 2860 | + ret_kfd = amd::smi::read_node_properties(node_id, "device_id", &kfd_device_id); |
| 2861 | + if (ret_kfd == 0) { |
| 2862 | + *id = static_cast<uint16_t>(kfd_device_id); |
| 2863 | + ret = RSMI_STATUS_SUCCESS; |
| 2864 | + } else { |
| 2865 | + *id = std::numeric_limits<uint16_t>::max(); |
| 2866 | + ret = RSMI_STATUS_NOT_SUPPORTED; |
| 2867 | + } |
| 2868 | + ss << __PRETTY_FUNCTION__ |
| 2869 | + << " | Issue: Could not read device from sysfs, falling back to KFD" << "\n" |
| 2870 | + << " ; Device #: " << std::to_string(dv_ind) << "\n" |
| 2871 | + << " ; ret_kfd: " << std::to_string(ret_kfd) << "\n" |
| 2872 | + << " ; node: " << std::to_string(node_id) << "\n" |
| 2873 | + << " ; Data: device_id (from KFD)= " << std::to_string(*id) << "\n" |
| 2874 | + << " ; ret = " << getRSMIStatusString(ret, false); |
| 2875 | + LOG_DEBUG(ss); |
| 2876 | + } |
| 2877 | return ret; |
| 2878 | } |
| 2879 | |
| 2880 | @@ -862,6 +913,7 @@ rsmi_dev_xgmi_physical_id_get(uint32_t dv_ind, uint16_t *id) { |
| 2881 | ss << __PRETTY_FUNCTION__ << "| ======= start ======="; |
| 2882 | LOG_TRACE(ss); |
| 2883 | CHK_SUPPORT_NAME_ONLY(id) |
| 2884 | + *id = std::numeric_limits<uint16_t>::max(); |
| 2885 | |
| 2886 | ret = get_id(dv_ind, amd::smi::kDevXGMIPhysicalID, id); |
| 2887 | ss << __PRETTY_FUNCTION__ << " | ======= end =======" |
| 2888 | @@ -907,16 +959,54 @@ rsmi_dev_subsystem_id_get(uint32_t dv_ind, uint16_t *id) { |
| 2889 | ss << __PRETTY_FUNCTION__ << "| ======= start ======="; |
| 2890 | LOG_TRACE(ss); |
| 2891 | CHK_SUPPORT_NAME_ONLY(id) |
| 2892 | - return get_id(dv_ind, amd::smi::kDevSubSysDevID, id); |
| 2893 | + auto ret = get_id(dv_ind, amd::smi::kDevSubSysDevID, id); |
| 2894 | + ss << __PRETTY_FUNCTION__ << " | ======= end =======" |
| 2895 | + << ", reporting " << amd::smi::getRSMIStatusString(ret, false); |
| 2896 | + LOG_INFO(ss); |
| 2897 | + return ret; |
| 2898 | } |
| 2899 | |
| 2900 | rsmi_status_t |
| 2901 | rsmi_dev_vendor_id_get(uint32_t dv_ind, uint16_t *id) { |
| 2902 | + TRY |
| 2903 | std::ostringstream ss; |
| 2904 | ss << __PRETTY_FUNCTION__ << "| ======= start ======="; |
| 2905 | LOG_TRACE(ss); |
| 2906 | + if (!id) { |
| 2907 | + return RSMI_STATUS_INVALID_ARGS; |
| 2908 | + } |
| 2909 | CHK_SUPPORT_NAME_ONLY(id) |
| 2910 | - return get_id(dv_ind, amd::smi::kDevVendorID, id); |
| 2911 | + int ret_kfd = 0; |
| 2912 | + uint32_t node_id; |
| 2913 | + rsmi_status_t ret = get_id(dv_ind, amd::smi::kDevVendorID, id); |
| 2914 | + bool need_fallback = false; |
| 2915 | + if (ret != RSMI_STATUS_SUCCESS) { |
| 2916 | + need_fallback = true; |
| 2917 | + } |
| 2918 | + if (ret != RSMI_STATUS_SUCCESS) { |
| 2919 | + GET_DEV_AND_KFDNODE_FROM_INDX |
| 2920 | + uint64_t kfd_vendor_id; |
| 2921 | + ret_kfd = kfd_node->get_node_id(&node_id); |
| 2922 | + ret_kfd = amd::smi::read_node_properties(node_id, "vendor_id", &kfd_vendor_id); |
| 2923 | + if (ret_kfd == 0) { |
| 2924 | + *id = static_cast<uint16_t>(kfd_vendor_id); |
| 2925 | + ret = RSMI_STATUS_SUCCESS; |
| 2926 | + } else { |
| 2927 | + *id = std::numeric_limits<uint16_t>::max(); |
| 2928 | + ret = RSMI_STATUS_NOT_SUPPORTED; |
| 2929 | + } |
| 2930 | + } |
| 2931 | + ss << __PRETTY_FUNCTION__ |
| 2932 | + << (need_fallback ? " | Needed to fallback to use KFD to read vendor_id" : |
| 2933 | + " | Read through SYSFS to read vendor_id") << "\n" |
| 2934 | + << " ; Device #: " << std::to_string(dv_ind) << "\n" |
| 2935 | + << " ; ret_kfd: " << std::to_string(ret_kfd) << "\n" |
| 2936 | + << " ; node: " << std::to_string(node_id) << "\n" |
| 2937 | + << " ; Data: vendor_id: " << std::to_string(*id) << "\n" |
| 2938 | + << " ; ret = " << getRSMIStatusString(ret, false); |
| 2939 | + LOG_INFO(ss); |
| 2940 | + return ret; |
| 2941 | + CATCH |
| 2942 | } |
| 2943 | |
| 2944 | rsmi_status_t |
| 2945 | @@ -936,8 +1026,11 @@ rsmi_dev_perf_level_get(uint32_t dv_ind, rsmi_dev_perf_level_t *perf) { |
| 2946 | ss << __PRETTY_FUNCTION__ << "| ======= start ======="; |
| 2947 | LOG_TRACE(ss); |
| 2948 | |
| 2949 | - CHK_SUPPORT_NAME_ONLY(perf) |
| 2950 | DEVICE_MUTEX |
| 2951 | + if (!perf) { |
| 2952 | + return RSMI_STATUS_INVALID_ARGS; |
| 2953 | + } |
| 2954 | + CHK_SUPPORT_NAME_ONLY(perf) |
| 2955 | |
| 2956 | rsmi_status_t ret = get_dev_value_str(amd::smi::kDevPerfLevel, dv_ind, |
| 2957 | &val_str); |
| 2958 | @@ -1006,6 +1099,11 @@ rsmi_dev_overdrive_level_get(uint32_t dv_ind, uint32_t *od) { |
| 2959 | CHK_SUPPORT_NAME_ONLY(od) |
| 2960 | DEVICE_MUTEX |
| 2961 | |
| 2962 | + // Bare Metal only feature |
| 2963 | + if (amd::smi::is_vm_guest()) { |
| 2964 | + return RSMI_STATUS_NOT_SUPPORTED; |
| 2965 | + } |
| 2966 | + |
| 2967 | rsmi_status_t ret = get_dev_value_str(amd::smi::kDevOverDriveLevel, dv_ind, |
| 2968 | &val_str); |
| 2969 | if (ret != RSMI_STATUS_SUCCESS) { |
| 2970 | @@ -1075,6 +1173,12 @@ rsmi_dev_overdrive_level_set_v1(uint32_t dv_ind, uint32_t od) { |
| 2971 | if (od > kMaxOverdriveLevel) { |
| 2972 | return RSMI_STATUS_INVALID_ARGS; |
| 2973 | } |
| 2974 | + |
| 2975 | + // Bare Metal only feature |
| 2976 | + if (amd::smi::is_vm_guest()) { |
| 2977 | + return RSMI_STATUS_NOT_SUPPORTED; |
| 2978 | + } |
| 2979 | + |
| 2980 | DEVICE_MUTEX |
| 2981 | return set_dev_value(amd::smi::kDevOverDriveLevel, dv_ind, od); |
| 2982 | CATCH |
| 2983 | @@ -1116,7 +1220,7 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_ |
| 2984 | return RSMI_STATUS_INVALID_ARGS; |
| 2985 | } |
| 2986 | memset(f, 0, sizeof(rsmi_frequencies_t)); |
| 2987 | - f->current=0; |
| 2988 | + f->current = 0; |
| 2989 | |
| 2990 | ret = GetDevValueVec(type, dv_ind, &val_vec); |
| 2991 | if (ret != RSMI_STATUS_SUCCESS) { |
| 2992 | @@ -1284,6 +1388,12 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, |
| 2993 | return RSMI_STATUS_INVALID_ARGS; |
| 2994 | } |
| 2995 | |
| 2996 | + // fill out rsmi_od_volt_freq_data_t p with default max values to indicate no valid data |
| 2997 | + p->curr_sclk_range.lower_bound = UINT64_MAX; |
| 2998 | + p->curr_sclk_range.upper_bound = UINT64_MAX; |
| 2999 | + p->curr_mclk_range.lower_bound = UINT64_MAX; |
| 3000 | + p->curr_mclk_range.upper_bound = UINT64_MAX; |
| 3001 | + |
| 3002 | ret = GetDevValueVec(amd::smi::kDevPowerODVoltage, dv_ind, &val_vec); |
| 3003 | if (ret != RSMI_STATUS_SUCCESS) { |
| 3004 | return ret; |
| 3005 | @@ -1311,13 +1421,6 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, |
| 3006 | .set_key_data_splitter(":", amd::smi::TagSplitterPositional_t::kBETWEEN) |
| 3007 | .structure_content(); |
| 3008 | |
| 3009 | - // |
| 3010 | - // Note: We must have minimum of 'GFXCLK:' && 'MCLK:' OR: |
| 3011 | - // 'OD_SCLK:' && 'OD_MCLK:' tags. |
| 3012 | - if (txt_power_dev_od_voltage.get_title_size() < kMIN_VALID_LINES) { |
| 3013 | - return rsmi_status_t::RSMI_STATUS_NO_DATA; |
| 3014 | - } |
| 3015 | - |
| 3016 | // Note: For debug builds/purposes only. |
| 3017 | assert(txt_power_dev_od_voltage.contains_title_key(kTAG_GFXCLK) || |
| 3018 | txt_power_dev_od_voltage.contains_title_key(kTAG_OD_SCLK)); |
| 3019 | @@ -1338,47 +1441,60 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, |
| 3020 | return std::vector<std::string>{upper_bound_data}; |
| 3021 | }; |
| 3022 | |
| 3023 | - // Validates 'OD_SCLK' is in the structure |
| 3024 | - if (txt_power_dev_od_voltage.contains_structured_key(kTAG_OD_SCLK, |
| 3025 | + // track the number of keys found, if this goes down to 0 then that means that there is no valid data |
| 3026 | + const uint8_t kNumStructuredKeysToCheck = 6; |
| 3027 | + uint8_t structured_key_counter = kNumStructuredKeysToCheck; |
| 3028 | + // Validates 'OD_SCLK' is in the structure |
| 3029 | + if (txt_power_dev_od_voltage.contains_structured_key(kTAG_OD_SCLK, |
| 3030 | KTAG_FIRST_FREQ_IDX)) { |
| 3031 | p->curr_sclk_range.lower_bound = freq_string_to_int(build_lower_bound(kTAG_OD_SCLK), nullptr, nullptr, 0); |
| 3032 | p->curr_sclk_range.upper_bound = freq_string_to_int(build_upper_bound(kTAG_OD_SCLK), nullptr, nullptr, 0); |
| 3033 | - |
| 3034 | + } |
| 3035 | + else |
| 3036 | + structured_key_counter--; |
| 3037 | // Validates 'OD_MCLK' is in the structure |
| 3038 | - if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_MCLK, |
| 3039 | - KTAG_FIRST_FREQ_IDX)) { |
| 3040 | - p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_OD_MCLK), nullptr, nullptr, 0); |
| 3041 | - p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_OD_MCLK), nullptr, nullptr, 0); |
| 3042 | - } |
| 3043 | + if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_MCLK, |
| 3044 | + KTAG_FIRST_FREQ_IDX)) { |
| 3045 | + p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_OD_MCLK), nullptr, nullptr, 0); |
| 3046 | + p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_OD_MCLK), nullptr, nullptr, 0); |
| 3047 | + } |
| 3048 | + else |
| 3049 | + structured_key_counter--; |
| 3050 | |
| 3051 | - // Validates 'OD_RANGE' is in the structure |
| 3052 | - if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_RANGE, |
| 3053 | - KTAG_SCLK)) { |
| 3054 | - od_value_pair_str_to_range(txt_power_dev_od_voltage |
| 3055 | - .get_structured_value_by_keys(KTAG_OD_RANGE, KTAG_SCLK), |
| 3056 | - &p->sclk_freq_limits); |
| 3057 | - } |
| 3058 | - if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_RANGE, |
| 3059 | - KTAG_MCLK)) { |
| 3060 | - od_value_pair_str_to_range(txt_power_dev_od_voltage |
| 3061 | - .get_structured_value_by_keys(KTAG_OD_RANGE, KTAG_MCLK), |
| 3062 | - &p->mclk_freq_limits); |
| 3063 | - } |
| 3064 | - } |
| 3065 | - // Validates 'GFXCLK' is in the structure |
| 3066 | - else if (txt_power_dev_od_voltage.contains_structured_key(kTAG_GFXCLK, |
| 3067 | - KTAG_FIRST_FREQ_IDX)) { |
| 3068 | - p->curr_sclk_range.lower_bound = freq_string_to_int(build_lower_bound(kTAG_GFXCLK), nullptr, nullptr, 0); |
| 3069 | - p->curr_sclk_range.upper_bound = freq_string_to_int(build_upper_bound(kTAG_GFXCLK), nullptr, nullptr, 0); |
| 3070 | - |
| 3071 | - // Validates 'MCLK' is in the structure |
| 3072 | - if (txt_power_dev_od_voltage.contains_structured_key(KTAG_MCLK, |
| 3073 | - KTAG_FIRST_FREQ_IDX)) { |
| 3074 | - p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_MCLK), nullptr, nullptr, 0); |
| 3075 | - p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_MCLK), nullptr, nullptr, 0); |
| 3076 | - } |
| 3077 | - } |
| 3078 | - else { |
| 3079 | + // Validates 'OD_RANGE' is in the structure |
| 3080 | + if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_RANGE, |
| 3081 | + KTAG_SCLK)) { |
| 3082 | + od_value_pair_str_to_range(txt_power_dev_od_voltage |
| 3083 | + .get_structured_value_by_keys(KTAG_OD_RANGE, KTAG_SCLK), |
| 3084 | + &p->sclk_freq_limits); |
| 3085 | + } |
| 3086 | + else |
| 3087 | + structured_key_counter--; |
| 3088 | + if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_RANGE, |
| 3089 | + KTAG_MCLK)) { |
| 3090 | + od_value_pair_str_to_range(txt_power_dev_od_voltage |
| 3091 | + .get_structured_value_by_keys(KTAG_OD_RANGE, KTAG_MCLK), |
| 3092 | + &p->mclk_freq_limits); |
| 3093 | + } |
| 3094 | + else |
| 3095 | + structured_key_counter--; |
| 3096 | + // Validates 'GFXCLK' is in the structure |
| 3097 | + if (txt_power_dev_od_voltage.contains_structured_key(kTAG_GFXCLK, |
| 3098 | + KTAG_FIRST_FREQ_IDX)) { |
| 3099 | + p->curr_sclk_range.lower_bound = freq_string_to_int(build_lower_bound(kTAG_GFXCLK), nullptr, nullptr, 0); |
| 3100 | + p->curr_sclk_range.upper_bound = freq_string_to_int(build_upper_bound(kTAG_GFXCLK), nullptr, nullptr, 0); |
| 3101 | + } |
| 3102 | + else |
| 3103 | + structured_key_counter--; |
| 3104 | + // Validates 'MCLK' is in the structure |
| 3105 | + if (txt_power_dev_od_voltage.contains_structured_key(KTAG_MCLK, |
| 3106 | + KTAG_FIRST_FREQ_IDX)) { |
| 3107 | + p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_MCLK), nullptr, nullptr, 0); |
| 3108 | + p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_MCLK), nullptr, nullptr, 0); |
| 3109 | + } |
| 3110 | + else |
| 3111 | + structured_key_counter--; |
| 3112 | + if (structured_key_counter == 0) { |
| 3113 | return RSMI_STATUS_NOT_YET_IMPLEMENTED; |
| 3114 | } |
| 3115 | |
| 3116 | @@ -1450,7 +1566,20 @@ rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue, |
| 3117 | ss << __PRETTY_FUNCTION__ << "| ======= start ======="; |
| 3118 | LOG_TRACE(ss); |
| 3119 | |
| 3120 | - assert(minclkvalue < maxclkvalue); |
| 3121 | + if (minclkvalue >= maxclkvalue) { |
| 3122 | + return RSMI_STATUS_INVALID_ARGS; |
| 3123 | + } |
| 3124 | + |
| 3125 | + // Bare Metal only feature |
| 3126 | + if (amd::smi::is_vm_guest()) { |
| 3127 | + return RSMI_STATUS_NOT_SUPPORTED; |
| 3128 | + } |
| 3129 | + |
| 3130 | + // Can only set the clock type for sys and mem type |
| 3131 | + if (clkType != RSMI_CLK_TYPE_SYS && clkType != RSMI_CLK_TYPE_MEM) { |
| 3132 | + return RSMI_STATUS_NOT_SUPPORTED; |
| 3133 | + } |
| 3134 | + |
| 3135 | std::string min_sysvalue; |
| 3136 | std::string max_sysvalue; |
| 3137 | std::map<rsmi_clk_type_t, std::string> clk_char_map = { |
| 3138 | @@ -1848,6 +1977,11 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, |
| 3139 | return RSMI_STATUS_INVALID_ARGS; |
| 3140 | } |
| 3141 | |
| 3142 | + // Bare Metal only feature |
| 3143 | + if (amd::smi::is_vm_guest()) { |
| 3144 | + return RSMI_STATUS_NOT_SUPPORTED; |
| 3145 | + } |
| 3146 | + |
| 3147 | ret = rsmi_dev_gpu_clk_freq_get(dv_ind, clk_type, &freqs); |
| 3148 | |
| 3149 | if (ret != RSMI_STATUS_SUCCESS) { |
| 3150 | @@ -1893,7 +2027,7 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, |
| 3151 | // will have read-only perms, and the OS will deny access, before the request hits the driver level |
| 3152 | if (status == RSMI_STATUS_PERMISSION){ |
| 3153 | bool read_only = false; |
| 3154 | - int perms = amd::smi::isReadOnlyForAll(dev->path(), &read_only); |
| 3155 | + amd::smi::isReadOnlyForAll(dev->path(), &read_only); |
| 3156 | if(read_only){ |
| 3157 | return RSMI_STATUS_NOT_SUPPORTED; |
| 3158 | } |
| 3159 | @@ -1903,6 +2037,7 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, |
| 3160 | |
| 3161 | CATCH |
| 3162 | } |
| 3163 | + |
| 3164 | static std::vector<std::string> pci_name_files = { |
| 3165 | "/usr/share/misc/pci.ids", |
| 3166 | "/usr/share/hwdata/pci.ids", |
| 3167 | @@ -2184,17 +2319,17 @@ rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len) { |
| 3168 | std::ostringstream ss; |
| 3169 | ss << __PRETTY_FUNCTION__ << "| ======= start ======="; |
| 3170 | LOG_TRACE(ss); |
| 3171 | - CHK_SUPPORT_NAME_ONLY(name) |
| 3172 | |
| 3173 | - if (len == 0) { |
| 3174 | + if (len == 0 || !name) { |
| 3175 | return RSMI_STATUS_INVALID_ARGS; |
| 3176 | } |
| 3177 | + CHK_SUPPORT_NAME_ONLY(name) |
| 3178 | |
| 3179 | DEVICE_MUTEX |
| 3180 | |
| 3181 | ret = get_dev_name_from_file(dv_ind, name, len); |
| 3182 | |
| 3183 | - if (ret || name[0] == '\0' || !isprint(name[0]) ) { |
| 3184 | + if (ret || name[0] == '\0' || !isprint(name[0])) { |
| 3185 | ret = get_dev_name_from_id(dv_ind, name, len, NAME_STR_DEVICE); |
| 3186 | } |
| 3187 | |
| 3188 | @@ -2327,12 +2462,12 @@ rsmi_dev_vendor_name_get(uint32_t dv_ind, char *name, size_t len) { |
| 3189 | std::ostringstream ss; |
| 3190 | ss << __PRETTY_FUNCTION__ << "| ======= start ======="; |
| 3191 | LOG_TRACE(ss); |
| 3192 | + if (name == nullptr || len == 0) { |
| 3193 | + return RSMI_STATUS_INVALID_ARGS; |
| 3194 | + } |
| 3195 | CHK_SUPPORT_NAME_ONLY(name) |
| 3196 | |
| 3197 | assert(len > 0); |
| 3198 | - if (len == 0) { |
| 3199 | - return RSMI_STATUS_INVALID_ARGS; |
| 3200 | - } |
| 3201 | |
| 3202 | DEVICE_MUTEX |
| 3203 | ret = get_dev_name_from_id(dv_ind, name, len, NAME_STR_VENDOR); |
| 3204 | @@ -2470,25 +2605,25 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { |
| 3205 | return ret; |
| 3206 | } |
| 3207 | |
| 3208 | - // Hardcode based on PCIe specification: https://en.wikipedia.org/wiki/PCI_Express |
| 3209 | + // Hardcode based on PCIe specification: search PCI_Express on wikipedia |
| 3210 | const uint32_t link_width[] = {1, 2, 4, 8, 12, 16}; |
| 3211 | const uint32_t link_speed[] = {25, 50, 80, 160}; // 0.1 Ghz |
| 3212 | const uint32_t WIDTH_DATA_LENGTH = sizeof(link_width)/sizeof(uint32_t); |
| 3213 | const uint32_t SPEED_DATA_LENGTH = sizeof(link_speed)/sizeof(uint32_t); |
| 3214 | |
| 3215 | // Calculate the index |
| 3216 | - uint32_t width_index = -1; |
| 3217 | - uint32_t speed_index = -1; |
| 3218 | + int32_t width_index = -1; |
| 3219 | + int32_t speed_index = -1; |
| 3220 | uint32_t cur_index = 0; |
| 3221 | for (cur_index = 0; cur_index < WIDTH_DATA_LENGTH; cur_index++) { |
| 3222 | if (link_width[cur_index] == gpu_metrics.pcie_link_width) { |
| 3223 | - width_index = cur_index; |
| 3224 | + width_index = static_cast<int32_t>(cur_index); |
| 3225 | break; |
| 3226 | } |
| 3227 | } |
| 3228 | for (cur_index = 0; cur_index < SPEED_DATA_LENGTH; cur_index++) { |
| 3229 | if (link_speed[cur_index] == gpu_metrics.pcie_link_speed) { |
| 3230 | - speed_index = cur_index; |
| 3231 | + speed_index = static_cast<int32_t>(cur_index); |
| 3232 | break; |
| 3233 | } |
| 3234 | } |
| 3235 | @@ -2497,7 +2632,7 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { |
| 3236 | } |
| 3237 | // Set possible lanes and frequencies |
| 3238 | b->transfer_rate.num_supported = WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; |
| 3239 | - b->transfer_rate.current = speed_index*WIDTH_DATA_LENGTH + width_index; |
| 3240 | + b->transfer_rate.current = static_cast<uint32_t>(speed_index)*WIDTH_DATA_LENGTH + static_cast<uint32_t>(width_index); |
| 3241 | for (cur_index = 0; cur_index < WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; cur_index++) { |
| 3242 | b->transfer_rate.frequency[cur_index] = |
| 3243 | static_cast<long>(link_speed[cur_index/WIDTH_DATA_LENGTH]) * 100 * 1000000L; |
| 3244 | @@ -2530,6 +2665,10 @@ rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask) { |
| 3245 | LOG_TRACE(ss); |
| 3246 | REQUIRE_ROOT_ACCESS |
| 3247 | DEVICE_MUTEX |
| 3248 | + // Bare Metal only feature |
| 3249 | + if (amd::smi::is_vm_guest()) { |
| 3250 | + return RSMI_STATUS_NOT_SUPPORTED; |
| 3251 | + } |
| 3252 | ret = rsmi_dev_pci_bandwidth_get(dv_ind, &bws); |
| 3253 | |
| 3254 | if (ret != RSMI_STATUS_SUCCESS) { |
| 3255 | @@ -2557,7 +2696,10 @@ rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask) { |
| 3256 | |
| 3257 | int32_t ret_i; |
| 3258 | ret_i = dev->writeDevInfo(amd::smi::kDevPCIEClk, freq_enable_str); |
| 3259 | - |
| 3260 | + // |
| 3261 | + // NOTE: kDevPCIEClk sysfs file maybe not exist for all cases. |
| 3262 | + // If it doesn't exist (pp_dpm_pcie), it shouldn't be an error |
| 3263 | + // and will get translated to RSMI_STATUS_NOT_SUPPORTED. |
| 3264 | return amd::smi::ErrnoToRsmiStatus(ret_i); |
| 3265 | |
| 3266 | CATCH |
| 3267 | @@ -2598,6 +2740,10 @@ rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent, |
| 3268 | fs_rng >> *max_pkt_sz; |
| 3269 | } |
| 3270 | |
| 3271 | + if ((sent && *sent == UINT64_MAX) || (received && *received == UINT64_MAX)){ |
| 3272 | + return RSMI_STATUS_NOT_SUPPORTED; |
| 3273 | + } |
| 3274 | + |
| 3275 | return RSMI_STATUS_SUCCESS; |
| 3276 | CATCH |
| 3277 | } |
| 3278 | @@ -2908,6 +3054,11 @@ rsmi_dev_fan_speed_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t speed) { |
| 3279 | REQUIRE_ROOT_ACCESS |
| 3280 | DEVICE_MUTEX |
| 3281 | |
| 3282 | + // Bare Metal only feature |
| 3283 | + if (amd::smi::is_vm_guest()) { |
| 3284 | + return RSMI_STATUS_NOT_SUPPORTED; |
| 3285 | + } |
| 3286 | + |
| 3287 | ret = rsmi_dev_fan_speed_max_get(dv_ind, sensor_ind, &max_speed); |
| 3288 | |
| 3289 | if (ret != RSMI_STATUS_SUCCESS) { |
| 3290 | @@ -2974,13 +3125,17 @@ rsmi_dev_gpu_reset(uint32_t dv_ind) { |
| 3291 | ss << __PRETTY_FUNCTION__ << "| ======= start ======="; |
| 3292 | LOG_TRACE(ss); |
| 3293 | REQUIRE_ROOT_ACCESS |
| 3294 | - DEVICE_MUTEX |
| 3295 | + // No longer using DEVICE_MUTEX as it blocks long running processes |
| 3296 | + // DEVICE_MUTEX |
| 3297 | |
| 3298 | rsmi_status_t ret; |
| 3299 | uint64_t status_code = 0; |
| 3300 | |
| 3301 | // Read amdgpu_gpu_recover to reset it |
| 3302 | ret = get_dev_value_int(amd::smi::kDevGpuReset, dv_ind, &status_code); |
| 3303 | + ss << __PRETTY_FUNCTION__ << " | ======= end ======= | returning " |
| 3304 | + << getRSMIStatusString(ret, false); |
| 3305 | + LOG_INFO(ss); |
| 3306 | return ret; |
| 3307 | |
| 3308 | CATCH |
| 3309 | @@ -3235,6 +3390,9 @@ rsmi_dev_power_cap_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *cap) { |
| 3310 | LOG_TRACE(ss); |
| 3311 | |
| 3312 | ++sensor_ind; // power sysfs files have 1-based indices |
| 3313 | + if (!cap) { |
| 3314 | + return RSMI_STATUS_INVALID_ARGS; |
| 3315 | + } |
| 3316 | CHK_SUPPORT_SUBVAR_ONLY(cap, sensor_ind) |
| 3317 | |
| 3318 | rsmi_status_t ret; |
| 3319 | @@ -3255,6 +3413,9 @@ rsmi_dev_power_cap_range_get(uint32_t dv_ind, uint32_t sensor_ind, |
| 3320 | LOG_TRACE(ss); |
| 3321 | |
| 3322 | ++sensor_ind; // power sysfs files have 1-based indices |
| 3323 | + if (max == nullptr || min == nullptr) { |
| 3324 | + return RSMI_STATUS_INVALID_ARGS; |
| 3325 | + } |
| 3326 | CHK_SUPPORT_SUBVAR_ONLY((min == nullptr || max == nullptr ?nullptr : min), |
| 3327 | sensor_ind) |
| 3328 | rsmi_status_t ret; |
| 3329 | @@ -3283,6 +3444,11 @@ rsmi_dev_power_cap_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t cap) { |
| 3330 | REQUIRE_ROOT_ACCESS |
| 3331 | DEVICE_MUTEX |
| 3332 | |
| 3333 | + // Bare Metal only feature |
| 3334 | + if (amd::smi::is_vm_guest()) { |
| 3335 | + return RSMI_STATUS_NOT_SUPPORTED; |
| 3336 | + } |
| 3337 | + |
| 3338 | ret = rsmi_dev_power_cap_range_get(dv_ind, sensor_ind, &max, &min); |
| 3339 | if (ret != RSMI_STATUS_SUCCESS) { |
| 3340 | return ret; |
| 3341 | @@ -3332,6 +3498,10 @@ rsmi_dev_power_profile_set(uint32_t dv_ind, uint32_t dummy, |
| 3342 | |
| 3343 | (void)dummy; |
| 3344 | DEVICE_MUTEX |
| 3345 | + // Bare Metal only feature |
| 3346 | + if (amd::smi::is_vm_guest()) { |
| 3347 | + return RSMI_STATUS_NOT_SUPPORTED; |
| 3348 | + } |
| 3349 | rsmi_status_t ret = set_power_profile(dv_ind, profile); |
| 3350 | |
| 3351 | return ret; |
| 3352 | @@ -3369,6 +3539,8 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, |
| 3353 | } |
| 3354 | |
| 3355 | DEVICE_MUTEX |
| 3356 | + *total = 0; // Initialize total to 0 |
| 3357 | + // This is needed to avoid returning garbage value in case of failure |
| 3358 | ret = get_dev_value_int(mem_type_file, dv_ind, total); |
| 3359 | |
| 3360 | // Fallback to KFD reported memory if VRAM total is 0 |
| 3361 | @@ -3396,6 +3568,7 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, |
| 3362 | return ret; |
| 3363 | CATCH |
| 3364 | } |
| 3365 | + |
| 3366 | rsmi_status_t |
| 3367 | rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, |
| 3368 | uint64_t *used) { |
| 3369 | @@ -3427,6 +3600,8 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, |
| 3370 | } |
| 3371 | |
| 3372 | DEVICE_MUTEX |
| 3373 | + *used = 0; // Initialize used to 0 |
| 3374 | + // This is needed to avoid returning garbage value in case of failure |
| 3375 | ret = get_dev_value_int(mem_type_file, dv_ind, used); |
| 3376 | |
| 3377 | // Fallback to KFD reported memory if no VRAM |
| 3378 | @@ -3613,6 +3788,19 @@ rsmi_status_string(rsmi_status_t status, const char **status_string) { |
| 3379 | "the call from completing successfully"; |
| 3380 | break; |
| 3381 | |
| 3382 | + case RSMI_STATUS_DRM_ERROR: |
| 3383 | + *status_string = "RSMI_STATUS_DRM_ERROR: An error occurred when calling " |
| 3384 | + "libdrm"; |
| 3385 | + break; |
| 3386 | + case RSMI_STATUS_FAIL_LOAD_MODULE: |
| 3387 | + *status_string = "RSMI_STATUS_FAIL_LOAD_MODULE: Failed to load the " |
| 3388 | + "required module"; |
| 3389 | + break; |
| 3390 | + case RSMI_STATUS_FAIL_LOAD_SYMBOL: |
| 3391 | + *status_string = "RSMI_STATUS_FAIL_LOAD_SYMBOL: Failed to load the " |
| 3392 | + "required symbol"; |
| 3393 | + break; |
| 3394 | + |
| 3395 | default: |
| 3396 | *status_string = "RSMI_STATUS_UNKNOWN_ERROR: An unknown error occurred"; |
| 3397 | return RSMI_STATUS_UNKNOWN_ERROR; |
| 3398 | @@ -3964,10 +4152,8 @@ rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *unique_id) { |
| 3399 | ss << __PRETTY_FUNCTION__ << "| ======= start ======="; |
| 3400 | LOG_TRACE(ss); |
| 3401 | |
| 3402 | - CHK_SUPPORT_NAME_ONLY(unique_id) |
| 3403 | - |
| 3404 | DEVICE_MUTEX |
| 3405 | - if (unique_id == nullptr) { |
| 3406 | + if (!unique_id) { |
| 3407 | return RSMI_STATUS_INVALID_ARGS; |
| 3408 | } |
| 3409 | *unique_id = std::numeric_limits<uint64_t>::max(); |
| 3410 | @@ -4134,14 +4320,17 @@ rsmi_counter_available_counters_get(uint32_t dv_ind, |
| 3411 | TRY |
| 3412 | CHK_SUPPORT_VAR(available, grp) |
| 3413 | DEVICE_MUTEX |
| 3414 | - uint64_t val; |
| 3415 | + uint64_t val = 0; |
| 3416 | |
| 3417 | switch (grp) { |
| 3418 | case RSMI_EVNT_GRP_XGMI: |
| 3419 | case RSMI_EVNT_GRP_XGMI_DATA_OUT: |
| 3420 | |
| 3421 | ret = get_dev_value_int(amd::smi::kDevDFCountersAvailable, dv_ind, &val); |
| 3422 | - assert(val < UINT32_MAX); |
| 3423 | + if (ret != RSMI_STATUS_SUCCESS) |
| 3424 | + return ret; |
| 3425 | + if (val == UINT32_MAX) |
| 3426 | + return RSMI_STATUS_NOT_SUPPORTED; |
| 3427 | *available = static_cast<uint32_t>(val); |
| 3428 | break; |
| 3429 | |
| 3430 | @@ -5009,6 +5198,61 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, |
| 3431 | CATCH |
| 3432 | } |
| 3433 | |
| 3434 | +rsmi_status_t rsmi_dev_compute_partition_capabilities_get( |
| 3435 | + uint32_t dv_ind, char *compute_partition_caps, uint32_t len) { |
| 3436 | + TRY |
| 3437 | + std::ostringstream ss; |
| 3438 | + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; |
| 3439 | + LOG_TRACE(ss); |
| 3440 | + DEVICE_MUTEX |
| 3441 | + std::string availableComputePartitions; |
| 3442 | + rsmi_status_t ret = |
| 3443 | + get_dev_value_line(amd::smi::kDevAvailableComputePartition, |
| 3444 | + dv_ind, &availableComputePartitions); |
| 3445 | + if (ret != RSMI_STATUS_SUCCESS) { |
| 3446 | + ss << __PRETTY_FUNCTION__ |
| 3447 | + << " | ======= end ======= " |
| 3448 | + << " | FAIL " |
| 3449 | + << " | Device #: " << dv_ind |
| 3450 | + << " | Type: " |
| 3451 | + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition) |
| 3452 | + << " | Data: could not retrieve requested data" |
| 3453 | + << " | Returning = " |
| 3454 | + << getRSMIStatusString(ret) << " |"; |
| 3455 | + LOG_ERROR(ss); |
| 3456 | + return ret; |
| 3457 | + } |
| 3458 | + |
| 3459 | + std::size_t length = availableComputePartitions.copy(compute_partition_caps, len-1); |
| 3460 | + compute_partition_caps[length]='\0'; |
| 3461 | + |
| 3462 | + if (len < (availableComputePartitions.size() + 1)) { |
| 3463 | + ss << __PRETTY_FUNCTION__ |
| 3464 | + << " | ======= end ======= " |
| 3465 | + << " | Fail " |
| 3466 | + << " | Device #: " << dv_ind |
| 3467 | + << " | Type: " |
| 3468 | + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition) |
| 3469 | + << " | Cause: requested size was insufficient" |
| 3470 | + << " | Returning = " |
| 3471 | + << getRSMIStatusString(RSMI_STATUS_INSUFFICIENT_SIZE) << " |"; |
| 3472 | + LOG_ERROR(ss); |
| 3473 | + return RSMI_STATUS_INSUFFICIENT_SIZE; |
| 3474 | + } |
| 3475 | + ss << __PRETTY_FUNCTION__ |
| 3476 | + << " | ======= end ======= " |
| 3477 | + << " | Success " |
| 3478 | + << " | Device #: " << dv_ind |
| 3479 | + << " | Type: " |
| 3480 | + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition) |
| 3481 | + << " | Data: " << compute_partition_caps |
| 3482 | + << " | Returning = " |
| 3483 | + << getRSMIStatusString(ret) << " |"; |
| 3484 | + LOG_TRACE(ss); |
| 3485 | + return ret; |
| 3486 | + CATCH |
| 3487 | +} |
| 3488 | + |
| 3489 | static rsmi_status_t get_memory_partition(uint32_t dv_ind, |
| 3490 | std::string &memory_partition) { |
| 3491 | TRY |
| 3492 | @@ -5054,10 +5298,6 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, |
| 3493 | REQUIRE_ROOT_ACCESS |
| 3494 | DEVICE_MUTEX |
| 3495 | const int k1000_MS_WAIT = 1000; |
| 3496 | - const uint32_t kMaxBoardLength = 128; |
| 3497 | - bool isCorrectDevice = false; |
| 3498 | - char boardName[kMaxBoardLength]; |
| 3499 | - boardName[0] = '\0'; |
| 3500 | |
| 3501 | const uint32_t kMaxMemoryCapabilitiesSize = 30; |
| 3502 | char available_memory_capabilities[kMaxMemoryCapabilitiesSize]; |
| 3503 | @@ -5574,16 +5814,16 @@ rsmi_status_t rsmi_dev_node_id_get(uint32_t dv_ind, uint32_t *node_id) { |
| 3504 | << " | Device #: " << dv_ind; |
| 3505 | LOG_TRACE(ss); |
| 3506 | GET_DEV_AND_KFDNODE_FROM_INDX |
| 3507 | - uint32_t kgd_node_id = std::numeric_limits<uint32_t>::max(); |
| 3508 | + uint32_t kfd_node_id = std::numeric_limits<uint32_t>::max(); |
| 3509 | rsmi_status_t resp = RSMI_STATUS_NOT_SUPPORTED; |
| 3510 | - int ret = kfd_node->KFDNode::get_node_id(&kgd_node_id); |
| 3511 | + int ret = kfd_node->KFDNode::get_node_id(&kfd_node_id); |
| 3512 | resp = amd::smi::ErrnoToRsmiStatus(ret); |
| 3513 | |
| 3514 | if (node_id == nullptr) { |
| 3515 | resp = RSMI_STATUS_INVALID_ARGS; |
| 3516 | } else { |
| 3517 | - *node_id = kgd_node_id; |
| 3518 | - if (kgd_node_id == std::numeric_limits<uint32_t>::max()) { |
| 3519 | + *node_id = kfd_node_id; |
| 3520 | + if (kfd_node_id == std::numeric_limits<uint32_t>::max()) { |
| 3521 | resp = RSMI_STATUS_NOT_SUPPORTED; |
| 3522 | } |
| 3523 | } |
| 3524 | @@ -5987,7 +6227,7 @@ rsmi_event_notification_get(int timeout_ms, |
| 3525 | |
| 3526 | uint32_t event; |
| 3527 | char event_in[MAX_EVENT_NOTIFICATION_MSG_SIZE]; |
| 3528 | - memcpy(reinterpret_cast<char *>(event_in), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE); |
| 3529 | + memset(event_in, '\0', MAX_EVENT_NOTIFICATION_MSG_SIZE); |
| 3530 | while (fgets(event_in, MAX_EVENT_NOTIFICATION_MSG_SIZE, anon_fp)) { |
| 3531 | /* Output is in format as "event_number message_information\n" |
| 3532 | * Both event are expressed in hex. |
| 3533 | @@ -6000,20 +6240,20 @@ rsmi_event_notification_get(int timeout_ms, |
| 3534 | // parse message based on event received |
| 3535 | switch (event){ |
| 3536 | case RSMI_EVT_NOTIF_NONE: |
| 3537 | - strcpy(reinterpret_cast<char *>(&data_item->message), "Event type None received"); |
| 3538 | + strncpy(reinterpret_cast<char *>(&data_item->message), "Event type None received", MAX_EVENT_NOTIFICATION_MSG_SIZE-1); |
| 3539 | break; |
| 3540 | case RSMI_EVT_NOTIF_VMFAULT: |
| 3541 | { |
| 3542 | uint32_t pid; |
| 3543 | char task_name[MAX_EVENT_NOTIFICATION_MSG_SIZE]; |
| 3544 | - memcpy(reinterpret_cast<char *>(task_name), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE); |
| 3545 | + memset(task_name, '\0', MAX_EVENT_NOTIFICATION_MSG_SIZE); |
| 3546 | |
| 3547 | sscanf(message, "%x:%s\n", &pid, task_name); |
| 3548 | std::stringstream final_message; |
| 3549 | - final_message << "pid: " << std::to_string(pid).c_str() |
| 3550 | + final_message << "PID: " << std::to_string(pid).c_str() |
| 3551 | << " task name: " << task_name; |
| 3552 | |
| 3553 | - strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str()); |
| 3554 | + strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1); |
| 3555 | } |
| 3556 | break; |
| 3557 | case RSMI_EVT_NOTIF_THERMAL_THROTTLE: |
| 3558 | @@ -6021,37 +6261,38 @@ rsmi_event_notification_get(int timeout_ms, |
| 3559 | uint64_t bitmask; |
| 3560 | uint64_t counter; |
| 3561 | |
| 3562 | - sscanf(message, "%llx:%llx\n", &bitmask, &counter); |
| 3563 | + sscanf(message, "%" PRIx64 ":%" PRIx64 "\n", &bitmask, &counter); |
| 3564 | std::stringstream final_message; |
| 3565 | final_message << "bitmask: 0x" << std::hex << bitmask |
| 3566 | << " counter: 0x" << std::hex << counter; |
| 3567 | |
| 3568 | - strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str()); |
| 3569 | + strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1); |
| 3570 | } |
| 3571 | break; |
| 3572 | case RSMI_EVT_NOTIF_GPU_PRE_RESET: |
| 3573 | { |
| 3574 | uint32_t reset_seq_num; |
| 3575 | char reset_cause[MAX_EVENT_NOTIFICATION_MSG_SIZE]; |
| 3576 | - memcpy(reinterpret_cast<char *>(reset_cause), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE); |
| 3577 | + memset(reset_cause, '\0', MAX_EVENT_NOTIFICATION_MSG_SIZE); |
| 3578 | |
| 3579 | sscanf(message, "%x %[^\n]\n", &reset_seq_num, reset_cause); |
| 3580 | std::stringstream final_message; |
| 3581 | final_message << "reset sequence number: " << std::to_string(reset_seq_num).c_str() |
| 3582 | << " reset cause: " << reset_cause; |
| 3583 | |
| 3584 | - strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str()); |
| 3585 | + strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1); |
| 3586 | } |
| 3587 | break; |
| 3588 | case RSMI_EVT_NOTIF_GPU_POST_RESET: |
| 3589 | { |
| 3590 | uint32_t reset_seq_num; |
| 3591 | |
| 3592 | - sscanf(message, "%x %[^\n]\n", &reset_seq_num); |
| 3593 | + char tmp[MAX_EVENT_NOTIFICATION_MSG_SIZE]; |
| 3594 | + sscanf(message, "%x %[^\n]\n", &reset_seq_num, tmp); |
| 3595 | std::stringstream final_message; |
| 3596 | - final_message << " reset sequence number: " << std::to_string(reset_seq_num).c_str(); |
| 3597 | + final_message << "reset sequence number: " << std::to_string(reset_seq_num).c_str(); |
| 3598 | |
| 3599 | - strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str()); |
| 3600 | + strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1); |
| 3601 | } |
| 3602 | break; |
| 3603 | case RSMI_EVT_NOTIF_EVENT_MIGRATE_START: |
| 3604 | @@ -6060,15 +6301,15 @@ rsmi_event_notification_get(int timeout_ms, |
| 3605 | int32_t pid; |
| 3606 | uint32_t start; |
| 3607 | uint32_t size; |
| 3608 | - uint16_t from; |
| 3609 | - uint16_t to; |
| 3610 | - uint16_t prefetch_loc; |
| 3611 | - uint16_t preferred_loc; |
| 3612 | + uint32_t from; |
| 3613 | + uint32_t to; |
| 3614 | + uint32_t prefetch_loc; |
| 3615 | + uint32_t preferred_loc; |
| 3616 | int32_t migrate_trigger; |
| 3617 | |
| 3618 | - sscanf(message, "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", &ns, &pid, &start, &size, &from, &to, &prefetch_loc, &preferred_loc, &migrate_trigger); |
| 3619 | + sscanf(message, "%" PRId64 " -%d @%" PRIu32 "(%" PRIu32 ") %x->%x %x:%x %d\n", &ns, &pid, &start, &size, &from, &to, &prefetch_loc, &preferred_loc, &migrate_trigger); |
| 3620 | std::stringstream final_message; |
| 3621 | - final_message << "ns: " << std::to_string(ns).c_str() |
| 3622 | + final_message << "nd: " << std::to_string(ns).c_str() |
| 3623 | << " pid: " << std::to_string(pid).c_str() |
| 3624 | << " start: 0x" << std::hex << start |
| 3625 | << " size: 0x" << std::hex << size |
| 3626 | @@ -6078,7 +6319,7 @@ rsmi_event_notification_get(int timeout_ms, |
| 3627 | << " preferred_loc: 0x" << std::hex << preferred_loc |
| 3628 | << " migrate_trigger: " << std::to_string(migrate_trigger).c_str(); |
| 3629 | |
| 3630 | - strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str()); |
| 3631 | + strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1); |
| 3632 | } |
| 3633 | break; |
| 3634 | case RSMI_EVT_NOTIF_EVENT_MIGRATE_END: |
| 3635 | @@ -6092,9 +6333,9 @@ rsmi_event_notification_get(int timeout_ms, |
| 3636 | uint32_t migrate_trigger; |
| 3637 | uint32_t error_code; |
| 3638 | |
| 3639 | - sscanf(message, "%lld -%d @%lx(%lx) %x->%x %d %d\n", &ns, &pid, &start, &size, &from, &to, &migrate_trigger, &error_code); |
| 3640 | + sscanf(message, "%" PRId64 " -%d @%" PRIu32 "(%" PRIu32 ") %x->%x %d %d\n", &ns, &pid, &start, &size, &from, &to, &migrate_trigger, &error_code); |
| 3641 | std::stringstream final_message; |
| 3642 | - final_message << "ns: " << std::to_string(ns).c_str() |
| 3643 | + final_message << "nd: " << std::to_string(ns).c_str() |
| 3644 | << " pid: " << std::to_string(pid).c_str() |
| 3645 | << " start: 0x" << std::hex << start |
| 3646 | << " size: 0x" << std::hex << size |
| 3647 | @@ -6103,7 +6344,7 @@ rsmi_event_notification_get(int timeout_ms, |
| 3648 | << " migrate_trigger: " << std::to_string(migrate_trigger).c_str() |
| 3649 | << " error_code: " << std::to_string(error_code).c_str(); |
| 3650 | |
| 3651 | - strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str()); |
| 3652 | + strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1); |
| 3653 | } |
| 3654 | break; |
| 3655 | case RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START: |
| 3656 | @@ -6112,9 +6353,9 @@ rsmi_event_notification_get(int timeout_ms, |
| 3657 | int32_t pid; |
| 3658 | uint32_t addr; |
| 3659 | uint32_t node; |
| 3660 | - char *rw; |
| 3661 | + char *rw = "\0"; |
| 3662 | |
| 3663 | - sscanf(message, "%lld -%d @%lx(%x) %c\n", &ns, &pid, &addr, &node, rw); |
| 3664 | + sscanf(message, "%" PRId64 " -%d @%" PRIx32 "(%x) %c\n", &ns, &pid, &addr, &node, rw); |
| 3665 | std::stringstream final_message; |
| 3666 | final_message << "ns: " << std::to_string(ns).c_str() |
| 3667 | << " pid: " << std::to_string(pid).c_str() |
| 3668 | @@ -6122,7 +6363,7 @@ rsmi_event_notification_get(int timeout_ms, |
| 3669 | << " node: 0x" << std::hex << node |
| 3670 | << " rw: " << rw; |
| 3671 | |
| 3672 | - strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str()); |
| 3673 | + strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1); |
| 3674 | } |
| 3675 | break; |
| 3676 | case RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END: |
| 3677 | @@ -6131,9 +6372,9 @@ rsmi_event_notification_get(int timeout_ms, |
| 3678 | int32_t pid; |
| 3679 | uint32_t addr; |
| 3680 | uint32_t node; |
| 3681 | - char *migrate_update; |
| 3682 | + char *migrate_update = "\0"; |
| 3683 | |
| 3684 | - sscanf(message, "%lld -%d @%lx(%x) %c\n", &ns, &pid, &addr, &node, migrate_update); |
| 3685 | + sscanf(message, "%" PRId64 " -%d @%" PRIx32 "(%x) %c\n", &ns, &pid, &addr, &node, migrate_update); |
| 3686 | std::stringstream final_message; |
| 3687 | final_message << "ns: " << std::to_string(ns).c_str() |
| 3688 | << " pid: " << std::to_string(pid).c_str() |
| 3689 | @@ -6141,7 +6382,7 @@ rsmi_event_notification_get(int timeout_ms, |
| 3690 | << " node: 0x" << std::hex << node |
| 3691 | << " migrate_udpate: " << migrate_update; |
| 3692 | |
| 3693 | - strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str()); |
| 3694 | + strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1); |
| 3695 | } |
| 3696 | break; |
| 3697 | case RSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION: |
| 3698 | @@ -6151,14 +6392,14 @@ rsmi_event_notification_get(int timeout_ms, |
| 3699 | uint32_t node; |
| 3700 | uint32_t evict_trigger; |
| 3701 | |
| 3702 | - sscanf(message, "%lld -%d %x %d\n", &ns, &pid, &node, &evict_trigger); |
| 3703 | + sscanf(message, "%" PRId64 "-%d %x %d\n", &ns, &pid, &node, &evict_trigger); |
| 3704 | std::stringstream final_message; |
| 3705 | final_message << "ns: " << std::to_string(ns).c_str() |
| 3706 | << " pid: " << std::to_string(pid).c_str() |
| 3707 | << " node: 0x" << std::hex << node |
| 3708 | << " evict_trigger: " << std::to_string(evict_trigger).c_str(); |
| 3709 | |
| 3710 | - strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str()); |
| 3711 | + strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1); |
| 3712 | } |
| 3713 | break; |
| 3714 | case RSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE: |
| 3715 | @@ -6166,16 +6407,16 @@ rsmi_event_notification_get(int timeout_ms, |
| 3716 | int64_t ns; |
| 3717 | int32_t pid; |
| 3718 | uint32_t node; |
| 3719 | - char *rescheduled; |
| 3720 | + char *rescheduled = "\0"; |
| 3721 | |
| 3722 | - sscanf(message, "%lld -%d %x %c\n", &ns, &pid, &node, rescheduled); |
| 3723 | + sscanf(message, "%" PRId64 "-%d %x %c\n", &ns, &pid, &node, rescheduled); |
| 3724 | std::stringstream final_message; |
| 3725 | final_message << "ns: " << std::to_string(ns).c_str() |
| 3726 | << " pid: " << std::to_string(pid).c_str() |
| 3727 | << " node: 0x" << std::hex << node |
| 3728 | << " rescheduled: " << rescheduled; |
| 3729 | |
| 3730 | - strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str()); |
| 3731 | + strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1); |
| 3732 | } |
| 3733 | break; |
| 3734 | case RSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU: |
| 3735 | @@ -6187,7 +6428,7 @@ rsmi_event_notification_get(int timeout_ms, |
| 3736 | uint32_t node; |
| 3737 | uint32_t unmap_trigger; |
| 3738 | |
| 3739 | - sscanf(message, "%lld -%d @%lx(%lx) %x %d\n", &ns, &pid, &addr, &size, &node, &unmap_trigger); |
| 3740 | + sscanf(message, "%" PRId64 " -%d @%" PRIx32 "(%" PRIx32 ") %x %d\n", &ns, &pid, &addr, &size, &node, &unmap_trigger); |
| 3741 | std::stringstream final_message; |
| 3742 | final_message << "ns: " << std::to_string(ns).c_str() |
| 3743 | << " pid: " << std::to_string(pid).c_str() |
| 3744 | @@ -6196,11 +6437,11 @@ rsmi_event_notification_get(int timeout_ms, |
| 3745 | << " node: 0x" << std::hex << node |
| 3746 | << " unmap_trigger: " << std::to_string(unmap_trigger).c_str(); |
| 3747 | |
| 3748 | - strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str()); |
| 3749 | + strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1); |
| 3750 | } |
| 3751 | break; |
| 3752 | default: |
| 3753 | - strcpy(reinterpret_cast<char *>(&data_item->message), "Unknown event received"); |
| 3754 | + strncpy(reinterpret_cast<char *>(&data_item->message), "Unknown event received", MAX_EVENT_NOTIFICATION_MSG_SIZE-1); |
| 3755 | break; |
| 3756 | } |
| 3757 | data_item->event = (rsmi_evt_notification_type_t)event; |
| 3758 | @@ -6208,7 +6449,7 @@ rsmi_event_notification_get(int timeout_ms, |
| 3759 | ++(*num_elem); |
| 3760 | |
| 3761 | // zero out event_in after each use |
| 3762 | - memcpy(reinterpret_cast<char *>(event_in), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE); |
| 3763 | + memset(event_in, '\0', MAX_EVENT_NOTIFICATION_MSG_SIZE); |
| 3764 | |
| 3765 | if (*num_elem >= buffer_size) { |
| 3766 | break; |
| 3767 | diff --git a/src/rocm_smi64Config.in b/src/rocm_smi64Config.in |
| 3768 | old mode 100755 |
| 3769 | new mode 100644 |
| 3770 | index a3b2631..e3c5903 |
| 3771 | --- a/src/rocm_smi64Config.in |
| 3772 | +++ b/src/rocm_smi64Config.in |
| 3773 | @@ -5,7 +5,7 @@ |
| 3774 | * The University of Illinois/NCSA |
| 3775 | * Open Source License (NCSA) |
| 3776 | * |
| 3777 | - * Copyright (c) 2017, Advanced Micro Devices, Inc. |
| 3778 | + * Copyright (c) 2025, Advanced Micro Devices, Inc. |
| 3779 | * All rights reserved. |
| 3780 | * |
| 3781 | * Developed by: |
| 3782 | diff --git a/src/rocm_smi_counters.cc b/src/rocm_smi_counters.cc |
| 3783 | old mode 100755 |
| 3784 | new mode 100644 |
| 3785 | index a088195..185ed0e |
| 3786 | --- a/src/rocm_smi_counters.cc |
| 3787 | +++ b/src/rocm_smi_counters.cc |
| 3788 | @@ -3,7 +3,7 @@ |
| 3789 | * The University of Illinois/NCSA |
| 3790 | * Open Source License (NCSA) |
| 3791 | * |
| 3792 | - * Copyright (c) 2019, Advanced Micro Devices, Inc. |
| 3793 | + * Copyright (c) 2025, Advanced Micro Devices, Inc. |
| 3794 | * All rights reserved. |
| 3795 | * |
| 3796 | * Developed by: |
| 3797 | diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc |
| 3798 | old mode 100755 |
| 3799 | new mode 100644 |
| 3800 | index 62ced13..cf8cbf7 |
| 3801 | --- a/src/rocm_smi_device.cc |
| 3802 | +++ b/src/rocm_smi_device.cc |
| 3803 | @@ -3,7 +3,7 @@ |
| 3804 | * The University of Illinois/NCSA |
| 3805 | * Open Source License (NCSA) |
| 3806 | * |
| 3807 | - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. |
| 3808 | + * Copyright (c) 2017-2025, Advanced Micro Devices, Inc. |
| 3809 | * All rights reserved. |
| 3810 | * |
| 3811 | * Developed by: |
| 3812 | @@ -509,10 +509,12 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = { |
| 3813 | {"rsmi_dev_counter_create", {{}, {}}}, |
| 3814 | {"rsmi_dev_xgmi_error_status", {{kDevXGMIErrorFName}, {}}}, |
| 3815 | {"rsmi_dev_xgmi_error_reset", {{kDevXGMIErrorFName}, {}}}, |
| 3816 | - {"rsmi_dev_memory_reserved_pages_get", {{kDevMemPageBadFName}, {}}}, |
| 3817 | {"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}}, |
| 3818 | {"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}}, |
| 3819 | {"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}}, |
| 3820 | + {"rsmi_dev_energy_count_get", {{kDevGpuMetricsFName}, {}}}, |
| 3821 | + {"rsmi_dev_current_socket_power_get", {{kDevGpuMetricsFName}, {}}}, |
| 3822 | + |
| 3823 | {"rsmi_dev_compute_partition_get", {{kDevComputePartitionFName}, {}}}, |
| 3824 | {"rsmi_dev_compute_partition_set", {{kDevComputePartitionFName}, {}}}, |
| 3825 | {"rsmi_dev_memory_partition_get", {{kDevMemoryPartitionFName}, {}}}, |
| 3826 | @@ -763,8 +765,8 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) { |
| 3827 | ret = openDebugFileStream(type, &fs); |
| 3828 | if (ret != 0) { |
| 3829 | ss << "Could not read debugInfoStr for DevInfoType (" |
| 3830 | - << get_type_string(type) << "), returning " |
| 3831 | - << std::to_string(ret); |
| 3832 | + << get_type_string(type) << "), returning " |
| 3833 | + << std::to_string(ret); |
| 3834 | LOG_ERROR(ss); |
| 3835 | return ret; |
| 3836 | } |
| 3837 | @@ -960,7 +962,7 @@ int Device::readDevInfoLine(DevInfoTypes type, std::string *line) { |
| 3838 | << get_type_string(type) << "), returning *line = " |
| 3839 | << *line; |
| 3840 | LOG_INFO(ss); |
| 3841 | - |
| 3842 | + fs.close(); |
| 3843 | return 0; |
| 3844 | } |
| 3845 | |
| 3846 | @@ -1042,6 +1044,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, |
| 3847 | while (std::getline(fs, line)) { |
| 3848 | retVec->push_back(line); |
| 3849 | } |
| 3850 | + fs.close(); |
| 3851 | |
| 3852 | if (retVec->empty()) { |
| 3853 | ss << "Read devInfoMultiLineStr for DevInfoType (" |
| 3854 | @@ -1422,7 +1425,6 @@ rsmi_status_t Device::restartAMDGpuDriver(void) { |
| 3855 | bool success = false; |
| 3856 | std::string out; |
| 3857 | bool wasGdmServiceActive = false; |
| 3858 | - bool restartInProgress = true; |
| 3859 | bool isRestartInProgress = true; |
| 3860 | bool isAMDGPUModuleLive = false; |
| 3861 | bool restartGDM = false; |
| 3862 | @@ -1508,7 +1510,6 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress, |
| 3863 | bool *isAMDGPUModuleLive) { |
| 3864 | REQUIRE_ROOT_ACCESS |
| 3865 | std::ostringstream ss; |
| 3866 | - bool restartSuccessful = true; |
| 3867 | bool success = false; |
| 3868 | std::string out; |
| 3869 | bool deviceRestartInProgress = true; // Assume in progress, we intend to disprove |
| 3870 | @@ -1718,3 +1719,4 @@ rsmi_status_t Device::get_smi_device_identifiers(uint32_t device_id, |
| 3871 | #undef RET_IF_NONZERO |
| 3872 | } // namespace smi |
| 3873 | } // namespace amd |
| 3874 | + |
| 3875 | diff --git a/src/rocm_smi_gpu_metrics.cc b/src/rocm_smi_gpu_metrics.cc |
| 3876 | old mode 100755 |
| 3877 | new mode 100644 |
| 3878 | index 5962477..0722f89 |
| 3879 | --- a/src/rocm_smi_gpu_metrics.cc |
| 3880 | +++ b/src/rocm_smi_gpu_metrics.cc |
| 3881 | @@ -1,44 +1,23 @@ |
| 3882 | /* |
| 3883 | - * ============================================================================= |
| 3884 | - * The University of Illinois/NCSA |
| 3885 | - * Open Source License (NCSA) |
| 3886 | - * |
| 3887 | - * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. |
| 3888 | - * All rights reserved. |
| 3889 | - * |
| 3890 | - * Developed by: |
| 3891 | - * |
| 3892 | - * AMD Research and AMD ROC Software Development |
| 3893 | - * |
| 3894 | - * Advanced Micro Devices, Inc. |
| 3895 | - * |
| 3896 | - * www.amd.com |
| 3897 | + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. |
| 3898 | * |
| 3899 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3900 | - * of this software and associated documentation files (the "Software"), to |
| 3901 | - * deal with the Software without restriction, including without limitation |
| 3902 | - * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| 3903 | - * and/or sell copies of the Software, and to permit persons to whom the |
| 3904 | - * Software is furnished to do so, subject to the following conditions: |
| 3905 | + * of this software and associated documentation files (the "Software"), to deal |
| 3906 | + * in the Software without restriction, including without limitation the rights |
| 3907 | + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 3908 | + * copies of the Software, and to permit persons to whom the Software is |
| 3909 | + * furnished to do so, subject to the following conditions: |
| 3910 | * |
| 3911 | - * - Redistributions of source code must retain the above copyright notice, |
| 3912 | - * this list of conditions and the following disclaimers. |
| 3913 | - * - Redistributions in binary form must reproduce the above copyright |
| 3914 | - * notice, this list of conditions and the following disclaimers in |
| 3915 | - * the documentation and/or other materials provided with the distribution. |
| 3916 | - * - Neither the names of <Name of Development Group, Name of Institution>, |
| 3917 | - * nor the names of its contributors may be used to endorse or promote |
| 3918 | - * products derived from this Software without specific prior written |
| 3919 | - * permission. |
| 3920 | + * The above copyright notice and this permission notice shall be included in |
| 3921 | + * all copies or substantial portions of the Software. |
| 3922 | * |
| 3923 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 3924 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 3925 | - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| 3926 | - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR |
| 3927 | - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
| 3928 | - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
| 3929 | - * DEALINGS WITH THE SOFTWARE. |
| 3930 | - * |
| 3931 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 3932 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 3933 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 3934 | + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 3935 | + * THE SOFTWARE. |
| 3936 | */ |
| 3937 | |
| 3938 | #include "rocm_smi/rocm_smi_gpu_metrics.h" |
| 3939 | @@ -52,6 +31,7 @@ |
| 3940 | |
| 3941 | #include <dirent.h> |
| 3942 | #include <pthread.h> |
| 3943 | +#include <unistd.h> |
| 3944 | |
| 3945 | #include <algorithm> |
| 3946 | #include <array> |
| 3947 | @@ -84,7 +64,7 @@ namespace amd::smi |
| 3948 | |
| 3949 | constexpr uint16_t join_metrics_version(uint8_t format_rev, uint8_t content_rev) |
| 3950 | { |
| 3951 | - return (format_rev << 8 | content_rev); |
| 3952 | + return static_cast<uint16_t>((format_rev << 8 | content_rev)); |
| 3953 | } |
| 3954 | |
| 3955 | constexpr uint16_t join_metrics_version(const AMDGpuMetricsHeader_v1_t& metrics_header) |
| 3956 | @@ -168,6 +148,7 @@ const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_tabl |
| 3957 | {join_metrics_version(1, 5), AMDGpuMetricVersionFlags_t::kGpuMetricV15}, |
| 3958 | {join_metrics_version(1, 6), AMDGpuMetricVersionFlags_t::kGpuMetricV16}, |
| 3959 | {join_metrics_version(1, 7), AMDGpuMetricVersionFlags_t::kGpuMetricV17}, |
| 3960 | + {join_metrics_version(1, 8), AMDGpuMetricVersionFlags_t::kGpuMetricV18}, |
| 3961 | }; |
| 3962 | |
| 3963 | /** |
| 3964 | @@ -277,22 +258,27 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation |
| 3965 | {AMDGpuMetricsUnitType_t::kMetricHBMThmResidencyAccumulator, "HBMThmResidencyAccumulator"}, /* v1.6 */ |
| 3966 | |
| 3967 | // kGpuMetricPartition |
| 3968 | - {AMDGpuMetricsUnitType_t::kGpuMetricNumPartition, "numPartition"}, /* v1.6 */ |
| 3969 | + {AMDGpuMetricsUnitType_t::kGpuMetricNumPartition, "numPartition"}, /* v1.6 */ |
| 3970 | |
| 3971 | // kGpuMetricXcpStats |
| 3972 | - {AMDGpuMetricsUnitType_t::kMetricGfxBusyInst, "GfxBusyInst"}, /* v1.6 */ |
| 3973 | - {AMDGpuMetricsUnitType_t::kMetricJpegBusy, "JpegBusy"}, /* v1.6 */ |
| 3974 | - {AMDGpuMetricsUnitType_t::kMetricVcnBusy, "VcnBusy"}, /* v1.6 */ |
| 3975 | - {AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, "GfxBusyAcc"}, /* v1.6 */ |
| 3976 | + {AMDGpuMetricsUnitType_t::kMetricGfxBusyInst, "GfxBusyInst"}, /* v1.6 */ |
| 3977 | + {AMDGpuMetricsUnitType_t::kMetricJpegBusy, "JpegBusy"}, /* v1.6 */ |
| 3978 | + {AMDGpuMetricsUnitType_t::kMetricVcnBusy, "VcnBusy"}, /* v1.6 */ |
| 3979 | + {AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, "GfxBusyAcc"}, /* v1.6 */ |
| 3980 | |
| 3981 | // kGpuMetricLinkWidthSpeed |
| 3982 | - {AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, "PcieLCPerfOtherEndRecov"}, /* v1.6 */ |
| 3983 | + {AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, "PcieLCPerfOtherEndRecov"}, /* v1.6 */ |
| 3984 | + |
| 3985 | |
| 3986 | + {AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, "XgmiLinkStatus"}, /* v1.7 */ |
| 3987 | + {AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, "VramMaxBandwidth"}, /* v1.7 */ |
| 3988 | + {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator,"GfxBelowHostLimitAccumulator"}, /* v1.7 */ |
| 3989 | |
| 3990 | - {AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, "XgmiLinkStatus"}, /* v1.7 */ |
| 3991 | - {AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, "VramMaxBandwidth"}, /* v1.7 */ |
| 3992 | - {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator, |
| 3993 | - "GfxBelowHostLimitAccumulator"}, /* v1.7 */ |
| 3994 | + // kGpuMetricXcpStats v1.8 |
| 3995 | + {AMDGpuMetricsUnitType_t::kMetricGfxLowUtilitizationAcc, "GfxLowUtilitizationAcc"}, /* v1.8 */ |
| 3996 | + {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitTotalAcc, "GfxBelowHostLimitTotalAcc"}, /* v1.8 */ |
| 3997 | + {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitPptAcc, "GfxBelowHostLimitPptAcc"}, /* v1.8 */ |
| 3998 | + {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitThmAcc, "GfxBelowHostLimitThmAcc"}, /* v1.8 */ |
| 3999 | }; |
| 4000 | |
| 4001 | |
| 4002 | @@ -382,6 +368,7 @@ AMDGpuMetricFactories_t amd_gpu_metrics_factory_table |
| 4003 | {AMDGpuMetricVersionFlags_t::kGpuMetricV15, std::make_shared<GpuMetricsBase_v15_t>(GpuMetricsBase_v15_t{})}, |
| 4004 | {AMDGpuMetricVersionFlags_t::kGpuMetricV16, std::make_shared<GpuMetricsBase_v16_t>(GpuMetricsBase_v16_t{})}, |
| 4005 | {AMDGpuMetricVersionFlags_t::kGpuMetricV17, std::make_shared<GpuMetricsBase_v17_t>(GpuMetricsBase_v17_t{})}, |
| 4006 | + {AMDGpuMetricVersionFlags_t::kGpuMetricV18, std::make_shared<GpuMetricsBase_v18_t>(GpuMetricsBase_v18_t{})}, |
| 4007 | }; |
| 4008 | |
| 4009 | GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version) |
| 4010 | @@ -500,381 +487,269 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str |
| 4011 | return multi_values; |
| 4012 | } |
| 4013 | |
| 4014 | -void GpuMetricsBase_v17_t::dump_internal_metrics_table() |
| 4015 | -{ |
| 4016 | - std::ostringstream ss; |
| 4017 | - auto idx = uint64_t(0); |
| 4018 | - auto idy = uint64_t(0); |
| 4019 | - std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n"; |
| 4020 | - ss << __PRETTY_FUNCTION__ |
| 4021 | - << " | ======= DEBUG ======= " |
| 4022 | - << " | Metric Version: " |
| 4023 | - << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header) |
| 4024 | - << " | Size: " |
| 4025 | - << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size) |
| 4026 | - << " |" |
| 4027 | - << "\n"; |
| 4028 | - ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n" |
| 4029 | - << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n" |
| 4030 | - << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n" |
| 4031 | - << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n" |
| 4032 | - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" |
| 4033 | - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"; |
| 4034 | - |
| 4035 | - ss << " vram_max_bandwidth: " << m_gpu_metrics_tbl.m_vram_max_bandwidth << "\n" // new for v1.7 |
| 4036 | - << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n" |
| 4037 | - << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n" |
| 4038 | - << " accumulation_counter: " << m_gpu_metrics_tbl.m_accumulation_counter << "\n" |
| 4039 | - << " prochot_residency_acc: " << m_gpu_metrics_tbl.m_prochot_residency_acc << "\n" |
| 4040 | - << " ppt_residency_acc: " << m_gpu_metrics_tbl.m_ppt_residency_acc << "\n" |
| 4041 | - << " socket_thm_residency_acc: " << m_gpu_metrics_tbl.m_socket_thm_residency_acc << "\n" |
| 4042 | - << " vr_thm_residency_acc: " << m_gpu_metrics_tbl.m_vr_thm_residency_acc << "\n" |
| 4043 | - << " hbm_thm_residency_acc: " << m_gpu_metrics_tbl.m_hbm_thm_residency_acc << "\n" |
| 4044 | - << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n" |
| 4045 | - << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n" |
| 4046 | - << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n" |
| 4047 | - << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n" |
| 4048 | - << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n" |
| 4049 | - << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n" |
| 4050 | - << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n" |
| 4051 | - << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n" |
| 4052 | - << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n" |
| 4053 | - << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n" |
| 4054 | - << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n" |
| 4055 | - << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n" |
| 4056 | - << " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n" |
| 4057 | - << " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n" |
| 4058 | - << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n" |
| 4059 | - << " current_uclk: " << m_gpu_metrics_tbl.m_current_uclk << "\n" |
| 4060 | - << " num_partition: " << m_gpu_metrics_tbl.m_num_partition << "\n" |
| 4061 | - << " pcie_lc_perf_other_end_recovery: " |
| 4062 | - << m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery << "\n"; |
| 4063 | - idx = 0; |
| 4064 | - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_link_status) { // new for v1.7 |
| 4065 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4066 | - ++idx; |
| 4067 | - } |
| 4068 | - |
| 4069 | - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) { |
| 4070 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4071 | - ++idx; |
| 4072 | - } |
| 4073 | - |
| 4074 | - ss << " xgmi_write_data_acc: " << "\n"; |
| 4075 | - idx = 0; |
| 4076 | - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) { |
| 4077 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4078 | - ++idx; |
| 4079 | - } |
| 4080 | |
| 4081 | - ss << " current_gfxclk: " << "\n"; |
| 4082 | - idx = 0; |
| 4083 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) { |
| 4084 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4085 | - ++idx; |
| 4086 | - } |
| 4087 | +rsmi_status_t GpuMetricsBase_v18_t::populate_metrics_dynamic_tbl() { |
| 4088 | + std::ostringstream ss; |
| 4089 | + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); |
| 4090 | + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; |
| 4091 | + LOG_TRACE(ss); |
| 4092 | |
| 4093 | - ss << " current_socclk: " << "\n"; |
| 4094 | - idx = 0; |
| 4095 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) { |
| 4096 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4097 | - ++idx; |
| 4098 | - } |
| 4099 | + auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; |
| 4100 | + // |
| 4101 | + // Note: Any metric treatment/changes (if any) should happen before they |
| 4102 | + // get written to internal/external tables. |
| 4103 | + // |
| 4104 | + auto run_metric_adjustments_v18 = [&]() { |
| 4105 | + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; |
| 4106 | + const auto gpu_metrics_version = |
| 4107 | + translate_flag_to_metric_version(get_gpu_metrics_version_used()); |
| 4108 | + ss << __PRETTY_FUNCTION__ << " | ======= info ======= " |
| 4109 | + << " | Applying adjustments " |
| 4110 | + << " | Metric Version: " |
| 4111 | + << stringfy_metric_header_version(disjoin_metrics_version(gpu_metrics_version)) << " |"; |
| 4112 | + LOG_TRACE(ss); |
| 4113 | |
| 4114 | - ss << " current_vclk0: " << "\n"; |
| 4115 | - idx = 0; |
| 4116 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) { |
| 4117 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4118 | - ++idx; |
| 4119 | - } |
| 4120 | + // firmware_timestamp is at 10ns resolution |
| 4121 | + ss << __PRETTY_FUNCTION__ << " | ======= Changes ======= " |
| 4122 | + << " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp |
| 4123 | + << " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10); |
| 4124 | + m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10); |
| 4125 | + LOG_DEBUG(ss); |
| 4126 | + }; |
| 4127 | |
| 4128 | - ss << " current_dclk0: " << "\n"; |
| 4129 | - idx = 0; |
| 4130 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) { |
| 4131 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4132 | - ++idx; |
| 4133 | - } |
| 4134 | + run_metric_adjustments_v18(); |
| 4135 | |
| 4136 | - idx = 0; |
| 4137 | - idy = 0; |
| 4138 | - ss << " xcp_stats.gfx_busy_inst: " << "\n"; |
| 4139 | - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { |
| 4140 | - if (idx == 0) { |
| 4141 | - ss << "\t [ "; |
| 4142 | - } |
| 4143 | - for (auto& col : row.gfx_busy_inst) { |
| 4144 | - ss << "\t [" << idx << "] [" << idy << "]: " << col; |
| 4145 | - if (idy + 1 != (std::end(row.gfx_busy_inst) - std::end(row.gfx_busy_inst) - 1)) { |
| 4146 | - ss << ", "; |
| 4147 | - } |
| 4148 | - if (idx + 1 != |
| 4149 | - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { |
| 4150 | - ss << "\n"; |
| 4151 | - } else { |
| 4152 | - ss << "]\n"; |
| 4153 | - } |
| 4154 | - idy++; |
| 4155 | - } |
| 4156 | - idx++; |
| 4157 | - } |
| 4158 | + // Temperature Info |
| 4159 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] |
| 4160 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, |
| 4161 | + format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot, |
| 4162 | + "temperature_hotspot"))); |
| 4163 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] |
| 4164 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem, |
| 4165 | + format_metric_row(m_gpu_metrics_tbl.m_temperature_mem, |
| 4166 | + "temperature_mem"))); |
| 4167 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] |
| 4168 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc, |
| 4169 | + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc, |
| 4170 | + "temperature_vrsoc"))); |
| 4171 | |
| 4172 | - idx = 0; |
| 4173 | - idy = 0; |
| 4174 | - ss << " xcp_stats.vcn_busy: " << "\n"; |
| 4175 | - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { |
| 4176 | - if (idx == 0) { |
| 4177 | - ss << "\t [ "; |
| 4178 | - } |
| 4179 | - for (auto& col : row.vcn_busy) { |
| 4180 | - ss << "\t [" << idx << "] [" << idy << "]: " << col; |
| 4181 | - if (idy + 1 != (std::end(row.vcn_busy) - std::end(row.vcn_busy) - 1)) { |
| 4182 | - ss << ", "; |
| 4183 | - } |
| 4184 | - if (idx + 1 != |
| 4185 | - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { |
| 4186 | - ss << "\n"; |
| 4187 | - } else { |
| 4188 | - ss << "]\n"; |
| 4189 | - } |
| 4190 | - idy++; |
| 4191 | - } |
| 4192 | - idx++; |
| 4193 | - } |
| 4194 | + // Power/Energy Info |
| 4195 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] |
| 4196 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, |
| 4197 | + format_metric_row(m_gpu_metrics_tbl.m_current_socket_power, |
| 4198 | + "curr_socket_power"))); |
| 4199 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] |
| 4200 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, |
| 4201 | + format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator, |
| 4202 | + "energy_acc"))); |
| 4203 | |
| 4204 | - idx = 0; |
| 4205 | - idy = 0; |
| 4206 | - ss << " xcp_stats.jpeg_busy: " << "\n"; |
| 4207 | - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { |
| 4208 | - if (idx == 0) { |
| 4209 | - ss << "\t [ "; |
| 4210 | - } |
| 4211 | - for (auto& col : row.jpeg_busy) { |
| 4212 | - ss << "\t [" << idx << "] [" << idy << "]: " << col; |
| 4213 | - if (idy + 1 != (std::end(row.jpeg_busy) - std::end(row.jpeg_busy) - 1)) { |
| 4214 | - ss << ", "; |
| 4215 | - } |
| 4216 | - if (idx + 1 != |
| 4217 | - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { |
| 4218 | - ss << "\n"; |
| 4219 | - } else { |
| 4220 | - ss << "]\n"; |
| 4221 | - } |
| 4222 | - idy++; |
| 4223 | - } |
| 4224 | - idx++; |
| 4225 | - } |
| 4226 | + // Utilization Info |
| 4227 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] |
| 4228 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity, |
| 4229 | + format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity, |
| 4230 | + "average_gfx_activity"))); |
| 4231 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] |
| 4232 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity, |
| 4233 | + format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity, |
| 4234 | + "average_umc_activity"))); |
| 4235 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] |
| 4236 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, |
| 4237 | + format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc, |
| 4238 | + "gfx_activity_acc"))); |
| 4239 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] |
| 4240 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, |
| 4241 | + format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc, |
| 4242 | + "mem_activity_acc"))); |
| 4243 | |
| 4244 | - idx = 0; |
| 4245 | - idy = 0; |
| 4246 | - ss << " xcp_stats.gfx_busy_acc: " << "\n"; |
| 4247 | - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { |
| 4248 | - if (idx == 0) { |
| 4249 | - ss << "\t [ "; |
| 4250 | - } |
| 4251 | - for (auto& col : row.gfx_busy_acc) { |
| 4252 | - ss << "\t [" << idx << "] [" << idy << "]: " << col; |
| 4253 | - if (idy + 1 != (std::end(row.gfx_busy_acc) - std::end(row.gfx_busy_acc) - 1)) { |
| 4254 | - ss << ", "; |
| 4255 | - } |
| 4256 | - if (idx + 1 != |
| 4257 | - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { |
| 4258 | - ss << "\n"; |
| 4259 | - } else { |
| 4260 | - ss << "]\n"; |
| 4261 | - } |
| 4262 | - idy++; |
| 4263 | - } |
| 4264 | - idx++; |
| 4265 | - } |
| 4266 | + // GfxLock Info |
| 4267 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus] |
| 4268 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, |
| 4269 | + format_metric_row(m_gpu_metrics_tbl.m_gfxclk_lock_status, |
| 4270 | + "gfxclk_lock_status"))); |
| 4271 | |
| 4272 | - LOG_DEBUG(ss); |
| 4273 | -} |
| 4274 | + // Timestamp Info |
| 4275 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] |
| 4276 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSFirmware, |
| 4277 | + format_metric_row(m_gpu_metrics_tbl.m_firmware_timestamp, |
| 4278 | + "firmware_timestamp"))); |
| 4279 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] |
| 4280 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter, |
| 4281 | + format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter, |
| 4282 | + "system_clock_counter"))); |
| 4283 | |
| 4284 | + // Link/Width/Speed Info |
| 4285 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4286 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, |
| 4287 | + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width, |
| 4288 | + "pcie_link_width"))); |
| 4289 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4290 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, |
| 4291 | + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed, |
| 4292 | + "pcie_link_speed"))); |
| 4293 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4294 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, |
| 4295 | + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_width, |
| 4296 | + "xgmi_link_width"))); |
| 4297 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4298 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, |
| 4299 | + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_speed, |
| 4300 | + "xgmi_link_speed"))); |
| 4301 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4302 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, |
| 4303 | + format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc, |
| 4304 | + "pcie_bandwidth_acc"))); |
| 4305 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4306 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, |
| 4307 | + format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_inst, |
| 4308 | + "pcie_bandwidth_inst"))); |
| 4309 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4310 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, |
| 4311 | + format_metric_row(m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc, |
| 4312 | + "pcie_l0_recov_count_acc"))); |
| 4313 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4314 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, |
| 4315 | + format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_count_acc, |
| 4316 | + "pcie_replay_count_acc"))); |
| 4317 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4318 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, |
| 4319 | + format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc, |
| 4320 | + "pcie_replay_rollover_count_acc"))); |
| 4321 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4322 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator, |
| 4323 | + format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc, |
| 4324 | + "pcie_nak_sent_count_acc"))); |
| 4325 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4326 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator, |
| 4327 | + format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc, |
| 4328 | + "pcie_nak_rcvd_count_acc"))); |
| 4329 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4330 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, |
| 4331 | + format_metric_row(m_gpu_metrics_tbl.m_xgmi_read_data_acc, |
| 4332 | + "[xgmi_read_data_acc]"))); |
| 4333 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4334 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, |
| 4335 | + format_metric_row(m_gpu_metrics_tbl.m_xgmi_write_data_acc, |
| 4336 | + "[xgmi_write_data_acc]"))); |
| 4337 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4338 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, |
| 4339 | + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_status, |
| 4340 | + "[xgmi_link_status]"))); |
| 4341 | |
| 4342 | -void GpuMetricsBase_v16_t::dump_internal_metrics_table() |
| 4343 | -{ |
| 4344 | - std::ostringstream ss; |
| 4345 | - auto idx = uint64_t(0); |
| 4346 | - auto idy = uint64_t(0); |
| 4347 | - std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n"; |
| 4348 | - ss << __PRETTY_FUNCTION__ |
| 4349 | - << " | ======= DEBUG ======= " |
| 4350 | - << " | Metric Version: " |
| 4351 | - << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header) |
| 4352 | - << " | Size: " |
| 4353 | - << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size) |
| 4354 | - << " |" |
| 4355 | - << "\n"; |
| 4356 | - ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n" |
| 4357 | - << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n" |
| 4358 | - << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n" |
| 4359 | - << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n" |
| 4360 | - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" |
| 4361 | - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"; |
| 4362 | - |
| 4363 | - ss << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n" |
| 4364 | - << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n" |
| 4365 | - << " accumulation_counter: " << m_gpu_metrics_tbl.m_accumulation_counter << "\n" |
| 4366 | - << " prochot_residency_acc: " << m_gpu_metrics_tbl.m_prochot_residency_acc << "\n" |
| 4367 | - << " ppt_residency_acc: " << m_gpu_metrics_tbl.m_ppt_residency_acc << "\n" |
| 4368 | - << " socket_thm_residency_acc: " << m_gpu_metrics_tbl.m_socket_thm_residency_acc << "\n" |
| 4369 | - << " vr_thm_residency_acc: " << m_gpu_metrics_tbl.m_vr_thm_residency_acc << "\n" |
| 4370 | - << " hbm_thm_residency_acc: " << m_gpu_metrics_tbl.m_hbm_thm_residency_acc << "\n" |
| 4371 | - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" |
| 4372 | - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n" |
| 4373 | - << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n" |
| 4374 | - << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n" |
| 4375 | - << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n" |
| 4376 | - << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n" |
| 4377 | - << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n" |
| 4378 | - << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n" |
| 4379 | - << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n" |
| 4380 | - << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n" |
| 4381 | - << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n" |
| 4382 | - << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n" |
| 4383 | - << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n" |
| 4384 | - << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n" |
| 4385 | - << " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n" |
| 4386 | - << " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n" |
| 4387 | - << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n" |
| 4388 | - << " current_uclk: " << m_gpu_metrics_tbl.m_current_uclk << "\n" |
| 4389 | - << " num_partition: " << m_gpu_metrics_tbl.m_num_partition << "\n" |
| 4390 | - << " pcie_lc_perf_other_end_recovery: " |
| 4391 | - << m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery << "\n"; |
| 4392 | - idx = 0; |
| 4393 | - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) { |
| 4394 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4395 | - ++idx; |
| 4396 | - } |
| 4397 | + // CurrentClock Info |
| 4398 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] |
| 4399 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, |
| 4400 | + format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk, |
| 4401 | + "[current_gfxclk]"))); |
| 4402 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] |
| 4403 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock, |
| 4404 | + format_metric_row(m_gpu_metrics_tbl.m_current_socclk, |
| 4405 | + "[current_socclk]"))); |
| 4406 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] |
| 4407 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0, |
| 4408 | + format_metric_row(m_gpu_metrics_tbl.m_current_vclk0, |
| 4409 | + "[current_vclk0]"))); |
| 4410 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] |
| 4411 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0, |
| 4412 | + format_metric_row(m_gpu_metrics_tbl.m_current_dclk0, |
| 4413 | + "[current_dclk0]"))); |
| 4414 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] |
| 4415 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock, |
| 4416 | + format_metric_row(m_gpu_metrics_tbl.m_current_uclk, |
| 4417 | + "current_uclk"))); |
| 4418 | |
| 4419 | - ss << " xgmi_write_data_acc: " << "\n"; |
| 4420 | - idx = 0; |
| 4421 | - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) { |
| 4422 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4423 | - ++idx; |
| 4424 | - } |
| 4425 | + /* Accumulation cycle counter */ |
| 4426 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] |
| 4427 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAccumulationCounter, |
| 4428 | + format_metric_row(m_gpu_metrics_tbl.m_accumulation_counter, |
| 4429 | + "accumulation_counter"))); |
| 4430 | |
| 4431 | - ss << " current_gfxclk: " << "\n"; |
| 4432 | - idx = 0; |
| 4433 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) { |
| 4434 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4435 | - ++idx; |
| 4436 | - } |
| 4437 | + /* Accumulated throttler residencies */ |
| 4438 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] |
| 4439 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricProchotResidencyAccumulator, |
| 4440 | + format_metric_row(m_gpu_metrics_tbl.m_prochot_residency_acc, |
| 4441 | + "prochot_residency_acc"))); |
| 4442 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] |
| 4443 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPPTResidencyAccumulator, |
| 4444 | + format_metric_row(m_gpu_metrics_tbl.m_ppt_residency_acc, |
| 4445 | + "ppt_residency_acc"))); |
| 4446 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] |
| 4447 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricSocketThmResidencyAccumulator, |
| 4448 | + format_metric_row(m_gpu_metrics_tbl.m_socket_thm_residency_acc, |
| 4449 | + "socket_thm_residency_acc"))); |
| 4450 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] |
| 4451 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVRThmResidencyAccumulator, |
| 4452 | + format_metric_row(m_gpu_metrics_tbl.m_vr_thm_residency_acc, |
| 4453 | + "vr_thm_residency_acc"))); |
| 4454 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] |
| 4455 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricHBMThmResidencyAccumulator, |
| 4456 | + format_metric_row(m_gpu_metrics_tbl.m_hbm_thm_residency_acc, |
| 4457 | + "hbm_thm_residency_acc"))); |
| 4458 | |
| 4459 | - ss << " current_socclk: " << "\n"; |
| 4460 | - idx = 0; |
| 4461 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) { |
| 4462 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4463 | - ++idx; |
| 4464 | - } |
| 4465 | + /* Partition info */ |
| 4466 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPartition] |
| 4467 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kGpuMetricNumPartition, |
| 4468 | + format_metric_row(m_gpu_metrics_tbl.m_num_partition, |
| 4469 | + "num_partition"))); |
| 4470 | |
| 4471 | - ss << " current_vclk0: " << "\n"; |
| 4472 | - idx = 0; |
| 4473 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) { |
| 4474 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4475 | - ++idx; |
| 4476 | - } |
| 4477 | + /* xcp_stats info */ |
| 4478 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] |
| 4479 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyInst, |
| 4480 | + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_inst, |
| 4481 | + "xcp_stats->gfx_busy_inst"))); |
| 4482 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] |
| 4483 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVcnBusy, |
| 4484 | + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->vcn_busy, |
| 4485 | + "xcp_stats->vcn_busy"))); |
| 4486 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] |
| 4487 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricJpegBusy, |
| 4488 | + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->jpeg_busy, |
| 4489 | + "xcp_stats->jpeg_busy"))); |
| 4490 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] |
| 4491 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, |
| 4492 | + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_acc, |
| 4493 | + "xcp_stats->gfx_busy_acc"))); |
| 4494 | |
| 4495 | - ss << " current_dclk0: " << "\n"; |
| 4496 | - idx = 0; |
| 4497 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) { |
| 4498 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4499 | - ++idx; |
| 4500 | - } |
| 4501 | + /* gpu metrics v1.8 xcp_stats info */ |
| 4502 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] |
| 4503 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitTotalAcc, |
| 4504 | + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_total_acc, |
| 4505 | + "xcp_stats->gfx_below_host_limit_total_acc"))); |
| 4506 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] |
| 4507 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitPptAcc, |
| 4508 | + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_ppt_acc, |
| 4509 | + "xcp_stats->gfx_below_host_limit_ppt_acc"))); |
| 4510 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] |
| 4511 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitThmAcc, |
| 4512 | + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_thm_acc, |
| 4513 | + "xcp_stats->gfx_below_host_limit_thm_acc"))); |
| 4514 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] |
| 4515 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxLowUtilitizationAcc, |
| 4516 | + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_low_utilization_acc, |
| 4517 | + "xcp_stats->gfx_low_utilization_acc"))); |
| 4518 | |
| 4519 | - idx = 0; |
| 4520 | - idy = 0; |
| 4521 | - ss << " xcp_stats.gfx_busy_inst: " << "\n"; |
| 4522 | - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { |
| 4523 | - if (idx == 0) { |
| 4524 | - ss << "\t [ "; |
| 4525 | - } |
| 4526 | - for (auto& col : row.gfx_busy_inst) { |
| 4527 | - ss << "\t [" << idx << "] [" << idy << "]: " << col; |
| 4528 | - if (idy + 1 != (std::end(row.gfx_busy_inst) - std::end(row.gfx_busy_inst) - 1)) { |
| 4529 | - ss << ", "; |
| 4530 | - } |
| 4531 | - if (idx + 1 != |
| 4532 | - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { |
| 4533 | - ss << "\n"; |
| 4534 | - } else { |
| 4535 | - ss << "]\n"; |
| 4536 | - } |
| 4537 | - idy++; |
| 4538 | - } |
| 4539 | - idx++; |
| 4540 | - } |
| 4541 | + /* PCIE other end recovery counter info */ |
| 4542 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4543 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, |
| 4544 | + format_metric_row(m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery, |
| 4545 | + "pcie_lc_perf_other_end_recovery"))); |
| 4546 | |
| 4547 | - idx = 0; |
| 4548 | - idy = 0; |
| 4549 | - ss << " xcp_stats.vcn_busy: " << "\n"; |
| 4550 | - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { |
| 4551 | - if (idx == 0) { |
| 4552 | - ss << "\t [ "; |
| 4553 | - } |
| 4554 | - for (auto& col : row.vcn_busy) { |
| 4555 | - ss << "\t [" << idx << "] [" << idy << "]: " << col; |
| 4556 | - if (idy + 1 != (std::end(row.vcn_busy) - std::end(row.vcn_busy) - 1)) { |
| 4557 | - ss << ", "; |
| 4558 | - } |
| 4559 | - if (idx + 1 != |
| 4560 | - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { |
| 4561 | - ss << "\n"; |
| 4562 | - } else { |
| 4563 | - ss << "]\n"; |
| 4564 | - } |
| 4565 | - idy++; |
| 4566 | - } |
| 4567 | - idx++; |
| 4568 | - } |
| 4569 | + /* VRAM max bandwidth (in GB/sec) at max memory clock */ |
| 4570 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4571 | + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, |
| 4572 | + format_metric_row(m_gpu_metrics_tbl.m_mem_max_bandwidth, |
| 4573 | + "vram_max_bandwidth"))); |
| 4574 | |
| 4575 | - idx = 0; |
| 4576 | - idy = 0; |
| 4577 | - ss << " xcp_stats.jpeg_busy: " << "\n"; |
| 4578 | - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { |
| 4579 | - if (idx == 0) { |
| 4580 | - ss << "\t [ "; |
| 4581 | - } |
| 4582 | - for (auto& col : row.jpeg_busy) { |
| 4583 | - ss << "\t [" << idx << "] [" << idy << "]: " << col; |
| 4584 | - if (idy + 1 != (std::end(row.jpeg_busy) - std::end(row.jpeg_busy) - 1)) { |
| 4585 | - ss << ", "; |
| 4586 | - } |
| 4587 | - if (idx + 1 != |
| 4588 | - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { |
| 4589 | - ss << "\n"; |
| 4590 | - } else { |
| 4591 | - ss << "]\n"; |
| 4592 | - } |
| 4593 | - idy++; |
| 4594 | - } |
| 4595 | - idx++; |
| 4596 | - } |
| 4597 | + ss << __PRETTY_FUNCTION__ << " | ======= end ======= " |
| 4598 | + << " | Success " |
| 4599 | + << " | Returning = " << getRSMIStatusString(status_code) << " |"; |
| 4600 | + LOG_TRACE(ss); |
| 4601 | |
| 4602 | - idx = 0; |
| 4603 | - idy = 0; |
| 4604 | - ss << " xcp_stats.gfx_busy_acc: " << "\n"; |
| 4605 | - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { |
| 4606 | - if (idx == 0) { |
| 4607 | - ss << "\t [ "; |
| 4608 | - } |
| 4609 | - for (auto& col : row.gfx_busy_acc) { |
| 4610 | - ss << "\t [" << idx << "] [" << idy << "]: " << col; |
| 4611 | - if (idy + 1 != (std::end(row.gfx_busy_acc) - std::end(row.gfx_busy_acc) - 1)) { |
| 4612 | - ss << ", "; |
| 4613 | - } |
| 4614 | - if (idx + 1 != |
| 4615 | - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { |
| 4616 | - ss << "\n"; |
| 4617 | - } else { |
| 4618 | - ss << "]\n"; |
| 4619 | - } |
| 4620 | - idy++; |
| 4621 | - } |
| 4622 | - idx++; |
| 4623 | - } |
| 4624 | + // Copy to base class |
| 4625 | + std::copy(m_metrics_dynamic_tbl.begin(), m_metrics_dynamic_tbl.end(), |
| 4626 | + std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl, |
| 4627 | + GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end())); |
| 4628 | |
| 4629 | - LOG_DEBUG(ss); |
| 4630 | + return status_code; |
| 4631 | } |
| 4632 | |
| 4633 | rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() { |
| 4634 | @@ -883,10 +758,7 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() { |
| 4635 | ss << __PRETTY_FUNCTION__ << " | ======= start ======="; |
| 4636 | LOG_TRACE(ss); |
| 4637 | |
| 4638 | - if (!m_metrics_dynamic_tbl.empty()) { |
| 4639 | - m_metrics_dynamic_tbl.clear(); |
| 4640 | - } |
| 4641 | - |
| 4642 | + auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; |
| 4643 | // |
| 4644 | // Note: Any metric treatment/changes (if any) should happen before they |
| 4645 | // get written to internal/external tables. |
| 4646 | @@ -1106,6 +978,11 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() { |
| 4647 | format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_acc, |
| 4648 | "xcp_stats->gfx_busy_acc"))); |
| 4649 | |
| 4650 | + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats].insert( |
| 4651 | + std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator, |
| 4652 | + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_acc, |
| 4653 | + "xcp_stats->gfx_below_host_limit_acc"))); |
| 4654 | + |
| 4655 | /* PCIE other end recovery counter info */ |
| 4656 | m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] |
| 4657 | .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, |
| 4658 | @@ -1118,12 +995,6 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() { |
| 4659 | format_metric_row(m_gpu_metrics_tbl.m_vram_max_bandwidth, |
| 4660 | "vram_max_bandwidth"))); |
| 4661 | |
| 4662 | - /* Total App Clock Counter Accumulated */ |
| 4663 | - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] |
| 4664 | - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator, |
| 4665 | - format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_acc, |
| 4666 | - "gfx_below_host_limit_acc"))); |
| 4667 | - |
| 4668 | ss << __PRETTY_FUNCTION__ |
| 4669 | << " | ======= end ======= " |
| 4670 | << " | Success " |
| 4671 | @@ -1131,6 +1002,12 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() { |
| 4672 | << " |"; |
| 4673 | LOG_TRACE(ss); |
| 4674 | |
| 4675 | + // Copy to base class |
| 4676 | + std::copy(m_metrics_dynamic_tbl.begin(), |
| 4677 | + m_metrics_dynamic_tbl.end(), |
| 4678 | + std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl, |
| 4679 | + GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end())); |
| 4680 | + |
| 4681 | return status_code; |
| 4682 | } |
| 4683 | |
| 4684 | @@ -1140,10 +1017,7 @@ rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() { |
| 4685 | ss << __PRETTY_FUNCTION__ << " | ======= start ======="; |
| 4686 | LOG_TRACE(ss); |
| 4687 | |
| 4688 | - if (!m_metrics_dynamic_tbl.empty()) { |
| 4689 | - m_metrics_dynamic_tbl.clear(); |
| 4690 | - } |
| 4691 | - |
| 4692 | + auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; |
| 4693 | // |
| 4694 | // Note: Any metric treatment/changes (if any) should happen before they |
| 4695 | // get written to internal/external tables. |
| 4696 | @@ -1371,127 +1245,22 @@ rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() { |
| 4697 | << " |"; |
| 4698 | LOG_TRACE(ss); |
| 4699 | |
| 4700 | - return status_code; |
| 4701 | -} |
| 4702 | - |
| 4703 | -void GpuMetricsBase_v15_t::dump_internal_metrics_table() |
| 4704 | -{ |
| 4705 | - std::ostringstream ss; |
| 4706 | - std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n"; |
| 4707 | - ss << __PRETTY_FUNCTION__ |
| 4708 | - << " | ======= DEBUG ======= " |
| 4709 | - << " | Metric Version: " << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header) |
| 4710 | - << " | Size: " << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size) |
| 4711 | - << " |" |
| 4712 | - << "\n"; |
| 4713 | - ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n" |
| 4714 | - << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n" |
| 4715 | - << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n" |
| 4716 | - |
| 4717 | - << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n" |
| 4718 | - |
| 4719 | - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" |
| 4720 | - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"; |
| 4721 | - |
| 4722 | - ss << " vcn_activity: " << "\n"; |
| 4723 | - auto idx = uint64_t(0); |
| 4724 | - for (const auto& temp : m_gpu_metrics_tbl.m_vcn_activity) { |
| 4725 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4726 | - ++idx; |
| 4727 | - } |
| 4728 | - |
| 4729 | - ss << " jpeg_activity: " << "\n"; |
| 4730 | - idx = 0; |
| 4731 | - for (const auto& temp : m_gpu_metrics_tbl.m_jpeg_activity) { |
| 4732 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4733 | - ++idx; |
| 4734 | - } |
| 4735 | - |
| 4736 | - ss << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n" |
| 4737 | - << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n" |
| 4738 | - |
| 4739 | - << " throttle_status: " << m_gpu_metrics_tbl.m_throttle_status << "\n" |
| 4740 | - |
| 4741 | - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" |
| 4742 | - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n" |
| 4743 | - |
| 4744 | - << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n" |
| 4745 | - |
| 4746 | - << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n" |
| 4747 | - << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n" |
| 4748 | - |
| 4749 | - << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n" |
| 4750 | - << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n" |
| 4751 | + // Copy to base class |
| 4752 | + std::copy(m_metrics_dynamic_tbl.begin(), |
| 4753 | + m_metrics_dynamic_tbl.end(), |
| 4754 | + std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl, |
| 4755 | + GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end())); |
| 4756 | |
| 4757 | - << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n" |
| 4758 | - << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n" |
| 4759 | - |
| 4760 | - << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n" |
| 4761 | - << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n" |
| 4762 | - << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n" |
| 4763 | - << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n" |
| 4764 | - << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n" |
| 4765 | - << " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n" |
| 4766 | - << " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n"; |
| 4767 | - |
| 4768 | - ss << " xgmi_read_data_acc: " << "\n"; |
| 4769 | - idx = 0; |
| 4770 | - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) { |
| 4771 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4772 | - ++idx; |
| 4773 | - } |
| 4774 | - |
| 4775 | - ss << " xgmi_write_data_acc: " << "\n"; |
| 4776 | - idx = 0; |
| 4777 | - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) { |
| 4778 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4779 | - ++idx; |
| 4780 | - } |
| 4781 | - |
| 4782 | - ss << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n"; |
| 4783 | - |
| 4784 | - ss << " current_gfxclk: " << "\n"; |
| 4785 | - idx = 0; |
| 4786 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) { |
| 4787 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4788 | - ++idx; |
| 4789 | - } |
| 4790 | - |
| 4791 | - ss << " current_socclk: " << "\n"; |
| 4792 | - idx = 0; |
| 4793 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) { |
| 4794 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4795 | - ++idx; |
| 4796 | - } |
| 4797 | - |
| 4798 | - ss << " current_vclk0: " << "\n"; |
| 4799 | - idx = 0; |
| 4800 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) { |
| 4801 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4802 | - ++idx; |
| 4803 | - } |
| 4804 | - |
| 4805 | - ss << " current_dclk0: " << "\n"; |
| 4806 | - idx = 0; |
| 4807 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) { |
| 4808 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4809 | - ++idx; |
| 4810 | - } |
| 4811 | - |
| 4812 | - ss << " padding: " << m_gpu_metrics_tbl.m_padding << "\n"; |
| 4813 | - LOG_DEBUG(ss); |
| 4814 | + return status_code; |
| 4815 | } |
| 4816 | |
| 4817 | rsmi_status_t GpuMetricsBase_v15_t::populate_metrics_dynamic_tbl() { |
| 4818 | std::ostringstream ss; |
| 4819 | auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); |
| 4820 | ss << __PRETTY_FUNCTION__ << " | ======= start ======="; |
| 4821 | - LOG_TRACE(ss); |
| 4822 | - |
| 4823 | - if (!m_metrics_dynamic_tbl.empty()) { |
| 4824 | - m_metrics_dynamic_tbl.clear(); |
| 4825 | - } |
| 4826 | + LOG_TRACE(ss); |
| 4827 | |
| 4828 | + auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; |
| 4829 | // |
| 4830 | // Note: Any metric treatment/changes (if any) should happen before they |
| 4831 | // get written to internal/external tables. |
| 4832 | @@ -1708,107 +1477,13 @@ rsmi_status_t GpuMetricsBase_v15_t::populate_metrics_dynamic_tbl() { |
| 4833 | << " |"; |
| 4834 | LOG_TRACE(ss); |
| 4835 | |
| 4836 | - return status_code; |
| 4837 | -} |
| 4838 | - |
| 4839 | - |
| 4840 | -void GpuMetricsBase_v14_t::dump_internal_metrics_table() |
| 4841 | -{ |
| 4842 | - std::ostringstream ss; |
| 4843 | - std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n"; |
| 4844 | - ss << __PRETTY_FUNCTION__ |
| 4845 | - << " | ======= DEBUG ======= " |
| 4846 | - << " | Metric Version: " << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header) |
| 4847 | - << " | Size: " << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size) |
| 4848 | - << " |" |
| 4849 | - << "\n"; |
| 4850 | - ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n" |
| 4851 | - << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n" |
| 4852 | - << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n" |
| 4853 | - |
| 4854 | - << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n" |
| 4855 | - |
| 4856 | - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" |
| 4857 | - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"; |
| 4858 | - |
| 4859 | - ss << " vcn_activity: " << "\n"; |
| 4860 | - auto idx = uint64_t(0); |
| 4861 | - for (const auto& temp : m_gpu_metrics_tbl.m_vcn_activity) { |
| 4862 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4863 | - ++idx; |
| 4864 | - } |
| 4865 | - |
| 4866 | - ss << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n" |
| 4867 | - << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n" |
| 4868 | - |
| 4869 | - << " throttle_status: " << m_gpu_metrics_tbl.m_throttle_status << "\n" |
| 4870 | - |
| 4871 | - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" |
| 4872 | - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n" |
| 4873 | - |
| 4874 | - << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n" |
| 4875 | - |
| 4876 | - << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n" |
| 4877 | - << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n" |
| 4878 | - |
| 4879 | - << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n" |
| 4880 | - << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n" |
| 4881 | - |
| 4882 | - << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n" |
| 4883 | - << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n" |
| 4884 | + // Copy to base class |
| 4885 | + std::copy(m_metrics_dynamic_tbl.begin(), |
| 4886 | + m_metrics_dynamic_tbl.end(), |
| 4887 | + std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl, |
| 4888 | + GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end())); |
| 4889 | |
| 4890 | - << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n" |
| 4891 | - << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n" |
| 4892 | - << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n" |
| 4893 | - << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n" |
| 4894 | - << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n"; |
| 4895 | - |
| 4896 | - ss << " xgmi_read_data_acc: " << "\n"; |
| 4897 | - idx = 0; |
| 4898 | - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) { |
| 4899 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4900 | - ++idx; |
| 4901 | - } |
| 4902 | - |
| 4903 | - ss << " xgmi_write_data_acc: " << "\n"; |
| 4904 | - idx = 0; |
| 4905 | - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) { |
| 4906 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4907 | - ++idx; |
| 4908 | - } |
| 4909 | - |
| 4910 | - ss << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n"; |
| 4911 | - |
| 4912 | - ss << " current_gfxclk: " << "\n"; |
| 4913 | - idx = 0; |
| 4914 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) { |
| 4915 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4916 | - ++idx; |
| 4917 | - } |
| 4918 | - |
| 4919 | - ss << " current_socclk: " << "\n"; |
| 4920 | - idx = 0; |
| 4921 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) { |
| 4922 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4923 | - ++idx; |
| 4924 | - } |
| 4925 | - |
| 4926 | - ss << " current_vclk0: " << "\n"; |
| 4927 | - idx = 0; |
| 4928 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) { |
| 4929 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4930 | - ++idx; |
| 4931 | - } |
| 4932 | - |
| 4933 | - ss << " current_dclk0: " << "\n"; |
| 4934 | - idx = 0; |
| 4935 | - for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) { |
| 4936 | - ss << "\t [" << idx << "]: " << temp << "\n"; |
| 4937 | - ++idx; |
| 4938 | - } |
| 4939 | - |
| 4940 | - ss << " padding: " << m_gpu_metrics_tbl.m_padding << "\n"; |
| 4941 | - LOG_DEBUG(ss); |
| 4942 | + return status_code; |
| 4943 | } |
| 4944 | |
| 4945 | rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() { |
| 4946 | @@ -1817,10 +1492,7 @@ rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() { |
| 4947 | ss << __PRETTY_FUNCTION__ << " | ======= start ======="; |
| 4948 | LOG_TRACE(ss); |
| 4949 | |
| 4950 | - if (!m_metrics_dynamic_tbl.empty()) { |
| 4951 | - m_metrics_dynamic_tbl.clear(); |
| 4952 | - } |
| 4953 | - |
| 4954 | + auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; |
| 4955 | // |
| 4956 | // Note: Any metric treatment/changes (if any) should happen before they |
| 4957 | // get written to internal/external tables. |
| 4958 | @@ -2022,6 +1694,12 @@ rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() { |
| 4959 | << " |"; |
| 4960 | LOG_TRACE(ss); |
| 4961 | |
| 4962 | + // Copy to base class |
| 4963 | + std::copy(m_metrics_dynamic_tbl.begin(), |
| 4964 | + m_metrics_dynamic_tbl.end(), |
| 4965 | + std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl, |
| 4966 | + GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end())); |
| 4967 | + |
| 4968 | return status_code; |
| 4969 | } |
| 4970 | |
| 4971 | @@ -2125,6 +1803,7 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m |
| 4972 | |
| 4973 | rsmi_gpu_metrics.pcie_nak_sent_count_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.pcie_nak_sent_count_acc)>(); |
| 4974 | rsmi_gpu_metrics.pcie_nak_rcvd_count_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.pcie_nak_rcvd_count_acc)>(); |
| 4975 | + |
| 4976 | rsmi_gpu_metrics.accumulation_counter = init_max_uint_types<decltype(rsmi_gpu_metrics.accumulation_counter)>(); |
| 4977 | rsmi_gpu_metrics.prochot_residency_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.prochot_residency_acc)>(); |
| 4978 | rsmi_gpu_metrics.ppt_residency_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.ppt_residency_acc)>(); |
| 4979 | @@ -2148,6 +1827,14 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m |
| 4980 | init_max_uint_types<std::uint64_t>()); |
| 4981 | std::fill(std::begin(row.gfx_below_host_limit_acc), std::end(row.gfx_below_host_limit_acc), |
| 4982 | init_max_uint_types<std::uint64_t>()); |
| 4983 | + std::fill(std::begin(row.gfx_below_host_limit_ppt_acc), std::end(row.gfx_below_host_limit_ppt_acc), |
| 4984 | + init_max_uint_types<std::uint64_t>()); |
| 4985 | + std::fill(std::begin(row.gfx_below_host_limit_thm_acc), std::end(row.gfx_below_host_limit_thm_acc), |
| 4986 | + init_max_uint_types<std::uint64_t>()); |
| 4987 | + std::fill(std::begin(row.gfx_low_utilization_acc), std::end(row.gfx_low_utilization_acc), |
| 4988 | + init_max_uint_types<std::uint64_t>()); |
| 4989 | + std::fill(std::begin(row.gfx_below_host_limit_total_acc), std::end(row.gfx_below_host_limit_total_acc), |
| 4990 | + init_max_uint_types<std::uint64_t>()); |
| 4991 | } |
| 4992 | |
| 4993 | ss << __PRETTY_FUNCTION__ |
| 4994 | @@ -2160,6 +1847,213 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m |
| 4995 | return status_code; |
| 4996 | } |
| 4997 | |
| 4998 | +AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v18_t::copy_internal_to_external_metrics() |
| 4999 | +{ |
| 5000 | + std::ostringstream ss; |

005-use- debian- version- when-available. patch
Didn't we have a similar problem with another package a while back?
Getting the version from d/changelog sounds wrong, since the upstream code should have the version number somewhere.
But just as an opportunity, you should not parse the changelog manually, there is dpkg-parsechangelog for that. For example:
$ dpkg-parsechangelog -Sversion
7.1.0-0ubuntu1