Merge ~bullwinkle-team/ubuntu/+source/rocm-smi-lib:bullwinkle/ubuntu/devel into ubuntu/+source/rocm-smi-lib:ubuntu/devel

Proposed by Talha Can Havadar
Status: Merged
Merged at revision: 401aba6b66ff49ce5c298d30193f938209edf0d7
Proposed branch: ~bullwinkle-team/ubuntu/+source/rocm-smi-lib:bullwinkle/ubuntu/devel
Merge into: ubuntu/+source/rocm-smi-lib:ubuntu/devel
Diff against target: 7412 lines (+2083/-1573)
140 files modified
.azuredevops/rocm-ci.yml (+4/-2)
.github/CODEOWNERS (+1/-1)
.github/palamida.yml (+5/-0)
.github/workflows/kws-caller.yml (+15/-0)
.github/workflows/rocm_ci_caller.yml (+16/-10)
CHANGELOG.md (+89/-0)
CMakeLists.txt (+26/-42)
LICENSE.md (+1/-2)
README.md (+1/-1)
cmake_modules/help_package.cmake (+1/-1)
cmake_modules/utils.cmake (+1/-1)
debian/changelog (+36/-0)
debian/control (+8/-6)
debian/liboam7.install (+2/-0)
debian/liboam7.symbols.amd64 (+1/-1)
debian/librocm-smi64-7.install (+2/-0)
debian/librocm-smi64-7.symbols.amd64 (+1/-1)
debian/not-installed (+1/-1)
debian/patches/0002-add-version-script-to-control-exposed-symbols.patch (+1/-1)
debian/patches/0005-oam-rocm_smi-fix-version-string-issue-when-no-git-av.patch (+45/-0)
debian/patches/series (+1/-0)
debian/rules (+7/-0)
dev/null (+0/-200)
docs/conf.py (+1/-1)
docs/license.md (+1/-1)
include/rocm_smi/rocm_smi.h (+34/-18)
include/rocm_smi/rocm_smi_common.h (+1/-1)
include/rocm_smi/rocm_smi_counters.h (+1/-1)
include/rocm_smi/rocm_smi_device.h (+1/-1)
include/rocm_smi/rocm_smi_exception.h (+1/-1)
include/rocm_smi/rocm_smi_gpu_metrics.h (+182/-64)
include/rocm_smi/rocm_smi_io_link.h (+1/-1)
include/rocm_smi/rocm_smi_kfd.h (+1/-1)
include/rocm_smi/rocm_smi_logger.h (+1/-1)
include/rocm_smi/rocm_smi_main.h (+1/-1)
include/rocm_smi/rocm_smi_monitor.h (+1/-1)
include/rocm_smi/rocm_smi_power_mon.h (+1/-1)
include/rocm_smi/rocm_smi_properties.h (+1/-1)
include/rocm_smi/rocm_smi_utils.h (+4/-1)
oam/CMakeLists.txt (+4/-2)
oam/src/oamConfig.in (+2/-2)
python_smi_tools/README.md (+1/-1)
python_smi_tools/rocm_smi.py (+176/-88)
python_smi_tools/rsmiBindings.py (+28/-5)
python_smi_tools/rsmiBindings.py.in (+1/-1)
python_smi_tools/rsmiBindingsInit.py.in (+1/-1)
rocm_smi/CMakeLists.txt (+3/-5)
rocm_smi/example/rocm_smi_example.cc (+50/-1)
src/rocm_smi.cc (+397/-156)
src/rocm_smi64Config.in (+1/-1)
src/rocm_smi_counters.cc (+1/-1)
src/rocm_smi_device.cc (+9/-7)
src/rocm_smi_gpu_metrics.cc (+565/-744)
src/rocm_smi_io_link.cc (+1/-1)
src/rocm_smi_kfd.cc (+3/-3)
src/rocm_smi_logger.cc (+1/-1)
src/rocm_smi_main.cc (+1/-1)
src/rocm_smi_monitor.cc (+10/-3)
src/rocm_smi_properties.cc (+1/-1)
src/rocm_smi_utils.cc (+56/-25)
tests/rocm_smi_test/functional/api_support_read.cc (+1/-1)
tests/rocm_smi_test/functional/api_support_read.h (+1/-1)
tests/rocm_smi_test/functional/computepartition_read_write.cc (+1/-1)
tests/rocm_smi_test/functional/computepartition_read_write.h (+1/-1)
tests/rocm_smi_test/functional/err_cnt_read.cc (+1/-1)
tests/rocm_smi_test/functional/err_cnt_read.h (+1/-1)
tests/rocm_smi_test/functional/evt_notif_read_write.cc (+1/-1)
tests/rocm_smi_test/functional/evt_notif_read_write.h (+1/-1)
tests/rocm_smi_test/functional/fan_read.cc (+7/-8)
tests/rocm_smi_test/functional/fan_read.h (+1/-1)
tests/rocm_smi_test/functional/fan_read_write.cc (+5/-1)
tests/rocm_smi_test/functional/fan_read_write.h (+1/-1)
tests/rocm_smi_test/functional/frequencies_read.cc (+1/-1)
tests/rocm_smi_test/functional/frequencies_read.h (+1/-1)
tests/rocm_smi_test/functional/frequencies_read_write.cc (+1/-1)
tests/rocm_smi_test/functional/frequencies_read_write.h (+1/-1)
tests/rocm_smi_test/functional/gpu_busy_read.cc (+3/-2)
tests/rocm_smi_test/functional/gpu_busy_read.h (+1/-1)
tests/rocm_smi_test/functional/gpu_metrics_read.cc (+46/-2)
tests/rocm_smi_test/functional/gpu_metrics_read.h (+1/-1)
tests/rocm_smi_test/functional/hw_topology_read.cc (+1/-1)
tests/rocm_smi_test/functional/hw_topology_read.h (+1/-1)
tests/rocm_smi_test/functional/id_info_read.cc (+1/-1)
tests/rocm_smi_test/functional/id_info_read.h (+1/-1)
tests/rocm_smi_test/functional/init_shutdown_refcount.cc (+1/-1)
tests/rocm_smi_test/functional/init_shutdown_refcount.h (+1/-1)
tests/rocm_smi_test/functional/measure_api_execution_time.cc (+1/-1)
tests/rocm_smi_test/functional/measure_api_execution_time.h (+1/-1)
tests/rocm_smi_test/functional/mem_page_info_read.cc (+1/-1)
tests/rocm_smi_test/functional/mem_page_info_read.h (+1/-1)
tests/rocm_smi_test/functional/mem_util_read.cc (+23/-11)
tests/rocm_smi_test/functional/mem_util_read.h (+1/-1)
tests/rocm_smi_test/functional/memorypartition_read_write.cc (+1/-1)
tests/rocm_smi_test/functional/memorypartition_read_write.h (+1/-1)
tests/rocm_smi_test/functional/metrics_counter_read.cc (+1/-1)
tests/rocm_smi_test/functional/metrics_counter_read.h (+1/-1)
tests/rocm_smi_test/functional/mutual_exclusion.cc (+1/-1)
tests/rocm_smi_test/functional/mutual_exclusion.h (+1/-1)
tests/rocm_smi_test/functional/overdrive_read.cc (+1/-1)
tests/rocm_smi_test/functional/overdrive_read.h (+1/-1)
tests/rocm_smi_test/functional/overdrive_read_write.cc (+1/-1)
tests/rocm_smi_test/functional/overdrive_read_write.h (+1/-1)
tests/rocm_smi_test/functional/pci_read_write.cc (+1/-1)
tests/rocm_smi_test/functional/pci_read_write.h (+1/-1)
tests/rocm_smi_test/functional/perf_cntr_read_write.cc (+1/-1)
tests/rocm_smi_test/functional/perf_cntr_read_write.h (+1/-1)
tests/rocm_smi_test/functional/perf_determinism.cc (+9/-1)
tests/rocm_smi_test/functional/perf_determinism.h (+1/-1)
tests/rocm_smi_test/functional/perf_level_read.cc (+10/-5)
tests/rocm_smi_test/functional/perf_level_read.h (+1/-1)
tests/rocm_smi_test/functional/perf_level_read_write.cc (+10/-5)
tests/rocm_smi_test/functional/perf_level_read_write.h (+1/-1)
tests/rocm_smi_test/functional/power_cap_read_write.cc (+14/-7)
tests/rocm_smi_test/functional/power_cap_read_write.h (+1/-1)
tests/rocm_smi_test/functional/power_read.cc (+26/-11)
tests/rocm_smi_test/functional/power_read.h (+1/-1)
tests/rocm_smi_test/functional/power_read_write.cc (+8/-15)
tests/rocm_smi_test/functional/power_read_write.h (+1/-1)
tests/rocm_smi_test/functional/process_info_read.cc (+1/-1)
tests/rocm_smi_test/functional/process_info_read.h (+1/-1)
tests/rocm_smi_test/functional/sys_info_read.cc (+11/-5)
tests/rocm_smi_test/functional/sys_info_read.h (+1/-1)
tests/rocm_smi_test/functional/temp_read.cc (+2/-2)
tests/rocm_smi_test/functional/temp_read.h (+1/-1)
tests/rocm_smi_test/functional/version_read.cc (+1/-1)
tests/rocm_smi_test/functional/version_read.h (+1/-1)
tests/rocm_smi_test/functional/volt_freq_curv_read.cc (+1/-1)
tests/rocm_smi_test/functional/volt_freq_curv_read.h (+1/-1)
tests/rocm_smi_test/functional/volt_read.cc (+2/-2)
tests/rocm_smi_test/functional/volt_read.h (+1/-1)
tests/rocm_smi_test/functional/xgmi_read_write.cc (+1/-1)
tests/rocm_smi_test/functional/xgmi_read_write.h (+1/-1)
tests/rocm_smi_test/main.cc (+1/-1)
tests/rocm_smi_test/test_base.cc (+22/-13)
tests/rocm_smi_test/test_base.h (+10/-3)
tests/rocm_smi_test/test_common.cc (+2/-1)
tests/rocm_smi_test/test_common.h (+1/-1)
tests/rocm_smi_test/test_utils.cc (+1/-1)
tests/rocm_smi_test/test_utils.h (+1/-1)
third_party/shared_mutex/shared_mutex.cc (+1/-1)
Reviewer Review Type Date Requested Status
Frank Heimes (community) Approve
Igor Luppi (community) Approve
Andreas Hasenack Pending
Ubuntu Sponsors Pending
Talha Can Havadar Pending
Review via email: mp+498272@code.launchpad.net

This proposal supersedes a proposal from 2025-12-04.

Description of the change

New upstream version 7.1.0

Tested this package in:
https://launchpad.net/~bullwinkle-team/+archive/ubuntu/rocm-devel
And here in this ppa where I experimented with upstream llvm:
https://launchpad.net/~tchavadar/+archive/ubuntu/rocm-with-llvm-21

Also built llama.cpp snap (indirectly depends this package):
https://github.com/talhaHavadar/snap-llama.cpp

Functionally things seems to be working ok.

To post a comment you must log in.
Revision history for this message
Andreas Hasenack (ahasenack) : Posted in a previous version of this proposal
Revision history for this message
Andreas Hasenack (ahasenack) : Posted in a previous version of this proposal
Revision history for this message
Andreas Hasenack (ahasenack) : Posted in a previous version of this proposal
Revision history for this message
Andreas Hasenack (ahasenack) wrote : Posted in a previous version of this proposal

005-use-debian-version-when-available.patch

Didn't we have a similar problem with another package a while back?

Getting the version from d/changelog sounds wrong, since the upstream code should have the version number somewhere.

But just as an opportunity, you should not parse the changelog manually, there is dpkg-parsechangelog for that. For example:

$ dpkg-parsechangelog -Sversion
7.1.0-0ubuntu1

review: Needs Fixing
Revision history for this message
Andreas Hasenack (ahasenack) : Posted in a previous version of this proposal
Revision history for this message
Andreas Hasenack (ahasenack) : Posted in a previous version of this proposal
Revision history for this message
Talha Can Havadar (tchavadar) wrote : Posted in a previous version of this proposal

Why do we have this change in rocm-smi-lib?

d/control:
```
-Architecture: linux-any
+Architecture: amd64 arm64
```

review: Needs Information
Revision history for this message
Talha Can Havadar (tchavadar) wrote : Posted in a previous version of this proposal

Also about b/debian/patches/0005-use-debian-version-when-available.patch should we make sure debian has this fix? because other than this change we dont have a significant diff with https://salsa.debian.org/rocm-team/rocm-smi-lib/-/tree/debian/unstable/debian?ref_type=heads

Based on our previous discussions, having ubuntu suffix in the version would block the sync from debian but I dont see a real need to block the sync from debian for rocm-smi-lib (this doesnt depend on amd llvm fork)

review: Needs Information
Revision history for this message
Igor Luppi (igorluppi) wrote : Posted in a previous version of this proposal

Please @Talha, take a second review. I have changed the patch and also the soversion. Thanks!!

Revision history for this message
Talha Can Havadar (tchavadar) wrote : Posted in a previous version of this proposal
Revision history for this message
Talha Can Havadar (tchavadar) wrote : Posted in a previous version of this proposal

I just updated the branch with the fix and gonna merge this to bullwinkle/ubuntu/devel

Revision history for this message
Talha Can Havadar (tchavadar) wrote : Posted in a previous version of this proposal

igor can you change propsed branch to bullwinkle/ubuntu/devel instead of bullwinkle/ubuntu/devel-7.1.0-0ubuntu1

Revision history for this message
Igor Luppi (igorluppi) :
review: Approve
Revision history for this message
Talha Can Havadar (tchavadar) wrote :

created https://launchpad.net/~tchavadar/+archive/ubuntu/lp2138653 PPA just for this package, enabled proposed as dependency including all arch to not miss any possible issues

Revision history for this message
Frank Heimes (fheimes) wrote :
Download full text (3.7 KiB)

First of all thanks for this significant work!

I have a few / the following thoughts:
- I am surprised that this ROCm package also builds for platforms other than amd64, amd64v3
  I haven't expected that - it builds also for arm64, armhf, ppc64el and s390x
  (The previous version even for i386 and riscv64.)
  Does that make sense? Especially the librocm-"smi64" armhf ?
  In d/control I see that the architecture is 'linux-any' - might make sense to limit this more.
  (But if all these arch. are upstream supported and work fine, ignore my ignorance, but then the symbols
   file would need to be available for more than amd64.)
- I bumped into the above because there is only a liboam7.symbols file for amd64.
  (Hence lintian, running on the binary DEBs, complains about the missing symbols file for all other
   architectures.)
- Then I believe that version 7.1.0-0ubuntu"1" was never uploaded outside of the PPA, right (since it's also marked with 'UNRELEASED')?
  So we could also combine/squash the changelog entries for 0ubuntu1 and 0ubuntu2 to a new 0ubuntu1
  and upload this. - just a thought.
  (It would be a bit cleaner for the archive, but I believe uploading an 0ubuntu2 would also be ok).

The only issue that I found was when I tried to get the orig-tarball, what I usually do using uscan.
But that didn't work, because the watch file seems to be broken:
$ uscan
uscan warn: debian/watch is an obsolete version 1 watch file;
   please upgrade to a higher version
   (see uscan(1) for details).
uscan warn: debian/watch is an obsolete version 1 watch file;
   please upgrade to a higher version
   (see uscan(1) for details).
uscan warn: debian/watch is an obsolete version 1 watch file;
   please upgrade to a higher version
   (see uscan(1) for details).
uscan warn: debian/watch is an obsolete version 1 watch file;
   please upgrade to a higher version
   (see uscan(1) for details).
uscan warn: debian/watch is an obsolete version 1 watch file;
   please upgrade to a higher version
   (see uscan(1) for details).
uscan warn: there appears to be a version 2 format line in
the version 1 watch file debian/watch;
Have you forgotten a 'version=2' line at the start, perhaps?
Skipping the line: Version: 5
uscan warn: there appears to be a version 2 format line in
the version 1 watch file debian/watch;
Have you forgotten a 'version=2' line at the start, perhaps?
Skipping the line: Source: https://api.github.com/repos/ROCm/rocm_smi_lib/tags?per_page=100
uscan warn: there appears to be a version 2 format line in
the version 1 watch file debian/watch;
Have you forgotten a 'version=2' line at the start, perhaps?
Skipping the line: Matching-Pattern: https://api.github.com/repos/ROCm/rocm_smi_lib/tarball/refs/tags/rocm-(?:[-_]?[Vv]?(\d[\-+\.:\~\da-zA-Z]*))
uscan warn: there appears to be a version 2 format line in
the version 1 watch file debian/watch;
Have you forgotten a 'version=2' line at the start, perhaps?
Skipping the line: Filenamemangle: s%.*/rocm-(?:[-_]?[Vv]?(\d[\-+\.:\~\da-zA-Z]*))%rocm-smi-lib-$1.tar.gz%
uscan warn: there appears to be a version 2 format line in
the version 1 watch file debian/watch;
Have you forgotten a 'version=2' line at the start, ...

Read more...

review: Needs Information
Revision history for this message
Frank Heimes (fheimes) wrote :

Good point, the issue with the watch file is likely because it's a v5, and I have tried it on a noble systems (rather than in a resolute container) -- good catch.

Revision history for this message
Talha Can Havadar (tchavadar) wrote :

Hello Frank,

Thank you very much for your time, yeah you are absolutely correct on questioning arch changes.

Please see the original content of the debian/control of the package here: https://salsa.debian.org/rocm-team/rocm-smi-lib/-/blob/debian/unstable/debian/control?ref_type=heads

I believe there was a glitch/bug or whatever in our debian sync bot/tool which caused adding all architectures to this package during import. it probably replaces all `linux-any`s with all archs explicitly.

so I believe it should stay as any.

about 0ubuntu1 yes it was only released in the ppa, I dont think anybody really installed the package from the ppa but it is a public ppa anyways so to not break the update path I would like to keep 0ubuntu2 but do you want me to merge these changelog entries and keep it as 0ubuntu2 and add a comment in the entry to state this?

Revision history for this message
Talha Can Havadar (tchavadar) wrote (last edit ):

For uscan I think v5 only works on questing and above. So I usually use a resolute container or find a newer version of uscan somewhere and use that locally to run v5 watch scripts

Revision history for this message
Frank Heimes (fheimes) wrote :

Hi Talha,
thanks for the reference to salsa - looks like it was always linux-any.

Yes, the watch file version is 5, hence it works fine on a resolute system.
(Funnily enough it would not have helped me, since it fetches of course the latest version, which is 7.1.1 - but this is a version bump to 7.0.1.)

No, if you want to stick with 0ubuntu2, I think there is no need for an update and we can go as is.

review: Approve
Revision history for this message
Talha Can Havadar (tchavadar) wrote :

Thank you very much Frank!

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml
2index dcb8198..430c585 100644
3--- a/.azuredevops/rocm-ci.yml
4+++ b/.azuredevops/rocm-ci.yml
5@@ -14,26 +14,28 @@ trigger:
6 branches:
7 include:
8 - amd-staging
9+ - amd-mainline
10 paths:
11 exclude:
12 - .github
13 - docs
14 - '.*.y*ml'
15 - '*.md'
16- - License.txt
17+ - LICENSE
18
19 pr:
20 autoCancel: true
21 branches:
22 include:
23 - amd-staging
24+ - amd-mainline
25 paths:
26 exclude:
27 - .github
28 - docs
29 - '.*.y*ml'
30 - '*.md'
31- - License.txt
32+ - LICENSE
33 drafts: false
34
35 jobs:
36diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
37index 50f8b9f..75ee317 100644
38--- a/.github/CODEOWNERS
39+++ b/.github/CODEOWNERS
40@@ -1,4 +1,4 @@
41-* @bill-shuzhou-liu @dmitrii-galantsev @charis-poag-amd @oliveiradan
42+* @bill-shuzhou-liu @dmitrii-galantsev @charis-poag-amd @oliveiradan @marifamd @gabrpham
43
44 docs/* @ROCm/rocm-documentation
45 *.md @ROCm/rocm-documentation
46diff --git a/.github/palamida.yml b/.github/palamida.yml
47new file mode 100644
48index 0000000..47bd57a
49--- /dev/null
50+++ b/.github/palamida.yml
51@@ -0,0 +1,5 @@
52+disabled: false
53+scmId: gh-emu-rocm
54+branchesToScan:
55+ - amd-staging
56+ - amd-mainline
57\ No newline at end of file
58diff --git a/.github/workflows/kws-caller.yml b/.github/workflows/kws-caller.yml
59new file mode 100644
60index 0000000..c0f4f26
61--- /dev/null
62+++ b/.github/workflows/kws-caller.yml
63@@ -0,0 +1,15 @@
64+name: Rocm Validation Suite KWS
65+on:
66+ push:
67+ branches: [amd-staging, amd-mainline]
68+ pull_request:
69+ types: [opened, synchronize, reopened]
70+ workflow_dispatch:
71+jobs:
72+ kws:
73+ if: ${{ github.event_name == 'pull_request' }}
74+ uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/kws.yml@mainline
75+ secrets: inherit
76+ with:
77+ pr_number: ${{github.event.pull_request.number}}
78+ base_branch: ${{github.base_ref}}
79diff --git a/.github/workflows/rocm_ci_caller.yml b/.github/workflows/rocm_ci_caller.yml
80index c3a28cc..9643cdf 100644
81--- a/.github/workflows/rocm_ci_caller.yml
82+++ b/.github/workflows/rocm_ci_caller.yml
83@@ -1,19 +1,25 @@
84-name: ROCm CI Caller
85-on:
86+name: ROCm CI Caller
87+on:
88 pull_request:
89- branches: [release/rocm-rel-6.4]
90+ branches: [amd-staging, release/rocm-rel-*, amd-mainline]
91 types: [opened, reopened, synchronize]
92+ push:
93+ branches: [amd-mainline]
94 workflow_dispatch:
95+ issue_comment:
96+ types: [created]
97
98 jobs:
99 call-workflow:
100- uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/rocm_ci.yml@mainline
101+ if: github.event_name != 'issue_comment' ||(github.event_name == 'issue_comment' && github.event.issue.pull_request && (startsWith(github.event.comment.body, '!verify') || startsWith(github.event.comment.body, '!verify release') || startsWith(github.event.comment.body, '!verify retest')))
102+ uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/rocm_ci.yml@mainline
103 secrets: inherit
104 with:
105- input_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
106- input_pr_num: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 0 }}
107- input_pr_url: ${{ github.event_name == 'pull_request' && github.event.pull_request.html_url || '' }}
108- input_pr_title: ${{ github.event_name == 'pull_request' && github.event.pull_request.title || '' }}
109+ input_sha: ${{github.event_name == 'pull_request' && github.event.pull_request.head.sha || (github.event_name == 'push' && github.sha) || (github.event_name == 'issue_comment' && github.event.issue.pull_request.head.sha) || github.sha}}
110+ input_pr_num: ${{github.event_name == 'pull_request' && github.event.pull_request.number || (github.event_name == 'issue_comment' && github.event.issue.number) || 0}}
111+ input_pr_url: ${{github.event_name == 'pull_request' && github.event.pull_request.html_url || (github.event_name == 'issue_comment' && github.event.issue.pull_request.html_url) || ''}}
112+ input_pr_title: ${{github.event_name == 'pull_request' && github.event.pull_request.title || (github.event_name == 'issue_comment' && github.event.issue.pull_request.title) || ''}}
113 repository_name: ${{ github.repository }}
114- base_ref: ${{ github.event_name == 'pull_request' && github.base_ref || github.ref }}
115- trigger_event_type: ${{ github.event_name }}
116+ base_ref: ${{github.event_name == 'pull_request' && github.event.pull_request.base.ref || (github.event_name == 'issue_comment' && github.event.issue.pull_request.base.ref) || github.ref}}
117+ trigger_event_type: ${{ github.event_name }}
118+ comment_text: ${{ github.event_name == 'issue_comment' && github.event.comment.body || '' }}
119diff --git a/CHANGELOG.md b/CHANGELOG.md
120index 683e654..ab2eac0 100644
121--- a/CHANGELOG.md
122+++ b/CHANGELOG.md
123@@ -4,6 +4,95 @@ Full documentation for rocm_smi_lib is available at [https://rocm.docs.amd.com/]
124
125 ***All information listed below is for reference and subject to change.***
126
127+## rocm_smi_lib for ROCm 7.0.0
128+
129+### Added
130+
131+- **Added support for GPU metrics 1.8**.
132+ - Added new fields for `rsmi_gpu_metrics_t` including:
133+ - Adding the following metrics to allow new calculations for violation status:
134+ - Per XCP metrics `gfx_below_host_limit_ppt_acc[XCP][MAX_XCC]` - GFX Clock Host limit Package Power Tracking violation counts
135+ - Per XCP metrics `gfx_below_host_limit_thm_acc[XCP][MAX_XCC]` - GFX Clock Host limit Thermal (TVIOL) violation counts
136+ - Per XCP metrics `gfx_low_utilization_acc[XCP][MAX_XCC]` - violation counts for how did low utilization caused the GPU to be below application clocks.
137+ - Per XCP metrics `gfx_below_host_limit_total_acc[XCP][MAX_XCC]`- violation counts for how long GPU was held below application clocks any limiter (see above new violation metrics).
138+ - Increasing available JPEG engines to 40.
139+ Current ASICs may not support all 40. These will be indicated as UINT16_MAX or N/A in CLI.
140+
141+### Changed
142+
143+- N/A
144+
145+### Removed
146+
147+- **Removed backwards compatibility `rsmi_dev_gpu_metrics_info_get()`'s `jpeg_activity` or `vcn_activity` fields: use `xcp_stats.jpeg_busy` or `xcp_stats.vcn_busy`**
148+ - Backwards compability is removed for `jpeg_activity` and `vcn_activity` fields, if the `jpeg_busy` or `vcn_busy` field is available.
149+ - <i>Reasons for this change</i>:
150+ - Providing both `vcn_activity`/`jpeg_activity` and XCP (partition) stats `vcn_busy`/`jpeg_busy` caused confusion for users about which field to use. By removing backward compatibility, it is easier to identify the relevant field.
151+ - The `jpeg_busy` field increased in size (for supported ASICs), making backward compatibility unable to fully copy the structure into `jpeg_activity`.
152+
153+ See below for comparison of updated CLI outputs:
154+
155+ Original output:
156+ ```shell
157+ $ rocm-smi --showmetrics
158+ GPU[0] : vcn_activity (%): [0, 'N/A', 'N/A', 'N/A']
159+ GPU[0] : jpeg_activity (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
160+ GPU[0] XCP[0] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
161+ GPU[0] XCP[1] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
162+ GPU[0] XCP[2] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
163+ GPU[0] XCP[3] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
164+ GPU[0] XCP[4] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
165+ GPU[0] XCP[5] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
166+ GPU[0] XCP[6] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
167+ GPU[0] XCP[7] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
168+ GPU[0] XCP[0] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
169+ GPU[0] XCP[1] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
170+ GPU[0] XCP[2] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
171+ GPU[0] XCP[3] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
172+ GPU[0] XCP[4] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
173+ GPU[0] XCP[5] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
174+ GPU[0] XCP[6] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
175+ GPU[0] XCP[7] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
176+ ```
177+ New output:
178+ ```shell
179+ $ rocm-smi --showmetrics
180+ GPU[0] : vcn_activity (%): ['N/A', 'N/A', 'N/A', 'N/A']
181+ GPU[0] : jpeg_activity (%): ['N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
182+ GPU[0] XCP[0] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
183+ GPU[0] XCP[1] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
184+ GPU[0] XCP[2] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
185+ GPU[0] XCP[3] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
186+ GPU[0] XCP[4] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
187+ GPU[0] XCP[5] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
188+ GPU[0] XCP[6] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
189+ GPU[0] XCP[7] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
190+ GPU[0] XCP[0] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
191+ GPU[0] XCP[1] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
192+ GPU[0] XCP[2] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
193+ GPU[0] XCP[3] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
194+ GPU[0] XCP[4] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
195+ GPU[0] XCP[5] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
196+ GPU[0] XCP[6] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
197+ GPU[0] XCP[7] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
198+ ```
199+
200+### Optimized
201+
202+- N/A
203+
204+### Resolved issues
205+
206+- N/A
207+
208+### Upcoming changes
209+
210+- N/A
211+
212+### Known issues
213+
214+- N/A
215+
216 ## rocm_smi_lib for ROCm 6.4.1
217
218 ### Added
219diff --git a/CMakeLists.txt b/CMakeLists.txt
220old mode 100755
221new mode 100644
222index a374078..327cb30
223--- a/CMakeLists.txt
224+++ b/CMakeLists.txt
225@@ -5,15 +5,13 @@ message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
226 message(" CMake ROCm SMI (Library) [root] ")
227 message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
228 cmake_minimum_required(VERSION 3.14)
229+project(rocm_smi_lib)
230
231 set(ROCM_SMI_LIBS_TARGET "rocm_smi_libraries")
232
233 set(BUILD_SHARED_LIBS ON CACHE BOOL "Build shared library (.so) or not.")
234
235-## Set default module path if not already set
236-if(NOT DEFINED CMAKE_MODULE_PATH)
237- set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules/")
238-endif()
239+list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules")
240 ## Include common cmake modules
241 include(utils)
242
243@@ -23,7 +21,7 @@ find_package(PkgConfig)
244 set(CMAKE_INSTALL_LIBDIR "lib" CACHE STRING "Library install directory")
245
246 if (NOT DEFINED CPACK_RESOURCE_FILE_LICENSE)
247- set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/License.txt")
248+ set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md")
249 endif()
250
251 set(ROCM_SMI "rocm_smi")
252@@ -41,10 +39,12 @@ set(SHARE_INSTALL_PREFIX
253 # provide git to utilities
254 find_program (GIT NAMES git)
255
256+# sets DRM_INCLUDE_DIRS
257+pkg_check_modules(DRM REQUIRED libdrm)
258
259 ## Setup the package version based on git tags.
260 set(PKG_VERSION_GIT_TAG_PREFIX "rsmi_pkg_ver")
261-get_package_version_number("7.6.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
262+get_package_version_number("7.8.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
263 message("Package version: ${PKG_VERSION_STR}")
264 set(${ROCM_SMI_LIBS_TARGET}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}")
265 set(${ROCM_SMI_LIBS_TARGET}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}")
266@@ -101,7 +101,7 @@ set(CMAKE_CXX_FLAGS
267
268 # Clang does not set the build-id
269 if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
270- set (CMAKE_SHARED_LINKER_FLAGS "-Wl,--build-id=sha1")
271+ set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--build-id=sha1")
272 endif()
273
274 # Use this instead of above for 32 bit
275@@ -135,10 +135,16 @@ else ()
276 set(CMAKE_CXX_FLAGS
277 "${CMAKE_CXX_FLAGS} -DFORTIFY_SOURCE=2 -fstack-protector-all -Wcast-align")
278 ## More security breach mitigation flags
279- set(CMAKE_CXX_FLAGS
280- "${CMAKE_CXX_FLAGS} -Wl,-z,noexecstack -Wl,-znoexecheap -Wl,-z,relro ")
281- set(CMAKE_CXX_FLAGS
282- "${CMAKE_CXX_FLAGS} -Wtrampolines -Wl,-z,now")
283+ set(HARDENING_LDFLAGS
284+ "${HARDENING_LDFLAGS} -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now")
285+ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${HARDENING_LDFLAGS}")
286+ set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${HARDENING_LDFLAGS}")
287+
288+ include(CheckCXXCompilerFlag)
289+ check_cxx_compiler_flag("-Wtrampolines" CXX_SUPPORTS_WTRAMPOLINES)
290+ if (CXX_SUPPORTS_WTRAMPOLINES)
291+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wtrampolines")
292+ endif ()
293 endif ()
294
295 set(COMMON_SRC_DIR "${PROJECT_SOURCE_DIR}/src")
296@@ -197,16 +203,15 @@ set(CPACK_RPM_COMPONENT_INSTALL ON)
297 # python doesn't need to be asan
298 set(CPACK_DEBIAN_PACKAGE_DEPENDS "libc6, python3")
299 set(CPACK_DEBIAN_ASAN_PACKAGE_DEPENDS "${CPACK_DEBIAN_PACKAGE_DEPENDS}")
300+set(CPACK_RPM_PACKAGE_REQUIRES "python3")
301+set(CPACK_RPM_ASAN_PACKAGE_REQUIRES "${CPACK_RPM_PACKAGE_REQUIRES}")
302 # Only add dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON is given
303 if(ROCM_DEP_ROCMCORE)
304- set(CPACK_DEBIAN_PACKAGE_DEPENDS "${CPACK_DEBIAN_PACKAGE_DEPENDS}, rocm-core")
305- # rocm-core needs to be asan
306- # override original variable because CPACK_DEBIAN_PACKAGE_DEPENDS changed
307- set(CPACK_DEBIAN_ASAN_PACKAGE_DEPENDS "${CPACK_DEBIAN_PACKAGE_DEPENDS}-asan")
308+ string(APPEND CPACK_DEBIAN_PACKAGE_DEPENDS ", rocm-core")
309+ string(APPEND CPACK_DEBIAN_ASAN_PACKAGE_DEPENDS ", rocm-core-asan")
310+ string(APPEND CPACK_RPM_PACKAGE_REQUIRES ", rocm-core")
311+ string(APPEND CPACK_RPM_ASAN_PACKAGE_REQUIRES ", rocm-core-asan")
312 endif()
313-# carefully reuse DEB's "DEPENDS" for RPM's "REQUIRES"
314-set(CPACK_RPM_PACKAGE_REQUIRES "python3")
315-set(CPACK_RPM_ASAN_PACKAGE_REQUIRES "${CPACK_RPM_PACKAGE_REQUIRES}")
316
317 #Component Specific Configuration/Flags
318 set(CPACK_DEBIAN_DEV_PACKAGE_NAME ${ROCM_SMI_PACKAGE})
319@@ -221,33 +226,12 @@ set(CPACK_RPM_STATIC_PACKAGE_NAME ${ROCM_SMI_PACKAGE}-static-devel)
320 add_subdirectory("rocm_smi")
321 add_subdirectory("oam")
322
323-option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorg with backward compatibility" OFF)
324-
325 # Add tests
326 if(BUILD_TESTS)
327 set(TESTS_COMPONENT "tests")
328 add_subdirectory("tests/rocm_smi_test")
329 endif()
330
331-if(FILE_REORG_BACKWARD_COMPATIBILITY)
332-# To enable/disable #error in wrapper header files
333- if(NOT DEFINED ROCM_HEADER_WRAPPER_WERROR)
334- if(DEFINED ENV{ROCM_HEADER_WRAPPER_WERROR})
335- set(ROCM_HEADER_WRAPPER_WERROR "$ENV{ROCM_HEADER_WRAPPER_WERROR}"
336- CACHE STRING "Header wrapper warnings as errors.")
337- else()
338- set(ROCM_HEADER_WRAPPER_WERROR "OFF" CACHE STRING "Header wrapper warnings as errors.")
339- endif()
340- endif()
341- if(ROCM_HEADER_WRAPPER_WERROR)
342- set(deprecated_error 1)
343- else()
344- set(deprecated_error 0)
345- endif()
346-
347- include(rocm_smi-backward-compat.cmake)
348-endif()
349-
350 include(CMakePackageConfigHelpers)
351
352 set(LIB_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}")
353@@ -293,14 +277,14 @@ install(EXPORT rocm_smiTargets
354
355 #License file
356 set(CPACK_RPM_PACKAGE_LICENSE "NCSA")
357-# install license file in share/doc/rocm_smi-asan folder
358+# install license file in share/doc/rocm-smi-lib-asan folder
359 if( ENABLE_ASAN_PACKAGING )
360 install(FILES ${CPACK_RESOURCE_FILE_LICENSE}
361- DESTINATION ${CMAKE_INSTALL_DATADIR}/doc/${ROCM_SMI}-asan RENAME LICENSE.txt
362+ DESTINATION ${CMAKE_INSTALL_DATADIR}/doc/${ROCM_SMI_PACKAGE}-asan RENAME LICENSE.md
363 COMPONENT asan)
364 endif()
365 install( FILES ${CPACK_RESOURCE_FILE_LICENSE}
366- DESTINATION ${CMAKE_INSTALL_DATADIR}/doc/${ROCM_SMI} RENAME LICENSE.txt
367+ DESTINATION ${CMAKE_INSTALL_DATADIR}/doc/${ROCM_SMI_PACKAGE} RENAME LICENSE.md
368 COMPONENT dev)
369
370 ###########################
371diff --git a/License.txt b/LICENSE.md
372similarity index 94%
373rename from License.txt
374rename to LICENSE.md
375index 31f9503..4d43ac8 100644
376--- a/License.txt
377+++ b/LICENSE.md
378@@ -1,6 +1,6 @@
379 MIT License
380
381-Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
382+Copyright (C) Advanced Micro Devices, Inc.
383
384 Permission is hereby granted, free of charge, to any person obtaining a copy
385 of this software and associated documentation files (the "Software"), to deal
386@@ -19,4 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
387 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
388 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
389 SOFTWARE.
390-
391diff --git a/README.md b/README.md
392old mode 100755
393new mode 100644
394index 13a593a..5ef16eb
395--- a/README.md
396+++ b/README.md
397@@ -1,6 +1,6 @@
398 # 🛠️ Maintenance Mode Notice 🛠️
399
400-Starting with ROCm 6.5, only critical bug fixes will be applied to ROCm-SMI.
401+Starting with ROCm 7.0, only critical bug fixes will be applied to ROCm-SMI.
402 For a seamless experience and continued support, please switch to [AMD-SMI](https://github.com/ROCm/amdsmi).
403
404 ## Use C++ in ROCm SMI
405diff --git a/cmake_modules/help_package.cmake b/cmake_modules/help_package.cmake
406index 94f71ce..9bfb07e 100644
407--- a/cmake_modules/help_package.cmake
408+++ b/cmake_modules/help_package.cmake
409@@ -85,7 +85,7 @@ function(generic_package)
410 "${CMAKE_INSTALL_PREFIX}"
411 CACHE STRING "Default packaging prefix.")
412 set(CPACK_RESOURCE_FILE_LICENSE
413- "${CMAKE_CURRENT_SOURCE_DIR}/License.txt"
414+ "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md"
415 CACHE STRING "")
416 set(CPACK_RPM_PACKAGE_LICENSE
417 "MIT"
418diff --git a/cmake_modules/utils.cmake b/cmake_modules/utils.cmake
419old mode 100755
420new mode 100644
421index 7131761..77aadee
422--- a/cmake_modules/utils.cmake
423+++ b/cmake_modules/utils.cmake
424@@ -3,7 +3,7 @@
425 ## The University of Illinois/NCSA
426 ## Open Source License (NCSA)
427 ##
428-## Copyright (c) 2014-2017, Advanced Micro Devices, Inc. All rights reserved.
429+## Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved.
430 ##
431 ## Developed by:
432 ##
433diff --git a/debian/changelog b/debian/changelog
434index 41c6f20..09f56d1 100644
435--- a/debian/changelog
436+++ b/debian/changelog
437@@ -1,3 +1,39 @@
438+rocm-smi-lib (7.1.0-0ubuntu2) resolute; urgency=medium
439+
440+ [ Talha Can Havadar ]
441+ * New upstream version 7.1.0 (LP: #2138653)
442+ * d/p/0002-add-version-script-to-control-exposed-symbols.patch:
443+ Fix FTBFS, error bad value for '-march=' switch
444+
445+ [ Gennaro Oliva ]
446+ * d/control: add missing libdrm-dev for librocm-smi-lib (Closes: #1121159)
447+
448+ -- Talha Can Havadar <talha.can.havadar@canonical.com> Fri, 19 Dec 2025 08:45:45 +0100
449+
450+rocm-smi-lib (7.1.0-0ubuntu1) UNRELEASED; urgency=medium
451+
452+ [ Igor Luppi ]
453+ * d/{control,liboam1*,librocm-smi64-1*}: Update SOVERSION from 1 to 7
454+ * d/p/0002-add-version-script-to-control-exposed-symbols.patch: fix soversion
455+ * d/rules: fix soversion
456+
457+ [ Zhai Zhaoxuan ]
458+ * d/control: add pkg-config in build-depends
459+ * d/not-installed: update the name of duplicated LICENSE file
460+
461+ [ Talha Can Havadar ]
462+ * d/rules: make symbol checking strict for builds
463+ * d/patches: fix so version problem due to missing git
464+
465+ -- Igor Luppi <igor.luppi@canonical.com> Wed, 17 Dec 2025 16:21:06 -0300
466+
467+rocm-smi-lib (6.4.3-0ubuntu1) questing; urgency=medium
468+
469+ * New upstream version 6.4.3
470+ * d/control: update maintainer information
471+
472+ -- Igor Luppi <igor.luppi@canonical.com> Mon, 22 Sep 2025 15:52:03 -0300
473+
474 rocm-smi-lib (6.4.1-1.1) unstable; urgency=medium
475
476 * Non-maintainer upload.
477diff --git a/debian/control b/debian/control
478index b3d0850..eba2d34 100644
479--- a/debian/control
480+++ b/debian/control
481@@ -1,5 +1,6 @@
482 Source: rocm-smi-lib
483-Maintainer: Debian ROCm Team <debian-ai@lists.debian.org>
484+Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>
485+XSBC-Original-Maintainer: Debian ROCm Team <debian-ai@lists.debian.org>
486 Uploaders: Mo Zhou <lumin@debian.org>,
487 Étienne Mollier <emollier@debian.org>,
488 Xuanteng Huang <xuanteng.huang@outlook.com>,
489@@ -9,6 +10,7 @@ Priority: optional
490 Build-Depends: debhelper-compat (= 13),
491 cmake,
492 libdrm-dev,
493+ pkgconf,
494 Standards-Version: 4.7.2
495 Vcs-Browser: https://salsa.debian.org/rocm-team/rocm-smi-lib
496 Vcs-Git: https://salsa.debian.org/rocm-team/rocm-smi-lib.git
497@@ -19,12 +21,12 @@ Architecture: linux-any
498 Section: utils
499 Depends: ${misc:Depends},
500 python3:any,
501- librocm-smi64-1 (= ${binary:Version})
502+ librocm-smi64-7 (= ${binary:Version})
503 Description: ROCm System Management Interface (ROCm SMI) command-line interface
504 This is the reference implementation from AMD, exposing the ROCm SMI library
505 to the user. It presents a Python executable, `rocm-smi`.
506
507-Package: librocm-smi64-1
508+Package: librocm-smi64-7
509 Architecture: linux-any
510 Section: libs
511 Depends: ${misc:Depends},
512@@ -39,9 +41,9 @@ Package: librocm-smi-dev
513 Architecture: linux-any
514 Section: libdevel
515 Depends: ${misc:Depends},
516- librocm-smi64-1 (= ${binary:Version}),
517 liboam-dev (= ${binary:Version}),
518 libdrm-dev,
519+ librocm-smi64-7 (= ${binary:Version})
520 Description: ROCm System Management Interface (ROCm SMI) library headers
521 ROCm SMI is part of the ROCm software stack. It is a C library for Linux
522 that provides a user-space interface for applications to monitor and
523@@ -49,7 +51,7 @@ Description: ROCm System Management Interface (ROCm SMI) library headers
524 .
525 This package contains the development headers.
526
527-Package: liboam1
528+Package: liboam7
529 Architecture: linux-any
530 Section: libs
531 Depends: ${misc:Depends},
532@@ -64,7 +66,7 @@ Package: liboam-dev
533 Architecture: linux-any
534 Section: libdevel
535 Depends: ${misc:Depends},
536- liboam1 (= ${binary:Version})
537+ liboam7 (= ${binary:Version})
538 Description: Datacenter flavor of a GPU system-management API headers
539 OCP Accelerator Module (OAM), is an Open Compute Project (OCP) hardware
540 standard, used in datacenters and high-performance-computing (HPC) clusters.
541diff --git a/debian/liboam1.install b/debian/liboam1.install
542deleted file mode 100644
543index d2ebc4e..0000000
544--- a/debian/liboam1.install
545+++ /dev/null
546@@ -1,2 +0,0 @@
547-usr/lib/${DEB_HOST_MULTIARCH}/liboam.so.1 usr/lib/${DEB_HOST_MULTIARCH}/
548-usr/lib/${DEB_HOST_MULTIARCH}/liboam.so.1.* usr/lib/${DEB_HOST_MULTIARCH}/
549diff --git a/debian/liboam7.install b/debian/liboam7.install
550new file mode 100644
551index 0000000..9c6ccd3
552--- /dev/null
553+++ b/debian/liboam7.install
554@@ -0,0 +1,2 @@
555+usr/lib/${DEB_HOST_MULTIARCH}/liboam.so.7 usr/lib/${DEB_HOST_MULTIARCH}/
556+usr/lib/${DEB_HOST_MULTIARCH}/liboam.so.7.* usr/lib/${DEB_HOST_MULTIARCH}/
557diff --git a/debian/liboam1.symbols.amd64 b/debian/liboam7.symbols.amd64
558similarity index 99%
559rename from debian/liboam1.symbols.amd64
560rename to debian/liboam7.symbols.amd64
561index 08d444b..9348cc4 100644
562--- a/debian/liboam1.symbols.amd64
563+++ b/debian/liboam7.symbols.amd64
564@@ -1,4 +1,4 @@
565-liboam.so.1 liboam1 #MINVER#
566+liboam.so.7 liboam7 #MINVER#
567 * Build-Depends-Package: liboam-dev
568 amdoam_discover_devices@Base 4.5.2
569 amdoam_free@Base 4.5.2
570diff --git a/debian/librocm-smi64-1.install b/debian/librocm-smi64-1.install
571deleted file mode 100644
572index 9ea6b8c..0000000
573--- a/debian/librocm-smi64-1.install
574+++ /dev/null
575@@ -1,2 +0,0 @@
576-usr/lib/${DEB_HOST_MULTIARCH}/librocm_smi64.so.1 usr/lib/${DEB_HOST_MULTIARCH}/
577-usr/lib/${DEB_HOST_MULTIARCH}/librocm_smi64.so.1.* usr/lib/${DEB_HOST_MULTIARCH}/
578diff --git a/debian/librocm-smi64-7.install b/debian/librocm-smi64-7.install
579new file mode 100644
580index 0000000..13c97fe
581--- /dev/null
582+++ b/debian/librocm-smi64-7.install
583@@ -0,0 +1,2 @@
584+usr/lib/${DEB_HOST_MULTIARCH}/librocm_smi64.so.7 usr/lib/${DEB_HOST_MULTIARCH}/
585+usr/lib/${DEB_HOST_MULTIARCH}/librocm_smi64.so.7.* usr/lib/${DEB_HOST_MULTIARCH}/
586diff --git a/debian/librocm-smi64-1.symbols.amd64 b/debian/librocm-smi64-7.symbols.amd64
587similarity index 99%
588rename from debian/librocm-smi64-1.symbols.amd64
589rename to debian/librocm-smi64-7.symbols.amd64
590index 4de3a4d..77834e3 100644
591--- a/debian/librocm-smi64-1.symbols.amd64
592+++ b/debian/librocm-smi64-7.symbols.amd64
593@@ -1,4 +1,4 @@
594-librocm_smi64.so.1 librocm-smi64-1 #MINVER#
595+librocm_smi64.so.7 librocm-smi64-7 #MINVER#
596 * Build-Depends-Package: librocm-smi64-dev
597 devInfoTypesStrings@Base 6.1.2
598 logFileName@Base 6.1.2
599diff --git a/debian/librocm-smi64-1.version b/debian/librocm-smi64-7.version
600similarity index 100%
601rename from debian/librocm-smi64-1.version
602rename to debian/librocm-smi64-7.version
603diff --git a/debian/not-installed b/debian/not-installed
604index 1ad2705..d87845d 100644
605--- a/debian/not-installed
606+++ b/debian/not-installed
607@@ -2,4 +2,4 @@
608 usr/oam/*
609 usr/rocm_smi/*
610 # duplicate license file
611-usr/share/doc/rocm_smi/LICENSE.txt
612+usr/share/doc/rocm-smi-lib/LICENSE.md
613diff --git a/debian/patches/0002-add-version-script-to-control-exposed-symbols.patch b/debian/patches/0002-add-version-script-to-control-exposed-symbols.patch
614index 4e9e6af..f21bf27 100644
615--- a/debian/patches/0002-add-version-script-to-control-exposed-symbols.patch
616+++ b/debian/patches/0002-add-version-script-to-control-exposed-symbols.patch
617@@ -16,7 +16,7 @@ Forwarded: not-needed
618 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0 -DDEBUG")
619 endif ()
620
621-+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--version-script=${CMAKE_SOURCE_DIR}/debian/librocm-smi64-1.version")
622++set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--version-script=${CMAKE_SOURCE_DIR}/debian/librocm-smi64-7.version")
623 +
624 ## Address Sanitize Flag
625 if (${ADDRESS_SANITIZER})
626diff --git a/debian/patches/0005-oam-rocm_smi-fix-version-string-issue-when-no-git-av.patch b/debian/patches/0005-oam-rocm_smi-fix-version-string-issue-when-no-git-av.patch
627new file mode 100644
628index 0000000..4a54a1c
629--- /dev/null
630+++ b/debian/patches/0005-oam-rocm_smi-fix-version-string-issue-when-no-git-av.patch
631@@ -0,0 +1,45 @@
632+From: Talha Can Havadar <talha.can.havadar@canonical.com>
633+Date: Wed, 17 Dec 2025 18:14:30 +0100
634+Subject: oam: rocm_smi: fix version string issue when no git available
635+
636+Git is not available in some build environment and this makes SO version
637+to default to 1 which is not correct. In such cases we can fallback to
638+CPACK version that is already set correctly.
639+
640+Bug: https://github.com/ROCm/rocm-systems/pull/2361
641+---
642+ oam/CMakeLists.txt | 4 ++--
643+ rocm_smi/CMakeLists.txt | 5 +++--
644+ 2 files changed, 5 insertions(+), 4 deletions(-)
645+
646+diff --git a/oam/CMakeLists.txt b/oam/CMakeLists.txt
647+index 7aa1b5f..c74523e 100644
648+--- a/oam/CMakeLists.txt
649++++ b/oam/CMakeLists.txt
650+@@ -36,8 +36,8 @@ set(SO_VERSION_GIT_TAG_PREFIX "oam_so_ver")
651+ message("Package version: ${PKG_VERSION_STR}")
652+
653+ # Debian package specific variables
654+-# Set a default value for the package version
655+-get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT)
656++# Set a default value for the package version - use the main package version as fallback
657++get_version_from_tag("${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}" ${SO_VERSION_GIT_TAG_PREFIX} GIT)
658+
659+ # VERSION_* variables should be set by get_version_from_tag
660+ if ( ${ROCM_PATCH_VERSION} )
661+diff --git a/rocm_smi/CMakeLists.txt b/rocm_smi/CMakeLists.txt
662+index 68c90ec..39c8848 100644
663+--- a/rocm_smi/CMakeLists.txt
664++++ b/rocm_smi/CMakeLists.txt
665+@@ -38,8 +38,9 @@ set(SO_VERSION_GIT_TAG_PREFIX "rsmi_so_ver")
666+ message("Package version: ${PKG_VERSION_STR}")
667+
668+ # Debian package specific variables
669+-# Set a default value for the package version
670+-get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT)
671++# Set a default value for the package version - use the main package version as fallback
672++get_version_from_tag("${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}" ${SO_VERSION_GIT_TAG_PREFIX} GIT)
673++
674+
675+ # VERSION_* variables should be set by get_version_from_tag
676+ if ( ${ROCM_PATCH_VERSION} )
677diff --git a/debian/patches/series b/debian/patches/series
678index 7b38bbb..be3307d 100644
679--- a/debian/patches/series
680+++ b/debian/patches/series
681@@ -1,3 +1,4 @@
682 0002-add-version-script-to-control-exposed-symbols.patch
683 0003-remove-example-target-using-internal-apis.patch
684 0004-revert-remove-reset-partition.patch
685+0005-oam-rocm_smi-fix-version-string-issue-when-no-git-av.patch
686diff --git a/debian/rules b/debian/rules
687index 1c468a9..ceeaa2f 100755
688--- a/debian/rules
689+++ b/debian/rules
690@@ -13,3 +13,10 @@ export DEB_BUILD_MAINT_OPTIONS = hardening=+all
691
692 execute_before_dh_missing-indep:
693 rm -vf $(CURDIR)/debian/tmp/usr/bin/rocm_smi.py
694+
695+# see https://manpages.debian.org/testing/debhelper/dh_makeshlibs.1.en.html
696+# and https://manpages.debian.org/testing/dpkg-dev/dpkg-gensymbols.1.en.html
697+# To make sure gensymbols fails the build, -c4 to be more strict
698+override_dh_makeshlibs:
699+ dh_makeshlibs -V -plibrocm-smi64-7 -- -c4
700+ dh_makeshlibs -V -pliboam7 -- -c4
701diff --git a/docs/conf.py b/docs/conf.py
702old mode 100755
703new mode 100644
704index cfd8d87..cba57b5
705--- a/docs/conf.py
706+++ b/docs/conf.py
707@@ -29,7 +29,7 @@ shutil.copy2('../CHANGELOG.md','./CHANGELOG.md')
708 # for PDF output on Read the Docs
709 project = "ROCm SMI LIB Documentation"
710 author = "Advanced Micro Devices, Inc."
711-copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
712+copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
713 version = version_number
714 release = version_number
715
716diff --git a/docs/index.rst b/docs/index.rst
717old mode 100755
718new mode 100644
719diff --git a/docs/license.md b/docs/license.md
720index 234cd49..aaf95ff 100644
721--- a/docs/license.md
722+++ b/docs/license.md
723@@ -1,4 +1,4 @@
724 # License
725
726-```{include} ../License.txt
727+```{include} ../LICENSE.md
728 ```
729diff --git a/include/rocm_smi/kfd_ioctl.h b/include/rocm_smi/kfd_ioctl.h
730old mode 100755
731new mode 100644
732diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h
733old mode 100755
734new mode 100644
735index aa9ea97..933b35a
736--- a/include/rocm_smi/rocm_smi.h
737+++ b/include/rocm_smi/rocm_smi.h
738@@ -3,7 +3,7 @@
739 * The University of Illinois/NCSA
740 * Open Source License (NCSA)
741 *
742- * Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
743+ * Copyright (c) 2017-2025, Advanced Micro Devices, Inc.
744 * All rights reserved.
745 *
746 * Developed by:
747@@ -522,9 +522,10 @@ typedef enum {
748 typedef enum {
749 RSMI_VOLT_TYPE_FIRST = 0,
750
751- RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU
752- //!< voltage
753- RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDGFX,
754+ RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU voltage
755+ RSMI_VOLT_TYPE_VDDBOARD, //!< Voltage for VDDBOARD
756+
757+ RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDBOARD,
758 RSMI_VOLT_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type
759 } rsmi_voltage_type_t;
760
761@@ -957,6 +958,11 @@ typedef struct metrics_table_header_t metrics_table_header_t;
762 #define RSMI_MAX_NUM_JPEG_ENGS 32
763
764 /**
765+ * @brief This should match kRSMI_MAX_NUM_JPEG_ENG_V1
766+ */
767+#define RSMI_MAX_NUM_JPEG_ENG_V1 40
768+
769+/**
770 * @brief This should match kRSMI_MAX_NUM_CLKS
771 */
772 #define RSMI_MAX_NUM_CLKS 4
773@@ -1003,7 +1009,7 @@ struct amdgpu_xcp_metrics_t {
774 */
775 /* Utilization Instantaneous (%) */
776 uint32_t gfx_busy_inst[RSMI_MAX_NUM_XCC];
777- uint16_t jpeg_busy[RSMI_MAX_NUM_JPEG_ENGS];
778+ uint16_t jpeg_busy[RSMI_MAX_NUM_JPEG_ENG_V1];
779 uint16_t vcn_busy[RSMI_MAX_NUM_VCNS];
780
781 /* Utilization Accumulated (%) */
782@@ -1014,6 +1020,14 @@ struct amdgpu_xcp_metrics_t {
783 */
784 /* Total App Clock Counter Accumulated */
785 uint64_t gfx_below_host_limit_acc[RSMI_MAX_NUM_XCC];
786+
787+ /**
788+ * v1.8 additions
789+ */
790+ uint64_t gfx_below_host_limit_ppt_acc[RSMI_MAX_NUM_XCC];
791+ uint64_t gfx_below_host_limit_thm_acc[RSMI_MAX_NUM_XCC];
792+ uint64_t gfx_low_utilization_acc[RSMI_MAX_NUM_XCC];
793+ uint64_t gfx_below_host_limit_total_acc[RSMI_MAX_NUM_XCC];
794 };
795
796 typedef struct {
797@@ -1220,7 +1234,7 @@ typedef struct {
798 /*
799 * v1.7 additions
800 */
801- /* VRAM max bandwidth at max memory clock (GB/s) */
802+ /* VRAM max bandwidth at max memory clock */
803 uint64_t vram_max_bandwidth;
804
805 /* XGMI link status(up/down) */
806@@ -1367,8 +1381,8 @@ rsmi_status_t rsmi_num_monitor_devices(uint32_t *num_devices);
807 * @brief Get the device id associated with the device with provided device
808 * index.
809 *
810- * @details Given a device index @p dv_ind and a pointer to a uint32_t @p id,
811- * this function will write the device id value to the uint64_t pointed to by
812+ * @details Given a device index @p dv_ind and a pointer to a uint16_t @p id,
813+ * this function will write the device id value to the uint16_t pointed to by
814 * @p id. This ID is an identification of the type of device, so calling this
815 * function for different devices will give the same value if they are kind
816 * of device. Consequently, this function should not be used to distinguish
817@@ -1377,7 +1391,7 @@ rsmi_status_t rsmi_num_monitor_devices(uint32_t *num_devices);
818 *
819 * @param[in] dv_ind a device index
820 *
821- * @param[inout] id a pointer to uint64_t to which the device id will be
822+ * @param[inout] id a pointer to uint16_t to which the device id will be
823 * written
824 * If this parameter is nullptr, this function will return
825 * ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided,
826@@ -1395,12 +1409,12 @@ rsmi_status_t rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id);
827 /**
828 * @brief Get the device revision associated with the device
829 *
830- * @details Given a device index @p dv_ind and a pointer to a uint32_t to
831+ * @details Given a device index @p dv_ind and a pointer to a uint16_t to
832 * which the revision will be written
833 *
834 * @param[in] dv_ind a device index
835 *
836- * @param[inout] revision a pointer to uint32_t to which the device revision
837+ * @param[inout] revision a pointer to uint16_t to which the device revision
838 * will be written
839 *
840 * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
841@@ -1412,14 +1426,14 @@ rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision);
842 * @brief Get the SKU for a desired device associated with the device with
843 * provided device index.
844 *
845- * @details Given a device index @p dv_ind and a pointer to a char @p sku,
846+ * @details Given a device index @p dv_ind and a pointer to a uint16_t @p sku,
847 * this function will attempt to obtain the SKU from the Product Information
848 * FRU chip, present on server ASICs. It will write the sku value to the
849- * char array pointed to by @p sku.
850+ * uint16_t pointed to by @p sku.
851 *
852 * @param[in] dv_ind a device index
853 *
854- * @param[inout] sku a pointer to char to which the sku will be written
855+ * @param[inout] sku a pointer to uint16_t to which the sku will be written
856 *
857 * If this parameter is nullptr, this function will return
858 * ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided,
859@@ -1438,13 +1452,13 @@ rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *sku);
860 * @brief Get the device vendor id associated with the device with provided
861 * device index.
862 *
863- * @details Given a device index @p dv_ind and a pointer to a uint32_t @p id,
864- * this function will write the device vendor id value to the uint64_t pointed
865+ * @details Given a device index @p dv_ind and a pointer to a uint16_t @p id,
866+ * this function will write the device vendor id value to the uint16_t pointed
867 * to by @p id.
868 *
869 * @param[in] dv_ind a device index
870 *
871- * @param[inout] id a pointer to uint64_t to which the device vendor id will
872+ * @param[inout] id a pointer to uint16_t to which the device vendor id will
873 * be written
874 * If this parameter is nullptr, this function will return
875 * ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided,
876@@ -2962,7 +2976,9 @@ rsmi_status_t rsmi_dev_gpu_reset(uint32_t dv_ind);
877 * If this parameter is nullptr, this function will return
878 * ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided,
879 * arguments and ::RSMI_STATUS_NOT_SUPPORTED if it is not supported with the
880- * provided arguments.
881+ * provided arguments. In the event where there are some values are missing from
882+ * or not available on the device, the respective values will be set to
883+ * UINT64_MAX.
884 *
885 * @retval ::RSMI_STATUS_SUCCESS call was successful
886 * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
887diff --git a/include/rocm_smi/rocm_smi_common.h b/include/rocm_smi/rocm_smi_common.h
888old mode 100755
889new mode 100644
890index 618d1d6..67264f6
891--- a/include/rocm_smi/rocm_smi_common.h
892+++ b/include/rocm_smi/rocm_smi_common.h
893@@ -5,7 +5,7 @@
894 * The University of Illinois/NCSA
895 * Open Source License (NCSA)
896 *
897- * Copyright (c) 2018-2023, Advanced Micro Devices, Inc.
898+ * Copyright (c) 2018-2025, Advanced Micro Devices, Inc.
899 * All rights reserved.
900 *
901 * Developed by:
902diff --git a/include/rocm_smi/rocm_smi_counters.h b/include/rocm_smi/rocm_smi_counters.h
903old mode 100755
904new mode 100644
905index 091c89d..1447df6
906--- a/include/rocm_smi/rocm_smi_counters.h
907+++ b/include/rocm_smi/rocm_smi_counters.h
908@@ -5,7 +5,7 @@
909 * The University of Illinois/NCSA
910 * Open Source License (NCSA)
911 *
912- * Copyright (c) 2019, Advanced Micro Devices, Inc.
913+ * Copyright (c) 2025, Advanced Micro Devices, Inc.
914 * All rights reserved.
915 *
916 * Developed by:
917diff --git a/include/rocm_smi/rocm_smi_device.h b/include/rocm_smi/rocm_smi_device.h
918old mode 100755
919new mode 100644
920index a891a66..d00d037
921--- a/include/rocm_smi/rocm_smi_device.h
922+++ b/include/rocm_smi/rocm_smi_device.h
923@@ -3,7 +3,7 @@
924 * The University of Illinois/NCSA
925 * Open Source License (NCSA)
926 *
927- * Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
928+ * Copyright (c) 2017-2025, Advanced Micro Devices, Inc.
929 * All rights reserved.
930 *
931 * Developed by:
932diff --git a/include/rocm_smi/rocm_smi_exception.h b/include/rocm_smi/rocm_smi_exception.h
933old mode 100755
934new mode 100644
935index 7c898fb..70e4949
936--- a/include/rocm_smi/rocm_smi_exception.h
937+++ b/include/rocm_smi/rocm_smi_exception.h
938@@ -5,7 +5,7 @@
939 * The University of Illinois/NCSA
940 * Open Source License (NCSA)
941 *
942- * Copyright (c) 2018, Advanced Micro Devices, Inc.
943+ * Copyright (c) 2025, Advanced Micro Devices, Inc.
944 * All rights reserved.
945 *
946 * Developed by:
947diff --git a/include/rocm_smi/rocm_smi_gpu_metrics.h b/include/rocm_smi/rocm_smi_gpu_metrics.h
948index 5712ea4..d9325cf 100644
949--- a/include/rocm_smi/rocm_smi_gpu_metrics.h
950+++ b/include/rocm_smi/rocm_smi_gpu_metrics.h
951@@ -1,44 +1,23 @@
952 /*
953- * =============================================================================
954- * The University of Illinois/NCSA
955- * Open Source License (NCSA)
956- *
957- * Copyright (c) 2017-2024, Advanced Micro Devices, Inc.
958- * All rights reserved.
959- *
960- * Developed by:
961- *
962- * AMD Research and AMD ROC Software Development
963- *
964- * Advanced Micro Devices, Inc.
965- *
966- * www.amd.com
967+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
968 *
969 * Permission is hereby granted, free of charge, to any person obtaining a copy
970- * of this software and associated documentation files (the "Software"), to
971- * deal with the Software without restriction, including without limitation
972- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
973- * and/or sell copies of the Software, and to permit persons to whom the
974- * Software is furnished to do so, subject to the following conditions:
975+ * of this software and associated documentation files (the "Software"), to deal
976+ * in the Software without restriction, including without limitation the rights
977+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
978+ * copies of the Software, and to permit persons to whom the Software is
979+ * furnished to do so, subject to the following conditions:
980 *
981- * - Redistributions of source code must retain the above copyright notice,
982- * this list of conditions and the following disclaimers.
983- * - Redistributions in binary form must reproduce the above copyright
984- * notice, this list of conditions and the following disclaimers in
985- * the documentation and/or other materials provided with the distribution.
986- * - Neither the names of <Name of Development Group, Name of Institution>,
987- * nor the names of its contributors may be used to endorse or promote
988- * products derived from this Software without specific prior written
989- * permission.
990+ * The above copyright notice and this permission notice shall be included in
991+ * all copies or substantial portions of the Software.
992 *
993 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
994 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
995- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
996- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
997- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
998- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
999- * DEALINGS WITH THE SOFTWARE.
1000- *
1001+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1002+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1003+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1004+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1005+ * THE SOFTWARE.
1006 */
1007
1008 #ifndef ROCM_SMI_ROCM_SMI_GPU_METRICS_H_
1009@@ -52,9 +31,12 @@
1010 #include <cassert>
1011 #include <cstdint>
1012 #include <cstring>
1013+#include <iostream>
1014 #include <string>
1015 #include <map>
1016 #include <memory>
1017+#include <mutex>
1018+#include <thread>
1019 #include <type_traits>
1020 #include <tuple>
1021 #include <variant>
1022@@ -72,10 +54,11 @@ constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_1 = 1;
1023 constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_2 = 2;
1024 constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_3 = 3;
1025 constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_4 = 4;
1026+constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_8 = 8;
1027 constexpr uint32_t kRSMI_LATEST_GPU_METRICS_API_CONTENT_MAJOR_VER
1028 = kRSMI_GPU_METRICS_API_CONTENT_MAJOR_VER_1;
1029-constexpr uint32_t kRSMI_LATEST_GPU_METRICS_API_CONTENT_MINON_VER
1030- = kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_4;
1031+constexpr uint32_t kRSMI_LATEST_GPU_METRICS_API_CONTENT_MINOR_VER
1032+ = kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_8;
1033
1034
1035 // Note: This *must* match NUM_HBM_INSTANCES
1036@@ -96,6 +79,10 @@ constexpr uint32_t kRSMI_MAX_NUM_VCNS = 4;
1037 // Note: This *must* match NUM_JPEG_ENG
1038 constexpr uint32_t kRSMI_MAX_JPEG_ENGINES = 32;
1039
1040+// Note: Updated for amdgpu_xcp_metrics_v1_2.
1041+// Document provides NUM_JPEG_ENG_V1 but will rename to kRSMI_MAX_NUM_JPEG_ENG_V1
1042+constexpr uint32_t kRSMI_MAX_NUM_JPEG_ENG_V1 = 40;
1043+
1044 // Note: This *must* match MAX_XCC
1045 constexpr uint32_t kRSMI_MAX_NUM_XCC = 8;
1046
1047@@ -108,6 +95,15 @@ struct AMDGpuMetricsHeader_v1_t {
1048 uint8_t m_format_revision;
1049 uint8_t m_content_revision;
1050 };
1051+struct amdgpu_xcp_metrics {
1052+ /* Utilization Instantaneous (%) */
1053+ uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC];
1054+ uint16_t jpeg_busy[kRSMI_MAX_JPEG_ENGINES];
1055+ uint16_t vcn_busy[kRSMI_MAX_NUM_VCNS];
1056+
1057+ /* Utilization Accumulated (%) */
1058+ uint64_t gfx_busy_acc[kRSMI_MAX_NUM_XCC];
1059+};
1060
1061 struct amdgpu_xcp_metrics_v1_1 {
1062 /* Utilization Instantaneous (%) */
1063@@ -122,14 +118,21 @@ struct amdgpu_xcp_metrics_v1_1 {
1064 uint64_t gfx_below_host_limit_acc[kRSMI_MAX_NUM_XCC];
1065 };
1066
1067-struct amdgpu_xcp_metrics {
1068+/* new for gpu metrics v1.8 */
1069+struct amdgpu_xcp_metrics_v1_2 {
1070 /* Utilization Instantaneous (%) */
1071 uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC];
1072- uint16_t jpeg_busy[kRSMI_MAX_JPEG_ENGINES];
1073+ uint16_t jpeg_busy[kRSMI_MAX_NUM_JPEG_ENG_V1];
1074 uint16_t vcn_busy[kRSMI_MAX_NUM_VCNS];
1075
1076 /* Utilization Accumulated (%) */
1077 uint64_t gfx_busy_acc[kRSMI_MAX_NUM_XCC];
1078+
1079+ /* Total App Clock Counter Accumulated */
1080+ uint64_t gfx_below_host_limit_ppt_acc[kRSMI_MAX_NUM_XCC];
1081+ uint64_t gfx_below_host_limit_thm_acc[kRSMI_MAX_NUM_XCC];
1082+ uint64_t gfx_low_utilization_acc[kRSMI_MAX_NUM_XCC];
1083+ uint64_t gfx_below_host_limit_total_acc[kRSMI_MAX_NUM_XCC];
1084 };
1085
1086 struct AMDGpuMetricsBase_t {
1087@@ -602,7 +605,7 @@ struct AMDGpuMetrics_v17_t {
1088 uint16_t m_average_gfx_activity;
1089 uint16_t m_average_umc_activity; // memory controller
1090
1091- /* VRAM max bandwidth at max memory clock */
1092+ /* VRAM max bandwidth at max memory clock (GB/s) */
1093 uint64_t m_vram_max_bandwidth; // new for 1.7
1094
1095 /* Energy (15.259uJ (2^-16) units) */
1096@@ -685,7 +688,107 @@ struct AMDGpuMetrics_v17_t {
1097 /* PCIE other end recovery counter */
1098 uint32_t m_pcie_lc_perf_other_end_recovery;
1099 };
1100-using AMGpuMetricsLatest_t = AMDGpuMetrics_v17_t;
1101+
1102+struct AMDGpuMetrics_v18_t {
1103+ ~AMDGpuMetrics_v18_t() = default;
1104+ struct AMDGpuMetricsHeader_v1_t m_common_header;
1105+
1106+ /* Temperature (Celsius) */
1107+ uint16_t m_temperature_hotspot;
1108+ uint16_t m_temperature_mem;
1109+ uint16_t m_temperature_vrsoc;
1110+
1111+ /* Power (Watts) */
1112+ uint16_t m_current_socket_power;
1113+
1114+ /* Utilization (%) */
1115+ uint16_t m_average_gfx_activity;
1116+ uint16_t m_average_umc_activity; // memory controller
1117+
1118+ /* VRAM max bandwidthi (in GB/sec) at max memory clock */
1119+ uint64_t m_mem_max_bandwidth;
1120+
1121+ /* Energy (15.259uJ (2^-16) units) */
1122+ uint64_t m_energy_accumulator;
1123+
1124+ /* Driver attached timestamp (in ns) */
1125+ uint64_t m_system_clock_counter;
1126+
1127+ /* Accumulation cycle counter */
1128+ uint32_t m_accumulation_counter;
1129+
1130+ /* Accumulated throttler residencies */
1131+ uint32_t m_prochot_residency_acc;
1132+ uint32_t m_ppt_residency_acc;
1133+ uint32_t m_socket_thm_residency_acc;
1134+ uint32_t m_vr_thm_residency_acc;
1135+ uint32_t m_hbm_thm_residency_acc;
1136+
1137+ /* Clock Lock Status. Each bit corresponds to clock instance */
1138+ uint32_t m_gfxclk_lock_status;
1139+
1140+ /* Link width (number of lanes) and speed (in 0.1 GT/s) */
1141+ uint16_t m_pcie_link_width;
1142+ uint16_t m_pcie_link_speed;
1143+
1144+ /* XGMI bus width and bitrate (in Gbps) */
1145+ uint16_t m_xgmi_link_width;
1146+ uint16_t m_xgmi_link_speed;
1147+
1148+ /* Utilization Accumulated (%) */
1149+ uint32_t m_gfx_activity_acc;
1150+ uint32_t m_mem_activity_acc;
1151+
1152+ /*PCIE accumulated bandwidth (GB/sec) */
1153+ uint64_t m_pcie_bandwidth_acc;
1154+
1155+ /*PCIE instantaneous bandwidth (GB/sec) */
1156+ uint64_t m_pcie_bandwidth_inst;
1157+
1158+ /* PCIE L0 to recovery state transition accumulated count */
1159+ uint64_t m_pcie_l0_to_recov_count_acc;
1160+
1161+ /* PCIE replay accumulated count */
1162+ uint64_t m_pcie_replay_count_acc;
1163+
1164+ /* PCIE replay rollover accumulated count */
1165+ uint64_t m_pcie_replay_rover_count_acc;
1166+
1167+ /* PCIE NAK sent accumulated count */
1168+ uint32_t m_pcie_nak_sent_count_acc;
1169+
1170+ /* PCIE NAK received accumulated count */
1171+ uint32_t m_pcie_nak_rcvd_count_acc;
1172+
1173+ /* XGMI accumulated data transfer size(KiloBytes) */
1174+ uint64_t m_xgmi_read_data_acc[kRSMI_MAX_NUM_XGMI_LINKS];
1175+ uint64_t m_xgmi_write_data_acc[kRSMI_MAX_NUM_XGMI_LINKS];
1176+
1177+ /* XGMI link status(active/inactive) */
1178+ uint16_t m_xgmi_link_status[kRSMI_MAX_NUM_XGMI_LINKS];
1179+
1180+ uint16_t m_padding;
1181+
1182+ /* PMFW attached timestamp (10ns resolution) */
1183+ uint64_t m_firmware_timestamp;
1184+
1185+ /* Current clocks (Mhz) */
1186+ uint16_t m_current_gfxclk[kRSMI_MAX_NUM_GFX_CLKS];
1187+ uint16_t m_current_socclk[kRSMI_MAX_NUM_CLKS];
1188+ uint16_t m_current_vclk0[kRSMI_MAX_NUM_CLKS];
1189+ uint16_t m_current_dclk0[kRSMI_MAX_NUM_CLKS];
1190+ uint16_t m_current_uclk;
1191+
1192+ /* Number of current partition */
1193+ uint16_t m_num_partition;
1194+
1195+ /* XCP metrics stats */
1196+ struct amdgpu_xcp_metrics_v1_2 m_xcp_stats[kRSMI_MAX_NUM_XCP];
1197+
1198+ /* PCIE other end recovery counter */
1199+ uint32_t m_pcie_lc_perf_other_end_recovery;
1200+};
1201+using AMGpuMetricsLatest_t = AMDGpuMetrics_v18_t;
1202
1203 /**
1204 * This is GPU Metrics version that gets to public access.
1205@@ -900,11 +1003,18 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t
1206 kMetricJpegBusy, // v1.6
1207 kMetricVcnBusy, // v1.6
1208 kMetricGfxBusyAcc, // v1.6
1209+ kMetricGfxBelowHostLimitAccumulator, // v1.7
1210+
1211 kMetricPcieLCPerfOtherEndRecov, // v1.6
1212
1213 kMetricVramMaxBandwidth, // v1.7
1214 kMetricXgmiLinkStatus, // v1.7
1215- kMetricGfxBelowHostLimitAccumulator, // v1.7
1216+
1217+ kMetricGfxBelowHostLimitPptAcc, // v1.8
1218+ kMetricGfxBelowHostLimitThmAcc, // v1.8
1219+ kMetricGfxBelowHostLimitTotalAcc, // v1.8
1220+ kMetricGfxLowUtilitizationAcc, // v1.8
1221+
1222 };
1223 using AMDGpuMetricsUnitTypeTranslationTbl_t = std::map<AMDGpuMetricsUnitType_t, std::string>;
1224
1225@@ -943,6 +1053,7 @@ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t
1226 kGpuMetricV15 = (0x1 << 5),
1227 kGpuMetricV16 = (0x1 << 6),
1228 kGpuMetricV17 = (0x1 << 7),
1229+ kGpuMetricV18 = (0x1 << 8), // Added new version flag
1230 };
1231 using AMDGpuMetricVersionTranslationTbl_t = std::map<uint16_t, AMDGpuMetricVersionFlags_t>;
1232 using GpuMetricTypePtr_t = std::shared_ptr<void>;
1233@@ -952,27 +1063,24 @@ class GpuMetricsBase_t {
1234 virtual ~GpuMetricsBase_t() = default;
1235 virtual size_t sizeof_metric_table() = 0;
1236 virtual GpuMetricTypePtr_t get_metrics_table() = 0;
1237- virtual void dump_internal_metrics_table() = 0;
1238 virtual AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() = 0;
1239 virtual rsmi_status_t populate_metrics_dynamic_tbl() = 0;
1240 virtual AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() = 0;
1241 virtual void set_device_id(uint32_t device_id) { m_device_id = device_id; }
1242 virtual void set_partition_id(uint32_t partition_id) { m_partition_id = partition_id; }
1243 virtual AMDGpuDynamicMetricsTbl_t get_metrics_dynamic_tbl() {
1244- return m_metrics_dynamic_tbl;
1245+ return m_base_metrics_dynamic_tbl;
1246 }
1247
1248 protected:
1249- AMDGpuDynamicMetricsTbl_t m_metrics_dynamic_tbl;
1250+ AMDGpuDynamicMetricsTbl_t m_base_metrics_dynamic_tbl;
1251 uint64_t m_metrics_timestamp;
1252 uint32_t m_device_id;
1253 uint32_t m_partition_id;
1254-
1255 };
1256 using GpuMetricsBasePtr = std::shared_ptr<GpuMetricsBase_t>;
1257 using AMDGpuMetricFactories_t = const std::map<AMDGpuMetricVersionFlags_t, GpuMetricsBasePtr>;
1258
1259-
1260 class GpuMetricsBase_v11_t final : public GpuMetricsBase_t {
1261 public:
1262 virtual ~GpuMetricsBase_v11_t() = default;
1263@@ -989,10 +1097,6 @@ class GpuMetricsBase_v11_t final : public GpuMetricsBase_t {
1264 return m_gpu_metric_ptr;
1265 }
1266
1267- void dump_internal_metrics_table() override {
1268- return;
1269- }
1270-
1271 AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override {
1272 return AMDGpuMetricVersionFlags_t::kGpuMetricV11;
1273 }
1274@@ -1022,10 +1126,6 @@ class GpuMetricsBase_v12_t final : public GpuMetricsBase_t {
1275 return m_gpu_metric_ptr;
1276 }
1277
1278- void dump_internal_metrics_table() override {
1279- return;
1280- }
1281-
1282 AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override {
1283 return AMDGpuMetricVersionFlags_t::kGpuMetricV12;
1284 }
1285@@ -1054,8 +1154,6 @@ class GpuMetricsBase_v13_t final : public GpuMetricsBase_t {
1286 return (m_gpu_metric_ptr);
1287 }
1288
1289- void dump_internal_metrics_table() override;
1290-
1291 AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override {
1292 return AMDGpuMetricVersionFlags_t::kGpuMetricV13;
1293 }
1294@@ -1085,8 +1183,6 @@ class GpuMetricsBase_v14_t final : public GpuMetricsBase_t {
1295 return m_gpu_metric_ptr;
1296 }
1297
1298- void dump_internal_metrics_table() override;
1299-
1300 AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override {
1301 return AMDGpuMetricVersionFlags_t::kGpuMetricV14;
1302 }
1303@@ -1116,8 +1212,6 @@ class GpuMetricsBase_v15_t final : public GpuMetricsBase_t {
1304 return m_gpu_metric_ptr;
1305 }
1306
1307- void dump_internal_metrics_table() override;
1308-
1309 AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override {
1310 return AMDGpuMetricVersionFlags_t::kGpuMetricV15;
1311 }
1312@@ -1147,8 +1241,6 @@ class GpuMetricsBase_v16_t final : public GpuMetricsBase_t {
1313 return m_gpu_metric_ptr;
1314 }
1315
1316- void dump_internal_metrics_table() override;
1317-
1318 AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override {
1319 return AMDGpuMetricVersionFlags_t::kGpuMetricV16;
1320 }
1321@@ -1177,8 +1269,6 @@ class GpuMetricsBase_v17_t final : public GpuMetricsBase_t {
1322 return m_gpu_metric_ptr;
1323 }
1324
1325- void dump_internal_metrics_table() override;
1326-
1327 AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override {
1328 return AMDGpuMetricVersionFlags_t::kGpuMetricV17;
1329 }
1330@@ -1191,6 +1281,34 @@ class GpuMetricsBase_v17_t final : public GpuMetricsBase_t {
1331 std::shared_ptr<AMDGpuMetrics_v17_t> m_gpu_metric_ptr;
1332 };
1333
1334+class GpuMetricsBase_v18_t final : public GpuMetricsBase_t {
1335+ public:
1336+ ~GpuMetricsBase_v18_t() = default;
1337+
1338+ size_t sizeof_metric_table() override {
1339+ return sizeof(AMDGpuMetrics_v18_t);
1340+ }
1341+
1342+ GpuMetricTypePtr_t get_metrics_table() override {
1343+ if (!m_gpu_metric_ptr) {
1344+ m_gpu_metric_ptr.reset(&m_gpu_metrics_tbl, [](AMDGpuMetrics_v18_t*){});
1345+ }
1346+ assert(m_gpu_metric_ptr != nullptr);
1347+ return m_gpu_metric_ptr;
1348+ }
1349+
1350+ AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override {
1351+ return AMDGpuMetricVersionFlags_t::kGpuMetricV18;
1352+ }
1353+
1354+ rsmi_status_t populate_metrics_dynamic_tbl() override;
1355+ AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override;
1356+
1357+ private:
1358+ AMDGpuMetrics_v18_t m_gpu_metrics_tbl;
1359+ std::shared_ptr<AMDGpuMetrics_v18_t> m_gpu_metric_ptr;
1360+};
1361+
1362 template<typename T>
1363 rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind,
1364 AMDGpuMetricsUnitType_t metric_counter, T& metric_value);
1365diff --git a/include/rocm_smi/rocm_smi_io_link.h b/include/rocm_smi/rocm_smi_io_link.h
1366old mode 100755
1367new mode 100644
1368index 5903ab9..b1265ba
1369--- a/include/rocm_smi/rocm_smi_io_link.h
1370+++ b/include/rocm_smi/rocm_smi_io_link.h
1371@@ -3,7 +3,7 @@
1372 * The University of Illinois/NCSA
1373 * Open Source License (NCSA)
1374 *
1375- * Copyright (c) 2020, Advanced Micro Devices, Inc.
1376+ * Copyright (c) 2025, Advanced Micro Devices, Inc.
1377 * All rights reserved.
1378 *
1379 * Developed by:
1380diff --git a/include/rocm_smi/rocm_smi_kfd.h b/include/rocm_smi/rocm_smi_kfd.h
1381old mode 100755
1382new mode 100644
1383index 2759dfd..68e7403
1384--- a/include/rocm_smi/rocm_smi_kfd.h
1385+++ b/include/rocm_smi/rocm_smi_kfd.h
1386@@ -3,7 +3,7 @@
1387 * The University of Illinois/NCSA
1388 * Open Source License (NCSA)
1389 *
1390- * Copyright (c) 2019, Advanced Micro Devices, Inc.
1391+ * Copyright (c) 2025, Advanced Micro Devices, Inc.
1392 * All rights reserved.
1393 *
1394 * Developed by:
1395diff --git a/include/rocm_smi/rocm_smi_logger.h b/include/rocm_smi/rocm_smi_logger.h
1396index f83240f..d51e487 100644
1397--- a/include/rocm_smi/rocm_smi_logger.h
1398+++ b/include/rocm_smi/rocm_smi_logger.h
1399@@ -3,7 +3,7 @@
1400 * The University of Illinois/NCSA
1401 * Open Source License (NCSA)
1402 *
1403- * Copyright (c) 2023, Advanced Micro Devices, Inc.
1404+ * Copyright (c) 2025, Advanced Micro Devices, Inc.
1405 * All rights reserved.
1406 *
1407 * Developed by:
1408diff --git a/include/rocm_smi/rocm_smi_main.h b/include/rocm_smi/rocm_smi_main.h
1409old mode 100755
1410new mode 100644
1411index 1d639d7..045ab2e
1412--- a/include/rocm_smi/rocm_smi_main.h
1413+++ b/include/rocm_smi/rocm_smi_main.h
1414@@ -5,7 +5,7 @@
1415 * The University of Illinois/NCSA
1416 * Open Source License (NCSA)
1417 *
1418- * Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
1419+ * Copyright (c) 2017-2025, Advanced Micro Devices, Inc.
1420 * All rights reserved.
1421 *
1422 * Developed by:
1423diff --git a/include/rocm_smi/rocm_smi_monitor.h b/include/rocm_smi/rocm_smi_monitor.h
1424old mode 100755
1425new mode 100644
1426index ad28464..f52144a
1427--- a/include/rocm_smi/rocm_smi_monitor.h
1428+++ b/include/rocm_smi/rocm_smi_monitor.h
1429@@ -5,7 +5,7 @@
1430 * The University of Illinois/NCSA
1431 * Open Source License (NCSA)
1432 *
1433- * Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
1434+ * Copyright (c) 2017-2025, Advanced Micro Devices, Inc.
1435 * All rights reserved.
1436 *
1437 * Developed by:
1438diff --git a/include/rocm_smi/rocm_smi_power_mon.h b/include/rocm_smi/rocm_smi_power_mon.h
1439old mode 100755
1440new mode 100644
1441index 71e4c08..0a8a2f4
1442--- a/include/rocm_smi/rocm_smi_power_mon.h
1443+++ b/include/rocm_smi/rocm_smi_power_mon.h
1444@@ -5,7 +5,7 @@
1445 * The University of Illinois/NCSA
1446 * Open Source License (NCSA)
1447 *
1448- * Copyright (c) 2017, Advanced Micro Devices, Inc.
1449+ * Copyright (c) 2025, Advanced Micro Devices, Inc.
1450 * All rights reserved.
1451 *
1452 * Developed by:
1453diff --git a/include/rocm_smi/rocm_smi_properties.h b/include/rocm_smi/rocm_smi_properties.h
1454index 67d285c..0aa4f4a 100644
1455--- a/include/rocm_smi/rocm_smi_properties.h
1456+++ b/include/rocm_smi/rocm_smi_properties.h
1457@@ -3,7 +3,7 @@
1458 * The University of Illinois/NCSA
1459 * Open Source License (NCSA)
1460 *
1461- * Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
1462+ * Copyright (c) 2017-2025, Advanced Micro Devices, Inc.
1463 * All rights reserved.
1464 *
1465 * Developed by:
1466diff --git a/include/rocm_smi/rocm_smi_utils.h b/include/rocm_smi/rocm_smi_utils.h
1467old mode 100755
1468new mode 100644
1469index 5263d35..42d009d
1470--- a/include/rocm_smi/rocm_smi_utils.h
1471+++ b/include/rocm_smi/rocm_smi_utils.h
1472@@ -3,7 +3,7 @@
1473 * The University of Illinois/NCSA
1474 * Open Source License (NCSA)
1475 *
1476- * Copyright (c) 2018-2023, Advanced Micro Devices, Inc.
1477+ * Copyright (c) 2018-2025, Advanced Micro Devices, Inc.
1478 * All rights reserved.
1479 *
1480 * Developed by:
1481@@ -136,6 +136,8 @@ rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind,
1482 std::string leftTrim(const std::string &s);
1483 std::string rightTrim(const std::string &s);
1484 std::string trim(const std::string &s);
1485+std::string trimAllWhiteSpace(const std::string &s);
1486+std::string removeWhitespace(const std::string &s);
1487 std::string removeNewLines(const std::string &s);
1488
1489 std::string removeString(const std::string origStr,
1490@@ -144,6 +146,7 @@ void system_wait(int milli_seconds);
1491 int countDigit(uint64_t n);
1492 std::string find_file_in_folder(const std::string& folder,
1493 const std::string& regex);
1494+uint64_t get_multiplier_from_char(char units_char);
1495 template <typename T>
1496 std::string print_int_as_hex(T i, bool showHexNotation = true,
1497 int overloadBitSize = 0) {
1498diff --git a/oam/CMakeLists.txt b/oam/CMakeLists.txt
1499index 181ee1e..7aa1b5f 100644
1500--- a/oam/CMakeLists.txt
1501+++ b/oam/CMakeLists.txt
1502@@ -94,14 +94,16 @@ endif ()
1503 # use the target_include_directories() command to specify the include directories for the target
1504 target_include_directories(${OAM_TARGET}
1505 PUBLIC
1506+ "$<BUILD_INTERFACE:${DRM_INCLUDE_DIRS}>"
1507+ "$<BUILD_INTERFACE:${AMDGPU_DRM_INCLUDE_DIRS}>"
1508 "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
1509 "$<INSTALL_INTERFACE:{OAM_NAME}/include>")
1510
1511 ## Add the install directives for the runtime library.
1512 install(TARGETS ${OAM_TARGET}
1513 EXPORT rocm_smiTargets
1514- LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
1515- COMPONENT dev)
1516+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT dev
1517+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT dev)
1518 install(TARGETS ${OAM_TARGET}
1519 LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
1520 COMPONENT asan)
1521diff --git a/oam/include/oam/amd_oam.h b/oam/include/oam/amd_oam.h
1522old mode 100755
1523new mode 100644
1524diff --git a/oam/include/oam/oam_mapi.h b/oam/include/oam/oam_mapi.h
1525old mode 100755
1526new mode 100644
1527diff --git a/oam/src/amd_oam.cc b/oam/src/amd_oam.cc
1528old mode 100755
1529new mode 100644
1530diff --git a/oam/src/oamConfig.in b/oam/src/oamConfig.in
1531old mode 100755
1532new mode 100644
1533index bde279c..5f5b96b
1534--- a/oam/src/oamConfig.in
1535+++ b/oam/src/oamConfig.in
1536@@ -5,7 +5,7 @@
1537 * The University of Illinois/NCSA
1538 * Open Source License (NCSA)
1539 *
1540- * Copyright (c) 2017, Advanced Micro Devices, Inc.
1541+ * Copyright (c) 2025, Advanced Micro Devices, Inc.
1542 * All rights reserved.
1543 *
1544 * Developed by:
1545@@ -53,4 +53,4 @@
1546 #define rocm_smi_VERSION_PATCH @rocm_smi_VERSION_PATCH@
1547 #define rocm_smi_VERSION_BUILD "@rocm_smi_VERSION_BUILD@"
1548
1549-#endif // INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_
1550\ No newline at end of file
1551+#endif // INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_
1552diff --git a/python_smi_tools/README.md b/python_smi_tools/README.md
1553index 81d9175..0247e40 100644
1554--- a/python_smi_tools/README.md
1555+++ b/python_smi_tools/README.md
1556@@ -456,4 +456,4 @@ The information contained herein is for informational purposes only, and is subj
1557
1558 AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.
1559
1560-Copyright (c) 2014-2024 Advanced Micro Devices, Inc. All rights reserved.
1561+Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All rights reserved.
1562diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py
1563index 68ab1b8..6eb22a2 100755
1564--- a/python_smi_tools/rocm_smi.py
1565+++ b/python_smi_tools/rocm_smi.py
1566@@ -17,10 +17,11 @@ import logging
1567 import os
1568 import sys
1569 import subprocess
1570-import _thread
1571+import threading
1572 import time
1573 import multiprocessing
1574 import trace
1575+from os.path import exists
1576 from io import StringIO
1577 from time import ctime
1578 from subprocess import check_output
1579@@ -49,7 +50,7 @@ except ImportError:
1580 # Minor version - Increment when adding a new feature, set to 0 when major is incremented
1581 # Patch version - Increment when adding a fix, set to 0 when minor is incremented
1582 # Hash version - Shortened commit hash. Print here and not with lib for consistency with amd-smi
1583-SMI_MAJ = 3
1584+SMI_MAJ = 4
1585 SMI_MIN = 0
1586 SMI_PAT = 0
1587 # SMI_HASH is provided by rsmiBindings
1588@@ -86,6 +87,9 @@ validClockNames = clk_type_names[1:-2]
1589 validClockNames.append('pcie')
1590 validClockNames.sort()
1591
1592+# Thread stop condition
1593+stop_threads = False
1594+
1595 def driverInitialized():
1596 """ Returns true if amdgpu is found in the list of initialized modules
1597 """
1598@@ -472,7 +476,7 @@ def getAllocatedMemoryPercent(device):
1599 mem_use_pct = 0
1600 if vram_used is None:
1601 return allocated_memory_vram
1602- if vram_used != None and vram_total != None and float(vram_total) != 0:
1603+ if vram_used is not None and vram_total is not None and float(vram_total) != 0:
1604 # take floor of result (round down to nearest integer)
1605 mem_use_pct = (100 * (float(vram_used) / float(vram_total))) // 1
1606 allocated_memory_vram['value'] = mem_use_pct
1607@@ -527,7 +531,7 @@ def getProcessName(pid):
1608 except subprocess.CalledProcessError as e:
1609 pName = 'UNKNOWN'
1610
1611- if pName == None:
1612+ if pName is None:
1613 pName = 'UNKNOWN'
1614
1615 # Remove the substrings surrounding from process name (b' and \n')
1616@@ -866,13 +870,16 @@ def printEventList(device, delay, eventList):
1617 if not rsmi_ret_ok(ret, device, 'set_event_notification_mask'):
1618 printErrLog(device, 'Unable to set event notification mask.')
1619 return
1620- while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
1621+ while not stop_threads: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
1622 num_elements = c_uint32(1)
1623 data = rsmi_evt_notification_data_t(1)
1624 rocmsmi.rsmi_event_notification_get(delay, byref(num_elements), byref(data))
1625 if len(data.message) > 0:
1626 print2DArray([['\rGPU[%d]:\t' % (data.dv_ind), ctime().split()[3], notification_type_names[data.event.value - 1],
1627 data.message.decode('utf8') + '\r']])
1628+ ret = rocmsmi.rsmi_event_notification_stop(device)
1629+ if not rsmi_ret_ok(ret, device, 'stop_event_notification'):
1630+ printErrLog(device, 'Unable to end event notifications.')
1631
1632 def printLog(device, metricName, value=None, extraSpace=False, useItalics=False, xcp=None):
1633 """ Print out to the SMI log
1634@@ -915,8 +922,8 @@ def printLog(device, metricName, value=None, extraSpace=False, useItalics=False,
1635
1636 # Handle non UTF-8 locale
1637 try:
1638- print(logstr + '\n', end='')
1639- except UnicodeEncodeError:
1640+ print(logstr.encode('utf-8', 'ignore').decode('utf-8'))
1641+ except UnicodeError:
1642 print(logstr.encode('ascii', 'ignore').decode('ascii'))
1643
1644 sys.stdout.flush()
1645@@ -1086,18 +1093,12 @@ def resetClocks(deviceList):
1646 ret = rocmsmi.rsmi_dev_overdrive_level_set(device, rsmi_dev_perf_level_t(0))
1647 if rsmi_ret_ok(ret, device, 'set_overdrive_level'):
1648 printLog(device, 'OverDrive set to 0', None)
1649- else:
1650- printLog(device, 'Unable to reset OverDrive', None)
1651 ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
1652 if rsmi_ret_ok(ret, device, 'set_perf_level'):
1653 printLog(device, 'Successfully reset clocks', None)
1654- else:
1655- printLog(device, 'Unable to reset clocks', None)
1656 ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
1657 if rsmi_ret_ok(ret, device, 'set_perf_level'):
1658 printLog(device, 'Performance level reset to auto', None)
1659- else:
1660- printLog(device, 'Unable to reset performance level to auto', None)
1661
1662
1663 def resetFans(deviceList):
1664@@ -1111,8 +1112,6 @@ def resetFans(deviceList):
1665 ret = rocmsmi.rsmi_dev_fan_reset(device, sensor_ind)
1666 if rsmi_ret_ok(ret, device, silent=True):
1667 printLog(device, 'Successfully reset fan speed to driver control', None)
1668- else:
1669- printLog(device, 'Not supported on the given system', None)
1670 printLogSpacer()
1671
1672
1673@@ -1134,13 +1133,6 @@ def resetProfile(deviceList):
1674 ret = rocmsmi.rsmi_dev_power_profile_set(device, 0, profileString('BOOTUP DEFAULT'))
1675 if rsmi_ret_ok(ret, device, 'set_power_profile'):
1676 printLog(device, 'Successfully reset Power Profile', None)
1677- else:
1678- printErrLog(device, 'Unable to reset Power Profile')
1679- ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
1680- if rsmi_ret_ok(ret, device, 'set_perf_level'):
1681- printLog(device, 'Successfully reset Performance Level', None)
1682- else:
1683- printErrLog(device, 'Unable to reset Performance Level')
1684 printLogSpacer()
1685
1686
1687@@ -1154,8 +1146,6 @@ def resetXgmiErr(deviceList):
1688 ret = rocmsmi.rsmi_dev_xgmi_error_reset(device)
1689 if rsmi_ret_ok(ret, device, 'reset xgmi'):
1690 printLog(device, 'Successfully reset XGMI Error count', None)
1691- else:
1692- logging.error('GPU[%s]\t\t: Unable to reset XGMI error count', device)
1693 printLogSpacer()
1694
1695
1696@@ -1169,8 +1159,6 @@ def resetPerfDeterminism(deviceList):
1697 ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
1698 if rsmi_ret_ok(ret, device, 'disable performance determinism'):
1699 printLog(device, 'Successfully disabled performance determinism', None)
1700- else:
1701- logging.error('GPU[%s]\t\t: Unable to disable performance determinism', device)
1702 printLogSpacer()
1703
1704
1705@@ -1203,10 +1191,10 @@ def setClockRange(deviceList, clkType, minvalue, maxvalue, autoRespond):
1706 if rsmi_ret_ok(ret, device, silent=True):
1707 printLog(device, 'Successfully set %s from %s(MHz) to %s(MHz)' % (clkType, minvalue, maxvalue), None)
1708 else:
1709- printErrLog(device, 'Unable to set %s from %s(MHz) to %s(MHz)' % (clkType, minvalue, maxvalue))
1710- RETCODE = 1
1711 if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
1712 printLog(device, 'Setting %s range is not supported for this device.' % (clkType), None)
1713+ else:
1714+ RETCODE = 1
1715
1716 def setClockExtremum(deviceList, level, clkType, clkValue, autoRespond):
1717 """ Set the range for the specified clktype in the PowerPlay table for a list of devices.
1718@@ -1247,10 +1235,10 @@ def setClockExtremum(deviceList, level, clkType, clkValue, autoRespond):
1719 if rsmi_ret_ok(ret, device, silent=True):
1720 printLog(device, 'Successfully set %s %s to %s(MHz)' % (level, clkType, clkValue), None)
1721 else:
1722- printErrLog(device, 'Unable to set %s %s to %s(MHz)' % (level, clkType, clkValue))
1723- RETCODE = 1
1724 if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
1725 printLog(device, 'Setting %s %s clock is not supported for this device.' % (level, clkType), None)
1726+ else:
1727+ RETCODE = 1
1728
1729
1730 def setVoltageCurve(deviceList, point, clk, volt, autoRespond):
1731@@ -1276,9 +1264,6 @@ def setVoltageCurve(deviceList, point, clk, volt, autoRespond):
1732 ret = rocmsmi.rsmi_dev_od_volt_info_set(device, int(point), int(clk), int(volt))
1733 if rsmi_ret_ok(ret, device, 'set_voltage_curve'):
1734 printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
1735- else:
1736- printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt))
1737- RETCODE = 1
1738
1739
1740 def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond):
1741@@ -1309,7 +1294,6 @@ def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond):
1742 if rsmi_ret_ok(ret, device, 'set_power_play_table_level_' + str(clkType)):
1743 printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
1744 else:
1745- printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt))
1746 RETCODE = 1
1747 elif clkType == 'mclk':
1748 ret = rocmsmi.rsmi_dev_od_clk_info_set(device, rsmi_freq_ind_t(int(point)), int(clk),
1749@@ -1317,7 +1301,6 @@ def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond):
1750 if rsmi_ret_ok(ret, device, 'set_power_play_table_level_' + str(clkType)):
1751 printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
1752 else:
1753- printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt))
1754 RETCODE = 1
1755 else:
1756 printErrLog(device, 'Unable to set %s range' % (clkType))
1757@@ -1357,8 +1340,6 @@ def setClockOverDrive(deviceList, clktype, value, autoRespond):
1758 ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(3))
1759 if rsmi_ret_ok(ret, device, 'set_perf_level_manual_' + str(clktype)):
1760 printLog(device, 'Performance level set to manual', None)
1761- else:
1762- printErrLog(device, 'Unable to set performance level to manual')
1763 if clktype == 'mclk':
1764 fsFile = os.path.join('/sys/class/drm', 'card%d' % (device), 'device', 'pp_mclk_od')
1765 if not os.path.isfile(fsFile):
1766@@ -1432,14 +1413,13 @@ def setClocks(deviceList, clktype, clk):
1767 if rsmi_ret_ok(ret, device, 'set_perf_level_manual'):
1768 printLog(device, 'Performance level was set to manual', None)
1769 else:
1770- printErrLog(device, 'Unable to set performance level to manual')
1771 RETCODE = 1
1772 return
1773 if clktype != 'pcie':
1774 # Validate frequency bitmask
1775 freq = rsmi_frequencies_t()
1776 ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clktype], byref(freq))
1777- if rsmi_ret_ok(ret, device, 'get_gpu_clk_freq_' + str(clktype)) == False:
1778+ if not rsmi_ret_ok(ret, device, 'get_gpu_clk_freq_' + str(clktype)):
1779 RETCODE = 1
1780 return
1781 # The freq_bitmask should be less than 2^(freqs.num_supported)
1782@@ -1453,13 +1433,12 @@ def setClocks(deviceList, clktype, clk):
1783 if rsmi_ret_ok(ret, device, 'set_gpu_clk_freq_' + str(clktype)):
1784 printLog(device, 'Successfully set %s bitmask to' % (clktype), hex(freq_bitmask))
1785 else:
1786- printErrLog(device, 'Unable to set %s bitmask to: %s' % (clktype, hex(freq_bitmask)))
1787 RETCODE = 1
1788 else:
1789 # Validate the bandwidth bitmask
1790 bw = rsmi_pcie_bandwidth_t()
1791 ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw))
1792- if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth') == False:
1793+ if not rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth'):
1794 RETCODE = 1
1795 return
1796 # The freq_bitmask should be less than 2^(bw.transfer_rate.num_supported)
1797@@ -1473,7 +1452,6 @@ def setClocks(deviceList, clktype, clk):
1798 if rsmi_ret_ok(ret, device, 'set_PCIe_bandwidth'):
1799 printLog(device, 'Successfully set %s to level bitmask' % (clktype), hex(freq_bitmask))
1800 else:
1801- printErrLog(device, 'Unable to set %s bitmask to: %s' % (clktype, hex(freq_bitmask)))
1802 RETCODE = 1
1803 printLogSpacer()
1804
1805@@ -1498,7 +1476,6 @@ def setPerfDeterminism(deviceList, clkvalue):
1806 if rsmi_ret_ok(ret, device, 'set_perf_determinism'):
1807 printLog(device, 'Successfully enabled performance determinism and set GFX clock frequency', str(clkvalue))
1808 else:
1809- printErrLog(device, 'Unable to set performance determinism and clock frequency to %s' % (str(clkvalue)))
1810 RETCODE = 1
1811
1812
1813@@ -1521,9 +1498,6 @@ def resetGpu(device):
1814 ret = rocmsmi.rsmi_dev_gpu_reset(resetDev)
1815 if rsmi_ret_ok(ret, resetDev, 'reset_gpu'):
1816 printLog(resetDev, 'Successfully reset GPU %d' % (resetDev), None)
1817- else:
1818- printErrLog(resetDev, 'Unable to reset GPU %d' % (resetDev))
1819- logging.debug('GPU reset failed with return value of %d' % ret)
1820 printLogSpacer()
1821
1822
1823@@ -1690,7 +1664,7 @@ def setPowerOverDrive(deviceList, value, autoRespond):
1824 new_power_cap.value = int(value) * 1000000
1825
1826 ret = rocmsmi.rsmi_dev_power_cap_range_get(device, 0, byref(power_cap_max), byref(power_cap_min))
1827- if rsmi_ret_ok(ret, device, 'get_power_cap_range') == False:
1828+ if not rsmi_ret_ok(ret, device, 'get_power_cap_range'):
1829 printErrLog(device, 'Unable to parse Power OverDrive range')
1830 RETCODE = 1
1831 continue
1832@@ -1897,7 +1871,6 @@ def setMemoryPartition(deviceList, memoryPartition, autoRespond):
1833 printLog(device, 'Issue reloading driver, please check dmsg for errors',
1834 None, addExtraLine)
1835 else:
1836- rsmi_ret_ok(ret, device, 'set_memory_partition')
1837 printErrLog(device, 'Failed to set memory partition, even though device supports it.')
1838 printLogSpacer()
1839
1840@@ -2380,7 +2353,7 @@ def getCoarseGrainUtil(device, typeName=None):
1841 """
1842 timestamp = c_uint64(0)
1843
1844- if typeName != None:
1845+ if typeName is not None:
1846
1847 try:
1848 i = utilization_counter_name.index(typeName)
1849@@ -2589,6 +2562,26 @@ def showPcieBw(deviceList):
1850 max_pkt_sz = c_uint64()
1851 printLogSpacer(' Measured PCIe Bandwidth ')
1852 for device in deviceList:
1853+ # Get BW from GPU metrics from version >= 1.5
1854+ header = metrics_table_header_t()
1855+ ret_version = rocmsmi.rsmi_dev_metrics_header_info_get(device, byref(header))
1856+ if rsmi_ret_ok(ret_version, device, 'get_metrics_header', True):
1857+ if header.format_revision >= 1 and header.content_revision >= 5:
1858+ gpu_metrics = rsmi_gpu_metrics_t()
1859+ ret = rocmsmi.rsmi_dev_gpu_metrics_info_get(device, byref(gpu_metrics))
1860+ if rsmi_ret_ok(ret, device, "get_gpu_metrics", True):
1861+ metric_bw = gpu_metrics.pcie_bandwidth_inst
1862+ if metric_bw != ctypes.c_uint64(-1).value and metric_bw > 0:
1863+ bandwidth_mbps = metric_bw / 8.0 # Convert megabits to megabytes
1864+ bwstr = f"{bandwidth_mbps:.3f}"
1865+ printLog(device, "Current PCIe bandwidth (MB/s)", bwstr)
1866+ continue
1867+ else:
1868+ printLog(device, "GPU metrics pcie_bandwidth_inst is invalid", None)
1869+ else:
1870+ printLog(device, "Failed to get GPU metrics info", None)
1871+
1872+ # Use legacy API (For GPU metric version < 1.5 or failed)
1873 ret = rocmsmi.rsmi_dev_pci_throughput_get(device, byref(sent), byref(received), byref(max_pkt_sz))
1874 if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth'):
1875 # Use 1024.0 to ensure that the result is a float and not integer division
1876@@ -2707,8 +2700,6 @@ def showPower(deviceList):
1877 elif checkIfSecondaryDie(device):
1878 printLog(device, 'Average Graphics Package Power (W)', "N/A (Secondary die)")
1879 secondaryPresent=True
1880- else:
1881- printErrLog(device, 'Unable to get Average or Current Socket Graphics Package Power Consumption')
1882 if secondaryPresent:
1883 printLog(None, "\n\t\tPrimary die (usually one above or below the secondary) shows total (primary + secondary) socket power information", None)
1884 printLogSpacer()
1885@@ -2823,13 +2814,20 @@ def showRange(deviceList, rangeType):
1886 return
1887 printLogSpacer(' Show Valid %s Range ' % (rangeType))
1888 odvf = rsmi_od_volt_freq_data_t()
1889+ uint64_max = UIntegerTypes.UINT64_T
1890 for device in deviceList:
1891 ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf))
1892 if rsmi_ret_ok(ret, device, 'get_od_volt', silent=False):
1893 if rangeType == 'sclk':
1894+ if odvf.curr_sclk_range.lower_bound == uint64_max or odvf.curr_sclk_range.upper_bound == uint64_max:
1895+ printLog(device, 'Unable to display %s range' % (rangeType), None)
1896+ continue
1897 printLog(device, 'Valid sclk range: %sMhz - %sMhz' % (
1898 int(odvf.curr_sclk_range.lower_bound / 1000000), int(odvf.curr_sclk_range.upper_bound / 1000000)), None)
1899 if rangeType == 'mclk':
1900+ if odvf.curr_mclk_range.lower_bound == uint64_max or odvf.curr_mclk_range.upper_bound == uint64_max:
1901+ printLog(device, 'Unable to display %s range' % (rangeType), None)
1902+ continue
1903 printLog(device, 'Valid mclk range: %sMhz - %sMhz' % (
1904 int(odvf.curr_mclk_range.lower_bound / 1000000), int(odvf.curr_mclk_range.upper_bound / 1000000)), None)
1905 if rangeType == 'voltage':
1906@@ -2996,8 +2994,9 @@ def showEvents(deviceList, eventTypes):
1907 :param eventTypes: List of event type names (can be a single-item list)
1908 """
1909 printLogSpacer(' Show Events ')
1910- printLog(None, 'press \'q\' or \'ctrl + c\' to quit', None)
1911+ printLog(None, 'press \'q\' or \'ctrl + c\' and then \'Enter\' to quit', None)
1912 eventTypeList = []
1913+ thread_list = []
1914 for event in eventTypes: # Cleaning list from wrong values
1915 if event.replace(',', '').upper() in notification_type_names:
1916 eventTypeList.append(event.replace(',', '').upper())
1917@@ -3009,22 +3008,23 @@ def showEvents(deviceList, eventTypes):
1918 # Create a separate thread for each GPU
1919 for device in deviceList:
1920 try:
1921- _thread.start_new_thread(printEventList, (device, 1000, eventTypeList))
1922+ thread = threading.Thread(target=printEventList, args=(device, 1000, eventTypeList))
1923+ thread_list.append(thread)
1924+ thread.start()
1925 time.sleep(0.25)
1926 except Exception as e:
1927 printErrLog(device, 'Unable to start new thread. %s' % (e))
1928 return
1929- while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
1930- getch = _Getch()
1931- user_input = getch()
1932+ while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c' and then 'Enter'
1933+ user_input = input()
1934 # Catch user input for q or Ctrl + c
1935 if user_input == 'q' or user_input == '\x03':
1936- for device in deviceList:
1937- ret = rocmsmi.rsmi_event_notification_stop(device)
1938- if not rsmi_ret_ok(ret, device, 'stop_event_notification'):
1939- printErrLog(device, 'Unable to end event notifications.')
1940+ global stop_threads
1941+ stop_threads = True
1942 print('\r')
1943 break
1944+ for thread in thread_list:
1945+ thread.join()
1946
1947
1948 def printTempGraph(deviceList, delay, temp_type):
1949@@ -3037,7 +3037,7 @@ def printTempGraph(deviceList, delay, temp_type):
1950 for i in range(devices):
1951 printEmptyLine()
1952 originalTerminalWidth = os.get_terminal_size()[0]
1953- while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
1954+ while not stop_threads: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
1955 terminalWidth = os.get_terminal_size()[0]
1956 printStrings = list()
1957 for device in deviceList:
1958@@ -3117,19 +3117,26 @@ def showTempGraph(deviceList):
1959 deviceList.sort()
1960 temp_type = getTemperatureLabel(deviceList)
1961 printLogSpacer(' Temperature Graph ' + temp_type.capitalize() + ' ')
1962+ thread_list = []
1963 # Start a thread for constantly printing
1964 try:
1965 # Create a thread (call print function, devices, delay in ms)
1966- _thread.start_new_thread(printTempGraph, (deviceList, 150, temp_type))
1967+ thread = threading.Thread(target=printTempGraph, args=(deviceList, 150, temp_type))
1968+ thread.start()
1969+ thread_list.append(thread)
1970 except Exception as e:
1971 printErrLog(device, 'Unable to start new thread. %s' % (e))
1972 # Catch user input for program termination
1973 while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
1974 getch = _Getch()
1975 user_input = getch()
1976+ global stop_threads
1977+ stop_threads = True;
1978 # Catch user input for q or Ctrl + c
1979 if user_input == 'q' or user_input == '\x03':
1980 break
1981+ for thread in thread_list:
1982+ thread.join()
1983 # Reset color to default before exit
1984 print('\033[A\x1b[0m\r')
1985 printLogSpacer()
1986@@ -3178,8 +3185,6 @@ def showVoltageCurve(deviceList):
1987 printLog(device, 'Voltage point %d: %sMhz %smV' % (
1988 position, int(list(odvf.curve.vc_points)[position].frequency / 1000000),
1989 int(list(odvf.curve.vc_points)[position].voltage)), None)
1990- else:
1991- printErrLog(device, 'Voltage curve Points unsupported.', is_warning=True)
1992 printLogSpacer()
1993
1994
1995@@ -3232,8 +3237,6 @@ def showAccessibleTopology(deviceList):
1996 ret = rocmsmi.rsmi_is_P2P_accessible(srcdevice, destdevice, byref(accessible))
1997 if rsmi_ret_ok(ret, metric='is_P2P_accessible'):
1998 gpu_links_type[srcdevice][destdevice] = accessible.value
1999- else:
2000- printErrLog(srcdevice, 'Cannot read link accessibility: Unsupported on this machine')
2001 if PRINT_JSON:
2002 formatMatrixToJSON(deviceList, gpu_links_type, "(Topology) Link accessibility between DRM devices {} and {}")
2003 return
2004@@ -3272,7 +3275,6 @@ def showWeightTopology(deviceList):
2005 if rsmi_ret_ok(ret, metric='get_link_weight_topology'):
2006 gpu_links_weight[srcdevice][destdevice] = weight
2007 else:
2008- printErrLog(srcdevice, 'Cannot read Link Weight: Not supported on this machine')
2009 gpu_links_weight[srcdevice][destdevice] = None
2010
2011
2012@@ -3291,7 +3293,7 @@ def showWeightTopology(deviceList):
2013 for gpu2 in deviceList:
2014 if (gpu1 == gpu2):
2015 printTableRow('%-12s', '0')
2016- elif (gpu_links_weight[gpu1][gpu2] == None):
2017+ elif (gpu_links_weight[gpu1][gpu2] is None):
2018 printTableRow('%-12s', 'N/A')
2019 else:
2020 printTableRow('%-12s', gpu_links_weight[gpu1][gpu2].value)
2021@@ -3319,7 +3321,6 @@ def showHopsTopology(deviceList):
2022 if rsmi_ret_ok(ret, metric='get_link_type_topology'):
2023 gpu_links_hops[srcdevice][destdevice] = hops
2024 else:
2025- printErrLog(srcdevice, 'Cannot read Link Hops: Not supported on this machine')
2026 gpu_links_hops[srcdevice][destdevice] = None
2027
2028 if PRINT_JSON:
2029@@ -3337,7 +3338,7 @@ def showHopsTopology(deviceList):
2030 for gpu2 in deviceList:
2031 if (gpu1 == gpu2):
2032 printTableRow('%-12s', '0')
2033- elif (gpu_links_hops[gpu1][gpu2] == None):
2034+ elif (gpu_links_hops[gpu1][gpu2] is None):
2035 printTableRow('%-12s', 'N/A')
2036 else:
2037 printTableRow('%-12s', gpu_links_hops[gpu1][gpu2].value)
2038@@ -3370,7 +3371,6 @@ def showTypeTopology(deviceList):
2039 else:
2040 gpu_links_type[srcdevice][destdevice] = "XXXX"
2041 else:
2042- printErrLog(srcdevice, 'Cannot read Link Type: Not supported on this machine')
2043 gpu_links_type[srcdevice][destdevice] = "XXXX"
2044
2045 if PRINT_JSON:
2046@@ -3406,14 +3406,10 @@ def showNumaTopology(deviceList):
2047 ret = rocmsmi.rsmi_topo_get_numa_node_number(device, byref(numa_numbers))
2048 if rsmi_ret_ok(ret, device, 'get_numa_node_number'):
2049 printLog(device, "(Topology) Numa Node", numa_numbers.value)
2050- else:
2051- printErrLog(device, "Cannot read Numa Node")
2052
2053 ret = rocmsmi.rsmi_topo_numa_affinity_get(device, byref(numa_numbers))
2054 if rsmi_ret_ok(ret, metric='get_numa_affinity_topology'):
2055 printLog(device, "(Topology) Numa Affinity", numa_numbers.value)
2056- else:
2057- printErrLog(device, 'Cannot read Numa Affinity')
2058
2059
2060 def showHwTopology(deviceList):
2061@@ -3496,8 +3492,7 @@ def showComputePartition(deviceList):
2062 elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
2063 printLog(device, 'Not supported on the given system', None)
2064 else:
2065- rsmi_ret_ok(ret, device, 'get_compute_partition')
2066- printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.')
2067+ printLog(device, 'Failed to retrieve compute partition, even though device supports it.')
2068 printLogSpacer()
2069
2070 def showMemoryPartition(deviceList):
2071@@ -3514,8 +3509,7 @@ def showMemoryPartition(deviceList):
2072 elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
2073 printLog(device, 'Not supported on the given system', None)
2074 else:
2075- rsmi_ret_ok(ret, device, 'get_memory_partition')
2076- printErrLog(device, 'Failed to retrieve current memory partition, even though device supports it.')
2077+ printLog(device, 'Failed to retrieve current memory partition, even though device supports it.')
2078 printLogSpacer()
2079
2080 class UIntegerTypes(IntEnum):
2081@@ -3799,6 +3793,38 @@ def showGPUMetrics(deviceList):
2082 },
2083 "xcp_stats.gfx_below_host_limit_acc": {
2084 "value": gpu_metrics.xcp_stats,
2085+ "unit": count,
2086+ },
2087+ "xcp_stats.gfx_below_host_limit_ppt_acc": {
2088+ "value": gpu_metrics.xcp_stats,
2089+ "unit": count,
2090+ },
2091+ "xcp_stats.gfx_below_host_limit_thm_acc": {
2092+ "value": gpu_metrics.xcp_stats,
2093+ "unit": count,
2094+ },
2095+ "xcp_stats.gfx_low_utilization_acc": {
2096+ "value": gpu_metrics.xcp_stats,
2097+ "unit": count,
2098+ },
2099+ "xcp_stats.gfx_below_host_limit_total_acc": {
2100+ "value": gpu_metrics.xcp_stats,
2101+ "unit": count,
2102+ },
2103+ "xcp_stats.gfx_below_host_limit_ppt_acc": {
2104+ "value": gpu_metrics.xcp_stats,
2105+ "unit": percent_unit,
2106+ },
2107+ "xcp_stats.gfx_below_host_limit_thm_acc": {
2108+ "value": gpu_metrics.xcp_stats,
2109+ "unit": percent_unit,
2110+ },
2111+ "xcp_stats.gfx_low_utilization_acc": {
2112+ "value": gpu_metrics.xcp_stats,
2113+ "unit": percent_unit,
2114+ },
2115+ "xcp_stats.gfx_below_host_limit_total_acc": {
2116+ "value": gpu_metrics.xcp_stats,
2117 "unit": percent_unit,
2118 },
2119 }
2120@@ -3841,14 +3867,37 @@ def showGPUMetrics(deviceList):
2121 for _, val in enumerate(item.gfx_below_host_limit_acc):
2122 print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
2123 printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
2124+ if 'xcp_stats.gfx_below_host_limit_ppt_acc' in k:
2125+ for curr_xcp, item in enumerate(v['value']):
2126+ print_xcp_detail = []
2127+ for _, val in enumerate(item.gfx_below_host_limit_ppt_acc):
2128+ print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
2129+ printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
2130+ if 'xcp_stats.gfx_below_host_limit_thm_acc' in k:
2131+ for curr_xcp, item in enumerate(v['value']):
2132+ print_xcp_detail = []
2133+ for _, val in enumerate(item.gfx_below_host_limit_thm_acc):
2134+ print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
2135+ printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
2136+ if 'xcp_stats.gfx_low_utilization_acc' in k:
2137+ for curr_xcp, item in enumerate(v['value']):
2138+ print_xcp_detail = []
2139+ for _, val in enumerate(item.gfx_low_utilization_acc):
2140+ print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
2141+ printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
2142+ if 'xcp_stats.gfx_below_host_limit_total_acc' in k:
2143+ for curr_xcp, item in enumerate(v['value']):
2144+ print_xcp_detail = []
2145+ for _, val in enumerate(item.gfx_below_host_limit_total_acc):
2146+ print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
2147+ printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
2148
2149 if int(device) < (len(deviceList) - 1):
2150 printLogSpacer()
2151 elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
2152 printLog(device, 'Not supported on the given system', None)
2153 else:
2154- rsmi_ret_ok(ret, device, 'get_gpu_metrics')
2155- printErrLog(device, 'Failed to retrieve GPU metrics, metric version may not be supported for this device.')
2156+ printLog(device, 'Failed to retrieve GPU metrics, metric version may not be supported for this device.')
2157 printLogSpacer()
2158
2159 def checkAmdGpus(deviceList):
2160@@ -3863,6 +3912,44 @@ def checkAmdGpus(deviceList):
2161 return False
2162
2163
2164+def check_runtime_status() -> bool:
2165+ """Check the runtime status of all AMD GPU devices managed by the amdgpu driver.
2166+
2167+ This function scans the directories under the specified path to verify the
2168+ runtime power management status of each device. It checks the "runtime_status"
2169+ file for each device to determine if the device is in an "active" state. If any
2170+ device is not in an "active" state it returns False. If the file is inaccessible,
2171+ this may be due to a system that does not support runtime power management.
2172+ Some GPUs support runtime power management, while others may not. This is why the default status
2173+ is set to True.
2174+
2175+ bool: False if any device is not in "active" state, True otherwise.
2176+ """
2177+ base_path = "/sys/class/drm"
2178+ status = True # Default to True, assuming active unless proven otherwise
2179+ for device in os.listdir(base_path):
2180+ if os.path.isdir(os.path.join(base_path, device)):
2181+ runtime_status_path = os.path.join(base_path, device, "power", "runtime_status")
2182+ try:
2183+ with open(runtime_status_path, 'r') as file:
2184+ current_status = file.read().strip()
2185+ if current_status != "active":
2186+ status = False
2187+ continue
2188+ else:
2189+ logging.debug(f"Runtime status for {device}: {current_status}")
2190+ status = True
2191+ except FileNotFoundError:
2192+ # File does not exist, skip this device
2193+ continue
2194+ except PermissionError as e:
2195+ # Handle permission errors gracefully
2196+ logging.debug(f"Permission denied while accessing {runtime_status_path} \nError: {e}")
2197+ continue
2198+ else:
2199+ pass
2200+ return status
2201+
2202 def component_str(component):
2203 """ Returns the component String value
2204
2205@@ -4374,7 +4461,7 @@ if __name__ == '__main__':
2206
2207 if not PRINT_JSON:
2208 print('\n')
2209- if not isConciseInfoRequested(args) and args.showhw == False:
2210+ if not isConciseInfoRequested(args) and not args.showhw:
2211 printLogSpacer(headerString)
2212
2213 if args.showallinfo:
2214@@ -4429,7 +4516,8 @@ if __name__ == '__main__':
2215
2216 if not checkAmdGpus(deviceList):
2217 logging.warning('No AMD GPUs specified')
2218-
2219+ if not check_runtime_status():
2220+ logging.warning('AMD GPU device(s) is/are in a low-power state. Check power control/runtime_status\n')
2221 if isConciseInfoRequested(args):
2222 showAllConcise(deviceList)
2223 if args.showhw:
2224@@ -4482,7 +4570,7 @@ if __name__ == '__main__':
2225 showPcieReplayCount(deviceList)
2226 if args.showserial:
2227 showSerialNumber(deviceList)
2228- if args.showpids != None:
2229+ if args.showpids is not None:
2230 showPids(args.showpids)
2231 if args.showpidgpus or str(args.showpidgpus) == '[]':
2232 showGpusByPid(args.showpidgpus)
2233@@ -4626,10 +4714,10 @@ if __name__ == '__main__':
2234 devCsv = ''
2235 sysCsv = ''
2236 # JSON won't have any 'system' data without one of these flags
2237- if args.showdriverversion and args.showallinfo == False:
2238+ if args.showdriverversion and not args.showallinfo:
2239 sysCsv = formatCsv(['system'])
2240 print('%s' % (sysCsv))
2241- elif args.showallinfo is True:
2242+ elif args.showallinfo:
2243 sysCsv = formatCsv(['system'])
2244 devCsv = formatCsv(deviceList)
2245 print('%s\n%s' % (sysCsv, devCsv))
2246@@ -4637,8 +4725,8 @@ if __name__ == '__main__':
2247 devCsv = formatCsv(deviceList)
2248 print(devCsv)
2249
2250- if not isConciseInfoRequested(args) and args.showhw == False:
2251+ if not isConciseInfoRequested(args) and not args.showhw:
2252 printLogSpacer(footerString)
2253
2254 rsmi_ret_ok(rocmsmi.rsmi_shut_down())
2255- exit(RETCODE)
2256\ No newline at end of file
2257+ exit(RETCODE)
2258diff --git a/python_smi_tools/rsmiBindings.py b/python_smi_tools/rsmiBindings.py
2259index 3a8d11a..69c7860 100644
2260--- a/python_smi_tools/rsmiBindings.py
2261+++ b/python_smi_tools/rsmiBindings.py
2262@@ -108,7 +108,19 @@ class rsmi_dev_perf_level_t(c_int):
2263 RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100
2264
2265
2266-notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_PRE_RESET', 'GPU_POST_RESET', 'RING_HANG']
2267+notification_type_names = [
2268+ 'VM_FAULT',
2269+ 'THERMAL_THROTTLE',
2270+ 'GPU_PRE_RESET',
2271+ 'GPU_POST_RESET',
2272+ 'MIGRATE_START',
2273+ 'MIGRATE_END',
2274+ 'PAGE_FAULT_START',
2275+ 'PAGE_FAULT_END',
2276+ 'QUEUE_EVICTION',
2277+ 'QUEUE_RESTORE',
2278+ 'UNMAP_FROM_GPU'
2279+]
2280
2281
2282 class rsmi_evt_notification_type_t(c_int):
2283@@ -118,8 +130,14 @@ class rsmi_evt_notification_type_t(c_int):
2284 RSMI_EVT_NOTIF_THERMAL_THROTTLE = 2
2285 RSMI_EVT_NOTIF_GPU_PRE_RESET = 3
2286 RSMI_EVT_NOTIF_GPU_POST_RESET = 4
2287- RSMI_EVT_NOTIF_RING_HANG = 5
2288- RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_RING_HANG
2289+ RSMI_EVT_NOTIF_MIGRATE_START = 5
2290+ RSMI_EVT_NOTIF_MIGRATE_END = 6
2291+ RSMI_EVT_NOTIF_PAGE_FAULT_START = 7
2292+ RSMI_EVT_NOTIF_PAGE_FAULT_END = 8
2293+ RSMI_EVT_NOTIF_QUEUE_EVICTION = 9
2294+ RSMI_EVT_NOTIF_QUEUE_RESTORE = 10
2295+ RSMI_EVT_NOTIF_UNMAP_FROM_GPU = 11
2296+ RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_UNMAP_FROM_GPU
2297
2298
2299 class rsmi_voltage_metric_t(c_int):
2300@@ -545,11 +563,12 @@ class rsmi_error_count_t(Structure):
2301 _fields_ = [('correctable_err', c_uint64),
2302 ('uncorrectable_err', c_uint64)]
2303
2304+MAX_EVENT_NOTIFICATION_MSG_SIZE = 96
2305
2306 class rsmi_evt_notification_data_t(Structure):
2307 _fields_ = [('dv_ind', c_uint32),
2308 ('event', rsmi_evt_notification_type_t),
2309- ('message', c_char*64)]
2310+ ('message', c_char*MAX_EVENT_NOTIFICATION_MSG_SIZE)]
2311
2312
2313 class rsmi_process_info_t(Structure):
2314@@ -666,10 +685,14 @@ class amdgpu_xcp_metrics_t(Structure):
2315 # amdgpu_xcp_metrics_t._pack_ = 1 # source:False
2316 amdgpu_xcp_metrics_t._fields_ = [
2317 ('gfx_busy_inst', c_uint32 * 8),
2318- ('jpeg_busy', c_uint16 * 32),
2319+ ('jpeg_busy', c_uint16 * 40),
2320 ('vcn_busy', c_uint16 * 4),
2321 ('gfx_busy_acc', c_uint64 * 8),
2322 ('gfx_below_host_limit_acc', c_uint64 * 8),
2323+ ('gfx_below_host_limit_ppt_acc', c_uint64 * 8),
2324+ ('gfx_below_host_limit_thm_acc', c_uint64 * 8),
2325+ ('gfx_low_utilization_acc', c_uint64 * 8),
2326+ ('gfx_below_host_limit_total_acc', c_uint64 * 8),
2327 ]
2328 xcp_stats_t = amdgpu_xcp_metrics_t
2329
2330diff --git a/python_smi_tools/rsmiBindings.py.in b/python_smi_tools/rsmiBindings.py.in
2331index 18a8535..aaed228 100644
2332--- a/python_smi_tools/rsmiBindings.py.in
2333+++ b/python_smi_tools/rsmiBindings.py.in
2334@@ -23,7 +23,7 @@ def initRsmiBindings(silent=False):
2335 print(args)
2336
2337 rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH')
2338- if (rocm_smi_lib_path != None):
2339+ if (rocm_smi_lib_path is not None):
2340 path_librocm = rocm_smi_lib_path
2341 else:
2342 path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@'
2343diff --git a/python_smi_tools/rsmiBindingsInit.py.in b/python_smi_tools/rsmiBindingsInit.py.in
2344index 12b9218..7c75c4a 100644
2345--- a/python_smi_tools/rsmiBindingsInit.py.in
2346+++ b/python_smi_tools/rsmiBindingsInit.py.in
2347@@ -23,7 +23,7 @@ def initRsmiBindings(silent=False):
2348 print(args)
2349
2350 rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH')
2351- if (rocm_smi_lib_path != None):
2352+ if (rocm_smi_lib_path is not None):
2353 path_librocm = rocm_smi_lib_path
2354 else:
2355 path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@'
2356diff --git a/rocm_smi-backward-compat.cmake b/rocm_smi-backward-compat.cmake
2357deleted file mode 100644
2358index d53542b..0000000
2359--- a/rocm_smi-backward-compat.cmake
2360+++ /dev/null
2361@@ -1,200 +0,0 @@
2362-# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
2363-# Permission is hereby granted, free of charge, to any person obtaining a copy
2364-# of this software and associated documentation files (the "Software"), to deal
2365-# in the Software without restriction, including without limitation the rights
2366-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
2367-# copies of the Software, and to permit persons to whom the Software is
2368-# furnished to do so, subject to the following conditions:
2369-#
2370-# The above copyright notice and this permission notice shall be included in
2371-# all copies or substantial portions of the Software.
2372-#
2373-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2374-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2375-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2376-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2377-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2378-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2379-# THE SOFTWARE.
2380-
2381-cmake_minimum_required(VERSION 3.16.8)
2382-
2383-set(RSMI_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR})
2384-set(RSMI_WRAPPER_DIR ${RSMI_BUILD_DIR}/wrapper_dir)
2385-set(RSMI_WRAPPER_INC_DIR ${RSMI_WRAPPER_DIR}/include/${ROCM_SMI})
2386-set(OAM_TARGET_NAME "oam")
2387-set(OAM_WRAPPER_INC_DIR ${RSMI_WRAPPER_DIR}/include/${OAM_TARGET_NAME})
2388-set(RSMI_WRAPPER_LIB_DIR ${RSMI_WRAPPER_DIR}/${ROCM_SMI}/lib)
2389-set(OAM_WRAPPER_LIB_DIR ${RSMI_WRAPPER_DIR}/${OAM_TARGET_NAME}/lib)
2390-## package headers
2391-set(PUBLIC_RSMI_HEADERS
2392- rocm_smi.h
2393- ${ROCM_SMI_TARGET}Config.h
2394- kfd_ioctl.h)
2395-set(OAM_HEADERS
2396- oam_mapi.h
2397- amd_oam.h)
2398-
2399-#Function to generate header template file
2400-function(create_header_template)
2401- file(WRITE ${RSMI_WRAPPER_DIR}/header.hpp.in "/*
2402- Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
2403-
2404- Permission is hereby granted, free of charge, to any person obtaining a copy
2405- of this software and associated documentation files (the \"Software\"), to deal
2406- in the Software without restriction, including without limitation the rights
2407- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
2408- copies of the Software, and to permit persons to whom the Software is
2409- furnished to do so, subject to the following conditions:
2410-
2411- The above copyright notice and this permission notice shall be included in
2412- all copies or substantial portions of the Software.
2413-
2414- THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2415- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2416- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2417- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2418- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2419- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2420- THE SOFTWARE.
2421- */
2422-
2423-#ifndef @include_guard@
2424-#define @include_guard@
2425-
2426-#ifndef ROCM_HEADER_WRAPPER_WERROR
2427-#define ROCM_HEADER_WRAPPER_WERROR @deprecated_error@
2428-#endif
2429-#if ROCM_HEADER_WRAPPER_WERROR /* ROCM_HEADER_WRAPPER_WERROR 1 */
2430-#error \"This file is deprecated. Use file from include path /opt/rocm-ver/include/ and prefix with @prefix_name@\"
2431-#else /* ROCM_HEADER_WRAPPER_WERROR 0 */
2432-#if defined(__GNUC__)
2433-#warning \"This file is deprecated. Use file from include path /opt/rocm-ver/include/ and prefix with @prefix_name@\"
2434-#else
2435-#pragma message(\"This file is deprecated. Use file from include path /opt/rocm-ver/include/ and prefix with @prefix_name@\")
2436-#endif
2437-#endif /* ROCM_HEADER_WRAPPER_WERROR */
2438-
2439-@include_statements@
2440-
2441-#endif")
2442-endfunction()
2443-
2444-#use header template file and generate wrapper header files
2445-function(generate_wrapper_header)
2446- file(MAKE_DIRECTORY ${RSMI_WRAPPER_INC_DIR})
2447- set(prefix_name "${prefix_name}${ROCM_SMI}")
2448- #Generate wrapper header files from the list
2449- foreach(header_file ${PUBLIC_RSMI_HEADERS})
2450- # set include guard
2451- get_filename_component(INC_GAURD_NAME ${header_file} NAME_WE)
2452- string(TOUPPER ${INC_GAURD_NAME} INC_GAURD_NAME)
2453- set(include_guard "${include_guard}COMGR_WRAPPER_INCLUDE_${INC_GAURD_NAME}_H")
2454- #set #include statement
2455- get_filename_component(file_name ${header_file} NAME)
2456- set(include_statements "${include_statements}#include \"../../../${CMAKE_INSTALL_INCLUDEDIR}/${ROCM_SMI}/${file_name}\"\n")
2457- configure_file(${RSMI_WRAPPER_DIR}/header.hpp.in ${RSMI_WRAPPER_INC_DIR}/${file_name})
2458- unset(include_guard)
2459- unset(include_statements)
2460- endforeach()
2461- unset(prefix_name)
2462-
2463-#OAM Wrpper Header file generation
2464- file(MAKE_DIRECTORY ${OAM_WRAPPER_INC_DIR})
2465- set(prefix_name "${prefix_name}${OAM_TARGET_NAME}")
2466- #Generate wrapper header files from the list
2467- foreach(header_file ${OAM_HEADERS})
2468- # set include guard
2469- get_filename_component(INC_GAURD_NAME ${header_file} NAME_WE)
2470- string(TOUPPER ${INC_GAURD_NAME} INC_GAURD_NAME)
2471- set(include_guard "${include_guard}COMGR_WRAPPER_INCLUDE_${INC_GAURD_NAME}_H")
2472- #set #include statement
2473- get_filename_component(file_name ${header_file} NAME)
2474- set(include_statements "${include_statements}#include \"../../../${CMAKE_INSTALL_INCLUDEDIR}/${OAM_TARGET_NAME}/${file_name}\"\n")
2475- configure_file(${RSMI_WRAPPER_DIR}/header.hpp.in ${OAM_WRAPPER_INC_DIR}/${file_name})
2476- unset(include_guard)
2477- unset(include_statements)
2478- endforeach()
2479- unset(prefix_name)
2480-
2481-endfunction()
2482-
2483-#function to create symlink to libraries
2484-function(create_library_symlink)
2485-
2486- file(MAKE_DIRECTORY ${RSMI_WRAPPER_LIB_DIR})
2487- if(BUILD_SHARED_LIBS)
2488-
2489- #get rsmi lib versions
2490- set(SO_VERSION_GIT_TAG_PREFIX "rsmi_so_ver")
2491- get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT)
2492- if(${ROCM_PATCH_VERSION})
2493- set(VERSION_PATCH ${ROCM_PATCH_VERSION})
2494- set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}")
2495- else()
2496- set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}")
2497- endif()
2498-
2499- #link RSMI library files
2500- set(LIB_RSMI "${ROCM_SMI_LIB_NAME}.so")
2501- set(library_files "${LIB_RSMI}" "${LIB_RSMI}.${VERSION_MAJOR}" "${LIB_RSMI}.${SO_VERSION_STRING}")
2502- else()
2503- set(LIB_RSMI "${ROCM_SMI_LIB_NAME}.a")
2504- set(library_files "${LIB_RSMI}")
2505- endif()
2506-
2507- foreach(file_name ${library_files})
2508- add_custom_target(link_${file_name} ALL
2509- WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
2510- COMMAND ${CMAKE_COMMAND} -E create_symlink
2511- ../../${CMAKE_INSTALL_LIBDIR}/${file_name} ${RSMI_WRAPPER_LIB_DIR}/${file_name})
2512- endforeach()
2513-
2514- file(MAKE_DIRECTORY ${OAM_WRAPPER_LIB_DIR})
2515- if(BUILD_SHARED_LIBS)
2516-
2517- #get OAM lib versions
2518- set(SO_VERSION_GIT_TAG_PREFIX "oam_so_ver")
2519- get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT)
2520- if(${ROCM_PATCH_VERSION})
2521- set(VERSION_PATCH ${ROCM_PATCH_VERSION})
2522- set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}")
2523- else()
2524- set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}")
2525- endif()
2526-
2527- #link OAM library files
2528- set(LIB_OAM "lib${OAM_TARGET_NAME}.so")
2529- set(library_files "${LIB_OAM}" "${LIB_OAM}.${VERSION_MAJOR}" "${LIB_OAM}.${SO_VERSION_STRING}")
2530- else()
2531- set(LIB_OAM "lib${OAM_TARGET_NAME}.a")
2532- set(library_files "${LIB_OAM}")
2533- endif()
2534-
2535- foreach(file_name ${library_files})
2536- add_custom_target(link_${file_name} ALL
2537- WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
2538- COMMAND ${CMAKE_COMMAND} -E create_symlink
2539- ../../${CMAKE_INSTALL_LIBDIR}/${file_name} ${OAM_WRAPPER_LIB_DIR}/${file_name})
2540- endforeach()
2541-
2542-endfunction()
2543-
2544-#Creater a template for header file
2545-create_header_template()
2546-#Use template header file and generater wrapper header files
2547-generate_wrapper_header()
2548-install(DIRECTORY ${RSMI_WRAPPER_INC_DIR}
2549- DESTINATION ${ROCM_SMI}/include
2550- COMPONENT dev)
2551-install(DIRECTORY ${OAM_WRAPPER_INC_DIR}
2552- DESTINATION ${OAM_TARGET_NAME}/include
2553- COMPONENT dev)
2554-# Create symlink to library files
2555-create_library_symlink()
2556-install(DIRECTORY ${RSMI_WRAPPER_LIB_DIR}
2557- DESTINATION ${ROCM_SMI}
2558- COMPONENT dev)
2559-install(DIRECTORY ${OAM_WRAPPER_LIB_DIR}
2560- DESTINATION ${OAM_TARGET_NAME}
2561- COMPONENT dev )
2562diff --git a/rocm_smi/CMakeLists.txt b/rocm_smi/CMakeLists.txt
2563old mode 100755
2564new mode 100644
2565index 257309b..23485ae
2566--- a/rocm_smi/CMakeLists.txt
2567+++ b/rocm_smi/CMakeLists.txt
2568@@ -88,15 +88,13 @@ target_include_directories(${ROCM_SMI_TARGET} PRIVATE
2569 # use the target_include_directories() command to specify the include directories for the target
2570 target_include_directories(${ROCM_SMI_TARGET}
2571 PUBLIC
2572+ "$<BUILD_INTERFACE:${DRM_INCLUDE_DIRS}>"
2573+ "$<BUILD_INTERFACE:${AMDGPU_DRM_INCLUDE_DIRS}>"
2574 "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
2575 "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
2576 )
2577
2578-if(FILE_REORG_BACKWARD_COMPATIBILITY)
2579- target_include_directories(${ROCM_SMI_TARGET}
2580- PUBLIC
2581- "$<INSTALL_INTERFACE:${ROCM_SMI}/include>")
2582-endif()
2583+target_include_directories(${ROCM_SMI_TARGET} INTERFACE ${DRM_INCLUDE_DIRS})
2584
2585 ## Set the VERSION and SOVERSION values
2586 set_property(TARGET ${ROCM_SMI_TARGET} PROPERTY
2587diff --git a/rocm_smi/example/rocm_smi_example.cc b/rocm_smi/example/rocm_smi_example.cc
2588old mode 100755
2589new mode 100644
2590index f6a3b8b..a370abe
2591--- a/rocm_smi/example/rocm_smi_example.cc
2592+++ b/rocm_smi/example/rocm_smi_example.cc
2593@@ -5,7 +5,7 @@
2594 * The University of Illinois/NCSA
2595 * Open Source License (NCSA)
2596 *
2597- * Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
2598+ * Copyright (c) 2017-2025, Advanced Micro Devices, Inc.
2599 * All rights reserved.
2600 *
2601 * Developed by:
2602@@ -991,6 +991,11 @@ int main() {
2603 std::cout << "\t -> " << std::dec << dclk << "\n";
2604 }
2605
2606+ std::cout << "\t**.jpeg_activity[] : " << std::dec << "\n";
2607+ for (const auto& jpeg : gpu_metrics.jpeg_activity) {
2608+ std::cout << "\t -> " << std::dec << jpeg << "\n";
2609+ }
2610+
2611 std::cout << std::dec << "xcp_stats.gfx_busy_inst = \n";
2612 auto xcp = 0;
2613 for (auto& row : gpu_metrics.xcp_stats) {
2614@@ -1046,6 +1051,50 @@ int main() {
2615 xcp++;
2616 }
2617
2618+ xcp = 0;
2619+ std::cout << std::dec << "xcp_stats.gfx_below_host_limit_ppt_acc = \n"; // new for 1.8
2620+ for (auto& row : gpu_metrics.xcp_stats) {
2621+ std::cout << "XCP[" << xcp << "] = " << "[ ";
2622+ std::copy(std::begin(row.gfx_below_host_limit_ppt_acc),
2623+ std::end(row.gfx_below_host_limit_ppt_acc),
2624+ amd::smi::make_ostream_joiner(&std::cout, ", "));
2625+ std::cout << " ]\n";
2626+ xcp++;
2627+ }
2628+
2629+ xcp = 0;
2630+ std::cout << std::dec << "xcp_stats.gfx_below_host_limit_thm_acc = \n"; // new for 1.8
2631+ for (auto& row : gpu_metrics.xcp_stats) {
2632+ std::cout << "XCP[" << xcp << "] = " << "[ ";
2633+ std::copy(std::begin(row.gfx_below_host_limit_thm_acc),
2634+ std::end(row.gfx_below_host_limit_thm_acc),
2635+ amd::smi::make_ostream_joiner(&std::cout, ", "));
2636+ std::cout << " ]\n";
2637+ xcp++;
2638+ }
2639+
2640+ xcp = 0;
2641+ std::cout << std::dec << "xcp_stats.gfx_low_utilization_acc = \n";
2642+ for (auto& row : gpu_metrics.xcp_stats) {
2643+ std::cout << "XCP[" << xcp << "] = " << "[ ";
2644+ std::copy(std::begin(row.gfx_low_utilization_acc),
2645+ std::end(row.gfx_low_utilization_acc),
2646+ amd::smi::make_ostream_joiner(&std::cout, ", "));
2647+ std::cout << " ]\n";
2648+ xcp++;
2649+ }
2650+
2651+ xcp = 0;
2652+ std::cout << std::dec << "xcp_stats.gfx_below_host_limit_total_acc = \n";
2653+ for (auto& row : gpu_metrics.xcp_stats) {
2654+ std::cout << "XCP[" << xcp << "] = " << "[ ";
2655+ std::copy(std::begin(row.gfx_below_host_limit_total_acc),
2656+ std::end(row.gfx_below_host_limit_total_acc),
2657+ amd::smi::make_ostream_joiner(&std::cout, ", "));
2658+ std::cout << " ]\n";
2659+ xcp++;
2660+ }
2661+
2662 std::cout << "\n";
2663 std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n";
2664 constexpr uint16_t kMAX_ITER_TEST = 10;
2665diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc
2666old mode 100755
2667new mode 100644
2668index 758ef3a..561798a
2669--- a/src/rocm_smi.cc
2670+++ b/src/rocm_smi.cc
2671@@ -3,7 +3,7 @@
2672 * The University of Illinois/NCSA
2673 * Open Source License (NCSA)
2674 *
2675- * Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
2676+ * Copyright (c) 2017-2025, Advanced Micro Devices, Inc.
2677 * All rights reserved.
2678 *
2679 * Developed by:
2680@@ -48,6 +48,7 @@
2681 #include <fcntl.h>
2682 #include <poll.h>
2683 #include <pthread.h>
2684+#include <inttypes.h>
2685
2686 #include <cstddef>
2687 #include <string>
2688@@ -198,39 +199,6 @@ static uint64_t freq_string_to_int(const std::vector<std::string> &freq_lines,
2689 return static_cast<uint64_t>(freq*multiplier);
2690 }
2691
2692-static void freq_volt_string_to_point(std::string in_line,
2693- rsmi_od_vddc_point_t *pt) {
2694- std::istringstream fs_vlt(in_line);
2695-
2696- assert(pt != nullptr);
2697- THROW_IF_NULLPTR_DEREF(pt)
2698-
2699- uint32_t ind;
2700- float freq;
2701- float volts;
2702- std::string junk;
2703- std::string freq_units_str;
2704- std::string volts_units_str;
2705-
2706- fs_vlt >> ind;
2707- fs_vlt >> junk; // colon
2708- fs_vlt >> freq;
2709- fs_vlt >> freq_units_str;
2710- fs_vlt >> volts;
2711- fs_vlt >> volts_units_str;
2712-
2713- if (freq < 0) {
2714- throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_SIZE, __FUNCTION__);
2715- }
2716-
2717- long double multiplier = get_multiplier_from_str(freq_units_str[0]);
2718-
2719- pt->frequency = static_cast<uint64_t>(freq*multiplier);
2720-
2721- multiplier = get_multiplier_from_str(volts_units_str[0]);
2722- pt->voltage = static_cast<uint64_t>(volts*multiplier);
2723-}
2724-
2725 static void od_value_pair_str_to_range(std::string in_line, rsmi_range_t *rg) {
2726 std::istringstream fs_rng(in_line);
2727
2728@@ -318,6 +286,7 @@ static rsmi_status_t get_dev_value_str(amd::smi::DevInfoTypes type,
2729
2730 return amd::smi::ErrnoToRsmiStatus(ret);
2731 }
2732+
2733 static rsmi_status_t get_dev_value_int(amd::smi::DevInfoTypes type,
2734 uint32_t dv_ind, uint64_t *val_int) {
2735 assert(val_int != nullptr);
2736@@ -369,9 +338,31 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type,
2737 return amd::smi::ErrnoToRsmiStatus(ret);
2738 }
2739
2740+ if (val_str.empty()) {
2741+ std::ostringstream ss;
2742+ ss << __PRETTY_FUNCTION__
2743+ << " | ======= end ======= "
2744+ << " | Fail "
2745+ << " | Device #: " << dv_ind
2746+ << " | Type: " << monitorTypesToString.at(type)
2747+ << " | Cause: SYSFS read was empty"
2748+ << " | Returning = "
2749+ << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
2750+ LOG_INFO(ss);
2751+ return RSMI_STATUS_UNEXPECTED_DATA;
2752+ }
2753+
2754 if (!amd::smi::IsInteger(val_str)) {
2755- std::cerr << "Expected integer value from monitor,"
2756- " but got \"" << val_str << "\"" << std::endl;
2757+ std::ostringstream ss;
2758+ ss << __PRETTY_FUNCTION__
2759+ << " | ======= end ======= "
2760+ << " | Fail "
2761+ << " | Device #: " << dv_ind
2762+ << " | Type: " << monitorTypesToString.at(type)
2763+ << " | Cause: Expected integer value from monitor, but got "<< val_str
2764+ << " | Returning = "
2765+ << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
2766+ LOG_INFO(ss);
2767 return RSMI_STATUS_UNEXPECTED_DATA;
2768 }
2769
2770@@ -398,9 +389,31 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type,
2771 return amd::smi::ErrnoToRsmiStatus(ret);
2772 }
2773
2774+ if (val_str.empty()) {
2775+ std::ostringstream ss;
2776+ ss << __PRETTY_FUNCTION__
2777+ << " | ======= end ======= "
2778+ << " | Fail "
2779+ << " | Device #: " << dv_ind
2780+ << " | Type: " << monitorTypesToString.at(type)
2781+ << " | Cause: SYSFS read was empty"
2782+ << " | Returning = "
2783+ << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
2784+ LOG_INFO(ss);
2785+ return RSMI_STATUS_UNEXPECTED_DATA;
2786+ }
2787+
2788 if (!amd::smi::IsInteger(val_str)) {
2789- std::cerr << "Expected integer value from monitor,"
2790- " but got \"" << val_str << "\"" << std::endl;
2791+ std::ostringstream ss;
2792+ ss << __PRETTY_FUNCTION__
2793+ << " | ======= end ======= "
2794+ << " | Fail "
2795+ << " | Device #: " << dv_ind
2796+ << " | Type: " << monitorTypesToString.at(type)
2797+ << " | Cause: Expected integer value from monitor, but got "<< val_str
2798+ << " | Returning = "
2799+ << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
2800+ LOG_INFO(ss);
2801 return RSMI_STATUS_UNEXPECTED_DATA;
2802 }
2803
2804@@ -735,7 +748,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
2805 fs2 >> ec->correctable_err;
2806
2807 ss << __PRETTY_FUNCTION__ << " | ======= end ======="
2808- << ", reporting " << amd::smi::getRSMIStatusString(ret);;
2809+ << ", reporting " << amd::smi::getRSMIStatusString(ret);
2810 LOG_TRACE(ss);
2811 return ret;
2812 CATCH
2813@@ -795,11 +808,15 @@ rsmi_topo_numa_affinity_get(uint32_t dv_ind, int32_t *numa_node) {
2814 TRY
2815 rsmi_status_t ret;
2816
2817- CHK_SUPPORT_NAME_ONLY(numa_node)
2818-
2819 DEVICE_MUTEX
2820+ if (!numa_node) {
2821+ return RSMI_STATUS_INVALID_ARGS;
2822+ }
2823 std::string str_val;
2824 ret = get_dev_value_str(amd::smi::kDevNumaNode, dv_ind, &str_val);
2825+ if (ret != RSMI_STATUS_SUCCESS) {
2826+ return ret;
2827+ }
2828 *numa_node = std::stoi(str_val, nullptr);
2829
2830 return ret;
2831@@ -846,12 +863,46 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) {
2832 rsmi_status_t ret;
2833 ss << __PRETTY_FUNCTION__ << "| ======= start =======";
2834 LOG_TRACE(ss);
2835+ if (id == nullptr) {
2836+ return RSMI_STATUS_INVALID_ARGS;
2837+ }
2838 CHK_SUPPORT_NAME_ONLY(id)
2839+ // Set the device ID to max value
2840+ *id = std::numeric_limits<uint16_t>::max();
2841
2842+ // Get the device ID from KGD
2843 ret = get_id(dv_ind, amd::smi::kDevDevID, id);
2844- ss << __PRETTY_FUNCTION__ << " | ======= end ======="
2845- << ", reporting " << amd::smi::getRSMIStatusString(ret);
2846 LOG_TRACE(ss);
2847+ ss << __PRETTY_FUNCTION__
2848+ << (ret == RSMI_STATUS_SUCCESS ?
2849+ " | No fall back needed retrieved from KGD" : " | fall back needed")
2850+ << " | Device #: " << std::to_string(dv_ind)
2851+ << " | Data: device_id = " << std::to_string(*id)
2852+ << " | ret = " << getRSMIStatusString(ret, false);
2853+ LOG_DEBUG(ss);
2854+ // If the device ID is not supported, use KFD's device ID
2855+ if (ret != RSMI_STATUS_SUCCESS) {
2856+ GET_DEV_AND_KFDNODE_FROM_INDX
2857+ uint32_t node_id;
2858+ uint64_t kfd_device_id;
2859+ int ret_kfd = kfd_node->get_node_id(&node_id);
2860+ ret_kfd = amd::smi::read_node_properties(node_id, "device_id", &kfd_device_id);
2861+ if (ret_kfd == 0) {
2862+ *id = static_cast<uint16_t>(kfd_device_id);
2863+ ret = RSMI_STATUS_SUCCESS;
2864+ } else {
2865+ *id = std::numeric_limits<uint16_t>::max();
2866+ ret = RSMI_STATUS_NOT_SUPPORTED;
2867+ }
2868+ ss << __PRETTY_FUNCTION__
2869+ << " | Issue: Could not read device from sysfs, falling back to KFD" << "\n"
2870+ << " ; Device #: " << std::to_string(dv_ind) << "\n"
2871+ << " ; ret_kfd: " << std::to_string(ret_kfd) << "\n"
2872+ << " ; node: " << std::to_string(node_id) << "\n"
2873+ << " ; Data: device_id (from KFD)= " << std::to_string(*id) << "\n"
2874+ << " ; ret = " << getRSMIStatusString(ret, false);
2875+ LOG_DEBUG(ss);
2876+ }
2877 return ret;
2878 }
2879
2880@@ -862,6 +913,7 @@ rsmi_dev_xgmi_physical_id_get(uint32_t dv_ind, uint16_t *id) {
2881 ss << __PRETTY_FUNCTION__ << "| ======= start =======";
2882 LOG_TRACE(ss);
2883 CHK_SUPPORT_NAME_ONLY(id)
2884+ *id = std::numeric_limits<uint16_t>::max();
2885
2886 ret = get_id(dv_ind, amd::smi::kDevXGMIPhysicalID, id);
2887 ss << __PRETTY_FUNCTION__ << " | ======= end ======="
2888@@ -907,16 +959,54 @@ rsmi_dev_subsystem_id_get(uint32_t dv_ind, uint16_t *id) {
2889 ss << __PRETTY_FUNCTION__ << "| ======= start =======";
2890 LOG_TRACE(ss);
2891 CHK_SUPPORT_NAME_ONLY(id)
2892- return get_id(dv_ind, amd::smi::kDevSubSysDevID, id);
2893+ auto ret = get_id(dv_ind, amd::smi::kDevSubSysDevID, id);
2894+ ss << __PRETTY_FUNCTION__ << " | ======= end ======="
2895+ << ", reporting " << amd::smi::getRSMIStatusString(ret, false);
2896+ LOG_INFO(ss);
2897+ return ret;
2898 }
2899
2900 rsmi_status_t
2901 rsmi_dev_vendor_id_get(uint32_t dv_ind, uint16_t *id) {
2902+ TRY
2903 std::ostringstream ss;
2904 ss << __PRETTY_FUNCTION__ << "| ======= start =======";
2905 LOG_TRACE(ss);
2906+ if (!id) {
2907+ return RSMI_STATUS_INVALID_ARGS;
2908+ }
2909 CHK_SUPPORT_NAME_ONLY(id)
2910- return get_id(dv_ind, amd::smi::kDevVendorID, id);
2911+ int ret_kfd = 0;
2912+ uint32_t node_id;
2913+ rsmi_status_t ret = get_id(dv_ind, amd::smi::kDevVendorID, id);
2914+ bool need_fallback = false;
2915+ if (ret != RSMI_STATUS_SUCCESS) {
2916+ need_fallback = true;
2917+ }
2918+ if (ret != RSMI_STATUS_SUCCESS) {
2919+ GET_DEV_AND_KFDNODE_FROM_INDX
2920+ uint64_t kfd_vendor_id;
2921+ ret_kfd = kfd_node->get_node_id(&node_id);
2922+ ret_kfd = amd::smi::read_node_properties(node_id, "vendor_id", &kfd_vendor_id);
2923+ if (ret_kfd == 0) {
2924+ *id = static_cast<uint16_t>(kfd_vendor_id);
2925+ ret = RSMI_STATUS_SUCCESS;
2926+ } else {
2927+ *id = std::numeric_limits<uint16_t>::max();
2928+ ret = RSMI_STATUS_NOT_SUPPORTED;
2929+ }
2930+ }
2931+ ss << __PRETTY_FUNCTION__
2932+ << (need_fallback ? " | Needed to fallback to use KFD to read vendor_id" :
2933+ " | Read through SYSFS to read vendor_id") << "\n"
2934+ << " ; Device #: " << std::to_string(dv_ind) << "\n"
2935+ << " ; ret_kfd: " << std::to_string(ret_kfd) << "\n"
2936+ << " ; node: " << std::to_string(node_id) << "\n"
2937+ << " ; Data: vendor_id: " << std::to_string(*id) << "\n"
2938+ << " ; ret = " << getRSMIStatusString(ret, false);
2939+ LOG_INFO(ss);
2940+ return ret;
2941+ CATCH
2942 }
2943
2944 rsmi_status_t
2945@@ -936,8 +1026,11 @@ rsmi_dev_perf_level_get(uint32_t dv_ind, rsmi_dev_perf_level_t *perf) {
2946 ss << __PRETTY_FUNCTION__ << "| ======= start =======";
2947 LOG_TRACE(ss);
2948
2949- CHK_SUPPORT_NAME_ONLY(perf)
2950 DEVICE_MUTEX
2951+ if (!perf) {
2952+ return RSMI_STATUS_INVALID_ARGS;
2953+ }
2954+ CHK_SUPPORT_NAME_ONLY(perf)
2955
2956 rsmi_status_t ret = get_dev_value_str(amd::smi::kDevPerfLevel, dv_ind,
2957 &val_str);
2958@@ -1006,6 +1099,11 @@ rsmi_dev_overdrive_level_get(uint32_t dv_ind, uint32_t *od) {
2959 CHK_SUPPORT_NAME_ONLY(od)
2960 DEVICE_MUTEX
2961
2962+ // Bare Metal only feature
2963+ if (amd::smi::is_vm_guest()) {
2964+ return RSMI_STATUS_NOT_SUPPORTED;
2965+ }
2966+
2967 rsmi_status_t ret = get_dev_value_str(amd::smi::kDevOverDriveLevel, dv_ind,
2968 &val_str);
2969 if (ret != RSMI_STATUS_SUCCESS) {
2970@@ -1075,6 +1173,12 @@ rsmi_dev_overdrive_level_set_v1(uint32_t dv_ind, uint32_t od) {
2971 if (od > kMaxOverdriveLevel) {
2972 return RSMI_STATUS_INVALID_ARGS;
2973 }
2974+
2975+ // Bare Metal only feature
2976+ if (amd::smi::is_vm_guest()) {
2977+ return RSMI_STATUS_NOT_SUPPORTED;
2978+ }
2979+
2980 DEVICE_MUTEX
2981 return set_dev_value(amd::smi::kDevOverDriveLevel, dv_ind, od);
2982 CATCH
2983@@ -1116,7 +1220,7 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_
2984 return RSMI_STATUS_INVALID_ARGS;
2985 }
2986 memset(f, 0, sizeof(rsmi_frequencies_t));
2987- f->current=0;
2988+ f->current = 0;
2989
2990 ret = GetDevValueVec(type, dv_ind, &val_vec);
2991 if (ret != RSMI_STATUS_SUCCESS) {
2992@@ -1284,6 +1388,12 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind,
2993 return RSMI_STATUS_INVALID_ARGS;
2994 }
2995
2996+ // fill out rsmi_od_volt_freq_data_t p with default max values to indicate no valid data
2997+ p->curr_sclk_range.lower_bound = UINT64_MAX;
2998+ p->curr_sclk_range.upper_bound = UINT64_MAX;
2999+ p->curr_mclk_range.lower_bound = UINT64_MAX;
3000+ p->curr_mclk_range.upper_bound = UINT64_MAX;
3001+
3002 ret = GetDevValueVec(amd::smi::kDevPowerODVoltage, dv_ind, &val_vec);
3003 if (ret != RSMI_STATUS_SUCCESS) {
3004 return ret;
3005@@ -1311,13 +1421,6 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind,
3006 .set_key_data_splitter(":", amd::smi::TagSplitterPositional_t::kBETWEEN)
3007 .structure_content();
3008
3009- //
3010- // Note: We must have minimum of 'GFXCLK:' && 'MCLK:' OR:
3011- // 'OD_SCLK:' && 'OD_MCLK:' tags.
3012- if (txt_power_dev_od_voltage.get_title_size() < kMIN_VALID_LINES) {
3013- return rsmi_status_t::RSMI_STATUS_NO_DATA;
3014- }
3015-
3016 // Note: For debug builds/purposes only.
3017 assert(txt_power_dev_od_voltage.contains_title_key(kTAG_GFXCLK) ||
3018 txt_power_dev_od_voltage.contains_title_key(kTAG_OD_SCLK));
3019@@ -1338,47 +1441,60 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind,
3020 return std::vector<std::string>{upper_bound_data};
3021 };
3022
3023- // Validates 'OD_SCLK' is in the structure
3024- if (txt_power_dev_od_voltage.contains_structured_key(kTAG_OD_SCLK,
3025+ // track the number of keys found, if this goes down to 0 then that means that there is no valid data
3026+ const uint8_t kNumStructuredKeysToCheck = 6;
3027+ uint8_t structured_key_counter = kNumStructuredKeysToCheck;
3028+ // Validates 'OD_SCLK' is in the structure
3029+ if (txt_power_dev_od_voltage.contains_structured_key(kTAG_OD_SCLK,
3030 KTAG_FIRST_FREQ_IDX)) {
3031 p->curr_sclk_range.lower_bound = freq_string_to_int(build_lower_bound(kTAG_OD_SCLK), nullptr, nullptr, 0);
3032 p->curr_sclk_range.upper_bound = freq_string_to_int(build_upper_bound(kTAG_OD_SCLK), nullptr, nullptr, 0);
3033-
3034+ }
3035+ else
3036+ structured_key_counter--;
3037 // Validates 'OD_MCLK' is in the structure
3038- if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_MCLK,
3039- KTAG_FIRST_FREQ_IDX)) {
3040- p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_OD_MCLK), nullptr, nullptr, 0);
3041- p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_OD_MCLK), nullptr, nullptr, 0);
3042- }
3043+ if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_MCLK,
3044+ KTAG_FIRST_FREQ_IDX)) {
3045+ p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_OD_MCLK), nullptr, nullptr, 0);
3046+ p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_OD_MCLK), nullptr, nullptr, 0);
3047+ }
3048+ else
3049+ structured_key_counter--;
3050
3051- // Validates 'OD_RANGE' is in the structure
3052- if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_RANGE,
3053- KTAG_SCLK)) {
3054- od_value_pair_str_to_range(txt_power_dev_od_voltage
3055- .get_structured_value_by_keys(KTAG_OD_RANGE, KTAG_SCLK),
3056- &p->sclk_freq_limits);
3057- }
3058- if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_RANGE,
3059- KTAG_MCLK)) {
3060- od_value_pair_str_to_range(txt_power_dev_od_voltage
3061- .get_structured_value_by_keys(KTAG_OD_RANGE, KTAG_MCLK),
3062- &p->mclk_freq_limits);
3063- }
3064- }
3065- // Validates 'GFXCLK' is in the structure
3066- else if (txt_power_dev_od_voltage.contains_structured_key(kTAG_GFXCLK,
3067- KTAG_FIRST_FREQ_IDX)) {
3068- p->curr_sclk_range.lower_bound = freq_string_to_int(build_lower_bound(kTAG_GFXCLK), nullptr, nullptr, 0);
3069- p->curr_sclk_range.upper_bound = freq_string_to_int(build_upper_bound(kTAG_GFXCLK), nullptr, nullptr, 0);
3070-
3071- // Validates 'MCLK' is in the structure
3072- if (txt_power_dev_od_voltage.contains_structured_key(KTAG_MCLK,
3073- KTAG_FIRST_FREQ_IDX)) {
3074- p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_MCLK), nullptr, nullptr, 0);
3075- p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_MCLK), nullptr, nullptr, 0);
3076- }
3077- }
3078- else {
3079+ // Validates 'OD_RANGE' is in the structure
3080+ if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_RANGE,
3081+ KTAG_SCLK)) {
3082+ od_value_pair_str_to_range(txt_power_dev_od_voltage
3083+ .get_structured_value_by_keys(KTAG_OD_RANGE, KTAG_SCLK),
3084+ &p->sclk_freq_limits);
3085+ }
3086+ else
3087+ structured_key_counter--;
3088+ if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_RANGE,
3089+ KTAG_MCLK)) {
3090+ od_value_pair_str_to_range(txt_power_dev_od_voltage
3091+ .get_structured_value_by_keys(KTAG_OD_RANGE, KTAG_MCLK),
3092+ &p->mclk_freq_limits);
3093+ }
3094+ else
3095+ structured_key_counter--;
3096+ // Validates 'GFXCLK' is in the structure
3097+ if (txt_power_dev_od_voltage.contains_structured_key(kTAG_GFXCLK,
3098+ KTAG_FIRST_FREQ_IDX)) {
3099+ p->curr_sclk_range.lower_bound = freq_string_to_int(build_lower_bound(kTAG_GFXCLK), nullptr, nullptr, 0);
3100+ p->curr_sclk_range.upper_bound = freq_string_to_int(build_upper_bound(kTAG_GFXCLK), nullptr, nullptr, 0);
3101+ }
3102+ else
3103+ structured_key_counter--;
3104+ // Validates 'MCLK' is in the structure
3105+ if (txt_power_dev_od_voltage.contains_structured_key(KTAG_MCLK,
3106+ KTAG_FIRST_FREQ_IDX)) {
3107+ p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_MCLK), nullptr, nullptr, 0);
3108+ p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_MCLK), nullptr, nullptr, 0);
3109+ }
3110+ else
3111+ structured_key_counter--;
3112+ if (structured_key_counter == 0) {
3113 return RSMI_STATUS_NOT_YET_IMPLEMENTED;
3114 }
3115
3116@@ -1450,7 +1566,20 @@ rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue,
3117 ss << __PRETTY_FUNCTION__ << "| ======= start =======";
3118 LOG_TRACE(ss);
3119
3120- assert(minclkvalue < maxclkvalue);
3121+ if (minclkvalue >= maxclkvalue) {
3122+ return RSMI_STATUS_INVALID_ARGS;
3123+ }
3124+
3125+ // Bare Metal only feature
3126+ if (amd::smi::is_vm_guest()) {
3127+ return RSMI_STATUS_NOT_SUPPORTED;
3128+ }
3129+
3130+ // Can only set the clock type for sys and mem type
3131+ if (clkType != RSMI_CLK_TYPE_SYS && clkType != RSMI_CLK_TYPE_MEM) {
3132+ return RSMI_STATUS_NOT_SUPPORTED;
3133+ }
3134+
3135 std::string min_sysvalue;
3136 std::string max_sysvalue;
3137 std::map<rsmi_clk_type_t, std::string> clk_char_map = {
3138@@ -1848,6 +1977,11 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind,
3139 return RSMI_STATUS_INVALID_ARGS;
3140 }
3141
3142+ // Bare Metal only feature
3143+ if (amd::smi::is_vm_guest()) {
3144+ return RSMI_STATUS_NOT_SUPPORTED;
3145+ }
3146+
3147 ret = rsmi_dev_gpu_clk_freq_get(dv_ind, clk_type, &freqs);
3148
3149 if (ret != RSMI_STATUS_SUCCESS) {
3150@@ -1893,7 +2027,7 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind,
3151 // will have read-only perms, and the OS will deny access, before the request hits the driver level
3152 if (status == RSMI_STATUS_PERMISSION){
3153 bool read_only = false;
3154- int perms = amd::smi::isReadOnlyForAll(dev->path(), &read_only);
3155+ amd::smi::isReadOnlyForAll(dev->path(), &read_only);
3156 if(read_only){
3157 return RSMI_STATUS_NOT_SUPPORTED;
3158 }
3159@@ -1903,6 +2037,7 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind,
3160
3161 CATCH
3162 }
3163+
3164 static std::vector<std::string> pci_name_files = {
3165 "/usr/share/misc/pci.ids",
3166 "/usr/share/hwdata/pci.ids",
3167@@ -2184,17 +2319,17 @@ rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len) {
3168 std::ostringstream ss;
3169 ss << __PRETTY_FUNCTION__ << "| ======= start =======";
3170 LOG_TRACE(ss);
3171- CHK_SUPPORT_NAME_ONLY(name)
3172
3173- if (len == 0) {
3174+ if (len == 0 || !name) {
3175 return RSMI_STATUS_INVALID_ARGS;
3176 }
3177+ CHK_SUPPORT_NAME_ONLY(name)
3178
3179 DEVICE_MUTEX
3180
3181 ret = get_dev_name_from_file(dv_ind, name, len);
3182
3183- if (ret || name[0] == '\0' || !isprint(name[0]) ) {
3184+ if (ret || name[0] == '\0' || !isprint(name[0])) {
3185 ret = get_dev_name_from_id(dv_ind, name, len, NAME_STR_DEVICE);
3186 }
3187
3188@@ -2327,12 +2462,12 @@ rsmi_dev_vendor_name_get(uint32_t dv_ind, char *name, size_t len) {
3189 std::ostringstream ss;
3190 ss << __PRETTY_FUNCTION__ << "| ======= start =======";
3191 LOG_TRACE(ss);
3192+ if (name == nullptr || len == 0) {
3193+ return RSMI_STATUS_INVALID_ARGS;
3194+ }
3195 CHK_SUPPORT_NAME_ONLY(name)
3196
3197 assert(len > 0);
3198- if (len == 0) {
3199- return RSMI_STATUS_INVALID_ARGS;
3200- }
3201
3202 DEVICE_MUTEX
3203 ret = get_dev_name_from_id(dv_ind, name, len, NAME_STR_VENDOR);
3204@@ -2470,25 +2605,25 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) {
3205 return ret;
3206 }
3207
3208- // Hardcode based on PCIe specification: https://en.wikipedia.org/wiki/PCI_Express
3209+ // Hardcode based on PCIe specification: search PCI_Express on wikipedia
3210 const uint32_t link_width[] = {1, 2, 4, 8, 12, 16};
3211 const uint32_t link_speed[] = {25, 50, 80, 160}; // 0.1 Ghz
3212 const uint32_t WIDTH_DATA_LENGTH = sizeof(link_width)/sizeof(uint32_t);
3213 const uint32_t SPEED_DATA_LENGTH = sizeof(link_speed)/sizeof(uint32_t);
3214
3215 // Calculate the index
3216- uint32_t width_index = -1;
3217- uint32_t speed_index = -1;
3218+ int32_t width_index = -1;
3219+ int32_t speed_index = -1;
3220 uint32_t cur_index = 0;
3221 for (cur_index = 0; cur_index < WIDTH_DATA_LENGTH; cur_index++) {
3222 if (link_width[cur_index] == gpu_metrics.pcie_link_width) {
3223- width_index = cur_index;
3224+ width_index = static_cast<int32_t>(cur_index);
3225 break;
3226 }
3227 }
3228 for (cur_index = 0; cur_index < SPEED_DATA_LENGTH; cur_index++) {
3229 if (link_speed[cur_index] == gpu_metrics.pcie_link_speed) {
3230- speed_index = cur_index;
3231+ speed_index = static_cast<int32_t>(cur_index);
3232 break;
3233 }
3234 }
3235@@ -2497,7 +2632,7 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) {
3236 }
3237 // Set possible lanes and frequencies
3238 b->transfer_rate.num_supported = WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH;
3239- b->transfer_rate.current = speed_index*WIDTH_DATA_LENGTH + width_index;
3240+ b->transfer_rate.current = static_cast<uint32_t>(speed_index)*WIDTH_DATA_LENGTH + static_cast<uint32_t>(width_index);
3241 for (cur_index = 0; cur_index < WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; cur_index++) {
3242 b->transfer_rate.frequency[cur_index] =
3243 static_cast<long>(link_speed[cur_index/WIDTH_DATA_LENGTH]) * 100 * 1000000L;
3244@@ -2530,6 +2665,10 @@ rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask) {
3245 LOG_TRACE(ss);
3246 REQUIRE_ROOT_ACCESS
3247 DEVICE_MUTEX
3248+ // Bare Metal only feature
3249+ if (amd::smi::is_vm_guest()) {
3250+ return RSMI_STATUS_NOT_SUPPORTED;
3251+ }
3252 ret = rsmi_dev_pci_bandwidth_get(dv_ind, &bws);
3253
3254 if (ret != RSMI_STATUS_SUCCESS) {
3255@@ -2557,7 +2696,10 @@ rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask) {
3256
3257 int32_t ret_i;
3258 ret_i = dev->writeDevInfo(amd::smi::kDevPCIEClk, freq_enable_str);
3259-
3260+ //
3261+ // NOTE: kDevPCIEClk sysfs file maybe not exist for all cases.
3262+ // If it doesn't exist (pp_dpm_pcie), it shouldn't be an error
3263+ // and will get translated to RSMI_STATUS_NOT_SUPPORTED.
3264 return amd::smi::ErrnoToRsmiStatus(ret_i);
3265
3266 CATCH
3267@@ -2598,6 +2740,10 @@ rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent,
3268 fs_rng >> *max_pkt_sz;
3269 }
3270
3271+ if ((sent && *sent == UINT64_MAX) || (received && *received == UINT64_MAX)){
3272+ return RSMI_STATUS_NOT_SUPPORTED;
3273+ }
3274+
3275 return RSMI_STATUS_SUCCESS;
3276 CATCH
3277 }
3278@@ -2908,6 +3054,11 @@ rsmi_dev_fan_speed_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t speed) {
3279 REQUIRE_ROOT_ACCESS
3280 DEVICE_MUTEX
3281
3282+ // Bare Metal only feature
3283+ if (amd::smi::is_vm_guest()) {
3284+ return RSMI_STATUS_NOT_SUPPORTED;
3285+ }
3286+
3287 ret = rsmi_dev_fan_speed_max_get(dv_ind, sensor_ind, &max_speed);
3288
3289 if (ret != RSMI_STATUS_SUCCESS) {
3290@@ -2974,13 +3125,17 @@ rsmi_dev_gpu_reset(uint32_t dv_ind) {
3291 ss << __PRETTY_FUNCTION__ << "| ======= start =======";
3292 LOG_TRACE(ss);
3293 REQUIRE_ROOT_ACCESS
3294- DEVICE_MUTEX
3295+ // No longer using DEVICE_MUTEX as it blocks long running processes
3296+ // DEVICE_MUTEX
3297
3298 rsmi_status_t ret;
3299 uint64_t status_code = 0;
3300
3301 // Read amdgpu_gpu_recover to reset it
3302 ret = get_dev_value_int(amd::smi::kDevGpuReset, dv_ind, &status_code);
3303+ ss << __PRETTY_FUNCTION__ << " | ======= end ======= | returning "
3304+ << getRSMIStatusString(ret, false);
3305+ LOG_INFO(ss);
3306 return ret;
3307
3308 CATCH
3309@@ -3235,6 +3390,9 @@ rsmi_dev_power_cap_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *cap) {
3310 LOG_TRACE(ss);
3311
3312 ++sensor_ind; // power sysfs files have 1-based indices
3313+ if (!cap) {
3314+ return RSMI_STATUS_INVALID_ARGS;
3315+ }
3316 CHK_SUPPORT_SUBVAR_ONLY(cap, sensor_ind)
3317
3318 rsmi_status_t ret;
3319@@ -3255,6 +3413,9 @@ rsmi_dev_power_cap_range_get(uint32_t dv_ind, uint32_t sensor_ind,
3320 LOG_TRACE(ss);
3321
3322 ++sensor_ind; // power sysfs files have 1-based indices
3323+ if (max == nullptr || min == nullptr) {
3324+ return RSMI_STATUS_INVALID_ARGS;
3325+ }
3326 CHK_SUPPORT_SUBVAR_ONLY((min == nullptr || max == nullptr ?nullptr : min),
3327 sensor_ind)
3328 rsmi_status_t ret;
3329@@ -3283,6 +3444,11 @@ rsmi_dev_power_cap_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t cap) {
3330 REQUIRE_ROOT_ACCESS
3331 DEVICE_MUTEX
3332
3333+ // Bare Metal only feature
3334+ if (amd::smi::is_vm_guest()) {
3335+ return RSMI_STATUS_NOT_SUPPORTED;
3336+ }
3337+
3338 ret = rsmi_dev_power_cap_range_get(dv_ind, sensor_ind, &max, &min);
3339 if (ret != RSMI_STATUS_SUCCESS) {
3340 return ret;
3341@@ -3332,6 +3498,10 @@ rsmi_dev_power_profile_set(uint32_t dv_ind, uint32_t dummy,
3342
3343 (void)dummy;
3344 DEVICE_MUTEX
3345+ // Bare Metal only feature
3346+ if (amd::smi::is_vm_guest()) {
3347+ return RSMI_STATUS_NOT_SUPPORTED;
3348+ }
3349 rsmi_status_t ret = set_power_profile(dv_ind, profile);
3350
3351 return ret;
3352@@ -3369,6 +3539,8 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
3353 }
3354
3355 DEVICE_MUTEX
3356+ *total = 0; // Initialize total to 0
3357+ // This is needed to avoid returning garbage value in case of failure
3358 ret = get_dev_value_int(mem_type_file, dv_ind, total);
3359
3360 // Fallback to KFD reported memory if VRAM total is 0
3361@@ -3396,6 +3568,7 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
3362 return ret;
3363 CATCH
3364 }
3365+
3366 rsmi_status_t
3367 rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
3368 uint64_t *used) {
3369@@ -3427,6 +3600,8 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
3370 }
3371
3372 DEVICE_MUTEX
3373+ *used = 0; // Initialize used to 0
3374+ // This is needed to avoid returning garbage value in case of failure
3375 ret = get_dev_value_int(mem_type_file, dv_ind, used);
3376
3377 // Fallback to KFD reported memory if no VRAM
3378@@ -3613,6 +3788,19 @@ rsmi_status_string(rsmi_status_t status, const char **status_string) {
3379 "the call from completing successfully";
3380 break;
3381
3382+ case RSMI_STATUS_DRM_ERROR:
3383+ *status_string = "RSMI_STATUS_DRM_ERROR: An error occurred when calling "
3384+ "libdrm";
3385+ break;
3386+ case RSMI_STATUS_FAIL_LOAD_MODULE:
3387+ *status_string = "RSMI_STATUS_FAIL_LOAD_MODULE: Failed to load the "
3388+ "required module";
3389+ break;
3390+ case RSMI_STATUS_FAIL_LOAD_SYMBOL:
3391+ *status_string = "RSMI_STATUS_FAIL_LOAD_SYMBOL: Failed to load the "
3392+ "required symbol";
3393+ break;
3394+
3395 default:
3396 *status_string = "RSMI_STATUS_UNKNOWN_ERROR: An unknown error occurred";
3397 return RSMI_STATUS_UNKNOWN_ERROR;
3398@@ -3964,10 +4152,8 @@ rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *unique_id) {
3399 ss << __PRETTY_FUNCTION__ << "| ======= start =======";
3400 LOG_TRACE(ss);
3401
3402- CHK_SUPPORT_NAME_ONLY(unique_id)
3403-
3404 DEVICE_MUTEX
3405- if (unique_id == nullptr) {
3406+ if (!unique_id) {
3407 return RSMI_STATUS_INVALID_ARGS;
3408 }
3409 *unique_id = std::numeric_limits<uint64_t>::max();
3410@@ -4134,14 +4320,17 @@ rsmi_counter_available_counters_get(uint32_t dv_ind,
3411 TRY
3412 CHK_SUPPORT_VAR(available, grp)
3413 DEVICE_MUTEX
3414- uint64_t val;
3415+ uint64_t val = 0;
3416
3417 switch (grp) {
3418 case RSMI_EVNT_GRP_XGMI:
3419 case RSMI_EVNT_GRP_XGMI_DATA_OUT:
3420
3421 ret = get_dev_value_int(amd::smi::kDevDFCountersAvailable, dv_ind, &val);
3422- assert(val < UINT32_MAX);
3423+ if (ret != RSMI_STATUS_SUCCESS)
3424+ return ret;
3425+ if (val == UINT32_MAX)
3426+ return RSMI_STATUS_NOT_SUPPORTED;
3427 *available = static_cast<uint32_t>(val);
3428 break;
3429
3430@@ -5009,6 +5198,61 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
3431 CATCH
3432 }
3433
3434+rsmi_status_t rsmi_dev_compute_partition_capabilities_get(
3435+ uint32_t dv_ind, char *compute_partition_caps, uint32_t len) {
3436+ TRY
3437+ std::ostringstream ss;
3438+ ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind;
3439+ LOG_TRACE(ss);
3440+ DEVICE_MUTEX
3441+ std::string availableComputePartitions;
3442+ rsmi_status_t ret =
3443+ get_dev_value_line(amd::smi::kDevAvailableComputePartition,
3444+ dv_ind, &availableComputePartitions);
3445+ if (ret != RSMI_STATUS_SUCCESS) {
3446+ ss << __PRETTY_FUNCTION__
3447+ << " | ======= end ======= "
3448+ << " | FAIL "
3449+ << " | Device #: " << dv_ind
3450+ << " | Type: "
3451+ << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition)
3452+ << " | Data: could not retrieve requested data"
3453+ << " | Returning = "
3454+ << getRSMIStatusString(ret) << " |";
3455+ LOG_ERROR(ss);
3456+ return ret;
3457+ }
3458+
3459+ std::size_t length = availableComputePartitions.copy(compute_partition_caps, len-1);
3460+ compute_partition_caps[length]='\0';
3461+
3462+ if (len < (availableComputePartitions.size() + 1)) {
3463+ ss << __PRETTY_FUNCTION__
3464+ << " | ======= end ======= "
3465+ << " | Fail "
3466+ << " | Device #: " << dv_ind
3467+ << " | Type: "
3468+ << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition)
3469+ << " | Cause: requested size was insufficient"
3470+ << " | Returning = "
3471+ << getRSMIStatusString(RSMI_STATUS_INSUFFICIENT_SIZE) << " |";
3472+ LOG_ERROR(ss);
3473+ return RSMI_STATUS_INSUFFICIENT_SIZE;
3474+ }
3475+ ss << __PRETTY_FUNCTION__
3476+ << " | ======= end ======= "
3477+ << " | Success "
3478+ << " | Device #: " << dv_ind
3479+ << " | Type: "
3480+ << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition)
3481+ << " | Data: " << compute_partition_caps
3482+ << " | Returning = "
3483+ << getRSMIStatusString(ret) << " |";
3484+ LOG_TRACE(ss);
3485+ return ret;
3486+ CATCH
3487+}
3488+
3489 static rsmi_status_t get_memory_partition(uint32_t dv_ind,
3490 std::string &memory_partition) {
3491 TRY
3492@@ -5054,10 +5298,6 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
3493 REQUIRE_ROOT_ACCESS
3494 DEVICE_MUTEX
3495 const int k1000_MS_WAIT = 1000;
3496- const uint32_t kMaxBoardLength = 128;
3497- bool isCorrectDevice = false;
3498- char boardName[kMaxBoardLength];
3499- boardName[0] = '\0';
3500
3501 const uint32_t kMaxMemoryCapabilitiesSize = 30;
3502 char available_memory_capabilities[kMaxMemoryCapabilitiesSize];
3503@@ -5574,16 +5814,16 @@ rsmi_status_t rsmi_dev_node_id_get(uint32_t dv_ind, uint32_t *node_id) {
3504 << " | Device #: " << dv_ind;
3505 LOG_TRACE(ss);
3506 GET_DEV_AND_KFDNODE_FROM_INDX
3507- uint32_t kgd_node_id = std::numeric_limits<uint32_t>::max();
3508+ uint32_t kfd_node_id = std::numeric_limits<uint32_t>::max();
3509 rsmi_status_t resp = RSMI_STATUS_NOT_SUPPORTED;
3510- int ret = kfd_node->KFDNode::get_node_id(&kgd_node_id);
3511+ int ret = kfd_node->KFDNode::get_node_id(&kfd_node_id);
3512 resp = amd::smi::ErrnoToRsmiStatus(ret);
3513
3514 if (node_id == nullptr) {
3515 resp = RSMI_STATUS_INVALID_ARGS;
3516 } else {
3517- *node_id = kgd_node_id;
3518- if (kgd_node_id == std::numeric_limits<uint32_t>::max()) {
3519+ *node_id = kfd_node_id;
3520+ if (kfd_node_id == std::numeric_limits<uint32_t>::max()) {
3521 resp = RSMI_STATUS_NOT_SUPPORTED;
3522 }
3523 }
3524@@ -5987,7 +6227,7 @@ rsmi_event_notification_get(int timeout_ms,
3525
3526 uint32_t event;
3527 char event_in[MAX_EVENT_NOTIFICATION_MSG_SIZE];
3528- memcpy(reinterpret_cast<char *>(event_in), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE);
3529+ memset(event_in, '\0', MAX_EVENT_NOTIFICATION_MSG_SIZE);
3530 while (fgets(event_in, MAX_EVENT_NOTIFICATION_MSG_SIZE, anon_fp)) {
3531 /* Output is in format as "event_number message_information\n"
3532 * Both event are expressed in hex.
3533@@ -6000,20 +6240,20 @@ rsmi_event_notification_get(int timeout_ms,
3534 // parse message based on event received
3535 switch (event){
3536 case RSMI_EVT_NOTIF_NONE:
3537- strcpy(reinterpret_cast<char *>(&data_item->message), "Event type None received");
3538+ strncpy(reinterpret_cast<char *>(&data_item->message), "Event type None received", MAX_EVENT_NOTIFICATION_MSG_SIZE-1);
3539 break;
3540 case RSMI_EVT_NOTIF_VMFAULT:
3541 {
3542 uint32_t pid;
3543 char task_name[MAX_EVENT_NOTIFICATION_MSG_SIZE];
3544- memcpy(reinterpret_cast<char *>(task_name), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE);
3545+ memset(task_name, '\0', MAX_EVENT_NOTIFICATION_MSG_SIZE);
3546
3547 sscanf(message, "%x:%s\n", &pid, task_name);
3548 std::stringstream final_message;
3549- final_message << "pid: " << std::to_string(pid).c_str()
3550+ final_message << "PID: " << std::to_string(pid).c_str()
3551 << " task name: " << task_name;
3552
3553- strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
3554+ strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1);
3555 }
3556 break;
3557 case RSMI_EVT_NOTIF_THERMAL_THROTTLE:
3558@@ -6021,37 +6261,38 @@ rsmi_event_notification_get(int timeout_ms,
3559 uint64_t bitmask;
3560 uint64_t counter;
3561
3562- sscanf(message, "%llx:%llx\n", &bitmask, &counter);
3563+ sscanf(message, "%" PRIx64 ":%" PRIx64 "\n", &bitmask, &counter);
3564 std::stringstream final_message;
3565 final_message << "bitmask: 0x" << std::hex << bitmask
3566 << " counter: 0x" << std::hex << counter;
3567
3568- strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
3569+ strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1);
3570 }
3571 break;
3572 case RSMI_EVT_NOTIF_GPU_PRE_RESET:
3573 {
3574 uint32_t reset_seq_num;
3575 char reset_cause[MAX_EVENT_NOTIFICATION_MSG_SIZE];
3576- memcpy(reinterpret_cast<char *>(reset_cause), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE);
3577+ memset(reset_cause, '\0', MAX_EVENT_NOTIFICATION_MSG_SIZE);
3578
3579 sscanf(message, "%x %[^\n]\n", &reset_seq_num, reset_cause);
3580 std::stringstream final_message;
3581 final_message << "reset sequence number: " << std::to_string(reset_seq_num).c_str()
3582 << " reset cause: " << reset_cause;
3583
3584- strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
3585+ strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1);
3586 }
3587 break;
3588 case RSMI_EVT_NOTIF_GPU_POST_RESET:
3589 {
3590 uint32_t reset_seq_num;
3591
3592- sscanf(message, "%x %[^\n]\n", &reset_seq_num);
3593+ char tmp[MAX_EVENT_NOTIFICATION_MSG_SIZE];
3594+ sscanf(message, "%x %[^\n]\n", &reset_seq_num, tmp);
3595 std::stringstream final_message;
3596- final_message << " reset sequence number: " << std::to_string(reset_seq_num).c_str();
3597+ final_message << "reset sequence number: " << std::to_string(reset_seq_num).c_str();
3598
3599- strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
3600+ strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1);
3601 }
3602 break;
3603 case RSMI_EVT_NOTIF_EVENT_MIGRATE_START:
3604@@ -6060,15 +6301,15 @@ rsmi_event_notification_get(int timeout_ms,
3605 int32_t pid;
3606 uint32_t start;
3607 uint32_t size;
3608- uint16_t from;
3609- uint16_t to;
3610- uint16_t prefetch_loc;
3611- uint16_t preferred_loc;
3612+ uint32_t from;
3613+ uint32_t to;
3614+ uint32_t prefetch_loc;
3615+ uint32_t preferred_loc;
3616 int32_t migrate_trigger;
3617
3618- sscanf(message, "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", &ns, &pid, &start, &size, &from, &to, &prefetch_loc, &preferred_loc, &migrate_trigger);
3619+ sscanf(message, "%" PRId64 " -%d @%" PRIu32 "(%" PRIu32 ") %x->%x %x:%x %d\n", &ns, &pid, &start, &size, &from, &to, &prefetch_loc, &preferred_loc, &migrate_trigger);
3620 std::stringstream final_message;
3621- final_message << "ns: " << std::to_string(ns).c_str()
3622+ final_message << "nd: " << std::to_string(ns).c_str()
3623 << " pid: " << std::to_string(pid).c_str()
3624 << " start: 0x" << std::hex << start
3625 << " size: 0x" << std::hex << size
3626@@ -6078,7 +6319,7 @@ rsmi_event_notification_get(int timeout_ms,
3627 << " preferred_loc: 0x" << std::hex << preferred_loc
3628 << " migrate_trigger: " << std::to_string(migrate_trigger).c_str();
3629
3630- strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
3631+ strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1);
3632 }
3633 break;
3634 case RSMI_EVT_NOTIF_EVENT_MIGRATE_END:
3635@@ -6092,9 +6333,9 @@ rsmi_event_notification_get(int timeout_ms,
3636 uint32_t migrate_trigger;
3637 uint32_t error_code;
3638
3639- sscanf(message, "%lld -%d @%lx(%lx) %x->%x %d %d\n", &ns, &pid, &start, &size, &from, &to, &migrate_trigger, &error_code);
3640+ sscanf(message, "%" PRId64 " -%d @%" PRIu32 "(%" PRIu32 ") %x->%x %d %d\n", &ns, &pid, &start, &size, &from, &to, &migrate_trigger, &error_code);
3641 std::stringstream final_message;
3642- final_message << "ns: " << std::to_string(ns).c_str()
3643+ final_message << "nd: " << std::to_string(ns).c_str()
3644 << " pid: " << std::to_string(pid).c_str()
3645 << " start: 0x" << std::hex << start
3646 << " size: 0x" << std::hex << size
3647@@ -6103,7 +6344,7 @@ rsmi_event_notification_get(int timeout_ms,
3648 << " migrate_trigger: " << std::to_string(migrate_trigger).c_str()
3649 << " error_code: " << std::to_string(error_code).c_str();
3650
3651- strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
3652+ strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1);
3653 }
3654 break;
3655 case RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START:
3656@@ -6112,9 +6353,9 @@ rsmi_event_notification_get(int timeout_ms,
3657 int32_t pid;
3658 uint32_t addr;
3659 uint32_t node;
3660- char *rw;
3661+ char *rw = "\0";
3662
3663- sscanf(message, "%lld -%d @%lx(%x) %c\n", &ns, &pid, &addr, &node, rw);
3664+ sscanf(message, "%" PRId64 " -%d @%" PRIx32 "(%x) %c\n", &ns, &pid, &addr, &node, rw);
3665 std::stringstream final_message;
3666 final_message << "ns: " << std::to_string(ns).c_str()
3667 << " pid: " << std::to_string(pid).c_str()
3668@@ -6122,7 +6363,7 @@ rsmi_event_notification_get(int timeout_ms,
3669 << " node: 0x" << std::hex << node
3670 << " rw: " << rw;
3671
3672- strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
3673+ strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1);
3674 }
3675 break;
3676 case RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END:
3677@@ -6131,9 +6372,9 @@ rsmi_event_notification_get(int timeout_ms,
3678 int32_t pid;
3679 uint32_t addr;
3680 uint32_t node;
3681- char *migrate_update;
3682+ char *migrate_update = "\0";
3683
3684- sscanf(message, "%lld -%d @%lx(%x) %c\n", &ns, &pid, &addr, &node, migrate_update);
3685+ sscanf(message, "%" PRId64 " -%d @%" PRIx32 "(%x) %c\n", &ns, &pid, &addr, &node, migrate_update);
3686 std::stringstream final_message;
3687 final_message << "ns: " << std::to_string(ns).c_str()
3688 << " pid: " << std::to_string(pid).c_str()
3689@@ -6141,7 +6382,7 @@ rsmi_event_notification_get(int timeout_ms,
3690 << " node: 0x" << std::hex << node
3691 << " migrate_udpate: " << migrate_update;
3692
3693- strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
3694+ strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1);
3695 }
3696 break;
3697 case RSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION:
3698@@ -6151,14 +6392,14 @@ rsmi_event_notification_get(int timeout_ms,
3699 uint32_t node;
3700 uint32_t evict_trigger;
3701
3702- sscanf(message, "%lld -%d %x %d\n", &ns, &pid, &node, &evict_trigger);
3703+ sscanf(message, "%" PRId64 "-%d %x %d\n", &ns, &pid, &node, &evict_trigger);
3704 std::stringstream final_message;
3705 final_message << "ns: " << std::to_string(ns).c_str()
3706 << " pid: " << std::to_string(pid).c_str()
3707 << " node: 0x" << std::hex << node
3708 << " evict_trigger: " << std::to_string(evict_trigger).c_str();
3709
3710- strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
3711+ strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1);
3712 }
3713 break;
3714 case RSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE:
3715@@ -6166,16 +6407,16 @@ rsmi_event_notification_get(int timeout_ms,
3716 int64_t ns;
3717 int32_t pid;
3718 uint32_t node;
3719- char *rescheduled;
3720+ char *rescheduled = "\0";
3721
3722- sscanf(message, "%lld -%d %x %c\n", &ns, &pid, &node, rescheduled);
3723+ sscanf(message, "%" PRId64 "-%d %x %c\n", &ns, &pid, &node, rescheduled);
3724 std::stringstream final_message;
3725 final_message << "ns: " << std::to_string(ns).c_str()
3726 << " pid: " << std::to_string(pid).c_str()
3727 << " node: 0x" << std::hex << node
3728 << " rescheduled: " << rescheduled;
3729
3730- strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
3731+ strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1);
3732 }
3733 break;
3734 case RSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU:
3735@@ -6187,7 +6428,7 @@ rsmi_event_notification_get(int timeout_ms,
3736 uint32_t node;
3737 uint32_t unmap_trigger;
3738
3739- sscanf(message, "%lld -%d @%lx(%lx) %x %d\n", &ns, &pid, &addr, &size, &node, &unmap_trigger);
3740+ sscanf(message, "%" PRId64 " -%d @%" PRIx32 "(%" PRIx32 ") %x %d\n", &ns, &pid, &addr, &size, &node, &unmap_trigger);
3741 std::stringstream final_message;
3742 final_message << "ns: " << std::to_string(ns).c_str()
3743 << " pid: " << std::to_string(pid).c_str()
3744@@ -6196,11 +6437,11 @@ rsmi_event_notification_get(int timeout_ms,
3745 << " node: 0x" << std::hex << node
3746 << " unmap_trigger: " << std::to_string(unmap_trigger).c_str();
3747
3748- strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
3749+ strncpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str(), MAX_EVENT_NOTIFICATION_MSG_SIZE-1);
3750 }
3751 break;
3752 default:
3753- strcpy(reinterpret_cast<char *>(&data_item->message), "Unknown event received");
3754+ strncpy(reinterpret_cast<char *>(&data_item->message), "Unknown event received", MAX_EVENT_NOTIFICATION_MSG_SIZE-1);
3755 break;
3756 }
3757 data_item->event = (rsmi_evt_notification_type_t)event;
3758@@ -6208,7 +6449,7 @@ rsmi_event_notification_get(int timeout_ms,
3759 ++(*num_elem);
3760
3761 // zero out event_in after each use
3762- memcpy(reinterpret_cast<char *>(event_in), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE);
3763+ memset(event_in, '\0', MAX_EVENT_NOTIFICATION_MSG_SIZE);
3764
3765 if (*num_elem >= buffer_size) {
3766 break;
3767diff --git a/src/rocm_smi64Config.in b/src/rocm_smi64Config.in
3768old mode 100755
3769new mode 100644
3770index a3b2631..e3c5903
3771--- a/src/rocm_smi64Config.in
3772+++ b/src/rocm_smi64Config.in
3773@@ -5,7 +5,7 @@
3774 * The University of Illinois/NCSA
3775 * Open Source License (NCSA)
3776 *
3777- * Copyright (c) 2017, Advanced Micro Devices, Inc.
3778+ * Copyright (c) 2025, Advanced Micro Devices, Inc.
3779 * All rights reserved.
3780 *
3781 * Developed by:
3782diff --git a/src/rocm_smi_counters.cc b/src/rocm_smi_counters.cc
3783old mode 100755
3784new mode 100644
3785index a088195..185ed0e
3786--- a/src/rocm_smi_counters.cc
3787+++ b/src/rocm_smi_counters.cc
3788@@ -3,7 +3,7 @@
3789 * The University of Illinois/NCSA
3790 * Open Source License (NCSA)
3791 *
3792- * Copyright (c) 2019, Advanced Micro Devices, Inc.
3793+ * Copyright (c) 2025, Advanced Micro Devices, Inc.
3794 * All rights reserved.
3795 *
3796 * Developed by:
3797diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc
3798old mode 100755
3799new mode 100644
3800index 62ced13..cf8cbf7
3801--- a/src/rocm_smi_device.cc
3802+++ b/src/rocm_smi_device.cc
3803@@ -3,7 +3,7 @@
3804 * The University of Illinois/NCSA
3805 * Open Source License (NCSA)
3806 *
3807- * Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
3808+ * Copyright (c) 2017-2025, Advanced Micro Devices, Inc.
3809 * All rights reserved.
3810 *
3811 * Developed by:
3812@@ -509,10 +509,12 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
3813 {"rsmi_dev_counter_create", {{}, {}}},
3814 {"rsmi_dev_xgmi_error_status", {{kDevXGMIErrorFName}, {}}},
3815 {"rsmi_dev_xgmi_error_reset", {{kDevXGMIErrorFName}, {}}},
3816- {"rsmi_dev_memory_reserved_pages_get", {{kDevMemPageBadFName}, {}}},
3817 {"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}},
3818 {"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}},
3819 {"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}},
3820+ {"rsmi_dev_energy_count_get", {{kDevGpuMetricsFName}, {}}},
3821+ {"rsmi_dev_current_socket_power_get", {{kDevGpuMetricsFName}, {}}},
3822+
3823 {"rsmi_dev_compute_partition_get", {{kDevComputePartitionFName}, {}}},
3824 {"rsmi_dev_compute_partition_set", {{kDevComputePartitionFName}, {}}},
3825 {"rsmi_dev_memory_partition_get", {{kDevMemoryPartitionFName}, {}}},
3826@@ -763,8 +765,8 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) {
3827 ret = openDebugFileStream(type, &fs);
3828 if (ret != 0) {
3829 ss << "Could not read debugInfoStr for DevInfoType ("
3830- << get_type_string(type) << "), returning "
3831- << std::to_string(ret);
3832+ << get_type_string(type) << "), returning "
3833+ << std::to_string(ret);
3834 LOG_ERROR(ss);
3835 return ret;
3836 }
3837@@ -960,7 +962,7 @@ int Device::readDevInfoLine(DevInfoTypes type, std::string *line) {
3838 << get_type_string(type) << "), returning *line = "
3839 << *line;
3840 LOG_INFO(ss);
3841-
3842+ fs.close();
3843 return 0;
3844 }
3845
3846@@ -1042,6 +1044,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type,
3847 while (std::getline(fs, line)) {
3848 retVec->push_back(line);
3849 }
3850+ fs.close();
3851
3852 if (retVec->empty()) {
3853 ss << "Read devInfoMultiLineStr for DevInfoType ("
3854@@ -1422,7 +1425,6 @@ rsmi_status_t Device::restartAMDGpuDriver(void) {
3855 bool success = false;
3856 std::string out;
3857 bool wasGdmServiceActive = false;
3858- bool restartInProgress = true;
3859 bool isRestartInProgress = true;
3860 bool isAMDGPUModuleLive = false;
3861 bool restartGDM = false;
3862@@ -1508,7 +1510,6 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress,
3863 bool *isAMDGPUModuleLive) {
3864 REQUIRE_ROOT_ACCESS
3865 std::ostringstream ss;
3866- bool restartSuccessful = true;
3867 bool success = false;
3868 std::string out;
3869 bool deviceRestartInProgress = true; // Assume in progress, we intend to disprove
3870@@ -1718,3 +1719,4 @@ rsmi_status_t Device::get_smi_device_identifiers(uint32_t device_id,
3871 #undef RET_IF_NONZERO
3872 } // namespace smi
3873 } // namespace amd
3874+
3875diff --git a/src/rocm_smi_gpu_metrics.cc b/src/rocm_smi_gpu_metrics.cc
3876old mode 100755
3877new mode 100644
3878index 5962477..0722f89
3879--- a/src/rocm_smi_gpu_metrics.cc
3880+++ b/src/rocm_smi_gpu_metrics.cc
3881@@ -1,44 +1,23 @@
3882 /*
3883- * =============================================================================
3884- * The University of Illinois/NCSA
3885- * Open Source License (NCSA)
3886- *
3887- * Copyright (c) 2017-2024, Advanced Micro Devices, Inc.
3888- * All rights reserved.
3889- *
3890- * Developed by:
3891- *
3892- * AMD Research and AMD ROC Software Development
3893- *
3894- * Advanced Micro Devices, Inc.
3895- *
3896- * www.amd.com
3897+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
3898 *
3899 * Permission is hereby granted, free of charge, to any person obtaining a copy
3900- * of this software and associated documentation files (the "Software"), to
3901- * deal with the Software without restriction, including without limitation
3902- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
3903- * and/or sell copies of the Software, and to permit persons to whom the
3904- * Software is furnished to do so, subject to the following conditions:
3905+ * of this software and associated documentation files (the "Software"), to deal
3906+ * in the Software without restriction, including without limitation the rights
3907+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
3908+ * copies of the Software, and to permit persons to whom the Software is
3909+ * furnished to do so, subject to the following conditions:
3910 *
3911- * - Redistributions of source code must retain the above copyright notice,
3912- * this list of conditions and the following disclaimers.
3913- * - Redistributions in binary form must reproduce the above copyright
3914- * notice, this list of conditions and the following disclaimers in
3915- * the documentation and/or other materials provided with the distribution.
3916- * - Neither the names of <Name of Development Group, Name of Institution>,
3917- * nor the names of its contributors may be used to endorse or promote
3918- * products derived from this Software without specific prior written
3919- * permission.
3920+ * The above copyright notice and this permission notice shall be included in
3921+ * all copies or substantial portions of the Software.
3922 *
3923 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
3924 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
3925- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
3926- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
3927- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
3928- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
3929- * DEALINGS WITH THE SOFTWARE.
3930- *
3931+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
3932+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
3933+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
3934+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
3935+ * THE SOFTWARE.
3936 */
3937
3938 #include "rocm_smi/rocm_smi_gpu_metrics.h"
3939@@ -52,6 +31,7 @@
3940
3941 #include <dirent.h>
3942 #include <pthread.h>
3943+#include <unistd.h>
3944
3945 #include <algorithm>
3946 #include <array>
3947@@ -84,7 +64,7 @@ namespace amd::smi
3948
3949 constexpr uint16_t join_metrics_version(uint8_t format_rev, uint8_t content_rev)
3950 {
3951- return (format_rev << 8 | content_rev);
3952+ return static_cast<uint16_t>((format_rev << 8 | content_rev));
3953 }
3954
3955 constexpr uint16_t join_metrics_version(const AMDGpuMetricsHeader_v1_t& metrics_header)
3956@@ -168,6 +148,7 @@ const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_tabl
3957 {join_metrics_version(1, 5), AMDGpuMetricVersionFlags_t::kGpuMetricV15},
3958 {join_metrics_version(1, 6), AMDGpuMetricVersionFlags_t::kGpuMetricV16},
3959 {join_metrics_version(1, 7), AMDGpuMetricVersionFlags_t::kGpuMetricV17},
3960+ {join_metrics_version(1, 8), AMDGpuMetricVersionFlags_t::kGpuMetricV18},
3961 };
3962
3963 /**
3964@@ -277,22 +258,27 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation
3965 {AMDGpuMetricsUnitType_t::kMetricHBMThmResidencyAccumulator, "HBMThmResidencyAccumulator"}, /* v1.6 */
3966
3967 // kGpuMetricPartition
3968- {AMDGpuMetricsUnitType_t::kGpuMetricNumPartition, "numPartition"}, /* v1.6 */
3969+ {AMDGpuMetricsUnitType_t::kGpuMetricNumPartition, "numPartition"}, /* v1.6 */
3970
3971 // kGpuMetricXcpStats
3972- {AMDGpuMetricsUnitType_t::kMetricGfxBusyInst, "GfxBusyInst"}, /* v1.6 */
3973- {AMDGpuMetricsUnitType_t::kMetricJpegBusy, "JpegBusy"}, /* v1.6 */
3974- {AMDGpuMetricsUnitType_t::kMetricVcnBusy, "VcnBusy"}, /* v1.6 */
3975- {AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, "GfxBusyAcc"}, /* v1.6 */
3976+ {AMDGpuMetricsUnitType_t::kMetricGfxBusyInst, "GfxBusyInst"}, /* v1.6 */
3977+ {AMDGpuMetricsUnitType_t::kMetricJpegBusy, "JpegBusy"}, /* v1.6 */
3978+ {AMDGpuMetricsUnitType_t::kMetricVcnBusy, "VcnBusy"}, /* v1.6 */
3979+ {AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, "GfxBusyAcc"}, /* v1.6 */
3980
3981 // kGpuMetricLinkWidthSpeed
3982- {AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, "PcieLCPerfOtherEndRecov"}, /* v1.6 */
3983+ {AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, "PcieLCPerfOtherEndRecov"}, /* v1.6 */
3984+
3985
3986+ {AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, "XgmiLinkStatus"}, /* v1.7 */
3987+ {AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, "VramMaxBandwidth"}, /* v1.7 */
3988+ {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator,"GfxBelowHostLimitAccumulator"}, /* v1.7 */
3989
3990- {AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, "XgmiLinkStatus"}, /* v1.7 */
3991- {AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, "VramMaxBandwidth"}, /* v1.7 */
3992- {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator,
3993- "GfxBelowHostLimitAccumulator"}, /* v1.7 */
3994+ // kGpuMetricXcpStats v1.8
3995+ {AMDGpuMetricsUnitType_t::kMetricGfxLowUtilitizationAcc, "GfxLowUtilitizationAcc"}, /* v1.8 */
3996+ {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitTotalAcc, "GfxBelowHostLimitTotalAcc"}, /* v1.8 */
3997+ {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitPptAcc, "GfxBelowHostLimitPptAcc"}, /* v1.8 */
3998+ {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitThmAcc, "GfxBelowHostLimitThmAcc"}, /* v1.8 */
3999 };
4000
4001
4002@@ -382,6 +368,7 @@ AMDGpuMetricFactories_t amd_gpu_metrics_factory_table
4003 {AMDGpuMetricVersionFlags_t::kGpuMetricV15, std::make_shared<GpuMetricsBase_v15_t>(GpuMetricsBase_v15_t{})},
4004 {AMDGpuMetricVersionFlags_t::kGpuMetricV16, std::make_shared<GpuMetricsBase_v16_t>(GpuMetricsBase_v16_t{})},
4005 {AMDGpuMetricVersionFlags_t::kGpuMetricV17, std::make_shared<GpuMetricsBase_v17_t>(GpuMetricsBase_v17_t{})},
4006+ {AMDGpuMetricVersionFlags_t::kGpuMetricV18, std::make_shared<GpuMetricsBase_v18_t>(GpuMetricsBase_v18_t{})},
4007 };
4008
4009 GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version)
4010@@ -500,381 +487,269 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str
4011 return multi_values;
4012 }
4013
4014-void GpuMetricsBase_v17_t::dump_internal_metrics_table()
4015-{
4016- std::ostringstream ss;
4017- auto idx = uint64_t(0);
4018- auto idy = uint64_t(0);
4019- std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n";
4020- ss << __PRETTY_FUNCTION__
4021- << " | ======= DEBUG ======= "
4022- << " | Metric Version: "
4023- << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header)
4024- << " | Size: "
4025- << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size)
4026- << " |"
4027- << "\n";
4028- ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n"
4029- << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n"
4030- << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n"
4031- << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n"
4032- << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n"
4033- << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n";
4034-
4035- ss << " vram_max_bandwidth: " << m_gpu_metrics_tbl.m_vram_max_bandwidth << "\n" // new for v1.7
4036- << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n"
4037- << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n"
4038- << " accumulation_counter: " << m_gpu_metrics_tbl.m_accumulation_counter << "\n"
4039- << " prochot_residency_acc: " << m_gpu_metrics_tbl.m_prochot_residency_acc << "\n"
4040- << " ppt_residency_acc: " << m_gpu_metrics_tbl.m_ppt_residency_acc << "\n"
4041- << " socket_thm_residency_acc: " << m_gpu_metrics_tbl.m_socket_thm_residency_acc << "\n"
4042- << " vr_thm_residency_acc: " << m_gpu_metrics_tbl.m_vr_thm_residency_acc << "\n"
4043- << " hbm_thm_residency_acc: " << m_gpu_metrics_tbl.m_hbm_thm_residency_acc << "\n"
4044- << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n"
4045- << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n"
4046- << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n"
4047- << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n"
4048- << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n"
4049- << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n"
4050- << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n"
4051- << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n"
4052- << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n"
4053- << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n"
4054- << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n"
4055- << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n"
4056- << " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n"
4057- << " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n"
4058- << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n"
4059- << " current_uclk: " << m_gpu_metrics_tbl.m_current_uclk << "\n"
4060- << " num_partition: " << m_gpu_metrics_tbl.m_num_partition << "\n"
4061- << " pcie_lc_perf_other_end_recovery: "
4062- << m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery << "\n";
4063- idx = 0;
4064- for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_link_status) { // new for v1.7
4065- ss << "\t [" << idx << "]: " << temp << "\n";
4066- ++idx;
4067- }
4068-
4069- for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) {
4070- ss << "\t [" << idx << "]: " << temp << "\n";
4071- ++idx;
4072- }
4073-
4074- ss << " xgmi_write_data_acc: " << "\n";
4075- idx = 0;
4076- for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) {
4077- ss << "\t [" << idx << "]: " << temp << "\n";
4078- ++idx;
4079- }
4080
4081- ss << " current_gfxclk: " << "\n";
4082- idx = 0;
4083- for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) {
4084- ss << "\t [" << idx << "]: " << temp << "\n";
4085- ++idx;
4086- }
4087+rsmi_status_t GpuMetricsBase_v18_t::populate_metrics_dynamic_tbl() {
4088+ std::ostringstream ss;
4089+ auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
4090+ ss << __PRETTY_FUNCTION__ << " | ======= start =======";
4091+ LOG_TRACE(ss);
4092
4093- ss << " current_socclk: " << "\n";
4094- idx = 0;
4095- for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) {
4096- ss << "\t [" << idx << "]: " << temp << "\n";
4097- ++idx;
4098- }
4099+ auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
4100+ //
4101+ // Note: Any metric treatment/changes (if any) should happen before they
4102+ // get written to internal/external tables.
4103+ //
4104+ auto run_metric_adjustments_v18 = [&]() {
4105+ ss << __PRETTY_FUNCTION__ << " | ======= start =======";
4106+ const auto gpu_metrics_version =
4107+ translate_flag_to_metric_version(get_gpu_metrics_version_used());
4108+ ss << __PRETTY_FUNCTION__ << " | ======= info ======= "
4109+ << " | Applying adjustments "
4110+ << " | Metric Version: "
4111+ << stringfy_metric_header_version(disjoin_metrics_version(gpu_metrics_version)) << " |";
4112+ LOG_TRACE(ss);
4113
4114- ss << " current_vclk0: " << "\n";
4115- idx = 0;
4116- for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) {
4117- ss << "\t [" << idx << "]: " << temp << "\n";
4118- ++idx;
4119- }
4120+ // firmware_timestamp is at 10ns resolution
4121+ ss << __PRETTY_FUNCTION__ << " | ======= Changes ======= "
4122+ << " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp
4123+ << " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10);
4124+ m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10);
4125+ LOG_DEBUG(ss);
4126+ };
4127
4128- ss << " current_dclk0: " << "\n";
4129- idx = 0;
4130- for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) {
4131- ss << "\t [" << idx << "]: " << temp << "\n";
4132- ++idx;
4133- }
4134+ run_metric_adjustments_v18();
4135
4136- idx = 0;
4137- idy = 0;
4138- ss << " xcp_stats.gfx_busy_inst: " << "\n";
4139- for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
4140- if (idx == 0) {
4141- ss << "\t [ ";
4142- }
4143- for (auto& col : row.gfx_busy_inst) {
4144- ss << "\t [" << idx << "] [" << idy << "]: " << col;
4145- if (idy + 1 != (std::end(row.gfx_busy_inst) - std::end(row.gfx_busy_inst) - 1)) {
4146- ss << ", ";
4147- }
4148- if (idx + 1 !=
4149- (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
4150- ss << "\n";
4151- } else {
4152- ss << "]\n";
4153- }
4154- idy++;
4155- }
4156- idx++;
4157- }
4158+ // Temperature Info
4159+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
4160+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot,
4161+ format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot,
4162+ "temperature_hotspot")));
4163+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
4164+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem,
4165+ format_metric_row(m_gpu_metrics_tbl.m_temperature_mem,
4166+ "temperature_mem")));
4167+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
4168+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc,
4169+ format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc,
4170+ "temperature_vrsoc")));
4171
4172- idx = 0;
4173- idy = 0;
4174- ss << " xcp_stats.vcn_busy: " << "\n";
4175- for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
4176- if (idx == 0) {
4177- ss << "\t [ ";
4178- }
4179- for (auto& col : row.vcn_busy) {
4180- ss << "\t [" << idx << "] [" << idy << "]: " << col;
4181- if (idy + 1 != (std::end(row.vcn_busy) - std::end(row.vcn_busy) - 1)) {
4182- ss << ", ";
4183- }
4184- if (idx + 1 !=
4185- (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
4186- ss << "\n";
4187- } else {
4188- ss << "]\n";
4189- }
4190- idy++;
4191- }
4192- idx++;
4193- }
4194+ // Power/Energy Info
4195+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy]
4196+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower,
4197+ format_metric_row(m_gpu_metrics_tbl.m_current_socket_power,
4198+ "curr_socket_power")));
4199+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy]
4200+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator,
4201+ format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator,
4202+ "energy_acc")));
4203
4204- idx = 0;
4205- idy = 0;
4206- ss << " xcp_stats.jpeg_busy: " << "\n";
4207- for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
4208- if (idx == 0) {
4209- ss << "\t [ ";
4210- }
4211- for (auto& col : row.jpeg_busy) {
4212- ss << "\t [" << idx << "] [" << idy << "]: " << col;
4213- if (idy + 1 != (std::end(row.jpeg_busy) - std::end(row.jpeg_busy) - 1)) {
4214- ss << ", ";
4215- }
4216- if (idx + 1 !=
4217- (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
4218- ss << "\n";
4219- } else {
4220- ss << "]\n";
4221- }
4222- idy++;
4223- }
4224- idx++;
4225- }
4226+ // Utilization Info
4227+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
4228+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity,
4229+ format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity,
4230+ "average_gfx_activity")));
4231+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
4232+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity,
4233+ format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity,
4234+ "average_umc_activity")));
4235+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
4236+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator,
4237+ format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc,
4238+ "gfx_activity_acc")));
4239+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
4240+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator,
4241+ format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc,
4242+ "mem_activity_acc")));
4243
4244- idx = 0;
4245- idy = 0;
4246- ss << " xcp_stats.gfx_busy_acc: " << "\n";
4247- for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
4248- if (idx == 0) {
4249- ss << "\t [ ";
4250- }
4251- for (auto& col : row.gfx_busy_acc) {
4252- ss << "\t [" << idx << "] [" << idy << "]: " << col;
4253- if (idy + 1 != (std::end(row.gfx_busy_acc) - std::end(row.gfx_busy_acc) - 1)) {
4254- ss << ", ";
4255- }
4256- if (idx + 1 !=
4257- (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
4258- ss << "\n";
4259- } else {
4260- ss << "]\n";
4261- }
4262- idy++;
4263- }
4264- idx++;
4265- }
4266+ // GfxLock Info
4267+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus]
4268+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus,
4269+ format_metric_row(m_gpu_metrics_tbl.m_gfxclk_lock_status,
4270+ "gfxclk_lock_status")));
4271
4272- LOG_DEBUG(ss);
4273-}
4274+ // Timestamp Info
4275+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp]
4276+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSFirmware,
4277+ format_metric_row(m_gpu_metrics_tbl.m_firmware_timestamp,
4278+ "firmware_timestamp")));
4279+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp]
4280+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter,
4281+ format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter,
4282+ "system_clock_counter")));
4283
4284+ // Link/Width/Speed Info
4285+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4286+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth,
4287+ format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width,
4288+ "pcie_link_width")));
4289+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4290+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed,
4291+ format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed,
4292+ "pcie_link_speed")));
4293+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4294+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth,
4295+ format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_width,
4296+ "xgmi_link_width")));
4297+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4298+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed,
4299+ format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_speed,
4300+ "xgmi_link_speed")));
4301+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4302+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator,
4303+ format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc,
4304+ "pcie_bandwidth_acc")));
4305+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4306+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst,
4307+ format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_inst,
4308+ "pcie_bandwidth_inst")));
4309+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4310+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator,
4311+ format_metric_row(m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc,
4312+ "pcie_l0_recov_count_acc")));
4313+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4314+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator,
4315+ format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_count_acc,
4316+ "pcie_replay_count_acc")));
4317+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4318+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator,
4319+ format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc,
4320+ "pcie_replay_rollover_count_acc")));
4321+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4322+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator,
4323+ format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc,
4324+ "pcie_nak_sent_count_acc")));
4325+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4326+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator,
4327+ format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc,
4328+ "pcie_nak_rcvd_count_acc")));
4329+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4330+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator,
4331+ format_metric_row(m_gpu_metrics_tbl.m_xgmi_read_data_acc,
4332+ "[xgmi_read_data_acc]")));
4333+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4334+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator,
4335+ format_metric_row(m_gpu_metrics_tbl.m_xgmi_write_data_acc,
4336+ "[xgmi_write_data_acc]")));
4337+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4338+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus,
4339+ format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_status,
4340+ "[xgmi_link_status]")));
4341
4342-void GpuMetricsBase_v16_t::dump_internal_metrics_table()
4343-{
4344- std::ostringstream ss;
4345- auto idx = uint64_t(0);
4346- auto idy = uint64_t(0);
4347- std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n";
4348- ss << __PRETTY_FUNCTION__
4349- << " | ======= DEBUG ======= "
4350- << " | Metric Version: "
4351- << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header)
4352- << " | Size: "
4353- << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size)
4354- << " |"
4355- << "\n";
4356- ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n"
4357- << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n"
4358- << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n"
4359- << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n"
4360- << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n"
4361- << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n";
4362-
4363- ss << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n"
4364- << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n"
4365- << " accumulation_counter: " << m_gpu_metrics_tbl.m_accumulation_counter << "\n"
4366- << " prochot_residency_acc: " << m_gpu_metrics_tbl.m_prochot_residency_acc << "\n"
4367- << " ppt_residency_acc: " << m_gpu_metrics_tbl.m_ppt_residency_acc << "\n"
4368- << " socket_thm_residency_acc: " << m_gpu_metrics_tbl.m_socket_thm_residency_acc << "\n"
4369- << " vr_thm_residency_acc: " << m_gpu_metrics_tbl.m_vr_thm_residency_acc << "\n"
4370- << " hbm_thm_residency_acc: " << m_gpu_metrics_tbl.m_hbm_thm_residency_acc << "\n"
4371- << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n"
4372- << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"
4373- << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n"
4374- << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n"
4375- << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n"
4376- << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n"
4377- << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n"
4378- << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n"
4379- << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n"
4380- << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n"
4381- << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n"
4382- << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n"
4383- << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n"
4384- << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n"
4385- << " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n"
4386- << " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n"
4387- << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n"
4388- << " current_uclk: " << m_gpu_metrics_tbl.m_current_uclk << "\n"
4389- << " num_partition: " << m_gpu_metrics_tbl.m_num_partition << "\n"
4390- << " pcie_lc_perf_other_end_recovery: "
4391- << m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery << "\n";
4392- idx = 0;
4393- for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) {
4394- ss << "\t [" << idx << "]: " << temp << "\n";
4395- ++idx;
4396- }
4397+ // CurrentClock Info
4398+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
4399+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock,
4400+ format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk,
4401+ "[current_gfxclk]")));
4402+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
4403+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock,
4404+ format_metric_row(m_gpu_metrics_tbl.m_current_socclk,
4405+ "[current_socclk]")));
4406+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
4407+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0,
4408+ format_metric_row(m_gpu_metrics_tbl.m_current_vclk0,
4409+ "[current_vclk0]")));
4410+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
4411+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0,
4412+ format_metric_row(m_gpu_metrics_tbl.m_current_dclk0,
4413+ "[current_dclk0]")));
4414+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
4415+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock,
4416+ format_metric_row(m_gpu_metrics_tbl.m_current_uclk,
4417+ "current_uclk")));
4418
4419- ss << " xgmi_write_data_acc: " << "\n";
4420- idx = 0;
4421- for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) {
4422- ss << "\t [" << idx << "]: " << temp << "\n";
4423- ++idx;
4424- }
4425+ /* Accumulation cycle counter */
4426+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
4427+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAccumulationCounter,
4428+ format_metric_row(m_gpu_metrics_tbl.m_accumulation_counter,
4429+ "accumulation_counter")));
4430
4431- ss << " current_gfxclk: " << "\n";
4432- idx = 0;
4433- for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) {
4434- ss << "\t [" << idx << "]: " << temp << "\n";
4435- ++idx;
4436- }
4437+ /* Accumulated throttler residencies */
4438+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
4439+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricProchotResidencyAccumulator,
4440+ format_metric_row(m_gpu_metrics_tbl.m_prochot_residency_acc,
4441+ "prochot_residency_acc")));
4442+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
4443+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPPTResidencyAccumulator,
4444+ format_metric_row(m_gpu_metrics_tbl.m_ppt_residency_acc,
4445+ "ppt_residency_acc")));
4446+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
4447+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricSocketThmResidencyAccumulator,
4448+ format_metric_row(m_gpu_metrics_tbl.m_socket_thm_residency_acc,
4449+ "socket_thm_residency_acc")));
4450+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
4451+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVRThmResidencyAccumulator,
4452+ format_metric_row(m_gpu_metrics_tbl.m_vr_thm_residency_acc,
4453+ "vr_thm_residency_acc")));
4454+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
4455+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricHBMThmResidencyAccumulator,
4456+ format_metric_row(m_gpu_metrics_tbl.m_hbm_thm_residency_acc,
4457+ "hbm_thm_residency_acc")));
4458
4459- ss << " current_socclk: " << "\n";
4460- idx = 0;
4461- for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) {
4462- ss << "\t [" << idx << "]: " << temp << "\n";
4463- ++idx;
4464- }
4465+ /* Partition info */
4466+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPartition]
4467+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kGpuMetricNumPartition,
4468+ format_metric_row(m_gpu_metrics_tbl.m_num_partition,
4469+ "num_partition")));
4470
4471- ss << " current_vclk0: " << "\n";
4472- idx = 0;
4473- for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) {
4474- ss << "\t [" << idx << "]: " << temp << "\n";
4475- ++idx;
4476- }
4477+ /* xcp_stats info */
4478+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
4479+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyInst,
4480+ format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_inst,
4481+ "xcp_stats->gfx_busy_inst")));
4482+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
4483+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVcnBusy,
4484+ format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->vcn_busy,
4485+ "xcp_stats->vcn_busy")));
4486+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
4487+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricJpegBusy,
4488+ format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->jpeg_busy,
4489+ "xcp_stats->jpeg_busy")));
4490+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
4491+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc,
4492+ format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_acc,
4493+ "xcp_stats->gfx_busy_acc")));
4494
4495- ss << " current_dclk0: " << "\n";
4496- idx = 0;
4497- for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) {
4498- ss << "\t [" << idx << "]: " << temp << "\n";
4499- ++idx;
4500- }
4501+ /* gpu metrics v1.8 xcp_stats info */
4502+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
4503+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitTotalAcc,
4504+ format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_total_acc,
4505+ "xcp_stats->gfx_below_host_limit_total_acc")));
4506+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
4507+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitPptAcc,
4508+ format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_ppt_acc,
4509+ "xcp_stats->gfx_below_host_limit_ppt_acc")));
4510+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
4511+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitThmAcc,
4512+ format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_thm_acc,
4513+ "xcp_stats->gfx_below_host_limit_thm_acc")));
4514+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
4515+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxLowUtilitizationAcc,
4516+ format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_low_utilization_acc,
4517+ "xcp_stats->gfx_low_utilization_acc")));
4518
4519- idx = 0;
4520- idy = 0;
4521- ss << " xcp_stats.gfx_busy_inst: " << "\n";
4522- for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
4523- if (idx == 0) {
4524- ss << "\t [ ";
4525- }
4526- for (auto& col : row.gfx_busy_inst) {
4527- ss << "\t [" << idx << "] [" << idy << "]: " << col;
4528- if (idy + 1 != (std::end(row.gfx_busy_inst) - std::end(row.gfx_busy_inst) - 1)) {
4529- ss << ", ";
4530- }
4531- if (idx + 1 !=
4532- (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
4533- ss << "\n";
4534- } else {
4535- ss << "]\n";
4536- }
4537- idy++;
4538- }
4539- idx++;
4540- }
4541+ /* PCIE other end recovery counter info */
4542+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4543+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov,
4544+ format_metric_row(m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery,
4545+ "pcie_lc_perf_other_end_recovery")));
4546
4547- idx = 0;
4548- idy = 0;
4549- ss << " xcp_stats.vcn_busy: " << "\n";
4550- for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
4551- if (idx == 0) {
4552- ss << "\t [ ";
4553- }
4554- for (auto& col : row.vcn_busy) {
4555- ss << "\t [" << idx << "] [" << idy << "]: " << col;
4556- if (idy + 1 != (std::end(row.vcn_busy) - std::end(row.vcn_busy) - 1)) {
4557- ss << ", ";
4558- }
4559- if (idx + 1 !=
4560- (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
4561- ss << "\n";
4562- } else {
4563- ss << "]\n";
4564- }
4565- idy++;
4566- }
4567- idx++;
4568- }
4569+ /* VRAM max bandwidth (in GB/sec) at max memory clock */
4570+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4571+ .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth,
4572+ format_metric_row(m_gpu_metrics_tbl.m_mem_max_bandwidth,
4573+ "vram_max_bandwidth")));
4574
4575- idx = 0;
4576- idy = 0;
4577- ss << " xcp_stats.jpeg_busy: " << "\n";
4578- for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
4579- if (idx == 0) {
4580- ss << "\t [ ";
4581- }
4582- for (auto& col : row.jpeg_busy) {
4583- ss << "\t [" << idx << "] [" << idy << "]: " << col;
4584- if (idy + 1 != (std::end(row.jpeg_busy) - std::end(row.jpeg_busy) - 1)) {
4585- ss << ", ";
4586- }
4587- if (idx + 1 !=
4588- (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
4589- ss << "\n";
4590- } else {
4591- ss << "]\n";
4592- }
4593- idy++;
4594- }
4595- idx++;
4596- }
4597+ ss << __PRETTY_FUNCTION__ << " | ======= end ======= "
4598+ << " | Success "
4599+ << " | Returning = " << getRSMIStatusString(status_code) << " |";
4600+ LOG_TRACE(ss);
4601
4602- idx = 0;
4603- idy = 0;
4604- ss << " xcp_stats.gfx_busy_acc: " << "\n";
4605- for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
4606- if (idx == 0) {
4607- ss << "\t [ ";
4608- }
4609- for (auto& col : row.gfx_busy_acc) {
4610- ss << "\t [" << idx << "] [" << idy << "]: " << col;
4611- if (idy + 1 != (std::end(row.gfx_busy_acc) - std::end(row.gfx_busy_acc) - 1)) {
4612- ss << ", ";
4613- }
4614- if (idx + 1 !=
4615- (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
4616- ss << "\n";
4617- } else {
4618- ss << "]\n";
4619- }
4620- idy++;
4621- }
4622- idx++;
4623- }
4624+ // Copy to base class
4625+ std::copy(m_metrics_dynamic_tbl.begin(), m_metrics_dynamic_tbl.end(),
4626+ std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
4627+ GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
4628
4629- LOG_DEBUG(ss);
4630+ return status_code;
4631 }
4632
4633 rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() {
4634@@ -883,10 +758,7 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() {
4635 ss << __PRETTY_FUNCTION__ << " | ======= start =======";
4636 LOG_TRACE(ss);
4637
4638- if (!m_metrics_dynamic_tbl.empty()) {
4639- m_metrics_dynamic_tbl.clear();
4640- }
4641-
4642+ auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
4643 //
4644 // Note: Any metric treatment/changes (if any) should happen before they
4645 // get written to internal/external tables.
4646@@ -1106,6 +978,11 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() {
4647 format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_acc,
4648 "xcp_stats->gfx_busy_acc")));
4649
4650+ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats].insert(
4651+ std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator,
4652+ format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_acc,
4653+ "xcp_stats->gfx_below_host_limit_acc")));
4654+
4655 /* PCIE other end recovery counter info */
4656 m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
4657 .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov,
4658@@ -1118,12 +995,6 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() {
4659 format_metric_row(m_gpu_metrics_tbl.m_vram_max_bandwidth,
4660 "vram_max_bandwidth")));
4661
4662- /* Total App Clock Counter Accumulated */
4663- m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
4664- .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator,
4665- format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_acc,
4666- "gfx_below_host_limit_acc")));
4667-
4668 ss << __PRETTY_FUNCTION__
4669 << " | ======= end ======= "
4670 << " | Success "
4671@@ -1131,6 +1002,12 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() {
4672 << " |";
4673 LOG_TRACE(ss);
4674
4675+ // Copy to base class
4676+ std::copy(m_metrics_dynamic_tbl.begin(),
4677+ m_metrics_dynamic_tbl.end(),
4678+ std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
4679+ GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
4680+
4681 return status_code;
4682 }
4683
4684@@ -1140,10 +1017,7 @@ rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() {
4685 ss << __PRETTY_FUNCTION__ << " | ======= start =======";
4686 LOG_TRACE(ss);
4687
4688- if (!m_metrics_dynamic_tbl.empty()) {
4689- m_metrics_dynamic_tbl.clear();
4690- }
4691-
4692+ auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
4693 //
4694 // Note: Any metric treatment/changes (if any) should happen before they
4695 // get written to internal/external tables.
4696@@ -1371,127 +1245,22 @@ rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() {
4697 << " |";
4698 LOG_TRACE(ss);
4699
4700- return status_code;
4701-}
4702-
4703-void GpuMetricsBase_v15_t::dump_internal_metrics_table()
4704-{
4705- std::ostringstream ss;
4706- std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n";
4707- ss << __PRETTY_FUNCTION__
4708- << " | ======= DEBUG ======= "
4709- << " | Metric Version: " << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header)
4710- << " | Size: " << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size)
4711- << " |"
4712- << "\n";
4713- ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n"
4714- << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n"
4715- << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n"
4716-
4717- << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n"
4718-
4719- << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n"
4720- << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n";
4721-
4722- ss << " vcn_activity: " << "\n";
4723- auto idx = uint64_t(0);
4724- for (const auto& temp : m_gpu_metrics_tbl.m_vcn_activity) {
4725- ss << "\t [" << idx << "]: " << temp << "\n";
4726- ++idx;
4727- }
4728-
4729- ss << " jpeg_activity: " << "\n";
4730- idx = 0;
4731- for (const auto& temp : m_gpu_metrics_tbl.m_jpeg_activity) {
4732- ss << "\t [" << idx << "]: " << temp << "\n";
4733- ++idx;
4734- }
4735-
4736- ss << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n"
4737- << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n"
4738-
4739- << " throttle_status: " << m_gpu_metrics_tbl.m_throttle_status << "\n"
4740-
4741- << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n"
4742- << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"
4743-
4744- << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n"
4745-
4746- << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n"
4747- << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n"
4748-
4749- << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n"
4750- << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n"
4751+ // Copy to base class
4752+ std::copy(m_metrics_dynamic_tbl.begin(),
4753+ m_metrics_dynamic_tbl.end(),
4754+ std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
4755+ GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
4756
4757- << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n"
4758- << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n"
4759-
4760- << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n"
4761- << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n"
4762- << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n"
4763- << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n"
4764- << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n"
4765- << " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n"
4766- << " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n";
4767-
4768- ss << " xgmi_read_data_acc: " << "\n";
4769- idx = 0;
4770- for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) {
4771- ss << "\t [" << idx << "]: " << temp << "\n";
4772- ++idx;
4773- }
4774-
4775- ss << " xgmi_write_data_acc: " << "\n";
4776- idx = 0;
4777- for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) {
4778- ss << "\t [" << idx << "]: " << temp << "\n";
4779- ++idx;
4780- }
4781-
4782- ss << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n";
4783-
4784- ss << " current_gfxclk: " << "\n";
4785- idx = 0;
4786- for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) {
4787- ss << "\t [" << idx << "]: " << temp << "\n";
4788- ++idx;
4789- }
4790-
4791- ss << " current_socclk: " << "\n";
4792- idx = 0;
4793- for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) {
4794- ss << "\t [" << idx << "]: " << temp << "\n";
4795- ++idx;
4796- }
4797-
4798- ss << " current_vclk0: " << "\n";
4799- idx = 0;
4800- for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) {
4801- ss << "\t [" << idx << "]: " << temp << "\n";
4802- ++idx;
4803- }
4804-
4805- ss << " current_dclk0: " << "\n";
4806- idx = 0;
4807- for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) {
4808- ss << "\t [" << idx << "]: " << temp << "\n";
4809- ++idx;
4810- }
4811-
4812- ss << " padding: " << m_gpu_metrics_tbl.m_padding << "\n";
4813- LOG_DEBUG(ss);
4814+ return status_code;
4815 }
4816
4817 rsmi_status_t GpuMetricsBase_v15_t::populate_metrics_dynamic_tbl() {
4818 std::ostringstream ss;
4819 auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
4820 ss << __PRETTY_FUNCTION__ << " | ======= start =======";
4821- LOG_TRACE(ss);
4822-
4823- if (!m_metrics_dynamic_tbl.empty()) {
4824- m_metrics_dynamic_tbl.clear();
4825- }
4826+ LOG_TRACE(ss);
4827
4828+ auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
4829 //
4830 // Note: Any metric treatment/changes (if any) should happen before they
4831 // get written to internal/external tables.
4832@@ -1708,107 +1477,13 @@ rsmi_status_t GpuMetricsBase_v15_t::populate_metrics_dynamic_tbl() {
4833 << " |";
4834 LOG_TRACE(ss);
4835
4836- return status_code;
4837-}
4838-
4839-
4840-void GpuMetricsBase_v14_t::dump_internal_metrics_table()
4841-{
4842- std::ostringstream ss;
4843- std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n";
4844- ss << __PRETTY_FUNCTION__
4845- << " | ======= DEBUG ======= "
4846- << " | Metric Version: " << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header)
4847- << " | Size: " << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size)
4848- << " |"
4849- << "\n";
4850- ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n"
4851- << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n"
4852- << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n"
4853-
4854- << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n"
4855-
4856- << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n"
4857- << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n";
4858-
4859- ss << " vcn_activity: " << "\n";
4860- auto idx = uint64_t(0);
4861- for (const auto& temp : m_gpu_metrics_tbl.m_vcn_activity) {
4862- ss << "\t [" << idx << "]: " << temp << "\n";
4863- ++idx;
4864- }
4865-
4866- ss << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n"
4867- << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n"
4868-
4869- << " throttle_status: " << m_gpu_metrics_tbl.m_throttle_status << "\n"
4870-
4871- << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n"
4872- << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"
4873-
4874- << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n"
4875-
4876- << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n"
4877- << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n"
4878-
4879- << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n"
4880- << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n"
4881-
4882- << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n"
4883- << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n"
4884+ // Copy to base class
4885+ std::copy(m_metrics_dynamic_tbl.begin(),
4886+ m_metrics_dynamic_tbl.end(),
4887+ std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
4888+ GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
4889
4890- << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n"
4891- << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n"
4892- << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n"
4893- << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n"
4894- << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n";
4895-
4896- ss << " xgmi_read_data_acc: " << "\n";
4897- idx = 0;
4898- for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) {
4899- ss << "\t [" << idx << "]: " << temp << "\n";
4900- ++idx;
4901- }
4902-
4903- ss << " xgmi_write_data_acc: " << "\n";
4904- idx = 0;
4905- for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) {
4906- ss << "\t [" << idx << "]: " << temp << "\n";
4907- ++idx;
4908- }
4909-
4910- ss << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n";
4911-
4912- ss << " current_gfxclk: " << "\n";
4913- idx = 0;
4914- for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) {
4915- ss << "\t [" << idx << "]: " << temp << "\n";
4916- ++idx;
4917- }
4918-
4919- ss << " current_socclk: " << "\n";
4920- idx = 0;
4921- for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) {
4922- ss << "\t [" << idx << "]: " << temp << "\n";
4923- ++idx;
4924- }
4925-
4926- ss << " current_vclk0: " << "\n";
4927- idx = 0;
4928- for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) {
4929- ss << "\t [" << idx << "]: " << temp << "\n";
4930- ++idx;
4931- }
4932-
4933- ss << " current_dclk0: " << "\n";
4934- idx = 0;
4935- for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) {
4936- ss << "\t [" << idx << "]: " << temp << "\n";
4937- ++idx;
4938- }
4939-
4940- ss << " padding: " << m_gpu_metrics_tbl.m_padding << "\n";
4941- LOG_DEBUG(ss);
4942+ return status_code;
4943 }
4944
4945 rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() {
4946@@ -1817,10 +1492,7 @@ rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() {
4947 ss << __PRETTY_FUNCTION__ << " | ======= start =======";
4948 LOG_TRACE(ss);
4949
4950- if (!m_metrics_dynamic_tbl.empty()) {
4951- m_metrics_dynamic_tbl.clear();
4952- }
4953-
4954+ auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
4955 //
4956 // Note: Any metric treatment/changes (if any) should happen before they
4957 // get written to internal/external tables.
4958@@ -2022,6 +1694,12 @@ rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() {
4959 << " |";
4960 LOG_TRACE(ss);
4961
4962+ // Copy to base class
4963+ std::copy(m_metrics_dynamic_tbl.begin(),
4964+ m_metrics_dynamic_tbl.end(),
4965+ std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
4966+ GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
4967+
4968 return status_code;
4969 }
4970
4971@@ -2125,6 +1803,7 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
4972
4973 rsmi_gpu_metrics.pcie_nak_sent_count_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.pcie_nak_sent_count_acc)>();
4974 rsmi_gpu_metrics.pcie_nak_rcvd_count_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.pcie_nak_rcvd_count_acc)>();
4975+
4976 rsmi_gpu_metrics.accumulation_counter = init_max_uint_types<decltype(rsmi_gpu_metrics.accumulation_counter)>();
4977 rsmi_gpu_metrics.prochot_residency_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.prochot_residency_acc)>();
4978 rsmi_gpu_metrics.ppt_residency_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.ppt_residency_acc)>();
4979@@ -2148,6 +1827,14 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
4980 init_max_uint_types<std::uint64_t>());
4981 std::fill(std::begin(row.gfx_below_host_limit_acc), std::end(row.gfx_below_host_limit_acc),
4982 init_max_uint_types<std::uint64_t>());
4983+ std::fill(std::begin(row.gfx_below_host_limit_ppt_acc), std::end(row.gfx_below_host_limit_ppt_acc),
4984+ init_max_uint_types<std::uint64_t>());
4985+ std::fill(std::begin(row.gfx_below_host_limit_thm_acc), std::end(row.gfx_below_host_limit_thm_acc),
4986+ init_max_uint_types<std::uint64_t>());
4987+ std::fill(std::begin(row.gfx_low_utilization_acc), std::end(row.gfx_low_utilization_acc),
4988+ init_max_uint_types<std::uint64_t>());
4989+ std::fill(std::begin(row.gfx_below_host_limit_total_acc), std::end(row.gfx_below_host_limit_total_acc),
4990+ init_max_uint_types<std::uint64_t>());
4991 }
4992
4993 ss << __PRETTY_FUNCTION__
4994@@ -2160,6 +1847,213 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
4995 return status_code;
4996 }
4997
4998+AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v18_t::copy_internal_to_external_metrics()
4999+{
5000+ std::ostringstream ss;
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches