Merge ~bullwinkle-team/ubuntu/+source/rocalution:bullwinkle/llvm-21/ubuntu/devel into ubuntu/+source/rocalution:ubuntu/devel

Proposed by Bruno Bernardo de Moura
Status: Merged
Approved by: Andreas Hasenack
Approved revision: 6e4d78c86cfc6891e351f45f3b378b525d77c661
Merged at revision: 6e4d78c86cfc6891e351f45f3b378b525d77c661
Proposed branch: ~bullwinkle-team/ubuntu/+source/rocalution:bullwinkle/llvm-21/ubuntu/devel
Merge into: ubuntu/+source/rocalution:ubuntu/devel
Diff against target: 107187 lines (+64996/-17596)
296 files modified
.azuredevops/rocm-ci.yml (+44/-0)
.githooks/pre-commit (+1/-1)
.github/CODEOWNERS (+6/-1)
.github/CONTRIBUTING.md (+160/-0)
.github/dependabot.yml (+8/-1)
.gitignore (+0/-11)
.jenkins/codecov.groovy (+4/-6)
.jenkins/common.groovy (+18/-12)
.jenkins/debug.groovy (+1/-1)
.jenkins/extended.groovy (+1/-1)
.jenkins/precheckin.groovy (+1/-1)
.jenkins/staticanalysis.groovy (+21/-3)
.jenkins/staticlibrary.groovy (+3/-3)
.readthedocs.yaml (+7/-3)
CHANGELOG.md (+272/-108)
CMakeLists.txt (+67/-28)
LICENSE.md (+1/-1)
README.md (+82/-71)
clients/benchmarks/CMakeLists.txt (+3/-2)
clients/benchmarks/rocalution_arguments_config.cpp (+53/-1)
clients/benchmarks/rocalution_bench_solver_parameters.cpp (+5/-1)
clients/benchmarks/rocalution_bench_solver_parameters.hpp (+18/-1)
clients/benchmarks/rocalution_driver_itsolver.hpp (+82/-13)
clients/benchmarks/rocalution_driver_itsolver_uaamg.hpp (+2/-2)
clients/benchmarks/rocalution_enum_itilu0_alg.cpp (+99/-0)
clients/benchmarks/rocalution_enum_itilu0_alg.hpp (+92/-0)
clients/benchmarks/rocalution_enum_preconditioner.hpp (+2/-1)
clients/include/common.hpp (+875/-31)
clients/include/testing_bicgstab.hpp (+24/-10)
clients/include/testing_bicgstabl.hpp (+18/-10)
clients/include/testing_cg.hpp (+17/-9)
clients/include/testing_chebyshev.hpp (+218/-0)
clients/include/testing_cr.hpp (+17/-9)
clients/include/testing_fcg.hpp (+17/-9)
clients/include/testing_fgmres.hpp (+18/-10)
clients/include/testing_global_matrix.hpp (+7/-11)
clients/include/testing_gmres.hpp (+19/-11)
clients/include/testing_idr.hpp (+19/-11)
clients/include/testing_inversion.hpp (+32/-13)
clients/include/testing_itsolver.hpp (+165/-0)
clients/include/testing_local_matrix.hpp (+2356/-28)
clients/include/testing_local_matrix_itsolve.hpp (+473/-0)
clients/include/testing_local_matrix_multicoloring.hpp (+128/-0)
clients/include/testing_local_matrix_solve.hpp (+461/-0)
clients/include/testing_local_vector.hpp (+1323/-15)
clients/include/testing_lu.hpp (+31/-13)
clients/include/testing_mixed_precision.hpp (+135/-0)
clients/include/testing_pairwise_amg.hpp (+282/-4)
clients/include/testing_preconditioner.hpp (+378/-0)
clients/include/testing_qmrcgstab.hpp (+17/-9)
clients/include/testing_qr.hpp (+31/-13)
clients/include/testing_ruge_stueben_amg.hpp (+35/-18)
clients/include/testing_saamg.hpp (+16/-33)
clients/include/testing_uaamg.hpp (+15/-9)
clients/include/utility.hpp (+65/-20)
clients/include/validate.hpp (+108/-0)
clients/samples/CMakeLists.txt (+10/-2)
clients/samples/cg-rsamg.cpp (+18/-10)
clients/samples/cg-rsamg_mpi.cpp (+179/-0)
clients/samples/cg-saamg_mpi.cpp (+177/-0)
clients/samples/cg-uaamg_mpi.cpp (+176/-0)
clients/samples/itsolve.cpp (+161/-0)
clients/samples/laplace_2d_weak_scaling.cpp (+163/-0)
clients/samples/laplace_3d_weak_scaling.cpp (+164/-0)
clients/tests/CMakeLists.txt (+9/-1)
clients/tests/test_backend.cpp (+18/-2)
clients/tests/test_bicgstab.cpp (+65/-6)
clients/tests/test_bicgstabl.cpp (+69/-8)
clients/tests/test_cg.cpp (+63/-6)
clients/tests/test_chebyshev.cpp (+99/-0)
clients/tests/test_cr.cpp (+65/-6)
clients/tests/test_fcg.cpp (+64/-6)
clients/tests/test_fgmres.cpp (+69/-7)
clients/tests/test_global_matrix.cpp (+8/-1)
clients/tests/test_global_vector.cpp (+8/-1)
clients/tests/test_gmres.cpp (+85/-12)
clients/tests/test_idr.cpp (+68/-7)
clients/tests/test_inversion.cpp (+71/-6)
clients/tests/test_itersolver.cpp (+93/-0)
clients/tests/test_local_matrix.cpp (+388/-3)
clients/tests/test_local_matrix_itsolve.cpp (+193/-0)
clients/tests/test_local_matrix_multicoloring.cpp (+79/-0)
clients/tests/test_local_matrix_solve.cpp (+197/-0)
clients/tests/test_local_stencil.cpp (+8/-1)
clients/tests/test_local_vector.cpp (+374/-1)
clients/tests/test_lu.cpp (+70/-6)
clients/tests/test_mixed_precision.cpp (+91/-0)
clients/tests/test_pairwise_amg.cpp (+108/-10)
clients/tests/test_parallel_manager.cpp (+8/-1)
clients/tests/test_preconditioner.cpp (+114/-0)
clients/tests/test_qmrcgstab.cpp (+64/-6)
clients/tests/test_qr.cpp (+69/-6)
clients/tests/test_ruge_stueben_amg.cpp (+105/-19)
clients/tests/test_saamg.cpp (+99/-13)
clients/tests/test_uaamg.cpp (+99/-13)
cmake/Dependencies.cmake (+20/-29)
debian/changelog (+32/-0)
debian/control (+14/-10)
debian/librocalution-doc.doc-base (+0/-2)
debian/librocalution-doc.docs (+1/-1)
debian/librocalution1-tests.install (+1/-0)
debian/patches/series (+0/-5)
debian/rules (+38/-4)
debian/shlibs (+1/-1)
debian/tests/control (+2/-2)
dev/null (+0/-20)
docs/.gitignore (+5/-0)
docs/conceptual/rocALUTION-design.rst (+35/-0)
docs/conceptual/rocALUTION-organization.rst (+130/-0)
docs/conf.py (+22/-2)
docs/doxygen/Doxyfile (+14/-44)
docs/doxygen/bibliography.bib (+207/-0)
docs/how-to/extending-rocALUTION.rst (+90/-74)
docs/how-to/include-rocALUTION.rst (+31/-0)
docs/index.rst (+58/-10)
docs/install/rocALUTION-general-install.rst (+14/-0)
docs/install/rocALUTION-linux-build-and-install.rst (+58/-0)
docs/install/rocALUTION-prerequisites.rst (+27/-0)
docs/install/rocALUTION-windows-build-and-install.rst (+59/-0)
docs/license.rst (+4/-0)
docs/reference/rocALUTION-accelerators.rst (+97/-0)
docs/reference/rocALUTION-api-basics.rst (+208/-0)
docs/reference/rocALUTION-api.rst (+8/-8)
docs/reference/rocALUTION-functionality-table.rst (+16/-6)
docs/reference/rocALUTION-library-notes.rst (+12/-4)
docs/reference/rocALUTION-multi-node-comp.rst (+39/-28)
docs/reference/rocALUTION-preconditioners.rst (+64/-27)
docs/reference/rocALUTION-single-node-comp.rst (+161/-103)
docs/reference/rocALUTION-solvers.rst (+60/-30)
docs/sphinx/_toc.yml.in (+48/-0)
docs/sphinx/requirements.in (+1/-0)
docs/sphinx/requirements.txt (+277/-0)
docs/tutorials/rocALUTION-client-examples.rst (+65/-0)
docs/what-is-rocalution.rst (+15/-26)
install.sh (+38/-42)
rmake.py (+22/-9)
rtest.py (+27/-6)
rtest.xml (+9/-0)
src/CMakeLists.txt (+103/-48)
src/base/backend_manager.cpp (+91/-16)
src/base/backend_manager.hpp (+92/-21)
src/base/base_matrix.cpp (+597/-55)
src/base/base_matrix.hpp (+511/-199)
src/base/base_rocalution.cpp (+18/-3)
src/base/base_rocalution.hpp (+2/-1)
src/base/base_stencil.hpp (+17/-17)
src/base/base_vector.cpp (+27/-4)
src/base/base_vector.hpp (+88/-74)
src/base/global_matrix.cpp (+3195/-554)
src/base/global_matrix.hpp (+161/-59)
src/base/global_vector.cpp (+57/-35)
src/base/global_vector.hpp (+66/-20)
src/base/hip/CMakeLists.txt (+2/-1)
src/base/hip/backend_hip.cpp (+207/-73)
src/base/hip/backend_hip.hpp (+26/-8)
src/base/hip/hip_allocate_free.cpp (+297/-75)
src/base/hip/hip_allocate_free.hpp (+26/-5)
src/base/hip/hip_blas.cpp (+176/-12)
src/base/hip/hip_blas.hpp (+25/-3)
src/base/hip/hip_conversion.cpp (+571/-583)
src/base/hip/hip_conversion.hpp (+87/-89)
src/base/hip/hip_kernels_conversion.hpp (+56/-55)
src/base/hip/hip_kernels_csr.hpp (+2369/-1362)
src/base/hip/hip_kernels_general.hpp (+3/-3)
src/base/hip/hip_kernels_rsamg_csr.hpp (+2121/-0)
src/base/hip/hip_kernels_vector.hpp (+62/-15)
src/base/hip/hip_matrix_bcsr.cpp (+183/-220)
src/base/hip/hip_matrix_bcsr.hpp (+4/-10)
src/base/hip/hip_matrix_coo.cpp (+155/-281)
src/base/hip/hip_matrix_coo.hpp (+3/-3)
src/base/hip/hip_matrix_csr.cpp (+7502/-2314)
src/base/hip/hip_matrix_csr.hpp (+301/-22)
src/base/hip/hip_matrix_dense.cpp (+74/-137)
src/base/hip/hip_matrix_dia.cpp (+114/-185)
src/base/hip/hip_matrix_dia.hpp (+4/-4)
src/base/hip/hip_matrix_ell.cpp (+87/-153)
src/base/hip/hip_matrix_ell.hpp (+3/-3)
src/base/hip/hip_matrix_hyb.cpp (+176/-358)
src/base/hip/hip_matrix_hyb.hpp (+7/-6)
src/base/hip/hip_matrix_mcsr.cpp (+359/-447)
src/base/hip/hip_matrix_mcsr.hpp (+3/-3)
src/base/hip/hip_rand.hpp (+1/-33)
src/base/hip/hip_rand_normal.hpp (+2/-1)
src/base/hip/hip_rand_uniform.hpp (+14/-17)
src/base/hip/hip_rsamg_csr.cpp (+1422/-0)
src/base/hip/hip_sparse.cpp (+742/-1)
src/base/hip/hip_sparse.hpp (+101/-1)
src/base/hip/hip_unordered_map.hpp (+8/-4)
src/base/hip/hip_unordered_set.hpp (+4/-2)
src/base/hip/hip_utils.hpp (+28/-124)
src/base/hip/hip_vector.cpp (+571/-504)
src/base/hip/hip_vector.hpp (+30/-22)
src/base/host/CMakeLists.txt (+3/-1)
src/base/host/host_affinity.cpp (+3/-1)
src/base/host/host_conversion.cpp (+700/-702)
src/base/host/host_conversion.hpp (+122/-122)
src/base/host/host_ilut_driver_csr.cpp (+386/-0)
src/base/host/host_ilut_driver_csr.hpp (+163/-0)
src/base/host/host_io.cpp (+3910/-94)
src/base/host/host_io.hpp (+163/-17)
src/base/host/host_matrix_bcsr.cpp (+120/-78)
src/base/host/host_matrix_bcsr.hpp (+6/-9)
src/base/host/host_matrix_coo.cpp (+110/-150)
src/base/host/host_matrix_coo.hpp (+6/-3)
src/base/host/host_matrix_csr.cpp (+8640/-4208)
src/base/host/host_matrix_csr.hpp (+310/-27)
src/base/host/host_matrix_dense.cpp (+74/-57)
src/base/host/host_matrix_dense.hpp (+4/-1)
src/base/host/host_matrix_dia.cpp (+87/-63)
src/base/host/host_matrix_dia.hpp (+7/-4)
src/base/host/host_matrix_ell.cpp (+82/-67)
src/base/host/host_matrix_ell.hpp (+6/-3)
src/base/host/host_matrix_hyb.cpp (+129/-99)
src/base/host/host_matrix_hyb.hpp (+10/-6)
src/base/host/host_matrix_mcsr.cpp (+91/-72)
src/base/host/host_matrix_mcsr.hpp (+6/-3)
src/base/host/host_sparse.cpp (+563/-0)
src/base/host/host_sparse.hpp (+92/-0)
src/base/host/host_stencil_laplace2d.cpp (+4/-2)
src/base/host/host_stencil_laplace2d.hpp (+2/-2)
src/base/host/host_vector.cpp (+446/-209)
src/base/host/host_vector.hpp (+33/-25)
src/base/local_matrix.cpp (+2849/-931)
src/base/local_matrix.hpp (+342/-91)
src/base/local_stencil.cpp (+9/-5)
src/base/local_stencil.hpp (+16/-6)
src/base/local_vector.cpp (+146/-55)
src/base/local_vector.hpp (+348/-40)
src/base/matrix_formats.hpp (+5/-4)
src/base/matrix_formats_ind.hpp (+3/-3)
src/base/operator.cpp (+20/-10)
src/base/operator.hpp (+14/-11)
src/base/parallel_manager.cpp (+929/-93)
src/base/parallel_manager.hpp (+130/-51)
src/base/vector.cpp (+64/-27)
src/base/vector.hpp (+28/-20)
src/solvers/chebyshev.cpp (+3/-1)
src/solvers/direct/inversion.cpp (+3/-1)
src/solvers/direct/lu.cpp (+9/-6)
src/solvers/direct/qr.cpp (+3/-1)
src/solvers/iter_ctrl.cpp (+7/-3)
src/solvers/iter_ctrl.hpp (+5/-4)
src/solvers/krylov/bicgstab.cpp (+3/-1)
src/solvers/krylov/bicgstabl.cpp (+3/-1)
src/solvers/krylov/cg.cpp (+3/-3)
src/solvers/krylov/cr.cpp (+3/-3)
src/solvers/krylov/fcg.cpp (+3/-3)
src/solvers/krylov/fgmres.cpp (+3/-4)
src/solvers/krylov/gmres.cpp (+3/-3)
src/solvers/krylov/idr.cpp (+7/-5)
src/solvers/krylov/qmrcgstab.cpp (+3/-1)
src/solvers/mixed_precision.cpp (+8/-4)
src/solvers/multigrid/base_amg.cpp (+64/-48)
src/solvers/multigrid/base_amg.hpp (+6/-9)
src/solvers/multigrid/base_multigrid.cpp (+13/-25)
src/solvers/multigrid/base_multigrid.hpp (+8/-6)
src/solvers/multigrid/multigrid.cpp (+3/-3)
src/solvers/multigrid/pairwise_amg.cpp (+15/-21)
src/solvers/multigrid/pairwise_amg.hpp (+22/-7)
src/solvers/multigrid/ruge_stueben_amg.cpp (+59/-82)
src/solvers/multigrid/ruge_stueben_amg.hpp (+6/-27)
src/solvers/multigrid/smoothed_amg.cpp (+72/-72)
src/solvers/multigrid/smoothed_amg.hpp (+6/-7)
src/solvers/multigrid/unsmoothed_amg.cpp (+51/-63)
src/solvers/multigrid/unsmoothed_amg.hpp (+6/-7)
src/solvers/preconditioners/preconditioner.cpp (+256/-30)
src/solvers/preconditioners/preconditioner.hpp (+96/-1)
src/solvers/preconditioners/preconditioner_ai.cpp (+9/-1)
src/solvers/preconditioners/preconditioner_as.cpp (+12/-5)
src/solvers/preconditioners/preconditioner_blockjacobi.cpp (+3/-1)
src/solvers/preconditioners/preconditioner_blockprecond.cpp (+3/-1)
src/solvers/preconditioners/preconditioner_multicolored_gs.cpp (+5/-1)
src/solvers/preconditioners/preconditioner_multicolored_ilu.cpp (+3/-1)
src/solvers/preconditioners/preconditioner_multicolored_ilu.hpp (+2/-2)
src/solvers/preconditioners/preconditioner_multielimination.cpp (+3/-1)
src/solvers/preconditioners/preconditioner_multielimination.hpp (+11/-5)
src/solvers/preconditioners/preconditioner_saddlepoint.cpp (+6/-2)
src/solvers/solver.cpp (+110/-3)
src/solvers/solver.hpp (+143/-11)
src/utils/CMakeLists.txt (+2/-2)
src/utils/allocate_free.cpp (+138/-45)
src/utils/allocate_free.hpp (+57/-6)
src/utils/communicator.cpp (+296/-165)
src/utils/communicator.hpp (+26/-6)
src/utils/def.hpp (+0/-2)
src/utils/log.cpp (+5/-1)
src/utils/log.hpp (+3/-1)
src/utils/math_functions.cpp (+5/-1)
src/utils/math_functions.hpp (+18/-15)
src/utils/rocsparseio.cpp (+1456/-0)
src/utils/rocsparseio.h (+1033/-0)
src/utils/rocsparseio.hpp (+2351/-0)
src/utils/type_traits.hpp (+39/-6)
src/utils/types.hpp.in (+6/-14)
toolchain-linux.cmake (+11/-6)
toolchain-windows.cmake (+5/-1)
Reviewer Review Type Date Requested Status
Andreas Hasenack Approve
Ubuntu Sponsors Pending
Review via email: mp+499090@code.launchpad.net

Description of the change

Update to new upstream version 7.1.0

Link for build reference: https://launchpad.net/~bruno-bdmoura/+archive/ubuntu/lp-2138877

To post a comment you must log in.
Revision history for this message
Bruno Bernardo de Moura (bruno-bdmoura) wrote :

Uploaded package to this ppa: https://launchpad.net/~bruno-bdmoura/+archive/ubuntu/lp-2138877

(-proposed and all archs enabled)

Revision history for this message
Talha Can Havadar (tchavadar) wrote :

Hello Bruno, I see we have segfaults for arm64 in the ppa you shared, do you happen to know the reason?

Revision history for this message
Bruno Bernardo de Moura (bruno-bdmoura) wrote :

Unfortunately I do not know the reason why this happens. However, the test that's causing the segmentation fault is the same one who failed whenever I was mistakenly trying to build this package using clang instead of the default upstream compiler.

I have asked Cory on matrix regarding this topic, let's see what he has to say.

Revision history for this message
Talha Can Havadar (tchavadar) wrote :

Can you point me the source code where it fails? is there like an obvious null pointer dereference some where?

Revision history for this message
Bruno Bernardo de Moura (bruno-bdmoura) wrote :

The error occurs on this test function: https://git.launchpad.net/~bullwinkle-team/ubuntu/+source/rocalution/tree/clients/tests/test_local_vector.cpp?h=bullwinkle/llvm-21/ubuntu/devel#n439.

This snippet then calls the testing_extract_coarse_mapping function, located here: https://git.launchpad.net/~bullwinkle-team/ubuntu/+source/rocalution/tree/clients/include/testing_local_vector.hpp?h=bullwinkle/llvm-21/ubuntu/devel#n1403, which is where the test fails.

The method invoked by the custom vector can be seen here: https://git.launchpad.net/~bullwinkle-team/ubuntu/+source/rocalution/tree/src/base/local_vector.cpp?h=bullwinkle/llvm-21/ubuntu/devel#n1210

I honestly don't know exactly how different compilers treat c++ template usage, but I would guess that's where the error might be coming from between the two platforms.

Besides that I don't see any obvious errors in that function.

Revision history for this message
Bruno Bernardo de Moura (bruno-bdmoura) wrote :

And also, I just have to say I'm by no means a c++ expert, so there might still be fact some intricate details on the above implementations that I'm missing.

Revision history for this message
Bruno Bernardo de Moura (bruno-bdmoura) wrote :

Talha, just as an update, I got access to the hinyari machine, and even though I had to skip the symbols file verification using the -c0 flag, the package builds and runs the tests without breaking, even the test that's breaking on launchpad.

Revision history for this message
Talha Can Havadar (tchavadar) wrote :

ok that is suspicious can you check the what is the revision of arm in hinyari? maybe it is different than the builders we have in launchpad and maybe the extensions supported by the arm cpu in hinyari so we can ask launchpad team

Revision history for this message
Talha Can Havadar (tchavadar) wrote :

it is strange but maybe some vector instructions are not there for builders we have in LP best to be sure

Revision history for this message
Talha Can Havadar (tchavadar) wrote :

That said, I think we need more information about the failure, I see some log functions called on the entrance of each function but cant see the output anywhere in build log maybe there is a way to get them to see at which step it fails.

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Don't delete the previous changelog entry from resolute:
diff --git a/debian/changelog b/debian/changelog
index 2a0b776..00fd507 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,8 +1,32 @@
-rocalution (5.7.1-3build1) resolute; urgency=medium
-
- * No-change mass rebuild for Ubuntu 26.04 (LP: #2132257)
-
- -- Sebastien Bacher <email address hidden> Thu, 29 Jan 2026 21:42:09 +0100
+rocalution (7.1.0-0ubuntu2) resolute; urgency=medium
+
+ * d/rules: skip failing local_vector_test.extract_coarse_mapping_int test
+
+ -- Bruno Bernardo de Moura <email address hidden> Tue, 03 Feb 2026 16:40:38 -0300
+
+rocalution (7.1.0-0ubuntu1) resolute; urgency=medium
+
+ * New upstream release 7.1.0.
+ * d/p/series: drop outdated and unused patches for 7.1.0
+ - drop d/p/spelling.patch
+ - drop d/p/use-readthedocs-theme.patch
+ - drop d/p/docs-table-of-contents.patch
+ - drop d/p/use-local-mathjax.patch.
+ - drop d/p/fix-host-and-backend-function-documentation.patch
+ * d/control: update standards version to 4.7.2
+ * d/control: update maintainer field for ubuntu archive
+ * d/control: update build-depends for rocm stack
+ * d/control: update SOVERSION (0 -> 1) after API changes
+ * d/rules: fix docs build step with rocm-docs-build
+ * d/rules: fix lintian warnings and improve docs cleanup
+ * d/{librocalution-doc.doc-base,librocalution-doc.docs}:
+ fix doc generation paths
+ * d/shlibs: drop auto generated file
+ * d/{t/control,librocalution0-tests.install}:
+ update test files SOVERSION (0 -> 1)
+ * d/librocalution1.symbols: add symbols file for 7.1.0
+
+ -- Bruno Bernardo de Moura <email address hidden> Wed, 10 Dec 2025 14:27:06 -0300

 rocalution (5.7.1-3) unstable; urgency=medium

review: Needs Fixing
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

+ * d/shlibs: drop auto generated file

...
diff --git a/debian/librocalution1.symbols b/debian/librocalution1.symbols
new file mode 100644
index 0000000..2f69eb9
--- /dev/null
+++ b/debian/librocalution1.symbols
@@ -0,0 +1,12060 @@
+librocalution.so.1 librocalution1 #MINVER#
+* Build-Depends-Package: librocalution-dev
+ _ZN10rocalution10BaseMatrixISt7complexIdEE10ItLAnalyseEb@Base 7.1.0
+ _ZN10rocalution10BaseMatrixISt7complexIdEE10ItUAnalyseEb@Base 7.1.0
+ _ZN10rocalution10BaseMatrixISt7complexIdEE10MatMatMultERKS3_S5_@Base 7.1.0
...

These are c++ symbols, and should be demangled with c++filt if you want to go down this route. Note that c++ symbols are notoriously difficult to manage, and that's why the d/shlibs file was there in the first place. Are you sure you want to make these changes?

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

d/rues:
@@ -17,16 +17,24 @@ override_dh_auto_configure-arch:

 override_dh_auto_test-arch:
 ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
- obj-$(DEB_HOST_GNU_TYPE)/clients/staging/rocalution-test
+ # As described in (https://github.com/ROCm/rocALUTION/issues/322), the test
+ # `local_vector_test.extract_coarse_mapping_int` causes a segmentation fault on
+ # ARM systems. It consistently triggers a SIGSEGV in the extract_coarse_mapping_int
+ # function, even though the same test runs successfully on AMD64 platforms.
+ # Therefore, it is filtered here to enable successful package builds.
+ obj-$(DEB_HOST_GNU_TYPE)/clients/staging/rocalution-test \
+ --gtest_filter=-local_vector_test.extract_coarse_mapping_int

A test is supposedly correctly showing a problem, and you are covering it. Just saying that the test passes on amd64 is not a reason: architectures are indeed different, and just because it passes on one architecture doesn't immediately means it's ok to fail in another.

I see you filed a bug upstream, and there is no response yet. I think this needs a bit more digging, or I didn't quite understand why it's ok to build and release the package on arm64 with a segfault in one of the tests.

review: Needs Information
Revision history for this message
Igor Luppi (igorluppi) wrote :

Oh after years someone rebuild v5 of rocalution just this week hehe (https://git.launchpad.net/ubuntu/+source/rocalution/commit/?id=9ae3859003b9b41bff59f3eb8182030193f7ce8a)

It was not a deletion, just a merge conflict because ubuntu/devel diverged after the MP was open, what a bad luck

Revision history for this message
Igor Luppi (igorluppi) wrote :

Rebased, now it should be fine :)

Revision history for this message
Bruno Bernardo de Moura (bruno-bdmoura) wrote :

@Andreas, thanks for the comments!

Now adressing each of them:

1 - The changelog skip was due to the merge conflict with the rebuild of v5, so it's now already fixed

2- Related to the symbols file: you are absolutely correct, I simply added it due to mistakenly deleting the d/shlibs file after not updating the SONAME version of the package on it.

I've just tested locally and by simply dropping the symbols file and updating the SONAME version on d/shlibs lintian gives no warnings related to symbols. This way, the symbols file will be removed, and the d/shlibs updated accordingly

3- Finally, related to the crashing test on ARM: the AMD team agreed to skip that test due to it not being a mandatory package for the stack itself.

Also, if needed, instead of being a filter on the debian/rules file, the skipping of the test could also be performed as a patch that applies to its file and only runs that specific crashing function in case the architecture is x86 (as is currently performed in hipblaslt, for example). Although this other approach would at least guarantee that the test is passing on x86, the intent of it would be practically the same.

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

3) Yes, skipping the test only on arm is less bad for now. But please change the comments, they make it sound like the test is invalid because it passes on amd64. Different architectures behave differently.

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

1) ok

2) waiting the the change

Revision history for this message
Bruno Bernardo de Moura (bruno-bdmoura) wrote :

@Andreas, back to the topics:

2 - Change done
3 - Transformed it to a patch and description enhanced as dep3 header

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

--- rocalution.orig/clients/tests/test_local_vector.cpp
+++ rocalution/clients/tests/test_local_vector.cpp
@@ -435,6 +435,7 @@ GENERATE_P_TEST_CASES(parameterized_loca
                       testing_set_continuous_values,
                       set_continuous_values)

+#if defined(__x86_64__) || defined(__i386__)
 // Test for LocalVector::ExtractCoarseMapping
 TEST_F(local_vector_test, extract_coarse_mapping_int)
 {
@@ -444,6 +445,7 @@ TEST_F(local_vector_test, extract_coarse
     testing_extract_coarse_mapping<int>();
     stop_rocalution();
 }
+#endif

 GENERATE_P_TEST_CASES(parameterized_local_vector_test,
                       testing_move_to_host_async,

I was expecting excluding the test via d/rules, by checking the arch there and calling the test command with or without the gtest_filter argument.

A patch works too, of course, so this part I don't mind that much, keeping in mind the patch might not apply anymore if the surrounding code changes.

But instead of excluding the arch where the test fails, you are listing two arches where it should run (and we don't even build for i386). At the same time we have "Arch any" in d/control, so the above patch will exclude the test in ppc64el, s390x, armhf, arm64, riscv64, thus reducing coverage. Or was the test also not passing there?

Revision history for this message
Bruno Bernardo de Moura (bruno-bdmoura) wrote :

@Andreas,

Updated the rules file to skip that test on arm64 instead of applying a patch.

Additionally, the builds can be seen on this ppa: https://launchpad.net/~bruno-bdmoura/+archive/ubuntu/rocm-llvm21-all-archs/+packages

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

I see armhf failed to build, but that is already the case with the current package in the archive[1], so no regression.

+# On ARM64, the test local_vector_test.extract_coarse_mapping_int causes
+# segmentation fault, triggering a SIGSEGV in the extract_coarse_mapping
+# function. Since different compilers handle memory allocation differently
+# from one another, the test is being skipped to allow for successfull builds
+# on those ARM plaforms.
+ifeq ($(DEB_HOST_ARCH),arm64)
+ GTEST_FILTER_VAL := *:-local_vector_test.extract_coarse_mapping_int
+endif

Please include the link you had before to the upstream issue about this: https://github.com/ROCm/rocALUTION/issues/322

1. https://launchpad.net/ubuntu/+source/rocalution/5.7.1-3build1

review: Needs Fixing
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

This looks wrong:

-override_dh_auto_build-indep:
+execute_after_dh_auto_build-indep: export http_proxy=127.0.0.1:9
+execute_after_dh_auto_build-indep: export https_proxy=127.0.0.1:9
+execute_after_dh_auto_build-indep:
 ifeq (,$(filter nodoc,$(DEB_BUILD_OPTIONS)))
- cd docs/.doxygen; doxygen
- sphinx-build -b html docs html
+ perl -pi -e 's/WARN_AS_ERROR.*=.*YES/WARN_AS_ERROR = NO/' docs/doxygen/Doxyfile
+ rocm-docs-build
 endif

End result in d/rules:

execute_after_dh_auto_build-indep: export http_proxy=127.0.0.1:9
execute_after_dh_auto_build-indep: export https_proxy=127.0.0.1:9
execute_after_dh_auto_build-indep:
ifeq (,$(filter nodoc,$(DEB_BUILD_OPTIONS)))
    perl -pi -e 's/WARN_AS_ERROR.*=.*YES/WARN_AS_ERROR = NO/' docs/doxygen/Doxyfile
    rocm-docs-build
endif

Perhaps take *inspiration* on the rocrand d/rules section. Other rocm packages also had to address this:

# python3-rocm-docs can only build offline with the help of some envvars
override_dh_auto_build-indep:
ifeq (,$(filter nodoc,$(DEB_BUILD_OPTIONS)))
    http_proxy='127.0.0.1:9' \
    https_proxy='127.0.0.1:9' \
    rocm-docs-build -r $(shell dpkg-parsechangelog -S Version | sed 's/[+-].*//')
    rm -rf build/html/_static/fonts
    rm -rf build/html/_static/vendor
endif

Note the "\" continuation marks at the end of each line, and how the proxy variable is being set in the same shell command. Remember that each line is a shell invocation on its own, so if you set a variable in one line, without the "\" continuation, it won't exist in the next line, which is a new shell invocation.

review: Needs Fixing
Revision history for this message
Bruno Bernardo de Moura (bruno-bdmoura) wrote :

@Andreas, thanks for the explanation!

The `execute_after_dh_auto_build-indep` rule is now fixed as suggested.

Revision history for this message
Andreas Hasenack (ahasenack) wrote :

+1 thanks

review: Approve
Revision history for this message
Andreas Hasenack (ahasenack) wrote :

Sponsored:

Uploading rocalution_7.1.0-0ubuntu2.dsc
Uploading rocalution_7.1.0.orig.tar.gz
Uploading rocalution_7.1.0-0ubuntu2.debian.tar.xz
Uploading rocalution_7.1.0-0ubuntu2_source.buildinfo
Uploading rocalution_7.1.0-0ubuntu2_source.changes

This has binary NEW packages, due to the soname change, so it will require an archive admin approval before the binaries land in resolute-proposed.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml
2new file mode 100644
3index 0000000..b615df9
4--- /dev/null
5+++ b/.azuredevops/rocm-ci.yml
6@@ -0,0 +1,44 @@
7+resources:
8+ repositories:
9+ - repository: pipelines_repo
10+ type: github
11+ endpoint: ROCm
12+ name: ROCm/ROCm
13+
14+variables:
15+- group: common
16+- template: /.azuredevops/variables-global.yml@pipelines_repo
17+
18+trigger:
19+ batch: true
20+ branches:
21+ include:
22+ - develop
23+ - mainline
24+ paths:
25+ exclude:
26+ - .githooks
27+ - .github
28+ - .jenkins
29+ - docs
30+ - '.*.y*ml'
31+ - '*.md'
32+
33+pr:
34+ autoCancel: true
35+ branches:
36+ include:
37+ - develop
38+ - mainline
39+ paths:
40+ exclude:
41+ - .githooks
42+ - .github
43+ - .jenkins
44+ - docs
45+ - '.*.y*ml'
46+ - '*.md'
47+ drafts: false
48+
49+jobs:
50+ - template: ${{ variables.CI_COMPONENT_PATH }}/rocALUTION.yml@pipelines_repo
51diff --git a/.githooks/pre-commit b/.githooks/pre-commit
52index 04dabeb..7d23219 100755
53--- a/.githooks/pre-commit
54+++ b/.githooks/pre-commit
55@@ -35,7 +35,7 @@ fi
56 for file in $files; do
57 if [[ -e $file ]]; then
58 /usr/bin/perl -pi -e 'INIT { exit 1 if !-f $ARGV[0] || -B $ARGV[0]; $year = (localtime)[5] + 1900 }
59- s/^([*\/#[:space:]]*)Copyright\s+(?:\(C\)\s*)?(\d+)(?:\s*-\s*\d+)?/qq($1Copyright (c) $2@{[$year != $2 ? "-$year" : ""]})/ie
60+ s/^([*\/#[:space:]]*)Copyright\s+(?:\(C\)\s*)?(\d+)(?:\s*-\s*\d+)?/qq($1Copyright (C) $2@{[$year != $2 ? "-$year" : ""]})/ie
61 if $. < 10' "$file" && git add -u "$file"
62 fi
63 done
64diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
65old mode 100644
66new mode 100755
67index 4b7a84c..300b638
68--- a/.github/CODEOWNERS
69+++ b/.github/CODEOWNERS
70@@ -1 +1,6 @@
71-* @ntrost57 @YvanMokwinski @jsandham
72+* @ntrost57 @YvanMokwinski @jsandham @kliegeois
73+# Documentation files
74+docs/* @ROCm/rocm-documentation
75+*.md @ROCm/rocm-documentation
76+*.rst @ROCm/rocm-documentation
77+.readthedocs.yaml @ROCm/rocm-documentation
78diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
79new file mode 100644
80index 0000000..2ab06f0
81--- /dev/null
82+++ b/.github/CONTRIBUTING.md
83@@ -0,0 +1,160 @@
84+<head>
85+ <meta charset="UTF-8">
86+ <meta name="description" content="Contributing to rocALUTION">
87+ <meta name="keywords" content="ROCm, contributing, rocALUTION">
88+</head>
89+
90+# Contributing to rocALUTION #
91+
92+AMD welcomes contributions to rocALUTION from the community. Whether those contributions are bug reports, bug fixes, documentation additions, performance notes, or other improvements, we value collaboration with our users. We can build better solutions together. Please follow these details to help ensure your contributions will be successfully accepted.
93+
94+Our code contriubtion guidelines closely follow the model of [GitHub pull-requests](https://help.github.com/articles/using-pull-requests/). This repository follows the [git flow](http://nvie.com/posts/a-successful-git-branching-model/) workflow, which dictates a /master branch where releases are cut, and a /develop branch which serves as an integration branch for new code.
95+
96+## Issue Discussion ##
97+
98+Please use the GitHub Issues tab to notify us of issues.
99+
100+* Use your best judgement for issue creation. If your issue is already listed, upvote the issue and
101+ comment or post to provide additional details, such as how you reproduced this issue.
102+* If you're not sure if your issue is the same, err on the side of caution and file your issue.
103+ You can add a comment to include the issue number (and link) for the similar issue. If we evaluate
104+ your issue as being the same as the existing issue, we'll close the duplicate.
105+* If your issue doesn't exist, use the issue template to file a new issue.
106+ * When filing an issue, be sure to provide as much information as possible, including script output so
107+ we can collect information about your configuration. This helps reduce the time required to
108+ reproduce your issue.
109+ * Check your issue regularly, as we may require additional information to successfully reproduce the
110+ issue.
111+* You may also open an issue to ask questions to the maintainers about whether a proposed change
112+ meets the acceptance criteria, or to discuss an idea pertaining to the library.
113+
114+## Acceptance Criteria ##
115+
116+rocALUTION is a sparse linear algebra library with focus on exploring fine-grained parallelism on top of the AMD ROCm runtime and toolchains, targeting modern CPU and GPU platforms. Based on C++ and HIP, it provides a portable, generic and flexible design that allows seamless integration with other scientific software packages.
117+
118+In rocALUTION we are interested in contributions that:
119+* Fix bugs, improve documentation, enhance testing, reduce complexity.
120+* Improve the performance of existing routines.
121+* Add missing functionality such as new multigrid solvers, iterative solvers, direct solvers, or preconditioners.
122+* Extending new or existing functionality to work with MPI or accelerators (such as GPU devices).
123+
124+We encourage contributors to leverage the GitHub "Issues" tab to discuss possible additions they would like to add.
125+
126+### Exceptions ###
127+
128+rocALUTION places a heavy emphasis on being high performance. Because of this, contributions that add new routines (or that modify existing routines) must do so from the perspective that they offer high performance in relation to the hardware they are run on. Furthermore, all routines added to rocalution must have at a minimum a host solution as all routines must have the ability to fall back to a host solution if a GPU accelerator is not avaiable. Because compile times, binary sizes, and general library complexity are important considerations, we reserve the right to make decisions on whether a proposed routine is too niche or specialized to be worth including.
129+
130+## Code Structure ##
131+
132+The following is the structure of the rocALUTION library in the GitHub repository. A more detailed description of the directory structure can be found in the rocALUTION [documentation](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/design/orga.html).
133+
134+The `src/` directory contains the library source code. This is broken up into three sub-directories:
135+* `src/base`
136+* `src/solvers`
137+* `src/utils`
138+
139+The `src/base` Contains source code related to rocALUTION's vector, matrix, and stencil operator types as well as classes related to parallel management. This directory is further broken up into:
140+* `src/base/hip` Contains HIP implementations of vector, matrix, and stencil operators.
141+* `src/base/host` Contains host implementations of vector, matrix, and stencil operators.
142+
143+The `src/solvers` directory contains all the source code related to direct (`src/solvers/direct`), krylov (`src/solvers/krylov`), and multigrid solvers `src/solvers/multigrid`.
144+
145+The `src/utils` directory contains source code related to logging, memory allocation, math and timing functions.
146+
147+The `clients/` directory contains the testing and benchmarking code as well as all the samples demonstrating rocALUTION usage.
148+
149+The `docs/` directory contains all of the documentation files.
150+
151+## Coding Style ##
152+
153+In general, follow the style of the surrounding code. C and C++ code is formatted using `clang-format`. Use the clang-format version installed with ROCm (found in the `/opt/rocm/llvm/bin` directory). Please do not use your system's built-in `clang-format`, as this is a different version that may result in incorrect results.
154+
155+To format a file, use:
156+
157+```
158+/opt/rocm/llvm/bin/clang-format -style=file -i <path-to-source-file>
159+```
160+
161+To format all files, run the following script in rocALUTION directory:
162+
163+```
164+#!/bin/bash
165+git ls-files -z *.cc *.cpp *.h *.hpp *.cl *.h.in *.hpp.in *.cpp.in | xargs -0 /opt/rocm/llvm/bin/clang-format -style=file -i
166+```
167+
168+Also, githooks can be installed to format the code per-commit:
169+
170+```
171+./.githooks/install
172+```
173+
174+## Pull Request Guidelines ##
175+
176+When you create a pull request, you should target the default branch. Our current default branch is the **develop** branch, which serves as our integration branch.
177+
178+By submitting a pull request, you acknowlege and agree with the CLA below:
179+
180+Contribution License Agreement
181+1. The code I am contributing is mine, and I have the right to license it.
182+2. By submitting a pull request for this project I am granting you a license to distribute said code under the MIT License for the project.
183+
184+### Deliverables ###
185+
186+When raising a PR in rocALUTION here are some important things to include:
187+
188+1. For each new file in the repository, Please include the licensing header
189+```
190+/* ************************************************************************
191+* Copyright (C) 20xx Advanced Micro Devices, Inc. All rights Reserved.
192+*
193+* Permission is hereby granted, free of charge, to any person obtaining a copy
194+* of this software and associated documentation files (the "Software"), to deal
195+* in the Software without restriction, including without limitation the rights
196+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
197+* copies of the Software, and to permit persons to whom the Software is
198+* furnished to do so, subject to the following conditions:
199+*
200+* The above copyright notice and this permission notice shall be included in
201+* all copies or substantial portions of the Software.
202+*
203+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
204+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
205+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
206+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
207+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
208+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
209+* THE SOFTWARE.
210+*
211+* ************************************************************************ */
212+```
213+and adjust the date to the current year. When simply modifying a file, the date should automatically be updated pre-commit as long as the githook has been installed (./.githooks/install).
214+
215+2. When adding a new routine, please make sure you are also adding appropriate testing code. These new unit tests should integrate within the existing [googletest framework](https://github.com/google/googletest/blob/master/googletest/docs/primer.md). This typically involves adding the following files:
216+
217+* testing_<routine_name>.hpp file in the directory `clients/include/`
218+* test_<routine_name>.cpp file in directory `clients/tests/`
219+
220+See existing tests for guidance when adding your own.
221+
222+3. When modifiying an existing routine, add appropriate testing to test_<routine_name>.cpp file in directory `clients/tests/`.
223+
224+4. Tests must have good code coverage.
225+
226+5. At a minimum, rocALUTION must have a host solution for each direct, iterative, multigrid, or preconditioner. If you add a accelerator solution (say using HIP targetting GPU devices) please also add a fall back host solution.
227+
228+6. Ensure code builds successfully. This includes making sure that the code can compile, that the code is properly formatted, and that all tests pass.
229+
230+7. Do not break existing test cases
231+
232+### Process ###
233+
234+When a PR is raised targetting the develop branch in rocALUTION, CI will be automatically triggered. This will:
235+
236+* Test that the PR passes static analysis (i.e ensure clang formatting rules have been followed).
237+* Test that the documentation can be properly built
238+* Ensure that the PR compiles on different OS and GPU device architecture combinations
239+* Ensure that all tests pass on different OS and GPU device architecture combinations
240+
241+Feel free to ask questions on your PR regarding any CI failures you encounter.
242+
243+* Reviewers are listed in the CODEOWNERS file
244diff --git a/.github/dependabot.yml b/.github/dependabot.yml
245index 9cdf2d6..047fac7 100644
246--- a/.github/dependabot.yml
247+++ b/.github/dependabot.yml
248@@ -6,7 +6,14 @@
249 version: 2
250 updates:
251 - package-ecosystem: "pip" # See documentation for possible values
252- directory: "/docs/.sphinx" # Location of package manifests
253+ directory: "/docs/sphinx" # Location of package manifests
254 open-pull-requests-limit: 10
255 schedule:
256 interval: "daily"
257+ target-branch: "develop"
258+ labels:
259+ - "documentation"
260+ - "dependencies"
261+ - "ci:docs-only"
262+ reviewers:
263+ - "samjwu"
264diff --git a/.gitignore b/.gitignore
265index 15ec500..50bd106 100644
266--- a/.gitignore
267+++ b/.gitignore
268@@ -38,20 +38,9 @@ tags
269 # build-in-source directory
270 build
271
272-# doc
273-docBin
274-_build
275-
276 # Visual Studio stuff
277 *.vcxproj.user
278 *.suo
279 *.sdf
280 *.pdb
281 *.opensdf
282-
283-# documentation artifacts
284-_build/
285-_images/
286-_static/
287-_templates/
288-_toc.yml
289diff --git a/.jenkins/codecov.groovy b/.jenkins/codecov.groovy
290index 175da9a..76162e8 100644
291--- a/.jenkins/codecov.groovy
292+++ b/.jenkins/codecov.groovy
293@@ -18,7 +18,7 @@ def runCI =
294 prj.paths.build_command = './install.sh -cg --codecoverage'
295 prj.compiler.compiler_name = 'c++'
296 prj.compiler.compiler_path = 'c++'
297- prj.libraryDependencies = ['rocPRIM', 'rocBLAS-internal', 'rocSPARSE-internal', 'rocRAND']
298+ prj.libraryDependencies = ['rocPRIM', 'hipBLAS-common', 'hipBLASLt', 'rocBLAS', 'rocSPARSE', 'rocRAND']
299 prj.defaults.ccache = false
300
301 // Define test architectures, optional rocm version argument is available
302@@ -52,12 +52,10 @@ def runCI =
303 ci: {
304 String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
305
306- def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 6')])],
307- "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 6')])] ]
308+ def propertyList = []
309 propertyList = auxiliary.appendPropertyList(propertyList)
310
311- def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu18:['gfx900']]),
312- "compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900']])]
313+ def jobNameList = []
314 jobNameList = auxiliary.appendJobNameList(jobNameList)
315
316 propertyList.each
317@@ -81,7 +79,7 @@ ci: {
318 {
319 properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
320 stage(urlJobName) {
321- runCI([ubuntu18:['gfx900']], urlJobName)
322+ runCI([], urlJobName)
323 }
324 }
325 }
326diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
327index 11e8586..1e973d1 100644
328--- a/.jenkins/common.groovy
329+++ b/.jenkins/common.groovy
330@@ -12,7 +12,7 @@ def runCompileCommand(platform, project, boolean sameOrg=false)
331 {
332 project.libraryDependencies.each
333 { libraryName ->
334- getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, sameOrg)
335+ getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, 'develop', sameOrg)
336 }
337 }
338
339@@ -47,23 +47,29 @@ def runTestCommand (platform, project, gfilter)
340 """
341
342 platform.runCommand(this, command)
343- junit "${project.paths.project_build_prefix}/build/release/clients/staging/*.xml"
344 }
345
346 def runCoverageCommand (platform, project, gfilter, String dirmode = "release")
347 {
348- //Temporary workaround due to bug in container
349- String centos7Workaround = platform.jenkinsLabel.contains('centos7') ? 'export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/opt/rocm/lib64/' : ''
350+ String commitSha
351+ String repoUrl
352+ (commitSha, repoUrl) = util.getGitHubCommitInformation(project.paths.project_src_prefix)
353
354- def command = """#!/usr/bin/env bash
355- set -x
356- cd ${project.paths.project_build_prefix}/build/${dirmode}
357- export LD_LIBRARY_PATH=/opt/rocm/lib/
358- ${centos7Workaround}
359- GTEST_LISTENER=NO_PASS_LINE_IN_LOG make coverage_cleanup coverage GTEST_FILTER=${gfilter}-*known_bug*
360- """
361+ withCredentials([string(credentialsId: "mathlibs-codecov-token-rocalution", variable: 'CODECOV_TOKEN')])
362+ {
363+ def command = """#!/usr/bin/env bash
364+ set -x
365+ cd ${project.paths.project_build_prefix}/build/${dirmode}
366+ export LD_LIBRARY_PATH=/opt/rocm/lib/
367+ export ROCALUTION_CODE_COVERAGE=1
368+ GTEST_LISTENER=NO_PASS_LINE_IN_LOG make coverage_cleanup coverage GTEST_FILTER=${gfilter}-*known_bug*
369+ curl -Os https://uploader.codecov.io/latest/linux/codecov
370+ chmod +x codecov
371+ ./codecov -v -U \$http_proxy -t ${CODECOV_TOKEN} --file lcoverage/main_coverage.info --name rocALUTION --sha ${commitSha}
372+ """
373
374- platform.runCommand(this, command)
375+ platform.runCommand(this, command)
376+ }
377
378 publishHTML([allowMissing: false,
379 alwaysLinkToLastBuild: false,
380diff --git a/.jenkins/debug.groovy b/.jenkins/debug.groovy
381index 15abd79..fd45e14 100644
382--- a/.jenkins/debug.groovy
383+++ b/.jenkins/debug.groovy
384@@ -18,7 +18,7 @@ def runCI =
385 prj.paths.build_command = buildCommand
386 prj.compiler.compiler_name = 'c++'
387 prj.compiler.compiler_path = 'c++'
388- prj.libraryDependencies = ['rocPRIM', 'rocBLAS-internal', 'rocSPARSE-internal', 'rocRAND']
389+ prj.libraryDependencies = ['rocPRIM', 'hipBLAS-common', 'hipBLASLt', 'rocBLAS', 'rocSPARSE', 'rocRAND']
390
391 // Define test architectures, optional rocm version argument is available
392 def nodes = new dockerNodes(nodeDetails, jobName, prj)
393diff --git a/.jenkins/extended.groovy b/.jenkins/extended.groovy
394index e40fe5e..d588769 100644
395--- a/.jenkins/extended.groovy
396+++ b/.jenkins/extended.groovy
397@@ -18,7 +18,7 @@ def runCI =
398 prj.paths.build_command = buildCommand
399 prj.compiler.compiler_name = 'c++'
400 prj.compiler.compiler_path = 'c++'
401- prj.libraryDependencies = ['rocPRIM', 'rocBLAS-internal', 'rocSPARSE-internal', 'rocRAND']
402+ prj.libraryDependencies = ['rocPRIM', 'hipBLAS-common', 'hipBLASLt', 'rocBLAS', 'rocSPARSE', 'rocRAND']
403
404 // Define test architectures, optional rocm version argument is available
405 def nodes = new dockerNodes(nodeDetails, jobName, prj)
406diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy
407index 5c309cb..c198213 100644
408--- a/.jenkins/precheckin.groovy
409+++ b/.jenkins/precheckin.groovy
410@@ -18,7 +18,7 @@ def runCI =
411 prj.paths.build_command = buildCommand
412 prj.compiler.compiler_name = 'c++'
413 prj.compiler.compiler_path = 'c++'
414- prj.libraryDependencies = ['rocPRIM', 'rocBLAS-internal', 'rocSPARSE-internal', 'rocRAND']
415+ prj.libraryDependencies = ['rocPRIM', 'hipBLAS-common', 'hipBLASLt', 'rocBLAS', 'rocSPARSE', 'rocRAND']
416
417 // Define test architectures, optional rocm version argument is available
418 def nodes = new dockerNodes(nodeDetails, jobName, prj)
419diff --git a/.jenkins/staticanalysis.groovy b/.jenkins/staticanalysis.groovy
420index 8f5c593..4cfcd5e 100644
421--- a/.jenkins/staticanalysis.groovy
422+++ b/.jenkins/staticanalysis.groovy
423@@ -39,8 +39,26 @@ def runCI =
424 ci: {
425 String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
426
427- properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])]))
428- stage(urlJobName) {
429- runCI([ubuntu20:['cpu']], urlJobName)
430+ def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
431+ "rocm-docker":[]]
432+ propertyList = auxiliary.appendPropertyList(propertyList)
433+
434+ def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":[]]
435+ jobNameList = auxiliary.appendJobNameList(jobNameList)
436+
437+ propertyList.each
438+ {
439+ jobName, property->
440+ if (urlJobName == jobName)
441+ properties(auxiliary.addCommonProperties(property))
442+ }
443+
444+ jobNameList.each
445+ {
446+ jobName, nodeDetails->
447+ if (urlJobName == jobName)
448+ stage(jobName) {
449+ runCI(nodeDetails, jobName)
450+ }
451 }
452 }
453diff --git a/.jenkins/staticlibrary.groovy b/.jenkins/staticlibrary.groovy
454index da565db..b06385d 100644
455--- a/.jenkins/staticlibrary.groovy
456+++ b/.jenkins/staticlibrary.groovy
457@@ -16,9 +16,9 @@ def runCI =
458 def prj = new rocProject('rocALUTION', 'Static Library PreCheckin')
459 // customize for project
460 prj.paths.build_command = buildCommand
461- prj.compiler.compiler_name = 'hipcc'
462- prj.compiler.compiler_path = '/opt/rocm/bin/hipcc'
463- prj.libraryDependencies = ['rocPRIM', 'rocBLAS-internal', 'rocSPARSE-internal', 'rocRAND']
464+ prj.compiler.compiler_name = 'amdclang++'
465+ prj.compiler.compiler_path = '/opt/rocm/bin/amdclang++'
466+ prj.libraryDependencies = ['rocPRIM', 'hipBLAS-common', 'hipBLASLt', 'rocBLAS', 'rocSPARSE', 'rocRAND']
467
468 // Define test architectures, optional rocm version argument is available
469 def nodes = new dockerNodes(nodeDetails, jobName, prj)
470diff --git a/.readthedocs.yaml b/.readthedocs.yaml
471index 43a0890..4e3a24d 100644
472--- a/.readthedocs.yaml
473+++ b/.readthedocs.yaml
474@@ -6,9 +6,13 @@ version: 2
475 sphinx:
476 configuration: docs/conf.py
477
478-formats: [htmlzip]
479+formats: [htmlzip, epub]
480
481 python:
482- version: "3.8"
483 install:
484- - requirements: docs/.sphinx/requirements.txt
485+ - requirements: docs/sphinx/requirements.txt
486+
487+build:
488+ os: ubuntu-22.04
489+ tools:
490+ python: "3.10"
491diff --git a/CHANGELOG.md b/CHANGELOG.md
492index e168a58..fe92d69 100644
493--- a/CHANGELOG.md
494+++ b/CHANGELOG.md
495@@ -1,142 +1,306 @@
496-# Change Log for rocALUTION
497+# Changelog for rocALUTION
498
499-Full documentation for rocALUTION is available at [rocalution.readthedocs.io](https://rocalution.readthedocs.io/en/latest/).
500+Full documentation forrocALUTION is available at [https://rocm.docs.amd.com/projects/rocALUTION/en/latest/](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/).
501+
502+## rocALUTION 4.0.1 for ROCm 7.1.0
503
504-## rocALUTION 2.1.11 for ROCm 5.7.0
505 ### Added
506-- Added support for gfx940, gfx941 and gfx942
507-### Improved
508-- Fixed OpenMP runtime issue with Windows toolchain
509+* Added support for gfx950.
510
511-## rocALUTION 2.1.9 for ROCm 5.6.0
512-### Improved
513-- Fixed synchronization issues in level 1 routines
514+### Changed
515+* Updated the default build standard to C++17 when compiling rocALUTION from source (previously C++14).
516+
517+### Optimized
518+* Improved and expanded user documentation.
519+
520+### Resolved issues
521+* Fixed a bug in the GPU hashing algorithm that occurred when not compiling with -O2/-O3.
522+* Fixed an issue with the SPAI preconditioner when using complex numbers.
523+
524+## rocALUTION 3.2.3 for ROCm 6.4.1
525
526-## rocALUTION 2.1.8 for ROCm 5.5.0
527 ### Added
528-- Added build support for Navi32
529-### Improved
530-- Fixed a typo in MPI backend
531-- Fixed a bug with the backend when HIP support is disabled
532-- Fixed a bug in SAAMG hierarchy building on HIP backend
533-- Improved SAAMG hierarchy build performance on HIP backend
534+* The `-a` option has been added to the `rmake.py` build script. This option provides a way to select specific architectures when building on Windows.
535+
536+### Resolved issues
537+* Fixed an issue where the `HIP_PATH` environment variable was being ignored when compiling on Windows.
538+
539+## rocALUTION 3.2.2 for ROCm 6.4.0
540+
541 ### Changed
542-- LocalVector::GetIndexValues(ValueType\*) is deprecated, use LocalVector::GetIndexValues(const LocalVector&, LocalVector\*) instead
543-- LocalVector::SetIndexValues(const ValueType\*) is deprecated, use LocalVector::SetIndexValues(const LocalVector&, const LocalVector&) instead
544-- LocalMatrix::RSDirectInterpolation(const LocalVector&, const LocalVector&, LocalMatrix\*, LocalMatrix\*) is deprecated, use LocalMatrix::RSDirectInterpolation(const LocalVector&, const LocalVector&, LocalMatrix\*) instead
545-- LocalMatrix::RSExtPIInterpolation(const LocalVector&, const LocalVector&, bool, float, LocalMatrix\*, LocalMatrix\*) is deprecated, use LocalMatrix::RSExtPIInterpolation(const LocalVector&, const LocalVector&, bool, LocalMatrix\*) instead
546-- LocalMatrix::RugeStueben() is deprecated
547-- LocalMatrix::AMGSmoothedAggregation(ValueType, const LocalVector&, const LocalVector&, LocalMatrix\*, LocalMatrix\*, int) is deprecated, use LocalMatrix::AMGAggregation(ValueType, const LocalVector&, const LocalVector&, LocalMatrix\*, int) instead
548-- LocalMatrix::AMGAggregation(const LocalVector&, LocalMatrix\*, LocalMatrix\*) is deprecated, use LocalMatrix::AMGAggregation(const LocalVector&, LocalMatrix\*) instead
549+* Improved documentation
550+
551+## rocALUTION 3.2.1 for ROCm 6.3.0
552
553-## rocALUTION 2.1.3 for ROCm 5.4.0
554 ### Added
555-- Added build support for Navi31 and Navi33
556-- Added support for non-squared global matrices
557-### Improved
558-- Fixed a memory leak in MatrixMult on HIP backend
559-- Global structures can now be used with a single process
560+
561+* Support for gfx1200, gfx1201, and gfx1151.
562+
563 ### Changed
564-- Switched GTest death test style to 'threadsafe'
565-- GlobalVector::GetGhostSize() is deprecated and will be removed
566-- ParallelManager::GetGlobalSize(), ParallelManager::GetLocalSize(), ParallelManager::SetGlobalSize() and ParallelManager::SetLocalSize() are deprecated and will be removed
567-- Vector::GetGhostSize() is deprecated and will be removed
568-- Multigrid::SetOperatorFormat(unsigned int) is deprecated and will be removed, use Multigrid::SetOperatorFormat(unsigned int, int) instead
569-- RugeStuebenAMG::SetCouplingStrength(ValueType) is deprecated and will be removed, use SetStrengthThreshold(float) instead
570+
571+* Changed the default compiler from `hipcc` to `amdclang` in the installation script and cmake files.
572+* Changed the address sanitizer build targets. Now only `gfx908:xnack+`, `gfx90a:xnack+`, `gfx940:xnack+`, `gfx941:xnack+`, and `gfx942:xnack+` are built with `BUILD_ADDRESS_SANITIZER=ON`.
573+
574+### Resolved issues
575+
576+* Fix hang in `RS-AMG` for Navi on some specific matrix sparsity patterns.
577+* Fix wrong results in `Apply` on multi-GPU setups.
578+
579+## rocALUTION 3.2.0 for ROCm 6.2.0
580+
581+### Additions
582+* New file I/O based on rocsparse I/O format
583+* `GetConvergenceHistory` for ItILU0 preconditioner
584+
585+### Deprecations
586+* `LocalMatrix::ReadFileCSR`
587+* `LocalMatrix::WriteFileCSR`
588+* `GlobalMatrix::ReadFileCSR`
589+* `GlobalMatrix::WriteFileCSR`
590+
591+## rocALUTION 3.1.1 for ROCm 6.1.0
592+
593+### Additions
594+
595+* `TripleMatrixProduct` functionality for `GlobalMatrix`
596+* Multi-Node/GPU support for `UA-AMG`, `SA-AMG` and `RS-AMG`
597+* Iterative ILU0 preconditioner `ItILU0`
598+* Iterative triangular solve, selectable via `SolverDecr` class
599+
600+### Deprecations
601+
602+* `LocalMatrix::AMGConnect`
603+* `LocalMatrix::AMGAggregate`
604+* `LocalMatrix::AMGPMISAggregate`
605+* `LocalMatrix::AMGSmoothedAggregation`
606+* `LocalMatrix::AMGAggregation`
607+* `PairwiseAMG`
608+
609+### Known Issues
610+* `PairwiseAMG` does currently not support matrix sizes that exceed int32 range
611+* `PairwiseAMG` might fail building the hierarchy on certain input matrices
612+
613+## rocALUTION 3.0.3 for ROCm 6.0.0
614+
615+### Additions
616+
617+* Support for 64bit integer vectors
618+* Inclusive and exclusive sum functionality for vector classes
619+* Transpose functionality for `GlobalMatrix` and `LocalMatrix`
620+* `TripleMatrixProduct` functionality for `LocalMatrix`
621+* `Sort()` function for `LocalVector` class
622+* Multiple stream support to the HIP backend
623+
624+### Optimizations
625+
626+* `GlobalMatrix::Apply()` now uses multiple streams to better hide communication
627+
628+### Changes
629+
630+* Matrix dimensions and number of non-zeros are now stored using 64-bit integers
631+* Improved the ILUT preconditioner
632+
633+### Deprecations
634+
635+* `LocalVector::GetIndexValues(ValueType*)`
636+* `LocalVector::SetIndexValues(const ValueType*)`
637+* `LocalMatrix::RSDirectInterpolation(const LocalVector&, const LocalVector&, LocalMatrix*, LocalMatrix*)`
638+* `LocalMatrix::RSExtPIInterpolation(const LocalVector&, const LocalVector&, bool, float, LocalMatrix*, LocalMatrix*)`
639+* `LocalMatrix::RugeStueben()`
640+* `LocalMatrix::AMGSmoothedAggregation(ValueType, const LocalVector&, const LocalVector&, LocalMatrix*, LocalMatrix*, int)`
641+* `LocalMatrix::AMGAggregation(const LocalVector&, LocalMatrix*, LocalMatrix*)`
642+
643+### Fixes
644+
645+* Unit tests no longer ignore BCSR block dimension
646+* Fixed documentation typos
647+* Bug in multi-coloring for non-symmetric matrix patterns
648+
649+## rocALUTION 2.1.11 for ROCm 5.7.0
650+
651+### Additions
652+
653+* Support for gfx940, gfx941, and gfx942
654+
655+### Fixes
656+
657+* OpenMP runtime issue with Windows toolchain
658+
659+## rocALUTION 2.1.9 for ROCm 5.6.0
660+
661+### Fixes
662+
663+* Synchronization issues in level 1 routines
664+
665+## rocALUTION 2.1.8 for ROCm 5.5.0
666+
667+### Additions
668+
669+* Build support for Navi32
670+
671+### Fixes
672+
673+* Typo in MPI backend
674+* Bug with the backend when HIP support is disabled
675+* Bug in SAAMG hierarchy building on the HIP backend
676+* Improved SAAMG hierarchy build performance on the HIP backend
677+
678+### Deprecations
679+
680+* `LocalVector::GetIndexValues(ValueType*)`: use
681+ `LocalVector::GetIndexValues(const LocalVector&, LocalVector*)` instead
682+* `LocalVector::SetIndexValues(const ValueType*)`: use
683+ `LocalVector::SetIndexValues(const LocalVector&, const LocalVector&)` instead
684+* `LocalMatrix::RSDirectInterpolation(const LocalVector&, const LocalVector&, LocalMatrix*, LocalMatrix*)`:
685+ use `LocalMatrix::RSDirectInterpolation(const LocalVector&, const LocalVector&, LocalMatrix*)`
686+ instead
687+* `LocalMatrix::RSExtPIInterpolation(const LocalVector&, const LocalVector&, bool, float, LocalMatrix*, LocalMatrix*)`:
688+ use `LocalMatrix::RSExtPIInterpolation(const LocalVector&, const LocalVector&, bool, LocalMatrix*)`
689+ instead
690+* `LocalMatrix::RugeStueben()`
691+* `LocalMatrix::AMGSmoothedAggregation(ValueType, const LocalVector&, const LocalVector&, LocalMatrix*, LocalMatrix*, int)`:
692+ use `LocalMatrix::AMGAggregation(ValueType, const LocalVector&, const LocalVector&, LocalMatrix*, int)`
693+ instead
694+* `LocalMatrix::AMGAggregation(const LocalVector&, LocalMatrix*, LocalMatrix*)`: use
695+ `LocalMatrix::AMGAggregation(const LocalVector&, LocalMatrix*)` instead
696+
697+## rocALUTION 2.1.3 for ROCm 5.4.0
698+
699+### Additions
700+
701+* Build support for Navi31 and Navi33
702+* Support for non-squared global matrices
703+
704+### Fixes
705+
706+* Memory leak in MatrixMult on HIP backend
707+* Global structures can now be used with a single process
708+
709+### Changes
710+
711+* Switched GTest death test style to 'threadsafe'
712+* Removed the native compiler option that was used during default library compilation
713+
714+### Deprecations
715+
716+* `GlobalVector::GetGhostSize()`
717+* `ParallelManager::GetGlobalSize(), ParallelManager::GetLocalSize()`, `ParallelManager::SetGlobalSize()`,
718+ and `ParallelManager::SetLocalSize()`
719+* `Vector::GetGhostSize()`
720+* `Multigrid::SetOperatorFormat(unsigned int)`: use `Multigrid::SetOperatorFormat(unsigned int, int)`
721+ instead
722+* `RugeStuebenAMG::SetCouplingStrength(ValueType)`: use `SetStrengthThreshold(float)` instead
723
724 ## rocALUTION 2.1.0 for ROCm 5.3.0
725-### Added
726-- Benchmarking tool
727-- Ext+I Interpolation with sparsify strategies added for RS-AMG
728-### Improved
729-- ParallelManager
730+
731+### Additions
732+
733+* Benchmarking tool
734+* Ext+I Interpolation with sparsify strategies added for RS-AMG
735+
736+### Optimizations
737+
738+* ParallelManager
739
740 ## rocALUTION 2.0.3 for ROCm 5.2.0
741-### Added
742-- Packages for test and benchmark executables on all supported OSes using CPack.
743+
744+### Additions
745+
746+* New packages for test and benchmark executables on all supported operating systems using CPack
747
748 ## rocALUTION 2.0.2 for ROCm 5.1.0
749-### Added
750-- Added out-of-place matrix transpose functionality
751-- Added LocalVector<bool>
752+
753+### Additions
754+
755+* Added out-of-place matrix transpose functionality
756+* Added LocalVector<bool>
757
758 ## rocALUTION 2.0.1 for ROCm 5.0.0
759-### Changed
760-- Removed deprecated GlobalPairwiseAMG class, please use PairwiseAMG instead.
761-- Changed to C++ 14 Standard
762-### Improved
763-- Added sanitizer option
764-- Improved documentation
765+
766+### Changes
767+
768+* Changed to C++ 14 Standard
769+* Added sanitizer option
770+* Improved documentation
771+
772+### Deprecations
773+
774+* `GlobalPairwiseAMG` class: use `PairwiseAMG` instead
775
776 ## rocALUTION 1.13.2 for ROCm 4.5.0
777-### Added
778-- Address sanitizer build option added
779-- Enabling beta support for Windows 10
780-### Changed
781-- Deprecated GlobalPairwiseAMG, please use PairwiseAMG instead. GlobalPairwiseAMG will be removed in a future major release.
782-- Packaging split into a runtime package called rocalution and a development package called rocalution-devel. The development package depends on runtime. The runtime package suggests the development package for all supported OSes except CentOS 7 to aid in the transition. The suggests feature in packaging is introduced as a deprecated feature and will be removed in a future rocm release.
783-### Improved
784-- (A)MG smoothing and convergence rate improvement
785-- Improved IDR Gram-Schmidt process
786-- (A)MG solving phase optimization
787+
788+### Additions
789+
790+* AddressSanitizer build option
791+* Enabled beta support for Windows 10
792+
793+### Changes
794+
795+* Packaging has been split into a runtime package (`rocalution`) and a development package
796+ (`rocalution-devel`):
797+ The development package depends on the runtime package. When installing the runtime package,
798+ the package manager will suggest the installation of the development package to aid users
799+ transitioning from the previous version's combined package. This suggestion by package manager is
800+ for all supported operating systems (except CentOS 7) to aid in the transition. The `suggestion`
801+ feature in the runtime package is introduced as a deprecated feature and will be removed in a future
802+ ROCm release.
803+
804+### Deprecations
805+
806+* `GlobalPairwiseAMG`: use `PairwiseAMG` instead
807+
808+### Optimizations
809+
810+* Improved (A)MG smoothing and convergence rate
811+* Improved IDR Gram-Schmidt process
812+* Optimized (A)MG solving phase
813
814 ## rocALUTION 1.12.1 for ROCm 4.3.0
815-### Added
816-- support for gfx90a target
817-- support for gfx1030 target
818-### Improved
819-- install script
820-### Known Issues
821-- none
822
823-## rocALUTION 1.11.5 for ROCm 4.2.0
824-### Added
825-- none
826-### Known Issues
827-- none
828+### Additions
829
830-## rocALUTION 1.11.5 for ROCm 4.1.0
831-### Added
832-- none
833-### Known Issues
834-- none
835+* Support for gfx90a target
836+* Support for gfx1030 target
837+
838+### Optimizations
839+
840+* Install script
841
842 ## rocALUTION 1.11.5 for ROCm 4.0.0
843-### Added
844-- Add changelog
845-- Fixing NaN issues
846-- update to debian package name
847-- bcsr format support.
848-- cmake files adjustments.
849+
850+### Additions
851+
852+* Changelog
853+* Block compressed sparse row (BCSRR) format support
854+
855+### Changes
856+
857+* Update to the Debian package name
858+* CMake file adjustments
859+
860+### Fixes
861+
862+* NaN issues
863
864 ## rocALUTION 1.10 for ROCm 3.9
865-### Added
866-- rocRAND to support GPU sampling of random data.
867-### Known Issues
868-- none
869+
870+### Additions
871+
872+* rocRAND support for GPU sampling of random data
873
874 ## rocALUTION 1.9.3 for ROCm 3.8
875-### Added
876-- csr2dense and dense2csr to HIP backend.
877-### Known Issues
878-- none
879
880-## rocALUTION 1.9.1 for ROCm 3.7
881-### Added
882-- none
883-### Known Issues
884-- none
885+### Additions
886
887-## rocALUTION 1.9.1 for ROCm 3.6
888-### Added
889-- none
890-### Known Issues
891-- none
892+* `csr2dense` and `dense2csr` to HIP backend
893
894 ## rocALUTION 1.9.1 for ROCm 3.5
895-### Added
896-- static build
897-- BCSR matrix format for SpMV
898-- Bug fixing in conversion from CSR to HYB format.
899-### Known Issues
900-- none
901+
902+### Additions
903+
904+* Static build
905+* BCSR matrix format for SpMV
906+
907+### Fixes
908+
909+* Bug in conversion from CSR to HYB format
910diff --git a/CMakeLists.txt b/CMakeLists.txt
911index 62e94be..8a622f4 100644
912--- a/CMakeLists.txt
913+++ b/CMakeLists.txt
914@@ -1,5 +1,5 @@
915 # ########################################################################
916-# Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved.
917+# Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved.
918 #
919 # Permission is hereby granted, free of charge, to any person obtaining a copy
920 # of this software and associated documentation files (the "Software"), to deal
921@@ -38,7 +38,8 @@ list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/llvm ${ROCM_PATH})
922 list(APPEND CMAKE_MODULE_PATH
923 ${CMAKE_CURRENT_SOURCE_DIR}/cmake
924 ${ROCM_PATH}/lib/cmake/hip
925- ${ROCM_PATH}/hip/cmake)
926+ ${ROCM_PATH}/hip/cmake
927+ ${ROCM_PATH}/cmake)
928
929 # Set a default build type if none was specified
930 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
931@@ -56,9 +57,12 @@ endif()
932 project(rocalution LANGUAGES CXX)
933
934 # Build flags
935-set(CMAKE_CXX_STANDARD 14)
936+set(CMAKE_CXX_STANDARD 17)
937 set(CMAKE_CXX_STANDARD_REQUIRED ON)
938 set(CMAKE_CXX_EXTENSIONS OFF)
939+set(CMAKE_HIP_STANDARD 17)
940+set(CMAKE_HIP_STANDARD_REQUIRED ON)
941+set(CMAKE_HIP_EXTENSIONS OFF)
942
943 # Build options
944 option(BUILD_SHARED_LIBS "Build rocALUTION as a shared library" ON)
945@@ -68,31 +72,67 @@ option(BUILD_CLIENTS_SAMPLES "Build examples" ON)
946 option(BUILD_VERBOSE "Output additional build information" OFF)
947 option(BUILD_CODE_COVERAGE "Build with code coverage enabled" OFF)
948 option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF)
949-option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" ON)
950+option(BUILD_GLOBALTYPE_64 "Support global number of rows / columns exceeding 32 bits" OFF)
951+option(BUILD_LOCALTYPE_64 "Support local number of rows / columns exceeding 32 bits" OFF)
952+option(BUILD_PTRTYPE_64 "Support local number of non-zeros exceeding 32 bits" OFF)
953+option(BUILD_OPTCPU "Enable all instruction subsets supported by the local machine" OFF)
954+option(BUILD_SUPPORT_COMPLEX "Enable complex number support" ON)
955
956 # Dependencies
957 include(cmake/Dependencies.cmake)
958
959-# Availability of rocm_check_target_ids command assures that we can also build
960-# for gfx90a target
961-if(COMMAND rocm_check_target_ids)
962- set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx1030;gfx90a:xnack-;gfx90a:xnack+;gfx940;gfx941;gfx942;gfx1100;gfx1101;gfx1102")
963+if(BUILD_ADDRESS_SANITIZER)
964+ set(DEFAULT_AMDGPU_TARGETS
965+ gfx908:xnack+
966+ gfx90a:xnack+
967+ gfx942:xnack+
968+ )
969 else()
970- set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx1030;gfx1100;gfx1101;gfx1102")
971+ set(DEFAULT_AMDGPU_TARGETS
972+ gfx803
973+ gfx900:xnack-
974+ gfx906:xnack-
975+ gfx908:xnack-
976+ gfx90a:xnack-
977+ gfx90a:xnack+
978+ gfx942
979+ gfx950
980+ gfx1030
981+ gfx1100
982+ gfx1101
983+ gfx1102
984+ gfx1151
985+ gfx1200
986+ gfx1201
987+ )
988 endif()
989-set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target")
990-
991-# Find HIP package
992-find_package(HIP)
993-
994-if (NOT HIP_FOUND)
995- message("-- HIP not found. Compiling WITHOUT HIP support.")
996+if(AMDGPU_TARGETS AND NOT GPU_TARGETS)
997+ message( DEPRECATION "AMDGPU_TARGETS use is deprecated. Use GPU_TARGETS." )
998+endif()
999+set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined. (Deprecated, prefer GPU_TARGETS)")
1000+# Don't force, users should be able to override GPU_TARGETS at the command line if desired
1001+set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to build for")
1002+
1003+include(CheckLanguage)
1004+include(CMakeDependentOption)
1005+check_language(HIP)
1006+cmake_dependent_option(USE_HIPCXX "Use CMake HIP language support" OFF CMAKE_HIP_COMPILER OFF)
1007+if(USE_HIPCXX)
1008+ enable_language(HIP)
1009 else()
1010- option(SUPPORT_HIP "Compile WITH HIP support." ON)
1011+ find_package(HIP MODULE) # hip_add_library is only provided by the find module
1012+ if(NOT HIP_FOUND)
1013+ message("-- HIP not found. Compiling WITHOUT HIP support.")
1014+ endif()
1015 endif()
1016
1017+cmake_dependent_option(SUPPORT_HIP "Compile WITH HIP support" ON "USE_HIPCXX OR HIP_FOUND" OFF)
1018+
1019 # HIP related library dependencies
1020 if(SUPPORT_HIP)
1021+ if( CMAKE_CXX_COMPILER_ID MATCHES "Clang" )
1022+ find_package( hip REQUIRED CONFIG PATHS ${HIP_DIR} ${ROCM_PATH} /opt/rocm )
1023+ endif( )
1024 find_package(rocblas REQUIRED)
1025 find_package(rocsparse REQUIRED)
1026 find_package(rocprim REQUIRED)
1027@@ -112,9 +152,7 @@ endif()
1028
1029
1030 # Setup version
1031-set(VERSION_STRING "2.1.11")
1032-rocm_setup_version(VERSION ${VERSION_STRING})
1033-set(rocalution_SOVERSION 0.1)
1034+rocm_setup_version(VERSION "4.0.1")
1035
1036 if(BUILD_CLIENTS_SAMPLES OR BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_TESTS)
1037 set( BUILD_CLIENTS ON )
1038@@ -137,7 +175,12 @@ endif()
1039
1040 # Package specific CPACK vars
1041 if(SUPPORT_HIP)
1042- rocm_package_add_dependencies(DEPENDS "hip-rocclr >= 4.0.0" "rocsparse >= 1.12.10" "rocblas >= 2.22.0" "rocrand >= 0.0.1")
1043+ if(BUILD_ADDRESS_SANITIZER)
1044+ set(DEPENDS_HIP_RUNTIME "hip-runtime-amd-asan >= 4.5.0" )
1045+ else()
1046+ set(DEPENDS_HIP_RUNTIME "hip-runtime-amd >= 4.5.0" )
1047+ endif()
1048+ rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME}" "rocsparse >= 1.12.10" "rocblas >= 2.22.0" "rocrand >= 2.1.0")
1049 endif()
1050
1051 set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md")
1052@@ -166,7 +209,7 @@ set(ROCALUTION_CONFIG_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_LI
1053
1054 rocm_create_package(
1055 NAME ${package_name}
1056- DESCRIPTION "Radeon Open Compute library for sparse linear systems"
1057+ DESCRIPTION "ROCm library for sparse linear systems"
1058 MAINTAINER "rocALUTION Maintainer <rocalution-maintainer@amd.com>"
1059 LDCONFIG
1060 LDCONFIG_DIR ${ROCALUTION_CONFIG_DIR}
1061@@ -202,18 +245,14 @@ if(BUILD_CODE_COVERAGE)
1062 add_custom_target(coverage_output
1063 DEPENDS coverage_analysis
1064 COMMAND mkdir -p lcoverage
1065- COMMAND echo "\\#!/bin/bash" > llvm-gcov.sh
1066- COMMAND echo "\\# THIS FILE HAS BEEN GENERATED" >> llvm-gcov.sh
1067- COMMAND printf "exec /opt/rocm/llvm/bin/llvm-cov gcov $$\\@" >> llvm-gcov.sh
1068- COMMAND chmod +x llvm-gcov.sh
1069 )
1070
1071 #
1072 # Generate coverage output.
1073 #
1074 add_custom_command(TARGET coverage_output
1075- COMMAND lcov --directory . --base-directory . --gcov-tool ${CMAKE_BINARY_DIR}/llvm-gcov.sh --capture -o lcoverage/raw_main_coverage.info
1076- COMMAND lcov --remove lcoverage/raw_main_coverage.info "'/opt/*'" "'/usr/*'" -o lcoverage/main_coverage.info
1077+ COMMAND lcov --directory . --base-directory . --capture -o lcoverage/raw_main_coverage.info
1078+ COMMAND lcov --remove lcoverage/raw_main_coverage.info "'${CMAKE_SOURCE_DIR}/src/utils/*'" "'${CMAKE_SOURCE_DIR}/src/base/host/host_io.*'" "'${CMAKE_SOURCE_DIR}/clients/*'" "'${CMAKE_SOURCE_DIR}/build/*'" "'/opt/*'" "'/usr/*'" -o lcoverage/main_coverage.info
1079 COMMAND genhtml lcoverage/main_coverage.info --output-directory lcoverage
1080 )
1081
1082diff --git a/LICENSE.md b/LICENSE.md
1083index 7a44e61..e41c5d8 100644
1084--- a/LICENSE.md
1085+++ b/LICENSE.md
1086@@ -1,6 +1,6 @@
1087 MIT License
1088
1089-Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
1090+Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
1091
1092 Permission is hereby granted, free of charge, to any person obtaining a copy
1093 of this software and associated documentation files (the "Software"), to deal
1094diff --git a/README.md b/README.md
1095index b9b6929..d63704f 100644
1096--- a/README.md
1097+++ b/README.md
1098@@ -1,28 +1,52 @@
1099 # rocALUTION
1100-rocALUTION is a sparse linear algebra library with focus on exploring fine-grained parallelism on top of AMD's Radeon Open eCosystem Platform [ROCm][] runtime and toolchains, targeting modern CPU and GPU platforms. Based on C++ and [HIP][], it provides a portable, generic and flexible design that allows seamless integration with other scientific software packages.
1101+
1102+> [!NOTE]
1103+> The published documentation is available at [rocALUTION](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
1104+
1105+rocALUTION is a sparse linear algebra library that can be used to explore fine-grained parallelism on
1106+top of the [ROCm](https://github.com/ROCm/ROCm) platform runtime and toolchains.
1107+Based on C++ and [HIP](https://github.com/ROCm/HIP/), rocALUTION
1108+provides a portable, generic, and flexible design that allows seamless integration with other scientific
1109+software packages.
1110+
1111+rocALUTION offers various backends for different (parallel) hardware:
1112+
1113+* Host
1114+* [OpenMP](http://www.openmp.org/): Designed for multi-core CPUs
1115+* [HIP](https://github.com/ROCm/HIP/): Designed for ROCm-compatible devices
1116+* [MPI](https://www.open-mpi.org/): Designed for multi-node clusters and multi-GPU setups
1117+
1118+## Requirements
1119+
1120+To use rocALUTION on GPU devices, you must first install the
1121+[rocBLAS](https://github.com/ROCm/rocBLAS),
1122+[rocSPARSE](https://github.com/ROCm/rocSPARSE), and
1123+[rocRAND](https://github.com/ROCm/rocRAND) libraries. You can install these from
1124+the ROCm repository, the GitHub 'releases' tab, or you can manually compile them.
1125
1126 ## Documentation
1127-The latest rocALUTION documentation and API description can be found [here][].
1128
1129-### How to build documentation
1130+Documentation for rocALUTION is available at
1131+[https://rocm.docs.amd.com/projects/rocALUTION/en/latest/](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/).
1132
1133-Run the following steps to build documentation.
1134+To build our documentation locally, use the following code:
1135
1136-```
1137+```bash
1138 cd docs
1139
1140-pip3 install -r .sphinx/requirements.txt
1141+pip3 install -r sphinx/requirements.txt
1142
1143 python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
1144 ```
1145
1146-## Quickstart rocALUTION build
1147+## Build
1148
1149-#### CMake 3.5 or later
1150-All compiler specifications are determined automatically. The compilation process can be performed by
1151-```
1152+You can compile rocALUTION using CMake 3.5 or later. Note that all compiler specifications are
1153+determined automatically.
1154+
1155+```bash
1156 # Clone rocALUTION using git
1157-git clone https://github.com/ROCmSoftwarePlatform/rocALUTION.git
1158+git clone https://github.com/ROCm/rocALUTION.git
1159
1160 # Go to rocALUTION directory, create and change to build directory
1161 cd rocALUTION; mkdir build; cd build
1162@@ -40,69 +64,56 @@ cmake .. -DSUPPORT_HIP=ON -DROCM_PATH=/opt/rocm/
1163 make
1164 ```
1165
1166-#### Simple test
1167-You can test the installation by running a CG solver on a Laplace matrix:
1168-```
1169+To test your installation, run a CG solver on a Laplacian matrix:
1170+
1171+```bash
1172 cd rocALUTION; cd build
1173 wget ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/laplace/gr_30_30.mtx.gz
1174 gzip -d gr_30_30.mtx.gz
1175 ./clients/staging/cg gr_30_30.mtx
1176 ```
1177
1178-## Overview
1179-
1180-#### Backends
1181-rocALUTION offers various backends for different (parallel) hardware.
1182-* Host
1183-* [OpenMP][] - designed for multi-core CPUs
1184-* [HIP][] - designed for ROCm compatible devices
1185-* [MPI][] - designed for multi-node clusters and multi-GPU setups
1186-
1187-#### Easy to use
1188-Syntax and structure of the library provide fast learning curves. With the help of the examples, anyone can try out the library - no knowledge in multi-core or GPU programming is required.
1189-
1190-#### Requirements
1191-There are no hardware requirements to install and run rocALUTION. If GPU devices are available, rocALUTION will use them.
1192-In order to use rocALUTION on GPU devices, you will need to make sure that [rocBLAS][], [rocSPARSE][] and [rocRAND][] libraries are installed on your system. You can install them from ROCm repository, from github releases tab or manually compile them yourself.
1193-
1194-#### Iterative solvers
1195-* Fixed-Point iteration schemes - Jacobi, (Symmetric) Gauss-Seidel, SOR, SSOR
1196-* Krylov subspace methods - CR, CG, BiCGStab, BiCGStab(*l*), GMRES, IDR, QMRCGSTAB, Flexible CG/GMRES
1197-* Mixed-precision defect-correction scheme
1198-* Chebyshev iteration scheme
1199-* Multigrid - geometric and algebraic
1200-
1201-#### Preconditioners
1202-* Matrix splitting schemes - Jacobi, (multi-colored) (symmetric) Gauss-Seidel, SOR, SSOR
1203-* Factorization schemes - ILU(*0*), ILU(*p*) (based on levels), ILU(*p,q*) (power(*q*)-pattern method), multi-elimination ILU (nested/recursive), ILUT (based on threshold), IC(*0*)
1204-* Approximate Inverses - Chebyshev matrix-valued polynomial, SPAI, FSAI, TNS
1205-* Diagonal-based preconditioner for Saddle-point problems
1206-* Block-type of sub-preconditioners/solvers
1207-* (Restricted) Additive Schwarz
1208-* Variable type of preconditioners
1209-
1210-#### Sparse matrix formats
1211-* Compressed Sparse Row (CSR)
1212-* Modified Compressed Sparse Row (MCSR)
1213-* Dense (DENSE)
1214-* Coordinate (COO)
1215-* ELL
1216-* Diagonal (DIA)
1217-* Hybrid ELL+COO (HYB)
1218-
1219-#### Generic and robust design
1220-rocALUTION is based on a generic and robust design, allowing expansion in the direction of new solvers and preconditioners and support for various hardware types. Furthermore, the design of the library allows the use of all solvers as preconditioners in other solvers, for example you can define a CG solver with a multi-elimination preconditioner, where the last-block is preconditioned with another Chebyshev iteration method which is preconditioned with a multi-colored symmetric Gauss-Seidel scheme.
1221-
1222-#### Portable code and results
1223-All code based on rocALUTION is portable and independent of the hardware, it will compile and run on any supported platform. All solvers and preconditioners are based on a single source code implementation, which delivers portable results across all backends (variations are possible due to different rounding modes on the hardware). The only difference which you can see for a hardware change is the performance variation.
1224-
1225-
1226-
1227-[ROCm]: https://github.com/RadeonOpenCompute/ROCm
1228-[HIP]: https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/
1229-[OpenMP]: http://www.openmp.org/
1230-[MPI]: https://www.open-mpi.org/
1231-[rocBLAS]: https://github.com/ROCmSoftwarePlatform/rocBLAS
1232-[rocSPARSE]: https://github.com/ROCmSoftwarePlatform/rocSPARSE
1233-[rocRAND]: https://github.com/ROCmSoftwarePlatform/rocRAND
1234-[here]: https://rocalution.readthedocs.io
1235+## General information
1236+
1237+rocALUTION is based on a generic and robust design that allows expansion in the direction of new
1238+solvers and preconditioners with support for various hardware types. The library's design allows the
1239+use of all solvers as preconditioners in other solvers. For example, you can define a CG solver with a
1240+multi-elimination preconditioner, in which the last-block is preconditioned with another Chebyshev
1241+iteration method that itself is preconditioned with a multi-colored symmetric Gauss-Seidel scheme.
1242+
1243+### Iterative solvers
1244+
1245+* Fixed-point iteration schemes: Jacobi, (Symmetric) Gauss-Seidel, SOR, SSOR
1246+* Krylov subspace methods: CR, CG, BiCGStab, BiCGStab(*l*), GMRES, IDR, QMRCGSTAB,
1247+ Flexible CG/GMRES
1248+* Mixed-precision defect correction scheme
1249+* Chebyshev iteration scheme
1250+* Multigrid: Geometric and algebraic
1251+
1252+### Preconditioners
1253+
1254+* Matrix splitting schemes: Jacobi, (multi-colored) (symmetric) Gauss-Seidel, SOR, SSOR
1255+* Factorization schemes: ILU(*0*), ILU(*p*) (based on levels), ILU(*p,q*) (power(*q*)-pattern method),
1256+ multi-elimination ILU (nested/recursive), ILUT (based on threshold), IC(*0*)
1257+* Approximate Inverses: Chebyshev matrix-valued polynomial, SPAI, FSAI, TNS
1258+* Diagonal-based preconditioner for Saddle-point problems
1259+* Block-type of sub-preconditioners/solvers
1260+* Additive Schwarz (restricted)
1261+* Variable type of preconditioners
1262+
1263+### Sparse matrix formats
1264+
1265+* Compressed Sparse Row (CSR)
1266+* Modified Compressed Sparse Row (MCSR)
1267+* Dense (DENSE)
1268+* Coordinate (COO)
1269+* ELL
1270+* Diagonal (DIA)
1271+* Hybrid ELL+COO (HYB)
1272+
1273+## Portability
1274+
1275+All code based on rocALUTION is portable and hardware-independent. It compiles and runs on any
1276+supported platform. All solvers and preconditioners are based on a single source code implementation
1277+that delivers portable results across all backends (note that variations are possible due to different
1278+hardware rounding modes). The only visible difference between hardware is performance variation.
1279diff --git a/clients/benchmarks/CMakeLists.txt b/clients/benchmarks/CMakeLists.txt
1280index 781a24c..aa66ca7 100644
1281--- a/clients/benchmarks/CMakeLists.txt
1282+++ b/clients/benchmarks/CMakeLists.txt
1283@@ -1,5 +1,5 @@
1284 # ########################################################################
1285-# Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved.
1286+# Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved.
1287 #
1288 # Permission is hereby granted, free of charge, to any person obtaining a copy
1289 # of this software and associated documentation files (the "Software"), to deal
1290@@ -29,6 +29,7 @@ set(ROCALUTION_BENCHMARK_SOURCES
1291 rocalution_bench_solver_results.cpp
1292 rocalution_enum_coarsening_strategy.cpp
1293 rocalution_enum_directsolver.cpp
1294+ rocalution_enum_itilu0_alg.cpp
1295 rocalution_enum_itsolver.cpp
1296 rocalution_enum_matrix_init.cpp
1297 rocalution_enum_preconditioner.cpp
1298@@ -40,7 +41,7 @@ set(ROCALUTION_BENCHMARK_SOURCES
1299 add_executable(rocalution-bench ${ROCALUTION_BENCHMARK_SOURCES} ${ROCALUTION_CLIENTS_COMMON})
1300
1301 # Target compile options
1302-target_compile_options(rocalution-bench PRIVATE -Wno-unused-command-line-argument -Wall)
1303+target_compile_options(rocalution-bench PRIVATE -Wall)
1304
1305 # Internal common header
1306 target_include_directories(rocalution-bench PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>)
1307diff --git a/clients/benchmarks/rocalution_arguments_config.cpp b/clients/benchmarks/rocalution_arguments_config.cpp
1308index 61cf6eb..692f1ea 100644
1309--- a/clients/benchmarks/rocalution_arguments_config.cpp
1310+++ b/clients/benchmarks/rocalution_arguments_config.cpp
1311@@ -1,6 +1,6 @@
1312 /*! \file */
1313 /* ************************************************************************
1314-* Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved.
1315+* Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved.
1316 *
1317 * Permission is hereby granted, free of charge, to any person obtaining a copy
1318 * of this software and associated documentation files (the "Software"), to deal
1319@@ -82,6 +82,16 @@ void rocalution_arguments_config::set_description(options_description& desc)
1320 ADD_OPTION(double, e, 0.05, "ilut tolerance");
1321 break;
1322 }
1323+ case rocalution_bench_solver_parameters::itsolve_tol:
1324+ {
1325+ ADD_OPTION(double, e, 1e-03, "iterative solve tolerance (see --iterative_solve).");
1326+ break;
1327+ }
1328+ case rocalution_bench_solver_parameters::itilu0_tol:
1329+ {
1330+ ADD_OPTION(double, e, 2e-07, "itilu0 tolerance");
1331+ break;
1332+ }
1333 case rocalution_bench_solver_parameters::mcgs_relax:
1334 {
1335 ADD_OPTION(double, e, 1.0, "relaxation coefficient");
1336@@ -155,6 +165,27 @@ void rocalution_arguments_config::set_description(options_description& desc)
1337 break;
1338 }
1339
1340+ case rocalution_bench_solver_parameters::itsolve_max_iter:
1341+ {
1342+ ADD_OPTION(int,
1343+ e,
1344+ 30,
1345+ "iterative solve maximum number of iterations (see "
1346+ "--iterative_solve).");
1347+ break;
1348+ }
1349+ case rocalution_bench_solver_parameters::itilu0_max_iter:
1350+ {
1351+ ADD_OPTION(int, e, 1000, "itilu0 maximum number of iterations.");
1352+ break;
1353+ }
1354+
1355+ case rocalution_bench_solver_parameters::itilu0_options:
1356+ {
1357+ ADD_OPTION(int, e, 1000, "itilu0 combination of options.");
1358+ break;
1359+ }
1360+
1361 case rocalution_bench_solver_parameters::mcilu_p:
1362 {
1363 ADD_OPTION(int, e, 0, "multicolored ilu parameter p.");
1364@@ -219,6 +250,11 @@ void rocalution_arguments_config::set_description(options_description& desc)
1365 ADD_OPTION(std::string, e, "", "coarsening strategy");
1366 break;
1367 }
1368+ case rocalution_bench_solver_parameters::itilu0_alg:
1369+ {
1370+ ADD_OPTION(std::string, e, "Default", "ItILU0 algorithm");
1371+ break;
1372+ }
1373 case rocalution_bench_solver_parameters::matrix:
1374 {
1375 ADD_OPTION(std::string, e, "", "matrix initialization");
1376@@ -243,6 +279,11 @@ void rocalution_arguments_config::set_description(options_description& desc)
1377 {
1378 switch(e)
1379 {
1380+ case rocalution_bench_solver_parameters::iterative_solve:
1381+ {
1382+ ADD_OPTION(bool, e, false, "perform triangular iterative solve during solving step.");
1383+ break;
1384+ }
1385 case rocalution_bench_solver_parameters::verbose:
1386 {
1387 ADD_OPTION(bool, e, false, "verbose");
1388@@ -335,6 +376,17 @@ int rocalution_arguments_config::parse(int& argc, char**& argv, options_descript
1389
1390 break;
1391 }
1392+ case rocalution_bench_solver_parameters::itilu0_alg:
1393+ {
1394+
1395+ auto itilu0_alg_string = this->Get(rocalution_bench_solver_parameters::itilu0_alg);
1396+ if(itilu0_alg_string != "")
1397+ {
1398+ this->m_enum_itilu0_alg(itilu0_alg_string.c_str());
1399+ }
1400+
1401+ break;
1402+ }
1403
1404 case rocalution_bench_solver_parameters::matrix:
1405 {
1406diff --git a/clients/benchmarks/rocalution_bench_solver_parameters.cpp b/clients/benchmarks/rocalution_bench_solver_parameters.cpp
1407index d48975d..e6f4268 100644
1408--- a/clients/benchmarks/rocalution_bench_solver_parameters.cpp
1409+++ b/clients/benchmarks/rocalution_bench_solver_parameters.cpp
1410@@ -1,5 +1,5 @@
1411 /* ************************************************************************
1412- * Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved.
1413+ * Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved.
1414 *
1415 * Permission is hereby granted, free of charge, to any person obtaining a copy
1416 * of this software and associated documentation files (the "Software"), to deal
1417@@ -35,6 +35,10 @@ rocalution_enum_preconditioner rocalution_bench_solver_parameters::GetEnumPrecon
1418 {
1419 return this->m_enum_preconditioner;
1420 };
1421+rocalution_enum_itilu0_alg rocalution_bench_solver_parameters::GetEnumItILU0Algorithm() const
1422+{
1423+ return this->m_enum_itilu0_alg;
1424+};
1425 rocalution_enum_itsolver rocalution_bench_solver_parameters::GetEnumIterativeSolver() const
1426 {
1427 return this->m_enum_itsolver;
1428diff --git a/clients/benchmarks/rocalution_bench_solver_parameters.hpp b/clients/benchmarks/rocalution_bench_solver_parameters.hpp
1429index 67d5721..debe199 100644
1430--- a/clients/benchmarks/rocalution_bench_solver_parameters.hpp
1431+++ b/clients/benchmarks/rocalution_bench_solver_parameters.hpp
1432@@ -1,5 +1,5 @@
1433 /* ************************************************************************
1434- * Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved.
1435+ * Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved.
1436 *
1437 * Permission is hereby granted, free of charge, to any person obtaining a copy
1438 * of this software and associated documentation files (the "Software"), to deal
1439@@ -25,6 +25,7 @@
1440
1441 #include "rocalution_enum_coarsening_strategy.hpp"
1442 #include "rocalution_enum_directsolver.hpp"
1443+#include "rocalution_enum_itilu0_alg.hpp"
1444 #include "rocalution_enum_itsolver.hpp"
1445 #include "rocalution_enum_matrix_init.hpp"
1446 #include "rocalution_enum_preconditioner.hpp"
1447@@ -44,6 +45,11 @@ protected:
1448 rocalution_enum_matrix_init m_enum_matrix_init{};
1449
1450 //
1451+ // @brief Which ItILU0 algorithm.
1452+ //
1453+ rocalution_enum_itilu0_alg m_enum_itilu0_alg{};
1454+
1455+ //
1456 // @brief Which iterative solver.
1457 //
1458 rocalution_enum_itsolver m_enum_itsolver{};
1459@@ -90,6 +96,10 @@ public:
1460 //
1461 rocalution_enum_itsolver GetEnumIterativeSolver() const;
1462 //
1463+ // @brief Get which ItILU0 algorithm.
1464+ //
1465+ rocalution_enum_itilu0_alg GetEnumItILU0Algorithm() const;
1466+ //
1467 // @brief Get which matrix initialization
1468 //
1469 rocalution_enum_matrix_init GetEnumMatrixInit() const;
1470@@ -100,6 +110,7 @@ public:
1471
1472 // clang-format off
1473 #define PBOOL_TRANSFORM_EACH \
1474+ PBOOL_TRANSFORM(iterative_solve) \
1475 PBOOL_TRANSFORM(verbose) \
1476 PBOOL_TRANSFORM(mcilu_use_level)
1477 // clang-format on
1478@@ -122,6 +133,9 @@ public:
1479 PINT_TRANSFORM(krylov_basis) \
1480 PINT_TRANSFORM(ndim) \
1481 PINT_TRANSFORM(ilut_n) \
1482+ PINT_TRANSFORM(itsolve_max_iter) \
1483+ PINT_TRANSFORM(itilu0_max_iter) \
1484+ PINT_TRANSFORM(itilu0_options) \
1485 PINT_TRANSFORM(mcilu_p) \
1486 PINT_TRANSFORM(mcilu_q) \
1487 PINT_TRANSFORM(max_iter) \
1488@@ -153,6 +167,7 @@ public:
1489 PSTRING_TRANSFORM(coarsening_strategy) \
1490 PSTRING_TRANSFORM(direct_solver) \
1491 PSTRING_TRANSFORM(iterative_solver) \
1492+ PSTRING_TRANSFORM(itilu0_alg) \
1493 PSTRING_TRANSFORM(matrix) \
1494 PSTRING_TRANSFORM(matrix_filename) \
1495 PSTRING_TRANSFORM(preconditioner) \
1496@@ -197,6 +212,8 @@ public:
1497 PDOUBLE_TRANSFORM(div_tol) \
1498 PDOUBLE_TRANSFORM(residual_tol) \
1499 PDOUBLE_TRANSFORM(ilut_tol) \
1500+ PDOUBLE_TRANSFORM(itsolve_tol) \
1501+ PDOUBLE_TRANSFORM(itilu0_tol) \
1502 PDOUBLE_TRANSFORM(mcgs_relax) \
1503 PDOUBLE_TRANSFORM(solver_over_interp) \
1504 PDOUBLE_TRANSFORM(solver_coupling_strength) \
1505diff --git a/clients/benchmarks/rocalution_driver_itsolver.hpp b/clients/benchmarks/rocalution_driver_itsolver.hpp
1506index 9bf4e36..1dc7ba4 100644
1507--- a/clients/benchmarks/rocalution_driver_itsolver.hpp
1508+++ b/clients/benchmarks/rocalution_driver_itsolver.hpp
1509@@ -1,5 +1,5 @@
1510 /* ************************************************************************
1511- * Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved.
1512+ * Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved.
1513 *
1514 * Permission is hereby granted, free of charge, to any person obtaining a copy
1515 * of this software and associated documentation files (the "Software"), to deal
1516@@ -256,7 +256,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO
1517 AIChebyshev<rocalution::LocalMatrix<T>, rocalution::LocalVector<T>, T>;
1518 p->Set(3, lambda_max / 7.0, lambda_max);
1519 this->m_preconditioner = p;
1520- return true;
1521+ break;
1522 }
1523
1524 case rocalution_enum_preconditioner::FSAI:
1525@@ -264,7 +264,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO
1526 auto* p
1527 = new rocalution::FSAI<rocalution::LocalMatrix<T>, rocalution::LocalVector<T>, T>;
1528 this->m_preconditioner = p;
1529- return true;
1530+ break;
1531 }
1532
1533 case rocalution_enum_preconditioner::SPAI:
1534@@ -272,14 +272,14 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO
1535 auto* p
1536 = new rocalution::SPAI<rocalution::LocalMatrix<T>, rocalution::LocalVector<T>, T>;
1537 this->m_preconditioner = p;
1538- return true;
1539+ break;
1540 }
1541 case rocalution_enum_preconditioner::TNS:
1542 {
1543 auto* p
1544 = new rocalution::TNS<rocalution::LocalMatrix<T>, rocalution::LocalVector<T>, T>;
1545 this->m_preconditioner = p;
1546- return true;
1547+ break;
1548 }
1549
1550 case rocalution_enum_preconditioner::Jacobi:
1551@@ -290,7 +290,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO
1552 // no specific parameters
1553 //
1554 this->m_preconditioner = p;
1555- return true;
1556+ break;
1557 }
1558
1559 case rocalution_enum_preconditioner::GS:
1560@@ -300,7 +300,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO
1561 // no specific parameters
1562 //
1563 this->m_preconditioner = p;
1564- return true;
1565+ break;
1566 }
1567
1568 case rocalution_enum_preconditioner::SGS:
1569@@ -311,7 +311,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO
1570 // no specific parameters
1571 //
1572 this->m_preconditioner = p;
1573- return true;
1574+ break;
1575 }
1576
1577 case rocalution_enum_preconditioner::ILU:
1578@@ -322,6 +322,53 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO
1579 // no specific parameters
1580 //
1581 this->m_preconditioner = p;
1582+ break;
1583+ }
1584+ case rocalution_enum_preconditioner::ItILU0:
1585+ {
1586+ auto enum_itilu0_alg = parameters.GetEnumItILU0Algorithm();
1587+ if(enum_itilu0_alg.is_invalid())
1588+ {
1589+ rocalution_bench_errmsg << "enum_itilu0_alg is invalid." << std::endl;
1590+ return false;
1591+ }
1592+
1593+ auto* p
1594+ = new rocalution::ItILU0<rocalution::LocalMatrix<T>, rocalution::LocalVector<T>, T>;
1595+ p->SetMaxIter(parameters.Get(params_t::itilu0_max_iter));
1596+ p->SetTolerance(parameters.Get(params_t::itilu0_tol));
1597+ p->SetOptions(parameters.Get(params_t::itilu0_options));
1598+
1599+ switch(enum_itilu0_alg.value)
1600+ {
1601+ case rocalution_enum_itilu0_alg::Default:
1602+ {
1603+ p->SetAlgorithm(ItILU0Algorithm::Default);
1604+ break;
1605+ }
1606+ case rocalution_enum_itilu0_alg::AsyncInPlace:
1607+ {
1608+ p->SetAlgorithm(ItILU0Algorithm::AsyncInPlace);
1609+ break;
1610+ }
1611+ case rocalution_enum_itilu0_alg::AsyncSplit:
1612+ {
1613+ p->SetAlgorithm(ItILU0Algorithm::AsyncSplit);
1614+ break;
1615+ }
1616+ case rocalution_enum_itilu0_alg::SyncSplit:
1617+ {
1618+ p->SetAlgorithm(ItILU0Algorithm::SyncSplit);
1619+ break;
1620+ }
1621+ case rocalution_enum_itilu0_alg::SyncSplitFusion:
1622+ {
1623+ p->SetAlgorithm(ItILU0Algorithm::SyncSplitFusion);
1624+ break;
1625+ }
1626+ }
1627+
1628+ this->m_preconditioner = p;
1629 return true;
1630 }
1631 case rocalution_enum_preconditioner::ILUT:
1632@@ -331,7 +378,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO
1633 p->Set(parameters.Get(params_t::ilut_tol), parameters.Get(params_t::ilut_n));
1634
1635 this->m_preconditioner = p;
1636- return true;
1637+ break;
1638 }
1639 case rocalution_enum_preconditioner::IC:
1640 {
1641@@ -340,7 +387,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO
1642 // no specific parameters
1643 //
1644 this->m_preconditioner = p;
1645- return true;
1646+ break;
1647 }
1648 case rocalution_enum_preconditioner::MCGS:
1649 {
1650@@ -348,7 +395,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO
1651 MultiColoredGS<rocalution::LocalMatrix<T>, rocalution::LocalVector<T>, T>;
1652 p->SetRelaxation(parameters.Get(params_t::mcgs_relax));
1653 this->m_preconditioner = p;
1654- return true;
1655+ break;
1656 }
1657 case rocalution_enum_preconditioner::MCSGS:
1658 {
1659@@ -358,7 +405,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO
1660 // no specific parameters
1661 //
1662 this->m_preconditioner = p;
1663- return true;
1664+ break;
1665 }
1666 case rocalution_enum_preconditioner::MCILU:
1667 {
1668@@ -370,8 +417,30 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO
1669 parameters.Get(params_t::mcilu_use_level));
1670
1671 this->m_preconditioner = p;
1672- return true;
1673+ break;
1674+ }
1675 }
1676+
1677+ if(this->m_preconditioner != nullptr)
1678+ {
1679+ const auto itsolve = parameters.Get(params_t::iterative_solve);
1680+
1681+ rocalution::SolverDescr descr;
1682+
1683+ if(itsolve)
1684+ {
1685+ descr.SetTriSolverAlg(TriSolverAlg_Iterative);
1686+ descr.SetIterativeSolverMaxIteration(parameters.Get(params_t::itsolve_max_iter));
1687+ descr.SetIterativeSolverTolerance(parameters.Get(params_t::itsolve_tol));
1688+ }
1689+ else
1690+ {
1691+ descr.SetTriSolverAlg(TriSolverAlg_Default);
1692+ }
1693+
1694+ this->m_preconditioner->SetSolverDescriptor(descr);
1695+
1696+ return true;
1697 }
1698
1699 return false;
1700diff --git a/clients/benchmarks/rocalution_driver_itsolver_uaamg.hpp b/clients/benchmarks/rocalution_driver_itsolver_uaamg.hpp
1701index 7d83c9c..e513a85 100644
1702--- a/clients/benchmarks/rocalution_driver_itsolver_uaamg.hpp
1703+++ b/clients/benchmarks/rocalution_driver_itsolver_uaamg.hpp
1704@@ -1,5 +1,5 @@
1705 /* ************************************************************************
1706- * Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved.
1707+ * Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved.
1708 *
1709 * Permission is hereby granted, free of charge, to any person obtaining a copy
1710 * of this software and associated documentation files (the "Software"), to deal
1711@@ -181,7 +181,7 @@ struct rocalution_driver_itsolver<rocalution_enum_itsolver::uaamg, T>
1712 preconditioner->SetSolver(cgs);
1713 preconditioner->SetSmootherPreIter(pre_smooth);
1714 preconditioner->SetSmootherPostIter(post_smooth);
1715- preconditioner->SetOperatorFormat(format);
1716+ preconditioner->SetOperatorFormat(format, parameters.Get(params_t::blockdim));
1717 preconditioner->InitMaxIter(1);
1718 preconditioner->Verbose(0);
1719
1720diff --git a/clients/benchmarks/rocalution_enum_itilu0_alg.cpp b/clients/benchmarks/rocalution_enum_itilu0_alg.cpp
1721new file mode 100644
1722index 0000000..c13ba38
1723--- /dev/null
1724+++ b/clients/benchmarks/rocalution_enum_itilu0_alg.cpp
1725@@ -0,0 +1,99 @@
1726+/*! \file */
1727+/* ************************************************************************
1728+* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights Reserved.
1729+*
1730+* Permission is hereby granted, free of charge, to any person obtaining a copy
1731+* of this software and associated documentation files (the "Software"), to deal
1732+* in the Software without restriction, including without limitation the rights
1733+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1734+* copies of the Software, and to permit persons to whom the Software is
1735+* furnished to do so, subject to the following conditions:
1736+*
1737+* The above copyright notice and this permission notice shall be included in
1738+* all copies or substantial portions of the Software.
1739+*
1740+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1741+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1742+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1743+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1744+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1745+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1746+* THE SOFTWARE.
1747+*
1748+* ************************************************************************ */
1749+#include "rocalution_enum_itilu0_alg.hpp"
1750+#include <iostream>
1751+constexpr const char* rocalution_enum_itilu0_alg::names[rocalution_enum_itilu0_alg::size];
1752+constexpr rocalution_enum_itilu0_alg::value_type rocalution_enum_itilu0_alg::all[];
1753+
1754+const char* rocalution_enum_itilu0_alg::to_string() const
1755+{
1756+ return rocalution_enum_itilu0_alg::to_string(this->value);
1757+}
1758+
1759+bool rocalution_enum_itilu0_alg::is_invalid() const
1760+{
1761+ for(auto v : all)
1762+ {
1763+ if(this->value == v)
1764+ {
1765+ return false;
1766+ }
1767+ }
1768+ return true;
1769+}
1770+
1771+rocalution_enum_itilu0_alg::rocalution_enum_itilu0_alg(const char* itilu0_alg_name)
1772+{
1773+ this->value = (value_type)-1;
1774+ for(auto v : all)
1775+ {
1776+ const char* str = names[v];
1777+ if(!strcmp(itilu0_alg_name, str))
1778+ {
1779+ this->value = v;
1780+ return;
1781+ }
1782+ }
1783+
1784+ rocalution_bench_errmsg << "ItILU0 algorithm '" << itilu0_alg_name
1785+ << "' is invalid, the list of valid ItILU0 algorithms is" << std::endl;
1786+ for(auto v : all)
1787+ {
1788+ const char* str = names[v];
1789+ rocalution_bench_errmsg << " - '" << str << "'" << std::endl;
1790+ }
1791+ throw false;
1792+}
1793+
1794+//
1795+// Default contructor.
1796+//
1797+rocalution_enum_itilu0_alg::rocalution_enum_itilu0_alg()
1798+ : value((value_type)-1){};
1799+
1800+//
1801+//
1802+//
1803+rocalution_enum_itilu0_alg& rocalution_enum_itilu0_alg::operator()(const char* itilu0_alg_name)
1804+{
1805+ this->value = (value_type)-1;
1806+ for(auto v : all)
1807+ {
1808+ const char* str = names[v];
1809+ if(!strcmp(itilu0_alg_name, str))
1810+ {
1811+ this->value = v;
1812+ return *this;
1813+ }
1814+ }
1815+
1816+ rocalution_bench_errmsg << "ItILU0 algorithm '" << itilu0_alg_name
1817+ << "' is invalid, the list of valid ItILU0 algorithms is" << std::endl;
1818+ for(auto v : all)
1819+ {
1820+ const char* str = names[v];
1821+ rocalution_bench_errmsg << " - '" << str << "'" << std::endl;
1822+ }
1823+ throw false;
1824+}
1825diff --git a/clients/benchmarks/rocalution_enum_itilu0_alg.hpp b/clients/benchmarks/rocalution_enum_itilu0_alg.hpp
1826new file mode 100644
1827index 0000000..eaa6863
1828--- /dev/null
1829+++ b/clients/benchmarks/rocalution_enum_itilu0_alg.hpp
1830@@ -0,0 +1,92 @@
1831+/*! \file */
1832+/* ************************************************************************
1833+* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights Reserved.
1834+*
1835+* Permission is hereby granted, free of charge, to any person obtaining a copy
1836+* of this software and associated documentation files (the "Software"), to deal
1837+* in the Software without restriction, including without limitation the rights
1838+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1839+* copies of the Software, and to permit persons to whom the Software is
1840+* furnished to do so, subject to the following conditions:
1841+*
1842+* The above copyright notice and this permission notice shall be included in
1843+* all copies or substantial portions of the Software.
1844+*
1845+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1846+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1847+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1848+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1849+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1850+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1851+* THE SOFTWARE.
1852+*
1853+* ************************************************************************ */
1854+#pragma once
1855+#include "utility.hpp"
1856+#include <cstring>
1857+
1858+//
1859+// List the enumeration values.
1860+//
1861+
1862+// clang-format off
1863+#define ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM_EACH \
1864+ ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(Default) \
1865+ ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(AsyncInPlace) \
1866+ ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(AsyncSplit) \
1867+ ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(SyncSplit) \
1868+ ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(SyncSplitFusion)
1869+// clang-format on
1870+
1871+struct rocalution_enum_itilu0_alg
1872+{
1873+public:
1874+#define ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(x_) x_,
1875+ typedef enum rocalution_enum_itilu0_alg__ : int
1876+ {
1877+ ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM_EACH
1878+ } value_type;
1879+ static constexpr value_type all[] = {ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM_EACH};
1880+#undef ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM
1881+ static constexpr std::size_t size = countof(all);
1882+ value_type value{};
1883+
1884+private:
1885+#define ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(x_) #x_,
1886+ static constexpr const char* names[size]{ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM_EACH};
1887+#undef ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM
1888+public:
1889+ operator value_type() const
1890+ {
1891+ return this->value;
1892+ };
1893+ rocalution_enum_itilu0_alg();
1894+ rocalution_enum_itilu0_alg& operator()(const char* function);
1895+ rocalution_enum_itilu0_alg(const char* function);
1896+ const char* to_string() const;
1897+ bool is_invalid() const;
1898+ static inline const char* to_string(rocalution_enum_itilu0_alg::value_type value)
1899+ {
1900+ //
1901+ // switch for checking inconsistency.
1902+ //
1903+ switch(value)
1904+ {
1905+ ///
1906+#define ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(x_) \
1907+ case x_: \
1908+ { \
1909+ if(strcmp(#x_, names[value])) \
1910+ return nullptr; \
1911+ break; \
1912+ }
1913+
1914+ ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM_EACH;
1915+
1916+#undef ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM
1917+ ///
1918+ }
1919+
1920+ return names[value];
1921+ }
1922+};
1923diff --git a/clients/benchmarks/rocalution_enum_preconditioner.hpp b/clients/benchmarks/rocalution_enum_preconditioner.hpp
1924index 2eaacf5..2063476 100644
1925--- a/clients/benchmarks/rocalution_enum_preconditioner.hpp
1926+++ b/clients/benchmarks/rocalution_enum_preconditioner.hpp
1927@@ -1,5 +1,5 @@
1928 /* ************************************************************************
1929- * Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved.
1930+ * Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved.
1931 *
1932 * Permission is hereby granted, free of charge, to any person obtaining a copy
1933 * of this software and associated documentation files (the "Software"), to deal
1934@@ -40,6 +40,7 @@ struct rocalution_enum_preconditioner
1935 ENUM_PRECONDITIONER(GS) \
1936 ENUM_PRECONDITIONER(SGS) \
1937 ENUM_PRECONDITIONER(ILU) \
1938+ ENUM_PRECONDITIONER(ItILU0) \
1939 ENUM_PRECONDITIONER(ILUT) \
1940 ENUM_PRECONDITIONER(IC) \
1941 ENUM_PRECONDITIONER(MCGS) \
1942diff --git a/clients/include/common.hpp b/clients/include/common.hpp
1943index 5cdb250..00acd2f 100644
1944--- a/clients/include/common.hpp
1945+++ b/clients/include/common.hpp
1946@@ -1,5 +1,5 @@
1947 /* ************************************************************************
1948- * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved.
1949+ * Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights Reserved.
1950 *
1951 * Permission is hereby granted, free of charge, to any person obtaining a copy
1952 * of this software and associated documentation files (the "Software"), to deal
1953@@ -22,11 +22,36 @@
1954 * ************************************************************************ */
1955
1956 #include <cstring>
1957+#include <map>
1958 #include <mpi.h>
1959 #include <rocalution/rocalution.hpp>
1960+#include <set>
1961
1962 using namespace rocalution;
1963
1964+static void my_irecv(int* buf, int count, int source, int tag, MPI_Comm comm, MPI_Request* request)
1965+{
1966+ MPI_Irecv(buf, count, MPI_INT, source, tag, comm, request);
1967+}
1968+
1969+static void
1970+ my_irecv(int64_t* buf, int count, int source, int tag, MPI_Comm comm, MPI_Request* request)
1971+{
1972+ MPI_Irecv(buf, count, MPI_INT64_T, source, tag, comm, request);
1973+}
1974+
1975+static void
1976+ my_isend(const int* buf, int count, int dest, int tag, MPI_Comm comm, MPI_Request* request)
1977+{
1978+ MPI_Isend(buf, count, MPI_INT, dest, tag, comm, request);
1979+}
1980+
1981+static void
1982+ my_isend(const int64_t* buf, int count, int dest, int tag, MPI_Comm comm, MPI_Request* request)
1983+{
1984+ MPI_Isend(buf, count, MPI_INT64_T, dest, tag, comm, request);
1985+}
1986+
1987 template <typename ValueType>
1988 void distribute_matrix(const MPI_Comm* comm,
1989 LocalMatrix<ValueType>* lmat,
1990@@ -39,11 +64,11 @@ void distribute_matrix(const MPI_Comm* comm,
1991 MPI_Comm_rank(*comm, &rank);
1992 MPI_Comm_size(*comm, &num_procs);
1993
1994- size_t global_nrow = lmat->GetM();
1995- size_t global_ncol = lmat->GetN();
1996- size_t global_nnz = lmat->GetNnz();
1997+ int64_t global_nrow = lmat->GetM();
1998+ int64_t global_ncol = lmat->GetN();
1999+ int64_t global_nnz = lmat->GetNnz();
2000
2001- int* global_row_offset = NULL;
2002+ PtrType* global_row_offset = NULL;
2003 int* global_col = NULL;
2004 ValueType* global_val = NULL;
2005
2006@@ -74,14 +99,14 @@ void distribute_matrix(const MPI_Comm* comm,
2007
2008 if(global_nrow % num_procs != 0)
2009 {
2010- for(size_t i = 0; i < global_nrow % num_procs; ++i)
2011+ for(int i = 0; i < global_nrow % num_procs; ++i)
2012 {
2013 ++local_size[i];
2014 }
2015 }
2016
2017 // Compute index offsets
2018- std::vector<int> index_offset(num_procs + 1);
2019+ std::vector<PtrType> index_offset(num_procs + 1);
2020 index_offset[0] = 0;
2021 for(int i = 0; i < num_procs; ++i)
2022 {
2023@@ -89,10 +114,10 @@ void distribute_matrix(const MPI_Comm* comm,
2024 }
2025
2026 // Read sub matrix - row_offset
2027- int local_nrow = local_size[rank];
2028- std::vector<int> local_row_offset(local_nrow + 1);
2029+ int local_nrow = local_size[rank];
2030+ std::vector<PtrType> local_row_offset(local_nrow + 1);
2031
2032- for(int i = index_offset[rank], k = 0; k < local_nrow + 1; ++i, ++k)
2033+ for(PtrType i = index_offset[rank], k = 0; k < local_nrow + 1; ++i, ++k)
2034 {
2035 local_row_offset[k] = global_row_offset[i];
2036 }
2037@@ -100,11 +125,11 @@ void distribute_matrix(const MPI_Comm* comm,
2038 free_host(&global_row_offset);
2039
2040 // Read sub matrix - col and val
2041- int local_nnz = local_row_offset[local_nrow] - local_row_offset[0];
2042+ PtrType local_nnz = local_row_offset[local_nrow] - local_row_offset[0];
2043 std::vector<int> local_col(local_nnz);
2044 std::vector<ValueType> local_val(local_nnz);
2045
2046- for(int i = local_row_offset[0], k = 0; k < local_nnz; ++i, ++k)
2047+ for(PtrType i = local_row_offset[0], k = 0; k < local_nnz; ++i, ++k)
2048 {
2049 local_col[k] = global_col[i];
2050 local_val[k] = global_val[i];
2051@@ -120,18 +145,18 @@ void distribute_matrix(const MPI_Comm* comm,
2052 local_row_offset[i] -= shift;
2053 }
2054
2055- int interior_nnz = 0;
2056- int ghost_nnz = 0;
2057- int boundary_nnz = 0;
2058- int neighbors = 0;
2059+ PtrType interior_nnz = 0;
2060+ PtrType ghost_nnz = 0;
2061+ int boundary_nnz = 0;
2062+ int neighbors = 0;
2063
2064- std::vector<std::vector<int>> boundary(num_procs, std::vector<int>());
2065- std::vector<bool> neighbor(num_procs, false);
2066- std::vector<std::map<int, bool>> checked(num_procs, std::map<int, bool>());
2067+ std::vector<std::vector<PtrType>> boundary(num_procs, std::vector<PtrType>());
2068+ std::vector<bool> neighbor(num_procs, false);
2069+ std::vector<std::map<int, bool>> checked(num_procs, std::map<int, bool>());
2070
2071 for(int i = 0; i < local_nrow; ++i)
2072 {
2073- for(int j = local_row_offset[i]; j < local_row_offset[i + 1]; ++j)
2074+ for(PtrType j = local_row_offset[i]; j < local_row_offset[i + 1]; ++j)
2075 {
2076
2077 // Interior point
2078@@ -214,7 +239,7 @@ void distribute_matrix(const MPI_Comm* comm,
2079 if(neighbor[i] == true)
2080 {
2081 // Receive size of boundary from rank i to current rank
2082- MPI_Irecv(&(boundary_size[n]), 1, MPI_INT, i, 0, *comm, &mpi_req[n]);
2083+ my_irecv(&(boundary_size[n]), 1, i, 0, *comm, &mpi_req[n]);
2084 ++n;
2085 }
2086 }
2087@@ -227,7 +252,7 @@ void distribute_matrix(const MPI_Comm* comm,
2088 {
2089 int size = boundary[i].size();
2090 // Send size of boundary from current rank to rank i
2091- MPI_Isend(&size, 1, MPI_INT, i, 0, *comm, &mpi_req[n]);
2092+ my_isend(&size, 1, i, 0, *comm, &mpi_req[n]);
2093 ++n;
2094 }
2095 }
2096@@ -256,7 +281,7 @@ void distribute_matrix(const MPI_Comm* comm,
2097 }
2098
2099 // Array to hold boundary for each interface
2100- std::vector<std::vector<int>> local_boundary(neighbors);
2101+ std::vector<std::vector<PtrType>> local_boundary(neighbors);
2102 for(int i = 0; i < neighbors; ++i)
2103 {
2104 local_boundary[i].resize(boundary_size[i]);
2105@@ -269,8 +294,7 @@ void distribute_matrix(const MPI_Comm* comm,
2106 if(neighbor[i] == true)
2107 {
2108 // Receive boundary from rank i to current rank
2109- MPI_Irecv(
2110- local_boundary[n].data(), boundary_size[n], MPI_INT, i, 0, *comm, &mpi_req[n]);
2111+ my_irecv(local_boundary[n].data(), boundary_size[n], i, 0, *comm, &mpi_req[n]);
2112 ++n;
2113 }
2114 }
2115@@ -282,7 +306,7 @@ void distribute_matrix(const MPI_Comm* comm,
2116 if(boundary[i].size() > 0)
2117 {
2118 // Send boundary from current rank to rank i
2119- MPI_Isend(&(boundary[i][0]), boundary[i].size(), MPI_INT, i, 0, *comm, &mpi_req[n]);
2120+ my_isend(&(boundary[i][0]), boundary[i].size(), i, 0, *comm, &mpi_req[n]);
2121 ++n;
2122 }
2123 }
2124@@ -305,13 +329,13 @@ void distribute_matrix(const MPI_Comm* comm,
2125 {
2126 for(unsigned int j = 0; j < boundary[i].size(); ++j)
2127 {
2128- bnd[k] = boundary[i][j] - index_offset[rank];
2129+ bnd[k] = static_cast<int>(boundary[i][j] - index_offset[rank]);
2130 ++k;
2131 }
2132 }
2133
2134 // Create boundary index array
2135- std::vector<int> boundary_index(nnz_boundary);
2136+ std::vector<PtrType> boundary_index(nnz_boundary);
2137
2138 k = 0;
2139 for(int i = 0; i < neighbors; ++i)
2140@@ -340,11 +364,11 @@ void distribute_matrix(const MPI_Comm* comm,
2141 memset(ghost_col, 0, sizeof(int) * ghost_nnz);
2142 memset(ghost_val, 0, sizeof(ValueType) * ghost_nnz);
2143
2144- int* row_offset = new int[local_nrow + 1];
2145+ PtrType* row_offset = new PtrType[local_nrow + 1];
2146 int* col = new int[interior_nnz];
2147 ValueType* val = new ValueType[interior_nnz];
2148
2149- memset(row_offset, 0, sizeof(int) * (local_nrow + 1));
2150+ memset(row_offset, 0, sizeof(PtrType) * (local_nrow + 1));
2151 memset(col, 0, sizeof(int) * interior_nnz);
2152 memset(val, 0, sizeof(ValueType) * interior_nnz);
2153
2154@@ -353,7 +377,7 @@ void distribute_matrix(const MPI_Comm* comm,
2155 int l = 0;
2156 for(int i = 0; i < local_nrow; ++i)
2157 {
2158- for(int j = local_row_offset[i]; j < local_row_offset[i + 1]; ++j)
2159+ for(PtrType j = local_row_offset[i]; j < local_row_offset[i + 1]; ++j)
2160 {
2161
2162 // Boundary point -- create ghost part
2163@@ -403,4 +427,824 @@ void distribute_matrix(const MPI_Comm* comm,
2164 gmat->SetParallelManager(*pm);
2165 gmat->SetLocalDataPtrCSR(&row_offset, &col, &val, "mat", interior_nnz);
2166 gmat->SetGhostDataPtrCOO(&ghost_row, &ghost_col, &ghost_val, "ghost", ghost_nnz);
2167+ gmat->Sort();
2168+}
2169+
2170+// This function computes all prime factors of a given number n
2171+static void compute_prime_factors(int n, std::vector<int>& p)
2172+{
2173+ int factor = 2;
2174+
2175+ // Factorize
2176+ while(n > 1)
2177+ {
2178+ while(n % factor == 0)
2179+ {
2180+ p.push_back(factor);
2181+ n /= factor;
2182+ }
2183+
2184+ ++factor;
2185+ }
2186+}
2187+
2188+// This function computes the process distribution for each dimension
2189+static void compute_2d_process_distribution(int nprocs, int& nprocx, int& nprocy)
2190+{
2191+ // Compute prime factors
2192+ std::vector<int> p;
2193+ compute_prime_factors(nprocs, p);
2194+
2195+ // Compute number of processes in each dimension
2196+ nprocx = 1;
2197+ nprocy = 1;
2198+
2199+ if(p.size() == 0)
2200+ {
2201+ // No entry, this means we have exactly one process
2202+ }
2203+ else if(p.size() == 1)
2204+ {
2205+ // If we have a single prime number, this is going to be our x dimension
2206+ nprocx = p[0];
2207+ }
2208+ else if(p.size() == 2)
2209+ {
2210+ // For two prime numbers, setup x and y
2211+ nprocx = p[1];
2212+ nprocy = p[0];
2213+ }
2214+ else
2215+ {
2216+ // More than two prime numbers
2217+
2218+ // #prime numbers
2219+ int idx = 0;
2220+ size_t nprime = p.size();
2221+
2222+ // cubic root
2223+ double sqroot = std::sqrt(nprocs);
2224+
2225+ // Determine x dimension
2226+ nprocx = p[nprime-- - 1];
2227+
2228+ while(nprocx < sqroot && idx < nprime)
2229+ {
2230+ nprocx *= p[idx++];
2231+ }
2232+
2233+ // Determine y dimension
2234+ while(idx < nprime)
2235+ {
2236+ nprocy *= p[idx++];
2237+ }
2238+ }
2239+
2240+ // Number of processes must match
2241+ assert(nprocx * nprocy == nprocs);
2242+}
2243+
2244+// This function computes the process distribution for each dimension
2245+static void compute_3d_process_distribution(int nprocs, int& nprocx, int& nprocy, int& nprocz)
2246+{
2247+ // Compute prime factors
2248+ std::vector<int> p;
2249+ compute_prime_factors(nprocs, p);
2250+
2251+ // Compute number of processes in each dimension
2252+ nprocx = 1;
2253+ nprocy = 1;
2254+ nprocz = 1;
2255+
2256+ if(p.size() == 0)
2257+ {
2258+ // No entry, this means we have exactly one process
2259+ }
2260+ else if(p.size() == 1)
2261+ {
2262+ // If we have a single prime number, this is going to be our x dimension
2263+ nprocx = p[0];
2264+ }
2265+ else if(p.size() == 2)
2266+ {
2267+ // For two prime numbers, setup x and y
2268+ nprocx = p[1];
2269+ nprocy = p[0];
2270+ }
2271+ else if(p.size() == 3)
2272+ {
2273+ // Three prime numbers
2274+ nprocx = p[2];
2275+ nprocy = p[1];
2276+ nprocz = p[0];
2277+ }
2278+ else
2279+ {
2280+ // More than three prime numbers
2281+
2282+ // #prime numbers
2283+ int idx = 0;
2284+ size_t nprime = p.size();
2285+
2286+ // cubic root
2287+ double qroot = std::cbrt(nprocs);
2288+
2289+ // Determine x dimension
2290+ nprocx = p[nprime-- - 1];
2291+
2292+ while(nprocx < qroot && idx < nprime)
2293+ {
2294+ nprocx *= p[idx++];
2295+ }
2296+
2297+ // Determine y dimension
2298+ double sqroot = std::sqrt(nprocs / nprocx);
2299+
2300+ nprocy = p[nprime-- - 1];
2301+
2302+ while(nprocy < sqroot && idx < nprime)
2303+ {
2304+ nprocy *= p[idx++];
2305+ }
2306+
2307+ // Determine z dimension
2308+ while(idx < nprime)
2309+ {
2310+ nprocz *= p[idx++];
2311+ }
2312+ }
2313+
2314+ // Number of processes must match
2315+ assert(nprocx * nprocy * nprocz == nprocs);
2316+}
2317+
2318+template <typename ValueType>
2319+void generate_2d_laplacian(int local_dimx,
2320+ int local_dimy,
2321+ const MPI_Comm* comm,
2322+ GlobalMatrix<ValueType>* mat,
2323+ ParallelManager* pm,
2324+ int rank,
2325+ int nprocs,
2326+ int nsten = 9)
2327+{
2328+ assert(nsten == 5 || nsten == 9);
2329+
2330+ // First, we need to determine process pattern for the unit square
2331+ int nproc_x;
2332+ int nproc_y;
2333+
2334+ compute_2d_process_distribution(nprocs, nproc_x, nproc_y);
2335+
2336+ // Next, determine process index into the unit square
2337+ int iproc_y = rank / nproc_x;
2338+ int iproc_x = rank % nproc_x;
2339+
2340+ // Global sizes
2341+ int64_t global_dimx = static_cast<int64_t>(nproc_x) * local_dimx;
2342+ int64_t global_dimy = static_cast<int64_t>(nproc_y) * local_dimy;
2343+
2344+ // Global process entry points
2345+ int64_t global_iproc_x = iproc_x * local_dimx;
2346+ int64_t global_iproc_y = iproc_y * local_dimy;
2347+
2348+ // Number of rows (global and local)
2349+ int64_t local_nrow = local_dimx * local_dimy;
2350+ int64_t global_nrow = global_dimx * global_dimy;
2351+
2352+ // Assemble local CSR matrix row offset pointers
2353+ PtrType* global_csr_row_ptr = NULL;
2354+ int64_t* global_csr_col_ind = NULL;
2355+ int64_t* local2global = NULL;
2356+
2357+ allocate_host(local_nrow + 1, &global_csr_row_ptr);
2358+ allocate_host(local_nrow * nsten, &global_csr_col_ind);
2359+ allocate_host(local_nrow, &local2global);
2360+
2361+ std::map<int64_t, int> global2local;
2362+
2363+ PtrType nnz = 0;
2364+ global_csr_row_ptr[0] = 0;
2365+
2366+ // Loop over y dimension
2367+ for(int local_y = 0; local_y < local_dimy; ++local_y)
2368+ {
2369+ // Global index into y
2370+ int64_t global_y = global_iproc_y + local_y;
2371+
2372+ // Loop over x dimension
2373+ for(int local_x = 0; local_x < local_dimx; ++local_x)
2374+ {
2375+ // Global index into x
2376+ int64_t global_x = global_iproc_x + local_x;
2377+
2378+ // Local row
2379+ int local_row = local_y * local_dimx + local_x;
2380+
2381+ // Global row
2382+ int64_t global_row = global_y * global_dimx + global_x;
2383+
2384+ // Fill l2g and g2l map
2385+ local2global[local_row] = global_row;
2386+ global2local[global_row] = local_row;
2387+
2388+ // 5pt stencil
2389+ if(nsten == 5)
2390+ {
2391+ // Fixed x (leaving out i == j)
2392+ for(int by = -1; by <= 1; ++by)
2393+ {
2394+ if(global_y + by > -1 && global_y + by < global_dimy && by != 0)
2395+ {
2396+ // Global column
2397+ int64_t global_col = global_row + by * global_dimx;
2398+
2399+ // Fill global CSR column indices
2400+ global_csr_col_ind[nnz++] = global_col;
2401+ }
2402+ }
2403+
2404+ // Fixed y
2405+ for(int bx = -1; bx <= 1; ++bx)
2406+ {
2407+ if(global_x + bx > -1 && global_x + bx < global_dimx)
2408+ {
2409+ // Global column
2410+ int64_t global_col = global_row + bx;
2411+
2412+ // Fill global CSR column indices
2413+ global_csr_col_ind[nnz++] = global_col;
2414+ }
2415+ }
2416+ }
2417+
2418+ // 9 pt stencil
2419+ if(nsten == 9)
2420+ {
2421+ // Check if current y vertex is on the boundary
2422+ for(int by = -1; by <= 1; ++by)
2423+ {
2424+ if(global_y + by > -1 && global_y + by < global_dimy)
2425+ {
2426+ // Check if current x vertex is on the boundary
2427+ for(int bx = -1; bx <= 1; ++bx)
2428+ {
2429+ if(global_x + bx > -1 && global_x + bx < global_dimx)
2430+ {
2431+ // Global column
2432+ int64_t global_col = global_row + by * global_dimx + bx;
2433+
2434+ // Fill global CSR column indices
2435+ global_csr_col_ind[nnz++] = global_col;
2436+ }
2437+ }
2438+ }
2439+ }
2440+ }
2441+
2442+ global_csr_row_ptr[local_row + 1] = nnz;
2443+ }
2444+ }
2445+
2446+ // Local number of non-zero entries - need to use long long int to make the communication work
2447+ int64_t local_nnz = global_csr_row_ptr[local_nrow];
2448+
2449+ // Total number of non-zeros
2450+ int64_t global_nnz;
2451+ MPI_Allreduce(&local_nnz, &global_nnz, 1, MPI_INT64_T, MPI_SUM, *comm);
2452+
2453+ // Now, we need to setup the communication pattern
2454+ std::map<int, std::set<int64_t>> recv_indices;
2455+ std::map<int, std::set<int64_t>> send_indices;
2456+
2457+ // CSR matrix row pointers
2458+ PtrType* int_csr_row_ptr = NULL;
2459+ PtrType* gst_csr_row_ptr = NULL;
2460+
2461+ allocate_host(local_nrow + 1, &int_csr_row_ptr);
2462+ allocate_host(local_nrow + 1, &gst_csr_row_ptr);
2463+
2464+ int_csr_row_ptr[0] = 0;
2465+ gst_csr_row_ptr[0] = 0;
2466+
2467+ // Determine, which vertices need to be sent / received
2468+ for(int i = 0; i < local_nrow; ++i)
2469+ {
2470+ int_csr_row_ptr[i + 1] = int_csr_row_ptr[i];
2471+ gst_csr_row_ptr[i + 1] = gst_csr_row_ptr[i];
2472+
2473+ int64_t global_row = local2global[i];
2474+
2475+ for(PtrType j = global_csr_row_ptr[i]; j < global_csr_row_ptr[i + 1]; ++j)
2476+ {
2477+ int64_t global_col = global_csr_col_ind[j];
2478+
2479+ // Determine which process owns the vertex
2480+ int64_t idx_y = global_col / global_dimx;
2481+ int64_t idx_x = global_col % global_dimx;
2482+
2483+ int idx_proc_y = idx_y / local_dimy;
2484+ int idx_proc_x = idx_x / local_dimx;
2485+
2486+ int owner = idx_proc_x + idx_proc_y * nproc_x;
2487+
2488+ // If we do not own it, we need to receive it from our neighbor
2489+ // and also send the current vertex to this neighbor
2490+ if(owner != rank)
2491+ {
2492+ // Store the global column and row id that we have to receive / send from / to a neighbor
2493+ // We need a set here to eliminate duplicates
2494+ recv_indices[owner].insert(global_col);
2495+ send_indices[owner].insert(global_row);
2496+
2497+ ++gst_csr_row_ptr[i + 1];
2498+ }
2499+ else
2500+ {
2501+ ++int_csr_row_ptr[i + 1];
2502+ }
2503+ }
2504+ }
2505+
2506+ // Number of processes we communicate with
2507+ int nrecv = recv_indices.size();
2508+ int nsend = send_indices.size();
2509+
2510+ // Process ids we communicate with
2511+ std::vector<int> recvs;
2512+ std::vector<int> sends;
2513+
2514+ recvs.reserve(nrecv);
2515+ sends.reserve(nsend);
2516+
2517+ // Index offsets for each neighbor
2518+ std::vector<int> recv_index_offset;
2519+ std::vector<int> send_index_offset;
2520+
2521+ recv_index_offset.reserve(nrecv + 1);
2522+ send_index_offset.reserve(nsend + 1);
2523+
2524+ recv_index_offset.push_back(0);
2525+ send_index_offset.push_back(0);
2526+
2527+ int cnt = 0;
2528+ std::map<int64_t, int> global2ghost;
2529+
2530+ // Go through the recv data
2531+ for(std::map<int, std::set<int64_t>>::iterator it = recv_indices.begin();
2532+ it != recv_indices.end();
2533+ ++it)
2534+ {
2535+ recvs.push_back(it->first);
2536+ recv_index_offset.push_back(it->second.size());
2537+
2538+ for(std::set<int64_t>::iterator iit = it->second.begin(); iit != it->second.end(); ++iit)
2539+ {
2540+ global2ghost[*iit] = cnt++;
2541+ }
2542+ }
2543+
2544+ // Go through the send data
2545+ int boundary_nnz = 0;
2546+ for(std::map<int, std::set<int64_t>>::iterator it = send_indices.begin();
2547+ it != send_indices.end();
2548+ ++it)
2549+ {
2550+ sends.push_back(it->first);
2551+ send_index_offset.push_back(it->second.size());
2552+ boundary_nnz += it->second.size();
2553+ }
2554+
2555+ // Exclusive sum
2556+ for(int i = 0; i < nrecv; ++i)
2557+ {
2558+ recv_index_offset[i + 1] += recv_index_offset[i];
2559+ }
2560+
2561+ for(int i = 0; i < nsend; ++i)
2562+ {
2563+ send_index_offset[i + 1] += send_index_offset[i];
2564+ }
2565+
2566+ // Boundary indices
2567+ std::vector<int> boundary;
2568+ boundary.reserve(boundary_nnz);
2569+
2570+ for(std::map<int, std::set<int64_t>>::iterator it = send_indices.begin();
2571+ it != send_indices.end();
2572+ ++it)
2573+ {
2574+ for(std::set<int64_t>::iterator iit = it->second.begin(); iit != it->second.end(); ++iit)
2575+ {
2576+ boundary.push_back(global2local[*iit]);
2577+ }
2578+ }
2579+
2580+ // Initialize manager
2581+ pm->SetMPICommunicator(comm);
2582+ pm->SetGlobalNrow(global_nrow);
2583+ pm->SetGlobalNcol(global_nrow);
2584+ pm->SetLocalNrow(local_nrow);
2585+ pm->SetLocalNcol(local_nrow);
2586+
2587+ if(nprocs > 1)
2588+ {
2589+ pm->SetBoundaryIndex(boundary_nnz, boundary.data());
2590+ pm->SetReceivers(nrecv, recvs.data(), recv_index_offset.data());
2591+ pm->SetSenders(nsend, sends.data(), send_index_offset.data());
2592+ }
2593+
2594+ mat->SetParallelManager(*pm);
2595+
2596+ // Generate local and ghost matrices
2597+ local_nnz = int_csr_row_ptr[local_nrow];
2598+ int64_t ghost_nnz = gst_csr_row_ptr[local_nrow];
2599+
2600+ int* int_csr_col_ind = NULL;
2601+ int* gst_csr_col_ind = NULL;
2602+ ValueType* int_csr_val = NULL;
2603+ ValueType* gst_csr_val = NULL;
2604+
2605+ allocate_host(local_nnz, &int_csr_col_ind);
2606+ allocate_host(local_nnz, &int_csr_val);
2607+ allocate_host(ghost_nnz, &gst_csr_col_ind);
2608+ allocate_host(ghost_nnz, &gst_csr_val);
2609+
2610+ // Convert global matrix columns to local columns
2611+ for(int i = 0; i < local_nrow; ++i)
2612+ {
2613+ PtrType local_idx = int_csr_row_ptr[i];
2614+ PtrType ghost_idx = gst_csr_row_ptr[i];
2615+
2616+ int64_t global_row = local2global[i];
2617+
2618+ for(PtrType j = global_csr_row_ptr[i]; j < global_csr_row_ptr[i + 1]; ++j)
2619+ {
2620+ int64_t global_col = global_csr_col_ind[j];
2621+
2622+ // Determine which process owns the vertex
2623+ int64_t idx_y = global_col / global_dimx;
2624+ int64_t idx_x = global_col % global_dimx;
2625+
2626+ int idx_proc_y = idx_y / local_dimy;
2627+ int idx_proc_x = idx_x / local_dimx;
2628+
2629+ int owner = idx_proc_x + idx_proc_y * nproc_x;
2630+
2631+ // If we do not own it, we need to receive it from our neighbor
2632+ // and also send the current vertex to this neighbor
2633+ if(owner != rank)
2634+ {
2635+ // Store the global column and row id that we have to receive / send from / to a neighbor
2636+ // We need a set here to eliminate duplicates
2637+ recv_indices[owner].insert(global_col);
2638+ send_indices[owner].insert(global_row);
2639+
2640+ gst_csr_col_ind[ghost_idx] = global2ghost[global_col];
2641+ gst_csr_val[ghost_idx] = -1.0;
2642+ ++ghost_idx;
2643+ }
2644+ else
2645+ {
2646+ // This is our part
2647+ int_csr_col_ind[local_idx] = global2local[global_col];
2648+ int_csr_val[local_idx] = (global_col == global_row) ? (nsten - 1.0) : -1.0;
2649+ ++local_idx;
2650+ }
2651+ }
2652+ }
2653+
2654+ free_host(&global_csr_row_ptr);
2655+ free_host(&global_csr_col_ind);
2656+ free_host(&local2global);
2657+
2658+ mat->SetLocalDataPtrCSR(&int_csr_row_ptr, &int_csr_col_ind, &int_csr_val, "mat", local_nnz);
2659+ mat->SetGhostDataPtrCSR(&gst_csr_row_ptr, &gst_csr_col_ind, &gst_csr_val, "gst", ghost_nnz);
2660+ mat->Sort();
2661+}
2662+
2663+template <typename ValueType>
2664+void generate_3d_laplacian(int local_dimx,
2665+ int local_dimy,
2666+ int local_dimz,
2667+ const MPI_Comm* comm,
2668+ GlobalMatrix<ValueType>* mat,
2669+ ParallelManager* pm,
2670+ int rank,
2671+ int nprocs)
2672+{
2673+ // First, we need to determine process pattern for the unit cube
2674+ int nproc_x;
2675+ int nproc_y;
2676+ int nproc_z;
2677+
2678+ compute_3d_process_distribution(nprocs, nproc_x, nproc_y, nproc_z);
2679+
2680+ // Next, determine process index into the unit cube
2681+ int iproc_z = rank / (nproc_x * nproc_y);
2682+ int iproc_y = (rank - iproc_z * nproc_x * nproc_y) / nproc_x;
2683+ int iproc_x = rank % nproc_x;
2684+
2685+ // Global sizes
2686+ int64_t global_dimx = static_cast<int64_t>(nproc_x) * local_dimx;
2687+ int64_t global_dimy = static_cast<int64_t>(nproc_y) * local_dimy;
2688+ int64_t global_dimz = static_cast<int64_t>(nproc_z) * local_dimz;
2689+
2690+ // Global process entry points
2691+ int64_t global_iproc_x = iproc_x * local_dimx;
2692+ int64_t global_iproc_y = iproc_y * local_dimy;
2693+ int64_t global_iproc_z = iproc_z * local_dimz;
2694+
2695+ // Number of rows (global and local)
2696+ int64_t local_nrow = local_dimx * local_dimy * local_dimz;
2697+ int64_t global_nrow = global_dimx * global_dimy * global_dimz;
2698+
2699+ // Assemble local CSR matrix row offset pointers
2700+ std::vector<PtrType> global_csr_row_ptr(local_nrow + 1);
2701+ std::vector<int64_t> global_csr_col_ind(local_nrow * 27);
2702+
2703+ std::vector<int64_t> local2global(local_nrow);
2704+ std::map<int64_t, int> global2local;
2705+
2706+ PtrType nnz = 0;
2707+ global_csr_row_ptr[0] = 0;
2708+
2709+ // Loop over z dimension
2710+ for(int local_z = 0; local_z < local_dimz; ++local_z)
2711+ {
2712+ // Global index into z
2713+ int64_t global_z = global_iproc_z + local_z;
2714+
2715+ // Loop over y dimension
2716+ for(int local_y = 0; local_y < local_dimy; ++local_y)
2717+ {
2718+ // Global index into y
2719+ int64_t global_y = global_iproc_y + local_y;
2720+
2721+ // Loop over x dimension
2722+ for(int local_x = 0; local_x < local_dimx; ++local_x)
2723+ {
2724+ // Global index into x
2725+ int64_t global_x = global_iproc_x + local_x;
2726+
2727+ // Local row
2728+ int local_row = local_z * local_dimx * local_dimy + local_y * local_dimx + local_x;
2729+
2730+ // Global row
2731+ int64_t global_row
2732+ = global_z * global_dimx * global_dimy + global_y * global_dimx + global_x;
2733+
2734+ // Fill l2g and g2l map
2735+ local2global[local_row] = global_row;
2736+ global2local[global_row] = local_row;
2737+
2738+ // Check if current z vertex is on the boundary
2739+ for(int bz = -1; bz <= 1; ++bz)
2740+ {
2741+ if(global_z + bz > -1 && global_z + bz < global_dimz)
2742+ {
2743+ // Check if current y vertex is on the boundary
2744+ for(int by = -1; by <= 1; ++by)
2745+ {
2746+ if(global_y + by > -1 && global_y + by < global_dimy)
2747+ {
2748+ // Check if current x vertex is on the boundary
2749+ for(int bx = -1; bx <= 1; ++bx)
2750+ {
2751+ if(global_x + bx > -1 && global_x + bx < global_dimx)
2752+ {
2753+ // Global column
2754+ int64_t global_col = global_row
2755+ + bz * global_dimx * global_dimy
2756+ + by * global_dimx + bx;
2757+
2758+ // Fill global CSR column indices
2759+ global_csr_col_ind[nnz++] = global_col;
2760+ }
2761+ }
2762+ }
2763+ }
2764+ }
2765+ }
2766+
2767+ global_csr_row_ptr[local_row + 1] = nnz;
2768+ }
2769+ }
2770+ }
2771+
2772+ // Local number of non-zero entries - need to use long long int to make the communication work
2773+ int64_t local_nnz = global_csr_row_ptr[local_nrow];
2774+
2775+ // Total number of non-zeros
2776+ int64_t global_nnz;
2777+ MPI_Allreduce(&local_nnz, &global_nnz, 1, MPI_INT64_T, MPI_SUM, *comm);
2778+
2779+ // Now, we need to setup the communication pattern
2780+ std::map<int, std::set<int64_t>> recv_indices;
2781+ std::map<int, std::set<int64_t>> send_indices;
2782+
2783+ // CSR matrix row pointers
2784+ PtrType* int_csr_row_ptr = NULL;
2785+ PtrType* gst_csr_row_ptr = NULL;
2786+
2787+ allocate_host(local_nrow + 1, &int_csr_row_ptr);
2788+ allocate_host(local_nrow + 1, &gst_csr_row_ptr);
2789+
2790+ int_csr_row_ptr[0] = 0;
2791+ gst_csr_row_ptr[0] = 0;
2792+
2793+ // Determine, which vertices need to be sent / received
2794+ for(int i = 0; i < local_nrow; ++i)
2795+ {
2796+ int_csr_row_ptr[i + 1] = int_csr_row_ptr[i];
2797+ gst_csr_row_ptr[i + 1] = gst_csr_row_ptr[i];
2798+
2799+ int64_t global_row = local2global[i];
2800+
2801+ for(PtrType j = global_csr_row_ptr[i]; j < global_csr_row_ptr[i + 1]; ++j)
2802+ {
2803+ int64_t global_col = global_csr_col_ind[j];
2804+
2805+ // Determine which process owns the vertex
2806+ int64_t idx_z = global_col / (global_dimx * global_dimy);
2807+ int64_t idx_y = (global_col - idx_z * global_dimy * global_dimx) / global_dimx;
2808+ int64_t idx_x = global_col % global_dimx;
2809+
2810+ int idx_proc_z = idx_z / local_dimz;
2811+ int idx_proc_y = idx_y / local_dimy;
2812+ int idx_proc_x = idx_x / local_dimx;
2813+
2814+ int owner = idx_proc_x + idx_proc_y * nproc_x + idx_proc_z * nproc_y * nproc_x;
2815+
2816+ // If we do not own it, we need to receive it from our neighbor
2817+ // and also send the current vertex to this neighbor
2818+ if(owner != rank)
2819+ {
2820+ // Store the global column and row id that we have to receive / send from / to a neighbor
2821+ // We need a set here to eliminate duplicates
2822+ recv_indices[owner].insert(global_col);
2823+ send_indices[owner].insert(global_row);
2824+
2825+ ++gst_csr_row_ptr[i + 1];
2826+ }
2827+ else
2828+ {
2829+ ++int_csr_row_ptr[i + 1];
2830+ }
2831+ }
2832+ }
2833+
2834+ // Number of processes we communicate with
2835+ int nrecv = recv_indices.size();
2836+ int nsend = send_indices.size();
2837+
2838+ // Process ids we communicate with
2839+ std::vector<int> recvs;
2840+ std::vector<int> sends;
2841+
2842+ recvs.reserve(nrecv);
2843+ sends.reserve(nsend);
2844+
2845+ // Index offsets for each neighbor
2846+ std::vector<int> recv_index_offset;
2847+ std::vector<int> send_index_offset;
2848+
2849+ recv_index_offset.reserve(nrecv + 1);
2850+ send_index_offset.reserve(nsend + 1);
2851+
2852+ recv_index_offset.push_back(0);
2853+ send_index_offset.push_back(0);
2854+
2855+ int cnt = 0;
2856+ std::map<int64_t, int> global2ghost;
2857+
2858+ // Go through the recv data
2859+ for(std::map<int, std::set<int64_t>>::iterator it = recv_indices.begin();
2860+ it != recv_indices.end();
2861+ ++it)
2862+ {
2863+ recvs.push_back(it->first);
2864+ recv_index_offset.push_back(it->second.size());
2865+
2866+ for(std::set<int64_t>::iterator iit = it->second.begin(); iit != it->second.end(); ++iit)
2867+ {
2868+ global2ghost[*iit] = cnt++;
2869+ }
2870+ }
2871+
2872+ // Go through the send data
2873+ int boundary_nnz = 0;
2874+ for(std::map<int, std::set<int64_t>>::iterator it = send_indices.begin();
2875+ it != send_indices.end();
2876+ ++it)
2877+ {
2878+ sends.push_back(it->first);
2879+ send_index_offset.push_back(it->second.size());
2880+ boundary_nnz += it->second.size();
2881+ }
2882+
2883+ // Exclusive sum
2884+ for(int i = 0; i < nrecv; ++i)
2885+ {
2886+ recv_index_offset[i + 1] += recv_index_offset[i];
2887+ }
2888+
2889+ for(int i = 0; i < nsend; ++i)
2890+ {
2891+ send_index_offset[i + 1] += send_index_offset[i];
2892+ }
2893+
2894+ // Boundary indices
2895+ std::vector<int> boundary;
2896+ boundary.reserve(boundary_nnz);
2897+
2898+ for(std::map<int, std::set<int64_t>>::iterator it = send_indices.begin();
2899+ it != send_indices.end();
2900+ ++it)
2901+ {
2902+ for(std::set<int64_t>::iterator iit = it->second.begin(); iit != it->second.end(); ++iit)
2903+ {
2904+ boundary.push_back(global2local[*iit]);
2905+ }
2906+ }
2907+
2908+ // Initialize manager
2909+ pm->SetMPICommunicator(comm);
2910+ pm->SetGlobalNrow(global_nrow);
2911+ pm->SetGlobalNcol(global_nrow);
2912+ pm->SetLocalNrow(local_nrow);
2913+ pm->SetLocalNcol(local_nrow);
2914+
2915+ if(nprocs > 1)
2916+ {
2917+ pm->SetBoundaryIndex(boundary_nnz, boundary.data());
2918+ pm->SetReceivers(nrecv, recvs.data(), recv_index_offset.data());
2919+ pm->SetSenders(nsend, sends.data(), send_index_offset.data());
2920+ }
2921+
2922+ mat->SetParallelManager(*pm);
2923+
2924+ // Generate local and ghost matrices
2925+ local_nnz = int_csr_row_ptr[local_nrow];
2926+ int64_t ghost_nnz = gst_csr_row_ptr[local_nrow];
2927+
2928+ int* int_csr_col_ind = NULL;
2929+ int* gst_csr_col_ind = NULL;
2930+ ValueType* int_csr_val = NULL;
2931+ ValueType* gst_csr_val = NULL;
2932+
2933+ allocate_host(local_nnz, &int_csr_col_ind);
2934+ allocate_host(local_nnz, &int_csr_val);
2935+ allocate_host(ghost_nnz, &gst_csr_col_ind);
2936+ allocate_host(ghost_nnz, &gst_csr_val);
2937+
2938+ // Convert global matrix columns to local columns
2939+ for(int i = 0; i < local_nrow; ++i)
2940+ {
2941+ PtrType local_idx = int_csr_row_ptr[i];
2942+ PtrType ghost_idx = gst_csr_row_ptr[i];
2943+
2944+ int64_t global_row = local2global[i];
2945+
2946+ for(PtrType j = global_csr_row_ptr[i]; j < global_csr_row_ptr[i + 1]; ++j)
2947+ {
2948+ int64_t global_col = global_csr_col_ind[j];
2949+
2950+ // Determine which process owns the vertex
2951+ int64_t idx_z = global_col / (global_dimx * global_dimy);
2952+ int64_t idx_y = (global_col - idx_z * global_dimy * global_dimx) / global_dimx;
2953+ int64_t idx_x = global_col % global_dimx;
2954+
2955+ int idx_proc_z = idx_z / local_dimz;
2956+ int idx_proc_y = idx_y / local_dimy;
2957+ int idx_proc_x = idx_x / local_dimx;
2958+
2959+ int owner = idx_proc_x + idx_proc_y * nproc_x + idx_proc_z * nproc_y * nproc_x;
2960+
2961+ // If we do not own it, we need to receive it from our neighbor
2962+ // and also send the current vertex to this neighbor
2963+ if(owner != rank)
2964+ {
2965+ // Store the global column and row id that we have to receive / send from / to a neighbor
2966+ // We need a set here to eliminate duplicates
2967+ recv_indices[owner].insert(global_col);
2968+ send_indices[owner].insert(global_row);
2969+
2970+ gst_csr_col_ind[ghost_idx] = global2ghost[global_col];
2971+ gst_csr_val[ghost_idx] = -1.0;
2972+ ++ghost_idx;
2973+ }
2974+ else
2975+ {
2976+ // This is our part
2977+ int_csr_col_ind[local_idx] = global2local[global_col];
2978+ int_csr_val[local_idx] = (global_col == global_row) ? 26.0 : -1.0;
2979+ ++local_idx;
2980+ }
2981+ }
2982+ }
2983+
2984+ mat->SetLocalDataPtrCSR(&int_csr_row_ptr, &int_csr_col_ind, &int_csr_val, "mat", local_nnz);
2985+ mat->SetGhostDataPtrCSR(&gst_csr_row_ptr, &gst_csr_col_ind, &gst_csr_val, "gst", ghost_nnz);
2986+ mat->Sort();
2987 }
2988diff --git a/clients/include/testing_bicgstab.hpp b/clients/include/testing_bicgstab.hpp
2989index 924295c..dbee12e 100644
2990--- a/clients/include/testing_bicgstab.hpp
2991+++ b/clients/include/testing_bicgstab.hpp
2992@@ -1,5 +1,5 @@
2993 /* ************************************************************************
2994- * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved.
2995+ * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved.
2996 *
2997 * Permission is hereby granted, free of charge, to any person obtaining a copy
2998 * of this software and associated documentation files (the "Software"), to deal
2999@@ -44,11 +44,13 @@ static bool check_residual(double res)
3000 template <typename T>
3001 bool testing_bicgstab(Arguments argus)
3002 {
3003- int ndim = argus.size;
3004- std::string precond = argus.precond;
3005- unsigned int format = argus.format;
3006+ int ndim = argus.size;
3007+ std::string precond = argus.precond;
3008+ unsigned int format = argus.format;
3009+ bool disable_accelerator = !argus.use_acc;
3010
3011 // Initialize rocALUTION platform
3012+ disable_accelerator_rocalution(disable_accelerator);
3013 set_device_rocalution(device);
3014 init_rocalution();
3015
3016@@ -69,10 +71,13 @@ bool testing_bicgstab(Arguments argus)
3017 A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow);
3018
3019 // Move data to accelerator
3020- A.MoveToAccelerator();
3021- x.MoveToAccelerator();
3022- b.MoveToAccelerator();
3023- e.MoveToAccelerator();
3024+ if(!disable_accelerator)
3025+ {
3026+ A.MoveToAccelerator();
3027+ x.MoveToAccelerator();
3028+ b.MoveToAccelerator();
3029+ e.MoveToAccelerator();
3030+ }
3031
3032 // Allocate x, b and e
3033 x.Allocate("x", A.GetN());
3034@@ -124,6 +129,8 @@ bool testing_bicgstab(Arguments argus)
3035 p = new SGS<LocalMatrix<T>, LocalVector<T>, T>;
3036 else if(precond == "ILU")
3037 p = new ILU<LocalMatrix<T>, LocalVector<T>, T>;
3038+ else if(precond == "ItILU0")
3039+ p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>;
3040 else if(precond == "ILUT")
3041 p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>;
3042 else if(precond == "IC")
3043@@ -146,14 +153,20 @@ bool testing_bicgstab(Arguments argus)
3044 ls.SetPreconditioner(*p);
3045 }
3046
3047- ls.Init(1e-8, 0.0, 1e+8, 10000);
3048+ ls.Init(1e-8, 0.0, 1e+8, 0, 10000);
3049+ ls.RecordResidualHistory();
3050+ auto n_iter = ls.GetIterationCount();
3051 ls.Build();
3052
3053 // Matrix format
3054- A.ConvertTo(format, format == BCSR ? 3 : 1);
3055+ A.ConvertTo(format, format == BCSR ? argus.blockdim : 1);
3056
3057 ls.Solve(b, &x);
3058
3059+ const std::string filename = get_temp_dir() + "test_recorded_history.txt";
3060+ ls.RecordHistory(filename);
3061+ std::remove(filename.c_str());
3062+
3063 // Verify solution
3064 x.ScaleAdd(-1.0, e);
3065 T nrm2 = x.Norm();
3066@@ -169,6 +182,7 @@ bool testing_bicgstab(Arguments argus)
3067
3068 // Stop rocALUTION platform
3069 stop_rocalution();
3070+ disable_accelerator_rocalution(false);
3071
3072 return success;
3073 }
3074diff --git a/clients/include/testing_bicgstabl.hpp b/clients/include/testing_bicgstabl.hpp
3075index 20f7c5d..dc4dfb3 100644
3076--- a/clients/include/testing_bicgstabl.hpp
3077+++ b/clients/include/testing_bicgstabl.hpp
3078@@ -1,5 +1,5 @@
3079 /* ************************************************************************
3080- * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved.
3081+ * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved.
3082 *
3083 * Permission is hereby granted, free of charge, to any person obtaining a copy
3084 * of this software and associated documentation files (the "Software"), to deal
3085@@ -44,12 +44,14 @@ static bool check_residual(double res)
3086 template <typename T>
3087 bool testing_bicgstabl(Arguments argus)
3088 {
3089- int ndim = argus.size;
3090- std::string precond = argus.precond;
3091- unsigned int format = argus.format;
3092- int l = argus.index;
3093+ int ndim = argus.size;
3094+ std::string precond = argus.precond;
3095+ unsigned int format = argus.format;
3096+ int l = argus.index;
3097+ bool disable_accelerator = !argus.use_acc;
3098
3099 // Initialize rocALUTION platform
3100+ disable_accelerator_rocalution(disable_accelerator);
3101 set_device_rocalution(device);
3102 init_rocalution();
3103
3104@@ -70,10 +72,13 @@ bool testing_bicgstabl(Arguments argus)
3105 A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow);
3106
3107 // Move data to accelerator
3108- A.MoveToAccelerator();
3109- x.MoveToAccelerator();
3110- b.MoveToAccelerator();
3111- e.MoveToAccelerator();
3112+ if(!disable_accelerator)
3113+ {
3114+ A.MoveToAccelerator();
3115+ x.MoveToAccelerator();
3116+ b.MoveToAccelerator();
3117+ e.MoveToAccelerator();
3118+ }
3119
3120 // Allocate x, b and e
3121 x.Allocate("x", A.GetN());
3122@@ -125,6 +130,8 @@ bool testing_bicgstabl(Arguments argus)
3123 p = new SGS<LocalMatrix<T>, LocalVector<T>, T>;
3124 else if(precond == "ILU")
3125 p = new ILU<LocalMatrix<T>, LocalVector<T>, T>;
3126+ else if(precond == "ItILU0")
3127+ p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>;
3128 else if(precond == "ILUT")
3129 p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>;
3130 else if(precond == "IC")
3131@@ -152,7 +159,7 @@ bool testing_bicgstabl(Arguments argus)
3132 ls.Build();
3133
3134 // Matrix format
3135- A.ConvertTo(format, format == BCSR ? 3 : 1);
3136+ A.ConvertTo(format, format == BCSR ? argus.blockdim : 1);
3137
3138 ls.Solve(b, &x);
3139
3140@@ -171,6 +178,7 @@ bool testing_bicgstabl(Arguments argus)
3141
3142 // Stop rocALUTION platform
3143 stop_rocalution();
3144+ disable_accelerator_rocalution(false);
3145
3146 return success;
3147 }
3148diff --git a/clients/include/testing_cg.hpp b/clients/include/testing_cg.hpp
3149index 7c43512..b7404fd 100644
3150--- a/clients/include/testing_cg.hpp
3151+++ b/clients/include/testing_cg.hpp
3152@@ -1,5 +1,5 @@
3153 /* ************************************************************************
3154- * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved.
3155+ * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved.
3156 *
3157 * Permission is hereby granted, free of charge, to any person obtaining a copy
3158 * of this software and associated documentation files (the "Software"), to deal
3159@@ -44,11 +44,13 @@ static bool check_residual(double res)
3160 template <typename T>
3161 bool testing_cg(Arguments argus)
3162 {
3163- int ndim = argus.size;
3164- std::string precond = argus.precond;
3165- unsigned int format = argus.format;
3166+ int ndim = argus.size;
3167+ std::string precond = argus.precond;
3168+ unsigned int format = argus.format;
3169+ bool disable_accelerator = !argus.use_acc;
3170
3171 // Initialize rocALUTION platform
3172+ disable_accelerator_rocalution(disable_accelerator);
3173 set_device_rocalution(device);
3174 init_rocalution();
3175
3176@@ -69,10 +71,13 @@ bool testing_cg(Arguments argus)
3177 A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow);
3178
3179 // Move data to accelerator
3180- A.MoveToAccelerator();
3181- x.MoveToAccelerator();
3182- b.MoveToAccelerator();
3183- e.MoveToAccelerator();
3184+ if(!disable_accelerator)
3185+ {
3186+ A.MoveToAccelerator();
3187+ x.MoveToAccelerator();
3188+ b.MoveToAccelerator();
3189+ e.MoveToAccelerator();
3190+ }
3191
3192 // Allocate x, b and e
3193 x.Allocate("x", A.GetN());
3194@@ -124,6 +129,8 @@ bool testing_cg(Arguments argus)
3195 p = new SGS<LocalMatrix<T>, LocalVector<T>, T>;
3196 else if(precond == "ILU")
3197 p = new ILU<LocalMatrix<T>, LocalVector<T>, T>;
3198+ else if(precond == "ItILU0")
3199+ p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>;
3200 else if(precond == "ILUT")
3201 p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>;
3202 else if(precond == "IC")
3203@@ -150,7 +157,7 @@ bool testing_cg(Arguments argus)
3204 ls.Build();
3205
3206 // Matrix format
3207- A.ConvertTo(format, format == BCSR ? 3 : 1);
3208+ A.ConvertTo(format, format == BCSR ? argus.blockdim : 1);
3209
3210 ls.Solve(b, &x);
3211
3212@@ -169,6 +176,7 @@ bool testing_cg(Arguments argus)
3213
3214 // Stop rocALUTION platform
3215 stop_rocalution();
3216+ disable_accelerator_rocalution(false);
3217
3218 return success;
3219 }
3220diff --git a/clients/include/testing_chebyshev.hpp b/clients/include/testing_chebyshev.hpp
3221new file mode 100644
3222index 0000000..a252f24
3223--- /dev/null
3224+++ b/clients/include/testing_chebyshev.hpp
3225@@ -0,0 +1,218 @@
3226+/* ************************************************************************
3227+ * Copyright (C) 2025 Advanced Micro Devices, Inc. All rights Reserved.
3228+ *
3229+ * Permission is hereby granted, free of charge, to any person obtaining a copy
3230+ * of this software and associated documentation files (the "Software"), to deal
3231+ * in the Software without restriction, including without limitation the rights
3232+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
3233+ * copies of the Software, and to permit persons to whom the Software is
3234+ * furnished to do so, subject to the following conditions:
3235+ *
3236+ * The above copyright notice and this permission notice shall be included in
3237+ * all copies or substantial portions of the Software.
3238+ *
3239+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
3240+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
3241+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
3242+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
3243+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
3244+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
3245+ * THE SOFTWARE.
3246+ *
3247+ * ************************************************************************ */
3248+
3249+#pragma once
3250+
3251+#include "utility.hpp"
3252+
3253+#include <rocalution/rocalution.hpp>
3254+
3255+template <typename T>
3256+bool testing_chebyshev(Arguments argus)
3257+{
3258+ using namespace rocalution;
3259+
3260+ int ndim = argus.size;
3261+ std::string precond = argus.precond;
3262+ unsigned int format = argus.format;
3263+ std::string matrix_type = argus.matrix_type;
3264+ bool rebuildnumeric = argus.rebuildnumeric;
3265+ bool disable_accelerator = !argus.use_acc;
3266+
3267+ // Initialize rocALUTION platform
3268+ disable_accelerator_rocalution(disable_accelerator);
3269+ set_device_rocalution(device);
3270+ init_rocalution();
3271+
3272+ // rocALUTION structures
3273+ LocalMatrix<T> A;
3274+ LocalVector<T> x;
3275+ LocalVector<T> b;
3276+ LocalVector<T> b_old;
3277+ LocalVector<T>* b_k;
3278+ LocalVector<T>* b_k1;
3279+ LocalVector<T>* b_tmp;
3280+ LocalVector<T> e;
3281+ LocalVector<T> rhs;
3282+
3283+ // Generate A
3284+ int* csr_ptr = NULL;
3285+ int* csr_col = NULL;
3286+ T* csr_val = NULL;
3287+
3288+ int nrow = 0;
3289+ int ncol = 0;
3290+ if(matrix_type == "Laplacian2D")
3291+ {
3292+ nrow = gen_2d_laplacian(ndim, &csr_ptr, &csr_col, &csr_val);
3293+ ncol = nrow;
3294+ }
3295+ else
3296+ {
3297+ stop_rocalution();
3298+ disable_accelerator_rocalution(false);
3299+ return true;
3300+ }
3301+ int nnz = csr_ptr[nrow];
3302+
3303+ T* csr_val2 = NULL;
3304+ if(rebuildnumeric)
3305+ {
3306+ csr_val2 = new T[nnz];
3307+ for(int i = 0; i < nnz; i++)
3308+ {
3309+ csr_val2[i] = csr_val[i];
3310+ }
3311+ }
3312+
3313+ A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow);
3314+
3315+ // Move data to accelerator
3316+ if(!disable_accelerator)
3317+ {
3318+ A.MoveToAccelerator();
3319+ x.MoveToAccelerator();
3320+ rhs.MoveToAccelerator();
3321+ e.MoveToAccelerator();
3322+ }
3323+
3324+ // Allocate x, b and e
3325+ x.Allocate("x", A.GetN());
3326+ rhs.Allocate("b", A.GetM());
3327+ e.Allocate("e", A.GetN());
3328+
3329+ T lambda_min;
3330+ T lambda_max;
3331+
3332+ A.Gershgorin(lambda_min, lambda_max);
3333+
3334+ // Chebyshev iteration
3335+ Chebyshev<LocalMatrix<T>, LocalVector<T>, T> ls;
3336+
3337+ // Initialize rhs such that A 1 = rhs
3338+ e.Ones();
3339+ A.Apply(e, &rhs);
3340+
3341+ // Initial zero guess
3342+ x.Zeros();
3343+
3344+ // Preconditioner
3345+ Preconditioner<LocalMatrix<T>, LocalVector<T>, T>* p;
3346+
3347+ if(precond == "None")
3348+ p = NULL;
3349+ else if(precond == "Chebyshev")
3350+ {
3351+ // Chebyshev preconditioner
3352+
3353+ // Determine min and max eigenvalues
3354+ T lambda_min;
3355+ T lambda_max;
3356+
3357+ A.Gershgorin(lambda_min, lambda_max);
3358+
3359+ AIChebyshev<LocalMatrix<T>, LocalVector<T>, T>* cheb
3360+ = new AIChebyshev<LocalMatrix<T>, LocalVector<T>, T>;
3361+ cheb->Set(3, lambda_max / 7.0, lambda_max);
3362+
3363+ p = cheb;
3364+ }
3365+ else if(precond == "FSAI")
3366+ p = new FSAI<LocalMatrix<T>, LocalVector<T>, T>;
3367+ else if(precond == "SPAI")
3368+ p = new SPAI<LocalMatrix<T>, LocalVector<T>, T>;
3369+ else if(precond == "TNS")
3370+ p = new TNS<LocalMatrix<T>, LocalVector<T>, T>;
3371+ else if(precond == "Jacobi")
3372+ p = new Jacobi<LocalMatrix<T>, LocalVector<T>, T>;
3373+ else if(precond == "GS")
3374+ p = new GS<LocalMatrix<T>, LocalVector<T>, T>;
3375+ else if(precond == "SGS")
3376+ p = new SGS<LocalMatrix<T>, LocalVector<T>, T>;
3377+ else if(precond == "ILU")
3378+ p = new ILU<LocalMatrix<T>, LocalVector<T>, T>;
3379+ else if(precond == "ItILU0")
3380+ p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>;
3381+ else if(precond == "ILUT")
3382+ p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>;
3383+ else if(precond == "IC")
3384+ p = new IC<LocalMatrix<T>, LocalVector<T>, T>;
3385+ else if(precond == "MCGS")
3386+ p = new MultiColoredGS<LocalMatrix<T>, LocalVector<T>, T>;
3387+ else if(precond == "MCSGS")
3388+ p = new MultiColoredSGS<LocalMatrix<T>, LocalVector<T>, T>;
3389+ else if(precond == "MCILU")
3390+ p = new MultiColoredILU<LocalMatrix<T>, LocalVector<T>, T>;
3391+ else
3392+ return false;
3393+
3394+ // Set solver operator
3395+ ls.SetOperator(A);
3396+
3397+ ls.Verbose(0);
3398+ ls.SetOperator(A);
3399+
3400+ // Set preconditioner
3401+ if(p != NULL)
3402+ {
3403+ ls.SetPreconditioner(*p);
3404+ }
3405+
3406+ // Set eigenvalues
3407+ ls.Set(lambda_min, lambda_max);
3408+
3409+ // Build solver
3410+ ls.Build();
3411+
3412+ if(rebuildnumeric)
3413+ {
3414+ A.UpdateValuesCSR(csr_val2);
3415+ delete[] csr_val2;
3416+
3417+ A.Apply(e, &rhs);
3418+
3419+ ls.ReBuildNumeric();
3420+ ls.Set(lambda_min, lambda_max);
3421+ }
3422+
3423+ // Solve A x = rhs
3424+ ls.Solve(rhs, &x);
3425+
3426+ // Clear solver
3427+ ls.Clear();
3428+ if(p != NULL)
3429+ {
3430+ delete p;
3431+ }
3432+
3433+ // Compute error L2 norm
3434+ e.ScaleAdd(-1.0, x);
3435+ T error = e.Norm();
3436+ std::cout << "Chebyshev iteration ||e - x||_2 = " << error << std::endl;
3437+
3438+ // Stop rocALUTION platform
3439+ stop_rocalution();
3440+ disable_accelerator_rocalution(false);
3441+
3442+ return true;
3443+}
3444\ No newline at end of file
3445diff --git a/clients/include/testing_cr.hpp b/clients/include/testing_cr.hpp
3446index 41658b6..3497337 100644
3447--- a/clients/include/testing_cr.hpp
3448+++ b/clients/include/testing_cr.hpp
3449@@ -1,5 +1,5 @@
3450 /* ************************************************************************
3451- * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved.
3452+ * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved.
3453 *
3454 * Permission is hereby granted, free of charge, to any person obtaining a copy
3455 * of this software and associated documentation files (the "Software"), to deal
3456@@ -44,11 +44,13 @@ static bool check_residual(double res)
3457 template <typename T>
3458 bool testing_cr(Arguments argus)
3459 {
3460- int ndim = argus.size;
3461- std::string precond = argus.precond;
3462- unsigned int format = argus.format;
3463+ int ndim = argus.size;
3464+ std::string precond = argus.precond;
3465+ unsigned int format = argus.format;
3466+ bool disable_accelerator = !argus.use_acc;
3467
3468 // Initialize rocALUTION platform
3469+ disable_accelerator_rocalution(disable_accelerator);
3470 set_device_rocalution(device);
3471 init_rocalution();
3472
3473@@ -69,10 +71,13 @@ bool testing_cr(Arguments argus)
3474 A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow);
3475
3476 // Move data to accelerator
3477- A.MoveToAccelerator();
3478- x.MoveToAccelerator();
3479- b.MoveToAccelerator();
3480- e.MoveToAccelerator();
3481+ if(!disable_accelerator)
3482+ {
3483+ A.MoveToAccelerator();
3484+ x.MoveToAccelerator();
3485+ b.MoveToAccelerator();
3486+ e.MoveToAccelerator();
3487+ }
3488
3489 // Allocate x, b and e
3490 x.Allocate("x", A.GetN());
3491@@ -124,6 +129,8 @@ bool testing_cr(Arguments argus)
3492 p = new SGS<LocalMatrix<T>, LocalVector<T>, T>;
3493 else if(precond == "ILU")
3494 p = new ILU<LocalMatrix<T>, LocalVector<T>, T>;
3495+ else if(precond == "ItILU0")
3496+ p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>;
3497 else if(precond == "ILUT")
3498 p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>;
3499 else if(precond == "IC")
3500@@ -150,7 +157,7 @@ bool testing_cr(Arguments argus)
3501 ls.Build();
3502
3503 // Matrix format
3504- A.ConvertTo(format, format == BCSR ? 3 : 1);
3505+ A.ConvertTo(format, format == BCSR ? argus.blockdim : 1);
3506
3507 ls.Solve(b, &x);
3508
3509@@ -169,6 +176,7 @@ bool testing_cr(Arguments argus)
3510
3511 // Stop rocALUTION platform
3512 stop_rocalution();
3513+ disable_accelerator_rocalution(false);
3514
3515 return success;
3516 }
3517diff --git a/clients/include/testing_fcg.hpp b/clients/include/testing_fcg.hpp
3518index 57e37bf..9aade14 100644
3519--- a/clients/include/testing_fcg.hpp
3520+++ b/clients/include/testing_fcg.hpp
3521@@ -1,5 +1,5 @@
3522 /* ************************************************************************
3523- * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved.
3524+ * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved.
3525 *
3526 * Permission is hereby granted, free of charge, to any person obtaining a copy
3527 * of this software and associated documentation files (the "Software"), to deal
3528@@ -44,11 +44,13 @@ static bool check_residual(double res)
3529 template <typename T>
3530 bool testing_fcg(Arguments argus)
3531 {
3532- int ndim = argus.size;
3533- std::string precond = argus.precond;
3534- unsigned int format = argus.format;
3535+ int ndim = argus.size;
3536+ std::string precond = argus.precond;
3537+ unsigned int format = argus.format;
3538+ bool disable_accelerator = !argus.use_acc;
3539
3540 // Initialize rocALUTION platform
3541+ disable_accelerator_rocalution(disable_accelerator);
3542 set_device_rocalution(device);
3543 init_rocalution();
3544
3545@@ -69,10 +71,13 @@ bool testing_fcg(Arguments argus)
3546 A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow);
3547
3548 // Move data to accelerator
3549- A.MoveToAccelerator();
3550- x.MoveToAccelerator();
3551- b.MoveToAccelerator();
3552- e.MoveToAccelerator();
3553+ if(!disable_accelerator)
3554+ {
3555+ A.MoveToAccelerator();
3556+ x.MoveToAccelerator();
3557+ b.MoveToAccelerator();
3558+ e.MoveToAccelerator();
3559+ }
3560
3561 // Allocate x, b and e
3562 x.Allocate("x", A.GetN());
3563@@ -124,6 +129,8 @@ bool testing_fcg(Arguments argus)
3564 p = new SGS<LocalMatrix<T>, LocalVector<T>, T>;
3565 else if(precond == "ILU")
3566 p = new ILU<LocalMatrix<T>, LocalVector<T>, T>;
3567+ else if(precond == "ItILU0")
3568+ p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>;
3569 else if(precond == "ILUT")
3570 p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>;
3571 else if(precond == "IC")
3572@@ -150,7 +157,7 @@ bool testing_fcg(Arguments argus)
3573 ls.Build();
3574
3575 // Matrix format
3576- A.ConvertTo(format, format == BCSR ? 3 : 1);
3577+ A.ConvertTo(format, format == BCSR ? argus.blockdim : 1);
3578
3579 ls.Solve(b, &x);
3580
3581@@ -169,6 +176,7 @@ bool testing_fcg(Arguments argus)
3582
3583 // Stop rocALUTION platform
3584 stop_rocalution();
3585+ disable_accelerator_rocalution(false);
3586
3587 return success;
3588 }
3589diff --git a/clients/include/testing_fgmres.hpp b/clients/include/testing_fgmres.hpp
3590index c92f5af..cd39930 100644
3591--- a/clients/include/testing_fgmres.hpp
3592+++ b/clients/include/testing_fgmres.hpp
3593@@ -1,5 +1,5 @@
3594 /* ************************************************************************
3595- * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved.
3596+ * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved.
3597 *
3598 * Permission is hereby granted, free of charge, to any person obtaining a copy
3599 * of this software and associated documentation files (the "Software"), to deal
3600@@ -34,12 +34,14 @@ using namespace rocalution;
3601 template <typename T>
3602 bool testing_fgmres(Arguments argus)
3603 {
3604- int ndim = argus.size;
3605- int basis = argus.index;
3606- std::string precond = argus.precond;
3607- unsigned int format = argus.format;
3608+ int ndim = argus.size;
3609+ int basis = argus.index;
3610+ std::string precond = argus.precond;
3611+ unsigned int format = argus.format;
3612+ bool disable_accelerator = !argus.use_acc;
3613
3614 // Initialize rocALUTION platform
3615+ disable_accelerator_rocalution(disable_accelerator);
3616 set_device_rocalution(device);
3617 init_rocalution();
3618
3619@@ -60,10 +62,13 @@ bool testing_fgmres(Arguments argus)
3620 A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow);
3621
3622 // Move data to accelerator
3623- A.MoveToAccelerator();
3624- x.MoveToAccelerator();
3625- b.MoveToAccelerator();
3626- e.MoveToAccelerator();
3627+ if(!disable_accelerator)
3628+ {
3629+ A.MoveToAccelerator();
3630+ x.MoveToAccelerator();
3631+ b.MoveToAccelerator();
3632+ e.MoveToAccelerator();
3633+ }
3634
3635 // Allocate x, b and e
3636 x.Allocate("x", A.GetN());
3637@@ -115,6 +120,8 @@ bool testing_fgmres(Arguments argus)
3638 p = new SGS<LocalMatrix<T>, LocalVector<T>, T>;
3639 else if(precond == "ILU")
3640 p = new ILU<LocalMatrix<T>, LocalVector<T>, T>;
3641+ else if(precond == "ItILU0")
3642+ p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>;
3643 else if(precond == "ILUT")
3644 p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>;
3645 else if(precond == "IC")
3646@@ -143,7 +150,7 @@ bool testing_fgmres(Arguments argus)
3647 ls.Build();
3648
3649 // Matrix format
3650- A.ConvertTo(format, format == BCSR ? 3 : 1);
3651+ A.ConvertTo(format, format == BCSR ? argus.blockdim : 1);
3652
3653 ls.Solve(b, &x);
3654
3655@@ -162,6 +169,7 @@ bool testing_fgmres(Arguments argus)
3656
3657 // Stop rocALUTION platform
3658 stop_rocalution();
3659+ disable_accelerator_rocalution(false);
3660
3661 return success;
3662 }
3663diff --git a/clients/include/testing_global_matrix.hpp b/clients/include/testing_global_matrix.hpp
3664index de9d7d2..439d789 100644
3665--- a/clients/include/testing_global_matrix.hpp
3666+++ b/clients/include/testing_global_matrix.hpp
3667@@ -1,5 +1,5 @@
3668 /* ************************************************************************
3669- * Copyright (C) 2018-2020 Advanced Micro Devices, Inc. All rights Reserved.
3670+ * Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights Reserved.
3671 *
3672 * Permission is hereby granted, free of charge, to any person obtaining a copy
3673 * of this software and associated documentation files (the "Software"), to deal
3674@@ -355,17 +355,13 @@ void testing_global_matrix_bad_args(void)
3675 ParallelManager pm;
3676 LocalVector<int> lvint;
3677 GlobalMatrix<T>* null_mat = nullptr;
3678- ParallelManager* null_pm = nullptr;
3679 int* null_int = nullptr;
3680- ASSERT_DEATH(mat.CoarsenOperator(
3681- null_mat, &pm, safe_size, safe_size, lvint, safe_size, idata, safe_size),
3682- ".*Assertion.*Ac != (NULL|__null)*");
3683- ASSERT_DEATH(mat.CoarsenOperator(
3684- &Ac, null_pm, safe_size, safe_size, lvint, safe_size, idata, safe_size),
3685- ".*Assertion.*pm != (NULL|__null)*");
3686- ASSERT_DEATH(mat.CoarsenOperator(
3687- &Ac, &pm, safe_size, safe_size, lvint, safe_size, null_int, safe_size),
3688- ".*Assertion.*rG != (NULL|__null)*");
3689+ ASSERT_DEATH(
3690+ mat.CoarsenOperator(null_mat, safe_size, safe_size, lvint, safe_size, idata, safe_size),
3691+ ".*Assertion.*Ac != (NULL|__null)*");
3692+ ASSERT_DEATH(
3693+ mat.CoarsenOperator(&Ac, safe_size, safe_size, lvint, safe_size, null_int, safe_size),
3694+ ".*Assertion.*rG != (NULL|__null)*");
3695 }
3696
3697 free_host(&idata);
3698diff --git a/clients/include/testing_gmres.hpp b/clients/include/testing_gmres.hpp
3699index 2d875b8..1790cb9 100644
3700--- a/clients/include/testing_gmres.hpp
3701+++ b/clients/include/testing_gmres.hpp
3702@@ -1,5 +1,5 @@
3703 /* ************************************************************************
3704- * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved.
3705+ * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved.
3706 *
3707 * Permission is hereby granted, free of charge, to any person obtaining a copy
3708 * of this software and associated documentation files (the "Software"), to deal
3709@@ -34,13 +34,15 @@ using namespace rocalution;
3710 template <typename T>
3711 bool testing_gmres(Arguments argus, bool expectConvergence = true)
3712 {
3713- int ndim = argus.size;
3714- int basis = argus.index;
3715- std::string matrix = argus.matrix;
3716- std::string precond = argus.precond;
3717- unsigned int format = argus.format;
3718+ int ndim = argus.size;
3719+ int basis = argus.index;
3720+ std::string matrix = argus.matrix;
3721+ std::string precond = argus.precond;
3722+ unsigned int format = argus.format;
3723+ bool disable_accelerator = !argus.use_acc;
3724
3725 // Initialize rocALUTION platform
3726+ disable_accelerator_rocalution(disable_accelerator);
3727 set_device_rocalution(device);
3728 init_rocalution();
3729
3730@@ -68,10 +70,13 @@ bool testing_gmres(Arguments argus, bool expectConvergence = true)
3731 A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow);
3732
3733 // Move data to accelerator
3734- A.MoveToAccelerator();
3735- x.MoveToAccelerator();
3736- b.MoveToAccelerator();
3737- e.MoveToAccelerator();
3738+ if(!disable_accelerator)
3739+ {
3740+ A.MoveToAccelerator();
3741+ x.MoveToAccelerator();
3742+ b.MoveToAccelerator();
3743+ e.MoveToAccelerator();
3744+ }
3745
3746 // Allocate x, b and e
3747 x.Allocate("x", A.GetN());
3748@@ -123,6 +128,8 @@ bool testing_gmres(Arguments argus, bool expectConvergence = true)
3749 p = new SGS<LocalMatrix<T>, LocalVector<T>, T>;
3750 else if(precond == "ILU")
3751 p = new ILU<LocalMatrix<T>, LocalVector<T>, T>;
3752+ else if(precond == "ItILU0")
3753+ p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>;
3754 else if(precond == "ILUT")
3755 p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>;
3756 else if(precond == "IC")
3757@@ -151,7 +158,7 @@ bool testing_gmres(Arguments argus, bool expectConvergence = true)
3758 ls.Build();
3759
3760 // Matrix format
3761- A.ConvertTo(format, format == BCSR ? 3 : 1);
3762+ A.ConvertTo(format, format == BCSR ? argus.blockdim : 1);
3763
3764 ls.Solve(b, &x);
3765
3766@@ -170,6 +177,7 @@ bool testing_gmres(Arguments argus, bool expectConvergence = true)
3767
3768 // Stop rocALUTION platform
3769 stop_rocalution();
3770+ disable_accelerator_rocalution(false);
3771
3772 return success;
3773 }
3774diff --git a/clients/include/testing_idr.hpp b/clients/include/testing_idr.hpp
3775index 000b8af..35efa3a 100644
3776--- a/clients/include/testing_idr.hpp
3777+++ b/clients/include/testing_idr.hpp
3778@@ -1,5 +1,5 @@
3779 /* ************************************************************************
3780- * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved.
3781+ * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved.
3782 *
3783 * Permission is hereby granted, free of charge, to any person obtaining a copy
3784 * of this software and associated documentation files (the "Software"), to deal
3785@@ -44,12 +44,14 @@ static bool check_residual(double res)
3786 template <typename T>
3787 bool testing_idr(Arguments argus)
3788 {
3789- int ndim = argus.size;
3790- std::string precond = argus.precond;
3791- unsigned int format = argus.format;
3792- int l = argus.index;
3793+ int ndim = argus.size;
3794+ std::string precond = argus.precond;
3795+ unsigned int format = argus.format;
3796+ int l = argus.index;
3797+ bool disable_accelerator = !argus.use_acc;
3798
3799 // Initialize rocALUTION platform
3800+ disable_accelerator_rocalution(disable_accelerator);
3801 set_device_rocalution(device);
3802 init_rocalution();
3803
3804@@ -70,10 +72,13 @@ bool testing_idr(Arguments argus)
3805 A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow);
3806
3807 // Move data to accelerator
3808- A.MoveToAccelerator();
3809- x.MoveToAccelerator();
3810- b.MoveToAccelerator();
3811- e.MoveToAccelerator();
3812+ if(!disable_accelerator)
3813+ {
3814+ A.MoveToAccelerator();
3815+ x.MoveToAccelerator();
3816+ b.MoveToAccelerator();
3817+ e.MoveToAccelerator();
3818+ }
3819
3820 // Allocate x, b and e
3821 x.Allocate("x", A.GetN());
3822@@ -85,7 +90,7 @@ bool testing_idr(Arguments argus)
3823 A.Apply(e, &b);
3824
3825 // Random initial guess
3826- x.SetRandomUniform(12345ULL, -4.0, 6.0);
3827+ x.SetRandomUniform(123456ULL, -4.0, 6.0);
3828
3829 // Solver
3830 IDR<LocalMatrix<T>, LocalVector<T>, T> ls;
3831@@ -125,6 +130,8 @@ bool testing_idr(Arguments argus)
3832 p = new SGS<LocalMatrix<T>, LocalVector<T>, T>;
3833 else if(precond == "ILU")
3834 p = new ILU<LocalMatrix<T>, LocalVector<T>, T>;
3835+ else if(precond == "ItILU0")
3836+ p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>;
3837 else if(precond == "ILUT")
3838 p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>;
3839 else if(precond == "IC")
3840@@ -153,7 +160,7 @@ bool testing_idr(Arguments argus)
3841 ls.Build();
3842
3843 // Matrix format
3844- A.ConvertTo(format, format == BCSR ? 3 : 1);
3845+ A.ConvertTo(format, format == BCSR ? argus.blockdim : 1);
3846
3847 ls.Solve(b, &x);
3848
3849@@ -172,6 +179,7 @@ bool testing_idr(Arguments argus)
3850
3851 // Stop rocALUTION platform
3852 stop_rocalution();
3853+ disable_accelerator_rocalution(false);
3854
3855 return success;
3856 }
3857diff --git a/clients/include/testing_inversion.hpp b/clients/include/testing_inversion.hpp
3858index 65cf850..a328af9 100644
3859--- a/clients/include/testing_inversion.hpp
3860+++ b/clients/include/testing_inversion.hpp
3861@@ -1,5 +1,5 @@
3862 /* ************************************************************************
3863- * Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved.
3864+ * Copyright (C) 2022-2025 Advanced Micro Devices, Inc. All rights Reserved.
3865 *
3866 * Permission is hereby granted, free of charge, to any person obtaining a copy
3867 * of this software and associated documentation files (the "Software"), to deal
3868@@ -44,9 +44,10 @@ static bool check_residual(double res)
3869 template <typename T>
3870 bool testing_inversion(Arguments argus)
3871 {
3872- int ndim = argus.size;
3873- unsigned int format = argus.format;
3874- std::string matrix_type = argus.matrix_type;
3875+ int ndim = argus.size;
3876+ unsigned int format = argus.format;
3877+ std::string matrix_type = argus.matrix_type;
3878+ const bool use_host_and_acc = argus.use_acc;
3879
3880 // Initialize rocALUTION platform
3881 set_device_rocalution(device);
3882@@ -83,12 +84,6 @@ bool testing_inversion(Arguments argus)
3883
3884 A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow);
3885
3886- // Move data to accelerator
3887- A.MoveToAccelerator();
3888- x.MoveToAccelerator();
3889- b.MoveToAccelerator();
3890- e.MoveToAccelerator();
3891-
3892 // Allocate x, b and e
3893 x.Allocate("x", A.GetN());
3894 b.Allocate("b", A.GetM());
3895@@ -110,15 +105,39 @@ bool testing_inversion(Arguments argus)
3896 dls.Build();
3897
3898 // Matrix format
3899- A.ConvertTo(format, format == BCSR ? 3 : 1);
3900+ A.ConvertTo(format, format == BCSR ? argus.blockdim : 1);
3901+
3902+ // Move data to accelerator
3903+ dls.MoveToAccelerator();
3904+ A.MoveToAccelerator();
3905+ x.MoveToAccelerator();
3906+ b.MoveToAccelerator();
3907+ e.MoveToAccelerator();
3908
3909 dls.Solve(b, &x);
3910
3911 // Verify solution
3912 x.ScaleAdd(-1.0, e);
3913- T nrm2 = x.Norm();
3914+ T nrm2_acc = x.Norm();
3915
3916- bool success = check_residual(nrm2);
3917+ bool success = check_residual(nrm2_acc);
3918+
3919+ if(use_host_and_acc)
3920+ {
3921+ dls.MoveToHost();
3922+ A.MoveToHost();
3923+ x.MoveToHost();
3924+ e.MoveToHost();
3925+ b.MoveToHost();
3926+
3927+ dls.Solve(b, &x);
3928+
3929+ // Verify solution
3930+ x.ScaleAdd(-1.0, e);
3931+ T nrm2_host = x.Norm();
3932+
3933+ success = success && check_residual(nrm2_host);
3934+ }
3935
3936 // Clean up
3937 dls.Clear();
3938diff --git a/clients/include/testing_itsolver.hpp b/clients/include/testing_itsolver.hpp
3939new file mode 100644
3940index 0000000..7d76972
3941--- /dev/null
3942+++ b/clients/include/testing_itsolver.hpp
3943@@ -0,0 +1,165 @@
3944+/* ************************************************************************
3945+ * Copyright (C) 2025 Advanced Micro Devices, Inc. All rights Reserved.
3946+ *
3947+ * Permission is hereby granted, free of charge, to any person obtaining a copy
3948+ * of this software and associated documentation files (the "Software"), to deal
3949+ * in the Software without restriction, including without limitation the rights
3950+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
3951+ * copies of the Software, and to permit persons to whom the Software is
3952+ * furnished to do so, subject to the following conditions:
3953+ *
3954+ * The above copyright notice and this permission notice shall be included in
3955+ * all copies or substantial portions of the Software.
3956+ *
3957+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
3958+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
3959+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
3960+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
3961+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
3962+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
3963+ * THE SOFTWARE.
3964+ *
3965+ * ************************************************************************ */
3966+
3967+#pragma once
3968+
3969+#include "utility.hpp"
3970+
3971+#include <rocalution/rocalution.hpp>
3972+
3973+template <typename T>
3974+bool testing_itsolver(Arguments argus)
3975+{
3976+ using namespace rocalution;
3977+
3978+ int ndim = argus.size;
3979+ unsigned int format = argus.format;
3980+ std::string matrix_type = argus.matrix_type;
3981+ bool disable_accelerator = !argus.use_acc;
3982+
3983+ // Initialize rocALUTION platform
3984+ disable_accelerator_rocalution(disable_accelerator);
3985+ set_device_rocalution(device);
3986+ init_rocalution();
3987+
3988+ // rocALUTION structures
3989+ LocalMatrix<T> A;
3990+ LocalVector<T> x;
3991+ LocalVector<T> b;
3992+ LocalVector<T> e;
3993+
3994+ // Generate A
3995+ int* csr_ptr = NULL;
3996+ int* csr_col = NULL;
3997+ T* csr_val = NULL;
3998+
3999+ int nrow = 0;
4000+ int ncol = 0;
4001+ if(matrix_type == "Laplacian2D")
4002+ {
4003+ nrow = gen_2d_laplacian(ndim, &csr_ptr, &csr_col, &csr_val);
4004+ ncol = nrow;
4005+ }
4006+ else if(matrix_type == "PermutedIdentity")
4007+ {
4008+ nrow = gen_permuted_identity(ndim, &csr_ptr, &csr_col, &csr_val);
4009+ ncol = nrow;
4010+ }
4011+ else
4012+ {
4013+ stop_rocalution();
4014+ disable_accelerator_rocalution(false);
4015+ return true;
4016+ }
4017+ int nnz = csr_ptr[nrow];
4018+
4019+ A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow);
4020+
4021+ // Move data to accelerator
4022+ if(!disable_accelerator)
4023+ {
4024+ A.MoveToAccelerator();
4025+ x.MoveToAccelerator();
4026+ b.MoveToAccelerator();
4027+ e.MoveToAccelerator();
4028+ }
4029+
4030+ // Allocate x, b and e
4031+ x.Allocate("x", A.GetN());
4032+ b.Allocate("b", A.GetM());
4033+ e.Allocate("e", A.GetN());
4034+
4035+ // Linear Solver
4036+ FixedPoint<LocalMatrix<T>, LocalVector<T>, T> fp;
4037+
4038+ // Preconditioner
4039+ ItILU0<LocalMatrix<T>, LocalVector<T>, T> p;
4040+
4041+ // Set iterative ILU stopping criteria
4042+ p.SetTolerance(1e-8);
4043+ p.SetMaxIter(50);
4044+
4045+ p.SetAlgorithm(ItILU0Algorithm::SyncSplit);
4046+
4047+ // Set up iterative triangular solve
4048+ SolverDescr descr;
4049+ descr.SetTriSolverAlg(TriSolverAlg_Iterative);
4050+ descr.SetIterativeSolverMaxIteration(30);
4051+ descr.SetIterativeSolverTolerance(1e-8);
4052+
4053+ descr.DisableIterativeSolverTolerance();
4054+ descr.EnableIterativeSolverTolerance();
4055+ SolverDescr descr_new(descr); // Copy the descriptor
4056+
4057+ p.SetSolverDescriptor(descr_new);
4058+
4059+ // Initialize b such that A 1 = b
4060+ e.Ones();
4061+ A.Apply(e, &b);
4062+
4063+ // Initial zero guess
4064+ x.Zeros();
4065+
4066+ // Set solver operator
4067+ fp.SetOperator(A);
4068+ // Set solver preconditioner
4069+ fp.SetPreconditioner(p);
4070+
4071+ // Build solver
4072+ fp.Build();
4073+
4074+ // Verbosity output
4075+ fp.Verbose(1);
4076+
4077+ fp.InitMinIter(1);
4078+ fp.InitMaxIter(1000);
4079+ fp.InitTol(1e-8, 1e-8, 1e-8);
4080+
4081+ // Print matrix info
4082+ A.Info();
4083+
4084+ // Solve A x = b
4085+ fp.Solve(b, &x);
4086+
4087+ int niter_preconditioner;
4088+ const double* history = p.GetConvergenceHistory(&niter_preconditioner);
4089+
4090+ auto res_final = fp.GetCurrentResidual();
4091+ //auto res_init = fp.GetInitialResidual();
4092+ //auto niter = fp.GetNumIterations();
4093+ auto status_solver = fp.GetSolverStatus();
4094+ auto ind = fp.GetAmaxResidualIndex();
4095+ // Clear solver
4096+ fp.Clear();
4097+
4098+ // Compute error L2 norm
4099+ e.ScaleAdd(-1.0, x);
4100+ T error = e.Norm();
4101+ std::cout << "||e - x||_2 = " << error << std::endl;
4102+
4103+ // Stop rocALUTION platform
4104+ stop_rocalution();
4105+ disable_accelerator_rocalution(false);
4106+
4107+ return true;
4108+}
4109\ No newline at end of file
4110diff --git a/clients/include/testing_local_matrix.hpp b/clients/include/testing_local_matrix.hpp
4111index 3145323..82e21e7 100644
4112--- a/clients/include/testing_local_matrix.hpp
4113+++ b/clients/include/testing_local_matrix.hpp
4114@@ -1,5 +1,5 @@
4115 /* ************************************************************************
4116- * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved.
4117+ * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved.
4118 *
4119 * Permission is hereby granted, free of charge, to any person obtaining a copy
4120 * of this software and associated documentation files (the "Software"), to deal
4121@@ -41,10 +41,12 @@ void testing_local_matrix_bad_args(void)
4122 set_device_rocalution(device);
4123 init_rocalution();
4124
4125- LocalMatrix<T> mat1;
4126- LocalMatrix<T> mat2;
4127- LocalVector<T> vec1;
4128- LocalVector<int> int1;
4129+ LocalMatrix<T> mat1;
4130+ LocalMatrix<T> mat2;
4131+ LocalVector<T> vec1;
4132+ LocalVector<bool> bool1;
4133+ LocalVector<int> int1;
4134+ LocalVector<int64_t> int641;
4135
4136 // null pointers
4137 int* null_int = nullptr;
4138@@ -57,6 +59,10 @@ void testing_local_matrix_bad_args(void)
4139 allocate_host(safe_size, &vint);
4140 allocate_host(safe_size, &vdata);
4141
4142+ // Valid matrices
4143+ LocalMatrix<T> mat3;
4144+ mat3.AllocateCSR("valid", safe_size, safe_size, safe_size);
4145+
4146 // ExtractSubMatrix, ExtractSubMatrices, Extract(Inverse)Diagonal, ExtractL/U
4147 {
4148 LocalMatrix<T>* mat_null = nullptr;
4149@@ -114,6 +120,23 @@ void testing_local_matrix_bad_args(void)
4150 ASSERT_DEATH(mat1.QRSolve(vec1, null_vec), ".*Assertion.*out != (NULL|__null)*");
4151 }
4152
4153+ // ItLSolve, ItUSolve, ItLLSolve, ItLUSolve
4154+ {
4155+ LocalVector<T>* null_vec = nullptr;
4156+ int max_iter = 1;
4157+ double tol = 0;
4158+ ASSERT_DEATH(mat1.ItLSolve(max_iter, tol, true, vec1, null_vec),
4159+ ".*Assertion.*out != (NULL|__null)*");
4160+ ASSERT_DEATH(mat1.ItUSolve(max_iter, tol, true, vec1, null_vec),
4161+ ".*Assertion.*out != (NULL|__null)*");
4162+ ASSERT_DEATH(mat1.ItLLSolve(max_iter, tol, true, vec1, null_vec),
4163+ ".*Assertion.*out != (NULL|__null)*");
4164+ ASSERT_DEATH(mat1.ItLLSolve(max_iter, tol, true, vec1, vec1, null_vec),
4165+ ".*Assertion.*out != (NULL|__null)*");
4166+ ASSERT_DEATH(mat1.ItLUSolve(max_iter, tol, true, vec1, null_vec),
4167+ ".*Assertion.*out != (NULL|__null)*");
4168+ }
4169+
4170 // ICFactorize, Householder
4171 {
4172 T val;
4173@@ -127,12 +150,12 @@ void testing_local_matrix_bad_args(void)
4174 ASSERT_DEATH(mat1.UpdateValuesCSR(null_data), ".*Assertion.*val != (NULL|__null)*");
4175 ASSERT_DEATH(mat1.CopyFromCSR(null_int, vint, vdata),
4176 ".*Assertion.*row_offsets != (NULL|__null)*");
4177- ASSERT_DEATH(mat1.CopyFromCSR(vint, null_int, vdata), ".*Assertion.*col != (NULL|__null)*");
4178- ASSERT_DEATH(mat1.CopyFromCSR(vint, vint, null_data), ".*Assertion.*val != (NULL|__null)*");
4179+ ASSERT_DEATH(mat3.CopyFromCSR(vint, null_int, vdata), ".*Assertion.*col != (NULL|__null)*");
4180+ ASSERT_DEATH(mat3.CopyFromCSR(vint, vint, null_data), ".*Assertion.*val != (NULL|__null)*");
4181 ASSERT_DEATH(mat1.CopyToCSR(null_int, vint, vdata),
4182 ".*Assertion.*row_offsets != (NULL|__null)*");
4183- ASSERT_DEATH(mat1.CopyToCSR(vint, null_int, vdata), ".*Assertion.*col != (NULL|__null)*");
4184- ASSERT_DEATH(mat1.CopyToCSR(vint, vint, null_data), ".*Assertion.*val != (NULL|__null)*");
4185+ ASSERT_DEATH(mat3.CopyToCSR(vint, null_int, vdata), ".*Assertion.*col != (NULL|__null)*");
4186+ ASSERT_DEATH(mat3.CopyToCSR(vint, vint, null_data), ".*Assertion.*val != (NULL|__null)*");
4187 ASSERT_DEATH(mat1.CopyFromCOO(null_int, vint, vdata), ".*Assertion.*row != (NULL|__null)*");
4188 ASSERT_DEATH(mat1.CopyFromCOO(vint, null_int, vdata), ".*Assertion.*col != (NULL|__null)*");
4189 ASSERT_DEATH(mat1.CopyFromCOO(vint, vint, null_data), ".*Assertion.*val != (NULL|__null)*");
4190@@ -173,24 +196,27 @@ void testing_local_matrix_bad_args(void)
4191
4192 // AMG
4193 {
4194+ int val;
4195+ LocalVector<bool>* bool_null_vec = nullptr;
4196+ LocalVector<int64_t>* int64_null_vec = nullptr;
4197+ ASSERT_DEATH(mat1.AMGGreedyAggregate(0.1, bool_null_vec, &int641, &int641),
4198+ ".*Assertion.*connections != (NULL|__null)*");
4199+ ASSERT_DEATH(mat1.AMGGreedyAggregate(0.1, &bool1, int64_null_vec, &int641),
4200+ ".*Assertion.*aggregates != (NULL|__null)*");
4201+ ASSERT_DEATH(mat1.AMGGreedyAggregate(0.1, &bool1, &int641, int64_null_vec),
4202+ ".*Assertion.*aggregate_root_nodes != (NULL|__null)*");
4203+
4204+ LocalMatrix<T>* null_mat = nullptr;
4205+ ASSERT_DEATH(mat1.AMGSmoothedAggregation(0.1, bool1, int641, int641, null_mat),
4206+ ".*Assertion.*prolong != (NULL|__null)*");
4207+ }
4208+
4209+ {
4210 int val;
4211 LocalVector<int>* null_vec = nullptr;
4212 LocalMatrix<T>* null_mat = nullptr;
4213- ASSERT_DEATH(mat1.AMGConnect(0.1, null_vec), ".*Assertion.*connections != (NULL|__null)*");
4214- ASSERT_DEATH(mat1.AMGAggregate(int1, null_vec),
4215- ".*Assertion.*aggregates != (NULL|__null)*");
4216- ASSERT_DEATH(mat1.AMGSmoothedAggregation(0.1, int1, int1, null_mat, &mat2),
4217- ".*Assertion.*prolong != (NULL|__null)*");
4218- ASSERT_DEATH(mat1.AMGSmoothedAggregation(0.1, int1, int1, &mat2, null_mat),
4219- ".*Assertion.*restrict != (NULL|__null)*");
4220- ASSERT_DEATH(mat1.AMGAggregation(int1, null_mat, &mat2),
4221+ ASSERT_DEATH(mat1.AMGUnsmoothedAggregation(int641, int641, null_mat),
4222 ".*Assertion.*prolong != (NULL|__null)*");
4223- ASSERT_DEATH(mat1.AMGAggregation(int1, &mat2, null_mat),
4224- ".*Assertion.*restrict != (NULL|__null)*");
4225- ASSERT_DEATH(mat1.RugeStueben(0.1, null_mat, &mat2),
4226- ".*Assertion.*prolong != (NULL|__null)*");
4227- ASSERT_DEATH(mat1.RugeStueben(0.1, &mat2, null_mat),
4228- ".*Assertion.*restrict != (NULL|__null)*");
4229 ASSERT_DEATH(mat1.InitialPairwiseAggregation(0.1, val, null_vec, val, &null_int, val, 0),
4230 ".*Assertion.*G != (NULL|__null)*");
4231 ASSERT_DEATH(mat1.InitialPairwiseAggregation(0.1, val, &int1, val, &vint, val, 0),
4232@@ -208,12 +234,11 @@ void testing_local_matrix_bad_args(void)
4233 ".*Assertion.*G != (NULL|__null)*");
4234 ASSERT_DEATH(mat1.FurtherPairwiseAggregation(mat2, 0.1, val, &int1, val, &null_int, val, 0),
4235 ".*Assertion.*rG != (NULL|__null)*");
4236- ASSERT_DEATH(mat1.CoarsenOperator(
4237- null_mat, nullptr, safe_size, safe_size, int1, safe_size, vint, safe_size),
4238- ".*Assertion.*Ac != (NULL|__null)*");
4239 ASSERT_DEATH(
4240- mat1.CoarsenOperator(
4241- &mat2, nullptr, safe_size, safe_size, int1, safe_size, null_int, safe_size),
4242+ mat1.CoarsenOperator(null_mat, safe_size, safe_size, int1, safe_size, vint, safe_size),
4243+ ".*Assertion.*Ac != (NULL|__null)*");
4244+ ASSERT_DEATH(
4245+ mat1.CoarsenOperator(&mat2, safe_size, safe_size, int1, safe_size, null_int, safe_size),
4246 ".*Assertion.*rG != (NULL|__null)*");
4247 }
4248
4249@@ -478,16 +503,2319 @@ bool testing_local_matrix_allocations(Arguments argus)
4250 LocalMatrix<T> D;
4251 D.AllocateDIA("D", nnz, m, n, ndiag);
4252
4253+ LocalMatrix<T> E;
4254+ E.AllocateMCSR("E", nnz, m, n);
4255+
4256 LocalMatrix<T> F;
4257 F.AllocateELL("F", ell_nnz, m, n, ell_max_row);
4258
4259 LocalMatrix<T> G;
4260 G.AllocateHYB("G", ell_nnz, coo_nnz, ell_max_row, m, n);
4261
4262+ LocalMatrix<T> H;
4263+ H.AllocateDENSE("H", m, n);
4264+
4265+ // Stop rocALUTION platform
4266+ stop_rocalution();
4267+
4268+ return true;
4269+}
4270+
4271+template <typename T>
4272+bool testing_local_matrix_zero(Arguments argus)
4273+{
4274+ int size = argus.size;
4275+ int blockdim = argus.blockdim;
4276+
4277+ int m = size;
4278+ int n = size;
4279+ int mb = (m + blockdim - 1) / blockdim;
4280+ int nb = (n + blockdim - 1) / blockdim;
4281+
4282+ int nnz = 0.05 * m * n;
4283+ if(nnz == 0)
4284+ {
4285+ nnz = m * n;
4286+ }
4287+
4288+ // Initialize rocALUTION
4289+ set_device_rocalution(device);
4290+ init_rocalution();
4291+
4292+ // Testing Zeros
4293+ LocalMatrix<T> A;
4294+ A.AllocateCSR("A", nnz, m, n);
4295+
4296+ A.Zeros();
4297+
4298+ A.Info();
4299+
4300+ // Stop rocALUTION platform
4301+ stop_rocalution();
4302+
4303+ return true;
4304+}
4305+
4306+template <typename T>
4307+bool testing_local_matrix_set_data_ptr(Arguments argus)
4308+{
4309+ int size = argus.size;
4310+ int blockdim = argus.blockdim;
4311+
4312+ int m = size;
4313+ int n = size;
4314+ int mb = (m + blockdim - 1) / blockdim;
4315+ int nb = (n + blockdim - 1) / blockdim;
4316+
4317+ int nnz = 0.05 * m * n;
4318+ if(nnz == 0)
4319+ {
4320+ nnz = m * n;
4321+ }
4322+
4323+ int nnzb = 0.01 * mb * nb;
4324+ if(nnzb == 0)
4325+ {
4326+ nnzb = mb * nb;
4327+ }
4328+
4329+ // Initialize rocALUTION
4330+ set_device_rocalution(device);
4331+ init_rocalution();
4332+
4333+ int ndiag = 5;
4334+ int ell_max_row = 6;
4335+ int ell_nnz = ell_max_row * m;
4336+ int coo_nnz = (nnz - ell_nnz) < 0 ? 0 : nnz - ell_nnz;
4337+
4338+ // Testing allocating matrix types
4339+ {
4340+ LocalMatrix<T> A;
4341+ int* row_offset = NULL;
4342+ int* col = NULL;
4343+ T* val = NULL;
4344+
4345+ allocate_host(m + 1, &row_offset);
4346+ allocate_host(nnz, &col);
4347+ allocate_host(nnz, &val);
4348+
4349+ set_to_zero_host(m + 1, row_offset);
4350+ set_to_zero_host(nnz, col);
4351+ set_to_zero_host(nnz, val);
4352+
4353+ A.SetDataPtrCSR(&row_offset, &col, &val, "A", nnz, m, n);
4354+ A.LeaveDataPtrCSR(&row_offset, &col, &val);
4355+
4356+ free_host(&row_offset);
4357+ free_host(&col);
4358+ free_host(&val);
4359+ }
4360+
4361+ {
4362+ LocalMatrix<T> B;
4363+ int* row_offset = NULL;
4364+ int* col = NULL;
4365+ T* val = NULL;
4366+
4367+ allocate_host(mb + 1, &row_offset);
4368+ allocate_host(nnzb, &col);
4369+ allocate_host(nnzb, &val);
4370+
4371+ set_to_zero_host(mb + 1, row_offset);
4372+ set_to_zero_host(nnzb, col);
4373+ set_to_zero_host(nnzb, val);
4374+
4375+ B.SetDataPtrBCSR(&row_offset, &col, &val, "C", nnzb, mb, nb, blockdim);
4376+ B.LeaveDataPtrBCSR(&row_offset, &col, &val, blockdim);
4377+
4378+ free_host(&row_offset);
4379+ free_host(&col);
4380+ free_host(&val);
4381+ }
4382+
4383+ {
4384+ LocalMatrix<T> C;
4385+ int* row = NULL;
4386+ int* col = NULL;
4387+ T* val = NULL;
4388+
4389+ allocate_host(nnz, &row);
4390+ allocate_host(nnz, &col);
4391+ allocate_host(nnz, &val);
4392+
4393+ set_to_zero_host(nnz, row);
4394+ set_to_zero_host(nnz, col);
4395+ set_to_zero_host(nnz, val);
4396+
4397+ C.SetDataPtrCOO(&row, &col, &val, "C", nnz, m, n);
4398+ C.LeaveDataPtrCOO(&row, &col, &val);
4399+
4400+ free_host(&row);
4401+ free_host(&col);
4402+ free_host(&val);
4403+ }
4404+
4405+ {
4406+ LocalMatrix<T> E;
4407+ int* row_offset = NULL;
4408+ int* col = NULL;
4409+ T* val = NULL;
4410+
4411+ allocate_host(m + 1, &row_offset);
4412+ allocate_host(nnz, &col);
4413+ allocate_host(nnz, &val);
4414+
4415+ set_to_zero_host(m + 1, row_offset);
4416+ set_to_zero_host(nnz, col);
4417+ set_to_zero_host(nnz, val);
4418+
4419+ E.SetDataPtrMCSR(&row_offset, &col, &val, "C", nnz, m, n);
4420+ E.LeaveDataPtrMCSR(&row_offset, &col, &val);
4421+
4422+ free_host(&row_offset);
4423+ free_host(&col);
4424+ free_host(&val);
4425+ }
4426+
4427+ {
4428+ LocalMatrix<T> F;
4429+ int* col = NULL;
4430+ T* val = NULL;
4431+
4432+ allocate_host(ell_nnz, &col);
4433+ allocate_host(ell_nnz, &val);
4434+
4435+ set_to_zero_host(ell_nnz, col);
4436+ set_to_zero_host(ell_nnz, val);
4437+
4438+ F.SetDataPtrELL(&col, &val, "C", ell_nnz, m, n, ell_max_row);
4439+ F.LeaveDataPtrELL(&col, &val, ell_max_row);
4440+
4441+ free_host(&col);
4442+ free_host(&val);
4443+ }
4444+
4445+ {
4446+ LocalMatrix<T> H;
4447+ T* val = NULL;
4448+
4449+ allocate_host(m * n, &val);
4450+
4451+ set_to_zero_host(m * n, val);
4452+
4453+ H.SetDataPtrDENSE(&val, "C", m, n);
4454+ H.LeaveDataPtrDENSE(&val);
4455+
4456+ free_host(&val);
4457+ }
4458+
4459 // Stop rocALUTION platform
4460 stop_rocalution();
4461
4462 return true;
4463 }
4464
4465+template <typename T>
4466+LocalMatrix<T> getTestMatrix()
4467+{
4468+ // Create a simple 2x2 CSR matrix
4469+ LocalMatrix<T> matrix;
4470+ matrix.AllocateCSR("TestMatrix", 4, 2, 2);
4471+
4472+ int row_offsets[3] = {0, 2, 4};
4473+ int col_indices[4] = {0, 1, 0, 1};
4474+ T values[4] = {1.0, 2.0, 3.0, 4.0};
4475+ matrix.CopyFromCSR(row_offsets, col_indices, values);
4476+
4477+ return matrix;
4478+}
4479+
4480+template <typename T>
4481+void getTestMatrix(Arguments argus, LocalMatrix<T>& matrix, bool& is_invertible)
4482+{
4483+ int size = argus.size;
4484+ int blockdim = argus.blockdim;
4485+ std::string matrix_type = argus.matrix_type;
4486+
4487+ // Generate A
4488+ int* csr_ptr = NULL;
4489+ int* csr_col = NULL;
4490+ T* csr_val = NULL;
4491+
4492+ int nrow = 0;
4493+ int ncol = 0;
4494+ if(matrix_type == "Laplacian2D")
4495+ {
4496+ nrow = gen_2d_laplacian(size, &csr_ptr, &csr_col, &csr_val);
4497+ ncol = nrow;
4498+
4499+ is_invertible = true;
4500+ }
4501+ else if(matrix_type == "PermutedIdentity")
4502+ {
4503+ nrow = gen_permuted_identity(size, &csr_ptr, &csr_col, &csr_val);
4504+ ncol = nrow;
4505+
4506+ is_invertible = true;
4507+ }
4508+ else if(matrix_type == "Random")
4509+ {
4510+ nrow = gen_random(100 * size, 50 * size, 6, &csr_ptr, &csr_col, &csr_val);
4511+ ncol = 50 * size;
4512+
4513+ is_invertible = false;
4514+ }
4515+ else
4516+ {
4517+ is_invertible = true;
4518+
4519+ matrix = getTestMatrix<T>();
4520+ return;
4521+ }
4522+
4523+ int nnz = csr_ptr[nrow];
4524+
4525+ matrix.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "TestMatrix", nnz, nrow, ncol);
4526+}
4527+
4528+template <typename T>
4529+void getTestMatrix(Arguments argus, LocalMatrix<T>& matrix)
4530+{
4531+ bool is_invertible;
4532+ getTestMatrix<T>(argus, matrix, is_invertible);
4533+}
4534+
4535+template <typename T>
4536+void getMatrixVal(const LocalMatrix<T>& matrix, T* values)
4537+{
4538+ // Copy the values from the matrix to the provided array
4539+ int64_t m = matrix.GetM();
4540+ int64_t nnz = matrix.GetNnz();
4541+
4542+ int* row_offsets = new int[m + 1];
4543+ int* col_indices = new int[nnz];
4544+ T* matrix_values = new T[nnz];
4545+
4546+ matrix.CopyToCSR(row_offsets, col_indices, matrix_values);
4547+ for(int i = 0; i < nnz; ++i)
4548+ {
4549+ values[i] = matrix_values[i];
4550+ }
4551+
4552+ delete[] row_offsets;
4553+ delete[] col_indices;
4554+ delete[] matrix_values;
4555+}
4556+
4557+template <typename T>
4558+void getMatrixDiagVal(const LocalMatrix<T>& matrix, T* values)
4559+{
4560+ // Copy the values from the matrix to the provided array
4561+ int64_t m = matrix.GetM();
4562+ int64_t nnz = matrix.GetNnz();
4563+
4564+ int* row_offsets = new int[m + 1];
4565+ int* col_indices = new int[nnz];
4566+ T* matrix_values = new T[nnz];
4567+
4568+ matrix.CopyToCSR(row_offsets, col_indices, matrix_values);
4569+ for(int row = 0; row < m; ++row)
4570+ {
4571+ int start = row_offsets[row];
4572+ int end = row_offsets[row + 1];
4573+ for(int i = start; i < end; ++i)
4574+ {
4575+ if(col_indices[i] == row) // Diagonal element
4576+ {
4577+ values[row] = matrix_values[i];
4578+ break; // Only one diagonal element per row
4579+ }
4580+ }
4581+ }
4582+
4583+ delete[] row_offsets;
4584+ delete[] col_indices;
4585+ delete[] matrix_values;
4586+}
4587+
4588+void checkPermutation(const LocalVector<int>& permutation)
4589+{
4590+ // Check that permutation is a valid permutation of 0..N-1
4591+ std::vector<int> seen(permutation.GetSize(), 0);
4592+ for(int i = 0; i < permutation.GetSize(); ++i)
4593+ {
4594+ int idx = permutation[i];
4595+ EXPECT_GE(idx, 0);
4596+ EXPECT_LT(idx, permutation.GetSize());
4597+ seen[idx]++;
4598+ }
4599+ for(int i = 0; i < seen.size(); ++i)
4600+ {
4601+ EXPECT_EQ(seen[i], 1); // Each index appears exactly once
4602+ }
4603+}
4604+
4605+template <typename T>
4606+T getTolerance()
4607+{
4608+ // Set tolerance based on the type
4609+ if(std::is_same<T, float>::value)
4610+ {
4611+ return 1e-5f; // Tolerance for float
4612+ }
4613+ else
4614+ {
4615+ return 1e-10; // Default tolerance for other types
4616+ }
4617+}
4618+
4619+// Helper to extract dense matrix from LocalMatrix<T>
4620+template <typename T>
4621+std::vector<std::vector<T>> extract_dense_matrix(const LocalMatrix<T>& matrix)
4622+{
4623+ int m = matrix.GetM();
4624+ int n = matrix.GetN();
4625+ int nnz = matrix.GetNnz();
4626+ std::vector<std::vector<T>> dense(m, std::vector<T>(n, static_cast<T>(0)));
4627+ std::vector<int> row_offsets(m + 1);
4628+ std::vector<int> col_indices(nnz);
4629+ std::vector<T> values(nnz);
4630+
4631+ matrix.CopyToCSR(row_offsets.data(), col_indices.data(), values.data());
4632+ for(int row = 0; row < m; ++row)
4633+ {
4634+ for(int idx = row_offsets[row]; idx < row_offsets[row + 1]; ++idx)
4635+ {
4636+ int col = col_indices[idx];
4637+ dense[row][col] = values[idx];
4638+ }
4639+ }
4640+ return dense;
4641+}
4642+
4643+template <typename T>
4644+void testing_local_allocate()
4645+{
4646+ // Test AllocateCSR
4647+ {
4648+ LocalMatrix<T> mat;
4649+ EXPECT_NO_THROW(mat.AllocateCSR("AllocatedMatrix", 4, 2, 2));
4650+ EXPECT_EQ(mat.GetNnz(), 4);
4651+ EXPECT_EQ(mat.GetM(), 2);
4652+ EXPECT_EQ(mat.GetN(), 2);
4653+ }
4654+
4655+ // Test AllocateCOO
4656+ {
4657+ LocalMatrix<T> mat;
4658+ mat.AllocateCOO("AllocatedMatrix", 4, 2, 2);
4659+ EXPECT_EQ(mat.GetNnz(), 4);
4660+ EXPECT_EQ(mat.GetM(), 2);
4661+ EXPECT_EQ(mat.GetN(), 2);
4662+ }
4663+
4664+ // Test AllocateBCSR
4665+ {
4666+ LocalMatrix<T> mat;
4667+ int nnzb = 4, mb = 2, nb = 2, blockdim = 2;
4668+ EXPECT_NO_THROW(mat.AllocateBCSR("BCSR", nnzb, mb, nb, blockdim));
4669+ EXPECT_EQ(mat.GetNnz(), nnzb * blockdim * blockdim);
4670+ EXPECT_EQ(mat.GetM(), mb * blockdim);
4671+ EXPECT_EQ(mat.GetN(), nb * blockdim);
4672+ }
4673+
4674+ // Test AllocateDIA
4675+ {
4676+ LocalMatrix<T> mat;
4677+ int nnz = 6, m = 3, n = 3, ndiag = 2;
4678+ EXPECT_NO_THROW(mat.AllocateDIA("DIA", nnz, m, n, ndiag));
4679+ EXPECT_EQ(mat.GetNnz(), nnz);
4680+ EXPECT_EQ(mat.GetM(), m);
4681+ EXPECT_EQ(mat.GetN(), n);
4682+ }
4683+
4684+ // Test AllocateMCSR
4685+ {
4686+ LocalMatrix<T> mat;
4687+ int nnz = 5, m = 3, n = 3;
4688+ EXPECT_NO_THROW(mat.AllocateMCSR("MCSR", nnz, m, n));
4689+ EXPECT_EQ(mat.GetNnz(), nnz);
4690+ EXPECT_EQ(mat.GetM(), m);
4691+ EXPECT_EQ(mat.GetN(), n);
4692+ }
4693+
4694+ // Test AllocateELL
4695+ {
4696+ LocalMatrix<T> mat;
4697+ int ell_nnz = 6, m = 3, n = 3, ell_max_row = 2;
4698+ EXPECT_NO_THROW(mat.AllocateELL("ELL", ell_nnz, m, n, ell_max_row));
4699+ EXPECT_EQ(mat.GetNnz(), ell_nnz);
4700+ EXPECT_EQ(mat.GetM(), m);
4701+ EXPECT_EQ(mat.GetN(), n);
4702+ }
4703+
4704+ // Test AllocateHYB
4705+ {
4706+ LocalMatrix<T> mat;
4707+ int m = 3, n = 3;
4708+ int ell_max_row = 2;
4709+ int ell_nnz = ell_max_row * m; // 2 * 3 = 6
4710+ int coo_nnz = 2;
4711+
4712+ EXPECT_NO_THROW(mat.AllocateHYB("HYB", ell_nnz, coo_nnz, ell_max_row, m, n));
4713+ EXPECT_EQ(mat.GetNnz(), ell_nnz + coo_nnz); // 8
4714+ EXPECT_EQ(mat.GetM(), m);
4715+ EXPECT_EQ(mat.GetN(), n);
4716+ }
4717+
4718+ // Test AllocateDENSE
4719+ {
4720+ LocalMatrix<T> mat;
4721+ int m = 3, n = 3;
4722+ EXPECT_NO_THROW(mat.AllocateDENSE("DENSE", m, n));
4723+ EXPECT_EQ(mat.GetNnz(), m * n);
4724+ EXPECT_EQ(mat.GetM(), m);
4725+ EXPECT_EQ(mat.GetN(), n);
4726+ }
4727+}
4728+
4729+template <typename T>
4730+void testing_check_with_empty_matrix()
4731+{
4732+ LocalMatrix<T> empty_matrix;
4733+ // Check should pass without any issues
4734+ EXPECT_NO_THROW(empty_matrix.Check());
4735+ // Info should not throw an error
4736+ EXPECT_NO_THROW(empty_matrix.Info());
4737+}
4738+
4739+template <typename T>
4740+void testing_local_copy_from_async()
4741+{
4742+ auto matrix = getTestMatrix<T>();
4743+ LocalMatrix<T> copy_matrix;
4744+ // CopyFromAsync should copy the matrix asynchronously (if supported)
4745+ EXPECT_NO_THROW(copy_matrix.CopyFromAsync(matrix));
4746+ EXPECT_NO_THROW(matrix.Sync());
4747+ EXPECT_EQ(copy_matrix.GetM(), matrix.GetM());
4748+ EXPECT_EQ(copy_matrix.GetN(), matrix.GetN());
4749+ EXPECT_EQ(copy_matrix.GetNnz(), matrix.GetNnz());
4750+
4751+ // Compare dense representations
4752+ auto dense_orig = extract_dense_matrix(matrix);
4753+ auto dense_copy = extract_dense_matrix(copy_matrix);
4754+
4755+ EXPECT_EQ(dense_orig.size(), dense_copy.size());
4756+ for(size_t i = 0; i < dense_orig.size(); ++i)
4757+ {
4758+ EXPECT_EQ(dense_orig[i].size(), dense_copy[i].size());
4759+ for(size_t j = 0; j < dense_orig[i].size(); ++j)
4760+ {
4761+ EXPECT_EQ(dense_orig[i][j], dense_copy[i][j]);
4762+ }
4763+ }
4764+}
4765+
4766+template <typename T>
4767+void testing_local_update_values_csr()
4768+{
4769+ auto matrix = getTestMatrix<T>();
4770+
4771+ int64_t nnz = matrix.GetNnz();
4772+
4773+ // Use std::vector instead of raw arrays
4774+ std::vector<T> new_values(nnz);
4775+ for(int64_t i = 0; i < nnz; ++i)
4776+ {
4777+ new_values[i] = static_cast<T>(i + 10); // Fill with some values
4778+ }
4779+
4780+ // UpdateValuesCSR should update the values in the matrix
4781+ EXPECT_NO_THROW(matrix.UpdateValuesCSR(new_values.data()));
4782+
4783+ std::vector<T> check_values(nnz);
4784+ getMatrixVal(matrix, check_values.data());
4785+ for(int64_t i = 0; i < nnz; ++i)
4786+ {
4787+ EXPECT_EQ(check_values[i], new_values[i]);
4788+ }
4789+}
4790+
4791+template <typename T>
4792+void testing_local_move_to_accelerator()
4793+{
4794+ auto matrix = getTestMatrix<T>();
4795+
4796+ for(int i = 0; i < 10; i++)
4797+ {
4798+ EXPECT_NO_THROW(matrix.MoveToAccelerator());
4799+ }
4800+ for(int i = 0; i < 10; i++)
4801+ {
4802+ EXPECT_NO_THROW(matrix.MoveToHost());
4803+ }
4804+ for(int i = 0; i < 10; i++)
4805+ {
4806+ EXPECT_NO_THROW(matrix.MoveToHost());
4807+ EXPECT_NO_THROW(matrix.MoveToAccelerator());
4808+ }
4809+ EXPECT_EQ(matrix.Check(), true);
4810+}
4811+
4812+template <typename T>
4813+void testing_local_move_to_accelerator_async()
4814+{
4815+ auto matrix = getTestMatrix<T>();
4816+ // MoveToAcceleratorAsync should move the matrix asynchronously
4817+ for(int i = 0; i < 10; i++)
4818+ {
4819+ EXPECT_NO_THROW(matrix.MoveToAcceleratorAsync());
4820+ }
4821+ for(int i = 0; i < 10; i++)
4822+ {
4823+ EXPECT_NO_THROW(matrix.MoveToHostAsync());
4824+ }
4825+ for(int i = 0; i < 10; i++)
4826+ {
4827+ EXPECT_NO_THROW(matrix.MoveToHostAsync());
4828+ EXPECT_NO_THROW(matrix.MoveToAcceleratorAsync());
4829+ }
4830+ EXPECT_NO_THROW(matrix.Sync());
4831+ EXPECT_EQ(matrix.Check(), true);
4832+}
4833+
4834+template <typename T>
4835+void testing_local_move_to_host_async()
4836+{
4837+ auto matrix = getTestMatrix<T>();
4838+ // MoveToHostAsync should move the matrix asynchronously to host
4839+ EXPECT_NO_THROW(matrix.MoveToHostAsync());
4840+ EXPECT_NO_THROW(matrix.Sync());
4841+ EXPECT_EQ(matrix.GetM(), 2);
4842+ EXPECT_EQ(matrix.GetN(), 2);
4843+}
4844+
4845+template <typename T>
4846+void testing_local_clear(Arguments argus)
4847+{
4848+ LocalMatrix<T> matrix;
4849+ getTestMatrix<T>(argus, matrix);
4850+
4851+ // Clear should remove all data from the matrix
4852+ // This test checks if the Clear operation is valid
4853+ // by checking if the number of non-zero entries (nnz) is zero
4854+ // and the dimensions (m, n) are also zero.
4855+
4856+ matrix.Clear();
4857+ EXPECT_EQ(matrix.GetNnz(), 0);
4858+ EXPECT_EQ(matrix.GetM(), 0);
4859+ EXPECT_EQ(matrix.GetN(), 0);
4860+}
4861+
4862+template <typename T>
4863+void testing_local_zeros(Arguments argus)
4864+{
4865+ LocalMatrix<T> matrix;
4866+ getTestMatrix<T>(argus, matrix);
4867+
4868+ // Zeros should set all values in the matrix to zero
4869+ // This test checks if the Zeros operation is valid
4870+ // by checking if all values in the matrix are zero after the operation.
4871+
4872+ matrix.Zeros();
4873+ matrix.Check();
4874+ matrix.Info();
4875+
4876+ int64_t nnz = matrix.GetNnz();
4877+ T* check_values = new T[nnz];
4878+ getMatrixVal(matrix, check_values);
4879+ for(int64_t i = 0; i < nnz; ++i)
4880+ {
4881+ EXPECT_EQ(check_values[i], static_cast<T>(0));
4882+ }
4883+ delete[] check_values;
4884+}
4885+
4886+template <typename T>
4887+void testing_local_copy(Arguments argus)
4888+{
4889+ LocalMatrix<T> matrix;
4890+ getTestMatrix<T>(argus, matrix);
4891+
4892+ // CopyFrom should create a copy of the matrix
4893+ // This test checks if the copy operation is valid
4894+ // by comparing the original matrix with the copied matrix.
4895+
4896+ LocalMatrix<T> copy_matrix;
4897+ copy_matrix.CopyFrom(matrix);
4898+ EXPECT_EQ(copy_matrix.GetNnz(), matrix.GetNnz());
4899+ EXPECT_EQ(copy_matrix.GetM(), matrix.GetM());
4900+ EXPECT_EQ(copy_matrix.GetN(), matrix.GetN());
4901+
4902+ // Compare dense representations
4903+ auto dense_orig = extract_dense_matrix(matrix);
4904+ auto dense_copy = extract_dense_matrix(copy_matrix);
4905+
4906+ EXPECT_EQ(dense_orig.size(), dense_copy.size());
4907+ for(size_t i = 0; i < dense_orig.size(); ++i)
4908+ {
4909+ EXPECT_EQ(dense_orig[i].size(), dense_copy[i].size());
4910+ for(size_t j = 0; j < dense_orig[i].size(); ++j)
4911+ {
4912+ EXPECT_EQ(dense_orig[i][j], dense_copy[i][j]);
4913+ }
4914+ }
4915+}
4916+
4917+template <typename T>
4918+void testing_local_scale(Arguments argus)
4919+{
4920+ LocalMatrix<T> matrix;
4921+ getTestMatrix<T>(argus, matrix);
4922+
4923+ // Scale should multiply all values in the matrix by a scalar
4924+ // This test checks if the scaling operation is valid
4925+ // by comparing the scaled values with the expected values.
4926+ // The expected values are obtained by multiplying the original values
4927+ // by the scaling factor.
4928+
4929+ // Save original dense matrix
4930+ auto orig_dense = extract_dense_matrix(matrix);
4931+
4932+ // Scale the matrix by 2.0
4933+ matrix.Scale(2.0);
4934+
4935+ // Extract new dense matrix
4936+ auto new_dense = extract_dense_matrix(matrix);
4937+
4938+ // Compare each value
4939+ int m = matrix.GetM();
4940+ int n = matrix.GetN();
4941+ for(int i = 0; i < m; ++i)
4942+ for(int j = 0; j < n; ++j)
4943+ EXPECT_EQ(new_dense[i][j], orig_dense[i][j] * 2.0);
4944+}
4945+
4946+template <typename T>
4947+void testing_local_extract_diagonal(Arguments argus)
4948+{
4949+ if(argus.matrix_type != "Laplacian2D")
4950+ {
4951+ return;
4952+ }
4953+
4954+ LocalMatrix<T> matrix;
4955+ getTestMatrix<T>(argus, matrix);
4956+
4957+ // ExtractDiagonal should extract the diagonal of the matrix
4958+ // and store it in a LocalVector
4959+ // This test checks if the diagonal extraction is correct
4960+ // by comparing the extracted diagonal with the expected values.
4961+ // The expected values are obtained by iterating through the matrix
4962+ // and checking the diagonal elements.
4963+
4964+ LocalVector<T> diag;
4965+ matrix.ExtractDiagonal(&diag);
4966+
4967+ int64_t m = matrix.GetM();
4968+ T* check_values = new T[m];
4969+ getMatrixDiagVal(matrix, check_values);
4970+
4971+ EXPECT_EQ(diag.GetSize(), m);
4972+ for(int i = 0; i < m; ++i)
4973+ {
4974+ EXPECT_EQ(diag[i], check_values[i]);
4975+ }
4976+ delete[] check_values;
4977+}
4978+
4979+template <typename T>
4980+void testing_local_extract_inverse_diagonal(Arguments argus)
4981+{
4982+ if(argus.matrix_type != "Laplacian2D")
4983+ {
4984+ return;
4985+ }
4986+
4987+ LocalMatrix<T> matrix;
4988+ getTestMatrix<T>(argus, matrix);
4989+
4990+ // ExtractInverseDiagonal should extract the inverse diagonal of the matrix
4991+ // and store it in a LocalVector
4992+ // This test checks if the inverse diagonal extraction is correct
4993+ // by comparing the extracted inverse diagonal with the expected values.
4994+ // The expected values are obtained by taking the reciprocal of the diagonal elements.
4995+
4996+ LocalVector<T> diag;
4997+ matrix.ExtractDiagonal(&diag);
4998+ LocalVector<T> inv_diag;
4999+ matrix.ExtractInverseDiagonal(&inv_diag);
5000+ EXPECT_EQ(inv_diag.GetSize(), diag.GetSize());
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches