Merge ~bullwinkle-team/ubuntu/+source/rocalution:bullwinkle/llvm-21/ubuntu/devel into ubuntu/+source/rocalution:ubuntu/devel
- Git
- lp:~bullwinkle-team/ubuntu/+source/rocalution
- bullwinkle/llvm-21/ubuntu/devel
- Merge into ubuntu/devel
| Status: | Merged | ||||
|---|---|---|---|---|---|
| Approved by: | Andreas Hasenack | ||||
| Approved revision: | 6e4d78c86cfc6891e351f45f3b378b525d77c661 | ||||
| Merged at revision: | 6e4d78c86cfc6891e351f45f3b378b525d77c661 | ||||
| Proposed branch: | ~bullwinkle-team/ubuntu/+source/rocalution:bullwinkle/llvm-21/ubuntu/devel | ||||
| Merge into: | ubuntu/+source/rocalution:ubuntu/devel | ||||
| Diff against target: |
107187 lines (+64996/-17596) 296 files modified
.azuredevops/rocm-ci.yml (+44/-0) .githooks/pre-commit (+1/-1) .github/CODEOWNERS (+6/-1) .github/CONTRIBUTING.md (+160/-0) .github/dependabot.yml (+8/-1) .gitignore (+0/-11) .jenkins/codecov.groovy (+4/-6) .jenkins/common.groovy (+18/-12) .jenkins/debug.groovy (+1/-1) .jenkins/extended.groovy (+1/-1) .jenkins/precheckin.groovy (+1/-1) .jenkins/staticanalysis.groovy (+21/-3) .jenkins/staticlibrary.groovy (+3/-3) .readthedocs.yaml (+7/-3) CHANGELOG.md (+272/-108) CMakeLists.txt (+67/-28) LICENSE.md (+1/-1) README.md (+82/-71) clients/benchmarks/CMakeLists.txt (+3/-2) clients/benchmarks/rocalution_arguments_config.cpp (+53/-1) clients/benchmarks/rocalution_bench_solver_parameters.cpp (+5/-1) clients/benchmarks/rocalution_bench_solver_parameters.hpp (+18/-1) clients/benchmarks/rocalution_driver_itsolver.hpp (+82/-13) clients/benchmarks/rocalution_driver_itsolver_uaamg.hpp (+2/-2) clients/benchmarks/rocalution_enum_itilu0_alg.cpp (+99/-0) clients/benchmarks/rocalution_enum_itilu0_alg.hpp (+92/-0) clients/benchmarks/rocalution_enum_preconditioner.hpp (+2/-1) clients/include/common.hpp (+875/-31) clients/include/testing_bicgstab.hpp (+24/-10) clients/include/testing_bicgstabl.hpp (+18/-10) clients/include/testing_cg.hpp (+17/-9) clients/include/testing_chebyshev.hpp (+218/-0) clients/include/testing_cr.hpp (+17/-9) clients/include/testing_fcg.hpp (+17/-9) clients/include/testing_fgmres.hpp (+18/-10) clients/include/testing_global_matrix.hpp (+7/-11) clients/include/testing_gmres.hpp (+19/-11) clients/include/testing_idr.hpp (+19/-11) clients/include/testing_inversion.hpp (+32/-13) clients/include/testing_itsolver.hpp (+165/-0) clients/include/testing_local_matrix.hpp (+2356/-28) clients/include/testing_local_matrix_itsolve.hpp (+473/-0) clients/include/testing_local_matrix_multicoloring.hpp (+128/-0) clients/include/testing_local_matrix_solve.hpp (+461/-0) clients/include/testing_local_vector.hpp (+1323/-15) clients/include/testing_lu.hpp (+31/-13) clients/include/testing_mixed_precision.hpp (+135/-0) clients/include/testing_pairwise_amg.hpp (+282/-4) clients/include/testing_preconditioner.hpp (+378/-0) clients/include/testing_qmrcgstab.hpp (+17/-9) clients/include/testing_qr.hpp (+31/-13) clients/include/testing_ruge_stueben_amg.hpp (+35/-18) clients/include/testing_saamg.hpp (+16/-33) clients/include/testing_uaamg.hpp (+15/-9) clients/include/utility.hpp (+65/-20) clients/include/validate.hpp (+108/-0) clients/samples/CMakeLists.txt (+10/-2) clients/samples/cg-rsamg.cpp (+18/-10) clients/samples/cg-rsamg_mpi.cpp (+179/-0) clients/samples/cg-saamg_mpi.cpp (+177/-0) clients/samples/cg-uaamg_mpi.cpp (+176/-0) clients/samples/itsolve.cpp (+161/-0) clients/samples/laplace_2d_weak_scaling.cpp (+163/-0) clients/samples/laplace_3d_weak_scaling.cpp (+164/-0) clients/tests/CMakeLists.txt (+9/-1) clients/tests/test_backend.cpp (+18/-2) clients/tests/test_bicgstab.cpp (+65/-6) clients/tests/test_bicgstabl.cpp (+69/-8) clients/tests/test_cg.cpp (+63/-6) clients/tests/test_chebyshev.cpp (+99/-0) clients/tests/test_cr.cpp (+65/-6) clients/tests/test_fcg.cpp (+64/-6) clients/tests/test_fgmres.cpp (+69/-7) clients/tests/test_global_matrix.cpp (+8/-1) clients/tests/test_global_vector.cpp (+8/-1) clients/tests/test_gmres.cpp (+85/-12) clients/tests/test_idr.cpp (+68/-7) clients/tests/test_inversion.cpp (+71/-6) clients/tests/test_itersolver.cpp (+93/-0) clients/tests/test_local_matrix.cpp (+388/-3) clients/tests/test_local_matrix_itsolve.cpp (+193/-0) clients/tests/test_local_matrix_multicoloring.cpp (+79/-0) clients/tests/test_local_matrix_solve.cpp (+197/-0) clients/tests/test_local_stencil.cpp (+8/-1) clients/tests/test_local_vector.cpp (+374/-1) clients/tests/test_lu.cpp (+70/-6) clients/tests/test_mixed_precision.cpp (+91/-0) clients/tests/test_pairwise_amg.cpp (+108/-10) clients/tests/test_parallel_manager.cpp (+8/-1) clients/tests/test_preconditioner.cpp (+114/-0) clients/tests/test_qmrcgstab.cpp (+64/-6) clients/tests/test_qr.cpp (+69/-6) clients/tests/test_ruge_stueben_amg.cpp (+105/-19) clients/tests/test_saamg.cpp (+99/-13) clients/tests/test_uaamg.cpp (+99/-13) cmake/Dependencies.cmake (+20/-29) debian/changelog (+32/-0) debian/control (+14/-10) debian/librocalution-doc.doc-base (+0/-2) debian/librocalution-doc.docs (+1/-1) debian/librocalution1-tests.install (+1/-0) debian/patches/series (+0/-5) debian/rules (+38/-4) debian/shlibs (+1/-1) debian/tests/control (+2/-2) dev/null (+0/-20) docs/.gitignore (+5/-0) docs/conceptual/rocALUTION-design.rst (+35/-0) docs/conceptual/rocALUTION-organization.rst (+130/-0) docs/conf.py (+22/-2) docs/doxygen/Doxyfile (+14/-44) docs/doxygen/bibliography.bib (+207/-0) docs/how-to/extending-rocALUTION.rst (+90/-74) docs/how-to/include-rocALUTION.rst (+31/-0) docs/index.rst (+58/-10) docs/install/rocALUTION-general-install.rst (+14/-0) docs/install/rocALUTION-linux-build-and-install.rst (+58/-0) docs/install/rocALUTION-prerequisites.rst (+27/-0) docs/install/rocALUTION-windows-build-and-install.rst (+59/-0) docs/license.rst (+4/-0) docs/reference/rocALUTION-accelerators.rst (+97/-0) docs/reference/rocALUTION-api-basics.rst (+208/-0) docs/reference/rocALUTION-api.rst (+8/-8) docs/reference/rocALUTION-functionality-table.rst (+16/-6) docs/reference/rocALUTION-library-notes.rst (+12/-4) docs/reference/rocALUTION-multi-node-comp.rst (+39/-28) docs/reference/rocALUTION-preconditioners.rst (+64/-27) docs/reference/rocALUTION-single-node-comp.rst (+161/-103) docs/reference/rocALUTION-solvers.rst (+60/-30) docs/sphinx/_toc.yml.in (+48/-0) docs/sphinx/requirements.in (+1/-0) docs/sphinx/requirements.txt (+277/-0) docs/tutorials/rocALUTION-client-examples.rst (+65/-0) docs/what-is-rocalution.rst (+15/-26) install.sh (+38/-42) rmake.py (+22/-9) rtest.py (+27/-6) rtest.xml (+9/-0) src/CMakeLists.txt (+103/-48) src/base/backend_manager.cpp (+91/-16) src/base/backend_manager.hpp (+92/-21) src/base/base_matrix.cpp (+597/-55) src/base/base_matrix.hpp (+511/-199) src/base/base_rocalution.cpp (+18/-3) src/base/base_rocalution.hpp (+2/-1) src/base/base_stencil.hpp (+17/-17) src/base/base_vector.cpp (+27/-4) src/base/base_vector.hpp (+88/-74) src/base/global_matrix.cpp (+3195/-554) src/base/global_matrix.hpp (+161/-59) src/base/global_vector.cpp (+57/-35) src/base/global_vector.hpp (+66/-20) src/base/hip/CMakeLists.txt (+2/-1) src/base/hip/backend_hip.cpp (+207/-73) src/base/hip/backend_hip.hpp (+26/-8) src/base/hip/hip_allocate_free.cpp (+297/-75) src/base/hip/hip_allocate_free.hpp (+26/-5) src/base/hip/hip_blas.cpp (+176/-12) src/base/hip/hip_blas.hpp (+25/-3) src/base/hip/hip_conversion.cpp (+571/-583) src/base/hip/hip_conversion.hpp (+87/-89) src/base/hip/hip_kernels_conversion.hpp (+56/-55) src/base/hip/hip_kernels_csr.hpp (+2369/-1362) src/base/hip/hip_kernels_general.hpp (+3/-3) src/base/hip/hip_kernels_rsamg_csr.hpp (+2121/-0) src/base/hip/hip_kernels_vector.hpp (+62/-15) src/base/hip/hip_matrix_bcsr.cpp (+183/-220) src/base/hip/hip_matrix_bcsr.hpp (+4/-10) src/base/hip/hip_matrix_coo.cpp (+155/-281) src/base/hip/hip_matrix_coo.hpp (+3/-3) src/base/hip/hip_matrix_csr.cpp (+7502/-2314) src/base/hip/hip_matrix_csr.hpp (+301/-22) src/base/hip/hip_matrix_dense.cpp (+74/-137) src/base/hip/hip_matrix_dia.cpp (+114/-185) src/base/hip/hip_matrix_dia.hpp (+4/-4) src/base/hip/hip_matrix_ell.cpp (+87/-153) src/base/hip/hip_matrix_ell.hpp (+3/-3) src/base/hip/hip_matrix_hyb.cpp (+176/-358) src/base/hip/hip_matrix_hyb.hpp (+7/-6) src/base/hip/hip_matrix_mcsr.cpp (+359/-447) src/base/hip/hip_matrix_mcsr.hpp (+3/-3) src/base/hip/hip_rand.hpp (+1/-33) src/base/hip/hip_rand_normal.hpp (+2/-1) src/base/hip/hip_rand_uniform.hpp (+14/-17) src/base/hip/hip_rsamg_csr.cpp (+1422/-0) src/base/hip/hip_sparse.cpp (+742/-1) src/base/hip/hip_sparse.hpp (+101/-1) src/base/hip/hip_unordered_map.hpp (+8/-4) src/base/hip/hip_unordered_set.hpp (+4/-2) src/base/hip/hip_utils.hpp (+28/-124) src/base/hip/hip_vector.cpp (+571/-504) src/base/hip/hip_vector.hpp (+30/-22) src/base/host/CMakeLists.txt (+3/-1) src/base/host/host_affinity.cpp (+3/-1) src/base/host/host_conversion.cpp (+700/-702) src/base/host/host_conversion.hpp (+122/-122) src/base/host/host_ilut_driver_csr.cpp (+386/-0) src/base/host/host_ilut_driver_csr.hpp (+163/-0) src/base/host/host_io.cpp (+3910/-94) src/base/host/host_io.hpp (+163/-17) src/base/host/host_matrix_bcsr.cpp (+120/-78) src/base/host/host_matrix_bcsr.hpp (+6/-9) src/base/host/host_matrix_coo.cpp (+110/-150) src/base/host/host_matrix_coo.hpp (+6/-3) src/base/host/host_matrix_csr.cpp (+8640/-4208) src/base/host/host_matrix_csr.hpp (+310/-27) src/base/host/host_matrix_dense.cpp (+74/-57) src/base/host/host_matrix_dense.hpp (+4/-1) src/base/host/host_matrix_dia.cpp (+87/-63) src/base/host/host_matrix_dia.hpp (+7/-4) src/base/host/host_matrix_ell.cpp (+82/-67) src/base/host/host_matrix_ell.hpp (+6/-3) src/base/host/host_matrix_hyb.cpp (+129/-99) src/base/host/host_matrix_hyb.hpp (+10/-6) src/base/host/host_matrix_mcsr.cpp (+91/-72) src/base/host/host_matrix_mcsr.hpp (+6/-3) src/base/host/host_sparse.cpp (+563/-0) src/base/host/host_sparse.hpp (+92/-0) src/base/host/host_stencil_laplace2d.cpp (+4/-2) src/base/host/host_stencil_laplace2d.hpp (+2/-2) src/base/host/host_vector.cpp (+446/-209) src/base/host/host_vector.hpp (+33/-25) src/base/local_matrix.cpp (+2849/-931) src/base/local_matrix.hpp (+342/-91) src/base/local_stencil.cpp (+9/-5) src/base/local_stencil.hpp (+16/-6) src/base/local_vector.cpp (+146/-55) src/base/local_vector.hpp (+348/-40) src/base/matrix_formats.hpp (+5/-4) src/base/matrix_formats_ind.hpp (+3/-3) src/base/operator.cpp (+20/-10) src/base/operator.hpp (+14/-11) src/base/parallel_manager.cpp (+929/-93) src/base/parallel_manager.hpp (+130/-51) src/base/vector.cpp (+64/-27) src/base/vector.hpp (+28/-20) src/solvers/chebyshev.cpp (+3/-1) src/solvers/direct/inversion.cpp (+3/-1) src/solvers/direct/lu.cpp (+9/-6) src/solvers/direct/qr.cpp (+3/-1) src/solvers/iter_ctrl.cpp (+7/-3) src/solvers/iter_ctrl.hpp (+5/-4) src/solvers/krylov/bicgstab.cpp (+3/-1) src/solvers/krylov/bicgstabl.cpp (+3/-1) src/solvers/krylov/cg.cpp (+3/-3) src/solvers/krylov/cr.cpp (+3/-3) src/solvers/krylov/fcg.cpp (+3/-3) src/solvers/krylov/fgmres.cpp (+3/-4) src/solvers/krylov/gmres.cpp (+3/-3) src/solvers/krylov/idr.cpp (+7/-5) src/solvers/krylov/qmrcgstab.cpp (+3/-1) src/solvers/mixed_precision.cpp (+8/-4) src/solvers/multigrid/base_amg.cpp (+64/-48) src/solvers/multigrid/base_amg.hpp (+6/-9) src/solvers/multigrid/base_multigrid.cpp (+13/-25) src/solvers/multigrid/base_multigrid.hpp (+8/-6) src/solvers/multigrid/multigrid.cpp (+3/-3) src/solvers/multigrid/pairwise_amg.cpp (+15/-21) src/solvers/multigrid/pairwise_amg.hpp (+22/-7) src/solvers/multigrid/ruge_stueben_amg.cpp (+59/-82) src/solvers/multigrid/ruge_stueben_amg.hpp (+6/-27) src/solvers/multigrid/smoothed_amg.cpp (+72/-72) src/solvers/multigrid/smoothed_amg.hpp (+6/-7) src/solvers/multigrid/unsmoothed_amg.cpp (+51/-63) src/solvers/multigrid/unsmoothed_amg.hpp (+6/-7) src/solvers/preconditioners/preconditioner.cpp (+256/-30) src/solvers/preconditioners/preconditioner.hpp (+96/-1) src/solvers/preconditioners/preconditioner_ai.cpp (+9/-1) src/solvers/preconditioners/preconditioner_as.cpp (+12/-5) src/solvers/preconditioners/preconditioner_blockjacobi.cpp (+3/-1) src/solvers/preconditioners/preconditioner_blockprecond.cpp (+3/-1) src/solvers/preconditioners/preconditioner_multicolored_gs.cpp (+5/-1) src/solvers/preconditioners/preconditioner_multicolored_ilu.cpp (+3/-1) src/solvers/preconditioners/preconditioner_multicolored_ilu.hpp (+2/-2) src/solvers/preconditioners/preconditioner_multielimination.cpp (+3/-1) src/solvers/preconditioners/preconditioner_multielimination.hpp (+11/-5) src/solvers/preconditioners/preconditioner_saddlepoint.cpp (+6/-2) src/solvers/solver.cpp (+110/-3) src/solvers/solver.hpp (+143/-11) src/utils/CMakeLists.txt (+2/-2) src/utils/allocate_free.cpp (+138/-45) src/utils/allocate_free.hpp (+57/-6) src/utils/communicator.cpp (+296/-165) src/utils/communicator.hpp (+26/-6) src/utils/def.hpp (+0/-2) src/utils/log.cpp (+5/-1) src/utils/log.hpp (+3/-1) src/utils/math_functions.cpp (+5/-1) src/utils/math_functions.hpp (+18/-15) src/utils/rocsparseio.cpp (+1456/-0) src/utils/rocsparseio.h (+1033/-0) src/utils/rocsparseio.hpp (+2351/-0) src/utils/type_traits.hpp (+39/-6) src/utils/types.hpp.in (+6/-14) toolchain-linux.cmake (+11/-6) toolchain-windows.cmake (+5/-1) |
||||
| Related bugs: |
|
| Reviewer | Review Type | Date Requested | Status |
|---|---|---|---|
| Andreas Hasenack | Approve | ||
| Ubuntu Sponsors | Pending | ||
|
Review via email:
|
|||
Commit message
Description of the change
Update to new upstream version 7.1.0
Link for build reference: https:/
| Bruno Bernardo de Moura (bruno-bdmoura) wrote : | # |
| Talha Can Havadar (tchavadar) wrote : | # |
Hello Bruno, I see we have segfaults for arm64 in the ppa you shared, do you happen to know the reason?
| Bruno Bernardo de Moura (bruno-bdmoura) wrote : | # |
Unfortunately I do not know the reason why this happens. However, the test that's causing the segmentation fault is the same one who failed whenever I was mistakenly trying to build this package using clang instead of the default upstream compiler.
I have asked Cory on matrix regarding this topic, let's see what he has to say.
| Talha Can Havadar (tchavadar) wrote : | # |
Can you point me the source code where it fails? is there like an obvious null pointer dereference some where?
| Bruno Bernardo de Moura (bruno-bdmoura) wrote : | # |
The error occurs on this test function: https:/
This snippet then calls the testing_
The method invoked by the custom vector can be seen here: https:/
I honestly don't know exactly how different compilers treat c++ template usage, but I would guess that's where the error might be coming from between the two platforms.
Besides that I don't see any obvious errors in that function.
| Bruno Bernardo de Moura (bruno-bdmoura) wrote : | # |
And also, I just have to say I'm by no means a c++ expert, so there might still be fact some intricate details on the above implementations that I'm missing.
| Bruno Bernardo de Moura (bruno-bdmoura) wrote : | # |
Talha, just as an update, I got access to the hinyari machine, and even though I had to skip the symbols file verification using the -c0 flag, the package builds and runs the tests without breaking, even the test that's breaking on launchpad.
| Talha Can Havadar (tchavadar) wrote : | # |
ok that is suspicious can you check the what is the revision of arm in hinyari? maybe it is different than the builders we have in launchpad and maybe the extensions supported by the arm cpu in hinyari so we can ask launchpad team
| Talha Can Havadar (tchavadar) wrote : | # |
it is strange but maybe some vector instructions are not there for builders we have in LP best to be sure
| Talha Can Havadar (tchavadar) wrote : | # |
That said, I think we need more information about the failure, I see some log functions called on the entrance of each function but cant see the output anywhere in build log maybe there is a way to get them to see at which step it fails.
| Andreas Hasenack (ahasenack) wrote : | # |
Don't delete the previous changelog entry from resolute:
diff --git a/debian/changelog b/debian/changelog
index 2a0b776..00fd507 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,8 +1,32 @@
-rocalution (5.7.1-3build1) resolute; urgency=medium
-
- * No-change mass rebuild for Ubuntu 26.04 (LP: #2132257)
-
- -- Sebastien Bacher <email address hidden> Thu, 29 Jan 2026 21:42:09 +0100
+rocalution (7.1.0-0ubuntu2) resolute; urgency=medium
+
+ * d/rules: skip failing local_vector_
+
+ -- Bruno Bernardo de Moura <email address hidden> Tue, 03 Feb 2026 16:40:38 -0300
+
+rocalution (7.1.0-0ubuntu1) resolute; urgency=medium
+
+ * New upstream release 7.1.0.
+ * d/p/series: drop outdated and unused patches for 7.1.0
+ - drop d/p/spelling.patch
+ - drop d/p/use-
+ - drop d/p/docs-
+ - drop d/p/use-
+ - drop d/p/fix-
+ * d/control: update standards version to 4.7.2
+ * d/control: update maintainer field for ubuntu archive
+ * d/control: update build-depends for rocm stack
+ * d/control: update SOVERSION (0 -> 1) after API changes
+ * d/rules: fix docs build step with rocm-docs-build
+ * d/rules: fix lintian warnings and improve docs cleanup
+ * d/{librocalutio
+ fix doc generation paths
+ * d/shlibs: drop auto generated file
+ * d/{t/control,
+ update test files SOVERSION (0 -> 1)
+ * d/librocalution
+
+ -- Bruno Bernardo de Moura <email address hidden> Wed, 10 Dec 2025 14:27:06 -0300
rocalution (5.7.1-3) unstable; urgency=medium
| Andreas Hasenack (ahasenack) wrote : | # |
+ * d/shlibs: drop auto generated file
...
diff --git a/debian/
new file mode 100644
index 0000000..2f69eb9
--- /dev/null
+++ b/debian/
@@ -0,0 +1,12060 @@
+librocalution.so.1 librocalution1 #MINVER#
+* Build-Depends-
+ _ZN10rocalution
+ _ZN10rocalution
+ _ZN10rocalution
...
These are c++ symbols, and should be demangled with c++filt if you want to go down this route. Note that c++ symbols are notoriously difficult to manage, and that's why the d/shlibs file was there in the first place. Are you sure you want to make these changes?
| Andreas Hasenack (ahasenack) wrote : | # |
d/rues:
@@ -17,16 +17,24 @@ override_
override_
ifeq (,$(filter nocheck,
- obj-$(DEB_
+ # As described in (https:/
+ # `local_
+ # ARM systems. It consistently triggers a SIGSEGV in the extract_
+ # function, even though the same test runs successfully on AMD64 platforms.
+ # Therefore, it is filtered here to enable successful package builds.
+ obj-$(DEB_
+ --gtest_
A test is supposedly correctly showing a problem, and you are covering it. Just saying that the test passes on amd64 is not a reason: architectures are indeed different, and just because it passes on one architecture doesn't immediately means it's ok to fail in another.
I see you filed a bug upstream, and there is no response yet. I think this needs a bit more digging, or I didn't quite understand why it's ok to build and release the package on arm64 with a segfault in one of the tests.
| Igor Luppi (igorluppi) wrote : | # |
Oh after years someone rebuild v5 of rocalution just this week hehe (https:/
It was not a deletion, just a merge conflict because ubuntu/devel diverged after the MP was open, what a bad luck
| Igor Luppi (igorluppi) wrote : | # |
Rebased, now it should be fine :)
| Bruno Bernardo de Moura (bruno-bdmoura) wrote : | # |
@Andreas, thanks for the comments!
Now adressing each of them:
1 - The changelog skip was due to the merge conflict with the rebuild of v5, so it's now already fixed
2- Related to the symbols file: you are absolutely correct, I simply added it due to mistakenly deleting the d/shlibs file after not updating the SONAME version of the package on it.
I've just tested locally and by simply dropping the symbols file and updating the SONAME version on d/shlibs lintian gives no warnings related to symbols. This way, the symbols file will be removed, and the d/shlibs updated accordingly
3- Finally, related to the crashing test on ARM: the AMD team agreed to skip that test due to it not being a mandatory package for the stack itself.
Also, if needed, instead of being a filter on the debian/rules file, the skipping of the test could also be performed as a patch that applies to its file and only runs that specific crashing function in case the architecture is x86 (as is currently performed in hipblaslt, for example). Although this other approach would at least guarantee that the test is passing on x86, the intent of it would be practically the same.
| Andreas Hasenack (ahasenack) wrote : | # |
3) Yes, skipping the test only on arm is less bad for now. But please change the comments, they make it sound like the test is invalid because it passes on amd64. Different architectures behave differently.
| Andreas Hasenack (ahasenack) wrote : | # |
1) ok
2) waiting the the change
| Bruno Bernardo de Moura (bruno-bdmoura) wrote : | # |
@Andreas, back to the topics:
2 - Change done
3 - Transformed it to a patch and description enhanced as dep3 header
| Andreas Hasenack (ahasenack) wrote : | # |
--- rocalution.
+++ rocalution/
@@ -435,6 +435,7 @@ GENERATE_
+#if defined(__x86_64__) || defined(__i386__)
// Test for LocalVector:
TEST_F(
{
@@ -444,6 +445,7 @@ TEST_F(
testing_
stop_
}
+#endif
GENERATE_
I was expecting excluding the test via d/rules, by checking the arch there and calling the test command with or without the gtest_filter argument.
A patch works too, of course, so this part I don't mind that much, keeping in mind the patch might not apply anymore if the surrounding code changes.
But instead of excluding the arch where the test fails, you are listing two arches where it should run (and we don't even build for i386). At the same time we have "Arch any" in d/control, so the above patch will exclude the test in ppc64el, s390x, armhf, arm64, riscv64, thus reducing coverage. Or was the test also not passing there?
| Bruno Bernardo de Moura (bruno-bdmoura) wrote : | # |
@Andreas,
Updated the rules file to skip that test on arm64 instead of applying a patch.
Additionally, the builds can be seen on this ppa: https:/
| Andreas Hasenack (ahasenack) wrote : | # |
I see armhf failed to build, but that is already the case with the current package in the archive[1], so no regression.
+# On ARM64, the test local_vector_
+# segmentation fault, triggering a SIGSEGV in the extract_
+# function. Since different compilers handle memory allocation differently
+# from one another, the test is being skipped to allow for successfull builds
+# on those ARM plaforms.
+ifeq ($(DEB_
+ GTEST_FILTER_VAL := *:-local_
+endif
Please include the link you had before to the upstream issue about this: https:/
1. https:/
| Andreas Hasenack (ahasenack) wrote : | # |
This looks wrong:
-override_
+execute_
+execute_
+execute_
ifeq (,$(filter nodoc,$
- cd docs/.doxygen; doxygen
- sphinx-build -b html docs html
+ perl -pi -e 's/WARN_
+ rocm-docs-build
endif
End result in d/rules:
execute_
execute_
execute_
ifeq (,$(filter nodoc,$
perl -pi -e 's/WARN_
rocm-docs-build
endif
Perhaps take *inspiration* on the rocrand d/rules section. Other rocm packages also had to address this:
# python3-rocm-docs can only build offline with the help of some envvars
override_
ifeq (,$(filter nodoc,$
http_
https_
rocm-docs-build -r $(shell dpkg-parsechangelog -S Version | sed 's/[+-].*//')
rm -rf build/html/
rm -rf build/html/
endif
Note the "\" continuation marks at the end of each line, and how the proxy variable is being set in the same shell command. Remember that each line is a shell invocation on its own, so if you set a variable in one line, without the "\" continuation, it won't exist in the next line, which is a new shell invocation.
| Bruno Bernardo de Moura (bruno-bdmoura) wrote : | # |
@Andreas, thanks for the explanation!
The `execute_
| Andreas Hasenack (ahasenack) wrote : | # |
Sponsored:
Uploading rocalution_
Uploading rocalution_
Uploading rocalution_
Uploading rocalution_
Uploading rocalution_
This has binary NEW packages, due to the soname change, so it will require an archive admin approval before the binaries land in resolute-proposed.
Preview Diff
| 1 | diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml |
| 2 | new file mode 100644 |
| 3 | index 0000000..b615df9 |
| 4 | --- /dev/null |
| 5 | +++ b/.azuredevops/rocm-ci.yml |
| 6 | @@ -0,0 +1,44 @@ |
| 7 | +resources: |
| 8 | + repositories: |
| 9 | + - repository: pipelines_repo |
| 10 | + type: github |
| 11 | + endpoint: ROCm |
| 12 | + name: ROCm/ROCm |
| 13 | + |
| 14 | +variables: |
| 15 | +- group: common |
| 16 | +- template: /.azuredevops/variables-global.yml@pipelines_repo |
| 17 | + |
| 18 | +trigger: |
| 19 | + batch: true |
| 20 | + branches: |
| 21 | + include: |
| 22 | + - develop |
| 23 | + - mainline |
| 24 | + paths: |
| 25 | + exclude: |
| 26 | + - .githooks |
| 27 | + - .github |
| 28 | + - .jenkins |
| 29 | + - docs |
| 30 | + - '.*.y*ml' |
| 31 | + - '*.md' |
| 32 | + |
| 33 | +pr: |
| 34 | + autoCancel: true |
| 35 | + branches: |
| 36 | + include: |
| 37 | + - develop |
| 38 | + - mainline |
| 39 | + paths: |
| 40 | + exclude: |
| 41 | + - .githooks |
| 42 | + - .github |
| 43 | + - .jenkins |
| 44 | + - docs |
| 45 | + - '.*.y*ml' |
| 46 | + - '*.md' |
| 47 | + drafts: false |
| 48 | + |
| 49 | +jobs: |
| 50 | + - template: ${{ variables.CI_COMPONENT_PATH }}/rocALUTION.yml@pipelines_repo |
| 51 | diff --git a/.githooks/pre-commit b/.githooks/pre-commit |
| 52 | index 04dabeb..7d23219 100755 |
| 53 | --- a/.githooks/pre-commit |
| 54 | +++ b/.githooks/pre-commit |
| 55 | @@ -35,7 +35,7 @@ fi |
| 56 | for file in $files; do |
| 57 | if [[ -e $file ]]; then |
| 58 | /usr/bin/perl -pi -e 'INIT { exit 1 if !-f $ARGV[0] || -B $ARGV[0]; $year = (localtime)[5] + 1900 } |
| 59 | - s/^([*\/#[:space:]]*)Copyright\s+(?:\(C\)\s*)?(\d+)(?:\s*-\s*\d+)?/qq($1Copyright (c) $2@{[$year != $2 ? "-$year" : ""]})/ie |
| 60 | + s/^([*\/#[:space:]]*)Copyright\s+(?:\(C\)\s*)?(\d+)(?:\s*-\s*\d+)?/qq($1Copyright (C) $2@{[$year != $2 ? "-$year" : ""]})/ie |
| 61 | if $. < 10' "$file" && git add -u "$file" |
| 62 | fi |
| 63 | done |
| 64 | diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS |
| 65 | old mode 100644 |
| 66 | new mode 100755 |
| 67 | index 4b7a84c..300b638 |
| 68 | --- a/.github/CODEOWNERS |
| 69 | +++ b/.github/CODEOWNERS |
| 70 | @@ -1 +1,6 @@ |
| 71 | -* @ntrost57 @YvanMokwinski @jsandham |
| 72 | +* @ntrost57 @YvanMokwinski @jsandham @kliegeois |
| 73 | +# Documentation files |
| 74 | +docs/* @ROCm/rocm-documentation |
| 75 | +*.md @ROCm/rocm-documentation |
| 76 | +*.rst @ROCm/rocm-documentation |
| 77 | +.readthedocs.yaml @ROCm/rocm-documentation |
| 78 | diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md |
| 79 | new file mode 100644 |
| 80 | index 0000000..2ab06f0 |
| 81 | --- /dev/null |
| 82 | +++ b/.github/CONTRIBUTING.md |
| 83 | @@ -0,0 +1,160 @@ |
| 84 | +<head> |
| 85 | + <meta charset="UTF-8"> |
| 86 | + <meta name="description" content="Contributing to rocALUTION"> |
| 87 | + <meta name="keywords" content="ROCm, contributing, rocALUTION"> |
| 88 | +</head> |
| 89 | + |
| 90 | +# Contributing to rocALUTION # |
| 91 | + |
| 92 | +AMD welcomes contributions to rocALUTION from the community. Whether those contributions are bug reports, bug fixes, documentation additions, performance notes, or other improvements, we value collaboration with our users. We can build better solutions together. Please follow these details to help ensure your contributions will be successfully accepted. |
| 93 | + |
| 94 | +Our code contriubtion guidelines closely follow the model of [GitHub pull-requests](https://help.github.com/articles/using-pull-requests/). This repository follows the [git flow](http://nvie.com/posts/a-successful-git-branching-model/) workflow, which dictates a /master branch where releases are cut, and a /develop branch which serves as an integration branch for new code. |
| 95 | + |
| 96 | +## Issue Discussion ## |
| 97 | + |
| 98 | +Please use the GitHub Issues tab to notify us of issues. |
| 99 | + |
| 100 | +* Use your best judgement for issue creation. If your issue is already listed, upvote the issue and |
| 101 | + comment or post to provide additional details, such as how you reproduced this issue. |
| 102 | +* If you're not sure if your issue is the same, err on the side of caution and file your issue. |
| 103 | + You can add a comment to include the issue number (and link) for the similar issue. If we evaluate |
| 104 | + your issue as being the same as the existing issue, we'll close the duplicate. |
| 105 | +* If your issue doesn't exist, use the issue template to file a new issue. |
| 106 | + * When filing an issue, be sure to provide as much information as possible, including script output so |
| 107 | + we can collect information about your configuration. This helps reduce the time required to |
| 108 | + reproduce your issue. |
| 109 | + * Check your issue regularly, as we may require additional information to successfully reproduce the |
| 110 | + issue. |
| 111 | +* You may also open an issue to ask questions to the maintainers about whether a proposed change |
| 112 | + meets the acceptance criteria, or to discuss an idea pertaining to the library. |
| 113 | + |
| 114 | +## Acceptance Criteria ## |
| 115 | + |
| 116 | +rocALUTION is a sparse linear algebra library with focus on exploring fine-grained parallelism on top of the AMD ROCm runtime and toolchains, targeting modern CPU and GPU platforms. Based on C++ and HIP, it provides a portable, generic and flexible design that allows seamless integration with other scientific software packages. |
| 117 | + |
| 118 | +In rocALUTION we are interested in contributions that: |
| 119 | +* Fix bugs, improve documentation, enhance testing, reduce complexity. |
| 120 | +* Improve the performance of existing routines. |
| 121 | +* Add missing functionality such as new multigrid solvers, iterative solvers, direct solvers, or preconditioners. |
| 122 | +* Extending new or existing functionality to work with MPI or accelerators (such as GPU devices). |
| 123 | + |
| 124 | +We encourage contributors to leverage the GitHub "Issues" tab to discuss possible additions they would like to add. |
| 125 | + |
| 126 | +### Exceptions ### |
| 127 | + |
| 128 | +rocALUTION places a heavy emphasis on being high performance. Because of this, contributions that add new routines (or that modify existing routines) must do so from the perspective that they offer high performance in relation to the hardware they are run on. Furthermore, all routines added to rocalution must have at a minimum a host solution as all routines must have the ability to fall back to a host solution if a GPU accelerator is not avaiable. Because compile times, binary sizes, and general library complexity are important considerations, we reserve the right to make decisions on whether a proposed routine is too niche or specialized to be worth including. |
| 129 | + |
| 130 | +## Code Structure ## |
| 131 | + |
| 132 | +The following is the structure of the rocALUTION library in the GitHub repository. A more detailed description of the directory structure can be found in the rocALUTION [documentation](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/design/orga.html). |
| 133 | + |
| 134 | +The `src/` directory contains the library source code. This is broken up into three sub-directories: |
| 135 | +* `src/base` |
| 136 | +* `src/solvers` |
| 137 | +* `src/utils` |
| 138 | + |
| 139 | +The `src/base` Contains source code related to rocALUTION's vector, matrix, and stencil operator types as well as classes related to parallel management. This directory is further broken up into: |
| 140 | +* `src/base/hip` Contains HIP implementations of vector, matrix, and stencil operators. |
| 141 | +* `src/base/host` Contains host implementations of vector, matrix, and stencil operators. |
| 142 | + |
| 143 | +The `src/solvers` directory contains all the source code related to direct (`src/solvers/direct`), krylov (`src/solvers/krylov`), and multigrid solvers `src/solvers/multigrid`. |
| 144 | + |
| 145 | +The `src/utils` directory contains source code related to logging, memory allocation, math and timing functions. |
| 146 | + |
| 147 | +The `clients/` directory contains the testing and benchmarking code as well as all the samples demonstrating rocALUTION usage. |
| 148 | + |
| 149 | +The `docs/` directory contains all of the documentation files. |
| 150 | + |
| 151 | +## Coding Style ## |
| 152 | + |
| 153 | +In general, follow the style of the surrounding code. C and C++ code is formatted using `clang-format`. Use the clang-format version installed with ROCm (found in the `/opt/rocm/llvm/bin` directory). Please do not use your system's built-in `clang-format`, as this is a different version that may result in incorrect results. |
| 154 | + |
| 155 | +To format a file, use: |
| 156 | + |
| 157 | +``` |
| 158 | +/opt/rocm/llvm/bin/clang-format -style=file -i <path-to-source-file> |
| 159 | +``` |
| 160 | + |
| 161 | +To format all files, run the following script in rocALUTION directory: |
| 162 | + |
| 163 | +``` |
| 164 | +#!/bin/bash |
| 165 | +git ls-files -z *.cc *.cpp *.h *.hpp *.cl *.h.in *.hpp.in *.cpp.in | xargs -0 /opt/rocm/llvm/bin/clang-format -style=file -i |
| 166 | +``` |
| 167 | + |
| 168 | +Also, githooks can be installed to format the code per-commit: |
| 169 | + |
| 170 | +``` |
| 171 | +./.githooks/install |
| 172 | +``` |
| 173 | + |
| 174 | +## Pull Request Guidelines ## |
| 175 | + |
| 176 | +When you create a pull request, you should target the default branch. Our current default branch is the **develop** branch, which serves as our integration branch. |
| 177 | + |
| 178 | +By submitting a pull request, you acknowlege and agree with the CLA below: |
| 179 | + |
| 180 | +Contribution License Agreement |
| 181 | +1. The code I am contributing is mine, and I have the right to license it. |
| 182 | +2. By submitting a pull request for this project I am granting you a license to distribute said code under the MIT License for the project. |
| 183 | + |
| 184 | +### Deliverables ### |
| 185 | + |
| 186 | +When raising a PR in rocALUTION here are some important things to include: |
| 187 | + |
| 188 | +1. For each new file in the repository, Please include the licensing header |
| 189 | +``` |
| 190 | +/* ************************************************************************ |
| 191 | +* Copyright (C) 20xx Advanced Micro Devices, Inc. All rights Reserved. |
| 192 | +* |
| 193 | +* Permission is hereby granted, free of charge, to any person obtaining a copy |
| 194 | +* of this software and associated documentation files (the "Software"), to deal |
| 195 | +* in the Software without restriction, including without limitation the rights |
| 196 | +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 197 | +* copies of the Software, and to permit persons to whom the Software is |
| 198 | +* furnished to do so, subject to the following conditions: |
| 199 | +* |
| 200 | +* The above copyright notice and this permission notice shall be included in |
| 201 | +* all copies or substantial portions of the Software. |
| 202 | +* |
| 203 | +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 204 | +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 205 | +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 206 | +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 207 | +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 208 | +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 209 | +* THE SOFTWARE. |
| 210 | +* |
| 211 | +* ************************************************************************ */ |
| 212 | +``` |
| 213 | +and adjust the date to the current year. When simply modifying a file, the date should automatically be updated pre-commit as long as the githook has been installed (./.githooks/install). |
| 214 | + |
| 215 | +2. When adding a new routine, please make sure you are also adding appropriate testing code. These new unit tests should integrate within the existing [googletest framework](https://github.com/google/googletest/blob/master/googletest/docs/primer.md). This typically involves adding the following files: |
| 216 | + |
| 217 | +* testing_<routine_name>.hpp file in the directory `clients/include/` |
| 218 | +* test_<routine_name>.cpp file in directory `clients/tests/` |
| 219 | + |
| 220 | +See existing tests for guidance when adding your own. |
| 221 | + |
| 222 | +3. When modifiying an existing routine, add appropriate testing to test_<routine_name>.cpp file in directory `clients/tests/`. |
| 223 | + |
| 224 | +4. Tests must have good code coverage. |
| 225 | + |
| 226 | +5. At a minimum, rocALUTION must have a host solution for each direct, iterative, multigrid, or preconditioner. If you add a accelerator solution (say using HIP targetting GPU devices) please also add a fall back host solution. |
| 227 | + |
| 228 | +6. Ensure code builds successfully. This includes making sure that the code can compile, that the code is properly formatted, and that all tests pass. |
| 229 | + |
| 230 | +7. Do not break existing test cases |
| 231 | + |
| 232 | +### Process ### |
| 233 | + |
| 234 | +When a PR is raised targetting the develop branch in rocALUTION, CI will be automatically triggered. This will: |
| 235 | + |
| 236 | +* Test that the PR passes static analysis (i.e ensure clang formatting rules have been followed). |
| 237 | +* Test that the documentation can be properly built |
| 238 | +* Ensure that the PR compiles on different OS and GPU device architecture combinations |
| 239 | +* Ensure that all tests pass on different OS and GPU device architecture combinations |
| 240 | + |
| 241 | +Feel free to ask questions on your PR regarding any CI failures you encounter. |
| 242 | + |
| 243 | +* Reviewers are listed in the CODEOWNERS file |
| 244 | diff --git a/.github/dependabot.yml b/.github/dependabot.yml |
| 245 | index 9cdf2d6..047fac7 100644 |
| 246 | --- a/.github/dependabot.yml |
| 247 | +++ b/.github/dependabot.yml |
| 248 | @@ -6,7 +6,14 @@ |
| 249 | version: 2 |
| 250 | updates: |
| 251 | - package-ecosystem: "pip" # See documentation for possible values |
| 252 | - directory: "/docs/.sphinx" # Location of package manifests |
| 253 | + directory: "/docs/sphinx" # Location of package manifests |
| 254 | open-pull-requests-limit: 10 |
| 255 | schedule: |
| 256 | interval: "daily" |
| 257 | + target-branch: "develop" |
| 258 | + labels: |
| 259 | + - "documentation" |
| 260 | + - "dependencies" |
| 261 | + - "ci:docs-only" |
| 262 | + reviewers: |
| 263 | + - "samjwu" |
| 264 | diff --git a/.gitignore b/.gitignore |
| 265 | index 15ec500..50bd106 100644 |
| 266 | --- a/.gitignore |
| 267 | +++ b/.gitignore |
| 268 | @@ -38,20 +38,9 @@ tags |
| 269 | # build-in-source directory |
| 270 | build |
| 271 | |
| 272 | -# doc |
| 273 | -docBin |
| 274 | -_build |
| 275 | - |
| 276 | # Visual Studio stuff |
| 277 | *.vcxproj.user |
| 278 | *.suo |
| 279 | *.sdf |
| 280 | *.pdb |
| 281 | *.opensdf |
| 282 | - |
| 283 | -# documentation artifacts |
| 284 | -_build/ |
| 285 | -_images/ |
| 286 | -_static/ |
| 287 | -_templates/ |
| 288 | -_toc.yml |
| 289 | diff --git a/.jenkins/codecov.groovy b/.jenkins/codecov.groovy |
| 290 | index 175da9a..76162e8 100644 |
| 291 | --- a/.jenkins/codecov.groovy |
| 292 | +++ b/.jenkins/codecov.groovy |
| 293 | @@ -18,7 +18,7 @@ def runCI = |
| 294 | prj.paths.build_command = './install.sh -cg --codecoverage' |
| 295 | prj.compiler.compiler_name = 'c++' |
| 296 | prj.compiler.compiler_path = 'c++' |
| 297 | - prj.libraryDependencies = ['rocPRIM', 'rocBLAS-internal', 'rocSPARSE-internal', 'rocRAND'] |
| 298 | + prj.libraryDependencies = ['rocPRIM', 'hipBLAS-common', 'hipBLASLt', 'rocBLAS', 'rocSPARSE', 'rocRAND'] |
| 299 | prj.defaults.ccache = false |
| 300 | |
| 301 | // Define test architectures, optional rocm version argument is available |
| 302 | @@ -52,12 +52,10 @@ def runCI = |
| 303 | ci: { |
| 304 | String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) |
| 305 | |
| 306 | - def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 6')])], |
| 307 | - "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 6')])] ] |
| 308 | + def propertyList = [] |
| 309 | propertyList = auxiliary.appendPropertyList(propertyList) |
| 310 | |
| 311 | - def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu18:['gfx900']]), |
| 312 | - "compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900']])] |
| 313 | + def jobNameList = [] |
| 314 | jobNameList = auxiliary.appendJobNameList(jobNameList) |
| 315 | |
| 316 | propertyList.each |
| 317 | @@ -81,7 +79,7 @@ ci: { |
| 318 | { |
| 319 | properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) |
| 320 | stage(urlJobName) { |
| 321 | - runCI([ubuntu18:['gfx900']], urlJobName) |
| 322 | + runCI([], urlJobName) |
| 323 | } |
| 324 | } |
| 325 | } |
| 326 | diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy |
| 327 | index 11e8586..1e973d1 100644 |
| 328 | --- a/.jenkins/common.groovy |
| 329 | +++ b/.jenkins/common.groovy |
| 330 | @@ -12,7 +12,7 @@ def runCompileCommand(platform, project, boolean sameOrg=false) |
| 331 | { |
| 332 | project.libraryDependencies.each |
| 333 | { libraryName -> |
| 334 | - getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, sameOrg) |
| 335 | + getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, 'develop', sameOrg) |
| 336 | } |
| 337 | } |
| 338 | |
| 339 | @@ -47,23 +47,29 @@ def runTestCommand (platform, project, gfilter) |
| 340 | """ |
| 341 | |
| 342 | platform.runCommand(this, command) |
| 343 | - junit "${project.paths.project_build_prefix}/build/release/clients/staging/*.xml" |
| 344 | } |
| 345 | |
| 346 | def runCoverageCommand (platform, project, gfilter, String dirmode = "release") |
| 347 | { |
| 348 | - //Temporary workaround due to bug in container |
| 349 | - String centos7Workaround = platform.jenkinsLabel.contains('centos7') ? 'export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/opt/rocm/lib64/' : '' |
| 350 | + String commitSha |
| 351 | + String repoUrl |
| 352 | + (commitSha, repoUrl) = util.getGitHubCommitInformation(project.paths.project_src_prefix) |
| 353 | |
| 354 | - def command = """#!/usr/bin/env bash |
| 355 | - set -x |
| 356 | - cd ${project.paths.project_build_prefix}/build/${dirmode} |
| 357 | - export LD_LIBRARY_PATH=/opt/rocm/lib/ |
| 358 | - ${centos7Workaround} |
| 359 | - GTEST_LISTENER=NO_PASS_LINE_IN_LOG make coverage_cleanup coverage GTEST_FILTER=${gfilter}-*known_bug* |
| 360 | - """ |
| 361 | + withCredentials([string(credentialsId: "mathlibs-codecov-token-rocalution", variable: 'CODECOV_TOKEN')]) |
| 362 | + { |
| 363 | + def command = """#!/usr/bin/env bash |
| 364 | + set -x |
| 365 | + cd ${project.paths.project_build_prefix}/build/${dirmode} |
| 366 | + export LD_LIBRARY_PATH=/opt/rocm/lib/ |
| 367 | + export ROCALUTION_CODE_COVERAGE=1 |
| 368 | + GTEST_LISTENER=NO_PASS_LINE_IN_LOG make coverage_cleanup coverage GTEST_FILTER=${gfilter}-*known_bug* |
| 369 | + curl -Os https://uploader.codecov.io/latest/linux/codecov |
| 370 | + chmod +x codecov |
| 371 | + ./codecov -v -U \$http_proxy -t ${CODECOV_TOKEN} --file lcoverage/main_coverage.info --name rocALUTION --sha ${commitSha} |
| 372 | + """ |
| 373 | |
| 374 | - platform.runCommand(this, command) |
| 375 | + platform.runCommand(this, command) |
| 376 | + } |
| 377 | |
| 378 | publishHTML([allowMissing: false, |
| 379 | alwaysLinkToLastBuild: false, |
| 380 | diff --git a/.jenkins/debug.groovy b/.jenkins/debug.groovy |
| 381 | index 15abd79..fd45e14 100644 |
| 382 | --- a/.jenkins/debug.groovy |
| 383 | +++ b/.jenkins/debug.groovy |
| 384 | @@ -18,7 +18,7 @@ def runCI = |
| 385 | prj.paths.build_command = buildCommand |
| 386 | prj.compiler.compiler_name = 'c++' |
| 387 | prj.compiler.compiler_path = 'c++' |
| 388 | - prj.libraryDependencies = ['rocPRIM', 'rocBLAS-internal', 'rocSPARSE-internal', 'rocRAND'] |
| 389 | + prj.libraryDependencies = ['rocPRIM', 'hipBLAS-common', 'hipBLASLt', 'rocBLAS', 'rocSPARSE', 'rocRAND'] |
| 390 | |
| 391 | // Define test architectures, optional rocm version argument is available |
| 392 | def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 393 | diff --git a/.jenkins/extended.groovy b/.jenkins/extended.groovy |
| 394 | index e40fe5e..d588769 100644 |
| 395 | --- a/.jenkins/extended.groovy |
| 396 | +++ b/.jenkins/extended.groovy |
| 397 | @@ -18,7 +18,7 @@ def runCI = |
| 398 | prj.paths.build_command = buildCommand |
| 399 | prj.compiler.compiler_name = 'c++' |
| 400 | prj.compiler.compiler_path = 'c++' |
| 401 | - prj.libraryDependencies = ['rocPRIM', 'rocBLAS-internal', 'rocSPARSE-internal', 'rocRAND'] |
| 402 | + prj.libraryDependencies = ['rocPRIM', 'hipBLAS-common', 'hipBLASLt', 'rocBLAS', 'rocSPARSE', 'rocRAND'] |
| 403 | |
| 404 | // Define test architectures, optional rocm version argument is available |
| 405 | def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 406 | diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy |
| 407 | index 5c309cb..c198213 100644 |
| 408 | --- a/.jenkins/precheckin.groovy |
| 409 | +++ b/.jenkins/precheckin.groovy |
| 410 | @@ -18,7 +18,7 @@ def runCI = |
| 411 | prj.paths.build_command = buildCommand |
| 412 | prj.compiler.compiler_name = 'c++' |
| 413 | prj.compiler.compiler_path = 'c++' |
| 414 | - prj.libraryDependencies = ['rocPRIM', 'rocBLAS-internal', 'rocSPARSE-internal', 'rocRAND'] |
| 415 | + prj.libraryDependencies = ['rocPRIM', 'hipBLAS-common', 'hipBLASLt', 'rocBLAS', 'rocSPARSE', 'rocRAND'] |
| 416 | |
| 417 | // Define test architectures, optional rocm version argument is available |
| 418 | def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 419 | diff --git a/.jenkins/staticanalysis.groovy b/.jenkins/staticanalysis.groovy |
| 420 | index 8f5c593..4cfcd5e 100644 |
| 421 | --- a/.jenkins/staticanalysis.groovy |
| 422 | +++ b/.jenkins/staticanalysis.groovy |
| 423 | @@ -39,8 +39,26 @@ def runCI = |
| 424 | ci: { |
| 425 | String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) |
| 426 | |
| 427 | - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])])) |
| 428 | - stage(urlJobName) { |
| 429 | - runCI([ubuntu20:['cpu']], urlJobName) |
| 430 | + def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], |
| 431 | + "rocm-docker":[]] |
| 432 | + propertyList = auxiliary.appendPropertyList(propertyList) |
| 433 | + |
| 434 | + def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":[]] |
| 435 | + jobNameList = auxiliary.appendJobNameList(jobNameList) |
| 436 | + |
| 437 | + propertyList.each |
| 438 | + { |
| 439 | + jobName, property-> |
| 440 | + if (urlJobName == jobName) |
| 441 | + properties(auxiliary.addCommonProperties(property)) |
| 442 | + } |
| 443 | + |
| 444 | + jobNameList.each |
| 445 | + { |
| 446 | + jobName, nodeDetails-> |
| 447 | + if (urlJobName == jobName) |
| 448 | + stage(jobName) { |
| 449 | + runCI(nodeDetails, jobName) |
| 450 | + } |
| 451 | } |
| 452 | } |
| 453 | diff --git a/.jenkins/staticlibrary.groovy b/.jenkins/staticlibrary.groovy |
| 454 | index da565db..b06385d 100644 |
| 455 | --- a/.jenkins/staticlibrary.groovy |
| 456 | +++ b/.jenkins/staticlibrary.groovy |
| 457 | @@ -16,9 +16,9 @@ def runCI = |
| 458 | def prj = new rocProject('rocALUTION', 'Static Library PreCheckin') |
| 459 | // customize for project |
| 460 | prj.paths.build_command = buildCommand |
| 461 | - prj.compiler.compiler_name = 'hipcc' |
| 462 | - prj.compiler.compiler_path = '/opt/rocm/bin/hipcc' |
| 463 | - prj.libraryDependencies = ['rocPRIM', 'rocBLAS-internal', 'rocSPARSE-internal', 'rocRAND'] |
| 464 | + prj.compiler.compiler_name = 'amdclang++' |
| 465 | + prj.compiler.compiler_path = '/opt/rocm/bin/amdclang++' |
| 466 | + prj.libraryDependencies = ['rocPRIM', 'hipBLAS-common', 'hipBLASLt', 'rocBLAS', 'rocSPARSE', 'rocRAND'] |
| 467 | |
| 468 | // Define test architectures, optional rocm version argument is available |
| 469 | def nodes = new dockerNodes(nodeDetails, jobName, prj) |
| 470 | diff --git a/.readthedocs.yaml b/.readthedocs.yaml |
| 471 | index 43a0890..4e3a24d 100644 |
| 472 | --- a/.readthedocs.yaml |
| 473 | +++ b/.readthedocs.yaml |
| 474 | @@ -6,9 +6,13 @@ version: 2 |
| 475 | sphinx: |
| 476 | configuration: docs/conf.py |
| 477 | |
| 478 | -formats: [htmlzip] |
| 479 | +formats: [htmlzip, epub] |
| 480 | |
| 481 | python: |
| 482 | - version: "3.8" |
| 483 | install: |
| 484 | - - requirements: docs/.sphinx/requirements.txt |
| 485 | + - requirements: docs/sphinx/requirements.txt |
| 486 | + |
| 487 | +build: |
| 488 | + os: ubuntu-22.04 |
| 489 | + tools: |
| 490 | + python: "3.10" |
| 491 | diff --git a/CHANGELOG.md b/CHANGELOG.md |
| 492 | index e168a58..fe92d69 100644 |
| 493 | --- a/CHANGELOG.md |
| 494 | +++ b/CHANGELOG.md |
| 495 | @@ -1,142 +1,306 @@ |
| 496 | -# Change Log for rocALUTION |
| 497 | +# Changelog for rocALUTION |
| 498 | |
| 499 | -Full documentation for rocALUTION is available at [rocalution.readthedocs.io](https://rocalution.readthedocs.io/en/latest/). |
| 500 | +Full documentation forrocALUTION is available at [https://rocm.docs.amd.com/projects/rocALUTION/en/latest/](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/). |
| 501 | + |
| 502 | +## rocALUTION 4.0.1 for ROCm 7.1.0 |
| 503 | |
| 504 | -## rocALUTION 2.1.11 for ROCm 5.7.0 |
| 505 | ### Added |
| 506 | -- Added support for gfx940, gfx941 and gfx942 |
| 507 | -### Improved |
| 508 | -- Fixed OpenMP runtime issue with Windows toolchain |
| 509 | +* Added support for gfx950. |
| 510 | |
| 511 | -## rocALUTION 2.1.9 for ROCm 5.6.0 |
| 512 | -### Improved |
| 513 | -- Fixed synchronization issues in level 1 routines |
| 514 | +### Changed |
| 515 | +* Updated the default build standard to C++17 when compiling rocALUTION from source (previously C++14). |
| 516 | + |
| 517 | +### Optimized |
| 518 | +* Improved and expanded user documentation. |
| 519 | + |
| 520 | +### Resolved issues |
| 521 | +* Fixed a bug in the GPU hashing algorithm that occurred when not compiling with -O2/-O3. |
| 522 | +* Fixed an issue with the SPAI preconditioner when using complex numbers. |
| 523 | + |
| 524 | +## rocALUTION 3.2.3 for ROCm 6.4.1 |
| 525 | |
| 526 | -## rocALUTION 2.1.8 for ROCm 5.5.0 |
| 527 | ### Added |
| 528 | -- Added build support for Navi32 |
| 529 | -### Improved |
| 530 | -- Fixed a typo in MPI backend |
| 531 | -- Fixed a bug with the backend when HIP support is disabled |
| 532 | -- Fixed a bug in SAAMG hierarchy building on HIP backend |
| 533 | -- Improved SAAMG hierarchy build performance on HIP backend |
| 534 | +* The `-a` option has been added to the `rmake.py` build script. This option provides a way to select specific architectures when building on Windows. |
| 535 | + |
| 536 | +### Resolved issues |
| 537 | +* Fixed an issue where the `HIP_PATH` environment variable was being ignored when compiling on Windows. |
| 538 | + |
| 539 | +## rocALUTION 3.2.2 for ROCm 6.4.0 |
| 540 | + |
| 541 | ### Changed |
| 542 | -- LocalVector::GetIndexValues(ValueType\*) is deprecated, use LocalVector::GetIndexValues(const LocalVector&, LocalVector\*) instead |
| 543 | -- LocalVector::SetIndexValues(const ValueType\*) is deprecated, use LocalVector::SetIndexValues(const LocalVector&, const LocalVector&) instead |
| 544 | -- LocalMatrix::RSDirectInterpolation(const LocalVector&, const LocalVector&, LocalMatrix\*, LocalMatrix\*) is deprecated, use LocalMatrix::RSDirectInterpolation(const LocalVector&, const LocalVector&, LocalMatrix\*) instead |
| 545 | -- LocalMatrix::RSExtPIInterpolation(const LocalVector&, const LocalVector&, bool, float, LocalMatrix\*, LocalMatrix\*) is deprecated, use LocalMatrix::RSExtPIInterpolation(const LocalVector&, const LocalVector&, bool, LocalMatrix\*) instead |
| 546 | -- LocalMatrix::RugeStueben() is deprecated |
| 547 | -- LocalMatrix::AMGSmoothedAggregation(ValueType, const LocalVector&, const LocalVector&, LocalMatrix\*, LocalMatrix\*, int) is deprecated, use LocalMatrix::AMGAggregation(ValueType, const LocalVector&, const LocalVector&, LocalMatrix\*, int) instead |
| 548 | -- LocalMatrix::AMGAggregation(const LocalVector&, LocalMatrix\*, LocalMatrix\*) is deprecated, use LocalMatrix::AMGAggregation(const LocalVector&, LocalMatrix\*) instead |
| 549 | +* Improved documentation |
| 550 | + |
| 551 | +## rocALUTION 3.2.1 for ROCm 6.3.0 |
| 552 | |
| 553 | -## rocALUTION 2.1.3 for ROCm 5.4.0 |
| 554 | ### Added |
| 555 | -- Added build support for Navi31 and Navi33 |
| 556 | -- Added support for non-squared global matrices |
| 557 | -### Improved |
| 558 | -- Fixed a memory leak in MatrixMult on HIP backend |
| 559 | -- Global structures can now be used with a single process |
| 560 | + |
| 561 | +* Support for gfx1200, gfx1201, and gfx1151. |
| 562 | + |
| 563 | ### Changed |
| 564 | -- Switched GTest death test style to 'threadsafe' |
| 565 | -- GlobalVector::GetGhostSize() is deprecated and will be removed |
| 566 | -- ParallelManager::GetGlobalSize(), ParallelManager::GetLocalSize(), ParallelManager::SetGlobalSize() and ParallelManager::SetLocalSize() are deprecated and will be removed |
| 567 | -- Vector::GetGhostSize() is deprecated and will be removed |
| 568 | -- Multigrid::SetOperatorFormat(unsigned int) is deprecated and will be removed, use Multigrid::SetOperatorFormat(unsigned int, int) instead |
| 569 | -- RugeStuebenAMG::SetCouplingStrength(ValueType) is deprecated and will be removed, use SetStrengthThreshold(float) instead |
| 570 | + |
| 571 | +* Changed the default compiler from `hipcc` to `amdclang` in the installation script and cmake files. |
| 572 | +* Changed the address sanitizer build targets. Now only `gfx908:xnack+`, `gfx90a:xnack+`, `gfx940:xnack+`, `gfx941:xnack+`, and `gfx942:xnack+` are built with `BUILD_ADDRESS_SANITIZER=ON`. |
| 573 | + |
| 574 | +### Resolved issues |
| 575 | + |
| 576 | +* Fix hang in `RS-AMG` for Navi on some specific matrix sparsity patterns. |
| 577 | +* Fix wrong results in `Apply` on multi-GPU setups. |
| 578 | + |
| 579 | +## rocALUTION 3.2.0 for ROCm 6.2.0 |
| 580 | + |
| 581 | +### Additions |
| 582 | +* New file I/O based on rocsparse I/O format |
| 583 | +* `GetConvergenceHistory` for ItILU0 preconditioner |
| 584 | + |
| 585 | +### Deprecations |
| 586 | +* `LocalMatrix::ReadFileCSR` |
| 587 | +* `LocalMatrix::WriteFileCSR` |
| 588 | +* `GlobalMatrix::ReadFileCSR` |
| 589 | +* `GlobalMatrix::WriteFileCSR` |
| 590 | + |
| 591 | +## rocALUTION 3.1.1 for ROCm 6.1.0 |
| 592 | + |
| 593 | +### Additions |
| 594 | + |
| 595 | +* `TripleMatrixProduct` functionality for `GlobalMatrix` |
| 596 | +* Multi-Node/GPU support for `UA-AMG`, `SA-AMG` and `RS-AMG` |
| 597 | +* Iterative ILU0 preconditioner `ItILU0` |
| 598 | +* Iterative triangular solve, selectable via `SolverDecr` class |
| 599 | + |
| 600 | +### Deprecations |
| 601 | + |
| 602 | +* `LocalMatrix::AMGConnect` |
| 603 | +* `LocalMatrix::AMGAggregate` |
| 604 | +* `LocalMatrix::AMGPMISAggregate` |
| 605 | +* `LocalMatrix::AMGSmoothedAggregation` |
| 606 | +* `LocalMatrix::AMGAggregation` |
| 607 | +* `PairwiseAMG` |
| 608 | + |
| 609 | +### Known Issues |
| 610 | +* `PairwiseAMG` does currently not support matrix sizes that exceed int32 range |
| 611 | +* `PairwiseAMG` might fail building the hierarchy on certain input matrices |
| 612 | + |
| 613 | +## rocALUTION 3.0.3 for ROCm 6.0.0 |
| 614 | + |
| 615 | +### Additions |
| 616 | + |
| 617 | +* Support for 64bit integer vectors |
| 618 | +* Inclusive and exclusive sum functionality for vector classes |
| 619 | +* Transpose functionality for `GlobalMatrix` and `LocalMatrix` |
| 620 | +* `TripleMatrixProduct` functionality for `LocalMatrix` |
| 621 | +* `Sort()` function for `LocalVector` class |
| 622 | +* Multiple stream support to the HIP backend |
| 623 | + |
| 624 | +### Optimizations |
| 625 | + |
| 626 | +* `GlobalMatrix::Apply()` now uses multiple streams to better hide communication |
| 627 | + |
| 628 | +### Changes |
| 629 | + |
| 630 | +* Matrix dimensions and number of non-zeros are now stored using 64-bit integers |
| 631 | +* Improved the ILUT preconditioner |
| 632 | + |
| 633 | +### Deprecations |
| 634 | + |
| 635 | +* `LocalVector::GetIndexValues(ValueType*)` |
| 636 | +* `LocalVector::SetIndexValues(const ValueType*)` |
| 637 | +* `LocalMatrix::RSDirectInterpolation(const LocalVector&, const LocalVector&, LocalMatrix*, LocalMatrix*)` |
| 638 | +* `LocalMatrix::RSExtPIInterpolation(const LocalVector&, const LocalVector&, bool, float, LocalMatrix*, LocalMatrix*)` |
| 639 | +* `LocalMatrix::RugeStueben()` |
| 640 | +* `LocalMatrix::AMGSmoothedAggregation(ValueType, const LocalVector&, const LocalVector&, LocalMatrix*, LocalMatrix*, int)` |
| 641 | +* `LocalMatrix::AMGAggregation(const LocalVector&, LocalMatrix*, LocalMatrix*)` |
| 642 | + |
| 643 | +### Fixes |
| 644 | + |
| 645 | +* Unit tests no longer ignore BCSR block dimension |
| 646 | +* Fixed documentation typos |
| 647 | +* Bug in multi-coloring for non-symmetric matrix patterns |
| 648 | + |
| 649 | +## rocALUTION 2.1.11 for ROCm 5.7.0 |
| 650 | + |
| 651 | +### Additions |
| 652 | + |
| 653 | +* Support for gfx940, gfx941, and gfx942 |
| 654 | + |
| 655 | +### Fixes |
| 656 | + |
| 657 | +* OpenMP runtime issue with Windows toolchain |
| 658 | + |
| 659 | +## rocALUTION 2.1.9 for ROCm 5.6.0 |
| 660 | + |
| 661 | +### Fixes |
| 662 | + |
| 663 | +* Synchronization issues in level 1 routines |
| 664 | + |
| 665 | +## rocALUTION 2.1.8 for ROCm 5.5.0 |
| 666 | + |
| 667 | +### Additions |
| 668 | + |
| 669 | +* Build support for Navi32 |
| 670 | + |
| 671 | +### Fixes |
| 672 | + |
| 673 | +* Typo in MPI backend |
| 674 | +* Bug with the backend when HIP support is disabled |
| 675 | +* Bug in SAAMG hierarchy building on the HIP backend |
| 676 | +* Improved SAAMG hierarchy build performance on the HIP backend |
| 677 | + |
| 678 | +### Deprecations |
| 679 | + |
| 680 | +* `LocalVector::GetIndexValues(ValueType*)`: use |
| 681 | + `LocalVector::GetIndexValues(const LocalVector&, LocalVector*)` instead |
| 682 | +* `LocalVector::SetIndexValues(const ValueType*)`: use |
| 683 | + `LocalVector::SetIndexValues(const LocalVector&, const LocalVector&)` instead |
| 684 | +* `LocalMatrix::RSDirectInterpolation(const LocalVector&, const LocalVector&, LocalMatrix*, LocalMatrix*)`: |
| 685 | + use `LocalMatrix::RSDirectInterpolation(const LocalVector&, const LocalVector&, LocalMatrix*)` |
| 686 | + instead |
| 687 | +* `LocalMatrix::RSExtPIInterpolation(const LocalVector&, const LocalVector&, bool, float, LocalMatrix*, LocalMatrix*)`: |
| 688 | + use `LocalMatrix::RSExtPIInterpolation(const LocalVector&, const LocalVector&, bool, LocalMatrix*)` |
| 689 | + instead |
| 690 | +* `LocalMatrix::RugeStueben()` |
| 691 | +* `LocalMatrix::AMGSmoothedAggregation(ValueType, const LocalVector&, const LocalVector&, LocalMatrix*, LocalMatrix*, int)`: |
| 692 | + use `LocalMatrix::AMGAggregation(ValueType, const LocalVector&, const LocalVector&, LocalMatrix*, int)` |
| 693 | + instead |
| 694 | +* `LocalMatrix::AMGAggregation(const LocalVector&, LocalMatrix*, LocalMatrix*)`: use |
| 695 | + `LocalMatrix::AMGAggregation(const LocalVector&, LocalMatrix*)` instead |
| 696 | + |
| 697 | +## rocALUTION 2.1.3 for ROCm 5.4.0 |
| 698 | + |
| 699 | +### Additions |
| 700 | + |
| 701 | +* Build support for Navi31 and Navi33 |
| 702 | +* Support for non-squared global matrices |
| 703 | + |
| 704 | +### Fixes |
| 705 | + |
| 706 | +* Memory leak in MatrixMult on HIP backend |
| 707 | +* Global structures can now be used with a single process |
| 708 | + |
| 709 | +### Changes |
| 710 | + |
| 711 | +* Switched GTest death test style to 'threadsafe' |
| 712 | +* Removed the native compiler option that was used during default library compilation |
| 713 | + |
| 714 | +### Deprecations |
| 715 | + |
| 716 | +* `GlobalVector::GetGhostSize()` |
| 717 | +* `ParallelManager::GetGlobalSize(), ParallelManager::GetLocalSize()`, `ParallelManager::SetGlobalSize()`, |
| 718 | + and `ParallelManager::SetLocalSize()` |
| 719 | +* `Vector::GetGhostSize()` |
| 720 | +* `Multigrid::SetOperatorFormat(unsigned int)`: use `Multigrid::SetOperatorFormat(unsigned int, int)` |
| 721 | + instead |
| 722 | +* `RugeStuebenAMG::SetCouplingStrength(ValueType)`: use `SetStrengthThreshold(float)` instead |
| 723 | |
| 724 | ## rocALUTION 2.1.0 for ROCm 5.3.0 |
| 725 | -### Added |
| 726 | -- Benchmarking tool |
| 727 | -- Ext+I Interpolation with sparsify strategies added for RS-AMG |
| 728 | -### Improved |
| 729 | -- ParallelManager |
| 730 | + |
| 731 | +### Additions |
| 732 | + |
| 733 | +* Benchmarking tool |
| 734 | +* Ext+I Interpolation with sparsify strategies added for RS-AMG |
| 735 | + |
| 736 | +### Optimizations |
| 737 | + |
| 738 | +* ParallelManager |
| 739 | |
| 740 | ## rocALUTION 2.0.3 for ROCm 5.2.0 |
| 741 | -### Added |
| 742 | -- Packages for test and benchmark executables on all supported OSes using CPack. |
| 743 | + |
| 744 | +### Additions |
| 745 | + |
| 746 | +* New packages for test and benchmark executables on all supported operating systems using CPack |
| 747 | |
| 748 | ## rocALUTION 2.0.2 for ROCm 5.1.0 |
| 749 | -### Added |
| 750 | -- Added out-of-place matrix transpose functionality |
| 751 | -- Added LocalVector<bool> |
| 752 | + |
| 753 | +### Additions |
| 754 | + |
| 755 | +* Added out-of-place matrix transpose functionality |
| 756 | +* Added LocalVector<bool> |
| 757 | |
| 758 | ## rocALUTION 2.0.1 for ROCm 5.0.0 |
| 759 | -### Changed |
| 760 | -- Removed deprecated GlobalPairwiseAMG class, please use PairwiseAMG instead. |
| 761 | -- Changed to C++ 14 Standard |
| 762 | -### Improved |
| 763 | -- Added sanitizer option |
| 764 | -- Improved documentation |
| 765 | + |
| 766 | +### Changes |
| 767 | + |
| 768 | +* Changed to C++ 14 Standard |
| 769 | +* Added sanitizer option |
| 770 | +* Improved documentation |
| 771 | + |
| 772 | +### Deprecations |
| 773 | + |
| 774 | +* `GlobalPairwiseAMG` class: use `PairwiseAMG` instead |
| 775 | |
| 776 | ## rocALUTION 1.13.2 for ROCm 4.5.0 |
| 777 | -### Added |
| 778 | -- Address sanitizer build option added |
| 779 | -- Enabling beta support for Windows 10 |
| 780 | -### Changed |
| 781 | -- Deprecated GlobalPairwiseAMG, please use PairwiseAMG instead. GlobalPairwiseAMG will be removed in a future major release. |
| 782 | -- Packaging split into a runtime package called rocalution and a development package called rocalution-devel. The development package depends on runtime. The runtime package suggests the development package for all supported OSes except CentOS 7 to aid in the transition. The suggests feature in packaging is introduced as a deprecated feature and will be removed in a future rocm release. |
| 783 | -### Improved |
| 784 | -- (A)MG smoothing and convergence rate improvement |
| 785 | -- Improved IDR Gram-Schmidt process |
| 786 | -- (A)MG solving phase optimization |
| 787 | + |
| 788 | +### Additions |
| 789 | + |
| 790 | +* AddressSanitizer build option |
| 791 | +* Enabled beta support for Windows 10 |
| 792 | + |
| 793 | +### Changes |
| 794 | + |
| 795 | +* Packaging has been split into a runtime package (`rocalution`) and a development package |
| 796 | + (`rocalution-devel`): |
| 797 | + The development package depends on the runtime package. When installing the runtime package, |
| 798 | + the package manager will suggest the installation of the development package to aid users |
| 799 | + transitioning from the previous version's combined package. This suggestion by package manager is |
| 800 | + for all supported operating systems (except CentOS 7) to aid in the transition. The `suggestion` |
| 801 | + feature in the runtime package is introduced as a deprecated feature and will be removed in a future |
| 802 | + ROCm release. |
| 803 | + |
| 804 | +### Deprecations |
| 805 | + |
| 806 | +* `GlobalPairwiseAMG`: use `PairwiseAMG` instead |
| 807 | + |
| 808 | +### Optimizations |
| 809 | + |
| 810 | +* Improved (A)MG smoothing and convergence rate |
| 811 | +* Improved IDR Gram-Schmidt process |
| 812 | +* Optimized (A)MG solving phase |
| 813 | |
| 814 | ## rocALUTION 1.12.1 for ROCm 4.3.0 |
| 815 | -### Added |
| 816 | -- support for gfx90a target |
| 817 | -- support for gfx1030 target |
| 818 | -### Improved |
| 819 | -- install script |
| 820 | -### Known Issues |
| 821 | -- none |
| 822 | |
| 823 | -## rocALUTION 1.11.5 for ROCm 4.2.0 |
| 824 | -### Added |
| 825 | -- none |
| 826 | -### Known Issues |
| 827 | -- none |
| 828 | +### Additions |
| 829 | |
| 830 | -## rocALUTION 1.11.5 for ROCm 4.1.0 |
| 831 | -### Added |
| 832 | -- none |
| 833 | -### Known Issues |
| 834 | -- none |
| 835 | +* Support for gfx90a target |
| 836 | +* Support for gfx1030 target |
| 837 | + |
| 838 | +### Optimizations |
| 839 | + |
| 840 | +* Install script |
| 841 | |
| 842 | ## rocALUTION 1.11.5 for ROCm 4.0.0 |
| 843 | -### Added |
| 844 | -- Add changelog |
| 845 | -- Fixing NaN issues |
| 846 | -- update to debian package name |
| 847 | -- bcsr format support. |
| 848 | -- cmake files adjustments. |
| 849 | + |
| 850 | +### Additions |
| 851 | + |
| 852 | +* Changelog |
| 853 | +* Block compressed sparse row (BCSRR) format support |
| 854 | + |
| 855 | +### Changes |
| 856 | + |
| 857 | +* Update to the Debian package name |
| 858 | +* CMake file adjustments |
| 859 | + |
| 860 | +### Fixes |
| 861 | + |
| 862 | +* NaN issues |
| 863 | |
| 864 | ## rocALUTION 1.10 for ROCm 3.9 |
| 865 | -### Added |
| 866 | -- rocRAND to support GPU sampling of random data. |
| 867 | -### Known Issues |
| 868 | -- none |
| 869 | + |
| 870 | +### Additions |
| 871 | + |
| 872 | +* rocRAND support for GPU sampling of random data |
| 873 | |
| 874 | ## rocALUTION 1.9.3 for ROCm 3.8 |
| 875 | -### Added |
| 876 | -- csr2dense and dense2csr to HIP backend. |
| 877 | -### Known Issues |
| 878 | -- none |
| 879 | |
| 880 | -## rocALUTION 1.9.1 for ROCm 3.7 |
| 881 | -### Added |
| 882 | -- none |
| 883 | -### Known Issues |
| 884 | -- none |
| 885 | +### Additions |
| 886 | |
| 887 | -## rocALUTION 1.9.1 for ROCm 3.6 |
| 888 | -### Added |
| 889 | -- none |
| 890 | -### Known Issues |
| 891 | -- none |
| 892 | +* `csr2dense` and `dense2csr` to HIP backend |
| 893 | |
| 894 | ## rocALUTION 1.9.1 for ROCm 3.5 |
| 895 | -### Added |
| 896 | -- static build |
| 897 | -- BCSR matrix format for SpMV |
| 898 | -- Bug fixing in conversion from CSR to HYB format. |
| 899 | -### Known Issues |
| 900 | -- none |
| 901 | + |
| 902 | +### Additions |
| 903 | + |
| 904 | +* Static build |
| 905 | +* BCSR matrix format for SpMV |
| 906 | + |
| 907 | +### Fixes |
| 908 | + |
| 909 | +* Bug in conversion from CSR to HYB format |
| 910 | diff --git a/CMakeLists.txt b/CMakeLists.txt |
| 911 | index 62e94be..8a622f4 100644 |
| 912 | --- a/CMakeLists.txt |
| 913 | +++ b/CMakeLists.txt |
| 914 | @@ -1,5 +1,5 @@ |
| 915 | # ######################################################################## |
| 916 | -# Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 917 | +# Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved. |
| 918 | # |
| 919 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 920 | # of this software and associated documentation files (the "Software"), to deal |
| 921 | @@ -38,7 +38,8 @@ list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/llvm ${ROCM_PATH}) |
| 922 | list(APPEND CMAKE_MODULE_PATH |
| 923 | ${CMAKE_CURRENT_SOURCE_DIR}/cmake |
| 924 | ${ROCM_PATH}/lib/cmake/hip |
| 925 | - ${ROCM_PATH}/hip/cmake) |
| 926 | + ${ROCM_PATH}/hip/cmake |
| 927 | + ${ROCM_PATH}/cmake) |
| 928 | |
| 929 | # Set a default build type if none was specified |
| 930 | if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) |
| 931 | @@ -56,9 +57,12 @@ endif() |
| 932 | project(rocalution LANGUAGES CXX) |
| 933 | |
| 934 | # Build flags |
| 935 | -set(CMAKE_CXX_STANDARD 14) |
| 936 | +set(CMAKE_CXX_STANDARD 17) |
| 937 | set(CMAKE_CXX_STANDARD_REQUIRED ON) |
| 938 | set(CMAKE_CXX_EXTENSIONS OFF) |
| 939 | +set(CMAKE_HIP_STANDARD 17) |
| 940 | +set(CMAKE_HIP_STANDARD_REQUIRED ON) |
| 941 | +set(CMAKE_HIP_EXTENSIONS OFF) |
| 942 | |
| 943 | # Build options |
| 944 | option(BUILD_SHARED_LIBS "Build rocALUTION as a shared library" ON) |
| 945 | @@ -68,31 +72,67 @@ option(BUILD_CLIENTS_SAMPLES "Build examples" ON) |
| 946 | option(BUILD_VERBOSE "Output additional build information" OFF) |
| 947 | option(BUILD_CODE_COVERAGE "Build with code coverage enabled" OFF) |
| 948 | option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF) |
| 949 | -option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" ON) |
| 950 | +option(BUILD_GLOBALTYPE_64 "Support global number of rows / columns exceeding 32 bits" OFF) |
| 951 | +option(BUILD_LOCALTYPE_64 "Support local number of rows / columns exceeding 32 bits" OFF) |
| 952 | +option(BUILD_PTRTYPE_64 "Support local number of non-zeros exceeding 32 bits" OFF) |
| 953 | +option(BUILD_OPTCPU "Enable all instruction subsets supported by the local machine" OFF) |
| 954 | +option(BUILD_SUPPORT_COMPLEX "Enable complex number support" ON) |
| 955 | |
| 956 | # Dependencies |
| 957 | include(cmake/Dependencies.cmake) |
| 958 | |
| 959 | -# Availability of rocm_check_target_ids command assures that we can also build |
| 960 | -# for gfx90a target |
| 961 | -if(COMMAND rocm_check_target_ids) |
| 962 | - set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx1030;gfx90a:xnack-;gfx90a:xnack+;gfx940;gfx941;gfx942;gfx1100;gfx1101;gfx1102") |
| 963 | +if(BUILD_ADDRESS_SANITIZER) |
| 964 | + set(DEFAULT_AMDGPU_TARGETS |
| 965 | + gfx908:xnack+ |
| 966 | + gfx90a:xnack+ |
| 967 | + gfx942:xnack+ |
| 968 | + ) |
| 969 | else() |
| 970 | - set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx1030;gfx1100;gfx1101;gfx1102") |
| 971 | + set(DEFAULT_AMDGPU_TARGETS |
| 972 | + gfx803 |
| 973 | + gfx900:xnack- |
| 974 | + gfx906:xnack- |
| 975 | + gfx908:xnack- |
| 976 | + gfx90a:xnack- |
| 977 | + gfx90a:xnack+ |
| 978 | + gfx942 |
| 979 | + gfx950 |
| 980 | + gfx1030 |
| 981 | + gfx1100 |
| 982 | + gfx1101 |
| 983 | + gfx1102 |
| 984 | + gfx1151 |
| 985 | + gfx1200 |
| 986 | + gfx1201 |
| 987 | + ) |
| 988 | endif() |
| 989 | -set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target") |
| 990 | - |
| 991 | -# Find HIP package |
| 992 | -find_package(HIP) |
| 993 | - |
| 994 | -if (NOT HIP_FOUND) |
| 995 | - message("-- HIP not found. Compiling WITHOUT HIP support.") |
| 996 | +if(AMDGPU_TARGETS AND NOT GPU_TARGETS) |
| 997 | + message( DEPRECATION "AMDGPU_TARGETS use is deprecated. Use GPU_TARGETS." ) |
| 998 | +endif() |
| 999 | +set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined. (Deprecated, prefer GPU_TARGETS)") |
| 1000 | +# Don't force, users should be able to override GPU_TARGETS at the command line if desired |
| 1001 | +set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to build for") |
| 1002 | + |
| 1003 | +include(CheckLanguage) |
| 1004 | +include(CMakeDependentOption) |
| 1005 | +check_language(HIP) |
| 1006 | +cmake_dependent_option(USE_HIPCXX "Use CMake HIP language support" OFF CMAKE_HIP_COMPILER OFF) |
| 1007 | +if(USE_HIPCXX) |
| 1008 | + enable_language(HIP) |
| 1009 | else() |
| 1010 | - option(SUPPORT_HIP "Compile WITH HIP support." ON) |
| 1011 | + find_package(HIP MODULE) # hip_add_library is only provided by the find module |
| 1012 | + if(NOT HIP_FOUND) |
| 1013 | + message("-- HIP not found. Compiling WITHOUT HIP support.") |
| 1014 | + endif() |
| 1015 | endif() |
| 1016 | |
| 1017 | +cmake_dependent_option(SUPPORT_HIP "Compile WITH HIP support" ON "USE_HIPCXX OR HIP_FOUND" OFF) |
| 1018 | + |
| 1019 | # HIP related library dependencies |
| 1020 | if(SUPPORT_HIP) |
| 1021 | + if( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) |
| 1022 | + find_package( hip REQUIRED CONFIG PATHS ${HIP_DIR} ${ROCM_PATH} /opt/rocm ) |
| 1023 | + endif( ) |
| 1024 | find_package(rocblas REQUIRED) |
| 1025 | find_package(rocsparse REQUIRED) |
| 1026 | find_package(rocprim REQUIRED) |
| 1027 | @@ -112,9 +152,7 @@ endif() |
| 1028 | |
| 1029 | |
| 1030 | # Setup version |
| 1031 | -set(VERSION_STRING "2.1.11") |
| 1032 | -rocm_setup_version(VERSION ${VERSION_STRING}) |
| 1033 | -set(rocalution_SOVERSION 0.1) |
| 1034 | +rocm_setup_version(VERSION "4.0.1") |
| 1035 | |
| 1036 | if(BUILD_CLIENTS_SAMPLES OR BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_TESTS) |
| 1037 | set( BUILD_CLIENTS ON ) |
| 1038 | @@ -137,7 +175,12 @@ endif() |
| 1039 | |
| 1040 | # Package specific CPACK vars |
| 1041 | if(SUPPORT_HIP) |
| 1042 | - rocm_package_add_dependencies(DEPENDS "hip-rocclr >= 4.0.0" "rocsparse >= 1.12.10" "rocblas >= 2.22.0" "rocrand >= 0.0.1") |
| 1043 | + if(BUILD_ADDRESS_SANITIZER) |
| 1044 | + set(DEPENDS_HIP_RUNTIME "hip-runtime-amd-asan >= 4.5.0" ) |
| 1045 | + else() |
| 1046 | + set(DEPENDS_HIP_RUNTIME "hip-runtime-amd >= 4.5.0" ) |
| 1047 | + endif() |
| 1048 | + rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME}" "rocsparse >= 1.12.10" "rocblas >= 2.22.0" "rocrand >= 2.1.0") |
| 1049 | endif() |
| 1050 | |
| 1051 | set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md") |
| 1052 | @@ -166,7 +209,7 @@ set(ROCALUTION_CONFIG_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_LI |
| 1053 | |
| 1054 | rocm_create_package( |
| 1055 | NAME ${package_name} |
| 1056 | - DESCRIPTION "Radeon Open Compute library for sparse linear systems" |
| 1057 | + DESCRIPTION "ROCm library for sparse linear systems" |
| 1058 | MAINTAINER "rocALUTION Maintainer <rocalution-maintainer@amd.com>" |
| 1059 | LDCONFIG |
| 1060 | LDCONFIG_DIR ${ROCALUTION_CONFIG_DIR} |
| 1061 | @@ -202,18 +245,14 @@ if(BUILD_CODE_COVERAGE) |
| 1062 | add_custom_target(coverage_output |
| 1063 | DEPENDS coverage_analysis |
| 1064 | COMMAND mkdir -p lcoverage |
| 1065 | - COMMAND echo "\\#!/bin/bash" > llvm-gcov.sh |
| 1066 | - COMMAND echo "\\# THIS FILE HAS BEEN GENERATED" >> llvm-gcov.sh |
| 1067 | - COMMAND printf "exec /opt/rocm/llvm/bin/llvm-cov gcov $$\\@" >> llvm-gcov.sh |
| 1068 | - COMMAND chmod +x llvm-gcov.sh |
| 1069 | ) |
| 1070 | |
| 1071 | # |
| 1072 | # Generate coverage output. |
| 1073 | # |
| 1074 | add_custom_command(TARGET coverage_output |
| 1075 | - COMMAND lcov --directory . --base-directory . --gcov-tool ${CMAKE_BINARY_DIR}/llvm-gcov.sh --capture -o lcoverage/raw_main_coverage.info |
| 1076 | - COMMAND lcov --remove lcoverage/raw_main_coverage.info "'/opt/*'" "'/usr/*'" -o lcoverage/main_coverage.info |
| 1077 | + COMMAND lcov --directory . --base-directory . --capture -o lcoverage/raw_main_coverage.info |
| 1078 | + COMMAND lcov --remove lcoverage/raw_main_coverage.info "'${CMAKE_SOURCE_DIR}/src/utils/*'" "'${CMAKE_SOURCE_DIR}/src/base/host/host_io.*'" "'${CMAKE_SOURCE_DIR}/clients/*'" "'${CMAKE_SOURCE_DIR}/build/*'" "'/opt/*'" "'/usr/*'" -o lcoverage/main_coverage.info |
| 1079 | COMMAND genhtml lcoverage/main_coverage.info --output-directory lcoverage |
| 1080 | ) |
| 1081 | |
| 1082 | diff --git a/LICENSE.md b/LICENSE.md |
| 1083 | index 7a44e61..e41c5d8 100644 |
| 1084 | --- a/LICENSE.md |
| 1085 | +++ b/LICENSE.md |
| 1086 | @@ -1,6 +1,6 @@ |
| 1087 | MIT License |
| 1088 | |
| 1089 | -Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights reserved. |
| 1090 | +Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. |
| 1091 | |
| 1092 | Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1093 | of this software and associated documentation files (the "Software"), to deal |
| 1094 | diff --git a/README.md b/README.md |
| 1095 | index b9b6929..d63704f 100644 |
| 1096 | --- a/README.md |
| 1097 | +++ b/README.md |
| 1098 | @@ -1,28 +1,52 @@ |
| 1099 | # rocALUTION |
| 1100 | -rocALUTION is a sparse linear algebra library with focus on exploring fine-grained parallelism on top of AMD's Radeon Open eCosystem Platform [ROCm][] runtime and toolchains, targeting modern CPU and GPU platforms. Based on C++ and [HIP][], it provides a portable, generic and flexible design that allows seamless integration with other scientific software packages. |
| 1101 | + |
| 1102 | +> [!NOTE] |
| 1103 | +> The published documentation is available at [rocALUTION](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). |
| 1104 | + |
| 1105 | +rocALUTION is a sparse linear algebra library that can be used to explore fine-grained parallelism on |
| 1106 | +top of the [ROCm](https://github.com/ROCm/ROCm) platform runtime and toolchains. |
| 1107 | +Based on C++ and [HIP](https://github.com/ROCm/HIP/), rocALUTION |
| 1108 | +provides a portable, generic, and flexible design that allows seamless integration with other scientific |
| 1109 | +software packages. |
| 1110 | + |
| 1111 | +rocALUTION offers various backends for different (parallel) hardware: |
| 1112 | + |
| 1113 | +* Host |
| 1114 | +* [OpenMP](http://www.openmp.org/): Designed for multi-core CPUs |
| 1115 | +* [HIP](https://github.com/ROCm/HIP/): Designed for ROCm-compatible devices |
| 1116 | +* [MPI](https://www.open-mpi.org/): Designed for multi-node clusters and multi-GPU setups |
| 1117 | + |
| 1118 | +## Requirements |
| 1119 | + |
| 1120 | +To use rocALUTION on GPU devices, you must first install the |
| 1121 | +[rocBLAS](https://github.com/ROCm/rocBLAS), |
| 1122 | +[rocSPARSE](https://github.com/ROCm/rocSPARSE), and |
| 1123 | +[rocRAND](https://github.com/ROCm/rocRAND) libraries. You can install these from |
| 1124 | +the ROCm repository, the GitHub 'releases' tab, or you can manually compile them. |
| 1125 | |
| 1126 | ## Documentation |
| 1127 | -The latest rocALUTION documentation and API description can be found [here][]. |
| 1128 | |
| 1129 | -### How to build documentation |
| 1130 | +Documentation for rocALUTION is available at |
| 1131 | +[https://rocm.docs.amd.com/projects/rocALUTION/en/latest/](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/). |
| 1132 | |
| 1133 | -Run the following steps to build documentation. |
| 1134 | +To build our documentation locally, use the following code: |
| 1135 | |
| 1136 | -``` |
| 1137 | +```bash |
| 1138 | cd docs |
| 1139 | |
| 1140 | -pip3 install -r .sphinx/requirements.txt |
| 1141 | +pip3 install -r sphinx/requirements.txt |
| 1142 | |
| 1143 | python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html |
| 1144 | ``` |
| 1145 | |
| 1146 | -## Quickstart rocALUTION build |
| 1147 | +## Build |
| 1148 | |
| 1149 | -#### CMake 3.5 or later |
| 1150 | -All compiler specifications are determined automatically. The compilation process can be performed by |
| 1151 | -``` |
| 1152 | +You can compile rocALUTION using CMake 3.5 or later. Note that all compiler specifications are |
| 1153 | +determined automatically. |
| 1154 | + |
| 1155 | +```bash |
| 1156 | # Clone rocALUTION using git |
| 1157 | -git clone https://github.com/ROCmSoftwarePlatform/rocALUTION.git |
| 1158 | +git clone https://github.com/ROCm/rocALUTION.git |
| 1159 | |
| 1160 | # Go to rocALUTION directory, create and change to build directory |
| 1161 | cd rocALUTION; mkdir build; cd build |
| 1162 | @@ -40,69 +64,56 @@ cmake .. -DSUPPORT_HIP=ON -DROCM_PATH=/opt/rocm/ |
| 1163 | make |
| 1164 | ``` |
| 1165 | |
| 1166 | -#### Simple test |
| 1167 | -You can test the installation by running a CG solver on a Laplace matrix: |
| 1168 | -``` |
| 1169 | +To test your installation, run a CG solver on a Laplacian matrix: |
| 1170 | + |
| 1171 | +```bash |
| 1172 | cd rocALUTION; cd build |
| 1173 | wget ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/laplace/gr_30_30.mtx.gz |
| 1174 | gzip -d gr_30_30.mtx.gz |
| 1175 | ./clients/staging/cg gr_30_30.mtx |
| 1176 | ``` |
| 1177 | |
| 1178 | -## Overview |
| 1179 | - |
| 1180 | -#### Backends |
| 1181 | -rocALUTION offers various backends for different (parallel) hardware. |
| 1182 | -* Host |
| 1183 | -* [OpenMP][] - designed for multi-core CPUs |
| 1184 | -* [HIP][] - designed for ROCm compatible devices |
| 1185 | -* [MPI][] - designed for multi-node clusters and multi-GPU setups |
| 1186 | - |
| 1187 | -#### Easy to use |
| 1188 | -Syntax and structure of the library provide fast learning curves. With the help of the examples, anyone can try out the library - no knowledge in multi-core or GPU programming is required. |
| 1189 | - |
| 1190 | -#### Requirements |
| 1191 | -There are no hardware requirements to install and run rocALUTION. If GPU devices are available, rocALUTION will use them. |
| 1192 | -In order to use rocALUTION on GPU devices, you will need to make sure that [rocBLAS][], [rocSPARSE][] and [rocRAND][] libraries are installed on your system. You can install them from ROCm repository, from github releases tab or manually compile them yourself. |
| 1193 | - |
| 1194 | -#### Iterative solvers |
| 1195 | -* Fixed-Point iteration schemes - Jacobi, (Symmetric) Gauss-Seidel, SOR, SSOR |
| 1196 | -* Krylov subspace methods - CR, CG, BiCGStab, BiCGStab(*l*), GMRES, IDR, QMRCGSTAB, Flexible CG/GMRES |
| 1197 | -* Mixed-precision defect-correction scheme |
| 1198 | -* Chebyshev iteration scheme |
| 1199 | -* Multigrid - geometric and algebraic |
| 1200 | - |
| 1201 | -#### Preconditioners |
| 1202 | -* Matrix splitting schemes - Jacobi, (multi-colored) (symmetric) Gauss-Seidel, SOR, SSOR |
| 1203 | -* Factorization schemes - ILU(*0*), ILU(*p*) (based on levels), ILU(*p,q*) (power(*q*)-pattern method), multi-elimination ILU (nested/recursive), ILUT (based on threshold), IC(*0*) |
| 1204 | -* Approximate Inverses - Chebyshev matrix-valued polynomial, SPAI, FSAI, TNS |
| 1205 | -* Diagonal-based preconditioner for Saddle-point problems |
| 1206 | -* Block-type of sub-preconditioners/solvers |
| 1207 | -* (Restricted) Additive Schwarz |
| 1208 | -* Variable type of preconditioners |
| 1209 | - |
| 1210 | -#### Sparse matrix formats |
| 1211 | -* Compressed Sparse Row (CSR) |
| 1212 | -* Modified Compressed Sparse Row (MCSR) |
| 1213 | -* Dense (DENSE) |
| 1214 | -* Coordinate (COO) |
| 1215 | -* ELL |
| 1216 | -* Diagonal (DIA) |
| 1217 | -* Hybrid ELL+COO (HYB) |
| 1218 | - |
| 1219 | -#### Generic and robust design |
| 1220 | -rocALUTION is based on a generic and robust design, allowing expansion in the direction of new solvers and preconditioners and support for various hardware types. Furthermore, the design of the library allows the use of all solvers as preconditioners in other solvers, for example you can define a CG solver with a multi-elimination preconditioner, where the last-block is preconditioned with another Chebyshev iteration method which is preconditioned with a multi-colored symmetric Gauss-Seidel scheme. |
| 1221 | - |
| 1222 | -#### Portable code and results |
| 1223 | -All code based on rocALUTION is portable and independent of the hardware, it will compile and run on any supported platform. All solvers and preconditioners are based on a single source code implementation, which delivers portable results across all backends (variations are possible due to different rounding modes on the hardware). The only difference which you can see for a hardware change is the performance variation. |
| 1224 | - |
| 1225 | - |
| 1226 | - |
| 1227 | -[ROCm]: https://github.com/RadeonOpenCompute/ROCm |
| 1228 | -[HIP]: https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/ |
| 1229 | -[OpenMP]: http://www.openmp.org/ |
| 1230 | -[MPI]: https://www.open-mpi.org/ |
| 1231 | -[rocBLAS]: https://github.com/ROCmSoftwarePlatform/rocBLAS |
| 1232 | -[rocSPARSE]: https://github.com/ROCmSoftwarePlatform/rocSPARSE |
| 1233 | -[rocRAND]: https://github.com/ROCmSoftwarePlatform/rocRAND |
| 1234 | -[here]: https://rocalution.readthedocs.io |
| 1235 | +## General information |
| 1236 | + |
| 1237 | +rocALUTION is based on a generic and robust design that allows expansion in the direction of new |
| 1238 | +solvers and preconditioners with support for various hardware types. The library's design allows the |
| 1239 | +use of all solvers as preconditioners in other solvers. For example, you can define a CG solver with a |
| 1240 | +multi-elimination preconditioner, in which the last-block is preconditioned with another Chebyshev |
| 1241 | +iteration method that itself is preconditioned with a multi-colored symmetric Gauss-Seidel scheme. |
| 1242 | + |
| 1243 | +### Iterative solvers |
| 1244 | + |
| 1245 | +* Fixed-point iteration schemes: Jacobi, (Symmetric) Gauss-Seidel, SOR, SSOR |
| 1246 | +* Krylov subspace methods: CR, CG, BiCGStab, BiCGStab(*l*), GMRES, IDR, QMRCGSTAB, |
| 1247 | + Flexible CG/GMRES |
| 1248 | +* Mixed-precision defect correction scheme |
| 1249 | +* Chebyshev iteration scheme |
| 1250 | +* Multigrid: Geometric and algebraic |
| 1251 | + |
| 1252 | +### Preconditioners |
| 1253 | + |
| 1254 | +* Matrix splitting schemes: Jacobi, (multi-colored) (symmetric) Gauss-Seidel, SOR, SSOR |
| 1255 | +* Factorization schemes: ILU(*0*), ILU(*p*) (based on levels), ILU(*p,q*) (power(*q*)-pattern method), |
| 1256 | + multi-elimination ILU (nested/recursive), ILUT (based on threshold), IC(*0*) |
| 1257 | +* Approximate Inverses: Chebyshev matrix-valued polynomial, SPAI, FSAI, TNS |
| 1258 | +* Diagonal-based preconditioner for Saddle-point problems |
| 1259 | +* Block-type of sub-preconditioners/solvers |
| 1260 | +* Additive Schwarz (restricted) |
| 1261 | +* Variable type of preconditioners |
| 1262 | + |
| 1263 | +### Sparse matrix formats |
| 1264 | + |
| 1265 | +* Compressed Sparse Row (CSR) |
| 1266 | +* Modified Compressed Sparse Row (MCSR) |
| 1267 | +* Dense (DENSE) |
| 1268 | +* Coordinate (COO) |
| 1269 | +* ELL |
| 1270 | +* Diagonal (DIA) |
| 1271 | +* Hybrid ELL+COO (HYB) |
| 1272 | + |
| 1273 | +## Portability |
| 1274 | + |
| 1275 | +All code based on rocALUTION is portable and hardware-independent. It compiles and runs on any |
| 1276 | +supported platform. All solvers and preconditioners are based on a single source code implementation |
| 1277 | +that delivers portable results across all backends (note that variations are possible due to different |
| 1278 | +hardware rounding modes). The only visible difference between hardware is performance variation. |
| 1279 | diff --git a/clients/benchmarks/CMakeLists.txt b/clients/benchmarks/CMakeLists.txt |
| 1280 | index 781a24c..aa66ca7 100644 |
| 1281 | --- a/clients/benchmarks/CMakeLists.txt |
| 1282 | +++ b/clients/benchmarks/CMakeLists.txt |
| 1283 | @@ -1,5 +1,5 @@ |
| 1284 | # ######################################################################## |
| 1285 | -# Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 1286 | +# Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved. |
| 1287 | # |
| 1288 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1289 | # of this software and associated documentation files (the "Software"), to deal |
| 1290 | @@ -29,6 +29,7 @@ set(ROCALUTION_BENCHMARK_SOURCES |
| 1291 | rocalution_bench_solver_results.cpp |
| 1292 | rocalution_enum_coarsening_strategy.cpp |
| 1293 | rocalution_enum_directsolver.cpp |
| 1294 | + rocalution_enum_itilu0_alg.cpp |
| 1295 | rocalution_enum_itsolver.cpp |
| 1296 | rocalution_enum_matrix_init.cpp |
| 1297 | rocalution_enum_preconditioner.cpp |
| 1298 | @@ -40,7 +41,7 @@ set(ROCALUTION_BENCHMARK_SOURCES |
| 1299 | add_executable(rocalution-bench ${ROCALUTION_BENCHMARK_SOURCES} ${ROCALUTION_CLIENTS_COMMON}) |
| 1300 | |
| 1301 | # Target compile options |
| 1302 | -target_compile_options(rocalution-bench PRIVATE -Wno-unused-command-line-argument -Wall) |
| 1303 | +target_compile_options(rocalution-bench PRIVATE -Wall) |
| 1304 | |
| 1305 | # Internal common header |
| 1306 | target_include_directories(rocalution-bench PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>) |
| 1307 | diff --git a/clients/benchmarks/rocalution_arguments_config.cpp b/clients/benchmarks/rocalution_arguments_config.cpp |
| 1308 | index 61cf6eb..692f1ea 100644 |
| 1309 | --- a/clients/benchmarks/rocalution_arguments_config.cpp |
| 1310 | +++ b/clients/benchmarks/rocalution_arguments_config.cpp |
| 1311 | @@ -1,6 +1,6 @@ |
| 1312 | /*! \file */ |
| 1313 | /* ************************************************************************ |
| 1314 | -* Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 1315 | +* Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved. |
| 1316 | * |
| 1317 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1318 | * of this software and associated documentation files (the "Software"), to deal |
| 1319 | @@ -82,6 +82,16 @@ void rocalution_arguments_config::set_description(options_description& desc) |
| 1320 | ADD_OPTION(double, e, 0.05, "ilut tolerance"); |
| 1321 | break; |
| 1322 | } |
| 1323 | + case rocalution_bench_solver_parameters::itsolve_tol: |
| 1324 | + { |
| 1325 | + ADD_OPTION(double, e, 1e-03, "iterative solve tolerance (see --iterative_solve)."); |
| 1326 | + break; |
| 1327 | + } |
| 1328 | + case rocalution_bench_solver_parameters::itilu0_tol: |
| 1329 | + { |
| 1330 | + ADD_OPTION(double, e, 2e-07, "itilu0 tolerance"); |
| 1331 | + break; |
| 1332 | + } |
| 1333 | case rocalution_bench_solver_parameters::mcgs_relax: |
| 1334 | { |
| 1335 | ADD_OPTION(double, e, 1.0, "relaxation coefficient"); |
| 1336 | @@ -155,6 +165,27 @@ void rocalution_arguments_config::set_description(options_description& desc) |
| 1337 | break; |
| 1338 | } |
| 1339 | |
| 1340 | + case rocalution_bench_solver_parameters::itsolve_max_iter: |
| 1341 | + { |
| 1342 | + ADD_OPTION(int, |
| 1343 | + e, |
| 1344 | + 30, |
| 1345 | + "iterative solve maximum number of iterations (see " |
| 1346 | + "--iterative_solve)."); |
| 1347 | + break; |
| 1348 | + } |
| 1349 | + case rocalution_bench_solver_parameters::itilu0_max_iter: |
| 1350 | + { |
| 1351 | + ADD_OPTION(int, e, 1000, "itilu0 maximum number of iterations."); |
| 1352 | + break; |
| 1353 | + } |
| 1354 | + |
| 1355 | + case rocalution_bench_solver_parameters::itilu0_options: |
| 1356 | + { |
| 1357 | + ADD_OPTION(int, e, 1000, "itilu0 combination of options."); |
| 1358 | + break; |
| 1359 | + } |
| 1360 | + |
| 1361 | case rocalution_bench_solver_parameters::mcilu_p: |
| 1362 | { |
| 1363 | ADD_OPTION(int, e, 0, "multicolored ilu parameter p."); |
| 1364 | @@ -219,6 +250,11 @@ void rocalution_arguments_config::set_description(options_description& desc) |
| 1365 | ADD_OPTION(std::string, e, "", "coarsening strategy"); |
| 1366 | break; |
| 1367 | } |
| 1368 | + case rocalution_bench_solver_parameters::itilu0_alg: |
| 1369 | + { |
| 1370 | + ADD_OPTION(std::string, e, "Default", "ItILU0 algorithm"); |
| 1371 | + break; |
| 1372 | + } |
| 1373 | case rocalution_bench_solver_parameters::matrix: |
| 1374 | { |
| 1375 | ADD_OPTION(std::string, e, "", "matrix initialization"); |
| 1376 | @@ -243,6 +279,11 @@ void rocalution_arguments_config::set_description(options_description& desc) |
| 1377 | { |
| 1378 | switch(e) |
| 1379 | { |
| 1380 | + case rocalution_bench_solver_parameters::iterative_solve: |
| 1381 | + { |
| 1382 | + ADD_OPTION(bool, e, false, "perform triangular iterative solve during solving step."); |
| 1383 | + break; |
| 1384 | + } |
| 1385 | case rocalution_bench_solver_parameters::verbose: |
| 1386 | { |
| 1387 | ADD_OPTION(bool, e, false, "verbose"); |
| 1388 | @@ -335,6 +376,17 @@ int rocalution_arguments_config::parse(int& argc, char**& argv, options_descript |
| 1389 | |
| 1390 | break; |
| 1391 | } |
| 1392 | + case rocalution_bench_solver_parameters::itilu0_alg: |
| 1393 | + { |
| 1394 | + |
| 1395 | + auto itilu0_alg_string = this->Get(rocalution_bench_solver_parameters::itilu0_alg); |
| 1396 | + if(itilu0_alg_string != "") |
| 1397 | + { |
| 1398 | + this->m_enum_itilu0_alg(itilu0_alg_string.c_str()); |
| 1399 | + } |
| 1400 | + |
| 1401 | + break; |
| 1402 | + } |
| 1403 | |
| 1404 | case rocalution_bench_solver_parameters::matrix: |
| 1405 | { |
| 1406 | diff --git a/clients/benchmarks/rocalution_bench_solver_parameters.cpp b/clients/benchmarks/rocalution_bench_solver_parameters.cpp |
| 1407 | index d48975d..e6f4268 100644 |
| 1408 | --- a/clients/benchmarks/rocalution_bench_solver_parameters.cpp |
| 1409 | +++ b/clients/benchmarks/rocalution_bench_solver_parameters.cpp |
| 1410 | @@ -1,5 +1,5 @@ |
| 1411 | /* ************************************************************************ |
| 1412 | - * Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 1413 | + * Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved. |
| 1414 | * |
| 1415 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1416 | * of this software and associated documentation files (the "Software"), to deal |
| 1417 | @@ -35,6 +35,10 @@ rocalution_enum_preconditioner rocalution_bench_solver_parameters::GetEnumPrecon |
| 1418 | { |
| 1419 | return this->m_enum_preconditioner; |
| 1420 | }; |
| 1421 | +rocalution_enum_itilu0_alg rocalution_bench_solver_parameters::GetEnumItILU0Algorithm() const |
| 1422 | +{ |
| 1423 | + return this->m_enum_itilu0_alg; |
| 1424 | +}; |
| 1425 | rocalution_enum_itsolver rocalution_bench_solver_parameters::GetEnumIterativeSolver() const |
| 1426 | { |
| 1427 | return this->m_enum_itsolver; |
| 1428 | diff --git a/clients/benchmarks/rocalution_bench_solver_parameters.hpp b/clients/benchmarks/rocalution_bench_solver_parameters.hpp |
| 1429 | index 67d5721..debe199 100644 |
| 1430 | --- a/clients/benchmarks/rocalution_bench_solver_parameters.hpp |
| 1431 | +++ b/clients/benchmarks/rocalution_bench_solver_parameters.hpp |
| 1432 | @@ -1,5 +1,5 @@ |
| 1433 | /* ************************************************************************ |
| 1434 | - * Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 1435 | + * Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved. |
| 1436 | * |
| 1437 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1438 | * of this software and associated documentation files (the "Software"), to deal |
| 1439 | @@ -25,6 +25,7 @@ |
| 1440 | |
| 1441 | #include "rocalution_enum_coarsening_strategy.hpp" |
| 1442 | #include "rocalution_enum_directsolver.hpp" |
| 1443 | +#include "rocalution_enum_itilu0_alg.hpp" |
| 1444 | #include "rocalution_enum_itsolver.hpp" |
| 1445 | #include "rocalution_enum_matrix_init.hpp" |
| 1446 | #include "rocalution_enum_preconditioner.hpp" |
| 1447 | @@ -44,6 +45,11 @@ protected: |
| 1448 | rocalution_enum_matrix_init m_enum_matrix_init{}; |
| 1449 | |
| 1450 | // |
| 1451 | + // @brief Which ItILU0 algorithm. |
| 1452 | + // |
| 1453 | + rocalution_enum_itilu0_alg m_enum_itilu0_alg{}; |
| 1454 | + |
| 1455 | + // |
| 1456 | // @brief Which iterative solver. |
| 1457 | // |
| 1458 | rocalution_enum_itsolver m_enum_itsolver{}; |
| 1459 | @@ -90,6 +96,10 @@ public: |
| 1460 | // |
| 1461 | rocalution_enum_itsolver GetEnumIterativeSolver() const; |
| 1462 | // |
| 1463 | + // @brief Get which ItILU0 algorithm. |
| 1464 | + // |
| 1465 | + rocalution_enum_itilu0_alg GetEnumItILU0Algorithm() const; |
| 1466 | + // |
| 1467 | // @brief Get which matrix initialization |
| 1468 | // |
| 1469 | rocalution_enum_matrix_init GetEnumMatrixInit() const; |
| 1470 | @@ -100,6 +110,7 @@ public: |
| 1471 | |
| 1472 | // clang-format off |
| 1473 | #define PBOOL_TRANSFORM_EACH \ |
| 1474 | + PBOOL_TRANSFORM(iterative_solve) \ |
| 1475 | PBOOL_TRANSFORM(verbose) \ |
| 1476 | PBOOL_TRANSFORM(mcilu_use_level) |
| 1477 | // clang-format on |
| 1478 | @@ -122,6 +133,9 @@ public: |
| 1479 | PINT_TRANSFORM(krylov_basis) \ |
| 1480 | PINT_TRANSFORM(ndim) \ |
| 1481 | PINT_TRANSFORM(ilut_n) \ |
| 1482 | + PINT_TRANSFORM(itsolve_max_iter) \ |
| 1483 | + PINT_TRANSFORM(itilu0_max_iter) \ |
| 1484 | + PINT_TRANSFORM(itilu0_options) \ |
| 1485 | PINT_TRANSFORM(mcilu_p) \ |
| 1486 | PINT_TRANSFORM(mcilu_q) \ |
| 1487 | PINT_TRANSFORM(max_iter) \ |
| 1488 | @@ -153,6 +167,7 @@ public: |
| 1489 | PSTRING_TRANSFORM(coarsening_strategy) \ |
| 1490 | PSTRING_TRANSFORM(direct_solver) \ |
| 1491 | PSTRING_TRANSFORM(iterative_solver) \ |
| 1492 | + PSTRING_TRANSFORM(itilu0_alg) \ |
| 1493 | PSTRING_TRANSFORM(matrix) \ |
| 1494 | PSTRING_TRANSFORM(matrix_filename) \ |
| 1495 | PSTRING_TRANSFORM(preconditioner) \ |
| 1496 | @@ -197,6 +212,8 @@ public: |
| 1497 | PDOUBLE_TRANSFORM(div_tol) \ |
| 1498 | PDOUBLE_TRANSFORM(residual_tol) \ |
| 1499 | PDOUBLE_TRANSFORM(ilut_tol) \ |
| 1500 | + PDOUBLE_TRANSFORM(itsolve_tol) \ |
| 1501 | + PDOUBLE_TRANSFORM(itilu0_tol) \ |
| 1502 | PDOUBLE_TRANSFORM(mcgs_relax) \ |
| 1503 | PDOUBLE_TRANSFORM(solver_over_interp) \ |
| 1504 | PDOUBLE_TRANSFORM(solver_coupling_strength) \ |
| 1505 | diff --git a/clients/benchmarks/rocalution_driver_itsolver.hpp b/clients/benchmarks/rocalution_driver_itsolver.hpp |
| 1506 | index 9bf4e36..1dc7ba4 100644 |
| 1507 | --- a/clients/benchmarks/rocalution_driver_itsolver.hpp |
| 1508 | +++ b/clients/benchmarks/rocalution_driver_itsolver.hpp |
| 1509 | @@ -1,5 +1,5 @@ |
| 1510 | /* ************************************************************************ |
| 1511 | - * Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 1512 | + * Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved. |
| 1513 | * |
| 1514 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1515 | * of this software and associated documentation files (the "Software"), to deal |
| 1516 | @@ -256,7 +256,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO |
| 1517 | AIChebyshev<rocalution::LocalMatrix<T>, rocalution::LocalVector<T>, T>; |
| 1518 | p->Set(3, lambda_max / 7.0, lambda_max); |
| 1519 | this->m_preconditioner = p; |
| 1520 | - return true; |
| 1521 | + break; |
| 1522 | } |
| 1523 | |
| 1524 | case rocalution_enum_preconditioner::FSAI: |
| 1525 | @@ -264,7 +264,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO |
| 1526 | auto* p |
| 1527 | = new rocalution::FSAI<rocalution::LocalMatrix<T>, rocalution::LocalVector<T>, T>; |
| 1528 | this->m_preconditioner = p; |
| 1529 | - return true; |
| 1530 | + break; |
| 1531 | } |
| 1532 | |
| 1533 | case rocalution_enum_preconditioner::SPAI: |
| 1534 | @@ -272,14 +272,14 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO |
| 1535 | auto* p |
| 1536 | = new rocalution::SPAI<rocalution::LocalMatrix<T>, rocalution::LocalVector<T>, T>; |
| 1537 | this->m_preconditioner = p; |
| 1538 | - return true; |
| 1539 | + break; |
| 1540 | } |
| 1541 | case rocalution_enum_preconditioner::TNS: |
| 1542 | { |
| 1543 | auto* p |
| 1544 | = new rocalution::TNS<rocalution::LocalMatrix<T>, rocalution::LocalVector<T>, T>; |
| 1545 | this->m_preconditioner = p; |
| 1546 | - return true; |
| 1547 | + break; |
| 1548 | } |
| 1549 | |
| 1550 | case rocalution_enum_preconditioner::Jacobi: |
| 1551 | @@ -290,7 +290,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO |
| 1552 | // no specific parameters |
| 1553 | // |
| 1554 | this->m_preconditioner = p; |
| 1555 | - return true; |
| 1556 | + break; |
| 1557 | } |
| 1558 | |
| 1559 | case rocalution_enum_preconditioner::GS: |
| 1560 | @@ -300,7 +300,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO |
| 1561 | // no specific parameters |
| 1562 | // |
| 1563 | this->m_preconditioner = p; |
| 1564 | - return true; |
| 1565 | + break; |
| 1566 | } |
| 1567 | |
| 1568 | case rocalution_enum_preconditioner::SGS: |
| 1569 | @@ -311,7 +311,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO |
| 1570 | // no specific parameters |
| 1571 | // |
| 1572 | this->m_preconditioner = p; |
| 1573 | - return true; |
| 1574 | + break; |
| 1575 | } |
| 1576 | |
| 1577 | case rocalution_enum_preconditioner::ILU: |
| 1578 | @@ -322,6 +322,53 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO |
| 1579 | // no specific parameters |
| 1580 | // |
| 1581 | this->m_preconditioner = p; |
| 1582 | + break; |
| 1583 | + } |
| 1584 | + case rocalution_enum_preconditioner::ItILU0: |
| 1585 | + { |
| 1586 | + auto enum_itilu0_alg = parameters.GetEnumItILU0Algorithm(); |
| 1587 | + if(enum_itilu0_alg.is_invalid()) |
| 1588 | + { |
| 1589 | + rocalution_bench_errmsg << "enum_itilu0_alg is invalid." << std::endl; |
| 1590 | + return false; |
| 1591 | + } |
| 1592 | + |
| 1593 | + auto* p |
| 1594 | + = new rocalution::ItILU0<rocalution::LocalMatrix<T>, rocalution::LocalVector<T>, T>; |
| 1595 | + p->SetMaxIter(parameters.Get(params_t::itilu0_max_iter)); |
| 1596 | + p->SetTolerance(parameters.Get(params_t::itilu0_tol)); |
| 1597 | + p->SetOptions(parameters.Get(params_t::itilu0_options)); |
| 1598 | + |
| 1599 | + switch(enum_itilu0_alg.value) |
| 1600 | + { |
| 1601 | + case rocalution_enum_itilu0_alg::Default: |
| 1602 | + { |
| 1603 | + p->SetAlgorithm(ItILU0Algorithm::Default); |
| 1604 | + break; |
| 1605 | + } |
| 1606 | + case rocalution_enum_itilu0_alg::AsyncInPlace: |
| 1607 | + { |
| 1608 | + p->SetAlgorithm(ItILU0Algorithm::AsyncInPlace); |
| 1609 | + break; |
| 1610 | + } |
| 1611 | + case rocalution_enum_itilu0_alg::AsyncSplit: |
| 1612 | + { |
| 1613 | + p->SetAlgorithm(ItILU0Algorithm::AsyncSplit); |
| 1614 | + break; |
| 1615 | + } |
| 1616 | + case rocalution_enum_itilu0_alg::SyncSplit: |
| 1617 | + { |
| 1618 | + p->SetAlgorithm(ItILU0Algorithm::SyncSplit); |
| 1619 | + break; |
| 1620 | + } |
| 1621 | + case rocalution_enum_itilu0_alg::SyncSplitFusion: |
| 1622 | + { |
| 1623 | + p->SetAlgorithm(ItILU0Algorithm::SyncSplitFusion); |
| 1624 | + break; |
| 1625 | + } |
| 1626 | + } |
| 1627 | + |
| 1628 | + this->m_preconditioner = p; |
| 1629 | return true; |
| 1630 | } |
| 1631 | case rocalution_enum_preconditioner::ILUT: |
| 1632 | @@ -331,7 +378,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO |
| 1633 | p->Set(parameters.Get(params_t::ilut_tol), parameters.Get(params_t::ilut_n)); |
| 1634 | |
| 1635 | this->m_preconditioner = p; |
| 1636 | - return true; |
| 1637 | + break; |
| 1638 | } |
| 1639 | case rocalution_enum_preconditioner::IC: |
| 1640 | { |
| 1641 | @@ -340,7 +387,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO |
| 1642 | // no specific parameters |
| 1643 | // |
| 1644 | this->m_preconditioner = p; |
| 1645 | - return true; |
| 1646 | + break; |
| 1647 | } |
| 1648 | case rocalution_enum_preconditioner::MCGS: |
| 1649 | { |
| 1650 | @@ -348,7 +395,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO |
| 1651 | MultiColoredGS<rocalution::LocalMatrix<T>, rocalution::LocalVector<T>, T>; |
| 1652 | p->SetRelaxation(parameters.Get(params_t::mcgs_relax)); |
| 1653 | this->m_preconditioner = p; |
| 1654 | - return true; |
| 1655 | + break; |
| 1656 | } |
| 1657 | case rocalution_enum_preconditioner::MCSGS: |
| 1658 | { |
| 1659 | @@ -358,7 +405,7 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO |
| 1660 | // no specific parameters |
| 1661 | // |
| 1662 | this->m_preconditioner = p; |
| 1663 | - return true; |
| 1664 | + break; |
| 1665 | } |
| 1666 | case rocalution_enum_preconditioner::MCILU: |
| 1667 | { |
| 1668 | @@ -370,8 +417,30 @@ struct rocalution_driver_itsolver_default : rocalution_driver_itsolver_base<ITSO |
| 1669 | parameters.Get(params_t::mcilu_use_level)); |
| 1670 | |
| 1671 | this->m_preconditioner = p; |
| 1672 | - return true; |
| 1673 | + break; |
| 1674 | + } |
| 1675 | } |
| 1676 | + |
| 1677 | + if(this->m_preconditioner != nullptr) |
| 1678 | + { |
| 1679 | + const auto itsolve = parameters.Get(params_t::iterative_solve); |
| 1680 | + |
| 1681 | + rocalution::SolverDescr descr; |
| 1682 | + |
| 1683 | + if(itsolve) |
| 1684 | + { |
| 1685 | + descr.SetTriSolverAlg(TriSolverAlg_Iterative); |
| 1686 | + descr.SetIterativeSolverMaxIteration(parameters.Get(params_t::itsolve_max_iter)); |
| 1687 | + descr.SetIterativeSolverTolerance(parameters.Get(params_t::itsolve_tol)); |
| 1688 | + } |
| 1689 | + else |
| 1690 | + { |
| 1691 | + descr.SetTriSolverAlg(TriSolverAlg_Default); |
| 1692 | + } |
| 1693 | + |
| 1694 | + this->m_preconditioner->SetSolverDescriptor(descr); |
| 1695 | + |
| 1696 | + return true; |
| 1697 | } |
| 1698 | |
| 1699 | return false; |
| 1700 | diff --git a/clients/benchmarks/rocalution_driver_itsolver_uaamg.hpp b/clients/benchmarks/rocalution_driver_itsolver_uaamg.hpp |
| 1701 | index 7d83c9c..e513a85 100644 |
| 1702 | --- a/clients/benchmarks/rocalution_driver_itsolver_uaamg.hpp |
| 1703 | +++ b/clients/benchmarks/rocalution_driver_itsolver_uaamg.hpp |
| 1704 | @@ -1,5 +1,5 @@ |
| 1705 | /* ************************************************************************ |
| 1706 | - * Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 1707 | + * Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved. |
| 1708 | * |
| 1709 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1710 | * of this software and associated documentation files (the "Software"), to deal |
| 1711 | @@ -181,7 +181,7 @@ struct rocalution_driver_itsolver<rocalution_enum_itsolver::uaamg, T> |
| 1712 | preconditioner->SetSolver(cgs); |
| 1713 | preconditioner->SetSmootherPreIter(pre_smooth); |
| 1714 | preconditioner->SetSmootherPostIter(post_smooth); |
| 1715 | - preconditioner->SetOperatorFormat(format); |
| 1716 | + preconditioner->SetOperatorFormat(format, parameters.Get(params_t::blockdim)); |
| 1717 | preconditioner->InitMaxIter(1); |
| 1718 | preconditioner->Verbose(0); |
| 1719 | |
| 1720 | diff --git a/clients/benchmarks/rocalution_enum_itilu0_alg.cpp b/clients/benchmarks/rocalution_enum_itilu0_alg.cpp |
| 1721 | new file mode 100644 |
| 1722 | index 0000000..c13ba38 |
| 1723 | --- /dev/null |
| 1724 | +++ b/clients/benchmarks/rocalution_enum_itilu0_alg.cpp |
| 1725 | @@ -0,0 +1,99 @@ |
| 1726 | +/*! \file */ |
| 1727 | +/* ************************************************************************ |
| 1728 | +* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights Reserved. |
| 1729 | +* |
| 1730 | +* Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1731 | +* of this software and associated documentation files (the "Software"), to deal |
| 1732 | +* in the Software without restriction, including without limitation the rights |
| 1733 | +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 1734 | +* copies of the Software, and to permit persons to whom the Software is |
| 1735 | +* furnished to do so, subject to the following conditions: |
| 1736 | +* |
| 1737 | +* The above copyright notice and this permission notice shall be included in |
| 1738 | +* all copies or substantial portions of the Software. |
| 1739 | +* |
| 1740 | +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 1741 | +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 1742 | +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 1743 | +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 1744 | +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 1745 | +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 1746 | +* THE SOFTWARE. |
| 1747 | +* |
| 1748 | +* ************************************************************************ */ |
| 1749 | +#include "rocalution_enum_itilu0_alg.hpp" |
| 1750 | +#include <iostream> |
| 1751 | +constexpr const char* rocalution_enum_itilu0_alg::names[rocalution_enum_itilu0_alg::size]; |
| 1752 | +constexpr rocalution_enum_itilu0_alg::value_type rocalution_enum_itilu0_alg::all[]; |
| 1753 | + |
| 1754 | +const char* rocalution_enum_itilu0_alg::to_string() const |
| 1755 | +{ |
| 1756 | + return rocalution_enum_itilu0_alg::to_string(this->value); |
| 1757 | +} |
| 1758 | + |
| 1759 | +bool rocalution_enum_itilu0_alg::is_invalid() const |
| 1760 | +{ |
| 1761 | + for(auto v : all) |
| 1762 | + { |
| 1763 | + if(this->value == v) |
| 1764 | + { |
| 1765 | + return false; |
| 1766 | + } |
| 1767 | + } |
| 1768 | + return true; |
| 1769 | +} |
| 1770 | + |
| 1771 | +rocalution_enum_itilu0_alg::rocalution_enum_itilu0_alg(const char* itilu0_alg_name) |
| 1772 | +{ |
| 1773 | + this->value = (value_type)-1; |
| 1774 | + for(auto v : all) |
| 1775 | + { |
| 1776 | + const char* str = names[v]; |
| 1777 | + if(!strcmp(itilu0_alg_name, str)) |
| 1778 | + { |
| 1779 | + this->value = v; |
| 1780 | + return; |
| 1781 | + } |
| 1782 | + } |
| 1783 | + |
| 1784 | + rocalution_bench_errmsg << "ItILU0 algorithm '" << itilu0_alg_name |
| 1785 | + << "' is invalid, the list of valid ItILU0 algorithms is" << std::endl; |
| 1786 | + for(auto v : all) |
| 1787 | + { |
| 1788 | + const char* str = names[v]; |
| 1789 | + rocalution_bench_errmsg << " - '" << str << "'" << std::endl; |
| 1790 | + } |
| 1791 | + throw false; |
| 1792 | +} |
| 1793 | + |
| 1794 | +// |
| 1795 | +// Default contructor. |
| 1796 | +// |
| 1797 | +rocalution_enum_itilu0_alg::rocalution_enum_itilu0_alg() |
| 1798 | + : value((value_type)-1){}; |
| 1799 | + |
| 1800 | +// |
| 1801 | +// |
| 1802 | +// |
| 1803 | +rocalution_enum_itilu0_alg& rocalution_enum_itilu0_alg::operator()(const char* itilu0_alg_name) |
| 1804 | +{ |
| 1805 | + this->value = (value_type)-1; |
| 1806 | + for(auto v : all) |
| 1807 | + { |
| 1808 | + const char* str = names[v]; |
| 1809 | + if(!strcmp(itilu0_alg_name, str)) |
| 1810 | + { |
| 1811 | + this->value = v; |
| 1812 | + return *this; |
| 1813 | + } |
| 1814 | + } |
| 1815 | + |
| 1816 | + rocalution_bench_errmsg << "ItILU0 algorithm '" << itilu0_alg_name |
| 1817 | + << "' is invalid, the list of valid ItILU0 algorithms is" << std::endl; |
| 1818 | + for(auto v : all) |
| 1819 | + { |
| 1820 | + const char* str = names[v]; |
| 1821 | + rocalution_bench_errmsg << " - '" << str << "'" << std::endl; |
| 1822 | + } |
| 1823 | + throw false; |
| 1824 | +} |
| 1825 | diff --git a/clients/benchmarks/rocalution_enum_itilu0_alg.hpp b/clients/benchmarks/rocalution_enum_itilu0_alg.hpp |
| 1826 | new file mode 100644 |
| 1827 | index 0000000..eaa6863 |
| 1828 | --- /dev/null |
| 1829 | +++ b/clients/benchmarks/rocalution_enum_itilu0_alg.hpp |
| 1830 | @@ -0,0 +1,92 @@ |
| 1831 | +/*! \file */ |
| 1832 | +/* ************************************************************************ |
| 1833 | +* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights Reserved. |
| 1834 | +* |
| 1835 | +* Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1836 | +* of this software and associated documentation files (the "Software"), to deal |
| 1837 | +* in the Software without restriction, including without limitation the rights |
| 1838 | +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 1839 | +* copies of the Software, and to permit persons to whom the Software is |
| 1840 | +* furnished to do so, subject to the following conditions: |
| 1841 | +* |
| 1842 | +* The above copyright notice and this permission notice shall be included in |
| 1843 | +* all copies or substantial portions of the Software. |
| 1844 | +* |
| 1845 | +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 1846 | +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 1847 | +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 1848 | +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 1849 | +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 1850 | +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 1851 | +* THE SOFTWARE. |
| 1852 | +* |
| 1853 | +* ************************************************************************ */ |
| 1854 | +#pragma once |
| 1855 | +#include "utility.hpp" |
| 1856 | +#include <cstring> |
| 1857 | + |
| 1858 | +// |
| 1859 | +// List the enumeration values. |
| 1860 | +// |
| 1861 | + |
| 1862 | +// clang-format off |
| 1863 | +#define ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM_EACH \ |
| 1864 | + ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(Default) \ |
| 1865 | + ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(AsyncInPlace) \ |
| 1866 | + ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(AsyncSplit) \ |
| 1867 | + ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(SyncSplit) \ |
| 1868 | + ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(SyncSplitFusion) |
| 1869 | +// clang-format on |
| 1870 | + |
| 1871 | +struct rocalution_enum_itilu0_alg |
| 1872 | +{ |
| 1873 | +public: |
| 1874 | +#define ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(x_) x_, |
| 1875 | + typedef enum rocalution_enum_itilu0_alg__ : int |
| 1876 | + { |
| 1877 | + ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM_EACH |
| 1878 | + } value_type; |
| 1879 | + static constexpr value_type all[] = {ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM_EACH}; |
| 1880 | +#undef ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM |
| 1881 | + static constexpr std::size_t size = countof(all); |
| 1882 | + value_type value{}; |
| 1883 | + |
| 1884 | +private: |
| 1885 | +#define ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(x_) #x_, |
| 1886 | + static constexpr const char* names[size]{ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM_EACH}; |
| 1887 | +#undef ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM |
| 1888 | +public: |
| 1889 | + operator value_type() const |
| 1890 | + { |
| 1891 | + return this->value; |
| 1892 | + }; |
| 1893 | + rocalution_enum_itilu0_alg(); |
| 1894 | + rocalution_enum_itilu0_alg& operator()(const char* function); |
| 1895 | + rocalution_enum_itilu0_alg(const char* function); |
| 1896 | + const char* to_string() const; |
| 1897 | + bool is_invalid() const; |
| 1898 | + static inline const char* to_string(rocalution_enum_itilu0_alg::value_type value) |
| 1899 | + { |
| 1900 | + // |
| 1901 | + // switch for checking inconsistency. |
| 1902 | + // |
| 1903 | + switch(value) |
| 1904 | + { |
| 1905 | + /// |
| 1906 | +#define ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM(x_) \ |
| 1907 | + case x_: \ |
| 1908 | + { \ |
| 1909 | + if(strcmp(#x_, names[value])) \ |
| 1910 | + return nullptr; \ |
| 1911 | + break; \ |
| 1912 | + } |
| 1913 | + |
| 1914 | + ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM_EACH; |
| 1915 | + |
| 1916 | +#undef ROCALUTION_ENUM_ITILU0_ALG_TRANSFORM |
| 1917 | + /// |
| 1918 | + } |
| 1919 | + |
| 1920 | + return names[value]; |
| 1921 | + } |
| 1922 | +}; |
| 1923 | diff --git a/clients/benchmarks/rocalution_enum_preconditioner.hpp b/clients/benchmarks/rocalution_enum_preconditioner.hpp |
| 1924 | index 2eaacf5..2063476 100644 |
| 1925 | --- a/clients/benchmarks/rocalution_enum_preconditioner.hpp |
| 1926 | +++ b/clients/benchmarks/rocalution_enum_preconditioner.hpp |
| 1927 | @@ -1,5 +1,5 @@ |
| 1928 | /* ************************************************************************ |
| 1929 | - * Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 1930 | + * Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved. |
| 1931 | * |
| 1932 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1933 | * of this software and associated documentation files (the "Software"), to deal |
| 1934 | @@ -40,6 +40,7 @@ struct rocalution_enum_preconditioner |
| 1935 | ENUM_PRECONDITIONER(GS) \ |
| 1936 | ENUM_PRECONDITIONER(SGS) \ |
| 1937 | ENUM_PRECONDITIONER(ILU) \ |
| 1938 | + ENUM_PRECONDITIONER(ItILU0) \ |
| 1939 | ENUM_PRECONDITIONER(ILUT) \ |
| 1940 | ENUM_PRECONDITIONER(IC) \ |
| 1941 | ENUM_PRECONDITIONER(MCGS) \ |
| 1942 | diff --git a/clients/include/common.hpp b/clients/include/common.hpp |
| 1943 | index 5cdb250..00acd2f 100644 |
| 1944 | --- a/clients/include/common.hpp |
| 1945 | +++ b/clients/include/common.hpp |
| 1946 | @@ -1,5 +1,5 @@ |
| 1947 | /* ************************************************************************ |
| 1948 | - * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 1949 | + * Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights Reserved. |
| 1950 | * |
| 1951 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 1952 | * of this software and associated documentation files (the "Software"), to deal |
| 1953 | @@ -22,11 +22,36 @@ |
| 1954 | * ************************************************************************ */ |
| 1955 | |
| 1956 | #include <cstring> |
| 1957 | +#include <map> |
| 1958 | #include <mpi.h> |
| 1959 | #include <rocalution/rocalution.hpp> |
| 1960 | +#include <set> |
| 1961 | |
| 1962 | using namespace rocalution; |
| 1963 | |
| 1964 | +static void my_irecv(int* buf, int count, int source, int tag, MPI_Comm comm, MPI_Request* request) |
| 1965 | +{ |
| 1966 | + MPI_Irecv(buf, count, MPI_INT, source, tag, comm, request); |
| 1967 | +} |
| 1968 | + |
| 1969 | +static void |
| 1970 | + my_irecv(int64_t* buf, int count, int source, int tag, MPI_Comm comm, MPI_Request* request) |
| 1971 | +{ |
| 1972 | + MPI_Irecv(buf, count, MPI_INT64_T, source, tag, comm, request); |
| 1973 | +} |
| 1974 | + |
| 1975 | +static void |
| 1976 | + my_isend(const int* buf, int count, int dest, int tag, MPI_Comm comm, MPI_Request* request) |
| 1977 | +{ |
| 1978 | + MPI_Isend(buf, count, MPI_INT, dest, tag, comm, request); |
| 1979 | +} |
| 1980 | + |
| 1981 | +static void |
| 1982 | + my_isend(const int64_t* buf, int count, int dest, int tag, MPI_Comm comm, MPI_Request* request) |
| 1983 | +{ |
| 1984 | + MPI_Isend(buf, count, MPI_INT64_T, dest, tag, comm, request); |
| 1985 | +} |
| 1986 | + |
| 1987 | template <typename ValueType> |
| 1988 | void distribute_matrix(const MPI_Comm* comm, |
| 1989 | LocalMatrix<ValueType>* lmat, |
| 1990 | @@ -39,11 +64,11 @@ void distribute_matrix(const MPI_Comm* comm, |
| 1991 | MPI_Comm_rank(*comm, &rank); |
| 1992 | MPI_Comm_size(*comm, &num_procs); |
| 1993 | |
| 1994 | - size_t global_nrow = lmat->GetM(); |
| 1995 | - size_t global_ncol = lmat->GetN(); |
| 1996 | - size_t global_nnz = lmat->GetNnz(); |
| 1997 | + int64_t global_nrow = lmat->GetM(); |
| 1998 | + int64_t global_ncol = lmat->GetN(); |
| 1999 | + int64_t global_nnz = lmat->GetNnz(); |
| 2000 | |
| 2001 | - int* global_row_offset = NULL; |
| 2002 | + PtrType* global_row_offset = NULL; |
| 2003 | int* global_col = NULL; |
| 2004 | ValueType* global_val = NULL; |
| 2005 | |
| 2006 | @@ -74,14 +99,14 @@ void distribute_matrix(const MPI_Comm* comm, |
| 2007 | |
| 2008 | if(global_nrow % num_procs != 0) |
| 2009 | { |
| 2010 | - for(size_t i = 0; i < global_nrow % num_procs; ++i) |
| 2011 | + for(int i = 0; i < global_nrow % num_procs; ++i) |
| 2012 | { |
| 2013 | ++local_size[i]; |
| 2014 | } |
| 2015 | } |
| 2016 | |
| 2017 | // Compute index offsets |
| 2018 | - std::vector<int> index_offset(num_procs + 1); |
| 2019 | + std::vector<PtrType> index_offset(num_procs + 1); |
| 2020 | index_offset[0] = 0; |
| 2021 | for(int i = 0; i < num_procs; ++i) |
| 2022 | { |
| 2023 | @@ -89,10 +114,10 @@ void distribute_matrix(const MPI_Comm* comm, |
| 2024 | } |
| 2025 | |
| 2026 | // Read sub matrix - row_offset |
| 2027 | - int local_nrow = local_size[rank]; |
| 2028 | - std::vector<int> local_row_offset(local_nrow + 1); |
| 2029 | + int local_nrow = local_size[rank]; |
| 2030 | + std::vector<PtrType> local_row_offset(local_nrow + 1); |
| 2031 | |
| 2032 | - for(int i = index_offset[rank], k = 0; k < local_nrow + 1; ++i, ++k) |
| 2033 | + for(PtrType i = index_offset[rank], k = 0; k < local_nrow + 1; ++i, ++k) |
| 2034 | { |
| 2035 | local_row_offset[k] = global_row_offset[i]; |
| 2036 | } |
| 2037 | @@ -100,11 +125,11 @@ void distribute_matrix(const MPI_Comm* comm, |
| 2038 | free_host(&global_row_offset); |
| 2039 | |
| 2040 | // Read sub matrix - col and val |
| 2041 | - int local_nnz = local_row_offset[local_nrow] - local_row_offset[0]; |
| 2042 | + PtrType local_nnz = local_row_offset[local_nrow] - local_row_offset[0]; |
| 2043 | std::vector<int> local_col(local_nnz); |
| 2044 | std::vector<ValueType> local_val(local_nnz); |
| 2045 | |
| 2046 | - for(int i = local_row_offset[0], k = 0; k < local_nnz; ++i, ++k) |
| 2047 | + for(PtrType i = local_row_offset[0], k = 0; k < local_nnz; ++i, ++k) |
| 2048 | { |
| 2049 | local_col[k] = global_col[i]; |
| 2050 | local_val[k] = global_val[i]; |
| 2051 | @@ -120,18 +145,18 @@ void distribute_matrix(const MPI_Comm* comm, |
| 2052 | local_row_offset[i] -= shift; |
| 2053 | } |
| 2054 | |
| 2055 | - int interior_nnz = 0; |
| 2056 | - int ghost_nnz = 0; |
| 2057 | - int boundary_nnz = 0; |
| 2058 | - int neighbors = 0; |
| 2059 | + PtrType interior_nnz = 0; |
| 2060 | + PtrType ghost_nnz = 0; |
| 2061 | + int boundary_nnz = 0; |
| 2062 | + int neighbors = 0; |
| 2063 | |
| 2064 | - std::vector<std::vector<int>> boundary(num_procs, std::vector<int>()); |
| 2065 | - std::vector<bool> neighbor(num_procs, false); |
| 2066 | - std::vector<std::map<int, bool>> checked(num_procs, std::map<int, bool>()); |
| 2067 | + std::vector<std::vector<PtrType>> boundary(num_procs, std::vector<PtrType>()); |
| 2068 | + std::vector<bool> neighbor(num_procs, false); |
| 2069 | + std::vector<std::map<int, bool>> checked(num_procs, std::map<int, bool>()); |
| 2070 | |
| 2071 | for(int i = 0; i < local_nrow; ++i) |
| 2072 | { |
| 2073 | - for(int j = local_row_offset[i]; j < local_row_offset[i + 1]; ++j) |
| 2074 | + for(PtrType j = local_row_offset[i]; j < local_row_offset[i + 1]; ++j) |
| 2075 | { |
| 2076 | |
| 2077 | // Interior point |
| 2078 | @@ -214,7 +239,7 @@ void distribute_matrix(const MPI_Comm* comm, |
| 2079 | if(neighbor[i] == true) |
| 2080 | { |
| 2081 | // Receive size of boundary from rank i to current rank |
| 2082 | - MPI_Irecv(&(boundary_size[n]), 1, MPI_INT, i, 0, *comm, &mpi_req[n]); |
| 2083 | + my_irecv(&(boundary_size[n]), 1, i, 0, *comm, &mpi_req[n]); |
| 2084 | ++n; |
| 2085 | } |
| 2086 | } |
| 2087 | @@ -227,7 +252,7 @@ void distribute_matrix(const MPI_Comm* comm, |
| 2088 | { |
| 2089 | int size = boundary[i].size(); |
| 2090 | // Send size of boundary from current rank to rank i |
| 2091 | - MPI_Isend(&size, 1, MPI_INT, i, 0, *comm, &mpi_req[n]); |
| 2092 | + my_isend(&size, 1, i, 0, *comm, &mpi_req[n]); |
| 2093 | ++n; |
| 2094 | } |
| 2095 | } |
| 2096 | @@ -256,7 +281,7 @@ void distribute_matrix(const MPI_Comm* comm, |
| 2097 | } |
| 2098 | |
| 2099 | // Array to hold boundary for each interface |
| 2100 | - std::vector<std::vector<int>> local_boundary(neighbors); |
| 2101 | + std::vector<std::vector<PtrType>> local_boundary(neighbors); |
| 2102 | for(int i = 0; i < neighbors; ++i) |
| 2103 | { |
| 2104 | local_boundary[i].resize(boundary_size[i]); |
| 2105 | @@ -269,8 +294,7 @@ void distribute_matrix(const MPI_Comm* comm, |
| 2106 | if(neighbor[i] == true) |
| 2107 | { |
| 2108 | // Receive boundary from rank i to current rank |
| 2109 | - MPI_Irecv( |
| 2110 | - local_boundary[n].data(), boundary_size[n], MPI_INT, i, 0, *comm, &mpi_req[n]); |
| 2111 | + my_irecv(local_boundary[n].data(), boundary_size[n], i, 0, *comm, &mpi_req[n]); |
| 2112 | ++n; |
| 2113 | } |
| 2114 | } |
| 2115 | @@ -282,7 +306,7 @@ void distribute_matrix(const MPI_Comm* comm, |
| 2116 | if(boundary[i].size() > 0) |
| 2117 | { |
| 2118 | // Send boundary from current rank to rank i |
| 2119 | - MPI_Isend(&(boundary[i][0]), boundary[i].size(), MPI_INT, i, 0, *comm, &mpi_req[n]); |
| 2120 | + my_isend(&(boundary[i][0]), boundary[i].size(), i, 0, *comm, &mpi_req[n]); |
| 2121 | ++n; |
| 2122 | } |
| 2123 | } |
| 2124 | @@ -305,13 +329,13 @@ void distribute_matrix(const MPI_Comm* comm, |
| 2125 | { |
| 2126 | for(unsigned int j = 0; j < boundary[i].size(); ++j) |
| 2127 | { |
| 2128 | - bnd[k] = boundary[i][j] - index_offset[rank]; |
| 2129 | + bnd[k] = static_cast<int>(boundary[i][j] - index_offset[rank]); |
| 2130 | ++k; |
| 2131 | } |
| 2132 | } |
| 2133 | |
| 2134 | // Create boundary index array |
| 2135 | - std::vector<int> boundary_index(nnz_boundary); |
| 2136 | + std::vector<PtrType> boundary_index(nnz_boundary); |
| 2137 | |
| 2138 | k = 0; |
| 2139 | for(int i = 0; i < neighbors; ++i) |
| 2140 | @@ -340,11 +364,11 @@ void distribute_matrix(const MPI_Comm* comm, |
| 2141 | memset(ghost_col, 0, sizeof(int) * ghost_nnz); |
| 2142 | memset(ghost_val, 0, sizeof(ValueType) * ghost_nnz); |
| 2143 | |
| 2144 | - int* row_offset = new int[local_nrow + 1]; |
| 2145 | + PtrType* row_offset = new PtrType[local_nrow + 1]; |
| 2146 | int* col = new int[interior_nnz]; |
| 2147 | ValueType* val = new ValueType[interior_nnz]; |
| 2148 | |
| 2149 | - memset(row_offset, 0, sizeof(int) * (local_nrow + 1)); |
| 2150 | + memset(row_offset, 0, sizeof(PtrType) * (local_nrow + 1)); |
| 2151 | memset(col, 0, sizeof(int) * interior_nnz); |
| 2152 | memset(val, 0, sizeof(ValueType) * interior_nnz); |
| 2153 | |
| 2154 | @@ -353,7 +377,7 @@ void distribute_matrix(const MPI_Comm* comm, |
| 2155 | int l = 0; |
| 2156 | for(int i = 0; i < local_nrow; ++i) |
| 2157 | { |
| 2158 | - for(int j = local_row_offset[i]; j < local_row_offset[i + 1]; ++j) |
| 2159 | + for(PtrType j = local_row_offset[i]; j < local_row_offset[i + 1]; ++j) |
| 2160 | { |
| 2161 | |
| 2162 | // Boundary point -- create ghost part |
| 2163 | @@ -403,4 +427,824 @@ void distribute_matrix(const MPI_Comm* comm, |
| 2164 | gmat->SetParallelManager(*pm); |
| 2165 | gmat->SetLocalDataPtrCSR(&row_offset, &col, &val, "mat", interior_nnz); |
| 2166 | gmat->SetGhostDataPtrCOO(&ghost_row, &ghost_col, &ghost_val, "ghost", ghost_nnz); |
| 2167 | + gmat->Sort(); |
| 2168 | +} |
| 2169 | + |
| 2170 | +// This function computes all prime factors of a given number n |
| 2171 | +static void compute_prime_factors(int n, std::vector<int>& p) |
| 2172 | +{ |
| 2173 | + int factor = 2; |
| 2174 | + |
| 2175 | + // Factorize |
| 2176 | + while(n > 1) |
| 2177 | + { |
| 2178 | + while(n % factor == 0) |
| 2179 | + { |
| 2180 | + p.push_back(factor); |
| 2181 | + n /= factor; |
| 2182 | + } |
| 2183 | + |
| 2184 | + ++factor; |
| 2185 | + } |
| 2186 | +} |
| 2187 | + |
| 2188 | +// This function computes the process distribution for each dimension |
| 2189 | +static void compute_2d_process_distribution(int nprocs, int& nprocx, int& nprocy) |
| 2190 | +{ |
| 2191 | + // Compute prime factors |
| 2192 | + std::vector<int> p; |
| 2193 | + compute_prime_factors(nprocs, p); |
| 2194 | + |
| 2195 | + // Compute number of processes in each dimension |
| 2196 | + nprocx = 1; |
| 2197 | + nprocy = 1; |
| 2198 | + |
| 2199 | + if(p.size() == 0) |
| 2200 | + { |
| 2201 | + // No entry, this means we have exactly one process |
| 2202 | + } |
| 2203 | + else if(p.size() == 1) |
| 2204 | + { |
| 2205 | + // If we have a single prime number, this is going to be our x dimension |
| 2206 | + nprocx = p[0]; |
| 2207 | + } |
| 2208 | + else if(p.size() == 2) |
| 2209 | + { |
| 2210 | + // For two prime numbers, setup x and y |
| 2211 | + nprocx = p[1]; |
| 2212 | + nprocy = p[0]; |
| 2213 | + } |
| 2214 | + else |
| 2215 | + { |
| 2216 | + // More than two prime numbers |
| 2217 | + |
| 2218 | + // #prime numbers |
| 2219 | + int idx = 0; |
| 2220 | + size_t nprime = p.size(); |
| 2221 | + |
| 2222 | + // cubic root |
| 2223 | + double sqroot = std::sqrt(nprocs); |
| 2224 | + |
| 2225 | + // Determine x dimension |
| 2226 | + nprocx = p[nprime-- - 1]; |
| 2227 | + |
| 2228 | + while(nprocx < sqroot && idx < nprime) |
| 2229 | + { |
| 2230 | + nprocx *= p[idx++]; |
| 2231 | + } |
| 2232 | + |
| 2233 | + // Determine y dimension |
| 2234 | + while(idx < nprime) |
| 2235 | + { |
| 2236 | + nprocy *= p[idx++]; |
| 2237 | + } |
| 2238 | + } |
| 2239 | + |
| 2240 | + // Number of processes must match |
| 2241 | + assert(nprocx * nprocy == nprocs); |
| 2242 | +} |
| 2243 | + |
| 2244 | +// This function computes the process distribution for each dimension |
| 2245 | +static void compute_3d_process_distribution(int nprocs, int& nprocx, int& nprocy, int& nprocz) |
| 2246 | +{ |
| 2247 | + // Compute prime factors |
| 2248 | + std::vector<int> p; |
| 2249 | + compute_prime_factors(nprocs, p); |
| 2250 | + |
| 2251 | + // Compute number of processes in each dimension |
| 2252 | + nprocx = 1; |
| 2253 | + nprocy = 1; |
| 2254 | + nprocz = 1; |
| 2255 | + |
| 2256 | + if(p.size() == 0) |
| 2257 | + { |
| 2258 | + // No entry, this means we have exactly one process |
| 2259 | + } |
| 2260 | + else if(p.size() == 1) |
| 2261 | + { |
| 2262 | + // If we have a single prime number, this is going to be our x dimension |
| 2263 | + nprocx = p[0]; |
| 2264 | + } |
| 2265 | + else if(p.size() == 2) |
| 2266 | + { |
| 2267 | + // For two prime numbers, setup x and y |
| 2268 | + nprocx = p[1]; |
| 2269 | + nprocy = p[0]; |
| 2270 | + } |
| 2271 | + else if(p.size() == 3) |
| 2272 | + { |
| 2273 | + // Three prime numbers |
| 2274 | + nprocx = p[2]; |
| 2275 | + nprocy = p[1]; |
| 2276 | + nprocz = p[0]; |
| 2277 | + } |
| 2278 | + else |
| 2279 | + { |
| 2280 | + // More than three prime numbers |
| 2281 | + |
| 2282 | + // #prime numbers |
| 2283 | + int idx = 0; |
| 2284 | + size_t nprime = p.size(); |
| 2285 | + |
| 2286 | + // cubic root |
| 2287 | + double qroot = std::cbrt(nprocs); |
| 2288 | + |
| 2289 | + // Determine x dimension |
| 2290 | + nprocx = p[nprime-- - 1]; |
| 2291 | + |
| 2292 | + while(nprocx < qroot && idx < nprime) |
| 2293 | + { |
| 2294 | + nprocx *= p[idx++]; |
| 2295 | + } |
| 2296 | + |
| 2297 | + // Determine y dimension |
| 2298 | + double sqroot = std::sqrt(nprocs / nprocx); |
| 2299 | + |
| 2300 | + nprocy = p[nprime-- - 1]; |
| 2301 | + |
| 2302 | + while(nprocy < sqroot && idx < nprime) |
| 2303 | + { |
| 2304 | + nprocy *= p[idx++]; |
| 2305 | + } |
| 2306 | + |
| 2307 | + // Determine z dimension |
| 2308 | + while(idx < nprime) |
| 2309 | + { |
| 2310 | + nprocz *= p[idx++]; |
| 2311 | + } |
| 2312 | + } |
| 2313 | + |
| 2314 | + // Number of processes must match |
| 2315 | + assert(nprocx * nprocy * nprocz == nprocs); |
| 2316 | +} |
| 2317 | + |
| 2318 | +template <typename ValueType> |
| 2319 | +void generate_2d_laplacian(int local_dimx, |
| 2320 | + int local_dimy, |
| 2321 | + const MPI_Comm* comm, |
| 2322 | + GlobalMatrix<ValueType>* mat, |
| 2323 | + ParallelManager* pm, |
| 2324 | + int rank, |
| 2325 | + int nprocs, |
| 2326 | + int nsten = 9) |
| 2327 | +{ |
| 2328 | + assert(nsten == 5 || nsten == 9); |
| 2329 | + |
| 2330 | + // First, we need to determine process pattern for the unit square |
| 2331 | + int nproc_x; |
| 2332 | + int nproc_y; |
| 2333 | + |
| 2334 | + compute_2d_process_distribution(nprocs, nproc_x, nproc_y); |
| 2335 | + |
| 2336 | + // Next, determine process index into the unit square |
| 2337 | + int iproc_y = rank / nproc_x; |
| 2338 | + int iproc_x = rank % nproc_x; |
| 2339 | + |
| 2340 | + // Global sizes |
| 2341 | + int64_t global_dimx = static_cast<int64_t>(nproc_x) * local_dimx; |
| 2342 | + int64_t global_dimy = static_cast<int64_t>(nproc_y) * local_dimy; |
| 2343 | + |
| 2344 | + // Global process entry points |
| 2345 | + int64_t global_iproc_x = iproc_x * local_dimx; |
| 2346 | + int64_t global_iproc_y = iproc_y * local_dimy; |
| 2347 | + |
| 2348 | + // Number of rows (global and local) |
| 2349 | + int64_t local_nrow = local_dimx * local_dimy; |
| 2350 | + int64_t global_nrow = global_dimx * global_dimy; |
| 2351 | + |
| 2352 | + // Assemble local CSR matrix row offset pointers |
| 2353 | + PtrType* global_csr_row_ptr = NULL; |
| 2354 | + int64_t* global_csr_col_ind = NULL; |
| 2355 | + int64_t* local2global = NULL; |
| 2356 | + |
| 2357 | + allocate_host(local_nrow + 1, &global_csr_row_ptr); |
| 2358 | + allocate_host(local_nrow * nsten, &global_csr_col_ind); |
| 2359 | + allocate_host(local_nrow, &local2global); |
| 2360 | + |
| 2361 | + std::map<int64_t, int> global2local; |
| 2362 | + |
| 2363 | + PtrType nnz = 0; |
| 2364 | + global_csr_row_ptr[0] = 0; |
| 2365 | + |
| 2366 | + // Loop over y dimension |
| 2367 | + for(int local_y = 0; local_y < local_dimy; ++local_y) |
| 2368 | + { |
| 2369 | + // Global index into y |
| 2370 | + int64_t global_y = global_iproc_y + local_y; |
| 2371 | + |
| 2372 | + // Loop over x dimension |
| 2373 | + for(int local_x = 0; local_x < local_dimx; ++local_x) |
| 2374 | + { |
| 2375 | + // Global index into x |
| 2376 | + int64_t global_x = global_iproc_x + local_x; |
| 2377 | + |
| 2378 | + // Local row |
| 2379 | + int local_row = local_y * local_dimx + local_x; |
| 2380 | + |
| 2381 | + // Global row |
| 2382 | + int64_t global_row = global_y * global_dimx + global_x; |
| 2383 | + |
| 2384 | + // Fill l2g and g2l map |
| 2385 | + local2global[local_row] = global_row; |
| 2386 | + global2local[global_row] = local_row; |
| 2387 | + |
| 2388 | + // 5pt stencil |
| 2389 | + if(nsten == 5) |
| 2390 | + { |
| 2391 | + // Fixed x (leaving out i == j) |
| 2392 | + for(int by = -1; by <= 1; ++by) |
| 2393 | + { |
| 2394 | + if(global_y + by > -1 && global_y + by < global_dimy && by != 0) |
| 2395 | + { |
| 2396 | + // Global column |
| 2397 | + int64_t global_col = global_row + by * global_dimx; |
| 2398 | + |
| 2399 | + // Fill global CSR column indices |
| 2400 | + global_csr_col_ind[nnz++] = global_col; |
| 2401 | + } |
| 2402 | + } |
| 2403 | + |
| 2404 | + // Fixed y |
| 2405 | + for(int bx = -1; bx <= 1; ++bx) |
| 2406 | + { |
| 2407 | + if(global_x + bx > -1 && global_x + bx < global_dimx) |
| 2408 | + { |
| 2409 | + // Global column |
| 2410 | + int64_t global_col = global_row + bx; |
| 2411 | + |
| 2412 | + // Fill global CSR column indices |
| 2413 | + global_csr_col_ind[nnz++] = global_col; |
| 2414 | + } |
| 2415 | + } |
| 2416 | + } |
| 2417 | + |
| 2418 | + // 9 pt stencil |
| 2419 | + if(nsten == 9) |
| 2420 | + { |
| 2421 | + // Check if current y vertex is on the boundary |
| 2422 | + for(int by = -1; by <= 1; ++by) |
| 2423 | + { |
| 2424 | + if(global_y + by > -1 && global_y + by < global_dimy) |
| 2425 | + { |
| 2426 | + // Check if current x vertex is on the boundary |
| 2427 | + for(int bx = -1; bx <= 1; ++bx) |
| 2428 | + { |
| 2429 | + if(global_x + bx > -1 && global_x + bx < global_dimx) |
| 2430 | + { |
| 2431 | + // Global column |
| 2432 | + int64_t global_col = global_row + by * global_dimx + bx; |
| 2433 | + |
| 2434 | + // Fill global CSR column indices |
| 2435 | + global_csr_col_ind[nnz++] = global_col; |
| 2436 | + } |
| 2437 | + } |
| 2438 | + } |
| 2439 | + } |
| 2440 | + } |
| 2441 | + |
| 2442 | + global_csr_row_ptr[local_row + 1] = nnz; |
| 2443 | + } |
| 2444 | + } |
| 2445 | + |
| 2446 | + // Local number of non-zero entries - need to use long long int to make the communication work |
| 2447 | + int64_t local_nnz = global_csr_row_ptr[local_nrow]; |
| 2448 | + |
| 2449 | + // Total number of non-zeros |
| 2450 | + int64_t global_nnz; |
| 2451 | + MPI_Allreduce(&local_nnz, &global_nnz, 1, MPI_INT64_T, MPI_SUM, *comm); |
| 2452 | + |
| 2453 | + // Now, we need to setup the communication pattern |
| 2454 | + std::map<int, std::set<int64_t>> recv_indices; |
| 2455 | + std::map<int, std::set<int64_t>> send_indices; |
| 2456 | + |
| 2457 | + // CSR matrix row pointers |
| 2458 | + PtrType* int_csr_row_ptr = NULL; |
| 2459 | + PtrType* gst_csr_row_ptr = NULL; |
| 2460 | + |
| 2461 | + allocate_host(local_nrow + 1, &int_csr_row_ptr); |
| 2462 | + allocate_host(local_nrow + 1, &gst_csr_row_ptr); |
| 2463 | + |
| 2464 | + int_csr_row_ptr[0] = 0; |
| 2465 | + gst_csr_row_ptr[0] = 0; |
| 2466 | + |
| 2467 | + // Determine, which vertices need to be sent / received |
| 2468 | + for(int i = 0; i < local_nrow; ++i) |
| 2469 | + { |
| 2470 | + int_csr_row_ptr[i + 1] = int_csr_row_ptr[i]; |
| 2471 | + gst_csr_row_ptr[i + 1] = gst_csr_row_ptr[i]; |
| 2472 | + |
| 2473 | + int64_t global_row = local2global[i]; |
| 2474 | + |
| 2475 | + for(PtrType j = global_csr_row_ptr[i]; j < global_csr_row_ptr[i + 1]; ++j) |
| 2476 | + { |
| 2477 | + int64_t global_col = global_csr_col_ind[j]; |
| 2478 | + |
| 2479 | + // Determine which process owns the vertex |
| 2480 | + int64_t idx_y = global_col / global_dimx; |
| 2481 | + int64_t idx_x = global_col % global_dimx; |
| 2482 | + |
| 2483 | + int idx_proc_y = idx_y / local_dimy; |
| 2484 | + int idx_proc_x = idx_x / local_dimx; |
| 2485 | + |
| 2486 | + int owner = idx_proc_x + idx_proc_y * nproc_x; |
| 2487 | + |
| 2488 | + // If we do not own it, we need to receive it from our neighbor |
| 2489 | + // and also send the current vertex to this neighbor |
| 2490 | + if(owner != rank) |
| 2491 | + { |
| 2492 | + // Store the global column and row id that we have to receive / send from / to a neighbor |
| 2493 | + // We need a set here to eliminate duplicates |
| 2494 | + recv_indices[owner].insert(global_col); |
| 2495 | + send_indices[owner].insert(global_row); |
| 2496 | + |
| 2497 | + ++gst_csr_row_ptr[i + 1]; |
| 2498 | + } |
| 2499 | + else |
| 2500 | + { |
| 2501 | + ++int_csr_row_ptr[i + 1]; |
| 2502 | + } |
| 2503 | + } |
| 2504 | + } |
| 2505 | + |
| 2506 | + // Number of processes we communicate with |
| 2507 | + int nrecv = recv_indices.size(); |
| 2508 | + int nsend = send_indices.size(); |
| 2509 | + |
| 2510 | + // Process ids we communicate with |
| 2511 | + std::vector<int> recvs; |
| 2512 | + std::vector<int> sends; |
| 2513 | + |
| 2514 | + recvs.reserve(nrecv); |
| 2515 | + sends.reserve(nsend); |
| 2516 | + |
| 2517 | + // Index offsets for each neighbor |
| 2518 | + std::vector<int> recv_index_offset; |
| 2519 | + std::vector<int> send_index_offset; |
| 2520 | + |
| 2521 | + recv_index_offset.reserve(nrecv + 1); |
| 2522 | + send_index_offset.reserve(nsend + 1); |
| 2523 | + |
| 2524 | + recv_index_offset.push_back(0); |
| 2525 | + send_index_offset.push_back(0); |
| 2526 | + |
| 2527 | + int cnt = 0; |
| 2528 | + std::map<int64_t, int> global2ghost; |
| 2529 | + |
| 2530 | + // Go through the recv data |
| 2531 | + for(std::map<int, std::set<int64_t>>::iterator it = recv_indices.begin(); |
| 2532 | + it != recv_indices.end(); |
| 2533 | + ++it) |
| 2534 | + { |
| 2535 | + recvs.push_back(it->first); |
| 2536 | + recv_index_offset.push_back(it->second.size()); |
| 2537 | + |
| 2538 | + for(std::set<int64_t>::iterator iit = it->second.begin(); iit != it->second.end(); ++iit) |
| 2539 | + { |
| 2540 | + global2ghost[*iit] = cnt++; |
| 2541 | + } |
| 2542 | + } |
| 2543 | + |
| 2544 | + // Go through the send data |
| 2545 | + int boundary_nnz = 0; |
| 2546 | + for(std::map<int, std::set<int64_t>>::iterator it = send_indices.begin(); |
| 2547 | + it != send_indices.end(); |
| 2548 | + ++it) |
| 2549 | + { |
| 2550 | + sends.push_back(it->first); |
| 2551 | + send_index_offset.push_back(it->second.size()); |
| 2552 | + boundary_nnz += it->second.size(); |
| 2553 | + } |
| 2554 | + |
| 2555 | + // Exclusive sum |
| 2556 | + for(int i = 0; i < nrecv; ++i) |
| 2557 | + { |
| 2558 | + recv_index_offset[i + 1] += recv_index_offset[i]; |
| 2559 | + } |
| 2560 | + |
| 2561 | + for(int i = 0; i < nsend; ++i) |
| 2562 | + { |
| 2563 | + send_index_offset[i + 1] += send_index_offset[i]; |
| 2564 | + } |
| 2565 | + |
| 2566 | + // Boundary indices |
| 2567 | + std::vector<int> boundary; |
| 2568 | + boundary.reserve(boundary_nnz); |
| 2569 | + |
| 2570 | + for(std::map<int, std::set<int64_t>>::iterator it = send_indices.begin(); |
| 2571 | + it != send_indices.end(); |
| 2572 | + ++it) |
| 2573 | + { |
| 2574 | + for(std::set<int64_t>::iterator iit = it->second.begin(); iit != it->second.end(); ++iit) |
| 2575 | + { |
| 2576 | + boundary.push_back(global2local[*iit]); |
| 2577 | + } |
| 2578 | + } |
| 2579 | + |
| 2580 | + // Initialize manager |
| 2581 | + pm->SetMPICommunicator(comm); |
| 2582 | + pm->SetGlobalNrow(global_nrow); |
| 2583 | + pm->SetGlobalNcol(global_nrow); |
| 2584 | + pm->SetLocalNrow(local_nrow); |
| 2585 | + pm->SetLocalNcol(local_nrow); |
| 2586 | + |
| 2587 | + if(nprocs > 1) |
| 2588 | + { |
| 2589 | + pm->SetBoundaryIndex(boundary_nnz, boundary.data()); |
| 2590 | + pm->SetReceivers(nrecv, recvs.data(), recv_index_offset.data()); |
| 2591 | + pm->SetSenders(nsend, sends.data(), send_index_offset.data()); |
| 2592 | + } |
| 2593 | + |
| 2594 | + mat->SetParallelManager(*pm); |
| 2595 | + |
| 2596 | + // Generate local and ghost matrices |
| 2597 | + local_nnz = int_csr_row_ptr[local_nrow]; |
| 2598 | + int64_t ghost_nnz = gst_csr_row_ptr[local_nrow]; |
| 2599 | + |
| 2600 | + int* int_csr_col_ind = NULL; |
| 2601 | + int* gst_csr_col_ind = NULL; |
| 2602 | + ValueType* int_csr_val = NULL; |
| 2603 | + ValueType* gst_csr_val = NULL; |
| 2604 | + |
| 2605 | + allocate_host(local_nnz, &int_csr_col_ind); |
| 2606 | + allocate_host(local_nnz, &int_csr_val); |
| 2607 | + allocate_host(ghost_nnz, &gst_csr_col_ind); |
| 2608 | + allocate_host(ghost_nnz, &gst_csr_val); |
| 2609 | + |
| 2610 | + // Convert global matrix columns to local columns |
| 2611 | + for(int i = 0; i < local_nrow; ++i) |
| 2612 | + { |
| 2613 | + PtrType local_idx = int_csr_row_ptr[i]; |
| 2614 | + PtrType ghost_idx = gst_csr_row_ptr[i]; |
| 2615 | + |
| 2616 | + int64_t global_row = local2global[i]; |
| 2617 | + |
| 2618 | + for(PtrType j = global_csr_row_ptr[i]; j < global_csr_row_ptr[i + 1]; ++j) |
| 2619 | + { |
| 2620 | + int64_t global_col = global_csr_col_ind[j]; |
| 2621 | + |
| 2622 | + // Determine which process owns the vertex |
| 2623 | + int64_t idx_y = global_col / global_dimx; |
| 2624 | + int64_t idx_x = global_col % global_dimx; |
| 2625 | + |
| 2626 | + int idx_proc_y = idx_y / local_dimy; |
| 2627 | + int idx_proc_x = idx_x / local_dimx; |
| 2628 | + |
| 2629 | + int owner = idx_proc_x + idx_proc_y * nproc_x; |
| 2630 | + |
| 2631 | + // If we do not own it, we need to receive it from our neighbor |
| 2632 | + // and also send the current vertex to this neighbor |
| 2633 | + if(owner != rank) |
| 2634 | + { |
| 2635 | + // Store the global column and row id that we have to receive / send from / to a neighbor |
| 2636 | + // We need a set here to eliminate duplicates |
| 2637 | + recv_indices[owner].insert(global_col); |
| 2638 | + send_indices[owner].insert(global_row); |
| 2639 | + |
| 2640 | + gst_csr_col_ind[ghost_idx] = global2ghost[global_col]; |
| 2641 | + gst_csr_val[ghost_idx] = -1.0; |
| 2642 | + ++ghost_idx; |
| 2643 | + } |
| 2644 | + else |
| 2645 | + { |
| 2646 | + // This is our part |
| 2647 | + int_csr_col_ind[local_idx] = global2local[global_col]; |
| 2648 | + int_csr_val[local_idx] = (global_col == global_row) ? (nsten - 1.0) : -1.0; |
| 2649 | + ++local_idx; |
| 2650 | + } |
| 2651 | + } |
| 2652 | + } |
| 2653 | + |
| 2654 | + free_host(&global_csr_row_ptr); |
| 2655 | + free_host(&global_csr_col_ind); |
| 2656 | + free_host(&local2global); |
| 2657 | + |
| 2658 | + mat->SetLocalDataPtrCSR(&int_csr_row_ptr, &int_csr_col_ind, &int_csr_val, "mat", local_nnz); |
| 2659 | + mat->SetGhostDataPtrCSR(&gst_csr_row_ptr, &gst_csr_col_ind, &gst_csr_val, "gst", ghost_nnz); |
| 2660 | + mat->Sort(); |
| 2661 | +} |
| 2662 | + |
| 2663 | +template <typename ValueType> |
| 2664 | +void generate_3d_laplacian(int local_dimx, |
| 2665 | + int local_dimy, |
| 2666 | + int local_dimz, |
| 2667 | + const MPI_Comm* comm, |
| 2668 | + GlobalMatrix<ValueType>* mat, |
| 2669 | + ParallelManager* pm, |
| 2670 | + int rank, |
| 2671 | + int nprocs) |
| 2672 | +{ |
| 2673 | + // First, we need to determine process pattern for the unit cube |
| 2674 | + int nproc_x; |
| 2675 | + int nproc_y; |
| 2676 | + int nproc_z; |
| 2677 | + |
| 2678 | + compute_3d_process_distribution(nprocs, nproc_x, nproc_y, nproc_z); |
| 2679 | + |
| 2680 | + // Next, determine process index into the unit cube |
| 2681 | + int iproc_z = rank / (nproc_x * nproc_y); |
| 2682 | + int iproc_y = (rank - iproc_z * nproc_x * nproc_y) / nproc_x; |
| 2683 | + int iproc_x = rank % nproc_x; |
| 2684 | + |
| 2685 | + // Global sizes |
| 2686 | + int64_t global_dimx = static_cast<int64_t>(nproc_x) * local_dimx; |
| 2687 | + int64_t global_dimy = static_cast<int64_t>(nproc_y) * local_dimy; |
| 2688 | + int64_t global_dimz = static_cast<int64_t>(nproc_z) * local_dimz; |
| 2689 | + |
| 2690 | + // Global process entry points |
| 2691 | + int64_t global_iproc_x = iproc_x * local_dimx; |
| 2692 | + int64_t global_iproc_y = iproc_y * local_dimy; |
| 2693 | + int64_t global_iproc_z = iproc_z * local_dimz; |
| 2694 | + |
| 2695 | + // Number of rows (global and local) |
| 2696 | + int64_t local_nrow = local_dimx * local_dimy * local_dimz; |
| 2697 | + int64_t global_nrow = global_dimx * global_dimy * global_dimz; |
| 2698 | + |
| 2699 | + // Assemble local CSR matrix row offset pointers |
| 2700 | + std::vector<PtrType> global_csr_row_ptr(local_nrow + 1); |
| 2701 | + std::vector<int64_t> global_csr_col_ind(local_nrow * 27); |
| 2702 | + |
| 2703 | + std::vector<int64_t> local2global(local_nrow); |
| 2704 | + std::map<int64_t, int> global2local; |
| 2705 | + |
| 2706 | + PtrType nnz = 0; |
| 2707 | + global_csr_row_ptr[0] = 0; |
| 2708 | + |
| 2709 | + // Loop over z dimension |
| 2710 | + for(int local_z = 0; local_z < local_dimz; ++local_z) |
| 2711 | + { |
| 2712 | + // Global index into z |
| 2713 | + int64_t global_z = global_iproc_z + local_z; |
| 2714 | + |
| 2715 | + // Loop over y dimension |
| 2716 | + for(int local_y = 0; local_y < local_dimy; ++local_y) |
| 2717 | + { |
| 2718 | + // Global index into y |
| 2719 | + int64_t global_y = global_iproc_y + local_y; |
| 2720 | + |
| 2721 | + // Loop over x dimension |
| 2722 | + for(int local_x = 0; local_x < local_dimx; ++local_x) |
| 2723 | + { |
| 2724 | + // Global index into x |
| 2725 | + int64_t global_x = global_iproc_x + local_x; |
| 2726 | + |
| 2727 | + // Local row |
| 2728 | + int local_row = local_z * local_dimx * local_dimy + local_y * local_dimx + local_x; |
| 2729 | + |
| 2730 | + // Global row |
| 2731 | + int64_t global_row |
| 2732 | + = global_z * global_dimx * global_dimy + global_y * global_dimx + global_x; |
| 2733 | + |
| 2734 | + // Fill l2g and g2l map |
| 2735 | + local2global[local_row] = global_row; |
| 2736 | + global2local[global_row] = local_row; |
| 2737 | + |
| 2738 | + // Check if current z vertex is on the boundary |
| 2739 | + for(int bz = -1; bz <= 1; ++bz) |
| 2740 | + { |
| 2741 | + if(global_z + bz > -1 && global_z + bz < global_dimz) |
| 2742 | + { |
| 2743 | + // Check if current y vertex is on the boundary |
| 2744 | + for(int by = -1; by <= 1; ++by) |
| 2745 | + { |
| 2746 | + if(global_y + by > -1 && global_y + by < global_dimy) |
| 2747 | + { |
| 2748 | + // Check if current x vertex is on the boundary |
| 2749 | + for(int bx = -1; bx <= 1; ++bx) |
| 2750 | + { |
| 2751 | + if(global_x + bx > -1 && global_x + bx < global_dimx) |
| 2752 | + { |
| 2753 | + // Global column |
| 2754 | + int64_t global_col = global_row |
| 2755 | + + bz * global_dimx * global_dimy |
| 2756 | + + by * global_dimx + bx; |
| 2757 | + |
| 2758 | + // Fill global CSR column indices |
| 2759 | + global_csr_col_ind[nnz++] = global_col; |
| 2760 | + } |
| 2761 | + } |
| 2762 | + } |
| 2763 | + } |
| 2764 | + } |
| 2765 | + } |
| 2766 | + |
| 2767 | + global_csr_row_ptr[local_row + 1] = nnz; |
| 2768 | + } |
| 2769 | + } |
| 2770 | + } |
| 2771 | + |
| 2772 | + // Local number of non-zero entries - need to use long long int to make the communication work |
| 2773 | + int64_t local_nnz = global_csr_row_ptr[local_nrow]; |
| 2774 | + |
| 2775 | + // Total number of non-zeros |
| 2776 | + int64_t global_nnz; |
| 2777 | + MPI_Allreduce(&local_nnz, &global_nnz, 1, MPI_INT64_T, MPI_SUM, *comm); |
| 2778 | + |
| 2779 | + // Now, we need to setup the communication pattern |
| 2780 | + std::map<int, std::set<int64_t>> recv_indices; |
| 2781 | + std::map<int, std::set<int64_t>> send_indices; |
| 2782 | + |
| 2783 | + // CSR matrix row pointers |
| 2784 | + PtrType* int_csr_row_ptr = NULL; |
| 2785 | + PtrType* gst_csr_row_ptr = NULL; |
| 2786 | + |
| 2787 | + allocate_host(local_nrow + 1, &int_csr_row_ptr); |
| 2788 | + allocate_host(local_nrow + 1, &gst_csr_row_ptr); |
| 2789 | + |
| 2790 | + int_csr_row_ptr[0] = 0; |
| 2791 | + gst_csr_row_ptr[0] = 0; |
| 2792 | + |
| 2793 | + // Determine, which vertices need to be sent / received |
| 2794 | + for(int i = 0; i < local_nrow; ++i) |
| 2795 | + { |
| 2796 | + int_csr_row_ptr[i + 1] = int_csr_row_ptr[i]; |
| 2797 | + gst_csr_row_ptr[i + 1] = gst_csr_row_ptr[i]; |
| 2798 | + |
| 2799 | + int64_t global_row = local2global[i]; |
| 2800 | + |
| 2801 | + for(PtrType j = global_csr_row_ptr[i]; j < global_csr_row_ptr[i + 1]; ++j) |
| 2802 | + { |
| 2803 | + int64_t global_col = global_csr_col_ind[j]; |
| 2804 | + |
| 2805 | + // Determine which process owns the vertex |
| 2806 | + int64_t idx_z = global_col / (global_dimx * global_dimy); |
| 2807 | + int64_t idx_y = (global_col - idx_z * global_dimy * global_dimx) / global_dimx; |
| 2808 | + int64_t idx_x = global_col % global_dimx; |
| 2809 | + |
| 2810 | + int idx_proc_z = idx_z / local_dimz; |
| 2811 | + int idx_proc_y = idx_y / local_dimy; |
| 2812 | + int idx_proc_x = idx_x / local_dimx; |
| 2813 | + |
| 2814 | + int owner = idx_proc_x + idx_proc_y * nproc_x + idx_proc_z * nproc_y * nproc_x; |
| 2815 | + |
| 2816 | + // If we do not own it, we need to receive it from our neighbor |
| 2817 | + // and also send the current vertex to this neighbor |
| 2818 | + if(owner != rank) |
| 2819 | + { |
| 2820 | + // Store the global column and row id that we have to receive / send from / to a neighbor |
| 2821 | + // We need a set here to eliminate duplicates |
| 2822 | + recv_indices[owner].insert(global_col); |
| 2823 | + send_indices[owner].insert(global_row); |
| 2824 | + |
| 2825 | + ++gst_csr_row_ptr[i + 1]; |
| 2826 | + } |
| 2827 | + else |
| 2828 | + { |
| 2829 | + ++int_csr_row_ptr[i + 1]; |
| 2830 | + } |
| 2831 | + } |
| 2832 | + } |
| 2833 | + |
| 2834 | + // Number of processes we communicate with |
| 2835 | + int nrecv = recv_indices.size(); |
| 2836 | + int nsend = send_indices.size(); |
| 2837 | + |
| 2838 | + // Process ids we communicate with |
| 2839 | + std::vector<int> recvs; |
| 2840 | + std::vector<int> sends; |
| 2841 | + |
| 2842 | + recvs.reserve(nrecv); |
| 2843 | + sends.reserve(nsend); |
| 2844 | + |
| 2845 | + // Index offsets for each neighbor |
| 2846 | + std::vector<int> recv_index_offset; |
| 2847 | + std::vector<int> send_index_offset; |
| 2848 | + |
| 2849 | + recv_index_offset.reserve(nrecv + 1); |
| 2850 | + send_index_offset.reserve(nsend + 1); |
| 2851 | + |
| 2852 | + recv_index_offset.push_back(0); |
| 2853 | + send_index_offset.push_back(0); |
| 2854 | + |
| 2855 | + int cnt = 0; |
| 2856 | + std::map<int64_t, int> global2ghost; |
| 2857 | + |
| 2858 | + // Go through the recv data |
| 2859 | + for(std::map<int, std::set<int64_t>>::iterator it = recv_indices.begin(); |
| 2860 | + it != recv_indices.end(); |
| 2861 | + ++it) |
| 2862 | + { |
| 2863 | + recvs.push_back(it->first); |
| 2864 | + recv_index_offset.push_back(it->second.size()); |
| 2865 | + |
| 2866 | + for(std::set<int64_t>::iterator iit = it->second.begin(); iit != it->second.end(); ++iit) |
| 2867 | + { |
| 2868 | + global2ghost[*iit] = cnt++; |
| 2869 | + } |
| 2870 | + } |
| 2871 | + |
| 2872 | + // Go through the send data |
| 2873 | + int boundary_nnz = 0; |
| 2874 | + for(std::map<int, std::set<int64_t>>::iterator it = send_indices.begin(); |
| 2875 | + it != send_indices.end(); |
| 2876 | + ++it) |
| 2877 | + { |
| 2878 | + sends.push_back(it->first); |
| 2879 | + send_index_offset.push_back(it->second.size()); |
| 2880 | + boundary_nnz += it->second.size(); |
| 2881 | + } |
| 2882 | + |
| 2883 | + // Exclusive sum |
| 2884 | + for(int i = 0; i < nrecv; ++i) |
| 2885 | + { |
| 2886 | + recv_index_offset[i + 1] += recv_index_offset[i]; |
| 2887 | + } |
| 2888 | + |
| 2889 | + for(int i = 0; i < nsend; ++i) |
| 2890 | + { |
| 2891 | + send_index_offset[i + 1] += send_index_offset[i]; |
| 2892 | + } |
| 2893 | + |
| 2894 | + // Boundary indices |
| 2895 | + std::vector<int> boundary; |
| 2896 | + boundary.reserve(boundary_nnz); |
| 2897 | + |
| 2898 | + for(std::map<int, std::set<int64_t>>::iterator it = send_indices.begin(); |
| 2899 | + it != send_indices.end(); |
| 2900 | + ++it) |
| 2901 | + { |
| 2902 | + for(std::set<int64_t>::iterator iit = it->second.begin(); iit != it->second.end(); ++iit) |
| 2903 | + { |
| 2904 | + boundary.push_back(global2local[*iit]); |
| 2905 | + } |
| 2906 | + } |
| 2907 | + |
| 2908 | + // Initialize manager |
| 2909 | + pm->SetMPICommunicator(comm); |
| 2910 | + pm->SetGlobalNrow(global_nrow); |
| 2911 | + pm->SetGlobalNcol(global_nrow); |
| 2912 | + pm->SetLocalNrow(local_nrow); |
| 2913 | + pm->SetLocalNcol(local_nrow); |
| 2914 | + |
| 2915 | + if(nprocs > 1) |
| 2916 | + { |
| 2917 | + pm->SetBoundaryIndex(boundary_nnz, boundary.data()); |
| 2918 | + pm->SetReceivers(nrecv, recvs.data(), recv_index_offset.data()); |
| 2919 | + pm->SetSenders(nsend, sends.data(), send_index_offset.data()); |
| 2920 | + } |
| 2921 | + |
| 2922 | + mat->SetParallelManager(*pm); |
| 2923 | + |
| 2924 | + // Generate local and ghost matrices |
| 2925 | + local_nnz = int_csr_row_ptr[local_nrow]; |
| 2926 | + int64_t ghost_nnz = gst_csr_row_ptr[local_nrow]; |
| 2927 | + |
| 2928 | + int* int_csr_col_ind = NULL; |
| 2929 | + int* gst_csr_col_ind = NULL; |
| 2930 | + ValueType* int_csr_val = NULL; |
| 2931 | + ValueType* gst_csr_val = NULL; |
| 2932 | + |
| 2933 | + allocate_host(local_nnz, &int_csr_col_ind); |
| 2934 | + allocate_host(local_nnz, &int_csr_val); |
| 2935 | + allocate_host(ghost_nnz, &gst_csr_col_ind); |
| 2936 | + allocate_host(ghost_nnz, &gst_csr_val); |
| 2937 | + |
| 2938 | + // Convert global matrix columns to local columns |
| 2939 | + for(int i = 0; i < local_nrow; ++i) |
| 2940 | + { |
| 2941 | + PtrType local_idx = int_csr_row_ptr[i]; |
| 2942 | + PtrType ghost_idx = gst_csr_row_ptr[i]; |
| 2943 | + |
| 2944 | + int64_t global_row = local2global[i]; |
| 2945 | + |
| 2946 | + for(PtrType j = global_csr_row_ptr[i]; j < global_csr_row_ptr[i + 1]; ++j) |
| 2947 | + { |
| 2948 | + int64_t global_col = global_csr_col_ind[j]; |
| 2949 | + |
| 2950 | + // Determine which process owns the vertex |
| 2951 | + int64_t idx_z = global_col / (global_dimx * global_dimy); |
| 2952 | + int64_t idx_y = (global_col - idx_z * global_dimy * global_dimx) / global_dimx; |
| 2953 | + int64_t idx_x = global_col % global_dimx; |
| 2954 | + |
| 2955 | + int idx_proc_z = idx_z / local_dimz; |
| 2956 | + int idx_proc_y = idx_y / local_dimy; |
| 2957 | + int idx_proc_x = idx_x / local_dimx; |
| 2958 | + |
| 2959 | + int owner = idx_proc_x + idx_proc_y * nproc_x + idx_proc_z * nproc_y * nproc_x; |
| 2960 | + |
| 2961 | + // If we do not own it, we need to receive it from our neighbor |
| 2962 | + // and also send the current vertex to this neighbor |
| 2963 | + if(owner != rank) |
| 2964 | + { |
| 2965 | + // Store the global column and row id that we have to receive / send from / to a neighbor |
| 2966 | + // We need a set here to eliminate duplicates |
| 2967 | + recv_indices[owner].insert(global_col); |
| 2968 | + send_indices[owner].insert(global_row); |
| 2969 | + |
| 2970 | + gst_csr_col_ind[ghost_idx] = global2ghost[global_col]; |
| 2971 | + gst_csr_val[ghost_idx] = -1.0; |
| 2972 | + ++ghost_idx; |
| 2973 | + } |
| 2974 | + else |
| 2975 | + { |
| 2976 | + // This is our part |
| 2977 | + int_csr_col_ind[local_idx] = global2local[global_col]; |
| 2978 | + int_csr_val[local_idx] = (global_col == global_row) ? 26.0 : -1.0; |
| 2979 | + ++local_idx; |
| 2980 | + } |
| 2981 | + } |
| 2982 | + } |
| 2983 | + |
| 2984 | + mat->SetLocalDataPtrCSR(&int_csr_row_ptr, &int_csr_col_ind, &int_csr_val, "mat", local_nnz); |
| 2985 | + mat->SetGhostDataPtrCSR(&gst_csr_row_ptr, &gst_csr_col_ind, &gst_csr_val, "gst", ghost_nnz); |
| 2986 | + mat->Sort(); |
| 2987 | } |
| 2988 | diff --git a/clients/include/testing_bicgstab.hpp b/clients/include/testing_bicgstab.hpp |
| 2989 | index 924295c..dbee12e 100644 |
| 2990 | --- a/clients/include/testing_bicgstab.hpp |
| 2991 | +++ b/clients/include/testing_bicgstab.hpp |
| 2992 | @@ -1,5 +1,5 @@ |
| 2993 | /* ************************************************************************ |
| 2994 | - * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 2995 | + * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved. |
| 2996 | * |
| 2997 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 2998 | * of this software and associated documentation files (the "Software"), to deal |
| 2999 | @@ -44,11 +44,13 @@ static bool check_residual(double res) |
| 3000 | template <typename T> |
| 3001 | bool testing_bicgstab(Arguments argus) |
| 3002 | { |
| 3003 | - int ndim = argus.size; |
| 3004 | - std::string precond = argus.precond; |
| 3005 | - unsigned int format = argus.format; |
| 3006 | + int ndim = argus.size; |
| 3007 | + std::string precond = argus.precond; |
| 3008 | + unsigned int format = argus.format; |
| 3009 | + bool disable_accelerator = !argus.use_acc; |
| 3010 | |
| 3011 | // Initialize rocALUTION platform |
| 3012 | + disable_accelerator_rocalution(disable_accelerator); |
| 3013 | set_device_rocalution(device); |
| 3014 | init_rocalution(); |
| 3015 | |
| 3016 | @@ -69,10 +71,13 @@ bool testing_bicgstab(Arguments argus) |
| 3017 | A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow); |
| 3018 | |
| 3019 | // Move data to accelerator |
| 3020 | - A.MoveToAccelerator(); |
| 3021 | - x.MoveToAccelerator(); |
| 3022 | - b.MoveToAccelerator(); |
| 3023 | - e.MoveToAccelerator(); |
| 3024 | + if(!disable_accelerator) |
| 3025 | + { |
| 3026 | + A.MoveToAccelerator(); |
| 3027 | + x.MoveToAccelerator(); |
| 3028 | + b.MoveToAccelerator(); |
| 3029 | + e.MoveToAccelerator(); |
| 3030 | + } |
| 3031 | |
| 3032 | // Allocate x, b and e |
| 3033 | x.Allocate("x", A.GetN()); |
| 3034 | @@ -124,6 +129,8 @@ bool testing_bicgstab(Arguments argus) |
| 3035 | p = new SGS<LocalMatrix<T>, LocalVector<T>, T>; |
| 3036 | else if(precond == "ILU") |
| 3037 | p = new ILU<LocalMatrix<T>, LocalVector<T>, T>; |
| 3038 | + else if(precond == "ItILU0") |
| 3039 | + p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>; |
| 3040 | else if(precond == "ILUT") |
| 3041 | p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>; |
| 3042 | else if(precond == "IC") |
| 3043 | @@ -146,14 +153,20 @@ bool testing_bicgstab(Arguments argus) |
| 3044 | ls.SetPreconditioner(*p); |
| 3045 | } |
| 3046 | |
| 3047 | - ls.Init(1e-8, 0.0, 1e+8, 10000); |
| 3048 | + ls.Init(1e-8, 0.0, 1e+8, 0, 10000); |
| 3049 | + ls.RecordResidualHistory(); |
| 3050 | + auto n_iter = ls.GetIterationCount(); |
| 3051 | ls.Build(); |
| 3052 | |
| 3053 | // Matrix format |
| 3054 | - A.ConvertTo(format, format == BCSR ? 3 : 1); |
| 3055 | + A.ConvertTo(format, format == BCSR ? argus.blockdim : 1); |
| 3056 | |
| 3057 | ls.Solve(b, &x); |
| 3058 | |
| 3059 | + const std::string filename = get_temp_dir() + "test_recorded_history.txt"; |
| 3060 | + ls.RecordHistory(filename); |
| 3061 | + std::remove(filename.c_str()); |
| 3062 | + |
| 3063 | // Verify solution |
| 3064 | x.ScaleAdd(-1.0, e); |
| 3065 | T nrm2 = x.Norm(); |
| 3066 | @@ -169,6 +182,7 @@ bool testing_bicgstab(Arguments argus) |
| 3067 | |
| 3068 | // Stop rocALUTION platform |
| 3069 | stop_rocalution(); |
| 3070 | + disable_accelerator_rocalution(false); |
| 3071 | |
| 3072 | return success; |
| 3073 | } |
| 3074 | diff --git a/clients/include/testing_bicgstabl.hpp b/clients/include/testing_bicgstabl.hpp |
| 3075 | index 20f7c5d..dc4dfb3 100644 |
| 3076 | --- a/clients/include/testing_bicgstabl.hpp |
| 3077 | +++ b/clients/include/testing_bicgstabl.hpp |
| 3078 | @@ -1,5 +1,5 @@ |
| 3079 | /* ************************************************************************ |
| 3080 | - * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 3081 | + * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved. |
| 3082 | * |
| 3083 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3084 | * of this software and associated documentation files (the "Software"), to deal |
| 3085 | @@ -44,12 +44,14 @@ static bool check_residual(double res) |
| 3086 | template <typename T> |
| 3087 | bool testing_bicgstabl(Arguments argus) |
| 3088 | { |
| 3089 | - int ndim = argus.size; |
| 3090 | - std::string precond = argus.precond; |
| 3091 | - unsigned int format = argus.format; |
| 3092 | - int l = argus.index; |
| 3093 | + int ndim = argus.size; |
| 3094 | + std::string precond = argus.precond; |
| 3095 | + unsigned int format = argus.format; |
| 3096 | + int l = argus.index; |
| 3097 | + bool disable_accelerator = !argus.use_acc; |
| 3098 | |
| 3099 | // Initialize rocALUTION platform |
| 3100 | + disable_accelerator_rocalution(disable_accelerator); |
| 3101 | set_device_rocalution(device); |
| 3102 | init_rocalution(); |
| 3103 | |
| 3104 | @@ -70,10 +72,13 @@ bool testing_bicgstabl(Arguments argus) |
| 3105 | A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow); |
| 3106 | |
| 3107 | // Move data to accelerator |
| 3108 | - A.MoveToAccelerator(); |
| 3109 | - x.MoveToAccelerator(); |
| 3110 | - b.MoveToAccelerator(); |
| 3111 | - e.MoveToAccelerator(); |
| 3112 | + if(!disable_accelerator) |
| 3113 | + { |
| 3114 | + A.MoveToAccelerator(); |
| 3115 | + x.MoveToAccelerator(); |
| 3116 | + b.MoveToAccelerator(); |
| 3117 | + e.MoveToAccelerator(); |
| 3118 | + } |
| 3119 | |
| 3120 | // Allocate x, b and e |
| 3121 | x.Allocate("x", A.GetN()); |
| 3122 | @@ -125,6 +130,8 @@ bool testing_bicgstabl(Arguments argus) |
| 3123 | p = new SGS<LocalMatrix<T>, LocalVector<T>, T>; |
| 3124 | else if(precond == "ILU") |
| 3125 | p = new ILU<LocalMatrix<T>, LocalVector<T>, T>; |
| 3126 | + else if(precond == "ItILU0") |
| 3127 | + p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>; |
| 3128 | else if(precond == "ILUT") |
| 3129 | p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>; |
| 3130 | else if(precond == "IC") |
| 3131 | @@ -152,7 +159,7 @@ bool testing_bicgstabl(Arguments argus) |
| 3132 | ls.Build(); |
| 3133 | |
| 3134 | // Matrix format |
| 3135 | - A.ConvertTo(format, format == BCSR ? 3 : 1); |
| 3136 | + A.ConvertTo(format, format == BCSR ? argus.blockdim : 1); |
| 3137 | |
| 3138 | ls.Solve(b, &x); |
| 3139 | |
| 3140 | @@ -171,6 +178,7 @@ bool testing_bicgstabl(Arguments argus) |
| 3141 | |
| 3142 | // Stop rocALUTION platform |
| 3143 | stop_rocalution(); |
| 3144 | + disable_accelerator_rocalution(false); |
| 3145 | |
| 3146 | return success; |
| 3147 | } |
| 3148 | diff --git a/clients/include/testing_cg.hpp b/clients/include/testing_cg.hpp |
| 3149 | index 7c43512..b7404fd 100644 |
| 3150 | --- a/clients/include/testing_cg.hpp |
| 3151 | +++ b/clients/include/testing_cg.hpp |
| 3152 | @@ -1,5 +1,5 @@ |
| 3153 | /* ************************************************************************ |
| 3154 | - * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 3155 | + * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved. |
| 3156 | * |
| 3157 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3158 | * of this software and associated documentation files (the "Software"), to deal |
| 3159 | @@ -44,11 +44,13 @@ static bool check_residual(double res) |
| 3160 | template <typename T> |
| 3161 | bool testing_cg(Arguments argus) |
| 3162 | { |
| 3163 | - int ndim = argus.size; |
| 3164 | - std::string precond = argus.precond; |
| 3165 | - unsigned int format = argus.format; |
| 3166 | + int ndim = argus.size; |
| 3167 | + std::string precond = argus.precond; |
| 3168 | + unsigned int format = argus.format; |
| 3169 | + bool disable_accelerator = !argus.use_acc; |
| 3170 | |
| 3171 | // Initialize rocALUTION platform |
| 3172 | + disable_accelerator_rocalution(disable_accelerator); |
| 3173 | set_device_rocalution(device); |
| 3174 | init_rocalution(); |
| 3175 | |
| 3176 | @@ -69,10 +71,13 @@ bool testing_cg(Arguments argus) |
| 3177 | A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow); |
| 3178 | |
| 3179 | // Move data to accelerator |
| 3180 | - A.MoveToAccelerator(); |
| 3181 | - x.MoveToAccelerator(); |
| 3182 | - b.MoveToAccelerator(); |
| 3183 | - e.MoveToAccelerator(); |
| 3184 | + if(!disable_accelerator) |
| 3185 | + { |
| 3186 | + A.MoveToAccelerator(); |
| 3187 | + x.MoveToAccelerator(); |
| 3188 | + b.MoveToAccelerator(); |
| 3189 | + e.MoveToAccelerator(); |
| 3190 | + } |
| 3191 | |
| 3192 | // Allocate x, b and e |
| 3193 | x.Allocate("x", A.GetN()); |
| 3194 | @@ -124,6 +129,8 @@ bool testing_cg(Arguments argus) |
| 3195 | p = new SGS<LocalMatrix<T>, LocalVector<T>, T>; |
| 3196 | else if(precond == "ILU") |
| 3197 | p = new ILU<LocalMatrix<T>, LocalVector<T>, T>; |
| 3198 | + else if(precond == "ItILU0") |
| 3199 | + p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>; |
| 3200 | else if(precond == "ILUT") |
| 3201 | p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>; |
| 3202 | else if(precond == "IC") |
| 3203 | @@ -150,7 +157,7 @@ bool testing_cg(Arguments argus) |
| 3204 | ls.Build(); |
| 3205 | |
| 3206 | // Matrix format |
| 3207 | - A.ConvertTo(format, format == BCSR ? 3 : 1); |
| 3208 | + A.ConvertTo(format, format == BCSR ? argus.blockdim : 1); |
| 3209 | |
| 3210 | ls.Solve(b, &x); |
| 3211 | |
| 3212 | @@ -169,6 +176,7 @@ bool testing_cg(Arguments argus) |
| 3213 | |
| 3214 | // Stop rocALUTION platform |
| 3215 | stop_rocalution(); |
| 3216 | + disable_accelerator_rocalution(false); |
| 3217 | |
| 3218 | return success; |
| 3219 | } |
| 3220 | diff --git a/clients/include/testing_chebyshev.hpp b/clients/include/testing_chebyshev.hpp |
| 3221 | new file mode 100644 |
| 3222 | index 0000000..a252f24 |
| 3223 | --- /dev/null |
| 3224 | +++ b/clients/include/testing_chebyshev.hpp |
| 3225 | @@ -0,0 +1,218 @@ |
| 3226 | +/* ************************************************************************ |
| 3227 | + * Copyright (C) 2025 Advanced Micro Devices, Inc. All rights Reserved. |
| 3228 | + * |
| 3229 | + * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3230 | + * of this software and associated documentation files (the "Software"), to deal |
| 3231 | + * in the Software without restriction, including without limitation the rights |
| 3232 | + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 3233 | + * copies of the Software, and to permit persons to whom the Software is |
| 3234 | + * furnished to do so, subject to the following conditions: |
| 3235 | + * |
| 3236 | + * The above copyright notice and this permission notice shall be included in |
| 3237 | + * all copies or substantial portions of the Software. |
| 3238 | + * |
| 3239 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 3240 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 3241 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 3242 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 3243 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 3244 | + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 3245 | + * THE SOFTWARE. |
| 3246 | + * |
| 3247 | + * ************************************************************************ */ |
| 3248 | + |
| 3249 | +#pragma once |
| 3250 | + |
| 3251 | +#include "utility.hpp" |
| 3252 | + |
| 3253 | +#include <rocalution/rocalution.hpp> |
| 3254 | + |
| 3255 | +template <typename T> |
| 3256 | +bool testing_chebyshev(Arguments argus) |
| 3257 | +{ |
| 3258 | + using namespace rocalution; |
| 3259 | + |
| 3260 | + int ndim = argus.size; |
| 3261 | + std::string precond = argus.precond; |
| 3262 | + unsigned int format = argus.format; |
| 3263 | + std::string matrix_type = argus.matrix_type; |
| 3264 | + bool rebuildnumeric = argus.rebuildnumeric; |
| 3265 | + bool disable_accelerator = !argus.use_acc; |
| 3266 | + |
| 3267 | + // Initialize rocALUTION platform |
| 3268 | + disable_accelerator_rocalution(disable_accelerator); |
| 3269 | + set_device_rocalution(device); |
| 3270 | + init_rocalution(); |
| 3271 | + |
| 3272 | + // rocALUTION structures |
| 3273 | + LocalMatrix<T> A; |
| 3274 | + LocalVector<T> x; |
| 3275 | + LocalVector<T> b; |
| 3276 | + LocalVector<T> b_old; |
| 3277 | + LocalVector<T>* b_k; |
| 3278 | + LocalVector<T>* b_k1; |
| 3279 | + LocalVector<T>* b_tmp; |
| 3280 | + LocalVector<T> e; |
| 3281 | + LocalVector<T> rhs; |
| 3282 | + |
| 3283 | + // Generate A |
| 3284 | + int* csr_ptr = NULL; |
| 3285 | + int* csr_col = NULL; |
| 3286 | + T* csr_val = NULL; |
| 3287 | + |
| 3288 | + int nrow = 0; |
| 3289 | + int ncol = 0; |
| 3290 | + if(matrix_type == "Laplacian2D") |
| 3291 | + { |
| 3292 | + nrow = gen_2d_laplacian(ndim, &csr_ptr, &csr_col, &csr_val); |
| 3293 | + ncol = nrow; |
| 3294 | + } |
| 3295 | + else |
| 3296 | + { |
| 3297 | + stop_rocalution(); |
| 3298 | + disable_accelerator_rocalution(false); |
| 3299 | + return true; |
| 3300 | + } |
| 3301 | + int nnz = csr_ptr[nrow]; |
| 3302 | + |
| 3303 | + T* csr_val2 = NULL; |
| 3304 | + if(rebuildnumeric) |
| 3305 | + { |
| 3306 | + csr_val2 = new T[nnz]; |
| 3307 | + for(int i = 0; i < nnz; i++) |
| 3308 | + { |
| 3309 | + csr_val2[i] = csr_val[i]; |
| 3310 | + } |
| 3311 | + } |
| 3312 | + |
| 3313 | + A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow); |
| 3314 | + |
| 3315 | + // Move data to accelerator |
| 3316 | + if(!disable_accelerator) |
| 3317 | + { |
| 3318 | + A.MoveToAccelerator(); |
| 3319 | + x.MoveToAccelerator(); |
| 3320 | + rhs.MoveToAccelerator(); |
| 3321 | + e.MoveToAccelerator(); |
| 3322 | + } |
| 3323 | + |
| 3324 | + // Allocate x, b and e |
| 3325 | + x.Allocate("x", A.GetN()); |
| 3326 | + rhs.Allocate("b", A.GetM()); |
| 3327 | + e.Allocate("e", A.GetN()); |
| 3328 | + |
| 3329 | + T lambda_min; |
| 3330 | + T lambda_max; |
| 3331 | + |
| 3332 | + A.Gershgorin(lambda_min, lambda_max); |
| 3333 | + |
| 3334 | + // Chebyshev iteration |
| 3335 | + Chebyshev<LocalMatrix<T>, LocalVector<T>, T> ls; |
| 3336 | + |
| 3337 | + // Initialize rhs such that A 1 = rhs |
| 3338 | + e.Ones(); |
| 3339 | + A.Apply(e, &rhs); |
| 3340 | + |
| 3341 | + // Initial zero guess |
| 3342 | + x.Zeros(); |
| 3343 | + |
| 3344 | + // Preconditioner |
| 3345 | + Preconditioner<LocalMatrix<T>, LocalVector<T>, T>* p; |
| 3346 | + |
| 3347 | + if(precond == "None") |
| 3348 | + p = NULL; |
| 3349 | + else if(precond == "Chebyshev") |
| 3350 | + { |
| 3351 | + // Chebyshev preconditioner |
| 3352 | + |
| 3353 | + // Determine min and max eigenvalues |
| 3354 | + T lambda_min; |
| 3355 | + T lambda_max; |
| 3356 | + |
| 3357 | + A.Gershgorin(lambda_min, lambda_max); |
| 3358 | + |
| 3359 | + AIChebyshev<LocalMatrix<T>, LocalVector<T>, T>* cheb |
| 3360 | + = new AIChebyshev<LocalMatrix<T>, LocalVector<T>, T>; |
| 3361 | + cheb->Set(3, lambda_max / 7.0, lambda_max); |
| 3362 | + |
| 3363 | + p = cheb; |
| 3364 | + } |
| 3365 | + else if(precond == "FSAI") |
| 3366 | + p = new FSAI<LocalMatrix<T>, LocalVector<T>, T>; |
| 3367 | + else if(precond == "SPAI") |
| 3368 | + p = new SPAI<LocalMatrix<T>, LocalVector<T>, T>; |
| 3369 | + else if(precond == "TNS") |
| 3370 | + p = new TNS<LocalMatrix<T>, LocalVector<T>, T>; |
| 3371 | + else if(precond == "Jacobi") |
| 3372 | + p = new Jacobi<LocalMatrix<T>, LocalVector<T>, T>; |
| 3373 | + else if(precond == "GS") |
| 3374 | + p = new GS<LocalMatrix<T>, LocalVector<T>, T>; |
| 3375 | + else if(precond == "SGS") |
| 3376 | + p = new SGS<LocalMatrix<T>, LocalVector<T>, T>; |
| 3377 | + else if(precond == "ILU") |
| 3378 | + p = new ILU<LocalMatrix<T>, LocalVector<T>, T>; |
| 3379 | + else if(precond == "ItILU0") |
| 3380 | + p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>; |
| 3381 | + else if(precond == "ILUT") |
| 3382 | + p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>; |
| 3383 | + else if(precond == "IC") |
| 3384 | + p = new IC<LocalMatrix<T>, LocalVector<T>, T>; |
| 3385 | + else if(precond == "MCGS") |
| 3386 | + p = new MultiColoredGS<LocalMatrix<T>, LocalVector<T>, T>; |
| 3387 | + else if(precond == "MCSGS") |
| 3388 | + p = new MultiColoredSGS<LocalMatrix<T>, LocalVector<T>, T>; |
| 3389 | + else if(precond == "MCILU") |
| 3390 | + p = new MultiColoredILU<LocalMatrix<T>, LocalVector<T>, T>; |
| 3391 | + else |
| 3392 | + return false; |
| 3393 | + |
| 3394 | + // Set solver operator |
| 3395 | + ls.SetOperator(A); |
| 3396 | + |
| 3397 | + ls.Verbose(0); |
| 3398 | + ls.SetOperator(A); |
| 3399 | + |
| 3400 | + // Set preconditioner |
| 3401 | + if(p != NULL) |
| 3402 | + { |
| 3403 | + ls.SetPreconditioner(*p); |
| 3404 | + } |
| 3405 | + |
| 3406 | + // Set eigenvalues |
| 3407 | + ls.Set(lambda_min, lambda_max); |
| 3408 | + |
| 3409 | + // Build solver |
| 3410 | + ls.Build(); |
| 3411 | + |
| 3412 | + if(rebuildnumeric) |
| 3413 | + { |
| 3414 | + A.UpdateValuesCSR(csr_val2); |
| 3415 | + delete[] csr_val2; |
| 3416 | + |
| 3417 | + A.Apply(e, &rhs); |
| 3418 | + |
| 3419 | + ls.ReBuildNumeric(); |
| 3420 | + ls.Set(lambda_min, lambda_max); |
| 3421 | + } |
| 3422 | + |
| 3423 | + // Solve A x = rhs |
| 3424 | + ls.Solve(rhs, &x); |
| 3425 | + |
| 3426 | + // Clear solver |
| 3427 | + ls.Clear(); |
| 3428 | + if(p != NULL) |
| 3429 | + { |
| 3430 | + delete p; |
| 3431 | + } |
| 3432 | + |
| 3433 | + // Compute error L2 norm |
| 3434 | + e.ScaleAdd(-1.0, x); |
| 3435 | + T error = e.Norm(); |
| 3436 | + std::cout << "Chebyshev iteration ||e - x||_2 = " << error << std::endl; |
| 3437 | + |
| 3438 | + // Stop rocALUTION platform |
| 3439 | + stop_rocalution(); |
| 3440 | + disable_accelerator_rocalution(false); |
| 3441 | + |
| 3442 | + return true; |
| 3443 | +} |
| 3444 | \ No newline at end of file |
| 3445 | diff --git a/clients/include/testing_cr.hpp b/clients/include/testing_cr.hpp |
| 3446 | index 41658b6..3497337 100644 |
| 3447 | --- a/clients/include/testing_cr.hpp |
| 3448 | +++ b/clients/include/testing_cr.hpp |
| 3449 | @@ -1,5 +1,5 @@ |
| 3450 | /* ************************************************************************ |
| 3451 | - * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 3452 | + * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved. |
| 3453 | * |
| 3454 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3455 | * of this software and associated documentation files (the "Software"), to deal |
| 3456 | @@ -44,11 +44,13 @@ static bool check_residual(double res) |
| 3457 | template <typename T> |
| 3458 | bool testing_cr(Arguments argus) |
| 3459 | { |
| 3460 | - int ndim = argus.size; |
| 3461 | - std::string precond = argus.precond; |
| 3462 | - unsigned int format = argus.format; |
| 3463 | + int ndim = argus.size; |
| 3464 | + std::string precond = argus.precond; |
| 3465 | + unsigned int format = argus.format; |
| 3466 | + bool disable_accelerator = !argus.use_acc; |
| 3467 | |
| 3468 | // Initialize rocALUTION platform |
| 3469 | + disable_accelerator_rocalution(disable_accelerator); |
| 3470 | set_device_rocalution(device); |
| 3471 | init_rocalution(); |
| 3472 | |
| 3473 | @@ -69,10 +71,13 @@ bool testing_cr(Arguments argus) |
| 3474 | A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow); |
| 3475 | |
| 3476 | // Move data to accelerator |
| 3477 | - A.MoveToAccelerator(); |
| 3478 | - x.MoveToAccelerator(); |
| 3479 | - b.MoveToAccelerator(); |
| 3480 | - e.MoveToAccelerator(); |
| 3481 | + if(!disable_accelerator) |
| 3482 | + { |
| 3483 | + A.MoveToAccelerator(); |
| 3484 | + x.MoveToAccelerator(); |
| 3485 | + b.MoveToAccelerator(); |
| 3486 | + e.MoveToAccelerator(); |
| 3487 | + } |
| 3488 | |
| 3489 | // Allocate x, b and e |
| 3490 | x.Allocate("x", A.GetN()); |
| 3491 | @@ -124,6 +129,8 @@ bool testing_cr(Arguments argus) |
| 3492 | p = new SGS<LocalMatrix<T>, LocalVector<T>, T>; |
| 3493 | else if(precond == "ILU") |
| 3494 | p = new ILU<LocalMatrix<T>, LocalVector<T>, T>; |
| 3495 | + else if(precond == "ItILU0") |
| 3496 | + p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>; |
| 3497 | else if(precond == "ILUT") |
| 3498 | p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>; |
| 3499 | else if(precond == "IC") |
| 3500 | @@ -150,7 +157,7 @@ bool testing_cr(Arguments argus) |
| 3501 | ls.Build(); |
| 3502 | |
| 3503 | // Matrix format |
| 3504 | - A.ConvertTo(format, format == BCSR ? 3 : 1); |
| 3505 | + A.ConvertTo(format, format == BCSR ? argus.blockdim : 1); |
| 3506 | |
| 3507 | ls.Solve(b, &x); |
| 3508 | |
| 3509 | @@ -169,6 +176,7 @@ bool testing_cr(Arguments argus) |
| 3510 | |
| 3511 | // Stop rocALUTION platform |
| 3512 | stop_rocalution(); |
| 3513 | + disable_accelerator_rocalution(false); |
| 3514 | |
| 3515 | return success; |
| 3516 | } |
| 3517 | diff --git a/clients/include/testing_fcg.hpp b/clients/include/testing_fcg.hpp |
| 3518 | index 57e37bf..9aade14 100644 |
| 3519 | --- a/clients/include/testing_fcg.hpp |
| 3520 | +++ b/clients/include/testing_fcg.hpp |
| 3521 | @@ -1,5 +1,5 @@ |
| 3522 | /* ************************************************************************ |
| 3523 | - * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 3524 | + * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved. |
| 3525 | * |
| 3526 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3527 | * of this software and associated documentation files (the "Software"), to deal |
| 3528 | @@ -44,11 +44,13 @@ static bool check_residual(double res) |
| 3529 | template <typename T> |
| 3530 | bool testing_fcg(Arguments argus) |
| 3531 | { |
| 3532 | - int ndim = argus.size; |
| 3533 | - std::string precond = argus.precond; |
| 3534 | - unsigned int format = argus.format; |
| 3535 | + int ndim = argus.size; |
| 3536 | + std::string precond = argus.precond; |
| 3537 | + unsigned int format = argus.format; |
| 3538 | + bool disable_accelerator = !argus.use_acc; |
| 3539 | |
| 3540 | // Initialize rocALUTION platform |
| 3541 | + disable_accelerator_rocalution(disable_accelerator); |
| 3542 | set_device_rocalution(device); |
| 3543 | init_rocalution(); |
| 3544 | |
| 3545 | @@ -69,10 +71,13 @@ bool testing_fcg(Arguments argus) |
| 3546 | A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow); |
| 3547 | |
| 3548 | // Move data to accelerator |
| 3549 | - A.MoveToAccelerator(); |
| 3550 | - x.MoveToAccelerator(); |
| 3551 | - b.MoveToAccelerator(); |
| 3552 | - e.MoveToAccelerator(); |
| 3553 | + if(!disable_accelerator) |
| 3554 | + { |
| 3555 | + A.MoveToAccelerator(); |
| 3556 | + x.MoveToAccelerator(); |
| 3557 | + b.MoveToAccelerator(); |
| 3558 | + e.MoveToAccelerator(); |
| 3559 | + } |
| 3560 | |
| 3561 | // Allocate x, b and e |
| 3562 | x.Allocate("x", A.GetN()); |
| 3563 | @@ -124,6 +129,8 @@ bool testing_fcg(Arguments argus) |
| 3564 | p = new SGS<LocalMatrix<T>, LocalVector<T>, T>; |
| 3565 | else if(precond == "ILU") |
| 3566 | p = new ILU<LocalMatrix<T>, LocalVector<T>, T>; |
| 3567 | + else if(precond == "ItILU0") |
| 3568 | + p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>; |
| 3569 | else if(precond == "ILUT") |
| 3570 | p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>; |
| 3571 | else if(precond == "IC") |
| 3572 | @@ -150,7 +157,7 @@ bool testing_fcg(Arguments argus) |
| 3573 | ls.Build(); |
| 3574 | |
| 3575 | // Matrix format |
| 3576 | - A.ConvertTo(format, format == BCSR ? 3 : 1); |
| 3577 | + A.ConvertTo(format, format == BCSR ? argus.blockdim : 1); |
| 3578 | |
| 3579 | ls.Solve(b, &x); |
| 3580 | |
| 3581 | @@ -169,6 +176,7 @@ bool testing_fcg(Arguments argus) |
| 3582 | |
| 3583 | // Stop rocALUTION platform |
| 3584 | stop_rocalution(); |
| 3585 | + disable_accelerator_rocalution(false); |
| 3586 | |
| 3587 | return success; |
| 3588 | } |
| 3589 | diff --git a/clients/include/testing_fgmres.hpp b/clients/include/testing_fgmres.hpp |
| 3590 | index c92f5af..cd39930 100644 |
| 3591 | --- a/clients/include/testing_fgmres.hpp |
| 3592 | +++ b/clients/include/testing_fgmres.hpp |
| 3593 | @@ -1,5 +1,5 @@ |
| 3594 | /* ************************************************************************ |
| 3595 | - * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 3596 | + * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved. |
| 3597 | * |
| 3598 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3599 | * of this software and associated documentation files (the "Software"), to deal |
| 3600 | @@ -34,12 +34,14 @@ using namespace rocalution; |
| 3601 | template <typename T> |
| 3602 | bool testing_fgmres(Arguments argus) |
| 3603 | { |
| 3604 | - int ndim = argus.size; |
| 3605 | - int basis = argus.index; |
| 3606 | - std::string precond = argus.precond; |
| 3607 | - unsigned int format = argus.format; |
| 3608 | + int ndim = argus.size; |
| 3609 | + int basis = argus.index; |
| 3610 | + std::string precond = argus.precond; |
| 3611 | + unsigned int format = argus.format; |
| 3612 | + bool disable_accelerator = !argus.use_acc; |
| 3613 | |
| 3614 | // Initialize rocALUTION platform |
| 3615 | + disable_accelerator_rocalution(disable_accelerator); |
| 3616 | set_device_rocalution(device); |
| 3617 | init_rocalution(); |
| 3618 | |
| 3619 | @@ -60,10 +62,13 @@ bool testing_fgmres(Arguments argus) |
| 3620 | A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow); |
| 3621 | |
| 3622 | // Move data to accelerator |
| 3623 | - A.MoveToAccelerator(); |
| 3624 | - x.MoveToAccelerator(); |
| 3625 | - b.MoveToAccelerator(); |
| 3626 | - e.MoveToAccelerator(); |
| 3627 | + if(!disable_accelerator) |
| 3628 | + { |
| 3629 | + A.MoveToAccelerator(); |
| 3630 | + x.MoveToAccelerator(); |
| 3631 | + b.MoveToAccelerator(); |
| 3632 | + e.MoveToAccelerator(); |
| 3633 | + } |
| 3634 | |
| 3635 | // Allocate x, b and e |
| 3636 | x.Allocate("x", A.GetN()); |
| 3637 | @@ -115,6 +120,8 @@ bool testing_fgmres(Arguments argus) |
| 3638 | p = new SGS<LocalMatrix<T>, LocalVector<T>, T>; |
| 3639 | else if(precond == "ILU") |
| 3640 | p = new ILU<LocalMatrix<T>, LocalVector<T>, T>; |
| 3641 | + else if(precond == "ItILU0") |
| 3642 | + p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>; |
| 3643 | else if(precond == "ILUT") |
| 3644 | p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>; |
| 3645 | else if(precond == "IC") |
| 3646 | @@ -143,7 +150,7 @@ bool testing_fgmres(Arguments argus) |
| 3647 | ls.Build(); |
| 3648 | |
| 3649 | // Matrix format |
| 3650 | - A.ConvertTo(format, format == BCSR ? 3 : 1); |
| 3651 | + A.ConvertTo(format, format == BCSR ? argus.blockdim : 1); |
| 3652 | |
| 3653 | ls.Solve(b, &x); |
| 3654 | |
| 3655 | @@ -162,6 +169,7 @@ bool testing_fgmres(Arguments argus) |
| 3656 | |
| 3657 | // Stop rocALUTION platform |
| 3658 | stop_rocalution(); |
| 3659 | + disable_accelerator_rocalution(false); |
| 3660 | |
| 3661 | return success; |
| 3662 | } |
| 3663 | diff --git a/clients/include/testing_global_matrix.hpp b/clients/include/testing_global_matrix.hpp |
| 3664 | index de9d7d2..439d789 100644 |
| 3665 | --- a/clients/include/testing_global_matrix.hpp |
| 3666 | +++ b/clients/include/testing_global_matrix.hpp |
| 3667 | @@ -1,5 +1,5 @@ |
| 3668 | /* ************************************************************************ |
| 3669 | - * Copyright (C) 2018-2020 Advanced Micro Devices, Inc. All rights Reserved. |
| 3670 | + * Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights Reserved. |
| 3671 | * |
| 3672 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3673 | * of this software and associated documentation files (the "Software"), to deal |
| 3674 | @@ -355,17 +355,13 @@ void testing_global_matrix_bad_args(void) |
| 3675 | ParallelManager pm; |
| 3676 | LocalVector<int> lvint; |
| 3677 | GlobalMatrix<T>* null_mat = nullptr; |
| 3678 | - ParallelManager* null_pm = nullptr; |
| 3679 | int* null_int = nullptr; |
| 3680 | - ASSERT_DEATH(mat.CoarsenOperator( |
| 3681 | - null_mat, &pm, safe_size, safe_size, lvint, safe_size, idata, safe_size), |
| 3682 | - ".*Assertion.*Ac != (NULL|__null)*"); |
| 3683 | - ASSERT_DEATH(mat.CoarsenOperator( |
| 3684 | - &Ac, null_pm, safe_size, safe_size, lvint, safe_size, idata, safe_size), |
| 3685 | - ".*Assertion.*pm != (NULL|__null)*"); |
| 3686 | - ASSERT_DEATH(mat.CoarsenOperator( |
| 3687 | - &Ac, &pm, safe_size, safe_size, lvint, safe_size, null_int, safe_size), |
| 3688 | - ".*Assertion.*rG != (NULL|__null)*"); |
| 3689 | + ASSERT_DEATH( |
| 3690 | + mat.CoarsenOperator(null_mat, safe_size, safe_size, lvint, safe_size, idata, safe_size), |
| 3691 | + ".*Assertion.*Ac != (NULL|__null)*"); |
| 3692 | + ASSERT_DEATH( |
| 3693 | + mat.CoarsenOperator(&Ac, safe_size, safe_size, lvint, safe_size, null_int, safe_size), |
| 3694 | + ".*Assertion.*rG != (NULL|__null)*"); |
| 3695 | } |
| 3696 | |
| 3697 | free_host(&idata); |
| 3698 | diff --git a/clients/include/testing_gmres.hpp b/clients/include/testing_gmres.hpp |
| 3699 | index 2d875b8..1790cb9 100644 |
| 3700 | --- a/clients/include/testing_gmres.hpp |
| 3701 | +++ b/clients/include/testing_gmres.hpp |
| 3702 | @@ -1,5 +1,5 @@ |
| 3703 | /* ************************************************************************ |
| 3704 | - * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 3705 | + * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved. |
| 3706 | * |
| 3707 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3708 | * of this software and associated documentation files (the "Software"), to deal |
| 3709 | @@ -34,13 +34,15 @@ using namespace rocalution; |
| 3710 | template <typename T> |
| 3711 | bool testing_gmres(Arguments argus, bool expectConvergence = true) |
| 3712 | { |
| 3713 | - int ndim = argus.size; |
| 3714 | - int basis = argus.index; |
| 3715 | - std::string matrix = argus.matrix; |
| 3716 | - std::string precond = argus.precond; |
| 3717 | - unsigned int format = argus.format; |
| 3718 | + int ndim = argus.size; |
| 3719 | + int basis = argus.index; |
| 3720 | + std::string matrix = argus.matrix; |
| 3721 | + std::string precond = argus.precond; |
| 3722 | + unsigned int format = argus.format; |
| 3723 | + bool disable_accelerator = !argus.use_acc; |
| 3724 | |
| 3725 | // Initialize rocALUTION platform |
| 3726 | + disable_accelerator_rocalution(disable_accelerator); |
| 3727 | set_device_rocalution(device); |
| 3728 | init_rocalution(); |
| 3729 | |
| 3730 | @@ -68,10 +70,13 @@ bool testing_gmres(Arguments argus, bool expectConvergence = true) |
| 3731 | A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow); |
| 3732 | |
| 3733 | // Move data to accelerator |
| 3734 | - A.MoveToAccelerator(); |
| 3735 | - x.MoveToAccelerator(); |
| 3736 | - b.MoveToAccelerator(); |
| 3737 | - e.MoveToAccelerator(); |
| 3738 | + if(!disable_accelerator) |
| 3739 | + { |
| 3740 | + A.MoveToAccelerator(); |
| 3741 | + x.MoveToAccelerator(); |
| 3742 | + b.MoveToAccelerator(); |
| 3743 | + e.MoveToAccelerator(); |
| 3744 | + } |
| 3745 | |
| 3746 | // Allocate x, b and e |
| 3747 | x.Allocate("x", A.GetN()); |
| 3748 | @@ -123,6 +128,8 @@ bool testing_gmres(Arguments argus, bool expectConvergence = true) |
| 3749 | p = new SGS<LocalMatrix<T>, LocalVector<T>, T>; |
| 3750 | else if(precond == "ILU") |
| 3751 | p = new ILU<LocalMatrix<T>, LocalVector<T>, T>; |
| 3752 | + else if(precond == "ItILU0") |
| 3753 | + p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>; |
| 3754 | else if(precond == "ILUT") |
| 3755 | p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>; |
| 3756 | else if(precond == "IC") |
| 3757 | @@ -151,7 +158,7 @@ bool testing_gmres(Arguments argus, bool expectConvergence = true) |
| 3758 | ls.Build(); |
| 3759 | |
| 3760 | // Matrix format |
| 3761 | - A.ConvertTo(format, format == BCSR ? 3 : 1); |
| 3762 | + A.ConvertTo(format, format == BCSR ? argus.blockdim : 1); |
| 3763 | |
| 3764 | ls.Solve(b, &x); |
| 3765 | |
| 3766 | @@ -170,6 +177,7 @@ bool testing_gmres(Arguments argus, bool expectConvergence = true) |
| 3767 | |
| 3768 | // Stop rocALUTION platform |
| 3769 | stop_rocalution(); |
| 3770 | + disable_accelerator_rocalution(false); |
| 3771 | |
| 3772 | return success; |
| 3773 | } |
| 3774 | diff --git a/clients/include/testing_idr.hpp b/clients/include/testing_idr.hpp |
| 3775 | index 000b8af..35efa3a 100644 |
| 3776 | --- a/clients/include/testing_idr.hpp |
| 3777 | +++ b/clients/include/testing_idr.hpp |
| 3778 | @@ -1,5 +1,5 @@ |
| 3779 | /* ************************************************************************ |
| 3780 | - * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 3781 | + * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved. |
| 3782 | * |
| 3783 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3784 | * of this software and associated documentation files (the "Software"), to deal |
| 3785 | @@ -44,12 +44,14 @@ static bool check_residual(double res) |
| 3786 | template <typename T> |
| 3787 | bool testing_idr(Arguments argus) |
| 3788 | { |
| 3789 | - int ndim = argus.size; |
| 3790 | - std::string precond = argus.precond; |
| 3791 | - unsigned int format = argus.format; |
| 3792 | - int l = argus.index; |
| 3793 | + int ndim = argus.size; |
| 3794 | + std::string precond = argus.precond; |
| 3795 | + unsigned int format = argus.format; |
| 3796 | + int l = argus.index; |
| 3797 | + bool disable_accelerator = !argus.use_acc; |
| 3798 | |
| 3799 | // Initialize rocALUTION platform |
| 3800 | + disable_accelerator_rocalution(disable_accelerator); |
| 3801 | set_device_rocalution(device); |
| 3802 | init_rocalution(); |
| 3803 | |
| 3804 | @@ -70,10 +72,13 @@ bool testing_idr(Arguments argus) |
| 3805 | A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow); |
| 3806 | |
| 3807 | // Move data to accelerator |
| 3808 | - A.MoveToAccelerator(); |
| 3809 | - x.MoveToAccelerator(); |
| 3810 | - b.MoveToAccelerator(); |
| 3811 | - e.MoveToAccelerator(); |
| 3812 | + if(!disable_accelerator) |
| 3813 | + { |
| 3814 | + A.MoveToAccelerator(); |
| 3815 | + x.MoveToAccelerator(); |
| 3816 | + b.MoveToAccelerator(); |
| 3817 | + e.MoveToAccelerator(); |
| 3818 | + } |
| 3819 | |
| 3820 | // Allocate x, b and e |
| 3821 | x.Allocate("x", A.GetN()); |
| 3822 | @@ -85,7 +90,7 @@ bool testing_idr(Arguments argus) |
| 3823 | A.Apply(e, &b); |
| 3824 | |
| 3825 | // Random initial guess |
| 3826 | - x.SetRandomUniform(12345ULL, -4.0, 6.0); |
| 3827 | + x.SetRandomUniform(123456ULL, -4.0, 6.0); |
| 3828 | |
| 3829 | // Solver |
| 3830 | IDR<LocalMatrix<T>, LocalVector<T>, T> ls; |
| 3831 | @@ -125,6 +130,8 @@ bool testing_idr(Arguments argus) |
| 3832 | p = new SGS<LocalMatrix<T>, LocalVector<T>, T>; |
| 3833 | else if(precond == "ILU") |
| 3834 | p = new ILU<LocalMatrix<T>, LocalVector<T>, T>; |
| 3835 | + else if(precond == "ItILU0") |
| 3836 | + p = new ItILU0<LocalMatrix<T>, LocalVector<T>, T>; |
| 3837 | else if(precond == "ILUT") |
| 3838 | p = new ILUT<LocalMatrix<T>, LocalVector<T>, T>; |
| 3839 | else if(precond == "IC") |
| 3840 | @@ -153,7 +160,7 @@ bool testing_idr(Arguments argus) |
| 3841 | ls.Build(); |
| 3842 | |
| 3843 | // Matrix format |
| 3844 | - A.ConvertTo(format, format == BCSR ? 3 : 1); |
| 3845 | + A.ConvertTo(format, format == BCSR ? argus.blockdim : 1); |
| 3846 | |
| 3847 | ls.Solve(b, &x); |
| 3848 | |
| 3849 | @@ -172,6 +179,7 @@ bool testing_idr(Arguments argus) |
| 3850 | |
| 3851 | // Stop rocALUTION platform |
| 3852 | stop_rocalution(); |
| 3853 | + disable_accelerator_rocalution(false); |
| 3854 | |
| 3855 | return success; |
| 3856 | } |
| 3857 | diff --git a/clients/include/testing_inversion.hpp b/clients/include/testing_inversion.hpp |
| 3858 | index 65cf850..a328af9 100644 |
| 3859 | --- a/clients/include/testing_inversion.hpp |
| 3860 | +++ b/clients/include/testing_inversion.hpp |
| 3861 | @@ -1,5 +1,5 @@ |
| 3862 | /* ************************************************************************ |
| 3863 | - * Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 3864 | + * Copyright (C) 2022-2025 Advanced Micro Devices, Inc. All rights Reserved. |
| 3865 | * |
| 3866 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3867 | * of this software and associated documentation files (the "Software"), to deal |
| 3868 | @@ -44,9 +44,10 @@ static bool check_residual(double res) |
| 3869 | template <typename T> |
| 3870 | bool testing_inversion(Arguments argus) |
| 3871 | { |
| 3872 | - int ndim = argus.size; |
| 3873 | - unsigned int format = argus.format; |
| 3874 | - std::string matrix_type = argus.matrix_type; |
| 3875 | + int ndim = argus.size; |
| 3876 | + unsigned int format = argus.format; |
| 3877 | + std::string matrix_type = argus.matrix_type; |
| 3878 | + const bool use_host_and_acc = argus.use_acc; |
| 3879 | |
| 3880 | // Initialize rocALUTION platform |
| 3881 | set_device_rocalution(device); |
| 3882 | @@ -83,12 +84,6 @@ bool testing_inversion(Arguments argus) |
| 3883 | |
| 3884 | A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow); |
| 3885 | |
| 3886 | - // Move data to accelerator |
| 3887 | - A.MoveToAccelerator(); |
| 3888 | - x.MoveToAccelerator(); |
| 3889 | - b.MoveToAccelerator(); |
| 3890 | - e.MoveToAccelerator(); |
| 3891 | - |
| 3892 | // Allocate x, b and e |
| 3893 | x.Allocate("x", A.GetN()); |
| 3894 | b.Allocate("b", A.GetM()); |
| 3895 | @@ -110,15 +105,39 @@ bool testing_inversion(Arguments argus) |
| 3896 | dls.Build(); |
| 3897 | |
| 3898 | // Matrix format |
| 3899 | - A.ConvertTo(format, format == BCSR ? 3 : 1); |
| 3900 | + A.ConvertTo(format, format == BCSR ? argus.blockdim : 1); |
| 3901 | + |
| 3902 | + // Move data to accelerator |
| 3903 | + dls.MoveToAccelerator(); |
| 3904 | + A.MoveToAccelerator(); |
| 3905 | + x.MoveToAccelerator(); |
| 3906 | + b.MoveToAccelerator(); |
| 3907 | + e.MoveToAccelerator(); |
| 3908 | |
| 3909 | dls.Solve(b, &x); |
| 3910 | |
| 3911 | // Verify solution |
| 3912 | x.ScaleAdd(-1.0, e); |
| 3913 | - T nrm2 = x.Norm(); |
| 3914 | + T nrm2_acc = x.Norm(); |
| 3915 | |
| 3916 | - bool success = check_residual(nrm2); |
| 3917 | + bool success = check_residual(nrm2_acc); |
| 3918 | + |
| 3919 | + if(use_host_and_acc) |
| 3920 | + { |
| 3921 | + dls.MoveToHost(); |
| 3922 | + A.MoveToHost(); |
| 3923 | + x.MoveToHost(); |
| 3924 | + e.MoveToHost(); |
| 3925 | + b.MoveToHost(); |
| 3926 | + |
| 3927 | + dls.Solve(b, &x); |
| 3928 | + |
| 3929 | + // Verify solution |
| 3930 | + x.ScaleAdd(-1.0, e); |
| 3931 | + T nrm2_host = x.Norm(); |
| 3932 | + |
| 3933 | + success = success && check_residual(nrm2_host); |
| 3934 | + } |
| 3935 | |
| 3936 | // Clean up |
| 3937 | dls.Clear(); |
| 3938 | diff --git a/clients/include/testing_itsolver.hpp b/clients/include/testing_itsolver.hpp |
| 3939 | new file mode 100644 |
| 3940 | index 0000000..7d76972 |
| 3941 | --- /dev/null |
| 3942 | +++ b/clients/include/testing_itsolver.hpp |
| 3943 | @@ -0,0 +1,165 @@ |
| 3944 | +/* ************************************************************************ |
| 3945 | + * Copyright (C) 2025 Advanced Micro Devices, Inc. All rights Reserved. |
| 3946 | + * |
| 3947 | + * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 3948 | + * of this software and associated documentation files (the "Software"), to deal |
| 3949 | + * in the Software without restriction, including without limitation the rights |
| 3950 | + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 3951 | + * copies of the Software, and to permit persons to whom the Software is |
| 3952 | + * furnished to do so, subject to the following conditions: |
| 3953 | + * |
| 3954 | + * The above copyright notice and this permission notice shall be included in |
| 3955 | + * all copies or substantial portions of the Software. |
| 3956 | + * |
| 3957 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 3958 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 3959 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 3960 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 3961 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 3962 | + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 3963 | + * THE SOFTWARE. |
| 3964 | + * |
| 3965 | + * ************************************************************************ */ |
| 3966 | + |
| 3967 | +#pragma once |
| 3968 | + |
| 3969 | +#include "utility.hpp" |
| 3970 | + |
| 3971 | +#include <rocalution/rocalution.hpp> |
| 3972 | + |
| 3973 | +template <typename T> |
| 3974 | +bool testing_itsolver(Arguments argus) |
| 3975 | +{ |
| 3976 | + using namespace rocalution; |
| 3977 | + |
| 3978 | + int ndim = argus.size; |
| 3979 | + unsigned int format = argus.format; |
| 3980 | + std::string matrix_type = argus.matrix_type; |
| 3981 | + bool disable_accelerator = !argus.use_acc; |
| 3982 | + |
| 3983 | + // Initialize rocALUTION platform |
| 3984 | + disable_accelerator_rocalution(disable_accelerator); |
| 3985 | + set_device_rocalution(device); |
| 3986 | + init_rocalution(); |
| 3987 | + |
| 3988 | + // rocALUTION structures |
| 3989 | + LocalMatrix<T> A; |
| 3990 | + LocalVector<T> x; |
| 3991 | + LocalVector<T> b; |
| 3992 | + LocalVector<T> e; |
| 3993 | + |
| 3994 | + // Generate A |
| 3995 | + int* csr_ptr = NULL; |
| 3996 | + int* csr_col = NULL; |
| 3997 | + T* csr_val = NULL; |
| 3998 | + |
| 3999 | + int nrow = 0; |
| 4000 | + int ncol = 0; |
| 4001 | + if(matrix_type == "Laplacian2D") |
| 4002 | + { |
| 4003 | + nrow = gen_2d_laplacian(ndim, &csr_ptr, &csr_col, &csr_val); |
| 4004 | + ncol = nrow; |
| 4005 | + } |
| 4006 | + else if(matrix_type == "PermutedIdentity") |
| 4007 | + { |
| 4008 | + nrow = gen_permuted_identity(ndim, &csr_ptr, &csr_col, &csr_val); |
| 4009 | + ncol = nrow; |
| 4010 | + } |
| 4011 | + else |
| 4012 | + { |
| 4013 | + stop_rocalution(); |
| 4014 | + disable_accelerator_rocalution(false); |
| 4015 | + return true; |
| 4016 | + } |
| 4017 | + int nnz = csr_ptr[nrow]; |
| 4018 | + |
| 4019 | + A.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "A", nnz, nrow, nrow); |
| 4020 | + |
| 4021 | + // Move data to accelerator |
| 4022 | + if(!disable_accelerator) |
| 4023 | + { |
| 4024 | + A.MoveToAccelerator(); |
| 4025 | + x.MoveToAccelerator(); |
| 4026 | + b.MoveToAccelerator(); |
| 4027 | + e.MoveToAccelerator(); |
| 4028 | + } |
| 4029 | + |
| 4030 | + // Allocate x, b and e |
| 4031 | + x.Allocate("x", A.GetN()); |
| 4032 | + b.Allocate("b", A.GetM()); |
| 4033 | + e.Allocate("e", A.GetN()); |
| 4034 | + |
| 4035 | + // Linear Solver |
| 4036 | + FixedPoint<LocalMatrix<T>, LocalVector<T>, T> fp; |
| 4037 | + |
| 4038 | + // Preconditioner |
| 4039 | + ItILU0<LocalMatrix<T>, LocalVector<T>, T> p; |
| 4040 | + |
| 4041 | + // Set iterative ILU stopping criteria |
| 4042 | + p.SetTolerance(1e-8); |
| 4043 | + p.SetMaxIter(50); |
| 4044 | + |
| 4045 | + p.SetAlgorithm(ItILU0Algorithm::SyncSplit); |
| 4046 | + |
| 4047 | + // Set up iterative triangular solve |
| 4048 | + SolverDescr descr; |
| 4049 | + descr.SetTriSolverAlg(TriSolverAlg_Iterative); |
| 4050 | + descr.SetIterativeSolverMaxIteration(30); |
| 4051 | + descr.SetIterativeSolverTolerance(1e-8); |
| 4052 | + |
| 4053 | + descr.DisableIterativeSolverTolerance(); |
| 4054 | + descr.EnableIterativeSolverTolerance(); |
| 4055 | + SolverDescr descr_new(descr); // Copy the descriptor |
| 4056 | + |
| 4057 | + p.SetSolverDescriptor(descr_new); |
| 4058 | + |
| 4059 | + // Initialize b such that A 1 = b |
| 4060 | + e.Ones(); |
| 4061 | + A.Apply(e, &b); |
| 4062 | + |
| 4063 | + // Initial zero guess |
| 4064 | + x.Zeros(); |
| 4065 | + |
| 4066 | + // Set solver operator |
| 4067 | + fp.SetOperator(A); |
| 4068 | + // Set solver preconditioner |
| 4069 | + fp.SetPreconditioner(p); |
| 4070 | + |
| 4071 | + // Build solver |
| 4072 | + fp.Build(); |
| 4073 | + |
| 4074 | + // Verbosity output |
| 4075 | + fp.Verbose(1); |
| 4076 | + |
| 4077 | + fp.InitMinIter(1); |
| 4078 | + fp.InitMaxIter(1000); |
| 4079 | + fp.InitTol(1e-8, 1e-8, 1e-8); |
| 4080 | + |
| 4081 | + // Print matrix info |
| 4082 | + A.Info(); |
| 4083 | + |
| 4084 | + // Solve A x = b |
| 4085 | + fp.Solve(b, &x); |
| 4086 | + |
| 4087 | + int niter_preconditioner; |
| 4088 | + const double* history = p.GetConvergenceHistory(&niter_preconditioner); |
| 4089 | + |
| 4090 | + auto res_final = fp.GetCurrentResidual(); |
| 4091 | + //auto res_init = fp.GetInitialResidual(); |
| 4092 | + //auto niter = fp.GetNumIterations(); |
| 4093 | + auto status_solver = fp.GetSolverStatus(); |
| 4094 | + auto ind = fp.GetAmaxResidualIndex(); |
| 4095 | + // Clear solver |
| 4096 | + fp.Clear(); |
| 4097 | + |
| 4098 | + // Compute error L2 norm |
| 4099 | + e.ScaleAdd(-1.0, x); |
| 4100 | + T error = e.Norm(); |
| 4101 | + std::cout << "||e - x||_2 = " << error << std::endl; |
| 4102 | + |
| 4103 | + // Stop rocALUTION platform |
| 4104 | + stop_rocalution(); |
| 4105 | + disable_accelerator_rocalution(false); |
| 4106 | + |
| 4107 | + return true; |
| 4108 | +} |
| 4109 | \ No newline at end of file |
| 4110 | diff --git a/clients/include/testing_local_matrix.hpp b/clients/include/testing_local_matrix.hpp |
| 4111 | index 3145323..82e21e7 100644 |
| 4112 | --- a/clients/include/testing_local_matrix.hpp |
| 4113 | +++ b/clients/include/testing_local_matrix.hpp |
| 4114 | @@ -1,5 +1,5 @@ |
| 4115 | /* ************************************************************************ |
| 4116 | - * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. |
| 4117 | + * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights Reserved. |
| 4118 | * |
| 4119 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 4120 | * of this software and associated documentation files (the "Software"), to deal |
| 4121 | @@ -41,10 +41,12 @@ void testing_local_matrix_bad_args(void) |
| 4122 | set_device_rocalution(device); |
| 4123 | init_rocalution(); |
| 4124 | |
| 4125 | - LocalMatrix<T> mat1; |
| 4126 | - LocalMatrix<T> mat2; |
| 4127 | - LocalVector<T> vec1; |
| 4128 | - LocalVector<int> int1; |
| 4129 | + LocalMatrix<T> mat1; |
| 4130 | + LocalMatrix<T> mat2; |
| 4131 | + LocalVector<T> vec1; |
| 4132 | + LocalVector<bool> bool1; |
| 4133 | + LocalVector<int> int1; |
| 4134 | + LocalVector<int64_t> int641; |
| 4135 | |
| 4136 | // null pointers |
| 4137 | int* null_int = nullptr; |
| 4138 | @@ -57,6 +59,10 @@ void testing_local_matrix_bad_args(void) |
| 4139 | allocate_host(safe_size, &vint); |
| 4140 | allocate_host(safe_size, &vdata); |
| 4141 | |
| 4142 | + // Valid matrices |
| 4143 | + LocalMatrix<T> mat3; |
| 4144 | + mat3.AllocateCSR("valid", safe_size, safe_size, safe_size); |
| 4145 | + |
| 4146 | // ExtractSubMatrix, ExtractSubMatrices, Extract(Inverse)Diagonal, ExtractL/U |
| 4147 | { |
| 4148 | LocalMatrix<T>* mat_null = nullptr; |
| 4149 | @@ -114,6 +120,23 @@ void testing_local_matrix_bad_args(void) |
| 4150 | ASSERT_DEATH(mat1.QRSolve(vec1, null_vec), ".*Assertion.*out != (NULL|__null)*"); |
| 4151 | } |
| 4152 | |
| 4153 | + // ItLSolve, ItUSolve, ItLLSolve, ItLUSolve |
| 4154 | + { |
| 4155 | + LocalVector<T>* null_vec = nullptr; |
| 4156 | + int max_iter = 1; |
| 4157 | + double tol = 0; |
| 4158 | + ASSERT_DEATH(mat1.ItLSolve(max_iter, tol, true, vec1, null_vec), |
| 4159 | + ".*Assertion.*out != (NULL|__null)*"); |
| 4160 | + ASSERT_DEATH(mat1.ItUSolve(max_iter, tol, true, vec1, null_vec), |
| 4161 | + ".*Assertion.*out != (NULL|__null)*"); |
| 4162 | + ASSERT_DEATH(mat1.ItLLSolve(max_iter, tol, true, vec1, null_vec), |
| 4163 | + ".*Assertion.*out != (NULL|__null)*"); |
| 4164 | + ASSERT_DEATH(mat1.ItLLSolve(max_iter, tol, true, vec1, vec1, null_vec), |
| 4165 | + ".*Assertion.*out != (NULL|__null)*"); |
| 4166 | + ASSERT_DEATH(mat1.ItLUSolve(max_iter, tol, true, vec1, null_vec), |
| 4167 | + ".*Assertion.*out != (NULL|__null)*"); |
| 4168 | + } |
| 4169 | + |
| 4170 | // ICFactorize, Householder |
| 4171 | { |
| 4172 | T val; |
| 4173 | @@ -127,12 +150,12 @@ void testing_local_matrix_bad_args(void) |
| 4174 | ASSERT_DEATH(mat1.UpdateValuesCSR(null_data), ".*Assertion.*val != (NULL|__null)*"); |
| 4175 | ASSERT_DEATH(mat1.CopyFromCSR(null_int, vint, vdata), |
| 4176 | ".*Assertion.*row_offsets != (NULL|__null)*"); |
| 4177 | - ASSERT_DEATH(mat1.CopyFromCSR(vint, null_int, vdata), ".*Assertion.*col != (NULL|__null)*"); |
| 4178 | - ASSERT_DEATH(mat1.CopyFromCSR(vint, vint, null_data), ".*Assertion.*val != (NULL|__null)*"); |
| 4179 | + ASSERT_DEATH(mat3.CopyFromCSR(vint, null_int, vdata), ".*Assertion.*col != (NULL|__null)*"); |
| 4180 | + ASSERT_DEATH(mat3.CopyFromCSR(vint, vint, null_data), ".*Assertion.*val != (NULL|__null)*"); |
| 4181 | ASSERT_DEATH(mat1.CopyToCSR(null_int, vint, vdata), |
| 4182 | ".*Assertion.*row_offsets != (NULL|__null)*"); |
| 4183 | - ASSERT_DEATH(mat1.CopyToCSR(vint, null_int, vdata), ".*Assertion.*col != (NULL|__null)*"); |
| 4184 | - ASSERT_DEATH(mat1.CopyToCSR(vint, vint, null_data), ".*Assertion.*val != (NULL|__null)*"); |
| 4185 | + ASSERT_DEATH(mat3.CopyToCSR(vint, null_int, vdata), ".*Assertion.*col != (NULL|__null)*"); |
| 4186 | + ASSERT_DEATH(mat3.CopyToCSR(vint, vint, null_data), ".*Assertion.*val != (NULL|__null)*"); |
| 4187 | ASSERT_DEATH(mat1.CopyFromCOO(null_int, vint, vdata), ".*Assertion.*row != (NULL|__null)*"); |
| 4188 | ASSERT_DEATH(mat1.CopyFromCOO(vint, null_int, vdata), ".*Assertion.*col != (NULL|__null)*"); |
| 4189 | ASSERT_DEATH(mat1.CopyFromCOO(vint, vint, null_data), ".*Assertion.*val != (NULL|__null)*"); |
| 4190 | @@ -173,24 +196,27 @@ void testing_local_matrix_bad_args(void) |
| 4191 | |
| 4192 | // AMG |
| 4193 | { |
| 4194 | + int val; |
| 4195 | + LocalVector<bool>* bool_null_vec = nullptr; |
| 4196 | + LocalVector<int64_t>* int64_null_vec = nullptr; |
| 4197 | + ASSERT_DEATH(mat1.AMGGreedyAggregate(0.1, bool_null_vec, &int641, &int641), |
| 4198 | + ".*Assertion.*connections != (NULL|__null)*"); |
| 4199 | + ASSERT_DEATH(mat1.AMGGreedyAggregate(0.1, &bool1, int64_null_vec, &int641), |
| 4200 | + ".*Assertion.*aggregates != (NULL|__null)*"); |
| 4201 | + ASSERT_DEATH(mat1.AMGGreedyAggregate(0.1, &bool1, &int641, int64_null_vec), |
| 4202 | + ".*Assertion.*aggregate_root_nodes != (NULL|__null)*"); |
| 4203 | + |
| 4204 | + LocalMatrix<T>* null_mat = nullptr; |
| 4205 | + ASSERT_DEATH(mat1.AMGSmoothedAggregation(0.1, bool1, int641, int641, null_mat), |
| 4206 | + ".*Assertion.*prolong != (NULL|__null)*"); |
| 4207 | + } |
| 4208 | + |
| 4209 | + { |
| 4210 | int val; |
| 4211 | LocalVector<int>* null_vec = nullptr; |
| 4212 | LocalMatrix<T>* null_mat = nullptr; |
| 4213 | - ASSERT_DEATH(mat1.AMGConnect(0.1, null_vec), ".*Assertion.*connections != (NULL|__null)*"); |
| 4214 | - ASSERT_DEATH(mat1.AMGAggregate(int1, null_vec), |
| 4215 | - ".*Assertion.*aggregates != (NULL|__null)*"); |
| 4216 | - ASSERT_DEATH(mat1.AMGSmoothedAggregation(0.1, int1, int1, null_mat, &mat2), |
| 4217 | - ".*Assertion.*prolong != (NULL|__null)*"); |
| 4218 | - ASSERT_DEATH(mat1.AMGSmoothedAggregation(0.1, int1, int1, &mat2, null_mat), |
| 4219 | - ".*Assertion.*restrict != (NULL|__null)*"); |
| 4220 | - ASSERT_DEATH(mat1.AMGAggregation(int1, null_mat, &mat2), |
| 4221 | + ASSERT_DEATH(mat1.AMGUnsmoothedAggregation(int641, int641, null_mat), |
| 4222 | ".*Assertion.*prolong != (NULL|__null)*"); |
| 4223 | - ASSERT_DEATH(mat1.AMGAggregation(int1, &mat2, null_mat), |
| 4224 | - ".*Assertion.*restrict != (NULL|__null)*"); |
| 4225 | - ASSERT_DEATH(mat1.RugeStueben(0.1, null_mat, &mat2), |
| 4226 | - ".*Assertion.*prolong != (NULL|__null)*"); |
| 4227 | - ASSERT_DEATH(mat1.RugeStueben(0.1, &mat2, null_mat), |
| 4228 | - ".*Assertion.*restrict != (NULL|__null)*"); |
| 4229 | ASSERT_DEATH(mat1.InitialPairwiseAggregation(0.1, val, null_vec, val, &null_int, val, 0), |
| 4230 | ".*Assertion.*G != (NULL|__null)*"); |
| 4231 | ASSERT_DEATH(mat1.InitialPairwiseAggregation(0.1, val, &int1, val, &vint, val, 0), |
| 4232 | @@ -208,12 +234,11 @@ void testing_local_matrix_bad_args(void) |
| 4233 | ".*Assertion.*G != (NULL|__null)*"); |
| 4234 | ASSERT_DEATH(mat1.FurtherPairwiseAggregation(mat2, 0.1, val, &int1, val, &null_int, val, 0), |
| 4235 | ".*Assertion.*rG != (NULL|__null)*"); |
| 4236 | - ASSERT_DEATH(mat1.CoarsenOperator( |
| 4237 | - null_mat, nullptr, safe_size, safe_size, int1, safe_size, vint, safe_size), |
| 4238 | - ".*Assertion.*Ac != (NULL|__null)*"); |
| 4239 | ASSERT_DEATH( |
| 4240 | - mat1.CoarsenOperator( |
| 4241 | - &mat2, nullptr, safe_size, safe_size, int1, safe_size, null_int, safe_size), |
| 4242 | + mat1.CoarsenOperator(null_mat, safe_size, safe_size, int1, safe_size, vint, safe_size), |
| 4243 | + ".*Assertion.*Ac != (NULL|__null)*"); |
| 4244 | + ASSERT_DEATH( |
| 4245 | + mat1.CoarsenOperator(&mat2, safe_size, safe_size, int1, safe_size, null_int, safe_size), |
| 4246 | ".*Assertion.*rG != (NULL|__null)*"); |
| 4247 | } |
| 4248 | |
| 4249 | @@ -478,16 +503,2319 @@ bool testing_local_matrix_allocations(Arguments argus) |
| 4250 | LocalMatrix<T> D; |
| 4251 | D.AllocateDIA("D", nnz, m, n, ndiag); |
| 4252 | |
| 4253 | + LocalMatrix<T> E; |
| 4254 | + E.AllocateMCSR("E", nnz, m, n); |
| 4255 | + |
| 4256 | LocalMatrix<T> F; |
| 4257 | F.AllocateELL("F", ell_nnz, m, n, ell_max_row); |
| 4258 | |
| 4259 | LocalMatrix<T> G; |
| 4260 | G.AllocateHYB("G", ell_nnz, coo_nnz, ell_max_row, m, n); |
| 4261 | |
| 4262 | + LocalMatrix<T> H; |
| 4263 | + H.AllocateDENSE("H", m, n); |
| 4264 | + |
| 4265 | + // Stop rocALUTION platform |
| 4266 | + stop_rocalution(); |
| 4267 | + |
| 4268 | + return true; |
| 4269 | +} |
| 4270 | + |
| 4271 | +template <typename T> |
| 4272 | +bool testing_local_matrix_zero(Arguments argus) |
| 4273 | +{ |
| 4274 | + int size = argus.size; |
| 4275 | + int blockdim = argus.blockdim; |
| 4276 | + |
| 4277 | + int m = size; |
| 4278 | + int n = size; |
| 4279 | + int mb = (m + blockdim - 1) / blockdim; |
| 4280 | + int nb = (n + blockdim - 1) / blockdim; |
| 4281 | + |
| 4282 | + int nnz = 0.05 * m * n; |
| 4283 | + if(nnz == 0) |
| 4284 | + { |
| 4285 | + nnz = m * n; |
| 4286 | + } |
| 4287 | + |
| 4288 | + // Initialize rocALUTION |
| 4289 | + set_device_rocalution(device); |
| 4290 | + init_rocalution(); |
| 4291 | + |
| 4292 | + // Testing Zeros |
| 4293 | + LocalMatrix<T> A; |
| 4294 | + A.AllocateCSR("A", nnz, m, n); |
| 4295 | + |
| 4296 | + A.Zeros(); |
| 4297 | + |
| 4298 | + A.Info(); |
| 4299 | + |
| 4300 | + // Stop rocALUTION platform |
| 4301 | + stop_rocalution(); |
| 4302 | + |
| 4303 | + return true; |
| 4304 | +} |
| 4305 | + |
| 4306 | +template <typename T> |
| 4307 | +bool testing_local_matrix_set_data_ptr(Arguments argus) |
| 4308 | +{ |
| 4309 | + int size = argus.size; |
| 4310 | + int blockdim = argus.blockdim; |
| 4311 | + |
| 4312 | + int m = size; |
| 4313 | + int n = size; |
| 4314 | + int mb = (m + blockdim - 1) / blockdim; |
| 4315 | + int nb = (n + blockdim - 1) / blockdim; |
| 4316 | + |
| 4317 | + int nnz = 0.05 * m * n; |
| 4318 | + if(nnz == 0) |
| 4319 | + { |
| 4320 | + nnz = m * n; |
| 4321 | + } |
| 4322 | + |
| 4323 | + int nnzb = 0.01 * mb * nb; |
| 4324 | + if(nnzb == 0) |
| 4325 | + { |
| 4326 | + nnzb = mb * nb; |
| 4327 | + } |
| 4328 | + |
| 4329 | + // Initialize rocALUTION |
| 4330 | + set_device_rocalution(device); |
| 4331 | + init_rocalution(); |
| 4332 | + |
| 4333 | + int ndiag = 5; |
| 4334 | + int ell_max_row = 6; |
| 4335 | + int ell_nnz = ell_max_row * m; |
| 4336 | + int coo_nnz = (nnz - ell_nnz) < 0 ? 0 : nnz - ell_nnz; |
| 4337 | + |
| 4338 | + // Testing allocating matrix types |
| 4339 | + { |
| 4340 | + LocalMatrix<T> A; |
| 4341 | + int* row_offset = NULL; |
| 4342 | + int* col = NULL; |
| 4343 | + T* val = NULL; |
| 4344 | + |
| 4345 | + allocate_host(m + 1, &row_offset); |
| 4346 | + allocate_host(nnz, &col); |
| 4347 | + allocate_host(nnz, &val); |
| 4348 | + |
| 4349 | + set_to_zero_host(m + 1, row_offset); |
| 4350 | + set_to_zero_host(nnz, col); |
| 4351 | + set_to_zero_host(nnz, val); |
| 4352 | + |
| 4353 | + A.SetDataPtrCSR(&row_offset, &col, &val, "A", nnz, m, n); |
| 4354 | + A.LeaveDataPtrCSR(&row_offset, &col, &val); |
| 4355 | + |
| 4356 | + free_host(&row_offset); |
| 4357 | + free_host(&col); |
| 4358 | + free_host(&val); |
| 4359 | + } |
| 4360 | + |
| 4361 | + { |
| 4362 | + LocalMatrix<T> B; |
| 4363 | + int* row_offset = NULL; |
| 4364 | + int* col = NULL; |
| 4365 | + T* val = NULL; |
| 4366 | + |
| 4367 | + allocate_host(mb + 1, &row_offset); |
| 4368 | + allocate_host(nnzb, &col); |
| 4369 | + allocate_host(nnzb, &val); |
| 4370 | + |
| 4371 | + set_to_zero_host(mb + 1, row_offset); |
| 4372 | + set_to_zero_host(nnzb, col); |
| 4373 | + set_to_zero_host(nnzb, val); |
| 4374 | + |
| 4375 | + B.SetDataPtrBCSR(&row_offset, &col, &val, "C", nnzb, mb, nb, blockdim); |
| 4376 | + B.LeaveDataPtrBCSR(&row_offset, &col, &val, blockdim); |
| 4377 | + |
| 4378 | + free_host(&row_offset); |
| 4379 | + free_host(&col); |
| 4380 | + free_host(&val); |
| 4381 | + } |
| 4382 | + |
| 4383 | + { |
| 4384 | + LocalMatrix<T> C; |
| 4385 | + int* row = NULL; |
| 4386 | + int* col = NULL; |
| 4387 | + T* val = NULL; |
| 4388 | + |
| 4389 | + allocate_host(nnz, &row); |
| 4390 | + allocate_host(nnz, &col); |
| 4391 | + allocate_host(nnz, &val); |
| 4392 | + |
| 4393 | + set_to_zero_host(nnz, row); |
| 4394 | + set_to_zero_host(nnz, col); |
| 4395 | + set_to_zero_host(nnz, val); |
| 4396 | + |
| 4397 | + C.SetDataPtrCOO(&row, &col, &val, "C", nnz, m, n); |
| 4398 | + C.LeaveDataPtrCOO(&row, &col, &val); |
| 4399 | + |
| 4400 | + free_host(&row); |
| 4401 | + free_host(&col); |
| 4402 | + free_host(&val); |
| 4403 | + } |
| 4404 | + |
| 4405 | + { |
| 4406 | + LocalMatrix<T> E; |
| 4407 | + int* row_offset = NULL; |
| 4408 | + int* col = NULL; |
| 4409 | + T* val = NULL; |
| 4410 | + |
| 4411 | + allocate_host(m + 1, &row_offset); |
| 4412 | + allocate_host(nnz, &col); |
| 4413 | + allocate_host(nnz, &val); |
| 4414 | + |
| 4415 | + set_to_zero_host(m + 1, row_offset); |
| 4416 | + set_to_zero_host(nnz, col); |
| 4417 | + set_to_zero_host(nnz, val); |
| 4418 | + |
| 4419 | + E.SetDataPtrMCSR(&row_offset, &col, &val, "C", nnz, m, n); |
| 4420 | + E.LeaveDataPtrMCSR(&row_offset, &col, &val); |
| 4421 | + |
| 4422 | + free_host(&row_offset); |
| 4423 | + free_host(&col); |
| 4424 | + free_host(&val); |
| 4425 | + } |
| 4426 | + |
| 4427 | + { |
| 4428 | + LocalMatrix<T> F; |
| 4429 | + int* col = NULL; |
| 4430 | + T* val = NULL; |
| 4431 | + |
| 4432 | + allocate_host(ell_nnz, &col); |
| 4433 | + allocate_host(ell_nnz, &val); |
| 4434 | + |
| 4435 | + set_to_zero_host(ell_nnz, col); |
| 4436 | + set_to_zero_host(ell_nnz, val); |
| 4437 | + |
| 4438 | + F.SetDataPtrELL(&col, &val, "C", ell_nnz, m, n, ell_max_row); |
| 4439 | + F.LeaveDataPtrELL(&col, &val, ell_max_row); |
| 4440 | + |
| 4441 | + free_host(&col); |
| 4442 | + free_host(&val); |
| 4443 | + } |
| 4444 | + |
| 4445 | + { |
| 4446 | + LocalMatrix<T> H; |
| 4447 | + T* val = NULL; |
| 4448 | + |
| 4449 | + allocate_host(m * n, &val); |
| 4450 | + |
| 4451 | + set_to_zero_host(m * n, val); |
| 4452 | + |
| 4453 | + H.SetDataPtrDENSE(&val, "C", m, n); |
| 4454 | + H.LeaveDataPtrDENSE(&val); |
| 4455 | + |
| 4456 | + free_host(&val); |
| 4457 | + } |
| 4458 | + |
| 4459 | // Stop rocALUTION platform |
| 4460 | stop_rocalution(); |
| 4461 | |
| 4462 | return true; |
| 4463 | } |
| 4464 | |
| 4465 | +template <typename T> |
| 4466 | +LocalMatrix<T> getTestMatrix() |
| 4467 | +{ |
| 4468 | + // Create a simple 2x2 CSR matrix |
| 4469 | + LocalMatrix<T> matrix; |
| 4470 | + matrix.AllocateCSR("TestMatrix", 4, 2, 2); |
| 4471 | + |
| 4472 | + int row_offsets[3] = {0, 2, 4}; |
| 4473 | + int col_indices[4] = {0, 1, 0, 1}; |
| 4474 | + T values[4] = {1.0, 2.0, 3.0, 4.0}; |
| 4475 | + matrix.CopyFromCSR(row_offsets, col_indices, values); |
| 4476 | + |
| 4477 | + return matrix; |
| 4478 | +} |
| 4479 | + |
| 4480 | +template <typename T> |
| 4481 | +void getTestMatrix(Arguments argus, LocalMatrix<T>& matrix, bool& is_invertible) |
| 4482 | +{ |
| 4483 | + int size = argus.size; |
| 4484 | + int blockdim = argus.blockdim; |
| 4485 | + std::string matrix_type = argus.matrix_type; |
| 4486 | + |
| 4487 | + // Generate A |
| 4488 | + int* csr_ptr = NULL; |
| 4489 | + int* csr_col = NULL; |
| 4490 | + T* csr_val = NULL; |
| 4491 | + |
| 4492 | + int nrow = 0; |
| 4493 | + int ncol = 0; |
| 4494 | + if(matrix_type == "Laplacian2D") |
| 4495 | + { |
| 4496 | + nrow = gen_2d_laplacian(size, &csr_ptr, &csr_col, &csr_val); |
| 4497 | + ncol = nrow; |
| 4498 | + |
| 4499 | + is_invertible = true; |
| 4500 | + } |
| 4501 | + else if(matrix_type == "PermutedIdentity") |
| 4502 | + { |
| 4503 | + nrow = gen_permuted_identity(size, &csr_ptr, &csr_col, &csr_val); |
| 4504 | + ncol = nrow; |
| 4505 | + |
| 4506 | + is_invertible = true; |
| 4507 | + } |
| 4508 | + else if(matrix_type == "Random") |
| 4509 | + { |
| 4510 | + nrow = gen_random(100 * size, 50 * size, 6, &csr_ptr, &csr_col, &csr_val); |
| 4511 | + ncol = 50 * size; |
| 4512 | + |
| 4513 | + is_invertible = false; |
| 4514 | + } |
| 4515 | + else |
| 4516 | + { |
| 4517 | + is_invertible = true; |
| 4518 | + |
| 4519 | + matrix = getTestMatrix<T>(); |
| 4520 | + return; |
| 4521 | + } |
| 4522 | + |
| 4523 | + int nnz = csr_ptr[nrow]; |
| 4524 | + |
| 4525 | + matrix.SetDataPtrCSR(&csr_ptr, &csr_col, &csr_val, "TestMatrix", nnz, nrow, ncol); |
| 4526 | +} |
| 4527 | + |
| 4528 | +template <typename T> |
| 4529 | +void getTestMatrix(Arguments argus, LocalMatrix<T>& matrix) |
| 4530 | +{ |
| 4531 | + bool is_invertible; |
| 4532 | + getTestMatrix<T>(argus, matrix, is_invertible); |
| 4533 | +} |
| 4534 | + |
| 4535 | +template <typename T> |
| 4536 | +void getMatrixVal(const LocalMatrix<T>& matrix, T* values) |
| 4537 | +{ |
| 4538 | + // Copy the values from the matrix to the provided array |
| 4539 | + int64_t m = matrix.GetM(); |
| 4540 | + int64_t nnz = matrix.GetNnz(); |
| 4541 | + |
| 4542 | + int* row_offsets = new int[m + 1]; |
| 4543 | + int* col_indices = new int[nnz]; |
| 4544 | + T* matrix_values = new T[nnz]; |
| 4545 | + |
| 4546 | + matrix.CopyToCSR(row_offsets, col_indices, matrix_values); |
| 4547 | + for(int i = 0; i < nnz; ++i) |
| 4548 | + { |
| 4549 | + values[i] = matrix_values[i]; |
| 4550 | + } |
| 4551 | + |
| 4552 | + delete[] row_offsets; |
| 4553 | + delete[] col_indices; |
| 4554 | + delete[] matrix_values; |
| 4555 | +} |
| 4556 | + |
| 4557 | +template <typename T> |
| 4558 | +void getMatrixDiagVal(const LocalMatrix<T>& matrix, T* values) |
| 4559 | +{ |
| 4560 | + // Copy the values from the matrix to the provided array |
| 4561 | + int64_t m = matrix.GetM(); |
| 4562 | + int64_t nnz = matrix.GetNnz(); |
| 4563 | + |
| 4564 | + int* row_offsets = new int[m + 1]; |
| 4565 | + int* col_indices = new int[nnz]; |
| 4566 | + T* matrix_values = new T[nnz]; |
| 4567 | + |
| 4568 | + matrix.CopyToCSR(row_offsets, col_indices, matrix_values); |
| 4569 | + for(int row = 0; row < m; ++row) |
| 4570 | + { |
| 4571 | + int start = row_offsets[row]; |
| 4572 | + int end = row_offsets[row + 1]; |
| 4573 | + for(int i = start; i < end; ++i) |
| 4574 | + { |
| 4575 | + if(col_indices[i] == row) // Diagonal element |
| 4576 | + { |
| 4577 | + values[row] = matrix_values[i]; |
| 4578 | + break; // Only one diagonal element per row |
| 4579 | + } |
| 4580 | + } |
| 4581 | + } |
| 4582 | + |
| 4583 | + delete[] row_offsets; |
| 4584 | + delete[] col_indices; |
| 4585 | + delete[] matrix_values; |
| 4586 | +} |
| 4587 | + |
| 4588 | +void checkPermutation(const LocalVector<int>& permutation) |
| 4589 | +{ |
| 4590 | + // Check that permutation is a valid permutation of 0..N-1 |
| 4591 | + std::vector<int> seen(permutation.GetSize(), 0); |
| 4592 | + for(int i = 0; i < permutation.GetSize(); ++i) |
| 4593 | + { |
| 4594 | + int idx = permutation[i]; |
| 4595 | + EXPECT_GE(idx, 0); |
| 4596 | + EXPECT_LT(idx, permutation.GetSize()); |
| 4597 | + seen[idx]++; |
| 4598 | + } |
| 4599 | + for(int i = 0; i < seen.size(); ++i) |
| 4600 | + { |
| 4601 | + EXPECT_EQ(seen[i], 1); // Each index appears exactly once |
| 4602 | + } |
| 4603 | +} |
| 4604 | + |
| 4605 | +template <typename T> |
| 4606 | +T getTolerance() |
| 4607 | +{ |
| 4608 | + // Set tolerance based on the type |
| 4609 | + if(std::is_same<T, float>::value) |
| 4610 | + { |
| 4611 | + return 1e-5f; // Tolerance for float |
| 4612 | + } |
| 4613 | + else |
| 4614 | + { |
| 4615 | + return 1e-10; // Default tolerance for other types |
| 4616 | + } |
| 4617 | +} |
| 4618 | + |
| 4619 | +// Helper to extract dense matrix from LocalMatrix<T> |
| 4620 | +template <typename T> |
| 4621 | +std::vector<std::vector<T>> extract_dense_matrix(const LocalMatrix<T>& matrix) |
| 4622 | +{ |
| 4623 | + int m = matrix.GetM(); |
| 4624 | + int n = matrix.GetN(); |
| 4625 | + int nnz = matrix.GetNnz(); |
| 4626 | + std::vector<std::vector<T>> dense(m, std::vector<T>(n, static_cast<T>(0))); |
| 4627 | + std::vector<int> row_offsets(m + 1); |
| 4628 | + std::vector<int> col_indices(nnz); |
| 4629 | + std::vector<T> values(nnz); |
| 4630 | + |
| 4631 | + matrix.CopyToCSR(row_offsets.data(), col_indices.data(), values.data()); |
| 4632 | + for(int row = 0; row < m; ++row) |
| 4633 | + { |
| 4634 | + for(int idx = row_offsets[row]; idx < row_offsets[row + 1]; ++idx) |
| 4635 | + { |
| 4636 | + int col = col_indices[idx]; |
| 4637 | + dense[row][col] = values[idx]; |
| 4638 | + } |
| 4639 | + } |
| 4640 | + return dense; |
| 4641 | +} |
| 4642 | + |
| 4643 | +template <typename T> |
| 4644 | +void testing_local_allocate() |
| 4645 | +{ |
| 4646 | + // Test AllocateCSR |
| 4647 | + { |
| 4648 | + LocalMatrix<T> mat; |
| 4649 | + EXPECT_NO_THROW(mat.AllocateCSR("AllocatedMatrix", 4, 2, 2)); |
| 4650 | + EXPECT_EQ(mat.GetNnz(), 4); |
| 4651 | + EXPECT_EQ(mat.GetM(), 2); |
| 4652 | + EXPECT_EQ(mat.GetN(), 2); |
| 4653 | + } |
| 4654 | + |
| 4655 | + // Test AllocateCOO |
| 4656 | + { |
| 4657 | + LocalMatrix<T> mat; |
| 4658 | + mat.AllocateCOO("AllocatedMatrix", 4, 2, 2); |
| 4659 | + EXPECT_EQ(mat.GetNnz(), 4); |
| 4660 | + EXPECT_EQ(mat.GetM(), 2); |
| 4661 | + EXPECT_EQ(mat.GetN(), 2); |
| 4662 | + } |
| 4663 | + |
| 4664 | + // Test AllocateBCSR |
| 4665 | + { |
| 4666 | + LocalMatrix<T> mat; |
| 4667 | + int nnzb = 4, mb = 2, nb = 2, blockdim = 2; |
| 4668 | + EXPECT_NO_THROW(mat.AllocateBCSR("BCSR", nnzb, mb, nb, blockdim)); |
| 4669 | + EXPECT_EQ(mat.GetNnz(), nnzb * blockdim * blockdim); |
| 4670 | + EXPECT_EQ(mat.GetM(), mb * blockdim); |
| 4671 | + EXPECT_EQ(mat.GetN(), nb * blockdim); |
| 4672 | + } |
| 4673 | + |
| 4674 | + // Test AllocateDIA |
| 4675 | + { |
| 4676 | + LocalMatrix<T> mat; |
| 4677 | + int nnz = 6, m = 3, n = 3, ndiag = 2; |
| 4678 | + EXPECT_NO_THROW(mat.AllocateDIA("DIA", nnz, m, n, ndiag)); |
| 4679 | + EXPECT_EQ(mat.GetNnz(), nnz); |
| 4680 | + EXPECT_EQ(mat.GetM(), m); |
| 4681 | + EXPECT_EQ(mat.GetN(), n); |
| 4682 | + } |
| 4683 | + |
| 4684 | + // Test AllocateMCSR |
| 4685 | + { |
| 4686 | + LocalMatrix<T> mat; |
| 4687 | + int nnz = 5, m = 3, n = 3; |
| 4688 | + EXPECT_NO_THROW(mat.AllocateMCSR("MCSR", nnz, m, n)); |
| 4689 | + EXPECT_EQ(mat.GetNnz(), nnz); |
| 4690 | + EXPECT_EQ(mat.GetM(), m); |
| 4691 | + EXPECT_EQ(mat.GetN(), n); |
| 4692 | + } |
| 4693 | + |
| 4694 | + // Test AllocateELL |
| 4695 | + { |
| 4696 | + LocalMatrix<T> mat; |
| 4697 | + int ell_nnz = 6, m = 3, n = 3, ell_max_row = 2; |
| 4698 | + EXPECT_NO_THROW(mat.AllocateELL("ELL", ell_nnz, m, n, ell_max_row)); |
| 4699 | + EXPECT_EQ(mat.GetNnz(), ell_nnz); |
| 4700 | + EXPECT_EQ(mat.GetM(), m); |
| 4701 | + EXPECT_EQ(mat.GetN(), n); |
| 4702 | + } |
| 4703 | + |
| 4704 | + // Test AllocateHYB |
| 4705 | + { |
| 4706 | + LocalMatrix<T> mat; |
| 4707 | + int m = 3, n = 3; |
| 4708 | + int ell_max_row = 2; |
| 4709 | + int ell_nnz = ell_max_row * m; // 2 * 3 = 6 |
| 4710 | + int coo_nnz = 2; |
| 4711 | + |
| 4712 | + EXPECT_NO_THROW(mat.AllocateHYB("HYB", ell_nnz, coo_nnz, ell_max_row, m, n)); |
| 4713 | + EXPECT_EQ(mat.GetNnz(), ell_nnz + coo_nnz); // 8 |
| 4714 | + EXPECT_EQ(mat.GetM(), m); |
| 4715 | + EXPECT_EQ(mat.GetN(), n); |
| 4716 | + } |
| 4717 | + |
| 4718 | + // Test AllocateDENSE |
| 4719 | + { |
| 4720 | + LocalMatrix<T> mat; |
| 4721 | + int m = 3, n = 3; |
| 4722 | + EXPECT_NO_THROW(mat.AllocateDENSE("DENSE", m, n)); |
| 4723 | + EXPECT_EQ(mat.GetNnz(), m * n); |
| 4724 | + EXPECT_EQ(mat.GetM(), m); |
| 4725 | + EXPECT_EQ(mat.GetN(), n); |
| 4726 | + } |
| 4727 | +} |
| 4728 | + |
| 4729 | +template <typename T> |
| 4730 | +void testing_check_with_empty_matrix() |
| 4731 | +{ |
| 4732 | + LocalMatrix<T> empty_matrix; |
| 4733 | + // Check should pass without any issues |
| 4734 | + EXPECT_NO_THROW(empty_matrix.Check()); |
| 4735 | + // Info should not throw an error |
| 4736 | + EXPECT_NO_THROW(empty_matrix.Info()); |
| 4737 | +} |
| 4738 | + |
| 4739 | +template <typename T> |
| 4740 | +void testing_local_copy_from_async() |
| 4741 | +{ |
| 4742 | + auto matrix = getTestMatrix<T>(); |
| 4743 | + LocalMatrix<T> copy_matrix; |
| 4744 | + // CopyFromAsync should copy the matrix asynchronously (if supported) |
| 4745 | + EXPECT_NO_THROW(copy_matrix.CopyFromAsync(matrix)); |
| 4746 | + EXPECT_NO_THROW(matrix.Sync()); |
| 4747 | + EXPECT_EQ(copy_matrix.GetM(), matrix.GetM()); |
| 4748 | + EXPECT_EQ(copy_matrix.GetN(), matrix.GetN()); |
| 4749 | + EXPECT_EQ(copy_matrix.GetNnz(), matrix.GetNnz()); |
| 4750 | + |
| 4751 | + // Compare dense representations |
| 4752 | + auto dense_orig = extract_dense_matrix(matrix); |
| 4753 | + auto dense_copy = extract_dense_matrix(copy_matrix); |
| 4754 | + |
| 4755 | + EXPECT_EQ(dense_orig.size(), dense_copy.size()); |
| 4756 | + for(size_t i = 0; i < dense_orig.size(); ++i) |
| 4757 | + { |
| 4758 | + EXPECT_EQ(dense_orig[i].size(), dense_copy[i].size()); |
| 4759 | + for(size_t j = 0; j < dense_orig[i].size(); ++j) |
| 4760 | + { |
| 4761 | + EXPECT_EQ(dense_orig[i][j], dense_copy[i][j]); |
| 4762 | + } |
| 4763 | + } |
| 4764 | +} |
| 4765 | + |
| 4766 | +template <typename T> |
| 4767 | +void testing_local_update_values_csr() |
| 4768 | +{ |
| 4769 | + auto matrix = getTestMatrix<T>(); |
| 4770 | + |
| 4771 | + int64_t nnz = matrix.GetNnz(); |
| 4772 | + |
| 4773 | + // Use std::vector instead of raw arrays |
| 4774 | + std::vector<T> new_values(nnz); |
| 4775 | + for(int64_t i = 0; i < nnz; ++i) |
| 4776 | + { |
| 4777 | + new_values[i] = static_cast<T>(i + 10); // Fill with some values |
| 4778 | + } |
| 4779 | + |
| 4780 | + // UpdateValuesCSR should update the values in the matrix |
| 4781 | + EXPECT_NO_THROW(matrix.UpdateValuesCSR(new_values.data())); |
| 4782 | + |
| 4783 | + std::vector<T> check_values(nnz); |
| 4784 | + getMatrixVal(matrix, check_values.data()); |
| 4785 | + for(int64_t i = 0; i < nnz; ++i) |
| 4786 | + { |
| 4787 | + EXPECT_EQ(check_values[i], new_values[i]); |
| 4788 | + } |
| 4789 | +} |
| 4790 | + |
| 4791 | +template <typename T> |
| 4792 | +void testing_local_move_to_accelerator() |
| 4793 | +{ |
| 4794 | + auto matrix = getTestMatrix<T>(); |
| 4795 | + |
| 4796 | + for(int i = 0; i < 10; i++) |
| 4797 | + { |
| 4798 | + EXPECT_NO_THROW(matrix.MoveToAccelerator()); |
| 4799 | + } |
| 4800 | + for(int i = 0; i < 10; i++) |
| 4801 | + { |
| 4802 | + EXPECT_NO_THROW(matrix.MoveToHost()); |
| 4803 | + } |
| 4804 | + for(int i = 0; i < 10; i++) |
| 4805 | + { |
| 4806 | + EXPECT_NO_THROW(matrix.MoveToHost()); |
| 4807 | + EXPECT_NO_THROW(matrix.MoveToAccelerator()); |
| 4808 | + } |
| 4809 | + EXPECT_EQ(matrix.Check(), true); |
| 4810 | +} |
| 4811 | + |
| 4812 | +template <typename T> |
| 4813 | +void testing_local_move_to_accelerator_async() |
| 4814 | +{ |
| 4815 | + auto matrix = getTestMatrix<T>(); |
| 4816 | + // MoveToAcceleratorAsync should move the matrix asynchronously |
| 4817 | + for(int i = 0; i < 10; i++) |
| 4818 | + { |
| 4819 | + EXPECT_NO_THROW(matrix.MoveToAcceleratorAsync()); |
| 4820 | + } |
| 4821 | + for(int i = 0; i < 10; i++) |
| 4822 | + { |
| 4823 | + EXPECT_NO_THROW(matrix.MoveToHostAsync()); |
| 4824 | + } |
| 4825 | + for(int i = 0; i < 10; i++) |
| 4826 | + { |
| 4827 | + EXPECT_NO_THROW(matrix.MoveToHostAsync()); |
| 4828 | + EXPECT_NO_THROW(matrix.MoveToAcceleratorAsync()); |
| 4829 | + } |
| 4830 | + EXPECT_NO_THROW(matrix.Sync()); |
| 4831 | + EXPECT_EQ(matrix.Check(), true); |
| 4832 | +} |
| 4833 | + |
| 4834 | +template <typename T> |
| 4835 | +void testing_local_move_to_host_async() |
| 4836 | +{ |
| 4837 | + auto matrix = getTestMatrix<T>(); |
| 4838 | + // MoveToHostAsync should move the matrix asynchronously to host |
| 4839 | + EXPECT_NO_THROW(matrix.MoveToHostAsync()); |
| 4840 | + EXPECT_NO_THROW(matrix.Sync()); |
| 4841 | + EXPECT_EQ(matrix.GetM(), 2); |
| 4842 | + EXPECT_EQ(matrix.GetN(), 2); |
| 4843 | +} |
| 4844 | + |
| 4845 | +template <typename T> |
| 4846 | +void testing_local_clear(Arguments argus) |
| 4847 | +{ |
| 4848 | + LocalMatrix<T> matrix; |
| 4849 | + getTestMatrix<T>(argus, matrix); |
| 4850 | + |
| 4851 | + // Clear should remove all data from the matrix |
| 4852 | + // This test checks if the Clear operation is valid |
| 4853 | + // by checking if the number of non-zero entries (nnz) is zero |
| 4854 | + // and the dimensions (m, n) are also zero. |
| 4855 | + |
| 4856 | + matrix.Clear(); |
| 4857 | + EXPECT_EQ(matrix.GetNnz(), 0); |
| 4858 | + EXPECT_EQ(matrix.GetM(), 0); |
| 4859 | + EXPECT_EQ(matrix.GetN(), 0); |
| 4860 | +} |
| 4861 | + |
| 4862 | +template <typename T> |
| 4863 | +void testing_local_zeros(Arguments argus) |
| 4864 | +{ |
| 4865 | + LocalMatrix<T> matrix; |
| 4866 | + getTestMatrix<T>(argus, matrix); |
| 4867 | + |
| 4868 | + // Zeros should set all values in the matrix to zero |
| 4869 | + // This test checks if the Zeros operation is valid |
| 4870 | + // by checking if all values in the matrix are zero after the operation. |
| 4871 | + |
| 4872 | + matrix.Zeros(); |
| 4873 | + matrix.Check(); |
| 4874 | + matrix.Info(); |
| 4875 | + |
| 4876 | + int64_t nnz = matrix.GetNnz(); |
| 4877 | + T* check_values = new T[nnz]; |
| 4878 | + getMatrixVal(matrix, check_values); |
| 4879 | + for(int64_t i = 0; i < nnz; ++i) |
| 4880 | + { |
| 4881 | + EXPECT_EQ(check_values[i], static_cast<T>(0)); |
| 4882 | + } |
| 4883 | + delete[] check_values; |
| 4884 | +} |
| 4885 | + |
| 4886 | +template <typename T> |
| 4887 | +void testing_local_copy(Arguments argus) |
| 4888 | +{ |
| 4889 | + LocalMatrix<T> matrix; |
| 4890 | + getTestMatrix<T>(argus, matrix); |
| 4891 | + |
| 4892 | + // CopyFrom should create a copy of the matrix |
| 4893 | + // This test checks if the copy operation is valid |
| 4894 | + // by comparing the original matrix with the copied matrix. |
| 4895 | + |
| 4896 | + LocalMatrix<T> copy_matrix; |
| 4897 | + copy_matrix.CopyFrom(matrix); |
| 4898 | + EXPECT_EQ(copy_matrix.GetNnz(), matrix.GetNnz()); |
| 4899 | + EXPECT_EQ(copy_matrix.GetM(), matrix.GetM()); |
| 4900 | + EXPECT_EQ(copy_matrix.GetN(), matrix.GetN()); |
| 4901 | + |
| 4902 | + // Compare dense representations |
| 4903 | + auto dense_orig = extract_dense_matrix(matrix); |
| 4904 | + auto dense_copy = extract_dense_matrix(copy_matrix); |
| 4905 | + |
| 4906 | + EXPECT_EQ(dense_orig.size(), dense_copy.size()); |
| 4907 | + for(size_t i = 0; i < dense_orig.size(); ++i) |
| 4908 | + { |
| 4909 | + EXPECT_EQ(dense_orig[i].size(), dense_copy[i].size()); |
| 4910 | + for(size_t j = 0; j < dense_orig[i].size(); ++j) |
| 4911 | + { |
| 4912 | + EXPECT_EQ(dense_orig[i][j], dense_copy[i][j]); |
| 4913 | + } |
| 4914 | + } |
| 4915 | +} |
| 4916 | + |
| 4917 | +template <typename T> |
| 4918 | +void testing_local_scale(Arguments argus) |
| 4919 | +{ |
| 4920 | + LocalMatrix<T> matrix; |
| 4921 | + getTestMatrix<T>(argus, matrix); |
| 4922 | + |
| 4923 | + // Scale should multiply all values in the matrix by a scalar |
| 4924 | + // This test checks if the scaling operation is valid |
| 4925 | + // by comparing the scaled values with the expected values. |
| 4926 | + // The expected values are obtained by multiplying the original values |
| 4927 | + // by the scaling factor. |
| 4928 | + |
| 4929 | + // Save original dense matrix |
| 4930 | + auto orig_dense = extract_dense_matrix(matrix); |
| 4931 | + |
| 4932 | + // Scale the matrix by 2.0 |
| 4933 | + matrix.Scale(2.0); |
| 4934 | + |
| 4935 | + // Extract new dense matrix |
| 4936 | + auto new_dense = extract_dense_matrix(matrix); |
| 4937 | + |
| 4938 | + // Compare each value |
| 4939 | + int m = matrix.GetM(); |
| 4940 | + int n = matrix.GetN(); |
| 4941 | + for(int i = 0; i < m; ++i) |
| 4942 | + for(int j = 0; j < n; ++j) |
| 4943 | + EXPECT_EQ(new_dense[i][j], orig_dense[i][j] * 2.0); |
| 4944 | +} |
| 4945 | + |
| 4946 | +template <typename T> |
| 4947 | +void testing_local_extract_diagonal(Arguments argus) |
| 4948 | +{ |
| 4949 | + if(argus.matrix_type != "Laplacian2D") |
| 4950 | + { |
| 4951 | + return; |
| 4952 | + } |
| 4953 | + |
| 4954 | + LocalMatrix<T> matrix; |
| 4955 | + getTestMatrix<T>(argus, matrix); |
| 4956 | + |
| 4957 | + // ExtractDiagonal should extract the diagonal of the matrix |
| 4958 | + // and store it in a LocalVector |
| 4959 | + // This test checks if the diagonal extraction is correct |
| 4960 | + // by comparing the extracted diagonal with the expected values. |
| 4961 | + // The expected values are obtained by iterating through the matrix |
| 4962 | + // and checking the diagonal elements. |
| 4963 | + |
| 4964 | + LocalVector<T> diag; |
| 4965 | + matrix.ExtractDiagonal(&diag); |
| 4966 | + |
| 4967 | + int64_t m = matrix.GetM(); |
| 4968 | + T* check_values = new T[m]; |
| 4969 | + getMatrixDiagVal(matrix, check_values); |
| 4970 | + |
| 4971 | + EXPECT_EQ(diag.GetSize(), m); |
| 4972 | + for(int i = 0; i < m; ++i) |
| 4973 | + { |
| 4974 | + EXPECT_EQ(diag[i], check_values[i]); |
| 4975 | + } |
| 4976 | + delete[] check_values; |
| 4977 | +} |
| 4978 | + |
| 4979 | +template <typename T> |
| 4980 | +void testing_local_extract_inverse_diagonal(Arguments argus) |
| 4981 | +{ |
| 4982 | + if(argus.matrix_type != "Laplacian2D") |
| 4983 | + { |
| 4984 | + return; |
| 4985 | + } |
| 4986 | + |
| 4987 | + LocalMatrix<T> matrix; |
| 4988 | + getTestMatrix<T>(argus, matrix); |
| 4989 | + |
| 4990 | + // ExtractInverseDiagonal should extract the inverse diagonal of the matrix |
| 4991 | + // and store it in a LocalVector |
| 4992 | + // This test checks if the inverse diagonal extraction is correct |
| 4993 | + // by comparing the extracted inverse diagonal with the expected values. |
| 4994 | + // The expected values are obtained by taking the reciprocal of the diagonal elements. |
| 4995 | + |
| 4996 | + LocalVector<T> diag; |
| 4997 | + matrix.ExtractDiagonal(&diag); |
| 4998 | + LocalVector<T> inv_diag; |
| 4999 | + matrix.ExtractInverseDiagonal(&inv_diag); |
| 5000 | + EXPECT_EQ(inv_diag.GetSize(), diag.GetSize()); |

Uploaded package to this ppa: https:/ /launchpad. net/~bruno- bdmoura/ +archive/ ubuntu/ lp-2138877
(-proposed and all archs enabled)