diff --git a/src/problem/Formulation.cpp b/src/problem/Formulation.cpp index 514ef49fc3268ebe4638e407fae100abbc09c37a..6ed735ce58da6fb7fc7a7501461e04a678319b69 100644 --- a/src/problem/Formulation.cpp +++ b/src/problem/Formulation.cpp @@ -42,7 +42,7 @@ namespace gmshddm template< class T_Scalar > Formulation< T_Scalar >::Formulation(const std::string &name, const std::vector< std::vector< unsigned int > > &topology) : - _name(name), _volume(), _interfaceFields(), _interfaceMapping(), _useMagma(false), _useGPUMRES(false), _numberOfIterations(0), _physicalCommutator(false), _artificialCommutator(false), _IA(false) + _name(name), _volume(), _interfaceFields(), _interfaceMapping(), _useMagma(false), _useGPUMRES(false), _useSparseMult(false), _numberOfIterations(0), _physicalCommutator(false), _artificialCommutator(false), _IA(false) { _topology.resize(topology.size()); for(auto i = 0ULL; i < topology.size(); ++i) { @@ -59,16 +59,19 @@ namespace gmshddm } template< class T_Scalar > - Formulation< T_Scalar >::Formulation(const std::string &name, const std::vector< std::vector< unsigned int > > &topology, bool useMagma, bool useGPUMRES) : + Formulation< T_Scalar >::Formulation(const std::string &name, const std::vector< std::vector< unsigned int > > &topology, bool useMagma, bool useGPUMRES, bool useSparseMult) : _name(name), _volume(), _interfaceFields(), _interfaceMapping(), _numberOfIterations(0), _physicalCommutator(false), _artificialCommutator(false), _IA(false) { #if defined(HAVE_MAGMA) _useMagma = useMagma; #if defined(USE_CUDA) || defined(USE_HIP) _useGPUMRES = useGPUMRES; + _useSparseMult = useSparseMult; #else if (useGPUMRES){ gmshfem::msg::warning << "Neither CUDA nor HIP found ! PETSC GPU support won't be used." << gmshfem::msg::endl; + _useGPUMRES = false; + _useSparseMult = false; } #endif #ifdef USE_CUDA @@ -80,7 +83,7 @@ namespace gmshddm #endif if (_useMagma){ gmshfem::msg::info << "GMSHDDM has been compiled with MAGMA. MAGMA will be used." << gmshfem::msg::endl; - _magmaHelper = new MagmaHelper<T_Scalar>(_useMagma, _useGPUMRES, gmshddm::mpi::getMPIRank()); + _magmaHelper = new MagmaHelper<T_Scalar>(_useMagma, _useGPUMRES, _useSparseMult, gmshddm::mpi::getMPIRank()); } #else if(useMagma) { @@ -88,6 +91,7 @@ namespace gmshddm } #endif _topology.resize(topology.size()); + std::cout << "Topology size: " << topology.size() << std::endl; for(auto i = 0ULL; i < topology.size(); ++i) { gmshfem::msg::debug << "Create volume formulation (" << i << ")." << gmshfem::msg::endl; _volume.push_back(new gmshfem::problem::Formulation< T_Scalar >("volume_" + std::to_string(i))); @@ -99,17 +103,18 @@ namespace gmshddm } } + std::cout << "End of Forumaltion constructor" << std::endl; } template< class T_Scalar > Formulation< T_Scalar >::Formulation(const std::string &name, const std::vector< gmshfem::problem::Formulation< T_Scalar > * > &volume, const std::vector< std::map< unsigned int, gmshfem::problem::Formulation< T_Scalar > * > > &surface) : - _name(name), _volume(volume), _interfaceFields(), _interfaceMapping(), _useMagma(false), _useGPUMRES(false), _numberOfIterations(0), _physicalCommutator(false), _artificialCommutator(false), _IA(false) + _name(name), _volume(volume), _interfaceFields(), _interfaceMapping(), _useMagma(false), _useGPUMRES(false), _useSparseMult(false), _numberOfIterations(0), _physicalCommutator(false), _artificialCommutator(false), _IA(false) { #warning "Still usable ?" } template< class T_Scalar > - Formulation< T_Scalar >::Formulation(const std::string &name, const std::vector< gmshfem::problem::Formulation< T_Scalar > * > &volume, const std::vector< std::map< unsigned int, gmshfem::problem::Formulation< T_Scalar > * > > &surface, bool useMagma, bool useGPUMRES) : + Formulation< T_Scalar >::Formulation(const std::string &name, const std::vector< gmshfem::problem::Formulation< T_Scalar > * > &volume, const std::vector< std::map< unsigned int, gmshfem::problem::Formulation< T_Scalar > * > > &surface, bool useMagma, bool useGPUMRES, bool useSparseMult) : _name(name), _volume(volume), _interfaceFields(), _interfaceMapping(), _numberOfIterations(0), _physicalCommutator(false), _artificialCommutator(false), _IA(false) { #warning "Still usable ?" @@ -1211,7 +1216,7 @@ namespace gmshddm assemble(); if (_useGPUMRES){ // _useGPUMRES - _magmaHelper->createOutgoingGVolumes(algebraicHelpers.fromVolumeToInterfaceScatter, algebraicHelpers.interfaceSizes, algebraicHelpers.interfaceIndices, algebraicHelpers.shiftedIdentity, algebraicHelpers.localGsize,algebraicHelpers.superOutgoingGVolume ,algebraicHelpers.outgoingGVolume); + _magmaHelper->createOutgoingGVolumes(algebraicHelpers.fromVolumeToInterfaceScatter, algebraicHelpers.interfaceSizes, algebraicHelpers.interfaceIndices, algebraicHelpers.shiftedIdentity, algebraicHelpers.localGsize, algebraicHelpers.superOutgoingGVolume, algebraicHelpers.outgoingGVolume); _magmaHelper->freeMagmaCPUArrays(); } //VecView(algebraicHelpers.superOutgoingGVolume, PETSC_VIEWER_STDOUT_WORLD); @@ -1370,7 +1375,6 @@ namespace gmshddm PetscInt globSize; // Create Petsc distributed vector - let Petsc compute the global vector size - // THIS IS WHAT BREAKS GPUMRES ??? if(_useGPUMRES) { #ifdef USE_CUDA VecCreateMPICUDA(MPI_COMM_WORLD, locSize, PETSC_DETERMINE, &RHS); @@ -1479,14 +1483,14 @@ namespace gmshddm // NO NEED TO CALL ALLOCATEGREAD AND SETMULTRHS AFTER THIS POINT !!! if (_useMagma && _useGPUMRES){ - _magmaHelper->allocateGread(algebraicHelpers.Gread, algebraicHelpers.localGsize); for (unsigned long incre = 0; incre < algebraicHelpers.myDoms.size(); ++incre) { unsigned long idom = algebraicHelpers.myDoms[incre]; auto [shift, endIdx] = _offsets[idom]; _localGsSize[idom] = endIdx - shift; _localGsCudaPtr[idom] = shift; } - _magmaHelper->setMultRHS(_localGsCudaPtr, _localGsSize); + _magmaHelper->allocateGread(algebraicHelpers.Gread, algebraicHelpers.localGsize, _localGsCudaPtr, _localGsSize); + //_magmaHelper->setMultRHS(_localGsCudaPtr, _localGsSize); } // Call the AbstractIterativeSolver object @@ -2240,7 +2244,7 @@ namespace gmshddm PetscLogEventRegister("Alg: offsetStuff", 0, &offsetStuff); PetscLogEventRegister("Alg: offsetDomain", 0, &offsetDomain); - PetscLogEventBegin(scatter, 0, 0, 0, 0); // Scatter Breaks GRead ??? + PetscLogEventBegin(scatter, 0, 0, 0, 0); PetscCall(VecScatterBegin(algebraicHelpers.globalInterfaceScatter, G, algebraicHelpers.Gread, INSERT_VALUES, SCATTER_FORWARD)); PetscCall(VecScatterEnd(algebraicHelpers.globalInterfaceScatter, G, algebraicHelpers.Gread, INSERT_VALUES, SCATTER_FORWARD)); PetscLogEventEnd(scatter, 0, 0, 0, 0); @@ -2249,7 +2253,7 @@ namespace gmshddm if (_useGPUMRES){ PetscLogEventBegin(matmult, 0, 0, 0, 0); - _magmaHelper->updateRHS(); + _magmaHelper->updateRHS(algebraicHelpers.Gread); PetscLogEventEnd(matmult, 0, 0, 0, 0); PetscFunctionReturn(0); } @@ -2274,7 +2278,7 @@ namespace gmshddm _magmaHelper->uploadMultRHS(_localGs); PetscLogEventEnd(magmaMRHS, 0, 0, 0, 0); PetscLogEventBegin(matmult, 0, 0, 0, 0); - _magmaHelper->updateRHS(); + _magmaHelper->updateRHS(algebraicHelpers.Gread); PetscLogEventEnd(matmult, 0, 0, 0, 0); } else { for(unsigned idom = 0; idom < size(); ++idom) { @@ -2602,8 +2606,6 @@ namespace gmshddm template< class T_Scalar > PetscErrorCode Formulation< T_Scalar >::_solveAllVolumeAndExtractRhs() - // On ne construit pas explicitement le vecteur B, on le construit directement dans la boucle puis on le lie à la matrice LHS directement - // C'est pas comme avec les matrices LHS où on a un gros _mats qui fait plz { PetscFunctionBeginUser; diff --git a/src/problem/Formulation.h b/src/problem/Formulation.h index 53e8f0e3f28cd4c0654666a1e9801807a0efef9a..20f08cca1b9185b91394b3e3ea7892414d6551ce 100644 --- a/src/problem/Formulation.h +++ b/src/problem/Formulation.h @@ -82,6 +82,7 @@ namespace gmshddm public: bool _useMagma; bool _useGPUMRES; + bool _useSparseMult; std::unique_ptr<gmshfem::system::AbstractMonitor> _monitor; unsigned int _numberOfIterations; @@ -175,8 +176,8 @@ namespace gmshddm public: Formulation(const std::string &name, const std::vector< std::vector< unsigned int > > &topology); Formulation(const std::string &name, const std::vector< gmshfem::problem::Formulation< T_Scalar > * > &volume, const std::vector< std::map< unsigned int, gmshfem::problem::Formulation< T_Scalar > * > > &surface); - Formulation(const std::string &name, const std::vector< std::vector< unsigned int > > &topology, bool useMagma, bool useGPUMRES); - Formulation(const std::string &name, const std::vector< gmshfem::problem::Formulation< T_Scalar > * > &volume, const std::vector< std::map< unsigned int, gmshfem::problem::Formulation< T_Scalar > * > > &surface, bool useMagma, bool useGPUMRES); + Formulation(const std::string &name, const std::vector< std::vector< unsigned int > > &topology, bool useMagma, bool useGPUMRES, bool useSparseMult); + Formulation(const std::string &name, const std::vector< gmshfem::problem::Formulation< T_Scalar > * > &volume, const std::vector< std::map< unsigned int, gmshfem::problem::Formulation< T_Scalar > * > > &surface, bool useMagma, bool useGPUMRES, bool useSparseMult); ~Formulation(); unsigned int size() const; diff --git a/src/problem/MagmaHelper.cpp b/src/problem/MagmaHelper.cpp index 12381013ca5f291ea5f51eb0fa50fddca3015920..796ff8dd5c2da2a3006769c3e3ae7f61b96a96e8 100644 --- a/src/problem/MagmaHelper.cpp +++ b/src/problem/MagmaHelper.cpp @@ -14,10 +14,15 @@ namespace gmshddm namespace problem { template< class T_Scalar > - MagmaHelper<T_Scalar>::MagmaHelper(bool useMagma, bool GPUMRES, int MPI_RANK) + MagmaHelper<T_Scalar>::MagmaHelper(bool useMagma, bool GPUMRES, bool sparseMult, int MPI_RANK) { #ifdef HAVE_MAGMA useGPUMRES = GPUMRES; + if (useGPUMRES){ + useSparseMult = sparseMult; + } else { + useSparseMult = false; + } if (useMagma){ _MPI_RANK = MPI_RANK; _totalNDom = 0; @@ -229,6 +234,7 @@ namespace gmshddm // magma_zprint_gpu( _lddx, _numMatrices, _d_X, _lddx, _queue); // } VecCreateSeqCUDAWithArray(PETSC_COMM_SELF, 1, _numMatrices * _lddx, (PetscScalar*)_d_X, &outgoingGVolumeVec); + _outgoingGVolumeVec = &outgoingGVolumeVec; #elif defined(USE_HIP) VecCreateSeqHIPWithArray(PETSC_COMM_SELF, 1, _numMatrices * _lddx, (PetscScalar*)_d_X, &outgoingGVolumeVec); #else @@ -308,7 +314,6 @@ namespace gmshddm // Allocate device memory magma_malloc( (magma_ptr*) &_d_X, _lddx*numRhs*_numMatrices*sizeof(Magma_Scalar) ); magma_malloc( (void**) &_dX_array, _numMatrices * sizeof(Magma_Scalar*) ); - magma_malloc( (magma_ptr*) &_d_B, _lddb*numRhs*_numMatrices*sizeof(Magma_Scalar) ); magma_malloc_cpu( (void**) &_hB_array, _numMatrices * sizeof(Magma_Scalar*) ); magma_malloc( (void**) &_dB_array, _numMatrices * sizeof(Magma_Scalar*) ); @@ -415,20 +420,6 @@ namespace gmshddm #endif } - template< class T_Scalar > - void MagmaHelper<T_Scalar>::setMultRHS(std::unordered_map< unsigned int, unsigned int> _localGsCudaPtr, std::unordered_map< unsigned int, unsigned int> _localGsSize){ - #ifndef HAVE_MAGMA - throw gmshfem::common::Exception("MAGMA is not available"); - #else - for (size_t i = 0; i < _numMatrices; ++i) { - auto pointer = reinterpret_cast<Magma_Scalar*>(_d_B + _localGsCudaPtr[_ids[i]]); - // Yes, I'm technically doing a out of bounds access and yes I don't care 😎 -> I'm not anymore (I think ???) - _hB_array[i] = pointer; - } - magma_setvector(_numMatrices, sizeof(Magma_Scalar*), _hB_array, 1, _dB_array, 1, _queue); - #endif - } - template< class T_Scalar > void MagmaHelper<T_Scalar>::uploadMultRHS(std::unordered_map< unsigned, Mat >& _updatedRHS) { @@ -553,17 +544,10 @@ namespace gmshddm _rhsMat_n = n; } } - - std::cout << " - padded size : " << _paddedMatrixSize << std::endl; - std::cout << " - rhsMat_m : " << _rhsMat_m << std::endl; - std::cout << " - rhsMat_n : " << _rhsMat_n << std::endl; - - if (_rhsMat_m != _paddedMatrixSize) { - throw gmshfem::common::Exception("Je comprends pas comment c'est possible HAHAHA Roland tu dois réfléchir !"); - } - - std::cout << "Allocaiton ..." << std::endl; - // Allocate host memory + _ldMult = _rhsMat_m; + _lddMult = magma_roundup(_rhsMat_m, 32); + _ldb = _rhsMat_n; + _lddb = magma_roundup(_rhsMat_n, 32); _ldMult = _rhsMat_m; _lddMult = magma_roundup(_rhsMat_m, 32); _sizeMult = (unsigned long long)_ldMult * (unsigned long long)_rhsMat_n * (unsigned long long)_numMatrices; // Ici on a une matrice rectangulaire !!! @@ -578,75 +562,110 @@ namespace gmshddm size_t requiredMemory = _sizeMult * sizeof(Magma_Scalar) + _rhsMat_n * _numMatrices * sizeof(Magma_Scalar); std::cout << " - Required memory : " << requiredMemory / 1e9 << std::endl; - _h_Mult = (Magma_Scalar *)calloc(sizeof(Magma_Scalar), _sizeMult); - //magma_malloc_cpu( (void**) &_h_Mult, _sizeMult*sizeof(Magma_Scalar) ); - - std::cout << "Allocation done" << std::endl; - // Fill the arrays - size_t sparseCount = 0; - for (size_t i = 0; i < _numMatrices; ++i) { - auto matrix = rhsMatricesSparsified[_ids[i]]; // Make sure that the order is correct - PetscInt m, n; // m = number of rows, n = number of columns - MatGetSize(matrix, &m, &n); - // Would love to do a MatGetValues directly into _h_Mult but I don't think I can do std::complex -> Magma_Scalar ? - // Also _h_Mult must be padded pffffff - for (PetscInt k = 0; k < n; ++k) { - for (PetscInt j = 0; j < m; ++j) { - PetscScalar value; - MatGetValue(matrix, j, k, &value); - if (!(value.real() == 0.0 && value.imag() == 0.0)) { - sparseCount++; + // ONLY WORKS WITH 1 RHS + magma_malloc( (magma_ptr*) &_d_B, _lddb*1*_numMatrices*sizeof(Magma_Scalar) ); + + if (useSparseMult) { // Sparse + MatCreate(PETSC_COMM_SELF, &_sparseFullMatrix); + MatSetType(_sparseFullMatrix, MATSEQAIJCUSPARSE); + MatSetSizes(_sparseFullMatrix, _lddMult * _numMatrices, _lddb * _numMatrices, PETSC_DECIDE, PETSC_DECIDE); + size_t sparseCount = 0; + for (size_t i = 0; i < _numMatrices; ++i) { + auto matrix = rhsMatricesSparsified[_ids[i]]; // Make sure that the order is correct + MatGetSize(matrix, &m, &n); + for (size_t j = 0; j < m; ++j) { + for (size_t k = 0; k < n; ++k) { + PetscScalar value; + MatGetValue(matrix, j, k, &value); + if (!(value.real() == 0.0 && value.imag() == 0.0)) { + sparseCount++; + MatSetValue(_sparseFullMatrix, i * _lddMult + j, i * _lddb + k, value, INSERT_VALUES); // Awful + } } - if constexpr(std::is_same_v<T_Scalar, std::complex<double>>) { - _h_Mult[i*_ldMult*_rhsMat_n + k*_ldMult + j] = MAGMA_Z_MAKE(value.real(), value.imag()); - } else if constexpr(std::is_same_v<T_Scalar, std::complex<float>>) { - _h_Mult[i*_ldMult*_rhsMat_n + k*_ldMult + j] = MAGMA_C_MAKE(value.real(), value.imag()); - } else if constexpr(std::is_same_v<T_Scalar, double>) { - _h_Mult[i*_ldMult*_rhsMat_n + k*_ldMult + j] = value; - } else { - _h_Mult[i*_ldMult*_rhsMat_n + k*_ldMult + j] = value; + } + } + MatAssemblyBegin(_sparseFullMatrix, MAT_FINAL_ASSEMBLY); + MatAssemblyEnd(_sparseFullMatrix, MAT_FINAL_ASSEMBLY); + VecCreateSeqCUDAWithArray(PETSC_COMM_SELF, 1, _lddb * _numMatrices, (PetscScalar*)_d_B, &_multRHS); + std::cout << "Required sparse memory : " << sparseCount * (sizeof(Magma_Scalar) + 2 * sizeof(size_t)) / 1e9 << std::endl; + } else { + if (_rhsMat_m != _paddedMatrixSize) { + throw gmshfem::common::Exception("Je comprends pas comment c'est possible HAHAHA Roland tu dois réfléchir !"); + } + + std::cout << "Allocaiton ..." << std::endl; + // Allocate host memory + + _h_Mult = (Magma_Scalar *)calloc(sizeof(Magma_Scalar), _sizeMult); + //magma_malloc_cpu( (void**) &_h_Mult, _sizeMult*sizeof(Magma_Scalar) ); + + std::cout << "Allocation done" << std::endl; + // Fill the arrays + size_t sparseCount = 0; + for (size_t i = 0; i < _numMatrices; ++i) { + auto matrix = rhsMatricesSparsified[_ids[i]]; // Make sure that the order is correct + PetscInt m, n; // m = number of rows, n = number of columns + MatGetSize(matrix, &m, &n); + // Would love to do a MatGetValues directly into _h_Mult but I don't think I can do std::complex -> Magma_Scalar ? + // Also _h_Mult must be padded pffffff + for (PetscInt k = 0; k < n; ++k) { + for (PetscInt j = 0; j < m; ++j) { + PetscScalar value; + MatGetValue(matrix, j, k, &value); + if (!(value.real() == 0.0 && value.imag() == 0.0)) { + sparseCount++; + } + if constexpr(std::is_same_v<T_Scalar, std::complex<double>>) { + _h_Mult[i*_ldMult*_rhsMat_n + k*_ldMult + j] = MAGMA_Z_MAKE(value.real(), value.imag()); + } else if constexpr(std::is_same_v<T_Scalar, std::complex<float>>) { + _h_Mult[i*_ldMult*_rhsMat_n + k*_ldMult + j] = MAGMA_C_MAKE(value.real(), value.imag()); + } else if constexpr(std::is_same_v<T_Scalar, double>) { + _h_Mult[i*_ldMult*_rhsMat_n + k*_ldMult + j] = value; + } else { + _h_Mult[i*_ldMult*_rhsMat_n + k*_ldMult + j] = value; + } } } } - } - std::cout << "Required sparse memory : " << sparseCount * (sizeof(Magma_Scalar) + 2 * sizeof(size_t)) / 1e9 << std::endl; + std::cout << "Required sparse memory : " << sparseCount * (sizeof(Magma_Scalar) + 2 * sizeof(size_t)) / 1e9 << std::endl; - // Matrices filled + // Matrices filled - // Allocate device memory - magma_malloc( (magma_ptr*) &_d_Mult, _lddMult*_rhsMat_n*_numMatrices*sizeof(Magma_Scalar) ); - magma_malloc( (void**) &_d_Mult_array, _numMatrices * sizeof(Magma_Scalar*) ); + // Allocate device memory + magma_malloc( (magma_ptr*) &_d_Mult, _lddMult*_rhsMat_n*_numMatrices*sizeof(Magma_Scalar) ); + magma_malloc( (void**) &_d_Mult_array, _numMatrices * sizeof(Magma_Scalar*) ); - void* zeroedDMULT = calloc(sizeof(Magma_Scalar), _lddMult*_rhsMat_n*_numMatrices); - if (zeroedDMULT == NULL) { - throw gmshfem::common::Exception("Failed to allocate memory for zeroedDMULT"); - } - magma_setvector(_lddMult*_rhsMat_n*_numMatrices, sizeof(Magma_Scalar), zeroedDMULT, 1, _d_Mult, 1, _queue); - free(zeroedDMULT); - - if constexpr(std::is_same_v<T_Scalar, std::complex<double>>) { - magma_zsetmatrix( _ldMult, _rhsMat_n*_numMatrices, _h_Mult, _ldMult, _d_Mult, _lddMult, _queue ); - magma_zset_pointer( _d_Mult_array, _d_Mult, _ldMult, 0, 0, _lddMult*_rhsMat_n, _numMatrices, _queue ); - } else if constexpr(std::is_same_v<T_Scalar, std::complex<float>>) { - magma_csetmatrix( _ldMult, _rhsMat_n*_numMatrices, _h_Mult, _ldMult, _d_Mult, _lddMult, _queue ); - magma_cset_pointer( _d_Mult_array, _d_Mult, _ldMult, 0, 0, _lddMult*_rhsMat_n, _numMatrices, _queue ); - } else if constexpr(std::is_same_v<T_Scalar, double>) { - magma_dsetmatrix( _ldMult, _rhsMat_n*_numMatrices, _h_Mult, _ldMult, _d_Mult, _lddMult, _queue ); - magma_dset_pointer( _d_Mult_array, _d_Mult, _ldMult, 0, 0, _lddMult*_rhsMat_n, _numMatrices, _queue ); - } else { - magma_ssetmatrix( _ldMult, _rhsMat_n*_numMatrices, _h_Mult, _ldMult, _d_Mult, _lddMult, _queue ); - magma_sset_pointer( _d_Mult_array, _d_Mult, _ldMult, 0, 0, _lddMult*_rhsMat_n, _numMatrices, _queue ); + void* zeroedDMULT = calloc(sizeof(Magma_Scalar), _lddMult*_rhsMat_n*_numMatrices); + if (zeroedDMULT == NULL) { + throw gmshfem::common::Exception("Failed to allocate memory for zeroedDMULT"); + } + magma_setvector(_lddMult*_rhsMat_n*_numMatrices, sizeof(Magma_Scalar), zeroedDMULT, 1, _d_Mult, 1, _queue); + free(zeroedDMULT); + + if constexpr(std::is_same_v<T_Scalar, std::complex<double>>) { + magma_zsetmatrix( _ldMult, _rhsMat_n*_numMatrices, _h_Mult, _ldMult, _d_Mult, _lddMult, _queue ); + magma_zset_pointer( _d_Mult_array, _d_Mult, _ldMult, 0, 0, _lddMult*_rhsMat_n, _numMatrices, _queue ); + } else if constexpr(std::is_same_v<T_Scalar, std::complex<float>>) { + magma_csetmatrix( _ldMult, _rhsMat_n*_numMatrices, _h_Mult, _ldMult, _d_Mult, _lddMult, _queue ); + magma_cset_pointer( _d_Mult_array, _d_Mult, _ldMult, 0, 0, _lddMult*_rhsMat_n, _numMatrices, _queue ); + } else if constexpr(std::is_same_v<T_Scalar, double>) { + magma_dsetmatrix( _ldMult, _rhsMat_n*_numMatrices, _h_Mult, _ldMult, _d_Mult, _lddMult, _queue ); + magma_dset_pointer( _d_Mult_array, _d_Mult, _ldMult, 0, 0, _lddMult*_rhsMat_n, _numMatrices, _queue ); + } else { + magma_ssetmatrix( _ldMult, _rhsMat_n*_numMatrices, _h_Mult, _ldMult, _d_Mult, _lddMult, _queue ); + magma_sset_pointer( _d_Mult_array, _d_Mult, _ldMult, 0, 0, _lddMult*_rhsMat_n, _numMatrices, _queue ); + } + std::cout << " - padded size : " << _paddedMatrixSize << std::endl; + std::cout << " - rhsMat_m : " << _rhsMat_m << std::endl; + std::cout << " - rhsMat_n : " << _rhsMat_n << std::endl; + std::cout << "setRHSMatrix end" <<std::endl; } - std::cout << " - padded size : " << _paddedMatrixSize << std::endl; - std::cout << " - rhsMat_m : " << _rhsMat_m << std::endl; - std::cout << " - rhsMat_n : " << _rhsMat_n << std::endl; - std::cout << "setRHSMatrix end" <<std::endl; #endif } template< class T_Scalar > - void MagmaHelper<T_Scalar>::updateRHS() + void MagmaHelper<T_Scalar>::updateRHS(Vec& Gread) { #ifndef HAVE_MAGMA throw gmshfem::common::Exception("MAGMA is not available"); @@ -659,31 +678,38 @@ namespace gmshddm // //magma_zprint_gpu(_lddb, _numRhs*_numMatrices, _d_B, _lddb, _queue); // } + if (useSparseMult){ + VecScatterBegin(_fromPetscToPadded, Gread, _multRHS, INSERT_VALUES, SCATTER_FORWARD); + VecScatterEnd(_fromPetscToPadded, Gread, _multRHS, INSERT_VALUES, SCATTER_FORWARD); + MatMult(_sparseFullMatrix, _multRHS, *_outgoingGVolumeVec); + //VecView(*_outgoingGVolumeVec, PETSC_VIEWER_STDOUT_WORLD); - - if (_numRhs == 1){ - if constexpr(std::is_same_v<T_Scalar, std::complex<double>>) { - magmablas_zgemv_batched(MagmaNoTrans, _ldMult, _rhsMat_n, MAGMA_Z_MAKE(1.0,0.0), _d_Mult_array, _lddMult, _dB_array, 1, MAGMA_Z_MAKE(0.0,0.0), _dX_array, 1, _numMatrices, _queue); - } else if constexpr(std::is_same_v<T_Scalar, std::complex<float>>) { - magmablas_cgemv_batched(MagmaNoTrans, _ldMult, _rhsMat_n, MAGMA_C_MAKE(1.0,0.0), _d_Mult_array, _lddMult, _dB_array, 1, MAGMA_C_MAKE(0.0,0.0), _dX_array, 1, _numMatrices, _queue); - } else if constexpr(std::is_same_v<T_Scalar, double>) { - magmablas_dgemv_batched(MagmaNoTrans, _ldMult, _rhsMat_n, 1.0, _d_Mult_array, _lddMult, _dB_array, 1, 0.0, _dX_array, 1, _numMatrices, _queue); - } else { - magmablas_sgemv_batched(MagmaNoTrans, _ldMult, _rhsMat_n, 1.0, _d_Mult_array, _lddMult, _dB_array, 1, 0.0, _dX_array, 1, _numMatrices, _queue); - } - } else { - if constexpr(std::is_same_v<T_Scalar, std::complex<double>>) { - magma_zgemm_batched(MagmaNoTrans, MagmaNoTrans, _ldMult, _rhsMat_n, _numRhs, MAGMA_Z_MAKE(-1.0,0.0), _d_Mult_array, _lddMult, _dB_array, _lddb, MAGMA_Z_MAKE(0.0,0.0), _dX_array, _lddx, _numMatrices, _queue); - } else if constexpr(std::is_same_v<T_Scalar, std::complex<float>>) { - magma_cgemm_batched(MagmaNoTrans, MagmaNoTrans, _ldMult, _rhsMat_n, _numRhs, MAGMA_C_MAKE(-1.0,0.0), _d_Mult_array, _lddMult, _dB_array, _lddb, MAGMA_C_MAKE(0.0,0.0), _dX_array, _lddx, _numMatrices, _queue); - } else if constexpr(std::is_same_v<T_Scalar, double>) { - magma_dgemm_batched(MagmaNoTrans, MagmaNoTrans, _ldMult, _rhsMat_n, _numRhs, -1.0, _d_Mult_array, _lddMult, _dB_array, _lddb, 0.0, _dX_array, _lddx, _numMatrices, _queue); + if (_numRhs == 1){ + if constexpr(std::is_same_v<T_Scalar, std::complex<double>>) { + magmablas_zgemv_batched(MagmaNoTrans, _ldMult, _rhsMat_n, MAGMA_Z_MAKE(1.0,0.0), _d_Mult_array, _lddMult, _dB_array, 1, MAGMA_Z_MAKE(0.0,0.0), _dX_array, 1, _numMatrices, _queue); + } else if constexpr(std::is_same_v<T_Scalar, std::complex<float>>) { + magmablas_cgemv_batched(MagmaNoTrans, _ldMult, _rhsMat_n, MAGMA_C_MAKE(1.0,0.0), _d_Mult_array, _lddMult, _dB_array, 1, MAGMA_C_MAKE(0.0,0.0), _dX_array, 1, _numMatrices, _queue); + } else if constexpr(std::is_same_v<T_Scalar, double>) { + magmablas_dgemv_batched(MagmaNoTrans, _ldMult, _rhsMat_n, 1.0, _d_Mult_array, _lddMult, _dB_array, 1, 0.0, _dX_array, 1, _numMatrices, _queue); + } else { + magmablas_sgemv_batched(MagmaNoTrans, _ldMult, _rhsMat_n, 1.0, _d_Mult_array, _lddMult, _dB_array, 1, 0.0, _dX_array, 1, _numMatrices, _queue); + } + } else { - magma_sgemm_batched(MagmaNoTrans, MagmaNoTrans, _ldMult, _rhsMat_n, _numRhs, -1.0, _d_Mult_array, _lddMult, _dB_array, _lddb, 0.0, _dX_array, _lddx, _numMatrices, _queue); + if constexpr(std::is_same_v<T_Scalar, std::complex<double>>) { + magma_zgemm_batched(MagmaNoTrans, MagmaNoTrans, _ldMult, _rhsMat_n, _numRhs, MAGMA_Z_MAKE(-1.0,0.0), _d_Mult_array, _lddMult, _dB_array, _lddb, MAGMA_Z_MAKE(0.0,0.0), _dX_array, _lddx, _numMatrices, _queue); + } else if constexpr(std::is_same_v<T_Scalar, std::complex<float>>) { + magma_cgemm_batched(MagmaNoTrans, MagmaNoTrans, _ldMult, _rhsMat_n, _numRhs, MAGMA_C_MAKE(-1.0,0.0), _d_Mult_array, _lddMult, _dB_array, _lddb, MAGMA_C_MAKE(0.0,0.0), _dX_array, _lddx, _numMatrices, _queue); + } else if constexpr(std::is_same_v<T_Scalar, double>) { + magma_dgemm_batched(MagmaNoTrans, MagmaNoTrans, _ldMult, _rhsMat_n, _numRhs, -1.0, _d_Mult_array, _lddMult, _dB_array, _lddb, 0.0, _dX_array, _lddx, _numMatrices, _queue); + } else { + magma_sgemm_batched(MagmaNoTrans, MagmaNoTrans, _ldMult, _rhsMat_n, _numRhs, -1.0, _d_Mult_array, _lddMult, _dB_array, _lddb, 0.0, _dX_array, _lddx, _numMatrices, _queue); + } } - } + + #endif } @@ -745,7 +771,7 @@ namespace gmshddm // Allocate the GRead matrix so that I don't do outofbounds access (oups) template< class T_Scalar > - void MagmaHelper<T_Scalar>::allocateGread(Vec& Gread, unsigned localGSize) + void MagmaHelper<T_Scalar>::allocateGread(Vec& Gread, unsigned localGSize, std::unordered_map< unsigned int, unsigned int> _localGsCudaPtr, std::unordered_map< unsigned int, unsigned int> _localGsSize) { #ifndef HAVE_MAGMA throw gmshfem::common::Exception("MAGMA is not available"); @@ -759,17 +785,54 @@ namespace gmshddm //std::cout << "allocate Gread HIP" << std::endl; VecHIPGetArray(Gread, &petscArray); #endif - // Copy to _dB - magma_copyvector_async(2 * localGSize, sizeof(double), petscArray, 1, _d_B, 1, _queue); - // Create new GSize + // with sparse we need to pad -> use _localGsCudaPtr to determine the local sizes + if (useSparseMult) { // Sparse + // Create new padded Gread + // size_t currPos = 0; + // for (size_t i = 0; i < _numMatrices; ++i) { + // magma_copyvector_async(2 * _localGsCudaPtr[_ids[i]], sizeof(double), petscArray + currPos, 1, _d_B + i * _lddb, 1, _queue); + // currPos += _localGsSize[_ids[i]]; + // } + // Create _multRHS to Gread scatter (basically copy withoput the padding) + std::vector< PetscInt > inputIndices; + PetscInt rstart, rend; + VecGetOwnershipRange(Gread, &rstart, &rend); + PetscInt localSize = rend - rstart; + IS outIS; + ISCreateStride(MPI_COMM_WORLD, localSize, rstart, 1, &outIS); + + for (size_t i = 0; i < _numMatrices; ++i) { + for (size_t j = 0; j < _localGsSize[_ids[i]]; ++j) { + inputIndices.push_back(i * _lddb + j); + } + } + + IS indexIS; + ISCreateGeneral(MPI_COMM_WORLD, inputIndices.size(), inputIndices.data(), PETSC_COPY_VALUES, &indexIS); + VecScatterCreate(Gread, outIS, _multRHS, indexIS, &_fromPetscToPadded); + VecScatterCreate(_multRHS, indexIS, Gread, outIS, &_fromPaddedToPetsc); + + } else { + magma_copyvector_async(2 * localGSize, sizeof(double), petscArray, 1, _d_B, 1, _queue); + VecDestroy(&Gread); + #ifdef USE_CUDA + VecCreateMPICUDAWithArray(PETSC_COMM_WORLD, 1, localGSize, PETSC_DECIDE, (PetscScalar*)_d_B, &Gread); + #endif + #ifdef USE_HIP + VecCreateMPIHIPWithArray(PETSC_COMM_WORLD, 1, localGSize, PETSC_DECIDE, (PetscScalar*)_d_B, &Gread); + #endif + for (size_t i = 0; i < _numMatrices; ++i) { + auto pointer = reinterpret_cast<Magma_Scalar*>(_d_B + _localGsCudaPtr[_ids[i]]); + // Yes, I'm technically doing a out of bounds access and yes I don't care 😎 -> I'm not anymore (I think ???) + // I'm not in the sense that technically, _d_B is allocated to fully take the padding into account + // But technically, when GEMV is called, it might fetch into other "RHS" (which are the ones of the other matrices) + // (but it's ok since the matrix is padded with zeros) + _hB_array[i] = pointer; + } + magma_setvector(_numMatrices, sizeof(Magma_Scalar*), _hB_array, 1, _dB_array, 1, _queue); + } + - VecDestroy(&Gread); - #ifdef USE_CUDA - VecCreateMPICUDAWithArray(PETSC_COMM_WORLD, 1, localGSize, PETSC_DECIDE, (PetscScalar*)_d_B, &Gread); - #endif - #ifdef USE_HIP - VecCreateMPIHIPWithArray(PETSC_COMM_WORLD, 1, localGSize, PETSC_DECIDE, (PetscScalar*)_d_B, &Gread); // This doesn't directly break _d_B ... ???? - #endif #endif // d_B has the correct value here !!! } @@ -781,7 +844,9 @@ namespace gmshddm #else magma_free_cpu(_h_X); magma_free_cpu(_h_B); - magma_free_cpu(_h_Mult); + if (_h_Mult != NULL) { + magma_free_cpu(_h_Mult); + } #endif } diff --git a/src/problem/MagmaHelper.h b/src/problem/MagmaHelper.h index b662aec696cc92434382265d174cea0a907a3fa8..996dae74ceec5cf4ff90d4dfdeab4f80921d04d1 100644 --- a/src/problem/MagmaHelper.h +++ b/src/problem/MagmaHelper.h @@ -56,7 +56,7 @@ namespace gmshddm public: using Magma_Scalar = typename MagmaTypes<T_Scalar>::Magma_Scalar; using Magma_Pointer = typename MagmaTypes<T_Scalar>::Magma_Pointer; - MagmaHelper(bool useMagma, bool useGPUMRES, int MPI_RANK); + MagmaHelper(bool useMagma, bool useGPUMRES, bool sprseMult, int MPI_RANK); ~MagmaHelper(); // Process unordered map of CRS matrices to be used in batched magma routines @@ -64,15 +64,15 @@ namespace gmshddm void convertRHSVectorToBatchedRHSMatrix(std::unordered_map< unsigned int, Mat > &_rhs, unsigned numRhs); void factPlusSolve(); // Premier solve qui donne la factorisation LU pour les matrices void solveOnly(); // Solve une fois qu'on a la factorisation LU avec un nouvbeau RHS (partie "directe" du solveur DDM en gros) - void updateRHS(); // Met à jour le RHS + void updateRHS(Vec& Gread); // Met à jour le RHS (sparse) void uploadMultRHS(std::unordered_map< unsigned, Vec >& _updatedRHS); // Met à jour le RHS de la multiplication void uploadMultRHS(std::unordered_map< unsigned, Mat >& _updatedRHS); // Met à jour le RHS de la multiplication void getSolution(std::unordered_map< unsigned, Vec >& _whereToPutIt); // Récupère la solution void getSolution(std::unordered_map< unsigned, Mat >& _whereToPutIt); // Récupère la solution void setRHSMatrix(std::unordered_map< unsigned, Mat>& rhsMatricesSparsified); - void setMultRHS(std::unordered_map< unsigned int, unsigned int> _localGsCudaPtr, std::unordered_map< unsigned int, unsigned int> _localGsSize); + //void setMultRHS(std::unordered_map< unsigned int, unsigned int> _localGsCudaPtr, std::unordered_map< unsigned int, unsigned int> _localGsSize); void createOutgoingGVolumes(VecScatter &fromVolumeToInterfaceScatter, std::unordered_map< unsigned, unsigned >& _interfaceSizes, std::map<unsigned, IS>& _interfaceIndices, std::map<unsigned, IS> &_shiftedIdentity, unsigned localGSize, Vec &outgoingGVolumeVec, std::unordered_map< unsigned, Vec > &outgoingGVolumes); - void allocateGread(Vec& Gread, unsigned localGSize); + void allocateGread(Vec& Gread, unsigned localGSize, std::unordered_map< unsigned int, unsigned int> _localGsCudaPtr, std::unordered_map< unsigned int, unsigned int> _localGsSize); void freeMagmaCPUArrays(); private: @@ -88,13 +88,17 @@ namespace gmshddm int _MPI_RANK; unsigned _totalNDom; bool useGPUMRES; + bool useSparseMult; #ifdef HAVE_MAGMA magma_queue_t _queue; // h_A -> Matrix of the systems // h_B -> RHS of the systems // h_Mult -> Compute RHS - Magma_Scalar *_h_A, *_h_B, *_h_X, *_h_Mult; + Magma_Scalar *_h_A = NULL; + Magma_Scalar *_h_B = NULL; + Magma_Scalar *_h_X = NULL; + Magma_Scalar *_h_Mult = NULL; Magma_Pointer _d_A, _d_B, _d_X, _d_Mult; magma_int_t _lda = 0, _ldda = 0, _ldb = 0, _lddb = 0, _ldMult = 0, _lddMult = 0, _ldx = 0, _lddx = 0; magma_int_t *_ipiv, *_info; @@ -103,6 +107,12 @@ namespace gmshddm Magma_Scalar **_dB_array = NULL; Magma_Scalar **_dX_array = NULL; + Mat _sparseFullMatrix; + Vec* _outgoingGVolumeVec; + Vec _multRHS; + VecScatter _fromPaddedToPetsc; + VecScatter _fromPetscToPadded; + Magma_Scalar **_d_Mult_array = NULL; Magma_Scalar **_hB_array = NULL; diff --git a/tutorials/helmholtz/waveguide/main.cpp b/tutorials/helmholtz/waveguide/main.cpp index a374aaae5dabd9e70e4c6a04f5a735a1be738170..605963b8f15ca12b8f556580582d55681c2a08c4 100644 --- a/tutorials/helmholtz/waveguide/main.cpp +++ b/tutorials/helmholtz/waveguide/main.cpp @@ -108,6 +108,7 @@ int main(int argc, char **argv) int order = 1; int magma = 0; int gpumres = 0; + int sparseMult = 0; int savesolution = 1; std::string iopath = "."; int bSaveFunct = 0; @@ -124,8 +125,10 @@ int main(int argc, char **argv) gmshDdm.userDefinedParameter(savesolution, "saveSolution"); gmshDdm.userDefinedParameter(iopath, "iopath"); gmshDdm.userDefinedParameter(bSaveFunct, "saveFunction"); + gmshDdm.userDefinedParameter(sparseMult, "sparseMult"); bool useMagma = (magma != 0); bool useGPUMRES = (gpumres != 0); + bool useSparseMult = (sparseMult != 0); bool saveSolution = (savesolution != 0); bool saveFunction = (bSaveFunct != 0); @@ -155,6 +158,8 @@ int main(int argc, char **argv) // DUMP EDGE TAGS //gmsh::model::mesh::createEdges(); + std::cout << "158" << std::endl; + // Define domain Subdomain omega(nDom); Subdomain gammaInf(nDom); @@ -163,29 +168,68 @@ int main(int argc, char **argv) Interface sigma(nDom); Domain corner = Domain("corner"); + std::cout << "166" << std::endl; + + std::map< std::pair< int, int >, std::vector< std::pair<int, int> > > groupsMap; + std::vector< std::pair<int, int> > physGroups; + std::vector< std::vector < std::pair<int, int> > > entities; + + gmsh::model::getPhysicalGroupsEntities(physGroups, entities); + + for(unsigned int i = 0; i < physGroups.size(); ++i) { + groupsMap[physGroups[i]] = entities[i]; + } + + std::cout << "176" << std::endl; + +//#pragma omp parallel for for(unsigned int i = 0; i < nDom; ++i) { - omega(i) = Domain(2, i + 1); + omega(i) = Domain(groupsMap[std::make_pair(2, i + 1)]); if(i == nDom - 1) { - gammaInf(i) = Domain(1, 2); + gammaInf(i) = Domain(groupsMap[std::make_pair(1, 2)]); } if(i == 0) { - gammaDir(i) = Domain(1, 1); + gammaDir(i) = Domain(groupsMap[std::make_pair(1, 1)]); } - border(i) = Domain(1, 100 + i); + border(i) = Domain(groupsMap[std::make_pair(1, 100 + i)]); if(i != 0) { - sigma(i, i - 1) = Domain(1, 10000 + i - 1); + sigma(i, i - 1) = Domain(groupsMap[std::make_pair(1, 10000+ i - 1)]); } if(i != nDom - 1) { - sigma(i, i + 1) = Domain(1, 10000 + i); + sigma(i, i + 1) = Domain(groupsMap[std::make_pair(1, 10000 + i)]); } } + // for(unsigned int i = 0; i < nDom; ++i) { + // omega(i) = Domain(2, i + 1); + + // if(i == nDom - 1) { + // gammaInf(i) = Domain(1, 2); + // } + + // if(i == 0) { + // gammaDir(i) = Domain(1, 1); + // } + + // border(i) = Domain(1, 100 + i); + + // if(i != 0) { + // sigma(i, i - 1) = Domain(1, 10000 + i - 1); + // } + // if(i != nDom - 1) { + // sigma(i, i + 1) = Domain(1, 10000 + i); + // } + // } + + std::cout << "190" << std::endl; + // Define topology std::vector< std::vector< unsigned int > > topology(nDom); +//#pragma omp parallel for for(unsigned int i = 0; i < nDom; ++i) { if(i != 0) { topology[i].push_back(i - 1); @@ -195,32 +239,41 @@ int main(int argc, char **argv) } } - + std::cout << "199" << std::endl; // Create DDM formulation - gmshddm::problem::Formulation< std::complex< double > > formulation("Helmholtz", topology, useMagma, useGPUMRES); + gmshddm::problem::Formulation< std::complex< double > > formulation("Helmholtz", topology, useMagma, useGPUMRES, useSparseMult); // Create fields // SubdomainField< std::complex< double >, Form::Form0 > u("u", omega | gammaDir | gammaInf | border | sigma, FunctionSpaceTypeForm0::HierarchicalH1, order); - SubdomainField< std::complex< double >, Form::Form0 > u("u", omega | gammaDir | gammaInf | border | sigma, FunctionSpaceTypeForm0::Lagrange); + SubdomainField< std::complex< double >, Form::Form0 > u("u", omega | gammaDir | gammaInf | border | sigma, FunctionSpaceTypeForm0::Lagrange); // Takes time + std::cout << "SubdomainField constructor done" << std::endl; +//#pragma omp parallel for for(unsigned int i = 0; i < nDom; ++i) { u(i).addConstraint(border(i), 0.); } u(0).addConstraint(gammaDir(0), sin< std::complex< double > >(pi * y< std::complex< double > >() / l)); + + std::cout << "u constraints done" << std::endl; //InterfaceField< std::complex< double >, Form::Form0 > g("g", sigma, FunctionSpaceTypeForm0::HierarchicalH1, order); - InterfaceField< std::complex< double >, Form::Form0 > g("g", sigma, FunctionSpaceTypeForm0::Lagrange); + InterfaceField< std::complex< double >, Form::Form0 > g("g", sigma, FunctionSpaceTypeForm0::Lagrange); // Ultra lent + std::cout << "InterfaceField constructor done" << std::endl; +//#pragma omp parallel for for(unsigned int i = 0; i < nDom; ++i) { for(unsigned int jj = 0; jj < topology[i].size(); ++jj) { const unsigned int j = topology[i][jj]; g(i, j).addConstraint(corner, 0.); } - } + } // Un peu lent + std::cout << "262" << std::endl; // Takes lot of time before here std::complex< double > im(0., 1.); // Tell to the formulation that g is field that have to be exchanged between subdomains formulation.addInterfaceField(g); // Add terms to the formulation +//#pragma omp parallel for + std::cout << "266" << std::endl; for(unsigned int i = 0; i < nDom; ++i) { // VOLUME TERMS formulation(i).integral(grad(dof(u(i))), grad(tf(u(i))), omega(i), gauss); @@ -248,6 +301,8 @@ int main(int argc, char **argv) } } + std::cout << "287" << std::endl; // Suprisingly fast + // Solve the DDM formulation formulation.pre();