diff --git a/README.md b/README.md index 8abc5fda..e1281936 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,18 @@ -**PhasicFlow** is a parallel C++ code for performing DEM simulations. It can run on shared-memory multi-core computational units such as multi-core CPUs or GPUs (for now it works on CUDA-enabled GPUs). The parallelization method mainly relies on loop-level parallelization on a shared-memory computational unit. You can build and run PhasicFlow in serial mode on regular PCs, in parallel mode for multi-core CPUs, or build it for a GPU device to off-load computations to a GPU. In its current statues you can simulate millions of particles (up to 80M particles tested) on a single desktop computer. You can see the [performance tests of PhasicFlow](https://github.com/PhasicFlow/phasicFlow/wiki/Performance-of-phasicFlow) in the wiki page. +**PhasicFlow** is a parallel C++ code for performing DEM simulations. It can run on shared-memory multi-core computational units such as multi-core CPUs or GPUs (for now it works on CUDA-enabled GPUs). The parallelization method mainly relies on loop-level parallelization on a shared-memory computational unit. You can build and run PhasicFlow in serial mode on regular PCs, in parallel mode for multi-core CPUs, or build it for a GPU device to off-load computations to a GPU. In its current statues you can simulate millions of particles (up to 80M particles tested) on a single desktop computer. You can see the [performance tests of PhasicFlow](https://github.com/PhasicFlow/phasicFlow/wiki/Performance-of-phasicFlow) in the wiki page. +**MPI** parallelization with dynamic load balancing is under development. With this level of parallelization, PhasicFlow can leverage the computational power of **multi-gpu** workstations or clusters with distributed memory CPUs. +In summary PhasicFlow can have 6 execution modes: +1. Serial on a single CPU core, +2. Parallel on a multi-core computer/node (using OpenMP), +3. Parallel on an nvidia-GPU (using Cuda), +4. Parallel on distributed memory workstation (Using MPI) +5. Parallel on distributed memory workstations with multi-core nodes (using MPI+OpenMP) +6. Parallel on workstations with multiple GPUs (using MPI+Cuda). ## How to build? -You can build PhasicFlow for CPU and GPU executions. [Here is a complete step-by-step procedure](https://github.com/PhasicFlow/phasicFlow/wiki/How-to-Build-PhasicFlow). +You can build PhasicFlow for CPU and GPU executions. The latest release of PhasicFlow is v-0.1. [Here is a complete step-by-step procedure for building phasicFlow-v-0.1.](https://github.com/PhasicFlow/phasicFlow/wiki/How-to-Build-PhasicFlow). ## Online code documentation You can find a full documentation of the code, its features, and other related materials on [online documentation of the code](https://phasicflow.github.io/phasicFlow/) diff --git a/src/Interaction/contactSearch/ContactSearch/ContactSearch.hpp b/src/Interaction/contactSearch/ContactSearch/ContactSearch.hpp index bb72657d..8906eaa5 100644 --- a/src/Interaction/contactSearch/ContactSearch/ContactSearch.hpp +++ b/src/Interaction/contactSearch/ContactSearch/ContactSearch.hpp @@ -130,10 +130,10 @@ public: csPairContainerType& pwPairs, bool force = false) override { - ppTimer().start(); - - Particles().boundingSphere().updateBoundaries(DataDirection::SlaveToMaster); + + ppTimer().start(); + const auto& position = Particles().pointPosition().deviceViewAll(); const auto& flags = Particles().dynPointStruct().activePointsMaskDevice(); const auto& diam = Particles().boundingSphere().deviceViewAll(); @@ -167,6 +167,7 @@ public: csPairContainerType& pwPairs, bool force = false)override { + Particles().boundingSphere().updateBoundaries(DataDirection::SlaveToMaster); return csBoundaries_[i].broadSearch( iter, t, @@ -176,7 +177,6 @@ public: force); } - bool enterBroadSearch(uint32 iter, real t, real dt)const override { if(ppwContactSearch_) diff --git a/src/Interaction/contactSearch/boundaries/processorBoundaryContactSearch/processorBoundaryContactSearch.cpp b/src/Interaction/contactSearch/boundaries/processorBoundaryContactSearch/processorBoundaryContactSearch.cpp index 9f9384e9..8ab8e61d 100644 --- a/src/Interaction/contactSearch/boundaries/processorBoundaryContactSearch/processorBoundaryContactSearch.cpp +++ b/src/Interaction/contactSearch/boundaries/processorBoundaryContactSearch/processorBoundaryContactSearch.cpp @@ -85,7 +85,7 @@ bool pFlow::processorBoundaryContactSearch::broadSearch { if(masterSearch_) { - /*const auto thisPoints = boundary().thisPoints(); + const auto thisPoints = boundary().thisPoints(); const auto& neighborProcPoints = boundary().neighborProcPoints(); const auto& bDiams = diameter_.BoundaryField(thisBoundaryIndex()); const auto thisDiams = bDiams.thisField(); @@ -96,9 +96,9 @@ bool pFlow::processorBoundaryContactSearch::broadSearch thisPoints, thisDiams, neighborProcPoints, - neighborProcDiams); - - pOutput<<"ppPairs size in boundary"<< ppPairs.size()<::processorBoundarySpher geomMotion ), masterInteraction_(boundary.isBoundaryMaster()) -{ - pOutput<<"Processor boundayrCondition for "<< boundary.name()< bool pFlow::MPI::processorBoundarySphereInteraction::sphereSphereInteraction @@ -43,13 +41,13 @@ bool pFlow::MPI::processorBoundarySphereInteraction::sphereSphereInter const ContactForceModel &cfModel ) { + return true; if(!masterInteraction_) return true; const auto & sphPar = this->sphParticles(); uint32 thisIndex = this->boundary().thisBoundaryIndex(); - const auto& a = sphPar.diameter().BoundaryField(thisIndex).neighborProcField().deviceViewAll(); - - /*pFlow::MPI::processorBoundarySIKernels::sphereSphereInteraction( + pOutput<<"beofre sphereSphereInteraction"<ppPairs(), cfModel, @@ -67,7 +65,9 @@ bool pFlow::MPI::processorBoundarySphereInteraction::sphereSphereInter sphPar.rVelocity().BoundaryField(thisIndex).neighborProcField().deviceViewAll(), sphPar.contactForce().BoundaryField(thisIndex).neighborProcField().deviceViewAll(), sphPar.contactTorque().BoundaryField(thisIndex).neighborProcField().deviceViewAll() - );*/ + ); + + pOutput<<"after sphereSphereInteraction"<::iterate() bool broadSearch = contactSearch_().enterBroadSearch(iter, t, dt); - /*sphParticles_.diameter().updateBoundaries(DataDirection::SlaveToMaster); + sphParticles_.diameter().updateBoundaries(DataDirection::SlaveToMaster); sphParticles_.velocity().updateBoundaries(DataDirection::SlaveToMaster); sphParticles_.rVelocity().updateBoundaries(DataDirection::SlaveToMaster); sphParticles_.mass().updateBoundaries(DataDirection::SlaveToMaster); sphParticles_.I().updateBoundaries(DataDirection::SlaveToMaster); - sphParticles_.propertyId().updateBoundaries(DataDirection::SlaveToMaster);*/ + sphParticles_.propertyId().updateBoundaries(DataDirection::SlaveToMaster); if(broadSearch) diff --git a/src/phasicFlow/MPIParallelization/MPI/mpiCommunication.hpp b/src/phasicFlow/MPIParallelization/MPI/mpiCommunication.hpp index 4fd5e260..27d259eb 100644 --- a/src/phasicFlow/MPIParallelization/MPI/mpiCommunication.hpp +++ b/src/phasicFlow/MPIParallelization/MPI/mpiCommunication.hpp @@ -238,6 +238,18 @@ inline auto send(span data, int dest, int tag, Comm comm) comm); } +template +inline auto send(const T& data, int dest, int tag, Comm comm) +{ + return MPI_Send( + &data, + sFactor(), + Type(), + dest, + tag, + comm); +} + template inline auto Isend(span data, int dest, int tag, Comm comm, Request* req) { @@ -277,6 +289,19 @@ inline auto recv(span data, int source, int tag, Comm comm, Status *status) status); } +template +inline auto recv(T& data, int source, int tag, Comm comm, Status *status) +{ + return MPI_Recv( + &data, + sFactor(), + Type(), + source, + tag, + comm, + status); +} + template inline auto Irecv(T& data, int source, int tag, Comm comm, Request* req) { diff --git a/src/phasicFlow/MPIParallelization/pointField/processorBoundaryField.cpp b/src/phasicFlow/MPIParallelization/pointField/processorBoundaryField.cpp index 2595ebaa..164a2fe6 100644 --- a/src/phasicFlow/MPIParallelization/pointField/processorBoundaryField.cpp +++ b/src/phasicFlow/MPIParallelization/pointField/processorBoundaryField.cpp @@ -24,13 +24,13 @@ pFlow::MPI::processorBoundaryField::checkDataRecieved() const { if (!dataRecieved_) { - //uint32 nRecv = reciever_.waitComplete(); + uint32 nRecv = reciever_.waitBufferForUse(); dataRecieved_ = true; - /*if (nRecv != this->neighborProcSize()) + if (nRecv != this->neighborProcSize()) { fatalErrorInFunction; fatalExit; - }*/ + } } } @@ -41,7 +41,7 @@ pFlow::MPI::processorBoundaryField::updateBoundary( DataDirection direction ) { - /*if (step == 1) + if (step == 1) { // Isend if (direction == DataDirection::TwoWay || @@ -67,7 +67,7 @@ pFlow::MPI::processorBoundaryField::updateBoundary( { fatalErrorInFunction << "Invalid step number " << step << endl; return false; - }*/ + } return true; } @@ -90,6 +90,8 @@ pFlow::MPI::processorBoundaryField::processorBoundaryField( boundary.mirrorBoundaryIndex() ) { + this->addEvent(message::BNDR_PROCTRANS1). + addEvent(message::BNDR_PROCTRANS2); } template diff --git a/src/phasicFlow/MPIParallelization/pointField/processorBoundaryField.hpp b/src/phasicFlow/MPIParallelization/pointField/processorBoundaryField.hpp index 5fb0780a..0a6bad28 100644 --- a/src/phasicFlow/MPIParallelization/pointField/processorBoundaryField.hpp +++ b/src/phasicFlow/MPIParallelization/pointField/processorBoundaryField.hpp @@ -50,11 +50,11 @@ public: private: - dataSender sender_; + dataSender sender_; - mutable dataReciever reciever_; + dataReciever reciever_; - mutable bool dataRecieved_ = true; + mutable bool dataRecieved_ = true; void checkDataRecieved()const; @@ -82,7 +82,6 @@ public: ProcVectorType& neighborProcField() override; - const ProcVectorType& neighborProcField()const override; bool hearChanges diff --git a/src/phasicFlow/MPIParallelization/pointStructure/boundaries/boundaryProcessor.cpp b/src/phasicFlow/MPIParallelization/pointStructure/boundaries/boundaryProcessor.cpp index 50098e0a..246959b1 100644 --- a/src/phasicFlow/MPIParallelization/pointStructure/boundaries/boundaryProcessor.cpp +++ b/src/phasicFlow/MPIParallelization/pointStructure/boundaries/boundaryProcessor.cpp @@ -21,15 +21,13 @@ Licence: #include "boundaryProcessor.hpp" #include "dictionary.hpp" #include "mpiCommunication.hpp" +#include "boundaryBaseKernels.hpp" +#include "internalPoints.hpp" void pFlow::MPI::boundaryProcessor::checkSize() const { - if (!sizeObtained_) - { - //MPI_Wait(&sizeRequest_, StatusIgnore); - sizeObtained_ = true; - } + } void @@ -37,13 +35,13 @@ pFlow::MPI::boundaryProcessor::checkDataRecieved() const { if (!dataRecieved_) { - //uint32 nRecv = reciever_.waitComplete(); + uint32 nRecv = reciever_.waitBufferForUse(); dataRecieved_ = true; - /*if (nRecv != neighborProcSize()) + if (nRecv != neighborProcSize()) { fatalErrorInFunction; fatalExit; - }*/ + } } } @@ -92,8 +90,7 @@ pFlow::MPI::boundaryProcessor::beforeIteration(uint32 iterNum, real t, real dt) pFlowProcessors().localCommunicator(), MPI_STATUS_IGNORE ); - - sizeObtained_ = false; + MPI_Request_free(&req); return true; } @@ -135,6 +132,105 @@ pFlow::MPI::boundaryProcessor::updataBoundary(int step) return true; } +bool pFlow::MPI::boundaryProcessor::transferData(int step) +{ + if(step==1) + { + uint32 s = size(); + uint32Vector_D transferFlags("transferFlags",s+1, s+1, RESERVE()); + transferFlags.fill(0u); + + const auto& transferD = transferFlags.deviceViewAll(); + auto points = thisPoints(); + auto p = boundaryPlane().infPlane(); + + numToTransfer_ = 0; + + Kokkos::parallel_reduce + ( + "boundaryProcessor::afterIteration", + deviceRPolicyStatic(0,s), + LAMBDA_HD(uint32 i, uint32& transferToUpdate) + { + if(p.pointInNegativeSide(points(i))) + { + transferD(i)=1; + transferToUpdate++; + } + }, + numToTransfer_ + ); + + uint32Vector_D keepIndices("keepIndices"); + if(numToTransfer_ != 0u) + { + pFlow::boundaryBaseKernels::createRemoveKeepIndices + ( + indexList(), + numToTransfer_, + transferFlags, + transferIndices_, + keepIndices, + false + ); + // delete transfer point from this processor + if( !setRemoveKeepIndices(transferIndices_, keepIndices)) + { + fatalErrorInFunction<< + "error in setting transfer and keep points in boundary "<< name()< sender_; - mutable Request sSizeRequest_; + dataReciever reciever_; - int req_=0; + mutable bool dataRecieved_ = true; - mutable bool sizeObtained_ = true; + uint32 numToTransfer_ = 0; - mutable dataSender sender_; + uint32 numToRecieve_ = 0; - mutable dataReciever reciever_; + uint32Vector_D transferIndices_{"transferIndices"}; - mutable bool dataRecieved_ = true; + void checkSize() const; - void checkSize()const; + void checkDataRecieved() const; - void checkDataRecieved()const; - - /// @brief Update processor boundary data for this processor - /// @param step It is either 1 or 2 in the input to indicate - /// the update step - /// @return true if successful - /// @details This method is called by boundaryList two times to - /// allow processor boundaries to exchange data in two steps. - /// The first step is a buffered non-blocking send and the second - /// step is non-blocking recieve to get data. - bool updataBoundary(int step)override; + /// @brief Update processor boundary data for this processor + /// @param step It is either 1 or 2 in the input to indicate + /// the update step + /// @return true if successful + /// @details This method is called by boundaryList two times to + /// allow processor boundaries to exchange data in two steps. + /// The first step is a buffered non-blocking send and the second + /// step is non-blocking recieve to get data. + bool updataBoundary(int step) override; -public: + bool transferData(int step) override; - TypeInfo("boundary"); + public: + TypeInfo("boundary"); - boundaryProcessor( - const dictionary& dict, - const plane& bplane, - internalPoints& internal, - boundaryList& bndrs, - uint32 thisIndex - ); + boundaryProcessor( + const dictionary &dict, + const plane &bplane, + internalPoints &internal, + boundaryList &bndrs, + uint32 thisIndex); - ~boundaryProcessor() override = default; + ~boundaryProcessor() override = default; - add_vCtor - ( - boundaryBase, - boundaryProcessor, - dictionary - ); + add_vCtor( + boundaryBase, + boundaryProcessor, + dictionary); - bool beforeIteration(uint32 iterNum, real t, real dt) override; + bool beforeIteration(uint32 iterNum, real t, real dt) override; - bool iterate(uint32 iterNum, real t, real dt) override; + bool iterate(uint32 iterNum, real t, real dt) override; - bool afterIteration(uint32 iterNum, real t, real dt) override; + bool afterIteration(uint32 iterNum, real t, real dt) override; - /// @brief Return number of points in the neighbor processor boundary. - /// This is overriden from boundaryBase. - uint32 neighborProcSize() const override; + /// @brief Return number of points in the neighbor processor boundary. + /// This is overriden from boundaryBase. + uint32 neighborProcSize() const override; - /// @brief Return a reference to point positions in the neighbor - /// processor boundary. - realx3Vector_D& neighborProcPoints() override; + /// @brief Return a reference to point positions in the neighbor + /// processor boundary. + realx3Vector_D &neighborProcPoints() override; - /// @brief Return a const reference to point positions in the - /// neighbor processor boundary. - const realx3Vector_D& neighborProcPoints() const override; - -}; + /// @brief Return a const reference to point positions in the + /// neighbor processor boundary. + const realx3Vector_D &neighborProcPoints() const override; + }; } // namespace pFlow::MPI diff --git a/src/phasicFlow/MPIParallelization/pointStructure/boundaries/dataReciever.hpp b/src/phasicFlow/MPIParallelization/pointStructure/boundaries/dataReciever.hpp index 13069b2a..962146eb 100644 --- a/src/phasicFlow/MPIParallelization/pointStructure/boundaries/dataReciever.hpp +++ b/src/phasicFlow/MPIParallelization/pointStructure/boundaries/dataReciever.hpp @@ -27,13 +27,11 @@ private: BufferVectorType buffer_; - std::vector buffer0_; - int fromProc_; int tag_; - Request recvRequest_; + mutable Request recvRequest_ = RequestNull; public: @@ -46,34 +44,40 @@ public: ~dataReciever()=default; + uint32 waitBufferForUse()const + { + if(recvRequest_ != RequestNull) + { + Status status; + MPI_Wait(&recvRequest_, &status); + int count; + CheckMPI(getCount(&status, count), true); + + return static_cast(count); + } + else + return buffer_.size(); + } + void recieveData( const localProcessors& processors, uint32 numToRecv ) { - - buffer0_.clear(); - buffer0_.resize(numToRecv); - MPI_Status status; + waitBufferForUse(); + buffer_.clear(); + buffer_.resize(numToRecv); - /*CheckMPI(recv( - buffer_.getSpan(), - fromProc_, - tag_, - processors.localCommunicator(), - &status), true);*/ - MPI_Recv( - buffer0_.data(), - buffer0_.size(), - realx3Type__, - fromProc_, - tag_, - processors.localCommunicator(), - &status + CheckMPI( + Irecv( + buffer_.getSpan(), + fromProc_, + tag_, + processors.localCommunicator(), + &recvRequest_ + ), + true ); - int c; - getCount(&status, c); - pOutput<<"Number of data recieved "<(&status, count), true); - - return static_cast(count);*/ - return buffer_.size(); - } - }; } diff --git a/src/phasicFlow/MPIParallelization/pointStructure/boundaries/dataSender.hpp b/src/phasicFlow/MPIParallelization/pointStructure/boundaries/dataSender.hpp index 11c1782f..6342009b 100644 --- a/src/phasicFlow/MPIParallelization/pointStructure/boundaries/dataSender.hpp +++ b/src/phasicFlow/MPIParallelization/pointStructure/boundaries/dataSender.hpp @@ -26,15 +26,13 @@ public: private: - //BufferVectorType buffer_; - - std::vector buffer_; + BufferVectorType buffer_; int toProc_; int tag_; - Request sendRequest_ = RequestNull; + mutable Request sendRequest_ = RequestNull; public: @@ -44,7 +42,22 @@ public: tag_(tag) {} - ~dataSender()=default; + ~dataSender() + { + if(sendRequest_ != RequestNull) + { + MPI_Request_free(&sendRequest_); + } + } + + bool waitBufferForUse()const + { + if(sendRequest_ != RequestNull) + { + MPI_Wait(&sendRequest_, StatusesIgnore); + } + return true; + } void sendData( const localProcessors& processors, @@ -52,17 +65,21 @@ public: ) { using RPolicy = Kokkos::RangePolicy< - DefaultExecutionSpace, + execution_space, Kokkos::Schedule, Kokkos::IndexType>; uint32 n = scatterField.size(); + + // make sure the buffer is ready to be used and free + // the previous request (if any). + waitBufferForUse(); // clear the buffer to prevent data copy if capacity increases buffer_.clear(); buffer_.resize(n); - auto* buffView = buffer_.data(); + const auto& buffView = buffer_.deviceViewAll(); Kokkos::parallel_for( "dataSender::sendData", @@ -73,26 +90,20 @@ public: } ); Kokkos::fence(); - auto req = MPI_REQUEST_NULL; - - MPI_Isend( - buffer_.data(), - buffer_.size(), - realx3Type__, - toProc_, - tag_, - processors.localCommunicator(), - &req); - /*CheckMPI(send( - buffer_.getSpan(), - toProc_, - tag_, - processors.localCommunicator(), - MPI_STATUS_IGNORE), true);*/ + CheckMPI( + Isend(buffer_.getSpan(), + toProc_, + tag_, + processors.localCommunicator(), + &sendRequest_ + ), + true + ); + } - /*auto& buffer() + auto& buffer() { return buffer_; } @@ -100,17 +111,20 @@ public: const auto& buffer()const { return buffer_; - }*/ + } bool sendComplete() { - return true; - /*int test; - MPI_Test(&sendRequest_, &test, StatusIgnore); - if(test) - return true; + int test; + if(sendRequest_ != RequestNull) + { + MPI_Test(&sendRequest_, &test, StatusIgnore); + return test; + } else - return false;*/ + { + return true; + } } }; diff --git a/src/phasicFlow/streams/processorOstream/processorOstream.cpp b/src/phasicFlow/streams/processorOstream/processorOstream.cpp index b91950cd..e59c759a 100755 --- a/src/phasicFlow/streams/processorOstream/processorOstream.cpp +++ b/src/phasicFlow/streams/processorOstream/processorOstream.cpp @@ -156,3 +156,9 @@ void pFlow::processorOstream::indent() checkForPrefix(); Ostream::indent(); } + +pFlow::processorOstream &pFlow::processorOstream::setColor(const char *colorCode) +{ + Ostream::write(colorCode); + return *this; +} diff --git a/src/phasicFlow/streams/processorOstream/processorOstream.hpp b/src/phasicFlow/streams/processorOstream/processorOstream.hpp index d448f082..92c77d22 100755 --- a/src/phasicFlow/streams/processorOstream/processorOstream.hpp +++ b/src/phasicFlow/streams/processorOstream/processorOstream.hpp @@ -139,6 +139,8 @@ public: /// Add indentation characters void indent() override; + processorOstream& setColor(const char* colorCode); + }; // processorOstream diff --git a/src/phasicFlow/streams/streams.hpp b/src/phasicFlow/streams/streams.hpp index 56e2f2b6..e34a28cd 100755 --- a/src/phasicFlow/streams/streams.hpp +++ b/src/phasicFlow/streams/streams.hpp @@ -33,7 +33,7 @@ namespace pFlow } -#define INFORMATION pFlow::pOutput< INFO: "< INFO: "<