Merge pull request #1 from hamidrezanorouzi/MPIdev

Mp idev
2025-09-16 04:16:04 +00:00 · 2024-05-05 23:02:16 +03:30
parent d0798dfc0b 68b7d141fa
commit 30d5349fcf
16 changed files with 362 additions and 176 deletions
--- a/README.md
+++ b/README.md
@ -5,8 +5,16 @@

 **PhasicFlow** is a parallel C++ code for performing DEM simulations. It can run on shared-memory multi-core computational units such as multi-core CPUs or GPUs (for now it works on CUDA-enabled GPUs). The parallelization method mainly relies on loop-level parallelization on a shared-memory computational unit. You can build and run PhasicFlow in serial mode on regular PCs, in parallel mode for multi-core CPUs, or build it for a GPU device to off-load computations to a GPU. In its current statues you can simulate millions of particles (up to 80M particles tested) on a single desktop computer. You can see the [performance tests of PhasicFlow](https://github.com/PhasicFlow/phasicFlow/wiki/Performance-of-phasicFlow) in the wiki page.

+**MPI** parallelization with dynamic load balancing is under development. With this level of parallelization, PhasicFlow can leverage the computational power of **multi-gpu** workstations or clusters with distributed memory CPUs. 
+In summary PhasicFlow can have 6 execution modes:
+1. Serial on a single CPU core,
+2. Parallel on a multi-core computer/node (using OpenMP),
+3. Parallel on an nvidia-GPU (using Cuda),
+4. Parallel on distributed memory workstation (Using MPI)
+5. Parallel on distributed memory workstations with multi-core nodes (using MPI+OpenMP)
+6. Parallel on workstations with multiple GPUs (using MPI+Cuda).
 ## How to build?
-You can build PhasicFlow for CPU and GPU executions. [Here is a complete step-by-step procedure](https://github.com/PhasicFlow/phasicFlow/wiki/How-to-Build-PhasicFlow).
+You can build PhasicFlow for CPU and GPU executions. The latest release of PhasicFlow is v-0.1. [Here is a complete step-by-step procedure for building phasicFlow-v-0.1.](https://github.com/PhasicFlow/phasicFlow/wiki/How-to-Build-PhasicFlow).

 ## Online code documentation
 You can find a full documentation of the code, its features, and other related materials on [online documentation of the code](https://phasicflow.github.io/phasicFlow/)
--- a/src/Interaction/contactSearch/ContactSearch/ContactSearch.hpp
+++ b/src/Interaction/contactSearch/ContactSearch/ContactSearch.hpp
@ -130,9 +130,9 @@ public:
 		csPairContainerType& pwPairs,
 		bool force = false) override
 	{
-		ppTimer().start();
 		
-		Particles().boundingSphere().updateBoundaries(DataDirection::SlaveToMaster);
+
+		ppTimer().start();
 	
 		const auto& position = Particles().pointPosition().deviceViewAll();
 		const auto& flags = Particles().dynPointStruct().activePointsMaskDevice();
@ -167,6 +167,7 @@ public:
 		csPairContainerType& pwPairs,
 		bool force = false)override
 	{
+		Particles().boundingSphere().updateBoundaries(DataDirection::SlaveToMaster);
 		return csBoundaries_[i].broadSearch(
 			iter, 
 			t, 
@ -176,7 +177,6 @@ public:
 			force);
 	}

-
 	bool enterBroadSearch(uint32 iter, real t, real dt)const override
 	{
 		if(ppwContactSearch_)
--- a/src/Interaction/contactSearch/boundaries/processorBoundaryContactSearch/processorBoundaryContactSearch.cpp
+++ b/src/Interaction/contactSearch/boundaries/processorBoundaryContactSearch/processorBoundaryContactSearch.cpp
@ -85,7 +85,7 @@ bool pFlow::processorBoundaryContactSearch::broadSearch
 {
    if(masterSearch_)
    {
-        /*const auto thisPoints = boundary().thisPoints();
+        const auto thisPoints = boundary().thisPoints();
        const auto& neighborProcPoints = boundary().neighborProcPoints();
        const auto& bDiams = diameter_.BoundaryField(thisBoundaryIndex());
        const auto thisDiams = bDiams.thisField();
@ -96,9 +96,9 @@ bool pFlow::processorBoundaryContactSearch::broadSearch
            thisPoints, 
            thisDiams,
            neighborProcPoints,
-            neighborProcDiams);
-
-        pOutput<<"ppPairs size in boundary"<< ppPairs.size()<<endl;      */
+            neighborProcDiams
+        );
+        //pOutput<<"ppSize "<< ppPairs.size()<<endl;
        return true;

    }else
--- a/src/Interaction/contactSearch/boundaries/twoPartContactSearch/twoPartContactSearch.cpp
+++ b/src/Interaction/contactSearch/boundaries/twoPartContactSearch/twoPartContactSearch.cpp
@ -99,7 +99,7 @@ bool pFlow::twoPartContactSearch::broadSearchPP
 			ppPairs.increaseCapacityBy(len);

 			INFORMATION<< "Particle-particle contact pair container capacity increased from "<<
-			oldCap << " to "<<ppPairs.capacity()<<" in peiodicBoundaryContactSearch."<<END_INFO;
+			oldCap << " to "<<ppPairs.capacity()<<" in contact search in boundary region."<<END_INFO;
 			
 		}

--- a/src/Interaction/sphereInteraction/boundaries/processorBoundarySphereInteraction/processorBoundarySphereInteraction.cpp
+++ b/src/Interaction/sphereInteraction/boundaries/processorBoundarySphereInteraction/processorBoundarySphereInteraction.cpp
@ -32,9 +32,7 @@ pFlow::MPI::processorBoundarySphereInteraction<cFM, gMM>::processorBoundarySpher
 		geomMotion
 	),
 	masterInteraction_(boundary.isBoundaryMaster())
-{
-	pOutput<<"Processor boundayrCondition for "<< boundary.name()<<endl;
-}
+{}

 template <typename cFM, typename gMM>
 bool pFlow::MPI::processorBoundarySphereInteraction<cFM, gMM>::sphereSphereInteraction
@ -43,13 +41,13 @@ bool pFlow::MPI::processorBoundarySphereInteraction<cFM, gMM>::sphereSphereInter
 	const ContactForceModel &cfModel
 )
 {
+	return true;
 	if(!masterInteraction_) return true;
 	
 	const auto & sphPar = this->sphParticles();
 	uint32 thisIndex = this->boundary().thisBoundaryIndex();
-	const auto& a = sphPar.diameter().BoundaryField(thisIndex).neighborProcField().deviceViewAll();
-
-	/*pFlow::MPI::processorBoundarySIKernels::sphereSphereInteraction(
+	pOutput<<"beofre sphereSphereInteraction"<<endl;
+	pFlow::MPI::processorBoundarySIKernels::sphereSphereInteraction(
 		dt,
 		this->ppPairs(),
 		cfModel,
@ -67,7 +65,9 @@ bool pFlow::MPI::processorBoundarySphereInteraction<cFM, gMM>::sphereSphereInter
 		sphPar.rVelocity().BoundaryField(thisIndex).neighborProcField().deviceViewAll(),
 		sphPar.contactForce().BoundaryField(thisIndex).neighborProcField().deviceViewAll(),
 		sphPar.contactTorque().BoundaryField(thisIndex).neighborProcField().deviceViewAll()
-	);*/
+	);
+
+	pOutput<<"after sphereSphereInteraction"<<endl;

 	return true;
 }
--- a/src/Interaction/sphereInteraction/sphereInteraction/sphereInteraction.cpp
+++ b/src/Interaction/sphereInteraction/sphereInteraction/sphereInteraction.cpp
@ -166,12 +166,12 @@ bool pFlow::sphereInteraction<cFM,gMM, cLT>::iterate()
 	
 	bool broadSearch = contactSearch_().enterBroadSearch(iter, t, dt);

-	/*sphParticles_.diameter().updateBoundaries(DataDirection::SlaveToMaster);
+	sphParticles_.diameter().updateBoundaries(DataDirection::SlaveToMaster);
 	sphParticles_.velocity().updateBoundaries(DataDirection::SlaveToMaster);
 	sphParticles_.rVelocity().updateBoundaries(DataDirection::SlaveToMaster);
 	sphParticles_.mass().updateBoundaries(DataDirection::SlaveToMaster);
 	sphParticles_.I().updateBoundaries(DataDirection::SlaveToMaster);
-	sphParticles_.propertyId().updateBoundaries(DataDirection::SlaveToMaster);*/
+	sphParticles_.propertyId().updateBoundaries(DataDirection::SlaveToMaster);
 	
 	
 	if(broadSearch)
--- a/src/phasicFlow/MPIParallelization/MPI/mpiCommunication.hpp
+++ b/src/phasicFlow/MPIParallelization/MPI/mpiCommunication.hpp
@ -238,6 +238,18 @@ inline auto send(span<T> data, int dest, int tag, Comm comm)
        comm);
 }

+template<typename T> 
+inline auto send(const T& data, int dest, int tag, Comm comm)
+{
+	return MPI_Send(
+        &data, 
+        sFactor<T>(), 
+        Type<T>(), 
+        dest, 
+        tag, 
+        comm);
+}
+
 template<typename T>
 inline auto Isend(span<T> data, int dest, int tag, Comm comm, Request* req)
 {
@ -277,6 +289,19 @@ inline auto recv(span<T> data, int source, int tag, Comm comm, Status *status)
        status);
 }

+template<typename T>
+inline auto recv(T& data, int source, int tag, Comm comm, Status *status)
+{
+	return MPI_Recv(
+        &data, 
+        sFactor<T>(), 
+        Type<T>(), 
+        source, 
+        tag, 
+        comm, 
+        status);
+}
+
 template<typename T>
 inline auto Irecv(T& data, int source, int tag, Comm comm, Request* req)
 {
--- a/src/phasicFlow/MPIParallelization/pointField/processorBoundaryField.cpp
+++ b/src/phasicFlow/MPIParallelization/pointField/processorBoundaryField.cpp
@ -24,13 +24,13 @@ pFlow::MPI::processorBoundaryField<T, MemorySpace>::checkDataRecieved() const
 {
 	if (!dataRecieved_)
 	{
-		//uint32 nRecv  = reciever_.waitComplete();
+		uint32 nRecv = reciever_.waitBufferForUse();
 		dataRecieved_ = true;
-		/*if (nRecv != this->neighborProcSize())
+		if (nRecv != this->neighborProcSize())
 		{
 			fatalErrorInFunction;
 			fatalExit;
-		}*/
+		}
 	}
 }

@ -41,7 +41,7 @@ pFlow::MPI::processorBoundaryField<T, MemorySpace>::updateBoundary(
  DataDirection direction
 )
 {
-	/*if (step == 1)
+	if (step == 1)
 	{
 		// Isend
 		if (direction == DataDirection::TwoWay || 
@ -67,7 +67,7 @@ pFlow::MPI::processorBoundaryField<T, MemorySpace>::updateBoundary(
 	{
 		fatalErrorInFunction << "Invalid step number " << step << endl;
 		return false;
-	}*/
+	}

 	return true;
 }
@ -90,6 +90,8 @@ pFlow::MPI::processorBoundaryField<T, MemorySpace>::processorBoundaryField(
      boundary.mirrorBoundaryIndex()
    )
 {
+	this->addEvent(message::BNDR_PROCTRANS1).
+	addEvent(message::BNDR_PROCTRANS2);
 }

 template<class T, class MemorySpace>
--- a/src/phasicFlow/MPIParallelization/pointField/processorBoundaryField.hpp
+++ b/src/phasicFlow/MPIParallelization/pointField/processorBoundaryField.hpp
@ -50,11 +50,11 @@ public:

 private:

-	dataSender<T, MemorySpace>           sender_;
+	dataSender<T, MemorySpace>   sender_;

-	mutable dataReciever<T, MemorySpace> reciever_;
+	dataReciever<T, MemorySpace> reciever_;

-	mutable bool                          dataRecieved_ = true;
+	mutable bool                 dataRecieved_ = true;

 	void checkDataRecieved()const;

@ -82,7 +82,6 @@ public:
 	
 	ProcVectorType& neighborProcField() override;

-	
 	const ProcVectorType& neighborProcField()const override;

 	bool hearChanges
--- a/src/phasicFlow/MPIParallelization/pointStructure/boundaries/boundaryProcessor.cpp
+++ b/src/phasicFlow/MPIParallelization/pointStructure/boundaries/boundaryProcessor.cpp
@ -21,15 +21,13 @@ Licence:
 #include "boundaryProcessor.hpp"
 #include "dictionary.hpp"
 #include "mpiCommunication.hpp"
+#include "boundaryBaseKernels.hpp"
+#include "internalPoints.hpp"

 void
 pFlow::MPI::boundaryProcessor::checkSize() const
 {
-	if (!sizeObtained_)
-	{
-		//MPI_Wait(&sizeRequest_, StatusIgnore);
-		sizeObtained_ = true;
-	}
+	
 }

 void
@ -37,13 +35,13 @@ pFlow::MPI::boundaryProcessor::checkDataRecieved() const
 {
 	if (!dataRecieved_)
 	{
-		//uint32 nRecv  = reciever_.waitComplete();
+		uint32 nRecv = reciever_.waitBufferForUse();
 		dataRecieved_ = true;
-		/*if (nRecv != neighborProcSize())
+		if (nRecv != neighborProcSize())
 		{
 			fatalErrorInFunction;
 			fatalExit;
-		}*/
+		}
 	}
 }

@ -92,8 +90,7 @@ pFlow::MPI::boundaryProcessor::beforeIteration(uint32 iterNum, real t, real dt)
 		pFlowProcessors().localCommunicator(),
 		MPI_STATUS_IGNORE
 	);
-
-	sizeObtained_ = false;
+	MPI_Request_free(&req);

 	return true;
 }
@ -135,6 +132,105 @@ pFlow::MPI::boundaryProcessor::updataBoundary(int step)
 	return true;
 }

+bool pFlow::MPI::boundaryProcessor::transferData(int step)
+{
+    if(step==1)
+	{
+		uint32 s = size();
+		uint32Vector_D transferFlags("transferFlags",s+1, s+1, RESERVE());
+		transferFlags.fill(0u);
+
+		const auto& transferD = transferFlags.deviceViewAll();
+		auto points = thisPoints();
+		auto p = boundaryPlane().infPlane();
+
+		numToTransfer_ = 0;	
+
+		Kokkos::parallel_reduce
+		(
+			"boundaryProcessor::afterIteration",
+			deviceRPolicyStatic(0,s),
+			LAMBDA_HD(uint32 i, uint32& transferToUpdate)
+			{
+				if(p.pointInNegativeSide(points(i)))
+				{
+					transferD(i)=1;
+					transferToUpdate++;
+				}
+			}, 
+			numToTransfer_
+		);
+			
+		uint32Vector_D keepIndices("keepIndices");
+		if(numToTransfer_ != 0u)
+		{
+			pFlow::boundaryBaseKernels::createRemoveKeepIndices
+			(
+				indexList(),
+				numToTransfer_,
+				transferFlags,
+				transferIndices_,
+				keepIndices,
+				false
+			);
+			// delete transfer point from this processor 
+			if( !setRemoveKeepIndices(transferIndices_, keepIndices))
+			{
+				fatalErrorInFunction<<
+				"error in setting transfer and keep points in boundary "<< name()<<endl;
+				return false;
+			}
+		}
+		else
+		{
+			transferIndices_.clear();
+		}
+
+		auto req = RequestNull;
+		CheckMPI( Isend(
+			numToTransfer_, 
+			neighborProcessorNo(), 
+			thisBoundaryIndex(),
+			pFlowProcessors().localCommunicator(),
+			&req), true );
+
+		CheckMPI(recv(
+			numToRecieve_,
+			neighborProcessorNo(),
+			mirrorBoundaryIndex(),
+			pFlowProcessors().localCommunicator(),
+			StatusesIgnore), true);
+
+		MPI_Request_free(&req);
+		return true;
+	}
+	else if(step ==2 )
+	{
+		pointFieldAccessType transferPoints(
+			transferIndices_.size(), 
+			transferIndices_.deviceViewAll(),
+			internal().pointPositionDevice());
+
+		sender_.sendData(pFlowProcessors(), transferPoints);
+		return true;
+	}
+	else if(step == 3)
+	{
+		
+		reciever_.recieveData(pFlowProcessors(), numToRecieve_);
+		return true;
+	}
+	else if(step == 4)
+	{
+		reciever_.waitBufferForUse();
+		// 		
+		return false;
+	}
+
+	return false;
+	
+}
+
 bool
 pFlow::MPI::boundaryProcessor::iterate(uint32 iterNum, real t, real dt)
 {
@ -144,5 +240,54 @@ pFlow::MPI::boundaryProcessor::iterate(uint32 iterNum, real t, real dt)
 bool
 pFlow::MPI::boundaryProcessor::afterIteration(uint32 iterNum, real t, real dt)
 {
+	
+	uint32 s = size();
+	pOutput<<"size of boundary is "<< s <<endl;
+	uint32Vector_D transferFlags("transferFlags",s+1, s+1, RESERVE());
+	transferFlags.fill(0u);
+
+	const auto& transferD = transferFlags.deviceViewAll();
+	auto points = thisPoints();
+	auto p = boundaryPlane().infPlane();
+
+	uint32 numTransfer = 0;	
+
+	Kokkos::parallel_reduce
+	(
+		"boundaryProcessor::afterIteration",
+		deviceRPolicyStatic(0,s),
+		LAMBDA_HD(uint32 i, uint32& transferToUpdate)
+		{
+			if(p.pointInNegativeSide(points(i)))
+			{
+				transferD(i)=1;
+				transferToUpdate++;
+			}
+		}, 
+		numTransfer
+	);
+		
+	pOutput<<"Numebr to be transfered "<< numTransfer<<endl;
+		
+	uint32Vector_D transferIndices("transferIndices");
+	uint32Vector_D keepIndices("keepIndices");
+
+	pFlow::boundaryBaseKernels::createRemoveKeepIndices
+	(
+		indexList(),
+		numTransfer,
+		transferFlags,
+		transferIndices,
+		keepIndices
+	);
+	
+	// delete transfer point from this processor 
+	if( !setRemoveKeepIndices(transferIndices, keepIndices))
+	{
+		fatalErrorInFunction<<
+		"error in setting transfer and keep points in boundary "<< name()<<endl;
+		return false;
+	}
+	
 	return true;
 }
--- a/src/phasicFlow/MPIParallelization/pointStructure/boundaries/boundaryProcessor.hpp
+++ b/src/phasicFlow/MPIParallelization/pointStructure/boundaries/boundaryProcessor.hpp
@ -1,7 +1,7 @@
 /*------------------------------- phasicFlow ---------------------------------
-      O        C enter of
-     O O       E ngineering and
-    O   O      M ultiscale modeling of
+	  O        C enter of
+	 O O       E ngineering and
+	O   O      M ultiscale modeling of
   OOOOOOO     F luid flow
 ------------------------------------------------------------------------------
  Copyright (C): www.cemf.ir
@ -21,7 +21,6 @@ Licence:
 #ifndef __boundaryProcessor_hpp__
 #define __boundaryProcessor_hpp__

-
 #include "boundaryBase.hpp"
 #include "mpiTypes.hpp"
 #include "dataSender.hpp"
@ -30,86 +29,82 @@ Licence:
 namespace pFlow::MPI
 {

-class boundaryProcessor
-:
- 	public boundaryBase
-{
-private:
+	class boundaryProcessor
+		: public boundaryBase
+	{
+	public:
+		using pointFieldAccessType = typename boundaryBase::pointFieldAccessType;

-	uint32                       neighborProcNumPoints_ = 0;
+	private:
+		uint32               neighborProcNumPoints_ = 0;

-	uint32                       thisNumPoints_;
+		uint32               thisNumPoints_ = 0;

-	realx3Vector_D               neighborProcPoints_;
+		realx3Vector_D       neighborProcPoints_;

-	mutable Request              sizeRequest_;
+		dataSender<realx3>   sender_;

-	mutable Request 			sSizeRequest_;
+		dataReciever<realx3> reciever_;

-	int req_=0;
+		mutable bool         dataRecieved_ = true;

-	mutable bool                 sizeObtained_ = true;
+		uint32               numToTransfer_ = 0;

-	mutable dataSender<realx3>   sender_;
+		uint32               numToRecieve_ = 0;

-	mutable dataReciever<realx3> reciever_;
+		uint32Vector_D       transferIndices_{"transferIndices"};

-	mutable bool                 dataRecieved_ = true;
+		void checkSize() const;

-	void checkSize()const;
+		void checkDataRecieved() const;

-	void checkDataRecieved()const;
+		/// @brief  Update processor boundary data for this processor
+		/// @param step It is either 1 or 2 in the input to indicate
+		/// the update step
+		/// @return true if successful
+		/// @details This method is called by boundaryList two times to
+		/// allow processor boundaries to exchange data in two steps.
+		/// The first step is a buffered non-blocking send and the second
+		/// step is non-blocking recieve to get data.
+		bool updataBoundary(int step) override;

-	/// @brief  Update processor boundary data for this processor
-	/// @param step It is either 1 or 2 in the input to indicate 
-	/// the update step 
-	/// @return true if successful 
-	/// @details This method is called by boundaryList two times to 
-	/// allow processor boundaries to exchange data in two steps.
-	/// The first step is a buffered non-blocking send and the second
-	/// step is non-blocking recieve to get data. 
-	bool updataBoundary(int step)override;
+		bool transferData(int step) override;

-public:
+	public:
+		TypeInfo("boundary<processor>");

-	TypeInfo("boundary<processor>");
+		boundaryProcessor(
+			const dictionary &dict,
+			const plane &bplane,
+			internalPoints &internal,
+			boundaryList &bndrs,
+			uint32 thisIndex);

-	boundaryProcessor(
-	  const dictionary& dict,
-	  const plane&      bplane,
-	  internalPoints&   internal,
-	  boundaryList&     bndrs,
-	  uint32            thisIndex
-	);
+		~boundaryProcessor() override = default;

-	~boundaryProcessor() override = default;
+		add_vCtor(
+			boundaryBase,
+			boundaryProcessor,
+			dictionary);

-	add_vCtor
-	(
-		boundaryBase, 
-		boundaryProcessor, 
-		dictionary
-	);
+		bool beforeIteration(uint32 iterNum, real t, real dt) override;

-	bool beforeIteration(uint32 iterNum, real t, real dt) override;
+		bool iterate(uint32 iterNum, real t, real dt) override;

-	bool iterate(uint32 iterNum, real t, real dt) override;
+		bool afterIteration(uint32 iterNum, real t, real dt) override;

-	bool afterIteration(uint32 iterNum, real t, real dt) override;
+		/// @brief Return number of points in the neighbor processor boundary.
+		/// This is overriden from boundaryBase.
+		uint32 neighborProcSize() const override;

-	/// @brief Return number of points in the neighbor processor boundary.
-	/// This is overriden from boundaryBase.
-	uint32 neighborProcSize() const override;
+		/// @brief Return a reference to point positions in the neighbor
+		/// processor boundary.
+		realx3Vector_D &neighborProcPoints() override;

-	/// @brief Return a reference to point positions in the neighbor
-	/// processor boundary.
-	realx3Vector_D& neighborProcPoints() override;
-
-	/// @brief Return a const reference to point positions in the
-	/// neighbor processor boundary.
-	const realx3Vector_D& neighborProcPoints() const override;
-
-};
+		/// @brief Return a const reference to point positions in the
+		/// neighbor processor boundary.
+		const realx3Vector_D &neighborProcPoints() const override;
+	};

 } // namespace pFlow::MPI

--- a/src/phasicFlow/MPIParallelization/pointStructure/boundaries/dataReciever.hpp
+++ b/src/phasicFlow/MPIParallelization/pointStructure/boundaries/dataReciever.hpp
@ -27,13 +27,11 @@ private:
    
    BufferVectorType buffer_;

-    std::vector<T>  buffer0_;
-
 	int              fromProc_;

 	int              tag_;

-	Request          recvRequest_;
+	mutable Request  recvRequest_ = RequestNull;

 public:

@ -46,34 +44,40 @@ public:

    ~dataReciever()=default;

+    uint32 waitBufferForUse()const
+    {
+        if(recvRequest_ != RequestNull)
+        {
+            Status status;
+            MPI_Wait(&recvRequest_, &status);
+            int count;
+            CheckMPI(getCount<T>(&status, count), true);
+
+            return static_cast<uint32>(count);
+        }
+        else
+            return buffer_.size();
+    }
+
    void recieveData(
        const localProcessors&      processors,
        uint32 numToRecv
    )
    {   
+        waitBufferForUse();
+        buffer_.clear();
+        buffer_.resize(numToRecv);

-        buffer0_.clear();
-		buffer0_.resize(numToRecv);
-        MPI_Status status;
-
-        /*CheckMPI(recv(
-            buffer_.getSpan(), 
-            fromProc_, 
-            tag_, 
-            processors.localCommunicator(), 
-            &status), true);*/
-        MPI_Recv(
-            buffer0_.data(),
-            buffer0_.size(),
-            realx3Type__,
-            fromProc_,
-            tag_,
-            processors.localCommunicator(),
-            &status
+        CheckMPI(
+            Irecv(
+                buffer_.getSpan(), 
+                fromProc_, 
+                tag_, 
+                processors.localCommunicator(), 
+                &recvRequest_
+            ),
+            true
        );
-        int c;
-        getCount<realx3>(&status, c);
-        pOutput<<"Number of data recieved "<<c<<endl;
    }

    auto& buffer()
@ -86,20 +90,6 @@ public:
        return buffer_;
    }

-    uint32 waitComplete()
-    {
-        
-        /*Status status;   
-        
-        CheckMPI(MPI_Wait(&recvRequest_, &status), true);
-
-        int count;
-        CheckMPI(getCount<T>(&status, count), true);
-
-        return static_cast<uint32>(count);*/
-        return buffer_.size();
-    }
-
 };

 }
--- a/src/phasicFlow/MPIParallelization/pointStructure/boundaries/dataSender.hpp
+++ b/src/phasicFlow/MPIParallelization/pointStructure/boundaries/dataSender.hpp
@ -26,15 +26,13 @@ public:

 private:

-	//BufferVectorType buffer_;
-
-    std::vector<T> buffer_;
+	BufferVectorType buffer_;

 	int              toProc_;

 	int              tag_;

-	Request          sendRequest_ = RequestNull;
+	mutable Request  sendRequest_ = RequestNull;

 public:

@ -44,7 +42,22 @@ public:
        tag_(tag)
    {}

-    ~dataSender()=default;
+    ~dataSender()
+    {
+        if(sendRequest_ != RequestNull)
+        {
+            MPI_Request_free(&sendRequest_);
+        }
+    }
+
+    bool waitBufferForUse()const
+    {
+        if(sendRequest_ != RequestNull)
+        {
+            MPI_Wait(&sendRequest_, StatusesIgnore);
+        }
+        return true;
+    }

    void sendData(
        const localProcessors&      processors,
@ -52,17 +65,21 @@ public:
    )
    {
        using RPolicy = Kokkos::RangePolicy<
-                DefaultExecutionSpace,
+                execution_space,
                Kokkos::Schedule<Kokkos::Static>,
                Kokkos::IndexType<pFlow::uint32>>;

        uint32 n = scatterField.size();
        
+        // make sure the buffer is ready to be used and free 
+        // the previous request (if any).
+        waitBufferForUse();
+
        // clear the buffer to prevent data copy if capacity increases 
        buffer_.clear();
        buffer_.resize(n);
        
-        auto* buffView = buffer_.data();
+        const auto& buffView = buffer_.deviceViewAll();

        Kokkos::parallel_for(
            "dataSender::sendData",
@ -73,26 +90,20 @@ public:
            }
        );
        Kokkos::fence();
-        auto req = MPI_REQUEST_NULL;
        
-        MPI_Isend(
-            buffer_.data(),
-            buffer_.size(),
-            realx3Type__,
-            toProc_,
-            tag_,
-            processors.localCommunicator(),
-            &req);
+        CheckMPI(
+            Isend(buffer_.getSpan(),
+                toProc_,
+                tag_,
+                processors.localCommunicator(),
+                &sendRequest_
+            ), 
+            true
+        );
                
-        /*CheckMPI(send(
-            buffer_.getSpan(), 
-            toProc_, 
-            tag_, 
-            processors.localCommunicator(), 
-            MPI_STATUS_IGNORE), true);*/
    }

-    /*auto& buffer()
+    auto& buffer()
    {
        return buffer_;
    }
@ -100,17 +111,20 @@ public:
    const auto& buffer()const
    {
        return buffer_;
-    }*/
+    }

    bool sendComplete()
    {
-        return true;
-        /*int test;   
-        MPI_Test(&sendRequest_, &test, StatusIgnore);
-        if(test) 
-            return true;
+        int test;
+        if(sendRequest_ != RequestNull)   
+        {
+            MPI_Test(&sendRequest_, &test, StatusIgnore);
+            return test;
+        }
        else
-            return false;*/
+        {
+            return true;
+        }
    }

 };
--- a/src/phasicFlow/streams/processorOstream/processorOstream.cpp
+++ b/src/phasicFlow/streams/processorOstream/processorOstream.cpp
@ -156,3 +156,9 @@ void pFlow::processorOstream::indent()
    checkForPrefix();
    Ostream::indent();
 }
+
+pFlow::processorOstream &pFlow::processorOstream::setColor(const char *colorCode)
+{
+    Ostream::write(colorCode);
+    return *this;
+}
--- a/src/phasicFlow/streams/processorOstream/processorOstream.hpp
+++ b/src/phasicFlow/streams/processorOstream/processorOstream.hpp
@ -139,6 +139,8 @@ public:
        /// Add indentation characters
        void indent() override;

+        processorOstream& setColor(const char* colorCode);
+

 }; // processorOstream

--- a/src/phasicFlow/streams/streams.hpp
+++ b/src/phasicFlow/streams/streams.hpp
@ -33,7 +33,7 @@ namespace pFlow
 }


-#define INFORMATION pFlow::pOutput<<boldChar<<magentaColor<<"> INFO: "<<defaultColor<<magentaColor
+#define INFORMATION pFlow::pOutput.setColor(boldChar).setColor(magentaColor)<<"> INFO: "<<defaultColor<<magentaColor
 #define END_INFO defaultColor<<pFlow::endl

 #define REPORT(n) pFlow::mOutput.space(2*n)