Rewrote the previous implementation, now fully works (verified) and gives the speedup we want. Communication latency is now completely hidden on at least two nodes (8 GPUs). Scaling looks very promising.

2020-04-06 17:28:02 +03:00
parent 37f1c841a3
commit 427a3ac5d8
2 changed files with 93 additions and 57 deletions
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -3,6 +3,7 @@

 #if AC_MPI_ENABLED
 #include <mpi.h>
+#include <stdbool.h>

 #define AC_MPI_UNIDIRECTIONAL_COMM (0)
 #define AC_MPI_RT_PINNING (1)
@@ -14,7 +15,8 @@ typedef struct {

 #if (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
    AcReal* data_pinned;
-#endif // (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
+    bool pinned; // Set if data was received to pinned memory
+#endif           // (AC_MPI_ENABLED && AC_MPI_RT_PINNING)

 #if (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM)
    MPI_Win win; // MPI window for RMA