MPI transfer for the first corner with 3D blocking now complete. Disabled/enabled some error checking for development

2019-12-27 13:43:22 +02:00
parent bd0cc3ee20
commit e86b082c98
3 changed files with 126 additions and 21 deletions
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -47,8 +47,17 @@ struct device_s {
 #if AC_MPI_ENABLED
    // Declare memory for buffers needed for packed data transfers here
 #define CORNER_000 (0)
 #define CORNER_100 (1)
 #define CORNER_010 (2)
 #define CORNER_110 (3)
 #define CORNER_001 (4)
 #define CORNER_101 (5)
 #define CORNER_011 (6)
 #define CORNER_111 (7)
 #define NUM_CORNERS (8)
    int3 corner_dims;
-    AcReal* packed_corners[NUM_VTXBUF_HANDLES][8];
+    AcReal* packed_corners[NUM_VTXBUF_HANDLES][NUM_CORNERS];
 #define EDGE_X (0) // left to right
 #define EDGE_Y (1) // bottom to top
@@ -68,8 +77,9 @@ struct device_s {
 #define EDGE_100A (9)
 #define EDGE_010A (10)
 #define EDGE_110A (11)
 #define NUM_EDGES (12)
    int3 edge_dims[3];
-    AcReal* packed_edges[NUM_VTXBUF_HANDLES][12];
+    AcReal* packed_edges[NUM_VTXBUF_HANDLES][NUM_EDGES];
 #define SIDE_XY (0) // Front/back
 #define SIDE_XZ (1) // Top/bottom
@@ -83,8 +93,9 @@ struct device_s {
 #define SIDE_LEFT (4)
 #define SIDE_RIGHT (5)
 #define NUM_SIDES (6)
    int3 side_dims[3];
-    AcReal* packed_sides[NUM_VTXBUF_HANDLES][6];
+    AcReal* packed_sides[NUM_VTXBUF_HANDLES][NUM_SIDES];
 #endif
 };
@@ -150,7 +161,9 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
        device->corner_dims       = (int3){NGHOST, NGHOST, NGHOST};
        const size_t corner_bytes = device->corner_dims.x * device->corner_dims.y *
                                    device->corner_dims.z * sizeof(AcReal);
-        ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_corners, corner_bytes));
+
        for (int j = 0; j < NUM_CORNERS; ++j)
            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_corners[i][j], corner_bytes));
    }
    // Edges
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
@@ -167,19 +180,19 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
            device->edge_dims[EDGE_Y] = (int3){NGHOST, device_config.int_params[AC_ny], NGHOST};
            const size_t edge_bytes   = device->edge_dims[EDGE_Y].x * device->edge_dims[EDGE_Y].y *
                                      device->edge_dims[EDGE_Y].z * sizeof(AcReal);
-            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][4], edge_bytes));
+            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_000U], edge_bytes));
-            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][5], edge_bytes));
+            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_100U], edge_bytes));
-            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][6], edge_bytes));
+            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_001U], edge_bytes));
-            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][7], edge_bytes));
+            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_101U], edge_bytes));
        }
        { // front-back
            device->edge_dims[EDGE_Z] = (int3){NGHOST, NGHOST, device_config.int_params[AC_nz]};
            const size_t edge_bytes   = device->edge_dims[EDGE_Z].x * device->edge_dims[EDGE_Z].y *
                                      device->edge_dims[EDGE_Z].z * sizeof(AcReal);
-            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][8], edge_bytes));
+            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_000A], edge_bytes));
-            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][9], edge_bytes));
+            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_100A], edge_bytes));
-            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][10], edge_bytes));
+            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_010A], edge_bytes));
-            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][11], edge_bytes));
+            ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_110A], edge_bytes));
        }
    }
@@ -251,13 +264,13 @@ acDeviceDestroy(Device device)
 #if AC_MPI_ENABLED
    // Free data required for packed tranfers here (cudaFree)
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        for (int j = 0; j < 8; ++j)
+        for (int j = 0; j < NUM_CORNERS; ++j)
            cudaFree(device->packed_corners[i][j]);
-        for (int j = 0; j < 12; ++j)
+        for (int j = 0; j < NUM_EDGES; ++j)
            cudaFree(device->packed_edges[i][j]);
-        for (int j = 0; j < 6; ++j)
+        for (int j = 0; j < NUM_SIDES; ++j)
            cudaFree(device->packed_sides[i][j]);
    }
 #endif
@@ -1214,11 +1227,92 @@ acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
    }
 }
 #include "kernels/packing.cuh"
 static AcResult
 acDeviceCommunicateHalos(const Device device)
 {
    acDeviceSynchronizeStream(device, STREAM_ALL);
    MPI_Datatype datatype = MPI_FLOAT;
    if (sizeof(AcReal) == 8)
        datatype = MPI_DOUBLE;
    MPI_Request send_corner_req[NUM_VTXBUF_HANDLES][NUM_CORNERS];
    MPI_Request recv_corner_req[NUM_VTXBUF_HANDLES][NUM_CORNERS];
    int pid, nprocs;
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    const int3 decomposition = decompose(nprocs);
    int3 pid3d               = getPid3D(pid, decomposition);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        // Recv corners
        {
            /// From 000
            const int recv_pid = getPid1D(pid3d + (int3){1, 1, 1}, decomposition);
            const size_t count = device->corner_dims.x * //
                                 device->corner_dims.y * //
                                 device->corner_dims.z;
            MPI_Irecv(&device->packed_corners[i][CORNER_111], count, datatype, recv_pid, CORNER_000,
                      MPI_COMM_WORLD, &(recv_corner_req[i][CORNER_111]));
        }
        {
            /// 100
        } {
            /// 010
        } { /// 110
        }
    }
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        // Pack data and send corners
        { /// 000
            const int send_pid = getPid1D(pid3d + (int3){-1, -1, -1}, decomposition);
            const size_t count = device->corner_dims.x * //
                                 device->corner_dims.y * //
                                 device->corner_dims.z;
            acKernelPackData(device->streams[STREAM_DEFAULT], device->vba.in[i],
                             (int3){NGHOST, NGHOST, NGHOST}, device->corner_dims,
                             device->packed_corners[i][CORNER_000]);
            MPI_Isend(&device->packed_corners[i][CORNER_000], count, datatype, send_pid, CORNER_000,
                      MPI_COMM_WORLD, &(send_corner_req[i][CORNER_000]));
        }
        {
            /// 100
        } {
            /// 010
        } { /// 110
        }
    }
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        // Unpack data
        {
            /// 000
            const int nx = device->local_config.int_params[AC_nx];
            const int ny = device->local_config.int_params[AC_ny];
            const int nz = device->local_config.int_params[AC_nz];
            MPI_Wait(&(recv_corner_req[i][CORNER_111]), MPI_STATUS_IGNORE);
            acKernelUnpackData(device->streams[STREAM_DEFAULT],
                               device->packed_corners[i][CORNER_111], device->corner_dims,
                               (int3){nx, ny, nz}, device->vba.in[i]);
        }
        {
            /// 100
        } {
            /// 010
        } { /// 110
        }
    }
    // for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
    //    MPI_Waitall(NUM_CORNERS, send_corner_req[i], MPI_STATUSES_IGNORE);
    // for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
    //    MPI_Wait(&(send_corner_req[i][CORNER_000]), MPI_STATUS_IGNORE);
    // TODO
    WARNING("acDeviceCommunicateHalos not yet implemented. Tests will fail (bounds must be "
            "up-to-date before calling acDeviceGatherMeshMPI)");
@@ -1334,7 +1428,7 @@ acDeviceRunMPITest(void)
    // Attempt to enable peer access with all neighbors in the node
    for (int i = 0; i < devices_per_node; ++i) {
        cudaSetDevice(device->id);
-        WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(i, 0));
+        // WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(i, 0)); // TODO RE-ENABLE
    }
    /*
    // Attempt to enable peer access to the most expensive neighbors (sides)
--- a/src/core/kernels/packing.cu
+++ b/src/core/kernels/packing.cu
@@ -27,6 +27,7 @@
 #include "packing.cuh"
 #include "common.cuh"
 #include "src/core/errchk.h"
 __global__ void
 kernel_pack_data(const AcReal* unpacked, const int3 unpacked_start, const int3 packed_dimensions,
@@ -49,7 +50,8 @@ kernel_pack_data(const AcReal* unpacked, const int3 unpacked_start, const int3 p
    const int k_unpacked = k_packed + unpacked_start.z;
    const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
-    const int packed_idx   = i_packed + j_packed * packed_dimensions.x +
+    const int packed_idx   = i_packed +                     //
                           j_packed * packed_dimensions.x + //
                           k_packed * packed_dimensions.x * packed_dimensions.y;
    packed[packed_idx] = unpacked[unpacked_idx];
@@ -76,13 +78,14 @@ kernel_unpack_data(const AcReal* packed, const int3 packed_dimensions, const int
    const int k_unpacked = k_packed + unpacked_start.z;
    const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
-    const int packed_idx   = i_packed + j_packed * packed_dimensions.x +
+    const int packed_idx   = i_packed +                     //
                           j_packed * packed_dimensions.x + //
                           k_packed * packed_dimensions.x * packed_dimensions.y;
    unpacked[unpacked_idx] = packed[packed_idx];
 }
-static AcResult
+AcResult
 acKernelPackData(const cudaStream_t stream, const AcReal* unpacked, const int3 unpacked_start,
                 const int3 packed_dimensions, AcReal* packed)
 {
@@ -92,11 +95,12 @@ acKernelPackData(const cudaStream_t stream, const AcReal* unpacked, const int3 u
                   (unsigned int)ceil(packed_dimensions.z / (float)tpb.z));
    kernel_pack_data<<<bpg, tpb, 0, stream>>>(unpacked, unpacked_start, packed_dimensions, packed);
    ERRCHK_CUDA_KERNEL_ALWAYS(); // TODO SET W/ DEBUG ONLY
    return AC_SUCCESS;
 }
-static AcResult
+AcResult
 acKernelUnpackData(const cudaStream_t stream, const AcReal* packed, const int3 packed_dimensions,
                   const int3 unpacked_start, AcReal* unpacked)
 {
@@ -107,7 +111,7 @@ acKernelUnpackData(const cudaStream_t stream, const AcReal* packed, const int3 p
    kernel_unpack_data<<<bpg, tpb, 0, stream>>>(packed, packed_dimensions, unpacked_start,
                                                unpacked);
-
+    ERRCHK_CUDA_KERNEL_ALWAYS(); // TODO SET W/ DEBUG ONLY
    return AC_SUCCESS;
 }
--- a/src/core/kernels/packing.cuh
+++ b/src/core/kernels/packing.cuh
@@ -27,6 +27,13 @@
 #pragma once
 #include "astaroth.h"
 AcResult acKernelPackData(const cudaStream_t stream, const AcReal* unpacked,
                          const int3 unpacked_start, const int3 packed_dimensions, AcReal* packed);
 AcResult acKernelUnpackData(const cudaStream_t stream, const AcReal* packed,
                            const int3 packed_dimensions, const int3 unpacked_start,
                            AcReal* unpacked);
 AcResult acKernelPackCorner(void);
 AcResult acKernelUnpackCorner(void);